cobweb-launcher 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,140 @@
1
+ import time
2
+ from hashlib import md5
3
+ from base.log import log
4
+ from base.bbb import Queue, Seed
5
+ from base.interface import SchedulerInterface, StorerInterface
6
+ # from pympler import asizeof
7
+
8
+
9
+ class Scheduler:
10
+
11
+ def schedule_seed(self, ready_seed_length, get_scheduler_lock, add_seed):
12
+
13
+ if not issubclass(self.__class__, SchedulerInterface):
14
+ raise Exception("not have schedule function!")
15
+
16
+ if self.__class__.__name__ == "Default":
17
+ self.stop = True
18
+ return None
19
+
20
+ while not self.stop:
21
+ length = ready_seed_length()
22
+ if length > self.size:
23
+ time.sleep(15)
24
+
25
+ elif get_scheduler_lock():
26
+ seeds = self.schedule()
27
+ add_seed(seeds)
28
+
29
+ log.info(f"close thread: schedule_seed")
30
+
31
+ def schedule_task(self, stop, get_seed, ready_seed_length):
32
+ time.sleep(3)
33
+ while not stop.is_set():
34
+
35
+ if not ready_seed_length():
36
+ time.sleep(15)
37
+ continue
38
+
39
+ if self.queue.length >= self.length:
40
+ time.sleep(3)
41
+ continue
42
+
43
+ seeds = get_seed(self.length)
44
+ self.queue.push(seeds)
45
+ log.info(f"close thread: schedule_task")
46
+
47
+
48
+ class Spider:
49
+
50
+ def __init__(self, queue, max_retries=5):
51
+ self.spider_in_progress = Queue()
52
+ self.max_retries = max_retries
53
+ self.queue = queue
54
+
55
+ def spider_task(self, stop, func, item, del_seed):
56
+ while not stop.is_set():
57
+ seed = self.queue.pop()
58
+ if not seed:
59
+ time.sleep(3)
60
+ continue
61
+ elif seed._retry > self.max_retries:
62
+ del_seed(seed, spider_status=False)
63
+ continue
64
+ try:
65
+ self.spider_in_progress.push(1)
66
+ log.info("spider seed: " + str(seed))
67
+ status = None
68
+ store_queue = None
69
+ store_data = list()
70
+ for it in func(item, seed):
71
+ if getattr(it, "table_name", None):
72
+ if not store_queue:
73
+ store_queue = it.queue()
74
+ store_data.append(it.struct_data)
75
+ elif isinstance(it, Seed):
76
+ self.queue.push(it)
77
+ elif any(isinstance(it, t) for t in (list, tuple)):
78
+ self.queue.push([s if isinstance(s, Seed) else Seed(s) for s in it])
79
+ elif isinstance(it, bool):
80
+ status = it
81
+
82
+ if store_queue and store_data:
83
+ store_data.append(seed)
84
+ store_queue.push(store_data)
85
+
86
+ if status is not None:
87
+ if status:
88
+ del_seed(seed, spider_status=True)
89
+ else:
90
+ seed._retry += 1
91
+ self.queue.push(seed)
92
+
93
+ except Exception as e:
94
+ seed._retry += 1
95
+ self.queue.push(seed)
96
+ log.info(str(e))
97
+ finally:
98
+ self.spider_in_progress.pop()
99
+ log.info(f"close thread: spider")
100
+
101
+
102
+ class Storer:
103
+
104
+ def store_task(self, stop, last, reset_seed, set_storer):
105
+
106
+ if not issubclass(self.__class__, StorerInterface):
107
+ return None
108
+
109
+ if not getattr(self, "store", None):
110
+ raise Exception("not have store function!")
111
+
112
+ storer_name = self.__class__.__name__ + self.table
113
+ store_key_id = md5(storer_name.encode()).hexdigest()
114
+
115
+ while not stop.is_set():
116
+
117
+ if last.is_set() or self.queue.length > self.length:
118
+ seeds, data_list = [], []
119
+
120
+ while True:
121
+ data = self.queue.pop()
122
+ if not data:
123
+ break
124
+ if isinstance(data, Seed):
125
+ seeds.append(data)
126
+ if len(data_list) >= self.length:
127
+ break
128
+ continue
129
+ data_list.append(data)
130
+
131
+ if data_list:
132
+ if self.store(data_list):
133
+ set_storer(store_key_id, seeds)
134
+ else:
135
+ reset_seed(seeds)
136
+ continue
137
+
138
+ time.sleep(3)
139
+
140
+ log.info(f"close thread: {storer_name}")
File without changes
@@ -0,0 +1,104 @@
1
+ import time
2
+ # from pympler import asizeof
3
+ from single.nest import Queue
4
+ from single.nest import struct_queue_name
5
+ from single.nest import SchedulerInterface, StorerInterface
6
+
7
+
8
+ # class Transceiver:
9
+ class Distributor:
10
+
11
+ def __init__(self):
12
+ self.seed_queue = Queue()
13
+
14
+ @property
15
+ def queue_names(self):
16
+ return tuple(self.__dict__.keys())
17
+
18
+ @property
19
+ def used_memory(self):
20
+ return asizeof.asizeof(self)
21
+
22
+ def create_queue(self, queue_name: str):
23
+ self.__setattr__(queue_name, Queue())
24
+
25
+ def get_queue(self, queue_name: str):
26
+ return self.__getattribute__(queue_name)
27
+
28
+ def deal_item(self, item):
29
+ icn = item.__class__.__name__
30
+ if icn == "Seed":
31
+ self.seed_queue.push(item)
32
+ elif getattr(item, "table_name", None):
33
+ queue_name = struct_queue_name(icn, item.table_name)
34
+ getattr(self, queue_name).push(item.serialization)
35
+
36
+ def distribute(self, callback, *args, **kwargs):
37
+ iterable = callback(*args, **kwargs)
38
+ if not iterable:
39
+ return None
40
+ for result in iterable:
41
+ self.deal_item(result)
42
+ return True
43
+
44
+
45
+ class Scheduler:
46
+
47
+ def schedule_task(self, distribute):
48
+
49
+ if not issubclass(self.__class__, SchedulerInterface):
50
+ return None
51
+
52
+ if not getattr(self, "schedule", None):
53
+ raise Exception("not have schedule function!")
54
+
55
+ while not self.stop:
56
+
57
+ if self.queue.length < self.length:
58
+ distribute(self.schedule)
59
+
60
+ else:
61
+ print("------------")
62
+ time.sleep(15)
63
+
64
+
65
+ class Spider:
66
+
67
+ def __init__(self, queue):
68
+ self.queue = queue
69
+ self.spider_in_progress = Queue()
70
+
71
+ def spider_task(self, stop_event, distribute, func, item):
72
+ while not stop_event.is_set():
73
+ seed = self.queue.pop()
74
+ if not seed:
75
+ time.sleep(3)
76
+ continue
77
+ try:
78
+ self.spider_in_progress.push(1)
79
+ distribute(func, item, seed)
80
+ except Exception as e:
81
+ print(e)
82
+ finally:
83
+ self.spider_in_progress.pop()
84
+
85
+
86
+ class Storer:
87
+
88
+ def store_task(self, stop_event, last_event, distribute):
89
+
90
+ if not issubclass(self.__class__, StorerInterface):
91
+ return None
92
+
93
+ if not getattr(self, "store", None):
94
+ raise Exception("not have store function!")
95
+
96
+ while not stop_event.is_set():
97
+ if last_event.is_set() or self.queue.length > self.length:
98
+ data_list = []
99
+ data_length = min(self.queue.length, self.length)
100
+ for _ in range(data_length):
101
+ data = self.queue.pop()
102
+ data_list.append(data)
103
+ if data_list:
104
+ distribute(self.store, data_list)
cobweb/single/nest.py ADDED
@@ -0,0 +1,153 @@
1
+ import time
2
+ import threading
3
+
4
+ from single.nest import Seed, DBItem
5
+ from single.nest import struct_queue_name, restore_table_name
6
+ from single.nest import Distributor, Scheduler, Spider, Storer
7
+
8
+
9
+ def init_task_seed(seeds):
10
+ if not seeds:
11
+ return None
12
+ if isinstance(seeds, list) or isinstance(seeds, tuple):
13
+ for seed in seeds:
14
+ yield Seed(seed)
15
+ elif isinstance(seeds, str) or isinstance(seeds, dict):
16
+ yield Seed(seeds)
17
+
18
+
19
+ def parse_storer_info(storer_info):
20
+ storer_data = {}
21
+ storer_info_list = []
22
+ if storer_info.__class__.__name__ == 'StorerInfo':
23
+ storer_info_list.append(storer_info)
24
+ elif isinstance(storer_info, tuple) or isinstance(storer_info, list):
25
+ storer_info_list = storer_info
26
+ for info in storer_info_list:
27
+ db_name = info.DB.__name__
28
+ storer_data.setdefault(db_name, {"StorerDB": info.DB, "db_args_list": []})
29
+ storer_data[db_name]["db_args_list"].append(info[1:])
30
+ return storer_data
31
+
32
+
33
+ def check(stop_event, last_event, distributor, scheduler, spider, storer_list):
34
+ while True:
35
+ time.sleep(3)
36
+ if (
37
+ scheduler.stop and
38
+ not distributor.seed_queue.length and
39
+ not spider.spider_in_progress.length
40
+ ):
41
+ last_event.set()
42
+ time.sleep(10)
43
+ storer_queue_empty = True
44
+ for storer in storer_list:
45
+ if storer.queue.length:
46
+ storer_queue_empty = False
47
+ break
48
+ if storer_queue_empty:
49
+ break
50
+ last_event.clear()
51
+ stop_event.set()
52
+
53
+
54
+ def cobweb(task):
55
+ """
56
+ 任务启动装饰器
57
+ :param task: 任务配置信息
58
+ """
59
+ def decorator(func):
60
+ """
61
+ func(Item, seed)
62
+ Item:
63
+ Item.Textfile()
64
+ Item.Console()
65
+ """
66
+ # project task_name start_seed spider_num queue_length scheduler_info storer_info
67
+
68
+ storer_list = []
69
+
70
+ # 程序结束事件
71
+ last_event = threading.Event()
72
+ # 暂停采集事件
73
+ stop_event = threading.Event()
74
+
75
+ # 创建分发器
76
+ distributor = Distributor()
77
+
78
+ # 调度器动态继承
79
+ SchedulerDB, table, sql, length, size = task.SchedulerInfo
80
+ SchedulerTmp = type('Scheduler', (Scheduler, SchedulerDB), {})
81
+
82
+ # 初始化调度器
83
+ scheduler = SchedulerTmp(table=table, sql=sql, length=length, size=size, queue=distributor.seed_queue)
84
+
85
+ # 初始化采集器
86
+ spider = Spider(queue=distributor.seed_queue)
87
+
88
+ # 解析存储器信息
89
+ storer_data = parse_storer_info(task.storer_info)
90
+
91
+ # sds
92
+ item = type("item", (object,), {})
93
+ for db_name in storer_data.keys():
94
+ # 存储器动态继承
95
+ StorerDB = storer_data[db_name]["StorerDB"]
96
+ StorerTmp = type('Storer', (Storer, StorerDB), {})
97
+ db_args_list = storer_data[db_name]["db_args_list"]
98
+ for storer_db_args in db_args_list:
99
+ table, fields, length = storer_db_args
100
+ if not getattr(item, db_name, None):
101
+ instance = type(db_name, (DBItem,), {})
102
+ setattr(item, db_name, instance)
103
+ # 创建存储xxx
104
+ getattr(item, db_name).init_item(table, fields)
105
+ # 创建存储队列
106
+ storer_queue = struct_queue_name(db_name, table)
107
+ distributor.create_queue(queue_name=storer_queue)
108
+ queue = distributor.get_queue(queue_name=storer_queue)
109
+ # 初始话存储器
110
+ table_name = restore_table_name(table_name=table)
111
+ storer = StorerTmp(table=table_name, fields=fields, length=length, queue=queue)
112
+ storer_list.append(storer)
113
+
114
+ # 推送初始种子
115
+ distributor.distribute(init_task_seed, seeds=task.start_seed)
116
+
117
+ # 启动调度器
118
+ threading.Thread(
119
+ target=scheduler.schedule_task,
120
+ args=(distributor.distribute,),
121
+ name="single_scheduler_task"
122
+ ).start()
123
+
124
+ # 启动采集器
125
+ for index in range(task.spider_num):
126
+ threading.Thread(
127
+ target=spider.spider_task,
128
+ args=(stop_event, distributor.distribute, func, item),
129
+ name=f"single_spider_task:{index}"
130
+ ).start()
131
+
132
+ # 启动存储器
133
+ for storer in storer_list:
134
+ threading.Thread(
135
+ target=storer.store_task,
136
+ args=(stop_event, last_event, distributor.distribute),
137
+ name=f"single_store_task:{storer.table}",
138
+ ).start()
139
+
140
+ threading.Thread(
141
+ target=check, name="check",
142
+ args=(
143
+ stop_event, last_event, distributor,
144
+ scheduler, spider, storer_list
145
+ )
146
+ ).start()
147
+
148
+ # return starter(task, func)
149
+ return decorator
150
+
151
+
152
+
153
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Juannie
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,45 @@
1
+ Metadata-Version: 2.1
2
+ Name: cobweb-launcher
3
+ Version: 0.0.1
4
+ Summary: spider_hole
5
+ Home-page: https://github.com/Juannie-PP/cobweb
6
+ Author: Juannie-PP
7
+ Author-email: 2604868278@qq.com
8
+ License: MIT
9
+ Keywords: cobweb
10
+ Classifier: Programming Language :: Python :: 3
11
+ Requires-Python: >=3.7
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: requests (>=2.19.1)
15
+ Requires-Dist: oss2 (>=2.18.1)
16
+ Requires-Dist: redis (>=4.4.4)
17
+ Requires-Dist: aliyun-log-python-sdk
18
+
19
+ # cobweb
20
+
21
+ > 通用爬虫框架: 1.单机模式采集框架;2.分布式采集框架
22
+ >
23
+ > 5部分
24
+ >
25
+ > 1. starter -- 启动器
26
+ >
27
+ > 2. scheduler -- 调度器
28
+ >
29
+ > 3. distributor -- 分发器
30
+ >
31
+ > 4. storer -- 存储器
32
+ >
33
+ > 5. utils -- 工具函数
34
+ >
35
+
36
+ need deal
37
+ - 队列优化完善,使用queue的机制wait()同步各模块执行?
38
+ - 日志功能完善,单机模式调度和保存数据写入文件,结构化输出各任务日志
39
+ - 去重过滤(布隆过滤器等)
40
+ - 防丢失(单机模式可以通过日志文件进行检查种子)
41
+ - 自定义数据库的功能
42
+ - excel、mysql、redis数据完善
43
+
44
+
45
+ ![img.png](https://image-luyuan.oss-cn-hangzhou.aliyuncs.com/image/D2388CDC-B9E5-4CE4-9F2C-7D173763B6A8.png)
@@ -0,0 +1,37 @@
1
+ cobweb/__init__.py,sha256=E6b7Sfy6WJTinGMY8xUpnx_zAH1U_TGmdjt_wOEpi8I,57
2
+ cobweb/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ cobweb/base/bbb.py,sha256=iX2Xn5AukRussL8xHKTGFfgODWTMaFA3JnvQWd02XDQ,5615
4
+ cobweb/base/config.py,sha256=qZY26tziCeENdopGyVSTCyd_8B30S37GbWx0-_g7EiY,5357
5
+ cobweb/base/decorators.py,sha256=gb2puZLRHv_348ZTIAFLhKuJWKZ6a9_tzduCS8p1-UQ,3089
6
+ cobweb/base/hash_table.py,sha256=-EPHRMNOHHbdjxZJUuDxghUfwrbMA05sqrldHAgrIco,1885
7
+ cobweb/base/interface.py,sha256=LM6C0eh-d1b2CxjtiHKfP8I3XhhlYQR5r-3MD6TMIc4,1037
8
+ cobweb/base/log.py,sha256=Gb3_y4IzTo5pJohTggBCU9rK6-ZN3hgTOHkoXHyN6CU,2384
9
+ cobweb/base/queue_tmp.py,sha256=NS4qBHKq2o-R78Jpv5xp7TtOHIMg8y0livilTVK49M8,1527
10
+ cobweb/base/request.py,sha256=dHTR7qMHbIIW4ggpTAg4io1TBAYH77teYU4bmcWPXH0,2318
11
+ cobweb/base/task.py,sha256=ztgNh4_tgy95pe3REBfMLKkwf7HaShvp-fdRIWJiJXo,1230
12
+ cobweb/base/utils.py,sha256=NSSgCBE4u1yTpXZrjg8RIepYedo4ZdM38rhDObVfRhI,325
13
+ cobweb/db/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ cobweb/db/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
+ cobweb/db/base/client_db.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
16
+ cobweb/db/base/oss_db.py,sha256=LYRsh26-Fttc6sjc9oOVsCd93jZSja6dtRc8G2Em-1E,3812
17
+ cobweb/db/base/redis_db.py,sha256=2-YMrTpiLNGwA7_bt62HYKevVeJr1Y_n88JKjPg1V3s,7636
18
+ cobweb/db/base/redis_dbv3.py,sha256=u-Tmexl0nrYVVRCCbxAjcH6fyRx7CP4J7iW4BdO7q98,8354
19
+ cobweb/db/scheduler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
+ cobweb/db/scheduler/default.py,sha256=m-zzC2cbHGxplEW5OoB9Vj3nJm30Xl0sAQ94wpPb7Yw,122
21
+ cobweb/db/scheduler/textfile.py,sha256=EiOxV8h99ouIr2HvmpM9B90QY3hqGNPMeQEnps_RG-c,869
22
+ cobweb/db/storer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
+ cobweb/db/storer/console.py,sha256=QbHnZ3ou0AR6c35iVy2hYEbKlYYTnFaD8-bUODeNt14,210
24
+ cobweb/db/storer/loghub.py,sha256=Pb0OwVteIllYjL2cIyBlc1WWau1PHn9K1rlAGrr0M3k,1815
25
+ cobweb/db/storer/redis.py,sha256=jK_RirqgSaV4aIWSuySIm5f1ZfZiULqFj2kman2H-Qw,440
26
+ cobweb/db/storer/textfile.py,sha256=yAvtbPkScjZ298H25kWsI0MDg2JuI2Im4m2qmPEUNTM,443
27
+ cobweb/distributed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
+ cobweb/distributed/launcher.py,sha256=IV2jd1hLyt1YyhyN3leSMrtbN5APsmvqblr2NXy18xg,6163
29
+ cobweb/distributed/models.py,sha256=7ypYQaiHP91LbPE0u5Lb-9LDazg74UW7KacK4-ai1tM,4438
30
+ cobweb/single/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
+ cobweb/single/models.py,sha256=lu8teNWnWcUwZFra8XmqyhzOAf3UyuEztwBr1Ne6pUs,2898
32
+ cobweb/single/nest.py,sha256=mL8q9a5BjtoeUyzXCIVw_vyUsNY8ltbvQpYIIpZEDFU,5012
33
+ cobweb_launcher-0.0.1.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
34
+ cobweb_launcher-0.0.1.dist-info/METADATA,sha256=jyujfg_Grc9wymEL9UlDkJvU1zS_N7IC7ZxIpirY6jM,1208
35
+ cobweb_launcher-0.0.1.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
36
+ cobweb_launcher-0.0.1.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
37
+ cobweb_launcher-0.0.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: bdist_wheel (0.40.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ cobweb