cobweb-launcher 0.0.1__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,140 @@
1
+ import time
2
+ from hashlib import md5
3
+ from base.log import log
4
+ from base.bbb import Queue, Seed
5
+ from base.interface import SchedulerInterface, StorerInterface
6
+ # from pympler import asizeof
7
+
8
+
9
+ class Scheduler:
10
+
11
+ def schedule_seed(self, ready_seed_length, get_scheduler_lock, add_seed):
12
+
13
+ if not issubclass(self.__class__, SchedulerInterface):
14
+ raise Exception("not have schedule function!")
15
+
16
+ if self.__class__.__name__ == "Default":
17
+ self.stop = True
18
+ return None
19
+
20
+ while not self.stop:
21
+ length = ready_seed_length()
22
+ if length > self.size:
23
+ time.sleep(15)
24
+
25
+ elif get_scheduler_lock():
26
+ seeds = self.schedule()
27
+ add_seed(seeds)
28
+
29
+ log.info(f"close thread: schedule_seed")
30
+
31
+ def schedule_task(self, stop, get_seed, ready_seed_length):
32
+ time.sleep(3)
33
+ while not stop.is_set():
34
+
35
+ if not ready_seed_length():
36
+ time.sleep(15)
37
+ continue
38
+
39
+ if self.queue.length >= self.length:
40
+ time.sleep(3)
41
+ continue
42
+
43
+ seeds = get_seed(self.length)
44
+ self.queue.push(seeds)
45
+ log.info(f"close thread: schedule_task")
46
+
47
+
48
+ class Spider:
49
+
50
+ def __init__(self, queue, max_retries=5):
51
+ self.spider_in_progress = Queue()
52
+ self.max_retries = max_retries
53
+ self.queue = queue
54
+
55
+ def spider_task(self, stop, func, item, del_seed):
56
+ while not stop.is_set():
57
+ seed = self.queue.pop()
58
+ if not seed:
59
+ time.sleep(3)
60
+ continue
61
+ elif seed._retry > self.max_retries:
62
+ del_seed(seed, spider_status=False)
63
+ continue
64
+ try:
65
+ self.spider_in_progress.push(1)
66
+ log.info("spider seed: " + str(seed))
67
+ status = None
68
+ store_queue = None
69
+ store_data = list()
70
+ for it in func(item, seed):
71
+ if getattr(it, "table_name", None):
72
+ if not store_queue:
73
+ store_queue = it.queue()
74
+ store_data.append(it.struct_data)
75
+ elif isinstance(it, Seed):
76
+ self.queue.push(it)
77
+ elif any(isinstance(it, t) for t in (list, tuple)):
78
+ self.queue.push([s if isinstance(s, Seed) else Seed(s) for s in it])
79
+ elif isinstance(it, bool):
80
+ status = it
81
+
82
+ if store_queue and store_data:
83
+ store_data.append(seed)
84
+ store_queue.push(store_data)
85
+
86
+ if status is not None:
87
+ if status:
88
+ del_seed(seed, spider_status=True)
89
+ else:
90
+ seed._retry += 1
91
+ self.queue.push(seed)
92
+
93
+ except Exception as e:
94
+ seed._retry += 1
95
+ self.queue.push(seed)
96
+ log.info(str(e))
97
+ finally:
98
+ self.spider_in_progress.pop()
99
+ log.info(f"close thread: spider")
100
+
101
+
102
+ class Storer:
103
+
104
+ def store_task(self, stop, last, reset_seed, set_storer):
105
+
106
+ if not issubclass(self.__class__, StorerInterface):
107
+ return None
108
+
109
+ if not getattr(self, "store", None):
110
+ raise Exception("not have store function!")
111
+
112
+ storer_name = self.__class__.__name__ + self.table
113
+ store_key_id = md5(storer_name.encode()).hexdigest()
114
+
115
+ while not stop.is_set():
116
+
117
+ if last.is_set() or self.queue.length > self.length:
118
+ seeds, data_list = [], []
119
+
120
+ while True:
121
+ data = self.queue.pop()
122
+ if not data:
123
+ break
124
+ if isinstance(data, Seed):
125
+ seeds.append(data)
126
+ if len(data_list) >= self.length:
127
+ break
128
+ continue
129
+ data_list.append(data)
130
+
131
+ if data_list:
132
+ if self.store(data_list):
133
+ set_storer(store_key_id, seeds)
134
+ else:
135
+ reset_seed(seeds)
136
+ continue
137
+
138
+ time.sleep(3)
139
+
140
+ log.info(f"close thread: {storer_name}")
File without changes
@@ -0,0 +1,104 @@
1
+ import time
2
+ # from pympler import asizeof
3
+ from single.nest import Queue
4
+ from single.nest import struct_queue_name
5
+ from single.nest import SchedulerInterface, StorerInterface
6
+
7
+
8
+ # class Transceiver:
9
+ class Distributor:
10
+
11
+ def __init__(self):
12
+ self.seed_queue = Queue()
13
+
14
+ @property
15
+ def queue_names(self):
16
+ return tuple(self.__dict__.keys())
17
+
18
+ @property
19
+ def used_memory(self):
20
+ return asizeof.asizeof(self)
21
+
22
+ def create_queue(self, queue_name: str):
23
+ self.__setattr__(queue_name, Queue())
24
+
25
+ def get_queue(self, queue_name: str):
26
+ return self.__getattribute__(queue_name)
27
+
28
+ def deal_item(self, item):
29
+ icn = item.__class__.__name__
30
+ if icn == "Seed":
31
+ self.seed_queue.push(item)
32
+ elif getattr(item, "table_name", None):
33
+ queue_name = struct_queue_name(icn, item.table_name)
34
+ getattr(self, queue_name).push(item.serialization)
35
+
36
+ def distribute(self, callback, *args, **kwargs):
37
+ iterable = callback(*args, **kwargs)
38
+ if not iterable:
39
+ return None
40
+ for result in iterable:
41
+ self.deal_item(result)
42
+ return True
43
+
44
+
45
+ class Scheduler:
46
+
47
+ def schedule_task(self, distribute):
48
+
49
+ if not issubclass(self.__class__, SchedulerInterface):
50
+ return None
51
+
52
+ if not getattr(self, "schedule", None):
53
+ raise Exception("not have schedule function!")
54
+
55
+ while not self.stop:
56
+
57
+ if self.queue.length < self.length:
58
+ distribute(self.schedule)
59
+
60
+ else:
61
+ print("------------")
62
+ time.sleep(15)
63
+
64
+
65
+ class Spider:
66
+
67
+ def __init__(self, queue):
68
+ self.queue = queue
69
+ self.spider_in_progress = Queue()
70
+
71
+ def spider_task(self, stop_event, distribute, func, item):
72
+ while not stop_event.is_set():
73
+ seed = self.queue.pop()
74
+ if not seed:
75
+ time.sleep(3)
76
+ continue
77
+ try:
78
+ self.spider_in_progress.push(1)
79
+ distribute(func, item, seed)
80
+ except Exception as e:
81
+ print(e)
82
+ finally:
83
+ self.spider_in_progress.pop()
84
+
85
+
86
+ class Storer:
87
+
88
+ def store_task(self, stop_event, last_event, distribute):
89
+
90
+ if not issubclass(self.__class__, StorerInterface):
91
+ return None
92
+
93
+ if not getattr(self, "store", None):
94
+ raise Exception("not have store function!")
95
+
96
+ while not stop_event.is_set():
97
+ if last_event.is_set() or self.queue.length > self.length:
98
+ data_list = []
99
+ data_length = min(self.queue.length, self.length)
100
+ for _ in range(data_length):
101
+ data = self.queue.pop()
102
+ data_list.append(data)
103
+ if data_list:
104
+ distribute(self.store, data_list)
cobweb/single/nest.py ADDED
@@ -0,0 +1,153 @@
1
+ import time
2
+ import threading
3
+
4
+ from single.nest import Seed, DBItem
5
+ from single.nest import struct_queue_name, restore_table_name
6
+ from single.nest import Distributor, Scheduler, Spider, Storer
7
+
8
+
9
+ def init_task_seed(seeds):
10
+ if not seeds:
11
+ return None
12
+ if isinstance(seeds, list) or isinstance(seeds, tuple):
13
+ for seed in seeds:
14
+ yield Seed(seed)
15
+ elif isinstance(seeds, str) or isinstance(seeds, dict):
16
+ yield Seed(seeds)
17
+
18
+
19
+ def parse_storer_info(storer_info):
20
+ storer_data = {}
21
+ storer_info_list = []
22
+ if storer_info.__class__.__name__ == 'StorerInfo':
23
+ storer_info_list.append(storer_info)
24
+ elif isinstance(storer_info, tuple) or isinstance(storer_info, list):
25
+ storer_info_list = storer_info
26
+ for info in storer_info_list:
27
+ db_name = info.DB.__name__
28
+ storer_data.setdefault(db_name, {"StorerDB": info.DB, "db_args_list": []})
29
+ storer_data[db_name]["db_args_list"].append(info[1:])
30
+ return storer_data
31
+
32
+
33
+ def check(stop_event, last_event, distributor, scheduler, spider, storer_list):
34
+ while True:
35
+ time.sleep(3)
36
+ if (
37
+ scheduler.stop and
38
+ not distributor.seed_queue.length and
39
+ not spider.spider_in_progress.length
40
+ ):
41
+ last_event.set()
42
+ time.sleep(10)
43
+ storer_queue_empty = True
44
+ for storer in storer_list:
45
+ if storer.queue.length:
46
+ storer_queue_empty = False
47
+ break
48
+ if storer_queue_empty:
49
+ break
50
+ last_event.clear()
51
+ stop_event.set()
52
+
53
+
54
+ def cobweb(task):
55
+ """
56
+ 任务启动装饰器
57
+ :param task: 任务配置信息
58
+ """
59
+ def decorator(func):
60
+ """
61
+ func(Item, seed)
62
+ Item:
63
+ Item.Textfile()
64
+ Item.Console()
65
+ """
66
+ # project task_name start_seed spider_num queue_length scheduler_info storer_info
67
+
68
+ storer_list = []
69
+
70
+ # 程序结束事件
71
+ last_event = threading.Event()
72
+ # 暂停采集事件
73
+ stop_event = threading.Event()
74
+
75
+ # 创建分发器
76
+ distributor = Distributor()
77
+
78
+ # 调度器动态继承
79
+ SchedulerDB, table, sql, length, size = task.SchedulerInfo
80
+ SchedulerTmp = type('Scheduler', (Scheduler, SchedulerDB), {})
81
+
82
+ # 初始化调度器
83
+ scheduler = SchedulerTmp(table=table, sql=sql, length=length, size=size, queue=distributor.seed_queue)
84
+
85
+ # 初始化采集器
86
+ spider = Spider(queue=distributor.seed_queue)
87
+
88
+ # 解析存储器信息
89
+ storer_data = parse_storer_info(task.storer_info)
90
+
91
+ # sds
92
+ item = type("item", (object,), {})
93
+ for db_name in storer_data.keys():
94
+ # 存储器动态继承
95
+ StorerDB = storer_data[db_name]["StorerDB"]
96
+ StorerTmp = type('Storer', (Storer, StorerDB), {})
97
+ db_args_list = storer_data[db_name]["db_args_list"]
98
+ for storer_db_args in db_args_list:
99
+ table, fields, length = storer_db_args
100
+ if not getattr(item, db_name, None):
101
+ instance = type(db_name, (DBItem,), {})
102
+ setattr(item, db_name, instance)
103
+ # 创建存储xxx
104
+ getattr(item, db_name).init_item(table, fields)
105
+ # 创建存储队列
106
+ storer_queue = struct_queue_name(db_name, table)
107
+ distributor.create_queue(queue_name=storer_queue)
108
+ queue = distributor.get_queue(queue_name=storer_queue)
109
+ # 初始话存储器
110
+ table_name = restore_table_name(table_name=table)
111
+ storer = StorerTmp(table=table_name, fields=fields, length=length, queue=queue)
112
+ storer_list.append(storer)
113
+
114
+ # 推送初始种子
115
+ distributor.distribute(init_task_seed, seeds=task.start_seed)
116
+
117
+ # 启动调度器
118
+ threading.Thread(
119
+ target=scheduler.schedule_task,
120
+ args=(distributor.distribute,),
121
+ name="single_scheduler_task"
122
+ ).start()
123
+
124
+ # 启动采集器
125
+ for index in range(task.spider_num):
126
+ threading.Thread(
127
+ target=spider.spider_task,
128
+ args=(stop_event, distributor.distribute, func, item),
129
+ name=f"single_spider_task:{index}"
130
+ ).start()
131
+
132
+ # 启动存储器
133
+ for storer in storer_list:
134
+ threading.Thread(
135
+ target=storer.store_task,
136
+ args=(stop_event, last_event, distributor.distribute),
137
+ name=f"single_store_task:{storer.table}",
138
+ ).start()
139
+
140
+ threading.Thread(
141
+ target=check, name="check",
142
+ args=(
143
+ stop_event, last_event, distributor,
144
+ scheduler, spider, storer_list
145
+ )
146
+ ).start()
147
+
148
+ # return starter(task, func)
149
+ return decorator
150
+
151
+
152
+
153
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Juannie
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,45 @@
1
+ Metadata-Version: 2.1
2
+ Name: cobweb-launcher
3
+ Version: 0.0.1
4
+ Summary: spider_hole
5
+ Home-page: https://github.com/Juannie-PP/cobweb
6
+ Author: Juannie-PP
7
+ Author-email: 2604868278@qq.com
8
+ License: MIT
9
+ Keywords: cobweb
10
+ Classifier: Programming Language :: Python :: 3
11
+ Requires-Python: >=3.7
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: requests (>=2.19.1)
15
+ Requires-Dist: oss2 (>=2.18.1)
16
+ Requires-Dist: redis (>=4.4.4)
17
+ Requires-Dist: aliyun-log-python-sdk
18
+
19
+ # cobweb
20
+
21
+ > 通用爬虫框架: 1.单机模式采集框架;2.分布式采集框架
22
+ >
23
+ > 5部分
24
+ >
25
+ > 1. starter -- 启动器
26
+ >
27
+ > 2. scheduler -- 调度器
28
+ >
29
+ > 3. distributor -- 分发器
30
+ >
31
+ > 4. storer -- 存储器
32
+ >
33
+ > 5. utils -- 工具函数
34
+ >
35
+
36
+ need deal
37
+ - 队列优化完善,使用queue的机制wait()同步各模块执行?
38
+ - 日志功能完善,单机模式调度和保存数据写入文件,结构化输出各任务日志
39
+ - 去重过滤(布隆过滤器等)
40
+ - 防丢失(单机模式可以通过日志文件进行检查种子)
41
+ - 自定义数据库的功能
42
+ - excel、mysql、redis数据完善
43
+
44
+
45
+ ![img.png](https://image-luyuan.oss-cn-hangzhou.aliyuncs.com/image/D2388CDC-B9E5-4CE4-9F2C-7D173763B6A8.png)
@@ -0,0 +1,37 @@
1
+ cobweb/__init__.py,sha256=E6b7Sfy6WJTinGMY8xUpnx_zAH1U_TGmdjt_wOEpi8I,57
2
+ cobweb/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ cobweb/base/bbb.py,sha256=iX2Xn5AukRussL8xHKTGFfgODWTMaFA3JnvQWd02XDQ,5615
4
+ cobweb/base/config.py,sha256=qZY26tziCeENdopGyVSTCyd_8B30S37GbWx0-_g7EiY,5357
5
+ cobweb/base/decorators.py,sha256=gb2puZLRHv_348ZTIAFLhKuJWKZ6a9_tzduCS8p1-UQ,3089
6
+ cobweb/base/hash_table.py,sha256=-EPHRMNOHHbdjxZJUuDxghUfwrbMA05sqrldHAgrIco,1885
7
+ cobweb/base/interface.py,sha256=LM6C0eh-d1b2CxjtiHKfP8I3XhhlYQR5r-3MD6TMIc4,1037
8
+ cobweb/base/log.py,sha256=Gb3_y4IzTo5pJohTggBCU9rK6-ZN3hgTOHkoXHyN6CU,2384
9
+ cobweb/base/queue_tmp.py,sha256=NS4qBHKq2o-R78Jpv5xp7TtOHIMg8y0livilTVK49M8,1527
10
+ cobweb/base/request.py,sha256=dHTR7qMHbIIW4ggpTAg4io1TBAYH77teYU4bmcWPXH0,2318
11
+ cobweb/base/task.py,sha256=ztgNh4_tgy95pe3REBfMLKkwf7HaShvp-fdRIWJiJXo,1230
12
+ cobweb/base/utils.py,sha256=NSSgCBE4u1yTpXZrjg8RIepYedo4ZdM38rhDObVfRhI,325
13
+ cobweb/db/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ cobweb/db/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
+ cobweb/db/base/client_db.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
16
+ cobweb/db/base/oss_db.py,sha256=LYRsh26-Fttc6sjc9oOVsCd93jZSja6dtRc8G2Em-1E,3812
17
+ cobweb/db/base/redis_db.py,sha256=2-YMrTpiLNGwA7_bt62HYKevVeJr1Y_n88JKjPg1V3s,7636
18
+ cobweb/db/base/redis_dbv3.py,sha256=u-Tmexl0nrYVVRCCbxAjcH6fyRx7CP4J7iW4BdO7q98,8354
19
+ cobweb/db/scheduler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
+ cobweb/db/scheduler/default.py,sha256=m-zzC2cbHGxplEW5OoB9Vj3nJm30Xl0sAQ94wpPb7Yw,122
21
+ cobweb/db/scheduler/textfile.py,sha256=EiOxV8h99ouIr2HvmpM9B90QY3hqGNPMeQEnps_RG-c,869
22
+ cobweb/db/storer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
+ cobweb/db/storer/console.py,sha256=QbHnZ3ou0AR6c35iVy2hYEbKlYYTnFaD8-bUODeNt14,210
24
+ cobweb/db/storer/loghub.py,sha256=Pb0OwVteIllYjL2cIyBlc1WWau1PHn9K1rlAGrr0M3k,1815
25
+ cobweb/db/storer/redis.py,sha256=jK_RirqgSaV4aIWSuySIm5f1ZfZiULqFj2kman2H-Qw,440
26
+ cobweb/db/storer/textfile.py,sha256=yAvtbPkScjZ298H25kWsI0MDg2JuI2Im4m2qmPEUNTM,443
27
+ cobweb/distributed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
+ cobweb/distributed/launcher.py,sha256=IV2jd1hLyt1YyhyN3leSMrtbN5APsmvqblr2NXy18xg,6163
29
+ cobweb/distributed/models.py,sha256=7ypYQaiHP91LbPE0u5Lb-9LDazg74UW7KacK4-ai1tM,4438
30
+ cobweb/single/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
+ cobweb/single/models.py,sha256=lu8teNWnWcUwZFra8XmqyhzOAf3UyuEztwBr1Ne6pUs,2898
32
+ cobweb/single/nest.py,sha256=mL8q9a5BjtoeUyzXCIVw_vyUsNY8ltbvQpYIIpZEDFU,5012
33
+ cobweb_launcher-0.0.1.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
34
+ cobweb_launcher-0.0.1.dist-info/METADATA,sha256=jyujfg_Grc9wymEL9UlDkJvU1zS_N7IC7ZxIpirY6jM,1208
35
+ cobweb_launcher-0.0.1.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
36
+ cobweb_launcher-0.0.1.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
37
+ cobweb_launcher-0.0.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: bdist_wheel (0.40.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ cobweb