cobweb-launcher 1.2.49__py3-none-any.whl → 1.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cobweb/base/__init__.py +141 -4
- cobweb/base/basic.py +28 -82
- cobweb/base/common_queue.py +13 -0
- cobweb/base/dotting.py +1 -1
- cobweb/base/request.py +14 -2
- cobweb/base/seed.py +10 -6
- cobweb/constant.py +16 -0
- cobweb/crawlers/crawler.py +51 -181
- cobweb/db/redis_db.py +28 -0
- cobweb/launchers/__init__.py +2 -2
- cobweb/launchers/launcher.py +110 -141
- cobweb/launchers/launcher_api.py +66 -114
- cobweb/launchers/launcher_pro.py +76 -194
- cobweb/pipelines/base_pipeline.py +54 -0
- cobweb/pipelines/loghub_pipeline.py +34 -0
- cobweb/pipelines/pipeline.py +25 -49
- cobweb/schedulers/__init__.py +0 -2
- cobweb/schedulers/scheduler_redis.py +5 -8
- cobweb/setting.py +29 -6
- cobweb/utils/dotting.py +10 -42
- cobweb_/__init__.py +2 -0
- cobweb_/base/__init__.py +9 -0
- cobweb_/base/common_queue.py +30 -0
- cobweb_/base/decorators.py +40 -0
- cobweb_/base/item.py +46 -0
- cobweb_/base/log.py +94 -0
- cobweb_/base/request.py +82 -0
- cobweb_/base/response.py +23 -0
- cobweb_/base/seed.py +114 -0
- cobweb_/constant.py +94 -0
- cobweb_/crawlers/__init__.py +1 -0
- cobweb_/crawlers/crawler.py +184 -0
- cobweb_/db/__init__.py +2 -0
- cobweb_/db/api_db.py +82 -0
- cobweb_/db/redis_db.py +130 -0
- cobweb_/exceptions/__init__.py +1 -0
- cobweb_/exceptions/oss_db_exception.py +28 -0
- cobweb_/launchers/__init__.py +3 -0
- cobweb_/launchers/launcher.py +235 -0
- cobweb_/launchers/launcher_air.py +88 -0
- cobweb_/launchers/launcher_api.py +221 -0
- cobweb_/launchers/launcher_pro.py +222 -0
- cobweb_/pipelines/__init__.py +3 -0
- cobweb_/pipelines/pipeline.py +69 -0
- cobweb_/pipelines/pipeline_console.py +22 -0
- cobweb_/pipelines/pipeline_loghub.py +34 -0
- cobweb_/setting.py +74 -0
- cobweb_/utils/__init__.py +5 -0
- cobweb_/utils/bloom.py +58 -0
- cobweb_/utils/dotting.py +32 -0
- cobweb_/utils/oss.py +94 -0
- cobweb_/utils/tools.py +42 -0
- {cobweb_launcher-1.2.49.dist-info → cobweb_launcher-1.3.2.dist-info}/METADATA +1 -1
- cobweb_launcher-1.3.2.dist-info/RECORD +110 -0
- cobweb_launcher-1.3.2.dist-info/top_level.txt +2 -0
- cobweb_new/__init__.py +2 -0
- cobweb_new/base/__init__.py +72 -0
- cobweb_new/base/common_queue.py +53 -0
- cobweb_new/base/decorators.py +72 -0
- cobweb_new/base/item.py +46 -0
- cobweb_new/base/log.py +94 -0
- cobweb_new/base/request.py +82 -0
- cobweb_new/base/response.py +23 -0
- cobweb_new/base/seed.py +118 -0
- cobweb_new/constant.py +105 -0
- cobweb_new/crawlers/__init__.py +1 -0
- cobweb_new/crawlers/crawler-new.py +85 -0
- cobweb_new/crawlers/crawler.py +170 -0
- cobweb_new/db/__init__.py +2 -0
- cobweb_new/db/api_db.py +82 -0
- cobweb_new/db/redis_db.py +158 -0
- cobweb_new/exceptions/__init__.py +1 -0
- cobweb_new/exceptions/oss_db_exception.py +28 -0
- cobweb_new/launchers/__init__.py +3 -0
- cobweb_new/launchers/launcher.py +237 -0
- cobweb_new/launchers/launcher_air.py +88 -0
- cobweb_new/launchers/launcher_api.py +161 -0
- cobweb_new/launchers/launcher_pro.py +96 -0
- cobweb_new/launchers/tesss.py +47 -0
- cobweb_new/pipelines/__init__.py +3 -0
- cobweb_new/pipelines/pipeline.py +68 -0
- cobweb_new/pipelines/pipeline_console.py +22 -0
- cobweb_new/pipelines/pipeline_loghub.py +34 -0
- cobweb_new/setting.py +95 -0
- cobweb_new/utils/__init__.py +5 -0
- cobweb_new/utils/bloom.py +58 -0
- cobweb_new/utils/oss.py +94 -0
- cobweb_new/utils/tools.py +42 -0
- cobweb/schedulers/scheduler_api.py +0 -72
- cobweb_launcher-1.2.49.dist-info/RECORD +0 -44
- cobweb_launcher-1.2.49.dist-info/top_level.txt +0 -1
- {cobweb_launcher-1.2.49.dist-info → cobweb_launcher-1.3.2.dist-info}/LICENSE +0 -0
- {cobweb_launcher-1.2.49.dist-info → cobweb_launcher-1.3.2.dist-info}/WHEEL +0 -0
@@ -0,0 +1,237 @@
|
|
1
|
+
import time
|
2
|
+
import inspect
|
3
|
+
import threading
|
4
|
+
import importlib
|
5
|
+
|
6
|
+
from inspect import isgenerator
|
7
|
+
from typing import Union, Callable
|
8
|
+
|
9
|
+
from constant import DealModel, LogTemplate
|
10
|
+
from cobweb.utils import dynamic_load_class
|
11
|
+
from cobweb.base import Seed, Queue, logger, TaskQueue
|
12
|
+
from cobweb import setting
|
13
|
+
|
14
|
+
|
15
|
+
class Launcher(threading.Thread):
|
16
|
+
|
17
|
+
__CUSTOM_FUNC__ = {}
|
18
|
+
|
19
|
+
def __init__(self, task, project, custom_setting=None, **kwargs):
|
20
|
+
super().__init__()
|
21
|
+
self.task = task
|
22
|
+
self.project = project
|
23
|
+
self.custom_func = dict()
|
24
|
+
self.app_time = int(time.time())
|
25
|
+
|
26
|
+
_setting = dict()
|
27
|
+
|
28
|
+
if custom_setting:
|
29
|
+
if isinstance(custom_setting, dict):
|
30
|
+
_setting = custom_setting
|
31
|
+
else:
|
32
|
+
if isinstance(custom_setting, str):
|
33
|
+
custom_setting = importlib.import_module(custom_setting)
|
34
|
+
if not inspect.ismodule(custom_setting):
|
35
|
+
raise Exception
|
36
|
+
for k, v in custom_setting.__dict__.items():
|
37
|
+
if not k.startswith("__") and not inspect.ismodule(v):
|
38
|
+
_setting[k] = v
|
39
|
+
|
40
|
+
_setting.update(**kwargs)
|
41
|
+
|
42
|
+
for k, v in _setting.items():
|
43
|
+
setattr(setting, k.upper(), v)
|
44
|
+
|
45
|
+
self.scheduling_wait_time = setting.SCHEDULING_WAIT_TIME
|
46
|
+
self.inserting_wait_time = setting.INSERTING_WAIT_TIME
|
47
|
+
self.removing_wait_time = setting.REMOVING_WAIT_TIME
|
48
|
+
|
49
|
+
self.scheduling_size = setting.SCHEDULING_SIZE
|
50
|
+
self.inserting_size = setting.INSERTING_SIZE
|
51
|
+
self.removing_size = setting.REMOVING_SIZE
|
52
|
+
|
53
|
+
self.todo_queue_size = setting.TODO_QUEUE_SIZE
|
54
|
+
self.seed_queue_size = setting.SEED_QUEUE_SIZE
|
55
|
+
self.request_queue_size = setting.REQUEST_QUEUE_SIZE
|
56
|
+
self.download_queue_size = setting.DOWNLOAD_QUEUE_SIZE
|
57
|
+
self.response_queue_size = setting.RESPONSE_QUEUE_SIZE
|
58
|
+
self.upload_queue_size = setting.UPLOAD_QUEUE_SIZE
|
59
|
+
self.delete_queue_size = setting.DELETE_QUEUE_SIZE
|
60
|
+
self.done_queue_size = setting.DONE_QUEUE_SIZE
|
61
|
+
|
62
|
+
self.stop = threading.Event() # 结束事件
|
63
|
+
self.pause = threading.Event() # 暂停事件
|
64
|
+
|
65
|
+
self.crawler_path = setting.CRAWLER
|
66
|
+
self.pipeline_path = setting.PIPELINE
|
67
|
+
|
68
|
+
# self.crawler = None
|
69
|
+
# self.pipeline = None
|
70
|
+
|
71
|
+
self._threads = []
|
72
|
+
|
73
|
+
self._task_info = dict(todo={}, download={})
|
74
|
+
|
75
|
+
# ------
|
76
|
+
|
77
|
+
self.before_scheduler_wait_seconds = setting.BEFORE_SCHEDULER_WAIT_SECONDS
|
78
|
+
|
79
|
+
self.todo_queue_full_wait_seconds = setting.TODO_QUEUE_FULL_WAIT_SECONDS
|
80
|
+
self.new_queue_wait_seconds = setting.NEW_QUEUE_WAIT_SECONDS
|
81
|
+
self.done_queue_wait_seconds = setting.DONE_QUEUE_WAIT_SECONDS
|
82
|
+
self.upload_queue_wait_seconds = setting.UPLOAD_QUEUE_WAIT_SECONDS
|
83
|
+
self.seed_reset_seconds = setting.SEED_RESET_SECONDS
|
84
|
+
|
85
|
+
self.todo_queue_size = setting.TODO_QUEUE_SIZE
|
86
|
+
# self.new_queue_max_size = setting.NEW_QUEUE_MAX_SIZE
|
87
|
+
# self.done_queue_max_size = setting.DONE_QUEUE_MAX_SIZE
|
88
|
+
# self.upload_queue_max_size = setting.UPLOAD_QUEUE_MAX_SIZE
|
89
|
+
|
90
|
+
self.spider_max_retries = setting.SPIDER_MAX_RETRIES
|
91
|
+
self.spider_thread_num = setting.SPIDER_THREAD_NUM
|
92
|
+
self.spider_time_sleep = setting.SPIDER_TIME_SLEEP
|
93
|
+
self.spider_max_count = setting.SPIDER_MAX_COUNT
|
94
|
+
self.time_window = setting.TIME_WINDOW
|
95
|
+
|
96
|
+
self.done_model = setting.DONE_MODEL
|
97
|
+
self.task_model = setting.TASK_MODEL
|
98
|
+
|
99
|
+
self.filter_field = setting.FILTER_FIELD
|
100
|
+
|
101
|
+
@staticmethod
|
102
|
+
def insert_seed(seed: Union[Seed, dict]):
|
103
|
+
if isinstance(seed, dict):
|
104
|
+
seed = Seed(seed)
|
105
|
+
TaskQueue.SEED.push(seed)
|
106
|
+
|
107
|
+
@property
|
108
|
+
def request(self):
|
109
|
+
"""
|
110
|
+
自定义request函数
|
111
|
+
use case:
|
112
|
+
from cobweb.base import Request, BaseItem
|
113
|
+
@launcher.request
|
114
|
+
def request(seed: Seed) -> Union[Request, BaseItem]:
|
115
|
+
...
|
116
|
+
yield Request(seed.url, seed)
|
117
|
+
"""
|
118
|
+
def decorator(func):
|
119
|
+
self.custom_func['request'] = func
|
120
|
+
return decorator
|
121
|
+
|
122
|
+
@property
|
123
|
+
def download(self):
|
124
|
+
"""
|
125
|
+
自定义download函数
|
126
|
+
use case:
|
127
|
+
from cobweb.base import Request, Response, Seed, BaseItem
|
128
|
+
@launcher.download
|
129
|
+
def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
|
130
|
+
...
|
131
|
+
yield Response(item.seed, response)
|
132
|
+
"""
|
133
|
+
def decorator(func):
|
134
|
+
self.custom_func['download'] = func
|
135
|
+
return decorator
|
136
|
+
|
137
|
+
@property
|
138
|
+
def parse(self):
|
139
|
+
"""
|
140
|
+
自定义parse函数, xxxItem为自定义的存储数据类型
|
141
|
+
use case:
|
142
|
+
from cobweb.base import Request, Response
|
143
|
+
@launcher.parse
|
144
|
+
def parse(item: Response) -> BaseItem:
|
145
|
+
...
|
146
|
+
yield xxxItem(seed, **kwargs)
|
147
|
+
"""
|
148
|
+
def decorator(func):
|
149
|
+
self.custom_func['parse'] = func
|
150
|
+
return decorator
|
151
|
+
|
152
|
+
def remove_working_items(self, key, items):
|
153
|
+
for item in items:
|
154
|
+
self._task_info[key].pop(item, None)
|
155
|
+
|
156
|
+
def check_alive(self):
|
157
|
+
while not self.stop.is_set():
|
158
|
+
if not self.pause.is_set():
|
159
|
+
for thread in self._threads:
|
160
|
+
if not thread.is_alive():
|
161
|
+
thread.start()
|
162
|
+
time.sleep(1)
|
163
|
+
|
164
|
+
def _add_thread(self, func, num=1, obj=None, name=None, args=()):
|
165
|
+
obj = obj or self
|
166
|
+
name = obj.__class__.__name__ + name or func.__name__
|
167
|
+
for i in range(num):
|
168
|
+
func_name = name + "_" + str(i) if num > 1 else name
|
169
|
+
self._threads.append(threading.Thread(name=func_name, target=func, args=(obj,) + args))
|
170
|
+
|
171
|
+
def _init_schedule_thread(self):
|
172
|
+
...
|
173
|
+
|
174
|
+
def _polling(self):
|
175
|
+
check_emtpy_times = 0
|
176
|
+
while not self.stop.is_set():
|
177
|
+
if TaskQueue.is_empty():
|
178
|
+
if self.pause.is_set():
|
179
|
+
run_time = int(time.time()) - self.app_time
|
180
|
+
if not self.task_model and run_time > self.before_scheduler_wait_seconds:
|
181
|
+
logger.info("Done! ready to close thread...")
|
182
|
+
self.stop.set()
|
183
|
+
else:
|
184
|
+
logger.info("pause! waiting for resume...")
|
185
|
+
elif check_emtpy_times > 2:
|
186
|
+
logger.info("pause! waiting for resume...")
|
187
|
+
self.doing_seeds = {}
|
188
|
+
self.pause.set()
|
189
|
+
else:
|
190
|
+
logger.info(
|
191
|
+
"check whether the task is complete, "
|
192
|
+
f"reset times {3 - check_emtpy_times}"
|
193
|
+
)
|
194
|
+
check_emtpy_times += 1
|
195
|
+
elif TaskQueue.TODO.length:
|
196
|
+
logger.info(f"Recovery {self.task} task run!")
|
197
|
+
check_emtpy_times = 0
|
198
|
+
self.pause.clear()
|
199
|
+
else:
|
200
|
+
logger.info(LogTemplate.launcher_polling.format(
|
201
|
+
task=self.task,
|
202
|
+
doing_len=len(self.doing_seeds.keys()),
|
203
|
+
todo_len=TaskQueue.TODO.length,
|
204
|
+
done_len=TaskQueue.DONE.length,
|
205
|
+
upload_len=TaskQueue.UPLOAD.length,
|
206
|
+
))
|
207
|
+
|
208
|
+
time.sleep(10)
|
209
|
+
|
210
|
+
logger.info("Done! Ready to close thread...")
|
211
|
+
|
212
|
+
def run(self):
|
213
|
+
Crawler = dynamic_load_class(self.crawler_path)
|
214
|
+
Pipeline = dynamic_load_class(self.pipeline_path)
|
215
|
+
|
216
|
+
crawler = Crawler(
|
217
|
+
stop=self.stop, pause=self.pause,
|
218
|
+
thread_num=self.spider_thread_num,
|
219
|
+
time_sleep=self.spider_time_sleep,
|
220
|
+
custom_func=self.custom_func
|
221
|
+
)
|
222
|
+
|
223
|
+
pipeline = Pipeline(
|
224
|
+
stop=self.stop, pause=self.pause,
|
225
|
+
upload_size=self.upload_queue_max_size,
|
226
|
+
wait_seconds=self.upload_queue_wait_seconds
|
227
|
+
)
|
228
|
+
|
229
|
+
self._add_thread(obj=crawler, func=crawler.build_request_item)
|
230
|
+
self._add_thread(obj=crawler, func=crawler.build_download_item, num=self.spider_thread_num)
|
231
|
+
self._add_thread(obj=crawler, func=crawler.build_parse_item)
|
232
|
+
|
233
|
+
self._init_schedule_thread()
|
234
|
+
self.check_alive()
|
235
|
+
|
236
|
+
|
237
|
+
|
@@ -0,0 +1,88 @@
|
|
1
|
+
import time
|
2
|
+
|
3
|
+
from cobweb.base import logger
|
4
|
+
from cobweb.constant import LogTemplate
|
5
|
+
from .launcher import Launcher, check_pause
|
6
|
+
|
7
|
+
|
8
|
+
class LauncherAir(Launcher):
|
9
|
+
|
10
|
+
# def _scheduler(self):
|
11
|
+
# if self.start_seeds:
|
12
|
+
# self.__LAUNCHER_QUEUE__['todo'].push(self.start_seeds)
|
13
|
+
|
14
|
+
@check_pause
|
15
|
+
def _insert(self):
|
16
|
+
seeds = {}
|
17
|
+
status = self.__LAUNCHER_QUEUE__['new'].length < self._new_queue_max_size
|
18
|
+
for _ in range(self._new_queue_max_size):
|
19
|
+
seed = self.__LAUNCHER_QUEUE__['new'].pop()
|
20
|
+
if not seed:
|
21
|
+
break
|
22
|
+
seeds[seed.to_string] = seed.params.priority
|
23
|
+
if seeds:
|
24
|
+
self.__LAUNCHER_QUEUE__['todo'].push(seeds)
|
25
|
+
if status:
|
26
|
+
time.sleep(self._new_queue_wait_seconds)
|
27
|
+
|
28
|
+
@check_pause
|
29
|
+
def _delete(self):
|
30
|
+
seeds = []
|
31
|
+
status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
|
32
|
+
|
33
|
+
for _ in range(self._done_queue_max_size):
|
34
|
+
seed = self.__LAUNCHER_QUEUE__['done'].pop()
|
35
|
+
if not seed:
|
36
|
+
break
|
37
|
+
seeds.append(seed.to_string)
|
38
|
+
|
39
|
+
if seeds:
|
40
|
+
self._remove_doing_seeds(seeds)
|
41
|
+
|
42
|
+
if status:
|
43
|
+
time.sleep(self._done_queue_wait_seconds)
|
44
|
+
|
45
|
+
def _polling(self):
|
46
|
+
|
47
|
+
check_emtpy_times = 0
|
48
|
+
|
49
|
+
while not self._stop.is_set():
|
50
|
+
|
51
|
+
queue_not_empty_count = 0
|
52
|
+
pooling_wait_seconds = 30
|
53
|
+
|
54
|
+
for q in self.__LAUNCHER_QUEUE__.values():
|
55
|
+
if q.length != 0:
|
56
|
+
queue_not_empty_count += 1
|
57
|
+
|
58
|
+
if queue_not_empty_count == 0:
|
59
|
+
pooling_wait_seconds = 3
|
60
|
+
if self._pause.is_set():
|
61
|
+
check_emtpy_times = 0
|
62
|
+
if not self._task_model:
|
63
|
+
logger.info("Done! Ready to close thread...")
|
64
|
+
self._stop.set()
|
65
|
+
elif check_emtpy_times > 2:
|
66
|
+
self.__DOING__ = {}
|
67
|
+
self._pause.set()
|
68
|
+
else:
|
69
|
+
logger.info(
|
70
|
+
"check whether the task is complete, "
|
71
|
+
f"reset times {3 - check_emtpy_times}"
|
72
|
+
)
|
73
|
+
check_emtpy_times += 1
|
74
|
+
elif self._pause.is_set():
|
75
|
+
self._pause.clear()
|
76
|
+
self._execute()
|
77
|
+
else:
|
78
|
+
logger.info(LogTemplate.launcher_air_polling.format(
|
79
|
+
task=self.task,
|
80
|
+
doing_len=len(self.__DOING__.keys()),
|
81
|
+
todo_len=self.__LAUNCHER_QUEUE__['todo'].length,
|
82
|
+
done_len=self.__LAUNCHER_QUEUE__['done'].length,
|
83
|
+
upload_len=self.__LAUNCHER_QUEUE__['upload'].length,
|
84
|
+
))
|
85
|
+
|
86
|
+
time.sleep(pooling_wait_seconds)
|
87
|
+
|
88
|
+
|
@@ -0,0 +1,161 @@
|
|
1
|
+
import time
|
2
|
+
import threading
|
3
|
+
|
4
|
+
from cobweb.db import ApiDB
|
5
|
+
from cobweb.base import Seed, TaskQueue,logger, stop, pause
|
6
|
+
from cobweb.constant import DealModel
|
7
|
+
from .launcher import Launcher
|
8
|
+
|
9
|
+
|
10
|
+
class LauncherApi(Launcher):
|
11
|
+
|
12
|
+
def __init__(self, task, project, custom_setting=None, **kwargs):
|
13
|
+
super().__init__(task, project, custom_setting, **kwargs)
|
14
|
+
self._db = ApiDB()
|
15
|
+
|
16
|
+
self._todo_key = "{%s:%s}:todo" % (project, task)
|
17
|
+
self._done_key = "{%s:%s}:done" % (project, task)
|
18
|
+
self._fail_key = "{%s:%s}:fail" % (project, task)
|
19
|
+
self._heartbeat_key = "heartbeat:%s_%s" % (project, task)
|
20
|
+
|
21
|
+
self._statistics_done_key = "statistics:%s:%s:done" % (project, task)
|
22
|
+
self._statistics_fail_key = "statistics:%s:%s:fail" % (project, task)
|
23
|
+
self._speed_control_key = "speed_control:%s_%s" % (project, task)
|
24
|
+
|
25
|
+
self._reset_lock_key = "lock:reset:%s_%s" % (project, task)
|
26
|
+
|
27
|
+
self._heartbeat_start_event = threading.Event()
|
28
|
+
|
29
|
+
@property
|
30
|
+
def heartbeat(self):
|
31
|
+
return self._db.exists(self._heartbeat_key)
|
32
|
+
|
33
|
+
def statistics(self, key, count):
|
34
|
+
if not self.task_model and not self._db.exists(key):
|
35
|
+
self._db.setex(key, 86400 * 30, int(count))
|
36
|
+
else:
|
37
|
+
self._db.incrby(key, count)
|
38
|
+
|
39
|
+
def _get_seed(self) -> Seed:
|
40
|
+
"""
|
41
|
+
从队列中获取种子(频控)
|
42
|
+
设置时间窗口为self._time_window(秒),判断在该窗口内的采集量是否满足阈值(self._spider_max_speed)
|
43
|
+
:return: True -> 种子, False -> None
|
44
|
+
"""
|
45
|
+
if TaskQueue.TODO.length and not self._db.auto_incr(
|
46
|
+
self._speed_control_key,
|
47
|
+
t=self.time_window,
|
48
|
+
limit=self.spider_max_count
|
49
|
+
):
|
50
|
+
expire_time = self._db.ttl(self._speed_control_key)
|
51
|
+
logger.info(f"Too fast! Please wait {expire_time} seconds...")
|
52
|
+
time.sleep(expire_time / 2)
|
53
|
+
return None
|
54
|
+
return TaskQueue.TODO.pop()
|
55
|
+
|
56
|
+
@stop
|
57
|
+
def _reset(self):
|
58
|
+
"""
|
59
|
+
检查过期种子,重新添加到redis缓存中
|
60
|
+
"""
|
61
|
+
if self._db.lock(self._reset_lock_key, t=120):
|
62
|
+
|
63
|
+
_min = -int(time.time()) + self.seed_reset_seconds \
|
64
|
+
if self.heartbeat else "-inf"
|
65
|
+
|
66
|
+
self._db.members(self._todo_key, 0, _min=_min, _max="(0")
|
67
|
+
|
68
|
+
if not self.heartbeat:
|
69
|
+
self._heartbeat_start_event.set()
|
70
|
+
|
71
|
+
self._db.delete(self._reset_lock_key)
|
72
|
+
|
73
|
+
time.sleep(30)
|
74
|
+
|
75
|
+
@stop
|
76
|
+
def _refresh(self):
|
77
|
+
"""
|
78
|
+
刷新doing种子过期时间,防止reset重新消费
|
79
|
+
"""
|
80
|
+
if self.doing_seeds:
|
81
|
+
refresh_time = int(time.time())
|
82
|
+
seeds = {k: -refresh_time - v / 1e3 for k, v in self.doing_seeds.items()}
|
83
|
+
self._db.zadd(self._todo_key, item=seeds, xx=True)
|
84
|
+
time.sleep(3)
|
85
|
+
|
86
|
+
@stop
|
87
|
+
def _scheduler(self):
|
88
|
+
"""
|
89
|
+
调度任务,获取redis队列种子,同时添加到doing字典中
|
90
|
+
"""
|
91
|
+
if not self._db.zcount(self._todo_key, 0, "(1000"):
|
92
|
+
time.sleep(self.scheduler_wait_seconds)
|
93
|
+
elif TaskQueue.TODO.length >= self.todo_queue_size:
|
94
|
+
time.sleep(self.todo_queue_full_wait_seconds)
|
95
|
+
else:
|
96
|
+
members = self._db.members(
|
97
|
+
self._todo_key, int(time.time()),
|
98
|
+
count=self.todo_queue_size,
|
99
|
+
_min=0, _max="(1000"
|
100
|
+
)
|
101
|
+
for member, priority in members:
|
102
|
+
seed = Seed(member, priority=priority)
|
103
|
+
TaskQueue.TODO.push(seed)
|
104
|
+
self.doing_seeds[seed.to_string] = seed.params.priority
|
105
|
+
|
106
|
+
@pause
|
107
|
+
def _heartbeat(self):
|
108
|
+
if self._heartbeat_start_event.is_set():
|
109
|
+
self._db.setex(self._heartbeat_key, t=5)
|
110
|
+
time.sleep(3)
|
111
|
+
|
112
|
+
@pause
|
113
|
+
def _insert(self):
|
114
|
+
"""
|
115
|
+
添加新种子到redis队列中
|
116
|
+
"""
|
117
|
+
seeds = {}
|
118
|
+
for _ in range(self.new_queue_max_size):
|
119
|
+
if seed := TaskQueue.SEED.pop():
|
120
|
+
seeds[seed.to_string] = seed.params.priority
|
121
|
+
if seeds:
|
122
|
+
self._db.zadd(self._todo_key, seeds, nx=True)
|
123
|
+
if TaskQueue.SEED.length < self.new_queue_max_size:
|
124
|
+
time.sleep(self.new_queue_wait_seconds)
|
125
|
+
|
126
|
+
@pause
|
127
|
+
def _delete(self):
|
128
|
+
"""
|
129
|
+
删除队列种子,根据状态添加至成功或失败队列,移除doing字典种子索引
|
130
|
+
"""
|
131
|
+
seed_info = {"count": 0, "failed": [], "succeed": [], "common": []}
|
132
|
+
status = TaskQueue.DONE.length < self.done_queue_max_size
|
133
|
+
|
134
|
+
for _ in range(self.done_queue_max_size):
|
135
|
+
seed = TaskQueue.DONE.pop()
|
136
|
+
if not seed:
|
137
|
+
break
|
138
|
+
if seed.params.seed_status == DealModel.fail:
|
139
|
+
seed_info["failed"].append(seed.to_string)
|
140
|
+
elif self.done_model == 1:
|
141
|
+
seed_info["succeed"].append(seed.to_string)
|
142
|
+
else:
|
143
|
+
seed_info["common"].append(seed.to_string)
|
144
|
+
seed_info['count'] += 1
|
145
|
+
|
146
|
+
if seed_info["count"]:
|
147
|
+
|
148
|
+
succeed_count = int(self._db.zrem(self._todo_key, *seed_info["common"]) or 0)
|
149
|
+
succeed_count += int(self._db.done([self._todo_key, self._done_key], *seed_info["succeed"]) or 0)
|
150
|
+
failed_count = int(self._db.done([self._todo_key, self._fail_key], *seed_info["failed"]) or 0)
|
151
|
+
|
152
|
+
if failed_count:
|
153
|
+
self.statistics(self._statistics_fail_key, failed_count)
|
154
|
+
if succeed_count:
|
155
|
+
self.statistics(self._statistics_done_key, succeed_count)
|
156
|
+
|
157
|
+
self._remove_doing_seeds(seed_info["common"] + seed_info["succeed"] + seed_info["failed"])
|
158
|
+
|
159
|
+
if status:
|
160
|
+
time.sleep(self.done_queue_wait_seconds)
|
161
|
+
|
@@ -0,0 +1,96 @@
|
|
1
|
+
import time
|
2
|
+
|
3
|
+
from base import TaskQueue
|
4
|
+
from cobweb.base import decorators
|
5
|
+
from schedulers.scheduler_redis import RedisScheduler
|
6
|
+
from .launcher import Launcher
|
7
|
+
|
8
|
+
|
9
|
+
class LauncherPro(Launcher):
|
10
|
+
|
11
|
+
def __init__(self, task, project, custom_setting=None, **kwargs):
|
12
|
+
super().__init__(task, project, custom_setting, **kwargs)
|
13
|
+
self._redis_download = "{%s:%s}:download" % (project, task)
|
14
|
+
self._redis_todo = "{%s:%s}:todo" % (project, task)
|
15
|
+
self._scheduler = RedisScheduler(task, project)
|
16
|
+
|
17
|
+
# @decorators.add_thread()
|
18
|
+
@decorators.stop
|
19
|
+
def _schedule(self):
|
20
|
+
thread_sleep = self.scheduling_wait_time
|
21
|
+
for q, key, size in [
|
22
|
+
(TaskQueue.TODO, self._redis_todo, self.todo_queue_size),
|
23
|
+
(TaskQueue.DOWNLOAD, self._redis_download, self.download_queue_size),
|
24
|
+
]:
|
25
|
+
if q.length < size:
|
26
|
+
for item in self._scheduler.schedule(
|
27
|
+
key, self.scheduling_size
|
28
|
+
):
|
29
|
+
q.push(item)
|
30
|
+
thread_sleep = 0.1
|
31
|
+
time.sleep(thread_sleep)
|
32
|
+
|
33
|
+
# @decorators.add_thread()
|
34
|
+
@decorators.pause
|
35
|
+
def _heartbeat(self):
|
36
|
+
if self._scheduler.working.is_set():
|
37
|
+
self._scheduler.set_heartbeat()
|
38
|
+
time.sleep(3)
|
39
|
+
|
40
|
+
# @decorators.add_thread()
|
41
|
+
@decorators.pause
|
42
|
+
def _reset(self):
|
43
|
+
self._scheduler.reset(
|
44
|
+
keys=[self._redis_todo, self._redis_download],
|
45
|
+
reset_time=self.seed_reset_seconds
|
46
|
+
)
|
47
|
+
time.sleep(15)
|
48
|
+
|
49
|
+
# @decorators.add_thread()
|
50
|
+
@decorators.pause
|
51
|
+
def _insert(self):
|
52
|
+
thread_sleep = 0.1
|
53
|
+
for q, key, size in [
|
54
|
+
(TaskQueue.SEED, self._redis_todo, self.seed_queue_size),
|
55
|
+
(TaskQueue.REQUEST, self._redis_download, self.request_queue_size),
|
56
|
+
]:
|
57
|
+
items = {}
|
58
|
+
while item := q.pop() and len(items.keys()) < self.inserting_size:
|
59
|
+
items[item.to_string] = item.params.priority
|
60
|
+
if q.length >= size:
|
61
|
+
thread_sleep = self.inserting_wait_time
|
62
|
+
self._scheduler.insert(key, items)
|
63
|
+
time.sleep(thread_sleep)
|
64
|
+
|
65
|
+
# @decorators.add_thread()
|
66
|
+
@decorators.pause
|
67
|
+
def _refresh(self):
|
68
|
+
self._scheduler.refresh(self._redis_todo, self._task_info["todo"])
|
69
|
+
self._scheduler.refresh(self._redis_download, self._task_info["download"])
|
70
|
+
time.sleep(3)
|
71
|
+
|
72
|
+
# @decorators.add_thread()
|
73
|
+
@decorators.pause
|
74
|
+
def _remove(self):
|
75
|
+
thread_sleep = self.removing_wait_time
|
76
|
+
for q, key, size in [
|
77
|
+
(TaskQueue.DELETE, self._redis_todo, self.delete_queue_size),
|
78
|
+
(TaskQueue.DONE, self._redis_download, self.done_queue_size),
|
79
|
+
]:
|
80
|
+
items = []
|
81
|
+
while item := q.pop() and len(items) < self.removing_size:
|
82
|
+
items.append(item)
|
83
|
+
self._scheduler.delete(key, *items)
|
84
|
+
self.remove_working_items(key.split(":")[-1], items)
|
85
|
+
if q.length >= size:
|
86
|
+
thread_sleep = 0.1
|
87
|
+
time.sleep(thread_sleep)
|
88
|
+
|
89
|
+
def _init_schedule_thread(self):
|
90
|
+
self._add_thread(func=self._heartbeat)
|
91
|
+
self._add_thread(func=self._reset)
|
92
|
+
self._add_thread(func=self._refresh)
|
93
|
+
self._add_thread(func=self._schedule)
|
94
|
+
self._add_thread(func=self._insert)
|
95
|
+
self._add_thread(func=self._remove)
|
96
|
+
self._add_thread(func=self._polling)
|
@@ -0,0 +1,47 @@
|
|
1
|
+
import threading
|
2
|
+
import time
|
3
|
+
from functools import wraps
|
4
|
+
|
5
|
+
|
6
|
+
def add_thread(num=1):
|
7
|
+
def decorator(func):
|
8
|
+
@wraps(func)
|
9
|
+
def wrapper(self, *args):
|
10
|
+
for i in range(num):
|
11
|
+
name = func.__name__ + "_" + str(i) if num > 1 else func.__name__
|
12
|
+
self._threads.append(threading.Thread(name=name, target=func, args=(self,) + args))
|
13
|
+
return wrapper
|
14
|
+
|
15
|
+
return decorator
|
16
|
+
|
17
|
+
|
18
|
+
def pause(func):
|
19
|
+
@wraps(func)
|
20
|
+
def wrapper(*args, **kwargs):
|
21
|
+
while True:
|
22
|
+
try:
|
23
|
+
func(*args, **kwargs)
|
24
|
+
except Exception as e:
|
25
|
+
print(str(e))
|
26
|
+
finally:
|
27
|
+
time.sleep(0.1)
|
28
|
+
|
29
|
+
return wrapper
|
30
|
+
|
31
|
+
|
32
|
+
class TTT:
|
33
|
+
_threads = []
|
34
|
+
|
35
|
+
@add_thread()
|
36
|
+
@pause
|
37
|
+
def tt(self):
|
38
|
+
print("hello")
|
39
|
+
time.sleep(1)
|
40
|
+
|
41
|
+
tttt = TTT()
|
42
|
+
tttt.tt()
|
43
|
+
print(TTT._threads)
|
44
|
+
|
45
|
+
|
46
|
+
for _ in TTT._threads:
|
47
|
+
_.start()
|
@@ -0,0 +1,68 @@
|
|
1
|
+
import time
|
2
|
+
import threading
|
3
|
+
|
4
|
+
from abc import ABC, abstractmethod
|
5
|
+
|
6
|
+
from cobweb.utils import TaskQueue
|
7
|
+
from cobweb.base import BaseItem, logger
|
8
|
+
|
9
|
+
|
10
|
+
class Pipeline(threading.Thread, ABC):
|
11
|
+
|
12
|
+
def __init__(
|
13
|
+
self,
|
14
|
+
stop: threading.Event,
|
15
|
+
pause: threading.Event,
|
16
|
+
upload_size: int,
|
17
|
+
wait_seconds: int
|
18
|
+
):
|
19
|
+
super().__init__()
|
20
|
+
self._stop = stop
|
21
|
+
self._pause = pause
|
22
|
+
|
23
|
+
self.upload_size = upload_size
|
24
|
+
self.wait_seconds = wait_seconds
|
25
|
+
|
26
|
+
@abstractmethod
|
27
|
+
def build(self, item: BaseItem) -> dict:
|
28
|
+
pass
|
29
|
+
|
30
|
+
@abstractmethod
|
31
|
+
def upload(self, table: str, data: list) -> bool:
|
32
|
+
pass
|
33
|
+
|
34
|
+
def run(self):
|
35
|
+
while not self._stop.is_set():
|
36
|
+
if not TaskQueue.UPLOAD.length:
|
37
|
+
time.sleep(self.wait_seconds)
|
38
|
+
continue
|
39
|
+
if TaskQueue.UPLOAD.length < self.upload_size:
|
40
|
+
time.sleep(self.wait_seconds)
|
41
|
+
status = True
|
42
|
+
data_info, seeds = {}, []
|
43
|
+
try:
|
44
|
+
for _ in range(self.upload_size):
|
45
|
+
item = TaskQueue.UPLOAD.pop()
|
46
|
+
if not item:
|
47
|
+
break
|
48
|
+
seeds.append(item.seed)
|
49
|
+
data = self.build(item)
|
50
|
+
data_info.setdefault(item.table, []).append(data)
|
51
|
+
for table, datas in data_info.items():
|
52
|
+
try:
|
53
|
+
self.upload(table, datas)
|
54
|
+
except Exception as e:
|
55
|
+
logger.info(e)
|
56
|
+
status = False
|
57
|
+
except Exception as e:
|
58
|
+
logger.info(e)
|
59
|
+
status = False
|
60
|
+
if not status:
|
61
|
+
for seed in seeds:
|
62
|
+
seed.params.seed_status = "deal model: fail"
|
63
|
+
if seeds:
|
64
|
+
TaskQueue.DONE.push(seeds)
|
65
|
+
|
66
|
+
logger.info("upload pipeline close!")
|
67
|
+
|
68
|
+
|