cobweb-launcher 0.1.8__py3-none-any.whl → 1.2.42__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- cobweb/__init__.py +2 -11
- cobweb/base/__init__.py +9 -0
- cobweb/base/basic.py +297 -0
- cobweb/base/common_queue.py +30 -0
- cobweb/base/decorators.py +40 -0
- cobweb/base/dotting.py +35 -0
- cobweb/base/item.py +46 -0
- cobweb/{log.py → base/log.py} +4 -6
- cobweb/base/request.py +82 -0
- cobweb/base/response.py +23 -0
- cobweb/base/seed.py +114 -0
- cobweb/constant.py +94 -0
- cobweb/crawlers/__init__.py +1 -0
- cobweb/crawlers/base_crawler.py +144 -0
- cobweb/crawlers/crawler.py +212 -0
- cobweb/crawlers/file_crawler.py +98 -0
- cobweb/db/__init__.py +2 -2
- cobweb/db/api_db.py +82 -0
- cobweb/db/redis_db.py +125 -218
- cobweb/exceptions/__init__.py +1 -0
- cobweb/exceptions/oss_db_exception.py +28 -0
- cobweb/launchers/__init__.py +3 -0
- cobweb/launchers/launcher.py +235 -0
- cobweb/launchers/launcher_air.py +88 -0
- cobweb/launchers/launcher_api.py +209 -0
- cobweb/launchers/launcher_pro.py +208 -0
- cobweb/pipelines/__init__.py +3 -0
- cobweb/pipelines/pipeline.py +69 -0
- cobweb/pipelines/pipeline_console.py +22 -0
- cobweb/pipelines/pipeline_loghub.py +34 -0
- cobweb/schedulers/__init__.py +3 -0
- cobweb/schedulers/scheduler_api.py +72 -0
- cobweb/schedulers/scheduler_redis.py +72 -0
- cobweb/setting.py +67 -6
- cobweb/utils/__init__.py +5 -0
- cobweb/utils/bloom.py +58 -0
- cobweb/utils/dotting.py +32 -0
- cobweb/utils/oss.py +94 -0
- cobweb/utils/tools.py +42 -0
- cobweb_launcher-1.2.42.dist-info/METADATA +205 -0
- cobweb_launcher-1.2.42.dist-info/RECORD +44 -0
- {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.42.dist-info}/WHEEL +1 -1
- cobweb/bbb.py +0 -191
- cobweb/db/oss_db.py +0 -127
- cobweb/db/scheduler/__init__.py +0 -0
- cobweb/db/scheduler/default.py +0 -8
- cobweb/db/scheduler/textfile.py +0 -27
- cobweb/db/storer/__init__.py +0 -0
- cobweb/db/storer/console.py +0 -9
- cobweb/db/storer/loghub.py +0 -54
- cobweb/db/storer/redis.py +0 -15
- cobweb/db/storer/textfile.py +0 -15
- cobweb/decorators.py +0 -16
- cobweb/distributed/__init__.py +0 -0
- cobweb/distributed/launcher.py +0 -243
- cobweb/distributed/models.py +0 -143
- cobweb/interface.py +0 -34
- cobweb/single/__init__.py +0 -0
- cobweb/single/launcher.py +0 -231
- cobweb/single/models.py +0 -134
- cobweb/single/nest.py +0 -153
- cobweb/task.py +0 -50
- cobweb/utils.py +0 -90
- cobweb_launcher-0.1.8.dist-info/METADATA +0 -45
- cobweb_launcher-0.1.8.dist-info/RECORD +0 -31
- {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.42.dist-info}/LICENSE +0 -0
- {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.42.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,235 @@
|
|
1
|
+
import time
|
2
|
+
import inspect
|
3
|
+
import threading
|
4
|
+
import importlib
|
5
|
+
from functools import wraps
|
6
|
+
|
7
|
+
|
8
|
+
from cobweb import setting
|
9
|
+
from cobweb.base import Seed, Queue, logger
|
10
|
+
from cobweb.utils.tools import dynamic_load_class
|
11
|
+
|
12
|
+
|
13
|
+
def check_pause(func):
|
14
|
+
@wraps(func)
|
15
|
+
def wrapper(self, *args, **kwargs):
|
16
|
+
while not self._pause.is_set():
|
17
|
+
try:
|
18
|
+
func(self, *args, **kwargs)
|
19
|
+
except Exception as e:
|
20
|
+
logger.info(f"{func.__name__}: " + str(e))
|
21
|
+
finally:
|
22
|
+
time.sleep(0.1)
|
23
|
+
|
24
|
+
return wrapper
|
25
|
+
|
26
|
+
|
27
|
+
class Launcher(threading.Thread):
|
28
|
+
|
29
|
+
SEEDS = []
|
30
|
+
|
31
|
+
__DOING__ = {}
|
32
|
+
|
33
|
+
__CUSTOM_FUNC__ = {
|
34
|
+
# "download": None,
|
35
|
+
# "request": None,
|
36
|
+
# "parse": None,
|
37
|
+
}
|
38
|
+
|
39
|
+
__LAUNCHER_QUEUE__ = {
|
40
|
+
"new": Queue(),
|
41
|
+
"todo": Queue(),
|
42
|
+
"done": Queue(),
|
43
|
+
"upload": Queue()
|
44
|
+
}
|
45
|
+
|
46
|
+
__LAUNCHER_FUNC__ = [
|
47
|
+
"_reset",
|
48
|
+
"_scheduler",
|
49
|
+
"_insert",
|
50
|
+
"_refresh",
|
51
|
+
"_delete",
|
52
|
+
]
|
53
|
+
|
54
|
+
def __init__(self, task, project, custom_setting=None, **kwargs):
|
55
|
+
super().__init__()
|
56
|
+
self.task = task
|
57
|
+
self.project = project
|
58
|
+
|
59
|
+
self._app_time = int(time.time())
|
60
|
+
self._stop = threading.Event() # 结束事件
|
61
|
+
self._pause = threading.Event() # 暂停事件
|
62
|
+
|
63
|
+
_setting = dict()
|
64
|
+
|
65
|
+
if custom_setting:
|
66
|
+
if isinstance(custom_setting, dict):
|
67
|
+
_setting = custom_setting
|
68
|
+
else:
|
69
|
+
if isinstance(custom_setting, str):
|
70
|
+
custom_setting = importlib.import_module(custom_setting)
|
71
|
+
if not inspect.ismodule(custom_setting):
|
72
|
+
raise Exception
|
73
|
+
for k, v in custom_setting.__dict__.items():
|
74
|
+
if not k.startswith("__") and not inspect.ismodule(v):
|
75
|
+
_setting[k] = v
|
76
|
+
|
77
|
+
_setting.update(**kwargs)
|
78
|
+
|
79
|
+
for k, v in _setting.items():
|
80
|
+
setattr(setting, k.upper(), v)
|
81
|
+
|
82
|
+
self._Crawler = dynamic_load_class(setting.CRAWLER)
|
83
|
+
self._Pipeline = dynamic_load_class(setting.PIPELINE)
|
84
|
+
|
85
|
+
self._before_scheduler_wait_seconds = setting.BEFORE_SCHEDULER_WAIT_SECONDS
|
86
|
+
self._scheduler_wait_seconds = setting.SCHEDULER_WAIT_SECONDS
|
87
|
+
self._todo_queue_full_wait_seconds = setting.TODO_QUEUE_FULL_WAIT_SECONDS
|
88
|
+
self._new_queue_wait_seconds = setting.NEW_QUEUE_WAIT_SECONDS
|
89
|
+
self._done_queue_wait_seconds = setting.DONE_QUEUE_WAIT_SECONDS
|
90
|
+
self._upload_queue_wait_seconds = setting.UPLOAD_QUEUE_WAIT_SECONDS
|
91
|
+
self._seed_reset_seconds = setting.SEED_RESET_SECONDS
|
92
|
+
|
93
|
+
self._todo_queue_size = setting.TODO_QUEUE_SIZE
|
94
|
+
self._new_queue_max_size = setting.NEW_QUEUE_MAX_SIZE
|
95
|
+
self._done_queue_max_size = setting.DONE_QUEUE_MAX_SIZE
|
96
|
+
self._upload_queue_max_size = setting.UPLOAD_QUEUE_MAX_SIZE
|
97
|
+
|
98
|
+
self._spider_max_retries = setting.SPIDER_MAX_RETRIES
|
99
|
+
self._spider_thread_num = setting.SPIDER_THREAD_NUM
|
100
|
+
self._spider_time_sleep = setting.SPIDER_TIME_SLEEP
|
101
|
+
self._spider_max_count = setting.SPIDER_MAX_COUNT
|
102
|
+
self._time_window = setting.TIME_WINDOW
|
103
|
+
|
104
|
+
self._done_model = setting.DONE_MODEL
|
105
|
+
self._task_model = setting.TASK_MODEL
|
106
|
+
|
107
|
+
self._filter_field = setting.FILTER_FIELD
|
108
|
+
|
109
|
+
@property
|
110
|
+
def request(self):
|
111
|
+
"""
|
112
|
+
自定义request函数
|
113
|
+
use case:
|
114
|
+
from cobweb.base import Request, BaseItem
|
115
|
+
@launcher.request
|
116
|
+
def request(seed: Seed) -> Union[Request, BaseItem]:
|
117
|
+
...
|
118
|
+
yield Request(seed.url, seed)
|
119
|
+
"""
|
120
|
+
def decorator(func):
|
121
|
+
self.__CUSTOM_FUNC__["request"] = func
|
122
|
+
return decorator
|
123
|
+
|
124
|
+
@property
|
125
|
+
def download(self):
|
126
|
+
"""
|
127
|
+
自定义download函数
|
128
|
+
use case:
|
129
|
+
from cobweb.base import Request, Response, Seed, BaseItem
|
130
|
+
@launcher.download
|
131
|
+
def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
|
132
|
+
...
|
133
|
+
yield Response(item.seed, response)
|
134
|
+
"""
|
135
|
+
def decorator(func):
|
136
|
+
self.__CUSTOM_FUNC__["download"] = func
|
137
|
+
return decorator
|
138
|
+
|
139
|
+
@property
|
140
|
+
def parse(self):
|
141
|
+
"""
|
142
|
+
自定义parse函数, xxxItem为自定义的存储数据类型
|
143
|
+
use case:
|
144
|
+
from cobweb.base import Request, Response
|
145
|
+
@launcher.parse
|
146
|
+
def parse(item: Response) -> BaseItem:
|
147
|
+
...
|
148
|
+
yield xxxItem(seed, **kwargs)
|
149
|
+
"""
|
150
|
+
def decorator(func):
|
151
|
+
self.__CUSTOM_FUNC__["parse"] = func
|
152
|
+
return decorator
|
153
|
+
|
154
|
+
def start_seeds(self):
|
155
|
+
seeds = [Seed(seed) for seed in self.SEEDS]
|
156
|
+
self.__LAUNCHER_QUEUE__['todo'].push(seeds)
|
157
|
+
return seeds
|
158
|
+
|
159
|
+
def _remove_doing_seeds(self, seeds):
|
160
|
+
for seed in seeds:
|
161
|
+
self.__DOING__.pop(seed, None)
|
162
|
+
# logger.info("remove %s seeds from __DOING__" % len(seeds))
|
163
|
+
|
164
|
+
def _get_seed(self) -> Seed:
|
165
|
+
return self.__LAUNCHER_QUEUE__["todo"].pop()
|
166
|
+
|
167
|
+
def _set_seed(self, seed, **kwargs):
|
168
|
+
self.__LAUNCHER_QUEUE__["todo"].push(seed, **kwargs)
|
169
|
+
|
170
|
+
def _upload_data(self, data, **kwargs):
|
171
|
+
self.__LAUNCHER_QUEUE__["upload"].push(data, **kwargs)
|
172
|
+
|
173
|
+
def _add_seed(self, seed, **kwargs):
|
174
|
+
self.__LAUNCHER_QUEUE__["new"].push(seed, **kwargs)
|
175
|
+
|
176
|
+
def _delete_seed(self, seed, **kwargs):
|
177
|
+
self.__LAUNCHER_QUEUE__["done"].push(seed, **kwargs)
|
178
|
+
|
179
|
+
def _execute(self):
|
180
|
+
for func_name in self.__LAUNCHER_FUNC__:
|
181
|
+
threading.Thread(name=func_name, target=getattr(self, func_name)).start()
|
182
|
+
time.sleep(1)
|
183
|
+
|
184
|
+
def run(self):
|
185
|
+
threading.Thread(target=self._execute_heartbeat).start()
|
186
|
+
|
187
|
+
self.start_seeds()
|
188
|
+
|
189
|
+
self._Crawler(
|
190
|
+
task=self.task, project=self.project,
|
191
|
+
stop=self._stop, pause=self._pause,
|
192
|
+
# launcher_queue=self.__LAUNCHER_QUEUE__,
|
193
|
+
get_seed=self._get_seed,
|
194
|
+
set_seed=self._set_seed,
|
195
|
+
add_seed=self._add_seed,
|
196
|
+
delete_seed=self._delete_seed,
|
197
|
+
upload_data=self._upload_data,
|
198
|
+
custom_func=self.__CUSTOM_FUNC__,
|
199
|
+
thread_num = self._spider_thread_num,
|
200
|
+
max_retries = self._spider_max_retries,
|
201
|
+
time_sleep=self._spider_time_sleep
|
202
|
+
).start()
|
203
|
+
|
204
|
+
self._Pipeline(
|
205
|
+
stop=self._stop, pause=self._pause,
|
206
|
+
upload=self.__LAUNCHER_QUEUE__["upload"],
|
207
|
+
done=self.__LAUNCHER_QUEUE__["done"],
|
208
|
+
upload_size=self._upload_queue_max_size,
|
209
|
+
wait_seconds=self._upload_queue_wait_seconds
|
210
|
+
).start()
|
211
|
+
|
212
|
+
self._execute()
|
213
|
+
self._polling()
|
214
|
+
|
215
|
+
def _execute_heartbeat(self):
|
216
|
+
pass
|
217
|
+
|
218
|
+
def _reset(self):
|
219
|
+
pass
|
220
|
+
|
221
|
+
def _scheduler(self):
|
222
|
+
pass
|
223
|
+
|
224
|
+
def _insert(self):
|
225
|
+
pass
|
226
|
+
|
227
|
+
def _refresh(self):
|
228
|
+
pass
|
229
|
+
|
230
|
+
def _delete(self):
|
231
|
+
pass
|
232
|
+
|
233
|
+
def _polling(self):
|
234
|
+
pass
|
235
|
+
|
@@ -0,0 +1,88 @@
|
|
1
|
+
import time
|
2
|
+
|
3
|
+
from cobweb.base import logger
|
4
|
+
from cobweb.constant import LogTemplate
|
5
|
+
from .launcher import Launcher, check_pause
|
6
|
+
|
7
|
+
|
8
|
+
class LauncherAir(Launcher):
|
9
|
+
|
10
|
+
# def _scheduler(self):
|
11
|
+
# if self.start_seeds:
|
12
|
+
# self.__LAUNCHER_QUEUE__['todo'].push(self.start_seeds)
|
13
|
+
|
14
|
+
@check_pause
|
15
|
+
def _insert(self):
|
16
|
+
seeds = {}
|
17
|
+
status = self.__LAUNCHER_QUEUE__['new'].length < self._new_queue_max_size
|
18
|
+
for _ in range(self._new_queue_max_size):
|
19
|
+
seed = self.__LAUNCHER_QUEUE__['new'].pop()
|
20
|
+
if not seed:
|
21
|
+
break
|
22
|
+
seeds[seed.to_string] = seed.params.priority
|
23
|
+
if seeds:
|
24
|
+
self.__LAUNCHER_QUEUE__['todo'].push(seeds)
|
25
|
+
if status:
|
26
|
+
time.sleep(self._new_queue_wait_seconds)
|
27
|
+
|
28
|
+
@check_pause
|
29
|
+
def _delete(self):
|
30
|
+
seeds = []
|
31
|
+
status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
|
32
|
+
|
33
|
+
for _ in range(self._done_queue_max_size):
|
34
|
+
seed = self.__LAUNCHER_QUEUE__['done'].pop()
|
35
|
+
if not seed:
|
36
|
+
break
|
37
|
+
seeds.append(seed.to_string)
|
38
|
+
|
39
|
+
if seeds:
|
40
|
+
self._remove_doing_seeds(seeds)
|
41
|
+
|
42
|
+
if status:
|
43
|
+
time.sleep(self._done_queue_wait_seconds)
|
44
|
+
|
45
|
+
def _polling(self):
|
46
|
+
|
47
|
+
check_emtpy_times = 0
|
48
|
+
|
49
|
+
while not self._stop.is_set():
|
50
|
+
|
51
|
+
queue_not_empty_count = 0
|
52
|
+
pooling_wait_seconds = 30
|
53
|
+
|
54
|
+
for q in self.__LAUNCHER_QUEUE__.values():
|
55
|
+
if q.length != 0:
|
56
|
+
queue_not_empty_count += 1
|
57
|
+
|
58
|
+
if queue_not_empty_count == 0:
|
59
|
+
pooling_wait_seconds = 3
|
60
|
+
if self._pause.is_set():
|
61
|
+
check_emtpy_times = 0
|
62
|
+
if not self._task_model:
|
63
|
+
logger.info("Done! Ready to close thread...")
|
64
|
+
self._stop.set()
|
65
|
+
elif check_emtpy_times > 2:
|
66
|
+
self.__DOING__ = {}
|
67
|
+
self._pause.set()
|
68
|
+
else:
|
69
|
+
logger.info(
|
70
|
+
"check whether the task is complete, "
|
71
|
+
f"reset times {3 - check_emtpy_times}"
|
72
|
+
)
|
73
|
+
check_emtpy_times += 1
|
74
|
+
elif self._pause.is_set():
|
75
|
+
self._pause.clear()
|
76
|
+
self._execute()
|
77
|
+
else:
|
78
|
+
logger.info(LogTemplate.launcher_air_polling.format(
|
79
|
+
task=self.task,
|
80
|
+
doing_len=len(self.__DOING__.keys()),
|
81
|
+
todo_len=self.__LAUNCHER_QUEUE__['todo'].length,
|
82
|
+
done_len=self.__LAUNCHER_QUEUE__['done'].length,
|
83
|
+
upload_len=self.__LAUNCHER_QUEUE__['upload'].length,
|
84
|
+
))
|
85
|
+
|
86
|
+
time.sleep(pooling_wait_seconds)
|
87
|
+
|
88
|
+
|
@@ -0,0 +1,209 @@
|
|
1
|
+
import time
|
2
|
+
import threading
|
3
|
+
|
4
|
+
from cobweb.db import ApiDB
|
5
|
+
from cobweb.base import Seed, logger
|
6
|
+
from cobweb.constant import DealModel, LogTemplate
|
7
|
+
from .launcher import Launcher, check_pause
|
8
|
+
|
9
|
+
|
10
|
+
class LauncherApi(Launcher):
|
11
|
+
|
12
|
+
def __init__(self, task, project, custom_setting=None, **kwargs):
|
13
|
+
super().__init__(task, project, custom_setting, **kwargs)
|
14
|
+
self._db = ApiDB()
|
15
|
+
|
16
|
+
self._todo_key = "{%s:%s}:todo" % (project, task)
|
17
|
+
self._done_key = "{%s:%s}:done" % (project, task)
|
18
|
+
self._fail_key = "{%s:%s}:fail" % (project, task)
|
19
|
+
self._heartbeat_key = "heartbeat:%s_%s" % (project, task)
|
20
|
+
|
21
|
+
self._statistics_done_key = "statistics:%s:%s:done" % (project, task)
|
22
|
+
self._statistics_fail_key = "statistics:%s:%s:fail" % (project, task)
|
23
|
+
self._speed_control_key = "speed_control:%s_%s" % (project, task)
|
24
|
+
|
25
|
+
self._reset_lock_key = "lock:reset:%s_%s" % (project, task)
|
26
|
+
|
27
|
+
# self._bf_key = "bloom_%s_%s" % (project, task)
|
28
|
+
# self._bf = BloomFilter(self._bf_key)
|
29
|
+
|
30
|
+
self._heartbeat_start_event = threading.Event()
|
31
|
+
self._redis_queue_empty_event = threading.Event()
|
32
|
+
|
33
|
+
@property
|
34
|
+
def heartbeat(self):
|
35
|
+
return self._db.exists(self._heartbeat_key)
|
36
|
+
|
37
|
+
def statistics(self, key, count):
|
38
|
+
if not self._task_model and not self._db.exists(key):
|
39
|
+
self._db.setex(key, 86400 * 30, int(count))
|
40
|
+
else:
|
41
|
+
self._db.incrby(key, count)
|
42
|
+
|
43
|
+
def _get_seed(self) -> Seed:
|
44
|
+
"""
|
45
|
+
从队列中获取种子(频控)
|
46
|
+
设置时间窗口为self._time_window(秒),判断在该窗口内的采集量是否满足阈值(self._spider_max_speed)
|
47
|
+
:return: True -> 种子, False -> None
|
48
|
+
"""
|
49
|
+
if (self.__LAUNCHER_QUEUE__["todo"].length and
|
50
|
+
not self._db.auto_incr(self._speed_control_key, t=self._time_window, limit=self._spider_max_count)):
|
51
|
+
expire_time = self._db.ttl(self._speed_control_key)
|
52
|
+
logger.info(f"Too fast! Please wait {expire_time} seconds...")
|
53
|
+
time.sleep(expire_time / 2)
|
54
|
+
return None
|
55
|
+
seed = self.__LAUNCHER_QUEUE__["todo"].pop()
|
56
|
+
return seed
|
57
|
+
|
58
|
+
@check_pause
|
59
|
+
def _execute_heartbeat(self):
|
60
|
+
if self._heartbeat_start_event.is_set():
|
61
|
+
self._db.setex(self._heartbeat_key, 5)
|
62
|
+
time.sleep(3)
|
63
|
+
|
64
|
+
@check_pause
|
65
|
+
def _reset(self):
|
66
|
+
"""
|
67
|
+
检查过期种子,重新添加到redis缓存中
|
68
|
+
"""
|
69
|
+
reset_wait_seconds = 30
|
70
|
+
if self._db.lock(self._reset_lock_key, t=120):
|
71
|
+
|
72
|
+
_min = -int(time.time()) + self._seed_reset_seconds \
|
73
|
+
if self.heartbeat else "-inf"
|
74
|
+
|
75
|
+
self._db.members(self._todo_key, 0, _min=_min, _max="(0")
|
76
|
+
|
77
|
+
if not self.heartbeat:
|
78
|
+
self._heartbeat_start_event.set()
|
79
|
+
|
80
|
+
self._db.delete(self._reset_lock_key)
|
81
|
+
|
82
|
+
time.sleep(reset_wait_seconds)
|
83
|
+
|
84
|
+
@check_pause
|
85
|
+
def _scheduler(self):
|
86
|
+
"""
|
87
|
+
调度任务,获取redis队列种子,同时添加到doing字典中
|
88
|
+
"""
|
89
|
+
if not self._db.zcount(self._todo_key, 0, "(1000"):
|
90
|
+
time.sleep(self._scheduler_wait_seconds)
|
91
|
+
elif self.__LAUNCHER_QUEUE__['todo'].length >= self._todo_queue_size:
|
92
|
+
time.sleep(self._todo_queue_full_wait_seconds)
|
93
|
+
else:
|
94
|
+
members = self._db.members(
|
95
|
+
self._todo_key, int(time.time()),
|
96
|
+
count=self._todo_queue_size,
|
97
|
+
_min=0, _max="(1000"
|
98
|
+
)
|
99
|
+
for member, priority in members:
|
100
|
+
seed = Seed(member, priority=priority)
|
101
|
+
self.__LAUNCHER_QUEUE__['todo'].push(seed)
|
102
|
+
self.__DOING__[seed.to_string] = seed.params.priority
|
103
|
+
|
104
|
+
@check_pause
|
105
|
+
def _insert(self):
|
106
|
+
"""
|
107
|
+
添加新种子到redis队列中
|
108
|
+
"""
|
109
|
+
seeds = {}
|
110
|
+
status = self.__LAUNCHER_QUEUE__['new'].length < self._new_queue_max_size
|
111
|
+
for _ in range(self._new_queue_max_size):
|
112
|
+
seed = self.__LAUNCHER_QUEUE__['new'].pop()
|
113
|
+
if seed:
|
114
|
+
seeds[seed.to_string] = seed.params.priority
|
115
|
+
if seeds:
|
116
|
+
self._db.zadd(self._todo_key, seeds, nx=True)
|
117
|
+
if status:
|
118
|
+
time.sleep(self._new_queue_wait_seconds)
|
119
|
+
|
120
|
+
@check_pause
|
121
|
+
def _refresh(self):
|
122
|
+
"""
|
123
|
+
刷新doing种子过期时间,防止reset重新消费
|
124
|
+
"""
|
125
|
+
if self.__DOING__:
|
126
|
+
refresh_time = int(time.time())
|
127
|
+
seeds = {k:-refresh_time - v / 1000 for k, v in self.__DOING__.items()}
|
128
|
+
self._db.zadd(self._todo_key, item=seeds, xx=True)
|
129
|
+
time.sleep(15)
|
130
|
+
|
131
|
+
@check_pause
|
132
|
+
def _delete(self):
|
133
|
+
"""
|
134
|
+
删除队列种子,根据状态添加至成功或失败队列,移除doing字典种子索引
|
135
|
+
"""
|
136
|
+
# seed_info = {"count": 0, "failed": [], "succeed": [], "common": []}
|
137
|
+
|
138
|
+
seed_list = []
|
139
|
+
status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
|
140
|
+
|
141
|
+
for _ in range(self._done_queue_max_size):
|
142
|
+
seed = self.__LAUNCHER_QUEUE__['done'].pop()
|
143
|
+
if not seed:
|
144
|
+
break
|
145
|
+
seed_list.append(seed.to_string)
|
146
|
+
|
147
|
+
if seed_list:
|
148
|
+
|
149
|
+
self._db.zrem(self._todo_key, *seed_list)
|
150
|
+
self._remove_doing_seeds(seed_list)
|
151
|
+
|
152
|
+
if status:
|
153
|
+
time.sleep(self._done_queue_wait_seconds)
|
154
|
+
|
155
|
+
def _polling(self):
|
156
|
+
wait_scheduler_execute = True
|
157
|
+
check_emtpy_times = 0
|
158
|
+
while not self._stop.is_set():
|
159
|
+
queue_not_empty_count = 0
|
160
|
+
pooling_wait_seconds = 30
|
161
|
+
|
162
|
+
for q in self.__LAUNCHER_QUEUE__.values():
|
163
|
+
if q.length != 0:
|
164
|
+
queue_not_empty_count += 1
|
165
|
+
wait_scheduler_execute = False
|
166
|
+
|
167
|
+
if queue_not_empty_count == 0:
|
168
|
+
pooling_wait_seconds = 3
|
169
|
+
if self._pause.is_set():
|
170
|
+
check_emtpy_times = 0
|
171
|
+
if not self._task_model and (
|
172
|
+
not wait_scheduler_execute or
|
173
|
+
int(time.time()) - self._app_time > self._before_scheduler_wait_seconds
|
174
|
+
):
|
175
|
+
logger.info("Done! ready to close thread...")
|
176
|
+
self._stop.set()
|
177
|
+
|
178
|
+
elif self._db.zcount(self._todo_key, _min=0, _max="(1000"):
|
179
|
+
logger.info(f"Recovery {self.task} task run!")
|
180
|
+
self._pause.clear()
|
181
|
+
self._execute()
|
182
|
+
else:
|
183
|
+
logger.info("pause! waiting for resume...")
|
184
|
+
elif check_emtpy_times > 2:
|
185
|
+
self.__DOING__ = {}
|
186
|
+
if not self._db.zcount(self._todo_key, _min="-inf", _max="(1000"):
|
187
|
+
self._pause.set()
|
188
|
+
else:
|
189
|
+
logger.info(
|
190
|
+
"check whether the task is complete, "
|
191
|
+
f"reset times {3 - check_emtpy_times}"
|
192
|
+
)
|
193
|
+
check_emtpy_times += 1
|
194
|
+
else:
|
195
|
+
logger.info(LogTemplate.launcher_pro_polling.format(
|
196
|
+
task=self.task,
|
197
|
+
doing_len=len(self.__DOING__.keys()),
|
198
|
+
todo_len=self.__LAUNCHER_QUEUE__['todo'].length,
|
199
|
+
done_len=self.__LAUNCHER_QUEUE__['done'].length,
|
200
|
+
redis_seed_count=self._db.zcount(self._todo_key, "-inf", "+inf"),
|
201
|
+
redis_todo_len=self._db.zcount(self._todo_key, 0, "(1000"),
|
202
|
+
redis_doing_len=self._db.zcount(self._todo_key, "-inf", "(0"),
|
203
|
+
upload_len=self.__LAUNCHER_QUEUE__['upload'].length,
|
204
|
+
))
|
205
|
+
|
206
|
+
time.sleep(pooling_wait_seconds)
|
207
|
+
|
208
|
+
logger.info("Done! Ready to close thread...")
|
209
|
+
|