cobweb-launcher 1.3.15__py3-none-any.whl → 3.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cobweb/__init__.py +1 -1
- cobweb/base/__init__.py +4 -149
- cobweb/base/common_queue.py +0 -13
- cobweb/base/request.py +2 -14
- cobweb/base/seed.py +16 -12
- cobweb/constant.py +0 -16
- cobweb/crawlers/crawler.py +3 -85
- cobweb/db/redis_db.py +109 -52
- cobweb/launchers/__init__.py +8 -2
- cobweb/launchers/distributor.py +171 -0
- cobweb/launchers/launcher.py +87 -131
- cobweb/launchers/uploader.py +65 -0
- cobweb/pipelines/pipeline.py +3 -36
- cobweb/schedulers/__init__.py +1 -3
- cobweb/schedulers/launcher_air.py +93 -0
- cobweb/schedulers/launcher_api.py +225 -0
- cobweb/schedulers/scheduler.py +85 -0
- cobweb/schedulers/scheduler_with_redis.py +177 -0
- cobweb/setting.py +15 -32
- cobweb/utils/__init__.py +2 -1
- cobweb/utils/decorators.py +43 -0
- cobweb/utils/dotting.py +55 -0
- cobweb/utils/oss.py +28 -9
- {cobweb_launcher-1.3.15.dist-info → cobweb_launcher-3.1.1.dist-info}/METADATA +1 -1
- cobweb_launcher-3.1.1.dist-info/RECORD +41 -0
- cobweb/base/basic.py +0 -297
- cobweb/base/dotting.py +0 -35
- cobweb/launchers/launcher_air.py +0 -88
- cobweb/launchers/launcher_api.py +0 -89
- cobweb/launchers/launcher_pro.py +0 -88
- cobweb/schedulers/scheduler_api.py +0 -72
- cobweb/schedulers/scheduler_redis.py +0 -72
- cobweb_launcher-1.3.15.dist-info/RECORD +0 -40
- {cobweb_launcher-1.3.15.dist-info → cobweb_launcher-3.1.1.dist-info}/LICENSE +0 -0
- {cobweb_launcher-1.3.15.dist-info → cobweb_launcher-3.1.1.dist-info}/WHEEL +0 -0
- {cobweb_launcher-1.3.15.dist-info → cobweb_launcher-3.1.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,225 @@
|
|
1
|
+
import time
|
2
|
+
import threading
|
3
|
+
|
4
|
+
from cobweb.db import ApiDB
|
5
|
+
from cobweb.base import Seed, logger
|
6
|
+
from cobweb.constant import DealModel, LogTemplate
|
7
|
+
from .launcher import Launcher, check_pause
|
8
|
+
|
9
|
+
|
10
|
+
class LauncherApi(Launcher):
|
11
|
+
|
12
|
+
def __init__(self, task, project, custom_setting=None, **kwargs):
|
13
|
+
super().__init__(task, project, custom_setting, **kwargs)
|
14
|
+
self._db = ApiDB()
|
15
|
+
|
16
|
+
self._todo_key = "{%s:%s}:todo" % (project, task)
|
17
|
+
self._done_key = "{%s:%s}:done" % (project, task)
|
18
|
+
self._fail_key = "{%s:%s}:fail" % (project, task)
|
19
|
+
self._heartbeat_key = "heartbeat:%s_%s" % (project, task)
|
20
|
+
|
21
|
+
self._statistics_done_key = "statistics:%s:%s:done" % (project, task)
|
22
|
+
self._statistics_fail_key = "statistics:%s:%s:fail" % (project, task)
|
23
|
+
self._speed_control_key = "speed_control:%s_%s" % (project, task)
|
24
|
+
|
25
|
+
self._reset_lock_key = "lock:reset:%s_%s" % (project, task)
|
26
|
+
|
27
|
+
# self._bf_key = "bloom_%s_%s" % (project, task)
|
28
|
+
# self._bf = BloomFilter(self._bf_key)
|
29
|
+
|
30
|
+
self._heartbeat_start_event = threading.Event()
|
31
|
+
self._redis_queue_empty_event = threading.Event()
|
32
|
+
|
33
|
+
@property
|
34
|
+
def heartbeat(self):
|
35
|
+
return self._db.exists(self._heartbeat_key)
|
36
|
+
|
37
|
+
def statistics(self, key, count):
|
38
|
+
if not self._task_model and not self._db.exists(key):
|
39
|
+
self._db.setex(key, 86400 * 30, int(count))
|
40
|
+
else:
|
41
|
+
self._db.incrby(key, count)
|
42
|
+
|
43
|
+
def _get_seed(self) -> Seed:
|
44
|
+
"""
|
45
|
+
从队列中获取种子(频控)
|
46
|
+
设置时间窗口为self._time_window(秒),判断在该窗口内的采集量是否满足阈值(self._spider_max_speed)
|
47
|
+
:return: True -> 种子, False -> None
|
48
|
+
"""
|
49
|
+
if (self._speed_control and self.__LAUNCHER_QUEUE__["todo"].length and
|
50
|
+
not self._db.auto_incr(self._speed_control_key, t=self._time_window, limit=self._spider_max_count)):
|
51
|
+
expire_time = self._db.ttl(self._speed_control_key)
|
52
|
+
if isinstance(expire_time, int) and expire_time <= -1:
|
53
|
+
self._db.delete(self._speed_control_key)
|
54
|
+
elif isinstance(expire_time, int):
|
55
|
+
logger.info(f"Too fast! Please wait {expire_time} seconds...")
|
56
|
+
time.sleep(expire_time / 2)
|
57
|
+
return None
|
58
|
+
seed = self.__LAUNCHER_QUEUE__["todo"].pop()
|
59
|
+
return seed
|
60
|
+
|
61
|
+
@check_pause
|
62
|
+
def _execute_heartbeat(self):
|
63
|
+
if self._heartbeat_start_event.is_set():
|
64
|
+
self._db.setex(self._heartbeat_key, 5)
|
65
|
+
time.sleep(3)
|
66
|
+
|
67
|
+
@check_pause
|
68
|
+
def _reset(self):
|
69
|
+
"""
|
70
|
+
检查过期种子,重新添加到redis缓存中
|
71
|
+
"""
|
72
|
+
reset_wait_seconds = 30
|
73
|
+
if self._db.lock(self._reset_lock_key, t=120):
|
74
|
+
|
75
|
+
_min = -int(time.time()) + self._seed_reset_seconds \
|
76
|
+
if self.heartbeat else "-inf"
|
77
|
+
|
78
|
+
self._db.members(self._todo_key, 0, _min=_min, _max="(0")
|
79
|
+
|
80
|
+
if not self.heartbeat:
|
81
|
+
self._heartbeat_start_event.set()
|
82
|
+
|
83
|
+
self._db.delete(self._reset_lock_key)
|
84
|
+
|
85
|
+
time.sleep(reset_wait_seconds)
|
86
|
+
|
87
|
+
@check_pause
|
88
|
+
def _scheduler(self):
|
89
|
+
"""
|
90
|
+
调度任务,获取redis队列种子,同时添加到doing字典中
|
91
|
+
"""
|
92
|
+
if not self._db.zcount(self._todo_key, 0, "(1000"):
|
93
|
+
time.sleep(self._scheduler_wait_seconds)
|
94
|
+
elif self.__LAUNCHER_QUEUE__['todo'].length >= self._todo_queue_size:
|
95
|
+
time.sleep(self._todo_queue_full_wait_seconds)
|
96
|
+
else:
|
97
|
+
members = self._db.members(
|
98
|
+
self._todo_key, int(time.time()),
|
99
|
+
count=self._todo_queue_size,
|
100
|
+
_min=0, _max="(1000"
|
101
|
+
)
|
102
|
+
for member, priority in members:
|
103
|
+
seed = Seed(member, priority=priority)
|
104
|
+
self.__LAUNCHER_QUEUE__['todo'].push(seed)
|
105
|
+
self.__DOING__[seed.to_string] = seed.params.priority
|
106
|
+
|
107
|
+
@check_pause
|
108
|
+
def _insert(self):
|
109
|
+
"""
|
110
|
+
添加新种子到redis队列中
|
111
|
+
"""
|
112
|
+
new_seeds = {}
|
113
|
+
del_seeds = set()
|
114
|
+
status = self.__LAUNCHER_QUEUE__['new'].length < self._new_queue_max_size
|
115
|
+
for _ in range(self._new_queue_max_size):
|
116
|
+
seed_tuple = self.__LAUNCHER_QUEUE__['new'].pop()
|
117
|
+
if not seed_tuple:
|
118
|
+
break
|
119
|
+
seed, new_seed = seed_tuple
|
120
|
+
new_seeds[new_seed.to_string] = new_seed.params.priority
|
121
|
+
del_seeds.add(seed)
|
122
|
+
if new_seeds:
|
123
|
+
self._db.zadd(self._todo_key, new_seeds, nx=True)
|
124
|
+
if del_seeds:
|
125
|
+
self.__LAUNCHER_QUEUE__['done'].push(list(del_seeds))
|
126
|
+
if status:
|
127
|
+
time.sleep(self._new_queue_wait_seconds)
|
128
|
+
|
129
|
+
@check_pause
|
130
|
+
def _refresh(self):
|
131
|
+
"""
|
132
|
+
刷新doing种子过期时间,防止reset重新消费
|
133
|
+
"""
|
134
|
+
if self.__DOING__:
|
135
|
+
refresh_time = int(time.time())
|
136
|
+
seeds = {k:-refresh_time - v / 1000 for k, v in self.__DOING__.items()}
|
137
|
+
self._db.zadd(self._todo_key, item=seeds, xx=True)
|
138
|
+
time.sleep(15)
|
139
|
+
|
140
|
+
@check_pause
|
141
|
+
def _delete(self):
|
142
|
+
"""
|
143
|
+
删除队列种子,根据状态添加至成功或失败队列,移除doing字典种子索引
|
144
|
+
"""
|
145
|
+
# seed_info = {"count": 0, "failed": [], "succeed": [], "common": []}
|
146
|
+
|
147
|
+
seed_list = []
|
148
|
+
status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
|
149
|
+
|
150
|
+
for _ in range(self._done_queue_max_size):
|
151
|
+
seed = self.__LAUNCHER_QUEUE__['done'].pop()
|
152
|
+
if not seed:
|
153
|
+
break
|
154
|
+
seed_list.append(seed.to_string)
|
155
|
+
|
156
|
+
if seed_list:
|
157
|
+
|
158
|
+
self._db.zrem(self._todo_key, *seed_list)
|
159
|
+
self._remove_doing_seeds(seed_list)
|
160
|
+
|
161
|
+
if status:
|
162
|
+
time.sleep(self._done_queue_wait_seconds)
|
163
|
+
|
164
|
+
def _polling(self):
|
165
|
+
wait_scheduler_execute = True
|
166
|
+
check_emtpy_times = 0
|
167
|
+
while not self._stop.is_set():
|
168
|
+
queue_not_empty_count = 0
|
169
|
+
pooling_wait_seconds = 30
|
170
|
+
|
171
|
+
for q in self.__LAUNCHER_QUEUE__.values():
|
172
|
+
if q.length != 0:
|
173
|
+
queue_not_empty_count += 1
|
174
|
+
wait_scheduler_execute = False
|
175
|
+
|
176
|
+
if queue_not_empty_count == 0:
|
177
|
+
pooling_wait_seconds = 3
|
178
|
+
if self._pause.is_set():
|
179
|
+
check_emtpy_times = 0
|
180
|
+
if not self._task_model and (
|
181
|
+
not wait_scheduler_execute or
|
182
|
+
int(time.time()) - self._app_time > self._before_scheduler_wait_seconds
|
183
|
+
):
|
184
|
+
logger.info("Done! ready to close thread...")
|
185
|
+
self._stop.set()
|
186
|
+
|
187
|
+
elif self._db.zcount(self._todo_key, _min=0, _max="(1000"):
|
188
|
+
logger.info(f"Recovery {self.task} task run!")
|
189
|
+
self._pause.clear()
|
190
|
+
self._execute()
|
191
|
+
else:
|
192
|
+
logger.info("pause! waiting for resume...")
|
193
|
+
elif check_emtpy_times > 2:
|
194
|
+
self.__DOING__ = {}
|
195
|
+
seed_count = self._db.zcard(self._todo_key)
|
196
|
+
logger.info(f"队列剩余种子数:{seed_count}")
|
197
|
+
if not seed_count:
|
198
|
+
logger.info("Done! pause set...")
|
199
|
+
self._pause.set()
|
200
|
+
else:
|
201
|
+
self._pause.clear()
|
202
|
+
else:
|
203
|
+
logger.info(
|
204
|
+
"check whether the task is complete, "
|
205
|
+
f"reset times {3 - check_emtpy_times}"
|
206
|
+
)
|
207
|
+
check_emtpy_times += 1
|
208
|
+
else:
|
209
|
+
if self._pause.is_set():
|
210
|
+
self._pause.clear()
|
211
|
+
logger.info(LogTemplate.launcher_pro_polling.format(
|
212
|
+
task=self.task,
|
213
|
+
doing_len=len(self.__DOING__.keys()),
|
214
|
+
todo_len=self.__LAUNCHER_QUEUE__['todo'].length,
|
215
|
+
done_len=self.__LAUNCHER_QUEUE__['done'].length,
|
216
|
+
redis_seed_count=self._db.zcount(self._todo_key, "-inf", "+inf"),
|
217
|
+
redis_todo_len=self._db.zcount(self._todo_key, 0, "(1000"),
|
218
|
+
redis_doing_len=self._db.zcount(self._todo_key, "-inf", "(0"),
|
219
|
+
upload_len=self.__LAUNCHER_QUEUE__['upload'].length,
|
220
|
+
))
|
221
|
+
|
222
|
+
time.sleep(pooling_wait_seconds)
|
223
|
+
|
224
|
+
logger.info("Done! Ready to close thread...")
|
225
|
+
|
@@ -0,0 +1,85 @@
|
|
1
|
+
import threading
|
2
|
+
|
3
|
+
from cobweb import setting
|
4
|
+
from typing import Callable
|
5
|
+
from cobweb.base import Queue
|
6
|
+
from abc import ABC, abstractmethod
|
7
|
+
|
8
|
+
|
9
|
+
class Scheduler(ABC, threading.Thread):
|
10
|
+
|
11
|
+
__LAUNCHER_FUNC__ = ["_reset", "_scheduler", "_insert", "_refresh", "_delete"]
|
12
|
+
|
13
|
+
def __init__(
|
14
|
+
self,
|
15
|
+
task,
|
16
|
+
project,
|
17
|
+
stop: threading.Event,
|
18
|
+
pause: threading.Event,
|
19
|
+
new: Queue,
|
20
|
+
todo: Queue,
|
21
|
+
done: Queue,
|
22
|
+
upload: Queue,
|
23
|
+
register: Callable
|
24
|
+
):
|
25
|
+
super().__init__()
|
26
|
+
self.task = task
|
27
|
+
self.project = project
|
28
|
+
|
29
|
+
self.task_model = setting.TASK_MODEL
|
30
|
+
self.seed_reset_seconds = setting.SEED_RESET_SECONDS
|
31
|
+
self.scheduler_wait_seconds = setting.SCHEDULER_WAIT_SECONDS
|
32
|
+
self.new_queue_wait_seconds = setting.NEW_QUEUE_WAIT_SECONDS
|
33
|
+
self.done_queue_wait_seconds = setting.DONE_QUEUE_WAIT_SECONDS
|
34
|
+
self.todo_queue_full_wait_seconds = setting.TODO_QUEUE_FULL_WAIT_SECONDS
|
35
|
+
self.before_scheduler_wait_seconds = setting.BEFORE_SCHEDULER_WAIT_SECONDS
|
36
|
+
|
37
|
+
self.todo_queue_size = setting.TODO_QUEUE_SIZE
|
38
|
+
self.new_queue_max_size = setting.NEW_QUEUE_MAX_SIZE
|
39
|
+
self.done_queue_max_size = setting.DONE_QUEUE_MAX_SIZE
|
40
|
+
self.upload_queue_max_size = setting.UPLOAD_QUEUE_MAX_SIZE
|
41
|
+
|
42
|
+
self.stop = stop
|
43
|
+
self.pause = pause
|
44
|
+
|
45
|
+
self.new = new
|
46
|
+
self.todo = todo
|
47
|
+
self.done = done
|
48
|
+
self.upload = upload
|
49
|
+
|
50
|
+
self.register = register
|
51
|
+
|
52
|
+
self.working_seeds = dict()
|
53
|
+
|
54
|
+
def is_empty(self):
|
55
|
+
if self.new.length == 0 and self.todo.length == 0 and self.done.length == 0 and self.upload.length == 0:
|
56
|
+
return True
|
57
|
+
else:
|
58
|
+
return False
|
59
|
+
|
60
|
+
def remove_working_seeds(self, seeds: list = None):
|
61
|
+
for seed in seeds:
|
62
|
+
if seed in self.working_seeds:
|
63
|
+
self.working_seeds.pop(seed)
|
64
|
+
|
65
|
+
@abstractmethod
|
66
|
+
def reset(self):
|
67
|
+
...
|
68
|
+
|
69
|
+
@abstractmethod
|
70
|
+
def schedule(self):
|
71
|
+
...
|
72
|
+
|
73
|
+
@abstractmethod
|
74
|
+
def insert(self):
|
75
|
+
...
|
76
|
+
|
77
|
+
@abstractmethod
|
78
|
+
def refresh(self):
|
79
|
+
...
|
80
|
+
|
81
|
+
@abstractmethod
|
82
|
+
def delete(self):
|
83
|
+
...
|
84
|
+
|
85
|
+
|
@@ -0,0 +1,177 @@
|
|
1
|
+
import os
|
2
|
+
import time
|
3
|
+
import threading
|
4
|
+
from typing import Callable
|
5
|
+
from cobweb.db import RedisDB, ApiDB
|
6
|
+
from cobweb.utils import check_pause
|
7
|
+
from cobweb.base import Queue, Seed, logger
|
8
|
+
from cobweb.constant import LogTemplate
|
9
|
+
from .scheduler import Scheduler
|
10
|
+
use_api = bool(int(os.getenv("REDIS_API")))
|
11
|
+
|
12
|
+
|
13
|
+
class RedisScheduler(Scheduler):
|
14
|
+
|
15
|
+
def __init__(
|
16
|
+
self,
|
17
|
+
task,
|
18
|
+
project,
|
19
|
+
stop: threading.Event,
|
20
|
+
pause: threading.Event,
|
21
|
+
new: Queue,
|
22
|
+
todo: Queue,
|
23
|
+
done: Queue,
|
24
|
+
upload: Queue,
|
25
|
+
register: Callable
|
26
|
+
):
|
27
|
+
super().__init__(task, project, stop, pause, new, todo, done, upload, register)
|
28
|
+
self.todo_key = "{%s:%s}:todo" % (project, task)
|
29
|
+
self.done_key = "{%s:%s}:done" % (project, task)
|
30
|
+
self.fail_key = "{%s:%s}:fail" % (project, task)
|
31
|
+
self.heartbeat_key = "heartbeat:%s_%s" % (project, task)
|
32
|
+
self.speed_control_key = "speed_control:%s_%s" % (project, task)
|
33
|
+
self.reset_lock_key = "lock:reset:%s_%s" % (project, task)
|
34
|
+
self.redis_queue_empty_event = threading.Event()
|
35
|
+
self.db = ApiDB() if use_api else RedisDB()
|
36
|
+
|
37
|
+
@check_pause
|
38
|
+
def reset(self):
|
39
|
+
"""
|
40
|
+
检查过期种子,重新添加到redis缓存中
|
41
|
+
"""
|
42
|
+
reset_wait_seconds = 30
|
43
|
+
if self.db.lock(self.reset_lock_key, t=60):
|
44
|
+
|
45
|
+
_min = -int(time.time()) + self.seed_reset_seconds
|
46
|
+
self.db.members(self.todo_key, 0, _min=_min, _max="(0")
|
47
|
+
self.db.delete(self.reset_lock_key)
|
48
|
+
|
49
|
+
time.sleep(reset_wait_seconds)
|
50
|
+
|
51
|
+
@check_pause
|
52
|
+
def schedule(self):
|
53
|
+
"""
|
54
|
+
调度任务,获取redis队列种子,同时添加到doing字典中
|
55
|
+
"""
|
56
|
+
if not self.db.zcount(self.todo_key, 0, "(1000"):
|
57
|
+
time.sleep(self.scheduler_wait_seconds)
|
58
|
+
elif self.todo.length >= self.todo_queue_size:
|
59
|
+
time.sleep(self.todo_queue_full_wait_seconds)
|
60
|
+
else:
|
61
|
+
members = self.db.members(
|
62
|
+
self.todo_key, int(time.time()),
|
63
|
+
count=self.todo_queue_size,
|
64
|
+
_min=0, _max="(1000"
|
65
|
+
)
|
66
|
+
for member, priority in members:
|
67
|
+
seed = Seed(member, priority=priority)
|
68
|
+
self.working_seeds[seed.to_string] = seed.params.priority
|
69
|
+
self.todo.push(seed)
|
70
|
+
|
71
|
+
@check_pause
|
72
|
+
def insert(self):
|
73
|
+
"""
|
74
|
+
添加新种子到redis队列中
|
75
|
+
"""
|
76
|
+
new_seeds = {}
|
77
|
+
del_seeds = set()
|
78
|
+
status = self.new.length < self.new_queue_max_size
|
79
|
+
for _ in range(self.new_queue_max_size):
|
80
|
+
seed_tuple = self.new.pop()
|
81
|
+
if not seed_tuple:
|
82
|
+
break
|
83
|
+
seed, new_seed = seed_tuple
|
84
|
+
new_seeds[new_seed.to_string] = new_seed.params.priority
|
85
|
+
del_seeds.add(seed)
|
86
|
+
if new_seeds:
|
87
|
+
self.db.zadd(self.todo_key, new_seeds, nx=True)
|
88
|
+
if del_seeds:
|
89
|
+
self.done.push(list(del_seeds))
|
90
|
+
if status:
|
91
|
+
time.sleep(self.new_queue_wait_seconds)
|
92
|
+
|
93
|
+
@check_pause
|
94
|
+
def refresh(self):
|
95
|
+
"""
|
96
|
+
刷新doing种子过期时间,防止reset重新消费
|
97
|
+
"""
|
98
|
+
if self.working_seeds:
|
99
|
+
refresh_time = int(time.time())
|
100
|
+
seeds = {k:-refresh_time - v / 1000 for k, v in self.working_seeds.items()}
|
101
|
+
self.db.zadd(self.todo_key, item=seeds, xx=True)
|
102
|
+
time.sleep(3)
|
103
|
+
|
104
|
+
@check_pause
|
105
|
+
def delete(self):
|
106
|
+
"""
|
107
|
+
删除队列种子,根据状态添加至成功或失败队列,移除doing字典种子索引
|
108
|
+
"""
|
109
|
+
seed_list = []
|
110
|
+
status = self.done.length < self.done_queue_max_size
|
111
|
+
|
112
|
+
for _ in range(self.done_queue_max_size):
|
113
|
+
seed = self.done.pop()
|
114
|
+
if not seed:
|
115
|
+
break
|
116
|
+
seed_list.append(seed.to_string)
|
117
|
+
|
118
|
+
if seed_list:
|
119
|
+
|
120
|
+
self.db.zrem(self.todo_key, *seed_list)
|
121
|
+
self.remove_working_seeds(seed_list)
|
122
|
+
|
123
|
+
if status:
|
124
|
+
time.sleep(self.done_queue_wait_seconds)
|
125
|
+
|
126
|
+
def run(self):
|
127
|
+
start_time = int(time.time())
|
128
|
+
|
129
|
+
self.register(self.reset, tag="scheduler")
|
130
|
+
self.register(self.insert, tag="scheduler")
|
131
|
+
self.register(self.delete, tag="scheduler")
|
132
|
+
self.register(self.refresh, tag="scheduler")
|
133
|
+
self.register(self.schedule, tag="scheduler")
|
134
|
+
|
135
|
+
while not self.stop.is_set():
|
136
|
+
working_count = len(self.working_seeds.keys())
|
137
|
+
memory_count = self.db.zcount(self.todo_key, "-inf", "(0")
|
138
|
+
todo_count = self.db.zcount(self.todo_key, 0, "(1000")
|
139
|
+
all_count = self.db.zcard(self.todo_key)
|
140
|
+
|
141
|
+
if self.is_empty():
|
142
|
+
if self.pause.is_set():
|
143
|
+
execute_time = int(time.time()) - start_time
|
144
|
+
if not self.task_model and execute_time > self.before_scheduler_wait_seconds:
|
145
|
+
logger.info("Done! ready to close thread...")
|
146
|
+
self.stop.set()
|
147
|
+
elif todo_count:
|
148
|
+
logger.info(f"Recovery {self.task} task run!todo seeds count: {todo_count}, queue length: {all_count}")
|
149
|
+
self.pause.clear()
|
150
|
+
# self.execute()
|
151
|
+
else:
|
152
|
+
logger.info("pause! waiting for resume...")
|
153
|
+
else:
|
154
|
+
if all_count:
|
155
|
+
logger.info(f"todo seeds count: {todo_count}, queue length: {all_count}")
|
156
|
+
self.pause.is_set()
|
157
|
+
else:
|
158
|
+
logger.info("Done! pause set...")
|
159
|
+
self.pause.clear()
|
160
|
+
else:
|
161
|
+
if self.pause.is_set():
|
162
|
+
self.pause.clear()
|
163
|
+
logger.info(LogTemplate.launcher_pro_polling.format(
|
164
|
+
task=self.task,
|
165
|
+
doing_len=working_count,
|
166
|
+
todo_len=self.todo.length,
|
167
|
+
done_len=self.done.length,
|
168
|
+
redis_seed_count=all_count,
|
169
|
+
redis_todo_len=todo_count,
|
170
|
+
redis_doing_len=memory_count,
|
171
|
+
upload_len=self.upload.length,
|
172
|
+
))
|
173
|
+
|
174
|
+
time.sleep(30)
|
175
|
+
|
176
|
+
logger.info("Done! Ready to close thread...")
|
177
|
+
|
cobweb/setting.py
CHANGED
@@ -30,49 +30,29 @@ OSS_MIN_UPLOAD_SIZE = 1024
|
|
30
30
|
# 采集器选择
|
31
31
|
CRAWLER = "cobweb.crawlers.Crawler"
|
32
32
|
|
33
|
-
#
|
34
|
-
PIPELINE = "cobweb.pipelines.
|
33
|
+
# 数据管道
|
34
|
+
PIPELINE = "cobweb.pipelines.Console"
|
35
|
+
|
36
|
+
# 调度器
|
37
|
+
SCHEDULER = "cobweb.schedulers.RedisScheduler"
|
35
38
|
|
36
39
|
|
37
40
|
# Launcher 等待时间
|
38
41
|
|
39
42
|
BEFORE_SCHEDULER_WAIT_SECONDS = 60 # 调度前等待时间,只作用于单次任务
|
40
|
-
|
41
|
-
|
42
|
-
INSERTING_WAIT_TIME = 30 # INSERT ITEM 等待时间
|
43
|
-
REMOVING_WAIT_TIME = 5 # REMOVE ITEM 等待时间
|
44
|
-
RESET_WAIT_TIME = 30 # REST ITEM 等待时间
|
45
|
-
UPLOAD_WAIT_TIME = 15 # 上传等待时间
|
46
|
-
|
47
|
-
TODO_QUEUE_FULL_WAIT_SECONDS = 5 # 队列已满时等待时间
|
43
|
+
SCHEDULER_WAIT_SECONDS = 15 # 调度等待时间
|
44
|
+
TODO_QUEUE_FULL_WAIT_SECONDS = 5 # todo队列已满时等待时间
|
48
45
|
NEW_QUEUE_WAIT_SECONDS = 30 # new队列等待时间
|
49
46
|
DONE_QUEUE_WAIT_SECONDS = 5 # done队列等待时间
|
50
47
|
UPLOAD_QUEUE_WAIT_SECONDS = 15 # upload队列等待时间
|
51
|
-
SEED_RESET_SECONDS =
|
48
|
+
SEED_RESET_SECONDS = 60 # 种子重制时间
|
52
49
|
|
53
50
|
|
54
51
|
# Launcher 队列长度
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
# SEED = Queue() # 添加任务种子队列
|
60
|
-
# TODO = Queue() # 任务种子队列
|
61
|
-
# REQUEST = Queue() # 请求队列
|
62
|
-
# DOWNLOAD = Queue() # 下载任务队列
|
63
|
-
# RESPONSE = Queue() # 响应队列
|
64
|
-
# DONE = Queue() # 下载完成队列
|
65
|
-
# UPLOAD = Queue() # 任务上传队列
|
66
|
-
# DELETE = Queue() # 任务删除队列
|
67
|
-
|
68
|
-
SEED_QUEUE_SIZE = 100 # TODO 队列长度
|
69
|
-
TODO_QUEUE_SIZE = 100 # TODO 队列长度
|
70
|
-
REQUEST_QUEUE_SIZE = 100 # new队列长度
|
71
|
-
DOWNLOAD_QUEUE_SIZE = 100 # done队列长度
|
72
|
-
RESPONSE_QUEUE_SIZE = 100 # upload队列长度
|
73
|
-
DONE_QUEUE_SIZE = 100 # upload队列长度
|
74
|
-
UPLOAD_QUEUE_SIZE = 100 # upload队列长度
|
75
|
-
DELETE_QUEUE_SIZE = 100 # upload队列长度
|
52
|
+
TODO_QUEUE_SIZE = 100 # todo队列长度
|
53
|
+
NEW_QUEUE_MAX_SIZE = 100 # new队列长度
|
54
|
+
DONE_QUEUE_MAX_SIZE = 100 # done队列长度
|
55
|
+
UPLOAD_QUEUE_MAX_SIZE = 100 # upload队列长度
|
76
56
|
|
77
57
|
# DONE_MODEL IN (0, 1), 种子完成模式
|
78
58
|
DONE_MODEL = 0 # 0:种子消费成功直接从队列移除,失败则添加至失败队列;1:种子消费成功添加至成功队列,失败添加至失败队列
|
@@ -81,6 +61,7 @@ DONE_MODEL = 0 # 0:种子消费成功直接从队列移除,失败则添加
|
|
81
61
|
SPIDER_THREAD_NUM = 10
|
82
62
|
SPIDER_MAX_RETRIES = 5
|
83
63
|
SPIDER_TIME_SLEEP = 10
|
64
|
+
RECORD_FAILED_SPIDER = False
|
84
65
|
|
85
66
|
SPIDER_MAX_COUNT = 1000 # 在规定时间窗口内最大采集数
|
86
67
|
TIME_WINDOW = 60 # 频控固定时间窗口(秒)
|
@@ -88,6 +69,8 @@ TIME_WINDOW = 60 # 频控固定时间窗口(秒)
|
|
88
69
|
# 任务模式
|
89
70
|
TASK_MODEL = 0 # 0:单次,1:常驻
|
90
71
|
|
72
|
+
# 流控措施
|
73
|
+
SPEED_CONTROL = 1 # 0:关闭,1:开启
|
91
74
|
|
92
75
|
# bloom过滤器
|
93
76
|
CAPACITY = 100000000
|
cobweb/utils/__init__.py
CHANGED
@@ -0,0 +1,43 @@
|
|
1
|
+
import time
|
2
|
+
from functools import wraps
|
3
|
+
|
4
|
+
from cobweb.base import logger
|
5
|
+
|
6
|
+
|
7
|
+
def decorator_oss_db(exception, retries=3):
|
8
|
+
def decorator(func):
|
9
|
+
@wraps(func)
|
10
|
+
def wrapper(callback_func, *args, **kwargs):
|
11
|
+
result = None
|
12
|
+
for i in range(retries):
|
13
|
+
msg = None
|
14
|
+
try:
|
15
|
+
return func(callback_func, *args, **kwargs)
|
16
|
+
except Exception as e:
|
17
|
+
result = None
|
18
|
+
msg = e
|
19
|
+
finally:
|
20
|
+
if result:
|
21
|
+
return result
|
22
|
+
|
23
|
+
if i >= 2 and msg:
|
24
|
+
raise exception(msg)
|
25
|
+
|
26
|
+
return wrapper
|
27
|
+
|
28
|
+
return decorator
|
29
|
+
|
30
|
+
|
31
|
+
def check_pause(func):
|
32
|
+
@wraps(func)
|
33
|
+
def wrapper(self, *args, **kwargs):
|
34
|
+
while not self.pause.is_set():
|
35
|
+
try:
|
36
|
+
func(self, *args, **kwargs)
|
37
|
+
except Exception as e:
|
38
|
+
logger.info(f"{func.__name__}: " + str(e))
|
39
|
+
finally:
|
40
|
+
time.sleep(0.1)
|
41
|
+
logger.info(f"pause: {func.__name__} thread close ...")
|
42
|
+
|
43
|
+
return wrapper
|
cobweb/utils/dotting.py
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
import json
|
2
|
+
import time
|
3
|
+
|
4
|
+
from aliyun.log import LogClient, LogItem, PutLogsRequest
|
5
|
+
|
6
|
+
from cobweb.base import Queue, logger
|
7
|
+
from cobweb import setting
|
8
|
+
|
9
|
+
|
10
|
+
class LoghubDot:
|
11
|
+
|
12
|
+
def __init__(self):
|
13
|
+
self.client = LogClient(**setting.LOGHUB_CONFIG)
|
14
|
+
self.queue = Queue()
|
15
|
+
|
16
|
+
def build(self, topic, **kwargs):
|
17
|
+
|
18
|
+
temp = {}
|
19
|
+
log_item = LogItem()
|
20
|
+
for key, value in kwargs.items():
|
21
|
+
if not isinstance(value, str):
|
22
|
+
temp[key] = json.dumps(value, ensure_ascii=False)
|
23
|
+
else:
|
24
|
+
temp[key] = value
|
25
|
+
contents = sorted(temp.items())
|
26
|
+
log_item.set_contents(contents)
|
27
|
+
self.queue.push((topic, log_item), direct_insertion=True)
|
28
|
+
|
29
|
+
def build_run(self):
|
30
|
+
while True:
|
31
|
+
start_time = int(time.time())
|
32
|
+
while True:
|
33
|
+
cost_time = int(time.time()) - start_time
|
34
|
+
if self.queue.length >= 1000 or cost_time > 10:
|
35
|
+
break
|
36
|
+
time.sleep(0.5)
|
37
|
+
try:
|
38
|
+
log_item_info = {}
|
39
|
+
for _ in range(1000):
|
40
|
+
its = self.queue.pop()
|
41
|
+
if not its:
|
42
|
+
break
|
43
|
+
topic, item = its
|
44
|
+
log_item_info.setdefault(topic, []).append(item)
|
45
|
+
for topic, log_items in log_item_info.items():
|
46
|
+
request = PutLogsRequest(
|
47
|
+
project="databee-download-log",
|
48
|
+
logstore="log",
|
49
|
+
topic=topic,
|
50
|
+
logitems=log_items,
|
51
|
+
compress=True
|
52
|
+
)
|
53
|
+
self.client.put_logs(request=request)
|
54
|
+
except Exception as e:
|
55
|
+
logger.info(str(e))
|