cobweb-launcher 1.2.49__py3-none-any.whl → 1.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cobweb/base/__init__.py +141 -4
- cobweb/base/basic.py +28 -82
- cobweb/base/common_queue.py +13 -0
- cobweb/base/dotting.py +1 -1
- cobweb/base/request.py +14 -2
- cobweb/base/seed.py +10 -6
- cobweb/constant.py +16 -0
- cobweb/crawlers/crawler.py +51 -181
- cobweb/db/redis_db.py +28 -0
- cobweb/launchers/__init__.py +2 -2
- cobweb/launchers/launcher.py +110 -141
- cobweb/launchers/launcher_api.py +66 -114
- cobweb/launchers/launcher_pro.py +76 -194
- cobweb/pipelines/base_pipeline.py +54 -0
- cobweb/pipelines/loghub_pipeline.py +34 -0
- cobweb/pipelines/pipeline.py +25 -49
- cobweb/schedulers/__init__.py +0 -2
- cobweb/schedulers/scheduler_redis.py +5 -8
- cobweb/setting.py +29 -6
- cobweb/utils/dotting.py +10 -42
- cobweb_/__init__.py +2 -0
- cobweb_/base/__init__.py +9 -0
- cobweb_/base/common_queue.py +30 -0
- cobweb_/base/decorators.py +40 -0
- cobweb_/base/item.py +46 -0
- cobweb_/base/log.py +94 -0
- cobweb_/base/request.py +82 -0
- cobweb_/base/response.py +23 -0
- cobweb_/base/seed.py +114 -0
- cobweb_/constant.py +94 -0
- cobweb_/crawlers/__init__.py +1 -0
- cobweb_/crawlers/crawler.py +184 -0
- cobweb_/db/__init__.py +2 -0
- cobweb_/db/api_db.py +82 -0
- cobweb_/db/redis_db.py +130 -0
- cobweb_/exceptions/__init__.py +1 -0
- cobweb_/exceptions/oss_db_exception.py +28 -0
- cobweb_/launchers/__init__.py +3 -0
- cobweb_/launchers/launcher.py +235 -0
- cobweb_/launchers/launcher_air.py +88 -0
- cobweb_/launchers/launcher_api.py +221 -0
- cobweb_/launchers/launcher_pro.py +222 -0
- cobweb_/pipelines/__init__.py +3 -0
- cobweb_/pipelines/pipeline.py +69 -0
- cobweb_/pipelines/pipeline_console.py +22 -0
- cobweb_/pipelines/pipeline_loghub.py +34 -0
- cobweb_/setting.py +74 -0
- cobweb_/utils/__init__.py +5 -0
- cobweb_/utils/bloom.py +58 -0
- cobweb_/utils/dotting.py +32 -0
- cobweb_/utils/oss.py +94 -0
- cobweb_/utils/tools.py +42 -0
- {cobweb_launcher-1.2.49.dist-info → cobweb_launcher-1.3.2.dist-info}/METADATA +1 -1
- cobweb_launcher-1.3.2.dist-info/RECORD +110 -0
- cobweb_launcher-1.3.2.dist-info/top_level.txt +2 -0
- cobweb_new/__init__.py +2 -0
- cobweb_new/base/__init__.py +72 -0
- cobweb_new/base/common_queue.py +53 -0
- cobweb_new/base/decorators.py +72 -0
- cobweb_new/base/item.py +46 -0
- cobweb_new/base/log.py +94 -0
- cobweb_new/base/request.py +82 -0
- cobweb_new/base/response.py +23 -0
- cobweb_new/base/seed.py +118 -0
- cobweb_new/constant.py +105 -0
- cobweb_new/crawlers/__init__.py +1 -0
- cobweb_new/crawlers/crawler-new.py +85 -0
- cobweb_new/crawlers/crawler.py +170 -0
- cobweb_new/db/__init__.py +2 -0
- cobweb_new/db/api_db.py +82 -0
- cobweb_new/db/redis_db.py +158 -0
- cobweb_new/exceptions/__init__.py +1 -0
- cobweb_new/exceptions/oss_db_exception.py +28 -0
- cobweb_new/launchers/__init__.py +3 -0
- cobweb_new/launchers/launcher.py +237 -0
- cobweb_new/launchers/launcher_air.py +88 -0
- cobweb_new/launchers/launcher_api.py +161 -0
- cobweb_new/launchers/launcher_pro.py +96 -0
- cobweb_new/launchers/tesss.py +47 -0
- cobweb_new/pipelines/__init__.py +3 -0
- cobweb_new/pipelines/pipeline.py +68 -0
- cobweb_new/pipelines/pipeline_console.py +22 -0
- cobweb_new/pipelines/pipeline_loghub.py +34 -0
- cobweb_new/setting.py +95 -0
- cobweb_new/utils/__init__.py +5 -0
- cobweb_new/utils/bloom.py +58 -0
- cobweb_new/utils/oss.py +94 -0
- cobweb_new/utils/tools.py +42 -0
- cobweb/schedulers/scheduler_api.py +0 -72
- cobweb_launcher-1.2.49.dist-info/RECORD +0 -44
- cobweb_launcher-1.2.49.dist-info/top_level.txt +0 -1
- {cobweb_launcher-1.2.49.dist-info → cobweb_launcher-1.3.2.dist-info}/LICENSE +0 -0
- {cobweb_launcher-1.2.49.dist-info → cobweb_launcher-1.3.2.dist-info}/WHEEL +0 -0
cobweb/launchers/launcher_api.py
CHANGED
@@ -2,9 +2,9 @@ import time
|
|
2
2
|
import threading
|
3
3
|
|
4
4
|
from cobweb.db import ApiDB
|
5
|
-
from cobweb.base import Seed, logger
|
6
|
-
from cobweb.constant import DealModel
|
7
|
-
from .launcher import Launcher
|
5
|
+
from cobweb.base import Seed, TaskQueue,logger, stop, pause
|
6
|
+
from cobweb.constant import DealModel
|
7
|
+
from .launcher import Launcher
|
8
8
|
|
9
9
|
|
10
10
|
class LauncherApi(Launcher):
|
@@ -24,18 +24,14 @@ class LauncherApi(Launcher):
|
|
24
24
|
|
25
25
|
self._reset_lock_key = "lock:reset:%s_%s" % (project, task)
|
26
26
|
|
27
|
-
# self._bf_key = "bloom_%s_%s" % (project, task)
|
28
|
-
# self._bf = BloomFilter(self._bf_key)
|
29
|
-
|
30
27
|
self._heartbeat_start_event = threading.Event()
|
31
|
-
self._redis_queue_empty_event = threading.Event()
|
32
28
|
|
33
29
|
@property
|
34
30
|
def heartbeat(self):
|
35
31
|
return self._db.exists(self._heartbeat_key)
|
36
32
|
|
37
33
|
def statistics(self, key, count):
|
38
|
-
if not self.
|
34
|
+
if not self.task_model and not self._db.exists(key):
|
39
35
|
self._db.setex(key, 86400 * 30, int(count))
|
40
36
|
else:
|
41
37
|
self._db.incrby(key, count)
|
@@ -46,30 +42,25 @@ class LauncherApi(Launcher):
|
|
46
42
|
设置时间窗口为self._time_window(秒),判断在该窗口内的采集量是否满足阈值(self._spider_max_speed)
|
47
43
|
:return: True -> 种子, False -> None
|
48
44
|
"""
|
49
|
-
if
|
50
|
-
|
45
|
+
if TaskQueue.TODO.length and not self._db.auto_incr(
|
46
|
+
self._speed_control_key,
|
47
|
+
t=self.time_window,
|
48
|
+
limit=self.spider_max_count
|
49
|
+
):
|
51
50
|
expire_time = self._db.ttl(self._speed_control_key)
|
52
51
|
logger.info(f"Too fast! Please wait {expire_time} seconds...")
|
53
52
|
time.sleep(expire_time / 2)
|
54
53
|
return None
|
55
|
-
|
56
|
-
return seed
|
57
|
-
|
58
|
-
@check_pause
|
59
|
-
def _execute_heartbeat(self):
|
60
|
-
if self._heartbeat_start_event.is_set():
|
61
|
-
self._db.setex(self._heartbeat_key, 5)
|
62
|
-
time.sleep(3)
|
54
|
+
return TaskQueue.TODO.pop()
|
63
55
|
|
64
|
-
@
|
56
|
+
@stop
|
65
57
|
def _reset(self):
|
66
58
|
"""
|
67
59
|
检查过期种子,重新添加到redis缓存中
|
68
60
|
"""
|
69
|
-
reset_wait_seconds = 30
|
70
61
|
if self._db.lock(self._reset_lock_key, t=120):
|
71
62
|
|
72
|
-
_min = -int(time.time()) + self.
|
63
|
+
_min = -int(time.time()) + self.seed_reset_seconds \
|
73
64
|
if self.heartbeat else "-inf"
|
74
65
|
|
75
66
|
self._db.members(self._todo_key, 0, _min=_min, _max="(0")
|
@@ -79,131 +70,92 @@ class LauncherApi(Launcher):
|
|
79
70
|
|
80
71
|
self._db.delete(self._reset_lock_key)
|
81
72
|
|
82
|
-
time.sleep(
|
73
|
+
time.sleep(30)
|
74
|
+
|
75
|
+
@stop
|
76
|
+
def _refresh(self):
|
77
|
+
"""
|
78
|
+
刷新doing种子过期时间,防止reset重新消费
|
79
|
+
"""
|
80
|
+
if self.doing_seeds:
|
81
|
+
refresh_time = int(time.time())
|
82
|
+
seeds = {k: -refresh_time - v / 1e3 for k, v in self.doing_seeds.items()}
|
83
|
+
self._db.zadd(self._todo_key, item=seeds, xx=True)
|
84
|
+
time.sleep(3)
|
83
85
|
|
84
|
-
@
|
86
|
+
@stop
|
85
87
|
def _scheduler(self):
|
86
88
|
"""
|
87
89
|
调度任务,获取redis队列种子,同时添加到doing字典中
|
88
90
|
"""
|
89
91
|
if not self._db.zcount(self._todo_key, 0, "(1000"):
|
90
|
-
time.sleep(self.
|
91
|
-
elif
|
92
|
-
time.sleep(self.
|
92
|
+
time.sleep(self.scheduler_wait_seconds)
|
93
|
+
elif TaskQueue.TODO.length >= self.todo_queue_size:
|
94
|
+
time.sleep(self.todo_queue_full_wait_seconds)
|
93
95
|
else:
|
94
96
|
members = self._db.members(
|
95
97
|
self._todo_key, int(time.time()),
|
96
|
-
count=self.
|
98
|
+
count=self.todo_queue_size,
|
97
99
|
_min=0, _max="(1000"
|
98
100
|
)
|
99
101
|
for member, priority in members:
|
100
102
|
seed = Seed(member, priority=priority)
|
101
|
-
|
102
|
-
self.
|
103
|
+
TaskQueue.TODO.push(seed)
|
104
|
+
self.doing_seeds[seed.to_string] = seed.params.priority
|
105
|
+
|
106
|
+
@pause
|
107
|
+
def _heartbeat(self):
|
108
|
+
if self._heartbeat_start_event.is_set():
|
109
|
+
self._db.setex(self._heartbeat_key, t=5)
|
110
|
+
time.sleep(3)
|
103
111
|
|
104
|
-
@
|
112
|
+
@pause
|
105
113
|
def _insert(self):
|
106
114
|
"""
|
107
115
|
添加新种子到redis队列中
|
108
116
|
"""
|
109
117
|
seeds = {}
|
110
|
-
|
111
|
-
|
112
|
-
seed = self.__LAUNCHER_QUEUE__['new'].pop()
|
113
|
-
if seed:
|
118
|
+
for _ in range(self.new_queue_max_size):
|
119
|
+
if seed := TaskQueue.SEED.pop():
|
114
120
|
seeds[seed.to_string] = seed.params.priority
|
115
121
|
if seeds:
|
116
122
|
self._db.zadd(self._todo_key, seeds, nx=True)
|
117
|
-
if
|
118
|
-
time.sleep(self.
|
119
|
-
|
120
|
-
@check_pause
|
121
|
-
def _refresh(self):
|
122
|
-
"""
|
123
|
-
刷新doing种子过期时间,防止reset重新消费
|
124
|
-
"""
|
125
|
-
if self.__DOING__:
|
126
|
-
refresh_time = int(time.time())
|
127
|
-
seeds = {k:-refresh_time - v / 1000 for k, v in self.__DOING__.items()}
|
128
|
-
self._db.zadd(self._todo_key, item=seeds, xx=True)
|
129
|
-
time.sleep(15)
|
123
|
+
if TaskQueue.SEED.length < self.new_queue_max_size:
|
124
|
+
time.sleep(self.new_queue_wait_seconds)
|
130
125
|
|
131
|
-
@
|
126
|
+
@pause
|
132
127
|
def _delete(self):
|
133
128
|
"""
|
134
129
|
删除队列种子,根据状态添加至成功或失败队列,移除doing字典种子索引
|
135
130
|
"""
|
136
|
-
|
131
|
+
seed_info = {"count": 0, "failed": [], "succeed": [], "common": []}
|
132
|
+
status = TaskQueue.DONE.length < self.done_queue_max_size
|
137
133
|
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
for _ in range(self._done_queue_max_size):
|
142
|
-
seed = self.__LAUNCHER_QUEUE__['done'].pop()
|
134
|
+
for _ in range(self.done_queue_max_size):
|
135
|
+
seed = TaskQueue.DONE.pop()
|
143
136
|
if not seed:
|
144
137
|
break
|
145
|
-
|
138
|
+
if seed.params.seed_status == DealModel.fail:
|
139
|
+
seed_info["failed"].append(seed.to_string)
|
140
|
+
elif self.done_model == 1:
|
141
|
+
seed_info["succeed"].append(seed.to_string)
|
142
|
+
else:
|
143
|
+
seed_info["common"].append(seed.to_string)
|
144
|
+
seed_info['count'] += 1
|
146
145
|
|
147
|
-
if
|
146
|
+
if seed_info["count"]:
|
148
147
|
|
149
|
-
self._db.zrem(self._todo_key, *
|
150
|
-
self.
|
148
|
+
succeed_count = int(self._db.zrem(self._todo_key, *seed_info["common"]) or 0)
|
149
|
+
succeed_count += int(self._db.done([self._todo_key, self._done_key], *seed_info["succeed"]) or 0)
|
150
|
+
failed_count = int(self._db.done([self._todo_key, self._fail_key], *seed_info["failed"]) or 0)
|
151
|
+
|
152
|
+
if failed_count:
|
153
|
+
self.statistics(self._statistics_fail_key, failed_count)
|
154
|
+
if succeed_count:
|
155
|
+
self.statistics(self._statistics_done_key, succeed_count)
|
156
|
+
|
157
|
+
self._remove_doing_seeds(seed_info["common"] + seed_info["succeed"] + seed_info["failed"])
|
151
158
|
|
152
159
|
if status:
|
153
|
-
time.sleep(self.
|
154
|
-
|
155
|
-
def _polling(self):
|
156
|
-
wait_scheduler_execute = True
|
157
|
-
check_emtpy_times = 0
|
158
|
-
while not self._stop.is_set():
|
159
|
-
queue_not_empty_count = 0
|
160
|
-
pooling_wait_seconds = 30
|
161
|
-
|
162
|
-
for q in self.__LAUNCHER_QUEUE__.values():
|
163
|
-
if q.length != 0:
|
164
|
-
queue_not_empty_count += 1
|
165
|
-
wait_scheduler_execute = False
|
166
|
-
|
167
|
-
if queue_not_empty_count == 0:
|
168
|
-
pooling_wait_seconds = 3
|
169
|
-
if self._pause.is_set():
|
170
|
-
check_emtpy_times = 0
|
171
|
-
if not self._task_model and (
|
172
|
-
not wait_scheduler_execute or
|
173
|
-
int(time.time()) - self._app_time > self._before_scheduler_wait_seconds
|
174
|
-
):
|
175
|
-
logger.info("Done! ready to close thread...")
|
176
|
-
self._stop.set()
|
177
|
-
|
178
|
-
elif self._db.zcount(self._todo_key, _min=0, _max="(1000"):
|
179
|
-
logger.info(f"Recovery {self.task} task run!")
|
180
|
-
self._pause.clear()
|
181
|
-
self._execute()
|
182
|
-
else:
|
183
|
-
logger.info("pause! waiting for resume...")
|
184
|
-
elif check_emtpy_times > 2:
|
185
|
-
self.__DOING__ = {}
|
186
|
-
if not self._db.zcount(self._todo_key, _min="-inf", _max="(1000"):
|
187
|
-
self._pause.set()
|
188
|
-
else:
|
189
|
-
logger.info(
|
190
|
-
"check whether the task is complete, "
|
191
|
-
f"reset times {3 - check_emtpy_times}"
|
192
|
-
)
|
193
|
-
check_emtpy_times += 1
|
194
|
-
else:
|
195
|
-
logger.info(LogTemplate.launcher_pro_polling.format(
|
196
|
-
task=self.task,
|
197
|
-
doing_len=len(self.__DOING__.keys()),
|
198
|
-
todo_len=self.__LAUNCHER_QUEUE__['todo'].length,
|
199
|
-
done_len=self.__LAUNCHER_QUEUE__['done'].length,
|
200
|
-
redis_seed_count=self._db.zcount(self._todo_key, "-inf", "+inf"),
|
201
|
-
redis_todo_len=self._db.zcount(self._todo_key, 0, "(1000"),
|
202
|
-
redis_doing_len=self._db.zcount(self._todo_key, "-inf", "(0"),
|
203
|
-
upload_len=self.__LAUNCHER_QUEUE__['upload'].length,
|
204
|
-
))
|
205
|
-
|
206
|
-
time.sleep(pooling_wait_seconds)
|
207
|
-
|
208
|
-
logger.info("Done! Ready to close thread...")
|
160
|
+
time.sleep(self.done_queue_wait_seconds)
|
209
161
|
|
cobweb/launchers/launcher_pro.py
CHANGED
@@ -1,208 +1,90 @@
|
|
1
1
|
import time
|
2
|
-
import threading
|
3
2
|
|
4
|
-
from cobweb.
|
5
|
-
from cobweb.
|
6
|
-
from
|
7
|
-
from cobweb.constant import DealModel, LogTemplate
|
8
|
-
from .launcher import Launcher, check_pause
|
3
|
+
from cobweb.base import TaskQueue, Decorators
|
4
|
+
from cobweb.schedulers import RedisScheduler
|
5
|
+
from .launcher import Launcher
|
9
6
|
|
10
7
|
|
11
8
|
class LauncherPro(Launcher):
|
12
9
|
|
13
10
|
def __init__(self, task, project, custom_setting=None, **kwargs):
|
14
11
|
super().__init__(task, project, custom_setting, **kwargs)
|
15
|
-
self.
|
16
|
-
self.
|
17
|
-
self.
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
@
|
36
|
-
def
|
37
|
-
|
38
|
-
|
39
|
-
def statistics(self, key, count):
|
40
|
-
if not self._task_model and not self._db.exists(key):
|
41
|
-
self._db.setex(key, 86400 * 30, int(count))
|
42
|
-
else:
|
43
|
-
self._db._client.incrby(key, count)
|
44
|
-
|
45
|
-
def _get_seed(self) -> Seed:
|
46
|
-
spider_speed = self._db._client.get(self._speed_control_key)
|
47
|
-
if int(spider_speed or 0) > self._spider_max_count:
|
48
|
-
expire_time = self._db.ttl(self._speed_control_key)
|
49
|
-
if expire_time == -1:
|
50
|
-
self._db.delete(self._speed_control_key)
|
51
|
-
else:
|
52
|
-
logger.info(f"Too fast! Please wait {expire_time} seconds...")
|
53
|
-
time.sleep(expire_time / 2)
|
54
|
-
return None
|
55
|
-
seed = self.__LAUNCHER_QUEUE__["todo"].pop()
|
56
|
-
if seed and not self._db.lock(self._speed_control_key, t=self._time_window):
|
57
|
-
self._db._client.incrby(self._speed_control_key, 1)
|
58
|
-
return seed
|
59
|
-
|
60
|
-
@check_pause
|
61
|
-
def _execute_heartbeat(self):
|
62
|
-
if self._heartbeat_start_event.is_set():
|
63
|
-
self._db.setex(self._heartbeat_key, 5)
|
12
|
+
self._redis_download = "{%s:%s}:download" % (project, task)
|
13
|
+
self._redis_todo = "{%s:%s}:todo" % (project, task)
|
14
|
+
self._scheduler = RedisScheduler(task, project)
|
15
|
+
|
16
|
+
@Decorators.stop
|
17
|
+
def _schedule(self):
|
18
|
+
thread_sleep = self.scheduling_wait_time
|
19
|
+
for q, key, size, item_info in [
|
20
|
+
(TaskQueue.TODO, self._redis_todo, self.todo_queue_size, self._task_info["todo"]),
|
21
|
+
(TaskQueue.DOWNLOAD, self._redis_download, self.download_queue_size, self._task_info["download"]),
|
22
|
+
]:
|
23
|
+
if q.length < size:
|
24
|
+
for member, priority in self._scheduler.schedule(
|
25
|
+
key, self.scheduling_size
|
26
|
+
):
|
27
|
+
q.push((member, priority), direct_insertion=True)
|
28
|
+
self.add_working_item(key.split(":")[-1], member, priority)
|
29
|
+
thread_sleep = 0.1
|
30
|
+
time.sleep(thread_sleep)
|
31
|
+
|
32
|
+
@Decorators.pause
|
33
|
+
def _heartbeat(self):
|
34
|
+
if self._scheduler.working.is_set():
|
35
|
+
self._scheduler.set_heartbeat()
|
64
36
|
time.sleep(3)
|
65
37
|
|
66
|
-
@
|
38
|
+
@Decorators.pause
|
67
39
|
def _reset(self):
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
_min = -int(time.time()) + self._seed_reset_seconds \
|
75
|
-
if self.heartbeat else "-inf"
|
76
|
-
|
77
|
-
self._db.members(self._todo_key, 0, _min=_min, _max="(0")
|
78
|
-
self._db.delete(self._reset_lock_key)
|
79
|
-
|
80
|
-
if not self.heartbeat:
|
81
|
-
self._heartbeat_start_event.set()
|
82
|
-
|
83
|
-
time.sleep(reset_wait_seconds)
|
84
|
-
|
85
|
-
@check_pause
|
86
|
-
def _scheduler(self):
|
87
|
-
"""
|
88
|
-
调度任务,获取redis队列种子,同时添加到doing字典中
|
89
|
-
"""
|
90
|
-
if not self._db.zcount(self._todo_key, 0, "(1000"):
|
91
|
-
time.sleep(self._scheduler_wait_seconds)
|
92
|
-
elif self.__LAUNCHER_QUEUE__['todo'].length >= self._todo_queue_size:
|
93
|
-
time.sleep(self._todo_queue_full_wait_seconds)
|
94
|
-
else:
|
95
|
-
members = self._db.members(
|
96
|
-
self._todo_key, int(time.time()),
|
97
|
-
count=self._todo_queue_size,
|
98
|
-
_min=0, _max="(1000"
|
99
|
-
)
|
100
|
-
for member, priority in members:
|
101
|
-
seed = Seed(member, priority=priority)
|
102
|
-
self.__LAUNCHER_QUEUE__['todo'].push(seed)
|
103
|
-
self.__DOING__[seed.to_string] = seed.params.priority
|
40
|
+
self._scheduler.reset(
|
41
|
+
keys=[self._redis_todo, self._redis_download],
|
42
|
+
reset_time=self.seed_reset_seconds
|
43
|
+
)
|
44
|
+
time.sleep(30)
|
104
45
|
|
105
|
-
@
|
46
|
+
@Decorators.pause
|
106
47
|
def _insert(self):
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
self.
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
@check_pause
|
48
|
+
thread_sleep = 0.1
|
49
|
+
for q, key, size in [
|
50
|
+
(TaskQueue.SEED, self._redis_todo, self.seed_queue_size),
|
51
|
+
(TaskQueue.REQUEST, self._redis_download, self.request_queue_size),
|
52
|
+
]:
|
53
|
+
item_info = {}
|
54
|
+
while item := q.pop() and len(item_info.keys()) < self.inserting_size:
|
55
|
+
item_info[item.seed] = item.params.priority
|
56
|
+
if q.length >= size:
|
57
|
+
thread_sleep = self.inserting_wait_time
|
58
|
+
self._scheduler.insert(key, item_info)
|
59
|
+
time.sleep(thread_sleep)
|
60
|
+
|
61
|
+
@Decorators.pause
|
122
62
|
def _refresh(self):
|
123
|
-
""
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
if status:
|
152
|
-
time.sleep(self._done_queue_wait_seconds)
|
153
|
-
|
154
|
-
def _polling(self):
|
155
|
-
wait_scheduler_execute = True
|
156
|
-
check_emtpy_times = 0
|
157
|
-
while not self._stop.is_set():
|
158
|
-
queue_not_empty_count = 0
|
159
|
-
pooling_wait_seconds = 30
|
160
|
-
|
161
|
-
for q in self.__LAUNCHER_QUEUE__.values():
|
162
|
-
if q.length != 0:
|
163
|
-
queue_not_empty_count += 1
|
164
|
-
wait_scheduler_execute = False
|
165
|
-
|
166
|
-
if queue_not_empty_count == 0:
|
167
|
-
pooling_wait_seconds = 3
|
168
|
-
if self._pause.is_set():
|
169
|
-
check_emtpy_times = 0
|
170
|
-
if not self._task_model and (
|
171
|
-
not wait_scheduler_execute or
|
172
|
-
int(time.time()) - self._app_time > self._before_scheduler_wait_seconds
|
173
|
-
):
|
174
|
-
logger.info("Done! ready to close thread...")
|
175
|
-
self._stop.set()
|
176
|
-
|
177
|
-
elif self._db.zcount(self._todo_key, _min=0, _max="(1000"):
|
178
|
-
logger.info(f"Recovery {self.task} task run!")
|
179
|
-
self._pause.clear()
|
180
|
-
self._execute()
|
181
|
-
else:
|
182
|
-
logger.info("pause! waiting for resume...")
|
183
|
-
elif check_emtpy_times > 2:
|
184
|
-
self.__DOING__ = {}
|
185
|
-
if not self._db.zcount(self._todo_key, _min="-inf", _max="(1000"):
|
186
|
-
self._pause.set()
|
187
|
-
else:
|
188
|
-
logger.info(
|
189
|
-
"check whether the task is complete, "
|
190
|
-
f"reset times {3 - check_emtpy_times}"
|
191
|
-
)
|
192
|
-
check_emtpy_times += 1
|
193
|
-
else:
|
194
|
-
logger.info(LogTemplate.launcher_pro_polling.format(
|
195
|
-
task=self.task,
|
196
|
-
doing_len=len(self.__DOING__.keys()),
|
197
|
-
todo_len=self.__LAUNCHER_QUEUE__['todo'].length,
|
198
|
-
done_len=self.__LAUNCHER_QUEUE__['done'].length,
|
199
|
-
redis_seed_count=self._db.zcount(self._todo_key, "-inf", "+inf"),
|
200
|
-
redis_todo_len=self._db.zcount(self._todo_key, 0, "(1000"),
|
201
|
-
redis_doing_len=self._db.zcount(self._todo_key, "-inf", "(0"),
|
202
|
-
upload_len=self.__LAUNCHER_QUEUE__['upload'].length,
|
203
|
-
))
|
204
|
-
|
205
|
-
time.sleep(pooling_wait_seconds)
|
206
|
-
|
207
|
-
logger.info("Done! Ready to close thread...")
|
208
|
-
|
63
|
+
self._scheduler.refresh(self._redis_todo, self._task_info["todo"])
|
64
|
+
self._scheduler.refresh(self._redis_download, self._task_info["download"])
|
65
|
+
time.sleep(10)
|
66
|
+
|
67
|
+
@Decorators.pause
|
68
|
+
def _remove(self):
|
69
|
+
thread_sleep = self.removing_wait_time
|
70
|
+
for q, key, size in [
|
71
|
+
(TaskQueue.DELETE, self._redis_todo, self.delete_queue_size),
|
72
|
+
(TaskQueue.DONE, self._redis_download, self.done_queue_size),
|
73
|
+
]:
|
74
|
+
items = []
|
75
|
+
while item := q.pop() and len(items) < self.removing_size:
|
76
|
+
items.append(item)
|
77
|
+
self._scheduler.delete(key, *items)
|
78
|
+
self.remove_working_items(key.split(":")[-1], items)
|
79
|
+
if q.length >= size:
|
80
|
+
thread_sleep = 0.1
|
81
|
+
time.sleep(thread_sleep)
|
82
|
+
|
83
|
+
def _init_schedule_thread(self):
|
84
|
+
self._add_thread(func=self._heartbeat)
|
85
|
+
self._add_thread(func=self._reset)
|
86
|
+
self._add_thread(func=self._refresh)
|
87
|
+
self._add_thread(func=self._schedule)
|
88
|
+
self._add_thread(func=self._insert)
|
89
|
+
self._add_thread(func=self._remove)
|
90
|
+
self._add_thread(func=self._polling)
|
@@ -0,0 +1,54 @@
|
|
1
|
+
import time
|
2
|
+
import threading
|
3
|
+
|
4
|
+
from abc import ABC, abstractmethod
|
5
|
+
from cobweb.base import BaseItem, Queue, logger
|
6
|
+
|
7
|
+
|
8
|
+
class Pipeline(threading.Thread, ABC):
|
9
|
+
|
10
|
+
def __init__(
|
11
|
+
self,
|
12
|
+
done_queue: Queue,
|
13
|
+
upload_queue: Queue,
|
14
|
+
upload_queue_size: int,
|
15
|
+
upload_wait_seconds: int
|
16
|
+
):
|
17
|
+
super().__init__()
|
18
|
+
self.done_queue = done_queue
|
19
|
+
self.upload_queue = upload_queue
|
20
|
+
self.upload_queue_size = upload_queue_size
|
21
|
+
self.upload_wait_seconds = upload_wait_seconds
|
22
|
+
|
23
|
+
@abstractmethod
|
24
|
+
def build(self, item: BaseItem) -> dict:
|
25
|
+
pass
|
26
|
+
|
27
|
+
@abstractmethod
|
28
|
+
def upload(self, table: str, data: list) -> bool:
|
29
|
+
pass
|
30
|
+
|
31
|
+
def run(self):
|
32
|
+
while True:
|
33
|
+
status = self.upload_queue.length < self.upload_queue_size
|
34
|
+
if status:
|
35
|
+
time.sleep(self.upload_wait_seconds)
|
36
|
+
data_info, seeds = {}, []
|
37
|
+
for _ in range(self.upload_queue_size):
|
38
|
+
item = self.upload_queue.pop()
|
39
|
+
if not item:
|
40
|
+
break
|
41
|
+
data = self.build(item)
|
42
|
+
seeds.append(item.seed)
|
43
|
+
data_info.setdefault(item.table, []).append(data)
|
44
|
+
for table, datas in data_info.items():
|
45
|
+
try:
|
46
|
+
self.upload(table, datas)
|
47
|
+
status = True
|
48
|
+
except Exception as e:
|
49
|
+
logger.info(e)
|
50
|
+
status = False
|
51
|
+
if status:
|
52
|
+
self.done_queue.push(seeds)
|
53
|
+
|
54
|
+
|
@@ -0,0 +1,34 @@
|
|
1
|
+
import json
|
2
|
+
|
3
|
+
from cobweb import setting
|
4
|
+
from cobweb.base import BaseItem
|
5
|
+
from cobweb.pipelines import Pipeline
|
6
|
+
from aliyun.log import LogClient, LogItem, PutLogsRequest
|
7
|
+
|
8
|
+
|
9
|
+
class LoghubPipeline(Pipeline):
|
10
|
+
|
11
|
+
def __init__(self, *args, **kwargs):
|
12
|
+
super().__init__(*args, **kwargs)
|
13
|
+
self.client = LogClient(**setting.LOGHUB_CONFIG)
|
14
|
+
|
15
|
+
def build(self, item: BaseItem):
|
16
|
+
log_item = LogItem()
|
17
|
+
temp = item.to_dict
|
18
|
+
for key, value in temp.items():
|
19
|
+
if not isinstance(value, str):
|
20
|
+
temp[key] = json.dumps(value, ensure_ascii=False)
|
21
|
+
contents = sorted(temp.items())
|
22
|
+
log_item.set_contents(contents)
|
23
|
+
return log_item
|
24
|
+
|
25
|
+
def upload(self, table, datas):
|
26
|
+
request = PutLogsRequest(
|
27
|
+
project=setting.LOGHUB_PROJECT,
|
28
|
+
logstore=table,
|
29
|
+
topic=setting.LOGHUB_TOPIC,
|
30
|
+
source=setting.LOGHUB_SOURCE,
|
31
|
+
logitems=datas,
|
32
|
+
compress=True
|
33
|
+
)
|
34
|
+
self.client.put_logs(request=request)
|