cobweb-launcher 1.3.6__py3-none-any.whl → 1.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cobweb_launcher-1.3.6.dist-info → cobweb_launcher-1.3.7.dist-info}/METADATA +1 -1
- cobweb_launcher-1.3.7.dist-info/RECORD +40 -0
- cobweb/base/decorators.py +0 -40
- cobweb/crawlers/base_crawler.py +0 -144
- cobweb/crawlers/file_crawler.py +0 -98
- cobweb/pipelines/base_pipeline.py +0 -54
- cobweb/pipelines/loghub_pipeline.py +0 -34
- cobweb/utils/dotting.py +0 -32
- cobweb_/__init__.py +0 -2
- cobweb_/base/__init__.py +0 -9
- cobweb_/base/common_queue.py +0 -30
- cobweb_/base/decorators.py +0 -40
- cobweb_/base/item.py +0 -46
- cobweb_/base/log.py +0 -94
- cobweb_/base/request.py +0 -82
- cobweb_/base/response.py +0 -23
- cobweb_/base/seed.py +0 -114
- cobweb_/constant.py +0 -94
- cobweb_/crawlers/__init__.py +0 -1
- cobweb_/crawlers/crawler.py +0 -184
- cobweb_/db/__init__.py +0 -2
- cobweb_/db/api_db.py +0 -82
- cobweb_/db/redis_db.py +0 -130
- cobweb_/exceptions/__init__.py +0 -1
- cobweb_/exceptions/oss_db_exception.py +0 -28
- cobweb_/launchers/__init__.py +0 -3
- cobweb_/launchers/launcher.py +0 -235
- cobweb_/launchers/launcher_air.py +0 -88
- cobweb_/launchers/launcher_api.py +0 -221
- cobweb_/launchers/launcher_pro.py +0 -222
- cobweb_/pipelines/__init__.py +0 -3
- cobweb_/pipelines/pipeline.py +0 -69
- cobweb_/pipelines/pipeline_console.py +0 -22
- cobweb_/pipelines/pipeline_loghub.py +0 -34
- cobweb_/setting.py +0 -74
- cobweb_/utils/__init__.py +0 -5
- cobweb_/utils/bloom.py +0 -58
- cobweb_/utils/dotting.py +0 -32
- cobweb_/utils/oss.py +0 -94
- cobweb_/utils/tools.py +0 -42
- cobweb_launcher-1.3.6.dist-info/RECORD +0 -111
- cobweb_new/__init__.py +0 -2
- cobweb_new/base/__init__.py +0 -72
- cobweb_new/base/common_queue.py +0 -53
- cobweb_new/base/decorators.py +0 -72
- cobweb_new/base/item.py +0 -46
- cobweb_new/base/log.py +0 -94
- cobweb_new/base/request.py +0 -82
- cobweb_new/base/response.py +0 -23
- cobweb_new/base/seed.py +0 -118
- cobweb_new/constant.py +0 -105
- cobweb_new/crawlers/__init__.py +0 -1
- cobweb_new/crawlers/crawler-new.py +0 -85
- cobweb_new/crawlers/crawler.py +0 -170
- cobweb_new/db/__init__.py +0 -2
- cobweb_new/db/api_db.py +0 -82
- cobweb_new/db/redis_db.py +0 -158
- cobweb_new/exceptions/__init__.py +0 -1
- cobweb_new/exceptions/oss_db_exception.py +0 -28
- cobweb_new/launchers/__init__.py +0 -3
- cobweb_new/launchers/launcher.py +0 -237
- cobweb_new/launchers/launcher_air.py +0 -88
- cobweb_new/launchers/launcher_api.py +0 -161
- cobweb_new/launchers/launcher_pro.py +0 -96
- cobweb_new/launchers/tesss.py +0 -47
- cobweb_new/pipelines/__init__.py +0 -3
- cobweb_new/pipelines/pipeline.py +0 -68
- cobweb_new/pipelines/pipeline_console.py +0 -22
- cobweb_new/pipelines/pipeline_loghub.py +0 -34
- cobweb_new/setting.py +0 -95
- cobweb_new/utils/__init__.py +0 -5
- cobweb_new/utils/bloom.py +0 -58
- cobweb_new/utils/oss.py +0 -94
- cobweb_new/utils/tools.py +0 -42
- {cobweb_launcher-1.3.6.dist-info → cobweb_launcher-1.3.7.dist-info}/LICENSE +0 -0
- {cobweb_launcher-1.3.6.dist-info → cobweb_launcher-1.3.7.dist-info}/WHEEL +0 -0
- {cobweb_launcher-1.3.6.dist-info → cobweb_launcher-1.3.7.dist-info}/top_level.txt +0 -0
@@ -1,221 +0,0 @@
|
|
1
|
-
import time
|
2
|
-
import threading
|
3
|
-
|
4
|
-
from cobweb.db import ApiDB
|
5
|
-
from cobweb.base import Seed, logger
|
6
|
-
from cobweb.constant import DealModel, LogTemplate
|
7
|
-
from .launcher import Launcher, check_pause
|
8
|
-
|
9
|
-
|
10
|
-
class LauncherApi(Launcher):
|
11
|
-
|
12
|
-
def __init__(self, task, project, custom_setting=None, **kwargs):
|
13
|
-
super().__init__(task, project, custom_setting, **kwargs)
|
14
|
-
self._db = ApiDB()
|
15
|
-
|
16
|
-
self._todo_key = "{%s:%s}:todo" % (project, task)
|
17
|
-
self._done_key = "{%s:%s}:done" % (project, task)
|
18
|
-
self._fail_key = "{%s:%s}:fail" % (project, task)
|
19
|
-
self._heartbeat_key = "heartbeat:%s_%s" % (project, task)
|
20
|
-
|
21
|
-
self._statistics_done_key = "statistics:%s:%s:done" % (project, task)
|
22
|
-
self._statistics_fail_key = "statistics:%s:%s:fail" % (project, task)
|
23
|
-
self._speed_control_key = "speed_control:%s_%s" % (project, task)
|
24
|
-
|
25
|
-
self._reset_lock_key = "lock:reset:%s_%s" % (project, task)
|
26
|
-
|
27
|
-
# self._bf_key = "bloom_%s_%s" % (project, task)
|
28
|
-
# self._bf = BloomFilter(self._bf_key)
|
29
|
-
|
30
|
-
self._heartbeat_start_event = threading.Event()
|
31
|
-
self._redis_queue_empty_event = threading.Event()
|
32
|
-
|
33
|
-
@property
|
34
|
-
def heartbeat(self):
|
35
|
-
return self._db.exists(self._heartbeat_key)
|
36
|
-
|
37
|
-
def statistics(self, key, count):
|
38
|
-
if not self._task_model and not self._db.exists(key):
|
39
|
-
self._db.setex(key, 86400 * 30, int(count))
|
40
|
-
else:
|
41
|
-
self._db.incrby(key, count)
|
42
|
-
|
43
|
-
def _get_seed(self) -> Seed:
|
44
|
-
"""
|
45
|
-
从队列中获取种子(频控)
|
46
|
-
设置时间窗口为self._time_window(秒),判断在该窗口内的采集量是否满足阈值(self._spider_max_speed)
|
47
|
-
:return: True -> 种子, False -> None
|
48
|
-
"""
|
49
|
-
if (self.__LAUNCHER_QUEUE__["todo"].length and
|
50
|
-
not self._db.auto_incr(self._speed_control_key, t=self._time_window, limit=self._spider_max_count)):
|
51
|
-
expire_time = self._db.ttl(self._speed_control_key)
|
52
|
-
logger.info(f"Too fast! Please wait {expire_time} seconds...")
|
53
|
-
time.sleep(expire_time / 2)
|
54
|
-
return None
|
55
|
-
seed = self.__LAUNCHER_QUEUE__["todo"].pop()
|
56
|
-
return seed
|
57
|
-
|
58
|
-
@check_pause
|
59
|
-
def _execute_heartbeat(self):
|
60
|
-
if self._heartbeat_start_event.is_set():
|
61
|
-
self._db.setex(self._heartbeat_key, 5)
|
62
|
-
time.sleep(3)
|
63
|
-
|
64
|
-
@check_pause
|
65
|
-
def _reset(self):
|
66
|
-
"""
|
67
|
-
检查过期种子,重新添加到redis缓存中
|
68
|
-
"""
|
69
|
-
reset_wait_seconds = 30
|
70
|
-
if self._db.lock(self._reset_lock_key, t=120):
|
71
|
-
|
72
|
-
_min = -int(time.time()) + self._seed_reset_seconds \
|
73
|
-
if self.heartbeat else "-inf"
|
74
|
-
|
75
|
-
self._db.members(self._todo_key, 0, _min=_min, _max="(0")
|
76
|
-
|
77
|
-
if not self.heartbeat:
|
78
|
-
self._heartbeat_start_event.set()
|
79
|
-
|
80
|
-
self._db.delete(self._reset_lock_key)
|
81
|
-
|
82
|
-
time.sleep(reset_wait_seconds)
|
83
|
-
|
84
|
-
@check_pause
|
85
|
-
def _scheduler(self):
|
86
|
-
"""
|
87
|
-
调度任务,获取redis队列种子,同时添加到doing字典中
|
88
|
-
"""
|
89
|
-
if not self._db.zcount(self._todo_key, 0, "(1000"):
|
90
|
-
time.sleep(self._scheduler_wait_seconds)
|
91
|
-
elif self.__LAUNCHER_QUEUE__['todo'].length >= self._todo_queue_size:
|
92
|
-
time.sleep(self._todo_queue_full_wait_seconds)
|
93
|
-
else:
|
94
|
-
members = self._db.members(
|
95
|
-
self._todo_key, int(time.time()),
|
96
|
-
count=self._todo_queue_size,
|
97
|
-
_min=0, _max="(1000"
|
98
|
-
)
|
99
|
-
for member, priority in members:
|
100
|
-
seed = Seed(member, priority=priority)
|
101
|
-
self.__LAUNCHER_QUEUE__['todo'].push(seed)
|
102
|
-
self.__DOING__[seed.to_string] = seed.params.priority
|
103
|
-
|
104
|
-
@check_pause
|
105
|
-
def _insert(self):
|
106
|
-
"""
|
107
|
-
添加新种子到redis队列中
|
108
|
-
"""
|
109
|
-
seeds = {}
|
110
|
-
status = self.__LAUNCHER_QUEUE__['new'].length < self._new_queue_max_size
|
111
|
-
for _ in range(self._new_queue_max_size):
|
112
|
-
seed = self.__LAUNCHER_QUEUE__['new'].pop()
|
113
|
-
if seed:
|
114
|
-
seeds[seed.to_string] = seed.params.priority
|
115
|
-
if seeds:
|
116
|
-
self._db.zadd(self._todo_key, seeds, nx=True)
|
117
|
-
if status:
|
118
|
-
time.sleep(self._new_queue_wait_seconds)
|
119
|
-
|
120
|
-
@check_pause
|
121
|
-
def _refresh(self):
|
122
|
-
"""
|
123
|
-
刷新doing种子过期时间,防止reset重新消费
|
124
|
-
"""
|
125
|
-
if self.__DOING__:
|
126
|
-
refresh_time = int(time.time())
|
127
|
-
seeds = {k:-refresh_time - v / 1000 for k, v in self.__DOING__.items()}
|
128
|
-
self._db.zadd(self._todo_key, item=seeds, xx=True)
|
129
|
-
time.sleep(15)
|
130
|
-
|
131
|
-
@check_pause
|
132
|
-
def _delete(self):
|
133
|
-
"""
|
134
|
-
删除队列种子,根据状态添加至成功或失败队列,移除doing字典种子索引
|
135
|
-
"""
|
136
|
-
seed_info = {"count": 0, "failed": [], "succeed": [], "common": []}
|
137
|
-
status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
|
138
|
-
|
139
|
-
for _ in range(self._done_queue_max_size):
|
140
|
-
seed = self.__LAUNCHER_QUEUE__['done'].pop()
|
141
|
-
if not seed:
|
142
|
-
break
|
143
|
-
if seed.params.seed_status == DealModel.fail:
|
144
|
-
seed_info["failed"].append(seed.to_string)
|
145
|
-
elif self._done_model == 1:
|
146
|
-
seed_info["succeed"].append(seed.to_string)
|
147
|
-
else:
|
148
|
-
seed_info["common"].append(seed.to_string)
|
149
|
-
seed_info['count'] += 1
|
150
|
-
|
151
|
-
if seed_info["count"]:
|
152
|
-
|
153
|
-
succeed_count = int(self._db.zrem(self._todo_key, *seed_info["common"]) or 0)
|
154
|
-
succeed_count += int(self._db.done([self._todo_key, self._done_key], *seed_info["succeed"]) or 0)
|
155
|
-
failed_count = int(self._db.done([self._todo_key, self._fail_key], *seed_info["failed"]) or 0)
|
156
|
-
|
157
|
-
if failed_count:
|
158
|
-
self.statistics(self._statistics_fail_key, failed_count)
|
159
|
-
if succeed_count:
|
160
|
-
self.statistics(self._statistics_done_key, succeed_count)
|
161
|
-
|
162
|
-
self._remove_doing_seeds(seed_info["common"] + seed_info["succeed"] + seed_info["failed"])
|
163
|
-
|
164
|
-
if status:
|
165
|
-
time.sleep(self._done_queue_wait_seconds)
|
166
|
-
|
167
|
-
def _polling(self):
|
168
|
-
wait_scheduler_execute = True
|
169
|
-
check_emtpy_times = 0
|
170
|
-
while not self._stop.is_set():
|
171
|
-
queue_not_empty_count = 0
|
172
|
-
pooling_wait_seconds = 30
|
173
|
-
|
174
|
-
for q in self.__LAUNCHER_QUEUE__.values():
|
175
|
-
if q.length != 0:
|
176
|
-
queue_not_empty_count += 1
|
177
|
-
wait_scheduler_execute = False
|
178
|
-
|
179
|
-
if queue_not_empty_count == 0:
|
180
|
-
pooling_wait_seconds = 3
|
181
|
-
if self._pause.is_set():
|
182
|
-
check_emtpy_times = 0
|
183
|
-
if not self._task_model and (
|
184
|
-
not wait_scheduler_execute or
|
185
|
-
int(time.time()) - self._app_time > self._before_scheduler_wait_seconds
|
186
|
-
):
|
187
|
-
logger.info("Done! ready to close thread...")
|
188
|
-
self._stop.set()
|
189
|
-
|
190
|
-
elif self._db.zcount(self._todo_key, _min=0, _max="(1000"):
|
191
|
-
logger.info(f"Recovery {self.task} task run!")
|
192
|
-
self._pause.clear()
|
193
|
-
self._execute()
|
194
|
-
else:
|
195
|
-
logger.info("pause! waiting for resume...")
|
196
|
-
elif check_emtpy_times > 2:
|
197
|
-
self.__DOING__ = {}
|
198
|
-
if not self._db.zcount(self._todo_key, _min="-inf", _max="(1000"):
|
199
|
-
self._pause.set()
|
200
|
-
else:
|
201
|
-
logger.info(
|
202
|
-
"check whether the task is complete, "
|
203
|
-
f"reset times {3 - check_emtpy_times}"
|
204
|
-
)
|
205
|
-
check_emtpy_times += 1
|
206
|
-
else:
|
207
|
-
logger.info(LogTemplate.launcher_pro_polling.format(
|
208
|
-
task=self.task,
|
209
|
-
doing_len=len(self.__DOING__.keys()),
|
210
|
-
todo_len=self.__LAUNCHER_QUEUE__['todo'].length,
|
211
|
-
done_len=self.__LAUNCHER_QUEUE__['done'].length,
|
212
|
-
redis_seed_count=self._db.zcount(self._todo_key, "-inf", "+inf"),
|
213
|
-
redis_todo_len=self._db.zcount(self._todo_key, 0, "(1000"),
|
214
|
-
redis_doing_len=self._db.zcount(self._todo_key, "-inf", "(0"),
|
215
|
-
upload_len=self.__LAUNCHER_QUEUE__['upload'].length,
|
216
|
-
))
|
217
|
-
|
218
|
-
time.sleep(pooling_wait_seconds)
|
219
|
-
|
220
|
-
logger.info("Done! Ready to close thread...")
|
221
|
-
|
@@ -1,222 +0,0 @@
|
|
1
|
-
import time
|
2
|
-
import threading
|
3
|
-
|
4
|
-
from cobweb.db import RedisDB
|
5
|
-
from cobweb.base import Seed, logger
|
6
|
-
from cobweb.utils import BloomFilter
|
7
|
-
from cobweb.constant import DealModel, LogTemplate
|
8
|
-
from .launcher import Launcher, check_pause
|
9
|
-
|
10
|
-
|
11
|
-
class LauncherPro(Launcher):
|
12
|
-
|
13
|
-
def __init__(self, task, project, custom_setting=None, **kwargs):
|
14
|
-
super().__init__(task, project, custom_setting, **kwargs)
|
15
|
-
self._todo_key = "{%s:%s}:todo" % (project, task)
|
16
|
-
self._done_key = "{%s:%s}:done" % (project, task)
|
17
|
-
self._fail_key = "{%s:%s}:fail" % (project, task)
|
18
|
-
self._heartbeat_key = "heartbeat:%s_%s" % (project, task)
|
19
|
-
|
20
|
-
self._statistics_done_key = "statistics:%s:%s:done" % (project, task)
|
21
|
-
self._statistics_fail_key = "statistics:%s:%s:fail" % (project, task)
|
22
|
-
self._speed_control_key = "speed_control:%s_%s" % (project, task)
|
23
|
-
|
24
|
-
self._reset_lock_key = "lock:reset:%s_%s" % (project, task)
|
25
|
-
|
26
|
-
self._bf_key = "bloom_%s_%s" % (project, task)
|
27
|
-
|
28
|
-
self._db = RedisDB()
|
29
|
-
|
30
|
-
self._bf = BloomFilter(self._bf_key)
|
31
|
-
|
32
|
-
self._heartbeat_start_event = threading.Event()
|
33
|
-
self._redis_queue_empty_event = threading.Event()
|
34
|
-
|
35
|
-
@property
|
36
|
-
def heartbeat(self):
|
37
|
-
return self._db.exists(self._heartbeat_key)
|
38
|
-
|
39
|
-
def statistics(self, key, count):
|
40
|
-
if not self._task_model and not self._db.exists(key):
|
41
|
-
self._db.setex(key, 86400 * 30, int(count))
|
42
|
-
else:
|
43
|
-
self._db._client.incrby(key, count)
|
44
|
-
|
45
|
-
def _get_seed(self) -> Seed:
|
46
|
-
spider_speed = self._db._client.get(self._speed_control_key)
|
47
|
-
if int(spider_speed or 0) > self._spider_max_count:
|
48
|
-
expire_time = self._db.ttl(self._speed_control_key)
|
49
|
-
if expire_time == -1:
|
50
|
-
self._db.delete(self._speed_control_key)
|
51
|
-
else:
|
52
|
-
logger.info(f"Too fast! Please wait {expire_time} seconds...")
|
53
|
-
time.sleep(expire_time / 2)
|
54
|
-
return None
|
55
|
-
seed = self.__LAUNCHER_QUEUE__["todo"].pop()
|
56
|
-
if seed and not self._db.lock(self._speed_control_key, t=self._time_window):
|
57
|
-
self._db._client.incrby(self._speed_control_key, 1)
|
58
|
-
return seed
|
59
|
-
|
60
|
-
@check_pause
|
61
|
-
def _execute_heartbeat(self):
|
62
|
-
if self._heartbeat_start_event.is_set():
|
63
|
-
self._db.setex(self._heartbeat_key, 5)
|
64
|
-
time.sleep(3)
|
65
|
-
|
66
|
-
@check_pause
|
67
|
-
def _reset(self):
|
68
|
-
"""
|
69
|
-
检查过期种子,重新添加到redis缓存中
|
70
|
-
"""
|
71
|
-
reset_wait_seconds = 30
|
72
|
-
if self._db.lock(self._reset_lock_key, t=120):
|
73
|
-
|
74
|
-
_min = -int(time.time()) + self._seed_reset_seconds \
|
75
|
-
if self.heartbeat else "-inf"
|
76
|
-
|
77
|
-
self._db.members(self._todo_key, 0, _min=_min, _max="(0")
|
78
|
-
self._db.delete(self._reset_lock_key)
|
79
|
-
|
80
|
-
if not self.heartbeat:
|
81
|
-
self._heartbeat_start_event.set()
|
82
|
-
|
83
|
-
time.sleep(reset_wait_seconds)
|
84
|
-
|
85
|
-
@check_pause
|
86
|
-
def _scheduler(self):
|
87
|
-
"""
|
88
|
-
调度任务,获取redis队列种子,同时添加到doing字典中
|
89
|
-
"""
|
90
|
-
if not self._db.zcount(self._todo_key, 0, "(1000"):
|
91
|
-
time.sleep(self._scheduler_wait_seconds)
|
92
|
-
elif self.__LAUNCHER_QUEUE__['todo'].length >= self._todo_queue_size:
|
93
|
-
time.sleep(self._todo_queue_full_wait_seconds)
|
94
|
-
else:
|
95
|
-
members = self._db.members(
|
96
|
-
self._todo_key, int(time.time()),
|
97
|
-
count=self._todo_queue_size,
|
98
|
-
_min=0, _max="(1000"
|
99
|
-
)
|
100
|
-
for member, priority in members:
|
101
|
-
seed = Seed(member, priority=priority)
|
102
|
-
self.__LAUNCHER_QUEUE__['todo'].push(seed)
|
103
|
-
self.__DOING__[seed.to_string] = seed.params.priority
|
104
|
-
|
105
|
-
@check_pause
|
106
|
-
def _insert(self):
|
107
|
-
"""
|
108
|
-
添加新种子到redis队列中
|
109
|
-
"""
|
110
|
-
seeds = {}
|
111
|
-
status = self.__LAUNCHER_QUEUE__['new'].length < self._new_queue_max_size
|
112
|
-
for _ in range(self._new_queue_max_size):
|
113
|
-
seed = self.__LAUNCHER_QUEUE__['new'].pop()
|
114
|
-
if seed:
|
115
|
-
seeds[seed.to_string] = seed.params.priority
|
116
|
-
if seeds:
|
117
|
-
self._db.zadd(self._todo_key, seeds, nx=True)
|
118
|
-
if status:
|
119
|
-
time.sleep(self._new_queue_wait_seconds)
|
120
|
-
|
121
|
-
@check_pause
|
122
|
-
def _refresh(self):
|
123
|
-
"""
|
124
|
-
刷新doing种子过期时间,防止reset重新消费
|
125
|
-
"""
|
126
|
-
if self.__DOING__:
|
127
|
-
refresh_time = int(time.time())
|
128
|
-
seeds = {k:-refresh_time - v / 1000 for k, v in self.__DOING__.items()}
|
129
|
-
self._db.zadd(self._todo_key, item=seeds, xx=True)
|
130
|
-
time.sleep(15)
|
131
|
-
|
132
|
-
@check_pause
|
133
|
-
def _delete(self):
|
134
|
-
"""
|
135
|
-
删除队列种子,根据状态添加至成功或失败队列,移除doing字典种子索引
|
136
|
-
"""
|
137
|
-
seed_info = {"count": 0, "failed": [], "succeed": [], "common": []}
|
138
|
-
status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
|
139
|
-
|
140
|
-
for _ in range(self._done_queue_max_size):
|
141
|
-
seed = self.__LAUNCHER_QUEUE__['done'].pop()
|
142
|
-
if not seed:
|
143
|
-
break
|
144
|
-
if seed.params.seed_status == DealModel.fail:
|
145
|
-
seed_info["failed"].append(seed.to_string)
|
146
|
-
elif self._done_model == 1:
|
147
|
-
seed_info["succeed"].append(seed.to_string)
|
148
|
-
else:
|
149
|
-
seed_info["common"].append(seed.to_string)
|
150
|
-
seed_info['count'] += 1
|
151
|
-
|
152
|
-
if seed_info["count"]:
|
153
|
-
|
154
|
-
succeed_count = int(self._db.zrem(self._todo_key, *seed_info["common"]) or 0)
|
155
|
-
succeed_count += int(self._db.done([self._todo_key, self._done_key], *seed_info["succeed"]) or 0)
|
156
|
-
failed_count = int(self._db.done([self._todo_key, self._fail_key], *seed_info["failed"]) or 0)
|
157
|
-
|
158
|
-
if failed_count:
|
159
|
-
self.statistics(self._statistics_fail_key, failed_count)
|
160
|
-
if succeed_count:
|
161
|
-
self.statistics(self._statistics_done_key, succeed_count)
|
162
|
-
|
163
|
-
self._remove_doing_seeds(seed_info["common"] + seed_info["succeed"] + seed_info["failed"])
|
164
|
-
|
165
|
-
if status:
|
166
|
-
time.sleep(self._done_queue_wait_seconds)
|
167
|
-
|
168
|
-
def _polling(self):
|
169
|
-
wait_scheduler_execute = True
|
170
|
-
check_emtpy_times = 0
|
171
|
-
while not self._stop.is_set():
|
172
|
-
queue_not_empty_count = 0
|
173
|
-
pooling_wait_seconds = 30
|
174
|
-
|
175
|
-
for q in self.__LAUNCHER_QUEUE__.values():
|
176
|
-
if q.length != 0:
|
177
|
-
queue_not_empty_count += 1
|
178
|
-
wait_scheduler_execute = False
|
179
|
-
|
180
|
-
if queue_not_empty_count == 0:
|
181
|
-
pooling_wait_seconds = 3
|
182
|
-
if self._pause.is_set():
|
183
|
-
check_emtpy_times = 0
|
184
|
-
if not self._task_model and (
|
185
|
-
not wait_scheduler_execute or
|
186
|
-
int(time.time()) - self._app_time > self._before_scheduler_wait_seconds
|
187
|
-
):
|
188
|
-
logger.info("Done! ready to close thread...")
|
189
|
-
self._stop.set()
|
190
|
-
|
191
|
-
elif self._db.zcount(self._todo_key, _min=0, _max="(1000"):
|
192
|
-
logger.info(f"Recovery {self.task} task run!")
|
193
|
-
self._pause.clear()
|
194
|
-
self._execute()
|
195
|
-
else:
|
196
|
-
logger.info("pause! waiting for resume...")
|
197
|
-
elif check_emtpy_times > 2:
|
198
|
-
self.__DOING__ = {}
|
199
|
-
if not self._db.zcount(self._todo_key, _min="-inf", _max="(1000"):
|
200
|
-
self._pause.set()
|
201
|
-
else:
|
202
|
-
logger.info(
|
203
|
-
"check whether the task is complete, "
|
204
|
-
f"reset times {3 - check_emtpy_times}"
|
205
|
-
)
|
206
|
-
check_emtpy_times += 1
|
207
|
-
else:
|
208
|
-
logger.info(LogTemplate.launcher_pro_polling.format(
|
209
|
-
task=self.task,
|
210
|
-
doing_len=len(self.__DOING__.keys()),
|
211
|
-
todo_len=self.__LAUNCHER_QUEUE__['todo'].length,
|
212
|
-
done_len=self.__LAUNCHER_QUEUE__['done'].length,
|
213
|
-
redis_seed_count=self._db.zcount(self._todo_key, "-inf", "+inf"),
|
214
|
-
redis_todo_len=self._db.zcount(self._todo_key, 0, "(1000"),
|
215
|
-
redis_doing_len=self._db.zcount(self._todo_key, "-inf", "(0"),
|
216
|
-
upload_len=self.__LAUNCHER_QUEUE__['upload'].length,
|
217
|
-
))
|
218
|
-
|
219
|
-
time.sleep(pooling_wait_seconds)
|
220
|
-
|
221
|
-
logger.info("Done! Ready to close thread...")
|
222
|
-
|
cobweb_/pipelines/__init__.py
DELETED
cobweb_/pipelines/pipeline.py
DELETED
@@ -1,69 +0,0 @@
|
|
1
|
-
import time
|
2
|
-
import threading
|
3
|
-
|
4
|
-
from abc import ABC, abstractmethod
|
5
|
-
from cobweb.base import BaseItem, Queue, logger
|
6
|
-
|
7
|
-
|
8
|
-
class Pipeline(threading.Thread, ABC):
|
9
|
-
|
10
|
-
def __init__(
|
11
|
-
self,
|
12
|
-
stop: threading.Event,
|
13
|
-
pause: threading.Event,
|
14
|
-
upload: Queue, done: Queue,
|
15
|
-
upload_size: int,
|
16
|
-
wait_seconds: int
|
17
|
-
):
|
18
|
-
super().__init__()
|
19
|
-
self._stop = stop
|
20
|
-
self._pause = pause
|
21
|
-
self._upload = upload
|
22
|
-
self._done = done
|
23
|
-
|
24
|
-
self.upload_size = upload_size
|
25
|
-
self.wait_seconds = wait_seconds
|
26
|
-
|
27
|
-
@abstractmethod
|
28
|
-
def build(self, item: BaseItem) -> dict:
|
29
|
-
pass
|
30
|
-
|
31
|
-
@abstractmethod
|
32
|
-
def upload(self, table: str, data: list) -> bool:
|
33
|
-
pass
|
34
|
-
|
35
|
-
def run(self):
|
36
|
-
while not self._stop.is_set():
|
37
|
-
if not self._upload.length:
|
38
|
-
time.sleep(self.wait_seconds)
|
39
|
-
continue
|
40
|
-
if self._upload.length < self.upload_size:
|
41
|
-
time.sleep(self.wait_seconds)
|
42
|
-
status = True
|
43
|
-
data_info, seeds = {}, []
|
44
|
-
try:
|
45
|
-
for _ in range(self.upload_size):
|
46
|
-
item = self._upload.pop()
|
47
|
-
if not item:
|
48
|
-
break
|
49
|
-
seeds.append(item.seed)
|
50
|
-
data = self.build(item)
|
51
|
-
data_info.setdefault(item.table, []).append(data)
|
52
|
-
for table, datas in data_info.items():
|
53
|
-
try:
|
54
|
-
self.upload(table, datas)
|
55
|
-
except Exception as e:
|
56
|
-
logger.info(e)
|
57
|
-
status = False
|
58
|
-
except Exception as e:
|
59
|
-
logger.info(e)
|
60
|
-
status = False
|
61
|
-
if not status:
|
62
|
-
for seed in seeds:
|
63
|
-
seed.params.seed_status = "deal model: fail"
|
64
|
-
if seeds:
|
65
|
-
self._done.push(seeds)
|
66
|
-
|
67
|
-
logger.info("upload pipeline close!")
|
68
|
-
|
69
|
-
|
@@ -1,22 +0,0 @@
|
|
1
|
-
from cobweb.base import ConsoleItem, logger
|
2
|
-
from cobweb.constant import LogTemplate
|
3
|
-
from cobweb.pipelines import Pipeline
|
4
|
-
|
5
|
-
|
6
|
-
class Console(Pipeline):
|
7
|
-
|
8
|
-
def build(self, item: ConsoleItem):
|
9
|
-
return {
|
10
|
-
"seed": item.seed.to_dict,
|
11
|
-
"data": item.to_dict
|
12
|
-
}
|
13
|
-
|
14
|
-
def upload(self, table, datas):
|
15
|
-
for data in datas:
|
16
|
-
parse_detail = LogTemplate.log_info(data["data"])
|
17
|
-
if len(parse_detail) > 500:
|
18
|
-
parse_detail = parse_detail[:500] + " ...\n" + " " * 12 + "-- Text is too long and details are omitted!"
|
19
|
-
logger.info(LogTemplate.console_item.format(
|
20
|
-
seed_detail=LogTemplate.log_info(data["seed"]),
|
21
|
-
parse_detail=parse_detail
|
22
|
-
))
|
@@ -1,34 +0,0 @@
|
|
1
|
-
import json
|
2
|
-
|
3
|
-
from cobweb import setting
|
4
|
-
from cobweb.base import BaseItem
|
5
|
-
from cobweb.pipelines import Pipeline
|
6
|
-
from aliyun.log import LogClient, LogItem, PutLogsRequest
|
7
|
-
|
8
|
-
|
9
|
-
class Loghub(Pipeline):
|
10
|
-
|
11
|
-
def __init__(self, *args, **kwargs):
|
12
|
-
super().__init__(*args, **kwargs)
|
13
|
-
self.client = LogClient(**setting.LOGHUB_CONFIG)
|
14
|
-
|
15
|
-
def build(self, item: BaseItem):
|
16
|
-
log_item = LogItem()
|
17
|
-
temp = item.to_dict
|
18
|
-
for key, value in temp.items():
|
19
|
-
if not isinstance(value, str):
|
20
|
-
temp[key] = json.dumps(value, ensure_ascii=False)
|
21
|
-
contents = sorted(temp.items())
|
22
|
-
log_item.set_contents(contents)
|
23
|
-
return log_item
|
24
|
-
|
25
|
-
def upload(self, table, datas):
|
26
|
-
request = PutLogsRequest(
|
27
|
-
project=setting.LOGHUB_PROJECT,
|
28
|
-
logstore=table,
|
29
|
-
topic=setting.LOGHUB_TOPIC,
|
30
|
-
source=setting.LOGHUB_SOURCE,
|
31
|
-
logitems=datas,
|
32
|
-
compress=True
|
33
|
-
)
|
34
|
-
self.client.put_logs(request=request)
|
cobweb_/setting.py
DELETED
@@ -1,74 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
|
3
|
-
# redis db config
|
4
|
-
REDIS_CONFIG = {
|
5
|
-
"host": os.getenv("REDIS_HOST"),
|
6
|
-
"password": os.getenv("REDIS_PASSWORD"),
|
7
|
-
"port": int(os.getenv("REDIS_PORT", 6379)),
|
8
|
-
"db": int(os.getenv("REDIS_DB", 0)),
|
9
|
-
}
|
10
|
-
|
11
|
-
# loghub db config
|
12
|
-
LOGHUB_TOPIC = os.getenv("LOGHUB_TOPIC")
|
13
|
-
LOGHUB_SOURCE = os.getenv("LOGHUB_SOURCE")
|
14
|
-
LOGHUB_PROJECT = os.getenv("LOGHUB_PROJECT")
|
15
|
-
LOGHUB_CONFIG = {
|
16
|
-
"endpoint": os.getenv("LOGHUB_ENDPOINT"),
|
17
|
-
"accessKeyId": os.getenv("LOGHUB_ACCESS_KEY"),
|
18
|
-
"accessKey": os.getenv("LOGHUB_SECRET_KEY")
|
19
|
-
}
|
20
|
-
|
21
|
-
# oss util config
|
22
|
-
OSS_BUCKET = os.getenv("OSS_BUCKET")
|
23
|
-
OSS_ENDPOINT = os.getenv("OSS_ENDPOINT")
|
24
|
-
OSS_ACCESS_KEY = os.getenv("OSS_ACCESS_KEY")
|
25
|
-
OSS_SECRET_KEY = os.getenv("OSS_SECRET_KEY")
|
26
|
-
OSS_CHUNK_SIZE = 10 * 1024 ** 2
|
27
|
-
OSS_MIN_UPLOAD_SIZE = 1024
|
28
|
-
|
29
|
-
|
30
|
-
# 采集器选择
|
31
|
-
CRAWLER = "cobweb.crawlers.Crawler"
|
32
|
-
|
33
|
-
# 数据存储链路
|
34
|
-
PIPELINE = "cobweb.pipelines.pipeline_console.Console"
|
35
|
-
|
36
|
-
|
37
|
-
# Launcher 等待时间
|
38
|
-
|
39
|
-
BEFORE_SCHEDULER_WAIT_SECONDS = 60 # 调度前等待时间,只作用于单次任务
|
40
|
-
SCHEDULER_WAIT_SECONDS = 15 # 调度等待时间
|
41
|
-
TODO_QUEUE_FULL_WAIT_SECONDS = 5 # todo队列已满时等待时间
|
42
|
-
NEW_QUEUE_WAIT_SECONDS = 30 # new队列等待时间
|
43
|
-
DONE_QUEUE_WAIT_SECONDS = 5 # done队列等待时间
|
44
|
-
UPLOAD_QUEUE_WAIT_SECONDS = 15 # upload队列等待时间
|
45
|
-
SEED_RESET_SECONDS = 30 # 种子重制时间
|
46
|
-
|
47
|
-
|
48
|
-
# Launcher 队列长度
|
49
|
-
TODO_QUEUE_SIZE = 100 # todo队列长度
|
50
|
-
NEW_QUEUE_MAX_SIZE = 100 # new队列长度
|
51
|
-
DONE_QUEUE_MAX_SIZE = 100 # done队列长度
|
52
|
-
UPLOAD_QUEUE_MAX_SIZE = 100 # upload队列长度
|
53
|
-
|
54
|
-
# DONE_MODEL IN (0, 1), 种子完成模式
|
55
|
-
DONE_MODEL = 0 # 0:种子消费成功直接从队列移除,失败则添加至失败队列;1:种子消费成功添加至成功队列,失败添加至失败队列
|
56
|
-
|
57
|
-
# spider
|
58
|
-
SPIDER_THREAD_NUM = 10
|
59
|
-
SPIDER_MAX_RETRIES = 5
|
60
|
-
SPIDER_TIME_SLEEP = 10
|
61
|
-
|
62
|
-
SPIDER_MAX_COUNT = 1000 # 在规定时间窗口内最大采集数
|
63
|
-
TIME_WINDOW = 60 # 频控固定时间窗口(秒)
|
64
|
-
|
65
|
-
# 任务模式
|
66
|
-
TASK_MODEL = 0 # 0:单次,1:常驻
|
67
|
-
|
68
|
-
|
69
|
-
# bloom过滤器
|
70
|
-
CAPACITY = 100000000
|
71
|
-
ERROR_RATE = 0.001
|
72
|
-
FILTER_FIELD = "url"
|
73
|
-
# 文件下载响应类型过滤
|
74
|
-
# FILE_FILTER_CONTENT_TYPE = ["text/html", "application/xhtml+xml"]
|