cobweb-launcher 1.3.2__py3-none-any.whl → 1.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cobweb/__init__.py +1 -1
- cobweb/base/__init__.py +9 -3
- cobweb/base/basic.py +78 -26
- cobweb/crawlers/crawler.py +4 -4
- cobweb/launchers/launcher.py +8 -6
- cobweb/launchers/launcher_air.py +88 -88
- cobweb/launchers/launcher_api.py +75 -148
- cobweb/launchers/launcher_pro.py +7 -9
- cobweb/pipelines/pipeline.py +2 -1
- cobweb/schedulers/__init__.py +2 -0
- cobweb/schedulers/scheduler_api.py +69 -0
- {cobweb_launcher-1.3.2.dist-info → cobweb_launcher-1.3.4.dist-info}/METADATA +1 -1
- {cobweb_launcher-1.3.2.dist-info → cobweb_launcher-1.3.4.dist-info}/RECORD +16 -15
- {cobweb_launcher-1.3.2.dist-info → cobweb_launcher-1.3.4.dist-info}/LICENSE +0 -0
- {cobweb_launcher-1.3.2.dist-info → cobweb_launcher-1.3.4.dist-info}/WHEEL +0 -0
- {cobweb_launcher-1.3.2.dist-info → cobweb_launcher-1.3.4.dist-info}/top_level.txt +0 -0
cobweb/__init__.py
CHANGED
@@ -1,2 +1,2 @@
|
|
1
|
-
from .launchers import
|
1
|
+
from .launchers import LauncherPro, LauncherApi
|
2
2
|
from .constant import CrawlerModel
|
cobweb/base/__init__.py
CHANGED
@@ -66,11 +66,17 @@ class TaskQueue:
|
|
66
66
|
except Exception as e:
|
67
67
|
it.params.retry += 1
|
68
68
|
if isinstance(it, Request):
|
69
|
-
TaskQueue.
|
69
|
+
TaskQueue.DOWNLOAD.push(it)
|
70
70
|
elif isinstance(it, Response):
|
71
71
|
TaskQueue.RESPONSE.push(it)
|
72
72
|
elif isinstance(it, Seed):
|
73
|
-
TaskQueue.
|
73
|
+
TaskQueue.TODO.push(it)
|
74
|
+
elif isinstance(it, BaseItem):
|
75
|
+
TaskQueue.UPLOAD.push(it)
|
76
|
+
logger.info(
|
77
|
+
f"{crawler_func.__name__} failed: "
|
78
|
+
f"{''.join(traceback.format_exception(type(e), e, e.__traceback__))}"
|
79
|
+
)
|
74
80
|
time.sleep(1)
|
75
81
|
|
76
82
|
|
@@ -95,7 +101,7 @@ class Decorators:
|
|
95
101
|
def wrapper(self, *args, **kwargs):
|
96
102
|
while not self.pause.is_set():
|
97
103
|
try:
|
98
|
-
func(self
|
104
|
+
func(self)
|
99
105
|
except Exception as e:
|
100
106
|
logger.info(f"{func.__name__}: " + str(e))
|
101
107
|
finally:
|
cobweb/base/basic.py
CHANGED
@@ -15,11 +15,21 @@ class Params:
|
|
15
15
|
|
16
16
|
|
17
17
|
class Seed:
|
18
|
+
__SEED_PARAMS__ = [
|
19
|
+
"retry",
|
20
|
+
"priority",
|
21
|
+
"version",
|
22
|
+
"status"
|
23
|
+
]
|
18
24
|
|
19
25
|
def __init__(
|
20
26
|
self,
|
21
27
|
seed,
|
22
|
-
|
28
|
+
sid=None,
|
29
|
+
retry=None,
|
30
|
+
priority=None,
|
31
|
+
version=None,
|
32
|
+
status=None,
|
23
33
|
**kwargs
|
24
34
|
):
|
25
35
|
if any(isinstance(seed, t) for t in (str, bytes)):
|
@@ -37,11 +47,27 @@ class Seed:
|
|
37
47
|
f"seed: {seed}"
|
38
48
|
))
|
39
49
|
|
50
|
+
seed_params = {
|
51
|
+
"retry": retry,
|
52
|
+
"priority": priority,
|
53
|
+
"version": version,
|
54
|
+
"status": status,
|
55
|
+
}
|
56
|
+
|
40
57
|
if kwargs:
|
58
|
+
# for k, v in kwargs.items():
|
59
|
+
# if k in seed_params.keys():
|
60
|
+
# seed_params[k] = v
|
61
|
+
# else:
|
62
|
+
# self.__setattr__(k, v)
|
41
63
|
self._init_seed(kwargs)
|
42
|
-
|
43
|
-
|
44
|
-
|
64
|
+
seed_params.update({
|
65
|
+
k: v for k, v in kwargs.items()
|
66
|
+
if k in self.__SEED_PARAMS__
|
67
|
+
})
|
68
|
+
if sid or not getattr(self, "sid", None):
|
69
|
+
self._init_id(sid)
|
70
|
+
self.params = Params(**seed_params)
|
45
71
|
|
46
72
|
def __getattr__(self, name):
|
47
73
|
return None
|
@@ -59,13 +85,14 @@ class Seed:
|
|
59
85
|
chars = [f"{k}={v}" for k, v in self.__dict__.items()]
|
60
86
|
return f'{self.__class__.__name__}({", ".join(chars)})'
|
61
87
|
|
62
|
-
def _init_seed(self, seed_info:dict):
|
88
|
+
def _init_seed(self, seed_info: dict):
|
63
89
|
for k, v in seed_info.items():
|
64
90
|
if k not in self.__SEED_PARAMS__:
|
65
91
|
self.__setattr__(k, v)
|
66
92
|
|
67
|
-
def _init_id(self):
|
68
|
-
|
93
|
+
def _init_id(self, sid):
|
94
|
+
if not sid:
|
95
|
+
sid = hashlib.md5(self.to_string.encode()).hexdigest()
|
69
96
|
self.__setattr__("sid", sid)
|
70
97
|
|
71
98
|
@property
|
@@ -85,16 +112,15 @@ class Seed:
|
|
85
112
|
|
86
113
|
@property
|
87
114
|
def seed(self):
|
88
|
-
return self
|
115
|
+
return self.to_string
|
89
116
|
|
90
117
|
|
91
118
|
class Request:
|
92
|
-
|
93
119
|
__SEED_PARAMS__ = [
|
94
120
|
"retry",
|
95
121
|
"priority",
|
96
|
-
"
|
97
|
-
"
|
122
|
+
"version",
|
123
|
+
"status"
|
98
124
|
]
|
99
125
|
|
100
126
|
__REQUEST_ATTRS__ = {
|
@@ -122,8 +148,8 @@ class Request:
|
|
122
148
|
check_status_code=True,
|
123
149
|
retry=None,
|
124
150
|
priority=None,
|
125
|
-
|
126
|
-
|
151
|
+
version=None,
|
152
|
+
status=None,
|
127
153
|
**kwargs
|
128
154
|
):
|
129
155
|
self.url = url
|
@@ -133,10 +159,15 @@ class Request:
|
|
133
159
|
seed_params = {
|
134
160
|
"retry": retry,
|
135
161
|
"priority": priority,
|
136
|
-
"
|
137
|
-
"
|
162
|
+
"version": version,
|
163
|
+
"status": status,
|
138
164
|
}
|
139
165
|
|
166
|
+
if isinstance(seed, Seed):
|
167
|
+
kwargs.update(**seed.to_dict)
|
168
|
+
elif isinstance(seed, str):
|
169
|
+
kwargs.update(**json.loads(seed))
|
170
|
+
|
140
171
|
for k, v in kwargs.items():
|
141
172
|
if k in self.__class__.__REQUEST_ATTRS__:
|
142
173
|
self.request_setting[k] = v
|
@@ -152,12 +183,7 @@ class Request:
|
|
152
183
|
self._build_header()
|
153
184
|
|
154
185
|
self.params = Params(**seed_params)
|
155
|
-
|
156
|
-
if isinstance(seed, Seed):
|
157
|
-
kwargs.update(**seed.to_dict)
|
158
|
-
elif isinstance(seed, str):
|
159
|
-
kwargs.update(**json.loads(seed))
|
160
|
-
self.seed = self.to_string
|
186
|
+
# self.seed = self.to_string
|
161
187
|
|
162
188
|
@property
|
163
189
|
def _random_ua(self) -> str:
|
@@ -183,10 +209,19 @@ class Request:
|
|
183
209
|
response.raise_for_status()
|
184
210
|
return response
|
185
211
|
|
212
|
+
def __getattr__(self, name):
|
213
|
+
return None
|
214
|
+
|
215
|
+
def __setitem__(self, key, value):
|
216
|
+
setattr(self, key, value)
|
217
|
+
|
218
|
+
def __getitem__(self, item):
|
219
|
+
return getattr(self, item)
|
220
|
+
|
186
221
|
@property
|
187
222
|
def to_dict(self):
|
188
223
|
_dict = self.__dict__.copy()
|
189
|
-
_dict.pop('seed')
|
224
|
+
# _dict.pop('seed')
|
190
225
|
_dict.pop('params')
|
191
226
|
_dict.pop('check_status_code')
|
192
227
|
# _dict.pop('request_setting')
|
@@ -200,6 +235,10 @@ class Request:
|
|
200
235
|
separators=(",", ":")
|
201
236
|
)
|
202
237
|
|
238
|
+
@property
|
239
|
+
def seed(self):
|
240
|
+
return self.to_string
|
241
|
+
|
203
242
|
|
204
243
|
class Response:
|
205
244
|
|
@@ -209,8 +248,8 @@ class Response:
|
|
209
248
|
response,
|
210
249
|
retry=None,
|
211
250
|
priority=None,
|
212
|
-
|
213
|
-
|
251
|
+
version=None,
|
252
|
+
status=None,
|
214
253
|
**kwargs
|
215
254
|
):
|
216
255
|
self.seed = seed
|
@@ -218,20 +257,24 @@ class Response:
|
|
218
257
|
seed_params = {
|
219
258
|
"retry": retry,
|
220
259
|
"priority": priority,
|
221
|
-
"
|
222
|
-
"
|
260
|
+
"version": version,
|
261
|
+
"status": status,
|
223
262
|
}
|
224
263
|
for k, v in kwargs.items():
|
225
264
|
if k in seed_params.keys():
|
226
265
|
seed_params[k] = v
|
227
266
|
else:
|
228
267
|
self.__setattr__(k, v)
|
268
|
+
self.params = Params(**seed_params)
|
229
269
|
|
230
270
|
@property
|
231
271
|
def to_dict(self):
|
232
272
|
_dict = self.__dict__.copy()
|
233
273
|
_dict.pop('seed')
|
234
274
|
_dict.pop('response')
|
275
|
+
_dict.pop('method')
|
276
|
+
_dict.pop('params')
|
277
|
+
_dict.pop('request_setting')
|
235
278
|
return _dict
|
236
279
|
|
237
280
|
@property
|
@@ -241,3 +284,12 @@ class Response:
|
|
241
284
|
ensure_ascii=False,
|
242
285
|
separators=(",", ":")
|
243
286
|
)
|
287
|
+
|
288
|
+
def __getattr__(self, name):
|
289
|
+
return None
|
290
|
+
|
291
|
+
def __setitem__(self, key, value):
|
292
|
+
setattr(self, key, value)
|
293
|
+
|
294
|
+
def __getitem__(self, item):
|
295
|
+
return getattr(self, item)
|
cobweb/crawlers/crawler.py
CHANGED
@@ -3,7 +3,7 @@ import time
|
|
3
3
|
import threading
|
4
4
|
from typing import Union, Callable, Mapping
|
5
5
|
|
6
|
-
import setting
|
6
|
+
from cobweb import setting
|
7
7
|
from cobweb.base import (
|
8
8
|
Seed,
|
9
9
|
BaseItem,
|
@@ -14,7 +14,7 @@ from cobweb.base import (
|
|
14
14
|
TaskQueue,
|
15
15
|
logger
|
16
16
|
)
|
17
|
-
from constant import DealModel
|
17
|
+
from cobweb.constant import DealModel
|
18
18
|
|
19
19
|
|
20
20
|
class Crawler(threading.Thread):
|
@@ -66,8 +66,8 @@ class Crawler(threading.Thread):
|
|
66
66
|
def build_download_item(self):
|
67
67
|
thread_sleep = 0.1
|
68
68
|
if TaskQueue.RESPONSE.length >= self.download_queue_size:
|
69
|
-
logger.info(f"download queue is full, sleep {thread_sleep}s")
|
70
69
|
thread_sleep = 5
|
70
|
+
# logger.info(f"download queue is full, sleep {thread_sleep}s")
|
71
71
|
elif request_info := TaskQueue.DOWNLOAD.pop():
|
72
72
|
member, priority = request_info
|
73
73
|
request_setting = json.loads(member)
|
@@ -79,7 +79,7 @@ class Crawler(threading.Thread):
|
|
79
79
|
def build_parse_item(self):
|
80
80
|
thread_sleep = 0.1
|
81
81
|
if TaskQueue.UPLOAD.length >= self.upload_queue_size:
|
82
|
-
logger.info(f"upload queue is full, sleep {thread_sleep}s")
|
82
|
+
# logger.info(f"upload queue is full, sleep {thread_sleep}s")
|
83
83
|
thread_sleep = 5
|
84
84
|
if response_item := TaskQueue.RESPONSE.pop():
|
85
85
|
TaskQueue.process_task(response_item, self.parse)
|
cobweb/launchers/launcher.py
CHANGED
@@ -137,10 +137,10 @@ class Launcher(threading.Thread):
|
|
137
137
|
|
138
138
|
def _add_thread(self, func, num=1, obj=None, name=None, args=()):
|
139
139
|
obj = obj or self
|
140
|
-
name = obj.__class__.__name__ + name or func.__name__
|
140
|
+
name = obj.__class__.__name__ + ":" + (name or func.__name__)
|
141
141
|
for i in range(num):
|
142
142
|
func_name = name + "_" + str(i) if num > 1 else name
|
143
|
-
self._threads.append(threading.Thread(name=func_name, target=func, args=(
|
143
|
+
self._threads.append(threading.Thread(name=func_name, target=func, args=()))
|
144
144
|
|
145
145
|
@Decorators.stop
|
146
146
|
def _polling(self):
|
@@ -150,6 +150,10 @@ class Launcher(threading.Thread):
|
|
150
150
|
if not self.task_model and run_time > self.before_scheduler_wait_seconds:
|
151
151
|
logger.info("Done! ready to close thread...")
|
152
152
|
self.stop.set()
|
153
|
+
elif TaskQueue.TODO.length or TaskQueue.DOWNLOAD.length:
|
154
|
+
logger.info(f"Recovery {self.task} task run!")
|
155
|
+
self.check_emtpy_times = 0
|
156
|
+
self.pause.clear()
|
153
157
|
else:
|
154
158
|
logger.info("pause! waiting for resume...")
|
155
159
|
elif self.check_emtpy_times > 2:
|
@@ -164,10 +168,6 @@ class Launcher(threading.Thread):
|
|
164
168
|
f"reset times {3 - self.check_emtpy_times}"
|
165
169
|
)
|
166
170
|
self.check_emtpy_times += 1
|
167
|
-
elif TaskQueue.TODO.length:
|
168
|
-
logger.info(f"Recovery {self.task} task run!")
|
169
|
-
self.check_emtpy_times = 0
|
170
|
-
self.pause.clear()
|
171
171
|
else:
|
172
172
|
logger.info(LogTemplate.launcher_polling.format(
|
173
173
|
task=self.task,
|
@@ -179,6 +179,8 @@ class Launcher(threading.Thread):
|
|
179
179
|
response_queue_len=TaskQueue.RESPONSE.length,
|
180
180
|
done_queue_len=TaskQueue.DONE.length,
|
181
181
|
upload_queue_len=TaskQueue.UPLOAD.length,
|
182
|
+
seed_queue_len=TaskQueue.SEED.length,
|
183
|
+
download_queue_len=TaskQueue.DOWNLOAD.length
|
182
184
|
))
|
183
185
|
time.sleep(10)
|
184
186
|
|
cobweb/launchers/launcher_air.py
CHANGED
@@ -1,88 +1,88 @@
|
|
1
|
-
import time
|
2
|
-
|
3
|
-
from cobweb.base import logger
|
4
|
-
from cobweb.constant import LogTemplate
|
5
|
-
from .launcher import Launcher, check_pause
|
6
|
-
|
7
|
-
|
8
|
-
class LauncherAir(Launcher):
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
1
|
+
# import time
|
2
|
+
#
|
3
|
+
# from cobweb.base import logger
|
4
|
+
# from cobweb.constant import LogTemplate
|
5
|
+
# from .launcher import Launcher, check_pause
|
6
|
+
#
|
7
|
+
#
|
8
|
+
# class LauncherAir(Launcher):
|
9
|
+
#
|
10
|
+
# # def _scheduler(self):
|
11
|
+
# # if self.start_seeds:
|
12
|
+
# # self.__LAUNCHER_QUEUE__['todo'].push(self.start_seeds)
|
13
|
+
#
|
14
|
+
# @check_pause
|
15
|
+
# def _insert(self):
|
16
|
+
# seeds = {}
|
17
|
+
# status = self.__LAUNCHER_QUEUE__['new'].length < self._new_queue_max_size
|
18
|
+
# for _ in range(self._new_queue_max_size):
|
19
|
+
# seed = self.__LAUNCHER_QUEUE__['new'].pop()
|
20
|
+
# if not seed:
|
21
|
+
# break
|
22
|
+
# seeds[seed.to_string] = seed.params.priority
|
23
|
+
# if seeds:
|
24
|
+
# self.__LAUNCHER_QUEUE__['todo'].push(seeds)
|
25
|
+
# if status:
|
26
|
+
# time.sleep(self._new_queue_wait_seconds)
|
27
|
+
#
|
28
|
+
# @check_pause
|
29
|
+
# def _delete(self):
|
30
|
+
# seeds = []
|
31
|
+
# status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
|
32
|
+
#
|
33
|
+
# for _ in range(self._done_queue_max_size):
|
34
|
+
# seed = self.__LAUNCHER_QUEUE__['done'].pop()
|
35
|
+
# if not seed:
|
36
|
+
# break
|
37
|
+
# seeds.append(seed.to_string)
|
38
|
+
#
|
39
|
+
# if seeds:
|
40
|
+
# self._remove_doing_seeds(seeds)
|
41
|
+
#
|
42
|
+
# if status:
|
43
|
+
# time.sleep(self._done_queue_wait_seconds)
|
44
|
+
#
|
45
|
+
# def _polling(self):
|
46
|
+
#
|
47
|
+
# check_emtpy_times = 0
|
48
|
+
#
|
49
|
+
# while not self._stop.is_set():
|
50
|
+
#
|
51
|
+
# queue_not_empty_count = 0
|
52
|
+
# pooling_wait_seconds = 30
|
53
|
+
#
|
54
|
+
# for q in self.__LAUNCHER_QUEUE__.values():
|
55
|
+
# if q.length != 0:
|
56
|
+
# queue_not_empty_count += 1
|
57
|
+
#
|
58
|
+
# if queue_not_empty_count == 0:
|
59
|
+
# pooling_wait_seconds = 3
|
60
|
+
# if self._pause.is_set():
|
61
|
+
# check_emtpy_times = 0
|
62
|
+
# if not self._task_model:
|
63
|
+
# logger.info("Done! Ready to close thread...")
|
64
|
+
# self._stop.set()
|
65
|
+
# elif check_emtpy_times > 2:
|
66
|
+
# self.__DOING__ = {}
|
67
|
+
# self._pause.set()
|
68
|
+
# else:
|
69
|
+
# logger.info(
|
70
|
+
# "check whether the task is complete, "
|
71
|
+
# f"reset times {3 - check_emtpy_times}"
|
72
|
+
# )
|
73
|
+
# check_emtpy_times += 1
|
74
|
+
# elif self._pause.is_set():
|
75
|
+
# self._pause.clear()
|
76
|
+
# self._execute()
|
77
|
+
# else:
|
78
|
+
# logger.info(LogTemplate.launcher_air_polling.format(
|
79
|
+
# task=self.task,
|
80
|
+
# doing_len=len(self.__DOING__.keys()),
|
81
|
+
# todo_len=self.__LAUNCHER_QUEUE__['todo'].length,
|
82
|
+
# done_len=self.__LAUNCHER_QUEUE__['done'].length,
|
83
|
+
# upload_len=self.__LAUNCHER_QUEUE__['upload'].length,
|
84
|
+
# ))
|
85
|
+
#
|
86
|
+
# time.sleep(pooling_wait_seconds)
|
87
|
+
#
|
88
|
+
#
|
cobweb/launchers/launcher_api.py
CHANGED
@@ -1,161 +1,88 @@
|
|
1
1
|
import time
|
2
|
-
import threading
|
3
2
|
|
4
|
-
from cobweb.
|
5
|
-
from cobweb.
|
6
|
-
from cobweb.constant import DealModel
|
3
|
+
from cobweb.base import TaskQueue, Decorators
|
4
|
+
from cobweb.schedulers import ApiScheduler
|
7
5
|
from .launcher import Launcher
|
8
6
|
|
9
7
|
|
10
|
-
class
|
8
|
+
class LauncherPro(Launcher):
|
11
9
|
|
12
10
|
def __init__(self, task, project, custom_setting=None, **kwargs):
|
13
11
|
super().__init__(task, project, custom_setting, **kwargs)
|
14
|
-
self.
|
15
|
-
|
16
|
-
self.
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
def
|
34
|
-
if
|
35
|
-
self.
|
36
|
-
|
37
|
-
self._db.incrby(key, count)
|
38
|
-
|
39
|
-
def _get_seed(self) -> Seed:
|
40
|
-
"""
|
41
|
-
从队列中获取种子(频控)
|
42
|
-
设置时间窗口为self._time_window(秒),判断在该窗口内的采集量是否满足阈值(self._spider_max_speed)
|
43
|
-
:return: True -> 种子, False -> None
|
44
|
-
"""
|
45
|
-
if TaskQueue.TODO.length and not self._db.auto_incr(
|
46
|
-
self._speed_control_key,
|
47
|
-
t=self.time_window,
|
48
|
-
limit=self.spider_max_count
|
49
|
-
):
|
50
|
-
expire_time = self._db.ttl(self._speed_control_key)
|
51
|
-
logger.info(f"Too fast! Please wait {expire_time} seconds...")
|
52
|
-
time.sleep(expire_time / 2)
|
53
|
-
return None
|
54
|
-
return TaskQueue.TODO.pop()
|
12
|
+
self._redis_download = "{%s:%s}:download" % (project, task)
|
13
|
+
self._redis_todo = "{%s:%s}:todo" % (project, task)
|
14
|
+
self._scheduler = ApiScheduler(task, project)
|
15
|
+
|
16
|
+
@Decorators.stop
|
17
|
+
def _schedule(self):
|
18
|
+
thread_sleep = self.scheduling_wait_time
|
19
|
+
for q, key, size, item_info in [
|
20
|
+
(TaskQueue.TODO, self._redis_todo, self.todo_queue_size, self._task_info["todo"]),
|
21
|
+
(TaskQueue.DOWNLOAD, self._redis_download, self.download_queue_size, self._task_info["download"]),
|
22
|
+
]:
|
23
|
+
if q.length < size:
|
24
|
+
for member, priority in self._scheduler.schedule(key, self.scheduling_size):
|
25
|
+
q.push((member, priority), direct_insertion=True)
|
26
|
+
self.add_working_item(key.split(":")[-1], member, priority)
|
27
|
+
thread_sleep = 0.1
|
28
|
+
time.sleep(thread_sleep)
|
29
|
+
|
30
|
+
@Decorators.stop
|
31
|
+
def _heartbeat(self):
|
32
|
+
if self._scheduler.working.is_set():
|
33
|
+
self._scheduler.set_heartbeat()
|
34
|
+
time.sleep(3)
|
55
35
|
|
56
|
-
@stop
|
36
|
+
@Decorators.stop
|
57
37
|
def _reset(self):
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
_min = -int(time.time()) + self.seed_reset_seconds \
|
64
|
-
if self.heartbeat else "-inf"
|
65
|
-
|
66
|
-
self._db.members(self._todo_key, 0, _min=_min, _max="(0")
|
67
|
-
|
68
|
-
if not self.heartbeat:
|
69
|
-
self._heartbeat_start_event.set()
|
70
|
-
|
71
|
-
self._db.delete(self._reset_lock_key)
|
72
|
-
|
38
|
+
self._scheduler.reset(
|
39
|
+
keys=[self._redis_todo, self._redis_download],
|
40
|
+
reset_time=self.seed_reset_seconds
|
41
|
+
)
|
73
42
|
time.sleep(30)
|
74
43
|
|
75
|
-
@
|
76
|
-
def _refresh(self):
|
77
|
-
"""
|
78
|
-
刷新doing种子过期时间,防止reset重新消费
|
79
|
-
"""
|
80
|
-
if self.doing_seeds:
|
81
|
-
refresh_time = int(time.time())
|
82
|
-
seeds = {k: -refresh_time - v / 1e3 for k, v in self.doing_seeds.items()}
|
83
|
-
self._db.zadd(self._todo_key, item=seeds, xx=True)
|
84
|
-
time.sleep(3)
|
85
|
-
|
86
|
-
@stop
|
87
|
-
def _scheduler(self):
|
88
|
-
"""
|
89
|
-
调度任务,获取redis队列种子,同时添加到doing字典中
|
90
|
-
"""
|
91
|
-
if not self._db.zcount(self._todo_key, 0, "(1000"):
|
92
|
-
time.sleep(self.scheduler_wait_seconds)
|
93
|
-
elif TaskQueue.TODO.length >= self.todo_queue_size:
|
94
|
-
time.sleep(self.todo_queue_full_wait_seconds)
|
95
|
-
else:
|
96
|
-
members = self._db.members(
|
97
|
-
self._todo_key, int(time.time()),
|
98
|
-
count=self.todo_queue_size,
|
99
|
-
_min=0, _max="(1000"
|
100
|
-
)
|
101
|
-
for member, priority in members:
|
102
|
-
seed = Seed(member, priority=priority)
|
103
|
-
TaskQueue.TODO.push(seed)
|
104
|
-
self.doing_seeds[seed.to_string] = seed.params.priority
|
105
|
-
|
106
|
-
@pause
|
107
|
-
def _heartbeat(self):
|
108
|
-
if self._heartbeat_start_event.is_set():
|
109
|
-
self._db.setex(self._heartbeat_key, t=5)
|
110
|
-
time.sleep(3)
|
111
|
-
|
112
|
-
@pause
|
44
|
+
@Decorators.pause
|
113
45
|
def _insert(self):
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
""
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
self._remove_doing_seeds(seed_info["common"] + seed_info["succeed"] + seed_info["failed"])
|
158
|
-
|
159
|
-
if status:
|
160
|
-
time.sleep(self.done_queue_wait_seconds)
|
161
|
-
|
46
|
+
thread_sleep = 0.1
|
47
|
+
for q, key, size in [
|
48
|
+
(TaskQueue.SEED, self._redis_todo, self.seed_queue_size),
|
49
|
+
(TaskQueue.REQUEST, self._redis_download, self.request_queue_size),
|
50
|
+
]:
|
51
|
+
item_info = {}
|
52
|
+
while (item := q.pop()) and len(item_info.keys()) < self.inserting_size:
|
53
|
+
item_info[item.seed] = item.params.priority
|
54
|
+
if q.length >= size:
|
55
|
+
thread_sleep = self.inserting_wait_time
|
56
|
+
self._scheduler.insert(key, item_info)
|
57
|
+
time.sleep(thread_sleep)
|
58
|
+
|
59
|
+
@Decorators.pause
|
60
|
+
def _refresh(self):
|
61
|
+
self._scheduler.refresh(self._redis_todo, self._task_info["todo"])
|
62
|
+
self._scheduler.refresh(self._redis_download, self._task_info["download"])
|
63
|
+
time.sleep(10)
|
64
|
+
|
65
|
+
@Decorators.pause
|
66
|
+
def _remove(self):
|
67
|
+
thread_sleep = self.removing_wait_time
|
68
|
+
for q, key, size in [
|
69
|
+
(TaskQueue.DELETE, self._redis_todo, self.delete_queue_size),
|
70
|
+
(TaskQueue.DONE, self._redis_download, self.done_queue_size),
|
71
|
+
]:
|
72
|
+
items = []
|
73
|
+
while (item := q.pop()) and len(items) < self.removing_size:
|
74
|
+
items.append(item)
|
75
|
+
self._scheduler.delete(key, items)
|
76
|
+
self.remove_working_items(key.split(":")[-1], items)
|
77
|
+
if q.length >= size:
|
78
|
+
thread_sleep = 0.1
|
79
|
+
time.sleep(thread_sleep)
|
80
|
+
|
81
|
+
def _init_schedule_thread(self):
|
82
|
+
self._add_thread(func=self._heartbeat)
|
83
|
+
self._add_thread(func=self._reset)
|
84
|
+
self._add_thread(func=self._refresh)
|
85
|
+
self._add_thread(func=self._schedule)
|
86
|
+
self._add_thread(func=self._insert)
|
87
|
+
self._add_thread(func=self._remove)
|
88
|
+
# self._add_thread(func=self._polling)
|
cobweb/launchers/launcher_pro.py
CHANGED
@@ -21,21 +21,19 @@ class LauncherPro(Launcher):
|
|
21
21
|
(TaskQueue.DOWNLOAD, self._redis_download, self.download_queue_size, self._task_info["download"]),
|
22
22
|
]:
|
23
23
|
if q.length < size:
|
24
|
-
for member, priority in self._scheduler.schedule(
|
25
|
-
key, self.scheduling_size
|
26
|
-
):
|
24
|
+
for member, priority in self._scheduler.schedule(key, self.scheduling_size):
|
27
25
|
q.push((member, priority), direct_insertion=True)
|
28
26
|
self.add_working_item(key.split(":")[-1], member, priority)
|
29
27
|
thread_sleep = 0.1
|
30
28
|
time.sleep(thread_sleep)
|
31
29
|
|
32
|
-
@Decorators.
|
30
|
+
@Decorators.stop
|
33
31
|
def _heartbeat(self):
|
34
32
|
if self._scheduler.working.is_set():
|
35
33
|
self._scheduler.set_heartbeat()
|
36
34
|
time.sleep(3)
|
37
35
|
|
38
|
-
@Decorators.
|
36
|
+
@Decorators.stop
|
39
37
|
def _reset(self):
|
40
38
|
self._scheduler.reset(
|
41
39
|
keys=[self._redis_todo, self._redis_download],
|
@@ -51,7 +49,7 @@ class LauncherPro(Launcher):
|
|
51
49
|
(TaskQueue.REQUEST, self._redis_download, self.request_queue_size),
|
52
50
|
]:
|
53
51
|
item_info = {}
|
54
|
-
while item := q.pop() and len(item_info.keys()) < self.inserting_size:
|
52
|
+
while (item := q.pop()) and len(item_info.keys()) < self.inserting_size:
|
55
53
|
item_info[item.seed] = item.params.priority
|
56
54
|
if q.length >= size:
|
57
55
|
thread_sleep = self.inserting_wait_time
|
@@ -72,9 +70,9 @@ class LauncherPro(Launcher):
|
|
72
70
|
(TaskQueue.DONE, self._redis_download, self.done_queue_size),
|
73
71
|
]:
|
74
72
|
items = []
|
75
|
-
while item := q.pop() and len(items) < self.removing_size:
|
73
|
+
while (item := q.pop()) and len(items) < self.removing_size:
|
76
74
|
items.append(item)
|
77
|
-
self._scheduler.delete(key,
|
75
|
+
self._scheduler.delete(key, items)
|
78
76
|
self.remove_working_items(key.split(":")[-1], items)
|
79
77
|
if q.length >= size:
|
80
78
|
thread_sleep = 0.1
|
@@ -87,4 +85,4 @@ class LauncherPro(Launcher):
|
|
87
85
|
self._add_thread(func=self._schedule)
|
88
86
|
self._add_thread(func=self._insert)
|
89
87
|
self._add_thread(func=self._remove)
|
90
|
-
self._add_thread(func=self._polling)
|
88
|
+
# self._add_thread(func=self._polling)
|
cobweb/pipelines/pipeline.py
CHANGED
@@ -30,7 +30,7 @@ class Pipeline(ABC):
|
|
30
30
|
data_info, seeds = {}, []
|
31
31
|
thread_sleep = self.upload_wait_time if TaskQueue.UPLOAD.length < self.upload_queue_size else 0.1
|
32
32
|
try:
|
33
|
-
while item := TaskQueue.UPLOAD.pop() and len(seeds) <= self.upload_queue_size:
|
33
|
+
while (item := TaskQueue.UPLOAD.pop()) and len(seeds) <= self.upload_queue_size:
|
34
34
|
data = self.build(item)
|
35
35
|
data_info.setdefault(item.table, []).append(data)
|
36
36
|
seeds.append(item.seed)
|
@@ -39,6 +39,7 @@ class Pipeline(ABC):
|
|
39
39
|
except Exception as e:
|
40
40
|
logger.info(e)
|
41
41
|
seeds = None
|
42
|
+
# todo: retry
|
42
43
|
finally:
|
43
44
|
TaskQueue.DONE.push(seeds)
|
44
45
|
|
cobweb/schedulers/__init__.py
CHANGED
@@ -0,0 +1,69 @@
|
|
1
|
+
import threading
|
2
|
+
import time
|
3
|
+
|
4
|
+
# from cobweb.base import Seed
|
5
|
+
from cobweb.db import ApiDB
|
6
|
+
|
7
|
+
|
8
|
+
class ApiScheduler:
|
9
|
+
|
10
|
+
def __init__(self, task, project, scheduler_wait_seconds=30):
|
11
|
+
self._todo_key = "{%s:%s}:todo" % (project, task)
|
12
|
+
self._download_key = "{%s:%s}:download" % (project, task)
|
13
|
+
self._heartbeat_key = "heartbeat:%s_%s" % (project, task)
|
14
|
+
self._speed_control_key = "speed_control:%s_%s" % (project, task)
|
15
|
+
self._reset_lock_key = "lock:reset:%s_%s" % (project, task)
|
16
|
+
self._db = ApiDB()
|
17
|
+
|
18
|
+
self.scheduler_wait_seconds = scheduler_wait_seconds
|
19
|
+
self.working = threading.Event()
|
20
|
+
|
21
|
+
@property
|
22
|
+
def heartbeat(self):
|
23
|
+
return self._db.exists(self._heartbeat_key)
|
24
|
+
|
25
|
+
def set_heartbeat(self):
|
26
|
+
return self._db.setex(self._heartbeat_key, 5)
|
27
|
+
|
28
|
+
def schedule(self, key, count):
|
29
|
+
if not self._db.zcount(key, 0, "(1000"):
|
30
|
+
time.sleep(self.scheduler_wait_seconds)
|
31
|
+
else:
|
32
|
+
source = int(time.time())
|
33
|
+
members = self._db.members(key, source, count=count, _min=0, _max="(1000")
|
34
|
+
for member, priority in members:
|
35
|
+
# seed = Seed(member, priority=priority)
|
36
|
+
yield member.decode(), priority
|
37
|
+
|
38
|
+
def insert(self, key, items):
|
39
|
+
if items:
|
40
|
+
self._db.zadd(key, items, nx=True)
|
41
|
+
|
42
|
+
def reset(self, keys, reset_time=30):
|
43
|
+
if self._db.lock(self._reset_lock_key, t=120):
|
44
|
+
|
45
|
+
if isinstance(keys, str):
|
46
|
+
keys = [keys]
|
47
|
+
|
48
|
+
_min = reset_time - int(time.time()) if self.heartbeat else "-inf"
|
49
|
+
|
50
|
+
for key in keys:
|
51
|
+
self._db.members(key, 0, _min=_min, _max="(0")
|
52
|
+
|
53
|
+
if not self.heartbeat:
|
54
|
+
self.working.set()
|
55
|
+
time.sleep(10)
|
56
|
+
|
57
|
+
self._db.delete(self._reset_lock_key)
|
58
|
+
|
59
|
+
def refresh(self, key, items: dict[str, int]):
|
60
|
+
refresh_time = int(time.time())
|
61
|
+
its = {k: -refresh_time - v / 1000 for k, v in items}
|
62
|
+
self._db.zadd(key, item=its, xx=True)
|
63
|
+
|
64
|
+
def delete(self, key, values):
|
65
|
+
self._db.zrem(key, *values)
|
66
|
+
|
67
|
+
|
68
|
+
|
69
|
+
|
@@ -1,8 +1,8 @@
|
|
1
|
-
cobweb/__init__.py,sha256=
|
1
|
+
cobweb/__init__.py,sha256=oaEfsGUuGP0s39UbFRwrnsjMUeuB6QvQIAwStKFyUTk,83
|
2
2
|
cobweb/constant.py,sha256=eofONAntk9O6S-cb4KbYGYHL_u7nBlOqqFOw_HzJHAU,3588
|
3
3
|
cobweb/setting.py,sha256=pY6LKsgWI3164GiGA1z_y26LVf5-3mpiEgmm86mKRdY,3135
|
4
|
-
cobweb/base/__init__.py,sha256=
|
5
|
-
cobweb/base/basic.py,sha256=
|
4
|
+
cobweb/base/__init__.py,sha256=Na385Hhl9l2S8aPhcdJVPjmb02wkVM969bWQ84bCSQs,5095
|
5
|
+
cobweb/base/basic.py,sha256=s5G4LBZiLUfoymV-gLSIqeH-OJ7q7-L35sBa6xEH3EI,7666
|
6
6
|
cobweb/base/common_queue.py,sha256=Gor7sR3h1hlZWaI0XcNAbf0S15Ftjr3DFRWNTGL13uU,1137
|
7
7
|
cobweb/base/decorators.py,sha256=wDCaQ94aAZGxks9Ljc0aXq6omDXT1_yzFy83ZW6VbVI,930
|
8
8
|
cobweb/base/dotting.py,sha256=0SH8F2uAGWZjfODpTAXngYHz8JgfCm-RqpmQbfQ3NCY,1233
|
@@ -13,7 +13,7 @@ cobweb/base/response.py,sha256=eB1DWMXFCpn3cJ3yzgCRU1WeZAdayGDohRgdjdMUFN4,406
|
|
13
13
|
cobweb/base/seed.py,sha256=PN5J4gKPEXylwyQeSGOBfauxHktxFr7RJe8nVX1hBw4,2987
|
14
14
|
cobweb/crawlers/__init__.py,sha256=msvkB9mTpsgyj8JfNMsmwAcpy5kWk_2NrO1Adw2Hkw0,29
|
15
15
|
cobweb/crawlers/base_crawler.py,sha256=ee_WSDnPQpPTk6wlFuY2UEx5L3hcsAZFcr6i3GLSry8,5751
|
16
|
-
cobweb/crawlers/crawler.py,sha256=
|
16
|
+
cobweb/crawlers/crawler.py,sha256=ZQ6yVA1EaQRdKJEY3DNqShzp9HPMwlSXapnsRW9E5Wc,2987
|
17
17
|
cobweb/crawlers/file_crawler.py,sha256=2Sjbdgxzqd41WykKUQE3QQlGai3T8k-pmHNmPlTchjQ,4454
|
18
18
|
cobweb/db/__init__.py,sha256=uZwSkd105EAwYo95oZQXAfofUKHVIAZZIPpNMy-hm2Q,56
|
19
19
|
cobweb/db/api_db.py,sha256=bDc5dJQxq4z04h70KUTHd0OqUOEY7Cm3wcNJZtTvJIM,3015
|
@@ -21,17 +21,18 @@ cobweb/db/redis_db.py,sha256=FvMzckJtmhwKhZqKoS23iXmJti5P2dnMVD5rJ__5LUw,5139
|
|
21
21
|
cobweb/exceptions/__init__.py,sha256=E9SHnJBbhD7fOgPFMswqyOf8SKRDrI_i25L0bSpohvk,32
|
22
22
|
cobweb/exceptions/oss_db_exception.py,sha256=iP_AImjNHT3-Iv49zCFQ3rdLnlvuHa3h2BXApgrOYpA,636
|
23
23
|
cobweb/launchers/__init__.py,sha256=uzfPkLbY2m0wsIR_s93VFxmO0U49GgUG7hXPzMYdye0,118
|
24
|
-
cobweb/launchers/launcher.py,sha256=
|
25
|
-
cobweb/launchers/launcher_air.py,sha256=
|
26
|
-
cobweb/launchers/launcher_api.py,sha256=
|
27
|
-
cobweb/launchers/launcher_pro.py,sha256=
|
24
|
+
cobweb/launchers/launcher.py,sha256=NFwpc_0Um0hbDm1A8glWA4fcW6mNYL1eon4t3JAQUlw,7411
|
25
|
+
cobweb/launchers/launcher_air.py,sha256=yPr395HVIIHAq6lqRcYJu7c0KkfO9V8O-2sn0hC96p0,2990
|
26
|
+
cobweb/launchers/launcher_api.py,sha256=c0bnnZQCqkk_cX-WyFsjc6jpliCwZCuAJeGAvUATODk,3370
|
27
|
+
cobweb/launchers/launcher_pro.py,sha256=2H-TcvQx-ga78GLNTa-GXMLYAj9nEeCJSWf8xl-1ISQ,3374
|
28
28
|
cobweb/pipelines/__init__.py,sha256=zSUsGtx6smbs2iXBXvYynReKSgky-3gjqaAtKVnA_OU,105
|
29
29
|
cobweb/pipelines/base_pipeline.py,sha256=fYnWf79GmhufXpcnMa3te18SbmnVeYLwxfyo-zLd9CY,1577
|
30
30
|
cobweb/pipelines/loghub_pipeline.py,sha256=cjPO6w6UJ0jNw2fVvdX0BCdlm58T7dmYXlxzXOBpvfY,1027
|
31
|
-
cobweb/pipelines/pipeline.py,sha256=
|
31
|
+
cobweb/pipelines/pipeline.py,sha256=Pycm22bHId9a3gdP81D5y7SsuMndYooTb5n4zQxP7dM,1321
|
32
32
|
cobweb/pipelines/pipeline_console.py,sha256=NEh-4zhuVAQOqwXLsqeb-rcNZ9_KXFUpL3otUTL5qBs,754
|
33
33
|
cobweb/pipelines/pipeline_loghub.py,sha256=xZ6D55BGdiM71WUv83jyLGbEyUwhBHLJRZoXthBxxTs,1019
|
34
|
-
cobweb/schedulers/__init__.py,sha256=
|
34
|
+
cobweb/schedulers/__init__.py,sha256=y7Lv_7b0zfTl0OhIONb_8u1K1C9gVlBA-xz_XG_kI9g,85
|
35
|
+
cobweb/schedulers/scheduler_api.py,sha256=mC54QOS0PEu4SFvxfD5Qr9239hAxwMrKTg-33rirANE,2112
|
35
36
|
cobweb/schedulers/scheduler_redis.py,sha256=Aw7de0sXigRAxJgqUhHWu30hMBzgEWjkj-3OXXqmldg,2118
|
36
37
|
cobweb/utils/__init__.py,sha256=Ev2LZZ1-S56iQYDqFZrqadizEv4Gk8Of-DraH-_WnKY,109
|
37
38
|
cobweb/utils/bloom.py,sha256=vng-YbKgh9HbtpAWYf_nkUSbfVTOj40aqUUejRYlsCU,1752
|
@@ -103,8 +104,8 @@ cobweb_new/utils/__init__.py,sha256=c9macpjc15hrCUCdzO5RR_sgK_B9kvJKreSGprZ1ld4,
|
|
103
104
|
cobweb_new/utils/bloom.py,sha256=vng-YbKgh9HbtpAWYf_nkUSbfVTOj40aqUUejRYlsCU,1752
|
104
105
|
cobweb_new/utils/oss.py,sha256=gyt8-UB07tVphZLQXMOf-JTJwU-mWq8KZkOXKkAf3uk,3513
|
105
106
|
cobweb_new/utils/tools.py,sha256=5JEaaAwYoV9Sdla2UBIJn6faUBuXmxUMagm9ck6FVqs,1253
|
106
|
-
cobweb_launcher-1.3.
|
107
|
-
cobweb_launcher-1.3.
|
108
|
-
cobweb_launcher-1.3.
|
109
|
-
cobweb_launcher-1.3.
|
110
|
-
cobweb_launcher-1.3.
|
107
|
+
cobweb_launcher-1.3.4.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
|
108
|
+
cobweb_launcher-1.3.4.dist-info/METADATA,sha256=AMoOPhP8ILf0uUUBrQUpn3_-S3qF-_-ef-_DSPQgJZA,6509
|
109
|
+
cobweb_launcher-1.3.4.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
|
110
|
+
cobweb_launcher-1.3.4.dist-info/top_level.txt,sha256=A0GPGeX6QtxXg7AJno3SVRTHtVCCqeRIOrpwDoXg9qs,15
|
111
|
+
cobweb_launcher-1.3.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|