cobweb-launcher 1.2.49__py3-none-any.whl → 1.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cobweb/base/__init__.py +141 -4
- cobweb/base/basic.py +28 -82
- cobweb/base/common_queue.py +13 -0
- cobweb/base/dotting.py +1 -1
- cobweb/base/request.py +14 -2
- cobweb/base/seed.py +10 -6
- cobweb/constant.py +16 -0
- cobweb/crawlers/crawler.py +51 -181
- cobweb/db/redis_db.py +28 -0
- cobweb/launchers/__init__.py +2 -2
- cobweb/launchers/launcher.py +110 -141
- cobweb/launchers/launcher_api.py +66 -114
- cobweb/launchers/launcher_pro.py +76 -194
- cobweb/pipelines/base_pipeline.py +54 -0
- cobweb/pipelines/loghub_pipeline.py +34 -0
- cobweb/pipelines/pipeline.py +25 -49
- cobweb/schedulers/__init__.py +0 -2
- cobweb/schedulers/scheduler_redis.py +5 -8
- cobweb/setting.py +29 -6
- cobweb/utils/dotting.py +10 -42
- cobweb_/__init__.py +2 -0
- cobweb_/base/__init__.py +9 -0
- cobweb_/base/common_queue.py +30 -0
- cobweb_/base/decorators.py +40 -0
- cobweb_/base/item.py +46 -0
- cobweb_/base/log.py +94 -0
- cobweb_/base/request.py +82 -0
- cobweb_/base/response.py +23 -0
- cobweb_/base/seed.py +114 -0
- cobweb_/constant.py +94 -0
- cobweb_/crawlers/__init__.py +1 -0
- cobweb_/crawlers/crawler.py +184 -0
- cobweb_/db/__init__.py +2 -0
- cobweb_/db/api_db.py +82 -0
- cobweb_/db/redis_db.py +130 -0
- cobweb_/exceptions/__init__.py +1 -0
- cobweb_/exceptions/oss_db_exception.py +28 -0
- cobweb_/launchers/__init__.py +3 -0
- cobweb_/launchers/launcher.py +235 -0
- cobweb_/launchers/launcher_air.py +88 -0
- cobweb_/launchers/launcher_api.py +221 -0
- cobweb_/launchers/launcher_pro.py +222 -0
- cobweb_/pipelines/__init__.py +3 -0
- cobweb_/pipelines/pipeline.py +69 -0
- cobweb_/pipelines/pipeline_console.py +22 -0
- cobweb_/pipelines/pipeline_loghub.py +34 -0
- cobweb_/setting.py +74 -0
- cobweb_/utils/__init__.py +5 -0
- cobweb_/utils/bloom.py +58 -0
- cobweb_/utils/dotting.py +32 -0
- cobweb_/utils/oss.py +94 -0
- cobweb_/utils/tools.py +42 -0
- {cobweb_launcher-1.2.49.dist-info → cobweb_launcher-1.3.2.dist-info}/METADATA +1 -1
- cobweb_launcher-1.3.2.dist-info/RECORD +110 -0
- cobweb_launcher-1.3.2.dist-info/top_level.txt +2 -0
- cobweb_new/__init__.py +2 -0
- cobweb_new/base/__init__.py +72 -0
- cobweb_new/base/common_queue.py +53 -0
- cobweb_new/base/decorators.py +72 -0
- cobweb_new/base/item.py +46 -0
- cobweb_new/base/log.py +94 -0
- cobweb_new/base/request.py +82 -0
- cobweb_new/base/response.py +23 -0
- cobweb_new/base/seed.py +118 -0
- cobweb_new/constant.py +105 -0
- cobweb_new/crawlers/__init__.py +1 -0
- cobweb_new/crawlers/crawler-new.py +85 -0
- cobweb_new/crawlers/crawler.py +170 -0
- cobweb_new/db/__init__.py +2 -0
- cobweb_new/db/api_db.py +82 -0
- cobweb_new/db/redis_db.py +158 -0
- cobweb_new/exceptions/__init__.py +1 -0
- cobweb_new/exceptions/oss_db_exception.py +28 -0
- cobweb_new/launchers/__init__.py +3 -0
- cobweb_new/launchers/launcher.py +237 -0
- cobweb_new/launchers/launcher_air.py +88 -0
- cobweb_new/launchers/launcher_api.py +161 -0
- cobweb_new/launchers/launcher_pro.py +96 -0
- cobweb_new/launchers/tesss.py +47 -0
- cobweb_new/pipelines/__init__.py +3 -0
- cobweb_new/pipelines/pipeline.py +68 -0
- cobweb_new/pipelines/pipeline_console.py +22 -0
- cobweb_new/pipelines/pipeline_loghub.py +34 -0
- cobweb_new/setting.py +95 -0
- cobweb_new/utils/__init__.py +5 -0
- cobweb_new/utils/bloom.py +58 -0
- cobweb_new/utils/oss.py +94 -0
- cobweb_new/utils/tools.py +42 -0
- cobweb/schedulers/scheduler_api.py +0 -72
- cobweb_launcher-1.2.49.dist-info/RECORD +0 -44
- cobweb_launcher-1.2.49.dist-info/top_level.txt +0 -1
- {cobweb_launcher-1.2.49.dist-info → cobweb_launcher-1.3.2.dist-info}/LICENSE +0 -0
- {cobweb_launcher-1.2.49.dist-info → cobweb_launcher-1.3.2.dist-info}/WHEEL +0 -0
cobweb/crawlers/crawler.py
CHANGED
@@ -1,66 +1,35 @@
|
|
1
1
|
import json
|
2
|
-
import os
|
3
|
-
import threading
|
4
2
|
import time
|
5
|
-
import
|
6
|
-
from inspect import isgenerator
|
3
|
+
import threading
|
7
4
|
from typing import Union, Callable, Mapping
|
8
|
-
from urllib.parse import urlparse
|
9
|
-
|
10
|
-
from requests import Response as Res
|
11
5
|
|
12
|
-
|
6
|
+
import setting
|
13
7
|
from cobweb.base import (
|
14
8
|
Seed,
|
15
|
-
BaseItem,
|
16
|
-
Request,
|
17
|
-
Response,
|
9
|
+
BaseItem,
|
10
|
+
Request,
|
11
|
+
Response,
|
18
12
|
ConsoleItem,
|
13
|
+
Decorators,
|
14
|
+
TaskQueue,
|
19
15
|
logger
|
20
16
|
)
|
21
|
-
from
|
22
|
-
proxy_type = os.getenv("PROXY_TYPE", "")
|
17
|
+
from constant import DealModel
|
23
18
|
|
24
19
|
|
25
20
|
class Crawler(threading.Thread):
|
26
21
|
|
27
|
-
def __init__(
|
28
|
-
self,
|
29
|
-
task: str,
|
30
|
-
project: str,
|
31
|
-
stop: threading.Event,
|
32
|
-
pause: threading.Event,
|
33
|
-
# launcher_queue: Union[Mapping[str, Queue]],
|
34
|
-
get_seed: Callable,
|
35
|
-
set_seed: Callable,
|
36
|
-
add_seed: Callable,
|
37
|
-
delete_seed: Callable,
|
38
|
-
upload_data: Callable,
|
39
|
-
custom_func: Union[Mapping[str, Callable]],
|
40
|
-
thread_num: int,
|
41
|
-
max_retries: int,
|
42
|
-
time_sleep: int,
|
43
|
-
):
|
22
|
+
def __init__(self, pause, custom_func: Union[Mapping[str, Callable]]):
|
44
23
|
super().__init__()
|
45
|
-
self.
|
46
|
-
self.project = project
|
47
|
-
self._stop = stop
|
48
|
-
self._pause = pause
|
49
|
-
self._get_seed = get_seed
|
50
|
-
self._set_seed = set_seed
|
51
|
-
self._add_seed = add_seed
|
52
|
-
self._delete_seed = delete_seed
|
53
|
-
self._upload_data = upload_data
|
54
|
-
|
24
|
+
self.pause = pause
|
55
25
|
for func_name, _callable in custom_func.items():
|
56
26
|
if isinstance(_callable, Callable):
|
57
27
|
self.__setattr__(func_name, _callable)
|
58
28
|
|
59
|
-
self.
|
60
|
-
self.
|
61
|
-
self.
|
62
|
-
|
63
|
-
self.loghub_dot = LoghubDot()
|
29
|
+
self.spider_max_retries = setting.SPIDER_MAX_RETRIES
|
30
|
+
self.request_queue_size = setting.REQUEST_QUEUE_SIZE
|
31
|
+
self.download_queue_size = setting.DOWNLOAD_QUEUE_SIZE
|
32
|
+
self.upload_queue_size = setting.UPLOAD_QUEUE_SIZE
|
64
33
|
|
65
34
|
@staticmethod
|
66
35
|
def request(seed: Seed) -> Union[Request, BaseItem]:
|
@@ -77,142 +46,43 @@ class Crawler(threading.Thread):
|
|
77
46
|
upload_item["text"] = item.response.text
|
78
47
|
yield ConsoleItem(item.seed, data=json.dumps(upload_item, ensure_ascii=False))
|
79
48
|
|
80
|
-
#
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
if
|
85
|
-
|
86
|
-
elif
|
87
|
-
|
88
|
-
|
89
|
-
self.
|
90
|
-
elif isinstance(item, str) and item == DealModel.done:
|
91
|
-
self._delete_seed(seed)
|
92
|
-
elif isinstance(item, str) and item == DealModel.fail:
|
93
|
-
seed.params.seed_status = DealModel.fail
|
94
|
-
self._delete_seed(seed)
|
95
|
-
else:
|
96
|
-
raise TypeError("yield value type error!")
|
97
|
-
|
98
|
-
def spider(self):
|
99
|
-
while not self._stop.is_set():
|
100
|
-
|
101
|
-
seed = self._get_seed()
|
102
|
-
|
103
|
-
if not seed:
|
104
|
-
time.sleep(1)
|
105
|
-
continue
|
106
|
-
|
107
|
-
elif seed.params.retry > self.max_retries:
|
49
|
+
# @decorators.add_thread()
|
50
|
+
@Decorators.pause
|
51
|
+
def build_request_item(self):
|
52
|
+
thread_sleep = 0.1
|
53
|
+
if TaskQueue.REQUEST.length >= self.request_queue_size:
|
54
|
+
thread_sleep = 5
|
55
|
+
elif seed_info := TaskQueue.TODO.pop():
|
56
|
+
member, priority = seed_info
|
57
|
+
seed = Seed(member, priority=priority)
|
58
|
+
if seed.params.retry > self.spider_max_retries:
|
108
59
|
seed.params.seed_status = DealModel.fail
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
iterator_status = False
|
137
|
-
logger.info(LogTemplate.download_info.format(
|
138
|
-
detail=seed_detail_log_info,
|
139
|
-
retry=seed.params.retry,
|
140
|
-
priority=seed.params.priority,
|
141
|
-
seed_version=seed.params.seed_version,
|
142
|
-
identifier=seed.identifier or "",
|
143
|
-
status=download_item.response,
|
144
|
-
response=LogTemplate.log_info(download_item.to_dict)
|
145
|
-
))
|
146
|
-
if isinstance(download_item.response, Res):
|
147
|
-
end_time = time.time()
|
148
|
-
self.loghub_dot.build(
|
149
|
-
topic=urlparse(download_item.response.request.url).netloc,
|
150
|
-
data_size=int(download_item.response.headers.get("content-length", 0)),
|
151
|
-
cost_time=end_time - start_time, status = 200,
|
152
|
-
url=download_item.response.url, proxy_type=proxy_type,
|
153
|
-
project=self.project, task=self.task,
|
154
|
-
)
|
155
|
-
parse_iterators = self.parse(download_item)
|
156
|
-
if not isgenerator(parse_iterators):
|
157
|
-
raise TypeError("parse function isn't a generator")
|
158
|
-
for parse_item in parse_iterators:
|
159
|
-
iterator_status = True
|
160
|
-
if isinstance(parse_item, Response):
|
161
|
-
raise TypeError("upload_item can't be a Response instance")
|
162
|
-
self.distribute(parse_item, seed)
|
163
|
-
else:
|
164
|
-
self.distribute(download_item, seed)
|
165
|
-
else:
|
166
|
-
self.distribute(request_item, seed)
|
167
|
-
|
168
|
-
if not iterator_status:
|
169
|
-
raise ValueError("request/download/parse function yield value error!")
|
170
|
-
except Exception as e:
|
171
|
-
exception_msg = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
172
|
-
url = seed.url
|
173
|
-
status = e.__class__.__name__
|
174
|
-
if getattr(e, "response", None) and isinstance(e.response, Res):
|
175
|
-
url = e.response.request.url
|
176
|
-
status = e.response.status_code
|
177
|
-
self.loghub_dot.build(
|
178
|
-
topic=urlparse(url).netloc,
|
179
|
-
data_size=-1, cost_time=-1,
|
180
|
-
status=status, url=url,
|
181
|
-
proxy_type=proxy_type,
|
182
|
-
project=self.project,
|
183
|
-
task=self.task,
|
184
|
-
msg=exception_msg,
|
185
|
-
)
|
186
|
-
logger.info(LogTemplate.download_exception.format(
|
187
|
-
detail=seed_detail_log_info,
|
188
|
-
retry=seed.params.retry,
|
189
|
-
priority=seed.params.priority,
|
190
|
-
seed_version=seed.params.seed_version,
|
191
|
-
identifier=seed.identifier or "",
|
192
|
-
exception=''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
193
|
-
))
|
194
|
-
seed.params.retry += 1
|
195
|
-
self._set_seed(seed)
|
196
|
-
# time.sleep(self.time_sleep * seed.params.retry)
|
197
|
-
# except Exception as e:
|
198
|
-
# logger.info(LogTemplate.download_exception.format(
|
199
|
-
# detail=seed_detail_log_info,
|
200
|
-
# retry=seed.params.retry,
|
201
|
-
# priority=seed.params.priority,
|
202
|
-
# seed_version=seed.params.seed_version,
|
203
|
-
# identifier=seed.identifier or "",
|
204
|
-
# exception=''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
205
|
-
# ))
|
206
|
-
# seed.params.retry += 1
|
207
|
-
# # self._todo.push(seed)
|
208
|
-
# self._set_seed(seed)
|
209
|
-
# # time.sleep(self.time_sleep * seed.params.retry)
|
210
|
-
finally:
|
211
|
-
time.sleep(0.1)
|
212
|
-
logger.info("spider thread close")
|
60
|
+
else:
|
61
|
+
TaskQueue.process_task(seed, self.request)
|
62
|
+
TaskQueue.DELETE.push(member)
|
63
|
+
time.sleep(thread_sleep)
|
64
|
+
|
65
|
+
@Decorators.pause
|
66
|
+
def build_download_item(self):
|
67
|
+
thread_sleep = 0.1
|
68
|
+
if TaskQueue.RESPONSE.length >= self.download_queue_size:
|
69
|
+
logger.info(f"download queue is full, sleep {thread_sleep}s")
|
70
|
+
thread_sleep = 5
|
71
|
+
elif request_info := TaskQueue.DOWNLOAD.pop():
|
72
|
+
member, priority = request_info
|
73
|
+
request_setting = json.loads(member)
|
74
|
+
request_item = Request(seed=member, **request_setting)
|
75
|
+
TaskQueue.process_task(request_item, self.download)
|
76
|
+
time.sleep(thread_sleep)
|
77
|
+
|
78
|
+
@Decorators.pause
|
79
|
+
def build_parse_item(self):
|
80
|
+
thread_sleep = 0.1
|
81
|
+
if TaskQueue.UPLOAD.length >= self.upload_queue_size:
|
82
|
+
logger.info(f"upload queue is full, sleep {thread_sleep}s")
|
83
|
+
thread_sleep = 5
|
84
|
+
if response_item := TaskQueue.RESPONSE.pop():
|
85
|
+
TaskQueue.process_task(response_item, self.parse)
|
86
|
+
time.sleep(thread_sleep)
|
213
87
|
|
214
|
-
def run(self):
|
215
|
-
threading.Thread(name="loghub_dot", target=self.loghub_dot.build_run).start()
|
216
|
-
for index in range(self.thread_num):
|
217
|
-
threading.Thread(name=f"spider_{index}", target=self.spider).start()
|
218
88
|
|
cobweb/db/redis_db.py
CHANGED
@@ -27,6 +27,9 @@ class RedisDB:
|
|
27
27
|
def exists(self, *name) -> bool:
|
28
28
|
return self._client.exists(*name)
|
29
29
|
|
30
|
+
def incrby(self, name, value):
|
31
|
+
return self._client.incrby(name, value)
|
32
|
+
|
30
33
|
def sadd(self, name, value):
|
31
34
|
return self._client.sadd(name, value)
|
32
35
|
|
@@ -72,6 +75,31 @@ class RedisDB:
|
|
72
75
|
status = self.execute_lua(lua_script, [key], t)
|
73
76
|
return bool(status)
|
74
77
|
|
78
|
+
def auto_incr(self, name, t=15, limit=1000):
|
79
|
+
lua_script = """
|
80
|
+
local count = 0
|
81
|
+
local status = false
|
82
|
+
local limit = ARGV[2]
|
83
|
+
local expire = redis.call('ttl', KEYS[1])
|
84
|
+
|
85
|
+
if ( expire == -2 ) then
|
86
|
+
redis.call('setnx', KEYS[1], 1)
|
87
|
+
elseif ( expire == -1) then
|
88
|
+
redis.call('expire', KEYS[1], ARGV[1])
|
89
|
+
else
|
90
|
+
count = redis.call('get', KEYS[1])
|
91
|
+
end
|
92
|
+
|
93
|
+
if ( count + 0 < limit + 0 ) then
|
94
|
+
status = true
|
95
|
+
redis.call('incr', KEYS[1])
|
96
|
+
end
|
97
|
+
|
98
|
+
return status
|
99
|
+
"""
|
100
|
+
status = self.execute_lua(lua_script, [name], t, limit)
|
101
|
+
return bool(status)
|
102
|
+
|
75
103
|
def members(self, key, score, start=0, count=5000, _min="-inf", _max="+inf") -> list:
|
76
104
|
lua_script = """
|
77
105
|
local min = ARGV[1]
|
cobweb/launchers/__init__.py
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
from .launcher_air import LauncherAir
|
1
|
+
# from .launcher_air import LauncherAir
|
2
2
|
from .launcher_pro import LauncherPro
|
3
|
-
from .launcher_api import LauncherApi
|
3
|
+
# from .launcher_api import LauncherApi
|
cobweb/launchers/launcher.py
CHANGED
@@ -2,63 +2,25 @@ import time
|
|
2
2
|
import inspect
|
3
3
|
import threading
|
4
4
|
import importlib
|
5
|
-
from functools import wraps
|
6
|
-
|
7
5
|
|
6
|
+
from cobweb.constant import LogTemplate
|
7
|
+
from cobweb.utils import dynamic_load_class
|
8
|
+
from cobweb.base import TaskQueue, Decorators, logger
|
8
9
|
from cobweb import setting
|
9
|
-
from cobweb.base import Seed, Queue, logger
|
10
|
-
from cobweb.utils.tools import dynamic_load_class
|
11
|
-
|
12
|
-
|
13
|
-
def check_pause(func):
|
14
|
-
@wraps(func)
|
15
|
-
def wrapper(self, *args, **kwargs):
|
16
|
-
while not self._pause.is_set():
|
17
|
-
try:
|
18
|
-
func(self, *args, **kwargs)
|
19
|
-
except Exception as e:
|
20
|
-
logger.info(f"{func.__name__}: " + str(e))
|
21
|
-
finally:
|
22
|
-
time.sleep(0.1)
|
23
|
-
|
24
|
-
return wrapper
|
25
10
|
|
26
11
|
|
27
12
|
class Launcher(threading.Thread):
|
28
13
|
|
29
|
-
|
30
|
-
|
31
|
-
__DOING__ = {}
|
32
|
-
|
33
|
-
__CUSTOM_FUNC__ = {
|
34
|
-
# "download": None,
|
35
|
-
# "request": None,
|
36
|
-
# "parse": None,
|
37
|
-
}
|
38
|
-
|
39
|
-
__LAUNCHER_QUEUE__ = {
|
40
|
-
"new": Queue(),
|
41
|
-
"todo": Queue(),
|
42
|
-
"done": Queue(),
|
43
|
-
"upload": Queue()
|
44
|
-
}
|
45
|
-
|
46
|
-
__LAUNCHER_FUNC__ = [
|
47
|
-
"_reset",
|
48
|
-
"_scheduler",
|
49
|
-
"_insert",
|
50
|
-
"_refresh",
|
51
|
-
"_delete",
|
52
|
-
]
|
14
|
+
__CUSTOM_FUNC__ = {}
|
53
15
|
|
54
16
|
def __init__(self, task, project, custom_setting=None, **kwargs):
|
55
17
|
super().__init__()
|
56
18
|
self.task = task
|
57
19
|
self.project = project
|
20
|
+
self.custom_func = dict()
|
21
|
+
self.app_time = int(time.time())
|
58
22
|
|
59
|
-
self.
|
60
|
-
self._stop = threading.Event() # 结束事件
|
61
|
-
self._pause = threading.Event() # 暂停事件
|
23
|
+
self.check_emtpy_times = 0
|
62
24
|
|
63
25
|
_setting = dict()
|
64
26
|
|
@@ -79,32 +41,39 @@ class Launcher(threading.Thread):
|
|
79
41
|
for k, v in _setting.items():
|
80
42
|
setattr(setting, k.upper(), v)
|
81
43
|
|
82
|
-
self.
|
83
|
-
|
44
|
+
self.before_scheduler_wait_seconds = setting.BEFORE_SCHEDULER_WAIT_SECONDS
|
45
|
+
|
46
|
+
self.scheduling_wait_time = setting.SCHEDULING_WAIT_TIME
|
47
|
+
self.inserting_wait_time = setting.INSERTING_WAIT_TIME
|
48
|
+
self.removing_wait_time = setting.REMOVING_WAIT_TIME
|
49
|
+
self.seed_reset_seconds = setting.SEED_RESET_SECONDS
|
50
|
+
|
51
|
+
self.scheduling_size = setting.SCHEDULING_SIZE
|
52
|
+
self.inserting_size = setting.INSERTING_SIZE
|
53
|
+
self.removing_size = setting.REMOVING_SIZE
|
84
54
|
|
85
|
-
self.
|
86
|
-
self.
|
87
|
-
self.
|
88
|
-
self.
|
89
|
-
self.
|
90
|
-
self.
|
91
|
-
self.
|
55
|
+
self.todo_queue_size = setting.TODO_QUEUE_SIZE
|
56
|
+
self.seed_queue_size = setting.SEED_QUEUE_SIZE
|
57
|
+
self.request_queue_size = setting.REQUEST_QUEUE_SIZE
|
58
|
+
self.download_queue_size = setting.DOWNLOAD_QUEUE_SIZE
|
59
|
+
self.response_queue_size = setting.RESPONSE_QUEUE_SIZE
|
60
|
+
self.upload_queue_size = setting.UPLOAD_QUEUE_SIZE
|
61
|
+
self.delete_queue_size = setting.DELETE_QUEUE_SIZE
|
62
|
+
self.done_queue_size = setting.DONE_QUEUE_SIZE
|
92
63
|
|
93
|
-
self.
|
94
|
-
self._new_queue_max_size = setting.NEW_QUEUE_MAX_SIZE
|
95
|
-
self._done_queue_max_size = setting.DONE_QUEUE_MAX_SIZE
|
96
|
-
self._upload_queue_max_size = setting.UPLOAD_QUEUE_MAX_SIZE
|
64
|
+
self.spider_thread_num = setting.SPIDER_THREAD_NUM
|
97
65
|
|
98
|
-
self.
|
99
|
-
self._spider_thread_num = setting.SPIDER_THREAD_NUM
|
100
|
-
self._spider_time_sleep = setting.SPIDER_TIME_SLEEP
|
101
|
-
self._spider_max_count = setting.SPIDER_MAX_COUNT
|
102
|
-
self._time_window = setting.TIME_WINDOW
|
66
|
+
self.task_model = setting.TASK_MODEL
|
103
67
|
|
104
|
-
self.
|
105
|
-
self.
|
68
|
+
self.stop = threading.Event() # 结束事件
|
69
|
+
self.pause = threading.Event() # 暂停事件
|
106
70
|
|
107
|
-
self.
|
71
|
+
self.crawler_path = setting.CRAWLER
|
72
|
+
self.pipeline_path = setting.PIPELINE
|
73
|
+
|
74
|
+
self._threads = []
|
75
|
+
|
76
|
+
self._task_info = dict(todo={}, download={})
|
108
77
|
|
109
78
|
@property
|
110
79
|
def request(self):
|
@@ -118,7 +87,7 @@ class Launcher(threading.Thread):
|
|
118
87
|
yield Request(seed.url, seed)
|
119
88
|
"""
|
120
89
|
def decorator(func):
|
121
|
-
self.
|
90
|
+
self.custom_func['request'] = func
|
122
91
|
return decorator
|
123
92
|
|
124
93
|
@property
|
@@ -133,7 +102,7 @@ class Launcher(threading.Thread):
|
|
133
102
|
yield Response(item.seed, response)
|
134
103
|
"""
|
135
104
|
def decorator(func):
|
136
|
-
self.
|
105
|
+
self.custom_func['download'] = func
|
137
106
|
return decorator
|
138
107
|
|
139
108
|
@property
|
@@ -148,88 +117,88 @@ class Launcher(threading.Thread):
|
|
148
117
|
yield xxxItem(seed, **kwargs)
|
149
118
|
"""
|
150
119
|
def decorator(func):
|
151
|
-
self.
|
120
|
+
self.custom_func['parse'] = func
|
152
121
|
return decorator
|
153
122
|
|
154
|
-
def
|
155
|
-
|
156
|
-
|
157
|
-
return seeds
|
123
|
+
def remove_working_items(self, key, items):
|
124
|
+
for item in items:
|
125
|
+
self._task_info[key].pop(item, None)
|
158
126
|
|
159
|
-
def
|
160
|
-
|
161
|
-
self.__DOING__.pop(seed, None)
|
162
|
-
# logger.info("remove %s seeds from __DOING__" % len(seeds))
|
127
|
+
def add_working_item(self, key, member, priority):
|
128
|
+
self._task_info[key][member] = priority
|
163
129
|
|
164
|
-
def
|
165
|
-
|
130
|
+
def check_alive(self):
|
131
|
+
while not self.stop.is_set():
|
132
|
+
if not self.pause.is_set():
|
133
|
+
for thread in self._threads:
|
134
|
+
if not thread.is_alive():
|
135
|
+
thread.start()
|
136
|
+
time.sleep(1)
|
166
137
|
|
167
|
-
def
|
168
|
-
self
|
138
|
+
def _add_thread(self, func, num=1, obj=None, name=None, args=()):
|
139
|
+
obj = obj or self
|
140
|
+
name = obj.__class__.__name__ + name or func.__name__
|
141
|
+
for i in range(num):
|
142
|
+
func_name = name + "_" + str(i) if num > 1 else name
|
143
|
+
self._threads.append(threading.Thread(name=func_name, target=func, args=(obj,) + args))
|
144
|
+
|
145
|
+
@Decorators.stop
|
146
|
+
def _polling(self):
|
147
|
+
if TaskQueue.is_empty():
|
148
|
+
if self.pause.is_set():
|
149
|
+
run_time = int(time.time()) - self.app_time
|
150
|
+
if not self.task_model and run_time > self.before_scheduler_wait_seconds:
|
151
|
+
logger.info("Done! ready to close thread...")
|
152
|
+
self.stop.set()
|
153
|
+
else:
|
154
|
+
logger.info("pause! waiting for resume...")
|
155
|
+
elif self.check_emtpy_times > 2:
|
156
|
+
logger.info("pause! waiting for resume...")
|
157
|
+
self.doing_seeds = {}
|
158
|
+
self._task_info['todo'] = {}
|
159
|
+
self._task_info['download'] = {}
|
160
|
+
self.pause.set()
|
161
|
+
else:
|
162
|
+
logger.info(
|
163
|
+
"check whether the task is complete, "
|
164
|
+
f"reset times {3 - self.check_emtpy_times}"
|
165
|
+
)
|
166
|
+
self.check_emtpy_times += 1
|
167
|
+
elif TaskQueue.TODO.length:
|
168
|
+
logger.info(f"Recovery {self.task} task run!")
|
169
|
+
self.check_emtpy_times = 0
|
170
|
+
self.pause.clear()
|
171
|
+
else:
|
172
|
+
logger.info(LogTemplate.launcher_polling.format(
|
173
|
+
task=self.task,
|
174
|
+
memory_todo_count=len(self._task_info["todo"]),
|
175
|
+
memory_download_count=len(self._task_info["download"]),
|
176
|
+
todo_queue_len=TaskQueue.TODO.length,
|
177
|
+
delete_queue_len=TaskQueue.DELETE.length,
|
178
|
+
request_queue_len=TaskQueue.REQUEST.length,
|
179
|
+
response_queue_len=TaskQueue.RESPONSE.length,
|
180
|
+
done_queue_len=TaskQueue.DONE.length,
|
181
|
+
upload_queue_len=TaskQueue.UPLOAD.length,
|
182
|
+
))
|
183
|
+
time.sleep(10)
|
169
184
|
|
170
|
-
def
|
171
|
-
self.
|
185
|
+
def run(self):
|
186
|
+
Crawler = dynamic_load_class(self.crawler_path)
|
187
|
+
Pipeline = dynamic_load_class(self.pipeline_path)
|
172
188
|
|
173
|
-
|
174
|
-
self.
|
189
|
+
crawler = Crawler(pause=self.pause, custom_func=self.custom_func)
|
190
|
+
pipeline = Pipeline(pause=self.pause)
|
175
191
|
|
176
|
-
|
177
|
-
self.
|
192
|
+
self._add_thread(obj=crawler, func=crawler.build_request_item)
|
193
|
+
self._add_thread(obj=crawler, func=crawler.build_download_item, num=self.spider_thread_num)
|
194
|
+
self._add_thread(obj=crawler, func=crawler.build_parse_item)
|
195
|
+
self._add_thread(obj=pipeline, func=pipeline.run)
|
178
196
|
|
179
|
-
|
180
|
-
for func_name in self.__LAUNCHER_FUNC__:
|
181
|
-
threading.Thread(name=func_name, target=getattr(self, func_name)).start()
|
182
|
-
time.sleep(1)
|
197
|
+
self._add_thread(func=self._polling)
|
183
198
|
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
self.start_seeds()
|
188
|
-
|
189
|
-
self._Crawler(
|
190
|
-
task=self.task, project=self.project,
|
191
|
-
stop=self._stop, pause=self._pause,
|
192
|
-
# launcher_queue=self.__LAUNCHER_QUEUE__,
|
193
|
-
get_seed=self._get_seed,
|
194
|
-
set_seed=self._set_seed,
|
195
|
-
add_seed=self._add_seed,
|
196
|
-
delete_seed=self._delete_seed,
|
197
|
-
upload_data=self._upload_data,
|
198
|
-
custom_func=self.__CUSTOM_FUNC__,
|
199
|
-
thread_num = self._spider_thread_num,
|
200
|
-
max_retries = self._spider_max_retries,
|
201
|
-
time_sleep=self._spider_time_sleep
|
202
|
-
).start()
|
203
|
-
|
204
|
-
self._Pipeline(
|
205
|
-
stop=self._stop, pause=self._pause,
|
206
|
-
upload=self.__LAUNCHER_QUEUE__["upload"],
|
207
|
-
done=self.__LAUNCHER_QUEUE__["done"],
|
208
|
-
upload_size=self._upload_queue_max_size,
|
209
|
-
wait_seconds=self._upload_queue_wait_seconds
|
210
|
-
).start()
|
211
|
-
|
212
|
-
self._execute()
|
213
|
-
self._polling()
|
214
|
-
|
215
|
-
def _execute_heartbeat(self):
|
216
|
-
pass
|
217
|
-
|
218
|
-
def _reset(self):
|
219
|
-
pass
|
220
|
-
|
221
|
-
def _scheduler(self):
|
222
|
-
pass
|
223
|
-
|
224
|
-
def _insert(self):
|
225
|
-
pass
|
226
|
-
|
227
|
-
def _refresh(self):
|
228
|
-
pass
|
229
|
-
|
230
|
-
def _delete(self):
|
231
|
-
pass
|
199
|
+
self._init_schedule_thread()
|
200
|
+
self.check_alive()
|
232
201
|
|
233
|
-
def
|
234
|
-
|
202
|
+
def _init_schedule_thread(self):
|
203
|
+
...
|
235
204
|
|