cobweb-launcher 1.2.25__py3-none-any.whl → 3.2.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cobweb/__init__.py +4 -1
- cobweb/base/__init__.py +3 -3
- cobweb/base/common_queue.py +37 -16
- cobweb/base/item.py +35 -16
- cobweb/base/{log.py → logger.py} +3 -3
- cobweb/base/request.py +741 -54
- cobweb/base/response.py +380 -13
- cobweb/base/seed.py +96 -48
- cobweb/base/task_queue.py +180 -0
- cobweb/base/test.py +257 -0
- cobweb/constant.py +10 -1
- cobweb/crawlers/crawler.py +12 -155
- cobweb/db/api_db.py +3 -2
- cobweb/db/redis_db.py +117 -28
- cobweb/launchers/__init__.py +4 -3
- cobweb/launchers/distributor.py +141 -0
- cobweb/launchers/launcher.py +95 -157
- cobweb/launchers/uploader.py +68 -0
- cobweb/log_dots/__init__.py +2 -0
- cobweb/log_dots/dot.py +258 -0
- cobweb/log_dots/loghub_dot.py +53 -0
- cobweb/pipelines/__init__.py +1 -1
- cobweb/pipelines/pipeline.py +5 -55
- cobweb/pipelines/pipeline_csv.py +25 -0
- cobweb/pipelines/pipeline_loghub.py +32 -12
- cobweb/schedulers/__init__.py +1 -0
- cobweb/schedulers/scheduler.py +66 -0
- cobweb/schedulers/scheduler_with_redis.py +189 -0
- cobweb/setting.py +27 -40
- cobweb/utils/__init__.py +5 -3
- cobweb/utils/bloom.py +58 -58
- cobweb/{base → utils}/decorators.py +14 -12
- cobweb/utils/dotting.py +300 -0
- cobweb/utils/oss.py +113 -94
- cobweb/utils/tools.py +3 -15
- {cobweb_launcher-1.2.25.dist-info → cobweb_launcher-3.2.18.dist-info}/METADATA +31 -43
- cobweb_launcher-3.2.18.dist-info/RECORD +44 -0
- {cobweb_launcher-1.2.25.dist-info → cobweb_launcher-3.2.18.dist-info}/WHEEL +1 -1
- cobweb/crawlers/base_crawler.py +0 -144
- cobweb/crawlers/file_crawler.py +0 -98
- cobweb/launchers/launcher_air.py +0 -88
- cobweb/launchers/launcher_api.py +0 -221
- cobweb/launchers/launcher_pro.py +0 -222
- cobweb/pipelines/base_pipeline.py +0 -54
- cobweb/pipelines/loghub_pipeline.py +0 -34
- cobweb/pipelines/pipeline_console.py +0 -22
- cobweb_launcher-1.2.25.dist-info/RECORD +0 -40
- {cobweb_launcher-1.2.25.dist-info → cobweb_launcher-3.2.18.dist-info}/LICENSE +0 -0
- {cobweb_launcher-1.2.25.dist-info → cobweb_launcher-3.2.18.dist-info}/top_level.txt +0 -0
cobweb/db/redis_db.py
CHANGED
|
@@ -1,64 +1,156 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import time
|
|
1
4
|
import redis
|
|
2
|
-
from
|
|
5
|
+
from redis.exceptions import ConnectionError, TimeoutError
|
|
3
6
|
|
|
4
7
|
|
|
5
8
|
class RedisDB:
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
9
|
+
def __init__(
|
|
10
|
+
self,
|
|
11
|
+
host=None,
|
|
12
|
+
password=None,
|
|
13
|
+
port=6379, db=0
|
|
14
|
+
):
|
|
15
|
+
self.host = host or os.getenv("REDIS_HOST", "localhost")
|
|
16
|
+
self.password = password or os.getenv("REDIS_PASSWORD")
|
|
17
|
+
self.port = port or os.getenv("REDIS_PORT", 6379)
|
|
18
|
+
self.db = db or os.getenv("REDIS_DB", 0)
|
|
19
|
+
|
|
20
|
+
self.max_retries = 5
|
|
21
|
+
self.retry_delay = 5
|
|
22
|
+
self.client = None
|
|
23
|
+
self.connect()
|
|
24
|
+
|
|
25
|
+
def connect(self):
|
|
26
|
+
retries = 0
|
|
27
|
+
while retries < self.max_retries:
|
|
28
|
+
try:
|
|
29
|
+
self.client = redis.Redis(
|
|
30
|
+
host=self.host,
|
|
31
|
+
port=self.port,
|
|
32
|
+
password=self.password,
|
|
33
|
+
db=self.db,
|
|
34
|
+
socket_timeout=5,
|
|
35
|
+
socket_connect_timeout=5
|
|
36
|
+
)
|
|
37
|
+
self.client.ping()
|
|
38
|
+
return
|
|
39
|
+
except (ConnectionError, TimeoutError) as e:
|
|
40
|
+
retries += 1
|
|
41
|
+
if retries < self.max_retries:
|
|
42
|
+
time.sleep(self.retry_delay)
|
|
43
|
+
else:
|
|
44
|
+
raise Exception("达到最大重试次数,无法连接 Redis")
|
|
45
|
+
|
|
46
|
+
def is_connected(self):
|
|
47
|
+
try:
|
|
48
|
+
self.client.ping()
|
|
49
|
+
return True
|
|
50
|
+
except (ConnectionError, TimeoutError):
|
|
51
|
+
return False
|
|
52
|
+
|
|
53
|
+
def reconnect(self):
|
|
54
|
+
self.connect()
|
|
55
|
+
|
|
56
|
+
def execute_command(self, command, *args, **kwargs):
|
|
57
|
+
retries = 0
|
|
58
|
+
while retries < self.max_retries:
|
|
59
|
+
try:
|
|
60
|
+
if not self.is_connected():
|
|
61
|
+
self.reconnect()
|
|
62
|
+
return getattr(self.client, command)(*args, **kwargs)
|
|
63
|
+
except (ConnectionError, TimeoutError) as e:
|
|
64
|
+
retries += 1
|
|
65
|
+
if retries < self.max_retries:
|
|
66
|
+
time.sleep(self.retry_delay)
|
|
67
|
+
else:
|
|
68
|
+
raise Exception("达到最大重试次数,无法执行命令")
|
|
69
|
+
|
|
70
|
+
def get(self, name):
|
|
71
|
+
# with self.get_connection() as client:
|
|
72
|
+
# return client.get(name)
|
|
73
|
+
return self.execute_command("get", name)
|
|
74
|
+
|
|
75
|
+
def incrby(self, name, value):
|
|
76
|
+
# with self.get_connection() as client:
|
|
77
|
+
# client.incrby(name, value)
|
|
78
|
+
self.execute_command("incrby", name, value)
|
|
11
79
|
|
|
12
80
|
def setnx(self, name, value=""):
|
|
13
|
-
|
|
81
|
+
# with self.get_connection() as client:
|
|
82
|
+
# client.setnx(name, value)
|
|
83
|
+
self.execute_command("setnx", name, value)
|
|
14
84
|
|
|
15
85
|
def setex(self, name, t, value=""):
|
|
16
|
-
|
|
86
|
+
# with self.get_connection() as client:
|
|
87
|
+
# client.setex(name, t, value)
|
|
88
|
+
self.execute_command("setex", name, t, value)
|
|
17
89
|
|
|
18
90
|
def expire(self, name, t, nx: bool = False, xx: bool = False, gt: bool = False, lt: bool = False):
|
|
19
|
-
|
|
91
|
+
# with self.get_connection() as client:
|
|
92
|
+
# client.expire(name, t, nx, xx, gt, lt)
|
|
93
|
+
self.execute_command("expire", name, t, nx, xx, gt, lt)
|
|
20
94
|
|
|
21
95
|
def ttl(self, name):
|
|
22
|
-
|
|
96
|
+
# with self.get_connection() as client:
|
|
97
|
+
# return client.ttl(name)
|
|
98
|
+
return self.execute_command("ttl", name)
|
|
23
99
|
|
|
24
100
|
def delete(self, name):
|
|
25
|
-
|
|
101
|
+
# with self.get_connection() as client:
|
|
102
|
+
# return client.delete(name)
|
|
103
|
+
return self.execute_command("delete", name)
|
|
26
104
|
|
|
27
105
|
def exists(self, *name) -> bool:
|
|
28
|
-
|
|
106
|
+
# with self.get_connection() as client:
|
|
107
|
+
# return client.exists(*name)
|
|
108
|
+
return self.execute_command("exists", *name)
|
|
29
109
|
|
|
30
110
|
def sadd(self, name, value):
|
|
31
|
-
|
|
111
|
+
# with self.get_connection() as client:
|
|
112
|
+
# return client.sadd(name, value)
|
|
113
|
+
return self.execute_command("sadd", name, value)
|
|
32
114
|
|
|
33
115
|
def zcard(self, name) -> bool:
|
|
34
|
-
|
|
116
|
+
# with self.get_connection() as client:
|
|
117
|
+
# return client.zcard(name)
|
|
118
|
+
return self.execute_command("zcard", name)
|
|
35
119
|
|
|
36
120
|
def zadd(self, name, item: dict, **kwargs):
|
|
37
|
-
|
|
121
|
+
# with self.get_connection() as client:
|
|
122
|
+
# return client.zadd(name, item, **kwargs)
|
|
123
|
+
if item:
|
|
124
|
+
return self.execute_command("zadd", name, item, **kwargs)
|
|
38
125
|
|
|
39
126
|
def zrem(self, name, *value):
|
|
40
|
-
|
|
127
|
+
# with self.get_connection() as client:
|
|
128
|
+
# return client.zrem(name, *value)
|
|
129
|
+
return self.execute_command("zrem", name, *value)
|
|
41
130
|
|
|
42
131
|
def zcount(self, name, _min, _max):
|
|
43
|
-
|
|
132
|
+
# with self.get_connection() as client:
|
|
133
|
+
# return client.zcount(name, _min, _max)
|
|
134
|
+
return self.execute_command("zcount", name, _min, _max)
|
|
44
135
|
|
|
45
136
|
# def zrangebyscore(self, name, _min, _max, start, num, withscores: bool = False, *args):
|
|
46
|
-
#
|
|
137
|
+
# with self.get_connection() as client:
|
|
138
|
+
# return client.zrangebyscore(name, _min, _max, start, num, withscores, *args)
|
|
47
139
|
|
|
48
140
|
def lua(self, script: str, keys: list = None, args: list = None):
|
|
49
141
|
keys = keys or []
|
|
50
142
|
args = args or []
|
|
51
143
|
keys_count = len(keys)
|
|
52
|
-
return self.
|
|
144
|
+
return self.execute_command("eval", script, keys_count, *keys, *args)
|
|
53
145
|
|
|
54
146
|
def lua_sha(self, sha1: str, keys: list = None, args: list = None):
|
|
55
147
|
keys = keys or []
|
|
56
148
|
args = args or []
|
|
57
149
|
keys_count = len(keys)
|
|
58
|
-
return self.
|
|
150
|
+
return self.execute_command("evalsha", sha1, keys_count, *keys, *args)
|
|
59
151
|
|
|
60
152
|
def execute_lua(self, lua_script: str, keys: list, *args):
|
|
61
|
-
execute = self.
|
|
153
|
+
execute = self.execute_command("register_script", lua_script)
|
|
62
154
|
return execute(keys=keys, args=args)
|
|
63
155
|
|
|
64
156
|
def lock(self, key, t=15) -> bool:
|
|
@@ -72,7 +164,7 @@ class RedisDB:
|
|
|
72
164
|
status = self.execute_lua(lua_script, [key], t)
|
|
73
165
|
return bool(status)
|
|
74
166
|
|
|
75
|
-
def members(self, key, score, start=0, count=
|
|
167
|
+
def members(self, key, score, start=0, count=1000, _min="-inf", _max="+inf") -> list:
|
|
76
168
|
lua_script = """
|
|
77
169
|
local min = ARGV[1]
|
|
78
170
|
local max = ARGV[2]
|
|
@@ -86,7 +178,7 @@ class RedisDB:
|
|
|
86
178
|
else
|
|
87
179
|
members = redis.call('zrangebyscore', KEYS[1], min, max, 'WITHSCORES', 'limit', start, count)
|
|
88
180
|
end
|
|
89
|
-
|
|
181
|
+
|
|
90
182
|
local result = {}
|
|
91
183
|
|
|
92
184
|
for i = 1, #members, 2 do
|
|
@@ -98,7 +190,7 @@ class RedisDB:
|
|
|
98
190
|
else
|
|
99
191
|
originPriority = math.floor(members[i+1])
|
|
100
192
|
end
|
|
101
|
-
|
|
193
|
+
|
|
102
194
|
if ( score + 0 >= 1000 ) then
|
|
103
195
|
priority = -score - originPriority / 1000
|
|
104
196
|
elseif ( score + 0 == 0 ) then
|
|
@@ -117,7 +209,7 @@ class RedisDB:
|
|
|
117
209
|
members = self.execute_lua(lua_script, [key], _min, _max, start, count, score)
|
|
118
210
|
return [(members[i].decode(), int(members[i + 1])) for i in range(0, len(members), 2)]
|
|
119
211
|
|
|
120
|
-
def done(self, keys: list, *args)
|
|
212
|
+
def done(self, keys: list, *args):
|
|
121
213
|
lua_script = """
|
|
122
214
|
for i, member in ipairs(ARGV) do
|
|
123
215
|
redis.call("zrem", KEYS[1], member)
|
|
@@ -125,6 +217,3 @@ class RedisDB:
|
|
|
125
217
|
end
|
|
126
218
|
"""
|
|
127
219
|
self.execute_lua(lua_script, keys, *args)
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
cobweb/launchers/__init__.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
-
from .
|
|
2
|
-
from .
|
|
3
|
-
from .
|
|
1
|
+
from .launcher import Launcher
|
|
2
|
+
from .uploader import Uploader
|
|
3
|
+
from .distributor import Distributor
|
|
4
|
+
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import threading
|
|
3
|
+
import traceback
|
|
4
|
+
|
|
5
|
+
from inspect import isgenerator
|
|
6
|
+
from typing import Callable, Type
|
|
7
|
+
from requests import RequestException
|
|
8
|
+
|
|
9
|
+
from cobweb.crawlers import Crawler
|
|
10
|
+
from cobweb.utils import check_pause
|
|
11
|
+
from cobweb.log_dots import LoghubDot
|
|
12
|
+
from cobweb.constant import DealModel, LogTemplate
|
|
13
|
+
from cobweb.base import Seed, Status, TaskQueue, BaseItem, Request, Response, logger
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Distributor(threading.Thread):
|
|
17
|
+
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
task: str,
|
|
21
|
+
project: str,
|
|
22
|
+
task_queue: TaskQueue,
|
|
23
|
+
stop: threading.Event,
|
|
24
|
+
pause: threading.Event,
|
|
25
|
+
callback_register: Callable,
|
|
26
|
+
SpiderCrawler: Type[Crawler]
|
|
27
|
+
):
|
|
28
|
+
super().__init__()
|
|
29
|
+
self.task = task
|
|
30
|
+
self.project = project
|
|
31
|
+
self.pause = pause
|
|
32
|
+
|
|
33
|
+
self.task_queue = task_queue
|
|
34
|
+
|
|
35
|
+
self.callback_register = callback_register
|
|
36
|
+
self.Crawler = SpiderCrawler
|
|
37
|
+
|
|
38
|
+
from cobweb import setting
|
|
39
|
+
self.time_sleep = setting.SPIDER_TIME_SLEEP
|
|
40
|
+
self.thread_num = setting.SPIDER_THREAD_NUM
|
|
41
|
+
self.max_retries = setting.SPIDER_MAX_RETRIES
|
|
42
|
+
self.loghub_dot = LoghubDot(stop=stop, project=self.project, task=self.task)
|
|
43
|
+
|
|
44
|
+
logger.debug(f"Distribute instance attrs: {self.__dict__}")
|
|
45
|
+
|
|
46
|
+
def distribute(self, task_id, item, status: Status):
|
|
47
|
+
if isinstance(item, Request):
|
|
48
|
+
item.seed.params.request_time = time.time()
|
|
49
|
+
self.loghub_dot._build_request_log(item)
|
|
50
|
+
self.process(task_id=task_id, item=item, callback=self.Crawler.download, status=Status.PROCESSING)
|
|
51
|
+
|
|
52
|
+
elif isinstance(item, Response):
|
|
53
|
+
if status == Status.FINISHED:
|
|
54
|
+
raise TypeError("parse function can't yield a Response instance")
|
|
55
|
+
item.seed.params.download_time = time.time()
|
|
56
|
+
logger.debug(LogTemplate.download_info.format(
|
|
57
|
+
detail=LogTemplate.log_info(item.seed.to_dict),
|
|
58
|
+
retry=item.seed.params.retry,
|
|
59
|
+
priority=item.seed.params.priority,
|
|
60
|
+
seed_version=item.seed.params.seed_version,
|
|
61
|
+
identifier=item.seed.identifier or "",
|
|
62
|
+
status=item.response,
|
|
63
|
+
response=LogTemplate.log_info(item.to_dict)
|
|
64
|
+
))
|
|
65
|
+
self.loghub_dot._build_download_log(item)
|
|
66
|
+
self.process(task_id=task_id, item=item, callback=self.Crawler.parse, status=Status.FINISHED)
|
|
67
|
+
|
|
68
|
+
elif isinstance(item, BaseItem):
|
|
69
|
+
item.seed.params.parse_time = time.time()
|
|
70
|
+
self.loghub_dot._build_parse_log(item)
|
|
71
|
+
self.task_queue.add_task(data=item, status=Status.UPLOAD, parent_id=task_id)
|
|
72
|
+
|
|
73
|
+
elif isinstance(item, Seed):
|
|
74
|
+
# todo: 新种子日志
|
|
75
|
+
item.seed.params.insert_time = time.time()
|
|
76
|
+
self.task_queue.add_task(
|
|
77
|
+
task_id=item.sid, data=item, status=Status.INSERT,
|
|
78
|
+
priority=item.params.priority, parent_id=task_id
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
elif isinstance(item, str) and item != DealModel.done:
|
|
82
|
+
raise TypeError("yield value type error!")
|
|
83
|
+
|
|
84
|
+
def process(self, task_id, item, callback, status: Status):
|
|
85
|
+
iterators = callback(item)
|
|
86
|
+
if not isgenerator(iterators):
|
|
87
|
+
raise TypeError(f"{callback.__name__} function isn't a generator!")
|
|
88
|
+
for it in iterators:
|
|
89
|
+
self.distribute(task_id=task_id, item=it, status=status)
|
|
90
|
+
|
|
91
|
+
@check_pause
|
|
92
|
+
def spider(self):
|
|
93
|
+
if task_item := self.task_queue.get_pending_task():
|
|
94
|
+
finsh_status = True
|
|
95
|
+
seed = task_item.data
|
|
96
|
+
status = Status.FINISHED
|
|
97
|
+
task_id = task_item.task_id
|
|
98
|
+
seed.params.start_time = time.time()
|
|
99
|
+
|
|
100
|
+
if seed.params.retry and isinstance(seed.params.retry, int):
|
|
101
|
+
time.sleep(self.time_sleep * seed.params.retry / 100)
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
self.process(task_id=task_id, item=seed, callback=self.Crawler.request, status=Status.PENDING)
|
|
105
|
+
except Exception as e:
|
|
106
|
+
seed.params.retry += 1
|
|
107
|
+
seed.params.failed_time = time.time()
|
|
108
|
+
msg = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
|
109
|
+
if not seed.params.msg:
|
|
110
|
+
seed.params.traceback = [msg]
|
|
111
|
+
elif isinstance(seed.params.msg, list):
|
|
112
|
+
seed.params.traceback.append(msg)
|
|
113
|
+
|
|
114
|
+
if isinstance(e, RequestException):
|
|
115
|
+
self.loghub_dot._build_http_error_log(seed, e)
|
|
116
|
+
else:
|
|
117
|
+
self.loghub_dot._build_exception_log(seed, e)
|
|
118
|
+
|
|
119
|
+
if seed.params.retry < self.max_retries:
|
|
120
|
+
status = Status.PENDING
|
|
121
|
+
finsh_status = False
|
|
122
|
+
|
|
123
|
+
logger.info(LogTemplate.download_exception.format(
|
|
124
|
+
detail=LogTemplate.log_info(seed.to_dict),
|
|
125
|
+
retry=seed.params.retry,
|
|
126
|
+
priority=seed.params.priority,
|
|
127
|
+
seed_version=seed.params.seed_version,
|
|
128
|
+
identifier=seed.identifier or "",
|
|
129
|
+
exception=msg
|
|
130
|
+
))
|
|
131
|
+
|
|
132
|
+
finally:
|
|
133
|
+
if finsh_status:
|
|
134
|
+
seed.params.finsh_time = time.time()
|
|
135
|
+
self.loghub_dot._build_finish_log(seed, status=bool(seed.params.retry < self.max_retries))
|
|
136
|
+
self.task_queue.update_task(task_id, status=status, data=seed)
|
|
137
|
+
|
|
138
|
+
def run(self):
|
|
139
|
+
self.callback_register(self.loghub_dot._build_run, tag="LoghubDot")
|
|
140
|
+
for _ in range(self.thread_num):
|
|
141
|
+
self.callback_register(self.spider, tag="Distributor")
|
cobweb/launchers/launcher.py
CHANGED
|
@@ -1,57 +1,25 @@
|
|
|
1
1
|
import time
|
|
2
|
+
import uuid
|
|
2
3
|
import inspect
|
|
3
4
|
import threading
|
|
4
5
|
import importlib
|
|
5
|
-
from functools import wraps
|
|
6
6
|
|
|
7
7
|
from cobweb import setting
|
|
8
|
-
from cobweb.
|
|
8
|
+
from cobweb.launchers.uploader import Uploader
|
|
9
9
|
from cobweb.utils.tools import dynamic_load_class
|
|
10
|
+
from cobweb.launchers.distributor import Distributor
|
|
11
|
+
from cobweb.base import Seed, logger, TaskQueue, Status
|
|
12
|
+
from typing import Optional, Union, Dict, Any, Callable
|
|
10
13
|
|
|
11
14
|
|
|
12
|
-
|
|
13
|
-
@wraps(func)
|
|
14
|
-
def wrapper(self, *args, **kwargs):
|
|
15
|
-
while not self._pause.is_set():
|
|
16
|
-
try:
|
|
17
|
-
func(self, *args, **kwargs)
|
|
18
|
-
except Exception as e:
|
|
19
|
-
logger.info(f"{func.__name__}: " + str(e))
|
|
20
|
-
finally:
|
|
21
|
-
time.sleep(0.1)
|
|
15
|
+
class Launcher:
|
|
22
16
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
class Launcher(threading.Thread):
|
|
27
|
-
|
|
28
|
-
SEEDS = []
|
|
29
|
-
|
|
30
|
-
__DOING__ = {}
|
|
31
|
-
|
|
32
|
-
__CUSTOM_FUNC__ = {
|
|
33
|
-
# "download": None,
|
|
34
|
-
# "request": None,
|
|
35
|
-
# "parse": None,
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
__LAUNCHER_QUEUE__ = {
|
|
39
|
-
"new": Queue(),
|
|
40
|
-
"todo": Queue(),
|
|
41
|
-
"done": Queue(),
|
|
42
|
-
"upload": Queue()
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
__LAUNCHER_FUNC__ = [
|
|
46
|
-
"_reset",
|
|
47
|
-
"_scheduler",
|
|
48
|
-
"_insert",
|
|
49
|
-
"_refresh",
|
|
50
|
-
"_delete",
|
|
51
|
-
]
|
|
17
|
+
__REGISTER_FUNC__: Dict[str, Callable] = {}
|
|
18
|
+
__WORKER_THREAD__: Dict[str, threading.Thread] = {}
|
|
52
19
|
|
|
53
20
|
def __init__(self, task, project, custom_setting=None, **kwargs):
|
|
54
21
|
super().__init__()
|
|
22
|
+
|
|
55
23
|
self.task = task
|
|
56
24
|
self.project = project
|
|
57
25
|
|
|
@@ -59,51 +27,37 @@ class Launcher(threading.Thread):
|
|
|
59
27
|
self._stop = threading.Event() # 结束事件
|
|
60
28
|
self._pause = threading.Event() # 暂停事件
|
|
61
29
|
|
|
62
|
-
_setting =
|
|
30
|
+
_setting = self._load_custom_settings(custom_setting)
|
|
63
31
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
else:
|
|
68
|
-
if isinstance(custom_setting, str):
|
|
69
|
-
custom_setting = importlib.import_module(custom_setting)
|
|
70
|
-
if not inspect.ismodule(custom_setting):
|
|
71
|
-
raise Exception
|
|
72
|
-
for k, v in custom_setting.__dict__.items():
|
|
73
|
-
if not k.startswith("__") and not inspect.ismodule(v):
|
|
74
|
-
_setting[k] = v
|
|
75
|
-
|
|
76
|
-
_setting.update(**kwargs)
|
|
77
|
-
|
|
78
|
-
for k, v in _setting.items():
|
|
79
|
-
setattr(setting, k.upper(), v)
|
|
80
|
-
|
|
81
|
-
self._Crawler = dynamic_load_class(setting.CRAWLER)
|
|
82
|
-
self._Pipeline = dynamic_load_class(setting.PIPELINE)
|
|
83
|
-
|
|
84
|
-
self._before_scheduler_wait_seconds = setting.BEFORE_SCHEDULER_WAIT_SECONDS
|
|
85
|
-
self._scheduler_wait_seconds = setting.SCHEDULER_WAIT_SECONDS
|
|
86
|
-
self._todo_queue_full_wait_seconds = setting.TODO_QUEUE_FULL_WAIT_SECONDS
|
|
87
|
-
self._new_queue_wait_seconds = setting.NEW_QUEUE_WAIT_SECONDS
|
|
88
|
-
self._done_queue_wait_seconds = setting.DONE_QUEUE_WAIT_SECONDS
|
|
89
|
-
self._upload_queue_wait_seconds = setting.UPLOAD_QUEUE_WAIT_SECONDS
|
|
90
|
-
self._seed_reset_seconds = setting.SEED_RESET_SECONDS
|
|
91
|
-
|
|
92
|
-
self._todo_queue_size = setting.TODO_QUEUE_SIZE
|
|
93
|
-
self._new_queue_max_size = setting.NEW_QUEUE_MAX_SIZE
|
|
94
|
-
self._done_queue_max_size = setting.DONE_QUEUE_MAX_SIZE
|
|
95
|
-
self._upload_queue_max_size = setting.UPLOAD_QUEUE_MAX_SIZE
|
|
96
|
-
|
|
97
|
-
self._spider_max_retries = setting.SPIDER_MAX_RETRIES
|
|
98
|
-
self._spider_thread_num = setting.SPIDER_THREAD_NUM
|
|
99
|
-
self._spider_time_sleep = setting.SPIDER_TIME_SLEEP
|
|
100
|
-
self._spider_max_count = setting.SPIDER_MAX_COUNT
|
|
101
|
-
self._time_window = setting.TIME_WINDOW
|
|
32
|
+
_setting.update(kwargs)
|
|
33
|
+
for key, value in _setting.items():
|
|
34
|
+
setattr(setting, key.upper(), value)
|
|
102
35
|
|
|
103
36
|
self._done_model = setting.DONE_MODEL
|
|
104
37
|
self._task_model = setting.TASK_MODEL
|
|
105
38
|
|
|
106
|
-
self.
|
|
39
|
+
self._task_queue = TaskQueue()
|
|
40
|
+
|
|
41
|
+
self.Scheduler = dynamic_load_class(setting.SCHEDULER)
|
|
42
|
+
self.SpiderCrawler = dynamic_load_class(setting.CRAWLER)
|
|
43
|
+
self.SpiderPipeline = dynamic_load_class(setting.PIPELINE)
|
|
44
|
+
|
|
45
|
+
@staticmethod
|
|
46
|
+
def _load_custom_settings(custom_setting: Optional[Union[str, Dict]]) -> Dict[str, Any]:
|
|
47
|
+
_setting = {}
|
|
48
|
+
if custom_setting:
|
|
49
|
+
if isinstance(custom_setting, dict):
|
|
50
|
+
_setting = custom_setting
|
|
51
|
+
elif isinstance(custom_setting, str):
|
|
52
|
+
module = importlib.import_module(custom_setting)
|
|
53
|
+
_setting = {
|
|
54
|
+
k: v
|
|
55
|
+
for k, v in module.__dict__.items()
|
|
56
|
+
if not k.startswith("__") and not inspect.ismodule(v)
|
|
57
|
+
}
|
|
58
|
+
else:
|
|
59
|
+
raise ValueError("custom_setting must be a dictionary or a module path.")
|
|
60
|
+
return _setting
|
|
107
61
|
|
|
108
62
|
@property
|
|
109
63
|
def request(self):
|
|
@@ -117,7 +71,7 @@ class Launcher(threading.Thread):
|
|
|
117
71
|
yield Request(seed.url, seed)
|
|
118
72
|
"""
|
|
119
73
|
def decorator(func):
|
|
120
|
-
self.
|
|
74
|
+
self.SpiderCrawler.request = func
|
|
121
75
|
return decorator
|
|
122
76
|
|
|
123
77
|
@property
|
|
@@ -132,7 +86,7 @@ class Launcher(threading.Thread):
|
|
|
132
86
|
yield Response(item.seed, response)
|
|
133
87
|
"""
|
|
134
88
|
def decorator(func):
|
|
135
|
-
self.
|
|
89
|
+
self.SpiderCrawler.download = func
|
|
136
90
|
return decorator
|
|
137
91
|
|
|
138
92
|
@property
|
|
@@ -147,87 +101,71 @@ class Launcher(threading.Thread):
|
|
|
147
101
|
yield xxxItem(seed, **kwargs)
|
|
148
102
|
"""
|
|
149
103
|
def decorator(func):
|
|
150
|
-
self.
|
|
104
|
+
self.SpiderCrawler.parse = func
|
|
151
105
|
return decorator
|
|
152
106
|
|
|
153
|
-
def start_seeds(self):
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
return
|
|
165
|
-
|
|
166
|
-
def
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
107
|
+
def start_seeds(self, seeds: list[Union[str, Dict]]) -> list[Seed]:
|
|
108
|
+
seed_list = [Seed(seed) for seed in seeds]
|
|
109
|
+
for seed in seed_list:
|
|
110
|
+
self._task_queue.add_task(
|
|
111
|
+
task_id=seed.sid,
|
|
112
|
+
data=seed,
|
|
113
|
+
status=Status.PENDING,
|
|
114
|
+
priority=seed.params.priority,
|
|
115
|
+
parent_id=None,
|
|
116
|
+
ttl_seconds=None
|
|
117
|
+
)
|
|
118
|
+
return seed_list
|
|
119
|
+
|
|
120
|
+
def _register(self, func: Callable, tag: str = "launcher"):
|
|
121
|
+
name = f"{tag}:{func.__name__}_{uuid.uuid4()}"
|
|
122
|
+
self.__REGISTER_FUNC__[name] = func
|
|
123
|
+
if not self.__WORKER_THREAD__.get(name):
|
|
124
|
+
worker_thread = threading.Thread(name=name, target=func)
|
|
125
|
+
self.__WORKER_THREAD__[name] = worker_thread
|
|
126
|
+
|
|
127
|
+
def _monitor(self):
|
|
128
|
+
while not self._stop.is_set():
|
|
129
|
+
if not self._pause.is_set():
|
|
130
|
+
for name, worker_thread in list(self.__WORKER_THREAD__.items()):
|
|
131
|
+
if not worker_thread.is_alive():
|
|
132
|
+
logger.debug(f"{name} thread is dead. Restarting...")
|
|
133
|
+
func = self.__REGISTER_FUNC__[name]
|
|
134
|
+
worker_thread = threading.Thread(name=name, target=func)
|
|
135
|
+
self.__WORKER_THREAD__[name] = worker_thread
|
|
136
|
+
worker_thread.start()
|
|
137
|
+
time.sleep(15)
|
|
138
|
+
logger.info("monitor thread close!")
|
|
139
|
+
|
|
140
|
+
def start(self):
|
|
141
|
+
self._pause.is_set()
|
|
142
|
+
|
|
143
|
+
self.Scheduler(
|
|
144
|
+
task=self.task,
|
|
145
|
+
project=self.project,
|
|
146
|
+
stop=self._stop,
|
|
147
|
+
pause=self._pause,
|
|
148
|
+
task_queue=self._task_queue,
|
|
149
|
+
callback_register=self._register
|
|
150
|
+
).start()
|
|
187
151
|
|
|
188
|
-
|
|
152
|
+
Distributor(
|
|
153
|
+
task=self.task,
|
|
154
|
+
project=self.project,
|
|
155
|
+
task_queue=self._task_queue,
|
|
156
|
+
callback_register=self._register,
|
|
189
157
|
stop=self._stop, pause=self._pause,
|
|
190
|
-
|
|
191
|
-
get_seed=self._get_seed,
|
|
192
|
-
set_seed=self._set_seed,
|
|
193
|
-
add_seed=self._add_seed,
|
|
194
|
-
delete_seed=self._delete_seed,
|
|
195
|
-
upload_data=self._upload_data,
|
|
196
|
-
custom_func=self.__CUSTOM_FUNC__,
|
|
197
|
-
thread_num = self._spider_thread_num,
|
|
198
|
-
max_retries = self._spider_max_retries,
|
|
199
|
-
time_sleep=self._spider_time_sleep
|
|
158
|
+
SpiderCrawler=self.SpiderCrawler
|
|
200
159
|
).start()
|
|
201
160
|
|
|
202
|
-
|
|
161
|
+
Uploader(
|
|
162
|
+
task=self.task, project=self.project,
|
|
203
163
|
stop=self._stop, pause=self._pause,
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
wait_seconds=self._upload_queue_wait_seconds
|
|
164
|
+
task_queue=self._task_queue,
|
|
165
|
+
callback_register=self._register,
|
|
166
|
+
SpiderPipeline=self.SpiderPipeline
|
|
208
167
|
).start()
|
|
209
168
|
|
|
210
|
-
self.
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
def _execute_heartbeat(self):
|
|
214
|
-
pass
|
|
215
|
-
|
|
216
|
-
def _reset(self):
|
|
217
|
-
pass
|
|
218
|
-
|
|
219
|
-
def _scheduler(self):
|
|
220
|
-
pass
|
|
221
|
-
|
|
222
|
-
def _insert(self):
|
|
223
|
-
pass
|
|
224
|
-
|
|
225
|
-
def _refresh(self):
|
|
226
|
-
pass
|
|
227
|
-
|
|
228
|
-
def _delete(self):
|
|
229
|
-
pass
|
|
230
|
-
|
|
231
|
-
def _polling(self):
|
|
232
|
-
pass
|
|
169
|
+
self._monitor()
|
|
170
|
+
logger.info("task done!")
|
|
233
171
|
|