cobweb-launcher 1.0.5__py3-none-any.whl → 3.2.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cobweb/__init__.py +5 -1
- cobweb/base/__init__.py +3 -3
- cobweb/base/common_queue.py +37 -16
- cobweb/base/item.py +40 -14
- cobweb/base/{log.py → logger.py} +3 -3
- cobweb/base/request.py +744 -47
- cobweb/base/response.py +381 -13
- cobweb/base/seed.py +98 -50
- cobweb/base/task_queue.py +180 -0
- cobweb/base/test.py +257 -0
- cobweb/constant.py +39 -2
- cobweb/crawlers/__init__.py +1 -2
- cobweb/crawlers/crawler.py +27 -0
- cobweb/db/__init__.py +1 -0
- cobweb/db/api_db.py +83 -0
- cobweb/db/redis_db.py +118 -27
- cobweb/launchers/__init__.py +3 -1
- cobweb/launchers/distributor.py +141 -0
- cobweb/launchers/launcher.py +103 -130
- cobweb/launchers/uploader.py +68 -0
- cobweb/log_dots/__init__.py +2 -0
- cobweb/log_dots/dot.py +258 -0
- cobweb/log_dots/loghub_dot.py +53 -0
- cobweb/pipelines/__init__.py +3 -2
- cobweb/pipelines/pipeline.py +19 -0
- cobweb/pipelines/pipeline_csv.py +25 -0
- cobweb/pipelines/pipeline_loghub.py +54 -0
- cobweb/schedulers/__init__.py +1 -0
- cobweb/schedulers/scheduler.py +66 -0
- cobweb/schedulers/scheduler_with_redis.py +189 -0
- cobweb/setting.py +37 -38
- cobweb/utils/__init__.py +5 -2
- cobweb/utils/bloom.py +58 -0
- cobweb/{base → utils}/decorators.py +14 -12
- cobweb/utils/dotting.py +300 -0
- cobweb/utils/oss.py +113 -86
- cobweb/utils/tools.py +3 -15
- cobweb_launcher-3.2.18.dist-info/METADATA +193 -0
- cobweb_launcher-3.2.18.dist-info/RECORD +44 -0
- {cobweb_launcher-1.0.5.dist-info → cobweb_launcher-3.2.18.dist-info}/WHEEL +1 -1
- cobweb/crawlers/base_crawler.py +0 -121
- cobweb/crawlers/file_crawler.py +0 -181
- cobweb/launchers/launcher_pro.py +0 -174
- cobweb/pipelines/base_pipeline.py +0 -54
- cobweb/pipelines/loghub_pipeline.py +0 -34
- cobweb_launcher-1.0.5.dist-info/METADATA +0 -48
- cobweb_launcher-1.0.5.dist-info/RECORD +0 -32
- {cobweb_launcher-1.0.5.dist-info → cobweb_launcher-3.2.18.dist-info}/LICENSE +0 -0
- {cobweb_launcher-1.0.5.dist-info → cobweb_launcher-3.2.18.dist-info}/top_level.txt +0 -0
cobweb/db/redis_db.py
CHANGED
|
@@ -1,68 +1,161 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import time
|
|
1
4
|
import redis
|
|
2
|
-
from
|
|
5
|
+
from redis.exceptions import ConnectionError, TimeoutError
|
|
3
6
|
|
|
4
7
|
|
|
5
8
|
class RedisDB:
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
9
|
+
def __init__(
|
|
10
|
+
self,
|
|
11
|
+
host=None,
|
|
12
|
+
password=None,
|
|
13
|
+
port=6379, db=0
|
|
14
|
+
):
|
|
15
|
+
self.host = host or os.getenv("REDIS_HOST", "localhost")
|
|
16
|
+
self.password = password or os.getenv("REDIS_PASSWORD")
|
|
17
|
+
self.port = port or os.getenv("REDIS_PORT", 6379)
|
|
18
|
+
self.db = db or os.getenv("REDIS_DB", 0)
|
|
19
|
+
|
|
20
|
+
self.max_retries = 5
|
|
21
|
+
self.retry_delay = 5
|
|
22
|
+
self.client = None
|
|
23
|
+
self.connect()
|
|
24
|
+
|
|
25
|
+
def connect(self):
|
|
26
|
+
retries = 0
|
|
27
|
+
while retries < self.max_retries:
|
|
28
|
+
try:
|
|
29
|
+
self.client = redis.Redis(
|
|
30
|
+
host=self.host,
|
|
31
|
+
port=self.port,
|
|
32
|
+
password=self.password,
|
|
33
|
+
db=self.db,
|
|
34
|
+
socket_timeout=5,
|
|
35
|
+
socket_connect_timeout=5
|
|
36
|
+
)
|
|
37
|
+
self.client.ping()
|
|
38
|
+
return
|
|
39
|
+
except (ConnectionError, TimeoutError) as e:
|
|
40
|
+
retries += 1
|
|
41
|
+
if retries < self.max_retries:
|
|
42
|
+
time.sleep(self.retry_delay)
|
|
43
|
+
else:
|
|
44
|
+
raise Exception("达到最大重试次数,无法连接 Redis")
|
|
45
|
+
|
|
46
|
+
def is_connected(self):
|
|
47
|
+
try:
|
|
48
|
+
self.client.ping()
|
|
49
|
+
return True
|
|
50
|
+
except (ConnectionError, TimeoutError):
|
|
51
|
+
return False
|
|
52
|
+
|
|
53
|
+
def reconnect(self):
|
|
54
|
+
self.connect()
|
|
55
|
+
|
|
56
|
+
def execute_command(self, command, *args, **kwargs):
|
|
57
|
+
retries = 0
|
|
58
|
+
while retries < self.max_retries:
|
|
59
|
+
try:
|
|
60
|
+
if not self.is_connected():
|
|
61
|
+
self.reconnect()
|
|
62
|
+
return getattr(self.client, command)(*args, **kwargs)
|
|
63
|
+
except (ConnectionError, TimeoutError) as e:
|
|
64
|
+
retries += 1
|
|
65
|
+
if retries < self.max_retries:
|
|
66
|
+
time.sleep(self.retry_delay)
|
|
67
|
+
else:
|
|
68
|
+
raise Exception("达到最大重试次数,无法执行命令")
|
|
69
|
+
|
|
70
|
+
def get(self, name):
|
|
71
|
+
# with self.get_connection() as client:
|
|
72
|
+
# return client.get(name)
|
|
73
|
+
return self.execute_command("get", name)
|
|
74
|
+
|
|
75
|
+
def incrby(self, name, value):
|
|
76
|
+
# with self.get_connection() as client:
|
|
77
|
+
# client.incrby(name, value)
|
|
78
|
+
self.execute_command("incrby", name, value)
|
|
10
79
|
|
|
11
80
|
def setnx(self, name, value=""):
|
|
12
|
-
|
|
81
|
+
# with self.get_connection() as client:
|
|
82
|
+
# client.setnx(name, value)
|
|
83
|
+
self.execute_command("setnx", name, value)
|
|
13
84
|
|
|
14
85
|
def setex(self, name, t, value=""):
|
|
15
|
-
|
|
86
|
+
# with self.get_connection() as client:
|
|
87
|
+
# client.setex(name, t, value)
|
|
88
|
+
self.execute_command("setex", name, t, value)
|
|
16
89
|
|
|
17
90
|
def expire(self, name, t, nx: bool = False, xx: bool = False, gt: bool = False, lt: bool = False):
|
|
18
|
-
|
|
91
|
+
# with self.get_connection() as client:
|
|
92
|
+
# client.expire(name, t, nx, xx, gt, lt)
|
|
93
|
+
self.execute_command("expire", name, t, nx, xx, gt, lt)
|
|
19
94
|
|
|
20
95
|
def ttl(self, name):
|
|
21
|
-
|
|
96
|
+
# with self.get_connection() as client:
|
|
97
|
+
# return client.ttl(name)
|
|
98
|
+
return self.execute_command("ttl", name)
|
|
22
99
|
|
|
23
100
|
def delete(self, name):
|
|
24
|
-
|
|
101
|
+
# with self.get_connection() as client:
|
|
102
|
+
# return client.delete(name)
|
|
103
|
+
return self.execute_command("delete", name)
|
|
25
104
|
|
|
26
105
|
def exists(self, *name) -> bool:
|
|
27
|
-
|
|
106
|
+
# with self.get_connection() as client:
|
|
107
|
+
# return client.exists(*name)
|
|
108
|
+
return self.execute_command("exists", *name)
|
|
28
109
|
|
|
29
110
|
def sadd(self, name, value):
|
|
30
|
-
|
|
111
|
+
# with self.get_connection() as client:
|
|
112
|
+
# return client.sadd(name, value)
|
|
113
|
+
return self.execute_command("sadd", name, value)
|
|
31
114
|
|
|
32
115
|
def zcard(self, name) -> bool:
|
|
33
|
-
|
|
116
|
+
# with self.get_connection() as client:
|
|
117
|
+
# return client.zcard(name)
|
|
118
|
+
return self.execute_command("zcard", name)
|
|
34
119
|
|
|
35
120
|
def zadd(self, name, item: dict, **kwargs):
|
|
36
|
-
|
|
121
|
+
# with self.get_connection() as client:
|
|
122
|
+
# return client.zadd(name, item, **kwargs)
|
|
123
|
+
if item:
|
|
124
|
+
return self.execute_command("zadd", name, item, **kwargs)
|
|
37
125
|
|
|
38
126
|
def zrem(self, name, *value):
|
|
39
|
-
|
|
127
|
+
# with self.get_connection() as client:
|
|
128
|
+
# return client.zrem(name, *value)
|
|
129
|
+
return self.execute_command("zrem", name, *value)
|
|
40
130
|
|
|
41
131
|
def zcount(self, name, _min, _max):
|
|
42
|
-
|
|
132
|
+
# with self.get_connection() as client:
|
|
133
|
+
# return client.zcount(name, _min, _max)
|
|
134
|
+
return self.execute_command("zcount", name, _min, _max)
|
|
43
135
|
|
|
44
136
|
# def zrangebyscore(self, name, _min, _max, start, num, withscores: bool = False, *args):
|
|
45
|
-
#
|
|
137
|
+
# with self.get_connection() as client:
|
|
138
|
+
# return client.zrangebyscore(name, _min, _max, start, num, withscores, *args)
|
|
46
139
|
|
|
47
140
|
def lua(self, script: str, keys: list = None, args: list = None):
|
|
48
141
|
keys = keys or []
|
|
49
142
|
args = args or []
|
|
50
143
|
keys_count = len(keys)
|
|
51
|
-
return self.
|
|
144
|
+
return self.execute_command("eval", script, keys_count, *keys, *args)
|
|
52
145
|
|
|
53
146
|
def lua_sha(self, sha1: str, keys: list = None, args: list = None):
|
|
54
147
|
keys = keys or []
|
|
55
148
|
args = args or []
|
|
56
149
|
keys_count = len(keys)
|
|
57
|
-
return self.
|
|
150
|
+
return self.execute_command("evalsha", sha1, keys_count, *keys, *args)
|
|
58
151
|
|
|
59
152
|
def execute_lua(self, lua_script: str, keys: list, *args):
|
|
60
|
-
execute = self.
|
|
153
|
+
execute = self.execute_command("register_script", lua_script)
|
|
61
154
|
return execute(keys=keys, args=args)
|
|
62
155
|
|
|
63
156
|
def lock(self, key, t=15) -> bool:
|
|
64
157
|
lua_script = """
|
|
65
|
-
local status = redis.call('setnx', KEYS[1],
|
|
158
|
+
local status = redis.call('setnx', KEYS[1], 1)
|
|
66
159
|
if ( status == 1 ) then
|
|
67
160
|
redis.call('expire', KEYS[1], ARGV[1])
|
|
68
161
|
end
|
|
@@ -71,7 +164,7 @@ class RedisDB:
|
|
|
71
164
|
status = self.execute_lua(lua_script, [key], t)
|
|
72
165
|
return bool(status)
|
|
73
166
|
|
|
74
|
-
def members(self, key, score, start=0, count=
|
|
167
|
+
def members(self, key, score, start=0, count=1000, _min="-inf", _max="+inf") -> list:
|
|
75
168
|
lua_script = """
|
|
76
169
|
local min = ARGV[1]
|
|
77
170
|
local max = ARGV[2]
|
|
@@ -85,7 +178,7 @@ class RedisDB:
|
|
|
85
178
|
else
|
|
86
179
|
members = redis.call('zrangebyscore', KEYS[1], min, max, 'WITHSCORES', 'limit', start, count)
|
|
87
180
|
end
|
|
88
|
-
|
|
181
|
+
|
|
89
182
|
local result = {}
|
|
90
183
|
|
|
91
184
|
for i = 1, #members, 2 do
|
|
@@ -97,7 +190,7 @@ class RedisDB:
|
|
|
97
190
|
else
|
|
98
191
|
originPriority = math.floor(members[i+1])
|
|
99
192
|
end
|
|
100
|
-
|
|
193
|
+
|
|
101
194
|
if ( score + 0 >= 1000 ) then
|
|
102
195
|
priority = -score - originPriority / 1000
|
|
103
196
|
elseif ( score + 0 == 0 ) then
|
|
@@ -116,7 +209,7 @@ class RedisDB:
|
|
|
116
209
|
members = self.execute_lua(lua_script, [key], _min, _max, start, count, score)
|
|
117
210
|
return [(members[i].decode(), int(members[i + 1])) for i in range(0, len(members), 2)]
|
|
118
211
|
|
|
119
|
-
def done(self, keys: list, *args)
|
|
212
|
+
def done(self, keys: list, *args):
|
|
120
213
|
lua_script = """
|
|
121
214
|
for i, member in ipairs(ARGV) do
|
|
122
215
|
redis.call("zrem", KEYS[1], member)
|
|
@@ -124,5 +217,3 @@ class RedisDB:
|
|
|
124
217
|
end
|
|
125
218
|
"""
|
|
126
219
|
self.execute_lua(lua_script, keys, *args)
|
|
127
|
-
|
|
128
|
-
|
cobweb/launchers/__init__.py
CHANGED
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import threading
|
|
3
|
+
import traceback
|
|
4
|
+
|
|
5
|
+
from inspect import isgenerator
|
|
6
|
+
from typing import Callable, Type
|
|
7
|
+
from requests import RequestException
|
|
8
|
+
|
|
9
|
+
from cobweb.crawlers import Crawler
|
|
10
|
+
from cobweb.utils import check_pause
|
|
11
|
+
from cobweb.log_dots import LoghubDot
|
|
12
|
+
from cobweb.constant import DealModel, LogTemplate
|
|
13
|
+
from cobweb.base import Seed, Status, TaskQueue, BaseItem, Request, Response, logger
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Distributor(threading.Thread):
|
|
17
|
+
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
task: str,
|
|
21
|
+
project: str,
|
|
22
|
+
task_queue: TaskQueue,
|
|
23
|
+
stop: threading.Event,
|
|
24
|
+
pause: threading.Event,
|
|
25
|
+
callback_register: Callable,
|
|
26
|
+
SpiderCrawler: Type[Crawler]
|
|
27
|
+
):
|
|
28
|
+
super().__init__()
|
|
29
|
+
self.task = task
|
|
30
|
+
self.project = project
|
|
31
|
+
self.pause = pause
|
|
32
|
+
|
|
33
|
+
self.task_queue = task_queue
|
|
34
|
+
|
|
35
|
+
self.callback_register = callback_register
|
|
36
|
+
self.Crawler = SpiderCrawler
|
|
37
|
+
|
|
38
|
+
from cobweb import setting
|
|
39
|
+
self.time_sleep = setting.SPIDER_TIME_SLEEP
|
|
40
|
+
self.thread_num = setting.SPIDER_THREAD_NUM
|
|
41
|
+
self.max_retries = setting.SPIDER_MAX_RETRIES
|
|
42
|
+
self.loghub_dot = LoghubDot(stop=stop, project=self.project, task=self.task)
|
|
43
|
+
|
|
44
|
+
logger.debug(f"Distribute instance attrs: {self.__dict__}")
|
|
45
|
+
|
|
46
|
+
def distribute(self, task_id, item, status: Status):
|
|
47
|
+
if isinstance(item, Request):
|
|
48
|
+
item.seed.params.request_time = time.time()
|
|
49
|
+
self.loghub_dot._build_request_log(item)
|
|
50
|
+
self.process(task_id=task_id, item=item, callback=self.Crawler.download, status=Status.PROCESSING)
|
|
51
|
+
|
|
52
|
+
elif isinstance(item, Response):
|
|
53
|
+
if status == Status.FINISHED:
|
|
54
|
+
raise TypeError("parse function can't yield a Response instance")
|
|
55
|
+
item.seed.params.download_time = time.time()
|
|
56
|
+
logger.debug(LogTemplate.download_info.format(
|
|
57
|
+
detail=LogTemplate.log_info(item.seed.to_dict),
|
|
58
|
+
retry=item.seed.params.retry,
|
|
59
|
+
priority=item.seed.params.priority,
|
|
60
|
+
seed_version=item.seed.params.seed_version,
|
|
61
|
+
identifier=item.seed.identifier or "",
|
|
62
|
+
status=item.response,
|
|
63
|
+
response=LogTemplate.log_info(item.to_dict)
|
|
64
|
+
))
|
|
65
|
+
self.loghub_dot._build_download_log(item)
|
|
66
|
+
self.process(task_id=task_id, item=item, callback=self.Crawler.parse, status=Status.FINISHED)
|
|
67
|
+
|
|
68
|
+
elif isinstance(item, BaseItem):
|
|
69
|
+
item.seed.params.parse_time = time.time()
|
|
70
|
+
self.loghub_dot._build_parse_log(item)
|
|
71
|
+
self.task_queue.add_task(data=item, status=Status.UPLOAD, parent_id=task_id)
|
|
72
|
+
|
|
73
|
+
elif isinstance(item, Seed):
|
|
74
|
+
# todo: 新种子日志
|
|
75
|
+
item.seed.params.insert_time = time.time()
|
|
76
|
+
self.task_queue.add_task(
|
|
77
|
+
task_id=item.sid, data=item, status=Status.INSERT,
|
|
78
|
+
priority=item.params.priority, parent_id=task_id
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
elif isinstance(item, str) and item != DealModel.done:
|
|
82
|
+
raise TypeError("yield value type error!")
|
|
83
|
+
|
|
84
|
+
def process(self, task_id, item, callback, status: Status):
|
|
85
|
+
iterators = callback(item)
|
|
86
|
+
if not isgenerator(iterators):
|
|
87
|
+
raise TypeError(f"{callback.__name__} function isn't a generator!")
|
|
88
|
+
for it in iterators:
|
|
89
|
+
self.distribute(task_id=task_id, item=it, status=status)
|
|
90
|
+
|
|
91
|
+
@check_pause
|
|
92
|
+
def spider(self):
|
|
93
|
+
if task_item := self.task_queue.get_pending_task():
|
|
94
|
+
finsh_status = True
|
|
95
|
+
seed = task_item.data
|
|
96
|
+
status = Status.FINISHED
|
|
97
|
+
task_id = task_item.task_id
|
|
98
|
+
seed.params.start_time = time.time()
|
|
99
|
+
|
|
100
|
+
if seed.params.retry and isinstance(seed.params.retry, int):
|
|
101
|
+
time.sleep(self.time_sleep * seed.params.retry / 100)
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
self.process(task_id=task_id, item=seed, callback=self.Crawler.request, status=Status.PENDING)
|
|
105
|
+
except Exception as e:
|
|
106
|
+
seed.params.retry += 1
|
|
107
|
+
seed.params.failed_time = time.time()
|
|
108
|
+
msg = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
|
109
|
+
if not seed.params.msg:
|
|
110
|
+
seed.params.traceback = [msg]
|
|
111
|
+
elif isinstance(seed.params.msg, list):
|
|
112
|
+
seed.params.traceback.append(msg)
|
|
113
|
+
|
|
114
|
+
if isinstance(e, RequestException):
|
|
115
|
+
self.loghub_dot._build_http_error_log(seed, e)
|
|
116
|
+
else:
|
|
117
|
+
self.loghub_dot._build_exception_log(seed, e)
|
|
118
|
+
|
|
119
|
+
if seed.params.retry < self.max_retries:
|
|
120
|
+
status = Status.PENDING
|
|
121
|
+
finsh_status = False
|
|
122
|
+
|
|
123
|
+
logger.info(LogTemplate.download_exception.format(
|
|
124
|
+
detail=LogTemplate.log_info(seed.to_dict),
|
|
125
|
+
retry=seed.params.retry,
|
|
126
|
+
priority=seed.params.priority,
|
|
127
|
+
seed_version=seed.params.seed_version,
|
|
128
|
+
identifier=seed.identifier or "",
|
|
129
|
+
exception=msg
|
|
130
|
+
))
|
|
131
|
+
|
|
132
|
+
finally:
|
|
133
|
+
if finsh_status:
|
|
134
|
+
seed.params.finsh_time = time.time()
|
|
135
|
+
self.loghub_dot._build_finish_log(seed, status=bool(seed.params.retry < self.max_retries))
|
|
136
|
+
self.task_queue.update_task(task_id, status=status, data=seed)
|
|
137
|
+
|
|
138
|
+
def run(self):
|
|
139
|
+
self.callback_register(self.loghub_dot._build_run, tag="LoghubDot")
|
|
140
|
+
for _ in range(self.thread_num):
|
|
141
|
+
self.callback_register(self.spider, tag="Distributor")
|
cobweb/launchers/launcher.py
CHANGED
|
@@ -1,84 +1,63 @@
|
|
|
1
1
|
import time
|
|
2
|
+
import uuid
|
|
2
3
|
import inspect
|
|
3
4
|
import threading
|
|
4
5
|
import importlib
|
|
5
6
|
|
|
6
7
|
from cobweb import setting
|
|
7
|
-
from cobweb.
|
|
8
|
+
from cobweb.launchers.uploader import Uploader
|
|
8
9
|
from cobweb.utils.tools import dynamic_load_class
|
|
10
|
+
from cobweb.launchers.distributor import Distributor
|
|
11
|
+
from cobweb.base import Seed, logger, TaskQueue, Status
|
|
12
|
+
from typing import Optional, Union, Dict, Any, Callable
|
|
9
13
|
|
|
10
14
|
|
|
11
|
-
class Launcher
|
|
15
|
+
class Launcher:
|
|
12
16
|
|
|
13
|
-
|
|
17
|
+
__REGISTER_FUNC__: Dict[str, Callable] = {}
|
|
18
|
+
__WORKER_THREAD__: Dict[str, threading.Thread] = {}
|
|
14
19
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
__CUSTOM_FUNC__ = {
|
|
18
|
-
"download": None,
|
|
19
|
-
"download_midware": None,
|
|
20
|
-
"parse": None,
|
|
21
|
-
}
|
|
22
|
-
|
|
23
|
-
__LAUNCHER_QUEUE__ = {
|
|
24
|
-
"new": Queue(),
|
|
25
|
-
"todo": Queue(),
|
|
26
|
-
"done": Queue(),
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
__LAUNCHER_FUNC__ = [
|
|
30
|
-
"_reset",
|
|
31
|
-
"_scheduler",
|
|
32
|
-
"_insert",
|
|
33
|
-
"_refresh",
|
|
34
|
-
"_delete",
|
|
35
|
-
]
|
|
36
|
-
|
|
37
|
-
def __init__(self, task, project, custom_setting=None):
|
|
20
|
+
def __init__(self, task, project, custom_setting=None, **kwargs):
|
|
38
21
|
super().__init__()
|
|
22
|
+
|
|
39
23
|
self.task = task
|
|
40
24
|
self.project = project
|
|
41
25
|
|
|
26
|
+
self._app_time = int(time.time())
|
|
42
27
|
self._stop = threading.Event() # 结束事件
|
|
43
28
|
self._pause = threading.Event() # 暂停事件
|
|
44
29
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
if isinstance(custom_setting, str):
|
|
51
|
-
custom_setting = importlib.import_module(custom_setting)
|
|
52
|
-
if not inspect.ismodule(custom_setting):
|
|
53
|
-
raise Exception
|
|
54
|
-
for k, v in custom_setting.__dict__.items():
|
|
55
|
-
if not k.startswith("__") and not inspect.ismodule(v):
|
|
56
|
-
setting_[k] = v
|
|
57
|
-
for k, v in setting_.items():
|
|
58
|
-
setattr(setting, k, v)
|
|
59
|
-
|
|
60
|
-
self._Crawler = dynamic_load_class(setting.CRAWLER)
|
|
61
|
-
self._Pipeline = dynamic_load_class(setting.PIPELINE)
|
|
62
|
-
|
|
63
|
-
self._scheduler_wait_seconds = setting.SCHEDULER_WAIT_SECONDS
|
|
64
|
-
self._todo_queue_full_wait_seconds = setting.TODO_QUEUE_FULL_WAIT_SECONDS
|
|
65
|
-
self._new_queue_wait_seconds = setting.NEW_QUEUE_WAIT_SECONDS
|
|
66
|
-
self._done_queue_wait_seconds = setting.DONE_QUEUE_WAIT_SECONDS
|
|
67
|
-
self._upload_queue_wait_seconds = setting.UPLOAD_QUEUE_WAIT_SECONDS
|
|
68
|
-
self._seed_reset_seconds = setting.SEED_RESET_SECONDS
|
|
69
|
-
|
|
70
|
-
self._todo_queue_size = setting.TODO_QUEUE_SIZE
|
|
71
|
-
self._new_queue_max_size = setting.NEW_QUEUE_MAX_SIZE
|
|
72
|
-
self._done_queue_max_size = setting.DONE_QUEUE_MAX_SIZE
|
|
73
|
-
self._upload_queue_max_size = setting.UPLOAD_QUEUE_MAX_SIZE
|
|
30
|
+
_setting = self._load_custom_settings(custom_setting)
|
|
31
|
+
|
|
32
|
+
_setting.update(kwargs)
|
|
33
|
+
for key, value in _setting.items():
|
|
34
|
+
setattr(setting, key.upper(), value)
|
|
74
35
|
|
|
75
36
|
self._done_model = setting.DONE_MODEL
|
|
37
|
+
self._task_model = setting.TASK_MODEL
|
|
76
38
|
|
|
77
|
-
self.
|
|
39
|
+
self._task_queue = TaskQueue()
|
|
78
40
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
41
|
+
self.Scheduler = dynamic_load_class(setting.SCHEDULER)
|
|
42
|
+
self.SpiderCrawler = dynamic_load_class(setting.CRAWLER)
|
|
43
|
+
self.SpiderPipeline = dynamic_load_class(setting.PIPELINE)
|
|
44
|
+
|
|
45
|
+
@staticmethod
|
|
46
|
+
def _load_custom_settings(custom_setting: Optional[Union[str, Dict]]) -> Dict[str, Any]:
|
|
47
|
+
_setting = {}
|
|
48
|
+
if custom_setting:
|
|
49
|
+
if isinstance(custom_setting, dict):
|
|
50
|
+
_setting = custom_setting
|
|
51
|
+
elif isinstance(custom_setting, str):
|
|
52
|
+
module = importlib.import_module(custom_setting)
|
|
53
|
+
_setting = {
|
|
54
|
+
k: v
|
|
55
|
+
for k, v in module.__dict__.items()
|
|
56
|
+
if not k.startswith("__") and not inspect.ismodule(v)
|
|
57
|
+
}
|
|
58
|
+
else:
|
|
59
|
+
raise ValueError("custom_setting must be a dictionary or a module path.")
|
|
60
|
+
return _setting
|
|
82
61
|
|
|
83
62
|
@property
|
|
84
63
|
def request(self):
|
|
@@ -89,10 +68,10 @@ class Launcher(threading.Thread):
|
|
|
89
68
|
@launcher.request
|
|
90
69
|
def request(seed: Seed) -> Union[Request, BaseItem]:
|
|
91
70
|
...
|
|
92
|
-
|
|
71
|
+
yield Request(seed.url, seed)
|
|
93
72
|
"""
|
|
94
73
|
def decorator(func):
|
|
95
|
-
self.
|
|
74
|
+
self.SpiderCrawler.request = func
|
|
96
75
|
return decorator
|
|
97
76
|
|
|
98
77
|
@property
|
|
@@ -107,7 +86,7 @@ class Launcher(threading.Thread):
|
|
|
107
86
|
yield Response(item.seed, response)
|
|
108
87
|
"""
|
|
109
88
|
def decorator(func):
|
|
110
|
-
self.
|
|
89
|
+
self.SpiderCrawler.download = func
|
|
111
90
|
return decorator
|
|
112
91
|
|
|
113
92
|
@property
|
|
@@ -116,83 +95,77 @@ class Launcher(threading.Thread):
|
|
|
116
95
|
自定义parse函数, xxxItem为自定义的存储数据类型
|
|
117
96
|
use case:
|
|
118
97
|
from cobweb.base import Request, Response
|
|
119
|
-
@launcher.
|
|
120
|
-
def
|
|
98
|
+
@launcher.parse
|
|
99
|
+
def parse(item: Response) -> BaseItem:
|
|
121
100
|
...
|
|
122
101
|
yield xxxItem(seed, **kwargs)
|
|
123
102
|
"""
|
|
124
103
|
def decorator(func):
|
|
125
|
-
self.
|
|
104
|
+
self.SpiderCrawler.parse = func
|
|
126
105
|
return decorator
|
|
127
106
|
|
|
128
|
-
def
|
|
129
|
-
for seed in seeds
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
def
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
def _execute(self):
|
|
151
|
-
for func_name in self.__LAUNCHER_FUNC__:
|
|
152
|
-
threading.Thread(name=func_name, target=getattr(self, func_name)).start()
|
|
153
|
-
time.sleep(2)
|
|
154
|
-
|
|
155
|
-
def _polling(self):
|
|
156
|
-
|
|
157
|
-
check_emtpy_times = 0
|
|
158
|
-
|
|
107
|
+
def start_seeds(self, seeds: list[Union[str, Dict]]) -> list[Seed]:
|
|
108
|
+
seed_list = [Seed(seed) for seed in seeds]
|
|
109
|
+
for seed in seed_list:
|
|
110
|
+
self._task_queue.add_task(
|
|
111
|
+
task_id=seed.sid,
|
|
112
|
+
data=seed,
|
|
113
|
+
status=Status.PENDING,
|
|
114
|
+
priority=seed.params.priority,
|
|
115
|
+
parent_id=None,
|
|
116
|
+
ttl_seconds=None
|
|
117
|
+
)
|
|
118
|
+
return seed_list
|
|
119
|
+
|
|
120
|
+
def _register(self, func: Callable, tag: str = "launcher"):
|
|
121
|
+
name = f"{tag}:{func.__name__}_{uuid.uuid4()}"
|
|
122
|
+
self.__REGISTER_FUNC__[name] = func
|
|
123
|
+
if not self.__WORKER_THREAD__.get(name):
|
|
124
|
+
worker_thread = threading.Thread(name=name, target=func)
|
|
125
|
+
self.__WORKER_THREAD__[name] = worker_thread
|
|
126
|
+
|
|
127
|
+
def _monitor(self):
|
|
159
128
|
while not self._stop.is_set():
|
|
129
|
+
if not self._pause.is_set():
|
|
130
|
+
for name, worker_thread in list(self.__WORKER_THREAD__.items()):
|
|
131
|
+
if not worker_thread.is_alive():
|
|
132
|
+
logger.debug(f"{name} thread is dead. Restarting...")
|
|
133
|
+
func = self.__REGISTER_FUNC__[name]
|
|
134
|
+
worker_thread = threading.Thread(name=name, target=func)
|
|
135
|
+
self.__WORKER_THREAD__[name] = worker_thread
|
|
136
|
+
worker_thread.start()
|
|
137
|
+
time.sleep(15)
|
|
138
|
+
logger.info("monitor thread close!")
|
|
139
|
+
|
|
140
|
+
def start(self):
|
|
141
|
+
self._pause.is_set()
|
|
142
|
+
|
|
143
|
+
self.Scheduler(
|
|
144
|
+
task=self.task,
|
|
145
|
+
project=self.project,
|
|
146
|
+
stop=self._stop,
|
|
147
|
+
pause=self._pause,
|
|
148
|
+
task_queue=self._task_queue,
|
|
149
|
+
callback_register=self._register
|
|
150
|
+
).start()
|
|
160
151
|
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
self._pause.clear()
|
|
169
|
-
self._execute()
|
|
170
|
-
|
|
171
|
-
elif queue_not_empty_count == 0:
|
|
172
|
-
check_emtpy_times += 1
|
|
173
|
-
else:
|
|
174
|
-
check_emtpy_times = 0
|
|
175
|
-
|
|
176
|
-
if check_emtpy_times > 2:
|
|
177
|
-
check_emtpy_times = 0
|
|
178
|
-
self.__DOING__ = {}
|
|
179
|
-
self._pause.set()
|
|
180
|
-
|
|
181
|
-
def run(self):
|
|
182
|
-
threading.Thread(target=self._execute_heartbeat).start()
|
|
183
|
-
|
|
184
|
-
self._Crawler(
|
|
185
|
-
upload_queue=self._upload_queue,
|
|
186
|
-
custom_func=self.__CUSTOM_FUNC__,
|
|
187
|
-
launcher_queue=self.__LAUNCHER_QUEUE__,
|
|
152
|
+
Distributor(
|
|
153
|
+
task=self.task,
|
|
154
|
+
project=self.project,
|
|
155
|
+
task_queue=self._task_queue,
|
|
156
|
+
callback_register=self._register,
|
|
157
|
+
stop=self._stop, pause=self._pause,
|
|
158
|
+
SpiderCrawler=self.SpiderCrawler
|
|
188
159
|
).start()
|
|
189
160
|
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
161
|
+
Uploader(
|
|
162
|
+
task=self.task, project=self.project,
|
|
163
|
+
stop=self._stop, pause=self._pause,
|
|
164
|
+
task_queue=self._task_queue,
|
|
165
|
+
callback_register=self._register,
|
|
166
|
+
SpiderPipeline=self.SpiderPipeline
|
|
195
167
|
).start()
|
|
196
168
|
|
|
197
|
-
self.
|
|
198
|
-
|
|
169
|
+
self._monitor()
|
|
170
|
+
logger.info("task done!")
|
|
171
|
+
|