cobweb-launcher 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cobweb/__init__.py +2 -0
- cobweb/base/__init__.py +0 -0
- cobweb/base/bbb.py +187 -0
- cobweb/base/config.py +164 -0
- cobweb/base/decorators.py +95 -0
- cobweb/base/hash_table.py +60 -0
- cobweb/base/interface.py +44 -0
- cobweb/base/log.py +96 -0
- cobweb/base/queue_tmp.py +60 -0
- cobweb/base/request.py +62 -0
- cobweb/base/task.py +38 -0
- cobweb/base/utils.py +15 -0
- cobweb/db/__init__.py +0 -0
- cobweb/db/base/__init__.py +0 -0
- cobweb/db/base/client_db.py +1 -0
- cobweb/db/base/oss_db.py +116 -0
- cobweb/db/base/redis_db.py +214 -0
- cobweb/db/base/redis_dbv3.py +231 -0
- cobweb/db/scheduler/__init__.py +0 -0
- cobweb/db/scheduler/default.py +8 -0
- cobweb/db/scheduler/textfile.py +29 -0
- cobweb/db/storer/__init__.py +0 -0
- cobweb/db/storer/console.py +10 -0
- cobweb/db/storer/loghub.py +55 -0
- cobweb/db/storer/redis.py +16 -0
- cobweb/db/storer/textfile.py +16 -0
- cobweb/distributed/__init__.py +0 -0
- cobweb/distributed/launcher.py +194 -0
- cobweb/distributed/models.py +140 -0
- cobweb/single/__init__.py +0 -0
- cobweb/single/models.py +104 -0
- cobweb/single/nest.py +153 -0
- cobweb_launcher-0.0.1.dist-info/LICENSE +21 -0
- cobweb_launcher-0.0.1.dist-info/METADATA +45 -0
- cobweb_launcher-0.0.1.dist-info/RECORD +37 -0
- cobweb_launcher-0.0.1.dist-info/WHEEL +5 -0
- cobweb_launcher-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,231 @@
|
|
1
|
+
import json
|
2
|
+
import random
|
3
|
+
import time
|
4
|
+
import redis
|
5
|
+
from datetime import datetime
|
6
|
+
from base.bbb import Seed
|
7
|
+
|
8
|
+
|
9
|
+
class RedisDB:
|
10
|
+
|
11
|
+
def __init__(
|
12
|
+
self,
|
13
|
+
project: str,
|
14
|
+
task_name: str,
|
15
|
+
# retry_num: int = 3,
|
16
|
+
host=None,
|
17
|
+
port=None,
|
18
|
+
username=None,
|
19
|
+
password=None,
|
20
|
+
db=0
|
21
|
+
):
|
22
|
+
pool = redis.ConnectionPool(
|
23
|
+
host=host,
|
24
|
+
port=port,
|
25
|
+
username=username,
|
26
|
+
password=password,
|
27
|
+
db=db
|
28
|
+
)
|
29
|
+
self.heartbeat_key = f"{project}:{task_name}:heartbeat" # redis type string
|
30
|
+
self.ready_key = f"{project}:{task_name}:seed_info:ready" # redis type zset, .format(priority)
|
31
|
+
self.spider_key = f"{project}:{task_name}:seed_info:spider" # redis type hash, .format(priority)
|
32
|
+
self.store_key = f"{project}:{task_name}:seed_info:store:%s" # redis type set,
|
33
|
+
self.failed_key = f"{project}:{task_name}:seed_info:failed" # redis type set, .format(priority)
|
34
|
+
self.succeed_key = f"{project}:{task_name}:seed_info:succeed" # redis type set, .format(priority)
|
35
|
+
self.update_lock = f"{project}:{task_name}:update_seed_lock" # redis type string
|
36
|
+
self.check_lock = f"{project}:{task_name}:check_seed_lock" # redis type string
|
37
|
+
# self.retry_lock = f"{project}:{task_name}:retry_seed_lock" # redis type string
|
38
|
+
self.scheduler_lock = f"{project}:{task_name}:scheduler_lock" # redis type string
|
39
|
+
self.client = redis.Redis(connection_pool=pool)
|
40
|
+
# self.retry_num = retry_num
|
41
|
+
|
42
|
+
def set_heartbeat(self, t=3):
|
43
|
+
self.client.expire(self.heartbeat_key, t)
|
44
|
+
|
45
|
+
# @property
|
46
|
+
def heartbeat(self):
|
47
|
+
return self.client.ttl(self.heartbeat_key)
|
48
|
+
|
49
|
+
def iterate_hash(self, key, count=1000, match=None):
|
50
|
+
cursor = "0"
|
51
|
+
while cursor != 0:
|
52
|
+
# 使用HSCAN命令迭代获取键值对
|
53
|
+
cursor, data = self.client.hscan(key, cursor=cursor, match=match, count=count)
|
54
|
+
if not data:
|
55
|
+
return None
|
56
|
+
for field, value in data.items():
|
57
|
+
yield field.decode(), value.decode()
|
58
|
+
|
59
|
+
def get_lock(self, key, t=15, timeout=3, sleep_time=0.1):
|
60
|
+
begin_time = int(time.time())
|
61
|
+
while True:
|
62
|
+
if self.client.setnx(key, ""):
|
63
|
+
self.client.expire(key, t)
|
64
|
+
return True
|
65
|
+
if int(time.time()) - begin_time > timeout:
|
66
|
+
break
|
67
|
+
time.sleep(sleep_time)
|
68
|
+
|
69
|
+
if self.client.ttl(key) == -1:
|
70
|
+
delete_status = True
|
71
|
+
for _ in range(3):
|
72
|
+
if self.client.ttl(key) != -1:
|
73
|
+
delete_status = False
|
74
|
+
break
|
75
|
+
time.sleep(0.5)
|
76
|
+
if delete_status:
|
77
|
+
self.client.expire(key, t)
|
78
|
+
return False
|
79
|
+
else:
|
80
|
+
ttl = self.client.ttl(key)
|
81
|
+
print("ttl: " + str(ttl))
|
82
|
+
return False
|
83
|
+
|
84
|
+
def execute_update(
|
85
|
+
self,
|
86
|
+
set_info,
|
87
|
+
del_info,
|
88
|
+
status: int = 0
|
89
|
+
):
|
90
|
+
if status not in [0, 1, 2, 3]:
|
91
|
+
return None
|
92
|
+
|
93
|
+
pipe = self.client.pipeline()
|
94
|
+
pipe.multi()
|
95
|
+
|
96
|
+
if status == 0:
|
97
|
+
pipe.hset(self.spider_key, mapping=set_info)
|
98
|
+
pipe.zrem(self.ready_key, *del_info)
|
99
|
+
elif status == 1:
|
100
|
+
pipe.zadd(self.ready_key, mapping=set_info)
|
101
|
+
pipe.hdel(self.spider_key, *del_info)
|
102
|
+
elif status == 2:
|
103
|
+
pipe.sadd(self.failed_key, *set_info)
|
104
|
+
pipe.hdel(self.spider_key, *del_info)
|
105
|
+
else:
|
106
|
+
pipe.sadd(self.succeed_key, *set_info)
|
107
|
+
pipe.hdel(self.spider_key, *del_info)
|
108
|
+
pipe.execute()
|
109
|
+
|
110
|
+
@property
|
111
|
+
def seed_count(self):
|
112
|
+
return self.client.zcard(self.ready_key)
|
113
|
+
|
114
|
+
def deal_seeds(self, sids, status: bool):
|
115
|
+
if isinstance(sids, str):
|
116
|
+
sids = [sids]
|
117
|
+
# if self.get_lock(key=self.retry_lock, t=15):
|
118
|
+
status = 2 if status else 3
|
119
|
+
del_list, fail_set = [], set()
|
120
|
+
for sid in sids:
|
121
|
+
for field, value in self.iterate_hash(self.spider_key, match=f"*{sid}"):
|
122
|
+
_, priority, _sid = field.split("_")
|
123
|
+
if sid != _sid:
|
124
|
+
continue
|
125
|
+
seed = Seed(value, priority=priority)
|
126
|
+
del_list.append(field)
|
127
|
+
fail_set.add(seed.format_seed)
|
128
|
+
if del_list:
|
129
|
+
self.execute_update(fail_set, del_list, status=status)
|
130
|
+
# self.client.delete(self.retry_lock)
|
131
|
+
print("retry seeds, sids: {}".format(json.dumps(sids)))
|
132
|
+
|
133
|
+
def set_seeds(self, seeds):
|
134
|
+
item_info = {}
|
135
|
+
if any(isinstance(seeds, t) for t in (list, tuple)):
|
136
|
+
for seed in seeds:
|
137
|
+
item_info[seed.format_seed] = seed.priority
|
138
|
+
elif isinstance(seeds, Seed):
|
139
|
+
item_info[seeds.format_seed] = seeds.priority
|
140
|
+
self.client.zadd(self.ready_key, mapping=item_info)
|
141
|
+
|
142
|
+
def get_seeds(self, length: int = 1000):
|
143
|
+
"""
|
144
|
+
redis获取种子
|
145
|
+
"""
|
146
|
+
cs = time.time()
|
147
|
+
|
148
|
+
if self.get_lock(key=self.update_lock):
|
149
|
+
|
150
|
+
set_dict, del_list, result = {}, [], []
|
151
|
+
|
152
|
+
# version = int(time.time() * 1e3)
|
153
|
+
version = time.time() * 1e6
|
154
|
+
|
155
|
+
items = self.client.zrangebyscore(self.ready_key, min=0, max="+inf", start=0, num=length, withscores=True)
|
156
|
+
|
157
|
+
# for value, priority in items:
|
158
|
+
# seed = Seed(value, priority=priority, version=version)
|
159
|
+
# pty = "{:03d}".format(int(priority))
|
160
|
+
# key = f"{version}_{pty}_{seed.sid}"
|
161
|
+
# set_dict[key] = value
|
162
|
+
# del_list.append(value)
|
163
|
+
# result.append(seed)
|
164
|
+
|
165
|
+
for value, priority in items:
|
166
|
+
v = version + int(priority) / 1000 + random.random() / 1000
|
167
|
+
seed = Seed(value, priority=priority, version=version)
|
168
|
+
pty = "{:03d}".format(int(priority))
|
169
|
+
key = f"{version}_{pty}_{seed.sid}"
|
170
|
+
set_dict[key] = value
|
171
|
+
del_list.append(value)
|
172
|
+
result.append(seed)
|
173
|
+
|
174
|
+
print("\nset seeds into queue time: " + str(time.time() - cs))
|
175
|
+
if result:
|
176
|
+
self.execute_update(set_dict, del_list)
|
177
|
+
|
178
|
+
self.client.delete(self.update_lock)
|
179
|
+
print("push seeds into queue time: " + str(time.time() - cs))
|
180
|
+
return result
|
181
|
+
|
182
|
+
def check_spider_hash(self):
|
183
|
+
cs = time.time()
|
184
|
+
set_dict, del_list, heartbeat = {}, [], False
|
185
|
+
if self.get_lock(key=self.check_lock, t=60, timeout=600, sleep_time=60):
|
186
|
+
count = self.client.hlen(self.spider_key)
|
187
|
+
if self.client.exists(self.heartbeat_key):
|
188
|
+
heartbeat = True
|
189
|
+
now = int(time.time())
|
190
|
+
for field, value in self.iterate_hash(key=self.spider_key, count=count):
|
191
|
+
version, priority, sid = field.split("_")
|
192
|
+
if heartbeat and int(version) + 600 > now:
|
193
|
+
continue
|
194
|
+
set_dict[value] = priority
|
195
|
+
del_list.append(field)
|
196
|
+
|
197
|
+
if len(del_list) >= 1000:
|
198
|
+
self.client.expire(self.check_lock, 60)
|
199
|
+
self.execute_update(set_dict, del_list, status=1)
|
200
|
+
set_dict, del_list = {}, []
|
201
|
+
|
202
|
+
if set_dict and del_list:
|
203
|
+
self.execute_update(set_dict, del_list, status=1)
|
204
|
+
|
205
|
+
# self.client.delete(self.check_lock)
|
206
|
+
print("init seeds time: " + str(time.time() - cs))
|
207
|
+
if not heartbeat:
|
208
|
+
self.client.setnx(self.heartbeat_key, "")
|
209
|
+
self.set_heartbeat(t=15)
|
210
|
+
|
211
|
+
def add_store_sid(self, key, data):
|
212
|
+
redis_key = self.store_key % key
|
213
|
+
self.client.sadd(redis_key, *data)
|
214
|
+
|
215
|
+
|
216
|
+
current_time = datetime.now()
|
217
|
+
# 格式化日期时间字符串
|
218
|
+
formatted_time = current_time.strftime("%m%d%H%M%S%f")
|
219
|
+
c = int(formatted_time)
|
220
|
+
print(c)
|
221
|
+
d = 200 + 0.9 * random.random()
|
222
|
+
print(d)
|
223
|
+
print(time.time())
|
224
|
+
print(c + d / 1000)
|
225
|
+
# for _ in range(100):
|
226
|
+
# redis_db.get_seeds(1000)
|
227
|
+
# redis_db.get_seeds(1000)
|
228
|
+
# redis_db.check_spider_hash()
|
229
|
+
# redis_db.retry_seeds(["dc895aee47f8fc39c479f7cac6025879"])
|
230
|
+
# "1705996980_200_dc895aee47f8fc39c479f7cac6025879"
|
231
|
+
|
File without changes
|
@@ -0,0 +1,29 @@
|
|
1
|
+
from base.log import log
|
2
|
+
from base.bbb import Seed
|
3
|
+
from base.interface import SchedulerInterface
|
4
|
+
|
5
|
+
|
6
|
+
class Textfile(SchedulerInterface):
|
7
|
+
|
8
|
+
index = None
|
9
|
+
|
10
|
+
def schedule(self):
|
11
|
+
try:
|
12
|
+
seeds = []
|
13
|
+
with open(self.table, "r") as fp:
|
14
|
+
fp.seek(self.index or 0, 0)
|
15
|
+
for _ in range(self.length):
|
16
|
+
data = fp.readline().strip()
|
17
|
+
if not data:
|
18
|
+
log.info("scheduler end!")
|
19
|
+
self.stop = True
|
20
|
+
break
|
21
|
+
seeds.append(Seed(data))
|
22
|
+
self.index = fp.tell()
|
23
|
+
return seeds
|
24
|
+
except FileNotFoundError:
|
25
|
+
log.error("task table not found!")
|
26
|
+
return None
|
27
|
+
except TypeError:
|
28
|
+
log.error("task table type error!")
|
29
|
+
return None
|
File without changes
|
@@ -0,0 +1,55 @@
|
|
1
|
+
import json
|
2
|
+
from base.log import log
|
3
|
+
from base.interface import StorerInterface
|
4
|
+
from aliyun.log import LogClient, LogItem, PutLogsRequest
|
5
|
+
|
6
|
+
|
7
|
+
class Loghub(StorerInterface):
|
8
|
+
|
9
|
+
def __init__(self, table, fields, length, queue, config):
|
10
|
+
super().__init__(table, fields, length, queue, config)
|
11
|
+
self.client = None
|
12
|
+
|
13
|
+
def init_loghub_clint(self):
|
14
|
+
try:
|
15
|
+
self.client = LogClient(
|
16
|
+
self.config['endpoint'],
|
17
|
+
self.config['access_key_id'],
|
18
|
+
self.config['access_key']
|
19
|
+
)
|
20
|
+
except Exception as e:
|
21
|
+
self.client = None
|
22
|
+
return False
|
23
|
+
|
24
|
+
def store(self, data_list):
|
25
|
+
try:
|
26
|
+
if not self.client:
|
27
|
+
self.init_loghub_clint()
|
28
|
+
|
29
|
+
log_items = list()
|
30
|
+
for item in data_list:
|
31
|
+
temp = item._asdict()
|
32
|
+
for key, value in temp.items():
|
33
|
+
if isinstance(value, str):
|
34
|
+
temp[key] = value
|
35
|
+
else:
|
36
|
+
temp[key] = json.dumps(value, ensure_ascii=False)
|
37
|
+
log_item = LogItem()
|
38
|
+
contents = sorted(temp.items()) # dict to tuple
|
39
|
+
log_item.set_contents(contents)
|
40
|
+
log_items.append(log_item)
|
41
|
+
request = PutLogsRequest(
|
42
|
+
project=self.config["project"],
|
43
|
+
logstore=self.table,
|
44
|
+
topic=self.config["topic"],
|
45
|
+
source=self.config.get("source"),
|
46
|
+
logitems=log_items,
|
47
|
+
compress=True
|
48
|
+
)
|
49
|
+
self.client.put_logs(request=request)
|
50
|
+
log.info(f"save data, data length: {len(data_list)}")
|
51
|
+
return True
|
52
|
+
except Exception as e:
|
53
|
+
log.exception(e)
|
54
|
+
return False
|
55
|
+
|
@@ -0,0 +1,16 @@
|
|
1
|
+
from base.log import log
|
2
|
+
from base.interface import StorerInterface
|
3
|
+
|
4
|
+
|
5
|
+
class Redis(StorerInterface):
|
6
|
+
|
7
|
+
def store(self, data_list):
|
8
|
+
try:
|
9
|
+
data_str = "\n".join(str(data) for data in data_list)
|
10
|
+
with open(self.table, "a") as fp:
|
11
|
+
fp.write(data_str)
|
12
|
+
log.info(f"save data, data length: {len(data_list)}")
|
13
|
+
return True
|
14
|
+
except Exception as e:
|
15
|
+
return False
|
16
|
+
|
@@ -0,0 +1,16 @@
|
|
1
|
+
from base.log import log
|
2
|
+
from base.interface import StorerInterface
|
3
|
+
|
4
|
+
|
5
|
+
class Textfile(StorerInterface):
|
6
|
+
|
7
|
+
def store(self, data_list):
|
8
|
+
try:
|
9
|
+
data_str = "\n".join(str(data) for data in data_list)
|
10
|
+
with open(self.table, "a") as fp:
|
11
|
+
fp.write(data_str)
|
12
|
+
log.info(f"save data, data length: {len(data_list)}")
|
13
|
+
return True
|
14
|
+
except Exception as e:
|
15
|
+
return False
|
16
|
+
|
File without changes
|
@@ -0,0 +1,194 @@
|
|
1
|
+
import time
|
2
|
+
import threading
|
3
|
+
from threading import Thread
|
4
|
+
from base.log import log
|
5
|
+
from db.base.redis_db import RedisDB
|
6
|
+
from base.bbb import Queue, Seed, DBItem
|
7
|
+
from base.utils import struct_queue_name, restore_table_name
|
8
|
+
from models import Scheduler, Spider, Storer
|
9
|
+
|
10
|
+
|
11
|
+
def start_seeds(seeds):
|
12
|
+
if not seeds:
|
13
|
+
return None
|
14
|
+
if any(isinstance(seeds, t) for t in (list, tuple)):
|
15
|
+
return [Seed(seed) for seed in seeds]
|
16
|
+
elif any(isinstance(seeds, t) for t in (str, dict)):
|
17
|
+
return Seed(seeds)
|
18
|
+
|
19
|
+
|
20
|
+
def parse_storer_info(storer_info):
|
21
|
+
storer_data = {}
|
22
|
+
storer_info_list = []
|
23
|
+
if storer_info.__class__.__name__ == 'StorerInfo':
|
24
|
+
storer_info_list.append(storer_info)
|
25
|
+
elif any(isinstance(storer_info, t) for t in (list, tuple)):
|
26
|
+
storer_info_list = storer_info
|
27
|
+
for info in storer_info_list:
|
28
|
+
db_name = info.DB.__name__
|
29
|
+
storer_data.setdefault(db_name, {"StorerDB": info.DB, "db_args_list": []})
|
30
|
+
storer_data[db_name]["db_args_list"].append(info[1:])
|
31
|
+
return storer_data
|
32
|
+
|
33
|
+
|
34
|
+
def check(stop, last, spider, scheduler, storer_list, ready_seed_length, spider_queue_length):
|
35
|
+
time.sleep(5)
|
36
|
+
while True:
|
37
|
+
if (
|
38
|
+
scheduler.stop and
|
39
|
+
not ready_seed_length() and
|
40
|
+
not scheduler.queue.length and
|
41
|
+
not spider.spider_in_progress.length
|
42
|
+
):
|
43
|
+
log.info("spider is done?")
|
44
|
+
last.set()
|
45
|
+
time.sleep(5)
|
46
|
+
storer_queue_empty = True
|
47
|
+
for storer in storer_list:
|
48
|
+
if storer.queue.length:
|
49
|
+
storer_queue_empty = False
|
50
|
+
break
|
51
|
+
if storer_queue_empty and not spider_queue_length():
|
52
|
+
log.info("spider done!")
|
53
|
+
break
|
54
|
+
last.clear()
|
55
|
+
time.sleep(3)
|
56
|
+
stop.set()
|
57
|
+
|
58
|
+
|
59
|
+
def launcher(task):
|
60
|
+
"""
|
61
|
+
任务启动装饰器
|
62
|
+
:param task: 任务配置信息
|
63
|
+
"""
|
64
|
+
def decorator(func):
|
65
|
+
"""
|
66
|
+
Item:
|
67
|
+
Textfile()
|
68
|
+
Loghub()
|
69
|
+
Console()
|
70
|
+
e.g.
|
71
|
+
task.fields = "a,b"
|
72
|
+
func(item, seed)
|
73
|
+
a = "a"
|
74
|
+
b = "b"
|
75
|
+
data = {"a": "a", "b": "b"}
|
76
|
+
yield item.Loghub(**data)
|
77
|
+
yield item.Loghub(a=a, b=b)
|
78
|
+
"""
|
79
|
+
storer_list = []
|
80
|
+
|
81
|
+
# 程序结束事件
|
82
|
+
last = threading.Event()
|
83
|
+
# 停止采集事件
|
84
|
+
stop = threading.Event()
|
85
|
+
|
86
|
+
# 初始化redis信息
|
87
|
+
redis_db = RedisDB(task.project, task.task_name, *task.redis_info)
|
88
|
+
|
89
|
+
log.info("初始化cobweb!")
|
90
|
+
|
91
|
+
seed_queue = Queue()
|
92
|
+
|
93
|
+
# 调度器动态继承
|
94
|
+
SchedulerDB, table, sql, length, size, config = task.scheduler_info
|
95
|
+
SchedulerTmp = type(SchedulerDB.__name__, (Scheduler, SchedulerDB), {})
|
96
|
+
|
97
|
+
# 初始化调度器
|
98
|
+
scheduler = SchedulerTmp(table, sql, length, size, seed_queue, config)
|
99
|
+
|
100
|
+
# 初始化采集器
|
101
|
+
spider = Spider(seed_queue, task.max_retries)
|
102
|
+
|
103
|
+
# 解析存储器信息
|
104
|
+
storer_data = parse_storer_info(task.storer_info)
|
105
|
+
|
106
|
+
# new item
|
107
|
+
item = type("Item", (object,), {"redis_client": redis_db})()
|
108
|
+
for db_name in storer_data.keys():
|
109
|
+
# 存储器动态继承
|
110
|
+
StorerDB = storer_data[db_name]["StorerDB"]
|
111
|
+
StorerTmp = type(StorerDB.__name__, (Storer, StorerDB), {})
|
112
|
+
db_args_list = storer_data[db_name]["db_args_list"]
|
113
|
+
for storer_db_args in db_args_list:
|
114
|
+
table, fields, length, config = storer_db_args
|
115
|
+
if not getattr(item, db_name, None):
|
116
|
+
instance = type(db_name, (DBItem,), {})
|
117
|
+
setattr(item, db_name, instance)
|
118
|
+
# 创建存储xxx, 创建存储队列
|
119
|
+
storer_item_instance = getattr(item, db_name)
|
120
|
+
storer_item_instance.init_item(table, fields)
|
121
|
+
#
|
122
|
+
storer_queue = struct_queue_name(db_name, table)
|
123
|
+
queue = getattr(storer_item_instance, storer_queue)
|
124
|
+
# 初始话存储器
|
125
|
+
table_name = restore_table_name(table_name=table)
|
126
|
+
storer = StorerTmp(table_name, fields, length, queue, config)
|
127
|
+
storer_list.append(storer)
|
128
|
+
|
129
|
+
Thread(target=redis_db.check_spider_queue, args=(stop, len(storer_list))).start()
|
130
|
+
Thread(target=redis_db.set_heartbeat, args=(stop,)).start()
|
131
|
+
|
132
|
+
# 推送初始种子
|
133
|
+
seeds = start_seeds(task.start_seed)
|
134
|
+
redis_db.add_seed(seeds)
|
135
|
+
# 启动调度器, 调度至redis队列
|
136
|
+
Thread(
|
137
|
+
# name="xxxx_schedule_seeds",
|
138
|
+
target=scheduler.schedule_seed,
|
139
|
+
args=(
|
140
|
+
redis_db.ready_seed_length,
|
141
|
+
redis_db.get_scheduler_lock,
|
142
|
+
redis_db.add_seed
|
143
|
+
)
|
144
|
+
).start()
|
145
|
+
|
146
|
+
# 启动调度器, 调度任务队列
|
147
|
+
Thread(
|
148
|
+
# name="xxxx_schedule_task",
|
149
|
+
target=scheduler.schedule_task,
|
150
|
+
args=(
|
151
|
+
stop, redis_db.get_seed,
|
152
|
+
redis_db.ready_seed_length
|
153
|
+
)
|
154
|
+
).start()
|
155
|
+
|
156
|
+
# 启动采集器
|
157
|
+
for index in range(task.spider_num):
|
158
|
+
Thread(
|
159
|
+
# name=f"xxxx_spider_task:{index}",
|
160
|
+
target=spider.spider_task,
|
161
|
+
args=(
|
162
|
+
stop, func, item,
|
163
|
+
redis_db.del_seed
|
164
|
+
)
|
165
|
+
).start()
|
166
|
+
|
167
|
+
# 启动存储器
|
168
|
+
for storer in storer_list:
|
169
|
+
Thread(
|
170
|
+
# name=f"xxxx_store_task:{storer.table}",
|
171
|
+
target=storer.store_task,
|
172
|
+
args=(
|
173
|
+
stop, last,
|
174
|
+
redis_db.reset_seed,
|
175
|
+
redis_db.set_storer
|
176
|
+
)
|
177
|
+
).start()
|
178
|
+
|
179
|
+
Thread(
|
180
|
+
# name="check_spider",
|
181
|
+
target=check,
|
182
|
+
args=(
|
183
|
+
stop, last, spider,
|
184
|
+
scheduler, storer_list,
|
185
|
+
redis_db.ready_seed_length,
|
186
|
+
redis_db.spider_queue_length,
|
187
|
+
)
|
188
|
+
).start()
|
189
|
+
|
190
|
+
return decorator
|
191
|
+
|
192
|
+
|
193
|
+
|
194
|
+
|