cobweb-launcher 0.1.16__tar.gz → 0.1.17__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cobweb-launcher might be problematic. Click here for more details.
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/PKG-INFO +1 -1
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/cobweb/__init__.py +1 -0
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/cobweb/constant.py +3 -1
- cobweb-launcher-0.1.17/cobweb/db/storer/redis.py +15 -0
- cobweb-launcher-0.1.17/cobweb/distributed/launcher.py +243 -0
- cobweb-launcher-0.1.17/cobweb/distributed/models.py +143 -0
- cobweb-launcher-0.1.17/cobweb/equip/single/__init__.py +0 -0
- cobweb-launcher-0.1.17/cobweb/setting.py +13 -0
- cobweb-launcher-0.1.17/cobweb/single/__init__.py +0 -0
- cobweb-launcher-0.1.17/cobweb/single/launcher.py +231 -0
- cobweb-launcher-0.1.17/cobweb/single/models.py +134 -0
- cobweb-launcher-0.1.17/cobweb/single/nest.py +153 -0
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/cobweb/task.py +3 -0
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/cobweb_launcher.egg-info/PKG-INFO +1 -1
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/cobweb_launcher.egg-info/SOURCES.txt +9 -0
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/setup.py +1 -1
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/LICENSE +0 -0
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/README.md +0 -0
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/cobweb/bbb.py +0 -0
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/cobweb/db/__init__.py +0 -0
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/cobweb/db/oss_db.py +0 -0
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/cobweb/db/redis_db.py +0 -0
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/cobweb/db/scheduler/__init__.py +0 -0
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/cobweb/db/scheduler/default.py +0 -0
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/cobweb/db/scheduler/textfile.py +0 -0
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/cobweb/db/storer/__init__.py +0 -0
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/cobweb/db/storer/console.py +0 -0
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/cobweb/db/storer/loghub.py +0 -0
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/cobweb/db/storer/textfile.py +0 -0
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/cobweb/decorators.py +0 -0
- {cobweb-launcher-0.1.16/cobweb/equip/dev → cobweb-launcher-0.1.17/cobweb/distributed}/__init__.py +0 -0
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/cobweb/equip/__init__.py +0 -0
- {cobweb-launcher-0.1.16/cobweb/equip/distributed → cobweb-launcher-0.1.17/cobweb/equip/dev}/__init__.py +0 -0
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/cobweb/equip/dev/launcher.py +0 -0
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/cobweb/equip/dev/models.py +0 -0
- {cobweb-launcher-0.1.16/cobweb/equip/single → cobweb-launcher-0.1.17/cobweb/equip/distributed}/__init__.py +0 -0
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/cobweb/equip/distributed/launcher.py +0 -0
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/cobweb/equip/distributed/models.py +0 -0
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/cobweb/equip/single/launcher.py +0 -0
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/cobweb/equip/single/models.py +0 -0
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/cobweb/interface.py +0 -0
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/cobweb/log.py +0 -0
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/cobweb/utils.py +0 -0
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/cobweb_launcher.egg-info/requires.txt +0 -0
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/cobweb_launcher.egg-info/top_level.txt +0 -0
- {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/setup.cfg +0 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from cobweb import log, StorerInterface
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class Redis(StorerInterface):
|
|
5
|
+
|
|
6
|
+
def store(self, data_list):
|
|
7
|
+
try:
|
|
8
|
+
data_str = "\n".join(str(data) for data in data_list)
|
|
9
|
+
with open(self.table, "a") as fp:
|
|
10
|
+
fp.write(data_str)
|
|
11
|
+
log.info(f"save data, data length: {len(data_list)}")
|
|
12
|
+
return True
|
|
13
|
+
except Exception as e:
|
|
14
|
+
return False
|
|
15
|
+
|
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import threading
|
|
3
|
+
from threading import Thread
|
|
4
|
+
|
|
5
|
+
from .models import Scheduler, Spider, Storer
|
|
6
|
+
from cobweb import log, Queue, DBItem, RedisDB
|
|
7
|
+
from cobweb.setting import MODEL, RESET_SCORE, CHECK_LOCK_TIME
|
|
8
|
+
from cobweb.utils import (
|
|
9
|
+
struct_queue_name as sqn,
|
|
10
|
+
restore_table_name as rtn,
|
|
11
|
+
parse_import_model as pim,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def check(stop, last, spider, scheduler, storer_list, ready_seed_length, spider_queue_length):
|
|
16
|
+
log.info("run check thread after 30 seconds...")
|
|
17
|
+
time.sleep(30)
|
|
18
|
+
spider_info = """
|
|
19
|
+
------------------- check: {0} ------------------
|
|
20
|
+
running_spider_thread_num: {1}
|
|
21
|
+
redis_ready_seed_length: {2}
|
|
22
|
+
redis_spider_seed_length: {3}
|
|
23
|
+
memory_seed_queue_length: {4}
|
|
24
|
+
storer_upload_queue_length_info:
|
|
25
|
+
{5}
|
|
26
|
+
----------------------- end -----------------------"""
|
|
27
|
+
while True:
|
|
28
|
+
status = "running"
|
|
29
|
+
running_spider_thread_num = spider.spider_in_progress.length
|
|
30
|
+
redis_ready_seed_length = ready_seed_length()
|
|
31
|
+
redis_spider_seed_length = spider_queue_length()
|
|
32
|
+
memory_seed_queue_length = scheduler.queue.length
|
|
33
|
+
storer_upload_queue_list = []
|
|
34
|
+
for storer in storer_list:
|
|
35
|
+
storer_upload_queue_list.append(
|
|
36
|
+
f"{storer.__class__.__name__} storer queue length: {storer.queue.length}"
|
|
37
|
+
)
|
|
38
|
+
if (
|
|
39
|
+
scheduler.stop and
|
|
40
|
+
# not redis_ready_seed_length and
|
|
41
|
+
not memory_seed_queue_length and
|
|
42
|
+
not running_spider_thread_num
|
|
43
|
+
):
|
|
44
|
+
if not MODEL:
|
|
45
|
+
log.info("spider is done?")
|
|
46
|
+
last.set()
|
|
47
|
+
time.sleep(3)
|
|
48
|
+
storer_queue_empty = True
|
|
49
|
+
storer_upload_queue_list = []
|
|
50
|
+
for storer in storer_list:
|
|
51
|
+
if storer.queue.length:
|
|
52
|
+
storer_queue_empty = False
|
|
53
|
+
storer_upload_queue_list.append(
|
|
54
|
+
f"{storer.__class__.__name__} storer queue length: {storer.queue.length}"
|
|
55
|
+
)
|
|
56
|
+
if (
|
|
57
|
+
storer_queue_empty and
|
|
58
|
+
not redis_ready_seed_length and
|
|
59
|
+
not redis_spider_seed_length
|
|
60
|
+
):
|
|
61
|
+
if MODEL:
|
|
62
|
+
log.info("waiting for push seeds...")
|
|
63
|
+
status = "waiting"
|
|
64
|
+
time.sleep(30)
|
|
65
|
+
else:
|
|
66
|
+
log.info("spider done!")
|
|
67
|
+
break
|
|
68
|
+
|
|
69
|
+
last.clear()
|
|
70
|
+
|
|
71
|
+
storer_upload_queue_length_info = "\n ".join(
|
|
72
|
+
storer_upload_queue_list) if storer_upload_queue_list else "None"
|
|
73
|
+
log.info(spider_info.format(
|
|
74
|
+
status,
|
|
75
|
+
running_spider_thread_num,
|
|
76
|
+
redis_ready_seed_length,
|
|
77
|
+
redis_spider_seed_length,
|
|
78
|
+
memory_seed_queue_length,
|
|
79
|
+
storer_upload_queue_length_info
|
|
80
|
+
))
|
|
81
|
+
|
|
82
|
+
time.sleep(3)
|
|
83
|
+
stop.set()
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def launcher(task):
|
|
87
|
+
"""
|
|
88
|
+
任务启动装饰器
|
|
89
|
+
:param task: 任务配置信息
|
|
90
|
+
"""
|
|
91
|
+
def decorator(func):
|
|
92
|
+
"""
|
|
93
|
+
Item:
|
|
94
|
+
Textfile()
|
|
95
|
+
Loghub()
|
|
96
|
+
Console()
|
|
97
|
+
e.g.
|
|
98
|
+
task.fields = "a,b"
|
|
99
|
+
func(item, seed)
|
|
100
|
+
a = "a"
|
|
101
|
+
b = "b"
|
|
102
|
+
data = {"a": "a", "b": "b"}
|
|
103
|
+
yield item.Loghub(**data)
|
|
104
|
+
yield item.Loghub(a=a, b=b)
|
|
105
|
+
"""
|
|
106
|
+
storer_list = []
|
|
107
|
+
|
|
108
|
+
# 程序结束事件
|
|
109
|
+
last = threading.Event()
|
|
110
|
+
# 停止采集事件
|
|
111
|
+
stop = threading.Event()
|
|
112
|
+
|
|
113
|
+
# 初始化redis信息
|
|
114
|
+
redis_db = RedisDB(
|
|
115
|
+
task.project, task.task_name, task.redis_info,
|
|
116
|
+
model=MODEL, cs_lct=CHECK_LOCK_TIME, rs_time=RESET_SCORE
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
log.info("初始化cobweb!")
|
|
120
|
+
|
|
121
|
+
seed_queue = Queue()
|
|
122
|
+
|
|
123
|
+
if task.scheduler_info is None:
|
|
124
|
+
task.scheduler_info = dict()
|
|
125
|
+
|
|
126
|
+
# 调度器动态继承
|
|
127
|
+
sql = task.scheduler_info.get("sql")
|
|
128
|
+
table = task.scheduler_info.get("table")
|
|
129
|
+
size = task.scheduler_info.get("size")
|
|
130
|
+
scheduler_config = task.scheduler_info.get("config")
|
|
131
|
+
scheduler_db = task.scheduler_info.get("db", "default")
|
|
132
|
+
DB, class_name = pim(scheduler_db, "scheduler")
|
|
133
|
+
# SchedulerDB, table, sql, length, size, config = task.scheduler_info
|
|
134
|
+
SchedulerTmp = type(class_name, (Scheduler, DB), {})
|
|
135
|
+
|
|
136
|
+
# 初始化调度器
|
|
137
|
+
scheduler = SchedulerTmp(
|
|
138
|
+
table=table, sql=sql, size=size, queue=seed_queue,
|
|
139
|
+
length=task.scheduler_queue_length, config=scheduler_config
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
# 初始化采集器
|
|
143
|
+
spider = Spider(seed_queue, task.max_retries)
|
|
144
|
+
|
|
145
|
+
# 解析存储器信息
|
|
146
|
+
storer_info_list = task.storer_info or []
|
|
147
|
+
if not isinstance(storer_info_list, list):
|
|
148
|
+
storer_info_list = [storer_info_list]
|
|
149
|
+
|
|
150
|
+
# new item
|
|
151
|
+
item = type("Item", (object,), {"redis_client": redis_db.client})()
|
|
152
|
+
|
|
153
|
+
for storer_info in storer_info_list:
|
|
154
|
+
storer_db = storer_info["db"]
|
|
155
|
+
fields = storer_info["fields"]
|
|
156
|
+
storer_table = storer_info.get("table", "console")
|
|
157
|
+
storer_config = storer_info.get("config")
|
|
158
|
+
|
|
159
|
+
StorerDB, class_name = pim(storer_db, "storer")
|
|
160
|
+
StorerTmp = type(class_name, (Storer, StorerDB), {})
|
|
161
|
+
|
|
162
|
+
db_name = class_name.lower()
|
|
163
|
+
if not getattr(item, db_name, None):
|
|
164
|
+
instance = type(db_name, (DBItem,), {})
|
|
165
|
+
setattr(item, db_name, instance)
|
|
166
|
+
|
|
167
|
+
storer_item_instance = getattr(item, db_name)
|
|
168
|
+
storer_item_instance.init_item(storer_table, fields)
|
|
169
|
+
|
|
170
|
+
storer_queue = sqn(db_name, storer_table)
|
|
171
|
+
queue = getattr(storer_item_instance, storer_queue)
|
|
172
|
+
# 初始话存储器
|
|
173
|
+
table_name = rtn(table_name=storer_table)
|
|
174
|
+
storer = StorerTmp(
|
|
175
|
+
table=table_name, fields=fields,
|
|
176
|
+
length=task.storer_queue_length,
|
|
177
|
+
queue=queue, config=storer_config
|
|
178
|
+
)
|
|
179
|
+
storer_list.append(storer)
|
|
180
|
+
|
|
181
|
+
Thread(target=redis_db.check_spider_queue, args=(stop, len(storer_list))).start()
|
|
182
|
+
Thread(target=redis_db.set_heartbeat, args=(stop,)).start()
|
|
183
|
+
|
|
184
|
+
# 推送初始种子
|
|
185
|
+
# seeds = start_seeds(task.start_seed)
|
|
186
|
+
redis_db.add_seed(task.seeds)
|
|
187
|
+
# 启动调度器, 调度至redis队列
|
|
188
|
+
Thread(
|
|
189
|
+
# name="xxxx_schedule_seeds",
|
|
190
|
+
target=scheduler.schedule_seed,
|
|
191
|
+
args=(
|
|
192
|
+
redis_db.ready_seed_length,
|
|
193
|
+
redis_db.get_scheduler_lock,
|
|
194
|
+
redis_db.add_seed
|
|
195
|
+
)
|
|
196
|
+
).start()
|
|
197
|
+
|
|
198
|
+
# 启动调度器, 调度任务队列
|
|
199
|
+
Thread(
|
|
200
|
+
# name="xxxx_schedule_task",
|
|
201
|
+
target=scheduler.schedule_task,
|
|
202
|
+
args=(
|
|
203
|
+
stop, redis_db.get_seed,
|
|
204
|
+
redis_db.ready_seed_length
|
|
205
|
+
)
|
|
206
|
+
).start()
|
|
207
|
+
|
|
208
|
+
# 启动采集器
|
|
209
|
+
for index in range(task.spider_num):
|
|
210
|
+
Thread(
|
|
211
|
+
# name=f"xxxx_spider_task:{index}",
|
|
212
|
+
target=spider.spider_task,
|
|
213
|
+
args=(
|
|
214
|
+
stop, func, item,
|
|
215
|
+
redis_db.del_seed
|
|
216
|
+
)
|
|
217
|
+
).start()
|
|
218
|
+
|
|
219
|
+
# 启动存储器
|
|
220
|
+
for storer in storer_list:
|
|
221
|
+
Thread(
|
|
222
|
+
# name=f"xxxx_store_task:{storer.table}",
|
|
223
|
+
target=storer.store_task,
|
|
224
|
+
args=(
|
|
225
|
+
stop, last,
|
|
226
|
+
redis_db.reset_seed,
|
|
227
|
+
redis_db.set_storer
|
|
228
|
+
)
|
|
229
|
+
).start()
|
|
230
|
+
|
|
231
|
+
Thread(
|
|
232
|
+
# name="check_spider",
|
|
233
|
+
target=check,
|
|
234
|
+
args=(
|
|
235
|
+
stop, last, spider,
|
|
236
|
+
scheduler, storer_list,
|
|
237
|
+
redis_db.ready_seed_length,
|
|
238
|
+
redis_db.spider_queue_length,
|
|
239
|
+
)
|
|
240
|
+
).start()
|
|
241
|
+
|
|
242
|
+
return decorator
|
|
243
|
+
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from hashlib import md5
|
|
3
|
+
from cobweb import log, Queue, Seed
|
|
4
|
+
from cobweb.utils import issubclass_cobweb_inf
|
|
5
|
+
|
|
6
|
+
# from pympler import asizeof
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Scheduler:
|
|
10
|
+
|
|
11
|
+
def schedule_seed(self, ready_seed_length, get_scheduler_lock, add_seed):
|
|
12
|
+
|
|
13
|
+
inf_name = "SchedulerInterface"
|
|
14
|
+
if not issubclass_cobweb_inf(self.__class__, inf_name):
|
|
15
|
+
raise Exception("not have schedule function!")
|
|
16
|
+
|
|
17
|
+
if self.__class__.__name__ == "Default":
|
|
18
|
+
self.stop = True
|
|
19
|
+
return None
|
|
20
|
+
|
|
21
|
+
while not self.stop:
|
|
22
|
+
length = ready_seed_length()
|
|
23
|
+
if length > self.size:
|
|
24
|
+
time.sleep(15)
|
|
25
|
+
|
|
26
|
+
elif get_scheduler_lock():
|
|
27
|
+
seeds = self.schedule()
|
|
28
|
+
add_seed(seeds)
|
|
29
|
+
|
|
30
|
+
log.info(f"close thread: schedule_seed")
|
|
31
|
+
|
|
32
|
+
def schedule_task(self, stop, get_seed, ready_seed_length):
|
|
33
|
+
time.sleep(3)
|
|
34
|
+
while not stop.is_set():
|
|
35
|
+
|
|
36
|
+
if not ready_seed_length():
|
|
37
|
+
time.sleep(5)
|
|
38
|
+
continue
|
|
39
|
+
|
|
40
|
+
if self.queue.length >= self.length:
|
|
41
|
+
time.sleep(3)
|
|
42
|
+
continue
|
|
43
|
+
|
|
44
|
+
seeds = get_seed(self.length)
|
|
45
|
+
self.queue.push(seeds)
|
|
46
|
+
log.info(f"close thread: schedule_task")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class Spider:
|
|
50
|
+
|
|
51
|
+
def __init__(self, queue, max_retries=5):
|
|
52
|
+
self.spider_in_progress = Queue()
|
|
53
|
+
self.max_retries = max_retries
|
|
54
|
+
self.queue = queue
|
|
55
|
+
|
|
56
|
+
def spider_task(self, stop, func, item, del_seed):
|
|
57
|
+
while not stop.is_set():
|
|
58
|
+
seed = self.queue.pop()
|
|
59
|
+
if not seed:
|
|
60
|
+
time.sleep(3)
|
|
61
|
+
continue
|
|
62
|
+
elif seed._retry >= self.max_retries:
|
|
63
|
+
del_seed(seed, spider_status=False)
|
|
64
|
+
continue
|
|
65
|
+
try:
|
|
66
|
+
self.spider_in_progress.push(1, direct_insertion=True)
|
|
67
|
+
# log.info("spider seed: " + str(seed))
|
|
68
|
+
ret_count = 0
|
|
69
|
+
status = None
|
|
70
|
+
store_queue = None
|
|
71
|
+
store_data = list()
|
|
72
|
+
for it in func(item, seed):
|
|
73
|
+
ret_count += 1
|
|
74
|
+
if getattr(it, "table_name", None):
|
|
75
|
+
if not store_queue:
|
|
76
|
+
store_queue = it.queue()
|
|
77
|
+
store_data.append(it.struct_data)
|
|
78
|
+
elif isinstance(it, Seed):
|
|
79
|
+
self.queue.push(it)
|
|
80
|
+
elif any(isinstance(it, t) for t in (list, tuple)):
|
|
81
|
+
self.queue.push([s if isinstance(s, Seed) else Seed(s) for s in it])
|
|
82
|
+
elif isinstance(it, bool):
|
|
83
|
+
status = it
|
|
84
|
+
|
|
85
|
+
if store_queue and store_data:
|
|
86
|
+
store_data.append(seed)
|
|
87
|
+
store_queue.push(store_data)
|
|
88
|
+
|
|
89
|
+
if status:
|
|
90
|
+
del_seed(seed, spider_status=True)
|
|
91
|
+
elif not ret_count or status is False:
|
|
92
|
+
seed._retry += 1
|
|
93
|
+
self.queue.push(seed)
|
|
94
|
+
|
|
95
|
+
except Exception as e:
|
|
96
|
+
seed._retry += 1
|
|
97
|
+
self.queue.push(seed)
|
|
98
|
+
log.info(f"{str(seed)} -> {str(e)}")
|
|
99
|
+
finally:
|
|
100
|
+
self.spider_in_progress.pop()
|
|
101
|
+
log.info(f"close thread: spider")
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class Storer:
|
|
105
|
+
|
|
106
|
+
def store_task(self, stop, last, reset_seed, set_storer):
|
|
107
|
+
|
|
108
|
+
inf_name = "StorerInterface"
|
|
109
|
+
if not issubclass_cobweb_inf(self.__class__, inf_name):
|
|
110
|
+
return None
|
|
111
|
+
|
|
112
|
+
if not getattr(self, "store", None):
|
|
113
|
+
raise Exception("not have store function!")
|
|
114
|
+
|
|
115
|
+
storer_name = self.__class__.__name__ + self.table
|
|
116
|
+
store_key_id = md5(storer_name.encode()).hexdigest()
|
|
117
|
+
|
|
118
|
+
while not stop.is_set():
|
|
119
|
+
|
|
120
|
+
if last.is_set() or self.queue.length >= self.length:
|
|
121
|
+
seeds, data_list = [], []
|
|
122
|
+
|
|
123
|
+
while True:
|
|
124
|
+
data = self.queue.pop()
|
|
125
|
+
if not data:
|
|
126
|
+
break
|
|
127
|
+
if isinstance(data, Seed):
|
|
128
|
+
seeds.append(data)
|
|
129
|
+
if len(data_list) >= self.length:
|
|
130
|
+
break
|
|
131
|
+
continue
|
|
132
|
+
data_list.append(data)
|
|
133
|
+
|
|
134
|
+
if data_list:
|
|
135
|
+
if self.store(data_list):
|
|
136
|
+
set_storer(store_key_id, seeds)
|
|
137
|
+
else:
|
|
138
|
+
reset_seed(seeds)
|
|
139
|
+
continue
|
|
140
|
+
|
|
141
|
+
time.sleep(3)
|
|
142
|
+
|
|
143
|
+
log.info(f"close thread: {storer_name}")
|
|
File without changes
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
# model: 0, 1, 2
|
|
5
|
+
MODEL = int(os.getenv("MODEL", "0"))
|
|
6
|
+
|
|
7
|
+
# 重制score值的等待时间, 默认10分钟
|
|
8
|
+
RESET_SCORE = int(os.getenv("RESET_SCORE", "600"))
|
|
9
|
+
|
|
10
|
+
# 默认设置检查spider queue队列锁的存活时间为30s
|
|
11
|
+
CHECK_LOCK_TIME = int(os.getenv("CHECK_LOCK_TIME", 30))
|
|
12
|
+
|
|
13
|
+
|
|
File without changes
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import threading
|
|
3
|
+
from threading import Thread
|
|
4
|
+
|
|
5
|
+
from .models import Scheduler, Spider, Storer
|
|
6
|
+
from cobweb import log, Queue, DBItem, RedisDB
|
|
7
|
+
from cobweb.setting import MODEL, RESET_SCORE, CHECK_LOCK_TIME
|
|
8
|
+
from cobweb.utils import (
|
|
9
|
+
struct_queue_name as sqn,
|
|
10
|
+
restore_table_name as rtn,
|
|
11
|
+
parse_import_model as pim,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def check(stop, last, spider, scheduler, storer, ready_seed_length, spider_queue_length):
|
|
16
|
+
log.info("run check thread after 30 seconds...")
|
|
17
|
+
time.sleep(30)
|
|
18
|
+
spider_info = """
|
|
19
|
+
------------------- check: {0} ------------------
|
|
20
|
+
redis_spider_seed_length: {1}
|
|
21
|
+
redis_ready_seed_length: {2}
|
|
22
|
+
running_spider_thread_num: {3}
|
|
23
|
+
memory_seed_queue_length: {4}
|
|
24
|
+
storer_queue_length_info: {5}
|
|
25
|
+
----------------------- end -----------------------"""
|
|
26
|
+
while True:
|
|
27
|
+
status = "running"
|
|
28
|
+
running_spider_thread_num = spider.spider_in_progress.length
|
|
29
|
+
redis_ready_seed_length = ready_seed_length()
|
|
30
|
+
redis_spider_seed_length = spider_queue_length()
|
|
31
|
+
memory_seed_queue_length = scheduler.queue.length
|
|
32
|
+
storer_upload_queue_length = storer.queue.length
|
|
33
|
+
if (
|
|
34
|
+
scheduler.stop and
|
|
35
|
+
# not redis_ready_seed_length and
|
|
36
|
+
not memory_seed_queue_length and
|
|
37
|
+
not running_spider_thread_num
|
|
38
|
+
):
|
|
39
|
+
if not MODEL:
|
|
40
|
+
log.info("spider is done?")
|
|
41
|
+
last.set()
|
|
42
|
+
time.sleep(3)
|
|
43
|
+
storer_queue_empty = True
|
|
44
|
+
if storer.queue.length:
|
|
45
|
+
storer_queue_empty = False
|
|
46
|
+
storer_upload_queue_length = storer.queue.length
|
|
47
|
+
if (
|
|
48
|
+
storer_queue_empty and
|
|
49
|
+
not redis_ready_seed_length and
|
|
50
|
+
not redis_spider_seed_length
|
|
51
|
+
):
|
|
52
|
+
if MODEL:
|
|
53
|
+
log.info("waiting for push seeds...")
|
|
54
|
+
status = "waiting"
|
|
55
|
+
time.sleep(30)
|
|
56
|
+
else:
|
|
57
|
+
log.info("spider done!")
|
|
58
|
+
break
|
|
59
|
+
|
|
60
|
+
last.clear()
|
|
61
|
+
|
|
62
|
+
log.info(spider_info.format(
|
|
63
|
+
status,
|
|
64
|
+
redis_spider_seed_length,
|
|
65
|
+
redis_ready_seed_length,
|
|
66
|
+
running_spider_thread_num,
|
|
67
|
+
memory_seed_queue_length,
|
|
68
|
+
storer_upload_queue_length
|
|
69
|
+
))
|
|
70
|
+
|
|
71
|
+
time.sleep(3)
|
|
72
|
+
stop.set()
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def launcher(task):
|
|
76
|
+
"""
|
|
77
|
+
任务启动装饰器
|
|
78
|
+
:param task: 任务配置信息
|
|
79
|
+
"""
|
|
80
|
+
def decorator(func):
|
|
81
|
+
"""
|
|
82
|
+
Item:
|
|
83
|
+
Textfile()
|
|
84
|
+
Loghub()
|
|
85
|
+
Console()
|
|
86
|
+
e.g.
|
|
87
|
+
task.fields = "a,b"
|
|
88
|
+
func(item, seed)
|
|
89
|
+
a = "a"
|
|
90
|
+
b = "b"
|
|
91
|
+
data = {"a": "a", "b": "b"}
|
|
92
|
+
yield item.Loghub(**data)
|
|
93
|
+
yield item.Loghub(a=a, b=b)
|
|
94
|
+
"""
|
|
95
|
+
storer_list = []
|
|
96
|
+
|
|
97
|
+
# 程序结束事件
|
|
98
|
+
last = threading.Event()
|
|
99
|
+
# 停止采集事件
|
|
100
|
+
stop = threading.Event()
|
|
101
|
+
|
|
102
|
+
# 初始化redis信息
|
|
103
|
+
redis_db = RedisDB(
|
|
104
|
+
task.project, task.task_name, task.redis_info,
|
|
105
|
+
model=MODEL, cs_lct=CHECK_LOCK_TIME, rs_time=RESET_SCORE
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# new item
|
|
109
|
+
item = type("Item", (object,), {"redis_client": redis_db.client})()
|
|
110
|
+
|
|
111
|
+
log.info("初始化cobweb!")
|
|
112
|
+
|
|
113
|
+
seed_queue = Queue()
|
|
114
|
+
|
|
115
|
+
scheduler_info = task.scheduler_info or dict()
|
|
116
|
+
|
|
117
|
+
# 调度器动态继承
|
|
118
|
+
sql = scheduler_info.get("sql")
|
|
119
|
+
table = scheduler_info.get("table")
|
|
120
|
+
size = scheduler_info.get("size")
|
|
121
|
+
scheduler_config = scheduler_info.get("config")
|
|
122
|
+
scheduler_db = scheduler_info.get("db", "default")
|
|
123
|
+
DB, class_name = pim(scheduler_db, "scheduler")
|
|
124
|
+
# SchedulerDB, table, sql, length, size, config = task.scheduler_info
|
|
125
|
+
SchedulerTmp = type(class_name, (Scheduler, DB), {})
|
|
126
|
+
|
|
127
|
+
# 初始化调度器
|
|
128
|
+
scheduler = SchedulerTmp(
|
|
129
|
+
table=table, sql=sql, size=size, queue=seed_queue,
|
|
130
|
+
length=task.scheduler_queue_length, config=scheduler_config
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
# 初始化采集器
|
|
134
|
+
spider = Spider(seed_queue, task.max_retries)
|
|
135
|
+
|
|
136
|
+
storer = None
|
|
137
|
+
|
|
138
|
+
# 解析存储器信息
|
|
139
|
+
storer_info = task.storer_info or dict()
|
|
140
|
+
|
|
141
|
+
# for storer_info in storer_info_list:
|
|
142
|
+
if storer_info:
|
|
143
|
+
storer_db = storer_info["db"]
|
|
144
|
+
fields = storer_info["fields"]
|
|
145
|
+
storer_table = storer_info.get("table", "console")
|
|
146
|
+
storer_config = storer_info.get("config")
|
|
147
|
+
|
|
148
|
+
StorerDB, class_name = pim(storer_db, "storer")
|
|
149
|
+
StorerTmp = type(class_name, (Storer, StorerDB), {})
|
|
150
|
+
|
|
151
|
+
db_name = class_name.lower()
|
|
152
|
+
if not getattr(item, db_name, None):
|
|
153
|
+
instance = type(db_name, (DBItem,), {})
|
|
154
|
+
setattr(item, db_name, instance)
|
|
155
|
+
|
|
156
|
+
storer_item_instance = getattr(item, db_name)
|
|
157
|
+
storer_item_instance.init_item(storer_table, fields)
|
|
158
|
+
|
|
159
|
+
storer_queue = sqn(db_name, storer_table)
|
|
160
|
+
queue = getattr(storer_item_instance, storer_queue)
|
|
161
|
+
# 初始话存储器
|
|
162
|
+
table_name = rtn(table_name=storer_table)
|
|
163
|
+
storer = StorerTmp(
|
|
164
|
+
table=table_name, fields=fields,
|
|
165
|
+
length=task.storer_queue_length,
|
|
166
|
+
queue=queue, config=storer_config
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
Thread(target=redis_db.check_spider_queue, args=(stop, len(storer_list))).start()
|
|
170
|
+
Thread(target=redis_db.set_heartbeat, args=(stop,)).start()
|
|
171
|
+
|
|
172
|
+
# 推送初始种子
|
|
173
|
+
# seeds = start_seeds(task.start_seed)
|
|
174
|
+
redis_db.add_seed(task.seeds)
|
|
175
|
+
# 启动调度器, 调度至redis队列
|
|
176
|
+
Thread(
|
|
177
|
+
# name="xxxx_schedule_seeds",
|
|
178
|
+
target=scheduler.schedule_seed,
|
|
179
|
+
args=(
|
|
180
|
+
redis_db.ready_seed_length,
|
|
181
|
+
redis_db.get_scheduler_lock,
|
|
182
|
+
redis_db.add_seed
|
|
183
|
+
)
|
|
184
|
+
).start()
|
|
185
|
+
|
|
186
|
+
# 启动调度器, 调度任务队列
|
|
187
|
+
Thread(
|
|
188
|
+
# name="xxxx_schedule_task",
|
|
189
|
+
target=scheduler.schedule_task,
|
|
190
|
+
args=(
|
|
191
|
+
stop, redis_db.get_seed,
|
|
192
|
+
redis_db.ready_seed_length
|
|
193
|
+
)
|
|
194
|
+
).start()
|
|
195
|
+
|
|
196
|
+
# 启动采集器
|
|
197
|
+
for index in range(task.spider_num):
|
|
198
|
+
Thread(
|
|
199
|
+
# name=f"xxxx_spider_task:{index}",
|
|
200
|
+
target=spider.spider_task,
|
|
201
|
+
args=(
|
|
202
|
+
stop, func, item,
|
|
203
|
+
redis_db.del_seed
|
|
204
|
+
)
|
|
205
|
+
).start()
|
|
206
|
+
|
|
207
|
+
# 启动存储器
|
|
208
|
+
if storer:
|
|
209
|
+
Thread(
|
|
210
|
+
# name=f"xxxx_store_task:{storer.table}",
|
|
211
|
+
target=storer.store_task,
|
|
212
|
+
args=(
|
|
213
|
+
stop, last,
|
|
214
|
+
redis_db.reset_seed,
|
|
215
|
+
redis_db.del_seed
|
|
216
|
+
)
|
|
217
|
+
).start()
|
|
218
|
+
|
|
219
|
+
Thread(
|
|
220
|
+
# name="check_spider",
|
|
221
|
+
target=check,
|
|
222
|
+
args=(
|
|
223
|
+
stop, last, spider,
|
|
224
|
+
scheduler, storer,
|
|
225
|
+
redis_db.ready_seed_length,
|
|
226
|
+
redis_db.spider_queue_length,
|
|
227
|
+
)
|
|
228
|
+
).start()
|
|
229
|
+
|
|
230
|
+
return decorator
|
|
231
|
+
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from cobweb import log, Queue, Seed, Setting
|
|
3
|
+
from cobweb.utils import issubclass_cobweb_inf
|
|
4
|
+
# from pympler import asizeof
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Scheduler:
|
|
8
|
+
|
|
9
|
+
def schedule_seed(self, ready_seed_length, get_scheduler_lock, add_seed):
|
|
10
|
+
|
|
11
|
+
inf_name = "SchedulerInterface"
|
|
12
|
+
if not issubclass_cobweb_inf(self.__class__, inf_name):
|
|
13
|
+
raise Exception("not have schedule function!")
|
|
14
|
+
|
|
15
|
+
if self.__class__.__name__ == "Default":
|
|
16
|
+
self.stop = True
|
|
17
|
+
return None
|
|
18
|
+
|
|
19
|
+
while not self.stop:
|
|
20
|
+
length = ready_seed_length()
|
|
21
|
+
if length > self.size:
|
|
22
|
+
time.sleep(15)
|
|
23
|
+
|
|
24
|
+
elif get_scheduler_lock():
|
|
25
|
+
seeds = self.schedule()
|
|
26
|
+
add_seed(seeds)
|
|
27
|
+
|
|
28
|
+
log.info(f"close thread: schedule_seed")
|
|
29
|
+
|
|
30
|
+
def schedule_task(self, stop, get_seed, ready_seed_length):
|
|
31
|
+
time.sleep(3)
|
|
32
|
+
while not stop.is_set():
|
|
33
|
+
|
|
34
|
+
if not ready_seed_length():
|
|
35
|
+
time.sleep(Setting.SCHEDULER_WAIT_TIME)
|
|
36
|
+
continue
|
|
37
|
+
|
|
38
|
+
if self.queue.length >= self.length:
|
|
39
|
+
time.sleep(Setting.SCHEDULER_BLOCK_TIME)
|
|
40
|
+
continue
|
|
41
|
+
|
|
42
|
+
seeds = get_seed(self.length)
|
|
43
|
+
self.queue.push(seeds)
|
|
44
|
+
log.info(f"close thread: schedule_task")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class Spider:
|
|
48
|
+
|
|
49
|
+
def __init__(self, queue, max_retries=5):
|
|
50
|
+
self.spider_in_progress = Queue()
|
|
51
|
+
self.max_retries = max_retries
|
|
52
|
+
self.queue = queue
|
|
53
|
+
|
|
54
|
+
def spider_task(self, stop, func, item, del_seed):
|
|
55
|
+
while not stop.is_set():
|
|
56
|
+
seed = self.queue.pop()
|
|
57
|
+
if not seed:
|
|
58
|
+
time.sleep(Setting.SPIDER_WAIT_TIME)
|
|
59
|
+
continue
|
|
60
|
+
elif seed._retry >= self.max_retries:
|
|
61
|
+
del_seed(seed, spider_status=False)
|
|
62
|
+
continue
|
|
63
|
+
try:
|
|
64
|
+
self.spider_in_progress.push(1, direct_insertion=True)
|
|
65
|
+
# log.info("spider seed: " + str(seed))
|
|
66
|
+
ret_count = 0
|
|
67
|
+
status = None
|
|
68
|
+
for it in func(item, seed):
|
|
69
|
+
ret_count += 1
|
|
70
|
+
if getattr(it, "table_name", None):
|
|
71
|
+
store_queue = it.queue()
|
|
72
|
+
store_queue.push(
|
|
73
|
+
[seed, it.struct_data],
|
|
74
|
+
direct_insertion=True
|
|
75
|
+
)
|
|
76
|
+
elif isinstance(it, Seed):
|
|
77
|
+
self.queue.push(it)
|
|
78
|
+
elif any(isinstance(it, t) for t in (list, tuple)):
|
|
79
|
+
self.queue.push([s if isinstance(s, Seed) else Seed(s) for s in it])
|
|
80
|
+
elif isinstance(it, bool):
|
|
81
|
+
status = it
|
|
82
|
+
|
|
83
|
+
if status:
|
|
84
|
+
del_seed(seed, spider_status=True)
|
|
85
|
+
elif not ret_count or status is False:
|
|
86
|
+
seed._retry += 1
|
|
87
|
+
self.queue.push(seed)
|
|
88
|
+
|
|
89
|
+
except Exception as e:
|
|
90
|
+
seed._retry += 1
|
|
91
|
+
self.queue.push(seed)
|
|
92
|
+
log.info(f"{str(seed)} -> {str(e)}")
|
|
93
|
+
finally:
|
|
94
|
+
self.spider_in_progress.pop()
|
|
95
|
+
log.info(f"close thread: spider")
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class Storer:
|
|
99
|
+
|
|
100
|
+
def store_task(self, stop, last, reset_seed, del_seed):
|
|
101
|
+
|
|
102
|
+
inf_name = "StorerInterface"
|
|
103
|
+
if not issubclass_cobweb_inf(self.__class__, inf_name):
|
|
104
|
+
return None
|
|
105
|
+
|
|
106
|
+
if not getattr(self, "store", None):
|
|
107
|
+
raise Exception("not have store function!")
|
|
108
|
+
|
|
109
|
+
storer_name = self.__class__.__name__ + self.table
|
|
110
|
+
|
|
111
|
+
while not stop.is_set():
|
|
112
|
+
|
|
113
|
+
if last.is_set() or self.queue.length >= self.length:
|
|
114
|
+
seeds, data_list = [], []
|
|
115
|
+
|
|
116
|
+
for _ in range(self.length):
|
|
117
|
+
items = self.queue.pop()
|
|
118
|
+
if not items:
|
|
119
|
+
break
|
|
120
|
+
seed, data = items
|
|
121
|
+
seeds.append(seed)
|
|
122
|
+
data_list.append(data)
|
|
123
|
+
|
|
124
|
+
if data_list:
|
|
125
|
+
if self.store(data_list):
|
|
126
|
+
del_seed(seeds)
|
|
127
|
+
else:
|
|
128
|
+
reset_seed(seeds)
|
|
129
|
+
log.info("reset seeds!")
|
|
130
|
+
continue
|
|
131
|
+
|
|
132
|
+
time.sleep(3)
|
|
133
|
+
|
|
134
|
+
log.info(f"close thread: {storer_name}")
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import threading
|
|
3
|
+
|
|
4
|
+
from equip.single import Seed, DBItem
|
|
5
|
+
from equip.single import struct_queue_name, restore_table_name
|
|
6
|
+
from equip.single import Distributor, Scheduler, Spider, Storer
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def init_task_seed(seeds):
|
|
10
|
+
if not seeds:
|
|
11
|
+
return None
|
|
12
|
+
if isinstance(seeds, list) or isinstance(seeds, tuple):
|
|
13
|
+
for seed in seeds:
|
|
14
|
+
yield Seed(seed)
|
|
15
|
+
elif isinstance(seeds, str) or isinstance(seeds, dict):
|
|
16
|
+
yield Seed(seeds)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def parse_storer_info(storer_info):
|
|
20
|
+
storer_data = {}
|
|
21
|
+
storer_info_list = []
|
|
22
|
+
if storer_info.__class__.__name__ == 'StorerInfo':
|
|
23
|
+
storer_info_list.append(storer_info)
|
|
24
|
+
elif isinstance(storer_info, tuple) or isinstance(storer_info, list):
|
|
25
|
+
storer_info_list = storer_info
|
|
26
|
+
for info in storer_info_list:
|
|
27
|
+
db_name = info.DB.__name__
|
|
28
|
+
storer_data.setdefault(db_name, {"StorerDB": info.DB, "db_args_list": []})
|
|
29
|
+
storer_data[db_name]["db_args_list"].append(info[1:])
|
|
30
|
+
return storer_data
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def check(stop_event, last_event, distributor, scheduler, spider, storer_list):
|
|
34
|
+
while True:
|
|
35
|
+
time.sleep(3)
|
|
36
|
+
if (
|
|
37
|
+
scheduler.stop and
|
|
38
|
+
not distributor.seed_queue.length and
|
|
39
|
+
not spider.spider_in_progress.length
|
|
40
|
+
):
|
|
41
|
+
last_event.set()
|
|
42
|
+
time.sleep(10)
|
|
43
|
+
storer_queue_empty = True
|
|
44
|
+
for storer in storer_list:
|
|
45
|
+
if storer.queue.length:
|
|
46
|
+
storer_queue_empty = False
|
|
47
|
+
break
|
|
48
|
+
if storer_queue_empty:
|
|
49
|
+
break
|
|
50
|
+
last_event.clear()
|
|
51
|
+
stop_event.set()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def cobweb(task):
|
|
55
|
+
"""
|
|
56
|
+
任务启动装饰器
|
|
57
|
+
:param task: 任务配置信息
|
|
58
|
+
"""
|
|
59
|
+
def decorator(func):
|
|
60
|
+
"""
|
|
61
|
+
func(Item, seed)
|
|
62
|
+
Item:
|
|
63
|
+
Item.Textfile()
|
|
64
|
+
Item.Console()
|
|
65
|
+
"""
|
|
66
|
+
# project task_name start_seed spider_num queue_length scheduler_info storer_info
|
|
67
|
+
|
|
68
|
+
storer_list = []
|
|
69
|
+
|
|
70
|
+
# 程序结束事件
|
|
71
|
+
last_event = threading.Event()
|
|
72
|
+
# 暂停采集事件
|
|
73
|
+
stop_event = threading.Event()
|
|
74
|
+
|
|
75
|
+
# 创建分发器
|
|
76
|
+
distributor = Distributor()
|
|
77
|
+
|
|
78
|
+
# 调度器动态继承
|
|
79
|
+
SchedulerDB, table, sql, length, size = task.SchedulerInfo
|
|
80
|
+
SchedulerTmp = type('Scheduler', (Scheduler, SchedulerDB), {})
|
|
81
|
+
|
|
82
|
+
# 初始化调度器
|
|
83
|
+
scheduler = SchedulerTmp(table=table, sql=sql, length=length, size=size, queue=distributor.seed_queue)
|
|
84
|
+
|
|
85
|
+
# 初始化采集器
|
|
86
|
+
spider = Spider(queue=distributor.seed_queue)
|
|
87
|
+
|
|
88
|
+
# 解析存储器信息
|
|
89
|
+
storer_data = parse_storer_info(task.storer_info)
|
|
90
|
+
|
|
91
|
+
# sds
|
|
92
|
+
item = type("item", (object,), {})
|
|
93
|
+
for db_name in storer_data.keys():
|
|
94
|
+
# 存储器动态继承
|
|
95
|
+
StorerDB = storer_data[db_name]["StorerDB"]
|
|
96
|
+
StorerTmp = type('Storer', (Storer, StorerDB), {})
|
|
97
|
+
db_args_list = storer_data[db_name]["db_args_list"]
|
|
98
|
+
for storer_db_args in db_args_list:
|
|
99
|
+
table, fields, length = storer_db_args
|
|
100
|
+
if not getattr(item, db_name, None):
|
|
101
|
+
instance = type(db_name, (DBItem,), {})
|
|
102
|
+
setattr(item, db_name, instance)
|
|
103
|
+
# 创建存储xxx
|
|
104
|
+
getattr(item, db_name).init_item(table, fields)
|
|
105
|
+
# 创建存储队列
|
|
106
|
+
storer_queue = struct_queue_name(db_name, table)
|
|
107
|
+
distributor.create_queue(queue_name=storer_queue)
|
|
108
|
+
queue = distributor.get_queue(queue_name=storer_queue)
|
|
109
|
+
# 初始话存储器
|
|
110
|
+
table_name = restore_table_name(table_name=table)
|
|
111
|
+
storer = StorerTmp(table=table_name, fields=fields, length=length, queue=queue)
|
|
112
|
+
storer_list.append(storer)
|
|
113
|
+
|
|
114
|
+
# 推送初始种子
|
|
115
|
+
distributor.distribute(init_task_seed, seeds=task.start_seed)
|
|
116
|
+
|
|
117
|
+
# 启动调度器
|
|
118
|
+
threading.Thread(
|
|
119
|
+
target=scheduler.schedule_task,
|
|
120
|
+
args=(distributor.distribute,),
|
|
121
|
+
name="single_scheduler_task"
|
|
122
|
+
).start()
|
|
123
|
+
|
|
124
|
+
# 启动采集器
|
|
125
|
+
for index in range(task.spider_num):
|
|
126
|
+
threading.Thread(
|
|
127
|
+
target=spider.spider_task,
|
|
128
|
+
args=(stop_event, distributor.distribute, func, item),
|
|
129
|
+
name=f"single_spider_task:{index}"
|
|
130
|
+
).start()
|
|
131
|
+
|
|
132
|
+
# 启动存储器
|
|
133
|
+
for storer in storer_list:
|
|
134
|
+
threading.Thread(
|
|
135
|
+
target=storer.store_task,
|
|
136
|
+
args=(stop_event, last_event, distributor.distribute),
|
|
137
|
+
name=f"single_store_task:{storer.table}",
|
|
138
|
+
).start()
|
|
139
|
+
|
|
140
|
+
threading.Thread(
|
|
141
|
+
target=check, name="check",
|
|
142
|
+
args=(
|
|
143
|
+
stop_event, last_event, distributor,
|
|
144
|
+
scheduler, spider, storer_list
|
|
145
|
+
)
|
|
146
|
+
).start()
|
|
147
|
+
|
|
148
|
+
# return starter(task, func)
|
|
149
|
+
return decorator
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
|
|
@@ -9,6 +9,9 @@ def init_task_env():
|
|
|
9
9
|
Setting.SPIDER_RUN_TIME = float(os.getenv("SPIDER_RUN_TIME", 0.2))
|
|
10
10
|
Setting.DEAL_MODEL = os.getenv("DEAL_MODEL", DealModel.failure)
|
|
11
11
|
Setting.LAUNCHER_MODEL = os.getenv("LAUNCHER_MODEL", LauncherModel.task)
|
|
12
|
+
Setting.SCHEDULER_WAIT_TIME = float(os.getenv("SCHEDULER_WAIT_TIME", 5))
|
|
13
|
+
Setting.SCHEDULER_BLOCK_TIME = float(os.getenv("SCHEDULER_BLOCK_TIME", 3))
|
|
14
|
+
Setting.SPIDER_RUN_TIME = float(os.getenv("SPIDER_RUN_TIME", 3))
|
|
12
15
|
|
|
13
16
|
|
|
14
17
|
class Task:
|
|
@@ -7,6 +7,7 @@ cobweb/constant.py
|
|
|
7
7
|
cobweb/decorators.py
|
|
8
8
|
cobweb/interface.py
|
|
9
9
|
cobweb/log.py
|
|
10
|
+
cobweb/setting.py
|
|
10
11
|
cobweb/task.py
|
|
11
12
|
cobweb/utils.py
|
|
12
13
|
cobweb/db/__init__.py
|
|
@@ -18,7 +19,11 @@ cobweb/db/scheduler/textfile.py
|
|
|
18
19
|
cobweb/db/storer/__init__.py
|
|
19
20
|
cobweb/db/storer/console.py
|
|
20
21
|
cobweb/db/storer/loghub.py
|
|
22
|
+
cobweb/db/storer/redis.py
|
|
21
23
|
cobweb/db/storer/textfile.py
|
|
24
|
+
cobweb/distributed/__init__.py
|
|
25
|
+
cobweb/distributed/launcher.py
|
|
26
|
+
cobweb/distributed/models.py
|
|
22
27
|
cobweb/equip/__init__.py
|
|
23
28
|
cobweb/equip/dev/__init__.py
|
|
24
29
|
cobweb/equip/dev/launcher.py
|
|
@@ -29,6 +34,10 @@ cobweb/equip/distributed/models.py
|
|
|
29
34
|
cobweb/equip/single/__init__.py
|
|
30
35
|
cobweb/equip/single/launcher.py
|
|
31
36
|
cobweb/equip/single/models.py
|
|
37
|
+
cobweb/single/__init__.py
|
|
38
|
+
cobweb/single/launcher.py
|
|
39
|
+
cobweb/single/models.py
|
|
40
|
+
cobweb/single/nest.py
|
|
32
41
|
cobweb_launcher.egg-info/PKG-INFO
|
|
33
42
|
cobweb_launcher.egg-info/SOURCES.txt
|
|
34
43
|
cobweb_launcher.egg-info/dependency_links.txt
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{cobweb-launcher-0.1.16/cobweb/equip/dev → cobweb-launcher-0.1.17/cobweb/distributed}/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{cobweb-launcher-0.1.16 → cobweb-launcher-0.1.17}/cobweb_launcher.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|