cobweb-launcher 1.0.1__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cobweb-launcher might be problematic. Click here for more details.
- cobweb/launchers/launcher.py +1 -2
- {cobweb_launcher-1.0.1.dist-info → cobweb_launcher-1.0.2.dist-info}/METADATA +1 -1
- cobweb_launcher-1.0.2.dist-info/RECORD +32 -0
- cobweb/bbb.py +0 -191
- cobweb/config.py +0 -164
- cobweb/db/oss_db.py +0 -128
- cobweb/db/scheduler/__init__.py +0 -1
- cobweb/db/scheduler/default.py +0 -8
- cobweb/db/scheduler/textfile.py +0 -27
- cobweb/db/storer/__init__.py +0 -1
- cobweb/db/storer/console.py +0 -9
- cobweb/db/storer/loghub.py +0 -54
- cobweb/db/storer/redis.py +0 -15
- cobweb/db/storer/textfile.py +0 -15
- cobweb/decorators.py +0 -16
- cobweb/distributed/__init__.py +0 -0
- cobweb/distributed/launcher.py +0 -243
- cobweb/distributed/models.py +0 -143
- cobweb/equip/__init__.py +0 -8
- cobweb/equip/dev/__init__.py +0 -0
- cobweb/equip/dev/launcher.py +0 -202
- cobweb/equip/dev/models.py +0 -156
- cobweb/equip/distributed/__init__.py +0 -0
- cobweb/equip/distributed/launcher.py +0 -219
- cobweb/equip/distributed/models.py +0 -158
- cobweb/equip/download/__init__.py +0 -0
- cobweb/equip/download/launcher.py +0 -203
- cobweb/equip/download/models.py +0 -169
- cobweb/equip/single/__init__.py +0 -0
- cobweb/equip/single/launcher.py +0 -203
- cobweb/equip/single/models.py +0 -173
- cobweb/interface.py +0 -34
- cobweb/log.py +0 -96
- cobweb/new.py +0 -20
- cobweb/single/__init__.py +0 -0
- cobweb/single/launcher.py +0 -231
- cobweb/single/models.py +0 -134
- cobweb/single/nest.py +0 -153
- cobweb/task.py +0 -61
- cobweb/utils.py +0 -90
- cobweb_launcher-1.0.1.dist-info/RECORD +0 -69
- {cobweb_launcher-1.0.1.dist-info → cobweb_launcher-1.0.2.dist-info}/LICENSE +0 -0
- {cobweb_launcher-1.0.1.dist-info → cobweb_launcher-1.0.2.dist-info}/WHEEL +0 -0
- {cobweb_launcher-1.0.1.dist-info → cobweb_launcher-1.0.2.dist-info}/top_level.txt +0 -0
|
@@ -1,203 +0,0 @@
|
|
|
1
|
-
import time
|
|
2
|
-
import threading
|
|
3
|
-
|
|
4
|
-
from .. import log, sqn, rtn, pim
|
|
5
|
-
from .. import Queue, DBItem, RedisDB, Setting, OssDB
|
|
6
|
-
from .models import Scheduler, Spider, Storer
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def check(stop, last, spider, scheduler, storer, ready_seed_length, spider_queue_length):
|
|
10
|
-
log.info("run check thread after 30 seconds...")
|
|
11
|
-
time.sleep(30)
|
|
12
|
-
spider_info = """
|
|
13
|
-
------------------- check: {0} ------------------
|
|
14
|
-
redis_spider_seed_length: {1}
|
|
15
|
-
redis_ready_seed_length: {2}
|
|
16
|
-
running_spider_thread_num: {3}
|
|
17
|
-
memory_seed_queue_length: {4}
|
|
18
|
-
storer_queue_length_info: {5}
|
|
19
|
-
----------------------- end -----------------------"""
|
|
20
|
-
while True:
|
|
21
|
-
status = "running"
|
|
22
|
-
running_spider_thread_num = spider.spider_in_progress.length
|
|
23
|
-
redis_ready_seed_length = ready_seed_length()
|
|
24
|
-
redis_spider_seed_length = spider_queue_length()
|
|
25
|
-
memory_seed_queue_length = scheduler.queue.length
|
|
26
|
-
storer_upload_queue_length = storer.queue.length if storer else None
|
|
27
|
-
if (
|
|
28
|
-
scheduler.stop and
|
|
29
|
-
not memory_seed_queue_length and
|
|
30
|
-
not running_spider_thread_num
|
|
31
|
-
):
|
|
32
|
-
if not Setting.LAUNCHER_MODEL:
|
|
33
|
-
log.info("spider is done?")
|
|
34
|
-
last.set()
|
|
35
|
-
time.sleep(3)
|
|
36
|
-
storer_queue_empty = True
|
|
37
|
-
if storer and storer.queue.length:
|
|
38
|
-
storer_queue_empty = False
|
|
39
|
-
storer_upload_queue_length = storer.queue.length if storer else None
|
|
40
|
-
if (
|
|
41
|
-
storer_queue_empty and
|
|
42
|
-
not redis_ready_seed_length and
|
|
43
|
-
not redis_spider_seed_length
|
|
44
|
-
):
|
|
45
|
-
if Setting.LAUNCHER_MODEL:
|
|
46
|
-
log.info("waiting for push seeds...")
|
|
47
|
-
status = "waiting"
|
|
48
|
-
time.sleep(30)
|
|
49
|
-
else:
|
|
50
|
-
log.info("spider done!")
|
|
51
|
-
break
|
|
52
|
-
|
|
53
|
-
last.clear()
|
|
54
|
-
|
|
55
|
-
log.info(spider_info.format(
|
|
56
|
-
status,
|
|
57
|
-
redis_spider_seed_length,
|
|
58
|
-
redis_ready_seed_length,
|
|
59
|
-
running_spider_thread_num,
|
|
60
|
-
memory_seed_queue_length,
|
|
61
|
-
storer_upload_queue_length
|
|
62
|
-
))
|
|
63
|
-
|
|
64
|
-
time.sleep(3)
|
|
65
|
-
stop.set()
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
def launcher(task):
|
|
69
|
-
"""
|
|
70
|
-
任务启动装饰器
|
|
71
|
-
:param task: 任务配置信息
|
|
72
|
-
"""
|
|
73
|
-
def decorator(func):
|
|
74
|
-
# 程序结束事件
|
|
75
|
-
last = threading.Event()
|
|
76
|
-
# 停止采集事件
|
|
77
|
-
stop = threading.Event()
|
|
78
|
-
|
|
79
|
-
# 初始化redis信息
|
|
80
|
-
redis_db = RedisDB(task.project, task.task_name, task.redis_info)
|
|
81
|
-
|
|
82
|
-
# new item
|
|
83
|
-
item = type("Item", (object,), {"redis_client": redis_db.client})()
|
|
84
|
-
if task.oss_config:
|
|
85
|
-
item.oss = OssDB(**task.oss_config)
|
|
86
|
-
|
|
87
|
-
log.info("初始化cobweb!")
|
|
88
|
-
|
|
89
|
-
seed_queue = Queue()
|
|
90
|
-
|
|
91
|
-
scheduler_info = task.scheduler_info or dict()
|
|
92
|
-
# 调度器动态继承
|
|
93
|
-
sql = scheduler_info.get("sql")
|
|
94
|
-
table = scheduler_info.get("table")
|
|
95
|
-
size = scheduler_info.get("size")
|
|
96
|
-
scheduler_config = scheduler_info.get("config")
|
|
97
|
-
scheduler_db = scheduler_info.get("db", "default")
|
|
98
|
-
DB, class_name = pim(scheduler_db, "scheduler")
|
|
99
|
-
# SchedulerDB, table, sql, length, size, config = task.scheduler_info
|
|
100
|
-
SchedulerTmp = type(class_name, (Scheduler, DB), {})
|
|
101
|
-
# 初始化调度器
|
|
102
|
-
scheduler = SchedulerTmp(
|
|
103
|
-
table=table, sql=sql, size=size, queue=seed_queue,
|
|
104
|
-
length=task.scheduler_queue_length, config=scheduler_config
|
|
105
|
-
)
|
|
106
|
-
|
|
107
|
-
storer = None
|
|
108
|
-
storer_info = task.storer_info or dict()
|
|
109
|
-
|
|
110
|
-
if storer_info:
|
|
111
|
-
storer_db = storer_info["db"]
|
|
112
|
-
fields = storer_info["fields"]
|
|
113
|
-
storer_table = storer_info.get("table", "console")
|
|
114
|
-
storer_config = storer_info.get("config")
|
|
115
|
-
|
|
116
|
-
StorerDB, class_name = pim(storer_db, "storer")
|
|
117
|
-
StorerTmp = type(class_name, (Storer, StorerDB), {})
|
|
118
|
-
|
|
119
|
-
db_name = class_name.lower()
|
|
120
|
-
if not getattr(item, db_name, None):
|
|
121
|
-
instance = type(db_name, (DBItem,), {})
|
|
122
|
-
setattr(item, db_name, instance)
|
|
123
|
-
|
|
124
|
-
storer_item_instance = getattr(item, db_name)
|
|
125
|
-
storer_item_instance.init_item(storer_table, fields)
|
|
126
|
-
|
|
127
|
-
storer_queue = sqn(db_name, storer_table)
|
|
128
|
-
queue = getattr(storer_item_instance, storer_queue)
|
|
129
|
-
# 初始话存储器
|
|
130
|
-
table_name = rtn(table_name=storer_table)
|
|
131
|
-
storer = StorerTmp(
|
|
132
|
-
table=table_name, fields=fields,
|
|
133
|
-
length=task.storer_queue_length,
|
|
134
|
-
queue=queue, config=storer_config
|
|
135
|
-
)
|
|
136
|
-
|
|
137
|
-
# 初始化采集器
|
|
138
|
-
spider = Spider(seed_queue, storer and True, task.max_retries)
|
|
139
|
-
|
|
140
|
-
threading.Thread(target=redis_db.check_spider_queue, args=(stop, 0)).start()
|
|
141
|
-
threading.Thread(target=redis_db.set_heartbeat, args=(stop,)).start()
|
|
142
|
-
|
|
143
|
-
# 推送初始种子
|
|
144
|
-
# seeds = start_seeds(task.start_seed)
|
|
145
|
-
redis_db.add_seed(task.seeds)
|
|
146
|
-
# 启动调度器, 调度至redis队列
|
|
147
|
-
threading.Thread(
|
|
148
|
-
# name="xxxx_schedule_seeds",
|
|
149
|
-
target=scheduler.schedule_seed,
|
|
150
|
-
args=(
|
|
151
|
-
redis_db.ready_seed_length,
|
|
152
|
-
redis_db.get_scheduler_lock,
|
|
153
|
-
redis_db.add_seed
|
|
154
|
-
)
|
|
155
|
-
).start()
|
|
156
|
-
|
|
157
|
-
# 启动调度器, 调度任务队列
|
|
158
|
-
threading.Thread(
|
|
159
|
-
# name="xxxx_schedule_task",
|
|
160
|
-
target=scheduler.schedule_task,
|
|
161
|
-
args=(
|
|
162
|
-
stop, redis_db.get_seed,
|
|
163
|
-
redis_db.ready_seed_length
|
|
164
|
-
)
|
|
165
|
-
).start()
|
|
166
|
-
|
|
167
|
-
# 启动采集器
|
|
168
|
-
for index in range(task.spider_num):
|
|
169
|
-
threading.Thread(
|
|
170
|
-
# name=f"xxxx_spider_task:{index}",
|
|
171
|
-
target=spider.spider_task,
|
|
172
|
-
args=(
|
|
173
|
-
stop, func, item,
|
|
174
|
-
redis_db.del_seed,
|
|
175
|
-
redis_db.add_seed
|
|
176
|
-
)
|
|
177
|
-
).start()
|
|
178
|
-
|
|
179
|
-
# 启动存储器
|
|
180
|
-
if storer:
|
|
181
|
-
threading.Thread(
|
|
182
|
-
# name=f"xxxx_store_task:{storer.table}",
|
|
183
|
-
target=storer.store_task,
|
|
184
|
-
args=(
|
|
185
|
-
stop, last,
|
|
186
|
-
redis_db.reset_seed,
|
|
187
|
-
redis_db.del_seed
|
|
188
|
-
)
|
|
189
|
-
).start()
|
|
190
|
-
|
|
191
|
-
threading.Thread(
|
|
192
|
-
# name="check_spider",
|
|
193
|
-
target=check,
|
|
194
|
-
args=(
|
|
195
|
-
stop, last, spider,
|
|
196
|
-
scheduler, storer,
|
|
197
|
-
redis_db.ready_seed_length,
|
|
198
|
-
redis_db.spider_queue_length,
|
|
199
|
-
)
|
|
200
|
-
).start()
|
|
201
|
-
|
|
202
|
-
return decorator
|
|
203
|
-
|
cobweb/equip/download/models.py
DELETED
|
@@ -1,169 +0,0 @@
|
|
|
1
|
-
import time
|
|
2
|
-
from inspect import isgenerator
|
|
3
|
-
# from pympler import asizeof
|
|
4
|
-
from .. import log, ici
|
|
5
|
-
from .. import DealModel, Queue, Seed, Setting
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class Scheduler:
|
|
9
|
-
|
|
10
|
-
def schedule_seed(self, ready_seed_length, get_scheduler_lock, add_seed):
|
|
11
|
-
|
|
12
|
-
inf_name = "SchedulerInterface"
|
|
13
|
-
if not ici(self.__class__, inf_name):
|
|
14
|
-
raise Exception("not have schedule function!")
|
|
15
|
-
|
|
16
|
-
if self.__class__.__name__ == "Default":
|
|
17
|
-
self.stop = True
|
|
18
|
-
return None
|
|
19
|
-
|
|
20
|
-
while not self.stop:
|
|
21
|
-
length = ready_seed_length()
|
|
22
|
-
if length > self.size:
|
|
23
|
-
time.sleep(15)
|
|
24
|
-
|
|
25
|
-
elif get_scheduler_lock():
|
|
26
|
-
seeds = self.schedule()
|
|
27
|
-
add_seed(seeds)
|
|
28
|
-
|
|
29
|
-
log.info(f"close thread: schedule_seed")
|
|
30
|
-
|
|
31
|
-
def schedule_task(self, stop, get_seed, ready_seed_length):
|
|
32
|
-
time.sleep(3)
|
|
33
|
-
while not stop.is_set():
|
|
34
|
-
|
|
35
|
-
if not ready_seed_length():
|
|
36
|
-
time.sleep(Setting.SCHEDULER_WAIT_TIME)
|
|
37
|
-
continue
|
|
38
|
-
|
|
39
|
-
if self.queue.length >= self.length:
|
|
40
|
-
time.sleep(Setting.SCHEDULER_BLOCK_TIME)
|
|
41
|
-
continue
|
|
42
|
-
|
|
43
|
-
seeds = get_seed(self.length)
|
|
44
|
-
self.queue.push(seeds)
|
|
45
|
-
log.info(f"close thread: schedule_task")
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
class Spider:
|
|
49
|
-
|
|
50
|
-
def __init__(self, queue, storage, max_retries=5):
|
|
51
|
-
self.spider_in_progress = Queue()
|
|
52
|
-
self.max_retries = max_retries
|
|
53
|
-
self.storage = storage
|
|
54
|
-
self.queue = queue
|
|
55
|
-
|
|
56
|
-
def spider_task(self, stop, func, item, del_seed, add_seed):
|
|
57
|
-
while not stop.is_set():
|
|
58
|
-
|
|
59
|
-
seed = self.queue.pop()
|
|
60
|
-
|
|
61
|
-
if not seed:
|
|
62
|
-
time.sleep(Setting.SPIDER_WAIT_TIME)
|
|
63
|
-
continue
|
|
64
|
-
|
|
65
|
-
elif seed._retry >= self.max_retries:
|
|
66
|
-
del_seed(seed, spider_status=False)
|
|
67
|
-
continue
|
|
68
|
-
|
|
69
|
-
try:
|
|
70
|
-
self.spider_in_progress.push(1, direct_insertion=True)
|
|
71
|
-
log.info("spider seed: " + str(seed))
|
|
72
|
-
|
|
73
|
-
store_queue = None
|
|
74
|
-
store_data = list()
|
|
75
|
-
add_seed_list = list()
|
|
76
|
-
iterators = func(item, seed)
|
|
77
|
-
|
|
78
|
-
if not isgenerator(iterators):
|
|
79
|
-
if not self.storage:
|
|
80
|
-
del_seed(seed, spider_status=True)
|
|
81
|
-
continue
|
|
82
|
-
raise TypeError(f"{func.__name__} isn't a generator")
|
|
83
|
-
|
|
84
|
-
status = None
|
|
85
|
-
for it in iterators:
|
|
86
|
-
status = True
|
|
87
|
-
if getattr(it, "table_name", None):
|
|
88
|
-
if not store_queue:
|
|
89
|
-
store_queue = it.queue()
|
|
90
|
-
store_data.append(it.struct_data)
|
|
91
|
-
elif isinstance(it, Seed):
|
|
92
|
-
add_seed_list.append(it)
|
|
93
|
-
|
|
94
|
-
elif isinstance(it, str) and it == DealModel.polling:
|
|
95
|
-
self.queue.push(seed)
|
|
96
|
-
break
|
|
97
|
-
elif isinstance(it, str) and it == DealModel.success:
|
|
98
|
-
del_seed(seed, spider_status=True)
|
|
99
|
-
break
|
|
100
|
-
elif isinstance(it, str) and it == DealModel.failure:
|
|
101
|
-
del_seed(seed, spider_status=False)
|
|
102
|
-
break
|
|
103
|
-
else:
|
|
104
|
-
raise TypeError("yield value type error!")
|
|
105
|
-
|
|
106
|
-
if not status:
|
|
107
|
-
raise Exception("yield value type error!")
|
|
108
|
-
|
|
109
|
-
if store_queue and store_data:
|
|
110
|
-
store_data.append(seed)
|
|
111
|
-
store_queue.push(store_data)
|
|
112
|
-
|
|
113
|
-
if add_seed_list:
|
|
114
|
-
del_seed(seed, spider_status=True)
|
|
115
|
-
add_seed(add_seed_list)
|
|
116
|
-
|
|
117
|
-
except Exception as e:
|
|
118
|
-
seed._retry += 1
|
|
119
|
-
self.queue.push(seed)
|
|
120
|
-
log.info(f"{str(seed)} -> {str(e)}")
|
|
121
|
-
finally:
|
|
122
|
-
self.spider_in_progress.pop()
|
|
123
|
-
time.sleep(Setting.SPIDER_SLEEP_TIME)
|
|
124
|
-
log.info(f"close thread: spider")
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
class Storer:
|
|
128
|
-
|
|
129
|
-
def store_task(self, stop, last, reset_seed, del_seed):
|
|
130
|
-
|
|
131
|
-
inf_name = "StorerInterface"
|
|
132
|
-
if not ici(self.__class__, inf_name):
|
|
133
|
-
return None
|
|
134
|
-
|
|
135
|
-
if not getattr(self, "store", None):
|
|
136
|
-
raise Exception("not have store function!")
|
|
137
|
-
|
|
138
|
-
storer_name = self.__class__.__name__ + self.table
|
|
139
|
-
|
|
140
|
-
while not stop.is_set():
|
|
141
|
-
|
|
142
|
-
storer_length = self.queue.length
|
|
143
|
-
if not storer_length:
|
|
144
|
-
time.sleep(5)
|
|
145
|
-
continue
|
|
146
|
-
elif not last.is_set() and storer_length < self.length:
|
|
147
|
-
time.sleep(3)
|
|
148
|
-
continue
|
|
149
|
-
|
|
150
|
-
seeds, data_list = [], []
|
|
151
|
-
|
|
152
|
-
while True:
|
|
153
|
-
data = self.queue.pop()
|
|
154
|
-
if not data:
|
|
155
|
-
break
|
|
156
|
-
if isinstance(data, Seed):
|
|
157
|
-
seeds.append(data)
|
|
158
|
-
if len(data_list) >= self.length:
|
|
159
|
-
break
|
|
160
|
-
continue
|
|
161
|
-
data_list.append(data)
|
|
162
|
-
|
|
163
|
-
if self.store(data_list):
|
|
164
|
-
del_seed(seeds)
|
|
165
|
-
else:
|
|
166
|
-
reset_seed(seeds)
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
log.info(f"close thread: {storer_name}")
|
cobweb/equip/single/__init__.py
DELETED
|
File without changes
|
cobweb/equip/single/launcher.py
DELETED
|
@@ -1,203 +0,0 @@
|
|
|
1
|
-
import time
|
|
2
|
-
import threading
|
|
3
|
-
|
|
4
|
-
from .. import log, sqn, rtn, pim
|
|
5
|
-
from .. import Queue, DBItem, RedisDB, Setting, OssDB
|
|
6
|
-
from .models import Scheduler, Spider, Storer
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def check(stop, last, spider, scheduler, storer, ready_seed_length, spider_queue_length):
|
|
10
|
-
log.info("run check thread after 30 seconds...")
|
|
11
|
-
time.sleep(30)
|
|
12
|
-
spider_info = """
|
|
13
|
-
------------------- check: {0} ------------------
|
|
14
|
-
redis_spider_seed_length: {1}
|
|
15
|
-
redis_ready_seed_length: {2}
|
|
16
|
-
running_spider_thread_num: {3}
|
|
17
|
-
memory_seed_queue_length: {4}
|
|
18
|
-
storer_queue_length_info: {5}
|
|
19
|
-
----------------------- end -----------------------"""
|
|
20
|
-
while True:
|
|
21
|
-
status = "running"
|
|
22
|
-
running_spider_thread_num = spider.spider_in_progress.length
|
|
23
|
-
redis_ready_seed_length = ready_seed_length()
|
|
24
|
-
redis_spider_seed_length = spider_queue_length()
|
|
25
|
-
memory_seed_queue_length = scheduler.queue.length
|
|
26
|
-
storer_upload_queue_length = storer.queue.length if storer else None
|
|
27
|
-
if (
|
|
28
|
-
scheduler.stop and
|
|
29
|
-
not memory_seed_queue_length and
|
|
30
|
-
not running_spider_thread_num
|
|
31
|
-
):
|
|
32
|
-
if not Setting.LAUNCHER_MODEL:
|
|
33
|
-
log.info("spider is done?")
|
|
34
|
-
last.set()
|
|
35
|
-
time.sleep(3)
|
|
36
|
-
storer_queue_empty = True
|
|
37
|
-
if storer and storer.queue.length:
|
|
38
|
-
storer_queue_empty = False
|
|
39
|
-
storer_upload_queue_length = storer.queue.length if storer else None
|
|
40
|
-
if (
|
|
41
|
-
storer_queue_empty and
|
|
42
|
-
not redis_ready_seed_length and
|
|
43
|
-
not redis_spider_seed_length
|
|
44
|
-
):
|
|
45
|
-
if Setting.LAUNCHER_MODEL:
|
|
46
|
-
log.info("waiting for push seeds...")
|
|
47
|
-
status = "waiting"
|
|
48
|
-
time.sleep(30)
|
|
49
|
-
else:
|
|
50
|
-
log.info("spider done!")
|
|
51
|
-
break
|
|
52
|
-
|
|
53
|
-
last.clear()
|
|
54
|
-
|
|
55
|
-
log.info(spider_info.format(
|
|
56
|
-
status,
|
|
57
|
-
redis_spider_seed_length,
|
|
58
|
-
redis_ready_seed_length,
|
|
59
|
-
running_spider_thread_num,
|
|
60
|
-
memory_seed_queue_length,
|
|
61
|
-
storer_upload_queue_length
|
|
62
|
-
))
|
|
63
|
-
|
|
64
|
-
time.sleep(3)
|
|
65
|
-
stop.set()
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
def launcher(task):
|
|
69
|
-
"""
|
|
70
|
-
任务启动装饰器
|
|
71
|
-
:param task: 任务配置信息
|
|
72
|
-
"""
|
|
73
|
-
def decorator(func):
|
|
74
|
-
# 程序结束事件
|
|
75
|
-
last = threading.Event()
|
|
76
|
-
# 停止采集事件
|
|
77
|
-
stop = threading.Event()
|
|
78
|
-
|
|
79
|
-
# 初始化redis信息
|
|
80
|
-
redis_db = RedisDB(task.project, task.task_name, task.redis_info)
|
|
81
|
-
|
|
82
|
-
# new item
|
|
83
|
-
item = type("Item", (object,), {"redis_client": redis_db.client})()
|
|
84
|
-
if task.oss_config:
|
|
85
|
-
item.oss = OssDB(**task.oss_config)
|
|
86
|
-
|
|
87
|
-
log.info("初始化cobweb!")
|
|
88
|
-
|
|
89
|
-
seed_queue = Queue()
|
|
90
|
-
|
|
91
|
-
scheduler_info = task.scheduler_info or dict()
|
|
92
|
-
# 调度器动态继承
|
|
93
|
-
sql = scheduler_info.get("sql")
|
|
94
|
-
table = scheduler_info.get("table")
|
|
95
|
-
size = scheduler_info.get("size")
|
|
96
|
-
scheduler_config = scheduler_info.get("config")
|
|
97
|
-
scheduler_db = scheduler_info.get("db", "default")
|
|
98
|
-
DB, class_name = pim(scheduler_db, "scheduler")
|
|
99
|
-
# SchedulerDB, table, sql, length, size, config = task.scheduler_info
|
|
100
|
-
SchedulerTmp = type(class_name, (Scheduler, DB), {})
|
|
101
|
-
# 初始化调度器
|
|
102
|
-
scheduler = SchedulerTmp(
|
|
103
|
-
table=table, sql=sql, size=size, queue=seed_queue,
|
|
104
|
-
length=task.scheduler_queue_length, config=scheduler_config
|
|
105
|
-
)
|
|
106
|
-
|
|
107
|
-
storer = None
|
|
108
|
-
storer_info = task.storer_info or dict()
|
|
109
|
-
|
|
110
|
-
if storer_info:
|
|
111
|
-
storer_db = storer_info["db"]
|
|
112
|
-
fields = storer_info["fields"]
|
|
113
|
-
storer_table = storer_info.get("table", "console")
|
|
114
|
-
storer_config = storer_info.get("config")
|
|
115
|
-
|
|
116
|
-
StorerDB, class_name = pim(storer_db, "storer")
|
|
117
|
-
StorerTmp = type(class_name, (Storer, StorerDB), {})
|
|
118
|
-
|
|
119
|
-
db_name = class_name.lower()
|
|
120
|
-
if not getattr(item, db_name, None):
|
|
121
|
-
instance = type(db_name, (DBItem,), {})
|
|
122
|
-
setattr(item, db_name, instance)
|
|
123
|
-
|
|
124
|
-
storer_item_instance = getattr(item, db_name)
|
|
125
|
-
storer_item_instance.init_item(storer_table, fields)
|
|
126
|
-
|
|
127
|
-
storer_queue = sqn(db_name, storer_table)
|
|
128
|
-
queue = getattr(storer_item_instance, storer_queue)
|
|
129
|
-
# 初始话存储器
|
|
130
|
-
table_name = rtn(table_name=storer_table)
|
|
131
|
-
storer = StorerTmp(
|
|
132
|
-
table=table_name, fields=fields,
|
|
133
|
-
length=task.storer_queue_length,
|
|
134
|
-
queue=queue, config=storer_config
|
|
135
|
-
)
|
|
136
|
-
|
|
137
|
-
# 初始化采集器
|
|
138
|
-
spider = Spider(seed_queue, storer and True, task.max_retries)
|
|
139
|
-
|
|
140
|
-
threading.Thread(target=redis_db.check_spider_queue, args=(stop, 0)).start()
|
|
141
|
-
threading.Thread(target=redis_db.set_heartbeat, args=(stop,)).start()
|
|
142
|
-
|
|
143
|
-
# 推送初始种子
|
|
144
|
-
# seeds = start_seeds(task.start_seed)
|
|
145
|
-
redis_db.add_seed(task.seeds)
|
|
146
|
-
# 启动调度器, 调度至redis队列
|
|
147
|
-
threading.Thread(
|
|
148
|
-
# name="xxxx_schedule_seeds",
|
|
149
|
-
target=scheduler.schedule_seed,
|
|
150
|
-
args=(
|
|
151
|
-
redis_db.ready_seed_length,
|
|
152
|
-
redis_db.get_scheduler_lock,
|
|
153
|
-
redis_db.add_seed
|
|
154
|
-
)
|
|
155
|
-
).start()
|
|
156
|
-
|
|
157
|
-
# 启动调度器, 调度任务队列
|
|
158
|
-
threading.Thread(
|
|
159
|
-
# name="xxxx_schedule_task",
|
|
160
|
-
target=scheduler.schedule_task,
|
|
161
|
-
args=(
|
|
162
|
-
stop, redis_db.get_seed,
|
|
163
|
-
redis_db.ready_seed_length
|
|
164
|
-
)
|
|
165
|
-
).start()
|
|
166
|
-
|
|
167
|
-
# 启动采集器
|
|
168
|
-
for index in range(task.spider_num):
|
|
169
|
-
threading.Thread(
|
|
170
|
-
# name=f"xxxx_spider_task:{index}",
|
|
171
|
-
target=spider.spider_task,
|
|
172
|
-
args=(
|
|
173
|
-
stop, func, item,
|
|
174
|
-
redis_db.del_seed,
|
|
175
|
-
redis_db.add_seed
|
|
176
|
-
)
|
|
177
|
-
).start()
|
|
178
|
-
|
|
179
|
-
# 启动存储器
|
|
180
|
-
if storer:
|
|
181
|
-
threading.Thread(
|
|
182
|
-
# name=f"xxxx_store_task:{storer.table}",
|
|
183
|
-
target=storer.store_task,
|
|
184
|
-
args=(
|
|
185
|
-
stop, last,
|
|
186
|
-
redis_db.reset_seed,
|
|
187
|
-
redis_db.del_seed
|
|
188
|
-
)
|
|
189
|
-
).start()
|
|
190
|
-
|
|
191
|
-
threading.Thread(
|
|
192
|
-
# name="check_spider",
|
|
193
|
-
target=check,
|
|
194
|
-
args=(
|
|
195
|
-
stop, last, spider,
|
|
196
|
-
scheduler, storer,
|
|
197
|
-
redis_db.ready_seed_length,
|
|
198
|
-
redis_db.spider_queue_length,
|
|
199
|
-
)
|
|
200
|
-
).start()
|
|
201
|
-
|
|
202
|
-
return decorator
|
|
203
|
-
|