cobweb-launcher 0.1.7__py3-none-any.whl → 1.2.41__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- cobweb/__init__.py +2 -11
- cobweb/base/__init__.py +9 -0
- cobweb/base/basic.py +297 -0
- cobweb/base/common_queue.py +30 -0
- cobweb/base/decorators.py +40 -0
- cobweb/base/dotting.py +35 -0
- cobweb/base/item.py +46 -0
- cobweb/{log.py → base/log.py} +4 -6
- cobweb/base/request.py +82 -0
- cobweb/base/response.py +23 -0
- cobweb/base/seed.py +114 -0
- cobweb/constant.py +94 -0
- cobweb/crawlers/__init__.py +1 -0
- cobweb/crawlers/base_crawler.py +144 -0
- cobweb/crawlers/crawler.py +209 -0
- cobweb/crawlers/file_crawler.py +98 -0
- cobweb/db/__init__.py +2 -2
- cobweb/db/api_db.py +82 -0
- cobweb/db/redis_db.py +125 -218
- cobweb/exceptions/__init__.py +1 -0
- cobweb/exceptions/oss_db_exception.py +28 -0
- cobweb/launchers/__init__.py +3 -0
- cobweb/launchers/launcher.py +235 -0
- cobweb/launchers/launcher_air.py +88 -0
- cobweb/launchers/launcher_api.py +209 -0
- cobweb/launchers/launcher_pro.py +208 -0
- cobweb/pipelines/__init__.py +3 -0
- cobweb/pipelines/pipeline.py +69 -0
- cobweb/pipelines/pipeline_console.py +22 -0
- cobweb/pipelines/pipeline_loghub.py +34 -0
- cobweb/schedulers/__init__.py +3 -0
- cobweb/schedulers/scheduler_api.py +72 -0
- cobweb/schedulers/scheduler_redis.py +72 -0
- cobweb/setting.py +67 -6
- cobweb/utils/__init__.py +5 -0
- cobweb/utils/bloom.py +58 -0
- cobweb/utils/dotting.py +32 -0
- cobweb/utils/oss.py +94 -0
- cobweb/utils/tools.py +42 -0
- cobweb_launcher-1.2.41.dist-info/METADATA +205 -0
- cobweb_launcher-1.2.41.dist-info/RECORD +44 -0
- {cobweb_launcher-0.1.7.dist-info → cobweb_launcher-1.2.41.dist-info}/WHEEL +1 -1
- cobweb/bbb.py +0 -191
- cobweb/db/oss_db.py +0 -127
- cobweb/db/scheduler/__init__.py +0 -0
- cobweb/db/scheduler/default.py +0 -8
- cobweb/db/scheduler/textfile.py +0 -27
- cobweb/db/storer/__init__.py +0 -0
- cobweb/db/storer/console.py +0 -9
- cobweb/db/storer/loghub.py +0 -54
- cobweb/db/storer/redis.py +0 -15
- cobweb/db/storer/textfile.py +0 -15
- cobweb/decorators.py +0 -16
- cobweb/distributed/__init__.py +0 -0
- cobweb/distributed/launcher.py +0 -243
- cobweb/distributed/models.py +0 -143
- cobweb/interface.py +0 -34
- cobweb/single/__init__.py +0 -0
- cobweb/single/launcher.py +0 -231
- cobweb/single/models.py +0 -134
- cobweb/single/nest.py +0 -153
- cobweb/task.py +0 -50
- cobweb/utils.py +0 -90
- cobweb_launcher-0.1.7.dist-info/METADATA +0 -45
- cobweb_launcher-0.1.7.dist-info/RECORD +0 -31
- {cobweb_launcher-0.1.7.dist-info → cobweb_launcher-1.2.41.dist-info}/LICENSE +0 -0
- {cobweb_launcher-0.1.7.dist-info → cobweb_launcher-1.2.41.dist-info}/top_level.txt +0 -0
cobweb/interface.py
DELETED
@@ -1,34 +0,0 @@
|
|
1
|
-
from abc import ABC, abstractmethod
|
2
|
-
from .utils import parse_info
|
3
|
-
|
4
|
-
|
5
|
-
class SchedulerInterface(ABC):
|
6
|
-
|
7
|
-
def __init__(self, table, sql, length, size, queue, config=None):
|
8
|
-
self.sql = sql
|
9
|
-
self.table = table
|
10
|
-
self.length = length
|
11
|
-
self.size = size
|
12
|
-
self.queue = queue
|
13
|
-
self.config = parse_info(config)
|
14
|
-
self.stop = False
|
15
|
-
|
16
|
-
@abstractmethod
|
17
|
-
def schedule(self, *args, **kwargs):
|
18
|
-
pass
|
19
|
-
|
20
|
-
|
21
|
-
class StorerInterface(ABC):
|
22
|
-
|
23
|
-
def __init__(self, table, fields, length, queue, config=None):
|
24
|
-
self.table = table
|
25
|
-
self.fields = fields
|
26
|
-
self.length = length
|
27
|
-
self.queue = queue
|
28
|
-
self.config = parse_info(config)
|
29
|
-
# self.redis_db = redis_db
|
30
|
-
|
31
|
-
@abstractmethod
|
32
|
-
def store(self, *args, **kwargs):
|
33
|
-
pass
|
34
|
-
|
cobweb/single/__init__.py
DELETED
File without changes
|
cobweb/single/launcher.py
DELETED
@@ -1,231 +0,0 @@
|
|
1
|
-
import time
|
2
|
-
import threading
|
3
|
-
from threading import Thread
|
4
|
-
|
5
|
-
from .models import Scheduler, Spider, Storer
|
6
|
-
from cobweb import log, Queue, DBItem, RedisDB
|
7
|
-
from cobweb.setting import MODEL, RESET_SCORE, CHECK_LOCK_TIME
|
8
|
-
from cobweb.utils import (
|
9
|
-
struct_queue_name as sqn,
|
10
|
-
restore_table_name as rtn,
|
11
|
-
parse_import_model as pim,
|
12
|
-
)
|
13
|
-
|
14
|
-
|
15
|
-
def check(stop, last, spider, scheduler, storer, ready_seed_length, spider_queue_length):
|
16
|
-
log.info("run check thread after 30 seconds...")
|
17
|
-
time.sleep(30)
|
18
|
-
spider_info = """
|
19
|
-
------------------- check: {0} ------------------
|
20
|
-
redis_spider_seed_length: {1}
|
21
|
-
redis_ready_seed_length: {2}
|
22
|
-
running_spider_thread_num: {3}
|
23
|
-
memory_seed_queue_length: {4}
|
24
|
-
storer_queue_length_info: {5}
|
25
|
-
----------------------- end -----------------------"""
|
26
|
-
while True:
|
27
|
-
status = "running"
|
28
|
-
running_spider_thread_num = spider.spider_in_progress.length
|
29
|
-
redis_ready_seed_length = ready_seed_length()
|
30
|
-
redis_spider_seed_length = spider_queue_length()
|
31
|
-
memory_seed_queue_length = scheduler.queue.length
|
32
|
-
storer_upload_queue_length = storer.queue.length
|
33
|
-
if (
|
34
|
-
scheduler.stop and
|
35
|
-
# not redis_ready_seed_length and
|
36
|
-
not memory_seed_queue_length and
|
37
|
-
not running_spider_thread_num
|
38
|
-
):
|
39
|
-
if not MODEL:
|
40
|
-
log.info("spider is done?")
|
41
|
-
last.set()
|
42
|
-
time.sleep(3)
|
43
|
-
storer_queue_empty = True
|
44
|
-
if storer.queue.length:
|
45
|
-
storer_queue_empty = False
|
46
|
-
storer_upload_queue_length = storer.queue.length
|
47
|
-
if (
|
48
|
-
storer_queue_empty and
|
49
|
-
not redis_ready_seed_length and
|
50
|
-
not redis_spider_seed_length
|
51
|
-
):
|
52
|
-
if MODEL:
|
53
|
-
log.info("waiting for push seeds...")
|
54
|
-
status = "waiting"
|
55
|
-
time.sleep(30)
|
56
|
-
else:
|
57
|
-
log.info("spider done!")
|
58
|
-
break
|
59
|
-
|
60
|
-
last.clear()
|
61
|
-
|
62
|
-
log.info(spider_info.format(
|
63
|
-
status,
|
64
|
-
redis_spider_seed_length,
|
65
|
-
redis_ready_seed_length,
|
66
|
-
running_spider_thread_num,
|
67
|
-
memory_seed_queue_length,
|
68
|
-
storer_upload_queue_length
|
69
|
-
))
|
70
|
-
|
71
|
-
time.sleep(3)
|
72
|
-
stop.set()
|
73
|
-
|
74
|
-
|
75
|
-
def launcher(task):
|
76
|
-
"""
|
77
|
-
任务启动装饰器
|
78
|
-
:param task: 任务配置信息
|
79
|
-
"""
|
80
|
-
def decorator(func):
|
81
|
-
"""
|
82
|
-
Item:
|
83
|
-
Textfile()
|
84
|
-
Loghub()
|
85
|
-
Console()
|
86
|
-
e.g.
|
87
|
-
task.fields = "a,b"
|
88
|
-
func(item, seed)
|
89
|
-
a = "a"
|
90
|
-
b = "b"
|
91
|
-
data = {"a": "a", "b": "b"}
|
92
|
-
yield item.Loghub(**data)
|
93
|
-
yield item.Loghub(a=a, b=b)
|
94
|
-
"""
|
95
|
-
storer_list = []
|
96
|
-
|
97
|
-
# 程序结束事件
|
98
|
-
last = threading.Event()
|
99
|
-
# 停止采集事件
|
100
|
-
stop = threading.Event()
|
101
|
-
|
102
|
-
# 初始化redis信息
|
103
|
-
redis_db = RedisDB(
|
104
|
-
task.project, task.task_name, task.redis_info,
|
105
|
-
model=MODEL, cs_lct=CHECK_LOCK_TIME, rs_time=RESET_SCORE
|
106
|
-
)
|
107
|
-
|
108
|
-
# new item
|
109
|
-
item = type("Item", (object,), {"redis_client": redis_db.client})()
|
110
|
-
|
111
|
-
log.info("初始化cobweb!")
|
112
|
-
|
113
|
-
seed_queue = Queue()
|
114
|
-
|
115
|
-
scheduler_info = task.scheduler_info or dict()
|
116
|
-
|
117
|
-
# 调度器动态继承
|
118
|
-
sql = scheduler_info.get("sql")
|
119
|
-
table = scheduler_info.get("table")
|
120
|
-
size = scheduler_info.get("size")
|
121
|
-
scheduler_config = scheduler_info.get("config")
|
122
|
-
scheduler_db = scheduler_info.get("db", "default")
|
123
|
-
DB, class_name = pim(scheduler_db, "scheduler")
|
124
|
-
# SchedulerDB, table, sql, length, size, config = task.scheduler_info
|
125
|
-
SchedulerTmp = type(class_name, (Scheduler, DB), {})
|
126
|
-
|
127
|
-
# 初始化调度器
|
128
|
-
scheduler = SchedulerTmp(
|
129
|
-
table=table, sql=sql, size=size, queue=seed_queue,
|
130
|
-
length=task.scheduler_queue_length, config=scheduler_config
|
131
|
-
)
|
132
|
-
|
133
|
-
# 初始化采集器
|
134
|
-
spider = Spider(seed_queue, task.max_retries)
|
135
|
-
|
136
|
-
storer = None
|
137
|
-
|
138
|
-
# 解析存储器信息
|
139
|
-
storer_info = task.storer_info or dict()
|
140
|
-
|
141
|
-
# for storer_info in storer_info_list:
|
142
|
-
if storer_info:
|
143
|
-
storer_db = storer_info["db"]
|
144
|
-
fields = storer_info["fields"]
|
145
|
-
storer_table = storer_info.get("table", "console")
|
146
|
-
storer_config = storer_info.get("config")
|
147
|
-
|
148
|
-
StorerDB, class_name = pim(storer_db, "storer")
|
149
|
-
StorerTmp = type(class_name, (Storer, StorerDB), {})
|
150
|
-
|
151
|
-
db_name = class_name.lower()
|
152
|
-
if not getattr(item, db_name, None):
|
153
|
-
instance = type(db_name, (DBItem,), {})
|
154
|
-
setattr(item, db_name, instance)
|
155
|
-
|
156
|
-
storer_item_instance = getattr(item, db_name)
|
157
|
-
storer_item_instance.init_item(storer_table, fields)
|
158
|
-
|
159
|
-
storer_queue = sqn(db_name, storer_table)
|
160
|
-
queue = getattr(storer_item_instance, storer_queue)
|
161
|
-
# 初始话存储器
|
162
|
-
table_name = rtn(table_name=storer_table)
|
163
|
-
storer = StorerTmp(
|
164
|
-
table=table_name, fields=fields,
|
165
|
-
length=task.storer_queue_length,
|
166
|
-
queue=queue, config=storer_config
|
167
|
-
)
|
168
|
-
|
169
|
-
Thread(target=redis_db.check_spider_queue, args=(stop, len(storer_list))).start()
|
170
|
-
Thread(target=redis_db.set_heartbeat, args=(stop,)).start()
|
171
|
-
|
172
|
-
# 推送初始种子
|
173
|
-
# seeds = start_seeds(task.start_seed)
|
174
|
-
redis_db.add_seed(task.seeds)
|
175
|
-
# 启动调度器, 调度至redis队列
|
176
|
-
Thread(
|
177
|
-
# name="xxxx_schedule_seeds",
|
178
|
-
target=scheduler.schedule_seed,
|
179
|
-
args=(
|
180
|
-
redis_db.ready_seed_length,
|
181
|
-
redis_db.get_scheduler_lock,
|
182
|
-
redis_db.add_seed
|
183
|
-
)
|
184
|
-
).start()
|
185
|
-
|
186
|
-
# 启动调度器, 调度任务队列
|
187
|
-
Thread(
|
188
|
-
# name="xxxx_schedule_task",
|
189
|
-
target=scheduler.schedule_task,
|
190
|
-
args=(
|
191
|
-
stop, redis_db.get_seed,
|
192
|
-
redis_db.ready_seed_length
|
193
|
-
)
|
194
|
-
).start()
|
195
|
-
|
196
|
-
# 启动采集器
|
197
|
-
for index in range(task.spider_num):
|
198
|
-
Thread(
|
199
|
-
# name=f"xxxx_spider_task:{index}",
|
200
|
-
target=spider.spider_task,
|
201
|
-
args=(
|
202
|
-
stop, func, item,
|
203
|
-
redis_db.del_seed
|
204
|
-
)
|
205
|
-
).start()
|
206
|
-
|
207
|
-
# 启动存储器
|
208
|
-
if storer:
|
209
|
-
Thread(
|
210
|
-
# name=f"xxxx_store_task:{storer.table}",
|
211
|
-
target=storer.store_task,
|
212
|
-
args=(
|
213
|
-
stop, last,
|
214
|
-
redis_db.reset_seed,
|
215
|
-
redis_db.del_seed
|
216
|
-
)
|
217
|
-
).start()
|
218
|
-
|
219
|
-
Thread(
|
220
|
-
# name="check_spider",
|
221
|
-
target=check,
|
222
|
-
args=(
|
223
|
-
stop, last, spider,
|
224
|
-
scheduler, storer,
|
225
|
-
redis_db.ready_seed_length,
|
226
|
-
redis_db.spider_queue_length,
|
227
|
-
)
|
228
|
-
).start()
|
229
|
-
|
230
|
-
return decorator
|
231
|
-
|
cobweb/single/models.py
DELETED
@@ -1,134 +0,0 @@
|
|
1
|
-
import time
|
2
|
-
from cobweb import log, Queue, Seed
|
3
|
-
from cobweb.utils import issubclass_cobweb_inf
|
4
|
-
# from pympler import asizeof
|
5
|
-
|
6
|
-
|
7
|
-
class Scheduler:
|
8
|
-
|
9
|
-
def schedule_seed(self, ready_seed_length, get_scheduler_lock, add_seed):
|
10
|
-
|
11
|
-
inf_name = "SchedulerInterface"
|
12
|
-
if not issubclass_cobweb_inf(self.__class__, inf_name):
|
13
|
-
raise Exception("not have schedule function!")
|
14
|
-
|
15
|
-
if self.__class__.__name__ == "Default":
|
16
|
-
self.stop = True
|
17
|
-
return None
|
18
|
-
|
19
|
-
while not self.stop:
|
20
|
-
length = ready_seed_length()
|
21
|
-
if length > self.size:
|
22
|
-
time.sleep(15)
|
23
|
-
|
24
|
-
elif get_scheduler_lock():
|
25
|
-
seeds = self.schedule()
|
26
|
-
add_seed(seeds)
|
27
|
-
|
28
|
-
log.info(f"close thread: schedule_seed")
|
29
|
-
|
30
|
-
def schedule_task(self, stop, get_seed, ready_seed_length):
|
31
|
-
time.sleep(3)
|
32
|
-
while not stop.is_set():
|
33
|
-
|
34
|
-
if not ready_seed_length():
|
35
|
-
time.sleep(15)
|
36
|
-
continue
|
37
|
-
|
38
|
-
if self.queue.length >= self.length:
|
39
|
-
time.sleep(3)
|
40
|
-
continue
|
41
|
-
|
42
|
-
seeds = get_seed(self.length)
|
43
|
-
self.queue.push(seeds)
|
44
|
-
log.info(f"close thread: schedule_task")
|
45
|
-
|
46
|
-
|
47
|
-
class Spider:
|
48
|
-
|
49
|
-
def __init__(self, queue, max_retries=5):
|
50
|
-
self.spider_in_progress = Queue()
|
51
|
-
self.max_retries = max_retries
|
52
|
-
self.queue = queue
|
53
|
-
|
54
|
-
def spider_task(self, stop, func, item, del_seed):
|
55
|
-
while not stop.is_set():
|
56
|
-
seed = self.queue.pop()
|
57
|
-
if not seed:
|
58
|
-
time.sleep(3)
|
59
|
-
continue
|
60
|
-
elif seed._retry >= self.max_retries:
|
61
|
-
del_seed(seed, spider_status=False)
|
62
|
-
continue
|
63
|
-
try:
|
64
|
-
self.spider_in_progress.push(1, direct_insertion=True)
|
65
|
-
# log.info("spider seed: " + str(seed))
|
66
|
-
ret_count = 0
|
67
|
-
status = None
|
68
|
-
for it in func(item, seed):
|
69
|
-
ret_count += 1
|
70
|
-
if getattr(it, "table_name", None):
|
71
|
-
store_queue = it.queue()
|
72
|
-
store_queue.push(
|
73
|
-
[seed, it.struct_data],
|
74
|
-
direct_insertion=True
|
75
|
-
)
|
76
|
-
elif isinstance(it, Seed):
|
77
|
-
self.queue.push(it)
|
78
|
-
elif any(isinstance(it, t) for t in (list, tuple)):
|
79
|
-
self.queue.push([s if isinstance(s, Seed) else Seed(s) for s in it])
|
80
|
-
elif isinstance(it, bool):
|
81
|
-
status = it
|
82
|
-
|
83
|
-
if status:
|
84
|
-
del_seed(seed, spider_status=True)
|
85
|
-
elif not ret_count or status is False:
|
86
|
-
seed._retry += 1
|
87
|
-
self.queue.push(seed)
|
88
|
-
|
89
|
-
except Exception as e:
|
90
|
-
seed._retry += 1
|
91
|
-
self.queue.push(seed)
|
92
|
-
log.info(f"{str(seed)} -> {str(e)}")
|
93
|
-
finally:
|
94
|
-
self.spider_in_progress.pop()
|
95
|
-
log.info(f"close thread: spider")
|
96
|
-
|
97
|
-
|
98
|
-
class Storer:
|
99
|
-
|
100
|
-
def store_task(self, stop, last, reset_seed, del_seed):
|
101
|
-
|
102
|
-
inf_name = "StorerInterface"
|
103
|
-
if not issubclass_cobweb_inf(self.__class__, inf_name):
|
104
|
-
return None
|
105
|
-
|
106
|
-
if not getattr(self, "store", None):
|
107
|
-
raise Exception("not have store function!")
|
108
|
-
|
109
|
-
storer_name = self.__class__.__name__ + self.table
|
110
|
-
|
111
|
-
while not stop.is_set():
|
112
|
-
|
113
|
-
if last.is_set() or self.queue.length >= self.length:
|
114
|
-
seeds, data_list = [], []
|
115
|
-
|
116
|
-
for _ in range(self.length):
|
117
|
-
items = self.queue.pop()
|
118
|
-
if not items:
|
119
|
-
break
|
120
|
-
seed, data = items
|
121
|
-
seeds.append(seed)
|
122
|
-
data_list.append(data)
|
123
|
-
|
124
|
-
if data_list:
|
125
|
-
if self.store(data_list):
|
126
|
-
del_seed(seeds)
|
127
|
-
else:
|
128
|
-
reset_seed(seeds)
|
129
|
-
log.info("reset seeds!")
|
130
|
-
continue
|
131
|
-
|
132
|
-
time.sleep(3)
|
133
|
-
|
134
|
-
log.info(f"close thread: {storer_name}")
|
cobweb/single/nest.py
DELETED
@@ -1,153 +0,0 @@
|
|
1
|
-
import time
|
2
|
-
import threading
|
3
|
-
|
4
|
-
from single.nest import Seed, DBItem
|
5
|
-
from single.nest import struct_queue_name, restore_table_name
|
6
|
-
from single.nest import Distributor, Scheduler, Spider, Storer
|
7
|
-
|
8
|
-
|
9
|
-
def init_task_seed(seeds):
|
10
|
-
if not seeds:
|
11
|
-
return None
|
12
|
-
if isinstance(seeds, list) or isinstance(seeds, tuple):
|
13
|
-
for seed in seeds:
|
14
|
-
yield Seed(seed)
|
15
|
-
elif isinstance(seeds, str) or isinstance(seeds, dict):
|
16
|
-
yield Seed(seeds)
|
17
|
-
|
18
|
-
|
19
|
-
def parse_storer_info(storer_info):
|
20
|
-
storer_data = {}
|
21
|
-
storer_info_list = []
|
22
|
-
if storer_info.__class__.__name__ == 'StorerInfo':
|
23
|
-
storer_info_list.append(storer_info)
|
24
|
-
elif isinstance(storer_info, tuple) or isinstance(storer_info, list):
|
25
|
-
storer_info_list = storer_info
|
26
|
-
for info in storer_info_list:
|
27
|
-
db_name = info.DB.__name__
|
28
|
-
storer_data.setdefault(db_name, {"StorerDB": info.DB, "db_args_list": []})
|
29
|
-
storer_data[db_name]["db_args_list"].append(info[1:])
|
30
|
-
return storer_data
|
31
|
-
|
32
|
-
|
33
|
-
def check(stop_event, last_event, distributor, scheduler, spider, storer_list):
|
34
|
-
while True:
|
35
|
-
time.sleep(3)
|
36
|
-
if (
|
37
|
-
scheduler.stop and
|
38
|
-
not distributor.seed_queue.length and
|
39
|
-
not spider.spider_in_progress.length
|
40
|
-
):
|
41
|
-
last_event.set()
|
42
|
-
time.sleep(10)
|
43
|
-
storer_queue_empty = True
|
44
|
-
for storer in storer_list:
|
45
|
-
if storer.queue.length:
|
46
|
-
storer_queue_empty = False
|
47
|
-
break
|
48
|
-
if storer_queue_empty:
|
49
|
-
break
|
50
|
-
last_event.clear()
|
51
|
-
stop_event.set()
|
52
|
-
|
53
|
-
|
54
|
-
def cobweb(task):
|
55
|
-
"""
|
56
|
-
任务启动装饰器
|
57
|
-
:param task: 任务配置信息
|
58
|
-
"""
|
59
|
-
def decorator(func):
|
60
|
-
"""
|
61
|
-
func(Item, seed)
|
62
|
-
Item:
|
63
|
-
Item.Textfile()
|
64
|
-
Item.Console()
|
65
|
-
"""
|
66
|
-
# project task_name start_seed spider_num queue_length scheduler_info storer_info
|
67
|
-
|
68
|
-
storer_list = []
|
69
|
-
|
70
|
-
# 程序结束事件
|
71
|
-
last_event = threading.Event()
|
72
|
-
# 暂停采集事件
|
73
|
-
stop_event = threading.Event()
|
74
|
-
|
75
|
-
# 创建分发器
|
76
|
-
distributor = Distributor()
|
77
|
-
|
78
|
-
# 调度器动态继承
|
79
|
-
SchedulerDB, table, sql, length, size = task.SchedulerInfo
|
80
|
-
SchedulerTmp = type('Scheduler', (Scheduler, SchedulerDB), {})
|
81
|
-
|
82
|
-
# 初始化调度器
|
83
|
-
scheduler = SchedulerTmp(table=table, sql=sql, length=length, size=size, queue=distributor.seed_queue)
|
84
|
-
|
85
|
-
# 初始化采集器
|
86
|
-
spider = Spider(queue=distributor.seed_queue)
|
87
|
-
|
88
|
-
# 解析存储器信息
|
89
|
-
storer_data = parse_storer_info(task.storer_info)
|
90
|
-
|
91
|
-
# sds
|
92
|
-
item = type("item", (object,), {})
|
93
|
-
for db_name in storer_data.keys():
|
94
|
-
# 存储器动态继承
|
95
|
-
StorerDB = storer_data[db_name]["StorerDB"]
|
96
|
-
StorerTmp = type('Storer', (Storer, StorerDB), {})
|
97
|
-
db_args_list = storer_data[db_name]["db_args_list"]
|
98
|
-
for storer_db_args in db_args_list:
|
99
|
-
table, fields, length = storer_db_args
|
100
|
-
if not getattr(item, db_name, None):
|
101
|
-
instance = type(db_name, (DBItem,), {})
|
102
|
-
setattr(item, db_name, instance)
|
103
|
-
# 创建存储xxx
|
104
|
-
getattr(item, db_name).init_item(table, fields)
|
105
|
-
# 创建存储队列
|
106
|
-
storer_queue = struct_queue_name(db_name, table)
|
107
|
-
distributor.create_queue(queue_name=storer_queue)
|
108
|
-
queue = distributor.get_queue(queue_name=storer_queue)
|
109
|
-
# 初始话存储器
|
110
|
-
table_name = restore_table_name(table_name=table)
|
111
|
-
storer = StorerTmp(table=table_name, fields=fields, length=length, queue=queue)
|
112
|
-
storer_list.append(storer)
|
113
|
-
|
114
|
-
# 推送初始种子
|
115
|
-
distributor.distribute(init_task_seed, seeds=task.start_seed)
|
116
|
-
|
117
|
-
# 启动调度器
|
118
|
-
threading.Thread(
|
119
|
-
target=scheduler.schedule_task,
|
120
|
-
args=(distributor.distribute,),
|
121
|
-
name="single_scheduler_task"
|
122
|
-
).start()
|
123
|
-
|
124
|
-
# 启动采集器
|
125
|
-
for index in range(task.spider_num):
|
126
|
-
threading.Thread(
|
127
|
-
target=spider.spider_task,
|
128
|
-
args=(stop_event, distributor.distribute, func, item),
|
129
|
-
name=f"single_spider_task:{index}"
|
130
|
-
).start()
|
131
|
-
|
132
|
-
# 启动存储器
|
133
|
-
for storer in storer_list:
|
134
|
-
threading.Thread(
|
135
|
-
target=storer.store_task,
|
136
|
-
args=(stop_event, last_event, distributor.distribute),
|
137
|
-
name=f"single_store_task:{storer.table}",
|
138
|
-
).start()
|
139
|
-
|
140
|
-
threading.Thread(
|
141
|
-
target=check, name="check",
|
142
|
-
args=(
|
143
|
-
stop_event, last_event, distributor,
|
144
|
-
scheduler, spider, storer_list
|
145
|
-
)
|
146
|
-
).start()
|
147
|
-
|
148
|
-
# return starter(task, func)
|
149
|
-
return decorator
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
cobweb/task.py
DELETED
@@ -1,50 +0,0 @@
|
|
1
|
-
from .utils import parse_info, struct_start_seeds
|
2
|
-
|
3
|
-
|
4
|
-
class Task:
|
5
|
-
|
6
|
-
def __init__(
|
7
|
-
self,
|
8
|
-
# model=None,
|
9
|
-
seeds=None,
|
10
|
-
project=None,
|
11
|
-
task_name=None,
|
12
|
-
oss_config=None,
|
13
|
-
redis_info=None,
|
14
|
-
storer_info=None,
|
15
|
-
scheduler_info=None,
|
16
|
-
spider_num=None,
|
17
|
-
max_retries=None,
|
18
|
-
storer_queue_length=None,
|
19
|
-
scheduler_queue_length=None,
|
20
|
-
):
|
21
|
-
"""
|
22
|
-
|
23
|
-
:param seeds:
|
24
|
-
:param project:
|
25
|
-
:param task_name:
|
26
|
-
:param redis_info:
|
27
|
-
:param storer_info:
|
28
|
-
:param scheduler_info: dict(DB="", table="", size="", config="")
|
29
|
-
:param spider_num:
|
30
|
-
:param max_retries:
|
31
|
-
:param storer_queue_length:
|
32
|
-
:param scheduler_queue_length:
|
33
|
-
"""
|
34
|
-
# self.model = model
|
35
|
-
|
36
|
-
self.seeds = struct_start_seeds(seeds)
|
37
|
-
self.project = project or "test"
|
38
|
-
self.task_name = task_name or "spider"
|
39
|
-
|
40
|
-
self.oss_config = oss_config
|
41
|
-
|
42
|
-
self.redis_info = parse_info(redis_info)
|
43
|
-
self.storer_info = parse_info(storer_info)
|
44
|
-
self.scheduler_info = parse_info(scheduler_info)
|
45
|
-
|
46
|
-
self.spider_num = spider_num or 1
|
47
|
-
self.max_retries = max_retries or 5
|
48
|
-
self.storer_queue_length = storer_queue_length or 100
|
49
|
-
self.scheduler_queue_length = scheduler_queue_length or 100
|
50
|
-
|
cobweb/utils.py
DELETED
@@ -1,90 +0,0 @@
|
|
1
|
-
import json
|
2
|
-
import re
|
3
|
-
import sys
|
4
|
-
from abc import ABC
|
5
|
-
from typing import Iterable
|
6
|
-
from importlib import import_module
|
7
|
-
|
8
|
-
|
9
|
-
def struct_table_name(table_name):
|
10
|
-
return table_name.replace(".", "__p__").replace(":", "__c__")
|
11
|
-
|
12
|
-
|
13
|
-
def restore_table_name(table_name):
|
14
|
-
return table_name.replace("__p__", ".").replace("__c__", ":")
|
15
|
-
|
16
|
-
|
17
|
-
def struct_queue_name(db_name, table_name):
|
18
|
-
return sys.intern(f"__{db_name}_{table_name}_queue__")
|
19
|
-
|
20
|
-
|
21
|
-
def parse_info(info):
|
22
|
-
if not info:
|
23
|
-
return info
|
24
|
-
|
25
|
-
if isinstance(info, dict):
|
26
|
-
return info
|
27
|
-
|
28
|
-
if isinstance(info, str):
|
29
|
-
return json.loads(info)
|
30
|
-
|
31
|
-
if isinstance(info, Iterable):
|
32
|
-
result = list()
|
33
|
-
for ii in info:
|
34
|
-
if isinstance(ii, str):
|
35
|
-
result.append(json.loads(ii))
|
36
|
-
elif isinstance(ii, dict):
|
37
|
-
result.append(ii)
|
38
|
-
else:
|
39
|
-
raise TypeError("must be in [str, dict]")
|
40
|
-
|
41
|
-
return result
|
42
|
-
|
43
|
-
|
44
|
-
def struct_start_seeds(seeds):
|
45
|
-
from .bbb import Seed
|
46
|
-
if not seeds:
|
47
|
-
return None
|
48
|
-
if any(isinstance(seeds, t) for t in (list, tuple)):
|
49
|
-
return [Seed(seed) for seed in seeds]
|
50
|
-
elif any(isinstance(seeds, t) for t in (str, dict)):
|
51
|
-
return Seed(seeds)
|
52
|
-
|
53
|
-
|
54
|
-
def issubclass_cobweb_inf(_class, inf_name):
|
55
|
-
for _c in _class.__mro__[1:]:
|
56
|
-
if _c.__name__ == inf_name:
|
57
|
-
return True
|
58
|
-
return False
|
59
|
-
|
60
|
-
|
61
|
-
def parse_import_model(model_info, model_type=None):
|
62
|
-
if model_type not in ["scheduler", "storer"]:
|
63
|
-
raise TypeError("model_type must be in scheduler, storer")
|
64
|
-
if isinstance(model_info, str):
|
65
|
-
if "import" in model_info:
|
66
|
-
model_path, class_name = re.search(
|
67
|
-
r"from (.*?) import (.*?)$", model_info
|
68
|
-
).groups()
|
69
|
-
model = import_module(model_path)
|
70
|
-
class_object = getattr(model, class_name)
|
71
|
-
elif "." in model_info:
|
72
|
-
info_list = model_info.split(".")
|
73
|
-
class_name = info_list[-1]
|
74
|
-
model_path = ".".join(info_list[:-1])
|
75
|
-
model = import_module(model_path)
|
76
|
-
class_object = getattr(model, class_name)
|
77
|
-
else:
|
78
|
-
model_path = f"cobweb.db.{model_type}.{model_info.lower()}"
|
79
|
-
class_name = model_info.capitalize()
|
80
|
-
model = import_module(model_path)
|
81
|
-
class_object = getattr(model, class_name)
|
82
|
-
return class_object, class_name
|
83
|
-
elif issubclass(model_info, ABC):
|
84
|
-
inf_name = model_type.capitalize() + "Interface"
|
85
|
-
if issubclass_cobweb_inf(model_info, inf_name):
|
86
|
-
return model_info, model_info.__name__
|
87
|
-
raise ImportError()
|
88
|
-
raise TypeError()
|
89
|
-
|
90
|
-
|