cobweb-launcher 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- cobweb/db/redis_db.py +5 -3
- cobweb/distributed/launcher.py +14 -51
- cobweb/distributed/models.py +7 -3
- cobweb/single/launcher.py +231 -0
- cobweb/single/models.py +95 -63
- cobweb/utils.py +38 -36
- {cobweb_launcher-0.1.2.dist-info → cobweb_launcher-0.1.3.dist-info}/METADATA +1 -1
- {cobweb_launcher-0.1.2.dist-info → cobweb_launcher-0.1.3.dist-info}/RECORD +11 -10
- {cobweb_launcher-0.1.2.dist-info → cobweb_launcher-0.1.3.dist-info}/LICENSE +0 -0
- {cobweb_launcher-0.1.2.dist-info → cobweb_launcher-0.1.3.dist-info}/WHEEL +0 -0
- {cobweb_launcher-0.1.2.dist-info → cobweb_launcher-0.1.3.dist-info}/top_level.txt +0 -0
cobweb/db/redis_db.py
CHANGED
@@ -151,17 +151,19 @@ class RedisDB:
|
|
151
151
|
@check_redis_status
|
152
152
|
def check_spider_queue(self, stop, storer_num):
|
153
153
|
while not stop.is_set():
|
154
|
-
# 每15s获取check锁,等待600s
|
154
|
+
# 每15s获取check锁,等待600s后仍获取不到锁则重试;获取到锁后,设置锁的存活时间为${cs_lct}s
|
155
155
|
if self._get_lock(key=self.check_lock, t=self.cs_lct, timeout=600, sleep_time=3):
|
156
156
|
heartbeat = True if self.client.exists(self.heartbeat_key) else False
|
157
|
-
# 重启重制score
|
157
|
+
# 重启重制score值,否则获取${rs_time}分钟前的分数值
|
158
158
|
score = -int(time.time()) + self.rs_time if heartbeat else "-inf"
|
159
159
|
|
160
160
|
keys = self.client.keys(self.storer_key % "*")
|
161
|
+
|
161
162
|
if keys and len(keys) >= storer_num:
|
162
163
|
intersection_key = self.storer_key % "intersection"
|
163
164
|
self.client.delete(intersection_key)
|
164
165
|
self.client.zinterstore(intersection_key, keys)
|
166
|
+
|
165
167
|
while True:
|
166
168
|
members = self.client.zrange(intersection_key, 0, 1999)
|
167
169
|
if not members:
|
@@ -192,7 +194,7 @@ class RedisDB:
|
|
192
194
|
self.client.setex(self.heartbeat_key, 15, "")
|
193
195
|
|
194
196
|
# self.client.delete(self.check_lock)
|
195
|
-
time.sleep(3)
|
197
|
+
# time.sleep(3)
|
196
198
|
|
197
199
|
@check_redis_status
|
198
200
|
def set_heartbeat(self, stop):
|
cobweb/distributed/launcher.py
CHANGED
@@ -1,50 +1,15 @@
|
|
1
1
|
import time
|
2
2
|
import threading
|
3
3
|
from threading import Thread
|
4
|
-
from importlib import import_module
|
5
4
|
|
6
|
-
from cobweb import log, Queue, DBItem, RedisDB, OssDB, StorerInterface
|
7
|
-
from cobweb.utils import struct_queue_name, restore_table_name
|
8
|
-
from cobweb.setting import MODEL, RESET_SCORE, CHECK_LOCK_TIME
|
9
5
|
from .models import Scheduler, Spider, Storer
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
obj = getattr(model, db)
|
18
|
-
else:
|
19
|
-
model = import_module(f"cobweb.db.scheduler.{db.lower()}")
|
20
|
-
obj = getattr(model, db.capitalize())
|
21
|
-
return obj
|
22
|
-
# if db.lower() in dir(StorerDB):
|
23
|
-
# return getattr(StorerDB, db)
|
24
|
-
# else:
|
25
|
-
# pass
|
26
|
-
elif issubclass(db, StorerInterface):
|
27
|
-
return db
|
28
|
-
raise TypeError()
|
29
|
-
|
30
|
-
|
31
|
-
def get_storer_db(db):
|
32
|
-
if isinstance(db, str):
|
33
|
-
if "." in db:
|
34
|
-
model_path = db.split(".")
|
35
|
-
model = import_module(db)
|
36
|
-
obj = getattr(model, db)
|
37
|
-
else:
|
38
|
-
model = import_module(f"cobweb.db.storer.{db.lower()}")
|
39
|
-
obj = getattr(model, db.capitalize())
|
40
|
-
return obj, db.lower()
|
41
|
-
# if db.lower() in dir(StorerDB):
|
42
|
-
# return getattr(StorerDB, db)
|
43
|
-
# else:
|
44
|
-
# pass
|
45
|
-
elif issubclass(db, StorerInterface):
|
46
|
-
return db, db.__name__.lower()
|
47
|
-
raise TypeError()
|
6
|
+
from cobweb import log, Queue, DBItem, RedisDB
|
7
|
+
from cobweb.setting import MODEL, RESET_SCORE, CHECK_LOCK_TIME
|
8
|
+
from cobweb.utils import (
|
9
|
+
struct_queue_name as sqn,
|
10
|
+
restore_table_name as rtn,
|
11
|
+
parse_import_model as pim,
|
12
|
+
)
|
48
13
|
|
49
14
|
|
50
15
|
def check(stop, last, spider, scheduler, storer_list, ready_seed_length, spider_queue_length):
|
@@ -164,9 +129,9 @@ def launcher(task):
|
|
164
129
|
size = task.scheduler_info.get("size")
|
165
130
|
scheduler_config = task.scheduler_info.get("config")
|
166
131
|
scheduler_db = task.scheduler_info.get("db", "default")
|
167
|
-
DB =
|
132
|
+
DB, class_name = pim(scheduler_db, "scheduler")
|
168
133
|
# SchedulerDB, table, sql, length, size, config = task.scheduler_info
|
169
|
-
SchedulerTmp = type(
|
134
|
+
SchedulerTmp = type(class_name, (Scheduler, DB), {})
|
170
135
|
|
171
136
|
# 初始化调度器
|
172
137
|
scheduler = SchedulerTmp(
|
@@ -185,18 +150,16 @@ def launcher(task):
|
|
185
150
|
# new item
|
186
151
|
item = type("Item", (object,), {"redis_client": redis_db.client})()
|
187
152
|
|
188
|
-
if task.oss_config:
|
189
|
-
item.oss = OssDB(**task.oss_config)
|
190
|
-
|
191
153
|
for storer_info in storer_info_list:
|
192
154
|
storer_db = storer_info["db"]
|
193
155
|
fields = storer_info["fields"]
|
194
156
|
storer_table = storer_info.get("table", "console")
|
195
157
|
storer_config = storer_info.get("config")
|
196
158
|
|
197
|
-
StorerDB,
|
198
|
-
StorerTmp = type(
|
159
|
+
StorerDB, class_name = pim(storer_db, "storer")
|
160
|
+
StorerTmp = type(class_name, (Storer, StorerDB), {})
|
199
161
|
|
162
|
+
db_name = class_name.lower()
|
200
163
|
if not getattr(item, db_name, None):
|
201
164
|
instance = type(db_name, (DBItem,), {})
|
202
165
|
setattr(item, db_name, instance)
|
@@ -204,10 +167,10 @@ def launcher(task):
|
|
204
167
|
storer_item_instance = getattr(item, db_name)
|
205
168
|
storer_item_instance.init_item(storer_table, fields)
|
206
169
|
|
207
|
-
storer_queue =
|
170
|
+
storer_queue = sqn(db_name, storer_table)
|
208
171
|
queue = getattr(storer_item_instance, storer_queue)
|
209
172
|
# 初始话存储器
|
210
|
-
table_name =
|
173
|
+
table_name = rtn(table_name=storer_table)
|
211
174
|
storer = StorerTmp(
|
212
175
|
table=table_name, fields=fields,
|
213
176
|
length=task.storer_queue_length,
|
cobweb/distributed/models.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
import time
|
2
2
|
from hashlib import md5
|
3
|
-
from cobweb import log, Queue, Seed
|
3
|
+
from cobweb import log, Queue, Seed
|
4
|
+
from utils import issubclass_cobweb_interface
|
5
|
+
|
4
6
|
# from pympler import asizeof
|
5
7
|
|
6
8
|
|
@@ -8,7 +10,8 @@ class Scheduler:
|
|
8
10
|
|
9
11
|
def schedule_seed(self, ready_seed_length, get_scheduler_lock, add_seed):
|
10
12
|
|
11
|
-
|
13
|
+
inf_name = "SchedulerInterface"
|
14
|
+
if not issubclass_cobweb_interface(self.__class__, inf_name):
|
12
15
|
raise Exception("not have schedule function!")
|
13
16
|
|
14
17
|
if self.__class__.__name__ == "Default":
|
@@ -103,7 +106,8 @@ class Storer:
|
|
103
106
|
|
104
107
|
def store_task(self, stop, last, reset_seed, set_storer):
|
105
108
|
|
106
|
-
|
109
|
+
inf_name = "StorerInterface"
|
110
|
+
if not issubclass_cobweb_interface(self.__class__, inf_name):
|
107
111
|
return None
|
108
112
|
|
109
113
|
if not getattr(self, "store", None):
|
@@ -0,0 +1,231 @@
|
|
1
|
+
import time
|
2
|
+
import threading
|
3
|
+
from threading import Thread
|
4
|
+
|
5
|
+
from .models import Scheduler, Spider, Storer
|
6
|
+
from cobweb import log, Queue, DBItem, RedisDB
|
7
|
+
from cobweb.setting import MODEL, RESET_SCORE, CHECK_LOCK_TIME
|
8
|
+
from cobweb.utils import (
|
9
|
+
struct_queue_name as sqn,
|
10
|
+
restore_table_name as rtn,
|
11
|
+
parse_import_model as pim,
|
12
|
+
)
|
13
|
+
|
14
|
+
|
15
|
+
def check(stop, last, spider, scheduler, storer, ready_seed_length, spider_queue_length):
|
16
|
+
log.info("run check thread after 30 seconds...")
|
17
|
+
time.sleep(30)
|
18
|
+
spider_info = """
|
19
|
+
------------------- check: {0} ------------------
|
20
|
+
redis_spider_seed_length: {1}
|
21
|
+
redis_ready_seed_length: {2}
|
22
|
+
running_spider_thread_num: {3}
|
23
|
+
memory_seed_queue_length: {4}
|
24
|
+
storer_queue_length_info: {5}
|
25
|
+
----------------------- end -----------------------"""
|
26
|
+
while True:
|
27
|
+
status = "running"
|
28
|
+
running_spider_thread_num = spider.spider_in_progress.length
|
29
|
+
redis_ready_seed_length = ready_seed_length()
|
30
|
+
redis_spider_seed_length = spider_queue_length()
|
31
|
+
memory_seed_queue_length = scheduler.queue.length
|
32
|
+
storer_upload_queue_length = storer.queue.length
|
33
|
+
if (
|
34
|
+
scheduler.stop and
|
35
|
+
# not redis_ready_seed_length and
|
36
|
+
not memory_seed_queue_length and
|
37
|
+
not running_spider_thread_num
|
38
|
+
):
|
39
|
+
if not MODEL:
|
40
|
+
log.info("spider is done?")
|
41
|
+
last.set()
|
42
|
+
time.sleep(3)
|
43
|
+
storer_queue_empty = True
|
44
|
+
if storer.queue.length:
|
45
|
+
storer_queue_empty = False
|
46
|
+
storer_upload_queue_length = storer.queue.length
|
47
|
+
if (
|
48
|
+
storer_queue_empty and
|
49
|
+
not redis_ready_seed_length and
|
50
|
+
not redis_spider_seed_length
|
51
|
+
):
|
52
|
+
if MODEL:
|
53
|
+
log.info("waiting for push seeds...")
|
54
|
+
status = "waiting"
|
55
|
+
time.sleep(30)
|
56
|
+
else:
|
57
|
+
log.info("spider done!")
|
58
|
+
break
|
59
|
+
|
60
|
+
last.clear()
|
61
|
+
|
62
|
+
log.info(spider_info.format(
|
63
|
+
status,
|
64
|
+
redis_spider_seed_length,
|
65
|
+
redis_ready_seed_length,
|
66
|
+
running_spider_thread_num,
|
67
|
+
memory_seed_queue_length,
|
68
|
+
storer_upload_queue_length
|
69
|
+
))
|
70
|
+
|
71
|
+
time.sleep(3)
|
72
|
+
stop.set()
|
73
|
+
|
74
|
+
|
75
|
+
def launcher(task):
|
76
|
+
"""
|
77
|
+
任务启动装饰器
|
78
|
+
:param task: 任务配置信息
|
79
|
+
"""
|
80
|
+
def decorator(func):
|
81
|
+
"""
|
82
|
+
Item:
|
83
|
+
Textfile()
|
84
|
+
Loghub()
|
85
|
+
Console()
|
86
|
+
e.g.
|
87
|
+
task.fields = "a,b"
|
88
|
+
func(item, seed)
|
89
|
+
a = "a"
|
90
|
+
b = "b"
|
91
|
+
data = {"a": "a", "b": "b"}
|
92
|
+
yield item.Loghub(**data)
|
93
|
+
yield item.Loghub(a=a, b=b)
|
94
|
+
"""
|
95
|
+
storer_list = []
|
96
|
+
|
97
|
+
# 程序结束事件
|
98
|
+
last = threading.Event()
|
99
|
+
# 停止采集事件
|
100
|
+
stop = threading.Event()
|
101
|
+
|
102
|
+
# 初始化redis信息
|
103
|
+
redis_db = RedisDB(
|
104
|
+
task.project, task.task_name, task.redis_info,
|
105
|
+
model=MODEL, cs_lct=CHECK_LOCK_TIME, rs_time=RESET_SCORE
|
106
|
+
)
|
107
|
+
|
108
|
+
# new item
|
109
|
+
item = type("Item", (object,), {"redis_client": redis_db.client})()
|
110
|
+
|
111
|
+
log.info("初始化cobweb!")
|
112
|
+
|
113
|
+
seed_queue = Queue()
|
114
|
+
|
115
|
+
scheduler_info = task.scheduler_info or dict()
|
116
|
+
|
117
|
+
# 调度器动态继承
|
118
|
+
sql = scheduler_info.get("sql")
|
119
|
+
table = scheduler_info.get("table")
|
120
|
+
size = scheduler_info.get("size")
|
121
|
+
scheduler_config = scheduler_info.get("config")
|
122
|
+
scheduler_db = scheduler_info.get("db", "default")
|
123
|
+
DB, class_name = pim(scheduler_db, "scheduler")
|
124
|
+
# SchedulerDB, table, sql, length, size, config = task.scheduler_info
|
125
|
+
SchedulerTmp = type(class_name, (Scheduler, DB), {})
|
126
|
+
|
127
|
+
# 初始化调度器
|
128
|
+
scheduler = SchedulerTmp(
|
129
|
+
table=table, sql=sql, size=size, queue=seed_queue,
|
130
|
+
length=task.scheduler_queue_length, config=scheduler_config
|
131
|
+
)
|
132
|
+
|
133
|
+
# 初始化采集器
|
134
|
+
spider = Spider(seed_queue, task.max_retries)
|
135
|
+
|
136
|
+
storer = None
|
137
|
+
|
138
|
+
# 解析存储器信息
|
139
|
+
storer_info = task.storer_info or dict()
|
140
|
+
|
141
|
+
# for storer_info in storer_info_list:
|
142
|
+
if storer_info:
|
143
|
+
storer_db = storer_info["db"]
|
144
|
+
fields = storer_info["fields"]
|
145
|
+
storer_table = storer_info.get("table", "console")
|
146
|
+
storer_config = storer_info.get("config")
|
147
|
+
|
148
|
+
StorerDB, class_name = pim(storer_db, "storer")
|
149
|
+
StorerTmp = type(class_name, (Storer, StorerDB), {})
|
150
|
+
|
151
|
+
db_name = class_name.lower()
|
152
|
+
if not getattr(item, db_name, None):
|
153
|
+
instance = type(db_name, (DBItem,), {})
|
154
|
+
setattr(item, db_name, instance)
|
155
|
+
|
156
|
+
storer_item_instance = getattr(item, db_name)
|
157
|
+
storer_item_instance.init_item(storer_table, fields)
|
158
|
+
|
159
|
+
storer_queue = sqn(db_name, storer_table)
|
160
|
+
queue = getattr(storer_item_instance, storer_queue)
|
161
|
+
# 初始话存储器
|
162
|
+
table_name = rtn(table_name=storer_table)
|
163
|
+
storer = StorerTmp(
|
164
|
+
table=table_name, fields=fields,
|
165
|
+
length=task.storer_queue_length,
|
166
|
+
queue=queue, config=storer_config
|
167
|
+
)
|
168
|
+
|
169
|
+
Thread(target=redis_db.check_spider_queue, args=(stop, len(storer_list))).start()
|
170
|
+
Thread(target=redis_db.set_heartbeat, args=(stop,)).start()
|
171
|
+
|
172
|
+
# 推送初始种子
|
173
|
+
# seeds = start_seeds(task.start_seed)
|
174
|
+
redis_db.add_seed(task.seeds)
|
175
|
+
# 启动调度器, 调度至redis队列
|
176
|
+
Thread(
|
177
|
+
# name="xxxx_schedule_seeds",
|
178
|
+
target=scheduler.schedule_seed,
|
179
|
+
args=(
|
180
|
+
redis_db.ready_seed_length,
|
181
|
+
redis_db.get_scheduler_lock,
|
182
|
+
redis_db.add_seed
|
183
|
+
)
|
184
|
+
).start()
|
185
|
+
|
186
|
+
# 启动调度器, 调度任务队列
|
187
|
+
Thread(
|
188
|
+
# name="xxxx_schedule_task",
|
189
|
+
target=scheduler.schedule_task,
|
190
|
+
args=(
|
191
|
+
stop, redis_db.get_seed,
|
192
|
+
redis_db.ready_seed_length
|
193
|
+
)
|
194
|
+
).start()
|
195
|
+
|
196
|
+
# 启动采集器
|
197
|
+
for index in range(task.spider_num):
|
198
|
+
Thread(
|
199
|
+
# name=f"xxxx_spider_task:{index}",
|
200
|
+
target=spider.spider_task,
|
201
|
+
args=(
|
202
|
+
stop, func, item,
|
203
|
+
redis_db.del_seed
|
204
|
+
)
|
205
|
+
).start()
|
206
|
+
|
207
|
+
# 启动存储器
|
208
|
+
if storer:
|
209
|
+
Thread(
|
210
|
+
# name=f"xxxx_store_task:{storer.table}",
|
211
|
+
target=storer.store_task,
|
212
|
+
args=(
|
213
|
+
stop, last,
|
214
|
+
redis_db.reset_seed,
|
215
|
+
redis_db.set_storer
|
216
|
+
)
|
217
|
+
).start()
|
218
|
+
|
219
|
+
Thread(
|
220
|
+
# name="check_spider",
|
221
|
+
target=check,
|
222
|
+
args=(
|
223
|
+
stop, last, spider,
|
224
|
+
scheduler, storer,
|
225
|
+
redis_db.ready_seed_length,
|
226
|
+
redis_db.spider_queue_length,
|
227
|
+
)
|
228
|
+
).start()
|
229
|
+
|
230
|
+
return decorator
|
231
|
+
|
cobweb/single/models.py
CHANGED
@@ -1,104 +1,136 @@
|
|
1
1
|
import time
|
2
|
-
|
3
|
-
from
|
4
|
-
from single.nest import struct_queue_name
|
5
|
-
from single.nest import SchedulerInterface, StorerInterface
|
6
|
-
|
7
|
-
|
8
|
-
# class Transceiver:
|
9
|
-
class Distributor:
|
10
|
-
|
11
|
-
def __init__(self):
|
12
|
-
self.seed_queue = Queue()
|
2
|
+
from cobweb import log, Queue, Seed
|
3
|
+
from utils import issubclass_cobweb_interface
|
13
4
|
|
14
|
-
|
15
|
-
def queue_names(self):
|
16
|
-
return tuple(self.__dict__.keys())
|
5
|
+
# from pympler import asizeof
|
17
6
|
|
18
|
-
@property
|
19
|
-
def used_memory(self):
|
20
|
-
return asizeof.asizeof(self)
|
21
7
|
|
22
|
-
|
23
|
-
self.__setattr__(queue_name, Queue())
|
8
|
+
class Scheduler:
|
24
9
|
|
25
|
-
def
|
26
|
-
return self.__getattribute__(queue_name)
|
10
|
+
def schedule_seed(self, ready_seed_length, get_scheduler_lock, add_seed):
|
27
11
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
self.seed_queue.push(item)
|
32
|
-
elif getattr(item, "table_name", None):
|
33
|
-
queue_name = struct_queue_name(icn, item.table_name)
|
34
|
-
getattr(self, queue_name).push(item.serialization)
|
12
|
+
inf_name = "SchedulerInterface"
|
13
|
+
if not issubclass_cobweb_interface(self.__class__, inf_name):
|
14
|
+
raise Exception("not have schedule function!")
|
35
15
|
|
36
|
-
|
37
|
-
|
38
|
-
if not iterable:
|
16
|
+
if self.__class__.__name__ == "Default":
|
17
|
+
self.stop = True
|
39
18
|
return None
|
40
|
-
for result in iterable:
|
41
|
-
self.deal_item(result)
|
42
|
-
return True
|
43
|
-
|
44
19
|
|
45
|
-
|
20
|
+
while not self.stop:
|
21
|
+
length = ready_seed_length()
|
22
|
+
if length > self.size:
|
23
|
+
time.sleep(15)
|
46
24
|
|
47
|
-
|
25
|
+
elif get_scheduler_lock():
|
26
|
+
seeds = self.schedule()
|
27
|
+
add_seed(seeds)
|
48
28
|
|
49
|
-
|
50
|
-
return None
|
29
|
+
log.info(f"close thread: schedule_seed")
|
51
30
|
|
52
|
-
|
53
|
-
|
31
|
+
def schedule_task(self, stop, get_seed, ready_seed_length):
|
32
|
+
time.sleep(3)
|
33
|
+
while not stop.is_set():
|
54
34
|
|
55
|
-
|
35
|
+
if not ready_seed_length():
|
36
|
+
time.sleep(15)
|
37
|
+
continue
|
56
38
|
|
57
|
-
if self.queue.length
|
58
|
-
|
39
|
+
if self.queue.length >= self.length:
|
40
|
+
time.sleep(3)
|
41
|
+
continue
|
59
42
|
|
60
|
-
|
61
|
-
|
62
|
-
|
43
|
+
seeds = get_seed(self.length)
|
44
|
+
self.queue.push(seeds)
|
45
|
+
log.info(f"close thread: schedule_task")
|
63
46
|
|
64
47
|
|
65
48
|
class Spider:
|
66
49
|
|
67
|
-
def __init__(self, queue):
|
68
|
-
self.queue = queue
|
50
|
+
def __init__(self, queue, max_retries=5):
|
69
51
|
self.spider_in_progress = Queue()
|
52
|
+
self.max_retries = max_retries
|
53
|
+
self.queue = queue
|
70
54
|
|
71
|
-
def spider_task(self,
|
72
|
-
while not
|
55
|
+
def spider_task(self, stop, func, item, del_seed):
|
56
|
+
while not stop.is_set():
|
73
57
|
seed = self.queue.pop()
|
74
58
|
if not seed:
|
75
59
|
time.sleep(3)
|
76
60
|
continue
|
61
|
+
elif seed._retry >= self.max_retries:
|
62
|
+
del_seed(seed, spider_status=False)
|
63
|
+
continue
|
77
64
|
try:
|
78
|
-
self.spider_in_progress.push(1)
|
79
|
-
|
65
|
+
self.spider_in_progress.push(1, direct_insertion=True)
|
66
|
+
# log.info("spider seed: " + str(seed))
|
67
|
+
status = None
|
68
|
+
for it in func(item, seed):
|
69
|
+
if getattr(it, "table_name", None):
|
70
|
+
store_queue = it.queue()
|
71
|
+
store_queue.push(
|
72
|
+
[seed, it.struct_data],
|
73
|
+
direct_insertion=True
|
74
|
+
)
|
75
|
+
elif isinstance(it, Seed):
|
76
|
+
self.queue.push(it)
|
77
|
+
elif any(isinstance(it, t) for t in (list, tuple)):
|
78
|
+
self.queue.push([s if isinstance(s, Seed) else Seed(s) for s in it])
|
79
|
+
elif isinstance(it, bool):
|
80
|
+
status = it
|
81
|
+
elif it is None:
|
82
|
+
status = False
|
83
|
+
|
84
|
+
if status is not None:
|
85
|
+
if status:
|
86
|
+
del_seed(seed, spider_status=True)
|
87
|
+
else:
|
88
|
+
seed._retry += 1
|
89
|
+
self.queue.push(seed)
|
90
|
+
|
80
91
|
except Exception as e:
|
81
|
-
|
92
|
+
seed._retry += 1
|
93
|
+
self.queue.push(seed)
|
94
|
+
log.info(f"{str(seed)} -> {str(e)}")
|
82
95
|
finally:
|
83
96
|
self.spider_in_progress.pop()
|
97
|
+
log.info(f"close thread: spider")
|
84
98
|
|
85
99
|
|
86
100
|
class Storer:
|
87
101
|
|
88
|
-
def store_task(self,
|
102
|
+
def store_task(self, stop, last, reset_seed, del_seed):
|
89
103
|
|
90
|
-
|
104
|
+
inf_name = "StorerInterface"
|
105
|
+
if not issubclass_cobweb_interface(self.__class__, inf_name):
|
91
106
|
return None
|
92
107
|
|
93
108
|
if not getattr(self, "store", None):
|
94
109
|
raise Exception("not have store function!")
|
95
110
|
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
111
|
+
storer_name = self.__class__.__name__ + self.table
|
112
|
+
|
113
|
+
while not stop.is_set():
|
114
|
+
|
115
|
+
if last.is_set() or self.queue.length >= self.length:
|
116
|
+
seeds, data_list = [], []
|
117
|
+
|
118
|
+
for _ in range(self.length):
|
119
|
+
items = self.queue.pop()
|
120
|
+
if not items:
|
121
|
+
break
|
122
|
+
seed, data = items
|
123
|
+
seeds.append(seed)
|
102
124
|
data_list.append(data)
|
125
|
+
|
103
126
|
if data_list:
|
104
|
-
|
127
|
+
if self.store(data_list):
|
128
|
+
del_seed(seeds)
|
129
|
+
else:
|
130
|
+
reset_seed(seeds)
|
131
|
+
log.info("reset seeds!")
|
132
|
+
continue
|
133
|
+
|
134
|
+
time.sleep(3)
|
135
|
+
|
136
|
+
log.info(f"close thread: {storer_name}")
|
cobweb/utils.py
CHANGED
@@ -1,11 +1,9 @@
|
|
1
1
|
import json
|
2
|
+
import re
|
2
3
|
import sys
|
4
|
+
from abc import ABC
|
3
5
|
from typing import Iterable
|
4
|
-
|
5
|
-
import requests
|
6
|
-
|
7
|
-
|
8
|
-
# from cobweb import Seed
|
6
|
+
from importlib import import_module
|
9
7
|
|
10
8
|
|
11
9
|
def struct_table_name(table_name):
|
@@ -20,27 +18,6 @@ def struct_queue_name(db_name, table_name):
|
|
20
18
|
return sys.intern(f"__{db_name}_{table_name}_queue__")
|
21
19
|
|
22
20
|
|
23
|
-
# class StorerDB:
|
24
|
-
#
|
25
|
-
# @staticmethod
|
26
|
-
# def console(self):
|
27
|
-
# from db.storer.console import Console
|
28
|
-
# table = struct_table_name(table)
|
29
|
-
# return StorerInfo(DB=Console, table=table, length=length, config=None)
|
30
|
-
#
|
31
|
-
# @staticmethod
|
32
|
-
# def textfile(table, length=200):
|
33
|
-
# from db.storer.textfile import Textfile
|
34
|
-
# table = struct_table_name(table)
|
35
|
-
# return StorerInfo(DB=Textfile, table=table, length=length, config=None)
|
36
|
-
#
|
37
|
-
# @staticmethod
|
38
|
-
# def loghub(table, length=200, config=None):
|
39
|
-
# from db.storer.loghub import Loghub
|
40
|
-
# table = struct_table_name(table)
|
41
|
-
# return StorerInfo(DB=Loghub, table=table, length=length, config=config)
|
42
|
-
|
43
|
-
|
44
21
|
def parse_info(info):
|
45
22
|
if not info:
|
46
23
|
return info
|
@@ -74,15 +51,40 @@ def struct_start_seeds(seeds):
|
|
74
51
|
return Seed(seeds)
|
75
52
|
|
76
53
|
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
54
|
+
def issubclass_cobweb_interface(_class, inf_name):
|
55
|
+
for _c in _class.__mro__[1:]:
|
56
|
+
if _c.__name__ == inf_name:
|
57
|
+
return True
|
58
|
+
return False
|
59
|
+
|
60
|
+
|
61
|
+
def parse_import_model(model_info, model_type=None):
|
62
|
+
if model_type not in ["scheduler", "storer"]:
|
63
|
+
raise TypeError("model_type must be in scheduler, storer")
|
64
|
+
if isinstance(model_info, str):
|
65
|
+
if "import" in model_info:
|
66
|
+
model_path, class_name = re.search(
|
67
|
+
r"from (.*?) import (.*?)$", model_info
|
68
|
+
).groups()
|
69
|
+
model = import_module(model_path)
|
70
|
+
class_object = getattr(model, class_name)
|
71
|
+
elif "." in model_info:
|
72
|
+
info_list = model_info.split(".")
|
73
|
+
class_name = info_list[-1]
|
74
|
+
model_path = ".".join(info_list[:-1])
|
75
|
+
model = import_module(model_path)
|
76
|
+
class_object = getattr(model, class_name)
|
77
|
+
else:
|
78
|
+
model_path = f"cobweb.db.{model_type}.{model_info.lower()}"
|
79
|
+
class_name = model_info.capitalize()
|
80
|
+
model = import_module(model_path)
|
81
|
+
class_object = getattr(model, class_name)
|
82
|
+
return class_object, class_name
|
83
|
+
elif issubclass(model_info, ABC):
|
84
|
+
inf_name = model_type.capitalize() + "Interface"
|
85
|
+
if issubclass_cobweb_interface(model_info, inf_name):
|
86
|
+
return model_info, model_info.__name__
|
87
|
+
raise ImportError()
|
88
|
+
raise TypeError()
|
87
89
|
|
88
90
|
|
@@ -5,10 +5,10 @@ cobweb/interface.py,sha256=um_k2AAQl1HTOvfUlq914DjkpfZVwt2m1B65EpPKrmE,802
|
|
5
5
|
cobweb/log.py,sha256=Gb3_y4IzTo5pJohTggBCU9rK6-ZN3hgTOHkoXHyN6CU,2384
|
6
6
|
cobweb/setting.py,sha256=UAu_dLuIFYO98MxtlZ5sZqJcwKAUKq4Bu4KoKlV50Mc,288
|
7
7
|
cobweb/task.py,sha256=77F5EaopSVlSX2TANv1lhuPHFI8ER8Jh4tSGrwDWAc0,1405
|
8
|
-
cobweb/utils.py,sha256=
|
8
|
+
cobweb/utils.py,sha256=5ZcSfT3VCWF_Kl1APGYJYHtPhZPnr45nFqyyZA9LlOg,2663
|
9
9
|
cobweb/db/__init__.py,sha256=4m9lqmxZCRbaih3Z3rl_BT0GugMd0dkOIgu_P9aeC84,63
|
10
10
|
cobweb/db/oss_db.py,sha256=lFGNuH3tdIMsohVXQ_fTZPyBfS2oxYNmFNuQ-ZBQgm0,4221
|
11
|
-
cobweb/db/redis_db.py,sha256=
|
11
|
+
cobweb/db/redis_db.py,sha256=yoWy-GI0rjVmT-68Che-pypfqNwNti5JGkc9bYvJH2o,8202
|
12
12
|
cobweb/db/scheduler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
13
|
cobweb/db/scheduler/default.py,sha256=OxmFX7OvMEhKEq-NF7A8I9cA4V4qWw5vayS-yIbng0A,114
|
14
14
|
cobweb/db/scheduler/textfile.py,sha256=atRDeNT-e5toNvyGsCXAxL1FJi77uSYktdCzH_hXGo8,821
|
@@ -18,13 +18,14 @@ cobweb/db/storer/loghub.py,sha256=4VqZacXWhidzINHXQu2_-E0HOBRCcc86f6LkKfnXD5I,17
|
|
18
18
|
cobweb/db/storer/redis.py,sha256=7Q2XEQwBL6X_M1uvxzzuSBt6iw9piKw-_FWKm2INZDQ,412
|
19
19
|
cobweb/db/storer/textfile.py,sha256=3mDHMvF6Sh5fn3IHzWQxyTUd45V-zUoH8vY3EoRlMx0,415
|
20
20
|
cobweb/distributed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
|
-
cobweb/distributed/launcher.py,sha256=
|
22
|
-
cobweb/distributed/models.py,sha256=
|
21
|
+
cobweb/distributed/launcher.py,sha256=jTtBXBmna_6yFdj6gyGQiiEtg8I0g5uI5h8kbHWt454,7998
|
22
|
+
cobweb/distributed/models.py,sha256=Z46-Dwv--VLUr9t7wbNHaLIuEK0l-copJEbc-abfjb8,4617
|
23
23
|
cobweb/single/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
24
|
-
cobweb/single/
|
24
|
+
cobweb/single/launcher.py,sha256=Z-PaX1f_nphdBwbIW1Ki6D_4xsrFmx94SO3BXaexYAg,7304
|
25
|
+
cobweb/single/models.py,sha256=xIn0mefT_oMVynn3V6S4wkOCx6PGOemT4_fcuU7CSZs,4297
|
25
26
|
cobweb/single/nest.py,sha256=mL8q9a5BjtoeUyzXCIVw_vyUsNY8ltbvQpYIIpZEDFU,5012
|
26
|
-
cobweb_launcher-0.1.
|
27
|
-
cobweb_launcher-0.1.
|
28
|
-
cobweb_launcher-0.1.
|
29
|
-
cobweb_launcher-0.1.
|
30
|
-
cobweb_launcher-0.1.
|
27
|
+
cobweb_launcher-0.1.3.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
|
28
|
+
cobweb_launcher-0.1.3.dist-info/METADATA,sha256=vr4nnPQWibda20Nnn2KB2k7U81pWZLvVh8402NIhEgU,1225
|
29
|
+
cobweb_launcher-0.1.3.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
|
30
|
+
cobweb_launcher-0.1.3.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
|
31
|
+
cobweb_launcher-0.1.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|