cobweb-launcher 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cobweb/db/redis_db.py +5 -3
- cobweb/distributed/launcher.py +14 -51
- cobweb/distributed/models.py +7 -3
- cobweb/single/launcher.py +231 -0
- cobweb/single/models.py +95 -63
- cobweb/utils.py +38 -36
- {cobweb_launcher-0.1.2.dist-info → cobweb_launcher-0.1.3.dist-info}/METADATA +1 -1
- {cobweb_launcher-0.1.2.dist-info → cobweb_launcher-0.1.3.dist-info}/RECORD +11 -10
- {cobweb_launcher-0.1.2.dist-info → cobweb_launcher-0.1.3.dist-info}/LICENSE +0 -0
- {cobweb_launcher-0.1.2.dist-info → cobweb_launcher-0.1.3.dist-info}/WHEEL +0 -0
- {cobweb_launcher-0.1.2.dist-info → cobweb_launcher-0.1.3.dist-info}/top_level.txt +0 -0
cobweb/db/redis_db.py
CHANGED
@@ -151,17 +151,19 @@ class RedisDB:
|
|
151
151
|
@check_redis_status
|
152
152
|
def check_spider_queue(self, stop, storer_num):
|
153
153
|
while not stop.is_set():
|
154
|
-
# 每15s获取check锁,等待600s
|
154
|
+
# 每15s获取check锁,等待600s后仍获取不到锁则重试;获取到锁后,设置锁的存活时间为${cs_lct}s
|
155
155
|
if self._get_lock(key=self.check_lock, t=self.cs_lct, timeout=600, sleep_time=3):
|
156
156
|
heartbeat = True if self.client.exists(self.heartbeat_key) else False
|
157
|
-
# 重启重制score
|
157
|
+
# 重启重制score值,否则获取${rs_time}分钟前的分数值
|
158
158
|
score = -int(time.time()) + self.rs_time if heartbeat else "-inf"
|
159
159
|
|
160
160
|
keys = self.client.keys(self.storer_key % "*")
|
161
|
+
|
161
162
|
if keys and len(keys) >= storer_num:
|
162
163
|
intersection_key = self.storer_key % "intersection"
|
163
164
|
self.client.delete(intersection_key)
|
164
165
|
self.client.zinterstore(intersection_key, keys)
|
166
|
+
|
165
167
|
while True:
|
166
168
|
members = self.client.zrange(intersection_key, 0, 1999)
|
167
169
|
if not members:
|
@@ -192,7 +194,7 @@ class RedisDB:
|
|
192
194
|
self.client.setex(self.heartbeat_key, 15, "")
|
193
195
|
|
194
196
|
# self.client.delete(self.check_lock)
|
195
|
-
time.sleep(3)
|
197
|
+
# time.sleep(3)
|
196
198
|
|
197
199
|
@check_redis_status
|
198
200
|
def set_heartbeat(self, stop):
|
cobweb/distributed/launcher.py
CHANGED
@@ -1,50 +1,15 @@
|
|
1
1
|
import time
|
2
2
|
import threading
|
3
3
|
from threading import Thread
|
4
|
-
from importlib import import_module
|
5
4
|
|
6
|
-
from cobweb import log, Queue, DBItem, RedisDB, OssDB, StorerInterface
|
7
|
-
from cobweb.utils import struct_queue_name, restore_table_name
|
8
|
-
from cobweb.setting import MODEL, RESET_SCORE, CHECK_LOCK_TIME
|
9
5
|
from .models import Scheduler, Spider, Storer
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
obj = getattr(model, db)
|
18
|
-
else:
|
19
|
-
model = import_module(f"cobweb.db.scheduler.{db.lower()}")
|
20
|
-
obj = getattr(model, db.capitalize())
|
21
|
-
return obj
|
22
|
-
# if db.lower() in dir(StorerDB):
|
23
|
-
# return getattr(StorerDB, db)
|
24
|
-
# else:
|
25
|
-
# pass
|
26
|
-
elif issubclass(db, StorerInterface):
|
27
|
-
return db
|
28
|
-
raise TypeError()
|
29
|
-
|
30
|
-
|
31
|
-
def get_storer_db(db):
|
32
|
-
if isinstance(db, str):
|
33
|
-
if "." in db:
|
34
|
-
model_path = db.split(".")
|
35
|
-
model = import_module(db)
|
36
|
-
obj = getattr(model, db)
|
37
|
-
else:
|
38
|
-
model = import_module(f"cobweb.db.storer.{db.lower()}")
|
39
|
-
obj = getattr(model, db.capitalize())
|
40
|
-
return obj, db.lower()
|
41
|
-
# if db.lower() in dir(StorerDB):
|
42
|
-
# return getattr(StorerDB, db)
|
43
|
-
# else:
|
44
|
-
# pass
|
45
|
-
elif issubclass(db, StorerInterface):
|
46
|
-
return db, db.__name__.lower()
|
47
|
-
raise TypeError()
|
6
|
+
from cobweb import log, Queue, DBItem, RedisDB
|
7
|
+
from cobweb.setting import MODEL, RESET_SCORE, CHECK_LOCK_TIME
|
8
|
+
from cobweb.utils import (
|
9
|
+
struct_queue_name as sqn,
|
10
|
+
restore_table_name as rtn,
|
11
|
+
parse_import_model as pim,
|
12
|
+
)
|
48
13
|
|
49
14
|
|
50
15
|
def check(stop, last, spider, scheduler, storer_list, ready_seed_length, spider_queue_length):
|
@@ -164,9 +129,9 @@ def launcher(task):
|
|
164
129
|
size = task.scheduler_info.get("size")
|
165
130
|
scheduler_config = task.scheduler_info.get("config")
|
166
131
|
scheduler_db = task.scheduler_info.get("db", "default")
|
167
|
-
DB =
|
132
|
+
DB, class_name = pim(scheduler_db, "scheduler")
|
168
133
|
# SchedulerDB, table, sql, length, size, config = task.scheduler_info
|
169
|
-
SchedulerTmp = type(
|
134
|
+
SchedulerTmp = type(class_name, (Scheduler, DB), {})
|
170
135
|
|
171
136
|
# 初始化调度器
|
172
137
|
scheduler = SchedulerTmp(
|
@@ -185,18 +150,16 @@ def launcher(task):
|
|
185
150
|
# new item
|
186
151
|
item = type("Item", (object,), {"redis_client": redis_db.client})()
|
187
152
|
|
188
|
-
if task.oss_config:
|
189
|
-
item.oss = OssDB(**task.oss_config)
|
190
|
-
|
191
153
|
for storer_info in storer_info_list:
|
192
154
|
storer_db = storer_info["db"]
|
193
155
|
fields = storer_info["fields"]
|
194
156
|
storer_table = storer_info.get("table", "console")
|
195
157
|
storer_config = storer_info.get("config")
|
196
158
|
|
197
|
-
StorerDB,
|
198
|
-
StorerTmp = type(
|
159
|
+
StorerDB, class_name = pim(storer_db, "storer")
|
160
|
+
StorerTmp = type(class_name, (Storer, StorerDB), {})
|
199
161
|
|
162
|
+
db_name = class_name.lower()
|
200
163
|
if not getattr(item, db_name, None):
|
201
164
|
instance = type(db_name, (DBItem,), {})
|
202
165
|
setattr(item, db_name, instance)
|
@@ -204,10 +167,10 @@ def launcher(task):
|
|
204
167
|
storer_item_instance = getattr(item, db_name)
|
205
168
|
storer_item_instance.init_item(storer_table, fields)
|
206
169
|
|
207
|
-
storer_queue =
|
170
|
+
storer_queue = sqn(db_name, storer_table)
|
208
171
|
queue = getattr(storer_item_instance, storer_queue)
|
209
172
|
# 初始话存储器
|
210
|
-
table_name =
|
173
|
+
table_name = rtn(table_name=storer_table)
|
211
174
|
storer = StorerTmp(
|
212
175
|
table=table_name, fields=fields,
|
213
176
|
length=task.storer_queue_length,
|
cobweb/distributed/models.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
import time
|
2
2
|
from hashlib import md5
|
3
|
-
from cobweb import log, Queue, Seed
|
3
|
+
from cobweb import log, Queue, Seed
|
4
|
+
from utils import issubclass_cobweb_interface
|
5
|
+
|
4
6
|
# from pympler import asizeof
|
5
7
|
|
6
8
|
|
@@ -8,7 +10,8 @@ class Scheduler:
|
|
8
10
|
|
9
11
|
def schedule_seed(self, ready_seed_length, get_scheduler_lock, add_seed):
|
10
12
|
|
11
|
-
|
13
|
+
inf_name = "SchedulerInterface"
|
14
|
+
if not issubclass_cobweb_interface(self.__class__, inf_name):
|
12
15
|
raise Exception("not have schedule function!")
|
13
16
|
|
14
17
|
if self.__class__.__name__ == "Default":
|
@@ -103,7 +106,8 @@ class Storer:
|
|
103
106
|
|
104
107
|
def store_task(self, stop, last, reset_seed, set_storer):
|
105
108
|
|
106
|
-
|
109
|
+
inf_name = "StorerInterface"
|
110
|
+
if not issubclass_cobweb_interface(self.__class__, inf_name):
|
107
111
|
return None
|
108
112
|
|
109
113
|
if not getattr(self, "store", None):
|
@@ -0,0 +1,231 @@
|
|
1
|
+
import time
|
2
|
+
import threading
|
3
|
+
from threading import Thread
|
4
|
+
|
5
|
+
from .models import Scheduler, Spider, Storer
|
6
|
+
from cobweb import log, Queue, DBItem, RedisDB
|
7
|
+
from cobweb.setting import MODEL, RESET_SCORE, CHECK_LOCK_TIME
|
8
|
+
from cobweb.utils import (
|
9
|
+
struct_queue_name as sqn,
|
10
|
+
restore_table_name as rtn,
|
11
|
+
parse_import_model as pim,
|
12
|
+
)
|
13
|
+
|
14
|
+
|
15
|
+
def check(stop, last, spider, scheduler, storer, ready_seed_length, spider_queue_length):
|
16
|
+
log.info("run check thread after 30 seconds...")
|
17
|
+
time.sleep(30)
|
18
|
+
spider_info = """
|
19
|
+
------------------- check: {0} ------------------
|
20
|
+
redis_spider_seed_length: {1}
|
21
|
+
redis_ready_seed_length: {2}
|
22
|
+
running_spider_thread_num: {3}
|
23
|
+
memory_seed_queue_length: {4}
|
24
|
+
storer_queue_length_info: {5}
|
25
|
+
----------------------- end -----------------------"""
|
26
|
+
while True:
|
27
|
+
status = "running"
|
28
|
+
running_spider_thread_num = spider.spider_in_progress.length
|
29
|
+
redis_ready_seed_length = ready_seed_length()
|
30
|
+
redis_spider_seed_length = spider_queue_length()
|
31
|
+
memory_seed_queue_length = scheduler.queue.length
|
32
|
+
storer_upload_queue_length = storer.queue.length
|
33
|
+
if (
|
34
|
+
scheduler.stop and
|
35
|
+
# not redis_ready_seed_length and
|
36
|
+
not memory_seed_queue_length and
|
37
|
+
not running_spider_thread_num
|
38
|
+
):
|
39
|
+
if not MODEL:
|
40
|
+
log.info("spider is done?")
|
41
|
+
last.set()
|
42
|
+
time.sleep(3)
|
43
|
+
storer_queue_empty = True
|
44
|
+
if storer.queue.length:
|
45
|
+
storer_queue_empty = False
|
46
|
+
storer_upload_queue_length = storer.queue.length
|
47
|
+
if (
|
48
|
+
storer_queue_empty and
|
49
|
+
not redis_ready_seed_length and
|
50
|
+
not redis_spider_seed_length
|
51
|
+
):
|
52
|
+
if MODEL:
|
53
|
+
log.info("waiting for push seeds...")
|
54
|
+
status = "waiting"
|
55
|
+
time.sleep(30)
|
56
|
+
else:
|
57
|
+
log.info("spider done!")
|
58
|
+
break
|
59
|
+
|
60
|
+
last.clear()
|
61
|
+
|
62
|
+
log.info(spider_info.format(
|
63
|
+
status,
|
64
|
+
redis_spider_seed_length,
|
65
|
+
redis_ready_seed_length,
|
66
|
+
running_spider_thread_num,
|
67
|
+
memory_seed_queue_length,
|
68
|
+
storer_upload_queue_length
|
69
|
+
))
|
70
|
+
|
71
|
+
time.sleep(3)
|
72
|
+
stop.set()
|
73
|
+
|
74
|
+
|
75
|
+
def launcher(task):
|
76
|
+
"""
|
77
|
+
任务启动装饰器
|
78
|
+
:param task: 任务配置信息
|
79
|
+
"""
|
80
|
+
def decorator(func):
|
81
|
+
"""
|
82
|
+
Item:
|
83
|
+
Textfile()
|
84
|
+
Loghub()
|
85
|
+
Console()
|
86
|
+
e.g.
|
87
|
+
task.fields = "a,b"
|
88
|
+
func(item, seed)
|
89
|
+
a = "a"
|
90
|
+
b = "b"
|
91
|
+
data = {"a": "a", "b": "b"}
|
92
|
+
yield item.Loghub(**data)
|
93
|
+
yield item.Loghub(a=a, b=b)
|
94
|
+
"""
|
95
|
+
storer_list = []
|
96
|
+
|
97
|
+
# 程序结束事件
|
98
|
+
last = threading.Event()
|
99
|
+
# 停止采集事件
|
100
|
+
stop = threading.Event()
|
101
|
+
|
102
|
+
# 初始化redis信息
|
103
|
+
redis_db = RedisDB(
|
104
|
+
task.project, task.task_name, task.redis_info,
|
105
|
+
model=MODEL, cs_lct=CHECK_LOCK_TIME, rs_time=RESET_SCORE
|
106
|
+
)
|
107
|
+
|
108
|
+
# new item
|
109
|
+
item = type("Item", (object,), {"redis_client": redis_db.client})()
|
110
|
+
|
111
|
+
log.info("初始化cobweb!")
|
112
|
+
|
113
|
+
seed_queue = Queue()
|
114
|
+
|
115
|
+
scheduler_info = task.scheduler_info or dict()
|
116
|
+
|
117
|
+
# 调度器动态继承
|
118
|
+
sql = scheduler_info.get("sql")
|
119
|
+
table = scheduler_info.get("table")
|
120
|
+
size = scheduler_info.get("size")
|
121
|
+
scheduler_config = scheduler_info.get("config")
|
122
|
+
scheduler_db = scheduler_info.get("db", "default")
|
123
|
+
DB, class_name = pim(scheduler_db, "scheduler")
|
124
|
+
# SchedulerDB, table, sql, length, size, config = task.scheduler_info
|
125
|
+
SchedulerTmp = type(class_name, (Scheduler, DB), {})
|
126
|
+
|
127
|
+
# 初始化调度器
|
128
|
+
scheduler = SchedulerTmp(
|
129
|
+
table=table, sql=sql, size=size, queue=seed_queue,
|
130
|
+
length=task.scheduler_queue_length, config=scheduler_config
|
131
|
+
)
|
132
|
+
|
133
|
+
# 初始化采集器
|
134
|
+
spider = Spider(seed_queue, task.max_retries)
|
135
|
+
|
136
|
+
storer = None
|
137
|
+
|
138
|
+
# 解析存储器信息
|
139
|
+
storer_info = task.storer_info or dict()
|
140
|
+
|
141
|
+
# for storer_info in storer_info_list:
|
142
|
+
if storer_info:
|
143
|
+
storer_db = storer_info["db"]
|
144
|
+
fields = storer_info["fields"]
|
145
|
+
storer_table = storer_info.get("table", "console")
|
146
|
+
storer_config = storer_info.get("config")
|
147
|
+
|
148
|
+
StorerDB, class_name = pim(storer_db, "storer")
|
149
|
+
StorerTmp = type(class_name, (Storer, StorerDB), {})
|
150
|
+
|
151
|
+
db_name = class_name.lower()
|
152
|
+
if not getattr(item, db_name, None):
|
153
|
+
instance = type(db_name, (DBItem,), {})
|
154
|
+
setattr(item, db_name, instance)
|
155
|
+
|
156
|
+
storer_item_instance = getattr(item, db_name)
|
157
|
+
storer_item_instance.init_item(storer_table, fields)
|
158
|
+
|
159
|
+
storer_queue = sqn(db_name, storer_table)
|
160
|
+
queue = getattr(storer_item_instance, storer_queue)
|
161
|
+
# 初始话存储器
|
162
|
+
table_name = rtn(table_name=storer_table)
|
163
|
+
storer = StorerTmp(
|
164
|
+
table=table_name, fields=fields,
|
165
|
+
length=task.storer_queue_length,
|
166
|
+
queue=queue, config=storer_config
|
167
|
+
)
|
168
|
+
|
169
|
+
Thread(target=redis_db.check_spider_queue, args=(stop, len(storer_list))).start()
|
170
|
+
Thread(target=redis_db.set_heartbeat, args=(stop,)).start()
|
171
|
+
|
172
|
+
# 推送初始种子
|
173
|
+
# seeds = start_seeds(task.start_seed)
|
174
|
+
redis_db.add_seed(task.seeds)
|
175
|
+
# 启动调度器, 调度至redis队列
|
176
|
+
Thread(
|
177
|
+
# name="xxxx_schedule_seeds",
|
178
|
+
target=scheduler.schedule_seed,
|
179
|
+
args=(
|
180
|
+
redis_db.ready_seed_length,
|
181
|
+
redis_db.get_scheduler_lock,
|
182
|
+
redis_db.add_seed
|
183
|
+
)
|
184
|
+
).start()
|
185
|
+
|
186
|
+
# 启动调度器, 调度任务队列
|
187
|
+
Thread(
|
188
|
+
# name="xxxx_schedule_task",
|
189
|
+
target=scheduler.schedule_task,
|
190
|
+
args=(
|
191
|
+
stop, redis_db.get_seed,
|
192
|
+
redis_db.ready_seed_length
|
193
|
+
)
|
194
|
+
).start()
|
195
|
+
|
196
|
+
# 启动采集器
|
197
|
+
for index in range(task.spider_num):
|
198
|
+
Thread(
|
199
|
+
# name=f"xxxx_spider_task:{index}",
|
200
|
+
target=spider.spider_task,
|
201
|
+
args=(
|
202
|
+
stop, func, item,
|
203
|
+
redis_db.del_seed
|
204
|
+
)
|
205
|
+
).start()
|
206
|
+
|
207
|
+
# 启动存储器
|
208
|
+
if storer:
|
209
|
+
Thread(
|
210
|
+
# name=f"xxxx_store_task:{storer.table}",
|
211
|
+
target=storer.store_task,
|
212
|
+
args=(
|
213
|
+
stop, last,
|
214
|
+
redis_db.reset_seed,
|
215
|
+
redis_db.set_storer
|
216
|
+
)
|
217
|
+
).start()
|
218
|
+
|
219
|
+
Thread(
|
220
|
+
# name="check_spider",
|
221
|
+
target=check,
|
222
|
+
args=(
|
223
|
+
stop, last, spider,
|
224
|
+
scheduler, storer,
|
225
|
+
redis_db.ready_seed_length,
|
226
|
+
redis_db.spider_queue_length,
|
227
|
+
)
|
228
|
+
).start()
|
229
|
+
|
230
|
+
return decorator
|
231
|
+
|
cobweb/single/models.py
CHANGED
@@ -1,104 +1,136 @@
|
|
1
1
|
import time
|
2
|
-
|
3
|
-
from
|
4
|
-
from single.nest import struct_queue_name
|
5
|
-
from single.nest import SchedulerInterface, StorerInterface
|
6
|
-
|
7
|
-
|
8
|
-
# class Transceiver:
|
9
|
-
class Distributor:
|
10
|
-
|
11
|
-
def __init__(self):
|
12
|
-
self.seed_queue = Queue()
|
2
|
+
from cobweb import log, Queue, Seed
|
3
|
+
from utils import issubclass_cobweb_interface
|
13
4
|
|
14
|
-
|
15
|
-
def queue_names(self):
|
16
|
-
return tuple(self.__dict__.keys())
|
5
|
+
# from pympler import asizeof
|
17
6
|
|
18
|
-
@property
|
19
|
-
def used_memory(self):
|
20
|
-
return asizeof.asizeof(self)
|
21
7
|
|
22
|
-
|
23
|
-
self.__setattr__(queue_name, Queue())
|
8
|
+
class Scheduler:
|
24
9
|
|
25
|
-
def
|
26
|
-
return self.__getattribute__(queue_name)
|
10
|
+
def schedule_seed(self, ready_seed_length, get_scheduler_lock, add_seed):
|
27
11
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
self.seed_queue.push(item)
|
32
|
-
elif getattr(item, "table_name", None):
|
33
|
-
queue_name = struct_queue_name(icn, item.table_name)
|
34
|
-
getattr(self, queue_name).push(item.serialization)
|
12
|
+
inf_name = "SchedulerInterface"
|
13
|
+
if not issubclass_cobweb_interface(self.__class__, inf_name):
|
14
|
+
raise Exception("not have schedule function!")
|
35
15
|
|
36
|
-
|
37
|
-
|
38
|
-
if not iterable:
|
16
|
+
if self.__class__.__name__ == "Default":
|
17
|
+
self.stop = True
|
39
18
|
return None
|
40
|
-
for result in iterable:
|
41
|
-
self.deal_item(result)
|
42
|
-
return True
|
43
|
-
|
44
19
|
|
45
|
-
|
20
|
+
while not self.stop:
|
21
|
+
length = ready_seed_length()
|
22
|
+
if length > self.size:
|
23
|
+
time.sleep(15)
|
46
24
|
|
47
|
-
|
25
|
+
elif get_scheduler_lock():
|
26
|
+
seeds = self.schedule()
|
27
|
+
add_seed(seeds)
|
48
28
|
|
49
|
-
|
50
|
-
return None
|
29
|
+
log.info(f"close thread: schedule_seed")
|
51
30
|
|
52
|
-
|
53
|
-
|
31
|
+
def schedule_task(self, stop, get_seed, ready_seed_length):
|
32
|
+
time.sleep(3)
|
33
|
+
while not stop.is_set():
|
54
34
|
|
55
|
-
|
35
|
+
if not ready_seed_length():
|
36
|
+
time.sleep(15)
|
37
|
+
continue
|
56
38
|
|
57
|
-
if self.queue.length
|
58
|
-
|
39
|
+
if self.queue.length >= self.length:
|
40
|
+
time.sleep(3)
|
41
|
+
continue
|
59
42
|
|
60
|
-
|
61
|
-
|
62
|
-
|
43
|
+
seeds = get_seed(self.length)
|
44
|
+
self.queue.push(seeds)
|
45
|
+
log.info(f"close thread: schedule_task")
|
63
46
|
|
64
47
|
|
65
48
|
class Spider:
|
66
49
|
|
67
|
-
def __init__(self, queue):
|
68
|
-
self.queue = queue
|
50
|
+
def __init__(self, queue, max_retries=5):
|
69
51
|
self.spider_in_progress = Queue()
|
52
|
+
self.max_retries = max_retries
|
53
|
+
self.queue = queue
|
70
54
|
|
71
|
-
def spider_task(self,
|
72
|
-
while not
|
55
|
+
def spider_task(self, stop, func, item, del_seed):
|
56
|
+
while not stop.is_set():
|
73
57
|
seed = self.queue.pop()
|
74
58
|
if not seed:
|
75
59
|
time.sleep(3)
|
76
60
|
continue
|
61
|
+
elif seed._retry >= self.max_retries:
|
62
|
+
del_seed(seed, spider_status=False)
|
63
|
+
continue
|
77
64
|
try:
|
78
|
-
self.spider_in_progress.push(1)
|
79
|
-
|
65
|
+
self.spider_in_progress.push(1, direct_insertion=True)
|
66
|
+
# log.info("spider seed: " + str(seed))
|
67
|
+
status = None
|
68
|
+
for it in func(item, seed):
|
69
|
+
if getattr(it, "table_name", None):
|
70
|
+
store_queue = it.queue()
|
71
|
+
store_queue.push(
|
72
|
+
[seed, it.struct_data],
|
73
|
+
direct_insertion=True
|
74
|
+
)
|
75
|
+
elif isinstance(it, Seed):
|
76
|
+
self.queue.push(it)
|
77
|
+
elif any(isinstance(it, t) for t in (list, tuple)):
|
78
|
+
self.queue.push([s if isinstance(s, Seed) else Seed(s) for s in it])
|
79
|
+
elif isinstance(it, bool):
|
80
|
+
status = it
|
81
|
+
elif it is None:
|
82
|
+
status = False
|
83
|
+
|
84
|
+
if status is not None:
|
85
|
+
if status:
|
86
|
+
del_seed(seed, spider_status=True)
|
87
|
+
else:
|
88
|
+
seed._retry += 1
|
89
|
+
self.queue.push(seed)
|
90
|
+
|
80
91
|
except Exception as e:
|
81
|
-
|
92
|
+
seed._retry += 1
|
93
|
+
self.queue.push(seed)
|
94
|
+
log.info(f"{str(seed)} -> {str(e)}")
|
82
95
|
finally:
|
83
96
|
self.spider_in_progress.pop()
|
97
|
+
log.info(f"close thread: spider")
|
84
98
|
|
85
99
|
|
86
100
|
class Storer:
|
87
101
|
|
88
|
-
def store_task(self,
|
102
|
+
def store_task(self, stop, last, reset_seed, del_seed):
|
89
103
|
|
90
|
-
|
104
|
+
inf_name = "StorerInterface"
|
105
|
+
if not issubclass_cobweb_interface(self.__class__, inf_name):
|
91
106
|
return None
|
92
107
|
|
93
108
|
if not getattr(self, "store", None):
|
94
109
|
raise Exception("not have store function!")
|
95
110
|
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
111
|
+
storer_name = self.__class__.__name__ + self.table
|
112
|
+
|
113
|
+
while not stop.is_set():
|
114
|
+
|
115
|
+
if last.is_set() or self.queue.length >= self.length:
|
116
|
+
seeds, data_list = [], []
|
117
|
+
|
118
|
+
for _ in range(self.length):
|
119
|
+
items = self.queue.pop()
|
120
|
+
if not items:
|
121
|
+
break
|
122
|
+
seed, data = items
|
123
|
+
seeds.append(seed)
|
102
124
|
data_list.append(data)
|
125
|
+
|
103
126
|
if data_list:
|
104
|
-
|
127
|
+
if self.store(data_list):
|
128
|
+
del_seed(seeds)
|
129
|
+
else:
|
130
|
+
reset_seed(seeds)
|
131
|
+
log.info("reset seeds!")
|
132
|
+
continue
|
133
|
+
|
134
|
+
time.sleep(3)
|
135
|
+
|
136
|
+
log.info(f"close thread: {storer_name}")
|
cobweb/utils.py
CHANGED
@@ -1,11 +1,9 @@
|
|
1
1
|
import json
|
2
|
+
import re
|
2
3
|
import sys
|
4
|
+
from abc import ABC
|
3
5
|
from typing import Iterable
|
4
|
-
|
5
|
-
import requests
|
6
|
-
|
7
|
-
|
8
|
-
# from cobweb import Seed
|
6
|
+
from importlib import import_module
|
9
7
|
|
10
8
|
|
11
9
|
def struct_table_name(table_name):
|
@@ -20,27 +18,6 @@ def struct_queue_name(db_name, table_name):
|
|
20
18
|
return sys.intern(f"__{db_name}_{table_name}_queue__")
|
21
19
|
|
22
20
|
|
23
|
-
# class StorerDB:
|
24
|
-
#
|
25
|
-
# @staticmethod
|
26
|
-
# def console(self):
|
27
|
-
# from db.storer.console import Console
|
28
|
-
# table = struct_table_name(table)
|
29
|
-
# return StorerInfo(DB=Console, table=table, length=length, config=None)
|
30
|
-
#
|
31
|
-
# @staticmethod
|
32
|
-
# def textfile(table, length=200):
|
33
|
-
# from db.storer.textfile import Textfile
|
34
|
-
# table = struct_table_name(table)
|
35
|
-
# return StorerInfo(DB=Textfile, table=table, length=length, config=None)
|
36
|
-
#
|
37
|
-
# @staticmethod
|
38
|
-
# def loghub(table, length=200, config=None):
|
39
|
-
# from db.storer.loghub import Loghub
|
40
|
-
# table = struct_table_name(table)
|
41
|
-
# return StorerInfo(DB=Loghub, table=table, length=length, config=config)
|
42
|
-
|
43
|
-
|
44
21
|
def parse_info(info):
|
45
22
|
if not info:
|
46
23
|
return info
|
@@ -74,15 +51,40 @@ def struct_start_seeds(seeds):
|
|
74
51
|
return Seed(seeds)
|
75
52
|
|
76
53
|
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
54
|
+
def issubclass_cobweb_interface(_class, inf_name):
|
55
|
+
for _c in _class.__mro__[1:]:
|
56
|
+
if _c.__name__ == inf_name:
|
57
|
+
return True
|
58
|
+
return False
|
59
|
+
|
60
|
+
|
61
|
+
def parse_import_model(model_info, model_type=None):
|
62
|
+
if model_type not in ["scheduler", "storer"]:
|
63
|
+
raise TypeError("model_type must be in scheduler, storer")
|
64
|
+
if isinstance(model_info, str):
|
65
|
+
if "import" in model_info:
|
66
|
+
model_path, class_name = re.search(
|
67
|
+
r"from (.*?) import (.*?)$", model_info
|
68
|
+
).groups()
|
69
|
+
model = import_module(model_path)
|
70
|
+
class_object = getattr(model, class_name)
|
71
|
+
elif "." in model_info:
|
72
|
+
info_list = model_info.split(".")
|
73
|
+
class_name = info_list[-1]
|
74
|
+
model_path = ".".join(info_list[:-1])
|
75
|
+
model = import_module(model_path)
|
76
|
+
class_object = getattr(model, class_name)
|
77
|
+
else:
|
78
|
+
model_path = f"cobweb.db.{model_type}.{model_info.lower()}"
|
79
|
+
class_name = model_info.capitalize()
|
80
|
+
model = import_module(model_path)
|
81
|
+
class_object = getattr(model, class_name)
|
82
|
+
return class_object, class_name
|
83
|
+
elif issubclass(model_info, ABC):
|
84
|
+
inf_name = model_type.capitalize() + "Interface"
|
85
|
+
if issubclass_cobweb_interface(model_info, inf_name):
|
86
|
+
return model_info, model_info.__name__
|
87
|
+
raise ImportError()
|
88
|
+
raise TypeError()
|
87
89
|
|
88
90
|
|
@@ -5,10 +5,10 @@ cobweb/interface.py,sha256=um_k2AAQl1HTOvfUlq914DjkpfZVwt2m1B65EpPKrmE,802
|
|
5
5
|
cobweb/log.py,sha256=Gb3_y4IzTo5pJohTggBCU9rK6-ZN3hgTOHkoXHyN6CU,2384
|
6
6
|
cobweb/setting.py,sha256=UAu_dLuIFYO98MxtlZ5sZqJcwKAUKq4Bu4KoKlV50Mc,288
|
7
7
|
cobweb/task.py,sha256=77F5EaopSVlSX2TANv1lhuPHFI8ER8Jh4tSGrwDWAc0,1405
|
8
|
-
cobweb/utils.py,sha256=
|
8
|
+
cobweb/utils.py,sha256=5ZcSfT3VCWF_Kl1APGYJYHtPhZPnr45nFqyyZA9LlOg,2663
|
9
9
|
cobweb/db/__init__.py,sha256=4m9lqmxZCRbaih3Z3rl_BT0GugMd0dkOIgu_P9aeC84,63
|
10
10
|
cobweb/db/oss_db.py,sha256=lFGNuH3tdIMsohVXQ_fTZPyBfS2oxYNmFNuQ-ZBQgm0,4221
|
11
|
-
cobweb/db/redis_db.py,sha256=
|
11
|
+
cobweb/db/redis_db.py,sha256=yoWy-GI0rjVmT-68Che-pypfqNwNti5JGkc9bYvJH2o,8202
|
12
12
|
cobweb/db/scheduler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
13
|
cobweb/db/scheduler/default.py,sha256=OxmFX7OvMEhKEq-NF7A8I9cA4V4qWw5vayS-yIbng0A,114
|
14
14
|
cobweb/db/scheduler/textfile.py,sha256=atRDeNT-e5toNvyGsCXAxL1FJi77uSYktdCzH_hXGo8,821
|
@@ -18,13 +18,14 @@ cobweb/db/storer/loghub.py,sha256=4VqZacXWhidzINHXQu2_-E0HOBRCcc86f6LkKfnXD5I,17
|
|
18
18
|
cobweb/db/storer/redis.py,sha256=7Q2XEQwBL6X_M1uvxzzuSBt6iw9piKw-_FWKm2INZDQ,412
|
19
19
|
cobweb/db/storer/textfile.py,sha256=3mDHMvF6Sh5fn3IHzWQxyTUd45V-zUoH8vY3EoRlMx0,415
|
20
20
|
cobweb/distributed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
|
-
cobweb/distributed/launcher.py,sha256=
|
22
|
-
cobweb/distributed/models.py,sha256=
|
21
|
+
cobweb/distributed/launcher.py,sha256=jTtBXBmna_6yFdj6gyGQiiEtg8I0g5uI5h8kbHWt454,7998
|
22
|
+
cobweb/distributed/models.py,sha256=Z46-Dwv--VLUr9t7wbNHaLIuEK0l-copJEbc-abfjb8,4617
|
23
23
|
cobweb/single/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
24
|
-
cobweb/single/
|
24
|
+
cobweb/single/launcher.py,sha256=Z-PaX1f_nphdBwbIW1Ki6D_4xsrFmx94SO3BXaexYAg,7304
|
25
|
+
cobweb/single/models.py,sha256=xIn0mefT_oMVynn3V6S4wkOCx6PGOemT4_fcuU7CSZs,4297
|
25
26
|
cobweb/single/nest.py,sha256=mL8q9a5BjtoeUyzXCIVw_vyUsNY8ltbvQpYIIpZEDFU,5012
|
26
|
-
cobweb_launcher-0.1.
|
27
|
-
cobweb_launcher-0.1.
|
28
|
-
cobweb_launcher-0.1.
|
29
|
-
cobweb_launcher-0.1.
|
30
|
-
cobweb_launcher-0.1.
|
27
|
+
cobweb_launcher-0.1.3.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
|
28
|
+
cobweb_launcher-0.1.3.dist-info/METADATA,sha256=vr4nnPQWibda20Nnn2KB2k7U81pWZLvVh8402NIhEgU,1225
|
29
|
+
cobweb_launcher-0.1.3.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
|
30
|
+
cobweb_launcher-0.1.3.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
|
31
|
+
cobweb_launcher-0.1.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|