cobweb-launcher 0.1.2__tar.gz → 0.1.4__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/PKG-INFO +1 -1
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/__init__.py +1 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/redis_db.py +5 -3
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/distributed/launcher.py +14 -51
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/distributed/models.py +7 -3
- cobweb-launcher-0.1.4/cobweb/single/launcher.py +231 -0
- cobweb-launcher-0.1.4/cobweb/single/models.py +136 -0
- cobweb-launcher-0.1.4/cobweb/utils.py +90 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb_launcher.egg-info/PKG-INFO +1 -1
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb_launcher.egg-info/SOURCES.txt +1 -1
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/setup.py +1 -1
- cobweb-launcher-0.1.2/cobweb/single/models.py +0 -104
- cobweb-launcher-0.1.2/cobweb/single/nest.py +0 -153
- cobweb-launcher-0.1.2/cobweb/utils.py +0 -88
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/LICENSE +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/README.md +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/bbb.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/__init__.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/oss_db.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/scheduler/__init__.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/scheduler/default.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/scheduler/textfile.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/storer/__init__.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/storer/console.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/storer/loghub.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/storer/redis.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/storer/textfile.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/decorators.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/distributed/__init__.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/interface.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/log.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/setting.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/single/__init__.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/task.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb_launcher.egg-info/requires.txt +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb_launcher.egg-info/top_level.txt +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/setup.cfg +0 -0
@@ -151,17 +151,19 @@ class RedisDB:
|
|
151
151
|
@check_redis_status
|
152
152
|
def check_spider_queue(self, stop, storer_num):
|
153
153
|
while not stop.is_set():
|
154
|
-
# 每15s获取check锁,等待600s
|
154
|
+
# 每15s获取check锁,等待600s后仍获取不到锁则重试;获取到锁后,设置锁的存活时间为${cs_lct}s
|
155
155
|
if self._get_lock(key=self.check_lock, t=self.cs_lct, timeout=600, sleep_time=3):
|
156
156
|
heartbeat = True if self.client.exists(self.heartbeat_key) else False
|
157
|
-
# 重启重制score
|
157
|
+
# 重启重制score值,否则获取${rs_time}分钟前的分数值
|
158
158
|
score = -int(time.time()) + self.rs_time if heartbeat else "-inf"
|
159
159
|
|
160
160
|
keys = self.client.keys(self.storer_key % "*")
|
161
|
+
|
161
162
|
if keys and len(keys) >= storer_num:
|
162
163
|
intersection_key = self.storer_key % "intersection"
|
163
164
|
self.client.delete(intersection_key)
|
164
165
|
self.client.zinterstore(intersection_key, keys)
|
166
|
+
|
165
167
|
while True:
|
166
168
|
members = self.client.zrange(intersection_key, 0, 1999)
|
167
169
|
if not members:
|
@@ -192,7 +194,7 @@ class RedisDB:
|
|
192
194
|
self.client.setex(self.heartbeat_key, 15, "")
|
193
195
|
|
194
196
|
# self.client.delete(self.check_lock)
|
195
|
-
time.sleep(3)
|
197
|
+
# time.sleep(3)
|
196
198
|
|
197
199
|
@check_redis_status
|
198
200
|
def set_heartbeat(self, stop):
|
@@ -1,50 +1,15 @@
|
|
1
1
|
import time
|
2
2
|
import threading
|
3
3
|
from threading import Thread
|
4
|
-
from importlib import import_module
|
5
4
|
|
6
|
-
from cobweb import log, Queue, DBItem, RedisDB, OssDB, StorerInterface
|
7
|
-
from cobweb.utils import struct_queue_name, restore_table_name
|
8
|
-
from cobweb.setting import MODEL, RESET_SCORE, CHECK_LOCK_TIME
|
9
5
|
from .models import Scheduler, Spider, Storer
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
obj = getattr(model, db)
|
18
|
-
else:
|
19
|
-
model = import_module(f"cobweb.db.scheduler.{db.lower()}")
|
20
|
-
obj = getattr(model, db.capitalize())
|
21
|
-
return obj
|
22
|
-
# if db.lower() in dir(StorerDB):
|
23
|
-
# return getattr(StorerDB, db)
|
24
|
-
# else:
|
25
|
-
# pass
|
26
|
-
elif issubclass(db, StorerInterface):
|
27
|
-
return db
|
28
|
-
raise TypeError()
|
29
|
-
|
30
|
-
|
31
|
-
def get_storer_db(db):
|
32
|
-
if isinstance(db, str):
|
33
|
-
if "." in db:
|
34
|
-
model_path = db.split(".")
|
35
|
-
model = import_module(db)
|
36
|
-
obj = getattr(model, db)
|
37
|
-
else:
|
38
|
-
model = import_module(f"cobweb.db.storer.{db.lower()}")
|
39
|
-
obj = getattr(model, db.capitalize())
|
40
|
-
return obj, db.lower()
|
41
|
-
# if db.lower() in dir(StorerDB):
|
42
|
-
# return getattr(StorerDB, db)
|
43
|
-
# else:
|
44
|
-
# pass
|
45
|
-
elif issubclass(db, StorerInterface):
|
46
|
-
return db, db.__name__.lower()
|
47
|
-
raise TypeError()
|
6
|
+
from cobweb import log, Queue, DBItem, RedisDB
|
7
|
+
from cobweb.setting import MODEL, RESET_SCORE, CHECK_LOCK_TIME
|
8
|
+
from cobweb.utils import (
|
9
|
+
struct_queue_name as sqn,
|
10
|
+
restore_table_name as rtn,
|
11
|
+
parse_import_model as pim,
|
12
|
+
)
|
48
13
|
|
49
14
|
|
50
15
|
def check(stop, last, spider, scheduler, storer_list, ready_seed_length, spider_queue_length):
|
@@ -164,9 +129,9 @@ def launcher(task):
|
|
164
129
|
size = task.scheduler_info.get("size")
|
165
130
|
scheduler_config = task.scheduler_info.get("config")
|
166
131
|
scheduler_db = task.scheduler_info.get("db", "default")
|
167
|
-
DB =
|
132
|
+
DB, class_name = pim(scheduler_db, "scheduler")
|
168
133
|
# SchedulerDB, table, sql, length, size, config = task.scheduler_info
|
169
|
-
SchedulerTmp = type(
|
134
|
+
SchedulerTmp = type(class_name, (Scheduler, DB), {})
|
170
135
|
|
171
136
|
# 初始化调度器
|
172
137
|
scheduler = SchedulerTmp(
|
@@ -185,18 +150,16 @@ def launcher(task):
|
|
185
150
|
# new item
|
186
151
|
item = type("Item", (object,), {"redis_client": redis_db.client})()
|
187
152
|
|
188
|
-
if task.oss_config:
|
189
|
-
item.oss = OssDB(**task.oss_config)
|
190
|
-
|
191
153
|
for storer_info in storer_info_list:
|
192
154
|
storer_db = storer_info["db"]
|
193
155
|
fields = storer_info["fields"]
|
194
156
|
storer_table = storer_info.get("table", "console")
|
195
157
|
storer_config = storer_info.get("config")
|
196
158
|
|
197
|
-
StorerDB,
|
198
|
-
StorerTmp = type(
|
159
|
+
StorerDB, class_name = pim(storer_db, "storer")
|
160
|
+
StorerTmp = type(class_name, (Storer, StorerDB), {})
|
199
161
|
|
162
|
+
db_name = class_name.lower()
|
200
163
|
if not getattr(item, db_name, None):
|
201
164
|
instance = type(db_name, (DBItem,), {})
|
202
165
|
setattr(item, db_name, instance)
|
@@ -204,10 +167,10 @@ def launcher(task):
|
|
204
167
|
storer_item_instance = getattr(item, db_name)
|
205
168
|
storer_item_instance.init_item(storer_table, fields)
|
206
169
|
|
207
|
-
storer_queue =
|
170
|
+
storer_queue = sqn(db_name, storer_table)
|
208
171
|
queue = getattr(storer_item_instance, storer_queue)
|
209
172
|
# 初始话存储器
|
210
|
-
table_name =
|
173
|
+
table_name = rtn(table_name=storer_table)
|
211
174
|
storer = StorerTmp(
|
212
175
|
table=table_name, fields=fields,
|
213
176
|
length=task.storer_queue_length,
|
@@ -1,6 +1,8 @@
|
|
1
1
|
import time
|
2
2
|
from hashlib import md5
|
3
|
-
from cobweb import log, Queue, Seed
|
3
|
+
from cobweb import log, Queue, Seed
|
4
|
+
from utils import issubclass_cobweb_interface
|
5
|
+
|
4
6
|
# from pympler import asizeof
|
5
7
|
|
6
8
|
|
@@ -8,7 +10,8 @@ class Scheduler:
|
|
8
10
|
|
9
11
|
def schedule_seed(self, ready_seed_length, get_scheduler_lock, add_seed):
|
10
12
|
|
11
|
-
|
13
|
+
inf_name = "SchedulerInterface"
|
14
|
+
if not issubclass_cobweb_interface(self.__class__, inf_name):
|
12
15
|
raise Exception("not have schedule function!")
|
13
16
|
|
14
17
|
if self.__class__.__name__ == "Default":
|
@@ -103,7 +106,8 @@ class Storer:
|
|
103
106
|
|
104
107
|
def store_task(self, stop, last, reset_seed, set_storer):
|
105
108
|
|
106
|
-
|
109
|
+
inf_name = "StorerInterface"
|
110
|
+
if not issubclass_cobweb_interface(self.__class__, inf_name):
|
107
111
|
return None
|
108
112
|
|
109
113
|
if not getattr(self, "store", None):
|
@@ -0,0 +1,231 @@
|
|
1
|
+
import time
|
2
|
+
import threading
|
3
|
+
from threading import Thread
|
4
|
+
|
5
|
+
from .models import Scheduler, Spider, Storer
|
6
|
+
from cobweb import log, Queue, DBItem, RedisDB
|
7
|
+
from cobweb.setting import MODEL, RESET_SCORE, CHECK_LOCK_TIME
|
8
|
+
from cobweb.utils import (
|
9
|
+
struct_queue_name as sqn,
|
10
|
+
restore_table_name as rtn,
|
11
|
+
parse_import_model as pim,
|
12
|
+
)
|
13
|
+
|
14
|
+
|
15
|
+
def check(stop, last, spider, scheduler, storer, ready_seed_length, spider_queue_length):
|
16
|
+
log.info("run check thread after 30 seconds...")
|
17
|
+
time.sleep(30)
|
18
|
+
spider_info = """
|
19
|
+
------------------- check: {0} ------------------
|
20
|
+
redis_spider_seed_length: {1}
|
21
|
+
redis_ready_seed_length: {2}
|
22
|
+
running_spider_thread_num: {3}
|
23
|
+
memory_seed_queue_length: {4}
|
24
|
+
storer_queue_length_info: {5}
|
25
|
+
----------------------- end -----------------------"""
|
26
|
+
while True:
|
27
|
+
status = "running"
|
28
|
+
running_spider_thread_num = spider.spider_in_progress.length
|
29
|
+
redis_ready_seed_length = ready_seed_length()
|
30
|
+
redis_spider_seed_length = spider_queue_length()
|
31
|
+
memory_seed_queue_length = scheduler.queue.length
|
32
|
+
storer_upload_queue_length = storer.queue.length
|
33
|
+
if (
|
34
|
+
scheduler.stop and
|
35
|
+
# not redis_ready_seed_length and
|
36
|
+
not memory_seed_queue_length and
|
37
|
+
not running_spider_thread_num
|
38
|
+
):
|
39
|
+
if not MODEL:
|
40
|
+
log.info("spider is done?")
|
41
|
+
last.set()
|
42
|
+
time.sleep(3)
|
43
|
+
storer_queue_empty = True
|
44
|
+
if storer.queue.length:
|
45
|
+
storer_queue_empty = False
|
46
|
+
storer_upload_queue_length = storer.queue.length
|
47
|
+
if (
|
48
|
+
storer_queue_empty and
|
49
|
+
not redis_ready_seed_length and
|
50
|
+
not redis_spider_seed_length
|
51
|
+
):
|
52
|
+
if MODEL:
|
53
|
+
log.info("waiting for push seeds...")
|
54
|
+
status = "waiting"
|
55
|
+
time.sleep(30)
|
56
|
+
else:
|
57
|
+
log.info("spider done!")
|
58
|
+
break
|
59
|
+
|
60
|
+
last.clear()
|
61
|
+
|
62
|
+
log.info(spider_info.format(
|
63
|
+
status,
|
64
|
+
redis_spider_seed_length,
|
65
|
+
redis_ready_seed_length,
|
66
|
+
running_spider_thread_num,
|
67
|
+
memory_seed_queue_length,
|
68
|
+
storer_upload_queue_length
|
69
|
+
))
|
70
|
+
|
71
|
+
time.sleep(3)
|
72
|
+
stop.set()
|
73
|
+
|
74
|
+
|
75
|
+
def launcher(task):
|
76
|
+
"""
|
77
|
+
任务启动装饰器
|
78
|
+
:param task: 任务配置信息
|
79
|
+
"""
|
80
|
+
def decorator(func):
|
81
|
+
"""
|
82
|
+
Item:
|
83
|
+
Textfile()
|
84
|
+
Loghub()
|
85
|
+
Console()
|
86
|
+
e.g.
|
87
|
+
task.fields = "a,b"
|
88
|
+
func(item, seed)
|
89
|
+
a = "a"
|
90
|
+
b = "b"
|
91
|
+
data = {"a": "a", "b": "b"}
|
92
|
+
yield item.Loghub(**data)
|
93
|
+
yield item.Loghub(a=a, b=b)
|
94
|
+
"""
|
95
|
+
storer_list = []
|
96
|
+
|
97
|
+
# 程序结束事件
|
98
|
+
last = threading.Event()
|
99
|
+
# 停止采集事件
|
100
|
+
stop = threading.Event()
|
101
|
+
|
102
|
+
# 初始化redis信息
|
103
|
+
redis_db = RedisDB(
|
104
|
+
task.project, task.task_name, task.redis_info,
|
105
|
+
model=MODEL, cs_lct=CHECK_LOCK_TIME, rs_time=RESET_SCORE
|
106
|
+
)
|
107
|
+
|
108
|
+
# new item
|
109
|
+
item = type("Item", (object,), {"redis_client": redis_db.client})()
|
110
|
+
|
111
|
+
log.info("初始化cobweb!")
|
112
|
+
|
113
|
+
seed_queue = Queue()
|
114
|
+
|
115
|
+
scheduler_info = task.scheduler_info or dict()
|
116
|
+
|
117
|
+
# 调度器动态继承
|
118
|
+
sql = scheduler_info.get("sql")
|
119
|
+
table = scheduler_info.get("table")
|
120
|
+
size = scheduler_info.get("size")
|
121
|
+
scheduler_config = scheduler_info.get("config")
|
122
|
+
scheduler_db = scheduler_info.get("db", "default")
|
123
|
+
DB, class_name = pim(scheduler_db, "scheduler")
|
124
|
+
# SchedulerDB, table, sql, length, size, config = task.scheduler_info
|
125
|
+
SchedulerTmp = type(class_name, (Scheduler, DB), {})
|
126
|
+
|
127
|
+
# 初始化调度器
|
128
|
+
scheduler = SchedulerTmp(
|
129
|
+
table=table, sql=sql, size=size, queue=seed_queue,
|
130
|
+
length=task.scheduler_queue_length, config=scheduler_config
|
131
|
+
)
|
132
|
+
|
133
|
+
# 初始化采集器
|
134
|
+
spider = Spider(seed_queue, task.max_retries)
|
135
|
+
|
136
|
+
storer = None
|
137
|
+
|
138
|
+
# 解析存储器信息
|
139
|
+
storer_info = task.storer_info or dict()
|
140
|
+
|
141
|
+
# for storer_info in storer_info_list:
|
142
|
+
if storer_info:
|
143
|
+
storer_db = storer_info["db"]
|
144
|
+
fields = storer_info["fields"]
|
145
|
+
storer_table = storer_info.get("table", "console")
|
146
|
+
storer_config = storer_info.get("config")
|
147
|
+
|
148
|
+
StorerDB, class_name = pim(storer_db, "storer")
|
149
|
+
StorerTmp = type(class_name, (Storer, StorerDB), {})
|
150
|
+
|
151
|
+
db_name = class_name.lower()
|
152
|
+
if not getattr(item, db_name, None):
|
153
|
+
instance = type(db_name, (DBItem,), {})
|
154
|
+
setattr(item, db_name, instance)
|
155
|
+
|
156
|
+
storer_item_instance = getattr(item, db_name)
|
157
|
+
storer_item_instance.init_item(storer_table, fields)
|
158
|
+
|
159
|
+
storer_queue = sqn(db_name, storer_table)
|
160
|
+
queue = getattr(storer_item_instance, storer_queue)
|
161
|
+
# 初始话存储器
|
162
|
+
table_name = rtn(table_name=storer_table)
|
163
|
+
storer = StorerTmp(
|
164
|
+
table=table_name, fields=fields,
|
165
|
+
length=task.storer_queue_length,
|
166
|
+
queue=queue, config=storer_config
|
167
|
+
)
|
168
|
+
|
169
|
+
Thread(target=redis_db.check_spider_queue, args=(stop, len(storer_list))).start()
|
170
|
+
Thread(target=redis_db.set_heartbeat, args=(stop,)).start()
|
171
|
+
|
172
|
+
# 推送初始种子
|
173
|
+
# seeds = start_seeds(task.start_seed)
|
174
|
+
redis_db.add_seed(task.seeds)
|
175
|
+
# 启动调度器, 调度至redis队列
|
176
|
+
Thread(
|
177
|
+
# name="xxxx_schedule_seeds",
|
178
|
+
target=scheduler.schedule_seed,
|
179
|
+
args=(
|
180
|
+
redis_db.ready_seed_length,
|
181
|
+
redis_db.get_scheduler_lock,
|
182
|
+
redis_db.add_seed
|
183
|
+
)
|
184
|
+
).start()
|
185
|
+
|
186
|
+
# 启动调度器, 调度任务队列
|
187
|
+
Thread(
|
188
|
+
# name="xxxx_schedule_task",
|
189
|
+
target=scheduler.schedule_task,
|
190
|
+
args=(
|
191
|
+
stop, redis_db.get_seed,
|
192
|
+
redis_db.ready_seed_length
|
193
|
+
)
|
194
|
+
).start()
|
195
|
+
|
196
|
+
# 启动采集器
|
197
|
+
for index in range(task.spider_num):
|
198
|
+
Thread(
|
199
|
+
# name=f"xxxx_spider_task:{index}",
|
200
|
+
target=spider.spider_task,
|
201
|
+
args=(
|
202
|
+
stop, func, item,
|
203
|
+
redis_db.del_seed
|
204
|
+
)
|
205
|
+
).start()
|
206
|
+
|
207
|
+
# 启动存储器
|
208
|
+
if storer:
|
209
|
+
Thread(
|
210
|
+
# name=f"xxxx_store_task:{storer.table}",
|
211
|
+
target=storer.store_task,
|
212
|
+
args=(
|
213
|
+
stop, last,
|
214
|
+
redis_db.reset_seed,
|
215
|
+
redis_db.set_storer
|
216
|
+
)
|
217
|
+
).start()
|
218
|
+
|
219
|
+
Thread(
|
220
|
+
# name="check_spider",
|
221
|
+
target=check,
|
222
|
+
args=(
|
223
|
+
stop, last, spider,
|
224
|
+
scheduler, storer,
|
225
|
+
redis_db.ready_seed_length,
|
226
|
+
redis_db.spider_queue_length,
|
227
|
+
)
|
228
|
+
).start()
|
229
|
+
|
230
|
+
return decorator
|
231
|
+
|
@@ -0,0 +1,136 @@
|
|
1
|
+
import time
|
2
|
+
from cobweb import log, Queue, Seed
|
3
|
+
from utils import issubclass_cobweb_interface
|
4
|
+
|
5
|
+
# from pympler import asizeof
|
6
|
+
|
7
|
+
|
8
|
+
class Scheduler:
|
9
|
+
|
10
|
+
def schedule_seed(self, ready_seed_length, get_scheduler_lock, add_seed):
|
11
|
+
|
12
|
+
inf_name = "SchedulerInterface"
|
13
|
+
if not issubclass_cobweb_interface(self.__class__, inf_name):
|
14
|
+
raise Exception("not have schedule function!")
|
15
|
+
|
16
|
+
if self.__class__.__name__ == "Default":
|
17
|
+
self.stop = True
|
18
|
+
return None
|
19
|
+
|
20
|
+
while not self.stop:
|
21
|
+
length = ready_seed_length()
|
22
|
+
if length > self.size:
|
23
|
+
time.sleep(15)
|
24
|
+
|
25
|
+
elif get_scheduler_lock():
|
26
|
+
seeds = self.schedule()
|
27
|
+
add_seed(seeds)
|
28
|
+
|
29
|
+
log.info(f"close thread: schedule_seed")
|
30
|
+
|
31
|
+
def schedule_task(self, stop, get_seed, ready_seed_length):
|
32
|
+
time.sleep(3)
|
33
|
+
while not stop.is_set():
|
34
|
+
|
35
|
+
if not ready_seed_length():
|
36
|
+
time.sleep(15)
|
37
|
+
continue
|
38
|
+
|
39
|
+
if self.queue.length >= self.length:
|
40
|
+
time.sleep(3)
|
41
|
+
continue
|
42
|
+
|
43
|
+
seeds = get_seed(self.length)
|
44
|
+
self.queue.push(seeds)
|
45
|
+
log.info(f"close thread: schedule_task")
|
46
|
+
|
47
|
+
|
48
|
+
class Spider:
|
49
|
+
|
50
|
+
def __init__(self, queue, max_retries=5):
|
51
|
+
self.spider_in_progress = Queue()
|
52
|
+
self.max_retries = max_retries
|
53
|
+
self.queue = queue
|
54
|
+
|
55
|
+
def spider_task(self, stop, func, item, del_seed):
|
56
|
+
while not stop.is_set():
|
57
|
+
seed = self.queue.pop()
|
58
|
+
if not seed:
|
59
|
+
time.sleep(3)
|
60
|
+
continue
|
61
|
+
elif seed._retry >= self.max_retries:
|
62
|
+
del_seed(seed, spider_status=False)
|
63
|
+
continue
|
64
|
+
try:
|
65
|
+
self.spider_in_progress.push(1, direct_insertion=True)
|
66
|
+
# log.info("spider seed: " + str(seed))
|
67
|
+
status = None
|
68
|
+
for it in func(item, seed):
|
69
|
+
if getattr(it, "table_name", None):
|
70
|
+
store_queue = it.queue()
|
71
|
+
store_queue.push(
|
72
|
+
[seed, it.struct_data],
|
73
|
+
direct_insertion=True
|
74
|
+
)
|
75
|
+
elif isinstance(it, Seed):
|
76
|
+
self.queue.push(it)
|
77
|
+
elif any(isinstance(it, t) for t in (list, tuple)):
|
78
|
+
self.queue.push([s if isinstance(s, Seed) else Seed(s) for s in it])
|
79
|
+
elif isinstance(it, bool):
|
80
|
+
status = it
|
81
|
+
elif it is None:
|
82
|
+
status = False
|
83
|
+
|
84
|
+
if status is not None:
|
85
|
+
if status:
|
86
|
+
del_seed(seed, spider_status=True)
|
87
|
+
else:
|
88
|
+
seed._retry += 1
|
89
|
+
self.queue.push(seed)
|
90
|
+
|
91
|
+
except Exception as e:
|
92
|
+
seed._retry += 1
|
93
|
+
self.queue.push(seed)
|
94
|
+
log.info(f"{str(seed)} -> {str(e)}")
|
95
|
+
finally:
|
96
|
+
self.spider_in_progress.pop()
|
97
|
+
log.info(f"close thread: spider")
|
98
|
+
|
99
|
+
|
100
|
+
class Storer:
|
101
|
+
|
102
|
+
def store_task(self, stop, last, reset_seed, del_seed):
|
103
|
+
|
104
|
+
inf_name = "StorerInterface"
|
105
|
+
if not issubclass_cobweb_interface(self.__class__, inf_name):
|
106
|
+
return None
|
107
|
+
|
108
|
+
if not getattr(self, "store", None):
|
109
|
+
raise Exception("not have store function!")
|
110
|
+
|
111
|
+
storer_name = self.__class__.__name__ + self.table
|
112
|
+
|
113
|
+
while not stop.is_set():
|
114
|
+
|
115
|
+
if last.is_set() or self.queue.length >= self.length:
|
116
|
+
seeds, data_list = [], []
|
117
|
+
|
118
|
+
for _ in range(self.length):
|
119
|
+
items = self.queue.pop()
|
120
|
+
if not items:
|
121
|
+
break
|
122
|
+
seed, data = items
|
123
|
+
seeds.append(seed)
|
124
|
+
data_list.append(data)
|
125
|
+
|
126
|
+
if data_list:
|
127
|
+
if self.store(data_list):
|
128
|
+
del_seed(seeds)
|
129
|
+
else:
|
130
|
+
reset_seed(seeds)
|
131
|
+
log.info("reset seeds!")
|
132
|
+
continue
|
133
|
+
|
134
|
+
time.sleep(3)
|
135
|
+
|
136
|
+
log.info(f"close thread: {storer_name}")
|
@@ -0,0 +1,90 @@
|
|
1
|
+
import json
|
2
|
+
import re
|
3
|
+
import sys
|
4
|
+
from abc import ABC
|
5
|
+
from typing import Iterable
|
6
|
+
from importlib import import_module
|
7
|
+
|
8
|
+
|
9
|
+
def struct_table_name(table_name):
|
10
|
+
return table_name.replace(".", "__p__").replace(":", "__c__")
|
11
|
+
|
12
|
+
|
13
|
+
def restore_table_name(table_name):
|
14
|
+
return table_name.replace("__p__", ".").replace("__c__", ":")
|
15
|
+
|
16
|
+
|
17
|
+
def struct_queue_name(db_name, table_name):
|
18
|
+
return sys.intern(f"__{db_name}_{table_name}_queue__")
|
19
|
+
|
20
|
+
|
21
|
+
def parse_info(info):
|
22
|
+
if not info:
|
23
|
+
return info
|
24
|
+
|
25
|
+
if isinstance(info, dict):
|
26
|
+
return info
|
27
|
+
|
28
|
+
if isinstance(info, str):
|
29
|
+
return json.loads(info)
|
30
|
+
|
31
|
+
if isinstance(info, Iterable):
|
32
|
+
result = list()
|
33
|
+
for ii in info:
|
34
|
+
if isinstance(ii, str):
|
35
|
+
result.append(json.loads(ii))
|
36
|
+
elif isinstance(ii, dict):
|
37
|
+
result.append(ii)
|
38
|
+
else:
|
39
|
+
raise TypeError("must be in [str, dict]")
|
40
|
+
|
41
|
+
return result
|
42
|
+
|
43
|
+
|
44
|
+
def struct_start_seeds(seeds):
|
45
|
+
from .bbb import Seed
|
46
|
+
if not seeds:
|
47
|
+
return None
|
48
|
+
if any(isinstance(seeds, t) for t in (list, tuple)):
|
49
|
+
return [Seed(seed) for seed in seeds]
|
50
|
+
elif any(isinstance(seeds, t) for t in (str, dict)):
|
51
|
+
return Seed(seeds)
|
52
|
+
|
53
|
+
|
54
|
+
def issubclass_cobweb_interface(_class, inf_name):
|
55
|
+
for _c in _class.__mro__[1:]:
|
56
|
+
if _c.__name__ == inf_name:
|
57
|
+
return True
|
58
|
+
return False
|
59
|
+
|
60
|
+
|
61
|
+
def parse_import_model(model_info, model_type=None):
|
62
|
+
if model_type not in ["scheduler", "storer"]:
|
63
|
+
raise TypeError("model_type must be in scheduler, storer")
|
64
|
+
if isinstance(model_info, str):
|
65
|
+
if "import" in model_info:
|
66
|
+
model_path, class_name = re.search(
|
67
|
+
r"from (.*?) import (.*?)$", model_info
|
68
|
+
).groups()
|
69
|
+
model = import_module(model_path)
|
70
|
+
class_object = getattr(model, class_name)
|
71
|
+
elif "." in model_info:
|
72
|
+
info_list = model_info.split(".")
|
73
|
+
class_name = info_list[-1]
|
74
|
+
model_path = ".".join(info_list[:-1])
|
75
|
+
model = import_module(model_path)
|
76
|
+
class_object = getattr(model, class_name)
|
77
|
+
else:
|
78
|
+
model_path = f"cobweb.db.{model_type}.{model_info.lower()}"
|
79
|
+
class_name = model_info.capitalize()
|
80
|
+
model = import_module(model_path)
|
81
|
+
class_object = getattr(model, class_name)
|
82
|
+
return class_object, class_name
|
83
|
+
elif issubclass(model_info, ABC):
|
84
|
+
inf_name = model_type.capitalize() + "Interface"
|
85
|
+
if issubclass_cobweb_interface(model_info, inf_name):
|
86
|
+
return model_info, model_info.__name__
|
87
|
+
raise ImportError()
|
88
|
+
raise TypeError()
|
89
|
+
|
90
|
+
|
@@ -24,8 +24,8 @@ cobweb/distributed/__init__.py
|
|
24
24
|
cobweb/distributed/launcher.py
|
25
25
|
cobweb/distributed/models.py
|
26
26
|
cobweb/single/__init__.py
|
27
|
+
cobweb/single/launcher.py
|
27
28
|
cobweb/single/models.py
|
28
|
-
cobweb/single/nest.py
|
29
29
|
cobweb_launcher.egg-info/PKG-INFO
|
30
30
|
cobweb_launcher.egg-info/SOURCES.txt
|
31
31
|
cobweb_launcher.egg-info/dependency_links.txt
|
@@ -1,104 +0,0 @@
|
|
1
|
-
import time
|
2
|
-
# from pympler import asizeof
|
3
|
-
from single.nest import Queue
|
4
|
-
from single.nest import struct_queue_name
|
5
|
-
from single.nest import SchedulerInterface, StorerInterface
|
6
|
-
|
7
|
-
|
8
|
-
# class Transceiver:
|
9
|
-
class Distributor:
|
10
|
-
|
11
|
-
def __init__(self):
|
12
|
-
self.seed_queue = Queue()
|
13
|
-
|
14
|
-
@property
|
15
|
-
def queue_names(self):
|
16
|
-
return tuple(self.__dict__.keys())
|
17
|
-
|
18
|
-
@property
|
19
|
-
def used_memory(self):
|
20
|
-
return asizeof.asizeof(self)
|
21
|
-
|
22
|
-
def create_queue(self, queue_name: str):
|
23
|
-
self.__setattr__(queue_name, Queue())
|
24
|
-
|
25
|
-
def get_queue(self, queue_name: str):
|
26
|
-
return self.__getattribute__(queue_name)
|
27
|
-
|
28
|
-
def deal_item(self, item):
|
29
|
-
icn = item.__class__.__name__
|
30
|
-
if icn == "Seed":
|
31
|
-
self.seed_queue.push(item)
|
32
|
-
elif getattr(item, "table_name", None):
|
33
|
-
queue_name = struct_queue_name(icn, item.table_name)
|
34
|
-
getattr(self, queue_name).push(item.serialization)
|
35
|
-
|
36
|
-
def distribute(self, callback, *args, **kwargs):
|
37
|
-
iterable = callback(*args, **kwargs)
|
38
|
-
if not iterable:
|
39
|
-
return None
|
40
|
-
for result in iterable:
|
41
|
-
self.deal_item(result)
|
42
|
-
return True
|
43
|
-
|
44
|
-
|
45
|
-
class Scheduler:
|
46
|
-
|
47
|
-
def schedule_task(self, distribute):
|
48
|
-
|
49
|
-
if not issubclass(self.__class__, SchedulerInterface):
|
50
|
-
return None
|
51
|
-
|
52
|
-
if not getattr(self, "schedule", None):
|
53
|
-
raise Exception("not have schedule function!")
|
54
|
-
|
55
|
-
while not self.stop:
|
56
|
-
|
57
|
-
if self.queue.length < self.length:
|
58
|
-
distribute(self.schedule)
|
59
|
-
|
60
|
-
else:
|
61
|
-
print("------------")
|
62
|
-
time.sleep(15)
|
63
|
-
|
64
|
-
|
65
|
-
class Spider:
|
66
|
-
|
67
|
-
def __init__(self, queue):
|
68
|
-
self.queue = queue
|
69
|
-
self.spider_in_progress = Queue()
|
70
|
-
|
71
|
-
def spider_task(self, stop_event, distribute, func, item):
|
72
|
-
while not stop_event.is_set():
|
73
|
-
seed = self.queue.pop()
|
74
|
-
if not seed:
|
75
|
-
time.sleep(3)
|
76
|
-
continue
|
77
|
-
try:
|
78
|
-
self.spider_in_progress.push(1)
|
79
|
-
distribute(func, item, seed)
|
80
|
-
except Exception as e:
|
81
|
-
print(e)
|
82
|
-
finally:
|
83
|
-
self.spider_in_progress.pop()
|
84
|
-
|
85
|
-
|
86
|
-
class Storer:
|
87
|
-
|
88
|
-
def store_task(self, stop_event, last_event, distribute):
|
89
|
-
|
90
|
-
if not issubclass(self.__class__, StorerInterface):
|
91
|
-
return None
|
92
|
-
|
93
|
-
if not getattr(self, "store", None):
|
94
|
-
raise Exception("not have store function!")
|
95
|
-
|
96
|
-
while not stop_event.is_set():
|
97
|
-
if last_event.is_set() or self.queue.length > self.length:
|
98
|
-
data_list = []
|
99
|
-
data_length = min(self.queue.length, self.length)
|
100
|
-
for _ in range(data_length):
|
101
|
-
data = self.queue.pop()
|
102
|
-
data_list.append(data)
|
103
|
-
if data_list:
|
104
|
-
distribute(self.store, data_list)
|
@@ -1,153 +0,0 @@
|
|
1
|
-
import time
|
2
|
-
import threading
|
3
|
-
|
4
|
-
from single.nest import Seed, DBItem
|
5
|
-
from single.nest import struct_queue_name, restore_table_name
|
6
|
-
from single.nest import Distributor, Scheduler, Spider, Storer
|
7
|
-
|
8
|
-
|
9
|
-
def init_task_seed(seeds):
|
10
|
-
if not seeds:
|
11
|
-
return None
|
12
|
-
if isinstance(seeds, list) or isinstance(seeds, tuple):
|
13
|
-
for seed in seeds:
|
14
|
-
yield Seed(seed)
|
15
|
-
elif isinstance(seeds, str) or isinstance(seeds, dict):
|
16
|
-
yield Seed(seeds)
|
17
|
-
|
18
|
-
|
19
|
-
def parse_storer_info(storer_info):
|
20
|
-
storer_data = {}
|
21
|
-
storer_info_list = []
|
22
|
-
if storer_info.__class__.__name__ == 'StorerInfo':
|
23
|
-
storer_info_list.append(storer_info)
|
24
|
-
elif isinstance(storer_info, tuple) or isinstance(storer_info, list):
|
25
|
-
storer_info_list = storer_info
|
26
|
-
for info in storer_info_list:
|
27
|
-
db_name = info.DB.__name__
|
28
|
-
storer_data.setdefault(db_name, {"StorerDB": info.DB, "db_args_list": []})
|
29
|
-
storer_data[db_name]["db_args_list"].append(info[1:])
|
30
|
-
return storer_data
|
31
|
-
|
32
|
-
|
33
|
-
def check(stop_event, last_event, distributor, scheduler, spider, storer_list):
|
34
|
-
while True:
|
35
|
-
time.sleep(3)
|
36
|
-
if (
|
37
|
-
scheduler.stop and
|
38
|
-
not distributor.seed_queue.length and
|
39
|
-
not spider.spider_in_progress.length
|
40
|
-
):
|
41
|
-
last_event.set()
|
42
|
-
time.sleep(10)
|
43
|
-
storer_queue_empty = True
|
44
|
-
for storer in storer_list:
|
45
|
-
if storer.queue.length:
|
46
|
-
storer_queue_empty = False
|
47
|
-
break
|
48
|
-
if storer_queue_empty:
|
49
|
-
break
|
50
|
-
last_event.clear()
|
51
|
-
stop_event.set()
|
52
|
-
|
53
|
-
|
54
|
-
def cobweb(task):
|
55
|
-
"""
|
56
|
-
任务启动装饰器
|
57
|
-
:param task: 任务配置信息
|
58
|
-
"""
|
59
|
-
def decorator(func):
|
60
|
-
"""
|
61
|
-
func(Item, seed)
|
62
|
-
Item:
|
63
|
-
Item.Textfile()
|
64
|
-
Item.Console()
|
65
|
-
"""
|
66
|
-
# project task_name start_seed spider_num queue_length scheduler_info storer_info
|
67
|
-
|
68
|
-
storer_list = []
|
69
|
-
|
70
|
-
# 程序结束事件
|
71
|
-
last_event = threading.Event()
|
72
|
-
# 暂停采集事件
|
73
|
-
stop_event = threading.Event()
|
74
|
-
|
75
|
-
# 创建分发器
|
76
|
-
distributor = Distributor()
|
77
|
-
|
78
|
-
# 调度器动态继承
|
79
|
-
SchedulerDB, table, sql, length, size = task.SchedulerInfo
|
80
|
-
SchedulerTmp = type('Scheduler', (Scheduler, SchedulerDB), {})
|
81
|
-
|
82
|
-
# 初始化调度器
|
83
|
-
scheduler = SchedulerTmp(table=table, sql=sql, length=length, size=size, queue=distributor.seed_queue)
|
84
|
-
|
85
|
-
# 初始化采集器
|
86
|
-
spider = Spider(queue=distributor.seed_queue)
|
87
|
-
|
88
|
-
# 解析存储器信息
|
89
|
-
storer_data = parse_storer_info(task.storer_info)
|
90
|
-
|
91
|
-
# sds
|
92
|
-
item = type("item", (object,), {})
|
93
|
-
for db_name in storer_data.keys():
|
94
|
-
# 存储器动态继承
|
95
|
-
StorerDB = storer_data[db_name]["StorerDB"]
|
96
|
-
StorerTmp = type('Storer', (Storer, StorerDB), {})
|
97
|
-
db_args_list = storer_data[db_name]["db_args_list"]
|
98
|
-
for storer_db_args in db_args_list:
|
99
|
-
table, fields, length = storer_db_args
|
100
|
-
if not getattr(item, db_name, None):
|
101
|
-
instance = type(db_name, (DBItem,), {})
|
102
|
-
setattr(item, db_name, instance)
|
103
|
-
# 创建存储xxx
|
104
|
-
getattr(item, db_name).init_item(table, fields)
|
105
|
-
# 创建存储队列
|
106
|
-
storer_queue = struct_queue_name(db_name, table)
|
107
|
-
distributor.create_queue(queue_name=storer_queue)
|
108
|
-
queue = distributor.get_queue(queue_name=storer_queue)
|
109
|
-
# 初始话存储器
|
110
|
-
table_name = restore_table_name(table_name=table)
|
111
|
-
storer = StorerTmp(table=table_name, fields=fields, length=length, queue=queue)
|
112
|
-
storer_list.append(storer)
|
113
|
-
|
114
|
-
# 推送初始种子
|
115
|
-
distributor.distribute(init_task_seed, seeds=task.start_seed)
|
116
|
-
|
117
|
-
# 启动调度器
|
118
|
-
threading.Thread(
|
119
|
-
target=scheduler.schedule_task,
|
120
|
-
args=(distributor.distribute,),
|
121
|
-
name="single_scheduler_task"
|
122
|
-
).start()
|
123
|
-
|
124
|
-
# 启动采集器
|
125
|
-
for index in range(task.spider_num):
|
126
|
-
threading.Thread(
|
127
|
-
target=spider.spider_task,
|
128
|
-
args=(stop_event, distributor.distribute, func, item),
|
129
|
-
name=f"single_spider_task:{index}"
|
130
|
-
).start()
|
131
|
-
|
132
|
-
# 启动存储器
|
133
|
-
for storer in storer_list:
|
134
|
-
threading.Thread(
|
135
|
-
target=storer.store_task,
|
136
|
-
args=(stop_event, last_event, distributor.distribute),
|
137
|
-
name=f"single_store_task:{storer.table}",
|
138
|
-
).start()
|
139
|
-
|
140
|
-
threading.Thread(
|
141
|
-
target=check, name="check",
|
142
|
-
args=(
|
143
|
-
stop_event, last_event, distributor,
|
144
|
-
scheduler, spider, storer_list
|
145
|
-
)
|
146
|
-
).start()
|
147
|
-
|
148
|
-
# return starter(task, func)
|
149
|
-
return decorator
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
@@ -1,88 +0,0 @@
|
|
1
|
-
import json
|
2
|
-
import sys
|
3
|
-
from typing import Iterable
|
4
|
-
|
5
|
-
import requests
|
6
|
-
|
7
|
-
|
8
|
-
# from cobweb import Seed
|
9
|
-
|
10
|
-
|
11
|
-
def struct_table_name(table_name):
|
12
|
-
return table_name.replace(".", "__p__").replace(":", "__c__")
|
13
|
-
|
14
|
-
|
15
|
-
def restore_table_name(table_name):
|
16
|
-
return table_name.replace("__p__", ".").replace("__c__", ":")
|
17
|
-
|
18
|
-
|
19
|
-
def struct_queue_name(db_name, table_name):
|
20
|
-
return sys.intern(f"__{db_name}_{table_name}_queue__")
|
21
|
-
|
22
|
-
|
23
|
-
# class StorerDB:
|
24
|
-
#
|
25
|
-
# @staticmethod
|
26
|
-
# def console(self):
|
27
|
-
# from db.storer.console import Console
|
28
|
-
# table = struct_table_name(table)
|
29
|
-
# return StorerInfo(DB=Console, table=table, length=length, config=None)
|
30
|
-
#
|
31
|
-
# @staticmethod
|
32
|
-
# def textfile(table, length=200):
|
33
|
-
# from db.storer.textfile import Textfile
|
34
|
-
# table = struct_table_name(table)
|
35
|
-
# return StorerInfo(DB=Textfile, table=table, length=length, config=None)
|
36
|
-
#
|
37
|
-
# @staticmethod
|
38
|
-
# def loghub(table, length=200, config=None):
|
39
|
-
# from db.storer.loghub import Loghub
|
40
|
-
# table = struct_table_name(table)
|
41
|
-
# return StorerInfo(DB=Loghub, table=table, length=length, config=config)
|
42
|
-
|
43
|
-
|
44
|
-
def parse_info(info):
|
45
|
-
if not info:
|
46
|
-
return info
|
47
|
-
|
48
|
-
if isinstance(info, dict):
|
49
|
-
return info
|
50
|
-
|
51
|
-
if isinstance(info, str):
|
52
|
-
return json.loads(info)
|
53
|
-
|
54
|
-
if isinstance(info, Iterable):
|
55
|
-
result = list()
|
56
|
-
for ii in info:
|
57
|
-
if isinstance(ii, str):
|
58
|
-
result.append(json.loads(ii))
|
59
|
-
elif isinstance(ii, dict):
|
60
|
-
result.append(ii)
|
61
|
-
else:
|
62
|
-
raise TypeError("must be in [str, dict]")
|
63
|
-
|
64
|
-
return result
|
65
|
-
|
66
|
-
|
67
|
-
def struct_start_seeds(seeds):
|
68
|
-
from .bbb import Seed
|
69
|
-
if not seeds:
|
70
|
-
return None
|
71
|
-
if any(isinstance(seeds, t) for t in (list, tuple)):
|
72
|
-
return [Seed(seed) for seed in seeds]
|
73
|
-
elif any(isinstance(seeds, t) for t in (str, dict)):
|
74
|
-
return Seed(seeds)
|
75
|
-
|
76
|
-
|
77
|
-
# def get_storer_db(db):
|
78
|
-
#
|
79
|
-
# if isinstance(db, str):
|
80
|
-
# model = import_module(f" db.storer.{db.lower()}")
|
81
|
-
#
|
82
|
-
# # if db.lower() in dir(StorerDB):
|
83
|
-
# # return getattr(StorerDB, db)
|
84
|
-
# # else:
|
85
|
-
# # pass
|
86
|
-
|
87
|
-
|
88
|
-
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb_launcher.egg-info/dependency_links.txt
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|