cobweb-launcher 0.1.7__tar.gz → 0.1.9__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {cobweb-launcher-0.1.7 → cobweb-launcher-0.1.9}/PKG-INFO +5 -1
- cobweb-launcher-0.1.9/cobweb/__init__.py +7 -0
- cobweb-launcher-0.1.9/cobweb/constant.py +24 -0
- cobweb-launcher-0.1.9/cobweb/db/__init__.py +3 -0
- {cobweb-launcher-0.1.7 → cobweb-launcher-0.1.9}/cobweb/db/redis_db.py +22 -32
- cobweb-launcher-0.1.9/cobweb/db/scheduler/__init__.py +1 -0
- cobweb-launcher-0.1.9/cobweb/db/scheduler/default.py +8 -0
- {cobweb-launcher-0.1.7 → cobweb-launcher-0.1.9}/cobweb/db/scheduler/textfile.py +2 -2
- cobweb-launcher-0.1.9/cobweb/db/storer/__init__.py +1 -0
- {cobweb-launcher-0.1.7 → cobweb-launcher-0.1.9}/cobweb/db/storer/console.py +2 -2
- {cobweb-launcher-0.1.7 → cobweb-launcher-0.1.9}/cobweb/db/storer/loghub.py +2 -2
- cobweb-launcher-0.1.7/cobweb/db/storer/redis.py → cobweb-launcher-0.1.9/cobweb/db/storer/textfile.py +2 -2
- {cobweb-launcher-0.1.7 → cobweb-launcher-0.1.9}/cobweb/decorators.py +1 -1
- cobweb-launcher-0.1.9/cobweb/equip/__init__.py +8 -0
- {cobweb-launcher-0.1.7/cobweb → cobweb-launcher-0.1.9/cobweb/equip}/distributed/launcher.py +15 -39
- {cobweb-launcher-0.1.7/cobweb → cobweb-launcher-0.1.9/cobweb/equip}/distributed/models.py +35 -26
- {cobweb-launcher-0.1.7/cobweb → cobweb-launcher-0.1.9/cobweb/equip}/single/launcher.py +18 -49
- {cobweb-launcher-0.1.7/cobweb → cobweb-launcher-0.1.9/cobweb/equip}/single/models.py +36 -26
- {cobweb-launcher-0.1.7 → cobweb-launcher-0.1.9}/cobweb/task.py +10 -3
- {cobweb-launcher-0.1.7 → cobweb-launcher-0.1.9}/cobweb_launcher.egg-info/PKG-INFO +5 -1
- {cobweb-launcher-0.1.7 → cobweb-launcher-0.1.9}/cobweb_launcher.egg-info/SOURCES.txt +8 -8
- {cobweb-launcher-0.1.7 → cobweb-launcher-0.1.9}/setup.py +1 -1
- cobweb-launcher-0.1.7/cobweb/__init__.py +0 -11
- cobweb-launcher-0.1.7/cobweb/db/__init__.py +0 -2
- cobweb-launcher-0.1.7/cobweb/db/scheduler/default.py +0 -8
- cobweb-launcher-0.1.7/cobweb/db/storer/textfile.py +0 -15
- cobweb-launcher-0.1.7/cobweb/distributed/__init__.py +0 -0
- cobweb-launcher-0.1.7/cobweb/setting.py +0 -13
- cobweb-launcher-0.1.7/cobweb/single/__init__.py +0 -0
- {cobweb-launcher-0.1.7 → cobweb-launcher-0.1.9}/LICENSE +0 -0
- {cobweb-launcher-0.1.7 → cobweb-launcher-0.1.9}/README.md +0 -0
- {cobweb-launcher-0.1.7 → cobweb-launcher-0.1.9}/cobweb/bbb.py +0 -0
- {cobweb-launcher-0.1.7 → cobweb-launcher-0.1.9}/cobweb/db/oss_db.py +0 -0
- {cobweb-launcher-0.1.7/cobweb/db/scheduler → cobweb-launcher-0.1.9/cobweb/equip/distributed}/__init__.py +0 -0
- {cobweb-launcher-0.1.7/cobweb/db/storer → cobweb-launcher-0.1.9/cobweb/equip/single}/__init__.py +0 -0
- {cobweb-launcher-0.1.7 → cobweb-launcher-0.1.9}/cobweb/interface.py +0 -0
- {cobweb-launcher-0.1.7 → cobweb-launcher-0.1.9}/cobweb/log.py +0 -0
- {cobweb-launcher-0.1.7 → cobweb-launcher-0.1.9}/cobweb/utils.py +0 -0
- {cobweb-launcher-0.1.7 → cobweb-launcher-0.1.9}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
- {cobweb-launcher-0.1.7 → cobweb-launcher-0.1.9}/cobweb_launcher.egg-info/requires.txt +0 -0
- {cobweb-launcher-0.1.7 → cobweb-launcher-0.1.9}/cobweb_launcher.egg-info/top_level.txt +0 -0
- {cobweb-launcher-0.1.7 → cobweb-launcher-0.1.9}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cobweb-launcher
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.9
|
4
4
|
Summary: spider_hole
|
5
5
|
Home-page: https://github.com/Juannie-PP/cobweb
|
6
6
|
Author: Juannie-PP
|
@@ -11,6 +11,10 @@ Classifier: Programming Language :: Python :: 3
|
|
11
11
|
Requires-Python: >=3.7
|
12
12
|
Description-Content-Type: text/markdown
|
13
13
|
License-File: LICENSE
|
14
|
+
Requires-Dist: requests>=2.19.1
|
15
|
+
Requires-Dist: oss2>=2.18.1
|
16
|
+
Requires-Dist: redis>=4.4.4
|
17
|
+
Requires-Dist: aliyun-log-python-sdk
|
14
18
|
|
15
19
|
# cobweb
|
16
20
|
|
@@ -0,0 +1,24 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
class LauncherModel:
|
4
|
+
task = "launcher model: task"
|
5
|
+
resident = "launcher model: resident"
|
6
|
+
|
7
|
+
|
8
|
+
class LogModel:
|
9
|
+
simple = "log model: simple"
|
10
|
+
common = "log model: common"
|
11
|
+
detailed = "log model: detailed"
|
12
|
+
|
13
|
+
|
14
|
+
class DealModel:
|
15
|
+
failure = "deal model: failure"
|
16
|
+
success = "deal model: success"
|
17
|
+
polling = "deal model: polling"
|
18
|
+
|
19
|
+
|
20
|
+
class Setting:
|
21
|
+
RESET_SCORE = None
|
22
|
+
CHECK_LOCK_TIME = None
|
23
|
+
DEAL_MODEL = None
|
24
|
+
LAUNCHER_MODEL = None
|
@@ -1,7 +1,8 @@
|
|
1
1
|
import time
|
2
2
|
import redis
|
3
|
-
from
|
4
|
-
from cobweb.decorators import check_redis_status
|
3
|
+
from . import log, decorators, Seed, Setting, DealModel
|
4
|
+
# from cobweb.decorators import decorators.check_redis_status
|
5
|
+
# from cobweb.constant import Setting, DealModel
|
5
6
|
|
6
7
|
|
7
8
|
class RedisDB:
|
@@ -11,9 +12,6 @@ class RedisDB:
|
|
11
12
|
project: str,
|
12
13
|
task_name: str,
|
13
14
|
config: dict,
|
14
|
-
model: int,
|
15
|
-
cs_lct: int,
|
16
|
-
rs_time: int,
|
17
15
|
):
|
18
16
|
pool = redis.ConnectionPool(**config)
|
19
17
|
self.heartbeat_key = f"{project}:{task_name}:heartbeat" # redis type string
|
@@ -25,11 +23,8 @@ class RedisDB:
|
|
25
23
|
self.check_lock = f"{project}:{task_name}:check_seed_lock" # redis type string
|
26
24
|
self.scheduler_lock = f"{project}:{task_name}:scheduler_lock" # redis type string
|
27
25
|
self.client = redis.Redis(connection_pool=pool)
|
28
|
-
self.model = model
|
29
|
-
self.cs_lct = cs_lct
|
30
|
-
self.rs_time = rs_time
|
31
26
|
|
32
|
-
@check_redis_status
|
27
|
+
@decorators.check_redis_status
|
33
28
|
def _get_lock(self, key, t=15, timeout=3, sleep_time=0.1):
|
34
29
|
begin_time = int(time.time())
|
35
30
|
while True:
|
@@ -55,7 +50,7 @@ class RedisDB:
|
|
55
50
|
log.info("ttl: " + str(ttl))
|
56
51
|
return False
|
57
52
|
|
58
|
-
@check_redis_status
|
53
|
+
@decorators.check_redis_status
|
59
54
|
def _deal_seed(self, seeds, is_add: bool):
|
60
55
|
if not seeds:
|
61
56
|
return None
|
@@ -73,15 +68,15 @@ class RedisDB:
|
|
73
68
|
if item_info:
|
74
69
|
self.client.zadd(self.spider_key, mapping=item_info, nx=is_add, xx=not is_add)
|
75
70
|
|
76
|
-
@check_redis_status
|
71
|
+
@decorators.check_redis_status
|
77
72
|
def add_seed(self, seeds):
|
78
73
|
self._deal_seed(seeds, is_add=True)
|
79
74
|
|
80
|
-
@check_redis_status
|
75
|
+
@decorators.check_redis_status
|
81
76
|
def reset_seed(self, seeds):
|
82
77
|
self._deal_seed(seeds, is_add=False)
|
83
78
|
|
84
|
-
@check_redis_status
|
79
|
+
@decorators.check_redis_status
|
85
80
|
def del_seed(self, seeds, spider_status: bool = True):
|
86
81
|
if not seeds:
|
87
82
|
return None
|
@@ -92,18 +87,16 @@ class RedisDB:
|
|
92
87
|
seeds = [seed if isinstance(seed, Seed) else Seed(seed) for seed in seeds]
|
93
88
|
|
94
89
|
if seeds:
|
95
|
-
# redis_key = self.succeed_key if spider_status else self.failed_key
|
96
90
|
redis_key = None
|
97
|
-
if spider_status:
|
98
|
-
|
99
|
-
|
100
|
-
else:
|
91
|
+
if spider_status and Setting.DEAL_MODEL in [DealModel.success, DealModel.polling]:
|
92
|
+
redis_key = self.succeed_key
|
93
|
+
elif not spider_status:
|
101
94
|
redis_key = self.failed_key
|
102
95
|
if redis_key:
|
103
96
|
self.client.sadd(redis_key, *(str(seed) for seed in seeds))
|
104
97
|
self.client.zrem(self.spider_key, *(seed.format_seed for seed in seeds))
|
105
98
|
|
106
|
-
@check_redis_status
|
99
|
+
@decorators.check_redis_status
|
107
100
|
def set_storer(self, key, seeds):
|
108
101
|
if not seeds:
|
109
102
|
return None
|
@@ -122,7 +115,7 @@ class RedisDB:
|
|
122
115
|
self.client.zadd(self.storer_key % key, mapping=item_info)
|
123
116
|
log.info(f"zadd storer key: length {len(item_info.keys())}")
|
124
117
|
|
125
|
-
@check_redis_status
|
118
|
+
@decorators.check_redis_status
|
126
119
|
def get_seed(self, length: int = 200):
|
127
120
|
cs = time.time()
|
128
121
|
|
@@ -148,14 +141,14 @@ class RedisDB:
|
|
148
141
|
log.info("push seeds into queue time: " + str(time.time() - cs))
|
149
142
|
return result
|
150
143
|
|
151
|
-
@check_redis_status
|
144
|
+
@decorators.check_redis_status
|
152
145
|
def check_spider_queue(self, stop, storer_num):
|
153
146
|
while not stop.is_set():
|
154
147
|
# 每15s获取check锁,等待600s后仍获取不到锁则重试;获取到锁后,设置锁的存活时间为${cs_lct}s
|
155
|
-
if self._get_lock(key=self.check_lock, t=
|
148
|
+
if self._get_lock(key=self.check_lock, t=Setting.CHECK_LOCK_TIME, timeout=600, sleep_time=3):
|
156
149
|
heartbeat = True if self.client.exists(self.heartbeat_key) else False
|
157
150
|
# 重启重制score值,否则获取${rs_time}分钟前的分数值
|
158
|
-
score = -int(time.time()) +
|
151
|
+
score = -int(time.time()) + Setting.RESET_SCORE if heartbeat else "-inf"
|
159
152
|
|
160
153
|
keys = self.client.keys(self.storer_key % "*")
|
161
154
|
|
@@ -170,7 +163,7 @@ class RedisDB:
|
|
170
163
|
break
|
171
164
|
for key in keys:
|
172
165
|
self.client.zrem(key, *members)
|
173
|
-
if
|
166
|
+
if Setting.DEAL_MODEL in [DealModel.success, DealModel.polling]:
|
174
167
|
self.client.sadd(self.succeed_key, *members)
|
175
168
|
self.client.zrem(self.spider_key, *members)
|
176
169
|
self.client.zrem(intersection_key, *members)
|
@@ -193,31 +186,28 @@ class RedisDB:
|
|
193
186
|
if not heartbeat:
|
194
187
|
self.client.setex(self.heartbeat_key, 15, "")
|
195
188
|
|
196
|
-
|
197
|
-
# time.sleep(3)
|
198
|
-
|
199
|
-
@check_redis_status
|
189
|
+
@decorators.check_redis_status
|
200
190
|
def set_heartbeat(self, stop):
|
201
191
|
time.sleep(5)
|
202
192
|
while not stop.is_set():
|
203
193
|
self.client.setex(self.heartbeat_key, 5, "")
|
204
194
|
time.sleep(3)
|
205
195
|
|
206
|
-
# @check_redis_status
|
196
|
+
# @decorators.check_redis_status
|
207
197
|
# def heartbeat(self):
|
208
198
|
# """
|
209
199
|
# 返回心跳key剩余存活时间
|
210
200
|
# """
|
211
201
|
# return self.client.ttl(self.heartbeat_key)
|
212
202
|
|
213
|
-
@check_redis_status
|
203
|
+
@decorators.check_redis_status
|
214
204
|
def spider_queue_length(self):
|
215
205
|
return self.client.zcard(self.spider_key)
|
216
206
|
|
217
|
-
@check_redis_status
|
207
|
+
@decorators.check_redis_status
|
218
208
|
def ready_seed_length(self):
|
219
209
|
return self.client.zcount(self.spider_key, min=0, max="+inf")
|
220
210
|
|
221
|
-
@check_redis_status
|
211
|
+
@decorators.check_redis_status
|
222
212
|
def get_scheduler_lock(self):
|
223
213
|
return self._get_lock(self.scheduler_lock)
|
@@ -0,0 +1 @@
|
|
1
|
+
from .. import log, Seed, SchedulerInterface as Inf
|
@@ -0,0 +1 @@
|
|
1
|
+
from .. import log, Seed, StorerInterface as Inf
|
@@ -1,15 +1,9 @@
|
|
1
1
|
import time
|
2
2
|
import threading
|
3
|
-
from threading import Thread
|
4
3
|
|
4
|
+
from .. import log, sqn, rtn, pim
|
5
|
+
from .. import Queue, DBItem, RedisDB, Setting
|
5
6
|
from .models import Scheduler, Spider, Storer
|
6
|
-
from cobweb import log, Queue, DBItem, RedisDB
|
7
|
-
from cobweb.setting import MODEL, RESET_SCORE, CHECK_LOCK_TIME
|
8
|
-
from cobweb.utils import (
|
9
|
-
struct_queue_name as sqn,
|
10
|
-
restore_table_name as rtn,
|
11
|
-
parse_import_model as pim,
|
12
|
-
)
|
13
7
|
|
14
8
|
|
15
9
|
def check(stop, last, spider, scheduler, storer_list, ready_seed_length, spider_queue_length):
|
@@ -37,11 +31,10 @@ def check(stop, last, spider, scheduler, storer_list, ready_seed_length, spider_
|
|
37
31
|
)
|
38
32
|
if (
|
39
33
|
scheduler.stop and
|
40
|
-
# not redis_ready_seed_length and
|
41
34
|
not memory_seed_queue_length and
|
42
35
|
not running_spider_thread_num
|
43
36
|
):
|
44
|
-
if not
|
37
|
+
if not Setting.LAUNCHER_MODEL:
|
45
38
|
log.info("spider is done?")
|
46
39
|
last.set()
|
47
40
|
time.sleep(3)
|
@@ -58,7 +51,7 @@ def check(stop, last, spider, scheduler, storer_list, ready_seed_length, spider_
|
|
58
51
|
not redis_ready_seed_length and
|
59
52
|
not redis_spider_seed_length
|
60
53
|
):
|
61
|
-
if
|
54
|
+
if Setting.LAUNCHER_MODEL:
|
62
55
|
log.info("waiting for push seeds...")
|
63
56
|
status = "waiting"
|
64
57
|
time.sleep(30)
|
@@ -89,20 +82,6 @@ def launcher(task):
|
|
89
82
|
:param task: 任务配置信息
|
90
83
|
"""
|
91
84
|
def decorator(func):
|
92
|
-
"""
|
93
|
-
Item:
|
94
|
-
Textfile()
|
95
|
-
Loghub()
|
96
|
-
Console()
|
97
|
-
e.g.
|
98
|
-
task.fields = "a,b"
|
99
|
-
func(item, seed)
|
100
|
-
a = "a"
|
101
|
-
b = "b"
|
102
|
-
data = {"a": "a", "b": "b"}
|
103
|
-
yield item.Loghub(**data)
|
104
|
-
yield item.Loghub(a=a, b=b)
|
105
|
-
"""
|
106
85
|
storer_list = []
|
107
86
|
|
108
87
|
# 程序结束事件
|
@@ -111,10 +90,7 @@ def launcher(task):
|
|
111
90
|
stop = threading.Event()
|
112
91
|
|
113
92
|
# 初始化redis信息
|
114
|
-
redis_db = RedisDB(
|
115
|
-
task.project, task.task_name, task.redis_info,
|
116
|
-
model=MODEL, cs_lct=CHECK_LOCK_TIME, rs_time=RESET_SCORE
|
117
|
-
)
|
93
|
+
redis_db = RedisDB(task.project, task.task_name, task.redis_info)
|
118
94
|
|
119
95
|
log.info("初始化cobweb!")
|
120
96
|
|
@@ -139,9 +115,6 @@ def launcher(task):
|
|
139
115
|
length=task.scheduler_queue_length, config=scheduler_config
|
140
116
|
)
|
141
117
|
|
142
|
-
# 初始化采集器
|
143
|
-
spider = Spider(seed_queue, task.max_retries)
|
144
|
-
|
145
118
|
# 解析存储器信息
|
146
119
|
storer_info_list = task.storer_info or []
|
147
120
|
if not isinstance(storer_info_list, list):
|
@@ -178,14 +151,17 @@ def launcher(task):
|
|
178
151
|
)
|
179
152
|
storer_list.append(storer)
|
180
153
|
|
181
|
-
|
182
|
-
|
154
|
+
# 初始化采集器
|
155
|
+
spider = Spider(seed_queue, storer_list and True, task.max_retries)
|
156
|
+
|
157
|
+
threading.Thread(target=redis_db.check_spider_queue, args=(stop, len(storer_list))).start()
|
158
|
+
threading.Thread(target=redis_db.set_heartbeat, args=(stop,)).start()
|
183
159
|
|
184
160
|
# 推送初始种子
|
185
161
|
# seeds = start_seeds(task.start_seed)
|
186
162
|
redis_db.add_seed(task.seeds)
|
187
163
|
# 启动调度器, 调度至redis队列
|
188
|
-
Thread(
|
164
|
+
threading.Thread(
|
189
165
|
# name="xxxx_schedule_seeds",
|
190
166
|
target=scheduler.schedule_seed,
|
191
167
|
args=(
|
@@ -196,7 +172,7 @@ def launcher(task):
|
|
196
172
|
).start()
|
197
173
|
|
198
174
|
# 启动调度器, 调度任务队列
|
199
|
-
Thread(
|
175
|
+
threading.Thread(
|
200
176
|
# name="xxxx_schedule_task",
|
201
177
|
target=scheduler.schedule_task,
|
202
178
|
args=(
|
@@ -207,7 +183,7 @@ def launcher(task):
|
|
207
183
|
|
208
184
|
# 启动采集器
|
209
185
|
for index in range(task.spider_num):
|
210
|
-
Thread(
|
186
|
+
threading.Thread(
|
211
187
|
# name=f"xxxx_spider_task:{index}",
|
212
188
|
target=spider.spider_task,
|
213
189
|
args=(
|
@@ -218,7 +194,7 @@ def launcher(task):
|
|
218
194
|
|
219
195
|
# 启动存储器
|
220
196
|
for storer in storer_list:
|
221
|
-
Thread(
|
197
|
+
threading.Thread(
|
222
198
|
# name=f"xxxx_store_task:{storer.table}",
|
223
199
|
target=storer.store_task,
|
224
200
|
args=(
|
@@ -228,7 +204,7 @@ def launcher(task):
|
|
228
204
|
)
|
229
205
|
).start()
|
230
206
|
|
231
|
-
Thread(
|
207
|
+
threading.Thread(
|
232
208
|
# name="check_spider",
|
233
209
|
target=check,
|
234
210
|
args=(
|
@@ -1,8 +1,9 @@
|
|
1
1
|
import time
|
2
2
|
from hashlib import md5
|
3
|
-
from
|
4
|
-
from cobweb.utils import issubclass_cobweb_inf
|
3
|
+
from inspect import isgenerator
|
5
4
|
|
5
|
+
from .. import log, ici
|
6
|
+
from .. import DealModel, Queue, Seed
|
6
7
|
# from pympler import asizeof
|
7
8
|
|
8
9
|
|
@@ -11,7 +12,7 @@ class Scheduler:
|
|
11
12
|
def schedule_seed(self, ready_seed_length, get_scheduler_lock, add_seed):
|
12
13
|
|
13
14
|
inf_name = "SchedulerInterface"
|
14
|
-
if not
|
15
|
+
if not ici(self.__class__, inf_name):
|
15
16
|
raise Exception("not have schedule function!")
|
16
17
|
|
17
18
|
if self.__class__.__name__ == "Default":
|
@@ -34,7 +35,7 @@ class Scheduler:
|
|
34
35
|
while not stop.is_set():
|
35
36
|
|
36
37
|
if not ready_seed_length():
|
37
|
-
time.sleep(
|
38
|
+
time.sleep(5)
|
38
39
|
continue
|
39
40
|
|
40
41
|
if self.queue.length >= self.length:
|
@@ -48,9 +49,10 @@ class Scheduler:
|
|
48
49
|
|
49
50
|
class Spider:
|
50
51
|
|
51
|
-
def __init__(self, queue, max_retries=5):
|
52
|
+
def __init__(self, queue, storage, max_retries=5):
|
52
53
|
self.spider_in_progress = Queue()
|
53
54
|
self.max_retries = max_retries
|
55
|
+
self.storage = storage
|
54
56
|
self.queue = queue
|
55
57
|
|
56
58
|
def spider_task(self, stop, func, item, del_seed):
|
@@ -65,33 +67,42 @@ class Spider:
|
|
65
67
|
try:
|
66
68
|
self.spider_in_progress.push(1, direct_insertion=True)
|
67
69
|
# log.info("spider seed: " + str(seed))
|
68
|
-
|
69
|
-
status = None
|
70
|
+
|
70
71
|
store_queue = None
|
71
72
|
store_data = list()
|
72
|
-
|
73
|
-
|
73
|
+
|
74
|
+
iterators = func(item, seed)
|
75
|
+
|
76
|
+
if not isgenerator(iterators):
|
77
|
+
if not self.storage:
|
78
|
+
del_seed(seed, spider_status=True)
|
79
|
+
continue
|
80
|
+
raise TypeError(f"{func.__name__} isn't a generator")
|
81
|
+
|
82
|
+
for it in iterators:
|
74
83
|
if getattr(it, "table_name", None):
|
75
84
|
if not store_queue:
|
76
85
|
store_queue = it.queue()
|
77
86
|
store_data.append(it.struct_data)
|
78
87
|
elif isinstance(it, Seed):
|
79
88
|
self.queue.push(it)
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
89
|
+
|
90
|
+
elif isinstance(it, str) and it == DealModel.polling:
|
91
|
+
self.queue.push(seed)
|
92
|
+
break
|
93
|
+
elif isinstance(it, str) and it == DealModel.success:
|
94
|
+
del_seed(seed, spider_status=True)
|
95
|
+
break
|
96
|
+
elif isinstance(it, str) and it == DealModel.failure:
|
97
|
+
del_seed(seed, spider_status=False)
|
98
|
+
break
|
99
|
+
else:
|
100
|
+
raise TypeError("yield value type error!")
|
84
101
|
|
85
102
|
if store_queue and store_data:
|
86
103
|
store_data.append(seed)
|
87
104
|
store_queue.push(store_data)
|
88
105
|
|
89
|
-
if status:
|
90
|
-
del_seed(seed, spider_status=True)
|
91
|
-
elif not ret_count or status is False:
|
92
|
-
seed._retry += 1
|
93
|
-
self.queue.push(seed)
|
94
|
-
|
95
106
|
except Exception as e:
|
96
107
|
seed._retry += 1
|
97
108
|
self.queue.push(seed)
|
@@ -106,7 +117,7 @@ class Storer:
|
|
106
117
|
def store_task(self, stop, last, reset_seed, set_storer):
|
107
118
|
|
108
119
|
inf_name = "StorerInterface"
|
109
|
-
if not
|
120
|
+
if not ici(self.__class__, inf_name):
|
110
121
|
return None
|
111
122
|
|
112
123
|
if not getattr(self, "store", None):
|
@@ -131,12 +142,10 @@ class Storer:
|
|
131
142
|
continue
|
132
143
|
data_list.append(data)
|
133
144
|
|
134
|
-
if data_list:
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
reset_seed(seeds)
|
139
|
-
continue
|
145
|
+
if self.store(data_list):
|
146
|
+
set_storer(store_key_id, seeds)
|
147
|
+
else:
|
148
|
+
reset_seed(seeds)
|
140
149
|
|
141
150
|
time.sleep(3)
|
142
151
|
|
@@ -1,15 +1,9 @@
|
|
1
1
|
import time
|
2
2
|
import threading
|
3
|
-
from threading import Thread
|
4
3
|
|
4
|
+
from .. import log, sqn, rtn, pim
|
5
|
+
from .. import Queue, DBItem, RedisDB, Setting
|
5
6
|
from .models import Scheduler, Spider, Storer
|
6
|
-
from cobweb import log, Queue, DBItem, RedisDB
|
7
|
-
from cobweb.setting import MODEL, RESET_SCORE, CHECK_LOCK_TIME
|
8
|
-
from cobweb.utils import (
|
9
|
-
struct_queue_name as sqn,
|
10
|
-
restore_table_name as rtn,
|
11
|
-
parse_import_model as pim,
|
12
|
-
)
|
13
7
|
|
14
8
|
|
15
9
|
def check(stop, last, spider, scheduler, storer, ready_seed_length, spider_queue_length):
|
@@ -29,27 +23,26 @@ def check(stop, last, spider, scheduler, storer, ready_seed_length, spider_queue
|
|
29
23
|
redis_ready_seed_length = ready_seed_length()
|
30
24
|
redis_spider_seed_length = spider_queue_length()
|
31
25
|
memory_seed_queue_length = scheduler.queue.length
|
32
|
-
storer_upload_queue_length = storer.queue.length
|
26
|
+
storer_upload_queue_length = storer.queue.length if storer else None
|
33
27
|
if (
|
34
28
|
scheduler.stop and
|
35
|
-
# not redis_ready_seed_length and
|
36
29
|
not memory_seed_queue_length and
|
37
30
|
not running_spider_thread_num
|
38
31
|
):
|
39
|
-
if not
|
32
|
+
if not Setting.LAUNCHER_MODEL:
|
40
33
|
log.info("spider is done?")
|
41
34
|
last.set()
|
42
35
|
time.sleep(3)
|
43
36
|
storer_queue_empty = True
|
44
|
-
if storer.queue.length:
|
37
|
+
if storer and storer.queue.length:
|
45
38
|
storer_queue_empty = False
|
46
|
-
storer_upload_queue_length = storer.queue.length
|
39
|
+
storer_upload_queue_length = storer.queue.length if storer else None
|
47
40
|
if (
|
48
41
|
storer_queue_empty and
|
49
42
|
not redis_ready_seed_length and
|
50
43
|
not redis_spider_seed_length
|
51
44
|
):
|
52
|
-
if
|
45
|
+
if Setting.LAUNCHER_MODEL:
|
53
46
|
log.info("waiting for push seeds...")
|
54
47
|
status = "waiting"
|
55
48
|
time.sleep(30)
|
@@ -78,32 +71,13 @@ def launcher(task):
|
|
78
71
|
:param task: 任务配置信息
|
79
72
|
"""
|
80
73
|
def decorator(func):
|
81
|
-
"""
|
82
|
-
Item:
|
83
|
-
Textfile()
|
84
|
-
Loghub()
|
85
|
-
Console()
|
86
|
-
e.g.
|
87
|
-
task.fields = "a,b"
|
88
|
-
func(item, seed)
|
89
|
-
a = "a"
|
90
|
-
b = "b"
|
91
|
-
data = {"a": "a", "b": "b"}
|
92
|
-
yield item.Loghub(**data)
|
93
|
-
yield item.Loghub(a=a, b=b)
|
94
|
-
"""
|
95
|
-
storer_list = []
|
96
|
-
|
97
74
|
# 程序结束事件
|
98
75
|
last = threading.Event()
|
99
76
|
# 停止采集事件
|
100
77
|
stop = threading.Event()
|
101
78
|
|
102
79
|
# 初始化redis信息
|
103
|
-
redis_db = RedisDB(
|
104
|
-
task.project, task.task_name, task.redis_info,
|
105
|
-
model=MODEL, cs_lct=CHECK_LOCK_TIME, rs_time=RESET_SCORE
|
106
|
-
)
|
80
|
+
redis_db = RedisDB(task.project, task.task_name, task.redis_info)
|
107
81
|
|
108
82
|
# new item
|
109
83
|
item = type("Item", (object,), {"redis_client": redis_db.client})()
|
@@ -113,7 +87,6 @@ def launcher(task):
|
|
113
87
|
seed_queue = Queue()
|
114
88
|
|
115
89
|
scheduler_info = task.scheduler_info or dict()
|
116
|
-
|
117
90
|
# 调度器动态继承
|
118
91
|
sql = scheduler_info.get("sql")
|
119
92
|
table = scheduler_info.get("table")
|
@@ -123,22 +96,15 @@ def launcher(task):
|
|
123
96
|
DB, class_name = pim(scheduler_db, "scheduler")
|
124
97
|
# SchedulerDB, table, sql, length, size, config = task.scheduler_info
|
125
98
|
SchedulerTmp = type(class_name, (Scheduler, DB), {})
|
126
|
-
|
127
99
|
# 初始化调度器
|
128
100
|
scheduler = SchedulerTmp(
|
129
101
|
table=table, sql=sql, size=size, queue=seed_queue,
|
130
102
|
length=task.scheduler_queue_length, config=scheduler_config
|
131
103
|
)
|
132
104
|
|
133
|
-
# 初始化采集器
|
134
|
-
spider = Spider(seed_queue, task.max_retries)
|
135
|
-
|
136
105
|
storer = None
|
137
|
-
|
138
|
-
# 解析存储器信息
|
139
106
|
storer_info = task.storer_info or dict()
|
140
107
|
|
141
|
-
# for storer_info in storer_info_list:
|
142
108
|
if storer_info:
|
143
109
|
storer_db = storer_info["db"]
|
144
110
|
fields = storer_info["fields"]
|
@@ -166,14 +132,17 @@ def launcher(task):
|
|
166
132
|
queue=queue, config=storer_config
|
167
133
|
)
|
168
134
|
|
169
|
-
|
170
|
-
|
135
|
+
# 初始化采集器
|
136
|
+
spider = Spider(seed_queue, storer and True, task.max_retries)
|
137
|
+
|
138
|
+
threading.Thread(target=redis_db.check_spider_queue, args=(stop, 0)).start()
|
139
|
+
threading.Thread(target=redis_db.set_heartbeat, args=(stop,)).start()
|
171
140
|
|
172
141
|
# 推送初始种子
|
173
142
|
# seeds = start_seeds(task.start_seed)
|
174
143
|
redis_db.add_seed(task.seeds)
|
175
144
|
# 启动调度器, 调度至redis队列
|
176
|
-
Thread(
|
145
|
+
threading.Thread(
|
177
146
|
# name="xxxx_schedule_seeds",
|
178
147
|
target=scheduler.schedule_seed,
|
179
148
|
args=(
|
@@ -184,7 +153,7 @@ def launcher(task):
|
|
184
153
|
).start()
|
185
154
|
|
186
155
|
# 启动调度器, 调度任务队列
|
187
|
-
Thread(
|
156
|
+
threading.Thread(
|
188
157
|
# name="xxxx_schedule_task",
|
189
158
|
target=scheduler.schedule_task,
|
190
159
|
args=(
|
@@ -195,7 +164,7 @@ def launcher(task):
|
|
195
164
|
|
196
165
|
# 启动采集器
|
197
166
|
for index in range(task.spider_num):
|
198
|
-
Thread(
|
167
|
+
threading.Thread(
|
199
168
|
# name=f"xxxx_spider_task:{index}",
|
200
169
|
target=spider.spider_task,
|
201
170
|
args=(
|
@@ -206,7 +175,7 @@ def launcher(task):
|
|
206
175
|
|
207
176
|
# 启动存储器
|
208
177
|
if storer:
|
209
|
-
Thread(
|
178
|
+
threading.Thread(
|
210
179
|
# name=f"xxxx_store_task:{storer.table}",
|
211
180
|
target=storer.store_task,
|
212
181
|
args=(
|
@@ -216,7 +185,7 @@ def launcher(task):
|
|
216
185
|
)
|
217
186
|
).start()
|
218
187
|
|
219
|
-
Thread(
|
188
|
+
threading.Thread(
|
220
189
|
# name="check_spider",
|
221
190
|
target=check,
|
222
191
|
args=(
|
@@ -1,7 +1,8 @@
|
|
1
1
|
import time
|
2
|
-
from
|
3
|
-
from cobweb.utils import issubclass_cobweb_inf
|
2
|
+
from inspect import isgenerator
|
4
3
|
# from pympler import asizeof
|
4
|
+
from .. import log, ici
|
5
|
+
from .. import DealModel, Queue, Seed
|
5
6
|
|
6
7
|
|
7
8
|
class Scheduler:
|
@@ -9,7 +10,7 @@ class Scheduler:
|
|
9
10
|
def schedule_seed(self, ready_seed_length, get_scheduler_lock, add_seed):
|
10
11
|
|
11
12
|
inf_name = "SchedulerInterface"
|
12
|
-
if not
|
13
|
+
if not ici(self.__class__, inf_name):
|
13
14
|
raise Exception("not have schedule function!")
|
14
15
|
|
15
16
|
if self.__class__.__name__ == "Default":
|
@@ -32,7 +33,7 @@ class Scheduler:
|
|
32
33
|
while not stop.is_set():
|
33
34
|
|
34
35
|
if not ready_seed_length():
|
35
|
-
time.sleep(
|
36
|
+
time.sleep(5)
|
36
37
|
continue
|
37
38
|
|
38
39
|
if self.queue.length >= self.length:
|
@@ -46,27 +47,37 @@ class Scheduler:
|
|
46
47
|
|
47
48
|
class Spider:
|
48
49
|
|
49
|
-
def __init__(self, queue, max_retries=5):
|
50
|
+
def __init__(self, queue, storage, max_retries=5):
|
50
51
|
self.spider_in_progress = Queue()
|
51
52
|
self.max_retries = max_retries
|
53
|
+
self.storage = storage
|
52
54
|
self.queue = queue
|
53
55
|
|
54
56
|
def spider_task(self, stop, func, item, del_seed):
|
55
57
|
while not stop.is_set():
|
58
|
+
|
56
59
|
seed = self.queue.pop()
|
60
|
+
|
57
61
|
if not seed:
|
58
62
|
time.sleep(3)
|
59
63
|
continue
|
64
|
+
|
60
65
|
elif seed._retry >= self.max_retries:
|
61
66
|
del_seed(seed, spider_status=False)
|
62
67
|
continue
|
68
|
+
|
63
69
|
try:
|
64
70
|
self.spider_in_progress.push(1, direct_insertion=True)
|
65
71
|
# log.info("spider seed: " + str(seed))
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
72
|
+
iterators = func(item, seed)
|
73
|
+
|
74
|
+
if not isgenerator(iterators):
|
75
|
+
if not self.storage:
|
76
|
+
del_seed(seed, spider_status=True)
|
77
|
+
continue
|
78
|
+
raise TypeError(f"{func.__name__} isn't a generator")
|
79
|
+
|
80
|
+
for it in iterators:
|
70
81
|
if getattr(it, "table_name", None):
|
71
82
|
store_queue = it.queue()
|
72
83
|
store_queue.push(
|
@@ -75,16 +86,18 @@ class Spider:
|
|
75
86
|
)
|
76
87
|
elif isinstance(it, Seed):
|
77
88
|
self.queue.push(it)
|
78
|
-
elif any(isinstance(it, t) for t in (list, tuple)):
|
79
|
-
self.queue.push([s if isinstance(s, Seed) else Seed(s) for s in it])
|
80
|
-
elif isinstance(it, bool):
|
81
|
-
status = it
|
82
89
|
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
90
|
+
elif isinstance(it, str) and it == DealModel.polling:
|
91
|
+
self.queue.push(seed)
|
92
|
+
break
|
93
|
+
elif isinstance(it, str) and it == DealModel.success:
|
94
|
+
del_seed(seed, spider_status=True)
|
95
|
+
break
|
96
|
+
elif isinstance(it, str) and it == DealModel.failure:
|
97
|
+
del_seed(seed, spider_status=False)
|
98
|
+
break
|
99
|
+
else:
|
100
|
+
raise TypeError("yield value type error!")
|
88
101
|
|
89
102
|
except Exception as e:
|
90
103
|
seed._retry += 1
|
@@ -100,7 +113,7 @@ class Storer:
|
|
100
113
|
def store_task(self, stop, last, reset_seed, del_seed):
|
101
114
|
|
102
115
|
inf_name = "StorerInterface"
|
103
|
-
if not
|
116
|
+
if not ici(self.__class__, inf_name):
|
104
117
|
return None
|
105
118
|
|
106
119
|
if not getattr(self, "store", None):
|
@@ -121,13 +134,10 @@ class Storer:
|
|
121
134
|
seeds.append(seed)
|
122
135
|
data_list.append(data)
|
123
136
|
|
124
|
-
if data_list:
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
reset_seed(seeds)
|
129
|
-
log.info("reset seeds!")
|
130
|
-
continue
|
137
|
+
if self.store(data_list):
|
138
|
+
del_seed(seeds)
|
139
|
+
else:
|
140
|
+
reset_seed(seeds)
|
131
141
|
|
132
142
|
time.sleep(3)
|
133
143
|
|
@@ -1,11 +1,19 @@
|
|
1
|
+
import os
|
2
|
+
from .constant import *
|
1
3
|
from .utils import parse_info, struct_start_seeds
|
2
4
|
|
3
5
|
|
6
|
+
def init_task_env():
|
7
|
+
Setting.RESET_SCORE = int(os.getenv("RESET_SCORE", 600))
|
8
|
+
Setting.CHECK_LOCK_TIME = int(os.getenv("CHECK_LOCK_TIME", 30))
|
9
|
+
Setting.DEAL_MODEL = os.getenv("DEAL_MODEL", DealModel.failure)
|
10
|
+
Setting.LAUNCHER_MODEL = os.getenv("LAUNCHER_MODEL", LauncherModel.task)
|
11
|
+
|
12
|
+
|
4
13
|
class Task:
|
5
14
|
|
6
15
|
def __init__(
|
7
16
|
self,
|
8
|
-
# model=None,
|
9
17
|
seeds=None,
|
10
18
|
project=None,
|
11
19
|
task_name=None,
|
@@ -31,8 +39,7 @@ class Task:
|
|
31
39
|
:param storer_queue_length:
|
32
40
|
:param scheduler_queue_length:
|
33
41
|
"""
|
34
|
-
|
35
|
-
|
42
|
+
init_task_env()
|
36
43
|
self.seeds = struct_start_seeds(seeds)
|
37
44
|
self.project = project or "test"
|
38
45
|
self.task_name = task_name or "spider"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cobweb-launcher
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.9
|
4
4
|
Summary: spider_hole
|
5
5
|
Home-page: https://github.com/Juannie-PP/cobweb
|
6
6
|
Author: Juannie-PP
|
@@ -11,6 +11,10 @@ Classifier: Programming Language :: Python :: 3
|
|
11
11
|
Requires-Python: >=3.7
|
12
12
|
Description-Content-Type: text/markdown
|
13
13
|
License-File: LICENSE
|
14
|
+
Requires-Dist: requests>=2.19.1
|
15
|
+
Requires-Dist: oss2>=2.18.1
|
16
|
+
Requires-Dist: redis>=4.4.4
|
17
|
+
Requires-Dist: aliyun-log-python-sdk
|
14
18
|
|
15
19
|
# cobweb
|
16
20
|
|
@@ -3,10 +3,10 @@ README.md
|
|
3
3
|
setup.py
|
4
4
|
cobweb/__init__.py
|
5
5
|
cobweb/bbb.py
|
6
|
+
cobweb/constant.py
|
6
7
|
cobweb/decorators.py
|
7
8
|
cobweb/interface.py
|
8
9
|
cobweb/log.py
|
9
|
-
cobweb/setting.py
|
10
10
|
cobweb/task.py
|
11
11
|
cobweb/utils.py
|
12
12
|
cobweb/db/__init__.py
|
@@ -18,14 +18,14 @@ cobweb/db/scheduler/textfile.py
|
|
18
18
|
cobweb/db/storer/__init__.py
|
19
19
|
cobweb/db/storer/console.py
|
20
20
|
cobweb/db/storer/loghub.py
|
21
|
-
cobweb/db/storer/redis.py
|
22
21
|
cobweb/db/storer/textfile.py
|
23
|
-
cobweb/
|
24
|
-
cobweb/distributed/
|
25
|
-
cobweb/distributed/
|
26
|
-
cobweb/
|
27
|
-
cobweb/single/
|
28
|
-
cobweb/single/
|
22
|
+
cobweb/equip/__init__.py
|
23
|
+
cobweb/equip/distributed/__init__.py
|
24
|
+
cobweb/equip/distributed/launcher.py
|
25
|
+
cobweb/equip/distributed/models.py
|
26
|
+
cobweb/equip/single/__init__.py
|
27
|
+
cobweb/equip/single/launcher.py
|
28
|
+
cobweb/equip/single/models.py
|
29
29
|
cobweb_launcher.egg-info/PKG-INFO
|
30
30
|
cobweb_launcher.egg-info/SOURCES.txt
|
31
31
|
cobweb_launcher.egg-info/dependency_links.txt
|
@@ -1,11 +0,0 @@
|
|
1
|
-
from .bbb import Seed, Queue, DBItem
|
2
|
-
from .task import Task
|
3
|
-
from .log import log
|
4
|
-
from .interface import SchedulerInterface, StorerInterface
|
5
|
-
from .db.redis_db import RedisDB
|
6
|
-
from .db.oss_db import OssDB
|
7
|
-
from .distributed.launcher import launcher
|
8
|
-
from .single.launcher import launcher as single_launcher
|
9
|
-
from . import setting
|
10
|
-
|
11
|
-
|
@@ -1,15 +0,0 @@
|
|
1
|
-
from cobweb import log, StorerInterface
|
2
|
-
|
3
|
-
|
4
|
-
class Textfile(StorerInterface):
|
5
|
-
|
6
|
-
def store(self, data_list):
|
7
|
-
try:
|
8
|
-
data_str = "\n".join(str(data) for data in data_list)
|
9
|
-
with open(self.table, "a") as fp:
|
10
|
-
fp.write(data_str)
|
11
|
-
log.info(f"save data, data length: {len(data_list)}")
|
12
|
-
return True
|
13
|
-
except Exception as e:
|
14
|
-
return False
|
15
|
-
|
File without changes
|
@@ -1,13 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
|
3
|
-
|
4
|
-
# model: 0, 1, 2
|
5
|
-
MODEL = int(os.getenv("MODEL", "0"))
|
6
|
-
|
7
|
-
# 重制score值的等待时间, 默认10分钟
|
8
|
-
RESET_SCORE = int(os.getenv("RESET_SCORE", "600"))
|
9
|
-
|
10
|
-
# 默认设置检查spider queue队列锁的存活时间为30s
|
11
|
-
CHECK_LOCK_TIME = int(os.getenv("CHECK_LOCK_TIME", 30))
|
12
|
-
|
13
|
-
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{cobweb-launcher-0.1.7/cobweb/db/storer → cobweb-launcher-0.1.9/cobweb/equip/single}/__init__.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{cobweb-launcher-0.1.7 → cobweb-launcher-0.1.9}/cobweb_launcher.egg-info/dependency_links.txt
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|