cobweb-launcher 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cobweb-launcher might be problematic. Click here for more details.
- cobweb/__init__.py +2 -6
- cobweb/constant.py +24 -0
- cobweb/db/__init__.py +3 -2
- cobweb/db/redis_db.py +22 -32
- cobweb/db/scheduler/__init__.py +1 -0
- cobweb/db/scheduler/default.py +2 -2
- cobweb/db/scheduler/textfile.py +2 -2
- cobweb/db/storer/__init__.py +1 -0
- cobweb/db/storer/console.py +2 -2
- cobweb/db/storer/loghub.py +2 -2
- cobweb/db/storer/textfile.py +2 -2
- cobweb/decorators.py +1 -1
- cobweb/equip/__init__.py +8 -0
- cobweb/equip/distributed/__init__.py +0 -0
- cobweb/equip/distributed/launcher.py +219 -0
- cobweb/equip/distributed/models.py +152 -0
- cobweb/equip/single/__init__.py +0 -0
- cobweb/equip/single/launcher.py +200 -0
- cobweb/equip/single/models.py +144 -0
- cobweb/single/nest.py +3 -3
- cobweb/task.py +10 -3
- {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-0.1.10.dist-info}/METADATA +4 -4
- cobweb_launcher-0.1.10.dist-info/RECORD +39 -0
- {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-0.1.10.dist-info}/WHEEL +1 -1
- cobweb_launcher-0.1.8.dist-info/RECORD +0 -31
- {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-0.1.10.dist-info}/LICENSE +0 -0
- {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-0.1.10.dist-info}/top_level.txt +0 -0
cobweb/__init__.py
CHANGED
|
@@ -1,11 +1,7 @@
|
|
|
1
1
|
from .bbb import Seed, Queue, DBItem
|
|
2
2
|
from .task import Task
|
|
3
3
|
from .log import log
|
|
4
|
-
from .interface import SchedulerInterface, StorerInterface
|
|
5
4
|
from .db.redis_db import RedisDB
|
|
6
|
-
from .db.oss_db import OssDB
|
|
7
|
-
from .distributed.launcher import launcher
|
|
8
|
-
from .single.launcher import launcher as single_launcher
|
|
9
|
-
from . import setting
|
|
10
|
-
|
|
11
5
|
|
|
6
|
+
from .equip.distributed.launcher import launcher
|
|
7
|
+
from .equip.single.launcher import launcher as single_launcher
|
cobweb/constant.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
class LauncherModel:
|
|
4
|
+
task = "launcher model: task"
|
|
5
|
+
resident = "launcher model: resident"
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class LogModel:
|
|
9
|
+
simple = "log model: simple"
|
|
10
|
+
common = "log model: common"
|
|
11
|
+
detailed = "log model: detailed"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DealModel:
|
|
15
|
+
failure = "deal model: failure"
|
|
16
|
+
success = "deal model: success"
|
|
17
|
+
polling = "deal model: polling"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Setting:
|
|
21
|
+
RESET_SCORE = None
|
|
22
|
+
CHECK_LOCK_TIME = None
|
|
23
|
+
DEAL_MODEL = None
|
|
24
|
+
LAUNCHER_MODEL = None
|
cobweb/db/__init__.py
CHANGED
|
@@ -1,2 +1,3 @@
|
|
|
1
|
-
from
|
|
2
|
-
from
|
|
1
|
+
from .. import log, Seed, decorators
|
|
2
|
+
from ..constant import Setting, DealModel
|
|
3
|
+
from ..interface import SchedulerInterface, StorerInterface
|
cobweb/db/redis_db.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import time
|
|
2
2
|
import redis
|
|
3
|
-
from
|
|
4
|
-
from cobweb.decorators import check_redis_status
|
|
3
|
+
from . import log, decorators, Seed, Setting, DealModel
|
|
4
|
+
# from cobweb.decorators import decorators.check_redis_status
|
|
5
|
+
# from cobweb.constant import Setting, DealModel
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
class RedisDB:
|
|
@@ -11,9 +12,6 @@ class RedisDB:
|
|
|
11
12
|
project: str,
|
|
12
13
|
task_name: str,
|
|
13
14
|
config: dict,
|
|
14
|
-
model: int,
|
|
15
|
-
cs_lct: int,
|
|
16
|
-
rs_time: int,
|
|
17
15
|
):
|
|
18
16
|
pool = redis.ConnectionPool(**config)
|
|
19
17
|
self.heartbeat_key = f"{project}:{task_name}:heartbeat" # redis type string
|
|
@@ -25,11 +23,8 @@ class RedisDB:
|
|
|
25
23
|
self.check_lock = f"{project}:{task_name}:check_seed_lock" # redis type string
|
|
26
24
|
self.scheduler_lock = f"{project}:{task_name}:scheduler_lock" # redis type string
|
|
27
25
|
self.client = redis.Redis(connection_pool=pool)
|
|
28
|
-
self.model = model
|
|
29
|
-
self.cs_lct = cs_lct
|
|
30
|
-
self.rs_time = rs_time
|
|
31
26
|
|
|
32
|
-
@check_redis_status
|
|
27
|
+
@decorators.check_redis_status
|
|
33
28
|
def _get_lock(self, key, t=15, timeout=3, sleep_time=0.1):
|
|
34
29
|
begin_time = int(time.time())
|
|
35
30
|
while True:
|
|
@@ -55,7 +50,7 @@ class RedisDB:
|
|
|
55
50
|
log.info("ttl: " + str(ttl))
|
|
56
51
|
return False
|
|
57
52
|
|
|
58
|
-
@check_redis_status
|
|
53
|
+
@decorators.check_redis_status
|
|
59
54
|
def _deal_seed(self, seeds, is_add: bool):
|
|
60
55
|
if not seeds:
|
|
61
56
|
return None
|
|
@@ -73,15 +68,15 @@ class RedisDB:
|
|
|
73
68
|
if item_info:
|
|
74
69
|
self.client.zadd(self.spider_key, mapping=item_info, nx=is_add, xx=not is_add)
|
|
75
70
|
|
|
76
|
-
@check_redis_status
|
|
71
|
+
@decorators.check_redis_status
|
|
77
72
|
def add_seed(self, seeds):
|
|
78
73
|
self._deal_seed(seeds, is_add=True)
|
|
79
74
|
|
|
80
|
-
@check_redis_status
|
|
75
|
+
@decorators.check_redis_status
|
|
81
76
|
def reset_seed(self, seeds):
|
|
82
77
|
self._deal_seed(seeds, is_add=False)
|
|
83
78
|
|
|
84
|
-
@check_redis_status
|
|
79
|
+
@decorators.check_redis_status
|
|
85
80
|
def del_seed(self, seeds, spider_status: bool = True):
|
|
86
81
|
if not seeds:
|
|
87
82
|
return None
|
|
@@ -92,18 +87,16 @@ class RedisDB:
|
|
|
92
87
|
seeds = [seed if isinstance(seed, Seed) else Seed(seed) for seed in seeds]
|
|
93
88
|
|
|
94
89
|
if seeds:
|
|
95
|
-
# redis_key = self.succeed_key if spider_status else self.failed_key
|
|
96
90
|
redis_key = None
|
|
97
|
-
if spider_status:
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
else:
|
|
91
|
+
if spider_status and Setting.DEAL_MODEL in [DealModel.success, DealModel.polling]:
|
|
92
|
+
redis_key = self.succeed_key
|
|
93
|
+
elif not spider_status:
|
|
101
94
|
redis_key = self.failed_key
|
|
102
95
|
if redis_key:
|
|
103
96
|
self.client.sadd(redis_key, *(str(seed) for seed in seeds))
|
|
104
97
|
self.client.zrem(self.spider_key, *(seed.format_seed for seed in seeds))
|
|
105
98
|
|
|
106
|
-
@check_redis_status
|
|
99
|
+
@decorators.check_redis_status
|
|
107
100
|
def set_storer(self, key, seeds):
|
|
108
101
|
if not seeds:
|
|
109
102
|
return None
|
|
@@ -122,7 +115,7 @@ class RedisDB:
|
|
|
122
115
|
self.client.zadd(self.storer_key % key, mapping=item_info)
|
|
123
116
|
log.info(f"zadd storer key: length {len(item_info.keys())}")
|
|
124
117
|
|
|
125
|
-
@check_redis_status
|
|
118
|
+
@decorators.check_redis_status
|
|
126
119
|
def get_seed(self, length: int = 200):
|
|
127
120
|
cs = time.time()
|
|
128
121
|
|
|
@@ -148,14 +141,14 @@ class RedisDB:
|
|
|
148
141
|
log.info("push seeds into queue time: " + str(time.time() - cs))
|
|
149
142
|
return result
|
|
150
143
|
|
|
151
|
-
@check_redis_status
|
|
144
|
+
@decorators.check_redis_status
|
|
152
145
|
def check_spider_queue(self, stop, storer_num):
|
|
153
146
|
while not stop.is_set():
|
|
154
147
|
# 每15s获取check锁,等待600s后仍获取不到锁则重试;获取到锁后,设置锁的存活时间为${cs_lct}s
|
|
155
|
-
if self._get_lock(key=self.check_lock, t=
|
|
148
|
+
if self._get_lock(key=self.check_lock, t=Setting.CHECK_LOCK_TIME, timeout=600, sleep_time=3):
|
|
156
149
|
heartbeat = True if self.client.exists(self.heartbeat_key) else False
|
|
157
150
|
# 重启重制score值,否则获取${rs_time}分钟前的分数值
|
|
158
|
-
score = -int(time.time()) +
|
|
151
|
+
score = -int(time.time()) + Setting.RESET_SCORE if heartbeat else "-inf"
|
|
159
152
|
|
|
160
153
|
keys = self.client.keys(self.storer_key % "*")
|
|
161
154
|
|
|
@@ -170,7 +163,7 @@ class RedisDB:
|
|
|
170
163
|
break
|
|
171
164
|
for key in keys:
|
|
172
165
|
self.client.zrem(key, *members)
|
|
173
|
-
if
|
|
166
|
+
if Setting.DEAL_MODEL in [DealModel.success, DealModel.polling]:
|
|
174
167
|
self.client.sadd(self.succeed_key, *members)
|
|
175
168
|
self.client.zrem(self.spider_key, *members)
|
|
176
169
|
self.client.zrem(intersection_key, *members)
|
|
@@ -193,31 +186,28 @@ class RedisDB:
|
|
|
193
186
|
if not heartbeat:
|
|
194
187
|
self.client.setex(self.heartbeat_key, 15, "")
|
|
195
188
|
|
|
196
|
-
|
|
197
|
-
# time.sleep(3)
|
|
198
|
-
|
|
199
|
-
@check_redis_status
|
|
189
|
+
@decorators.check_redis_status
|
|
200
190
|
def set_heartbeat(self, stop):
|
|
201
191
|
time.sleep(5)
|
|
202
192
|
while not stop.is_set():
|
|
203
193
|
self.client.setex(self.heartbeat_key, 5, "")
|
|
204
194
|
time.sleep(3)
|
|
205
195
|
|
|
206
|
-
# @check_redis_status
|
|
196
|
+
# @decorators.check_redis_status
|
|
207
197
|
# def heartbeat(self):
|
|
208
198
|
# """
|
|
209
199
|
# 返回心跳key剩余存活时间
|
|
210
200
|
# """
|
|
211
201
|
# return self.client.ttl(self.heartbeat_key)
|
|
212
202
|
|
|
213
|
-
@check_redis_status
|
|
203
|
+
@decorators.check_redis_status
|
|
214
204
|
def spider_queue_length(self):
|
|
215
205
|
return self.client.zcard(self.spider_key)
|
|
216
206
|
|
|
217
|
-
@check_redis_status
|
|
207
|
+
@decorators.check_redis_status
|
|
218
208
|
def ready_seed_length(self):
|
|
219
209
|
return self.client.zcount(self.spider_key, min=0, max="+inf")
|
|
220
210
|
|
|
221
|
-
@check_redis_status
|
|
211
|
+
@decorators.check_redis_status
|
|
222
212
|
def get_scheduler_lock(self):
|
|
223
213
|
return self._get_lock(self.scheduler_lock)
|
cobweb/db/scheduler/__init__.py
CHANGED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .. import log, Seed, SchedulerInterface as Inf
|
cobweb/db/scheduler/default.py
CHANGED
cobweb/db/scheduler/textfile.py
CHANGED
cobweb/db/storer/__init__.py
CHANGED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .. import log, Seed, StorerInterface as Inf
|
cobweb/db/storer/console.py
CHANGED
cobweb/db/storer/loghub.py
CHANGED
cobweb/db/storer/textfile.py
CHANGED
cobweb/decorators.py
CHANGED
cobweb/equip/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import threading
|
|
3
|
+
|
|
4
|
+
from .. import log, sqn, rtn, pim
|
|
5
|
+
from .. import Queue, DBItem, RedisDB, Setting
|
|
6
|
+
from .models import Scheduler, Spider, Storer
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def check(stop, last, spider, scheduler, storer_list, ready_seed_length, spider_queue_length):
|
|
10
|
+
log.info("run check thread after 30 seconds...")
|
|
11
|
+
time.sleep(30)
|
|
12
|
+
spider_info = """
|
|
13
|
+
------------------- check: {0} ------------------
|
|
14
|
+
running_spider_thread_num: {1}
|
|
15
|
+
redis_ready_seed_length: {2}
|
|
16
|
+
redis_spider_seed_length: {3}
|
|
17
|
+
memory_seed_queue_length: {4}
|
|
18
|
+
storer_upload_queue_length_info:
|
|
19
|
+
{5}
|
|
20
|
+
----------------------- end -----------------------"""
|
|
21
|
+
while True:
|
|
22
|
+
status = "running"
|
|
23
|
+
running_spider_thread_num = spider.spider_in_progress.length
|
|
24
|
+
redis_ready_seed_length = ready_seed_length()
|
|
25
|
+
redis_spider_seed_length = spider_queue_length()
|
|
26
|
+
memory_seed_queue_length = scheduler.queue.length
|
|
27
|
+
storer_upload_queue_list = []
|
|
28
|
+
for storer in storer_list:
|
|
29
|
+
storer_upload_queue_list.append(
|
|
30
|
+
f"{storer.__class__.__name__} storer queue length: {storer.queue.length}"
|
|
31
|
+
)
|
|
32
|
+
if (
|
|
33
|
+
scheduler.stop and
|
|
34
|
+
not memory_seed_queue_length and
|
|
35
|
+
not running_spider_thread_num
|
|
36
|
+
):
|
|
37
|
+
if not Setting.LAUNCHER_MODEL:
|
|
38
|
+
log.info("spider is done?")
|
|
39
|
+
last.set()
|
|
40
|
+
time.sleep(3)
|
|
41
|
+
storer_queue_empty = True
|
|
42
|
+
storer_upload_queue_list = []
|
|
43
|
+
for storer in storer_list:
|
|
44
|
+
if storer.queue.length:
|
|
45
|
+
storer_queue_empty = False
|
|
46
|
+
storer_upload_queue_list.append(
|
|
47
|
+
f"{storer.__class__.__name__} storer queue length: {storer.queue.length}"
|
|
48
|
+
)
|
|
49
|
+
if (
|
|
50
|
+
storer_queue_empty and
|
|
51
|
+
not redis_ready_seed_length and
|
|
52
|
+
not redis_spider_seed_length
|
|
53
|
+
):
|
|
54
|
+
if Setting.LAUNCHER_MODEL:
|
|
55
|
+
log.info("waiting for push seeds...")
|
|
56
|
+
status = "waiting"
|
|
57
|
+
time.sleep(30)
|
|
58
|
+
else:
|
|
59
|
+
log.info("spider done!")
|
|
60
|
+
break
|
|
61
|
+
|
|
62
|
+
last.clear()
|
|
63
|
+
|
|
64
|
+
storer_upload_queue_length_info = "\n ".join(
|
|
65
|
+
storer_upload_queue_list) if storer_upload_queue_list else "None"
|
|
66
|
+
log.info(spider_info.format(
|
|
67
|
+
status,
|
|
68
|
+
running_spider_thread_num,
|
|
69
|
+
redis_ready_seed_length,
|
|
70
|
+
redis_spider_seed_length,
|
|
71
|
+
memory_seed_queue_length,
|
|
72
|
+
storer_upload_queue_length_info
|
|
73
|
+
))
|
|
74
|
+
|
|
75
|
+
time.sleep(3)
|
|
76
|
+
stop.set()
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def launcher(task):
|
|
80
|
+
"""
|
|
81
|
+
任务启动装饰器
|
|
82
|
+
:param task: 任务配置信息
|
|
83
|
+
"""
|
|
84
|
+
def decorator(func):
|
|
85
|
+
storer_list = []
|
|
86
|
+
|
|
87
|
+
# 程序结束事件
|
|
88
|
+
last = threading.Event()
|
|
89
|
+
# 停止采集事件
|
|
90
|
+
stop = threading.Event()
|
|
91
|
+
|
|
92
|
+
# 初始化redis信息
|
|
93
|
+
redis_db = RedisDB(task.project, task.task_name, task.redis_info)
|
|
94
|
+
|
|
95
|
+
log.info("初始化cobweb!")
|
|
96
|
+
|
|
97
|
+
seed_queue = Queue()
|
|
98
|
+
|
|
99
|
+
if task.scheduler_info is None:
|
|
100
|
+
task.scheduler_info = dict()
|
|
101
|
+
|
|
102
|
+
# 调度器动态继承
|
|
103
|
+
sql = task.scheduler_info.get("sql")
|
|
104
|
+
table = task.scheduler_info.get("table")
|
|
105
|
+
size = task.scheduler_info.get("size")
|
|
106
|
+
scheduler_config = task.scheduler_info.get("config")
|
|
107
|
+
scheduler_db = task.scheduler_info.get("db", "default")
|
|
108
|
+
DB, class_name = pim(scheduler_db, "scheduler")
|
|
109
|
+
# SchedulerDB, table, sql, length, size, config = task.scheduler_info
|
|
110
|
+
SchedulerTmp = type(class_name, (Scheduler, DB), {})
|
|
111
|
+
|
|
112
|
+
# 初始化调度器
|
|
113
|
+
scheduler = SchedulerTmp(
|
|
114
|
+
table=table, sql=sql, size=size, queue=seed_queue,
|
|
115
|
+
length=task.scheduler_queue_length, config=scheduler_config
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# 解析存储器信息
|
|
119
|
+
storer_info_list = task.storer_info or []
|
|
120
|
+
if not isinstance(storer_info_list, list):
|
|
121
|
+
storer_info_list = [storer_info_list]
|
|
122
|
+
|
|
123
|
+
# new item
|
|
124
|
+
item = type("Item", (object,), {"redis_client": redis_db.client})()
|
|
125
|
+
|
|
126
|
+
for storer_info in storer_info_list:
|
|
127
|
+
storer_db = storer_info["db"]
|
|
128
|
+
fields = storer_info["fields"]
|
|
129
|
+
storer_table = storer_info.get("table", "console")
|
|
130
|
+
storer_config = storer_info.get("config")
|
|
131
|
+
|
|
132
|
+
StorerDB, class_name = pim(storer_db, "storer")
|
|
133
|
+
StorerTmp = type(class_name, (Storer, StorerDB), {})
|
|
134
|
+
|
|
135
|
+
db_name = class_name.lower()
|
|
136
|
+
if not getattr(item, db_name, None):
|
|
137
|
+
instance = type(db_name, (DBItem,), {})
|
|
138
|
+
setattr(item, db_name, instance)
|
|
139
|
+
|
|
140
|
+
storer_item_instance = getattr(item, db_name)
|
|
141
|
+
storer_item_instance.init_item(storer_table, fields)
|
|
142
|
+
|
|
143
|
+
storer_queue = sqn(db_name, storer_table)
|
|
144
|
+
queue = getattr(storer_item_instance, storer_queue)
|
|
145
|
+
# 初始话存储器
|
|
146
|
+
table_name = rtn(table_name=storer_table)
|
|
147
|
+
storer = StorerTmp(
|
|
148
|
+
table=table_name, fields=fields,
|
|
149
|
+
length=task.storer_queue_length,
|
|
150
|
+
queue=queue, config=storer_config
|
|
151
|
+
)
|
|
152
|
+
storer_list.append(storer)
|
|
153
|
+
|
|
154
|
+
# 初始化采集器
|
|
155
|
+
spider = Spider(seed_queue, storer_list and True, task.max_retries)
|
|
156
|
+
|
|
157
|
+
threading.Thread(target=redis_db.check_spider_queue, args=(stop, len(storer_list))).start()
|
|
158
|
+
threading.Thread(target=redis_db.set_heartbeat, args=(stop,)).start()
|
|
159
|
+
|
|
160
|
+
# 推送初始种子
|
|
161
|
+
# seeds = start_seeds(task.start_seed)
|
|
162
|
+
redis_db.add_seed(task.seeds)
|
|
163
|
+
# 启动调度器, 调度至redis队列
|
|
164
|
+
threading.Thread(
|
|
165
|
+
# name="xxxx_schedule_seeds",
|
|
166
|
+
target=scheduler.schedule_seed,
|
|
167
|
+
args=(
|
|
168
|
+
redis_db.ready_seed_length,
|
|
169
|
+
redis_db.get_scheduler_lock,
|
|
170
|
+
redis_db.add_seed
|
|
171
|
+
)
|
|
172
|
+
).start()
|
|
173
|
+
|
|
174
|
+
# 启动调度器, 调度任务队列
|
|
175
|
+
threading.Thread(
|
|
176
|
+
# name="xxxx_schedule_task",
|
|
177
|
+
target=scheduler.schedule_task,
|
|
178
|
+
args=(
|
|
179
|
+
stop, redis_db.get_seed,
|
|
180
|
+
redis_db.ready_seed_length
|
|
181
|
+
)
|
|
182
|
+
).start()
|
|
183
|
+
|
|
184
|
+
# 启动采集器
|
|
185
|
+
for index in range(task.spider_num):
|
|
186
|
+
threading.Thread(
|
|
187
|
+
# name=f"xxxx_spider_task:{index}",
|
|
188
|
+
target=spider.spider_task,
|
|
189
|
+
args=(
|
|
190
|
+
stop, func, item,
|
|
191
|
+
redis_db.del_seed
|
|
192
|
+
)
|
|
193
|
+
).start()
|
|
194
|
+
|
|
195
|
+
# 启动存储器
|
|
196
|
+
for storer in storer_list:
|
|
197
|
+
threading.Thread(
|
|
198
|
+
# name=f"xxxx_store_task:{storer.table}",
|
|
199
|
+
target=storer.store_task,
|
|
200
|
+
args=(
|
|
201
|
+
stop, last,
|
|
202
|
+
redis_db.reset_seed,
|
|
203
|
+
redis_db.set_storer
|
|
204
|
+
)
|
|
205
|
+
).start()
|
|
206
|
+
|
|
207
|
+
threading.Thread(
|
|
208
|
+
# name="check_spider",
|
|
209
|
+
target=check,
|
|
210
|
+
args=(
|
|
211
|
+
stop, last, spider,
|
|
212
|
+
scheduler, storer_list,
|
|
213
|
+
redis_db.ready_seed_length,
|
|
214
|
+
redis_db.spider_queue_length,
|
|
215
|
+
)
|
|
216
|
+
).start()
|
|
217
|
+
|
|
218
|
+
return decorator
|
|
219
|
+
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from hashlib import md5
|
|
3
|
+
from inspect import isgenerator
|
|
4
|
+
|
|
5
|
+
from .. import log, ici
|
|
6
|
+
from .. import DealModel, Queue, Seed
|
|
7
|
+
# from pympler import asizeof
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Scheduler:
|
|
11
|
+
|
|
12
|
+
def schedule_seed(self, ready_seed_length, get_scheduler_lock, add_seed):
|
|
13
|
+
|
|
14
|
+
inf_name = "SchedulerInterface"
|
|
15
|
+
if not ici(self.__class__, inf_name):
|
|
16
|
+
raise Exception("not have schedule function!")
|
|
17
|
+
|
|
18
|
+
if self.__class__.__name__ == "Default":
|
|
19
|
+
self.stop = True
|
|
20
|
+
return None
|
|
21
|
+
|
|
22
|
+
while not self.stop:
|
|
23
|
+
length = ready_seed_length()
|
|
24
|
+
if length > self.size:
|
|
25
|
+
time.sleep(15)
|
|
26
|
+
|
|
27
|
+
elif get_scheduler_lock():
|
|
28
|
+
seeds = self.schedule()
|
|
29
|
+
add_seed(seeds)
|
|
30
|
+
|
|
31
|
+
log.info(f"close thread: schedule_seed")
|
|
32
|
+
|
|
33
|
+
def schedule_task(self, stop, get_seed, ready_seed_length):
|
|
34
|
+
time.sleep(3)
|
|
35
|
+
while not stop.is_set():
|
|
36
|
+
|
|
37
|
+
if not ready_seed_length():
|
|
38
|
+
time.sleep(5)
|
|
39
|
+
continue
|
|
40
|
+
|
|
41
|
+
if self.queue.length >= self.length:
|
|
42
|
+
time.sleep(3)
|
|
43
|
+
continue
|
|
44
|
+
|
|
45
|
+
seeds = get_seed(self.length)
|
|
46
|
+
self.queue.push(seeds)
|
|
47
|
+
log.info(f"close thread: schedule_task")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class Spider:
|
|
51
|
+
|
|
52
|
+
def __init__(self, queue, storage, max_retries=5):
|
|
53
|
+
self.spider_in_progress = Queue()
|
|
54
|
+
self.max_retries = max_retries
|
|
55
|
+
self.storage = storage
|
|
56
|
+
self.queue = queue
|
|
57
|
+
|
|
58
|
+
def spider_task(self, stop, func, item, del_seed):
|
|
59
|
+
while not stop.is_set():
|
|
60
|
+
seed = self.queue.pop()
|
|
61
|
+
if not seed:
|
|
62
|
+
time.sleep(3)
|
|
63
|
+
continue
|
|
64
|
+
elif seed._retry >= self.max_retries:
|
|
65
|
+
del_seed(seed, spider_status=False)
|
|
66
|
+
continue
|
|
67
|
+
try:
|
|
68
|
+
self.spider_in_progress.push(1, direct_insertion=True)
|
|
69
|
+
# log.info("spider seed: " + str(seed))
|
|
70
|
+
|
|
71
|
+
store_queue = None
|
|
72
|
+
store_data = list()
|
|
73
|
+
|
|
74
|
+
iterators = func(item, seed)
|
|
75
|
+
|
|
76
|
+
if not isgenerator(iterators):
|
|
77
|
+
if not self.storage:
|
|
78
|
+
del_seed(seed, spider_status=True)
|
|
79
|
+
continue
|
|
80
|
+
raise TypeError(f"{func.__name__} isn't a generator")
|
|
81
|
+
|
|
82
|
+
for it in iterators:
|
|
83
|
+
if getattr(it, "table_name", None):
|
|
84
|
+
if not store_queue:
|
|
85
|
+
store_queue = it.queue()
|
|
86
|
+
store_data.append(it.struct_data)
|
|
87
|
+
elif isinstance(it, Seed):
|
|
88
|
+
self.queue.push(it)
|
|
89
|
+
|
|
90
|
+
elif isinstance(it, str) and it == DealModel.polling:
|
|
91
|
+
self.queue.push(seed)
|
|
92
|
+
break
|
|
93
|
+
elif isinstance(it, str) and it == DealModel.success:
|
|
94
|
+
del_seed(seed, spider_status=True)
|
|
95
|
+
break
|
|
96
|
+
elif isinstance(it, str) and it == DealModel.failure:
|
|
97
|
+
del_seed(seed, spider_status=False)
|
|
98
|
+
break
|
|
99
|
+
else:
|
|
100
|
+
raise TypeError("yield value type error!")
|
|
101
|
+
|
|
102
|
+
if store_queue and store_data:
|
|
103
|
+
store_data.append(seed)
|
|
104
|
+
store_queue.push(store_data)
|
|
105
|
+
|
|
106
|
+
except Exception as e:
|
|
107
|
+
seed._retry += 1
|
|
108
|
+
self.queue.push(seed)
|
|
109
|
+
log.info(f"{str(seed)} -> {str(e)}")
|
|
110
|
+
finally:
|
|
111
|
+
self.spider_in_progress.pop()
|
|
112
|
+
log.info(f"close thread: spider")
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class Storer:
|
|
116
|
+
|
|
117
|
+
def store_task(self, stop, last, reset_seed, set_storer):
|
|
118
|
+
|
|
119
|
+
inf_name = "StorerInterface"
|
|
120
|
+
if not ici(self.__class__, inf_name):
|
|
121
|
+
return None
|
|
122
|
+
|
|
123
|
+
if not getattr(self, "store", None):
|
|
124
|
+
raise Exception("not have store function!")
|
|
125
|
+
|
|
126
|
+
storer_name = self.__class__.__name__ + self.table
|
|
127
|
+
store_key_id = md5(storer_name.encode()).hexdigest()
|
|
128
|
+
|
|
129
|
+
while not stop.is_set():
|
|
130
|
+
|
|
131
|
+
if last.is_set() or self.queue.length >= self.length:
|
|
132
|
+
seeds, data_list = [], []
|
|
133
|
+
|
|
134
|
+
while True:
|
|
135
|
+
data = self.queue.pop()
|
|
136
|
+
if not data:
|
|
137
|
+
break
|
|
138
|
+
if isinstance(data, Seed):
|
|
139
|
+
seeds.append(data)
|
|
140
|
+
if len(data_list) >= self.length:
|
|
141
|
+
break
|
|
142
|
+
continue
|
|
143
|
+
data_list.append(data)
|
|
144
|
+
|
|
145
|
+
if self.store(data_list):
|
|
146
|
+
set_storer(store_key_id, seeds)
|
|
147
|
+
else:
|
|
148
|
+
reset_seed(seeds)
|
|
149
|
+
|
|
150
|
+
time.sleep(3)
|
|
151
|
+
|
|
152
|
+
log.info(f"close thread: {storer_name}")
|
|
File without changes
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import threading
|
|
3
|
+
|
|
4
|
+
from .. import log, sqn, rtn, pim
|
|
5
|
+
from .. import Queue, DBItem, RedisDB, Setting
|
|
6
|
+
from .models import Scheduler, Spider, Storer
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def check(stop, last, spider, scheduler, storer, ready_seed_length, spider_queue_length):
|
|
10
|
+
log.info("run check thread after 30 seconds...")
|
|
11
|
+
time.sleep(30)
|
|
12
|
+
spider_info = """
|
|
13
|
+
------------------- check: {0} ------------------
|
|
14
|
+
redis_spider_seed_length: {1}
|
|
15
|
+
redis_ready_seed_length: {2}
|
|
16
|
+
running_spider_thread_num: {3}
|
|
17
|
+
memory_seed_queue_length: {4}
|
|
18
|
+
storer_queue_length_info: {5}
|
|
19
|
+
----------------------- end -----------------------"""
|
|
20
|
+
while True:
|
|
21
|
+
status = "running"
|
|
22
|
+
running_spider_thread_num = spider.spider_in_progress.length
|
|
23
|
+
redis_ready_seed_length = ready_seed_length()
|
|
24
|
+
redis_spider_seed_length = spider_queue_length()
|
|
25
|
+
memory_seed_queue_length = scheduler.queue.length
|
|
26
|
+
storer_upload_queue_length = storer.queue.length if storer else None
|
|
27
|
+
if (
|
|
28
|
+
scheduler.stop and
|
|
29
|
+
not memory_seed_queue_length and
|
|
30
|
+
not running_spider_thread_num
|
|
31
|
+
):
|
|
32
|
+
if not Setting.LAUNCHER_MODEL:
|
|
33
|
+
log.info("spider is done?")
|
|
34
|
+
last.set()
|
|
35
|
+
time.sleep(3)
|
|
36
|
+
storer_queue_empty = True
|
|
37
|
+
if storer and storer.queue.length:
|
|
38
|
+
storer_queue_empty = False
|
|
39
|
+
storer_upload_queue_length = storer.queue.length if storer else None
|
|
40
|
+
if (
|
|
41
|
+
storer_queue_empty and
|
|
42
|
+
not redis_ready_seed_length and
|
|
43
|
+
not redis_spider_seed_length
|
|
44
|
+
):
|
|
45
|
+
if Setting.LAUNCHER_MODEL:
|
|
46
|
+
log.info("waiting for push seeds...")
|
|
47
|
+
status = "waiting"
|
|
48
|
+
time.sleep(30)
|
|
49
|
+
else:
|
|
50
|
+
log.info("spider done!")
|
|
51
|
+
break
|
|
52
|
+
|
|
53
|
+
last.clear()
|
|
54
|
+
|
|
55
|
+
log.info(spider_info.format(
|
|
56
|
+
status,
|
|
57
|
+
redis_spider_seed_length,
|
|
58
|
+
redis_ready_seed_length,
|
|
59
|
+
running_spider_thread_num,
|
|
60
|
+
memory_seed_queue_length,
|
|
61
|
+
storer_upload_queue_length
|
|
62
|
+
))
|
|
63
|
+
|
|
64
|
+
time.sleep(3)
|
|
65
|
+
stop.set()
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def launcher(task):
|
|
69
|
+
"""
|
|
70
|
+
任务启动装饰器
|
|
71
|
+
:param task: 任务配置信息
|
|
72
|
+
"""
|
|
73
|
+
def decorator(func):
|
|
74
|
+
# 程序结束事件
|
|
75
|
+
last = threading.Event()
|
|
76
|
+
# 停止采集事件
|
|
77
|
+
stop = threading.Event()
|
|
78
|
+
|
|
79
|
+
# 初始化redis信息
|
|
80
|
+
redis_db = RedisDB(task.project, task.task_name, task.redis_info)
|
|
81
|
+
|
|
82
|
+
# new item
|
|
83
|
+
item = type("Item", (object,), {"redis_client": redis_db.client})()
|
|
84
|
+
|
|
85
|
+
log.info("初始化cobweb!")
|
|
86
|
+
|
|
87
|
+
seed_queue = Queue()
|
|
88
|
+
|
|
89
|
+
scheduler_info = task.scheduler_info or dict()
|
|
90
|
+
# 调度器动态继承
|
|
91
|
+
sql = scheduler_info.get("sql")
|
|
92
|
+
table = scheduler_info.get("table")
|
|
93
|
+
size = scheduler_info.get("size")
|
|
94
|
+
scheduler_config = scheduler_info.get("config")
|
|
95
|
+
scheduler_db = scheduler_info.get("db", "default")
|
|
96
|
+
DB, class_name = pim(scheduler_db, "scheduler")
|
|
97
|
+
# SchedulerDB, table, sql, length, size, config = task.scheduler_info
|
|
98
|
+
SchedulerTmp = type(class_name, (Scheduler, DB), {})
|
|
99
|
+
# 初始化调度器
|
|
100
|
+
scheduler = SchedulerTmp(
|
|
101
|
+
table=table, sql=sql, size=size, queue=seed_queue,
|
|
102
|
+
length=task.scheduler_queue_length, config=scheduler_config
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
storer = None
|
|
106
|
+
storer_info = task.storer_info or dict()
|
|
107
|
+
|
|
108
|
+
if storer_info:
|
|
109
|
+
storer_db = storer_info["db"]
|
|
110
|
+
fields = storer_info["fields"]
|
|
111
|
+
storer_table = storer_info.get("table", "console")
|
|
112
|
+
storer_config = storer_info.get("config")
|
|
113
|
+
|
|
114
|
+
StorerDB, class_name = pim(storer_db, "storer")
|
|
115
|
+
StorerTmp = type(class_name, (Storer, StorerDB), {})
|
|
116
|
+
|
|
117
|
+
db_name = class_name.lower()
|
|
118
|
+
if not getattr(item, db_name, None):
|
|
119
|
+
instance = type(db_name, (DBItem,), {})
|
|
120
|
+
setattr(item, db_name, instance)
|
|
121
|
+
|
|
122
|
+
storer_item_instance = getattr(item, db_name)
|
|
123
|
+
storer_item_instance.init_item(storer_table, fields)
|
|
124
|
+
|
|
125
|
+
storer_queue = sqn(db_name, storer_table)
|
|
126
|
+
queue = getattr(storer_item_instance, storer_queue)
|
|
127
|
+
# 初始话存储器
|
|
128
|
+
table_name = rtn(table_name=storer_table)
|
|
129
|
+
storer = StorerTmp(
|
|
130
|
+
table=table_name, fields=fields,
|
|
131
|
+
length=task.storer_queue_length,
|
|
132
|
+
queue=queue, config=storer_config
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# 初始化采集器
|
|
136
|
+
spider = Spider(seed_queue, storer and True, task.max_retries)
|
|
137
|
+
|
|
138
|
+
threading.Thread(target=redis_db.check_spider_queue, args=(stop, 0)).start()
|
|
139
|
+
threading.Thread(target=redis_db.set_heartbeat, args=(stop,)).start()
|
|
140
|
+
|
|
141
|
+
# 推送初始种子
|
|
142
|
+
# seeds = start_seeds(task.start_seed)
|
|
143
|
+
redis_db.add_seed(task.seeds)
|
|
144
|
+
# 启动调度器, 调度至redis队列
|
|
145
|
+
threading.Thread(
|
|
146
|
+
# name="xxxx_schedule_seeds",
|
|
147
|
+
target=scheduler.schedule_seed,
|
|
148
|
+
args=(
|
|
149
|
+
redis_db.ready_seed_length,
|
|
150
|
+
redis_db.get_scheduler_lock,
|
|
151
|
+
redis_db.add_seed
|
|
152
|
+
)
|
|
153
|
+
).start()
|
|
154
|
+
|
|
155
|
+
# 启动调度器, 调度任务队列
|
|
156
|
+
threading.Thread(
|
|
157
|
+
# name="xxxx_schedule_task",
|
|
158
|
+
target=scheduler.schedule_task,
|
|
159
|
+
args=(
|
|
160
|
+
stop, redis_db.get_seed,
|
|
161
|
+
redis_db.ready_seed_length
|
|
162
|
+
)
|
|
163
|
+
).start()
|
|
164
|
+
|
|
165
|
+
# 启动采集器
|
|
166
|
+
for index in range(task.spider_num):
|
|
167
|
+
threading.Thread(
|
|
168
|
+
# name=f"xxxx_spider_task:{index}",
|
|
169
|
+
target=spider.spider_task,
|
|
170
|
+
args=(
|
|
171
|
+
stop, func, item,
|
|
172
|
+
redis_db.del_seed
|
|
173
|
+
)
|
|
174
|
+
).start()
|
|
175
|
+
|
|
176
|
+
# 启动存储器
|
|
177
|
+
if storer:
|
|
178
|
+
threading.Thread(
|
|
179
|
+
# name=f"xxxx_store_task:{storer.table}",
|
|
180
|
+
target=storer.store_task,
|
|
181
|
+
args=(
|
|
182
|
+
stop, last,
|
|
183
|
+
redis_db.reset_seed,
|
|
184
|
+
redis_db.del_seed
|
|
185
|
+
)
|
|
186
|
+
).start()
|
|
187
|
+
|
|
188
|
+
threading.Thread(
|
|
189
|
+
# name="check_spider",
|
|
190
|
+
target=check,
|
|
191
|
+
args=(
|
|
192
|
+
stop, last, spider,
|
|
193
|
+
scheduler, storer,
|
|
194
|
+
redis_db.ready_seed_length,
|
|
195
|
+
redis_db.spider_queue_length,
|
|
196
|
+
)
|
|
197
|
+
).start()
|
|
198
|
+
|
|
199
|
+
return decorator
|
|
200
|
+
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from inspect import isgenerator
|
|
3
|
+
# from pympler import asizeof
|
|
4
|
+
from .. import log, ici
|
|
5
|
+
from .. import DealModel, Queue, Seed
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Scheduler:
|
|
9
|
+
|
|
10
|
+
def schedule_seed(self, ready_seed_length, get_scheduler_lock, add_seed):
|
|
11
|
+
|
|
12
|
+
inf_name = "SchedulerInterface"
|
|
13
|
+
if not ici(self.__class__, inf_name):
|
|
14
|
+
raise Exception("not have schedule function!")
|
|
15
|
+
|
|
16
|
+
if self.__class__.__name__ == "Default":
|
|
17
|
+
self.stop = True
|
|
18
|
+
return None
|
|
19
|
+
|
|
20
|
+
while not self.stop:
|
|
21
|
+
length = ready_seed_length()
|
|
22
|
+
if length > self.size:
|
|
23
|
+
time.sleep(15)
|
|
24
|
+
|
|
25
|
+
elif get_scheduler_lock():
|
|
26
|
+
seeds = self.schedule()
|
|
27
|
+
add_seed(seeds)
|
|
28
|
+
|
|
29
|
+
log.info(f"close thread: schedule_seed")
|
|
30
|
+
|
|
31
|
+
def schedule_task(self, stop, get_seed, ready_seed_length):
|
|
32
|
+
time.sleep(3)
|
|
33
|
+
while not stop.is_set():
|
|
34
|
+
|
|
35
|
+
if not ready_seed_length():
|
|
36
|
+
time.sleep(5)
|
|
37
|
+
continue
|
|
38
|
+
|
|
39
|
+
if self.queue.length >= self.length:
|
|
40
|
+
time.sleep(3)
|
|
41
|
+
continue
|
|
42
|
+
|
|
43
|
+
seeds = get_seed(self.length)
|
|
44
|
+
self.queue.push(seeds)
|
|
45
|
+
log.info(f"close thread: schedule_task")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class Spider:
|
|
49
|
+
|
|
50
|
+
def __init__(self, queue, storage, max_retries=5):
|
|
51
|
+
self.spider_in_progress = Queue()
|
|
52
|
+
self.max_retries = max_retries
|
|
53
|
+
self.storage = storage
|
|
54
|
+
self.queue = queue
|
|
55
|
+
|
|
56
|
+
def spider_task(self, stop, func, item, del_seed):
|
|
57
|
+
while not stop.is_set():
|
|
58
|
+
|
|
59
|
+
seed = self.queue.pop()
|
|
60
|
+
|
|
61
|
+
if not seed:
|
|
62
|
+
time.sleep(3)
|
|
63
|
+
continue
|
|
64
|
+
|
|
65
|
+
elif seed._retry >= self.max_retries:
|
|
66
|
+
del_seed(seed, spider_status=False)
|
|
67
|
+
continue
|
|
68
|
+
|
|
69
|
+
try:
|
|
70
|
+
self.spider_in_progress.push(1, direct_insertion=True)
|
|
71
|
+
# log.info("spider seed: " + str(seed))
|
|
72
|
+
iterators = func(item, seed)
|
|
73
|
+
|
|
74
|
+
if not isgenerator(iterators):
|
|
75
|
+
if not self.storage:
|
|
76
|
+
del_seed(seed, spider_status=True)
|
|
77
|
+
continue
|
|
78
|
+
raise TypeError(f"{func.__name__} isn't a generator")
|
|
79
|
+
|
|
80
|
+
for it in iterators:
|
|
81
|
+
if getattr(it, "table_name", None):
|
|
82
|
+
store_queue = it.queue()
|
|
83
|
+
store_queue.push(
|
|
84
|
+
[seed, it.struct_data],
|
|
85
|
+
direct_insertion=True
|
|
86
|
+
)
|
|
87
|
+
elif isinstance(it, Seed):
|
|
88
|
+
self.queue.push(it)
|
|
89
|
+
|
|
90
|
+
elif isinstance(it, str) and it == DealModel.polling:
|
|
91
|
+
self.queue.push(seed)
|
|
92
|
+
break
|
|
93
|
+
elif isinstance(it, str) and it == DealModel.success:
|
|
94
|
+
del_seed(seed, spider_status=True)
|
|
95
|
+
break
|
|
96
|
+
elif isinstance(it, str) and it == DealModel.failure:
|
|
97
|
+
del_seed(seed, spider_status=False)
|
|
98
|
+
break
|
|
99
|
+
else:
|
|
100
|
+
raise TypeError("yield value type error!")
|
|
101
|
+
|
|
102
|
+
except Exception as e:
|
|
103
|
+
seed._retry += 1
|
|
104
|
+
self.queue.push(seed)
|
|
105
|
+
log.info(f"{str(seed)} -> {str(e)}")
|
|
106
|
+
finally:
|
|
107
|
+
self.spider_in_progress.pop()
|
|
108
|
+
log.info(f"close thread: spider")
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class Storer:
|
|
112
|
+
|
|
113
|
+
def store_task(self, stop, last, reset_seed, del_seed):
|
|
114
|
+
|
|
115
|
+
inf_name = "StorerInterface"
|
|
116
|
+
if not ici(self.__class__, inf_name):
|
|
117
|
+
return None
|
|
118
|
+
|
|
119
|
+
if not getattr(self, "store", None):
|
|
120
|
+
raise Exception("not have store function!")
|
|
121
|
+
|
|
122
|
+
storer_name = self.__class__.__name__ + self.table
|
|
123
|
+
|
|
124
|
+
while not stop.is_set():
|
|
125
|
+
|
|
126
|
+
if last.is_set() or self.queue.length >= self.length:
|
|
127
|
+
seeds, data_list = [], []
|
|
128
|
+
|
|
129
|
+
for _ in range(self.length):
|
|
130
|
+
items = self.queue.pop()
|
|
131
|
+
if not items:
|
|
132
|
+
break
|
|
133
|
+
seed, data = items
|
|
134
|
+
seeds.append(seed)
|
|
135
|
+
data_list.append(data)
|
|
136
|
+
|
|
137
|
+
if self.store(data_list):
|
|
138
|
+
del_seed(seeds)
|
|
139
|
+
else:
|
|
140
|
+
reset_seed(seeds)
|
|
141
|
+
|
|
142
|
+
time.sleep(3)
|
|
143
|
+
|
|
144
|
+
log.info(f"close thread: {storer_name}")
|
cobweb/single/nest.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
import time
|
|
2
2
|
import threading
|
|
3
3
|
|
|
4
|
-
from single
|
|
5
|
-
from single
|
|
6
|
-
from single
|
|
4
|
+
from equip.single import Seed, DBItem
|
|
5
|
+
from equip.single import struct_queue_name, restore_table_name
|
|
6
|
+
from equip.single import Distributor, Scheduler, Spider, Storer
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
def init_task_seed(seeds):
|
cobweb/task.py
CHANGED
|
@@ -1,11 +1,19 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from .constant import *
|
|
1
3
|
from .utils import parse_info, struct_start_seeds
|
|
2
4
|
|
|
3
5
|
|
|
6
|
+
def init_task_env():
|
|
7
|
+
Setting.RESET_SCORE = int(os.getenv("RESET_SCORE", 600))
|
|
8
|
+
Setting.CHECK_LOCK_TIME = int(os.getenv("CHECK_LOCK_TIME", 30))
|
|
9
|
+
Setting.DEAL_MODEL = os.getenv("DEAL_MODEL", DealModel.failure)
|
|
10
|
+
Setting.LAUNCHER_MODEL = os.getenv("LAUNCHER_MODEL", LauncherModel.task)
|
|
11
|
+
|
|
12
|
+
|
|
4
13
|
class Task:
|
|
5
14
|
|
|
6
15
|
def __init__(
|
|
7
16
|
self,
|
|
8
|
-
# model=None,
|
|
9
17
|
seeds=None,
|
|
10
18
|
project=None,
|
|
11
19
|
task_name=None,
|
|
@@ -31,8 +39,7 @@ class Task:
|
|
|
31
39
|
:param storer_queue_length:
|
|
32
40
|
:param scheduler_queue_length:
|
|
33
41
|
"""
|
|
34
|
-
|
|
35
|
-
|
|
42
|
+
init_task_env()
|
|
36
43
|
self.seeds = struct_start_seeds(seeds)
|
|
37
44
|
self.project = project or "test"
|
|
38
45
|
self.task_name = task_name or "spider"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: cobweb-launcher
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.10
|
|
4
4
|
Summary: spider_hole
|
|
5
5
|
Home-page: https://github.com/Juannie-PP/cobweb
|
|
6
6
|
Author: Juannie-PP
|
|
@@ -11,9 +11,9 @@ Classifier: Programming Language :: Python :: 3
|
|
|
11
11
|
Requires-Python: >=3.7
|
|
12
12
|
Description-Content-Type: text/markdown
|
|
13
13
|
License-File: LICENSE
|
|
14
|
-
Requires-Dist: requests
|
|
15
|
-
Requires-Dist: oss2
|
|
16
|
-
Requires-Dist: redis
|
|
14
|
+
Requires-Dist: requests >=2.19.1
|
|
15
|
+
Requires-Dist: oss2 >=2.18.1
|
|
16
|
+
Requires-Dist: redis >=4.4.4
|
|
17
17
|
Requires-Dist: aliyun-log-python-sdk
|
|
18
18
|
|
|
19
19
|
# cobweb
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
cobweb/__init__.py,sha256=fIg4v2yl3NHiCSli9EHU1WnMTLqEfDAvOgRtmULwu5A,227
|
|
2
|
+
cobweb/bbb.py,sha256=Sckof5zgzWEz2tIjs6xxoLkrL7wUdogPw3AetlXIDUo,5684
|
|
3
|
+
cobweb/constant.py,sha256=WApaB3mn9cTDzvoCd7UcejO5T5wsCrxzn7D-uBTcIpg,462
|
|
4
|
+
cobweb/decorators.py,sha256=eYQI9rddPVJihAlomLTmbtQhIOzPw8dCrOFpxAq2pLY,318
|
|
5
|
+
cobweb/interface.py,sha256=um_k2AAQl1HTOvfUlq914DjkpfZVwt2m1B65EpPKrmE,802
|
|
6
|
+
cobweb/log.py,sha256=Gb3_y4IzTo5pJohTggBCU9rK6-ZN3hgTOHkoXHyN6CU,2384
|
|
7
|
+
cobweb/setting.py,sha256=UAu_dLuIFYO98MxtlZ5sZqJcwKAUKq4Bu4KoKlV50Mc,288
|
|
8
|
+
cobweb/task.py,sha256=H9hDK72fSqD7k16exVIXIvCYr1Kvq-pl8PCXtKh8JWA,1704
|
|
9
|
+
cobweb/utils.py,sha256=ivmRqJJNtwdOKYT4G7qQCWnL8ar9c-shxeDZzGB2E9c,2651
|
|
10
|
+
cobweb/db/__init__.py,sha256=jC-uOThYLtiDUG6cTJRkDITgOzR4nIOeaZQeZhuk-v0,139
|
|
11
|
+
cobweb/db/oss_db.py,sha256=lFGNuH3tdIMsohVXQ_fTZPyBfS2oxYNmFNuQ-ZBQgm0,4221
|
|
12
|
+
cobweb/db/redis_db.py,sha256=go9IPQQZAl_jXiHDTruWF7N8svs9QEpQPOr7JhuLCQs,8179
|
|
13
|
+
cobweb/db/scheduler/__init__.py,sha256=w5uIGEB1wLJ-H9RqGpzRwOEWW-BBVSk6Cc7FxZIlWCs,51
|
|
14
|
+
cobweb/db/scheduler/default.py,sha256=XDtxNyu5KTpVAbfCOW8mR1zNFNHiMuaQ4sAhZuIYBoM,79
|
|
15
|
+
cobweb/db/scheduler/textfile.py,sha256=P5pk75DUnbXbLNPOaMIbHh2lbwBGBlv0mitX58yK-MU,786
|
|
16
|
+
cobweb/db/storer/__init__.py,sha256=yWUVyq8JLpuUDPnUC0igw3P8Kkw_FqNi0aAoxkMkRmc,49
|
|
17
|
+
cobweb/db/storer/console.py,sha256=096JTALYuB_I3Qy5TjN40yEPeugO_pmqHN9VJu7wD7Y,153
|
|
18
|
+
cobweb/db/storer/loghub.py,sha256=4ImSIpHPNU7Djp72HlUGOd2h5c9gIxGzBKL1jJ3KPkM,1702
|
|
19
|
+
cobweb/db/storer/redis.py,sha256=7Q2XEQwBL6X_M1uvxzzuSBt6iw9piKw-_FWKm2INZDQ,412
|
|
20
|
+
cobweb/db/storer/textfile.py,sha256=auoXGXLbIbEhMoeYIhy58qw22N2r0fQTtzVjHCjqVGA,386
|
|
21
|
+
cobweb/distributed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
|
+
cobweb/distributed/launcher.py,sha256=jTtBXBmna_6yFdj6gyGQiiEtg8I0g5uI5h8kbHWt454,7998
|
|
23
|
+
cobweb/distributed/models.py,sha256=PUQokXMGD-H4A99nX7qYA395Ul6IsWGruMTVa05nswY,4568
|
|
24
|
+
cobweb/equip/__init__.py,sha256=UVhm9xl9kj1Ez_9Sf5ElRvkeI5pTpXatWfIHJbXAFx4,240
|
|
25
|
+
cobweb/equip/distributed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
|
+
cobweb/equip/distributed/launcher.py,sha256=1LzxibGXWR20XpXawakiRpEMaa9yfaj2rFSKnmEwjFc,7475
|
|
27
|
+
cobweb/equip/distributed/models.py,sha256=pIQSac-WuaKsFGrT1ImOMqYoD2zySTkOK1ZLtHw33d8,4815
|
|
28
|
+
cobweb/equip/single/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
29
|
+
cobweb/equip/single/launcher.py,sha256=dMd1IJg__yiFnxy3Q6EueebPllN8zyTDwfC0OjEhRf4,6702
|
|
30
|
+
cobweb/equip/single/models.py,sha256=kf6ZSlcY9XVdDx6WqGahO4uuGCAdHK9ayeJ6Kuu3JiE,4447
|
|
31
|
+
cobweb/single/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
32
|
+
cobweb/single/launcher.py,sha256=IoJbn87j7t7Pib_FxoWZmmX8asXOqNGb-9ospw6EYJI,7302
|
|
33
|
+
cobweb/single/models.py,sha256=UXcxr_Quok91k82plaqbj4deB-UBCWo14WCo6SS5L_o,4247
|
|
34
|
+
cobweb/single/nest.py,sha256=49K6KQ934INfPrWQsrq9rIFpQauLbLGOFbDaHvoQzOk,5015
|
|
35
|
+
cobweb_launcher-0.1.10.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
|
|
36
|
+
cobweb_launcher-0.1.10.dist-info/METADATA,sha256=qGzb9Ib-SW1Tnh4m5LreEwqnSoyO0DxhP7DujQHRsIE,1220
|
|
37
|
+
cobweb_launcher-0.1.10.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
38
|
+
cobweb_launcher-0.1.10.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
|
|
39
|
+
cobweb_launcher-0.1.10.dist-info/RECORD,,
|
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
cobweb/__init__.py,sha256=hEucL3DxmJNXKLwwcSlJrMlwMarlDV4VbILjJVk1s64,326
|
|
2
|
-
cobweb/bbb.py,sha256=Sckof5zgzWEz2tIjs6xxoLkrL7wUdogPw3AetlXIDUo,5684
|
|
3
|
-
cobweb/decorators.py,sha256=8KPSKL8xsiXOLv-kckkaDtK8LXM8d5gaRriGpuEgOQk,320
|
|
4
|
-
cobweb/interface.py,sha256=um_k2AAQl1HTOvfUlq914DjkpfZVwt2m1B65EpPKrmE,802
|
|
5
|
-
cobweb/log.py,sha256=Gb3_y4IzTo5pJohTggBCU9rK6-ZN3hgTOHkoXHyN6CU,2384
|
|
6
|
-
cobweb/setting.py,sha256=UAu_dLuIFYO98MxtlZ5sZqJcwKAUKq4Bu4KoKlV50Mc,288
|
|
7
|
-
cobweb/task.py,sha256=77F5EaopSVlSX2TANv1lhuPHFI8ER8Jh4tSGrwDWAc0,1405
|
|
8
|
-
cobweb/utils.py,sha256=ivmRqJJNtwdOKYT4G7qQCWnL8ar9c-shxeDZzGB2E9c,2651
|
|
9
|
-
cobweb/db/__init__.py,sha256=4m9lqmxZCRbaih3Z3rl_BT0GugMd0dkOIgu_P9aeC84,63
|
|
10
|
-
cobweb/db/oss_db.py,sha256=lFGNuH3tdIMsohVXQ_fTZPyBfS2oxYNmFNuQ-ZBQgm0,4221
|
|
11
|
-
cobweb/db/redis_db.py,sha256=yoWy-GI0rjVmT-68Che-pypfqNwNti5JGkc9bYvJH2o,8202
|
|
12
|
-
cobweb/db/scheduler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
|
-
cobweb/db/scheduler/default.py,sha256=OxmFX7OvMEhKEq-NF7A8I9cA4V4qWw5vayS-yIbng0A,114
|
|
14
|
-
cobweb/db/scheduler/textfile.py,sha256=atRDeNT-e5toNvyGsCXAxL1FJi77uSYktdCzH_hXGo8,821
|
|
15
|
-
cobweb/db/storer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
|
-
cobweb/db/storer/console.py,sha256=f7yZFo4qTieaB9JxbGfrVAclAb2H_wji82dWoZp7HUw,182
|
|
17
|
-
cobweb/db/storer/loghub.py,sha256=4VqZacXWhidzINHXQu2_-E0HOBRCcc86f6LkKfnXD5I,1731
|
|
18
|
-
cobweb/db/storer/redis.py,sha256=7Q2XEQwBL6X_M1uvxzzuSBt6iw9piKw-_FWKm2INZDQ,412
|
|
19
|
-
cobweb/db/storer/textfile.py,sha256=3mDHMvF6Sh5fn3IHzWQxyTUd45V-zUoH8vY3EoRlMx0,415
|
|
20
|
-
cobweb/distributed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
21
|
-
cobweb/distributed/launcher.py,sha256=jTtBXBmna_6yFdj6gyGQiiEtg8I0g5uI5h8kbHWt454,7998
|
|
22
|
-
cobweb/distributed/models.py,sha256=PUQokXMGD-H4A99nX7qYA395Ul6IsWGruMTVa05nswY,4568
|
|
23
|
-
cobweb/single/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
|
-
cobweb/single/launcher.py,sha256=IoJbn87j7t7Pib_FxoWZmmX8asXOqNGb-9ospw6EYJI,7302
|
|
25
|
-
cobweb/single/models.py,sha256=UXcxr_Quok91k82plaqbj4deB-UBCWo14WCo6SS5L_o,4247
|
|
26
|
-
cobweb/single/nest.py,sha256=mL8q9a5BjtoeUyzXCIVw_vyUsNY8ltbvQpYIIpZEDFU,5012
|
|
27
|
-
cobweb_launcher-0.1.8.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
|
|
28
|
-
cobweb_launcher-0.1.8.dist-info/METADATA,sha256=9NTzvHe-pPNFJdbyGuE9SRqvtKQd6Kar1WG_GwUS4Ss,1225
|
|
29
|
-
cobweb_launcher-0.1.8.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
|
|
30
|
-
cobweb_launcher-0.1.8.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
|
|
31
|
-
cobweb_launcher-0.1.8.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|