cobweb-launcher 1.2.6__tar.gz → 1.2.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cobweb-launcher might be problematic. Click here for more details.
- {cobweb-launcher-1.2.6/cobweb_launcher.egg-info → cobweb-launcher-1.2.8}/PKG-INFO +1 -1
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/cobweb/launchers/launcher.py +22 -6
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/cobweb/launchers/launcher_air.py +30 -30
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/cobweb/launchers/launcher_pro.py +66 -70
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/cobweb/setting.py +1 -4
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8/cobweb_launcher.egg-info}/PKG-INFO +1 -1
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/setup.py +1 -1
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/LICENSE +0 -0
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/README.md +0 -0
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/cobweb/__init__.py +0 -0
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/cobweb/base/__init__.py +0 -0
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/cobweb/base/common_queue.py +0 -0
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/cobweb/base/decorators.py +0 -0
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/cobweb/base/item.py +0 -0
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/cobweb/base/log.py +0 -0
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/cobweb/base/request.py +0 -0
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/cobweb/base/response.py +0 -0
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/cobweb/base/seed.py +0 -0
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/cobweb/constant.py +0 -0
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/cobweb/crawlers/__init__.py +0 -0
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/cobweb/crawlers/crawler.py +0 -0
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/cobweb/db/__init__.py +0 -0
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/cobweb/db/redis_db.py +0 -0
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/cobweb/exceptions/__init__.py +0 -0
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/cobweb/exceptions/oss_db_exception.py +0 -0
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/cobweb/launchers/__init__.py +0 -0
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/cobweb/pipelines/__init__.py +0 -0
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/cobweb/pipelines/pipeline.py +0 -0
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/cobweb/pipelines/pipeline_console.py +0 -0
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/cobweb/pipelines/pipeline_loghub.py +0 -0
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/cobweb/utils/__init__.py +0 -0
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/cobweb/utils/oss.py +0 -0
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/cobweb/utils/tools.py +0 -0
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/cobweb_launcher.egg-info/SOURCES.txt +0 -0
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/cobweb_launcher.egg-info/requires.txt +0 -0
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/cobweb_launcher.egg-info/top_level.txt +0 -0
- {cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/setup.cfg +0 -0
|
@@ -2,12 +2,25 @@ import time
|
|
|
2
2
|
import inspect
|
|
3
3
|
import threading
|
|
4
4
|
import importlib
|
|
5
|
+
from functools import wraps
|
|
5
6
|
|
|
6
7
|
from cobweb import setting
|
|
7
|
-
from cobweb.base import Seed, Queue
|
|
8
|
+
from cobweb.base import Seed, Queue, logger
|
|
8
9
|
from cobweb.utils.tools import dynamic_load_class
|
|
9
10
|
|
|
10
11
|
|
|
12
|
+
def check_pause(func):
|
|
13
|
+
@wraps(func)
|
|
14
|
+
def wrapper(self, *args, **kwargs):
|
|
15
|
+
while not self._pause.is_set():
|
|
16
|
+
try:
|
|
17
|
+
func(*args, **kwargs)
|
|
18
|
+
except Exception as e:
|
|
19
|
+
logger.info(f"{func.__name__}: " + str(e))
|
|
20
|
+
|
|
21
|
+
return wrapper
|
|
22
|
+
|
|
23
|
+
|
|
11
24
|
class Launcher(threading.Thread):
|
|
12
25
|
|
|
13
26
|
SEEDS = []
|
|
@@ -85,11 +98,6 @@ class Launcher(threading.Thread):
|
|
|
85
98
|
self._done_model = setting.DONE_MODEL
|
|
86
99
|
self._task_model = setting.TASK_MODEL
|
|
87
100
|
|
|
88
|
-
|
|
89
|
-
@property
|
|
90
|
-
def start_seeds(self):
|
|
91
|
-
return [Seed(seed) for seed in self.SEEDS]
|
|
92
|
-
|
|
93
101
|
@property
|
|
94
102
|
def request(self):
|
|
95
103
|
"""
|
|
@@ -135,9 +143,15 @@ class Launcher(threading.Thread):
|
|
|
135
143
|
self.__CUSTOM_FUNC__["parse"] = func
|
|
136
144
|
return decorator
|
|
137
145
|
|
|
146
|
+
def start_seeds(self):
|
|
147
|
+
seeds = [Seed(seed) for seed in self.SEEDS]
|
|
148
|
+
self.__LAUNCHER_QUEUE__['todo'].push(seeds)
|
|
149
|
+
return seeds
|
|
150
|
+
|
|
138
151
|
def _remove_doing_seeds(self, seeds):
|
|
139
152
|
for seed in seeds:
|
|
140
153
|
self.__DOING__.pop(seed, None)
|
|
154
|
+
logger.info("remove %s seeds from __DOING__" % len(seeds))
|
|
141
155
|
|
|
142
156
|
def _execute(self):
|
|
143
157
|
for func_name in self.__LAUNCHER_FUNC__:
|
|
@@ -147,6 +161,8 @@ class Launcher(threading.Thread):
|
|
|
147
161
|
def run(self):
|
|
148
162
|
threading.Thread(target=self._execute_heartbeat).start()
|
|
149
163
|
|
|
164
|
+
self.start_seeds()
|
|
165
|
+
|
|
150
166
|
self._Crawler(
|
|
151
167
|
stop=self._stop, pause=self._pause,
|
|
152
168
|
launcher_queue=self.__LAUNCHER_QUEUE__,
|
|
@@ -1,46 +1,46 @@
|
|
|
1
1
|
import time
|
|
2
2
|
|
|
3
|
-
from cobweb.constant import LogTemplate
|
|
4
3
|
from cobweb.base import logger
|
|
5
|
-
from .
|
|
4
|
+
from cobweb.constant import LogTemplate
|
|
5
|
+
from .launcher import Launcher, check_pause
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class LauncherAir(Launcher):
|
|
9
9
|
|
|
10
|
-
def _scheduler(self):
|
|
11
|
-
|
|
12
|
-
|
|
10
|
+
# def _scheduler(self):
|
|
11
|
+
# if self.start_seeds:
|
|
12
|
+
# self.__LAUNCHER_QUEUE__['todo'].push(self.start_seeds)
|
|
13
13
|
|
|
14
|
+
@check_pause
|
|
14
15
|
def _insert(self):
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
16
|
+
seeds = {}
|
|
17
|
+
status = self.__LAUNCHER_QUEUE__['new'].length < self._new_queue_max_size
|
|
18
|
+
for _ in range(self._new_queue_max_size):
|
|
19
|
+
seed = self.__LAUNCHER_QUEUE__['new'].pop()
|
|
20
|
+
if not seed:
|
|
21
|
+
break
|
|
22
|
+
seeds[seed.to_string] = seed.params.priority
|
|
23
|
+
if seeds:
|
|
24
|
+
self.__LAUNCHER_QUEUE__['todo'].push(seeds)
|
|
25
|
+
if status:
|
|
26
|
+
time.sleep(self._new_queue_wait_seconds)
|
|
27
|
+
|
|
28
|
+
@check_pause
|
|
28
29
|
def _delete(self):
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
|
|
30
|
+
seeds = []
|
|
31
|
+
status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
|
|
32
32
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
33
|
+
for _ in range(self._done_queue_max_size):
|
|
34
|
+
seed = self.__LAUNCHER_QUEUE__['done'].pop()
|
|
35
|
+
if not seed:
|
|
36
|
+
break
|
|
37
|
+
seeds.append(seed.to_string)
|
|
38
38
|
|
|
39
|
-
|
|
40
|
-
|
|
39
|
+
if seeds:
|
|
40
|
+
self._remove_doing_seeds(seeds)
|
|
41
41
|
|
|
42
|
-
|
|
43
|
-
|
|
42
|
+
if status:
|
|
43
|
+
time.sleep(self._done_queue_wait_seconds)
|
|
44
44
|
|
|
45
45
|
def _polling(self):
|
|
46
46
|
|
|
@@ -4,7 +4,7 @@ import threading
|
|
|
4
4
|
from cobweb.db import RedisDB
|
|
5
5
|
from cobweb.base import Seed, logger
|
|
6
6
|
from cobweb.constant import DealModel, LogTemplate
|
|
7
|
-
from .launcher import Launcher
|
|
7
|
+
from .launcher import Launcher, check_pause
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
class LauncherPro(Launcher):
|
|
@@ -33,44 +33,41 @@ class LauncherPro(Launcher):
|
|
|
33
33
|
else:
|
|
34
34
|
self._db._client.incrby(key, count)
|
|
35
35
|
|
|
36
|
+
@check_pause
|
|
36
37
|
def _execute_heartbeat(self):
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
time.sleep(3)
|
|
38
|
+
if self._heartbeat_start_event.is_set():
|
|
39
|
+
self._db.setex(self._heartbeat_key, 5)
|
|
40
|
+
time.sleep(3)
|
|
41
41
|
|
|
42
|
+
@check_pause
|
|
42
43
|
def _reset(self):
|
|
43
44
|
"""
|
|
44
45
|
检查过期种子,重新添加到redis缓存中
|
|
45
46
|
"""
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
if self._db.lock(self._reset_lock_key, t=120):
|
|
47
|
+
reset_wait_seconds = 30
|
|
48
|
+
if self._db.lock(self._reset_lock_key, t=120):
|
|
49
49
|
|
|
50
|
-
|
|
51
|
-
|
|
50
|
+
_min = -int(time.time()) + self._seed_reset_seconds \
|
|
51
|
+
if self.heartbeat else "-inf"
|
|
52
52
|
|
|
53
|
-
|
|
54
|
-
|
|
53
|
+
self._db.members(self._todo_key, 0, _min=_min, _max="(0")
|
|
54
|
+
self._db.delete(self._reset_lock_key)
|
|
55
55
|
|
|
56
|
-
|
|
57
|
-
|
|
56
|
+
if not self.heartbeat:
|
|
57
|
+
self._heartbeat_start_event.set()
|
|
58
58
|
|
|
59
|
-
|
|
59
|
+
time.sleep(reset_wait_seconds)
|
|
60
60
|
|
|
61
|
+
@check_pause
|
|
61
62
|
def _scheduler(self):
|
|
62
63
|
"""
|
|
63
64
|
调度任务,获取redis队列种子,同时添加到doing字典中
|
|
64
65
|
"""
|
|
65
|
-
if self.
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
continue
|
|
71
|
-
if self.__LAUNCHER_QUEUE__['todo'].length >= self._todo_queue_size:
|
|
72
|
-
time.sleep(self._todo_queue_full_wait_seconds)
|
|
73
|
-
continue
|
|
66
|
+
if not self._db.zcount(self._todo_key, 0, "(1000"):
|
|
67
|
+
time.sleep(self._scheduler_wait_seconds)
|
|
68
|
+
elif self.__LAUNCHER_QUEUE__['todo'].length >= self._todo_queue_size:
|
|
69
|
+
time.sleep(self._todo_queue_full_wait_seconds)
|
|
70
|
+
else:
|
|
74
71
|
members = self._db.members(
|
|
75
72
|
self._todo_key, int(time.time()),
|
|
76
73
|
count=self._todo_queue_size,
|
|
@@ -81,67 +78,66 @@ class LauncherPro(Launcher):
|
|
|
81
78
|
self.__LAUNCHER_QUEUE__['todo'].push(seed)
|
|
82
79
|
self.__DOING__[seed.to_string] = seed.params.priority
|
|
83
80
|
|
|
81
|
+
@check_pause
|
|
84
82
|
def _insert(self):
|
|
85
83
|
"""
|
|
86
84
|
添加新种子到redis队列中
|
|
87
85
|
"""
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
if not seed:
|
|
94
|
-
break
|
|
86
|
+
seeds = {}
|
|
87
|
+
status = self.__LAUNCHER_QUEUE__['new'].length < self._new_queue_max_size
|
|
88
|
+
for _ in range(self._new_queue_max_size):
|
|
89
|
+
seed = self.__LAUNCHER_QUEUE__['new'].pop()
|
|
90
|
+
if seed:
|
|
95
91
|
seeds[seed.to_string] = seed.params.priority
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
92
|
+
if seeds:
|
|
93
|
+
self._db.zadd(self._todo_key, seeds, nx=True)
|
|
94
|
+
if status:
|
|
95
|
+
time.sleep(self._new_queue_wait_seconds)
|
|
100
96
|
|
|
97
|
+
@check_pause
|
|
101
98
|
def _refresh(self):
|
|
102
99
|
"""
|
|
103
100
|
刷新doing种子过期时间,防止reset重新消费
|
|
104
101
|
"""
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
time.sleep(15)
|
|
102
|
+
if self.__DOING__:
|
|
103
|
+
refresh_time = int(time.time())
|
|
104
|
+
seeds = {k:-refresh_time - v / 1000 for k, v in self.__DOING__.items()}
|
|
105
|
+
self._db.zadd(self._todo_key, item=seeds, xx=True)
|
|
106
|
+
time.sleep(15)
|
|
111
107
|
|
|
108
|
+
@check_pause
|
|
112
109
|
def _delete(self):
|
|
113
110
|
"""
|
|
114
111
|
删除队列种子,根据状态添加至成功或失败队列,移除doing字典种子索引
|
|
115
112
|
"""
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
time.sleep(self._done_queue_wait_seconds)
|
|
113
|
+
seeds, s_seeds, f_seeds = [], [], []
|
|
114
|
+
status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
|
|
115
|
+
|
|
116
|
+
for _ in range(self._done_queue_max_size):
|
|
117
|
+
seed = self.__LAUNCHER_QUEUE__['done'].pop()
|
|
118
|
+
if not seed:
|
|
119
|
+
break
|
|
120
|
+
if seed.params.seed_status == DealModel.fail:
|
|
121
|
+
f_seeds.append(seed.to_string)
|
|
122
|
+
elif self._done_model == 1:
|
|
123
|
+
s_seeds.append(seed.to_string)
|
|
124
|
+
else:
|
|
125
|
+
seeds.append(seed.to_string)
|
|
126
|
+
if seeds:
|
|
127
|
+
count = self._db.zrem(self._todo_key, *seeds)
|
|
128
|
+
self.statistics(self._statistics_done_key, count)
|
|
129
|
+
self._remove_doing_seeds(seeds)
|
|
130
|
+
if s_seeds:
|
|
131
|
+
count = self._db.done([self._todo_key, self._done_key], *s_seeds)
|
|
132
|
+
self.statistics(self._statistics_done_key, count)
|
|
133
|
+
self._remove_doing_seeds(s_seeds)
|
|
134
|
+
if f_seeds:
|
|
135
|
+
count = self._db.done([self._todo_key, self._fail_key], *f_seeds)
|
|
136
|
+
self.statistics(self._statistics_fail_key, count)
|
|
137
|
+
self._remove_doing_seeds(f_seeds)
|
|
138
|
+
|
|
139
|
+
if status:
|
|
140
|
+
time.sleep(self._done_queue_wait_seconds)
|
|
145
141
|
|
|
146
142
|
def _polling(self):
|
|
147
143
|
wait_scheduler_execute = True
|
|
@@ -26,9 +26,6 @@ OSS_SECRET_KEY = os.getenv("OSS_SECRET_KEY")
|
|
|
26
26
|
OSS_CHUNK_SIZE = 10 * 1024 ** 2
|
|
27
27
|
OSS_MIN_UPLOAD_SIZE = 1024
|
|
28
28
|
|
|
29
|
-
# message
|
|
30
|
-
MESSAGE = ""
|
|
31
|
-
|
|
32
29
|
|
|
33
30
|
# 采集器选择
|
|
34
31
|
CRAWLER = "cobweb.crawlers.Crawler"
|
|
@@ -43,7 +40,7 @@ BEFORE_SCHEDULER_WAIT_SECONDS = 60 # 调度前等待时间,只作用于单次
|
|
|
43
40
|
SCHEDULER_WAIT_SECONDS = 15 # 调度等待时间
|
|
44
41
|
TODO_QUEUE_FULL_WAIT_SECONDS = 5 # todo队列已满时等待时间
|
|
45
42
|
NEW_QUEUE_WAIT_SECONDS = 30 # new队列等待时间
|
|
46
|
-
DONE_QUEUE_WAIT_SECONDS =
|
|
43
|
+
DONE_QUEUE_WAIT_SECONDS = 5 # done队列等待时间
|
|
47
44
|
UPLOAD_QUEUE_WAIT_SECONDS = 15 # upload队列等待时间
|
|
48
45
|
SEED_RESET_SECONDS = 30 # 种子重制时间
|
|
49
46
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{cobweb-launcher-1.2.6 → cobweb-launcher-1.2.8}/cobweb_launcher.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|