cobweb-launcher 1.3.14__py3-none-any.whl → 3.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cobweb/__init__.py +1 -1
- cobweb/base/__init__.py +4 -149
- cobweb/base/common_queue.py +0 -13
- cobweb/base/request.py +2 -14
- cobweb/base/seed.py +16 -12
- cobweb/constant.py +0 -16
- cobweb/crawlers/crawler.py +3 -85
- cobweb/db/redis_db.py +109 -52
- cobweb/launchers/__init__.py +8 -2
- cobweb/launchers/distributor.py +171 -0
- cobweb/launchers/launcher.py +87 -131
- cobweb/launchers/uploader.py +65 -0
- cobweb/pipelines/pipeline.py +3 -36
- cobweb/schedulers/__init__.py +1 -3
- cobweb/schedulers/launcher_air.py +93 -0
- cobweb/schedulers/launcher_api.py +225 -0
- cobweb/schedulers/scheduler.py +85 -0
- cobweb/schedulers/scheduler_with_redis.py +177 -0
- cobweb/setting.py +15 -32
- cobweb/utils/__init__.py +2 -1
- cobweb/utils/decorators.py +43 -0
- cobweb/utils/dotting.py +55 -0
- cobweb/utils/oss.py +28 -9
- {cobweb_launcher-1.3.14.dist-info → cobweb_launcher-3.1.0.dist-info}/METADATA +1 -1
- cobweb_launcher-3.1.0.dist-info/RECORD +41 -0
- cobweb/base/basic.py +0 -295
- cobweb/base/dotting.py +0 -35
- cobweb/launchers/launcher_air.py +0 -88
- cobweb/launchers/launcher_api.py +0 -88
- cobweb/launchers/launcher_pro.py +0 -88
- cobweb/schedulers/scheduler_api.py +0 -72
- cobweb/schedulers/scheduler_redis.py +0 -72
- cobweb_launcher-1.3.14.dist-info/RECORD +0 -40
- {cobweb_launcher-1.3.14.dist-info → cobweb_launcher-3.1.0.dist-info}/LICENSE +0 -0
- {cobweb_launcher-1.3.14.dist-info → cobweb_launcher-3.1.0.dist-info}/WHEEL +0 -0
- {cobweb_launcher-1.3.14.dist-info → cobweb_launcher-3.1.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,171 @@
|
|
1
|
+
import time
|
2
|
+
import threading
|
3
|
+
import traceback
|
4
|
+
from inspect import isgenerator
|
5
|
+
from typing import Callable
|
6
|
+
from urllib.parse import urlparse
|
7
|
+
from requests import Response as Res
|
8
|
+
|
9
|
+
from cobweb import setting
|
10
|
+
from cobweb.constant import DealModel, LogTemplate
|
11
|
+
from cobweb.base import (
|
12
|
+
Seed,
|
13
|
+
Queue,
|
14
|
+
BaseItem,
|
15
|
+
Request,
|
16
|
+
Response,
|
17
|
+
logger
|
18
|
+
)
|
19
|
+
from cobweb.utils import LoghubDot, check_pause
|
20
|
+
|
21
|
+
|
22
|
+
class Distributor(threading.Thread):
|
23
|
+
|
24
|
+
def __init__(
|
25
|
+
self,
|
26
|
+
task: str,
|
27
|
+
project: str,
|
28
|
+
new: Queue,
|
29
|
+
todo: Queue,
|
30
|
+
done: Queue,
|
31
|
+
upload: Queue,
|
32
|
+
register: Callable,
|
33
|
+
stop: threading.Event,
|
34
|
+
pause: threading.Event,
|
35
|
+
SpiderCrawler
|
36
|
+
):
|
37
|
+
super().__init__()
|
38
|
+
self.task = task
|
39
|
+
self.project = project
|
40
|
+
self.stop = stop
|
41
|
+
self.pause = pause
|
42
|
+
|
43
|
+
self.new = new
|
44
|
+
self.todo = todo
|
45
|
+
self.done = done
|
46
|
+
self.upload = upload
|
47
|
+
self.register = register
|
48
|
+
|
49
|
+
self.time_sleep = setting.SPIDER_TIME_SLEEP
|
50
|
+
self.thread_num = setting.SPIDER_THREAD_NUM
|
51
|
+
self.max_retries = setting.SPIDER_MAX_RETRIES
|
52
|
+
self.record_failed = setting.RECORD_FAILED_SPIDER
|
53
|
+
self.loghub_dot = LoghubDot() # todo: 解偶
|
54
|
+
|
55
|
+
self.Crawler = SpiderCrawler
|
56
|
+
|
57
|
+
logger.debug(f"Distribute instance attrs: {self.__dict__}")
|
58
|
+
|
59
|
+
def distribute(self, item, seed, _id: int):
|
60
|
+
if isinstance(item, Request):
|
61
|
+
seed.params.start_time = time.time()
|
62
|
+
self.process(item=seed, seed=seed, callback=self.Crawler.download, _id=1)
|
63
|
+
elif isinstance(item, Response):
|
64
|
+
if _id == 2:
|
65
|
+
raise TypeError("parse function can't yield a Response instance")
|
66
|
+
dot = isinstance(item.response, Res)
|
67
|
+
self.spider_logging(seed, item, dot=dot)
|
68
|
+
self.process(item=seed, seed=seed, callback=self.Crawler.parse, _id=2)
|
69
|
+
elif isinstance(item, BaseItem):
|
70
|
+
self.upload.push(item)
|
71
|
+
elif isinstance(item, Seed):
|
72
|
+
self.new.push((seed, item), direct_insertion=True)
|
73
|
+
elif isinstance(item, str) and item == DealModel.poll:
|
74
|
+
self.todo.push(seed)
|
75
|
+
elif isinstance(item, str) and item == DealModel.done:
|
76
|
+
self.done.push(seed)
|
77
|
+
elif isinstance(item, str) and item == DealModel.fail:
|
78
|
+
seed.params.retry += 1
|
79
|
+
if seed.params.retry < self.max_retries:
|
80
|
+
self.todo.push(seed)
|
81
|
+
else:
|
82
|
+
if record_failed := self.record_failed:
|
83
|
+
try:
|
84
|
+
response = Response(seed, "failed", max_retries=True)
|
85
|
+
self.process(response, seed, self.Crawler.parse, _id=2)
|
86
|
+
except:
|
87
|
+
record_failed = False
|
88
|
+
if not record_failed:
|
89
|
+
self.done.push(seed)
|
90
|
+
else:
|
91
|
+
raise TypeError("yield value type error!")
|
92
|
+
|
93
|
+
def process(self, item, seed, callback, _id: int):
|
94
|
+
result_iterators = callback(item)
|
95
|
+
if not isgenerator(result_iterators):
|
96
|
+
raise TypeError(f"{callback.__name__} function isn't a generator!")
|
97
|
+
for result_item in result_iterators:
|
98
|
+
self.distribute(result_item, seed, _id)
|
99
|
+
|
100
|
+
@check_pause
|
101
|
+
def spider(self):
|
102
|
+
if seed := self.todo.pop():
|
103
|
+
try:
|
104
|
+
self.process(item=seed, seed=seed, callback=self.Crawler.request, _id=0)
|
105
|
+
except Exception as e:
|
106
|
+
url, status = seed.url, e.__class__.__name__
|
107
|
+
msg = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
108
|
+
if getattr(e, "response", None) and isinstance(e.response, Res):
|
109
|
+
url = e.response.request.url
|
110
|
+
status = e.response.status_code
|
111
|
+
self.spider_logging(seed, None, error=True, url=url, status=status, msg=msg)
|
112
|
+
self.distribute(DealModel.fail, seed, _id=-1)
|
113
|
+
|
114
|
+
def spider_logging(
|
115
|
+
self, seed,
|
116
|
+
item: Response = None,
|
117
|
+
error: bool = False,
|
118
|
+
dot: bool = True,
|
119
|
+
**kwargs
|
120
|
+
):
|
121
|
+
detail_log_info = LogTemplate.log_info(seed.to_dict)
|
122
|
+
if error:
|
123
|
+
url = kwargs.get("url")
|
124
|
+
msg = kwargs.get("msg")
|
125
|
+
status = kwargs.get("status")
|
126
|
+
if dot:
|
127
|
+
self.loghub_dot.build(
|
128
|
+
topic=urlparse(url).netloc,
|
129
|
+
data_size=-1, cost_time=-1,
|
130
|
+
status=status, url=url,
|
131
|
+
seed=seed.to_string,
|
132
|
+
proxy_type=seed.params.proxy_type,
|
133
|
+
proxy=seed.params.proxy,
|
134
|
+
project=self.project,
|
135
|
+
task=self.task, msg=msg,
|
136
|
+
)
|
137
|
+
logger.info(LogTemplate.download_exception.format(
|
138
|
+
detail=detail_log_info,
|
139
|
+
retry=seed.params.retry,
|
140
|
+
priority=seed.params.priority,
|
141
|
+
seed_version=seed.params.seed_version,
|
142
|
+
identifier=seed.identifier or "",
|
143
|
+
exception=msg
|
144
|
+
))
|
145
|
+
else:
|
146
|
+
logger.info(LogTemplate.download_info.format(
|
147
|
+
detail=detail_log_info,
|
148
|
+
retry=seed.params.retry,
|
149
|
+
priority=seed.params.priority,
|
150
|
+
seed_version=seed.params.seed_version,
|
151
|
+
identifier=seed.identifier or "",
|
152
|
+
status=item.response,
|
153
|
+
response=LogTemplate.log_info(item.to_dict)
|
154
|
+
))
|
155
|
+
if dot:
|
156
|
+
end_time = time.time()
|
157
|
+
stime = seed.params.start_time
|
158
|
+
cost_time = end_time - stime if stime else -1
|
159
|
+
topic = urlparse(item.response.request.url).netloc
|
160
|
+
data_size = int(item.response.headers.get("content-length", 0))
|
161
|
+
self.loghub_dot.build(
|
162
|
+
topic=topic, data_size=data_size, cost_time=cost_time,
|
163
|
+
status=200, seed=seed.to_string, url=item.response.url,
|
164
|
+
proxy=seed.params.proxy, proxy_type=seed.params.proxy_type,
|
165
|
+
project=self.project, task=self.task,
|
166
|
+
)
|
167
|
+
|
168
|
+
def run(self):
|
169
|
+
self.register(self.loghub_dot.build_run, tag="LoghubDot")
|
170
|
+
for _ in range(self.thread_num):
|
171
|
+
self.register(self.spider, tag="Distributor")
|
cobweb/launchers/launcher.py
CHANGED
@@ -1,26 +1,34 @@
|
|
1
1
|
import time
|
2
|
+
import uuid
|
2
3
|
import inspect
|
3
4
|
import threading
|
4
5
|
import importlib
|
5
6
|
|
6
|
-
from cobweb.constant import LogTemplate
|
7
|
-
from cobweb.utils import dynamic_load_class
|
8
|
-
from cobweb.base import TaskQueue, Decorators, logger
|
9
7
|
from cobweb import setting
|
8
|
+
from cobweb.base import Seed, Queue, logger
|
9
|
+
from cobweb.utils.tools import dynamic_load_class
|
10
|
+
from cobweb.launchers import Distributor, Uploader
|
10
11
|
|
11
12
|
|
12
|
-
class Launcher
|
13
|
+
class Launcher:
|
13
14
|
|
14
|
-
|
15
|
+
_NEW_QUEUE_ = Queue()
|
16
|
+
_TODO_QUEUE_ = Queue()
|
17
|
+
_DONE_QUEUE_ = Queue()
|
18
|
+
_UPLOAD_QUEUE_ = Queue()
|
19
|
+
|
20
|
+
__WORKER_THREAD__ = dict()
|
21
|
+
__REGISTER_FUNC__ = dict()
|
15
22
|
|
16
23
|
def __init__(self, task, project, custom_setting=None, **kwargs):
|
17
24
|
super().__init__()
|
25
|
+
|
18
26
|
self.task = task
|
19
27
|
self.project = project
|
20
|
-
self.custom_func = dict()
|
21
|
-
self.app_time = int(time.time())
|
22
28
|
|
23
|
-
self.
|
29
|
+
self._app_time = int(time.time())
|
30
|
+
self._stop = threading.Event() # 结束事件
|
31
|
+
self._pause = threading.Event() # 暂停事件
|
24
32
|
|
25
33
|
_setting = dict()
|
26
34
|
|
@@ -41,40 +49,12 @@ class Launcher(threading.Thread):
|
|
41
49
|
for k, v in _setting.items():
|
42
50
|
setattr(setting, k.upper(), v)
|
43
51
|
|
44
|
-
self.
|
45
|
-
|
46
|
-
self.scheduling_wait_time = setting.SCHEDULING_WAIT_TIME
|
47
|
-
self.inserting_wait_time = setting.INSERTING_WAIT_TIME
|
48
|
-
self.removing_wait_time = setting.REMOVING_WAIT_TIME
|
49
|
-
self.seed_reset_seconds = setting.SEED_RESET_SECONDS
|
50
|
-
|
51
|
-
self.scheduling_size = setting.SCHEDULING_SIZE
|
52
|
-
self.inserting_size = setting.INSERTING_SIZE
|
53
|
-
self.removing_size = setting.REMOVING_SIZE
|
54
|
-
|
55
|
-
self.todo_queue_size = setting.TODO_QUEUE_SIZE
|
56
|
-
self.seed_queue_size = setting.SEED_QUEUE_SIZE
|
57
|
-
self.request_queue_size = setting.REQUEST_QUEUE_SIZE
|
58
|
-
self.download_queue_size = setting.DOWNLOAD_QUEUE_SIZE
|
59
|
-
self.response_queue_size = setting.RESPONSE_QUEUE_SIZE
|
60
|
-
self.upload_queue_size = setting.UPLOAD_QUEUE_SIZE
|
61
|
-
self.delete_queue_size = setting.DELETE_QUEUE_SIZE
|
62
|
-
self.done_queue_size = setting.DONE_QUEUE_SIZE
|
63
|
-
self.spider_max_retries = setting.SPIDER_MAX_RETRIES
|
64
|
-
|
65
|
-
self.spider_thread_num = setting.SPIDER_THREAD_NUM
|
52
|
+
self._done_model = setting.DONE_MODEL
|
53
|
+
self._task_model = setting.TASK_MODEL
|
66
54
|
|
67
|
-
self.
|
68
|
-
|
69
|
-
self.
|
70
|
-
self.pause = threading.Event() # 暂停事件
|
71
|
-
|
72
|
-
self.crawler_path = setting.CRAWLER
|
73
|
-
self.pipeline_path = setting.PIPELINE
|
74
|
-
|
75
|
-
self._thread_info = {}
|
76
|
-
|
77
|
-
self._task_info = dict(todo={}, download={})
|
55
|
+
self.Scheduler = dynamic_load_class(setting.SCHEDULER)
|
56
|
+
self.SpiderCrawler = dynamic_load_class(setting.CRAWLER)
|
57
|
+
self.SpiderPipeline = dynamic_load_class(setting.PIPELINE)
|
78
58
|
|
79
59
|
@property
|
80
60
|
def request(self):
|
@@ -88,7 +68,7 @@ class Launcher(threading.Thread):
|
|
88
68
|
yield Request(seed.url, seed)
|
89
69
|
"""
|
90
70
|
def decorator(func):
|
91
|
-
self.
|
71
|
+
self.SpiderCrawler.request = func
|
92
72
|
return decorator
|
93
73
|
|
94
74
|
@property
|
@@ -103,7 +83,7 @@ class Launcher(threading.Thread):
|
|
103
83
|
yield Response(item.seed, response)
|
104
84
|
"""
|
105
85
|
def decorator(func):
|
106
|
-
self.
|
86
|
+
self.SpiderCrawler.download = func
|
107
87
|
return decorator
|
108
88
|
|
109
89
|
@property
|
@@ -118,94 +98,70 @@ class Launcher(threading.Thread):
|
|
118
98
|
yield xxxItem(seed, **kwargs)
|
119
99
|
"""
|
120
100
|
def decorator(func):
|
121
|
-
self.
|
101
|
+
self.SpiderCrawler.parse = func
|
122
102
|
return decorator
|
123
103
|
|
124
|
-
def
|
125
|
-
for
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
self.
|
168
|
-
self.
|
169
|
-
self.
|
170
|
-
self.
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
upload_queue_len=TaskQueue.UPLOAD.length,
|
188
|
-
seed_queue_len=TaskQueue.SEED.length,
|
189
|
-
download_queue_len=TaskQueue.DOWNLOAD.length
|
190
|
-
))
|
191
|
-
|
192
|
-
def run(self):
|
193
|
-
Crawler = dynamic_load_class(self.crawler_path)
|
194
|
-
Pipeline = dynamic_load_class(self.pipeline_path)
|
195
|
-
|
196
|
-
crawler = Crawler(stop=self.stop, pause=self.pause, custom_func=self.custom_func)
|
197
|
-
pipeline = Pipeline(stop=self.stop, pause=self.pause)
|
198
|
-
|
199
|
-
self._add_thread(obj=crawler, func=crawler.build_request_item)
|
200
|
-
self._add_thread(obj=crawler, func=crawler.build_download_item, num=self.spider_thread_num)
|
201
|
-
self._add_thread(obj=crawler, func=crawler.build_parse_item)
|
202
|
-
self._add_thread(obj=pipeline, func=pipeline.run)
|
203
|
-
|
204
|
-
self._add_thread(func=self._polling)
|
205
|
-
|
206
|
-
self._init_schedule_thread()
|
207
|
-
self.check_alive()
|
208
|
-
|
209
|
-
def _init_schedule_thread(self):
|
210
|
-
...
|
104
|
+
def start_seeds(self, seeds: list):
|
105
|
+
seed_list = [Seed(seed) for seed in seeds]
|
106
|
+
self._TODO_QUEUE_.push(seed_list)
|
107
|
+
return seed_list
|
108
|
+
|
109
|
+
def _register(self, func, tag: str = "launcher"):
|
110
|
+
name = tag + ":" + func.__name__ + "_" + str(uuid.uuid4())
|
111
|
+
self.__REGISTER_FUNC__[name] = func
|
112
|
+
if not self.__WORKER_THREAD__.get(name):
|
113
|
+
worker_thread = threading.Thread(name=name, target=func)
|
114
|
+
self.__WORKER_THREAD__[name] = worker_thread
|
115
|
+
worker_thread.start()
|
116
|
+
|
117
|
+
def _monitor(self):
|
118
|
+
while not self._stop.is_set():
|
119
|
+
if self._pause.is_set():
|
120
|
+
time.sleep(15)
|
121
|
+
continue
|
122
|
+
for name, worker_thread in self.__WORKER_THREAD__.items():
|
123
|
+
if not worker_thread.is_alive():
|
124
|
+
logger.info(f"{name} thread is dead. Restarting...")
|
125
|
+
func = self.__REGISTER_FUNC__[name]
|
126
|
+
worker_thread = threading.Thread(name=name, target=func)
|
127
|
+
self.__WORKER_THREAD__[name] = worker_thread
|
128
|
+
worker_thread.start()
|
129
|
+
time.sleep(3)
|
130
|
+
|
131
|
+
def start(self):
|
132
|
+
self._pause.is_set()
|
133
|
+
|
134
|
+
self.Scheduler(
|
135
|
+
task=self.task,
|
136
|
+
project=self.project,
|
137
|
+
stop=self._stop,
|
138
|
+
pause=self._pause,
|
139
|
+
new=self._NEW_QUEUE_,
|
140
|
+
todo=self._TODO_QUEUE_,
|
141
|
+
done=self._DONE_QUEUE_,
|
142
|
+
upload=self._UPLOAD_QUEUE_,
|
143
|
+
register=self._register
|
144
|
+
).start()
|
145
|
+
|
146
|
+
Distributor(
|
147
|
+
task=self.task,
|
148
|
+
project=self.project,
|
149
|
+
new=self._NEW_QUEUE_,
|
150
|
+
todo=self._TODO_QUEUE_,
|
151
|
+
done=self._DONE_QUEUE_,
|
152
|
+
upload=self._UPLOAD_QUEUE_,
|
153
|
+
register=self._register,
|
154
|
+
stop=self._stop, pause=self._pause,
|
155
|
+
SpiderCrawler=self.SpiderCrawler
|
156
|
+
).start()
|
157
|
+
|
158
|
+
Uploader(
|
159
|
+
stop=self._stop, pause=self._pause,
|
160
|
+
done=self._DONE_QUEUE_,
|
161
|
+
upload=self._UPLOAD_QUEUE_,
|
162
|
+
register=self._register,
|
163
|
+
SpiderPipeline=self.SpiderPipeline
|
164
|
+
).start()
|
165
|
+
|
166
|
+
self._monitor()
|
211
167
|
|
@@ -0,0 +1,65 @@
|
|
1
|
+
import time
|
2
|
+
import threading
|
3
|
+
from typing import Callable
|
4
|
+
from cobweb import setting
|
5
|
+
from cobweb.base import Queue, logger
|
6
|
+
from cobweb.utils import check_pause
|
7
|
+
|
8
|
+
|
9
|
+
class Uploader(threading.Thread):
|
10
|
+
|
11
|
+
def __init__(
|
12
|
+
self,
|
13
|
+
stop: threading.Event,
|
14
|
+
pause: threading.Event,
|
15
|
+
upload: Queue, done: Queue,
|
16
|
+
register: Callable,
|
17
|
+
SpiderPipeline
|
18
|
+
):
|
19
|
+
super().__init__()
|
20
|
+
self.stop = stop
|
21
|
+
self.pause = pause
|
22
|
+
|
23
|
+
self.done = done
|
24
|
+
self.upload = upload
|
25
|
+
self.register = register
|
26
|
+
|
27
|
+
self.upload_size = setting.UPLOAD_QUEUE_MAX_SIZE
|
28
|
+
self.wait_seconds = setting.UPLOAD_QUEUE_WAIT_SECONDS
|
29
|
+
|
30
|
+
self.Pipeline = SpiderPipeline
|
31
|
+
|
32
|
+
logger.debug(f"Uploader instance attrs: {self.__dict__}")
|
33
|
+
|
34
|
+
@check_pause
|
35
|
+
def upload(self):
|
36
|
+
if not self.upload.length:
|
37
|
+
time.sleep(self.wait_seconds)
|
38
|
+
return
|
39
|
+
if self.upload.length < self.upload_size:
|
40
|
+
time.sleep(self.wait_seconds)
|
41
|
+
data_info, seeds = {}, []
|
42
|
+
try:
|
43
|
+
for _ in range(self.upload_size):
|
44
|
+
item = self.upload.pop()
|
45
|
+
if not item:
|
46
|
+
break
|
47
|
+
seeds.append(item.seed)
|
48
|
+
data = self.Pipeline.build(item)
|
49
|
+
data_info.setdefault(item.table, []).append(data)
|
50
|
+
for table, datas in data_info.items():
|
51
|
+
try:
|
52
|
+
self.Pipeline.upload(table, datas)
|
53
|
+
except Exception as e:
|
54
|
+
logger.info(e)
|
55
|
+
except Exception as e:
|
56
|
+
logger.info(e)
|
57
|
+
if seeds:
|
58
|
+
self.done.push(seeds)
|
59
|
+
|
60
|
+
logger.info("upload pipeline close!")
|
61
|
+
|
62
|
+
def run(self):
|
63
|
+
self.register(self.upload, tag="Uploader")
|
64
|
+
|
65
|
+
|
cobweb/pipelines/pipeline.py
CHANGED
@@ -1,48 +1,15 @@
|
|
1
|
-
import time
|
2
|
-
import threading
|
3
|
-
|
4
1
|
from abc import ABC, abstractmethod
|
5
|
-
from cobweb.base import BaseItem
|
6
|
-
from cobweb import setting
|
2
|
+
from cobweb.base import BaseItem
|
7
3
|
|
8
4
|
|
9
5
|
class Pipeline(ABC):
|
10
6
|
|
11
|
-
def __init__(
|
12
|
-
self,
|
13
|
-
stop: threading.Event,
|
14
|
-
pause: threading.Event,
|
15
|
-
):
|
16
|
-
super().__init__()
|
17
|
-
self.stop = stop
|
18
|
-
self.pause = pause
|
19
|
-
self.upload_queue_size = setting.UPLOAD_QUEUE_SIZE
|
20
|
-
self.upload_wait_time = setting.UPLOAD_WAIT_TIME
|
21
|
-
|
22
7
|
@abstractmethod
|
23
8
|
def build(self, item: BaseItem) -> dict:
|
24
|
-
|
9
|
+
pass
|
25
10
|
|
26
11
|
@abstractmethod
|
27
12
|
def upload(self, table: str, data: list) -> bool:
|
28
|
-
|
13
|
+
pass
|
29
14
|
|
30
|
-
@Decorators.pause
|
31
|
-
def run(self):
|
32
|
-
data_info, seeds = {}, []
|
33
|
-
thread_sleep = self.upload_wait_time if TaskQueue.UPLOAD.length < self.upload_queue_size else 0.1
|
34
|
-
try:
|
35
|
-
while (item := TaskQueue.UPLOAD.pop()) and len(seeds) <= self.upload_queue_size:
|
36
|
-
data = self.build(item)
|
37
|
-
data_info.setdefault(item.table, []).append(data)
|
38
|
-
seeds.append(item.seed)
|
39
|
-
for table, datas in data_info.items():
|
40
|
-
self.upload(table, datas)
|
41
|
-
except Exception as e:
|
42
|
-
logger.info(e)
|
43
|
-
seeds = None
|
44
|
-
# todo: retry
|
45
|
-
finally:
|
46
|
-
TaskQueue.DONE.push(seeds)
|
47
15
|
|
48
|
-
time.sleep(thread_sleep)
|
cobweb/schedulers/__init__.py
CHANGED
@@ -0,0 +1,93 @@
|
|
1
|
+
import time
|
2
|
+
|
3
|
+
from cobweb.base import logger
|
4
|
+
from cobweb.constant import LogTemplate
|
5
|
+
from .launcher import Launcher, check_pause
|
6
|
+
|
7
|
+
|
8
|
+
class LauncherAir(Launcher):
|
9
|
+
|
10
|
+
# def _scheduler(self):
|
11
|
+
# if self.start_seeds:
|
12
|
+
# self.__LAUNCHER_QUEUE__['todo'].push(self.start_seeds)
|
13
|
+
|
14
|
+
@check_pause
|
15
|
+
def _insert(self):
|
16
|
+
new_seeds = {}
|
17
|
+
del_seeds = set()
|
18
|
+
status = self.__LAUNCHER_QUEUE__['new'].length < self._new_queue_max_size
|
19
|
+
for _ in range(self._new_queue_max_size):
|
20
|
+
seed_tuple = self.__LAUNCHER_QUEUE__['new'].pop()
|
21
|
+
if not seed_tuple:
|
22
|
+
break
|
23
|
+
seed, new_seed = seed_tuple
|
24
|
+
new_seeds[new_seed.to_string] = new_seed.params.priority
|
25
|
+
del_seeds.add(seed.to_string)
|
26
|
+
if new_seeds:
|
27
|
+
self.__LAUNCHER_QUEUE__['todo'].push(new_seeds)
|
28
|
+
if del_seeds:
|
29
|
+
self.__LAUNCHER_QUEUE__['done'].push(del_seeds)
|
30
|
+
if status:
|
31
|
+
time.sleep(self._new_queue_wait_seconds)
|
32
|
+
|
33
|
+
@check_pause
|
34
|
+
def _delete(self):
|
35
|
+
seeds = []
|
36
|
+
status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
|
37
|
+
|
38
|
+
for _ in range(self._done_queue_max_size):
|
39
|
+
seed = self.__LAUNCHER_QUEUE__['done'].pop()
|
40
|
+
if not seed:
|
41
|
+
break
|
42
|
+
seeds.append(seed.to_string)
|
43
|
+
|
44
|
+
if seeds:
|
45
|
+
self._remove_doing_seeds(seeds)
|
46
|
+
|
47
|
+
if status:
|
48
|
+
time.sleep(self._done_queue_wait_seconds)
|
49
|
+
|
50
|
+
def _polling(self):
|
51
|
+
|
52
|
+
check_emtpy_times = 0
|
53
|
+
|
54
|
+
while not self._stop.is_set():
|
55
|
+
|
56
|
+
queue_not_empty_count = 0
|
57
|
+
pooling_wait_seconds = 30
|
58
|
+
|
59
|
+
for q in self.__LAUNCHER_QUEUE__.values():
|
60
|
+
if q.length != 0:
|
61
|
+
queue_not_empty_count += 1
|
62
|
+
|
63
|
+
if queue_not_empty_count == 0:
|
64
|
+
pooling_wait_seconds = 3
|
65
|
+
if self._pause.is_set():
|
66
|
+
check_emtpy_times = 0
|
67
|
+
if not self._task_model:
|
68
|
+
logger.info("Done! Ready to close thread...")
|
69
|
+
self._stop.set()
|
70
|
+
elif check_emtpy_times > 2:
|
71
|
+
self.__DOING__ = {}
|
72
|
+
self._pause.set()
|
73
|
+
else:
|
74
|
+
logger.info(
|
75
|
+
"check whether the task is complete, "
|
76
|
+
f"reset times {3 - check_emtpy_times}"
|
77
|
+
)
|
78
|
+
check_emtpy_times += 1
|
79
|
+
elif self._pause.is_set():
|
80
|
+
self._pause.clear()
|
81
|
+
self._execute()
|
82
|
+
else:
|
83
|
+
logger.info(LogTemplate.launcher_air_polling.format(
|
84
|
+
task=self.task,
|
85
|
+
doing_len=len(self.__DOING__.keys()),
|
86
|
+
todo_len=self.__LAUNCHER_QUEUE__['todo'].length,
|
87
|
+
done_len=self.__LAUNCHER_QUEUE__['done'].length,
|
88
|
+
upload_len=self.__LAUNCHER_QUEUE__['upload'].length,
|
89
|
+
))
|
90
|
+
|
91
|
+
time.sleep(pooling_wait_seconds)
|
92
|
+
|
93
|
+
|