cobweb-launcher 1.2.49__py3-none-any.whl → 1.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. cobweb/base/__init__.py +141 -4
  2. cobweb/base/basic.py +28 -82
  3. cobweb/base/common_queue.py +13 -0
  4. cobweb/base/dotting.py +1 -1
  5. cobweb/base/request.py +14 -2
  6. cobweb/base/seed.py +10 -6
  7. cobweb/constant.py +16 -0
  8. cobweb/crawlers/crawler.py +51 -181
  9. cobweb/db/redis_db.py +28 -0
  10. cobweb/launchers/__init__.py +2 -2
  11. cobweb/launchers/launcher.py +110 -141
  12. cobweb/launchers/launcher_api.py +66 -114
  13. cobweb/launchers/launcher_pro.py +76 -194
  14. cobweb/pipelines/base_pipeline.py +54 -0
  15. cobweb/pipelines/loghub_pipeline.py +34 -0
  16. cobweb/pipelines/pipeline.py +25 -49
  17. cobweb/schedulers/__init__.py +0 -2
  18. cobweb/schedulers/scheduler_redis.py +5 -8
  19. cobweb/setting.py +29 -6
  20. cobweb/utils/dotting.py +10 -42
  21. cobweb_/__init__.py +2 -0
  22. cobweb_/base/__init__.py +9 -0
  23. cobweb_/base/common_queue.py +30 -0
  24. cobweb_/base/decorators.py +40 -0
  25. cobweb_/base/item.py +46 -0
  26. cobweb_/base/log.py +94 -0
  27. cobweb_/base/request.py +82 -0
  28. cobweb_/base/response.py +23 -0
  29. cobweb_/base/seed.py +114 -0
  30. cobweb_/constant.py +94 -0
  31. cobweb_/crawlers/__init__.py +1 -0
  32. cobweb_/crawlers/crawler.py +184 -0
  33. cobweb_/db/__init__.py +2 -0
  34. cobweb_/db/api_db.py +82 -0
  35. cobweb_/db/redis_db.py +130 -0
  36. cobweb_/exceptions/__init__.py +1 -0
  37. cobweb_/exceptions/oss_db_exception.py +28 -0
  38. cobweb_/launchers/__init__.py +3 -0
  39. cobweb_/launchers/launcher.py +235 -0
  40. cobweb_/launchers/launcher_air.py +88 -0
  41. cobweb_/launchers/launcher_api.py +221 -0
  42. cobweb_/launchers/launcher_pro.py +222 -0
  43. cobweb_/pipelines/__init__.py +3 -0
  44. cobweb_/pipelines/pipeline.py +69 -0
  45. cobweb_/pipelines/pipeline_console.py +22 -0
  46. cobweb_/pipelines/pipeline_loghub.py +34 -0
  47. cobweb_/setting.py +74 -0
  48. cobweb_/utils/__init__.py +5 -0
  49. cobweb_/utils/bloom.py +58 -0
  50. cobweb_/utils/dotting.py +32 -0
  51. cobweb_/utils/oss.py +94 -0
  52. cobweb_/utils/tools.py +42 -0
  53. {cobweb_launcher-1.2.49.dist-info → cobweb_launcher-1.3.2.dist-info}/METADATA +1 -1
  54. cobweb_launcher-1.3.2.dist-info/RECORD +110 -0
  55. cobweb_launcher-1.3.2.dist-info/top_level.txt +2 -0
  56. cobweb_new/__init__.py +2 -0
  57. cobweb_new/base/__init__.py +72 -0
  58. cobweb_new/base/common_queue.py +53 -0
  59. cobweb_new/base/decorators.py +72 -0
  60. cobweb_new/base/item.py +46 -0
  61. cobweb_new/base/log.py +94 -0
  62. cobweb_new/base/request.py +82 -0
  63. cobweb_new/base/response.py +23 -0
  64. cobweb_new/base/seed.py +118 -0
  65. cobweb_new/constant.py +105 -0
  66. cobweb_new/crawlers/__init__.py +1 -0
  67. cobweb_new/crawlers/crawler-new.py +85 -0
  68. cobweb_new/crawlers/crawler.py +170 -0
  69. cobweb_new/db/__init__.py +2 -0
  70. cobweb_new/db/api_db.py +82 -0
  71. cobweb_new/db/redis_db.py +158 -0
  72. cobweb_new/exceptions/__init__.py +1 -0
  73. cobweb_new/exceptions/oss_db_exception.py +28 -0
  74. cobweb_new/launchers/__init__.py +3 -0
  75. cobweb_new/launchers/launcher.py +237 -0
  76. cobweb_new/launchers/launcher_air.py +88 -0
  77. cobweb_new/launchers/launcher_api.py +161 -0
  78. cobweb_new/launchers/launcher_pro.py +96 -0
  79. cobweb_new/launchers/tesss.py +47 -0
  80. cobweb_new/pipelines/__init__.py +3 -0
  81. cobweb_new/pipelines/pipeline.py +68 -0
  82. cobweb_new/pipelines/pipeline_console.py +22 -0
  83. cobweb_new/pipelines/pipeline_loghub.py +34 -0
  84. cobweb_new/setting.py +95 -0
  85. cobweb_new/utils/__init__.py +5 -0
  86. cobweb_new/utils/bloom.py +58 -0
  87. cobweb_new/utils/oss.py +94 -0
  88. cobweb_new/utils/tools.py +42 -0
  89. cobweb/schedulers/scheduler_api.py +0 -72
  90. cobweb_launcher-1.2.49.dist-info/RECORD +0 -44
  91. cobweb_launcher-1.2.49.dist-info/top_level.txt +0 -1
  92. {cobweb_launcher-1.2.49.dist-info → cobweb_launcher-1.3.2.dist-info}/LICENSE +0 -0
  93. {cobweb_launcher-1.2.49.dist-info → cobweb_launcher-1.3.2.dist-info}/WHEEL +0 -0
@@ -0,0 +1,237 @@
1
+ import time
2
+ import inspect
3
+ import threading
4
+ import importlib
5
+
6
+ from inspect import isgenerator
7
+ from typing import Union, Callable
8
+
9
+ from constant import DealModel, LogTemplate
10
+ from cobweb.utils import dynamic_load_class
11
+ from cobweb.base import Seed, Queue, logger, TaskQueue
12
+ from cobweb import setting
13
+
14
+
15
+ class Launcher(threading.Thread):
16
+
17
+ __CUSTOM_FUNC__ = {}
18
+
19
+ def __init__(self, task, project, custom_setting=None, **kwargs):
20
+ super().__init__()
21
+ self.task = task
22
+ self.project = project
23
+ self.custom_func = dict()
24
+ self.app_time = int(time.time())
25
+
26
+ _setting = dict()
27
+
28
+ if custom_setting:
29
+ if isinstance(custom_setting, dict):
30
+ _setting = custom_setting
31
+ else:
32
+ if isinstance(custom_setting, str):
33
+ custom_setting = importlib.import_module(custom_setting)
34
+ if not inspect.ismodule(custom_setting):
35
+ raise Exception
36
+ for k, v in custom_setting.__dict__.items():
37
+ if not k.startswith("__") and not inspect.ismodule(v):
38
+ _setting[k] = v
39
+
40
+ _setting.update(**kwargs)
41
+
42
+ for k, v in _setting.items():
43
+ setattr(setting, k.upper(), v)
44
+
45
+ self.scheduling_wait_time = setting.SCHEDULING_WAIT_TIME
46
+ self.inserting_wait_time = setting.INSERTING_WAIT_TIME
47
+ self.removing_wait_time = setting.REMOVING_WAIT_TIME
48
+
49
+ self.scheduling_size = setting.SCHEDULING_SIZE
50
+ self.inserting_size = setting.INSERTING_SIZE
51
+ self.removing_size = setting.REMOVING_SIZE
52
+
53
+ self.todo_queue_size = setting.TODO_QUEUE_SIZE
54
+ self.seed_queue_size = setting.SEED_QUEUE_SIZE
55
+ self.request_queue_size = setting.REQUEST_QUEUE_SIZE
56
+ self.download_queue_size = setting.DOWNLOAD_QUEUE_SIZE
57
+ self.response_queue_size = setting.RESPONSE_QUEUE_SIZE
58
+ self.upload_queue_size = setting.UPLOAD_QUEUE_SIZE
59
+ self.delete_queue_size = setting.DELETE_QUEUE_SIZE
60
+ self.done_queue_size = setting.DONE_QUEUE_SIZE
61
+
62
+ self.stop = threading.Event() # 结束事件
63
+ self.pause = threading.Event() # 暂停事件
64
+
65
+ self.crawler_path = setting.CRAWLER
66
+ self.pipeline_path = setting.PIPELINE
67
+
68
+ # self.crawler = None
69
+ # self.pipeline = None
70
+
71
+ self._threads = []
72
+
73
+ self._task_info = dict(todo={}, download={})
74
+
75
+ # ------
76
+
77
+ self.before_scheduler_wait_seconds = setting.BEFORE_SCHEDULER_WAIT_SECONDS
78
+
79
+ self.todo_queue_full_wait_seconds = setting.TODO_QUEUE_FULL_WAIT_SECONDS
80
+ self.new_queue_wait_seconds = setting.NEW_QUEUE_WAIT_SECONDS
81
+ self.done_queue_wait_seconds = setting.DONE_QUEUE_WAIT_SECONDS
82
+ self.upload_queue_wait_seconds = setting.UPLOAD_QUEUE_WAIT_SECONDS
83
+ self.seed_reset_seconds = setting.SEED_RESET_SECONDS
84
+
85
+ self.todo_queue_size = setting.TODO_QUEUE_SIZE
86
+ # self.new_queue_max_size = setting.NEW_QUEUE_MAX_SIZE
87
+ # self.done_queue_max_size = setting.DONE_QUEUE_MAX_SIZE
88
+ # self.upload_queue_max_size = setting.UPLOAD_QUEUE_MAX_SIZE
89
+
90
+ self.spider_max_retries = setting.SPIDER_MAX_RETRIES
91
+ self.spider_thread_num = setting.SPIDER_THREAD_NUM
92
+ self.spider_time_sleep = setting.SPIDER_TIME_SLEEP
93
+ self.spider_max_count = setting.SPIDER_MAX_COUNT
94
+ self.time_window = setting.TIME_WINDOW
95
+
96
+ self.done_model = setting.DONE_MODEL
97
+ self.task_model = setting.TASK_MODEL
98
+
99
+ self.filter_field = setting.FILTER_FIELD
100
+
101
+ @staticmethod
102
+ def insert_seed(seed: Union[Seed, dict]):
103
+ if isinstance(seed, dict):
104
+ seed = Seed(seed)
105
+ TaskQueue.SEED.push(seed)
106
+
107
+ @property
108
+ def request(self):
109
+ """
110
+ 自定义request函数
111
+ use case:
112
+ from cobweb.base import Request, BaseItem
113
+ @launcher.request
114
+ def request(seed: Seed) -> Union[Request, BaseItem]:
115
+ ...
116
+ yield Request(seed.url, seed)
117
+ """
118
+ def decorator(func):
119
+ self.custom_func['request'] = func
120
+ return decorator
121
+
122
+ @property
123
+ def download(self):
124
+ """
125
+ 自定义download函数
126
+ use case:
127
+ from cobweb.base import Request, Response, Seed, BaseItem
128
+ @launcher.download
129
+ def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
130
+ ...
131
+ yield Response(item.seed, response)
132
+ """
133
+ def decorator(func):
134
+ self.custom_func['download'] = func
135
+ return decorator
136
+
137
+ @property
138
+ def parse(self):
139
+ """
140
+ 自定义parse函数, xxxItem为自定义的存储数据类型
141
+ use case:
142
+ from cobweb.base import Request, Response
143
+ @launcher.parse
144
+ def parse(item: Response) -> BaseItem:
145
+ ...
146
+ yield xxxItem(seed, **kwargs)
147
+ """
148
+ def decorator(func):
149
+ self.custom_func['parse'] = func
150
+ return decorator
151
+
152
+ def remove_working_items(self, key, items):
153
+ for item in items:
154
+ self._task_info[key].pop(item, None)
155
+
156
+ def check_alive(self):
157
+ while not self.stop.is_set():
158
+ if not self.pause.is_set():
159
+ for thread in self._threads:
160
+ if not thread.is_alive():
161
+ thread.start()
162
+ time.sleep(1)
163
+
164
+ def _add_thread(self, func, num=1, obj=None, name=None, args=()):
165
+ obj = obj or self
166
+ name = obj.__class__.__name__ + name or func.__name__
167
+ for i in range(num):
168
+ func_name = name + "_" + str(i) if num > 1 else name
169
+ self._threads.append(threading.Thread(name=func_name, target=func, args=(obj,) + args))
170
+
171
+ def _init_schedule_thread(self):
172
+ ...
173
+
174
+ def _polling(self):
175
+ check_emtpy_times = 0
176
+ while not self.stop.is_set():
177
+ if TaskQueue.is_empty():
178
+ if self.pause.is_set():
179
+ run_time = int(time.time()) - self.app_time
180
+ if not self.task_model and run_time > self.before_scheduler_wait_seconds:
181
+ logger.info("Done! ready to close thread...")
182
+ self.stop.set()
183
+ else:
184
+ logger.info("pause! waiting for resume...")
185
+ elif check_emtpy_times > 2:
186
+ logger.info("pause! waiting for resume...")
187
+ self.doing_seeds = {}
188
+ self.pause.set()
189
+ else:
190
+ logger.info(
191
+ "check whether the task is complete, "
192
+ f"reset times {3 - check_emtpy_times}"
193
+ )
194
+ check_emtpy_times += 1
195
+ elif TaskQueue.TODO.length:
196
+ logger.info(f"Recovery {self.task} task run!")
197
+ check_emtpy_times = 0
198
+ self.pause.clear()
199
+ else:
200
+ logger.info(LogTemplate.launcher_polling.format(
201
+ task=self.task,
202
+ doing_len=len(self.doing_seeds.keys()),
203
+ todo_len=TaskQueue.TODO.length,
204
+ done_len=TaskQueue.DONE.length,
205
+ upload_len=TaskQueue.UPLOAD.length,
206
+ ))
207
+
208
+ time.sleep(10)
209
+
210
+ logger.info("Done! Ready to close thread...")
211
+
212
+ def run(self):
213
+ Crawler = dynamic_load_class(self.crawler_path)
214
+ Pipeline = dynamic_load_class(self.pipeline_path)
215
+
216
+ crawler = Crawler(
217
+ stop=self.stop, pause=self.pause,
218
+ thread_num=self.spider_thread_num,
219
+ time_sleep=self.spider_time_sleep,
220
+ custom_func=self.custom_func
221
+ )
222
+
223
+ pipeline = Pipeline(
224
+ stop=self.stop, pause=self.pause,
225
+ upload_size=self.upload_queue_max_size,
226
+ wait_seconds=self.upload_queue_wait_seconds
227
+ )
228
+
229
+ self._add_thread(obj=crawler, func=crawler.build_request_item)
230
+ self._add_thread(obj=crawler, func=crawler.build_download_item, num=self.spider_thread_num)
231
+ self._add_thread(obj=crawler, func=crawler.build_parse_item)
232
+
233
+ self._init_schedule_thread()
234
+ self.check_alive()
235
+
236
+
237
+
@@ -0,0 +1,88 @@
1
+ import time
2
+
3
+ from cobweb.base import logger
4
+ from cobweb.constant import LogTemplate
5
+ from .launcher import Launcher, check_pause
6
+
7
+
8
+ class LauncherAir(Launcher):
9
+
10
+ # def _scheduler(self):
11
+ # if self.start_seeds:
12
+ # self.__LAUNCHER_QUEUE__['todo'].push(self.start_seeds)
13
+
14
+ @check_pause
15
+ def _insert(self):
16
+ seeds = {}
17
+ status = self.__LAUNCHER_QUEUE__['new'].length < self._new_queue_max_size
18
+ for _ in range(self._new_queue_max_size):
19
+ seed = self.__LAUNCHER_QUEUE__['new'].pop()
20
+ if not seed:
21
+ break
22
+ seeds[seed.to_string] = seed.params.priority
23
+ if seeds:
24
+ self.__LAUNCHER_QUEUE__['todo'].push(seeds)
25
+ if status:
26
+ time.sleep(self._new_queue_wait_seconds)
27
+
28
+ @check_pause
29
+ def _delete(self):
30
+ seeds = []
31
+ status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
32
+
33
+ for _ in range(self._done_queue_max_size):
34
+ seed = self.__LAUNCHER_QUEUE__['done'].pop()
35
+ if not seed:
36
+ break
37
+ seeds.append(seed.to_string)
38
+
39
+ if seeds:
40
+ self._remove_doing_seeds(seeds)
41
+
42
+ if status:
43
+ time.sleep(self._done_queue_wait_seconds)
44
+
45
+ def _polling(self):
46
+
47
+ check_emtpy_times = 0
48
+
49
+ while not self._stop.is_set():
50
+
51
+ queue_not_empty_count = 0
52
+ pooling_wait_seconds = 30
53
+
54
+ for q in self.__LAUNCHER_QUEUE__.values():
55
+ if q.length != 0:
56
+ queue_not_empty_count += 1
57
+
58
+ if queue_not_empty_count == 0:
59
+ pooling_wait_seconds = 3
60
+ if self._pause.is_set():
61
+ check_emtpy_times = 0
62
+ if not self._task_model:
63
+ logger.info("Done! Ready to close thread...")
64
+ self._stop.set()
65
+ elif check_emtpy_times > 2:
66
+ self.__DOING__ = {}
67
+ self._pause.set()
68
+ else:
69
+ logger.info(
70
+ "check whether the task is complete, "
71
+ f"reset times {3 - check_emtpy_times}"
72
+ )
73
+ check_emtpy_times += 1
74
+ elif self._pause.is_set():
75
+ self._pause.clear()
76
+ self._execute()
77
+ else:
78
+ logger.info(LogTemplate.launcher_air_polling.format(
79
+ task=self.task,
80
+ doing_len=len(self.__DOING__.keys()),
81
+ todo_len=self.__LAUNCHER_QUEUE__['todo'].length,
82
+ done_len=self.__LAUNCHER_QUEUE__['done'].length,
83
+ upload_len=self.__LAUNCHER_QUEUE__['upload'].length,
84
+ ))
85
+
86
+ time.sleep(pooling_wait_seconds)
87
+
88
+
@@ -0,0 +1,161 @@
1
+ import time
2
+ import threading
3
+
4
+ from cobweb.db import ApiDB
5
+ from cobweb.base import Seed, TaskQueue,logger, stop, pause
6
+ from cobweb.constant import DealModel
7
+ from .launcher import Launcher
8
+
9
+
10
+ class LauncherApi(Launcher):
11
+
12
+ def __init__(self, task, project, custom_setting=None, **kwargs):
13
+ super().__init__(task, project, custom_setting, **kwargs)
14
+ self._db = ApiDB()
15
+
16
+ self._todo_key = "{%s:%s}:todo" % (project, task)
17
+ self._done_key = "{%s:%s}:done" % (project, task)
18
+ self._fail_key = "{%s:%s}:fail" % (project, task)
19
+ self._heartbeat_key = "heartbeat:%s_%s" % (project, task)
20
+
21
+ self._statistics_done_key = "statistics:%s:%s:done" % (project, task)
22
+ self._statistics_fail_key = "statistics:%s:%s:fail" % (project, task)
23
+ self._speed_control_key = "speed_control:%s_%s" % (project, task)
24
+
25
+ self._reset_lock_key = "lock:reset:%s_%s" % (project, task)
26
+
27
+ self._heartbeat_start_event = threading.Event()
28
+
29
+ @property
30
+ def heartbeat(self):
31
+ return self._db.exists(self._heartbeat_key)
32
+
33
+ def statistics(self, key, count):
34
+ if not self.task_model and not self._db.exists(key):
35
+ self._db.setex(key, 86400 * 30, int(count))
36
+ else:
37
+ self._db.incrby(key, count)
38
+
39
+ def _get_seed(self) -> Seed:
40
+ """
41
+ 从队列中获取种子(频控)
42
+ 设置时间窗口为self._time_window(秒),判断在该窗口内的采集量是否满足阈值(self._spider_max_speed)
43
+ :return: True -> 种子, False -> None
44
+ """
45
+ if TaskQueue.TODO.length and not self._db.auto_incr(
46
+ self._speed_control_key,
47
+ t=self.time_window,
48
+ limit=self.spider_max_count
49
+ ):
50
+ expire_time = self._db.ttl(self._speed_control_key)
51
+ logger.info(f"Too fast! Please wait {expire_time} seconds...")
52
+ time.sleep(expire_time / 2)
53
+ return None
54
+ return TaskQueue.TODO.pop()
55
+
56
+ @stop
57
+ def _reset(self):
58
+ """
59
+ 检查过期种子,重新添加到redis缓存中
60
+ """
61
+ if self._db.lock(self._reset_lock_key, t=120):
62
+
63
+ _min = -int(time.time()) + self.seed_reset_seconds \
64
+ if self.heartbeat else "-inf"
65
+
66
+ self._db.members(self._todo_key, 0, _min=_min, _max="(0")
67
+
68
+ if not self.heartbeat:
69
+ self._heartbeat_start_event.set()
70
+
71
+ self._db.delete(self._reset_lock_key)
72
+
73
+ time.sleep(30)
74
+
75
+ @stop
76
+ def _refresh(self):
77
+ """
78
+ 刷新doing种子过期时间,防止reset重新消费
79
+ """
80
+ if self.doing_seeds:
81
+ refresh_time = int(time.time())
82
+ seeds = {k: -refresh_time - v / 1e3 for k, v in self.doing_seeds.items()}
83
+ self._db.zadd(self._todo_key, item=seeds, xx=True)
84
+ time.sleep(3)
85
+
86
+ @stop
87
+ def _scheduler(self):
88
+ """
89
+ 调度任务,获取redis队列种子,同时添加到doing字典中
90
+ """
91
+ if not self._db.zcount(self._todo_key, 0, "(1000"):
92
+ time.sleep(self.scheduler_wait_seconds)
93
+ elif TaskQueue.TODO.length >= self.todo_queue_size:
94
+ time.sleep(self.todo_queue_full_wait_seconds)
95
+ else:
96
+ members = self._db.members(
97
+ self._todo_key, int(time.time()),
98
+ count=self.todo_queue_size,
99
+ _min=0, _max="(1000"
100
+ )
101
+ for member, priority in members:
102
+ seed = Seed(member, priority=priority)
103
+ TaskQueue.TODO.push(seed)
104
+ self.doing_seeds[seed.to_string] = seed.params.priority
105
+
106
+ @pause
107
+ def _heartbeat(self):
108
+ if self._heartbeat_start_event.is_set():
109
+ self._db.setex(self._heartbeat_key, t=5)
110
+ time.sleep(3)
111
+
112
+ @pause
113
+ def _insert(self):
114
+ """
115
+ 添加新种子到redis队列中
116
+ """
117
+ seeds = {}
118
+ for _ in range(self.new_queue_max_size):
119
+ if seed := TaskQueue.SEED.pop():
120
+ seeds[seed.to_string] = seed.params.priority
121
+ if seeds:
122
+ self._db.zadd(self._todo_key, seeds, nx=True)
123
+ if TaskQueue.SEED.length < self.new_queue_max_size:
124
+ time.sleep(self.new_queue_wait_seconds)
125
+
126
+ @pause
127
+ def _delete(self):
128
+ """
129
+ 删除队列种子,根据状态添加至成功或失败队列,移除doing字典种子索引
130
+ """
131
+ seed_info = {"count": 0, "failed": [], "succeed": [], "common": []}
132
+ status = TaskQueue.DONE.length < self.done_queue_max_size
133
+
134
+ for _ in range(self.done_queue_max_size):
135
+ seed = TaskQueue.DONE.pop()
136
+ if not seed:
137
+ break
138
+ if seed.params.seed_status == DealModel.fail:
139
+ seed_info["failed"].append(seed.to_string)
140
+ elif self.done_model == 1:
141
+ seed_info["succeed"].append(seed.to_string)
142
+ else:
143
+ seed_info["common"].append(seed.to_string)
144
+ seed_info['count'] += 1
145
+
146
+ if seed_info["count"]:
147
+
148
+ succeed_count = int(self._db.zrem(self._todo_key, *seed_info["common"]) or 0)
149
+ succeed_count += int(self._db.done([self._todo_key, self._done_key], *seed_info["succeed"]) or 0)
150
+ failed_count = int(self._db.done([self._todo_key, self._fail_key], *seed_info["failed"]) or 0)
151
+
152
+ if failed_count:
153
+ self.statistics(self._statistics_fail_key, failed_count)
154
+ if succeed_count:
155
+ self.statistics(self._statistics_done_key, succeed_count)
156
+
157
+ self._remove_doing_seeds(seed_info["common"] + seed_info["succeed"] + seed_info["failed"])
158
+
159
+ if status:
160
+ time.sleep(self.done_queue_wait_seconds)
161
+
@@ -0,0 +1,96 @@
1
+ import time
2
+
3
+ from base import TaskQueue
4
+ from cobweb.base import decorators
5
+ from schedulers.scheduler_redis import RedisScheduler
6
+ from .launcher import Launcher
7
+
8
+
9
+ class LauncherPro(Launcher):
10
+
11
+ def __init__(self, task, project, custom_setting=None, **kwargs):
12
+ super().__init__(task, project, custom_setting, **kwargs)
13
+ self._redis_download = "{%s:%s}:download" % (project, task)
14
+ self._redis_todo = "{%s:%s}:todo" % (project, task)
15
+ self._scheduler = RedisScheduler(task, project)
16
+
17
+ # @decorators.add_thread()
18
+ @decorators.stop
19
+ def _schedule(self):
20
+ thread_sleep = self.scheduling_wait_time
21
+ for q, key, size in [
22
+ (TaskQueue.TODO, self._redis_todo, self.todo_queue_size),
23
+ (TaskQueue.DOWNLOAD, self._redis_download, self.download_queue_size),
24
+ ]:
25
+ if q.length < size:
26
+ for item in self._scheduler.schedule(
27
+ key, self.scheduling_size
28
+ ):
29
+ q.push(item)
30
+ thread_sleep = 0.1
31
+ time.sleep(thread_sleep)
32
+
33
+ # @decorators.add_thread()
34
+ @decorators.pause
35
+ def _heartbeat(self):
36
+ if self._scheduler.working.is_set():
37
+ self._scheduler.set_heartbeat()
38
+ time.sleep(3)
39
+
40
+ # @decorators.add_thread()
41
+ @decorators.pause
42
+ def _reset(self):
43
+ self._scheduler.reset(
44
+ keys=[self._redis_todo, self._redis_download],
45
+ reset_time=self.seed_reset_seconds
46
+ )
47
+ time.sleep(15)
48
+
49
+ # @decorators.add_thread()
50
+ @decorators.pause
51
+ def _insert(self):
52
+ thread_sleep = 0.1
53
+ for q, key, size in [
54
+ (TaskQueue.SEED, self._redis_todo, self.seed_queue_size),
55
+ (TaskQueue.REQUEST, self._redis_download, self.request_queue_size),
56
+ ]:
57
+ items = {}
58
+ while item := q.pop() and len(items.keys()) < self.inserting_size:
59
+ items[item.to_string] = item.params.priority
60
+ if q.length >= size:
61
+ thread_sleep = self.inserting_wait_time
62
+ self._scheduler.insert(key, items)
63
+ time.sleep(thread_sleep)
64
+
65
+ # @decorators.add_thread()
66
+ @decorators.pause
67
+ def _refresh(self):
68
+ self._scheduler.refresh(self._redis_todo, self._task_info["todo"])
69
+ self._scheduler.refresh(self._redis_download, self._task_info["download"])
70
+ time.sleep(3)
71
+
72
+ # @decorators.add_thread()
73
+ @decorators.pause
74
+ def _remove(self):
75
+ thread_sleep = self.removing_wait_time
76
+ for q, key, size in [
77
+ (TaskQueue.DELETE, self._redis_todo, self.delete_queue_size),
78
+ (TaskQueue.DONE, self._redis_download, self.done_queue_size),
79
+ ]:
80
+ items = []
81
+ while item := q.pop() and len(items) < self.removing_size:
82
+ items.append(item)
83
+ self._scheduler.delete(key, *items)
84
+ self.remove_working_items(key.split(":")[-1], items)
85
+ if q.length >= size:
86
+ thread_sleep = 0.1
87
+ time.sleep(thread_sleep)
88
+
89
+ def _init_schedule_thread(self):
90
+ self._add_thread(func=self._heartbeat)
91
+ self._add_thread(func=self._reset)
92
+ self._add_thread(func=self._refresh)
93
+ self._add_thread(func=self._schedule)
94
+ self._add_thread(func=self._insert)
95
+ self._add_thread(func=self._remove)
96
+ self._add_thread(func=self._polling)
@@ -0,0 +1,47 @@
1
+ import threading
2
+ import time
3
+ from functools import wraps
4
+
5
+
6
+ def add_thread(num=1):
7
+ def decorator(func):
8
+ @wraps(func)
9
+ def wrapper(self, *args):
10
+ for i in range(num):
11
+ name = func.__name__ + "_" + str(i) if num > 1 else func.__name__
12
+ self._threads.append(threading.Thread(name=name, target=func, args=(self,) + args))
13
+ return wrapper
14
+
15
+ return decorator
16
+
17
+
18
+ def pause(func):
19
+ @wraps(func)
20
+ def wrapper(*args, **kwargs):
21
+ while True:
22
+ try:
23
+ func(*args, **kwargs)
24
+ except Exception as e:
25
+ print(str(e))
26
+ finally:
27
+ time.sleep(0.1)
28
+
29
+ return wrapper
30
+
31
+
32
+ class TTT:
33
+ _threads = []
34
+
35
+ @add_thread()
36
+ @pause
37
+ def tt(self):
38
+ print("hello")
39
+ time.sleep(1)
40
+
41
+ tttt = TTT()
42
+ tttt.tt()
43
+ print(TTT._threads)
44
+
45
+
46
+ for _ in TTT._threads:
47
+ _.start()
@@ -0,0 +1,3 @@
1
+ from .pipeline import Pipeline
2
+ from .pipeline_console import Console
3
+ from .pipeline_loghub import Loghub
@@ -0,0 +1,68 @@
1
+ import time
2
+ import threading
3
+
4
+ from abc import ABC, abstractmethod
5
+
6
+ from cobweb.utils import TaskQueue
7
+ from cobweb.base import BaseItem, logger
8
+
9
+
10
+ class Pipeline(threading.Thread, ABC):
11
+
12
+ def __init__(
13
+ self,
14
+ stop: threading.Event,
15
+ pause: threading.Event,
16
+ upload_size: int,
17
+ wait_seconds: int
18
+ ):
19
+ super().__init__()
20
+ self._stop = stop
21
+ self._pause = pause
22
+
23
+ self.upload_size = upload_size
24
+ self.wait_seconds = wait_seconds
25
+
26
+ @abstractmethod
27
+ def build(self, item: BaseItem) -> dict:
28
+ pass
29
+
30
+ @abstractmethod
31
+ def upload(self, table: str, data: list) -> bool:
32
+ pass
33
+
34
+ def run(self):
35
+ while not self._stop.is_set():
36
+ if not TaskQueue.UPLOAD.length:
37
+ time.sleep(self.wait_seconds)
38
+ continue
39
+ if TaskQueue.UPLOAD.length < self.upload_size:
40
+ time.sleep(self.wait_seconds)
41
+ status = True
42
+ data_info, seeds = {}, []
43
+ try:
44
+ for _ in range(self.upload_size):
45
+ item = TaskQueue.UPLOAD.pop()
46
+ if not item:
47
+ break
48
+ seeds.append(item.seed)
49
+ data = self.build(item)
50
+ data_info.setdefault(item.table, []).append(data)
51
+ for table, datas in data_info.items():
52
+ try:
53
+ self.upload(table, datas)
54
+ except Exception as e:
55
+ logger.info(e)
56
+ status = False
57
+ except Exception as e:
58
+ logger.info(e)
59
+ status = False
60
+ if not status:
61
+ for seed in seeds:
62
+ seed.params.seed_status = "deal model: fail"
63
+ if seeds:
64
+ TaskQueue.DONE.push(seeds)
65
+
66
+ logger.info("upload pipeline close!")
67
+
68
+