cobweb-launcher 1.3.14__py3-none-any.whl → 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. cobweb/__init__.py +1 -1
  2. cobweb/base/__init__.py +4 -149
  3. cobweb/base/common_queue.py +0 -13
  4. cobweb/base/request.py +2 -14
  5. cobweb/base/seed.py +16 -12
  6. cobweb/constant.py +0 -16
  7. cobweb/crawlers/crawler.py +3 -85
  8. cobweb/db/redis_db.py +109 -52
  9. cobweb/launchers/__init__.py +8 -2
  10. cobweb/launchers/distributor.py +171 -0
  11. cobweb/launchers/launcher.py +87 -131
  12. cobweb/launchers/uploader.py +65 -0
  13. cobweb/pipelines/pipeline.py +3 -36
  14. cobweb/schedulers/__init__.py +1 -3
  15. cobweb/schedulers/launcher_air.py +93 -0
  16. cobweb/schedulers/launcher_api.py +225 -0
  17. cobweb/schedulers/scheduler.py +85 -0
  18. cobweb/schedulers/scheduler_with_redis.py +177 -0
  19. cobweb/setting.py +15 -32
  20. cobweb/utils/__init__.py +2 -1
  21. cobweb/utils/decorators.py +43 -0
  22. cobweb/utils/dotting.py +55 -0
  23. cobweb/utils/oss.py +28 -9
  24. {cobweb_launcher-1.3.14.dist-info → cobweb_launcher-3.1.0.dist-info}/METADATA +1 -1
  25. cobweb_launcher-3.1.0.dist-info/RECORD +41 -0
  26. cobweb/base/basic.py +0 -295
  27. cobweb/base/dotting.py +0 -35
  28. cobweb/launchers/launcher_air.py +0 -88
  29. cobweb/launchers/launcher_api.py +0 -88
  30. cobweb/launchers/launcher_pro.py +0 -88
  31. cobweb/schedulers/scheduler_api.py +0 -72
  32. cobweb/schedulers/scheduler_redis.py +0 -72
  33. cobweb_launcher-1.3.14.dist-info/RECORD +0 -40
  34. {cobweb_launcher-1.3.14.dist-info → cobweb_launcher-3.1.0.dist-info}/LICENSE +0 -0
  35. {cobweb_launcher-1.3.14.dist-info → cobweb_launcher-3.1.0.dist-info}/WHEEL +0 -0
  36. {cobweb_launcher-1.3.14.dist-info → cobweb_launcher-3.1.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,225 @@
1
+ import time
2
+ import threading
3
+
4
+ from cobweb.db import ApiDB
5
+ from cobweb.base import Seed, logger
6
+ from cobweb.constant import DealModel, LogTemplate
7
+ from .launcher import Launcher, check_pause
8
+
9
+
10
+ class LauncherApi(Launcher):
11
+
12
+ def __init__(self, task, project, custom_setting=None, **kwargs):
13
+ super().__init__(task, project, custom_setting, **kwargs)
14
+ self._db = ApiDB()
15
+
16
+ self._todo_key = "{%s:%s}:todo" % (project, task)
17
+ self._done_key = "{%s:%s}:done" % (project, task)
18
+ self._fail_key = "{%s:%s}:fail" % (project, task)
19
+ self._heartbeat_key = "heartbeat:%s_%s" % (project, task)
20
+
21
+ self._statistics_done_key = "statistics:%s:%s:done" % (project, task)
22
+ self._statistics_fail_key = "statistics:%s:%s:fail" % (project, task)
23
+ self._speed_control_key = "speed_control:%s_%s" % (project, task)
24
+
25
+ self._reset_lock_key = "lock:reset:%s_%s" % (project, task)
26
+
27
+ # self._bf_key = "bloom_%s_%s" % (project, task)
28
+ # self._bf = BloomFilter(self._bf_key)
29
+
30
+ self._heartbeat_start_event = threading.Event()
31
+ self._redis_queue_empty_event = threading.Event()
32
+
33
+ @property
34
+ def heartbeat(self):
35
+ return self._db.exists(self._heartbeat_key)
36
+
37
+ def statistics(self, key, count):
38
+ if not self._task_model and not self._db.exists(key):
39
+ self._db.setex(key, 86400 * 30, int(count))
40
+ else:
41
+ self._db.incrby(key, count)
42
+
43
+ def _get_seed(self) -> Seed:
44
+ """
45
+ 从队列中获取种子(频控)
46
+ 设置时间窗口为self._time_window(秒),判断在该窗口内的采集量是否满足阈值(self._spider_max_speed)
47
+ :return: True -> 种子, False -> None
48
+ """
49
+ if (self._speed_control and self.__LAUNCHER_QUEUE__["todo"].length and
50
+ not self._db.auto_incr(self._speed_control_key, t=self._time_window, limit=self._spider_max_count)):
51
+ expire_time = self._db.ttl(self._speed_control_key)
52
+ if isinstance(expire_time, int) and expire_time <= -1:
53
+ self._db.delete(self._speed_control_key)
54
+ elif isinstance(expire_time, int):
55
+ logger.info(f"Too fast! Please wait {expire_time} seconds...")
56
+ time.sleep(expire_time / 2)
57
+ return None
58
+ seed = self.__LAUNCHER_QUEUE__["todo"].pop()
59
+ return seed
60
+
61
+ @check_pause
62
+ def _execute_heartbeat(self):
63
+ if self._heartbeat_start_event.is_set():
64
+ self._db.setex(self._heartbeat_key, 5)
65
+ time.sleep(3)
66
+
67
+ @check_pause
68
+ def _reset(self):
69
+ """
70
+ 检查过期种子,重新添加到redis缓存中
71
+ """
72
+ reset_wait_seconds = 30
73
+ if self._db.lock(self._reset_lock_key, t=120):
74
+
75
+ _min = -int(time.time()) + self._seed_reset_seconds \
76
+ if self.heartbeat else "-inf"
77
+
78
+ self._db.members(self._todo_key, 0, _min=_min, _max="(0")
79
+
80
+ if not self.heartbeat:
81
+ self._heartbeat_start_event.set()
82
+
83
+ self._db.delete(self._reset_lock_key)
84
+
85
+ time.sleep(reset_wait_seconds)
86
+
87
+ @check_pause
88
+ def _scheduler(self):
89
+ """
90
+ 调度任务,获取redis队列种子,同时添加到doing字典中
91
+ """
92
+ if not self._db.zcount(self._todo_key, 0, "(1000"):
93
+ time.sleep(self._scheduler_wait_seconds)
94
+ elif self.__LAUNCHER_QUEUE__['todo'].length >= self._todo_queue_size:
95
+ time.sleep(self._todo_queue_full_wait_seconds)
96
+ else:
97
+ members = self._db.members(
98
+ self._todo_key, int(time.time()),
99
+ count=self._todo_queue_size,
100
+ _min=0, _max="(1000"
101
+ )
102
+ for member, priority in members:
103
+ seed = Seed(member, priority=priority)
104
+ self.__LAUNCHER_QUEUE__['todo'].push(seed)
105
+ self.__DOING__[seed.to_string] = seed.params.priority
106
+
107
+ @check_pause
108
+ def _insert(self):
109
+ """
110
+ 添加新种子到redis队列中
111
+ """
112
+ new_seeds = {}
113
+ del_seeds = set()
114
+ status = self.__LAUNCHER_QUEUE__['new'].length < self._new_queue_max_size
115
+ for _ in range(self._new_queue_max_size):
116
+ seed_tuple = self.__LAUNCHER_QUEUE__['new'].pop()
117
+ if not seed_tuple:
118
+ break
119
+ seed, new_seed = seed_tuple
120
+ new_seeds[new_seed.to_string] = new_seed.params.priority
121
+ del_seeds.add(seed)
122
+ if new_seeds:
123
+ self._db.zadd(self._todo_key, new_seeds, nx=True)
124
+ if del_seeds:
125
+ self.__LAUNCHER_QUEUE__['done'].push(list(del_seeds))
126
+ if status:
127
+ time.sleep(self._new_queue_wait_seconds)
128
+
129
+ @check_pause
130
+ def _refresh(self):
131
+ """
132
+ 刷新doing种子过期时间,防止reset重新消费
133
+ """
134
+ if self.__DOING__:
135
+ refresh_time = int(time.time())
136
+ seeds = {k:-refresh_time - v / 1000 for k, v in self.__DOING__.items()}
137
+ self._db.zadd(self._todo_key, item=seeds, xx=True)
138
+ time.sleep(15)
139
+
140
+ @check_pause
141
+ def _delete(self):
142
+ """
143
+ 删除队列种子,根据状态添加至成功或失败队列,移除doing字典种子索引
144
+ """
145
+ # seed_info = {"count": 0, "failed": [], "succeed": [], "common": []}
146
+
147
+ seed_list = []
148
+ status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
149
+
150
+ for _ in range(self._done_queue_max_size):
151
+ seed = self.__LAUNCHER_QUEUE__['done'].pop()
152
+ if not seed:
153
+ break
154
+ seed_list.append(seed.to_string)
155
+
156
+ if seed_list:
157
+
158
+ self._db.zrem(self._todo_key, *seed_list)
159
+ self._remove_doing_seeds(seed_list)
160
+
161
+ if status:
162
+ time.sleep(self._done_queue_wait_seconds)
163
+
164
+ def _polling(self):
165
+ wait_scheduler_execute = True
166
+ check_emtpy_times = 0
167
+ while not self._stop.is_set():
168
+ queue_not_empty_count = 0
169
+ pooling_wait_seconds = 30
170
+
171
+ for q in self.__LAUNCHER_QUEUE__.values():
172
+ if q.length != 0:
173
+ queue_not_empty_count += 1
174
+ wait_scheduler_execute = False
175
+
176
+ if queue_not_empty_count == 0:
177
+ pooling_wait_seconds = 3
178
+ if self._pause.is_set():
179
+ check_emtpy_times = 0
180
+ if not self._task_model and (
181
+ not wait_scheduler_execute or
182
+ int(time.time()) - self._app_time > self._before_scheduler_wait_seconds
183
+ ):
184
+ logger.info("Done! ready to close thread...")
185
+ self._stop.set()
186
+
187
+ elif self._db.zcount(self._todo_key, _min=0, _max="(1000"):
188
+ logger.info(f"Recovery {self.task} task run!")
189
+ self._pause.clear()
190
+ self._execute()
191
+ else:
192
+ logger.info("pause! waiting for resume...")
193
+ elif check_emtpy_times > 2:
194
+ self.__DOING__ = {}
195
+ seed_count = self._db.zcard(self._todo_key)
196
+ logger.info(f"队列剩余种子数:{seed_count}")
197
+ if not seed_count:
198
+ logger.info("Done! pause set...")
199
+ self._pause.set()
200
+ else:
201
+ self._pause.clear()
202
+ else:
203
+ logger.info(
204
+ "check whether the task is complete, "
205
+ f"reset times {3 - check_emtpy_times}"
206
+ )
207
+ check_emtpy_times += 1
208
+ else:
209
+ if self._pause.is_set():
210
+ self._pause.clear()
211
+ logger.info(LogTemplate.launcher_pro_polling.format(
212
+ task=self.task,
213
+ doing_len=len(self.__DOING__.keys()),
214
+ todo_len=self.__LAUNCHER_QUEUE__['todo'].length,
215
+ done_len=self.__LAUNCHER_QUEUE__['done'].length,
216
+ redis_seed_count=self._db.zcount(self._todo_key, "-inf", "+inf"),
217
+ redis_todo_len=self._db.zcount(self._todo_key, 0, "(1000"),
218
+ redis_doing_len=self._db.zcount(self._todo_key, "-inf", "(0"),
219
+ upload_len=self.__LAUNCHER_QUEUE__['upload'].length,
220
+ ))
221
+
222
+ time.sleep(pooling_wait_seconds)
223
+
224
+ logger.info("Done! Ready to close thread...")
225
+
@@ -0,0 +1,85 @@
1
+ import threading
2
+
3
+ from cobweb import setting
4
+ from typing import Callable
5
+ from cobweb.base import Queue
6
+ from abc import ABC, abstractmethod
7
+
8
+
9
+ class Scheduler(ABC, threading.Thread):
10
+
11
+ __LAUNCHER_FUNC__ = ["_reset", "_scheduler", "_insert", "_refresh", "_delete"]
12
+
13
+ def __init__(
14
+ self,
15
+ task,
16
+ project,
17
+ stop: threading.Event,
18
+ pause: threading.Event,
19
+ new: Queue,
20
+ todo: Queue,
21
+ done: Queue,
22
+ upload: Queue,
23
+ register: Callable
24
+ ):
25
+ super().__init__()
26
+ self.task = task
27
+ self.project = project
28
+
29
+ self.task_model = setting.TASK_MODEL
30
+ self.seed_reset_seconds = setting.SEED_RESET_SECONDS
31
+ self.scheduler_wait_seconds = setting.SCHEDULER_WAIT_SECONDS
32
+ self.new_queue_wait_seconds = setting.NEW_QUEUE_WAIT_SECONDS
33
+ self.done_queue_wait_seconds = setting.DONE_QUEUE_WAIT_SECONDS
34
+ self.todo_queue_full_wait_seconds = setting.TODO_QUEUE_FULL_WAIT_SECONDS
35
+ self.before_scheduler_wait_seconds = setting.BEFORE_SCHEDULER_WAIT_SECONDS
36
+
37
+ self.todo_queue_size = setting.TODO_QUEUE_SIZE
38
+ self.new_queue_max_size = setting.NEW_QUEUE_MAX_SIZE
39
+ self.done_queue_max_size = setting.DONE_QUEUE_MAX_SIZE
40
+ self.upload_queue_max_size = setting.UPLOAD_QUEUE_MAX_SIZE
41
+
42
+ self.stop = stop
43
+ self.pause = pause
44
+
45
+ self.new = new
46
+ self.todo = todo
47
+ self.done = done
48
+ self.upload = upload
49
+
50
+ self.register = register
51
+
52
+ self.working_seeds = dict()
53
+
54
+ def is_empty(self):
55
+ if self.new.length == 0 and self.todo.length == 0 and self.done.length == 0 and self.upload.length == 0:
56
+ return True
57
+ else:
58
+ return False
59
+
60
+ def remove_working_seeds(self, seeds: list = None):
61
+ for seed in seeds:
62
+ if seed in self.working_seeds:
63
+ self.working_seeds.pop(seed)
64
+
65
+ @abstractmethod
66
+ def reset(self):
67
+ ...
68
+
69
+ @abstractmethod
70
+ def schedule(self):
71
+ ...
72
+
73
+ @abstractmethod
74
+ def insert(self):
75
+ ...
76
+
77
+ @abstractmethod
78
+ def refresh(self):
79
+ ...
80
+
81
+ @abstractmethod
82
+ def delete(self):
83
+ ...
84
+
85
+
@@ -0,0 +1,177 @@
1
+ import os
2
+ import time
3
+ import threading
4
+ from typing import Callable
5
+ from cobweb.db import RedisDB, ApiDB
6
+ from cobweb.utils import check_pause
7
+ from cobweb.base import Queue, Seed, logger
8
+ from cobweb.constant import LogTemplate
9
+ from .scheduler import Scheduler
10
+ use_api = bool(int(os.getenv("REDIS_API")))
11
+
12
+
13
+ class RedisScheduler(Scheduler):
14
+
15
+ def __init__(
16
+ self,
17
+ task,
18
+ project,
19
+ stop: threading.Event,
20
+ pause: threading.Event,
21
+ new: Queue,
22
+ todo: Queue,
23
+ done: Queue,
24
+ upload: Queue,
25
+ register: Callable
26
+ ):
27
+ super().__init__(task, project, stop, pause, new, todo, done, upload, register)
28
+ self.todo_key = "{%s:%s}:todo" % (project, task)
29
+ self.done_key = "{%s:%s}:done" % (project, task)
30
+ self.fail_key = "{%s:%s}:fail" % (project, task)
31
+ self.heartbeat_key = "heartbeat:%s_%s" % (project, task)
32
+ self.speed_control_key = "speed_control:%s_%s" % (project, task)
33
+ self.reset_lock_key = "lock:reset:%s_%s" % (project, task)
34
+ self.redis_queue_empty_event = threading.Event()
35
+ self.db = ApiDB() if use_api else RedisDB()
36
+
37
+ @check_pause
38
+ def reset(self):
39
+ """
40
+ 检查过期种子,重新添加到redis缓存中
41
+ """
42
+ reset_wait_seconds = 30
43
+ if self.db.lock(self.reset_lock_key, t=60):
44
+
45
+ _min = -int(time.time()) + self.seed_reset_seconds
46
+ self.db.members(self.todo_key, 0, _min=_min, _max="(0")
47
+ self.db.delete(self.reset_lock_key)
48
+
49
+ time.sleep(reset_wait_seconds)
50
+
51
+ @check_pause
52
+ def schedule(self):
53
+ """
54
+ 调度任务,获取redis队列种子,同时添加到doing字典中
55
+ """
56
+ if not self.db.zcount(self.todo_key, 0, "(1000"):
57
+ time.sleep(self.scheduler_wait_seconds)
58
+ elif self.todo.length >= self.todo_queue_size:
59
+ time.sleep(self.todo_queue_full_wait_seconds)
60
+ else:
61
+ members = self.db.members(
62
+ self.todo_key, int(time.time()),
63
+ count=self.todo_queue_size,
64
+ _min=0, _max="(1000"
65
+ )
66
+ for member, priority in members:
67
+ seed = Seed(member, priority=priority)
68
+ self.working_seeds[seed.to_string] = seed.params.priority
69
+ self.todo.push(seed)
70
+
71
+ @check_pause
72
+ def insert(self):
73
+ """
74
+ 添加新种子到redis队列中
75
+ """
76
+ new_seeds = {}
77
+ del_seeds = set()
78
+ status = self.new.length < self.new_queue_max_size
79
+ for _ in range(self.new_queue_max_size):
80
+ seed_tuple = self.new.pop()
81
+ if not seed_tuple:
82
+ break
83
+ seed, new_seed = seed_tuple
84
+ new_seeds[new_seed.to_string] = new_seed.params.priority
85
+ del_seeds.add(seed)
86
+ if new_seeds:
87
+ self.db.zadd(self.todo_key, new_seeds, nx=True)
88
+ if del_seeds:
89
+ self.done.push(list(del_seeds))
90
+ if status:
91
+ time.sleep(self.new_queue_wait_seconds)
92
+
93
+ @check_pause
94
+ def refresh(self):
95
+ """
96
+ 刷新doing种子过期时间,防止reset重新消费
97
+ """
98
+ if self.working_seeds:
99
+ refresh_time = int(time.time())
100
+ seeds = {k:-refresh_time - v / 1000 for k, v in self.working_seeds.items()}
101
+ self.db.zadd(self.todo_key, item=seeds, xx=True)
102
+ time.sleep(3)
103
+
104
+ @check_pause
105
+ def delete(self):
106
+ """
107
+ 删除队列种子,根据状态添加至成功或失败队列,移除doing字典种子索引
108
+ """
109
+ seed_list = []
110
+ status = self.done.length < self.done_queue_max_size
111
+
112
+ for _ in range(self.done_queue_max_size):
113
+ seed = self.done.pop()
114
+ if not seed:
115
+ break
116
+ seed_list.append(seed.to_string)
117
+
118
+ if seed_list:
119
+
120
+ self.db.zrem(self.todo_key, *seed_list)
121
+ self.remove_working_seeds(seed_list)
122
+
123
+ if status:
124
+ time.sleep(self.done_queue_wait_seconds)
125
+
126
+ def run(self):
127
+ start_time = int(time.time())
128
+
129
+ self.register(self.reset, tag="scheduler")
130
+ self.register(self.insert, tag="scheduler")
131
+ self.register(self.delete, tag="scheduler")
132
+ self.register(self.refresh, tag="scheduler")
133
+ self.register(self.schedule, tag="scheduler")
134
+
135
+ while not self.stop.is_set():
136
+ working_count = len(self.working_seeds.keys())
137
+ memory_count = self.db.zcount(self.todo_key, "-inf", "(0")
138
+ todo_count = self.db.zcount(self.todo_key, 0, "(1000")
139
+ all_count = self.db.zcard(self.todo_key)
140
+
141
+ if self.is_empty():
142
+ if self.pause.is_set():
143
+ execute_time = int(time.time()) - start_time
144
+ if not self.task_model and execute_time > self.before_scheduler_wait_seconds:
145
+ logger.info("Done! ready to close thread...")
146
+ self.stop.set()
147
+ elif todo_count:
148
+ logger.info(f"Recovery {self.task} task run!todo seeds count: {todo_count}, queue length: {all_count}")
149
+ self.pause.clear()
150
+ # self.execute()
151
+ else:
152
+ logger.info("pause! waiting for resume...")
153
+ else:
154
+ if all_count:
155
+ logger.info(f"todo seeds count: {todo_count}, queue length: {all_count}")
156
+ self.pause.is_set()
157
+ else:
158
+ logger.info("Done! pause set...")
159
+ self.pause.clear()
160
+ else:
161
+ if self.pause.is_set():
162
+ self.pause.clear()
163
+ logger.info(LogTemplate.launcher_pro_polling.format(
164
+ task=self.task,
165
+ doing_len=working_count,
166
+ todo_len=self.todo.length,
167
+ done_len=self.done.length,
168
+ redis_seed_count=all_count,
169
+ redis_todo_len=todo_count,
170
+ redis_doing_len=memory_count,
171
+ upload_len=self.upload.length,
172
+ ))
173
+
174
+ time.sleep(30)
175
+
176
+ logger.info("Done! Ready to close thread...")
177
+
cobweb/setting.py CHANGED
@@ -30,49 +30,29 @@ OSS_MIN_UPLOAD_SIZE = 1024
30
30
  # 采集器选择
31
31
  CRAWLER = "cobweb.crawlers.Crawler"
32
32
 
33
- # 数据存储链路
34
- PIPELINE = "cobweb.pipelines.pipeline_console.Console"
33
+ # 数据管道
34
+ PIPELINE = "cobweb.pipelines.Console"
35
+
36
+ # 调度器
37
+ SCHEDULER = "cobweb.schedulers.RedisScheduler"
35
38
 
36
39
 
37
40
  # Launcher 等待时间
38
41
 
39
42
  BEFORE_SCHEDULER_WAIT_SECONDS = 60 # 调度前等待时间,只作用于单次任务
40
-
41
- SCHEDULING_WAIT_TIME = 15 # SCHEDULER ITEM 调度等待时间
42
- INSERTING_WAIT_TIME = 30 # INSERT ITEM 等待时间
43
- REMOVING_WAIT_TIME = 5 # REMOVE ITEM 等待时间
44
- RESET_WAIT_TIME = 30 # REST ITEM 等待时间
45
- UPLOAD_WAIT_TIME = 15 # 上传等待时间
46
-
47
- TODO_QUEUE_FULL_WAIT_SECONDS = 5 # 队列已满时等待时间
43
+ SCHEDULER_WAIT_SECONDS = 15 # 调度等待时间
44
+ TODO_QUEUE_FULL_WAIT_SECONDS = 5 # todo队列已满时等待时间
48
45
  NEW_QUEUE_WAIT_SECONDS = 30 # new队列等待时间
49
46
  DONE_QUEUE_WAIT_SECONDS = 5 # done队列等待时间
50
47
  UPLOAD_QUEUE_WAIT_SECONDS = 15 # upload队列等待时间
51
- SEED_RESET_SECONDS = 30 # 种子重制时间
48
+ SEED_RESET_SECONDS = 60 # 种子重制时间
52
49
 
53
50
 
54
51
  # Launcher 队列长度
55
- SCHEDULING_SIZE = 100 # 调度队列长度
56
- INSERTING_SIZE = 100 # INSERT 长度
57
- REMOVING_SIZE = 100 # REMOVE 长度
58
-
59
- # SEED = Queue() # 添加任务种子队列
60
- # TODO = Queue() # 任务种子队列
61
- # REQUEST = Queue() # 请求队列
62
- # DOWNLOAD = Queue() # 下载任务队列
63
- # RESPONSE = Queue() # 响应队列
64
- # DONE = Queue() # 下载完成队列
65
- # UPLOAD = Queue() # 任务上传队列
66
- # DELETE = Queue() # 任务删除队列
67
-
68
- SEED_QUEUE_SIZE = 100 # TODO 队列长度
69
- TODO_QUEUE_SIZE = 100 # TODO 队列长度
70
- REQUEST_QUEUE_SIZE = 100 # new队列长度
71
- DOWNLOAD_QUEUE_SIZE = 100 # done队列长度
72
- RESPONSE_QUEUE_SIZE = 100 # upload队列长度
73
- DONE_QUEUE_SIZE = 100 # upload队列长度
74
- UPLOAD_QUEUE_SIZE = 100 # upload队列长度
75
- DELETE_QUEUE_SIZE = 100 # upload队列长度
52
+ TODO_QUEUE_SIZE = 100 # todo队列长度
53
+ NEW_QUEUE_MAX_SIZE = 100 # new队列长度
54
+ DONE_QUEUE_MAX_SIZE = 100 # done队列长度
55
+ UPLOAD_QUEUE_MAX_SIZE = 100 # upload队列长度
76
56
 
77
57
  # DONE_MODEL IN (0, 1), 种子完成模式
78
58
  DONE_MODEL = 0 # 0:种子消费成功直接从队列移除,失败则添加至失败队列;1:种子消费成功添加至成功队列,失败添加至失败队列
@@ -81,6 +61,7 @@ DONE_MODEL = 0 # 0:种子消费成功直接从队列移除,失败则添加
81
61
  SPIDER_THREAD_NUM = 10
82
62
  SPIDER_MAX_RETRIES = 5
83
63
  SPIDER_TIME_SLEEP = 10
64
+ RECORD_FAILED_SPIDER = False
84
65
 
85
66
  SPIDER_MAX_COUNT = 1000 # 在规定时间窗口内最大采集数
86
67
  TIME_WINDOW = 60 # 频控固定时间窗口(秒)
@@ -88,6 +69,8 @@ TIME_WINDOW = 60 # 频控固定时间窗口(秒)
88
69
  # 任务模式
89
70
  TASK_MODEL = 0 # 0:单次,1:常驻
90
71
 
72
+ # 流控措施
73
+ SPEED_CONTROL = 1 # 0:关闭,1:开启
91
74
 
92
75
  # bloom过滤器
93
76
  CAPACITY = 100000000
cobweb/utils/__init__.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from .oss import OssUtil
2
2
  from .tools import *
3
3
  from .bloom import BloomFilter
4
- # from .task_queue import TaskQueue
4
+ from .dotting import LoghubDot
5
+ from .decorators import check_pause
5
6
 
@@ -0,0 +1,43 @@
1
+ import time
2
+ from functools import wraps
3
+
4
+ from cobweb.base import logger
5
+
6
+
7
+ def decorator_oss_db(exception, retries=3):
8
+ def decorator(func):
9
+ @wraps(func)
10
+ def wrapper(callback_func, *args, **kwargs):
11
+ result = None
12
+ for i in range(retries):
13
+ msg = None
14
+ try:
15
+ return func(callback_func, *args, **kwargs)
16
+ except Exception as e:
17
+ result = None
18
+ msg = e
19
+ finally:
20
+ if result:
21
+ return result
22
+
23
+ if i >= 2 and msg:
24
+ raise exception(msg)
25
+
26
+ return wrapper
27
+
28
+ return decorator
29
+
30
+
31
+ def check_pause(func):
32
+ @wraps(func)
33
+ def wrapper(self, *args, **kwargs):
34
+ while not self.pause.is_set():
35
+ try:
36
+ func(self, *args, **kwargs)
37
+ except Exception as e:
38
+ logger.info(f"{func.__name__}: " + str(e))
39
+ finally:
40
+ time.sleep(0.1)
41
+ logger.info(f"pause: {func.__name__} thread close ...")
42
+
43
+ return wrapper
@@ -0,0 +1,55 @@
1
+ import json
2
+ import time
3
+
4
+ from aliyun.log import LogClient, LogItem, PutLogsRequest
5
+
6
+ from cobweb.base import Queue, logger
7
+ from cobweb import setting
8
+
9
+
10
+ class LoghubDot:
11
+
12
+ def __init__(self):
13
+ self.client = LogClient(**setting.LOGHUB_CONFIG)
14
+ self.queue = Queue()
15
+
16
+ def build(self, topic, **kwargs):
17
+
18
+ temp = {}
19
+ log_item = LogItem()
20
+ for key, value in kwargs.items():
21
+ if not isinstance(value, str):
22
+ temp[key] = json.dumps(value, ensure_ascii=False)
23
+ else:
24
+ temp[key] = value
25
+ contents = sorted(temp.items())
26
+ log_item.set_contents(contents)
27
+ self.queue.push((topic, log_item), direct_insertion=True)
28
+
29
+ def build_run(self):
30
+ while True:
31
+ start_time = int(time.time())
32
+ while True:
33
+ cost_time = int(time.time()) - start_time
34
+ if self.queue.length >= 1000 or cost_time > 10:
35
+ break
36
+ time.sleep(0.5)
37
+ try:
38
+ log_item_info = {}
39
+ for _ in range(1000):
40
+ its = self.queue.pop()
41
+ if not its:
42
+ break
43
+ topic, item = its
44
+ log_item_info.setdefault(topic, []).append(item)
45
+ for topic, log_items in log_item_info.items():
46
+ request = PutLogsRequest(
47
+ project="databee-download-log",
48
+ logstore="log",
49
+ topic=topic,
50
+ logitems=log_items,
51
+ compress=True
52
+ )
53
+ self.client.put_logs(request=request)
54
+ except Exception as e:
55
+ logger.info(str(e))