cobweb-launcher 1.2.49__py3-none-any.whl → 1.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. cobweb/base/__init__.py +141 -4
  2. cobweb/base/basic.py +28 -82
  3. cobweb/base/common_queue.py +13 -0
  4. cobweb/base/dotting.py +1 -1
  5. cobweb/base/request.py +14 -2
  6. cobweb/base/seed.py +10 -6
  7. cobweb/constant.py +16 -0
  8. cobweb/crawlers/crawler.py +51 -181
  9. cobweb/db/redis_db.py +28 -0
  10. cobweb/launchers/__init__.py +2 -2
  11. cobweb/launchers/launcher.py +110 -141
  12. cobweb/launchers/launcher_api.py +66 -114
  13. cobweb/launchers/launcher_pro.py +76 -194
  14. cobweb/pipelines/base_pipeline.py +54 -0
  15. cobweb/pipelines/loghub_pipeline.py +34 -0
  16. cobweb/pipelines/pipeline.py +25 -49
  17. cobweb/schedulers/__init__.py +0 -2
  18. cobweb/schedulers/scheduler_redis.py +5 -8
  19. cobweb/setting.py +29 -6
  20. cobweb/utils/dotting.py +10 -42
  21. cobweb_/__init__.py +2 -0
  22. cobweb_/base/__init__.py +9 -0
  23. cobweb_/base/common_queue.py +30 -0
  24. cobweb_/base/decorators.py +40 -0
  25. cobweb_/base/item.py +46 -0
  26. cobweb_/base/log.py +94 -0
  27. cobweb_/base/request.py +82 -0
  28. cobweb_/base/response.py +23 -0
  29. cobweb_/base/seed.py +114 -0
  30. cobweb_/constant.py +94 -0
  31. cobweb_/crawlers/__init__.py +1 -0
  32. cobweb_/crawlers/crawler.py +184 -0
  33. cobweb_/db/__init__.py +2 -0
  34. cobweb_/db/api_db.py +82 -0
  35. cobweb_/db/redis_db.py +130 -0
  36. cobweb_/exceptions/__init__.py +1 -0
  37. cobweb_/exceptions/oss_db_exception.py +28 -0
  38. cobweb_/launchers/__init__.py +3 -0
  39. cobweb_/launchers/launcher.py +235 -0
  40. cobweb_/launchers/launcher_air.py +88 -0
  41. cobweb_/launchers/launcher_api.py +221 -0
  42. cobweb_/launchers/launcher_pro.py +222 -0
  43. cobweb_/pipelines/__init__.py +3 -0
  44. cobweb_/pipelines/pipeline.py +69 -0
  45. cobweb_/pipelines/pipeline_console.py +22 -0
  46. cobweb_/pipelines/pipeline_loghub.py +34 -0
  47. cobweb_/setting.py +74 -0
  48. cobweb_/utils/__init__.py +5 -0
  49. cobweb_/utils/bloom.py +58 -0
  50. cobweb_/utils/dotting.py +32 -0
  51. cobweb_/utils/oss.py +94 -0
  52. cobweb_/utils/tools.py +42 -0
  53. {cobweb_launcher-1.2.49.dist-info → cobweb_launcher-1.3.2.dist-info}/METADATA +1 -1
  54. cobweb_launcher-1.3.2.dist-info/RECORD +110 -0
  55. cobweb_launcher-1.3.2.dist-info/top_level.txt +2 -0
  56. cobweb_new/__init__.py +2 -0
  57. cobweb_new/base/__init__.py +72 -0
  58. cobweb_new/base/common_queue.py +53 -0
  59. cobweb_new/base/decorators.py +72 -0
  60. cobweb_new/base/item.py +46 -0
  61. cobweb_new/base/log.py +94 -0
  62. cobweb_new/base/request.py +82 -0
  63. cobweb_new/base/response.py +23 -0
  64. cobweb_new/base/seed.py +118 -0
  65. cobweb_new/constant.py +105 -0
  66. cobweb_new/crawlers/__init__.py +1 -0
  67. cobweb_new/crawlers/crawler-new.py +85 -0
  68. cobweb_new/crawlers/crawler.py +170 -0
  69. cobweb_new/db/__init__.py +2 -0
  70. cobweb_new/db/api_db.py +82 -0
  71. cobweb_new/db/redis_db.py +158 -0
  72. cobweb_new/exceptions/__init__.py +1 -0
  73. cobweb_new/exceptions/oss_db_exception.py +28 -0
  74. cobweb_new/launchers/__init__.py +3 -0
  75. cobweb_new/launchers/launcher.py +237 -0
  76. cobweb_new/launchers/launcher_air.py +88 -0
  77. cobweb_new/launchers/launcher_api.py +161 -0
  78. cobweb_new/launchers/launcher_pro.py +96 -0
  79. cobweb_new/launchers/tesss.py +47 -0
  80. cobweb_new/pipelines/__init__.py +3 -0
  81. cobweb_new/pipelines/pipeline.py +68 -0
  82. cobweb_new/pipelines/pipeline_console.py +22 -0
  83. cobweb_new/pipelines/pipeline_loghub.py +34 -0
  84. cobweb_new/setting.py +95 -0
  85. cobweb_new/utils/__init__.py +5 -0
  86. cobweb_new/utils/bloom.py +58 -0
  87. cobweb_new/utils/oss.py +94 -0
  88. cobweb_new/utils/tools.py +42 -0
  89. cobweb/schedulers/scheduler_api.py +0 -72
  90. cobweb_launcher-1.2.49.dist-info/RECORD +0 -44
  91. cobweb_launcher-1.2.49.dist-info/top_level.txt +0 -1
  92. {cobweb_launcher-1.2.49.dist-info → cobweb_launcher-1.3.2.dist-info}/LICENSE +0 -0
  93. {cobweb_launcher-1.2.49.dist-info → cobweb_launcher-1.3.2.dist-info}/WHEEL +0 -0
@@ -2,9 +2,9 @@ import time
2
2
  import threading
3
3
 
4
4
  from cobweb.db import ApiDB
5
- from cobweb.base import Seed, logger
6
- from cobweb.constant import DealModel, LogTemplate
7
- from .launcher import Launcher, check_pause
5
+ from cobweb.base import Seed, TaskQueue,logger, stop, pause
6
+ from cobweb.constant import DealModel
7
+ from .launcher import Launcher
8
8
 
9
9
 
10
10
  class LauncherApi(Launcher):
@@ -24,18 +24,14 @@ class LauncherApi(Launcher):
24
24
 
25
25
  self._reset_lock_key = "lock:reset:%s_%s" % (project, task)
26
26
 
27
- # self._bf_key = "bloom_%s_%s" % (project, task)
28
- # self._bf = BloomFilter(self._bf_key)
29
-
30
27
  self._heartbeat_start_event = threading.Event()
31
- self._redis_queue_empty_event = threading.Event()
32
28
 
33
29
  @property
34
30
  def heartbeat(self):
35
31
  return self._db.exists(self._heartbeat_key)
36
32
 
37
33
  def statistics(self, key, count):
38
- if not self._task_model and not self._db.exists(key):
34
+ if not self.task_model and not self._db.exists(key):
39
35
  self._db.setex(key, 86400 * 30, int(count))
40
36
  else:
41
37
  self._db.incrby(key, count)
@@ -46,30 +42,25 @@ class LauncherApi(Launcher):
46
42
  设置时间窗口为self._time_window(秒),判断在该窗口内的采集量是否满足阈值(self._spider_max_speed)
47
43
  :return: True -> 种子, False -> None
48
44
  """
49
- if (self.__LAUNCHER_QUEUE__["todo"].length and
50
- not self._db.auto_incr(self._speed_control_key, t=self._time_window, limit=self._spider_max_count)):
45
+ if TaskQueue.TODO.length and not self._db.auto_incr(
46
+ self._speed_control_key,
47
+ t=self.time_window,
48
+ limit=self.spider_max_count
49
+ ):
51
50
  expire_time = self._db.ttl(self._speed_control_key)
52
51
  logger.info(f"Too fast! Please wait {expire_time} seconds...")
53
52
  time.sleep(expire_time / 2)
54
53
  return None
55
- seed = self.__LAUNCHER_QUEUE__["todo"].pop()
56
- return seed
57
-
58
- @check_pause
59
- def _execute_heartbeat(self):
60
- if self._heartbeat_start_event.is_set():
61
- self._db.setex(self._heartbeat_key, 5)
62
- time.sleep(3)
54
+ return TaskQueue.TODO.pop()
63
55
 
64
- @check_pause
56
+ @stop
65
57
  def _reset(self):
66
58
  """
67
59
  检查过期种子,重新添加到redis缓存中
68
60
  """
69
- reset_wait_seconds = 30
70
61
  if self._db.lock(self._reset_lock_key, t=120):
71
62
 
72
- _min = -int(time.time()) + self._seed_reset_seconds \
63
+ _min = -int(time.time()) + self.seed_reset_seconds \
73
64
  if self.heartbeat else "-inf"
74
65
 
75
66
  self._db.members(self._todo_key, 0, _min=_min, _max="(0")
@@ -79,131 +70,92 @@ class LauncherApi(Launcher):
79
70
 
80
71
  self._db.delete(self._reset_lock_key)
81
72
 
82
- time.sleep(reset_wait_seconds)
73
+ time.sleep(30)
74
+
75
+ @stop
76
+ def _refresh(self):
77
+ """
78
+ 刷新doing种子过期时间,防止reset重新消费
79
+ """
80
+ if self.doing_seeds:
81
+ refresh_time = int(time.time())
82
+ seeds = {k: -refresh_time - v / 1e3 for k, v in self.doing_seeds.items()}
83
+ self._db.zadd(self._todo_key, item=seeds, xx=True)
84
+ time.sleep(3)
83
85
 
84
- @check_pause
86
+ @stop
85
87
  def _scheduler(self):
86
88
  """
87
89
  调度任务,获取redis队列种子,同时添加到doing字典中
88
90
  """
89
91
  if not self._db.zcount(self._todo_key, 0, "(1000"):
90
- time.sleep(self._scheduler_wait_seconds)
91
- elif self.__LAUNCHER_QUEUE__['todo'].length >= self._todo_queue_size:
92
- time.sleep(self._todo_queue_full_wait_seconds)
92
+ time.sleep(self.scheduler_wait_seconds)
93
+ elif TaskQueue.TODO.length >= self.todo_queue_size:
94
+ time.sleep(self.todo_queue_full_wait_seconds)
93
95
  else:
94
96
  members = self._db.members(
95
97
  self._todo_key, int(time.time()),
96
- count=self._todo_queue_size,
98
+ count=self.todo_queue_size,
97
99
  _min=0, _max="(1000"
98
100
  )
99
101
  for member, priority in members:
100
102
  seed = Seed(member, priority=priority)
101
- self.__LAUNCHER_QUEUE__['todo'].push(seed)
102
- self.__DOING__[seed.to_string] = seed.params.priority
103
+ TaskQueue.TODO.push(seed)
104
+ self.doing_seeds[seed.to_string] = seed.params.priority
105
+
106
+ @pause
107
+ def _heartbeat(self):
108
+ if self._heartbeat_start_event.is_set():
109
+ self._db.setex(self._heartbeat_key, t=5)
110
+ time.sleep(3)
103
111
 
104
- @check_pause
112
+ @pause
105
113
  def _insert(self):
106
114
  """
107
115
  添加新种子到redis队列中
108
116
  """
109
117
  seeds = {}
110
- status = self.__LAUNCHER_QUEUE__['new'].length < self._new_queue_max_size
111
- for _ in range(self._new_queue_max_size):
112
- seed = self.__LAUNCHER_QUEUE__['new'].pop()
113
- if seed:
118
+ for _ in range(self.new_queue_max_size):
119
+ if seed := TaskQueue.SEED.pop():
114
120
  seeds[seed.to_string] = seed.params.priority
115
121
  if seeds:
116
122
  self._db.zadd(self._todo_key, seeds, nx=True)
117
- if status:
118
- time.sleep(self._new_queue_wait_seconds)
119
-
120
- @check_pause
121
- def _refresh(self):
122
- """
123
- 刷新doing种子过期时间,防止reset重新消费
124
- """
125
- if self.__DOING__:
126
- refresh_time = int(time.time())
127
- seeds = {k:-refresh_time - v / 1000 for k, v in self.__DOING__.items()}
128
- self._db.zadd(self._todo_key, item=seeds, xx=True)
129
- time.sleep(15)
123
+ if TaskQueue.SEED.length < self.new_queue_max_size:
124
+ time.sleep(self.new_queue_wait_seconds)
130
125
 
131
- @check_pause
126
+ @pause
132
127
  def _delete(self):
133
128
  """
134
129
  删除队列种子,根据状态添加至成功或失败队列,移除doing字典种子索引
135
130
  """
136
- # seed_info = {"count": 0, "failed": [], "succeed": [], "common": []}
131
+ seed_info = {"count": 0, "failed": [], "succeed": [], "common": []}
132
+ status = TaskQueue.DONE.length < self.done_queue_max_size
137
133
 
138
- seed_list = []
139
- status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
140
-
141
- for _ in range(self._done_queue_max_size):
142
- seed = self.__LAUNCHER_QUEUE__['done'].pop()
134
+ for _ in range(self.done_queue_max_size):
135
+ seed = TaskQueue.DONE.pop()
143
136
  if not seed:
144
137
  break
145
- seed_list.append(seed.to_string)
138
+ if seed.params.seed_status == DealModel.fail:
139
+ seed_info["failed"].append(seed.to_string)
140
+ elif self.done_model == 1:
141
+ seed_info["succeed"].append(seed.to_string)
142
+ else:
143
+ seed_info["common"].append(seed.to_string)
144
+ seed_info['count'] += 1
146
145
 
147
- if seed_list:
146
+ if seed_info["count"]:
148
147
 
149
- self._db.zrem(self._todo_key, *seed_list)
150
- self._remove_doing_seeds(seed_list)
148
+ succeed_count = int(self._db.zrem(self._todo_key, *seed_info["common"]) or 0)
149
+ succeed_count += int(self._db.done([self._todo_key, self._done_key], *seed_info["succeed"]) or 0)
150
+ failed_count = int(self._db.done([self._todo_key, self._fail_key], *seed_info["failed"]) or 0)
151
+
152
+ if failed_count:
153
+ self.statistics(self._statistics_fail_key, failed_count)
154
+ if succeed_count:
155
+ self.statistics(self._statistics_done_key, succeed_count)
156
+
157
+ self._remove_doing_seeds(seed_info["common"] + seed_info["succeed"] + seed_info["failed"])
151
158
 
152
159
  if status:
153
- time.sleep(self._done_queue_wait_seconds)
154
-
155
- def _polling(self):
156
- wait_scheduler_execute = True
157
- check_emtpy_times = 0
158
- while not self._stop.is_set():
159
- queue_not_empty_count = 0
160
- pooling_wait_seconds = 30
161
-
162
- for q in self.__LAUNCHER_QUEUE__.values():
163
- if q.length != 0:
164
- queue_not_empty_count += 1
165
- wait_scheduler_execute = False
166
-
167
- if queue_not_empty_count == 0:
168
- pooling_wait_seconds = 3
169
- if self._pause.is_set():
170
- check_emtpy_times = 0
171
- if not self._task_model and (
172
- not wait_scheduler_execute or
173
- int(time.time()) - self._app_time > self._before_scheduler_wait_seconds
174
- ):
175
- logger.info("Done! ready to close thread...")
176
- self._stop.set()
177
-
178
- elif self._db.zcount(self._todo_key, _min=0, _max="(1000"):
179
- logger.info(f"Recovery {self.task} task run!")
180
- self._pause.clear()
181
- self._execute()
182
- else:
183
- logger.info("pause! waiting for resume...")
184
- elif check_emtpy_times > 2:
185
- self.__DOING__ = {}
186
- if not self._db.zcount(self._todo_key, _min="-inf", _max="(1000"):
187
- self._pause.set()
188
- else:
189
- logger.info(
190
- "check whether the task is complete, "
191
- f"reset times {3 - check_emtpy_times}"
192
- )
193
- check_emtpy_times += 1
194
- else:
195
- logger.info(LogTemplate.launcher_pro_polling.format(
196
- task=self.task,
197
- doing_len=len(self.__DOING__.keys()),
198
- todo_len=self.__LAUNCHER_QUEUE__['todo'].length,
199
- done_len=self.__LAUNCHER_QUEUE__['done'].length,
200
- redis_seed_count=self._db.zcount(self._todo_key, "-inf", "+inf"),
201
- redis_todo_len=self._db.zcount(self._todo_key, 0, "(1000"),
202
- redis_doing_len=self._db.zcount(self._todo_key, "-inf", "(0"),
203
- upload_len=self.__LAUNCHER_QUEUE__['upload'].length,
204
- ))
205
-
206
- time.sleep(pooling_wait_seconds)
207
-
208
- logger.info("Done! Ready to close thread...")
160
+ time.sleep(self.done_queue_wait_seconds)
209
161
 
@@ -1,208 +1,90 @@
1
1
  import time
2
- import threading
3
2
 
4
- from cobweb.db import RedisDB
5
- from cobweb.base import Seed, logger
6
- from cobweb.utils import BloomFilter
7
- from cobweb.constant import DealModel, LogTemplate
8
- from .launcher import Launcher, check_pause
3
+ from cobweb.base import TaskQueue, Decorators
4
+ from cobweb.schedulers import RedisScheduler
5
+ from .launcher import Launcher
9
6
 
10
7
 
11
8
  class LauncherPro(Launcher):
12
9
 
13
10
  def __init__(self, task, project, custom_setting=None, **kwargs):
14
11
  super().__init__(task, project, custom_setting, **kwargs)
15
- self._todo_key = "{%s:%s}:todo" % (project, task)
16
- self._done_key = "{%s:%s}:done" % (project, task)
17
- self._fail_key = "{%s:%s}:fail" % (project, task)
18
- self._heartbeat_key = "heartbeat:%s_%s" % (project, task)
19
-
20
- self._statistics_done_key = "statistics:%s:%s:done" % (project, task)
21
- self._statistics_fail_key = "statistics:%s:%s:fail" % (project, task)
22
- self._speed_control_key = "speed_control:%s_%s" % (project, task)
23
-
24
- self._reset_lock_key = "lock:reset:%s_%s" % (project, task)
25
-
26
- # self._bf_key = "bloom_%s_%s" % (project, task)
27
- #
28
- self._db = RedisDB()
29
- #
30
- # self._bf = BloomFilter(self._bf_key)
31
-
32
- self._heartbeat_start_event = threading.Event()
33
- self._redis_queue_empty_event = threading.Event()
34
-
35
- @property
36
- def heartbeat(self):
37
- return self._db.exists(self._heartbeat_key)
38
-
39
- def statistics(self, key, count):
40
- if not self._task_model and not self._db.exists(key):
41
- self._db.setex(key, 86400 * 30, int(count))
42
- else:
43
- self._db._client.incrby(key, count)
44
-
45
- def _get_seed(self) -> Seed:
46
- spider_speed = self._db._client.get(self._speed_control_key)
47
- if int(spider_speed or 0) > self._spider_max_count:
48
- expire_time = self._db.ttl(self._speed_control_key)
49
- if expire_time == -1:
50
- self._db.delete(self._speed_control_key)
51
- else:
52
- logger.info(f"Too fast! Please wait {expire_time} seconds...")
53
- time.sleep(expire_time / 2)
54
- return None
55
- seed = self.__LAUNCHER_QUEUE__["todo"].pop()
56
- if seed and not self._db.lock(self._speed_control_key, t=self._time_window):
57
- self._db._client.incrby(self._speed_control_key, 1)
58
- return seed
59
-
60
- @check_pause
61
- def _execute_heartbeat(self):
62
- if self._heartbeat_start_event.is_set():
63
- self._db.setex(self._heartbeat_key, 5)
12
+ self._redis_download = "{%s:%s}:download" % (project, task)
13
+ self._redis_todo = "{%s:%s}:todo" % (project, task)
14
+ self._scheduler = RedisScheduler(task, project)
15
+
16
+ @Decorators.stop
17
+ def _schedule(self):
18
+ thread_sleep = self.scheduling_wait_time
19
+ for q, key, size, item_info in [
20
+ (TaskQueue.TODO, self._redis_todo, self.todo_queue_size, self._task_info["todo"]),
21
+ (TaskQueue.DOWNLOAD, self._redis_download, self.download_queue_size, self._task_info["download"]),
22
+ ]:
23
+ if q.length < size:
24
+ for member, priority in self._scheduler.schedule(
25
+ key, self.scheduling_size
26
+ ):
27
+ q.push((member, priority), direct_insertion=True)
28
+ self.add_working_item(key.split(":")[-1], member, priority)
29
+ thread_sleep = 0.1
30
+ time.sleep(thread_sleep)
31
+
32
+ @Decorators.pause
33
+ def _heartbeat(self):
34
+ if self._scheduler.working.is_set():
35
+ self._scheduler.set_heartbeat()
64
36
  time.sleep(3)
65
37
 
66
- @check_pause
38
+ @Decorators.pause
67
39
  def _reset(self):
68
- """
69
- 检查过期种子,重新添加到redis缓存中
70
- """
71
- reset_wait_seconds = 30
72
- if self._db.lock(self._reset_lock_key, t=120):
73
-
74
- _min = -int(time.time()) + self._seed_reset_seconds \
75
- if self.heartbeat else "-inf"
76
-
77
- self._db.members(self._todo_key, 0, _min=_min, _max="(0")
78
- self._db.delete(self._reset_lock_key)
79
-
80
- if not self.heartbeat:
81
- self._heartbeat_start_event.set()
82
-
83
- time.sleep(reset_wait_seconds)
84
-
85
- @check_pause
86
- def _scheduler(self):
87
- """
88
- 调度任务,获取redis队列种子,同时添加到doing字典中
89
- """
90
- if not self._db.zcount(self._todo_key, 0, "(1000"):
91
- time.sleep(self._scheduler_wait_seconds)
92
- elif self.__LAUNCHER_QUEUE__['todo'].length >= self._todo_queue_size:
93
- time.sleep(self._todo_queue_full_wait_seconds)
94
- else:
95
- members = self._db.members(
96
- self._todo_key, int(time.time()),
97
- count=self._todo_queue_size,
98
- _min=0, _max="(1000"
99
- )
100
- for member, priority in members:
101
- seed = Seed(member, priority=priority)
102
- self.__LAUNCHER_QUEUE__['todo'].push(seed)
103
- self.__DOING__[seed.to_string] = seed.params.priority
40
+ self._scheduler.reset(
41
+ keys=[self._redis_todo, self._redis_download],
42
+ reset_time=self.seed_reset_seconds
43
+ )
44
+ time.sleep(30)
104
45
 
105
- @check_pause
46
+ @Decorators.pause
106
47
  def _insert(self):
107
- """
108
- 添加新种子到redis队列中
109
- """
110
- seeds = {}
111
- status = self.__LAUNCHER_QUEUE__['new'].length < self._new_queue_max_size
112
- for _ in range(self._new_queue_max_size):
113
- seed = self.__LAUNCHER_QUEUE__['new'].pop()
114
- if seed:
115
- seeds[seed.to_string] = seed.params.priority
116
- if seeds:
117
- self._db.zadd(self._todo_key, seeds, nx=True)
118
- if status:
119
- time.sleep(self._new_queue_wait_seconds)
120
-
121
- @check_pause
48
+ thread_sleep = 0.1
49
+ for q, key, size in [
50
+ (TaskQueue.SEED, self._redis_todo, self.seed_queue_size),
51
+ (TaskQueue.REQUEST, self._redis_download, self.request_queue_size),
52
+ ]:
53
+ item_info = {}
54
+ while item := q.pop() and len(item_info.keys()) < self.inserting_size:
55
+ item_info[item.seed] = item.params.priority
56
+ if q.length >= size:
57
+ thread_sleep = self.inserting_wait_time
58
+ self._scheduler.insert(key, item_info)
59
+ time.sleep(thread_sleep)
60
+
61
+ @Decorators.pause
122
62
  def _refresh(self):
123
- """
124
- 刷新doing种子过期时间,防止reset重新消费
125
- """
126
- if self.__DOING__:
127
- refresh_time = int(time.time())
128
- seeds = {k:-refresh_time - v / 1000 for k, v in self.__DOING__.items()}
129
- self._db.zadd(self._todo_key, item=seeds, xx=True)
130
- time.sleep(15)
131
-
132
- @check_pause
133
- def _delete(self):
134
- """
135
- 删除队列种子,根据状态添加至成功或失败队列,移除doing字典种子索引
136
- """
137
- seed_list = []
138
- status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
139
-
140
- for _ in range(self._done_queue_max_size):
141
- seed = self.__LAUNCHER_QUEUE__['done'].pop()
142
- if not seed:
143
- break
144
- seed_list.append(seed.to_string)
145
-
146
- if seed_list:
147
-
148
- self._db.zrem(self._todo_key, *seed_list)
149
- self._remove_doing_seeds(seed_list)
150
-
151
- if status:
152
- time.sleep(self._done_queue_wait_seconds)
153
-
154
- def _polling(self):
155
- wait_scheduler_execute = True
156
- check_emtpy_times = 0
157
- while not self._stop.is_set():
158
- queue_not_empty_count = 0
159
- pooling_wait_seconds = 30
160
-
161
- for q in self.__LAUNCHER_QUEUE__.values():
162
- if q.length != 0:
163
- queue_not_empty_count += 1
164
- wait_scheduler_execute = False
165
-
166
- if queue_not_empty_count == 0:
167
- pooling_wait_seconds = 3
168
- if self._pause.is_set():
169
- check_emtpy_times = 0
170
- if not self._task_model and (
171
- not wait_scheduler_execute or
172
- int(time.time()) - self._app_time > self._before_scheduler_wait_seconds
173
- ):
174
- logger.info("Done! ready to close thread...")
175
- self._stop.set()
176
-
177
- elif self._db.zcount(self._todo_key, _min=0, _max="(1000"):
178
- logger.info(f"Recovery {self.task} task run!")
179
- self._pause.clear()
180
- self._execute()
181
- else:
182
- logger.info("pause! waiting for resume...")
183
- elif check_emtpy_times > 2:
184
- self.__DOING__ = {}
185
- if not self._db.zcount(self._todo_key, _min="-inf", _max="(1000"):
186
- self._pause.set()
187
- else:
188
- logger.info(
189
- "check whether the task is complete, "
190
- f"reset times {3 - check_emtpy_times}"
191
- )
192
- check_emtpy_times += 1
193
- else:
194
- logger.info(LogTemplate.launcher_pro_polling.format(
195
- task=self.task,
196
- doing_len=len(self.__DOING__.keys()),
197
- todo_len=self.__LAUNCHER_QUEUE__['todo'].length,
198
- done_len=self.__LAUNCHER_QUEUE__['done'].length,
199
- redis_seed_count=self._db.zcount(self._todo_key, "-inf", "+inf"),
200
- redis_todo_len=self._db.zcount(self._todo_key, 0, "(1000"),
201
- redis_doing_len=self._db.zcount(self._todo_key, "-inf", "(0"),
202
- upload_len=self.__LAUNCHER_QUEUE__['upload'].length,
203
- ))
204
-
205
- time.sleep(pooling_wait_seconds)
206
-
207
- logger.info("Done! Ready to close thread...")
208
-
63
+ self._scheduler.refresh(self._redis_todo, self._task_info["todo"])
64
+ self._scheduler.refresh(self._redis_download, self._task_info["download"])
65
+ time.sleep(10)
66
+
67
+ @Decorators.pause
68
+ def _remove(self):
69
+ thread_sleep = self.removing_wait_time
70
+ for q, key, size in [
71
+ (TaskQueue.DELETE, self._redis_todo, self.delete_queue_size),
72
+ (TaskQueue.DONE, self._redis_download, self.done_queue_size),
73
+ ]:
74
+ items = []
75
+ while item := q.pop() and len(items) < self.removing_size:
76
+ items.append(item)
77
+ self._scheduler.delete(key, *items)
78
+ self.remove_working_items(key.split(":")[-1], items)
79
+ if q.length >= size:
80
+ thread_sleep = 0.1
81
+ time.sleep(thread_sleep)
82
+
83
+ def _init_schedule_thread(self):
84
+ self._add_thread(func=self._heartbeat)
85
+ self._add_thread(func=self._reset)
86
+ self._add_thread(func=self._refresh)
87
+ self._add_thread(func=self._schedule)
88
+ self._add_thread(func=self._insert)
89
+ self._add_thread(func=self._remove)
90
+ self._add_thread(func=self._polling)
@@ -0,0 +1,54 @@
1
+ import time
2
+ import threading
3
+
4
+ from abc import ABC, abstractmethod
5
+ from cobweb.base import BaseItem, Queue, logger
6
+
7
+
8
+ class Pipeline(threading.Thread, ABC):
9
+
10
+ def __init__(
11
+ self,
12
+ done_queue: Queue,
13
+ upload_queue: Queue,
14
+ upload_queue_size: int,
15
+ upload_wait_seconds: int
16
+ ):
17
+ super().__init__()
18
+ self.done_queue = done_queue
19
+ self.upload_queue = upload_queue
20
+ self.upload_queue_size = upload_queue_size
21
+ self.upload_wait_seconds = upload_wait_seconds
22
+
23
+ @abstractmethod
24
+ def build(self, item: BaseItem) -> dict:
25
+ pass
26
+
27
+ @abstractmethod
28
+ def upload(self, table: str, data: list) -> bool:
29
+ pass
30
+
31
+ def run(self):
32
+ while True:
33
+ status = self.upload_queue.length < self.upload_queue_size
34
+ if status:
35
+ time.sleep(self.upload_wait_seconds)
36
+ data_info, seeds = {}, []
37
+ for _ in range(self.upload_queue_size):
38
+ item = self.upload_queue.pop()
39
+ if not item:
40
+ break
41
+ data = self.build(item)
42
+ seeds.append(item.seed)
43
+ data_info.setdefault(item.table, []).append(data)
44
+ for table, datas in data_info.items():
45
+ try:
46
+ self.upload(table, datas)
47
+ status = True
48
+ except Exception as e:
49
+ logger.info(e)
50
+ status = False
51
+ if status:
52
+ self.done_queue.push(seeds)
53
+
54
+
@@ -0,0 +1,34 @@
1
+ import json
2
+
3
+ from cobweb import setting
4
+ from cobweb.base import BaseItem
5
+ from cobweb.pipelines import Pipeline
6
+ from aliyun.log import LogClient, LogItem, PutLogsRequest
7
+
8
+
9
+ class LoghubPipeline(Pipeline):
10
+
11
+ def __init__(self, *args, **kwargs):
12
+ super().__init__(*args, **kwargs)
13
+ self.client = LogClient(**setting.LOGHUB_CONFIG)
14
+
15
+ def build(self, item: BaseItem):
16
+ log_item = LogItem()
17
+ temp = item.to_dict
18
+ for key, value in temp.items():
19
+ if not isinstance(value, str):
20
+ temp[key] = json.dumps(value, ensure_ascii=False)
21
+ contents = sorted(temp.items())
22
+ log_item.set_contents(contents)
23
+ return log_item
24
+
25
+ def upload(self, table, datas):
26
+ request = PutLogsRequest(
27
+ project=setting.LOGHUB_PROJECT,
28
+ logstore=table,
29
+ topic=setting.LOGHUB_TOPIC,
30
+ source=setting.LOGHUB_SOURCE,
31
+ logitems=datas,
32
+ compress=True
33
+ )
34
+ self.client.put_logs(request=request)