cobweb-launcher 1.3.2__tar.gz → 1.3.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. {cobweb-launcher-1.3.2/cobweb_launcher.egg-info → cobweb-launcher-1.3.4}/PKG-INFO +1 -1
  2. cobweb-launcher-1.3.4/cobweb/__init__.py +2 -0
  3. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb/base/__init__.py +9 -3
  4. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb/base/basic.py +78 -26
  5. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb/crawlers/crawler.py +4 -4
  6. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb/launchers/launcher.py +8 -6
  7. cobweb-launcher-1.3.4/cobweb/launchers/launcher_air.py +88 -0
  8. cobweb-launcher-1.3.4/cobweb/launchers/launcher_api.py +88 -0
  9. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb/launchers/launcher_pro.py +7 -9
  10. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb/pipelines/pipeline.py +2 -1
  11. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb/schedulers/__init__.py +2 -0
  12. cobweb-launcher-1.3.4/cobweb/schedulers/scheduler_api.py +69 -0
  13. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4/cobweb_launcher.egg-info}/PKG-INFO +1 -1
  14. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb_launcher.egg-info/SOURCES.txt +1 -0
  15. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/setup.py +1 -1
  16. cobweb-launcher-1.3.2/cobweb/launchers/launcher_api.py +0 -161
  17. cobweb-launcher-1.3.2/cobweb_/__init__.py +0 -2
  18. cobweb-launcher-1.3.2/cobweb_/launchers/launcher_air.py +0 -88
  19. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/LICENSE +0 -0
  20. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/README.md +0 -0
  21. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb/base/common_queue.py +0 -0
  22. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb/base/dotting.py +0 -0
  23. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb/base/item.py +0 -0
  24. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb/base/log.py +0 -0
  25. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb/base/request.py +0 -0
  26. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb/base/response.py +0 -0
  27. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb/base/seed.py +0 -0
  28. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb/constant.py +0 -0
  29. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb/crawlers/__init__.py +0 -0
  30. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb/db/__init__.py +0 -0
  31. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb/db/api_db.py +0 -0
  32. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb/db/redis_db.py +0 -0
  33. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb/exceptions/__init__.py +0 -0
  34. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb/exceptions/oss_db_exception.py +0 -0
  35. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb/launchers/__init__.py +0 -0
  36. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb/pipelines/__init__.py +0 -0
  37. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb/pipelines/pipeline_console.py +0 -0
  38. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb/pipelines/pipeline_loghub.py +0 -0
  39. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb/schedulers/scheduler_redis.py +0 -0
  40. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb/setting.py +0 -0
  41. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb/utils/__init__.py +0 -0
  42. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb/utils/bloom.py +0 -0
  43. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb/utils/oss.py +0 -0
  44. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb/utils/tools.py +0 -0
  45. {cobweb-launcher-1.3.2/cobweb → cobweb-launcher-1.3.4/cobweb_}/__init__.py +0 -0
  46. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb_/base/__init__.py +0 -0
  47. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb_/base/common_queue.py +0 -0
  48. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb_/base/decorators.py +0 -0
  49. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb_/base/item.py +0 -0
  50. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb_/base/log.py +0 -0
  51. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb_/base/request.py +0 -0
  52. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb_/base/response.py +0 -0
  53. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb_/base/seed.py +0 -0
  54. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb_/constant.py +0 -0
  55. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb_/crawlers/__init__.py +0 -0
  56. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb_/crawlers/crawler.py +0 -0
  57. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb_/db/__init__.py +0 -0
  58. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb_/db/api_db.py +0 -0
  59. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb_/db/redis_db.py +0 -0
  60. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb_/exceptions/__init__.py +0 -0
  61. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb_/exceptions/oss_db_exception.py +0 -0
  62. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb_/launchers/__init__.py +0 -0
  63. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb_/launchers/launcher.py +0 -0
  64. {cobweb-launcher-1.3.2/cobweb → cobweb-launcher-1.3.4/cobweb_}/launchers/launcher_air.py +0 -0
  65. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb_/launchers/launcher_api.py +0 -0
  66. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb_/launchers/launcher_pro.py +0 -0
  67. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb_/pipelines/__init__.py +0 -0
  68. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb_/pipelines/pipeline.py +0 -0
  69. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb_/pipelines/pipeline_console.py +0 -0
  70. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb_/pipelines/pipeline_loghub.py +0 -0
  71. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb_/setting.py +0 -0
  72. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb_/utils/__init__.py +0 -0
  73. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb_/utils/bloom.py +0 -0
  74. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb_/utils/dotting.py +0 -0
  75. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb_/utils/oss.py +0 -0
  76. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb_/utils/tools.py +0 -0
  77. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
  78. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb_launcher.egg-info/requires.txt +0 -0
  79. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/cobweb_launcher.egg-info/top_level.txt +0 -0
  80. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/setup.cfg +0 -0
  81. {cobweb-launcher-1.3.2 → cobweb-launcher-1.3.4}/test/test.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 1.3.2
3
+ Version: 1.3.4
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -0,0 +1,2 @@
1
+ from .launchers import LauncherPro, LauncherApi
2
+ from .constant import CrawlerModel
@@ -66,11 +66,17 @@ class TaskQueue:
66
66
  except Exception as e:
67
67
  it.params.retry += 1
68
68
  if isinstance(it, Request):
69
- TaskQueue.REQUEST.push(it)
69
+ TaskQueue.DOWNLOAD.push(it)
70
70
  elif isinstance(it, Response):
71
71
  TaskQueue.RESPONSE.push(it)
72
72
  elif isinstance(it, Seed):
73
- TaskQueue.SEED.push(it)
73
+ TaskQueue.TODO.push(it)
74
+ elif isinstance(it, BaseItem):
75
+ TaskQueue.UPLOAD.push(it)
76
+ logger.info(
77
+ f"{crawler_func.__name__} failed: "
78
+ f"{''.join(traceback.format_exception(type(e), e, e.__traceback__))}"
79
+ )
74
80
  time.sleep(1)
75
81
 
76
82
 
@@ -95,7 +101,7 @@ class Decorators:
95
101
  def wrapper(self, *args, **kwargs):
96
102
  while not self.pause.is_set():
97
103
  try:
98
- func(self, *args, **kwargs)
104
+ func(self)
99
105
  except Exception as e:
100
106
  logger.info(f"{func.__name__}: " + str(e))
101
107
  finally:
@@ -15,11 +15,21 @@ class Params:
15
15
 
16
16
 
17
17
  class Seed:
18
+ __SEED_PARAMS__ = [
19
+ "retry",
20
+ "priority",
21
+ "version",
22
+ "status"
23
+ ]
18
24
 
19
25
  def __init__(
20
26
  self,
21
27
  seed,
22
- params = Params(),
28
+ sid=None,
29
+ retry=None,
30
+ priority=None,
31
+ version=None,
32
+ status=None,
23
33
  **kwargs
24
34
  ):
25
35
  if any(isinstance(seed, t) for t in (str, bytes)):
@@ -37,11 +47,27 @@ class Seed:
37
47
  f"seed: {seed}"
38
48
  ))
39
49
 
50
+ seed_params = {
51
+ "retry": retry,
52
+ "priority": priority,
53
+ "version": version,
54
+ "status": status,
55
+ }
56
+
40
57
  if kwargs:
58
+ # for k, v in kwargs.items():
59
+ # if k in seed_params.keys():
60
+ # seed_params[k] = v
61
+ # else:
62
+ # self.__setattr__(k, v)
41
63
  self._init_seed(kwargs)
42
- if not getattr(self, "sid", None):
43
- self._init_id()
44
- self.params = params or Params()
64
+ seed_params.update({
65
+ k: v for k, v in kwargs.items()
66
+ if k in self.__SEED_PARAMS__
67
+ })
68
+ if sid or not getattr(self, "sid", None):
69
+ self._init_id(sid)
70
+ self.params = Params(**seed_params)
45
71
 
46
72
  def __getattr__(self, name):
47
73
  return None
@@ -59,13 +85,14 @@ class Seed:
59
85
  chars = [f"{k}={v}" for k, v in self.__dict__.items()]
60
86
  return f'{self.__class__.__name__}({", ".join(chars)})'
61
87
 
62
- def _init_seed(self, seed_info:dict):
88
+ def _init_seed(self, seed_info: dict):
63
89
  for k, v in seed_info.items():
64
90
  if k not in self.__SEED_PARAMS__:
65
91
  self.__setattr__(k, v)
66
92
 
67
- def _init_id(self):
68
- sid = hashlib.md5(self.to_string.encode()).hexdigest()
93
+ def _init_id(self, sid):
94
+ if not sid:
95
+ sid = hashlib.md5(self.to_string.encode()).hexdigest()
69
96
  self.__setattr__("sid", sid)
70
97
 
71
98
  @property
@@ -85,16 +112,15 @@ class Seed:
85
112
 
86
113
  @property
87
114
  def seed(self):
88
- return self
115
+ return self.to_string
89
116
 
90
117
 
91
118
  class Request:
92
-
93
119
  __SEED_PARAMS__ = [
94
120
  "retry",
95
121
  "priority",
96
- "seed_version",
97
- "seed_status"
122
+ "version",
123
+ "status"
98
124
  ]
99
125
 
100
126
  __REQUEST_ATTRS__ = {
@@ -122,8 +148,8 @@ class Request:
122
148
  check_status_code=True,
123
149
  retry=None,
124
150
  priority=None,
125
- seed_version=None,
126
- seed_status=None,
151
+ version=None,
152
+ status=None,
127
153
  **kwargs
128
154
  ):
129
155
  self.url = url
@@ -133,10 +159,15 @@ class Request:
133
159
  seed_params = {
134
160
  "retry": retry,
135
161
  "priority": priority,
136
- "seed_version": seed_version,
137
- "seed_status": seed_status,
162
+ "version": version,
163
+ "status": status,
138
164
  }
139
165
 
166
+ if isinstance(seed, Seed):
167
+ kwargs.update(**seed.to_dict)
168
+ elif isinstance(seed, str):
169
+ kwargs.update(**json.loads(seed))
170
+
140
171
  for k, v in kwargs.items():
141
172
  if k in self.__class__.__REQUEST_ATTRS__:
142
173
  self.request_setting[k] = v
@@ -152,12 +183,7 @@ class Request:
152
183
  self._build_header()
153
184
 
154
185
  self.params = Params(**seed_params)
155
-
156
- if isinstance(seed, Seed):
157
- kwargs.update(**seed.to_dict)
158
- elif isinstance(seed, str):
159
- kwargs.update(**json.loads(seed))
160
- self.seed = self.to_string
186
+ # self.seed = self.to_string
161
187
 
162
188
  @property
163
189
  def _random_ua(self) -> str:
@@ -183,10 +209,19 @@ class Request:
183
209
  response.raise_for_status()
184
210
  return response
185
211
 
212
+ def __getattr__(self, name):
213
+ return None
214
+
215
+ def __setitem__(self, key, value):
216
+ setattr(self, key, value)
217
+
218
+ def __getitem__(self, item):
219
+ return getattr(self, item)
220
+
186
221
  @property
187
222
  def to_dict(self):
188
223
  _dict = self.__dict__.copy()
189
- _dict.pop('seed')
224
+ # _dict.pop('seed')
190
225
  _dict.pop('params')
191
226
  _dict.pop('check_status_code')
192
227
  # _dict.pop('request_setting')
@@ -200,6 +235,10 @@ class Request:
200
235
  separators=(",", ":")
201
236
  )
202
237
 
238
+ @property
239
+ def seed(self):
240
+ return self.to_string
241
+
203
242
 
204
243
  class Response:
205
244
 
@@ -209,8 +248,8 @@ class Response:
209
248
  response,
210
249
  retry=None,
211
250
  priority=None,
212
- seed_version=None,
213
- seed_status=None,
251
+ version=None,
252
+ status=None,
214
253
  **kwargs
215
254
  ):
216
255
  self.seed = seed
@@ -218,20 +257,24 @@ class Response:
218
257
  seed_params = {
219
258
  "retry": retry,
220
259
  "priority": priority,
221
- "seed_version": seed_version,
222
- "seed_status": seed_status,
260
+ "version": version,
261
+ "status": status,
223
262
  }
224
263
  for k, v in kwargs.items():
225
264
  if k in seed_params.keys():
226
265
  seed_params[k] = v
227
266
  else:
228
267
  self.__setattr__(k, v)
268
+ self.params = Params(**seed_params)
229
269
 
230
270
  @property
231
271
  def to_dict(self):
232
272
  _dict = self.__dict__.copy()
233
273
  _dict.pop('seed')
234
274
  _dict.pop('response')
275
+ _dict.pop('method')
276
+ _dict.pop('params')
277
+ _dict.pop('request_setting')
235
278
  return _dict
236
279
 
237
280
  @property
@@ -241,3 +284,12 @@ class Response:
241
284
  ensure_ascii=False,
242
285
  separators=(",", ":")
243
286
  )
287
+
288
+ def __getattr__(self, name):
289
+ return None
290
+
291
+ def __setitem__(self, key, value):
292
+ setattr(self, key, value)
293
+
294
+ def __getitem__(self, item):
295
+ return getattr(self, item)
@@ -3,7 +3,7 @@ import time
3
3
  import threading
4
4
  from typing import Union, Callable, Mapping
5
5
 
6
- import setting
6
+ from cobweb import setting
7
7
  from cobweb.base import (
8
8
  Seed,
9
9
  BaseItem,
@@ -14,7 +14,7 @@ from cobweb.base import (
14
14
  TaskQueue,
15
15
  logger
16
16
  )
17
- from constant import DealModel
17
+ from cobweb.constant import DealModel
18
18
 
19
19
 
20
20
  class Crawler(threading.Thread):
@@ -66,8 +66,8 @@ class Crawler(threading.Thread):
66
66
  def build_download_item(self):
67
67
  thread_sleep = 0.1
68
68
  if TaskQueue.RESPONSE.length >= self.download_queue_size:
69
- logger.info(f"download queue is full, sleep {thread_sleep}s")
70
69
  thread_sleep = 5
70
+ # logger.info(f"download queue is full, sleep {thread_sleep}s")
71
71
  elif request_info := TaskQueue.DOWNLOAD.pop():
72
72
  member, priority = request_info
73
73
  request_setting = json.loads(member)
@@ -79,7 +79,7 @@ class Crawler(threading.Thread):
79
79
  def build_parse_item(self):
80
80
  thread_sleep = 0.1
81
81
  if TaskQueue.UPLOAD.length >= self.upload_queue_size:
82
- logger.info(f"upload queue is full, sleep {thread_sleep}s")
82
+ # logger.info(f"upload queue is full, sleep {thread_sleep}s")
83
83
  thread_sleep = 5
84
84
  if response_item := TaskQueue.RESPONSE.pop():
85
85
  TaskQueue.process_task(response_item, self.parse)
@@ -137,10 +137,10 @@ class Launcher(threading.Thread):
137
137
 
138
138
  def _add_thread(self, func, num=1, obj=None, name=None, args=()):
139
139
  obj = obj or self
140
- name = obj.__class__.__name__ + name or func.__name__
140
+ name = obj.__class__.__name__ + ":" + (name or func.__name__)
141
141
  for i in range(num):
142
142
  func_name = name + "_" + str(i) if num > 1 else name
143
- self._threads.append(threading.Thread(name=func_name, target=func, args=(obj,) + args))
143
+ self._threads.append(threading.Thread(name=func_name, target=func, args=()))
144
144
 
145
145
  @Decorators.stop
146
146
  def _polling(self):
@@ -150,6 +150,10 @@ class Launcher(threading.Thread):
150
150
  if not self.task_model and run_time > self.before_scheduler_wait_seconds:
151
151
  logger.info("Done! ready to close thread...")
152
152
  self.stop.set()
153
+ elif TaskQueue.TODO.length or TaskQueue.DOWNLOAD.length:
154
+ logger.info(f"Recovery {self.task} task run!")
155
+ self.check_emtpy_times = 0
156
+ self.pause.clear()
153
157
  else:
154
158
  logger.info("pause! waiting for resume...")
155
159
  elif self.check_emtpy_times > 2:
@@ -164,10 +168,6 @@ class Launcher(threading.Thread):
164
168
  f"reset times {3 - self.check_emtpy_times}"
165
169
  )
166
170
  self.check_emtpy_times += 1
167
- elif TaskQueue.TODO.length:
168
- logger.info(f"Recovery {self.task} task run!")
169
- self.check_emtpy_times = 0
170
- self.pause.clear()
171
171
  else:
172
172
  logger.info(LogTemplate.launcher_polling.format(
173
173
  task=self.task,
@@ -179,6 +179,8 @@ class Launcher(threading.Thread):
179
179
  response_queue_len=TaskQueue.RESPONSE.length,
180
180
  done_queue_len=TaskQueue.DONE.length,
181
181
  upload_queue_len=TaskQueue.UPLOAD.length,
182
+ seed_queue_len=TaskQueue.SEED.length,
183
+ download_queue_len=TaskQueue.DOWNLOAD.length
182
184
  ))
183
185
  time.sleep(10)
184
186
 
@@ -0,0 +1,88 @@
1
+ # import time
2
+ #
3
+ # from cobweb.base import logger
4
+ # from cobweb.constant import LogTemplate
5
+ # from .launcher import Launcher, check_pause
6
+ #
7
+ #
8
+ # class LauncherAir(Launcher):
9
+ #
10
+ # # def _scheduler(self):
11
+ # # if self.start_seeds:
12
+ # # self.__LAUNCHER_QUEUE__['todo'].push(self.start_seeds)
13
+ #
14
+ # @check_pause
15
+ # def _insert(self):
16
+ # seeds = {}
17
+ # status = self.__LAUNCHER_QUEUE__['new'].length < self._new_queue_max_size
18
+ # for _ in range(self._new_queue_max_size):
19
+ # seed = self.__LAUNCHER_QUEUE__['new'].pop()
20
+ # if not seed:
21
+ # break
22
+ # seeds[seed.to_string] = seed.params.priority
23
+ # if seeds:
24
+ # self.__LAUNCHER_QUEUE__['todo'].push(seeds)
25
+ # if status:
26
+ # time.sleep(self._new_queue_wait_seconds)
27
+ #
28
+ # @check_pause
29
+ # def _delete(self):
30
+ # seeds = []
31
+ # status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
32
+ #
33
+ # for _ in range(self._done_queue_max_size):
34
+ # seed = self.__LAUNCHER_QUEUE__['done'].pop()
35
+ # if not seed:
36
+ # break
37
+ # seeds.append(seed.to_string)
38
+ #
39
+ # if seeds:
40
+ # self._remove_doing_seeds(seeds)
41
+ #
42
+ # if status:
43
+ # time.sleep(self._done_queue_wait_seconds)
44
+ #
45
+ # def _polling(self):
46
+ #
47
+ # check_emtpy_times = 0
48
+ #
49
+ # while not self._stop.is_set():
50
+ #
51
+ # queue_not_empty_count = 0
52
+ # pooling_wait_seconds = 30
53
+ #
54
+ # for q in self.__LAUNCHER_QUEUE__.values():
55
+ # if q.length != 0:
56
+ # queue_not_empty_count += 1
57
+ #
58
+ # if queue_not_empty_count == 0:
59
+ # pooling_wait_seconds = 3
60
+ # if self._pause.is_set():
61
+ # check_emtpy_times = 0
62
+ # if not self._task_model:
63
+ # logger.info("Done! Ready to close thread...")
64
+ # self._stop.set()
65
+ # elif check_emtpy_times > 2:
66
+ # self.__DOING__ = {}
67
+ # self._pause.set()
68
+ # else:
69
+ # logger.info(
70
+ # "check whether the task is complete, "
71
+ # f"reset times {3 - check_emtpy_times}"
72
+ # )
73
+ # check_emtpy_times += 1
74
+ # elif self._pause.is_set():
75
+ # self._pause.clear()
76
+ # self._execute()
77
+ # else:
78
+ # logger.info(LogTemplate.launcher_air_polling.format(
79
+ # task=self.task,
80
+ # doing_len=len(self.__DOING__.keys()),
81
+ # todo_len=self.__LAUNCHER_QUEUE__['todo'].length,
82
+ # done_len=self.__LAUNCHER_QUEUE__['done'].length,
83
+ # upload_len=self.__LAUNCHER_QUEUE__['upload'].length,
84
+ # ))
85
+ #
86
+ # time.sleep(pooling_wait_seconds)
87
+ #
88
+ #
@@ -0,0 +1,88 @@
1
+ import time
2
+
3
+ from cobweb.base import TaskQueue, Decorators
4
+ from cobweb.schedulers import ApiScheduler
5
+ from .launcher import Launcher
6
+
7
+
8
+ class LauncherPro(Launcher):
9
+
10
+ def __init__(self, task, project, custom_setting=None, **kwargs):
11
+ super().__init__(task, project, custom_setting, **kwargs)
12
+ self._redis_download = "{%s:%s}:download" % (project, task)
13
+ self._redis_todo = "{%s:%s}:todo" % (project, task)
14
+ self._scheduler = ApiScheduler(task, project)
15
+
16
+ @Decorators.stop
17
+ def _schedule(self):
18
+ thread_sleep = self.scheduling_wait_time
19
+ for q, key, size, item_info in [
20
+ (TaskQueue.TODO, self._redis_todo, self.todo_queue_size, self._task_info["todo"]),
21
+ (TaskQueue.DOWNLOAD, self._redis_download, self.download_queue_size, self._task_info["download"]),
22
+ ]:
23
+ if q.length < size:
24
+ for member, priority in self._scheduler.schedule(key, self.scheduling_size):
25
+ q.push((member, priority), direct_insertion=True)
26
+ self.add_working_item(key.split(":")[-1], member, priority)
27
+ thread_sleep = 0.1
28
+ time.sleep(thread_sleep)
29
+
30
+ @Decorators.stop
31
+ def _heartbeat(self):
32
+ if self._scheduler.working.is_set():
33
+ self._scheduler.set_heartbeat()
34
+ time.sleep(3)
35
+
36
+ @Decorators.stop
37
+ def _reset(self):
38
+ self._scheduler.reset(
39
+ keys=[self._redis_todo, self._redis_download],
40
+ reset_time=self.seed_reset_seconds
41
+ )
42
+ time.sleep(30)
43
+
44
+ @Decorators.pause
45
+ def _insert(self):
46
+ thread_sleep = 0.1
47
+ for q, key, size in [
48
+ (TaskQueue.SEED, self._redis_todo, self.seed_queue_size),
49
+ (TaskQueue.REQUEST, self._redis_download, self.request_queue_size),
50
+ ]:
51
+ item_info = {}
52
+ while (item := q.pop()) and len(item_info.keys()) < self.inserting_size:
53
+ item_info[item.seed] = item.params.priority
54
+ if q.length >= size:
55
+ thread_sleep = self.inserting_wait_time
56
+ self._scheduler.insert(key, item_info)
57
+ time.sleep(thread_sleep)
58
+
59
+ @Decorators.pause
60
+ def _refresh(self):
61
+ self._scheduler.refresh(self._redis_todo, self._task_info["todo"])
62
+ self._scheduler.refresh(self._redis_download, self._task_info["download"])
63
+ time.sleep(10)
64
+
65
+ @Decorators.pause
66
+ def _remove(self):
67
+ thread_sleep = self.removing_wait_time
68
+ for q, key, size in [
69
+ (TaskQueue.DELETE, self._redis_todo, self.delete_queue_size),
70
+ (TaskQueue.DONE, self._redis_download, self.done_queue_size),
71
+ ]:
72
+ items = []
73
+ while (item := q.pop()) and len(items) < self.removing_size:
74
+ items.append(item)
75
+ self._scheduler.delete(key, items)
76
+ self.remove_working_items(key.split(":")[-1], items)
77
+ if q.length >= size:
78
+ thread_sleep = 0.1
79
+ time.sleep(thread_sleep)
80
+
81
+ def _init_schedule_thread(self):
82
+ self._add_thread(func=self._heartbeat)
83
+ self._add_thread(func=self._reset)
84
+ self._add_thread(func=self._refresh)
85
+ self._add_thread(func=self._schedule)
86
+ self._add_thread(func=self._insert)
87
+ self._add_thread(func=self._remove)
88
+ # self._add_thread(func=self._polling)
@@ -21,21 +21,19 @@ class LauncherPro(Launcher):
21
21
  (TaskQueue.DOWNLOAD, self._redis_download, self.download_queue_size, self._task_info["download"]),
22
22
  ]:
23
23
  if q.length < size:
24
- for member, priority in self._scheduler.schedule(
25
- key, self.scheduling_size
26
- ):
24
+ for member, priority in self._scheduler.schedule(key, self.scheduling_size):
27
25
  q.push((member, priority), direct_insertion=True)
28
26
  self.add_working_item(key.split(":")[-1], member, priority)
29
27
  thread_sleep = 0.1
30
28
  time.sleep(thread_sleep)
31
29
 
32
- @Decorators.pause
30
+ @Decorators.stop
33
31
  def _heartbeat(self):
34
32
  if self._scheduler.working.is_set():
35
33
  self._scheduler.set_heartbeat()
36
34
  time.sleep(3)
37
35
 
38
- @Decorators.pause
36
+ @Decorators.stop
39
37
  def _reset(self):
40
38
  self._scheduler.reset(
41
39
  keys=[self._redis_todo, self._redis_download],
@@ -51,7 +49,7 @@ class LauncherPro(Launcher):
51
49
  (TaskQueue.REQUEST, self._redis_download, self.request_queue_size),
52
50
  ]:
53
51
  item_info = {}
54
- while item := q.pop() and len(item_info.keys()) < self.inserting_size:
52
+ while (item := q.pop()) and len(item_info.keys()) < self.inserting_size:
55
53
  item_info[item.seed] = item.params.priority
56
54
  if q.length >= size:
57
55
  thread_sleep = self.inserting_wait_time
@@ -72,9 +70,9 @@ class LauncherPro(Launcher):
72
70
  (TaskQueue.DONE, self._redis_download, self.done_queue_size),
73
71
  ]:
74
72
  items = []
75
- while item := q.pop() and len(items) < self.removing_size:
73
+ while (item := q.pop()) and len(items) < self.removing_size:
76
74
  items.append(item)
77
- self._scheduler.delete(key, *items)
75
+ self._scheduler.delete(key, items)
78
76
  self.remove_working_items(key.split(":")[-1], items)
79
77
  if q.length >= size:
80
78
  thread_sleep = 0.1
@@ -87,4 +85,4 @@ class LauncherPro(Launcher):
87
85
  self._add_thread(func=self._schedule)
88
86
  self._add_thread(func=self._insert)
89
87
  self._add_thread(func=self._remove)
90
- self._add_thread(func=self._polling)
88
+ # self._add_thread(func=self._polling)
@@ -30,7 +30,7 @@ class Pipeline(ABC):
30
30
  data_info, seeds = {}, []
31
31
  thread_sleep = self.upload_wait_time if TaskQueue.UPLOAD.length < self.upload_queue_size else 0.1
32
32
  try:
33
- while item := TaskQueue.UPLOAD.pop() and len(seeds) <= self.upload_queue_size:
33
+ while (item := TaskQueue.UPLOAD.pop()) and len(seeds) <= self.upload_queue_size:
34
34
  data = self.build(item)
35
35
  data_info.setdefault(item.table, []).append(data)
36
36
  seeds.append(item.seed)
@@ -39,6 +39,7 @@ class Pipeline(ABC):
39
39
  except Exception as e:
40
40
  logger.info(e)
41
41
  seeds = None
42
+ # todo: retry
42
43
  finally:
43
44
  TaskQueue.DONE.push(seeds)
44
45
 
@@ -1 +1,3 @@
1
1
  from .scheduler_redis import RedisScheduler
2
+ from .scheduler_api import ApiScheduler
3
+
@@ -0,0 +1,69 @@
1
+ import threading
2
+ import time
3
+
4
+ # from cobweb.base import Seed
5
+ from cobweb.db import ApiDB
6
+
7
+
8
+ class ApiScheduler:
9
+
10
+ def __init__(self, task, project, scheduler_wait_seconds=30):
11
+ self._todo_key = "{%s:%s}:todo" % (project, task)
12
+ self._download_key = "{%s:%s}:download" % (project, task)
13
+ self._heartbeat_key = "heartbeat:%s_%s" % (project, task)
14
+ self._speed_control_key = "speed_control:%s_%s" % (project, task)
15
+ self._reset_lock_key = "lock:reset:%s_%s" % (project, task)
16
+ self._db = ApiDB()
17
+
18
+ self.scheduler_wait_seconds = scheduler_wait_seconds
19
+ self.working = threading.Event()
20
+
21
+ @property
22
+ def heartbeat(self):
23
+ return self._db.exists(self._heartbeat_key)
24
+
25
+ def set_heartbeat(self):
26
+ return self._db.setex(self._heartbeat_key, 5)
27
+
28
+ def schedule(self, key, count):
29
+ if not self._db.zcount(key, 0, "(1000"):
30
+ time.sleep(self.scheduler_wait_seconds)
31
+ else:
32
+ source = int(time.time())
33
+ members = self._db.members(key, source, count=count, _min=0, _max="(1000")
34
+ for member, priority in members:
35
+ # seed = Seed(member, priority=priority)
36
+ yield member.decode(), priority
37
+
38
+ def insert(self, key, items):
39
+ if items:
40
+ self._db.zadd(key, items, nx=True)
41
+
42
+ def reset(self, keys, reset_time=30):
43
+ if self._db.lock(self._reset_lock_key, t=120):
44
+
45
+ if isinstance(keys, str):
46
+ keys = [keys]
47
+
48
+ _min = reset_time - int(time.time()) if self.heartbeat else "-inf"
49
+
50
+ for key in keys:
51
+ self._db.members(key, 0, _min=_min, _max="(0")
52
+
53
+ if not self.heartbeat:
54
+ self.working.set()
55
+ time.sleep(10)
56
+
57
+ self._db.delete(self._reset_lock_key)
58
+
59
+ def refresh(self, key, items: dict[str, int]):
60
+ refresh_time = int(time.time())
61
+ its = {k: -refresh_time - v / 1000 for k, v in items}
62
+ self._db.zadd(key, item=its, xx=True)
63
+
64
+ def delete(self, key, values):
65
+ self._db.zrem(key, *values)
66
+
67
+
68
+
69
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 1.3.2
3
+ Version: 1.3.4
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -30,6 +30,7 @@ cobweb/pipelines/pipeline.py
30
30
  cobweb/pipelines/pipeline_console.py
31
31
  cobweb/pipelines/pipeline_loghub.py
32
32
  cobweb/schedulers/__init__.py
33
+ cobweb/schedulers/scheduler_api.py
33
34
  cobweb/schedulers/scheduler_redis.py
34
35
  cobweb/utils/__init__.py
35
36
  cobweb/utils/bloom.py
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
5
5
 
6
6
  setup(
7
7
  name="cobweb-launcher",
8
- version="1.3.2",
8
+ version="1.3.4",
9
9
  packages=find_packages(),
10
10
  url="https://github.com/Juannie-PP/cobweb",
11
11
  license="MIT",
@@ -1,161 +0,0 @@
1
- import time
2
- import threading
3
-
4
- from cobweb.db import ApiDB
5
- from cobweb.base import Seed, TaskQueue,logger, stop, pause
6
- from cobweb.constant import DealModel
7
- from .launcher import Launcher
8
-
9
-
10
- class LauncherApi(Launcher):
11
-
12
- def __init__(self, task, project, custom_setting=None, **kwargs):
13
- super().__init__(task, project, custom_setting, **kwargs)
14
- self._db = ApiDB()
15
-
16
- self._todo_key = "{%s:%s}:todo" % (project, task)
17
- self._done_key = "{%s:%s}:done" % (project, task)
18
- self._fail_key = "{%s:%s}:fail" % (project, task)
19
- self._heartbeat_key = "heartbeat:%s_%s" % (project, task)
20
-
21
- self._statistics_done_key = "statistics:%s:%s:done" % (project, task)
22
- self._statistics_fail_key = "statistics:%s:%s:fail" % (project, task)
23
- self._speed_control_key = "speed_control:%s_%s" % (project, task)
24
-
25
- self._reset_lock_key = "lock:reset:%s_%s" % (project, task)
26
-
27
- self._heartbeat_start_event = threading.Event()
28
-
29
- @property
30
- def heartbeat(self):
31
- return self._db.exists(self._heartbeat_key)
32
-
33
- def statistics(self, key, count):
34
- if not self.task_model and not self._db.exists(key):
35
- self._db.setex(key, 86400 * 30, int(count))
36
- else:
37
- self._db.incrby(key, count)
38
-
39
- def _get_seed(self) -> Seed:
40
- """
41
- 从队列中获取种子(频控)
42
- 设置时间窗口为self._time_window(秒),判断在该窗口内的采集量是否满足阈值(self._spider_max_speed)
43
- :return: True -> 种子, False -> None
44
- """
45
- if TaskQueue.TODO.length and not self._db.auto_incr(
46
- self._speed_control_key,
47
- t=self.time_window,
48
- limit=self.spider_max_count
49
- ):
50
- expire_time = self._db.ttl(self._speed_control_key)
51
- logger.info(f"Too fast! Please wait {expire_time} seconds...")
52
- time.sleep(expire_time / 2)
53
- return None
54
- return TaskQueue.TODO.pop()
55
-
56
- @stop
57
- def _reset(self):
58
- """
59
- 检查过期种子,重新添加到redis缓存中
60
- """
61
- if self._db.lock(self._reset_lock_key, t=120):
62
-
63
- _min = -int(time.time()) + self.seed_reset_seconds \
64
- if self.heartbeat else "-inf"
65
-
66
- self._db.members(self._todo_key, 0, _min=_min, _max="(0")
67
-
68
- if not self.heartbeat:
69
- self._heartbeat_start_event.set()
70
-
71
- self._db.delete(self._reset_lock_key)
72
-
73
- time.sleep(30)
74
-
75
- @stop
76
- def _refresh(self):
77
- """
78
- 刷新doing种子过期时间,防止reset重新消费
79
- """
80
- if self.doing_seeds:
81
- refresh_time = int(time.time())
82
- seeds = {k: -refresh_time - v / 1e3 for k, v in self.doing_seeds.items()}
83
- self._db.zadd(self._todo_key, item=seeds, xx=True)
84
- time.sleep(3)
85
-
86
- @stop
87
- def _scheduler(self):
88
- """
89
- 调度任务,获取redis队列种子,同时添加到doing字典中
90
- """
91
- if not self._db.zcount(self._todo_key, 0, "(1000"):
92
- time.sleep(self.scheduler_wait_seconds)
93
- elif TaskQueue.TODO.length >= self.todo_queue_size:
94
- time.sleep(self.todo_queue_full_wait_seconds)
95
- else:
96
- members = self._db.members(
97
- self._todo_key, int(time.time()),
98
- count=self.todo_queue_size,
99
- _min=0, _max="(1000"
100
- )
101
- for member, priority in members:
102
- seed = Seed(member, priority=priority)
103
- TaskQueue.TODO.push(seed)
104
- self.doing_seeds[seed.to_string] = seed.params.priority
105
-
106
- @pause
107
- def _heartbeat(self):
108
- if self._heartbeat_start_event.is_set():
109
- self._db.setex(self._heartbeat_key, t=5)
110
- time.sleep(3)
111
-
112
- @pause
113
- def _insert(self):
114
- """
115
- 添加新种子到redis队列中
116
- """
117
- seeds = {}
118
- for _ in range(self.new_queue_max_size):
119
- if seed := TaskQueue.SEED.pop():
120
- seeds[seed.to_string] = seed.params.priority
121
- if seeds:
122
- self._db.zadd(self._todo_key, seeds, nx=True)
123
- if TaskQueue.SEED.length < self.new_queue_max_size:
124
- time.sleep(self.new_queue_wait_seconds)
125
-
126
- @pause
127
- def _delete(self):
128
- """
129
- 删除队列种子,根据状态添加至成功或失败队列,移除doing字典种子索引
130
- """
131
- seed_info = {"count": 0, "failed": [], "succeed": [], "common": []}
132
- status = TaskQueue.DONE.length < self.done_queue_max_size
133
-
134
- for _ in range(self.done_queue_max_size):
135
- seed = TaskQueue.DONE.pop()
136
- if not seed:
137
- break
138
- if seed.params.seed_status == DealModel.fail:
139
- seed_info["failed"].append(seed.to_string)
140
- elif self.done_model == 1:
141
- seed_info["succeed"].append(seed.to_string)
142
- else:
143
- seed_info["common"].append(seed.to_string)
144
- seed_info['count'] += 1
145
-
146
- if seed_info["count"]:
147
-
148
- succeed_count = int(self._db.zrem(self._todo_key, *seed_info["common"]) or 0)
149
- succeed_count += int(self._db.done([self._todo_key, self._done_key], *seed_info["succeed"]) or 0)
150
- failed_count = int(self._db.done([self._todo_key, self._fail_key], *seed_info["failed"]) or 0)
151
-
152
- if failed_count:
153
- self.statistics(self._statistics_fail_key, failed_count)
154
- if succeed_count:
155
- self.statistics(self._statistics_done_key, succeed_count)
156
-
157
- self._remove_doing_seeds(seed_info["common"] + seed_info["succeed"] + seed_info["failed"])
158
-
159
- if status:
160
- time.sleep(self.done_queue_wait_seconds)
161
-
@@ -1,2 +0,0 @@
1
- from .launchers import LauncherAir, LauncherPro, LauncherApi
2
- from .constant import CrawlerModel
@@ -1,88 +0,0 @@
1
- import time
2
-
3
- from cobweb.base import logger
4
- from cobweb.constant import LogTemplate
5
- from .launcher import Launcher, check_pause
6
-
7
-
8
- class LauncherAir(Launcher):
9
-
10
- # def _scheduler(self):
11
- # if self.start_seeds:
12
- # self.__LAUNCHER_QUEUE__['todo'].push(self.start_seeds)
13
-
14
- @check_pause
15
- def _insert(self):
16
- seeds = {}
17
- status = self.__LAUNCHER_QUEUE__['new'].length < self._new_queue_max_size
18
- for _ in range(self._new_queue_max_size):
19
- seed = self.__LAUNCHER_QUEUE__['new'].pop()
20
- if not seed:
21
- break
22
- seeds[seed.to_string] = seed.params.priority
23
- if seeds:
24
- self.__LAUNCHER_QUEUE__['todo'].push(seeds)
25
- if status:
26
- time.sleep(self._new_queue_wait_seconds)
27
-
28
- @check_pause
29
- def _delete(self):
30
- seeds = []
31
- status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
32
-
33
- for _ in range(self._done_queue_max_size):
34
- seed = self.__LAUNCHER_QUEUE__['done'].pop()
35
- if not seed:
36
- break
37
- seeds.append(seed.to_string)
38
-
39
- if seeds:
40
- self._remove_doing_seeds(seeds)
41
-
42
- if status:
43
- time.sleep(self._done_queue_wait_seconds)
44
-
45
- def _polling(self):
46
-
47
- check_emtpy_times = 0
48
-
49
- while not self._stop.is_set():
50
-
51
- queue_not_empty_count = 0
52
- pooling_wait_seconds = 30
53
-
54
- for q in self.__LAUNCHER_QUEUE__.values():
55
- if q.length != 0:
56
- queue_not_empty_count += 1
57
-
58
- if queue_not_empty_count == 0:
59
- pooling_wait_seconds = 3
60
- if self._pause.is_set():
61
- check_emtpy_times = 0
62
- if not self._task_model:
63
- logger.info("Done! Ready to close thread...")
64
- self._stop.set()
65
- elif check_emtpy_times > 2:
66
- self.__DOING__ = {}
67
- self._pause.set()
68
- else:
69
- logger.info(
70
- "check whether the task is complete, "
71
- f"reset times {3 - check_emtpy_times}"
72
- )
73
- check_emtpy_times += 1
74
- elif self._pause.is_set():
75
- self._pause.clear()
76
- self._execute()
77
- else:
78
- logger.info(LogTemplate.launcher_air_polling.format(
79
- task=self.task,
80
- doing_len=len(self.__DOING__.keys()),
81
- todo_len=self.__LAUNCHER_QUEUE__['todo'].length,
82
- done_len=self.__LAUNCHER_QUEUE__['done'].length,
83
- upload_len=self.__LAUNCHER_QUEUE__['upload'].length,
84
- ))
85
-
86
- time.sleep(pooling_wait_seconds)
87
-
88
-
File without changes