cobweb-launcher 1.2.49__py3-none-any.whl → 1.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. cobweb/base/__init__.py +141 -4
  2. cobweb/base/basic.py +28 -82
  3. cobweb/base/common_queue.py +13 -0
  4. cobweb/base/dotting.py +1 -1
  5. cobweb/base/request.py +14 -2
  6. cobweb/base/seed.py +10 -6
  7. cobweb/constant.py +16 -0
  8. cobweb/crawlers/crawler.py +51 -181
  9. cobweb/db/redis_db.py +28 -0
  10. cobweb/launchers/__init__.py +2 -2
  11. cobweb/launchers/launcher.py +110 -141
  12. cobweb/launchers/launcher_api.py +66 -114
  13. cobweb/launchers/launcher_pro.py +76 -194
  14. cobweb/pipelines/base_pipeline.py +54 -0
  15. cobweb/pipelines/loghub_pipeline.py +34 -0
  16. cobweb/pipelines/pipeline.py +25 -49
  17. cobweb/setting.py +29 -6
  18. cobweb/utils/dotting.py +10 -42
  19. cobweb_/__init__.py +2 -0
  20. cobweb_/base/__init__.py +9 -0
  21. cobweb_/base/common_queue.py +30 -0
  22. cobweb_/base/decorators.py +40 -0
  23. cobweb_/base/item.py +46 -0
  24. cobweb_/base/log.py +94 -0
  25. cobweb_/base/request.py +82 -0
  26. cobweb_/base/response.py +23 -0
  27. cobweb_/base/seed.py +114 -0
  28. cobweb_/constant.py +94 -0
  29. cobweb_/crawlers/__init__.py +1 -0
  30. cobweb_/crawlers/crawler.py +184 -0
  31. cobweb_/db/__init__.py +2 -0
  32. cobweb_/db/api_db.py +82 -0
  33. cobweb_/db/redis_db.py +130 -0
  34. cobweb_/exceptions/__init__.py +1 -0
  35. cobweb_/exceptions/oss_db_exception.py +28 -0
  36. cobweb_/launchers/__init__.py +3 -0
  37. cobweb_/launchers/launcher.py +235 -0
  38. cobweb_/launchers/launcher_air.py +88 -0
  39. cobweb_/launchers/launcher_api.py +221 -0
  40. cobweb_/launchers/launcher_pro.py +222 -0
  41. cobweb_/pipelines/__init__.py +3 -0
  42. cobweb_/pipelines/pipeline.py +69 -0
  43. cobweb_/pipelines/pipeline_console.py +22 -0
  44. cobweb_/pipelines/pipeline_loghub.py +34 -0
  45. cobweb_/setting.py +74 -0
  46. cobweb_/utils/__init__.py +5 -0
  47. cobweb_/utils/bloom.py +58 -0
  48. cobweb_/utils/dotting.py +32 -0
  49. cobweb_/utils/oss.py +94 -0
  50. cobweb_/utils/tools.py +42 -0
  51. {cobweb_launcher-1.2.49.dist-info → cobweb_launcher-1.3.1.dist-info}/METADATA +1 -1
  52. cobweb_launcher-1.3.1.dist-info/RECORD +108 -0
  53. cobweb_launcher-1.3.1.dist-info/top_level.txt +2 -0
  54. cobweb_new/__init__.py +2 -0
  55. cobweb_new/base/__init__.py +72 -0
  56. cobweb_new/base/common_queue.py +53 -0
  57. cobweb_new/base/decorators.py +72 -0
  58. cobweb_new/base/item.py +46 -0
  59. cobweb_new/base/log.py +94 -0
  60. cobweb_new/base/request.py +82 -0
  61. cobweb_new/base/response.py +23 -0
  62. cobweb_new/base/seed.py +118 -0
  63. cobweb_new/constant.py +105 -0
  64. cobweb_new/crawlers/__init__.py +1 -0
  65. cobweb_new/crawlers/crawler-new.py +85 -0
  66. cobweb_new/crawlers/crawler.py +170 -0
  67. cobweb_new/db/__init__.py +2 -0
  68. cobweb_new/db/api_db.py +82 -0
  69. cobweb_new/db/redis_db.py +158 -0
  70. cobweb_new/exceptions/__init__.py +1 -0
  71. cobweb_new/exceptions/oss_db_exception.py +28 -0
  72. cobweb_new/launchers/__init__.py +3 -0
  73. cobweb_new/launchers/launcher.py +237 -0
  74. cobweb_new/launchers/launcher_air.py +88 -0
  75. cobweb_new/launchers/launcher_api.py +161 -0
  76. cobweb_new/launchers/launcher_pro.py +96 -0
  77. cobweb_new/launchers/tesss.py +47 -0
  78. cobweb_new/pipelines/__init__.py +3 -0
  79. cobweb_new/pipelines/pipeline.py +68 -0
  80. cobweb_new/pipelines/pipeline_console.py +22 -0
  81. cobweb_new/pipelines/pipeline_loghub.py +34 -0
  82. cobweb_new/setting.py +95 -0
  83. cobweb_new/utils/__init__.py +5 -0
  84. cobweb_new/utils/bloom.py +58 -0
  85. cobweb_new/utils/oss.py +94 -0
  86. cobweb_new/utils/tools.py +42 -0
  87. cobweb/schedulers/__init__.py +0 -3
  88. cobweb/schedulers/scheduler_api.py +0 -72
  89. cobweb/schedulers/scheduler_redis.py +0 -72
  90. cobweb_launcher-1.2.49.dist-info/RECORD +0 -44
  91. cobweb_launcher-1.2.49.dist-info/top_level.txt +0 -1
  92. {cobweb_launcher-1.2.49.dist-info → cobweb_launcher-1.3.1.dist-info}/LICENSE +0 -0
  93. {cobweb_launcher-1.2.49.dist-info → cobweb_launcher-1.3.1.dist-info}/WHEEL +0 -0
cobweb/base/__init__.py CHANGED
@@ -1,9 +1,146 @@
1
+ import time
2
+ import traceback
3
+ import threading
4
+
5
+ from functools import wraps
6
+ from inspect import isgenerator
7
+ from typing import Callable, Union
8
+
1
9
  from .common_queue import Queue
2
10
  from .response import Response
3
- from .request import Request
11
+ from .basic import Seed, Request, Response
4
12
  from .item import BaseItem, ConsoleItem
5
- from .seed import Seed
6
-
13
+ # from .seed import Seed
7
14
  from .log import logger
8
- from .decorators import decorator_oss_db
15
+ # from .dotting import LoghubDot
16
+
17
+
18
+ class TaskQueue:
19
+ TODO = Queue() # 任务种子队列
20
+ DOWNLOAD = Queue() # 下载任务队列
21
+
22
+ SEED = Queue() # 添加任务种子队列
23
+ REQUEST = Queue() # 请求队列
24
+ RESPONSE = Queue() # 响应队列
25
+ DONE = Queue() # 下载完成队列
26
+ UPLOAD = Queue() # 任务上传队列
27
+ DELETE = Queue() # 任务删除队列
28
+
29
+ # DOT = LoghubDot()
30
+
31
+ @staticmethod
32
+ def is_empty():
33
+ total_length = TaskQueue.SEED.length
34
+ total_length += TaskQueue.TODO.length
35
+ total_length += TaskQueue.REQUEST.length
36
+ total_length += TaskQueue.DOWNLOAD.length
37
+ total_length += TaskQueue.RESPONSE.length
38
+ total_length += TaskQueue.UPLOAD.length
39
+ total_length += TaskQueue.DONE.length
40
+ total_length += TaskQueue.DELETE.length
41
+ return not bool(total_length)
42
+
43
+ @staticmethod
44
+ def process_task(it: Union[Seed, Request, Response, BaseItem], crawler_func: Callable):
45
+ try:
46
+ iterators = crawler_func(it)
47
+ if not isgenerator(iterators):
48
+ raise TypeError(f"{crawler_func.__name__} function isn't a generator")
49
+ for tk in iterators:
50
+ if isinstance(tk, Request):
51
+ TaskQueue.REQUEST.push(tk)
52
+ elif isinstance(tk, Response):
53
+ TaskQueue.RESPONSE.push(tk)
54
+ elif isinstance(tk, BaseItem):
55
+ TaskQueue.UPLOAD.push(tk)
56
+ elif isinstance(tk, Seed):
57
+ TaskQueue.SEED.push(tk)
58
+ else:
59
+ raise TypeError(f"{crawler_func.__name__} function return type isn't supported")
60
+ # TaskQueue.DOT.build(
61
+ # topic=f"{self.project}:{self.task}",
62
+ # cost_time=end_time - start_time,
63
+ # **download_item.to_dict
64
+ # )
65
+ # todo: 数据打点
66
+ except Exception as e:
67
+ it.params.retry += 1
68
+ if isinstance(it, Request):
69
+ TaskQueue.REQUEST.push(it)
70
+ elif isinstance(it, Response):
71
+ TaskQueue.RESPONSE.push(it)
72
+ elif isinstance(it, Seed):
73
+ TaskQueue.SEED.push(it)
74
+ time.sleep(1)
75
+
76
+
77
+ class Decorators:
78
+
79
+ @staticmethod
80
+ def add_thread(num=1):
81
+ def decorator(func):
82
+ @wraps(func)
83
+ def wrapper(self, *args):
84
+ for i in range(num):
85
+ name = func.__name__ + "_" + str(i) if num > 1 else func.__name__
86
+ self._threads.append(threading.Thread(name=name, target=func, args=(self,) + args))
87
+
88
+ return wrapper
89
+
90
+ return decorator
91
+
92
+ @staticmethod
93
+ def pause(func):
94
+ @wraps(func)
95
+ def wrapper(self, *args, **kwargs):
96
+ while not self.pause.is_set():
97
+ try:
98
+ func(self, *args, **kwargs)
99
+ except Exception as e:
100
+ logger.info(f"{func.__name__}: " + str(e))
101
+ finally:
102
+ time.sleep(0.1)
103
+ logger.info(f"{func.__name__}: close!")
104
+
105
+ return wrapper
106
+
107
+ @staticmethod
108
+ def stop(func):
109
+ @wraps(func)
110
+ def wrapper(self, *args, **kwargs):
111
+ while not self.stop.is_set():
112
+ try:
113
+ func(self, *args, **kwargs)
114
+ except Exception as e:
115
+ logger.info(
116
+ f"{func.__name__} exception: \n" +
117
+ ''.join(traceback.format_exception(type(e), e, e.__traceback__))
118
+ )
119
+ finally:
120
+ time.sleep(0.1)
121
+
122
+ return wrapper
123
+
124
+ @staticmethod
125
+ def decorator_oss_db(exception, retries=3):
126
+ def decorator(func):
127
+ @wraps(func)
128
+ def wrapper(callback_func, *args, **kwargs):
129
+ result = None
130
+ for i in range(retries):
131
+ msg = None
132
+ try:
133
+ return func(callback_func, *args, **kwargs)
134
+ except Exception as e:
135
+ result = None
136
+ msg = e
137
+ finally:
138
+ if result:
139
+ return result
140
+
141
+ if i >= 2 and msg:
142
+ raise exception(msg)
143
+
144
+ return wrapper
9
145
 
146
+ return decorator
cobweb/base/basic.py CHANGED
@@ -15,21 +15,11 @@ class Params:
15
15
 
16
16
 
17
17
  class Seed:
18
- __SEED_PARAMS__ = [
19
- "retry",
20
- "priority",
21
- "version",
22
- "status"
23
- ]
24
18
 
25
19
  def __init__(
26
20
  self,
27
21
  seed,
28
- sid=None,
29
- retry=None,
30
- priority=None,
31
- version=None,
32
- status=None,
22
+ params = Params(),
33
23
  **kwargs
34
24
  ):
35
25
  if any(isinstance(seed, t) for t in (str, bytes)):
@@ -47,27 +37,11 @@ class Seed:
47
37
  f"seed: {seed}"
48
38
  ))
49
39
 
50
- seed_params = {
51
- "retry": retry,
52
- "priority": priority,
53
- "version": version,
54
- "status": status,
55
- }
56
-
57
40
  if kwargs:
58
- # for k, v in kwargs.items():
59
- # if k in seed_params.keys():
60
- # seed_params[k] = v
61
- # else:
62
- # self.__setattr__(k, v)
63
41
  self._init_seed(kwargs)
64
- seed_params.update({
65
- k: v for k, v in kwargs.items()
66
- if k in self.__SEED_PARAMS__
67
- })
68
- if sid or not getattr(self, "sid", None):
69
- self._init_id(sid)
70
- self.params = Params(**seed_params)
42
+ if not getattr(self, "sid", None):
43
+ self._init_id()
44
+ self.params = params or Params()
71
45
 
72
46
  def __getattr__(self, name):
73
47
  return None
@@ -85,14 +59,13 @@ class Seed:
85
59
  chars = [f"{k}={v}" for k, v in self.__dict__.items()]
86
60
  return f'{self.__class__.__name__}({", ".join(chars)})'
87
61
 
88
- def _init_seed(self, seed_info: dict):
62
+ def _init_seed(self, seed_info:dict):
89
63
  for k, v in seed_info.items():
90
64
  if k not in self.__SEED_PARAMS__:
91
65
  self.__setattr__(k, v)
92
66
 
93
- def _init_id(self, sid):
94
- if not sid:
95
- sid = hashlib.md5(self.to_string.encode()).hexdigest()
67
+ def _init_id(self):
68
+ sid = hashlib.md5(self.to_string.encode()).hexdigest()
96
69
  self.__setattr__("sid", sid)
97
70
 
98
71
  @property
@@ -112,15 +85,16 @@ class Seed:
112
85
 
113
86
  @property
114
87
  def seed(self):
115
- return self.to_string
88
+ return self
116
89
 
117
90
 
118
91
  class Request:
92
+
119
93
  __SEED_PARAMS__ = [
120
94
  "retry",
121
95
  "priority",
122
- "version",
123
- "status"
96
+ "seed_version",
97
+ "seed_status"
124
98
  ]
125
99
 
126
100
  __REQUEST_ATTRS__ = {
@@ -142,34 +116,27 @@ class Request:
142
116
 
143
117
  def __init__(
144
118
  self,
145
- # url,
119
+ url,
146
120
  seed,
147
121
  random_ua=True,
148
122
  check_status_code=True,
149
123
  retry=None,
150
124
  priority=None,
151
- version=None,
152
- status=None,
125
+ seed_version=None,
126
+ seed_status=None,
153
127
  **kwargs
154
128
  ):
155
- # self.url = url
129
+ self.url = url
156
130
  self.check_status_code = check_status_code
157
131
  self.request_setting = {}
158
132
 
159
133
  seed_params = {
160
134
  "retry": retry,
161
135
  "priority": priority,
162
- "version": version,
163
- "status": status,
136
+ "seed_version": seed_version,
137
+ "seed_status": seed_status,
164
138
  }
165
139
 
166
- if isinstance(seed, Seed):
167
- kwargs.update(**seed.to_dict)
168
- elif isinstance(seed, str):
169
- kwargs.update(**json.loads(seed))
170
- elif isinstance(seed, dict):
171
- kwargs.update(**seed)
172
-
173
140
  for k, v in kwargs.items():
174
141
  if k in self.__class__.__REQUEST_ATTRS__:
175
142
  self.request_setting[k] = v
@@ -185,7 +152,12 @@ class Request:
185
152
  self._build_header()
186
153
 
187
154
  self.params = Params(**seed_params)
188
- # self.seed = self.to_string
155
+
156
+ if isinstance(seed, Seed):
157
+ kwargs.update(**seed.to_dict)
158
+ elif isinstance(seed, str):
159
+ kwargs.update(**json.loads(seed))
160
+ self.seed = self.to_string
189
161
 
190
162
  @property
191
163
  def _random_ua(self) -> str:
@@ -211,19 +183,10 @@ class Request:
211
183
  response.raise_for_status()
212
184
  return response
213
185
 
214
- def __getattr__(self, name):
215
- return None
216
-
217
- def __setitem__(self, key, value):
218
- setattr(self, key, value)
219
-
220
- def __getitem__(self, item):
221
- return getattr(self, item)
222
-
223
186
  @property
224
187
  def to_dict(self):
225
188
  _dict = self.__dict__.copy()
226
- # _dict.pop('seed')
189
+ _dict.pop('seed')
227
190
  _dict.pop('params')
228
191
  _dict.pop('check_status_code')
229
192
  # _dict.pop('request_setting')
@@ -237,10 +200,6 @@ class Request:
237
200
  separators=(",", ":")
238
201
  )
239
202
 
240
- @property
241
- def seed(self):
242
- return self.to_string
243
-
244
203
 
245
204
  class Response:
246
205
 
@@ -250,8 +209,8 @@ class Response:
250
209
  response,
251
210
  retry=None,
252
211
  priority=None,
253
- version=None,
254
- status=None,
212
+ seed_version=None,
213
+ seed_status=None,
255
214
  **kwargs
256
215
  ):
257
216
  self.seed = seed
@@ -259,24 +218,20 @@ class Response:
259
218
  seed_params = {
260
219
  "retry": retry,
261
220
  "priority": priority,
262
- "version": version,
263
- "status": status,
221
+ "seed_version": seed_version,
222
+ "seed_status": seed_status,
264
223
  }
265
224
  for k, v in kwargs.items():
266
225
  if k in seed_params.keys():
267
226
  seed_params[k] = v
268
227
  else:
269
228
  self.__setattr__(k, v)
270
- self.params = Params(**seed_params)
271
229
 
272
230
  @property
273
231
  def to_dict(self):
274
232
  _dict = self.__dict__.copy()
275
233
  _dict.pop('seed')
276
234
  _dict.pop('response')
277
- _dict.pop('method')
278
- _dict.pop('params')
279
- _dict.pop('request_setting')
280
235
  return _dict
281
236
 
282
237
  @property
@@ -286,12 +241,3 @@ class Response:
286
241
  ensure_ascii=False,
287
242
  separators=(",", ":")
288
243
  )
289
-
290
- def __getattr__(self, name):
291
- return None
292
-
293
- def __setitem__(self, key, value):
294
- setattr(self, key, value)
295
-
296
- def __getitem__(self, item):
297
- return getattr(self, item)
@@ -1,3 +1,4 @@
1
+ import time
1
2
  from collections import deque
2
3
 
3
4
 
@@ -28,3 +29,15 @@ class Queue:
28
29
  return None
29
30
  except AttributeError:
30
31
  return None
32
+
33
+ def clear(self):
34
+ self._queue.clear()
35
+
36
+ def get(self):
37
+ try:
38
+ yield self._queue.popleft()
39
+ except IndexError:
40
+ time.sleep(1)
41
+ yield None
42
+ except AttributeError:
43
+ yield None
cobweb/base/dotting.py CHANGED
@@ -27,7 +27,7 @@ class LoghubDot:
27
27
  log_items.append(log_item)
28
28
  request = PutLogsRequest(
29
29
  project="databee-download-log",
30
- logstore="download-logging",
30
+ logstore="cobweb_log",
31
31
  topic=topic,
32
32
  logitems=log_items,
33
33
  compress=True
cobweb/base/request.py CHANGED
@@ -1,3 +1,4 @@
1
+ import json
1
2
  import random
2
3
  import requests
3
4
 
@@ -30,7 +31,6 @@ class Request:
30
31
  **kwargs
31
32
  ):
32
33
  self.url = url
33
- self.seed = seed
34
34
  self.check_status_code = check_status_code
35
35
  self.request_setting = {}
36
36
 
@@ -46,6 +46,12 @@ class Request:
46
46
  if random_ua:
47
47
  self._build_header()
48
48
 
49
+ if isinstance(seed, Seed):
50
+ self.seed = seed.to_string
51
+ else:
52
+ kwargs.update(**seed.to_dict)
53
+ self.seed = self.to_string
54
+
49
55
  @property
50
56
  def _random_ua(self) -> str:
51
57
  v1 = random.randint(4, 15)
@@ -73,10 +79,16 @@ class Request:
73
79
  @property
74
80
  def to_dict(self):
75
81
  _dict = self.__dict__.copy()
76
- _dict.pop('url')
77
82
  _dict.pop('seed')
78
83
  _dict.pop('check_status_code')
79
84
  _dict.pop('request_setting')
80
85
  return _dict
81
86
 
87
+ @property
88
+ def to_string(self) -> str:
89
+ return json.dumps(
90
+ self.to_dict,
91
+ ensure_ascii=False,
92
+ separators=(",", ":")
93
+ )
82
94
 
cobweb/base/seed.py CHANGED
@@ -104,11 +104,15 @@ class Seed:
104
104
  separators=(",", ":")
105
105
  )
106
106
 
107
+ # @property
108
+ # def get_all(self):
109
+ # return json.dumps(
110
+ # self.__dict__,
111
+ # ensure_ascii=False,
112
+ # separators=(",", ":")
113
+ # )
114
+
107
115
  @property
108
- def get_all(self):
109
- return json.dumps(
110
- self.__dict__,
111
- ensure_ascii=False,
112
- separators=(",", ":")
113
- )
116
+ def seed(self):
117
+ return self.to_string
114
118
 
cobweb/constant.py CHANGED
@@ -37,6 +37,22 @@ class LogTemplate:
37
37
  ----------------------- end - console pipeline ------------------
38
38
  """
39
39
 
40
+ launcher_polling = """
41
+ ----------------------- start - 轮训日志: {task} -----------------
42
+ 正在运行任务
43
+ 构造请求任务数: {memory_todo_count}
44
+ 正在下载任务数: {memory_download_count}
45
+ 任务内存队列
46
+ 待构造请求队列: {todo_queue_len}
47
+ 待删除请求队列: {delete_queue_len}
48
+ 待进行下载队列: {request_queue_len}
49
+ 待解析响应队列: {response_queue_len}
50
+ 待删除下载队列: {done_queue_len}
51
+ 存储队列
52
+ 待上传数据队列: {upload_queue_len}
53
+ ----------------------- end - 轮训日志: {task} ------------------
54
+ """
55
+
40
56
  launcher_air_polling = """
41
57
  ----------------------- start - 轮训日志: {task} -----------------
42
58
  内存队列