cobweb-launcher 1.2.25__py3-none-any.whl → 3.2.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. cobweb/__init__.py +4 -1
  2. cobweb/base/__init__.py +3 -3
  3. cobweb/base/common_queue.py +37 -16
  4. cobweb/base/item.py +35 -16
  5. cobweb/base/{log.py → logger.py} +3 -3
  6. cobweb/base/request.py +741 -54
  7. cobweb/base/response.py +380 -13
  8. cobweb/base/seed.py +96 -48
  9. cobweb/base/task_queue.py +180 -0
  10. cobweb/base/test.py +257 -0
  11. cobweb/constant.py +10 -1
  12. cobweb/crawlers/crawler.py +12 -155
  13. cobweb/db/api_db.py +3 -2
  14. cobweb/db/redis_db.py +117 -28
  15. cobweb/launchers/__init__.py +4 -3
  16. cobweb/launchers/distributor.py +141 -0
  17. cobweb/launchers/launcher.py +95 -157
  18. cobweb/launchers/uploader.py +68 -0
  19. cobweb/log_dots/__init__.py +2 -0
  20. cobweb/log_dots/dot.py +258 -0
  21. cobweb/log_dots/loghub_dot.py +53 -0
  22. cobweb/pipelines/__init__.py +1 -1
  23. cobweb/pipelines/pipeline.py +5 -55
  24. cobweb/pipelines/pipeline_csv.py +25 -0
  25. cobweb/pipelines/pipeline_loghub.py +32 -12
  26. cobweb/schedulers/__init__.py +1 -0
  27. cobweb/schedulers/scheduler.py +66 -0
  28. cobweb/schedulers/scheduler_with_redis.py +189 -0
  29. cobweb/setting.py +27 -40
  30. cobweb/utils/__init__.py +5 -3
  31. cobweb/utils/bloom.py +58 -58
  32. cobweb/{base → utils}/decorators.py +14 -12
  33. cobweb/utils/dotting.py +300 -0
  34. cobweb/utils/oss.py +113 -94
  35. cobweb/utils/tools.py +3 -15
  36. {cobweb_launcher-1.2.25.dist-info → cobweb_launcher-3.2.18.dist-info}/METADATA +31 -43
  37. cobweb_launcher-3.2.18.dist-info/RECORD +44 -0
  38. {cobweb_launcher-1.2.25.dist-info → cobweb_launcher-3.2.18.dist-info}/WHEEL +1 -1
  39. cobweb/crawlers/base_crawler.py +0 -144
  40. cobweb/crawlers/file_crawler.py +0 -98
  41. cobweb/launchers/launcher_air.py +0 -88
  42. cobweb/launchers/launcher_api.py +0 -221
  43. cobweb/launchers/launcher_pro.py +0 -222
  44. cobweb/pipelines/base_pipeline.py +0 -54
  45. cobweb/pipelines/loghub_pipeline.py +0 -34
  46. cobweb/pipelines/pipeline_console.py +0 -22
  47. cobweb_launcher-1.2.25.dist-info/RECORD +0 -40
  48. {cobweb_launcher-1.2.25.dist-info → cobweb_launcher-3.2.18.dist-info}/LICENSE +0 -0
  49. {cobweb_launcher-1.2.25.dist-info → cobweb_launcher-3.2.18.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,300 @@
1
+ import os
2
+ import json
3
+ import time
4
+ from threading import Event
5
+ from requests import RequestException, Response as requests_Response
6
+
7
+ from cobweb.base import Queue, Request, Seed, Response, BaseItem, logger
8
+ from aliyun.log import LogClient, LogItem, PutLogsRequest
9
+
10
+
11
+ class LoghubDot:
12
+
13
+ def __init__(self, stop: Event, project: str, task: str) -> None:
14
+ self._stop = stop
15
+ self._queue = Queue()
16
+ self._client = LogClient(
17
+ endpoint=os.getenv("LOGHUB_ENDPOINT"),
18
+ accessKeyId=os.getenv("LOGHUB_ACCESS_KEY"),
19
+ accessKey=os.getenv("LOGHUB_SECRET_KEY")
20
+ )
21
+ self.project = project
22
+ self.task = task
23
+
24
+ def logging(self, topic, msg):
25
+ log_item = LogItem()
26
+ log_data = {
27
+ "stage": topic,
28
+ "message": msg,
29
+ "project": self.project,
30
+ "task": self.task,
31
+ }
32
+
33
+ for key, value in log_data.items():
34
+ if not isinstance(value, str):
35
+ log_data[key] = json.dumps(value, ensure_ascii=False)
36
+ else:
37
+ log_data[key] = value
38
+
39
+ contents = sorted(log_data.items())
40
+ log_item.set_contents(contents)
41
+ self._queue.push(log_item)
42
+
43
+ def _build_request_log(self, request_item: Request):
44
+ log_item = LogItem()
45
+
46
+ seed: Seed = request_item.seed
47
+ get_time = seed.params.get_time
48
+ start_time = seed.params.start_time
49
+ request_time = seed.params.request_time
50
+ stage_cost = request_time - start_time
51
+ cost = request_time - start_time
52
+
53
+ request_settings = json.dumps(
54
+ request_item.request_settings,
55
+ ensure_ascii=False, separators=(',', ':')
56
+ )
57
+
58
+ log_data = {
59
+ "stage": "request",
60
+ "project": self.project,
61
+ "task": self.task,
62
+ "seed": seed.to_string,
63
+ "request": repr(request_item),
64
+ "request_settings": request_settings,
65
+ "get_time": get_time,
66
+ "start_time": start_time,
67
+ "stage_cost": stage_cost,
68
+ "cost": cost,
69
+ "time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(request_time)),
70
+ }
71
+
72
+ for key, value in log_data.items():
73
+ if not isinstance(value, str):
74
+ log_data[key] = json.dumps(value, ensure_ascii=False)
75
+ else:
76
+ log_data[key] = value
77
+
78
+ contents = sorted(log_data.items())
79
+ log_item.set_contents(contents)
80
+ self._queue.push(log_item)
81
+
82
+ def _build_download_log(self, response_item: Response):
83
+ """
84
+ 构建下载阶段的日志项
85
+
86
+ Args:
87
+ response_item: 响应对象
88
+ """
89
+ log_item = LogItem()
90
+
91
+ seed: Seed = response_item.seed
92
+ get_time = seed.params.get_time
93
+ start_time = seed.params.start_time
94
+ request_time = seed.params.request_time
95
+ download_time = seed.params.download_time
96
+ stage_cost = download_time - request_time
97
+ cost = download_time - start_time
98
+
99
+ log_data = {
100
+ "stage": "download",
101
+ "project": self.project,
102
+ "task": self.task,
103
+ "seed": seed.to_string,
104
+ "response": repr(response_item),
105
+ "get_time": get_time,
106
+ "start_time": start_time,
107
+ "request_time": request_time,
108
+ "download_time": download_time,
109
+ "stage_cost": stage_cost,
110
+ "cost": cost,
111
+ "proxy": seed.params.proxy or '-',
112
+ "time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(download_time)),
113
+ }
114
+
115
+ response = response_item.response
116
+ if isinstance(response, requests_Response):
117
+ log_data['request_info'] = {
118
+ 'method': response.request.method,
119
+ 'url': response.request.url,
120
+ 'headers': dict(response.request.headers),
121
+ 'body': response.request.body or "-",
122
+ }
123
+ log_data['response_info'] = {
124
+ "status_code": response.status_code,
125
+ "reason": response.reason,
126
+ "headers": dict(response.headers),
127
+ "content": response.text[:500], # 截取内容
128
+ "content_type": response.headers.get('content-type', '-'),
129
+ "content_length": response.headers.get('content-length', '-'),
130
+ "server": response.headers.get('server', '-'),
131
+ "date": response.headers.get('date', '-'),
132
+ }
133
+
134
+ for key, value in log_data.items():
135
+ if not isinstance(value, str):
136
+ log_data[key] = json.dumps(value, ensure_ascii=False)
137
+ else:
138
+ log_data[key] = value
139
+
140
+ contents = sorted(log_data.items())
141
+ log_item.set_contents(contents)
142
+ self._queue.push(log_item)
143
+
144
+ def _build_parse_log(self, parse_item: BaseItem):
145
+ log_item = LogItem()
146
+
147
+ seed: Seed = parse_item.seed
148
+ get_time = seed.params.get_time
149
+ start_time = seed.params.start_time
150
+ request_time = seed.params.request_time
151
+ response_time = seed.params.response_time
152
+ parse_time = seed.params.parse_time
153
+
154
+ pre_time = request_time or response_time
155
+ stage_cost = parse_time - pre_time
156
+ cost = parse_time - start_time
157
+
158
+ log_data = {
159
+ "stage": "parse",
160
+ "project": self.project,
161
+ "task": self.task,
162
+ "seed": seed.to_string,
163
+ "parse": repr(parse_item),
164
+ "get_time": get_time,
165
+ "start_time": start_time,
166
+ "parse_time": parse_time,
167
+ "stage_cost": stage_cost,
168
+ "cost": cost,
169
+ "time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(parse_time)),
170
+ }
171
+
172
+ for key, value in log_data.items():
173
+ if not isinstance(value, str):
174
+ log_data[key] = json.dumps(value, ensure_ascii=False)
175
+ else:
176
+ log_data[key] = value
177
+
178
+ contents = sorted(log_data.items())
179
+ log_item.set_contents(contents)
180
+ self._queue.push(log_item)
181
+
182
+ def _build_http_error_log(self, seed: Seed, e: RequestException):
183
+ log_item = LogItem()
184
+
185
+ status_code = getattr(e.response, 'status_code', '-')
186
+
187
+ request_info = {
188
+ 'method': getattr(e.request, 'method', '-'),
189
+ 'url': getattr(e.request, 'url', '-'),
190
+ 'headers': dict(getattr(e.request, 'headers', {})),
191
+ 'body': getattr(e.request, 'body', '-'),
192
+ }
193
+
194
+ response_info = {
195
+ 'status_code': getattr(e.response, 'status_code', '-'),
196
+ 'reason': getattr(e.response, 'reason', '-'),
197
+ 'headers': dict(getattr(e.response, 'headers', {})),
198
+ 'content': getattr(e.response, 'text', '')[:500],
199
+ 'content_type': e.response.headers.get('content-type', '-') if e.response else '-',
200
+ 'content_length': e.response.headers.get('content-length', '-') if e.response else '-',
201
+ 'server': e.response.headers.get('server', '-') if e.response else '-',
202
+ 'date': e.response.headers.get('date', '-') if e.response else '-',
203
+ }
204
+ retry = seed.params.retry
205
+ get_time = seed.params.get_time
206
+ start_time = seed.params.start_time
207
+ failed_time = seed.params.failed_time
208
+ cost = failed_time - start_time
209
+
210
+ log_data = {
211
+ "stage": "http_error",
212
+ "project": self.project,
213
+ "task": self.task,
214
+ "seed": seed.to_string,
215
+ "status_code": status_code,
216
+ "request_info": request_info,
217
+ "response_info": response_info,
218
+ "retry": retry,
219
+ "proxy": seed.params.proxy or '-',
220
+ "exception_type": type(e).__name__,
221
+ "exception_message": str(e),
222
+ "traceback": seed.params.traceback or '-',
223
+ "get_time": get_time,
224
+ "start_time": start_time,
225
+ "error_time": failed_time,
226
+ "stage_cost": cost,
227
+ "cost": cost,
228
+ "time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(failed_time)),
229
+ }
230
+
231
+ for key, value in log_data.items():
232
+ if not isinstance(value, str):
233
+ log_data[key] = json.dumps(value, ensure_ascii=False)
234
+ else:
235
+ log_data[key] = value
236
+
237
+ contents = sorted(log_data.items())
238
+ log_item.set_contents(contents)
239
+ self._queue.push(log_item)
240
+
241
+ def _build_exception_log(self, seed: Seed, e: Exception):
242
+ log_item = LogItem()
243
+
244
+ retry = seed.params.retry
245
+ get_time = seed.params.get_time
246
+ start_time = seed.params.start_time
247
+ failed_time = seed.params.failed_time
248
+ cost = failed_time - start_time
249
+
250
+ log_data = {
251
+ "stage": "exception",
252
+ "project": self.project,
253
+ "task": self.task,
254
+ "seed": seed.to_string,
255
+ "retry": retry,
256
+ "exception_type": type(e).__name__,
257
+ "exception_message": str(e),
258
+ "traceback": seed.params.traceback or '-',
259
+ "proxy": seed.params.proxy or '-',
260
+ "get_time": get_time,
261
+ "start_time": start_time,
262
+ "error_time": failed_time,
263
+ "stage_cost": cost,
264
+ "cost": cost,
265
+ "time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(failed_time)),
266
+ }
267
+
268
+ for key, value in log_data.items():
269
+ if not isinstance(value, str):
270
+ log_data[key] = json.dumps(value, ensure_ascii=False)
271
+ else:
272
+ log_data[key] = value
273
+
274
+ contents = sorted(log_data.items())
275
+ log_item.set_contents(contents)
276
+ self._queue.push(log_item)
277
+
278
+ def _build_run(self):
279
+ while not self._stop.is_set():
280
+ try:
281
+ items = []
282
+ start_time = int(time.time())
283
+
284
+ while len(items) < 1000:
285
+ log_item = self._queue.pop()
286
+ if not log_item or (int(time.time()) - start_time > 10):
287
+ break
288
+ items.append(log_item)
289
+
290
+ if items:
291
+ request = PutLogsRequest(
292
+ project="databee-download-log",
293
+ logstore="log",
294
+ topic="cobweb",
295
+ logitems=items,
296
+ compress=True
297
+ )
298
+ self._client.put_logs(request=request)
299
+ except Exception as e:
300
+ logger.info(str(e))
cobweb/utils/oss.py CHANGED
@@ -1,94 +1,113 @@
1
- from typing import List
2
- from cobweb import setting
3
- from requests import Response
4
- from oss2 import Auth, Bucket, models, PartIterator
5
- from cobweb.exceptions import oss_db_exception
6
- from cobweb.base.decorators import decorator_oss_db
7
-
8
-
9
- class OssUtil:
10
-
11
- def __init__(
12
- self,
13
- bucket=None,
14
- endpoint=None,
15
- access_key=None,
16
- secret_key=None,
17
- chunk_size=None,
18
- min_upload_size=None,
19
- **kwargs
20
- ):
21
- self.bucket = bucket or setting.OSS_BUCKET
22
- self.endpoint = endpoint or setting.OSS_ENDPOINT
23
- self.chunk_size = int(chunk_size or setting.OSS_CHUNK_SIZE)
24
- self.min_upload_size = int(min_upload_size or setting.OSS_MIN_UPLOAD_SIZE)
25
-
26
- self._auth = Auth(
27
- access_key_id=access_key or setting.OSS_ACCESS_KEY,
28
- access_key_secret=secret_key or setting.OSS_SECRET_KEY
29
- )
30
- self._client = Bucket(
31
- auth=self._auth,
32
- endpoint=self.endpoint,
33
- bucket_name=self.bucket,
34
- **kwargs
35
- )
36
-
37
- def exists(self, key: str) -> bool:
38
- return self._client.object_exists(key)
39
-
40
- def head(self, key: str) -> models.HeadObjectResult:
41
- return self._client.head_object(key)
42
-
43
- @decorator_oss_db(exception=oss_db_exception.OssDBInitPartError)
44
- def init_part(self, key) -> models.InitMultipartUploadResult:
45
- """初始化分片上传"""
46
- return self._client.init_multipart_upload(key)
47
-
48
- @decorator_oss_db(exception=oss_db_exception.OssDBPutObjError)
49
- def put(self, key, data) -> models.PutObjectResult:
50
- """文件上传"""
51
- return self._client.put_object(key, data)
52
-
53
- @decorator_oss_db(exception=oss_db_exception.OssDBPutPartError)
54
- def put_part(self, key, upload_id, position, data) -> models.PutObjectResult:
55
- """分片上传"""
56
- return self._client.upload_part(key, upload_id, position, data)
57
-
58
- def list_part(self, key, upload_id): # -> List[models.ListPartsResult]:
59
- """获取分片列表"""
60
- return [part_info for part_info in PartIterator(self._client, key, upload_id)]
61
-
62
- @decorator_oss_db(exception=oss_db_exception.OssDBMergeError)
63
- def merge(self, key, upload_id, parts=None) -> models.PutObjectResult:
64
- """合并分片"""
65
- headers = None if parts else {"x-oss-complete-all": "yes"}
66
- return self._client.complete_multipart_upload(key, upload_id, parts, headers=headers)
67
-
68
- @decorator_oss_db(exception=oss_db_exception.OssDBAppendObjError)
69
- def append(self, key, position, data) -> models.AppendObjectResult:
70
- """追加上传"""
71
- return self._client.append_object(key, position, data)
72
-
73
- def iter_data(self, data, chunk_size=None):
74
- chunk_size = chunk_size or self.chunk_size
75
- if isinstance(data, Response):
76
- for part_data in data.iter_content(chunk_size):
77
- yield part_data
78
- if isinstance(data, bytes):
79
- for i in range(0, len(data), chunk_size):
80
- yield data[i:i + chunk_size]
81
-
82
- def assemble(self, ready_data, data, chunk_size=None):
83
- upload_data = b""
84
- ready_data = ready_data + data
85
- chunk_size = chunk_size or self.chunk_size
86
- if len(ready_data) >= chunk_size:
87
- upload_data = ready_data[:chunk_size]
88
- ready_data = ready_data[chunk_size:]
89
- return ready_data, upload_data
90
-
91
- def content_length(self, key: str) -> int:
92
- head = self.head(key)
93
- return head.content_length
94
-
1
+ #
2
+ # from cobweb import setting
3
+ # from requests import Response
4
+ # from oss2 import Auth, Bucket, models, PartIterator
5
+ # from cobweb.exceptions import oss_db_exception
6
+ # from cobweb.utils.decorators import decorator_oss_db
7
+ #
8
+ #
9
+ # class OssUtil:
10
+ #
11
+ # def __init__(
12
+ # self,
13
+ # bucket=None,
14
+ # endpoint=None,
15
+ # access_key=None,
16
+ # secret_key=None,
17
+ # chunk_size=None,
18
+ # min_upload_size=None,
19
+ # **kwargs
20
+ # ):
21
+ # self.bucket = bucket or setting.OSS_BUCKET
22
+ # self.endpoint = endpoint or setting.OSS_ENDPOINT
23
+ # self.chunk_size = int(chunk_size or setting.OSS_CHUNK_SIZE)
24
+ # self.min_upload_size = int(min_upload_size or setting.OSS_MIN_UPLOAD_SIZE)
25
+ #
26
+ # self.failed_count = 0
27
+ # self._kw = kwargs
28
+ #
29
+ # self._auth = Auth(
30
+ # access_key_id=access_key or setting.OSS_ACCESS_KEY,
31
+ # access_key_secret=secret_key or setting.OSS_SECRET_KEY
32
+ # )
33
+ # self._client = Bucket(
34
+ # auth=self._auth,
35
+ # endpoint=self.endpoint,
36
+ # bucket_name=self.bucket,
37
+ # **self._kw
38
+ # )
39
+ #
40
+ # def failed(self):
41
+ # self.failed_count += 1
42
+ # if self.failed_count >= 5:
43
+ # self._client = Bucket(
44
+ # auth=self._auth,
45
+ # endpoint=self.endpoint,
46
+ # bucket_name=self.bucket,
47
+ # **self._kw
48
+ # )
49
+ #
50
+ # def exists(self, key: str) -> bool:
51
+ # try:
52
+ # result = self._client.object_exists(key)
53
+ # self.failed_count = 0
54
+ # return result
55
+ # except Exception as e:
56
+ # self.failed()
57
+ # raise e
58
+ #
59
+ # def head(self, key: str) -> models.HeadObjectResult:
60
+ # return self._client.head_object(key)
61
+ #
62
+ # @decorator_oss_db(exception=oss_db_exception.OssDBInitPartError)
63
+ # def init_part(self, key) -> models.InitMultipartUploadResult:
64
+ # """初始化分片上传"""
65
+ # return self._client.init_multipart_upload(key)
66
+ #
67
+ # @decorator_oss_db(exception=oss_db_exception.OssDBPutObjError)
68
+ # def put(self, key, data) -> models.PutObjectResult:
69
+ # """文件上传"""
70
+ # return self._client.put_object(key, data)
71
+ #
72
+ # @decorator_oss_db(exception=oss_db_exception.OssDBPutPartError)
73
+ # def put_part(self, key, upload_id, position, data) -> models.PutObjectResult:
74
+ # """分片上传"""
75
+ # return self._client.upload_part(key, upload_id, position, data)
76
+ #
77
+ # def list_part(self, key, upload_id): # -> List[models.ListPartsResult]:
78
+ # """获取分片列表"""
79
+ # return [part_info for part_info in PartIterator(self._client, key, upload_id)]
80
+ #
81
+ # @decorator_oss_db(exception=oss_db_exception.OssDBMergeError)
82
+ # def merge(self, key, upload_id, parts=None) -> models.PutObjectResult:
83
+ # """合并分片"""
84
+ # headers = None if parts else {"x-oss-complete-all": "yes"}
85
+ # return self._client.complete_multipart_upload(key, upload_id, parts, headers=headers)
86
+ #
87
+ # @decorator_oss_db(exception=oss_db_exception.OssDBAppendObjError)
88
+ # def append(self, key, position, data) -> models.AppendObjectResult:
89
+ # """追加上传"""
90
+ # return self._client.append_object(key, position, data)
91
+ #
92
+ # def iter_data(self, data, chunk_size=None):
93
+ # chunk_size = chunk_size or self.chunk_size
94
+ # if isinstance(data, Response):
95
+ # for part_data in data.iter_content(chunk_size):
96
+ # yield part_data
97
+ # if isinstance(data, bytes):
98
+ # for i in range(0, len(data), chunk_size):
99
+ # yield data[i:i + chunk_size]
100
+ #
101
+ # def assemble(self, ready_data, data, chunk_size=None):
102
+ # upload_data = b""
103
+ # ready_data = ready_data + data
104
+ # chunk_size = chunk_size or self.chunk_size
105
+ # if len(ready_data) >= chunk_size:
106
+ # upload_data = ready_data[:chunk_size]
107
+ # ready_data = ready_data[chunk_size:]
108
+ # return ready_data, upload_data
109
+ #
110
+ # def content_length(self, key: str) -> int:
111
+ # head = self.head(key)
112
+ # return head.content_length
113
+ #
cobweb/utils/tools.py CHANGED
@@ -1,5 +1,6 @@
1
1
  import re
2
2
  import hashlib
3
+ import inspect
3
4
  from typing import Union
4
5
  from importlib import import_module
5
6
 
@@ -10,18 +11,6 @@ def md5(text: Union[str, bytes]) -> str:
10
11
  return hashlib.md5(text).hexdigest()
11
12
 
12
13
 
13
- def build_path(site, url, file_type):
14
- return f"{site}/{md5(url)}.{file_type}"
15
-
16
-
17
- def format_size(content_length: int) -> str:
18
- units = ["KB", "MB", "GB", "TB"]
19
- for i in range(4):
20
- num = content_length / (1024 ** (i + 1))
21
- if num < 1024:
22
- return f"{round(num, 2)} {units[i]}"
23
-
24
-
25
14
  def dynamic_load_class(model_info):
26
15
  if isinstance(model_info, str):
27
16
  if "import" in model_info:
@@ -35,8 +24,7 @@ def dynamic_load_class(model_info):
35
24
  model = import_module(model_path)
36
25
  class_object = getattr(model, class_name)
37
26
  return class_object
27
+ elif inspect.isclass(model_info):
28
+ return model_info
38
29
  raise TypeError()
39
30
 
40
-
41
- # def download_log_info(item:dict) -> str:
42
- # return "\n".join([" " * 12 + f"{str(k).ljust(14)}: {str(v)}" for k, v in item.items()])