cobweb-launcher 1.2.49__tar.gz → 1.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. {cobweb-launcher-1.2.49/cobweb_launcher.egg-info → cobweb-launcher-1.3.1}/PKG-INFO +1 -1
  2. cobweb-launcher-1.3.1/cobweb/base/__init__.py +146 -0
  3. cobweb-launcher-1.3.1/cobweb/base/basic.py +243 -0
  4. cobweb-launcher-1.3.1/cobweb/base/common_queue.py +43 -0
  5. cobweb-launcher-1.3.1/cobweb/base/dotting.py +35 -0
  6. cobweb-launcher-1.3.1/cobweb/base/request.py +94 -0
  7. cobweb-launcher-1.3.1/cobweb/base/seed.py +118 -0
  8. cobweb-launcher-1.3.1/cobweb/constant.py +110 -0
  9. cobweb-launcher-1.3.1/cobweb/crawlers/crawler.py +88 -0
  10. cobweb-launcher-1.3.1/cobweb/db/redis_db.py +158 -0
  11. cobweb-launcher-1.3.1/cobweb/launchers/__init__.py +3 -0
  12. cobweb-launcher-1.3.1/cobweb/launchers/launcher.py +204 -0
  13. cobweb-launcher-1.3.1/cobweb/launchers/launcher_api.py +161 -0
  14. cobweb-launcher-1.3.1/cobweb/launchers/launcher_pro.py +90 -0
  15. cobweb-launcher-1.3.1/cobweb/pipelines/pipeline.py +45 -0
  16. cobweb-launcher-1.3.1/cobweb/setting.py +97 -0
  17. cobweb-launcher-1.3.1/cobweb/utils/__init__.py +5 -0
  18. cobweb-launcher-1.3.1/cobweb_/__init__.py +2 -0
  19. cobweb-launcher-1.3.1/cobweb_/base/item.py +46 -0
  20. cobweb-launcher-1.3.1/cobweb_/base/log.py +94 -0
  21. cobweb-launcher-1.3.1/cobweb_/base/response.py +23 -0
  22. cobweb-launcher-1.3.1/cobweb_/crawlers/__init__.py +1 -0
  23. {cobweb-launcher-1.2.49/cobweb → cobweb-launcher-1.3.1/cobweb_}/crawlers/crawler.py +8 -42
  24. cobweb-launcher-1.3.1/cobweb_/db/__init__.py +2 -0
  25. cobweb-launcher-1.3.1/cobweb_/db/api_db.py +82 -0
  26. cobweb-launcher-1.3.1/cobweb_/exceptions/__init__.py +1 -0
  27. cobweb-launcher-1.3.1/cobweb_/exceptions/oss_db_exception.py +28 -0
  28. cobweb-launcher-1.3.1/cobweb_/launchers/launcher_air.py +88 -0
  29. {cobweb-launcher-1.2.49/cobweb → cobweb-launcher-1.3.1/cobweb_}/launchers/launcher_api.py +19 -7
  30. {cobweb-launcher-1.2.49/cobweb → cobweb-launcher-1.3.1/cobweb_}/launchers/launcher_pro.py +23 -9
  31. cobweb-launcher-1.3.1/cobweb_/pipelines/__init__.py +3 -0
  32. cobweb-launcher-1.3.1/cobweb_/pipelines/pipeline_console.py +22 -0
  33. cobweb-launcher-1.3.1/cobweb_/pipelines/pipeline_loghub.py +34 -0
  34. cobweb-launcher-1.3.1/cobweb_/utils/bloom.py +58 -0
  35. cobweb-launcher-1.3.1/cobweb_/utils/dotting.py +32 -0
  36. cobweb-launcher-1.3.1/cobweb_/utils/oss.py +94 -0
  37. cobweb-launcher-1.3.1/cobweb_/utils/tools.py +42 -0
  38. {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1/cobweb_launcher.egg-info}/PKG-INFO +1 -1
  39. {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb_launcher.egg-info/SOURCES.txt +34 -4
  40. cobweb-launcher-1.3.1/cobweb_launcher.egg-info/top_level.txt +2 -0
  41. {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/setup.py +1 -1
  42. cobweb-launcher-1.2.49/cobweb/crawlers/base_crawler.py +0 -144
  43. cobweb-launcher-1.2.49/cobweb/crawlers/file_crawler.py +0 -98
  44. cobweb-launcher-1.2.49/cobweb/utils/dotting.py +0 -64
  45. cobweb-launcher-1.2.49/cobweb_launcher.egg-info/top_level.txt +0 -1
  46. {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/LICENSE +0 -0
  47. {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/README.md +0 -0
  48. {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb/__init__.py +0 -0
  49. {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb/base/item.py +0 -0
  50. {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb/base/log.py +0 -0
  51. {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb/base/response.py +0 -0
  52. {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb/crawlers/__init__.py +0 -0
  53. {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb/db/__init__.py +0 -0
  54. {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb/db/api_db.py +0 -0
  55. {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb/exceptions/__init__.py +0 -0
  56. {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb/exceptions/oss_db_exception.py +0 -0
  57. {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb/launchers/launcher_air.py +0 -0
  58. {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb/pipelines/__init__.py +0 -0
  59. {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb/pipelines/pipeline_console.py +0 -0
  60. {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb/pipelines/pipeline_loghub.py +0 -0
  61. {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb/utils/bloom.py +0 -0
  62. {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb/utils/oss.py +0 -0
  63. {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb/utils/tools.py +0 -0
  64. {cobweb-launcher-1.2.49/cobweb → cobweb-launcher-1.3.1/cobweb_}/base/__init__.py +0 -0
  65. {cobweb-launcher-1.2.49/cobweb → cobweb-launcher-1.3.1/cobweb_}/base/common_queue.py +0 -0
  66. {cobweb-launcher-1.2.49/cobweb → cobweb-launcher-1.3.1/cobweb_}/base/decorators.py +0 -0
  67. {cobweb-launcher-1.2.49/cobweb → cobweb-launcher-1.3.1/cobweb_}/base/request.py +0 -0
  68. {cobweb-launcher-1.2.49/cobweb → cobweb-launcher-1.3.1/cobweb_}/base/seed.py +0 -0
  69. {cobweb-launcher-1.2.49/cobweb → cobweb-launcher-1.3.1/cobweb_}/constant.py +0 -0
  70. {cobweb-launcher-1.2.49/cobweb → cobweb-launcher-1.3.1/cobweb_}/db/redis_db.py +0 -0
  71. {cobweb-launcher-1.2.49/cobweb → cobweb-launcher-1.3.1/cobweb_}/launchers/__init__.py +0 -0
  72. {cobweb-launcher-1.2.49/cobweb → cobweb-launcher-1.3.1/cobweb_}/launchers/launcher.py +0 -0
  73. {cobweb-launcher-1.2.49/cobweb → cobweb-launcher-1.3.1/cobweb_}/pipelines/pipeline.py +0 -0
  74. {cobweb-launcher-1.2.49/cobweb → cobweb-launcher-1.3.1/cobweb_}/setting.py +0 -0
  75. {cobweb-launcher-1.2.49/cobweb → cobweb-launcher-1.3.1/cobweb_}/utils/__init__.py +0 -0
  76. {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
  77. {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb_launcher.egg-info/requires.txt +0 -0
  78. {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/setup.cfg +0 -0
  79. {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/test/test.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 1.2.49
3
+ Version: 1.3.1
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -0,0 +1,146 @@
1
+ import time
2
+ import traceback
3
+ import threading
4
+
5
+ from functools import wraps
6
+ from inspect import isgenerator
7
+ from typing import Callable, Union
8
+
9
+ from .common_queue import Queue
10
+ from .response import Response
11
+ from .basic import Seed, Request, Response
12
+ from .item import BaseItem, ConsoleItem
13
+ # from .seed import Seed
14
+ from .log import logger
15
+ # from .dotting import LoghubDot
16
+
17
+
18
+ class TaskQueue:
19
+ TODO = Queue() # 任务种子队列
20
+ DOWNLOAD = Queue() # 下载任务队列
21
+
22
+ SEED = Queue() # 添加任务种子队列
23
+ REQUEST = Queue() # 请求队列
24
+ RESPONSE = Queue() # 响应队列
25
+ DONE = Queue() # 下载完成队列
26
+ UPLOAD = Queue() # 任务上传队列
27
+ DELETE = Queue() # 任务删除队列
28
+
29
+ # DOT = LoghubDot()
30
+
31
+ @staticmethod
32
+ def is_empty():
33
+ total_length = TaskQueue.SEED.length
34
+ total_length += TaskQueue.TODO.length
35
+ total_length += TaskQueue.REQUEST.length
36
+ total_length += TaskQueue.DOWNLOAD.length
37
+ total_length += TaskQueue.RESPONSE.length
38
+ total_length += TaskQueue.UPLOAD.length
39
+ total_length += TaskQueue.DONE.length
40
+ total_length += TaskQueue.DELETE.length
41
+ return not bool(total_length)
42
+
43
+ @staticmethod
44
+ def process_task(it: Union[Seed, Request, Response, BaseItem], crawler_func: Callable):
45
+ try:
46
+ iterators = crawler_func(it)
47
+ if not isgenerator(iterators):
48
+ raise TypeError(f"{crawler_func.__name__} function isn't a generator")
49
+ for tk in iterators:
50
+ if isinstance(tk, Request):
51
+ TaskQueue.REQUEST.push(tk)
52
+ elif isinstance(tk, Response):
53
+ TaskQueue.RESPONSE.push(tk)
54
+ elif isinstance(tk, BaseItem):
55
+ TaskQueue.UPLOAD.push(tk)
56
+ elif isinstance(tk, Seed):
57
+ TaskQueue.SEED.push(tk)
58
+ else:
59
+ raise TypeError(f"{crawler_func.__name__} function return type isn't supported")
60
+ # TaskQueue.DOT.build(
61
+ # topic=f"{self.project}:{self.task}",
62
+ # cost_time=end_time - start_time,
63
+ # **download_item.to_dict
64
+ # )
65
+ # todo: 数据打点
66
+ except Exception as e:
67
+ it.params.retry += 1
68
+ if isinstance(it, Request):
69
+ TaskQueue.REQUEST.push(it)
70
+ elif isinstance(it, Response):
71
+ TaskQueue.RESPONSE.push(it)
72
+ elif isinstance(it, Seed):
73
+ TaskQueue.SEED.push(it)
74
+ time.sleep(1)
75
+
76
+
77
+ class Decorators:
78
+
79
+ @staticmethod
80
+ def add_thread(num=1):
81
+ def decorator(func):
82
+ @wraps(func)
83
+ def wrapper(self, *args):
84
+ for i in range(num):
85
+ name = func.__name__ + "_" + str(i) if num > 1 else func.__name__
86
+ self._threads.append(threading.Thread(name=name, target=func, args=(self,) + args))
87
+
88
+ return wrapper
89
+
90
+ return decorator
91
+
92
+ @staticmethod
93
+ def pause(func):
94
+ @wraps(func)
95
+ def wrapper(self, *args, **kwargs):
96
+ while not self.pause.is_set():
97
+ try:
98
+ func(self, *args, **kwargs)
99
+ except Exception as e:
100
+ logger.info(f"{func.__name__}: " + str(e))
101
+ finally:
102
+ time.sleep(0.1)
103
+ logger.info(f"{func.__name__}: close!")
104
+
105
+ return wrapper
106
+
107
+ @staticmethod
108
+ def stop(func):
109
+ @wraps(func)
110
+ def wrapper(self, *args, **kwargs):
111
+ while not self.stop.is_set():
112
+ try:
113
+ func(self, *args, **kwargs)
114
+ except Exception as e:
115
+ logger.info(
116
+ f"{func.__name__} exception: \n" +
117
+ ''.join(traceback.format_exception(type(e), e, e.__traceback__))
118
+ )
119
+ finally:
120
+ time.sleep(0.1)
121
+
122
+ return wrapper
123
+
124
+ @staticmethod
125
+ def decorator_oss_db(exception, retries=3):
126
+ def decorator(func):
127
+ @wraps(func)
128
+ def wrapper(callback_func, *args, **kwargs):
129
+ result = None
130
+ for i in range(retries):
131
+ msg = None
132
+ try:
133
+ return func(callback_func, *args, **kwargs)
134
+ except Exception as e:
135
+ result = None
136
+ msg = e
137
+ finally:
138
+ if result:
139
+ return result
140
+
141
+ if i >= 2 and msg:
142
+ raise exception(msg)
143
+
144
+ return wrapper
145
+
146
+ return decorator
@@ -0,0 +1,243 @@
1
+ import json
2
+ import random
3
+ import time
4
+ import hashlib
5
+ import requests
6
+
7
+
8
+ class Params:
9
+
10
+ def __init__(self, retry=None, priority=None, version=None, status=None):
11
+ self.retry = retry or 0
12
+ self.priority = priority or 300
13
+ self.version = version or int(time.time())
14
+ self.status = status
15
+
16
+
17
+ class Seed:
18
+
19
+ def __init__(
20
+ self,
21
+ seed,
22
+ params = Params(),
23
+ **kwargs
24
+ ):
25
+ if any(isinstance(seed, t) for t in (str, bytes)):
26
+ try:
27
+ item = json.loads(seed)
28
+ self._init_seed(item)
29
+ except json.JSONDecodeError:
30
+ self.__setattr__("url", seed)
31
+ elif isinstance(seed, dict):
32
+ self._init_seed(seed)
33
+ else:
34
+ raise TypeError(Exception(
35
+ f"seed type error, "
36
+ f"must be str or dict! "
37
+ f"seed: {seed}"
38
+ ))
39
+
40
+ if kwargs:
41
+ self._init_seed(kwargs)
42
+ if not getattr(self, "sid", None):
43
+ self._init_id()
44
+ self.params = params or Params()
45
+
46
+ def __getattr__(self, name):
47
+ return None
48
+
49
+ def __setitem__(self, key, value):
50
+ setattr(self, key, value)
51
+
52
+ def __getitem__(self, item):
53
+ return getattr(self, item)
54
+
55
+ def __str__(self):
56
+ return json.dumps(self.__dict__, ensure_ascii=False)
57
+
58
+ def __repr__(self):
59
+ chars = [f"{k}={v}" for k, v in self.__dict__.items()]
60
+ return f'{self.__class__.__name__}({", ".join(chars)})'
61
+
62
+ def _init_seed(self, seed_info:dict):
63
+ for k, v in seed_info.items():
64
+ if k not in self.__SEED_PARAMS__:
65
+ self.__setattr__(k, v)
66
+
67
+ def _init_id(self):
68
+ sid = hashlib.md5(self.to_string.encode()).hexdigest()
69
+ self.__setattr__("sid", sid)
70
+
71
+ @property
72
+ def to_dict(self) -> dict:
73
+ seed = self.__dict__.copy()
74
+ if seed.get("params"):
75
+ del seed["params"]
76
+ return seed
77
+
78
+ @property
79
+ def to_string(self) -> str:
80
+ return json.dumps(
81
+ self.to_dict,
82
+ ensure_ascii=False,
83
+ separators=(",", ":")
84
+ )
85
+
86
+ @property
87
+ def seed(self):
88
+ return self
89
+
90
+
91
+ class Request:
92
+
93
+ __SEED_PARAMS__ = [
94
+ "retry",
95
+ "priority",
96
+ "seed_version",
97
+ "seed_status"
98
+ ]
99
+
100
+ __REQUEST_ATTRS__ = {
101
+ "params",
102
+ "headers",
103
+ "cookies",
104
+ "data",
105
+ "json",
106
+ "files",
107
+ "auth",
108
+ "timeout",
109
+ "proxies",
110
+ "hooks",
111
+ "stream",
112
+ "verify",
113
+ "cert",
114
+ "allow_redirects",
115
+ }
116
+
117
+ def __init__(
118
+ self,
119
+ url,
120
+ seed,
121
+ random_ua=True,
122
+ check_status_code=True,
123
+ retry=None,
124
+ priority=None,
125
+ seed_version=None,
126
+ seed_status=None,
127
+ **kwargs
128
+ ):
129
+ self.url = url
130
+ self.check_status_code = check_status_code
131
+ self.request_setting = {}
132
+
133
+ seed_params = {
134
+ "retry": retry,
135
+ "priority": priority,
136
+ "seed_version": seed_version,
137
+ "seed_status": seed_status,
138
+ }
139
+
140
+ for k, v in kwargs.items():
141
+ if k in self.__class__.__REQUEST_ATTRS__:
142
+ self.request_setting[k] = v
143
+ continue
144
+ elif k in self.__SEED_PARAMS__:
145
+ seed_params[k] = v
146
+ self.__setattr__(k, v)
147
+
148
+ if not getattr(self, "method", None):
149
+ self.method = "POST" if self.request_setting.get("data") or self.request_setting.get("json") else "GET"
150
+
151
+ if random_ua:
152
+ self._build_header()
153
+
154
+ self.params = Params(**seed_params)
155
+
156
+ if isinstance(seed, Seed):
157
+ kwargs.update(**seed.to_dict)
158
+ elif isinstance(seed, str):
159
+ kwargs.update(**json.loads(seed))
160
+ self.seed = self.to_string
161
+
162
+ @property
163
+ def _random_ua(self) -> str:
164
+ v1 = random.randint(4, 15)
165
+ v2 = random.randint(3, 11)
166
+ v3 = random.randint(1, 16)
167
+ v4 = random.randint(533, 605)
168
+ v5 = random.randint(1000, 6000)
169
+ v6 = random.randint(10, 80)
170
+ user_agent = (f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_{v1}_{v2}) AppleWebKit/{v4}.{v3} "
171
+ f"(KHTML, like Gecko) Chrome/105.0.0.0 Safari/{v4}.{v3} Edg/105.0.{v5}.{v6}")
172
+ return user_agent
173
+
174
+ def _build_header(self) -> dict:
175
+ if not self.request_setting.get("headers"):
176
+ self.request_setting["headers"] = {"accept": "*/*", "user-agent": self._random_ua}
177
+ elif "user-agent" not in [key.lower() for key in self.request_setting["headers"].keys()]:
178
+ self.request_setting["headers"]["user-agent"] = self._random_ua
179
+
180
+ def download(self) -> requests.Response:
181
+ response = requests.request(self.method, self.url, **self.request_setting)
182
+ if self.check_status_code:
183
+ response.raise_for_status()
184
+ return response
185
+
186
+ @property
187
+ def to_dict(self):
188
+ _dict = self.__dict__.copy()
189
+ _dict.pop('seed')
190
+ _dict.pop('params')
191
+ _dict.pop('check_status_code')
192
+ # _dict.pop('request_setting')
193
+ return _dict
194
+
195
+ @property
196
+ def to_string(self) -> str:
197
+ return json.dumps(
198
+ self.to_dict,
199
+ ensure_ascii=False,
200
+ separators=(",", ":")
201
+ )
202
+
203
+
204
+ class Response:
205
+
206
+ def __init__(
207
+ self,
208
+ seed,
209
+ response,
210
+ retry=None,
211
+ priority=None,
212
+ seed_version=None,
213
+ seed_status=None,
214
+ **kwargs
215
+ ):
216
+ self.seed = seed
217
+ self.response = response
218
+ seed_params = {
219
+ "retry": retry,
220
+ "priority": priority,
221
+ "seed_version": seed_version,
222
+ "seed_status": seed_status,
223
+ }
224
+ for k, v in kwargs.items():
225
+ if k in seed_params.keys():
226
+ seed_params[k] = v
227
+ else:
228
+ self.__setattr__(k, v)
229
+
230
+ @property
231
+ def to_dict(self):
232
+ _dict = self.__dict__.copy()
233
+ _dict.pop('seed')
234
+ _dict.pop('response')
235
+ return _dict
236
+
237
+ @property
238
+ def to_string(self) -> str:
239
+ return json.dumps(
240
+ self.to_dict,
241
+ ensure_ascii=False,
242
+ separators=(",", ":")
243
+ )
@@ -0,0 +1,43 @@
1
+ import time
2
+ from collections import deque
3
+
4
+
5
+ class Queue:
6
+
7
+ def __init__(self):
8
+ self._queue = deque()
9
+
10
+ @property
11
+ def length(self) -> int:
12
+ return len(self._queue)
13
+
14
+ def push(self, data, left: bool = False, direct_insertion: bool = False):
15
+ try:
16
+ if not data:
17
+ return None
18
+ if not direct_insertion and any(isinstance(data, t) for t in (list, tuple)):
19
+ self._queue.extendleft(data) if left else self._queue.extend(data)
20
+ else:
21
+ self._queue.appendleft(data) if left else self._queue.append(data)
22
+ except AttributeError:
23
+ pass
24
+
25
+ def pop(self, left: bool = True):
26
+ try:
27
+ return self._queue.popleft() if left else self._queue.pop()
28
+ except IndexError:
29
+ return None
30
+ except AttributeError:
31
+ return None
32
+
33
+ def clear(self):
34
+ self._queue.clear()
35
+
36
+ def get(self):
37
+ try:
38
+ yield self._queue.popleft()
39
+ except IndexError:
40
+ time.sleep(1)
41
+ yield None
42
+ except AttributeError:
43
+ yield None
@@ -0,0 +1,35 @@
1
+ import os
2
+ import json
3
+ from aliyun.log import LogClient, LogItem, PutLogsRequest
4
+
5
+
6
+ class LoghubDot:
7
+
8
+ def __init__(self):
9
+ endpoint = os.getenv("DOTTING_ENDPOINT", "")
10
+ accessKeyId = os.getenv("DOTTING_ACCESS_KEY", "")
11
+ accessKey = os.getenv("DOTTING_SECRET_KEY", "")
12
+ self.client = LogClient(endpoint=endpoint, accessKeyId=accessKeyId, accessKey=accessKey) \
13
+ if endpoint and accessKeyId and accessKey else None
14
+
15
+ def build(self, topic, **kwargs):
16
+ if self.client:
17
+ temp = {}
18
+ log_items = []
19
+ log_item = LogItem()
20
+ for key, value in kwargs.items():
21
+ if not isinstance(value, str):
22
+ temp[key] = json.dumps(value, ensure_ascii=False)
23
+ else:
24
+ temp[key] = value
25
+ contents = sorted(temp.items())
26
+ log_item.set_contents(contents)
27
+ log_items.append(log_item)
28
+ request = PutLogsRequest(
29
+ project="databee-download-log",
30
+ logstore="cobweb_log",
31
+ topic=topic,
32
+ logitems=log_items,
33
+ compress=True
34
+ )
35
+ self.client.put_logs(request=request)
@@ -0,0 +1,94 @@
1
+ import json
2
+ import random
3
+ import requests
4
+
5
+
6
+ class Request:
7
+
8
+ __REQUEST_ATTRS__ = {
9
+ "params",
10
+ "headers",
11
+ "cookies",
12
+ "data",
13
+ "json",
14
+ "files",
15
+ "auth",
16
+ "timeout",
17
+ "proxies",
18
+ "hooks",
19
+ "stream",
20
+ "verify",
21
+ "cert",
22
+ "allow_redirects",
23
+ }
24
+
25
+ def __init__(
26
+ self,
27
+ url,
28
+ seed,
29
+ random_ua=True,
30
+ check_status_code=True,
31
+ **kwargs
32
+ ):
33
+ self.url = url
34
+ self.check_status_code = check_status_code
35
+ self.request_setting = {}
36
+
37
+ for k, v in kwargs.items():
38
+ if k in self.__class__.__REQUEST_ATTRS__:
39
+ self.request_setting[k] = v
40
+ continue
41
+ self.__setattr__(k, v)
42
+
43
+ if not getattr(self, "method", None):
44
+ self.method = "POST" if self.request_setting.get("data") or self.request_setting.get("json") else "GET"
45
+
46
+ if random_ua:
47
+ self._build_header()
48
+
49
+ if isinstance(seed, Seed):
50
+ self.seed = seed.to_string
51
+ else:
52
+ kwargs.update(**seed.to_dict)
53
+ self.seed = self.to_string
54
+
55
+ @property
56
+ def _random_ua(self) -> str:
57
+ v1 = random.randint(4, 15)
58
+ v2 = random.randint(3, 11)
59
+ v3 = random.randint(1, 16)
60
+ v4 = random.randint(533, 605)
61
+ v5 = random.randint(1000, 6000)
62
+ v6 = random.randint(10, 80)
63
+ user_agent = (f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_{v1}_{v2}) AppleWebKit/{v4}.{v3} "
64
+ f"(KHTML, like Gecko) Chrome/105.0.0.0 Safari/{v4}.{v3} Edg/105.0.{v5}.{v6}")
65
+ return user_agent
66
+
67
+ def _build_header(self) -> dict:
68
+ if not self.request_setting.get("headers"):
69
+ self.request_setting["headers"] = {"accept": "*/*", "user-agent": self._random_ua}
70
+ elif "user-agent" not in [key.lower() for key in self.request_setting["headers"].keys()]:
71
+ self.request_setting["headers"]["user-agent"] = self._random_ua
72
+
73
+ def download(self) -> requests.Response:
74
+ response = requests.request(self.method, self.url, **self.request_setting)
75
+ if self.check_status_code:
76
+ response.raise_for_status()
77
+ return response
78
+
79
+ @property
80
+ def to_dict(self):
81
+ _dict = self.__dict__.copy()
82
+ _dict.pop('seed')
83
+ _dict.pop('check_status_code')
84
+ _dict.pop('request_setting')
85
+ return _dict
86
+
87
+ @property
88
+ def to_string(self) -> str:
89
+ return json.dumps(
90
+ self.to_dict,
91
+ ensure_ascii=False,
92
+ separators=(",", ":")
93
+ )
94
+
@@ -0,0 +1,118 @@
1
+ import json
2
+ import time
3
+ import hashlib
4
+
5
+
6
+ class SeedParams:
7
+
8
+ def __init__(self, retry, priority, seed_version, seed_status=None):
9
+ self.retry = retry or 0
10
+ self.priority = priority or 300
11
+ self.seed_version = seed_version or int(time.time())
12
+ self.seed_status = seed_status
13
+
14
+
15
+ class Seed:
16
+
17
+ __SEED_PARAMS__ = [
18
+ "retry",
19
+ "priority",
20
+ "seed_version",
21
+ "seed_status"
22
+ ]
23
+
24
+ def __init__(
25
+ self,
26
+ seed,
27
+ sid=None,
28
+ retry=None,
29
+ priority=None,
30
+ seed_version=None,
31
+ seed_status=None,
32
+ **kwargs
33
+ ):
34
+ if any(isinstance(seed, t) for t in (str, bytes)):
35
+ try:
36
+ item = json.loads(seed)
37
+ self._init_seed(item)
38
+ except json.JSONDecodeError:
39
+ self.__setattr__("url", seed)
40
+ elif isinstance(seed, dict):
41
+ self._init_seed(seed)
42
+ else:
43
+ raise TypeError(Exception(
44
+ f"seed type error, "
45
+ f"must be str or dict! "
46
+ f"seed: {seed}"
47
+ ))
48
+
49
+ seed_params = {
50
+ "retry": retry,
51
+ "priority": priority,
52
+ "seed_version": seed_version,
53
+ "seed_status": seed_status,
54
+ }
55
+
56
+ if kwargs:
57
+ self._init_seed(kwargs)
58
+ seed_params.update({
59
+ k:v for k, v in kwargs.items()
60
+ if k in self.__SEED_PARAMS__
61
+ })
62
+ if sid or not getattr(self, "sid", None):
63
+ self._init_id(sid)
64
+ self.params = SeedParams(**seed_params)
65
+
66
+ def __getattr__(self, name):
67
+ return None
68
+
69
+ def __setitem__(self, key, value):
70
+ setattr(self, key, value)
71
+
72
+ def __getitem__(self, item):
73
+ return getattr(self, item)
74
+
75
+ def __str__(self):
76
+ return json.dumps(self.__dict__, ensure_ascii=False)
77
+
78
+ def __repr__(self):
79
+ chars = [f"{k}={v}" for k, v in self.__dict__.items()]
80
+ return f'{self.__class__.__name__}({", ".join(chars)})'
81
+
82
+ def _init_seed(self, seed_info:dict):
83
+ for k, v in seed_info.items():
84
+ if k not in self.__SEED_PARAMS__:
85
+ self.__setattr__(k, v)
86
+
87
+ def _init_id(self, sid):
88
+ if not sid:
89
+ sid = hashlib.md5(self.to_string.encode()).hexdigest()
90
+ self.__setattr__("sid", sid)
91
+
92
+ @property
93
+ def to_dict(self) -> dict:
94
+ seed = self.__dict__.copy()
95
+ if seed.get("params"):
96
+ del seed["params"]
97
+ return seed
98
+
99
+ @property
100
+ def to_string(self) -> str:
101
+ return json.dumps(
102
+ self.to_dict,
103
+ ensure_ascii=False,
104
+ separators=(",", ":")
105
+ )
106
+
107
+ # @property
108
+ # def get_all(self):
109
+ # return json.dumps(
110
+ # self.__dict__,
111
+ # ensure_ascii=False,
112
+ # separators=(",", ":")
113
+ # )
114
+
115
+ @property
116
+ def seed(self):
117
+ return self.to_string
118
+