cobweb-launcher 1.2.49__tar.gz → 1.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cobweb-launcher-1.2.49/cobweb_launcher.egg-info → cobweb-launcher-1.3.1}/PKG-INFO +1 -1
- cobweb-launcher-1.3.1/cobweb/base/__init__.py +146 -0
- cobweb-launcher-1.3.1/cobweb/base/basic.py +243 -0
- cobweb-launcher-1.3.1/cobweb/base/common_queue.py +43 -0
- cobweb-launcher-1.3.1/cobweb/base/dotting.py +35 -0
- cobweb-launcher-1.3.1/cobweb/base/request.py +94 -0
- cobweb-launcher-1.3.1/cobweb/base/seed.py +118 -0
- cobweb-launcher-1.3.1/cobweb/constant.py +110 -0
- cobweb-launcher-1.3.1/cobweb/crawlers/crawler.py +88 -0
- cobweb-launcher-1.3.1/cobweb/db/redis_db.py +158 -0
- cobweb-launcher-1.3.1/cobweb/launchers/__init__.py +3 -0
- cobweb-launcher-1.3.1/cobweb/launchers/launcher.py +204 -0
- cobweb-launcher-1.3.1/cobweb/launchers/launcher_api.py +161 -0
- cobweb-launcher-1.3.1/cobweb/launchers/launcher_pro.py +90 -0
- cobweb-launcher-1.3.1/cobweb/pipelines/pipeline.py +45 -0
- cobweb-launcher-1.3.1/cobweb/setting.py +97 -0
- cobweb-launcher-1.3.1/cobweb/utils/__init__.py +5 -0
- cobweb-launcher-1.3.1/cobweb_/__init__.py +2 -0
- cobweb-launcher-1.3.1/cobweb_/base/item.py +46 -0
- cobweb-launcher-1.3.1/cobweb_/base/log.py +94 -0
- cobweb-launcher-1.3.1/cobweb_/base/response.py +23 -0
- cobweb-launcher-1.3.1/cobweb_/crawlers/__init__.py +1 -0
- {cobweb-launcher-1.2.49/cobweb → cobweb-launcher-1.3.1/cobweb_}/crawlers/crawler.py +8 -42
- cobweb-launcher-1.3.1/cobweb_/db/__init__.py +2 -0
- cobweb-launcher-1.3.1/cobweb_/db/api_db.py +82 -0
- cobweb-launcher-1.3.1/cobweb_/exceptions/__init__.py +1 -0
- cobweb-launcher-1.3.1/cobweb_/exceptions/oss_db_exception.py +28 -0
- cobweb-launcher-1.3.1/cobweb_/launchers/launcher_air.py +88 -0
- {cobweb-launcher-1.2.49/cobweb → cobweb-launcher-1.3.1/cobweb_}/launchers/launcher_api.py +19 -7
- {cobweb-launcher-1.2.49/cobweb → cobweb-launcher-1.3.1/cobweb_}/launchers/launcher_pro.py +23 -9
- cobweb-launcher-1.3.1/cobweb_/pipelines/__init__.py +3 -0
- cobweb-launcher-1.3.1/cobweb_/pipelines/pipeline_console.py +22 -0
- cobweb-launcher-1.3.1/cobweb_/pipelines/pipeline_loghub.py +34 -0
- cobweb-launcher-1.3.1/cobweb_/utils/bloom.py +58 -0
- cobweb-launcher-1.3.1/cobweb_/utils/dotting.py +32 -0
- cobweb-launcher-1.3.1/cobweb_/utils/oss.py +94 -0
- cobweb-launcher-1.3.1/cobweb_/utils/tools.py +42 -0
- {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1/cobweb_launcher.egg-info}/PKG-INFO +1 -1
- {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb_launcher.egg-info/SOURCES.txt +34 -4
- cobweb-launcher-1.3.1/cobweb_launcher.egg-info/top_level.txt +2 -0
- {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/setup.py +1 -1
- cobweb-launcher-1.2.49/cobweb/crawlers/base_crawler.py +0 -144
- cobweb-launcher-1.2.49/cobweb/crawlers/file_crawler.py +0 -98
- cobweb-launcher-1.2.49/cobweb/utils/dotting.py +0 -64
- cobweb-launcher-1.2.49/cobweb_launcher.egg-info/top_level.txt +0 -1
- {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/LICENSE +0 -0
- {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/README.md +0 -0
- {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb/__init__.py +0 -0
- {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb/base/item.py +0 -0
- {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb/base/log.py +0 -0
- {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb/base/response.py +0 -0
- {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb/crawlers/__init__.py +0 -0
- {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb/db/__init__.py +0 -0
- {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb/db/api_db.py +0 -0
- {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb/exceptions/__init__.py +0 -0
- {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb/exceptions/oss_db_exception.py +0 -0
- {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb/launchers/launcher_air.py +0 -0
- {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb/pipelines/__init__.py +0 -0
- {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb/pipelines/pipeline_console.py +0 -0
- {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb/pipelines/pipeline_loghub.py +0 -0
- {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb/utils/bloom.py +0 -0
- {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb/utils/oss.py +0 -0
- {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb/utils/tools.py +0 -0
- {cobweb-launcher-1.2.49/cobweb → cobweb-launcher-1.3.1/cobweb_}/base/__init__.py +0 -0
- {cobweb-launcher-1.2.49/cobweb → cobweb-launcher-1.3.1/cobweb_}/base/common_queue.py +0 -0
- {cobweb-launcher-1.2.49/cobweb → cobweb-launcher-1.3.1/cobweb_}/base/decorators.py +0 -0
- {cobweb-launcher-1.2.49/cobweb → cobweb-launcher-1.3.1/cobweb_}/base/request.py +0 -0
- {cobweb-launcher-1.2.49/cobweb → cobweb-launcher-1.3.1/cobweb_}/base/seed.py +0 -0
- {cobweb-launcher-1.2.49/cobweb → cobweb-launcher-1.3.1/cobweb_}/constant.py +0 -0
- {cobweb-launcher-1.2.49/cobweb → cobweb-launcher-1.3.1/cobweb_}/db/redis_db.py +0 -0
- {cobweb-launcher-1.2.49/cobweb → cobweb-launcher-1.3.1/cobweb_}/launchers/__init__.py +0 -0
- {cobweb-launcher-1.2.49/cobweb → cobweb-launcher-1.3.1/cobweb_}/launchers/launcher.py +0 -0
- {cobweb-launcher-1.2.49/cobweb → cobweb-launcher-1.3.1/cobweb_}/pipelines/pipeline.py +0 -0
- {cobweb-launcher-1.2.49/cobweb → cobweb-launcher-1.3.1/cobweb_}/setting.py +0 -0
- {cobweb-launcher-1.2.49/cobweb → cobweb-launcher-1.3.1/cobweb_}/utils/__init__.py +0 -0
- {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
- {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/cobweb_launcher.egg-info/requires.txt +0 -0
- {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/setup.cfg +0 -0
- {cobweb-launcher-1.2.49 → cobweb-launcher-1.3.1}/test/test.py +0 -0
@@ -0,0 +1,146 @@
|
|
1
|
+
import time
|
2
|
+
import traceback
|
3
|
+
import threading
|
4
|
+
|
5
|
+
from functools import wraps
|
6
|
+
from inspect import isgenerator
|
7
|
+
from typing import Callable, Union
|
8
|
+
|
9
|
+
from .common_queue import Queue
|
10
|
+
from .response import Response
|
11
|
+
from .basic import Seed, Request, Response
|
12
|
+
from .item import BaseItem, ConsoleItem
|
13
|
+
# from .seed import Seed
|
14
|
+
from .log import logger
|
15
|
+
# from .dotting import LoghubDot
|
16
|
+
|
17
|
+
|
18
|
+
class TaskQueue:
|
19
|
+
TODO = Queue() # 任务种子队列
|
20
|
+
DOWNLOAD = Queue() # 下载任务队列
|
21
|
+
|
22
|
+
SEED = Queue() # 添加任务种子队列
|
23
|
+
REQUEST = Queue() # 请求队列
|
24
|
+
RESPONSE = Queue() # 响应队列
|
25
|
+
DONE = Queue() # 下载完成队列
|
26
|
+
UPLOAD = Queue() # 任务上传队列
|
27
|
+
DELETE = Queue() # 任务删除队列
|
28
|
+
|
29
|
+
# DOT = LoghubDot()
|
30
|
+
|
31
|
+
@staticmethod
|
32
|
+
def is_empty():
|
33
|
+
total_length = TaskQueue.SEED.length
|
34
|
+
total_length += TaskQueue.TODO.length
|
35
|
+
total_length += TaskQueue.REQUEST.length
|
36
|
+
total_length += TaskQueue.DOWNLOAD.length
|
37
|
+
total_length += TaskQueue.RESPONSE.length
|
38
|
+
total_length += TaskQueue.UPLOAD.length
|
39
|
+
total_length += TaskQueue.DONE.length
|
40
|
+
total_length += TaskQueue.DELETE.length
|
41
|
+
return not bool(total_length)
|
42
|
+
|
43
|
+
@staticmethod
|
44
|
+
def process_task(it: Union[Seed, Request, Response, BaseItem], crawler_func: Callable):
|
45
|
+
try:
|
46
|
+
iterators = crawler_func(it)
|
47
|
+
if not isgenerator(iterators):
|
48
|
+
raise TypeError(f"{crawler_func.__name__} function isn't a generator")
|
49
|
+
for tk in iterators:
|
50
|
+
if isinstance(tk, Request):
|
51
|
+
TaskQueue.REQUEST.push(tk)
|
52
|
+
elif isinstance(tk, Response):
|
53
|
+
TaskQueue.RESPONSE.push(tk)
|
54
|
+
elif isinstance(tk, BaseItem):
|
55
|
+
TaskQueue.UPLOAD.push(tk)
|
56
|
+
elif isinstance(tk, Seed):
|
57
|
+
TaskQueue.SEED.push(tk)
|
58
|
+
else:
|
59
|
+
raise TypeError(f"{crawler_func.__name__} function return type isn't supported")
|
60
|
+
# TaskQueue.DOT.build(
|
61
|
+
# topic=f"{self.project}:{self.task}",
|
62
|
+
# cost_time=end_time - start_time,
|
63
|
+
# **download_item.to_dict
|
64
|
+
# )
|
65
|
+
# todo: 数据打点
|
66
|
+
except Exception as e:
|
67
|
+
it.params.retry += 1
|
68
|
+
if isinstance(it, Request):
|
69
|
+
TaskQueue.REQUEST.push(it)
|
70
|
+
elif isinstance(it, Response):
|
71
|
+
TaskQueue.RESPONSE.push(it)
|
72
|
+
elif isinstance(it, Seed):
|
73
|
+
TaskQueue.SEED.push(it)
|
74
|
+
time.sleep(1)
|
75
|
+
|
76
|
+
|
77
|
+
class Decorators:
|
78
|
+
|
79
|
+
@staticmethod
|
80
|
+
def add_thread(num=1):
|
81
|
+
def decorator(func):
|
82
|
+
@wraps(func)
|
83
|
+
def wrapper(self, *args):
|
84
|
+
for i in range(num):
|
85
|
+
name = func.__name__ + "_" + str(i) if num > 1 else func.__name__
|
86
|
+
self._threads.append(threading.Thread(name=name, target=func, args=(self,) + args))
|
87
|
+
|
88
|
+
return wrapper
|
89
|
+
|
90
|
+
return decorator
|
91
|
+
|
92
|
+
@staticmethod
|
93
|
+
def pause(func):
|
94
|
+
@wraps(func)
|
95
|
+
def wrapper(self, *args, **kwargs):
|
96
|
+
while not self.pause.is_set():
|
97
|
+
try:
|
98
|
+
func(self, *args, **kwargs)
|
99
|
+
except Exception as e:
|
100
|
+
logger.info(f"{func.__name__}: " + str(e))
|
101
|
+
finally:
|
102
|
+
time.sleep(0.1)
|
103
|
+
logger.info(f"{func.__name__}: close!")
|
104
|
+
|
105
|
+
return wrapper
|
106
|
+
|
107
|
+
@staticmethod
|
108
|
+
def stop(func):
|
109
|
+
@wraps(func)
|
110
|
+
def wrapper(self, *args, **kwargs):
|
111
|
+
while not self.stop.is_set():
|
112
|
+
try:
|
113
|
+
func(self, *args, **kwargs)
|
114
|
+
except Exception as e:
|
115
|
+
logger.info(
|
116
|
+
f"{func.__name__} exception: \n" +
|
117
|
+
''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
118
|
+
)
|
119
|
+
finally:
|
120
|
+
time.sleep(0.1)
|
121
|
+
|
122
|
+
return wrapper
|
123
|
+
|
124
|
+
@staticmethod
|
125
|
+
def decorator_oss_db(exception, retries=3):
|
126
|
+
def decorator(func):
|
127
|
+
@wraps(func)
|
128
|
+
def wrapper(callback_func, *args, **kwargs):
|
129
|
+
result = None
|
130
|
+
for i in range(retries):
|
131
|
+
msg = None
|
132
|
+
try:
|
133
|
+
return func(callback_func, *args, **kwargs)
|
134
|
+
except Exception as e:
|
135
|
+
result = None
|
136
|
+
msg = e
|
137
|
+
finally:
|
138
|
+
if result:
|
139
|
+
return result
|
140
|
+
|
141
|
+
if i >= 2 and msg:
|
142
|
+
raise exception(msg)
|
143
|
+
|
144
|
+
return wrapper
|
145
|
+
|
146
|
+
return decorator
|
@@ -0,0 +1,243 @@
|
|
1
|
+
import json
|
2
|
+
import random
|
3
|
+
import time
|
4
|
+
import hashlib
|
5
|
+
import requests
|
6
|
+
|
7
|
+
|
8
|
+
class Params:
|
9
|
+
|
10
|
+
def __init__(self, retry=None, priority=None, version=None, status=None):
|
11
|
+
self.retry = retry or 0
|
12
|
+
self.priority = priority or 300
|
13
|
+
self.version = version or int(time.time())
|
14
|
+
self.status = status
|
15
|
+
|
16
|
+
|
17
|
+
class Seed:
|
18
|
+
|
19
|
+
def __init__(
|
20
|
+
self,
|
21
|
+
seed,
|
22
|
+
params = Params(),
|
23
|
+
**kwargs
|
24
|
+
):
|
25
|
+
if any(isinstance(seed, t) for t in (str, bytes)):
|
26
|
+
try:
|
27
|
+
item = json.loads(seed)
|
28
|
+
self._init_seed(item)
|
29
|
+
except json.JSONDecodeError:
|
30
|
+
self.__setattr__("url", seed)
|
31
|
+
elif isinstance(seed, dict):
|
32
|
+
self._init_seed(seed)
|
33
|
+
else:
|
34
|
+
raise TypeError(Exception(
|
35
|
+
f"seed type error, "
|
36
|
+
f"must be str or dict! "
|
37
|
+
f"seed: {seed}"
|
38
|
+
))
|
39
|
+
|
40
|
+
if kwargs:
|
41
|
+
self._init_seed(kwargs)
|
42
|
+
if not getattr(self, "sid", None):
|
43
|
+
self._init_id()
|
44
|
+
self.params = params or Params()
|
45
|
+
|
46
|
+
def __getattr__(self, name):
|
47
|
+
return None
|
48
|
+
|
49
|
+
def __setitem__(self, key, value):
|
50
|
+
setattr(self, key, value)
|
51
|
+
|
52
|
+
def __getitem__(self, item):
|
53
|
+
return getattr(self, item)
|
54
|
+
|
55
|
+
def __str__(self):
|
56
|
+
return json.dumps(self.__dict__, ensure_ascii=False)
|
57
|
+
|
58
|
+
def __repr__(self):
|
59
|
+
chars = [f"{k}={v}" for k, v in self.__dict__.items()]
|
60
|
+
return f'{self.__class__.__name__}({", ".join(chars)})'
|
61
|
+
|
62
|
+
def _init_seed(self, seed_info:dict):
|
63
|
+
for k, v in seed_info.items():
|
64
|
+
if k not in self.__SEED_PARAMS__:
|
65
|
+
self.__setattr__(k, v)
|
66
|
+
|
67
|
+
def _init_id(self):
|
68
|
+
sid = hashlib.md5(self.to_string.encode()).hexdigest()
|
69
|
+
self.__setattr__("sid", sid)
|
70
|
+
|
71
|
+
@property
|
72
|
+
def to_dict(self) -> dict:
|
73
|
+
seed = self.__dict__.copy()
|
74
|
+
if seed.get("params"):
|
75
|
+
del seed["params"]
|
76
|
+
return seed
|
77
|
+
|
78
|
+
@property
|
79
|
+
def to_string(self) -> str:
|
80
|
+
return json.dumps(
|
81
|
+
self.to_dict,
|
82
|
+
ensure_ascii=False,
|
83
|
+
separators=(",", ":")
|
84
|
+
)
|
85
|
+
|
86
|
+
@property
|
87
|
+
def seed(self):
|
88
|
+
return self
|
89
|
+
|
90
|
+
|
91
|
+
class Request:
|
92
|
+
|
93
|
+
__SEED_PARAMS__ = [
|
94
|
+
"retry",
|
95
|
+
"priority",
|
96
|
+
"seed_version",
|
97
|
+
"seed_status"
|
98
|
+
]
|
99
|
+
|
100
|
+
__REQUEST_ATTRS__ = {
|
101
|
+
"params",
|
102
|
+
"headers",
|
103
|
+
"cookies",
|
104
|
+
"data",
|
105
|
+
"json",
|
106
|
+
"files",
|
107
|
+
"auth",
|
108
|
+
"timeout",
|
109
|
+
"proxies",
|
110
|
+
"hooks",
|
111
|
+
"stream",
|
112
|
+
"verify",
|
113
|
+
"cert",
|
114
|
+
"allow_redirects",
|
115
|
+
}
|
116
|
+
|
117
|
+
def __init__(
|
118
|
+
self,
|
119
|
+
url,
|
120
|
+
seed,
|
121
|
+
random_ua=True,
|
122
|
+
check_status_code=True,
|
123
|
+
retry=None,
|
124
|
+
priority=None,
|
125
|
+
seed_version=None,
|
126
|
+
seed_status=None,
|
127
|
+
**kwargs
|
128
|
+
):
|
129
|
+
self.url = url
|
130
|
+
self.check_status_code = check_status_code
|
131
|
+
self.request_setting = {}
|
132
|
+
|
133
|
+
seed_params = {
|
134
|
+
"retry": retry,
|
135
|
+
"priority": priority,
|
136
|
+
"seed_version": seed_version,
|
137
|
+
"seed_status": seed_status,
|
138
|
+
}
|
139
|
+
|
140
|
+
for k, v in kwargs.items():
|
141
|
+
if k in self.__class__.__REQUEST_ATTRS__:
|
142
|
+
self.request_setting[k] = v
|
143
|
+
continue
|
144
|
+
elif k in self.__SEED_PARAMS__:
|
145
|
+
seed_params[k] = v
|
146
|
+
self.__setattr__(k, v)
|
147
|
+
|
148
|
+
if not getattr(self, "method", None):
|
149
|
+
self.method = "POST" if self.request_setting.get("data") or self.request_setting.get("json") else "GET"
|
150
|
+
|
151
|
+
if random_ua:
|
152
|
+
self._build_header()
|
153
|
+
|
154
|
+
self.params = Params(**seed_params)
|
155
|
+
|
156
|
+
if isinstance(seed, Seed):
|
157
|
+
kwargs.update(**seed.to_dict)
|
158
|
+
elif isinstance(seed, str):
|
159
|
+
kwargs.update(**json.loads(seed))
|
160
|
+
self.seed = self.to_string
|
161
|
+
|
162
|
+
@property
|
163
|
+
def _random_ua(self) -> str:
|
164
|
+
v1 = random.randint(4, 15)
|
165
|
+
v2 = random.randint(3, 11)
|
166
|
+
v3 = random.randint(1, 16)
|
167
|
+
v4 = random.randint(533, 605)
|
168
|
+
v5 = random.randint(1000, 6000)
|
169
|
+
v6 = random.randint(10, 80)
|
170
|
+
user_agent = (f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_{v1}_{v2}) AppleWebKit/{v4}.{v3} "
|
171
|
+
f"(KHTML, like Gecko) Chrome/105.0.0.0 Safari/{v4}.{v3} Edg/105.0.{v5}.{v6}")
|
172
|
+
return user_agent
|
173
|
+
|
174
|
+
def _build_header(self) -> dict:
|
175
|
+
if not self.request_setting.get("headers"):
|
176
|
+
self.request_setting["headers"] = {"accept": "*/*", "user-agent": self._random_ua}
|
177
|
+
elif "user-agent" not in [key.lower() for key in self.request_setting["headers"].keys()]:
|
178
|
+
self.request_setting["headers"]["user-agent"] = self._random_ua
|
179
|
+
|
180
|
+
def download(self) -> requests.Response:
|
181
|
+
response = requests.request(self.method, self.url, **self.request_setting)
|
182
|
+
if self.check_status_code:
|
183
|
+
response.raise_for_status()
|
184
|
+
return response
|
185
|
+
|
186
|
+
@property
|
187
|
+
def to_dict(self):
|
188
|
+
_dict = self.__dict__.copy()
|
189
|
+
_dict.pop('seed')
|
190
|
+
_dict.pop('params')
|
191
|
+
_dict.pop('check_status_code')
|
192
|
+
# _dict.pop('request_setting')
|
193
|
+
return _dict
|
194
|
+
|
195
|
+
@property
|
196
|
+
def to_string(self) -> str:
|
197
|
+
return json.dumps(
|
198
|
+
self.to_dict,
|
199
|
+
ensure_ascii=False,
|
200
|
+
separators=(",", ":")
|
201
|
+
)
|
202
|
+
|
203
|
+
|
204
|
+
class Response:
|
205
|
+
|
206
|
+
def __init__(
|
207
|
+
self,
|
208
|
+
seed,
|
209
|
+
response,
|
210
|
+
retry=None,
|
211
|
+
priority=None,
|
212
|
+
seed_version=None,
|
213
|
+
seed_status=None,
|
214
|
+
**kwargs
|
215
|
+
):
|
216
|
+
self.seed = seed
|
217
|
+
self.response = response
|
218
|
+
seed_params = {
|
219
|
+
"retry": retry,
|
220
|
+
"priority": priority,
|
221
|
+
"seed_version": seed_version,
|
222
|
+
"seed_status": seed_status,
|
223
|
+
}
|
224
|
+
for k, v in kwargs.items():
|
225
|
+
if k in seed_params.keys():
|
226
|
+
seed_params[k] = v
|
227
|
+
else:
|
228
|
+
self.__setattr__(k, v)
|
229
|
+
|
230
|
+
@property
|
231
|
+
def to_dict(self):
|
232
|
+
_dict = self.__dict__.copy()
|
233
|
+
_dict.pop('seed')
|
234
|
+
_dict.pop('response')
|
235
|
+
return _dict
|
236
|
+
|
237
|
+
@property
|
238
|
+
def to_string(self) -> str:
|
239
|
+
return json.dumps(
|
240
|
+
self.to_dict,
|
241
|
+
ensure_ascii=False,
|
242
|
+
separators=(",", ":")
|
243
|
+
)
|
@@ -0,0 +1,43 @@
|
|
1
|
+
import time
|
2
|
+
from collections import deque
|
3
|
+
|
4
|
+
|
5
|
+
class Queue:
|
6
|
+
|
7
|
+
def __init__(self):
|
8
|
+
self._queue = deque()
|
9
|
+
|
10
|
+
@property
|
11
|
+
def length(self) -> int:
|
12
|
+
return len(self._queue)
|
13
|
+
|
14
|
+
def push(self, data, left: bool = False, direct_insertion: bool = False):
|
15
|
+
try:
|
16
|
+
if not data:
|
17
|
+
return None
|
18
|
+
if not direct_insertion and any(isinstance(data, t) for t in (list, tuple)):
|
19
|
+
self._queue.extendleft(data) if left else self._queue.extend(data)
|
20
|
+
else:
|
21
|
+
self._queue.appendleft(data) if left else self._queue.append(data)
|
22
|
+
except AttributeError:
|
23
|
+
pass
|
24
|
+
|
25
|
+
def pop(self, left: bool = True):
|
26
|
+
try:
|
27
|
+
return self._queue.popleft() if left else self._queue.pop()
|
28
|
+
except IndexError:
|
29
|
+
return None
|
30
|
+
except AttributeError:
|
31
|
+
return None
|
32
|
+
|
33
|
+
def clear(self):
|
34
|
+
self._queue.clear()
|
35
|
+
|
36
|
+
def get(self):
|
37
|
+
try:
|
38
|
+
yield self._queue.popleft()
|
39
|
+
except IndexError:
|
40
|
+
time.sleep(1)
|
41
|
+
yield None
|
42
|
+
except AttributeError:
|
43
|
+
yield None
|
@@ -0,0 +1,35 @@
|
|
1
|
+
import os
|
2
|
+
import json
|
3
|
+
from aliyun.log import LogClient, LogItem, PutLogsRequest
|
4
|
+
|
5
|
+
|
6
|
+
class LoghubDot:
|
7
|
+
|
8
|
+
def __init__(self):
|
9
|
+
endpoint = os.getenv("DOTTING_ENDPOINT", "")
|
10
|
+
accessKeyId = os.getenv("DOTTING_ACCESS_KEY", "")
|
11
|
+
accessKey = os.getenv("DOTTING_SECRET_KEY", "")
|
12
|
+
self.client = LogClient(endpoint=endpoint, accessKeyId=accessKeyId, accessKey=accessKey) \
|
13
|
+
if endpoint and accessKeyId and accessKey else None
|
14
|
+
|
15
|
+
def build(self, topic, **kwargs):
|
16
|
+
if self.client:
|
17
|
+
temp = {}
|
18
|
+
log_items = []
|
19
|
+
log_item = LogItem()
|
20
|
+
for key, value in kwargs.items():
|
21
|
+
if not isinstance(value, str):
|
22
|
+
temp[key] = json.dumps(value, ensure_ascii=False)
|
23
|
+
else:
|
24
|
+
temp[key] = value
|
25
|
+
contents = sorted(temp.items())
|
26
|
+
log_item.set_contents(contents)
|
27
|
+
log_items.append(log_item)
|
28
|
+
request = PutLogsRequest(
|
29
|
+
project="databee-download-log",
|
30
|
+
logstore="cobweb_log",
|
31
|
+
topic=topic,
|
32
|
+
logitems=log_items,
|
33
|
+
compress=True
|
34
|
+
)
|
35
|
+
self.client.put_logs(request=request)
|
@@ -0,0 +1,94 @@
|
|
1
|
+
import json
|
2
|
+
import random
|
3
|
+
import requests
|
4
|
+
|
5
|
+
|
6
|
+
class Request:
|
7
|
+
|
8
|
+
__REQUEST_ATTRS__ = {
|
9
|
+
"params",
|
10
|
+
"headers",
|
11
|
+
"cookies",
|
12
|
+
"data",
|
13
|
+
"json",
|
14
|
+
"files",
|
15
|
+
"auth",
|
16
|
+
"timeout",
|
17
|
+
"proxies",
|
18
|
+
"hooks",
|
19
|
+
"stream",
|
20
|
+
"verify",
|
21
|
+
"cert",
|
22
|
+
"allow_redirects",
|
23
|
+
}
|
24
|
+
|
25
|
+
def __init__(
|
26
|
+
self,
|
27
|
+
url,
|
28
|
+
seed,
|
29
|
+
random_ua=True,
|
30
|
+
check_status_code=True,
|
31
|
+
**kwargs
|
32
|
+
):
|
33
|
+
self.url = url
|
34
|
+
self.check_status_code = check_status_code
|
35
|
+
self.request_setting = {}
|
36
|
+
|
37
|
+
for k, v in kwargs.items():
|
38
|
+
if k in self.__class__.__REQUEST_ATTRS__:
|
39
|
+
self.request_setting[k] = v
|
40
|
+
continue
|
41
|
+
self.__setattr__(k, v)
|
42
|
+
|
43
|
+
if not getattr(self, "method", None):
|
44
|
+
self.method = "POST" if self.request_setting.get("data") or self.request_setting.get("json") else "GET"
|
45
|
+
|
46
|
+
if random_ua:
|
47
|
+
self._build_header()
|
48
|
+
|
49
|
+
if isinstance(seed, Seed):
|
50
|
+
self.seed = seed.to_string
|
51
|
+
else:
|
52
|
+
kwargs.update(**seed.to_dict)
|
53
|
+
self.seed = self.to_string
|
54
|
+
|
55
|
+
@property
|
56
|
+
def _random_ua(self) -> str:
|
57
|
+
v1 = random.randint(4, 15)
|
58
|
+
v2 = random.randint(3, 11)
|
59
|
+
v3 = random.randint(1, 16)
|
60
|
+
v4 = random.randint(533, 605)
|
61
|
+
v5 = random.randint(1000, 6000)
|
62
|
+
v6 = random.randint(10, 80)
|
63
|
+
user_agent = (f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_{v1}_{v2}) AppleWebKit/{v4}.{v3} "
|
64
|
+
f"(KHTML, like Gecko) Chrome/105.0.0.0 Safari/{v4}.{v3} Edg/105.0.{v5}.{v6}")
|
65
|
+
return user_agent
|
66
|
+
|
67
|
+
def _build_header(self) -> dict:
|
68
|
+
if not self.request_setting.get("headers"):
|
69
|
+
self.request_setting["headers"] = {"accept": "*/*", "user-agent": self._random_ua}
|
70
|
+
elif "user-agent" not in [key.lower() for key in self.request_setting["headers"].keys()]:
|
71
|
+
self.request_setting["headers"]["user-agent"] = self._random_ua
|
72
|
+
|
73
|
+
def download(self) -> requests.Response:
|
74
|
+
response = requests.request(self.method, self.url, **self.request_setting)
|
75
|
+
if self.check_status_code:
|
76
|
+
response.raise_for_status()
|
77
|
+
return response
|
78
|
+
|
79
|
+
@property
|
80
|
+
def to_dict(self):
|
81
|
+
_dict = self.__dict__.copy()
|
82
|
+
_dict.pop('seed')
|
83
|
+
_dict.pop('check_status_code')
|
84
|
+
_dict.pop('request_setting')
|
85
|
+
return _dict
|
86
|
+
|
87
|
+
@property
|
88
|
+
def to_string(self) -> str:
|
89
|
+
return json.dumps(
|
90
|
+
self.to_dict,
|
91
|
+
ensure_ascii=False,
|
92
|
+
separators=(",", ":")
|
93
|
+
)
|
94
|
+
|
@@ -0,0 +1,118 @@
|
|
1
|
+
import json
|
2
|
+
import time
|
3
|
+
import hashlib
|
4
|
+
|
5
|
+
|
6
|
+
class SeedParams:
|
7
|
+
|
8
|
+
def __init__(self, retry, priority, seed_version, seed_status=None):
|
9
|
+
self.retry = retry or 0
|
10
|
+
self.priority = priority or 300
|
11
|
+
self.seed_version = seed_version or int(time.time())
|
12
|
+
self.seed_status = seed_status
|
13
|
+
|
14
|
+
|
15
|
+
class Seed:
|
16
|
+
|
17
|
+
__SEED_PARAMS__ = [
|
18
|
+
"retry",
|
19
|
+
"priority",
|
20
|
+
"seed_version",
|
21
|
+
"seed_status"
|
22
|
+
]
|
23
|
+
|
24
|
+
def __init__(
|
25
|
+
self,
|
26
|
+
seed,
|
27
|
+
sid=None,
|
28
|
+
retry=None,
|
29
|
+
priority=None,
|
30
|
+
seed_version=None,
|
31
|
+
seed_status=None,
|
32
|
+
**kwargs
|
33
|
+
):
|
34
|
+
if any(isinstance(seed, t) for t in (str, bytes)):
|
35
|
+
try:
|
36
|
+
item = json.loads(seed)
|
37
|
+
self._init_seed(item)
|
38
|
+
except json.JSONDecodeError:
|
39
|
+
self.__setattr__("url", seed)
|
40
|
+
elif isinstance(seed, dict):
|
41
|
+
self._init_seed(seed)
|
42
|
+
else:
|
43
|
+
raise TypeError(Exception(
|
44
|
+
f"seed type error, "
|
45
|
+
f"must be str or dict! "
|
46
|
+
f"seed: {seed}"
|
47
|
+
))
|
48
|
+
|
49
|
+
seed_params = {
|
50
|
+
"retry": retry,
|
51
|
+
"priority": priority,
|
52
|
+
"seed_version": seed_version,
|
53
|
+
"seed_status": seed_status,
|
54
|
+
}
|
55
|
+
|
56
|
+
if kwargs:
|
57
|
+
self._init_seed(kwargs)
|
58
|
+
seed_params.update({
|
59
|
+
k:v for k, v in kwargs.items()
|
60
|
+
if k in self.__SEED_PARAMS__
|
61
|
+
})
|
62
|
+
if sid or not getattr(self, "sid", None):
|
63
|
+
self._init_id(sid)
|
64
|
+
self.params = SeedParams(**seed_params)
|
65
|
+
|
66
|
+
def __getattr__(self, name):
|
67
|
+
return None
|
68
|
+
|
69
|
+
def __setitem__(self, key, value):
|
70
|
+
setattr(self, key, value)
|
71
|
+
|
72
|
+
def __getitem__(self, item):
|
73
|
+
return getattr(self, item)
|
74
|
+
|
75
|
+
def __str__(self):
|
76
|
+
return json.dumps(self.__dict__, ensure_ascii=False)
|
77
|
+
|
78
|
+
def __repr__(self):
|
79
|
+
chars = [f"{k}={v}" for k, v in self.__dict__.items()]
|
80
|
+
return f'{self.__class__.__name__}({", ".join(chars)})'
|
81
|
+
|
82
|
+
def _init_seed(self, seed_info:dict):
|
83
|
+
for k, v in seed_info.items():
|
84
|
+
if k not in self.__SEED_PARAMS__:
|
85
|
+
self.__setattr__(k, v)
|
86
|
+
|
87
|
+
def _init_id(self, sid):
|
88
|
+
if not sid:
|
89
|
+
sid = hashlib.md5(self.to_string.encode()).hexdigest()
|
90
|
+
self.__setattr__("sid", sid)
|
91
|
+
|
92
|
+
@property
|
93
|
+
def to_dict(self) -> dict:
|
94
|
+
seed = self.__dict__.copy()
|
95
|
+
if seed.get("params"):
|
96
|
+
del seed["params"]
|
97
|
+
return seed
|
98
|
+
|
99
|
+
@property
|
100
|
+
def to_string(self) -> str:
|
101
|
+
return json.dumps(
|
102
|
+
self.to_dict,
|
103
|
+
ensure_ascii=False,
|
104
|
+
separators=(",", ":")
|
105
|
+
)
|
106
|
+
|
107
|
+
# @property
|
108
|
+
# def get_all(self):
|
109
|
+
# return json.dumps(
|
110
|
+
# self.__dict__,
|
111
|
+
# ensure_ascii=False,
|
112
|
+
# separators=(",", ":")
|
113
|
+
# )
|
114
|
+
|
115
|
+
@property
|
116
|
+
def seed(self):
|
117
|
+
return self.to_string
|
118
|
+
|