cobweb-launcher 0.1.8__py3-none-any.whl → 1.2.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. cobweb/__init__.py +2 -11
  2. cobweb/base/__init__.py +9 -0
  3. cobweb/base/basic.py +297 -0
  4. cobweb/base/common_queue.py +30 -0
  5. cobweb/base/decorators.py +40 -0
  6. cobweb/base/dotting.py +35 -0
  7. cobweb/base/item.py +46 -0
  8. cobweb/{log.py → base/log.py} +4 -6
  9. cobweb/base/request.py +82 -0
  10. cobweb/base/response.py +23 -0
  11. cobweb/base/seed.py +114 -0
  12. cobweb/constant.py +94 -0
  13. cobweb/crawlers/__init__.py +1 -0
  14. cobweb/crawlers/base_crawler.py +144 -0
  15. cobweb/crawlers/crawler.py +209 -0
  16. cobweb/crawlers/file_crawler.py +98 -0
  17. cobweb/db/__init__.py +2 -2
  18. cobweb/db/api_db.py +82 -0
  19. cobweb/db/redis_db.py +125 -218
  20. cobweb/exceptions/__init__.py +1 -0
  21. cobweb/exceptions/oss_db_exception.py +28 -0
  22. cobweb/launchers/__init__.py +3 -0
  23. cobweb/launchers/launcher.py +235 -0
  24. cobweb/launchers/launcher_air.py +88 -0
  25. cobweb/launchers/launcher_api.py +209 -0
  26. cobweb/launchers/launcher_pro.py +208 -0
  27. cobweb/pipelines/__init__.py +3 -0
  28. cobweb/pipelines/pipeline.py +69 -0
  29. cobweb/pipelines/pipeline_console.py +22 -0
  30. cobweb/pipelines/pipeline_loghub.py +34 -0
  31. cobweb/schedulers/__init__.py +3 -0
  32. cobweb/schedulers/scheduler_api.py +72 -0
  33. cobweb/schedulers/scheduler_redis.py +72 -0
  34. cobweb/setting.py +67 -6
  35. cobweb/utils/__init__.py +5 -0
  36. cobweb/utils/bloom.py +58 -0
  37. cobweb/utils/dotting.py +32 -0
  38. cobweb/utils/oss.py +94 -0
  39. cobweb/utils/tools.py +42 -0
  40. cobweb_launcher-1.2.41.dist-info/METADATA +205 -0
  41. cobweb_launcher-1.2.41.dist-info/RECORD +44 -0
  42. {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.41.dist-info}/WHEEL +1 -1
  43. cobweb/bbb.py +0 -191
  44. cobweb/db/oss_db.py +0 -127
  45. cobweb/db/scheduler/__init__.py +0 -0
  46. cobweb/db/scheduler/default.py +0 -8
  47. cobweb/db/scheduler/textfile.py +0 -27
  48. cobweb/db/storer/__init__.py +0 -0
  49. cobweb/db/storer/console.py +0 -9
  50. cobweb/db/storer/loghub.py +0 -54
  51. cobweb/db/storer/redis.py +0 -15
  52. cobweb/db/storer/textfile.py +0 -15
  53. cobweb/decorators.py +0 -16
  54. cobweb/distributed/__init__.py +0 -0
  55. cobweb/distributed/launcher.py +0 -243
  56. cobweb/distributed/models.py +0 -143
  57. cobweb/interface.py +0 -34
  58. cobweb/single/__init__.py +0 -0
  59. cobweb/single/launcher.py +0 -231
  60. cobweb/single/models.py +0 -134
  61. cobweb/single/nest.py +0 -153
  62. cobweb/task.py +0 -50
  63. cobweb/utils.py +0 -90
  64. cobweb_launcher-0.1.8.dist-info/METADATA +0 -45
  65. cobweb_launcher-0.1.8.dist-info/RECORD +0 -31
  66. {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.41.dist-info}/LICENSE +0 -0
  67. {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.41.dist-info}/top_level.txt +0 -0
cobweb/__init__.py CHANGED
@@ -1,11 +1,2 @@
1
- from .bbb import Seed, Queue, DBItem
2
- from .task import Task
3
- from .log import log
4
- from .interface import SchedulerInterface, StorerInterface
5
- from .db.redis_db import RedisDB
6
- from .db.oss_db import OssDB
7
- from .distributed.launcher import launcher
8
- from .single.launcher import launcher as single_launcher
9
- from . import setting
10
-
11
-
1
+ from .launchers import LauncherAir, LauncherPro, LauncherApi
2
+ from .constant import CrawlerModel
@@ -0,0 +1,9 @@
1
+ from .common_queue import Queue
2
+ from .response import Response
3
+ from .request import Request
4
+ from .item import BaseItem, ConsoleItem
5
+ from .seed import Seed
6
+
7
+ from .log import logger
8
+ from .decorators import decorator_oss_db
9
+
cobweb/base/basic.py ADDED
@@ -0,0 +1,297 @@
1
+ import json
2
+ import random
3
+ import time
4
+ import hashlib
5
+ import requests
6
+
7
+
8
+ class Params:
9
+
10
+ def __init__(self, retry=None, priority=None, version=None, status=None):
11
+ self.retry = retry or 0
12
+ self.priority = priority or 300
13
+ self.version = version or int(time.time())
14
+ self.status = status
15
+
16
+
17
+ class Seed:
18
+ __SEED_PARAMS__ = [
19
+ "retry",
20
+ "priority",
21
+ "version",
22
+ "status"
23
+ ]
24
+
25
+ def __init__(
26
+ self,
27
+ seed,
28
+ sid=None,
29
+ retry=None,
30
+ priority=None,
31
+ version=None,
32
+ status=None,
33
+ **kwargs
34
+ ):
35
+ if any(isinstance(seed, t) for t in (str, bytes)):
36
+ try:
37
+ item = json.loads(seed)
38
+ self._init_seed(item)
39
+ except json.JSONDecodeError:
40
+ self.__setattr__("url", seed)
41
+ elif isinstance(seed, dict):
42
+ self._init_seed(seed)
43
+ else:
44
+ raise TypeError(Exception(
45
+ f"seed type error, "
46
+ f"must be str or dict! "
47
+ f"seed: {seed}"
48
+ ))
49
+
50
+ seed_params = {
51
+ "retry": retry,
52
+ "priority": priority,
53
+ "version": version,
54
+ "status": status,
55
+ }
56
+
57
+ if kwargs:
58
+ # for k, v in kwargs.items():
59
+ # if k in seed_params.keys():
60
+ # seed_params[k] = v
61
+ # else:
62
+ # self.__setattr__(k, v)
63
+ self._init_seed(kwargs)
64
+ seed_params.update({
65
+ k: v for k, v in kwargs.items()
66
+ if k in self.__SEED_PARAMS__
67
+ })
68
+ if sid or not getattr(self, "sid", None):
69
+ self._init_id(sid)
70
+ self.params = Params(**seed_params)
71
+
72
+ def __getattr__(self, name):
73
+ return None
74
+
75
+ def __setitem__(self, key, value):
76
+ setattr(self, key, value)
77
+
78
+ def __getitem__(self, item):
79
+ return getattr(self, item)
80
+
81
+ def __str__(self):
82
+ return json.dumps(self.__dict__, ensure_ascii=False)
83
+
84
+ def __repr__(self):
85
+ chars = [f"{k}={v}" for k, v in self.__dict__.items()]
86
+ return f'{self.__class__.__name__}({", ".join(chars)})'
87
+
88
+ def _init_seed(self, seed_info: dict):
89
+ for k, v in seed_info.items():
90
+ if k not in self.__SEED_PARAMS__:
91
+ self.__setattr__(k, v)
92
+
93
+ def _init_id(self, sid):
94
+ if not sid:
95
+ sid = hashlib.md5(self.to_string.encode()).hexdigest()
96
+ self.__setattr__("sid", sid)
97
+
98
+ @property
99
+ def to_dict(self) -> dict:
100
+ seed = self.__dict__.copy()
101
+ if seed.get("params"):
102
+ del seed["params"]
103
+ return seed
104
+
105
+ @property
106
+ def to_string(self) -> str:
107
+ return json.dumps(
108
+ self.to_dict,
109
+ ensure_ascii=False,
110
+ separators=(",", ":")
111
+ )
112
+
113
+ @property
114
+ def seed(self):
115
+ return self.to_string
116
+
117
+
118
+ class Request:
119
+ __SEED_PARAMS__ = [
120
+ "retry",
121
+ "priority",
122
+ "version",
123
+ "status"
124
+ ]
125
+
126
+ __REQUEST_ATTRS__ = {
127
+ "params",
128
+ "headers",
129
+ "cookies",
130
+ "data",
131
+ "json",
132
+ "files",
133
+ "auth",
134
+ "timeout",
135
+ "proxies",
136
+ "hooks",
137
+ "stream",
138
+ "verify",
139
+ "cert",
140
+ "allow_redirects",
141
+ }
142
+
143
+ def __init__(
144
+ self,
145
+ # url,
146
+ seed,
147
+ random_ua=True,
148
+ check_status_code=True,
149
+ retry=None,
150
+ priority=None,
151
+ version=None,
152
+ status=None,
153
+ **kwargs
154
+ ):
155
+ # self.url = url
156
+ self.check_status_code = check_status_code
157
+ self.request_setting = {}
158
+
159
+ seed_params = {
160
+ "retry": retry,
161
+ "priority": priority,
162
+ "version": version,
163
+ "status": status,
164
+ }
165
+
166
+ if isinstance(seed, Seed):
167
+ kwargs.update(**seed.to_dict)
168
+ elif isinstance(seed, str):
169
+ kwargs.update(**json.loads(seed))
170
+ elif isinstance(seed, dict):
171
+ kwargs.update(**seed)
172
+
173
+ for k, v in kwargs.items():
174
+ if k in self.__class__.__REQUEST_ATTRS__:
175
+ self.request_setting[k] = v
176
+ continue
177
+ elif k in self.__SEED_PARAMS__:
178
+ seed_params[k] = v
179
+ self.__setattr__(k, v)
180
+
181
+ if not getattr(self, "method", None):
182
+ self.method = "POST" if self.request_setting.get("data") or self.request_setting.get("json") else "GET"
183
+
184
+ if random_ua:
185
+ self._build_header()
186
+
187
+ self.params = Params(**seed_params)
188
+ # self.seed = self.to_string
189
+
190
+ @property
191
+ def _random_ua(self) -> str:
192
+ v1 = random.randint(4, 15)
193
+ v2 = random.randint(3, 11)
194
+ v3 = random.randint(1, 16)
195
+ v4 = random.randint(533, 605)
196
+ v5 = random.randint(1000, 6000)
197
+ v6 = random.randint(10, 80)
198
+ user_agent = (f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_{v1}_{v2}) AppleWebKit/{v4}.{v3} "
199
+ f"(KHTML, like Gecko) Chrome/105.0.0.0 Safari/{v4}.{v3} Edg/105.0.{v5}.{v6}")
200
+ return user_agent
201
+
202
+ def _build_header(self) -> dict:
203
+ if not self.request_setting.get("headers"):
204
+ self.request_setting["headers"] = {"accept": "*/*", "user-agent": self._random_ua}
205
+ elif "user-agent" not in [key.lower() for key in self.request_setting["headers"].keys()]:
206
+ self.request_setting["headers"]["user-agent"] = self._random_ua
207
+
208
+ def download(self) -> requests.Response:
209
+ response = requests.request(self.method, self.url, **self.request_setting)
210
+ if self.check_status_code:
211
+ response.raise_for_status()
212
+ return response
213
+
214
+ def __getattr__(self, name):
215
+ return None
216
+
217
+ def __setitem__(self, key, value):
218
+ setattr(self, key, value)
219
+
220
+ def __getitem__(self, item):
221
+ return getattr(self, item)
222
+
223
+ @property
224
+ def to_dict(self):
225
+ _dict = self.__dict__.copy()
226
+ # _dict.pop('seed')
227
+ _dict.pop('params')
228
+ _dict.pop('check_status_code')
229
+ # _dict.pop('request_setting')
230
+ return _dict
231
+
232
+ @property
233
+ def to_string(self) -> str:
234
+ return json.dumps(
235
+ self.to_dict,
236
+ ensure_ascii=False,
237
+ separators=(",", ":")
238
+ )
239
+
240
+ @property
241
+ def seed(self):
242
+ return self.to_string
243
+
244
+
245
+ class Response:
246
+
247
+ def __init__(
248
+ self,
249
+ seed,
250
+ response,
251
+ retry=None,
252
+ priority=None,
253
+ version=None,
254
+ status=None,
255
+ **kwargs
256
+ ):
257
+ self.seed = seed
258
+ self.response = response
259
+ seed_params = {
260
+ "retry": retry,
261
+ "priority": priority,
262
+ "version": version,
263
+ "status": status,
264
+ }
265
+ for k, v in kwargs.items():
266
+ if k in seed_params.keys():
267
+ seed_params[k] = v
268
+ else:
269
+ self.__setattr__(k, v)
270
+ self.params = Params(**seed_params)
271
+
272
+ @property
273
+ def to_dict(self):
274
+ _dict = self.__dict__.copy()
275
+ _dict.pop('seed')
276
+ _dict.pop('response')
277
+ _dict.pop('method')
278
+ _dict.pop('params')
279
+ _dict.pop('request_setting')
280
+ return _dict
281
+
282
+ @property
283
+ def to_string(self) -> str:
284
+ return json.dumps(
285
+ self.to_dict,
286
+ ensure_ascii=False,
287
+ separators=(",", ":")
288
+ )
289
+
290
+ def __getattr__(self, name):
291
+ return None
292
+
293
+ def __setitem__(self, key, value):
294
+ setattr(self, key, value)
295
+
296
+ def __getitem__(self, item):
297
+ return getattr(self, item)
@@ -0,0 +1,30 @@
1
+ from collections import deque
2
+
3
+
4
+ class Queue:
5
+
6
+ def __init__(self):
7
+ self._queue = deque()
8
+
9
+ @property
10
+ def length(self) -> int:
11
+ return len(self._queue)
12
+
13
+ def push(self, data, left: bool = False, direct_insertion: bool = False):
14
+ try:
15
+ if not data:
16
+ return None
17
+ if not direct_insertion and any(isinstance(data, t) for t in (list, tuple)):
18
+ self._queue.extendleft(data) if left else self._queue.extend(data)
19
+ else:
20
+ self._queue.appendleft(data) if left else self._queue.append(data)
21
+ except AttributeError:
22
+ pass
23
+
24
+ def pop(self, left: bool = True):
25
+ try:
26
+ return self._queue.popleft() if left else self._queue.pop()
27
+ except IndexError:
28
+ return None
29
+ except AttributeError:
30
+ return None
@@ -0,0 +1,40 @@
1
+ from functools import wraps
2
+
3
+
4
+ # def check_redis_status(func):
5
+ # @wraps(func)
6
+ # def wrapper(*args, **kwargs):
7
+ # try:
8
+ # result = func(*args, **kwargs)
9
+ # except Exception:
10
+ # result = False
11
+ # return result
12
+ #
13
+ # return wrapper
14
+
15
+
16
+ def decorator_oss_db(exception, retries=3):
17
+ def decorator(func):
18
+ @wraps(func)
19
+ def wrapper(callback_func, *args, **kwargs):
20
+ result = None
21
+ for i in range(retries):
22
+ msg = None
23
+ try:
24
+ return func(callback_func, *args, **kwargs)
25
+ except Exception as e:
26
+ result = None
27
+ msg = e
28
+ finally:
29
+ if result:
30
+ return result
31
+
32
+ if i >= 2 and msg:
33
+ raise exception(msg)
34
+
35
+ return wrapper
36
+
37
+ return decorator
38
+
39
+
40
+
cobweb/base/dotting.py ADDED
@@ -0,0 +1,35 @@
1
+ import os
2
+ import json
3
+ from aliyun.log import LogClient, LogItem, PutLogsRequest
4
+
5
+
6
+ class LoghubDot:
7
+
8
+ def __init__(self):
9
+ endpoint = os.getenv("DOTTING_ENDPOINT", "")
10
+ accessKeyId = os.getenv("DOTTING_ACCESS_KEY", "")
11
+ accessKey = os.getenv("DOTTING_SECRET_KEY", "")
12
+ self.client = LogClient(endpoint=endpoint, accessKeyId=accessKeyId, accessKey=accessKey) \
13
+ if endpoint and accessKeyId and accessKey else None
14
+
15
+ def build(self, topic, **kwargs):
16
+ if self.client:
17
+ temp = {}
18
+ log_items = []
19
+ log_item = LogItem()
20
+ for key, value in kwargs.items():
21
+ if not isinstance(value, str):
22
+ temp[key] = json.dumps(value, ensure_ascii=False)
23
+ else:
24
+ temp[key] = value
25
+ contents = sorted(temp.items())
26
+ log_item.set_contents(contents)
27
+ log_items.append(log_item)
28
+ request = PutLogsRequest(
29
+ project="databee-download-log",
30
+ logstore="download-logging",
31
+ topic=topic,
32
+ logitems=log_items,
33
+ compress=True
34
+ )
35
+ self.client.put_logs(request=request)
cobweb/base/item.py ADDED
@@ -0,0 +1,46 @@
1
+ from .seed import Seed
2
+ from collections import namedtuple
3
+
4
+
5
+ class Item(type):
6
+
7
+ def __new__(cls, name, bases, dct):
8
+ new_class_instance = type.__new__(cls, name, bases, dct)
9
+ if name != "BaseItem":
10
+ table = getattr(new_class_instance, "__TABLE__")
11
+ fields = getattr(new_class_instance, "__FIELDS__")
12
+ new_class_instance.Data = namedtuple(table, fields)
13
+ return new_class_instance
14
+
15
+
16
+ class BaseItem(metaclass=Item):
17
+
18
+ __TABLE__ = ""
19
+ __FIELDS__ = ""
20
+
21
+ def __init__(self, seed: Seed, **kwargs):
22
+ self.seed = seed
23
+
24
+ data = {}
25
+ for key, value in kwargs.items():
26
+ if key not in self.__FIELDS__:
27
+ self.__setattr__(key, value)
28
+ else:
29
+ data[key] = value
30
+
31
+ self.data = self.Data(**data)
32
+
33
+ @property
34
+ def to_dict(self):
35
+ return self.data._asdict()
36
+
37
+ @property
38
+ def table(self):
39
+ return self.Data.__name__
40
+
41
+
42
+ class ConsoleItem(BaseItem):
43
+
44
+ __TABLE__ = "console"
45
+ __FIELDS__ = "data"
46
+
@@ -52,6 +52,7 @@ class ColorCodes:
52
52
 
53
53
 
54
54
  class Log:
55
+ logging.getLogger('oss2.api').setLevel(logging.WARNING)
55
56
  logging.basicConfig(
56
57
  level=logging.INFO,
57
58
  format=f'%(asctime)s %(name)s [%(filename)s:%(lineno)d %(funcName)s]'
@@ -87,10 +88,7 @@ class Log:
87
88
  return self.__class__.log.critical
88
89
 
89
90
 
90
- log = Log()
91
- # log.info("This text will be bold!")
92
- # print(ColorCodes.BOLD + "This text will be bold!" + ColorCodes.RESET)
93
- # print(ColorCodes.UNDERLINE + ColorCodes.BLUE + "This text will be underlined and blue!" + ColorCodes.RESET)
94
- # print(ColorCodes.BG_YELLOW + ColorCodes.RED + "This text will have a yellow background and red text!" + ColorCodes.RESET)
95
- # print(ColorCodes.BLINK + "This text will blink (if supported by the terminal)!" + ColorCodes.RESET)
91
+ logger = Log()
92
+
93
+
96
94
 
cobweb/base/request.py ADDED
@@ -0,0 +1,82 @@
1
+ import random
2
+ import requests
3
+
4
+
5
+ class Request:
6
+
7
+ __REQUEST_ATTRS__ = {
8
+ "params",
9
+ "headers",
10
+ "cookies",
11
+ "data",
12
+ "json",
13
+ "files",
14
+ "auth",
15
+ "timeout",
16
+ "proxies",
17
+ "hooks",
18
+ "stream",
19
+ "verify",
20
+ "cert",
21
+ "allow_redirects",
22
+ }
23
+
24
+ def __init__(
25
+ self,
26
+ url,
27
+ seed,
28
+ random_ua=True,
29
+ check_status_code=True,
30
+ **kwargs
31
+ ):
32
+ self.url = url
33
+ self.seed = seed
34
+ self.check_status_code = check_status_code
35
+ self.request_setting = {}
36
+
37
+ for k, v in kwargs.items():
38
+ if k in self.__class__.__REQUEST_ATTRS__:
39
+ self.request_setting[k] = v
40
+ continue
41
+ self.__setattr__(k, v)
42
+
43
+ if not getattr(self, "method", None):
44
+ self.method = "POST" if self.request_setting.get("data") or self.request_setting.get("json") else "GET"
45
+
46
+ if random_ua:
47
+ self._build_header()
48
+
49
+ @property
50
+ def _random_ua(self) -> str:
51
+ v1 = random.randint(4, 15)
52
+ v2 = random.randint(3, 11)
53
+ v3 = random.randint(1, 16)
54
+ v4 = random.randint(533, 605)
55
+ v5 = random.randint(1000, 6000)
56
+ v6 = random.randint(10, 80)
57
+ user_agent = (f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_{v1}_{v2}) AppleWebKit/{v4}.{v3} "
58
+ f"(KHTML, like Gecko) Chrome/105.0.0.0 Safari/{v4}.{v3} Edg/105.0.{v5}.{v6}")
59
+ return user_agent
60
+
61
+ def _build_header(self) -> dict:
62
+ if not self.request_setting.get("headers"):
63
+ self.request_setting["headers"] = {"accept": "*/*", "user-agent": self._random_ua}
64
+ elif "user-agent" not in [key.lower() for key in self.request_setting["headers"].keys()]:
65
+ self.request_setting["headers"]["user-agent"] = self._random_ua
66
+
67
+ def download(self) -> requests.Response:
68
+ response = requests.request(self.method, self.url, **self.request_setting)
69
+ if self.check_status_code:
70
+ response.raise_for_status()
71
+ return response
72
+
73
+ @property
74
+ def to_dict(self):
75
+ _dict = self.__dict__.copy()
76
+ _dict.pop('url')
77
+ _dict.pop('seed')
78
+ _dict.pop('check_status_code')
79
+ _dict.pop('request_setting')
80
+ return _dict
81
+
82
+
@@ -0,0 +1,23 @@
1
+
2
+
3
+ class Response:
4
+
5
+ def __init__(
6
+ self,
7
+ seed,
8
+ response,
9
+ **kwargs
10
+ ):
11
+ self.seed = seed
12
+ self.response = response
13
+
14
+ for k, v in kwargs.items():
15
+ self.__setattr__(k, v)
16
+
17
+ @property
18
+ def to_dict(self):
19
+ _dict = self.__dict__.copy()
20
+ _dict.pop('seed')
21
+ _dict.pop('response')
22
+ return _dict
23
+