cobweb-launcher 0.1.8__py3-none-any.whl → 1.2.42__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. cobweb/__init__.py +2 -11
  2. cobweb/base/__init__.py +9 -0
  3. cobweb/base/basic.py +297 -0
  4. cobweb/base/common_queue.py +30 -0
  5. cobweb/base/decorators.py +40 -0
  6. cobweb/base/dotting.py +35 -0
  7. cobweb/base/item.py +46 -0
  8. cobweb/{log.py → base/log.py} +4 -6
  9. cobweb/base/request.py +82 -0
  10. cobweb/base/response.py +23 -0
  11. cobweb/base/seed.py +114 -0
  12. cobweb/constant.py +94 -0
  13. cobweb/crawlers/__init__.py +1 -0
  14. cobweb/crawlers/base_crawler.py +144 -0
  15. cobweb/crawlers/crawler.py +212 -0
  16. cobweb/crawlers/file_crawler.py +98 -0
  17. cobweb/db/__init__.py +2 -2
  18. cobweb/db/api_db.py +82 -0
  19. cobweb/db/redis_db.py +125 -218
  20. cobweb/exceptions/__init__.py +1 -0
  21. cobweb/exceptions/oss_db_exception.py +28 -0
  22. cobweb/launchers/__init__.py +3 -0
  23. cobweb/launchers/launcher.py +235 -0
  24. cobweb/launchers/launcher_air.py +88 -0
  25. cobweb/launchers/launcher_api.py +209 -0
  26. cobweb/launchers/launcher_pro.py +208 -0
  27. cobweb/pipelines/__init__.py +3 -0
  28. cobweb/pipelines/pipeline.py +69 -0
  29. cobweb/pipelines/pipeline_console.py +22 -0
  30. cobweb/pipelines/pipeline_loghub.py +34 -0
  31. cobweb/schedulers/__init__.py +3 -0
  32. cobweb/schedulers/scheduler_api.py +72 -0
  33. cobweb/schedulers/scheduler_redis.py +72 -0
  34. cobweb/setting.py +67 -6
  35. cobweb/utils/__init__.py +5 -0
  36. cobweb/utils/bloom.py +58 -0
  37. cobweb/utils/dotting.py +32 -0
  38. cobweb/utils/oss.py +94 -0
  39. cobweb/utils/tools.py +42 -0
  40. cobweb_launcher-1.2.42.dist-info/METADATA +205 -0
  41. cobweb_launcher-1.2.42.dist-info/RECORD +44 -0
  42. {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.42.dist-info}/WHEEL +1 -1
  43. cobweb/bbb.py +0 -191
  44. cobweb/db/oss_db.py +0 -127
  45. cobweb/db/scheduler/__init__.py +0 -0
  46. cobweb/db/scheduler/default.py +0 -8
  47. cobweb/db/scheduler/textfile.py +0 -27
  48. cobweb/db/storer/__init__.py +0 -0
  49. cobweb/db/storer/console.py +0 -9
  50. cobweb/db/storer/loghub.py +0 -54
  51. cobweb/db/storer/redis.py +0 -15
  52. cobweb/db/storer/textfile.py +0 -15
  53. cobweb/decorators.py +0 -16
  54. cobweb/distributed/__init__.py +0 -0
  55. cobweb/distributed/launcher.py +0 -243
  56. cobweb/distributed/models.py +0 -143
  57. cobweb/interface.py +0 -34
  58. cobweb/single/__init__.py +0 -0
  59. cobweb/single/launcher.py +0 -231
  60. cobweb/single/models.py +0 -134
  61. cobweb/single/nest.py +0 -153
  62. cobweb/task.py +0 -50
  63. cobweb/utils.py +0 -90
  64. cobweb_launcher-0.1.8.dist-info/METADATA +0 -45
  65. cobweb_launcher-0.1.8.dist-info/RECORD +0 -31
  66. {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.42.dist-info}/LICENSE +0 -0
  67. {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.42.dist-info}/top_level.txt +0 -0
cobweb/__init__.py CHANGED
@@ -1,11 +1,2 @@
1
- from .bbb import Seed, Queue, DBItem
2
- from .task import Task
3
- from .log import log
4
- from .interface import SchedulerInterface, StorerInterface
5
- from .db.redis_db import RedisDB
6
- from .db.oss_db import OssDB
7
- from .distributed.launcher import launcher
8
- from .single.launcher import launcher as single_launcher
9
- from . import setting
10
-
11
-
1
+ from .launchers import LauncherAir, LauncherPro, LauncherApi
2
+ from .constant import CrawlerModel
@@ -0,0 +1,9 @@
1
+ from .common_queue import Queue
2
+ from .response import Response
3
+ from .request import Request
4
+ from .item import BaseItem, ConsoleItem
5
+ from .seed import Seed
6
+
7
+ from .log import logger
8
+ from .decorators import decorator_oss_db
9
+
cobweb/base/basic.py ADDED
@@ -0,0 +1,297 @@
1
+ import json
2
+ import random
3
+ import time
4
+ import hashlib
5
+ import requests
6
+
7
+
8
+ class Params:
9
+
10
+ def __init__(self, retry=None, priority=None, version=None, status=None):
11
+ self.retry = retry or 0
12
+ self.priority = priority or 300
13
+ self.version = version or int(time.time())
14
+ self.status = status
15
+
16
+
17
+ class Seed:
18
+ __SEED_PARAMS__ = [
19
+ "retry",
20
+ "priority",
21
+ "version",
22
+ "status"
23
+ ]
24
+
25
+ def __init__(
26
+ self,
27
+ seed,
28
+ sid=None,
29
+ retry=None,
30
+ priority=None,
31
+ version=None,
32
+ status=None,
33
+ **kwargs
34
+ ):
35
+ if any(isinstance(seed, t) for t in (str, bytes)):
36
+ try:
37
+ item = json.loads(seed)
38
+ self._init_seed(item)
39
+ except json.JSONDecodeError:
40
+ self.__setattr__("url", seed)
41
+ elif isinstance(seed, dict):
42
+ self._init_seed(seed)
43
+ else:
44
+ raise TypeError(Exception(
45
+ f"seed type error, "
46
+ f"must be str or dict! "
47
+ f"seed: {seed}"
48
+ ))
49
+
50
+ seed_params = {
51
+ "retry": retry,
52
+ "priority": priority,
53
+ "version": version,
54
+ "status": status,
55
+ }
56
+
57
+ if kwargs:
58
+ # for k, v in kwargs.items():
59
+ # if k in seed_params.keys():
60
+ # seed_params[k] = v
61
+ # else:
62
+ # self.__setattr__(k, v)
63
+ self._init_seed(kwargs)
64
+ seed_params.update({
65
+ k: v for k, v in kwargs.items()
66
+ if k in self.__SEED_PARAMS__
67
+ })
68
+ if sid or not getattr(self, "sid", None):
69
+ self._init_id(sid)
70
+ self.params = Params(**seed_params)
71
+
72
+ def __getattr__(self, name):
73
+ return None
74
+
75
+ def __setitem__(self, key, value):
76
+ setattr(self, key, value)
77
+
78
+ def __getitem__(self, item):
79
+ return getattr(self, item)
80
+
81
+ def __str__(self):
82
+ return json.dumps(self.__dict__, ensure_ascii=False)
83
+
84
+ def __repr__(self):
85
+ chars = [f"{k}={v}" for k, v in self.__dict__.items()]
86
+ return f'{self.__class__.__name__}({", ".join(chars)})'
87
+
88
+ def _init_seed(self, seed_info: dict):
89
+ for k, v in seed_info.items():
90
+ if k not in self.__SEED_PARAMS__:
91
+ self.__setattr__(k, v)
92
+
93
+ def _init_id(self, sid):
94
+ if not sid:
95
+ sid = hashlib.md5(self.to_string.encode()).hexdigest()
96
+ self.__setattr__("sid", sid)
97
+
98
+ @property
99
+ def to_dict(self) -> dict:
100
+ seed = self.__dict__.copy()
101
+ if seed.get("params"):
102
+ del seed["params"]
103
+ return seed
104
+
105
+ @property
106
+ def to_string(self) -> str:
107
+ return json.dumps(
108
+ self.to_dict,
109
+ ensure_ascii=False,
110
+ separators=(",", ":")
111
+ )
112
+
113
+ @property
114
+ def seed(self):
115
+ return self.to_string
116
+
117
+
118
+ class Request:
119
+ __SEED_PARAMS__ = [
120
+ "retry",
121
+ "priority",
122
+ "version",
123
+ "status"
124
+ ]
125
+
126
+ __REQUEST_ATTRS__ = {
127
+ "params",
128
+ "headers",
129
+ "cookies",
130
+ "data",
131
+ "json",
132
+ "files",
133
+ "auth",
134
+ "timeout",
135
+ "proxies",
136
+ "hooks",
137
+ "stream",
138
+ "verify",
139
+ "cert",
140
+ "allow_redirects",
141
+ }
142
+
143
+ def __init__(
144
+ self,
145
+ # url,
146
+ seed,
147
+ random_ua=True,
148
+ check_status_code=True,
149
+ retry=None,
150
+ priority=None,
151
+ version=None,
152
+ status=None,
153
+ **kwargs
154
+ ):
155
+ # self.url = url
156
+ self.check_status_code = check_status_code
157
+ self.request_setting = {}
158
+
159
+ seed_params = {
160
+ "retry": retry,
161
+ "priority": priority,
162
+ "version": version,
163
+ "status": status,
164
+ }
165
+
166
+ if isinstance(seed, Seed):
167
+ kwargs.update(**seed.to_dict)
168
+ elif isinstance(seed, str):
169
+ kwargs.update(**json.loads(seed))
170
+ elif isinstance(seed, dict):
171
+ kwargs.update(**seed)
172
+
173
+ for k, v in kwargs.items():
174
+ if k in self.__class__.__REQUEST_ATTRS__:
175
+ self.request_setting[k] = v
176
+ continue
177
+ elif k in self.__SEED_PARAMS__:
178
+ seed_params[k] = v
179
+ self.__setattr__(k, v)
180
+
181
+ if not getattr(self, "method", None):
182
+ self.method = "POST" if self.request_setting.get("data") or self.request_setting.get("json") else "GET"
183
+
184
+ if random_ua:
185
+ self._build_header()
186
+
187
+ self.params = Params(**seed_params)
188
+ # self.seed = self.to_string
189
+
190
+ @property
191
+ def _random_ua(self) -> str:
192
+ v1 = random.randint(4, 15)
193
+ v2 = random.randint(3, 11)
194
+ v3 = random.randint(1, 16)
195
+ v4 = random.randint(533, 605)
196
+ v5 = random.randint(1000, 6000)
197
+ v6 = random.randint(10, 80)
198
+ user_agent = (f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_{v1}_{v2}) AppleWebKit/{v4}.{v3} "
199
+ f"(KHTML, like Gecko) Chrome/105.0.0.0 Safari/{v4}.{v3} Edg/105.0.{v5}.{v6}")
200
+ return user_agent
201
+
202
+ def _build_header(self) -> dict:
203
+ if not self.request_setting.get("headers"):
204
+ self.request_setting["headers"] = {"accept": "*/*", "user-agent": self._random_ua}
205
+ elif "user-agent" not in [key.lower() for key in self.request_setting["headers"].keys()]:
206
+ self.request_setting["headers"]["user-agent"] = self._random_ua
207
+
208
+ def download(self) -> requests.Response:
209
+ response = requests.request(self.method, self.url, **self.request_setting)
210
+ if self.check_status_code:
211
+ response.raise_for_status()
212
+ return response
213
+
214
+ def __getattr__(self, name):
215
+ return None
216
+
217
+ def __setitem__(self, key, value):
218
+ setattr(self, key, value)
219
+
220
+ def __getitem__(self, item):
221
+ return getattr(self, item)
222
+
223
+ @property
224
+ def to_dict(self):
225
+ _dict = self.__dict__.copy()
226
+ # _dict.pop('seed')
227
+ _dict.pop('params')
228
+ _dict.pop('check_status_code')
229
+ # _dict.pop('request_setting')
230
+ return _dict
231
+
232
+ @property
233
+ def to_string(self) -> str:
234
+ return json.dumps(
235
+ self.to_dict,
236
+ ensure_ascii=False,
237
+ separators=(",", ":")
238
+ )
239
+
240
+ @property
241
+ def seed(self):
242
+ return self.to_string
243
+
244
+
245
+ class Response:
246
+
247
+ def __init__(
248
+ self,
249
+ seed,
250
+ response,
251
+ retry=None,
252
+ priority=None,
253
+ version=None,
254
+ status=None,
255
+ **kwargs
256
+ ):
257
+ self.seed = seed
258
+ self.response = response
259
+ seed_params = {
260
+ "retry": retry,
261
+ "priority": priority,
262
+ "version": version,
263
+ "status": status,
264
+ }
265
+ for k, v in kwargs.items():
266
+ if k in seed_params.keys():
267
+ seed_params[k] = v
268
+ else:
269
+ self.__setattr__(k, v)
270
+ self.params = Params(**seed_params)
271
+
272
+ @property
273
+ def to_dict(self):
274
+ _dict = self.__dict__.copy()
275
+ _dict.pop('seed')
276
+ _dict.pop('response')
277
+ _dict.pop('method')
278
+ _dict.pop('params')
279
+ _dict.pop('request_setting')
280
+ return _dict
281
+
282
+ @property
283
+ def to_string(self) -> str:
284
+ return json.dumps(
285
+ self.to_dict,
286
+ ensure_ascii=False,
287
+ separators=(",", ":")
288
+ )
289
+
290
+ def __getattr__(self, name):
291
+ return None
292
+
293
+ def __setitem__(self, key, value):
294
+ setattr(self, key, value)
295
+
296
+ def __getitem__(self, item):
297
+ return getattr(self, item)
@@ -0,0 +1,30 @@
1
+ from collections import deque
2
+
3
+
4
+ class Queue:
5
+
6
+ def __init__(self):
7
+ self._queue = deque()
8
+
9
+ @property
10
+ def length(self) -> int:
11
+ return len(self._queue)
12
+
13
+ def push(self, data, left: bool = False, direct_insertion: bool = False):
14
+ try:
15
+ if not data:
16
+ return None
17
+ if not direct_insertion and any(isinstance(data, t) for t in (list, tuple)):
18
+ self._queue.extendleft(data) if left else self._queue.extend(data)
19
+ else:
20
+ self._queue.appendleft(data) if left else self._queue.append(data)
21
+ except AttributeError:
22
+ pass
23
+
24
+ def pop(self, left: bool = True):
25
+ try:
26
+ return self._queue.popleft() if left else self._queue.pop()
27
+ except IndexError:
28
+ return None
29
+ except AttributeError:
30
+ return None
@@ -0,0 +1,40 @@
1
+ from functools import wraps
2
+
3
+
4
+ # def check_redis_status(func):
5
+ # @wraps(func)
6
+ # def wrapper(*args, **kwargs):
7
+ # try:
8
+ # result = func(*args, **kwargs)
9
+ # except Exception:
10
+ # result = False
11
+ # return result
12
+ #
13
+ # return wrapper
14
+
15
+
16
+ def decorator_oss_db(exception, retries=3):
17
+ def decorator(func):
18
+ @wraps(func)
19
+ def wrapper(callback_func, *args, **kwargs):
20
+ result = None
21
+ for i in range(retries):
22
+ msg = None
23
+ try:
24
+ return func(callback_func, *args, **kwargs)
25
+ except Exception as e:
26
+ result = None
27
+ msg = e
28
+ finally:
29
+ if result:
30
+ return result
31
+
32
+ if i >= 2 and msg:
33
+ raise exception(msg)
34
+
35
+ return wrapper
36
+
37
+ return decorator
38
+
39
+
40
+
cobweb/base/dotting.py ADDED
@@ -0,0 +1,35 @@
1
+ import os
2
+ import json
3
+ from aliyun.log import LogClient, LogItem, PutLogsRequest
4
+
5
+
6
+ class LoghubDot:
7
+
8
+ def __init__(self):
9
+ endpoint = os.getenv("DOTTING_ENDPOINT", "")
10
+ accessKeyId = os.getenv("DOTTING_ACCESS_KEY", "")
11
+ accessKey = os.getenv("DOTTING_SECRET_KEY", "")
12
+ self.client = LogClient(endpoint=endpoint, accessKeyId=accessKeyId, accessKey=accessKey) \
13
+ if endpoint and accessKeyId and accessKey else None
14
+
15
+ def build(self, topic, **kwargs):
16
+ if self.client:
17
+ temp = {}
18
+ log_items = []
19
+ log_item = LogItem()
20
+ for key, value in kwargs.items():
21
+ if not isinstance(value, str):
22
+ temp[key] = json.dumps(value, ensure_ascii=False)
23
+ else:
24
+ temp[key] = value
25
+ contents = sorted(temp.items())
26
+ log_item.set_contents(contents)
27
+ log_items.append(log_item)
28
+ request = PutLogsRequest(
29
+ project="databee-download-log",
30
+ logstore="download-logging",
31
+ topic=topic,
32
+ logitems=log_items,
33
+ compress=True
34
+ )
35
+ self.client.put_logs(request=request)
cobweb/base/item.py ADDED
@@ -0,0 +1,46 @@
1
+ from .seed import Seed
2
+ from collections import namedtuple
3
+
4
+
5
+ class Item(type):
6
+
7
+ def __new__(cls, name, bases, dct):
8
+ new_class_instance = type.__new__(cls, name, bases, dct)
9
+ if name != "BaseItem":
10
+ table = getattr(new_class_instance, "__TABLE__")
11
+ fields = getattr(new_class_instance, "__FIELDS__")
12
+ new_class_instance.Data = namedtuple(table, fields)
13
+ return new_class_instance
14
+
15
+
16
+ class BaseItem(metaclass=Item):
17
+
18
+ __TABLE__ = ""
19
+ __FIELDS__ = ""
20
+
21
+ def __init__(self, seed: Seed, **kwargs):
22
+ self.seed = seed
23
+
24
+ data = {}
25
+ for key, value in kwargs.items():
26
+ if key not in self.__FIELDS__:
27
+ self.__setattr__(key, value)
28
+ else:
29
+ data[key] = value
30
+
31
+ self.data = self.Data(**data)
32
+
33
+ @property
34
+ def to_dict(self):
35
+ return self.data._asdict()
36
+
37
+ @property
38
+ def table(self):
39
+ return self.Data.__name__
40
+
41
+
42
+ class ConsoleItem(BaseItem):
43
+
44
+ __TABLE__ = "console"
45
+ __FIELDS__ = "data"
46
+
@@ -52,6 +52,7 @@ class ColorCodes:
52
52
 
53
53
 
54
54
  class Log:
55
+ logging.getLogger('oss2.api').setLevel(logging.WARNING)
55
56
  logging.basicConfig(
56
57
  level=logging.INFO,
57
58
  format=f'%(asctime)s %(name)s [%(filename)s:%(lineno)d %(funcName)s]'
@@ -87,10 +88,7 @@ class Log:
87
88
  return self.__class__.log.critical
88
89
 
89
90
 
90
- log = Log()
91
- # log.info("This text will be bold!")
92
- # print(ColorCodes.BOLD + "This text will be bold!" + ColorCodes.RESET)
93
- # print(ColorCodes.UNDERLINE + ColorCodes.BLUE + "This text will be underlined and blue!" + ColorCodes.RESET)
94
- # print(ColorCodes.BG_YELLOW + ColorCodes.RED + "This text will have a yellow background and red text!" + ColorCodes.RESET)
95
- # print(ColorCodes.BLINK + "This text will blink (if supported by the terminal)!" + ColorCodes.RESET)
91
+ logger = Log()
92
+
93
+
96
94
 
cobweb/base/request.py ADDED
@@ -0,0 +1,82 @@
1
+ import random
2
+ import requests
3
+
4
+
5
+ class Request:
6
+
7
+ __REQUEST_ATTRS__ = {
8
+ "params",
9
+ "headers",
10
+ "cookies",
11
+ "data",
12
+ "json",
13
+ "files",
14
+ "auth",
15
+ "timeout",
16
+ "proxies",
17
+ "hooks",
18
+ "stream",
19
+ "verify",
20
+ "cert",
21
+ "allow_redirects",
22
+ }
23
+
24
+ def __init__(
25
+ self,
26
+ url,
27
+ seed,
28
+ random_ua=True,
29
+ check_status_code=True,
30
+ **kwargs
31
+ ):
32
+ self.url = url
33
+ self.seed = seed
34
+ self.check_status_code = check_status_code
35
+ self.request_setting = {}
36
+
37
+ for k, v in kwargs.items():
38
+ if k in self.__class__.__REQUEST_ATTRS__:
39
+ self.request_setting[k] = v
40
+ continue
41
+ self.__setattr__(k, v)
42
+
43
+ if not getattr(self, "method", None):
44
+ self.method = "POST" if self.request_setting.get("data") or self.request_setting.get("json") else "GET"
45
+
46
+ if random_ua:
47
+ self._build_header()
48
+
49
+ @property
50
+ def _random_ua(self) -> str:
51
+ v1 = random.randint(4, 15)
52
+ v2 = random.randint(3, 11)
53
+ v3 = random.randint(1, 16)
54
+ v4 = random.randint(533, 605)
55
+ v5 = random.randint(1000, 6000)
56
+ v6 = random.randint(10, 80)
57
+ user_agent = (f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_{v1}_{v2}) AppleWebKit/{v4}.{v3} "
58
+ f"(KHTML, like Gecko) Chrome/105.0.0.0 Safari/{v4}.{v3} Edg/105.0.{v5}.{v6}")
59
+ return user_agent
60
+
61
+ def _build_header(self) -> dict:
62
+ if not self.request_setting.get("headers"):
63
+ self.request_setting["headers"] = {"accept": "*/*", "user-agent": self._random_ua}
64
+ elif "user-agent" not in [key.lower() for key in self.request_setting["headers"].keys()]:
65
+ self.request_setting["headers"]["user-agent"] = self._random_ua
66
+
67
+ def download(self) -> requests.Response:
68
+ response = requests.request(self.method, self.url, **self.request_setting)
69
+ if self.check_status_code:
70
+ response.raise_for_status()
71
+ return response
72
+
73
+ @property
74
+ def to_dict(self):
75
+ _dict = self.__dict__.copy()
76
+ _dict.pop('url')
77
+ _dict.pop('seed')
78
+ _dict.pop('check_status_code')
79
+ _dict.pop('request_setting')
80
+ return _dict
81
+
82
+
@@ -0,0 +1,23 @@
1
+
2
+
3
+ class Response:
4
+
5
+ def __init__(
6
+ self,
7
+ seed,
8
+ response,
9
+ **kwargs
10
+ ):
11
+ self.seed = seed
12
+ self.response = response
13
+
14
+ for k, v in kwargs.items():
15
+ self.__setattr__(k, v)
16
+
17
+ @property
18
+ def to_dict(self):
19
+ _dict = self.__dict__.copy()
20
+ _dict.pop('seed')
21
+ _dict.pop('response')
22
+ return _dict
23
+