cobweb-launcher 1.0.5__py3-none-any.whl → 3.2.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cobweb/__init__.py +5 -1
- cobweb/base/__init__.py +3 -3
- cobweb/base/common_queue.py +37 -16
- cobweb/base/item.py +40 -14
- cobweb/base/{log.py → logger.py} +3 -3
- cobweb/base/request.py +744 -47
- cobweb/base/response.py +381 -13
- cobweb/base/seed.py +98 -50
- cobweb/base/task_queue.py +180 -0
- cobweb/base/test.py +257 -0
- cobweb/constant.py +39 -2
- cobweb/crawlers/__init__.py +1 -2
- cobweb/crawlers/crawler.py +27 -0
- cobweb/db/__init__.py +1 -0
- cobweb/db/api_db.py +83 -0
- cobweb/db/redis_db.py +118 -27
- cobweb/launchers/__init__.py +3 -1
- cobweb/launchers/distributor.py +141 -0
- cobweb/launchers/launcher.py +103 -130
- cobweb/launchers/uploader.py +68 -0
- cobweb/log_dots/__init__.py +2 -0
- cobweb/log_dots/dot.py +258 -0
- cobweb/log_dots/loghub_dot.py +53 -0
- cobweb/pipelines/__init__.py +3 -2
- cobweb/pipelines/pipeline.py +19 -0
- cobweb/pipelines/pipeline_csv.py +25 -0
- cobweb/pipelines/pipeline_loghub.py +54 -0
- cobweb/schedulers/__init__.py +1 -0
- cobweb/schedulers/scheduler.py +66 -0
- cobweb/schedulers/scheduler_with_redis.py +189 -0
- cobweb/setting.py +37 -38
- cobweb/utils/__init__.py +5 -2
- cobweb/utils/bloom.py +58 -0
- cobweb/{base → utils}/decorators.py +14 -12
- cobweb/utils/dotting.py +300 -0
- cobweb/utils/oss.py +113 -86
- cobweb/utils/tools.py +3 -15
- cobweb_launcher-3.2.18.dist-info/METADATA +193 -0
- cobweb_launcher-3.2.18.dist-info/RECORD +44 -0
- {cobweb_launcher-1.0.5.dist-info → cobweb_launcher-3.2.18.dist-info}/WHEEL +1 -1
- cobweb/crawlers/base_crawler.py +0 -121
- cobweb/crawlers/file_crawler.py +0 -181
- cobweb/launchers/launcher_pro.py +0 -174
- cobweb/pipelines/base_pipeline.py +0 -54
- cobweb/pipelines/loghub_pipeline.py +0 -34
- cobweb_launcher-1.0.5.dist-info/METADATA +0 -48
- cobweb_launcher-1.0.5.dist-info/RECORD +0 -32
- {cobweb_launcher-1.0.5.dist-info → cobweb_launcher-3.2.18.dist-info}/LICENSE +0 -0
- {cobweb_launcher-1.0.5.dist-info → cobweb_launcher-3.2.18.dist-info}/top_level.txt +0 -0
cobweb/__init__.py
CHANGED
cobweb/base/__init__.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
+
from .item import BaseItem, CSVItem
|
|
1
2
|
from .common_queue import Queue
|
|
2
3
|
from .response import Response
|
|
3
4
|
from .request import Request
|
|
4
|
-
from .
|
|
5
|
+
from .logger import logger
|
|
5
6
|
from .seed import Seed
|
|
7
|
+
from .task_queue import TaskQueue, Status
|
|
6
8
|
|
|
7
|
-
from .log import logger
|
|
8
|
-
from .decorators import decorator_oss_db
|
|
9
9
|
|
cobweb/base/common_queue.py
CHANGED
|
@@ -1,30 +1,51 @@
|
|
|
1
1
|
from collections import deque
|
|
2
|
+
from typing import Any, Iterable, Union
|
|
2
3
|
|
|
3
4
|
|
|
4
5
|
class Queue:
|
|
5
|
-
|
|
6
6
|
def __init__(self):
|
|
7
|
+
"""初始化队列"""
|
|
7
8
|
self._queue = deque()
|
|
8
9
|
|
|
9
10
|
@property
|
|
10
11
|
def length(self) -> int:
|
|
12
|
+
"""返回队列长度"""
|
|
11
13
|
return len(self._queue)
|
|
12
14
|
|
|
13
|
-
def
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
15
|
+
def empty(self) -> bool:
|
|
16
|
+
"""检查队列是否为空"""
|
|
17
|
+
return not self._queue
|
|
18
|
+
|
|
19
|
+
def push(self, data: Union[Any, Iterable], direct_insertion: bool = False):
|
|
20
|
+
"""
|
|
21
|
+
向队列中添加数据。
|
|
22
|
+
如果数据是可迭代对象(如列表、元组或集合),可以选择直接扩展队列。
|
|
23
|
+
"""
|
|
24
|
+
if not data:
|
|
25
|
+
return
|
|
26
|
+
|
|
27
|
+
if not direct_insertion and isinstance(data, (list, tuple, set)):
|
|
28
|
+
self._queue.extend(data)
|
|
29
|
+
else:
|
|
30
|
+
self._queue.append(data)
|
|
31
|
+
|
|
32
|
+
def pop(self) -> Any:
|
|
33
|
+
"""
|
|
34
|
+
从队列左侧弹出一个元素。
|
|
35
|
+
如果队列为空,返回 None。
|
|
36
|
+
"""
|
|
25
37
|
try:
|
|
26
|
-
return self._queue.popleft()
|
|
38
|
+
return self._queue.popleft()
|
|
27
39
|
except IndexError:
|
|
28
40
|
return None
|
|
29
|
-
|
|
30
|
-
|
|
41
|
+
|
|
42
|
+
def iter_items(self, limit: int = 1) -> Iterable:
|
|
43
|
+
"""
|
|
44
|
+
按指定数量从队列中弹出多个元素并生成它们。
|
|
45
|
+
如果队列为空或达到限制,则停止生成。
|
|
46
|
+
"""
|
|
47
|
+
for _ in range(limit):
|
|
48
|
+
item = self.pop()
|
|
49
|
+
if item is None:
|
|
50
|
+
break
|
|
51
|
+
yield item
|
cobweb/base/item.py
CHANGED
|
@@ -1,19 +1,22 @@
|
|
|
1
1
|
from .seed import Seed
|
|
2
|
+
from typing import Dict, Any
|
|
2
3
|
from collections import namedtuple
|
|
3
4
|
|
|
4
5
|
|
|
5
|
-
class
|
|
6
|
+
class ItemMeta(type):
|
|
6
7
|
|
|
7
|
-
def __new__(cls, name, bases, dct):
|
|
8
|
-
|
|
8
|
+
def __new__(cls, name: str, bases: tuple, dct: dict) -> type:
|
|
9
|
+
new_class = super().__new__(cls, name, bases, dct)
|
|
9
10
|
if name != "BaseItem":
|
|
10
|
-
table = getattr(
|
|
11
|
-
fields = getattr(
|
|
12
|
-
|
|
13
|
-
|
|
11
|
+
table = getattr(new_class, "__TABLE__")
|
|
12
|
+
fields = getattr(new_class, "__FIELDS__")
|
|
13
|
+
if not table or not fields:
|
|
14
|
+
raise ValueError(f"Missing required attributes '__TABLE__' or '__FIELDS__' in class {name}")
|
|
15
|
+
new_class.Data = namedtuple(table, fields)
|
|
16
|
+
return new_class
|
|
14
17
|
|
|
15
18
|
|
|
16
|
-
class BaseItem(metaclass=
|
|
19
|
+
class BaseItem(metaclass=ItemMeta):
|
|
17
20
|
|
|
18
21
|
__TABLE__ = ""
|
|
19
22
|
__FIELDS__ = ""
|
|
@@ -23,17 +26,40 @@ class BaseItem(metaclass=Item):
|
|
|
23
26
|
|
|
24
27
|
data = {}
|
|
25
28
|
for key, value in kwargs.items():
|
|
26
|
-
if key
|
|
27
|
-
self.__setattr__(key, value)
|
|
28
|
-
else:
|
|
29
|
+
if key in self.__FIELDS__:
|
|
29
30
|
data[key] = value
|
|
31
|
+
else:
|
|
32
|
+
setattr(self, key, value)
|
|
30
33
|
|
|
31
|
-
|
|
34
|
+
try:
|
|
35
|
+
self.data = self.Data(**data)
|
|
36
|
+
except TypeError as e:
|
|
37
|
+
raise ValueError(f"Invalid field values for Data: {e}") from e
|
|
32
38
|
|
|
33
39
|
@property
|
|
34
|
-
def to_dict(self):
|
|
40
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
35
41
|
return self.data._asdict()
|
|
36
42
|
|
|
37
43
|
@property
|
|
38
|
-
def
|
|
44
|
+
def fields(self) -> tuple[str]:
|
|
45
|
+
return self.data._fields
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def table(self) -> str:
|
|
39
49
|
return self.Data.__name__
|
|
50
|
+
|
|
51
|
+
def __setitem__(self, key: str, value: Any):
|
|
52
|
+
setattr(self, key, value)
|
|
53
|
+
|
|
54
|
+
def __getitem__(self, key: str) -> Any:
|
|
55
|
+
return getattr(self, key, None)
|
|
56
|
+
|
|
57
|
+
def __getattr__(self, name: str) -> Any:
|
|
58
|
+
return None
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class CSVItem(BaseItem):
|
|
62
|
+
|
|
63
|
+
__TABLE__ = "cobweb"
|
|
64
|
+
__FIELDS__ = "data"
|
|
65
|
+
|
cobweb/base/{log.py → logger.py}
RENAMED
|
@@ -51,8 +51,8 @@ class ColorCodes:
|
|
|
51
51
|
HIDDEN = "\033[8m"
|
|
52
52
|
|
|
53
53
|
|
|
54
|
-
class
|
|
55
|
-
logging.getLogger('oss2.api').setLevel(logging.WARNING)
|
|
54
|
+
class Logger:
|
|
55
|
+
# logging.getLogger('oss2.api').setLevel(logging.WARNING)
|
|
56
56
|
logging.basicConfig(
|
|
57
57
|
level=logging.INFO,
|
|
58
58
|
format=f'%(asctime)s %(name)s [%(filename)s:%(lineno)d %(funcName)s]'
|
|
@@ -88,7 +88,7 @@ class Log:
|
|
|
88
88
|
return self.__class__.log.critical
|
|
89
89
|
|
|
90
90
|
|
|
91
|
-
logger =
|
|
91
|
+
logger = Logger()
|
|
92
92
|
|
|
93
93
|
|
|
94
94
|
|