cobweb-launcher 1.0.5__py3-none-any.whl → 3.2.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. cobweb/__init__.py +5 -1
  2. cobweb/base/__init__.py +3 -3
  3. cobweb/base/common_queue.py +37 -16
  4. cobweb/base/item.py +40 -14
  5. cobweb/base/{log.py → logger.py} +3 -3
  6. cobweb/base/request.py +744 -47
  7. cobweb/base/response.py +381 -13
  8. cobweb/base/seed.py +98 -50
  9. cobweb/base/task_queue.py +180 -0
  10. cobweb/base/test.py +257 -0
  11. cobweb/constant.py +39 -2
  12. cobweb/crawlers/__init__.py +1 -2
  13. cobweb/crawlers/crawler.py +27 -0
  14. cobweb/db/__init__.py +1 -0
  15. cobweb/db/api_db.py +83 -0
  16. cobweb/db/redis_db.py +118 -27
  17. cobweb/launchers/__init__.py +3 -1
  18. cobweb/launchers/distributor.py +141 -0
  19. cobweb/launchers/launcher.py +103 -130
  20. cobweb/launchers/uploader.py +68 -0
  21. cobweb/log_dots/__init__.py +2 -0
  22. cobweb/log_dots/dot.py +258 -0
  23. cobweb/log_dots/loghub_dot.py +53 -0
  24. cobweb/pipelines/__init__.py +3 -2
  25. cobweb/pipelines/pipeline.py +19 -0
  26. cobweb/pipelines/pipeline_csv.py +25 -0
  27. cobweb/pipelines/pipeline_loghub.py +54 -0
  28. cobweb/schedulers/__init__.py +1 -0
  29. cobweb/schedulers/scheduler.py +66 -0
  30. cobweb/schedulers/scheduler_with_redis.py +189 -0
  31. cobweb/setting.py +37 -38
  32. cobweb/utils/__init__.py +5 -2
  33. cobweb/utils/bloom.py +58 -0
  34. cobweb/{base → utils}/decorators.py +14 -12
  35. cobweb/utils/dotting.py +300 -0
  36. cobweb/utils/oss.py +113 -86
  37. cobweb/utils/tools.py +3 -15
  38. cobweb_launcher-3.2.18.dist-info/METADATA +193 -0
  39. cobweb_launcher-3.2.18.dist-info/RECORD +44 -0
  40. {cobweb_launcher-1.0.5.dist-info → cobweb_launcher-3.2.18.dist-info}/WHEEL +1 -1
  41. cobweb/crawlers/base_crawler.py +0 -121
  42. cobweb/crawlers/file_crawler.py +0 -181
  43. cobweb/launchers/launcher_pro.py +0 -174
  44. cobweb/pipelines/base_pipeline.py +0 -54
  45. cobweb/pipelines/loghub_pipeline.py +0 -34
  46. cobweb_launcher-1.0.5.dist-info/METADATA +0 -48
  47. cobweb_launcher-1.0.5.dist-info/RECORD +0 -32
  48. {cobweb_launcher-1.0.5.dist-info → cobweb_launcher-3.2.18.dist-info}/LICENSE +0 -0
  49. {cobweb_launcher-1.0.5.dist-info → cobweb_launcher-3.2.18.dist-info}/top_level.txt +0 -0
cobweb/__init__.py CHANGED
@@ -1 +1,5 @@
1
- from .launchers import Launcher, LauncherPro
1
+ from .launchers import Launcher
2
+ from .constant import CrawlerModel
3
+ from .pipelines import Pipeline
4
+ from .crawlers import Crawler
5
+ from .log_dots import Dot
cobweb/base/__init__.py CHANGED
@@ -1,9 +1,9 @@
1
+ from .item import BaseItem, CSVItem
1
2
  from .common_queue import Queue
2
3
  from .response import Response
3
4
  from .request import Request
4
- from .item import BaseItem
5
+ from .logger import logger
5
6
  from .seed import Seed
7
+ from .task_queue import TaskQueue, Status
6
8
 
7
- from .log import logger
8
- from .decorators import decorator_oss_db
9
9
 
@@ -1,30 +1,51 @@
1
1
  from collections import deque
2
+ from typing import Any, Iterable, Union
2
3
 
3
4
 
4
5
  class Queue:
5
-
6
6
  def __init__(self):
7
+ """初始化队列"""
7
8
  self._queue = deque()
8
9
 
9
10
  @property
10
11
  def length(self) -> int:
12
+ """返回队列长度"""
11
13
  return len(self._queue)
12
14
 
13
- def push(self, data, left: bool = False, direct_insertion: bool = False):
14
- try:
15
- if not data:
16
- return None
17
- if not direct_insertion and any(isinstance(data, t) for t in (list, tuple)):
18
- self._queue.extendleft(data) if left else self._queue.extend(data)
19
- else:
20
- self._queue.appendleft(data) if left else self._queue.append(data)
21
- except AttributeError:
22
- pass
23
-
24
- def pop(self, left: bool = True):
15
+ def empty(self) -> bool:
16
+ """检查队列是否为空"""
17
+ return not self._queue
18
+
19
+ def push(self, data: Union[Any, Iterable], direct_insertion: bool = False):
20
+ """
21
+ 向队列中添加数据。
22
+ 如果数据是可迭代对象(如列表、元组或集合),可以选择直接扩展队列。
23
+ """
24
+ if not data:
25
+ return
26
+
27
+ if not direct_insertion and isinstance(data, (list, tuple, set)):
28
+ self._queue.extend(data)
29
+ else:
30
+ self._queue.append(data)
31
+
32
+ def pop(self) -> Any:
33
+ """
34
+ 从队列左侧弹出一个元素。
35
+ 如果队列为空,返回 None。
36
+ """
25
37
  try:
26
- return self._queue.popleft() if left else self._queue.pop()
38
+ return self._queue.popleft()
27
39
  except IndexError:
28
40
  return None
29
- except AttributeError:
30
- return None
41
+
42
+ def iter_items(self, limit: int = 1) -> Iterable:
43
+ """
44
+ 按指定数量从队列中弹出多个元素并生成它们。
45
+ 如果队列为空或达到限制,则停止生成。
46
+ """
47
+ for _ in range(limit):
48
+ item = self.pop()
49
+ if item is None:
50
+ break
51
+ yield item
cobweb/base/item.py CHANGED
@@ -1,19 +1,22 @@
1
1
  from .seed import Seed
2
+ from typing import Dict, Any
2
3
  from collections import namedtuple
3
4
 
4
5
 
5
- class Item(type):
6
+ class ItemMeta(type):
6
7
 
7
- def __new__(cls, name, bases, dct):
8
- new_class_instance = type.__new__(cls, name, bases, dct)
8
+ def __new__(cls, name: str, bases: tuple, dct: dict) -> type:
9
+ new_class = super().__new__(cls, name, bases, dct)
9
10
  if name != "BaseItem":
10
- table = getattr(new_class_instance, "__TABLE__")
11
- fields = getattr(new_class_instance, "__FIELDS__")
12
- new_class_instance.Data = namedtuple(table, fields)
13
- return new_class_instance
11
+ table = getattr(new_class, "__TABLE__")
12
+ fields = getattr(new_class, "__FIELDS__")
13
+ if not table or not fields:
14
+ raise ValueError(f"Missing required attributes '__TABLE__' or '__FIELDS__' in class {name}")
15
+ new_class.Data = namedtuple(table, fields)
16
+ return new_class
14
17
 
15
18
 
16
- class BaseItem(metaclass=Item):
19
+ class BaseItem(metaclass=ItemMeta):
17
20
 
18
21
  __TABLE__ = ""
19
22
  __FIELDS__ = ""
@@ -23,17 +26,40 @@ class BaseItem(metaclass=Item):
23
26
 
24
27
  data = {}
25
28
  for key, value in kwargs.items():
26
- if key not in self.__FIELDS__:
27
- self.__setattr__(key, value)
28
- else:
29
+ if key in self.__FIELDS__:
29
30
  data[key] = value
31
+ else:
32
+ setattr(self, key, value)
30
33
 
31
- self.data = self.Data(**data)
34
+ try:
35
+ self.data = self.Data(**data)
36
+ except TypeError as e:
37
+ raise ValueError(f"Invalid field values for Data: {e}") from e
32
38
 
33
39
  @property
34
- def to_dict(self):
40
+ def to_dict(self) -> Dict[str, Any]:
35
41
  return self.data._asdict()
36
42
 
37
43
  @property
38
- def table(self):
44
+ def fields(self) -> tuple[str]:
45
+ return self.data._fields
46
+
47
+ @property
48
+ def table(self) -> str:
39
49
  return self.Data.__name__
50
+
51
+ def __setitem__(self, key: str, value: Any):
52
+ setattr(self, key, value)
53
+
54
+ def __getitem__(self, key: str) -> Any:
55
+ return getattr(self, key, None)
56
+
57
+ def __getattr__(self, name: str) -> Any:
58
+ return None
59
+
60
+
61
+ class CSVItem(BaseItem):
62
+
63
+ __TABLE__ = "cobweb"
64
+ __FIELDS__ = "data"
65
+
@@ -51,8 +51,8 @@ class ColorCodes:
51
51
  HIDDEN = "\033[8m"
52
52
 
53
53
 
54
- class Log:
55
- logging.getLogger('oss2.api').setLevel(logging.WARNING)
54
+ class Logger:
55
+ # logging.getLogger('oss2.api').setLevel(logging.WARNING)
56
56
  logging.basicConfig(
57
57
  level=logging.INFO,
58
58
  format=f'%(asctime)s %(name)s [%(filename)s:%(lineno)d %(funcName)s]'
@@ -88,7 +88,7 @@ class Log:
88
88
  return self.__class__.log.critical
89
89
 
90
90
 
91
- logger = Log()
91
+ logger = Logger()
92
92
 
93
93
 
94
94