cobweb-launcher 0.1.23__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cobweb-launcher might be problematic. Click here for more details.

cobweb/__init__.py CHANGED
@@ -1,9 +1 @@
1
- from .bbb import Seed, Queue, DBItem
2
- from .task import Task
3
- from .log import log
4
- from .db.redis_db import RedisDB
5
- from .db.oss_db import OssDB
6
- from .constant import Setting
7
-
8
- from .equip.distributed.launcher import launcher
9
- from .equip.single.launcher import launcher as single_launcher
1
+ from launchers import Launcher, LauncherPro
@@ -0,0 +1,9 @@
1
+ from .common_queue import Queue
2
+ from .response import Response
3
+ from .request import Request
4
+ from .item import BaseItem
5
+ from .seed import Seed
6
+
7
+ from .log import logger
8
+ from .decorators import decorator_oss_db
9
+
@@ -0,0 +1,30 @@
1
+ from collections import deque
2
+
3
+
4
+ class Queue:
5
+
6
+ def __init__(self):
7
+ self._queue = deque()
8
+
9
+ @property
10
+ def length(self) -> int:
11
+ return len(self._queue)
12
+
13
+ def push(self, data, left: bool = False, direct_insertion: bool = False):
14
+ try:
15
+ if not data:
16
+ return None
17
+ if not direct_insertion and any(isinstance(data, t) for t in (list, tuple)):
18
+ self._queue.extendleft(data) if left else self._queue.extend(data)
19
+ else:
20
+ self._queue.appendleft(data) if left else self._queue.append(data)
21
+ except AttributeError:
22
+ pass
23
+
24
+ def pop(self, left: bool = True):
25
+ try:
26
+ return self._queue.popleft() if left else self._queue.pop()
27
+ except IndexError:
28
+ return None
29
+ except AttributeError:
30
+ return None
@@ -0,0 +1,40 @@
1
+ from functools import wraps
2
+
3
+
4
+ # def check_redis_status(func):
5
+ # @wraps(func)
6
+ # def wrapper(*args, **kwargs):
7
+ # try:
8
+ # result = func(*args, **kwargs)
9
+ # except Exception:
10
+ # result = False
11
+ # return result
12
+ #
13
+ # return wrapper
14
+
15
+
16
+ def decorator_oss_db(exception, retries=3):
17
+ def decorator(func):
18
+ @wraps(func)
19
+ def wrapper(callback_func, *args, **kwargs):
20
+ result = None
21
+ for i in range(retries):
22
+ msg = None
23
+ try:
24
+ return func(callback_func, *args, **kwargs)
25
+ except Exception as e:
26
+ result = None
27
+ msg = e
28
+ finally:
29
+ if result:
30
+ return result
31
+
32
+ if i >= 2 and msg:
33
+ raise exception(msg)
34
+
35
+ return wrapper
36
+
37
+ return decorator
38
+
39
+
40
+
cobweb/base/item.py ADDED
@@ -0,0 +1,39 @@
1
+ from .seed import Seed
2
+ from collections import namedtuple
3
+
4
+
5
+ class Item(type):
6
+
7
+ def __new__(cls, name, bases, dct):
8
+ new_class_instance = type.__new__(cls, name, bases, dct)
9
+ if name != "BaseItem":
10
+ table = getattr(new_class_instance, "__TABLE__")
11
+ fields = getattr(new_class_instance, "__FIELDS__")
12
+ new_class_instance.Data = namedtuple(table, fields)
13
+ return new_class_instance
14
+
15
+
16
+ class BaseItem(metaclass=Item):
17
+
18
+ __TABLE__ = ""
19
+ __FIELDS__ = ""
20
+
21
+ def __init__(self, seed: Seed, **kwargs):
22
+ self.seed = seed
23
+
24
+ data = {}
25
+ for key, value in kwargs.items():
26
+ if key not in self.__FIELDS__:
27
+ self.__setattr__(key, value)
28
+ else:
29
+ data[key] = value
30
+
31
+ self.data = self.Data(**data)
32
+
33
+ @property
34
+ def to_dict(self):
35
+ return self.data._asdict()
36
+
37
+ @property
38
+ def table(self):
39
+ return self.Data.__name__
cobweb/base/log.py ADDED
@@ -0,0 +1,94 @@
1
+ import logging
2
+
3
+
4
+ class ColorCodes:
5
+ # Text Reset
6
+ RESET = "\033[0m"
7
+
8
+ # Regular Colors
9
+ RED = "\033[31m"
10
+ GREEN = "\033[32m"
11
+ YELLOW = "\033[33m"
12
+ BLUE = "\033[34m"
13
+ PURPLE = "\033[35m"
14
+ CYAN = "\033[36m"
15
+ WHITE = "\033[37m"
16
+
17
+ # Bright Colors
18
+ BRIGHT_RED = "\033[91m"
19
+ BRIGHT_GREEN = "\033[92m"
20
+ BRIGHT_YELLOW = "\033[93m"
21
+ BRIGHT_BLUE = "\033[94m"
22
+ BRIGHT_PURPLE = "\033[95m"
23
+ BRIGHT_CYAN = "\033[96m"
24
+ BRIGHT_WHITE = "\033[97m"
25
+
26
+ # Background Colors
27
+ BG_RED = "\033[41m"
28
+ BG_GREEN = "\033[42m"
29
+ BG_YELLOW = "\033[43m"
30
+ BG_BLUE = "\033[44m"
31
+ BG_PURPLE = "\033[45m"
32
+ BG_CYAN = "\033[46m"
33
+ BG_WHITE = "\033[47m"
34
+
35
+ # Bright Background Colors
36
+ BG_BRIGHT_RED = "\033[101m"
37
+ BG_BRIGHT_GREEN = "\033[102m"
38
+ BG_BRIGHT_YELLOW = "\033[103m"
39
+ BG_BRIGHT_BLUE = "\033[104m"
40
+ BG_BRIGHT_PURPLE = "\033[105m"
41
+ BG_BRIGHT_CYAN = "\033[106m"
42
+ BG_BRIGHT_WHITE = "\033[107m"
43
+
44
+ # Text Styles
45
+ BOLD = "\033[1m"
46
+ DIM = "\033[2m"
47
+ ITALIC = "\033[3m"
48
+ UNDERLINE = "\033[4m"
49
+ BLINK = "\033[5m"
50
+ REVERSE = "\033[7m"
51
+ HIDDEN = "\033[8m"
52
+
53
+
54
+ class Log:
55
+ logging.getLogger('oss2.api').setLevel(logging.WARNING)
56
+ logging.basicConfig(
57
+ level=logging.INFO,
58
+ format=f'%(asctime)s %(name)s [%(filename)s:%(lineno)d %(funcName)s]'
59
+ f' %(levelname)s -> %(message)s'
60
+ )
61
+ log = logging.getLogger()
62
+
63
+ def set_log_name(self, name):
64
+ self.__class__.log = logging.getLogger(name)
65
+
66
+ @property
67
+ def debug(self):
68
+ return self.__class__.log.debug
69
+
70
+ @property
71
+ def info(self):
72
+ return self.__class__.log.info
73
+
74
+ @property
75
+ def warning(self):
76
+ return self.__class__.log.warning
77
+
78
+ @property
79
+ def exception(self):
80
+ return self.__class__.log.exception
81
+
82
+ @property
83
+ def error(self):
84
+ return self.__class__.log.error
85
+
86
+ @property
87
+ def critical(self):
88
+ return self.__class__.log.critical
89
+
90
+
91
+ logger = Log()
92
+
93
+
94
+
cobweb/base/request.py ADDED
@@ -0,0 +1,72 @@
1
+ import random
2
+ import requests
3
+
4
+
5
+ class Request:
6
+
7
+ __REQUEST_ATTRS__ = {
8
+ "params",
9
+ "headers",
10
+ "cookies",
11
+ "data",
12
+ "json",
13
+ "files",
14
+ "auth",
15
+ "timeout",
16
+ "proxies",
17
+ "hooks",
18
+ "stream",
19
+ "verify",
20
+ "cert",
21
+ "allow_redirects",
22
+ }
23
+
24
+ def __init__(
25
+ self,
26
+ url,
27
+ seed,
28
+ random_ua=True,
29
+ check_status_code=True,
30
+ **kwargs
31
+ ):
32
+ self.url = url
33
+ self.seed = seed
34
+ self.check_status_code = check_status_code
35
+ self.request_setting = {}
36
+
37
+ for k, v in kwargs.items():
38
+ if k in self.__class__.__REQUEST_ATTRS__:
39
+ self.request_setting[k] = v
40
+ continue
41
+ self.__setattr__(k, v)
42
+
43
+ if not getattr(self, "method", None):
44
+ self.method = "POST" if self.request_setting.get("data") or self.request_setting.get("json") else "GET"
45
+
46
+ if random_ua:
47
+ self._build_header()
48
+
49
+ @property
50
+ def _random_ua(self) -> str:
51
+ v1 = random.randint(4, 15)
52
+ v2 = random.randint(3, 11)
53
+ v3 = random.randint(1, 16)
54
+ v4 = random.randint(533, 605)
55
+ v5 = random.randint(1000, 6000)
56
+ v6 = random.randint(10, 80)
57
+ user_agent = (f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_{v1}_{v2}) AppleWebKit/{v4}.{v3} "
58
+ f"(KHTML, like Gecko) Chrome/105.0.0.0 Safari/{v4}.{v3} Edg/105.0.{v5}.{v6}")
59
+ return user_agent
60
+
61
+ def _build_header(self) -> dict:
62
+ if not self.request_setting.get("headers"):
63
+ self.request_setting["headers"] = {"accept": "*/*", "user-agent": self._random_ua}
64
+ elif "user-agent" not in [key.lower() for key in self.request_setting["headers"].keys()]:
65
+ self.request_setting["headers"]["user-agent"] = self._random_ua
66
+
67
+ def download(self) -> requests.Response:
68
+ response = requests.request(self.method, self.url, **self.request_setting)
69
+ if self.check_status_code:
70
+ response.raise_for_status()
71
+ return response
72
+
@@ -0,0 +1,22 @@
1
+
2
+
3
+ class Response:
4
+
5
+ def __init__(
6
+ self,
7
+ seed,
8
+ response,
9
+ **kwargs
10
+ ):
11
+ self.seed = seed
12
+ self.response = response
13
+
14
+ for k, v in kwargs.items():
15
+ self.__setattr__(k, v)
16
+
17
+ @property
18
+ def to_dict(self):
19
+ _dict = self.__dict__.copy()
20
+ _dict.pop('seed')
21
+ _dict.pop('response')
22
+ return _dict
cobweb/base/seed.py ADDED
@@ -0,0 +1,114 @@
1
+ import json
2
+ import time
3
+ import hashlib
4
+
5
+
6
+ class SeedParams:
7
+
8
+ def __init__(self, retry, priority, seed_version, identifier=None):
9
+ self.retry = retry or 0
10
+ self.priority = priority or 300
11
+ self.seed_version = seed_version or int(time.time())
12
+ self.identifier = identifier
13
+
14
+
15
+ class Seed:
16
+
17
+ __SEED_PARAMS__ = [
18
+ "retry",
19
+ "priority",
20
+ "seed_version",
21
+ "identifier"
22
+ ]
23
+
24
+ def __init__(
25
+ self,
26
+ seed,
27
+ sid=None,
28
+ retry=None,
29
+ priority=None,
30
+ seed_version=None,
31
+ identifier=None,
32
+ **kwargs
33
+ ):
34
+ if any(isinstance(seed, t) for t in (str, bytes)):
35
+ try:
36
+ item = json.loads(seed)
37
+ self._init_seed(item)
38
+ except json.JSONDecodeError:
39
+ self.__setattr__("url", seed)
40
+ elif isinstance(seed, dict):
41
+ self._init_seed(seed)
42
+ else:
43
+ raise TypeError(Exception(
44
+ f"seed type error, "
45
+ f"must be str or dict! "
46
+ f"seed: {seed}"
47
+ ))
48
+
49
+ seed_params = {
50
+ "retry": retry,
51
+ "priority": priority,
52
+ "seed_version": seed_version,
53
+ "identifier": identifier
54
+ }
55
+
56
+ if kwargs:
57
+ self._init_seed(kwargs)
58
+ seed_params.update({
59
+ k:v for k, v in kwargs.items()
60
+ if k in self.__SEED_PARAMS__
61
+ })
62
+ if sid or not getattr(self, "sid", None):
63
+ self._init_id(sid)
64
+ self.params = SeedParams(**seed_params)
65
+
66
+ def __getattr__(self, name):
67
+ return None
68
+
69
+ def __setitem__(self, key, value):
70
+ setattr(self, key, value)
71
+
72
+ def __getitem__(self, item):
73
+ return getattr(self, item)
74
+
75
+ def __str__(self):
76
+ return json.dumps(self.__dict__, ensure_ascii=False)
77
+
78
+ def __repr__(self):
79
+ chars = [f"{k}={v}" for k, v in self.__dict__.items()]
80
+ return f'{self.__class__.__name__}({", ".join(chars)})'
81
+
82
+ def _init_seed(self, seed_info:dict):
83
+ for k, v in seed_info.items():
84
+ if k not in self.__SEED_PARAMS__:
85
+ self.__setattr__(k, v)
86
+
87
+ def _init_id(self, sid):
88
+ if not sid:
89
+ sid = hashlib.md5(self.to_string.encode()).hexdigest()
90
+ self.__setattr__("sid", sid)
91
+
92
+ @property
93
+ def to_dict(self) -> dict:
94
+ seed = self.__dict__.copy()
95
+ if seed.get("params"):
96
+ del seed["params"]
97
+ return seed
98
+
99
+ @property
100
+ def to_string(self) -> str:
101
+ return json.dumps(
102
+ self.to_dict,
103
+ ensure_ascii=False,
104
+ separators=(",", ":")
105
+ )
106
+
107
+ @property
108
+ def get_all(self):
109
+ return json.dumps(
110
+ self.__dict__,
111
+ ensure_ascii=False,
112
+ separators=(",", ":")
113
+ )
114
+
cobweb/constant.py CHANGED
@@ -5,6 +5,11 @@ class LauncherModel:
5
5
  resident = "launcher model: resident"
6
6
 
7
7
 
8
+ class DownloadModel:
9
+ common = "download model: common"
10
+ file = "download model: file"
11
+
12
+
8
13
  class LogModel:
9
14
  simple = "log model: simple"
10
15
  common = "log model: common"
@@ -12,18 +17,50 @@ class LogModel:
12
17
 
13
18
 
14
19
  class DealModel:
15
- failure = "deal model: failure"
16
- success = "deal model: success"
17
- polling = "deal model: polling"
18
-
19
-
20
- class Setting:
21
- RESET_SCORE = None
22
- CHECK_LOCK_TIME = None
23
- SCHEDULER_LOCK_TIME = None
24
- DEAL_MODEL = None
25
- LAUNCHER_MODEL = None
26
- SCHEDULER_WAIT_TIME = None
27
- SCHEDULER_BLOCK_TIME = None
28
- SPIDER_WAIT_TIME = None
29
- SPIDER_SLEEP_TIME = None
20
+ fail = "deal model: fail"
21
+ done = "deal model: done"
22
+ poll = "deal model: poll"
23
+
24
+
25
+ class LogTemplate:
26
+
27
+ launcher_pro_polling = """
28
+ ----------------------- start - 轮训日志: {task} -----------------
29
+ 内存队列
30
+ 种子数: {doing_len}
31
+ 待消费: {todo_len}
32
+ 已消费: {done_len}
33
+ redis队列
34
+ 种子数: {redis_seed_count}
35
+ 待消费: {redis_todo_len}
36
+ 消费中: {redis_doing_len}
37
+ 存储队列
38
+ 待上传: {upload_len}
39
+ ----------------------- end - 轮训日志: {task} ------------------
40
+ """
41
+
42
+ download_exception = """
43
+ ----------------------- download exception -----------------------
44
+ 种子详情 \n{detail}
45
+ 种子参数
46
+ retry : {retry}
47
+ priority : {priority}
48
+ seed_version : {seed_version}
49
+ identifier : {identifier}
50
+ exception
51
+ msg : {exception}
52
+ ------------------------------------------------------------------
53
+ """
54
+
55
+ download_info = """
56
+ ------------------------ download info ---------------------------
57
+ 种子详情 \n{detail}
58
+ 种子参数
59
+ retry : {retry}
60
+ priority : {priority}
61
+ seed_version : {seed_version}
62
+ identifier : {identifier}
63
+ response
64
+ status : {status} \n{response}
65
+ ------------------------------------------------------------------
66
+ """
@@ -0,0 +1,2 @@
1
+ from .base_crawler import Crawler
2
+ from .file_crawler import CrawlerAir
@@ -0,0 +1,121 @@
1
+ import setting
2
+ import threading
3
+
4
+ from inspect import isgenerator
5
+ from typing import Union, Callable, Mapping
6
+
7
+ from cobweb.base import Queue, Seed, BaseItem, Request, Response, logger
8
+ from cobweb.constant import DealModel, LogTemplate
9
+ from cobweb.utils import download_log_info
10
+
11
+
12
+ class Crawler(threading.Thread):
13
+
14
+ def __init__(
15
+ self,
16
+ upload_queue: Queue,
17
+ custom_func: Union[Mapping[str, Callable]],
18
+ launcher_queue: Union[Mapping[str, Queue]],
19
+ ):
20
+ super().__init__()
21
+
22
+ self.upload_queue = upload_queue
23
+ for func_name, _callable in custom_func.items():
24
+ if isinstance(_callable, Callable):
25
+ self.__setattr__(func_name, _callable)
26
+
27
+ self.launcher_queue = launcher_queue
28
+
29
+ self.spider_thread_num = setting.SPIDER_THREAD_NUM
30
+ self.max_retries = setting.SPIDER_MAX_RETRIES
31
+
32
+ @staticmethod
33
+ def request(seed: Seed) -> Union[Request, BaseItem]:
34
+ stream = True if setting.DOWNLOAD_MODEL else False
35
+ return Request(seed.url, seed, stream=stream, timeout=5)
36
+
37
+ @staticmethod
38
+ def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
39
+ response = item.download()
40
+ yield Response(item.seed, response)
41
+
42
+ @staticmethod
43
+ def parse(item: Response) -> BaseItem:
44
+ pass
45
+
46
+ def get(self) -> Seed:
47
+ return self.launcher_queue['todo'].pop()
48
+
49
+ def spider(self):
50
+ while True:
51
+ seed = self.get()
52
+
53
+ if not seed:
54
+ continue
55
+
56
+ elif seed.params.retry >= self.max_retries:
57
+ self.launcher_queue['done'].push(seed)
58
+ continue
59
+
60
+ item = self.request(seed)
61
+
62
+ if isinstance(item, Request):
63
+
64
+ download_iterators = self.download(item)
65
+
66
+ if not isgenerator(download_iterators):
67
+ raise TypeError("download function isn't a generator")
68
+
69
+ seed_detail_log_info = download_log_info(seed.to_dict)
70
+
71
+ try:
72
+ for it in download_iterators:
73
+ if isinstance(it, Response):
74
+ response_detail_log_info = download_log_info(it.to_dict())
75
+ logger.info(LogTemplate.download_info.format(
76
+ detail=seed_detail_log_info, retry=item.seed.params.retry,
77
+ priority=item.seed.params.priority,
78
+ seed_version=item.seed.params.seed_version,
79
+ identifier=item.seed.params.identifier,
80
+ status=it.response, response=response_detail_log_info
81
+ ))
82
+ parse_iterators = self.parse(it)
83
+ if not isgenerator(parse_iterators):
84
+ raise TypeError("parse function isn't a generator")
85
+ for upload_item in parse_iterators:
86
+ if not isinstance(upload_item, BaseItem):
87
+ raise TypeError("upload_item isn't BaseItem subclass")
88
+ self.upload_queue.push(upload_item)
89
+ elif isinstance(it, BaseItem):
90
+ self.upload_queue.push(it)
91
+ elif isinstance(it, Seed):
92
+ self.launcher_queue['new'].push(it)
93
+ elif isinstance(it, str) and it == DealModel.poll:
94
+ self.launcher_queue['todo'].push(item)
95
+ break
96
+ elif isinstance(it, str) and it == DealModel.done:
97
+ self.launcher_queue['done'].push(seed)
98
+ break
99
+ elif isinstance(it, str) and it == DealModel.fail:
100
+ seed.params.identifier = DealModel.fail
101
+ self.launcher_queue['done'].push(seed)
102
+ break
103
+ else:
104
+ raise TypeError("yield value type error!")
105
+
106
+ except Exception as e:
107
+ logger.info(LogTemplate.download_exception.format(
108
+ detail=seed_detail_log_info, retry=seed.params.retry,
109
+ priority=seed.params.priority, seed_version=seed.params.seed_version,
110
+ identifier=seed.params.identifier, exception=e
111
+ ))
112
+ seed.params.retry += 1
113
+ self.launcher_queue['todo'].push(seed)
114
+
115
+ elif isinstance(item, BaseItem):
116
+ self.upload_queue.push(item)
117
+
118
+ def run(self):
119
+ for index in range(self.spider_thread_num):
120
+ threading.Thread(name=f"spider_{index}", target=self.spider).start()
121
+