cobweb-launcher 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cobweb/base/config.py DELETED
@@ -1,164 +0,0 @@
1
- import json
2
- from collections import namedtuple
3
- from utils import struct_table_name
4
-
5
- StorerInfo = namedtuple(
6
- "StorerInfo",
7
- "DB, table, fields, length, config"
8
- )
9
- SchedulerInfo = namedtuple(
10
- "SchedulerInfo",
11
- "DB, table, sql, length, size, config",
12
- )
13
- RedisInfo = namedtuple(
14
- "RedisInfo",
15
- "host, port, username, password, db",
16
- defaults=("localhost", 6379, None, None, 0)
17
- )
18
-
19
- # redis_info = dict(
20
- # host="localhost",
21
- # port=6379,
22
- # username=None,
23
- # password=None,
24
- # db=0
25
- # )
26
-
27
-
28
- class SchedulerDB:
29
-
30
- @staticmethod
31
- def default():
32
- from db.scheduler.default import Default
33
- return SchedulerInfo(DB=Default, table="", sql="", length=100, size=500000, config=None)
34
-
35
- @staticmethod
36
- def textfile(table, sql=None, length=100, size=500000):
37
- from db.scheduler.textfile import Textfile
38
- return SchedulerInfo(DB=Textfile, table=table, sql=sql, length=length, size=size, config=None)
39
-
40
- @staticmethod
41
- def diy(DB, table, sql=None, length=100, size=500000, config=None):
42
- from base.interface import SchedulerInterface
43
- if not isinstance(DB, SchedulerInterface):
44
- raise Exception("DB must be inherit from SchedulerInterface")
45
- return SchedulerInfo(DB=DB, table=table, sql=sql, length=length, size=size, config=config)
46
-
47
- # @staticmethod
48
- # def info(scheduler_info):
49
- # if not scheduler_info:
50
- # return SchedulerDB.default()
51
- #
52
- # if isinstance(scheduler_info, SchedulerInfo):
53
- # return scheduler_info
54
- #
55
- # if isinstance(scheduler_info, str):
56
- # scheduler = json.loads(scheduler_info)
57
- # if isinstance(scheduler, dict):
58
- # db_name = scheduler["DB"]
59
- # if db_name in dir(SchedulerDB):
60
- # del scheduler["DB"]
61
- # else:
62
- # db_name = "diy"
63
- # func = getattr(SchedulerDB, db_name)
64
- # return func(**scheduler)
65
-
66
-
67
- class StorerDB:
68
-
69
- @staticmethod
70
- def console(table, fields, length=200):
71
- from db.storer.console import Console
72
- table = struct_table_name(table)
73
- return StorerInfo(DB=Console, table=table, fields=fields, length=length, config=None)
74
-
75
- @staticmethod
76
- def textfile(table, fields, length=200):
77
- from db.storer.textfile import Textfile
78
- table = struct_table_name(table)
79
- return StorerInfo(DB=Textfile, table=table, fields=fields, length=length, config=None)
80
-
81
- @staticmethod
82
- def loghub(table, fields, length=200, config=None):
83
- from db.storer.loghub import Loghub
84
- table = struct_table_name(table)
85
- return StorerInfo(DB=Loghub, table=table, fields=fields, length=length, config=config)
86
-
87
- @staticmethod
88
- def diy(DB, table, fields, length=200, config=None):
89
- from base.interface import StorerInterface
90
- if not isinstance(DB, StorerInterface):
91
- raise Exception("DB must be inherit from StorerInterface")
92
- table = struct_table_name(table)
93
- return StorerInfo(DB=DB, table=table, fields=fields, length=length, config=config)
94
-
95
- # @staticmethod
96
- # def info(storer_info):
97
- # if not storer_info:
98
- # return None
99
- #
100
- # if isinstance(storer_info, str):
101
- # storer_info = json.loads(storer_info)
102
- #
103
- # if any(isinstance(storer_info, t) for t in (dict, StorerInfo)):
104
- # storer_info = [storer_info]
105
- #
106
- # if not isinstance(storer_info, list):
107
- # raise Exception("StorerDB.info storer_info")
108
- #
109
- # storer_info_list = []
110
- # for storer in storer_info:
111
- # if isinstance(storer, StorerInfo):
112
- # storer_info_list.append(storer)
113
- # else:
114
- # db_name = storer["DB"]
115
- # if db_name in dir(StorerDB):
116
- # del storer["DB"]
117
- # else:
118
- # db_name = "diy"
119
- # func = getattr(StorerDB, db_name)
120
- # storer_info_list.append(func(**storer))
121
- # return storer_info_list
122
-
123
-
124
-
125
- def deal(config, tag):
126
- if isinstance(config, dict):
127
- if tag == 0:
128
- return RedisInfo(**config)
129
- elif tag == 1:
130
- db_name = config["DB"]
131
- if db_name in dir(SchedulerDB):
132
- del config["DB"]
133
- else:
134
- db_name = "diy"
135
- func = getattr(SchedulerDB, db_name)
136
- return func(**config)
137
- elif tag == 2:
138
- db_name = config["DB"]
139
- if db_name in dir(StorerDB):
140
- del config["DB"]
141
- else:
142
- db_name = "diy"
143
- func = getattr(StorerDB, db_name)
144
- return func(**config)
145
- raise ValueError("tag must be in [0, 1, 2]")
146
- elif any(isinstance(config, t) for t in (StorerInfo, SchedulerInfo, RedisInfo)):
147
- return config
148
- raise TypeError("config must be in [StorerInfo, SchedulerInfo, RedisInfo]")
149
-
150
-
151
- def info(configs, tag = 0):
152
- if configs is None:
153
- return SchedulerDB.default() if tag == 1 else None
154
-
155
- if isinstance(configs, str):
156
- configs = json.loads(configs)
157
-
158
- if tag == 0:
159
- return deal(configs, tag)
160
-
161
- if not isinstance(configs, list):
162
- configs = [configs]
163
-
164
- return [deal(config, tag) for config in configs]
cobweb/base/decorators.py DELETED
@@ -1,95 +0,0 @@
1
- import time
2
- from functools import wraps
3
-
4
- # from config import DBType
5
- from log import log
6
-
7
-
8
- # def find_func_name(func_name, name_list):
9
- # for name in name_list:
10
- # if func_name.find(name) == 0:
11
- # return True
12
- # return False
13
-
14
-
15
- # def starter_decorator(execute):
16
- # @wraps(execute)
17
- # def wrapper(starter, *args, **kwargs):
18
- # spider_dynamic_funcs = []
19
- # scheduler_dynamic_funcs = []
20
- # starter_functions = inspect.getmembers(starter, lambda a: inspect.isfunction(a))
21
- # for starter_function in starter_functions:
22
- # if find_func_name(starter_function[0], starter.scheduler_funcs):
23
- # scheduler_dynamic_funcs.append(starter_function)
24
- # elif find_func_name(starter_function[0], starter.spider_funcs):
25
- # spider_dynamic_funcs.append(starter_function)
26
- # return execute(starter, scheduler_dynamic_funcs, spider_dynamic_funcs, *args, **kwargs)
27
- #
28
- # return wrapper
29
-
30
-
31
- # def scheduler_decorator(execute):
32
- # @wraps(execute)
33
- # def wrapper(scheduler, distribute_task):
34
- # if not issubclass(scheduler, SchedulerInterface):
35
- # scheduler.stop = True
36
- # elif getattr(scheduler, "scheduler", None):
37
- # execute(scheduler, distribute_task)
38
- # else:
39
- # log.error(f"scheduler type: {scheduler.db_type} not have add function!")
40
- # scheduler.stop = True
41
- # return wrapper
42
-
43
-
44
- def storer_decorator(execute):
45
- @wraps(execute)
46
- def wrapper(storer, stop_event, last_event, table_name, callback):
47
- if getattr(storer, "save", None):
48
- execute(storer, stop_event, last_event, table_name, callback)
49
- else:
50
- log.error(f"storer base_type: {storer.data_type} not have add function!")
51
- storer.stop = True
52
- return wrapper
53
-
54
-
55
- def distribute_scheduler_decorators(func):
56
- @wraps(func)
57
- def wrapper(distributor, callback):
58
- try:
59
- func(distributor, callback)
60
- except TypeError:
61
- pass
62
- distributor.event.set()
63
- return wrapper
64
-
65
-
66
- def distribute_spider_decorators(func):
67
- @wraps(func)
68
- def wrapper(distributor, stop_event, db, callback):
69
- while not stop_event.is_set():
70
- try:
71
- seed = distributor.queue_client.pop("_seed_queue")
72
- if not seed:
73
- time.sleep(3)
74
- continue
75
- distributor.spider_in_progress.append(1)
76
- func(distributor, db, seed, callback)
77
- except Exception as e:
78
- print(e)
79
- finally:
80
- distributor.spider_in_progress.pop()
81
-
82
- return wrapper
83
-
84
-
85
- def distribute_storer_decorators(func):
86
- @wraps(func)
87
- def wrapper(distributor, callback, data_type, table_name, last):
88
- data_list = []
89
- try:
90
- func(distributor, callback, data_list, data_type, table_name, last)
91
- except Exception as e:
92
- log.info("storage exception! " + str(e))
93
- # distributor._task_queue.extendleft(data_list)
94
-
95
- return wrapper
cobweb/base/hash_table.py DELETED
@@ -1,60 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
-
3
- class DynamicHashTable:
4
- def __init__(self):
5
- self.capacity = 1000 # 初始容量
6
- self.size = 0 # 元素个数
7
- self.table = [None] * self.capacity
8
-
9
- def hash_function(self, key):
10
- return hash(key) % self.capacity
11
-
12
- def probe(self, index):
13
- # 线性探测法
14
- return (index + 1) % self.capacity
15
-
16
- def insert(self, key, value):
17
- index = self.hash_function(key)
18
- while self.table[index] is not None:
19
- if self.table[index][0] == key:
20
- self.table[index][1] = value
21
- return
22
- index = self.probe(index)
23
- self.table[index] = [key, value]
24
- self.size += 1
25
-
26
- # 动态扩容
27
- if self.size / self.capacity >= 0.7:
28
- self.resize()
29
-
30
- def get(self, key):
31
- index = self.hash_function(key)
32
- while self.table[index] is not None:
33
- if self.table[index][0] == key:
34
- return self.table[index][1]
35
- index = self.probe(index)
36
- raise KeyError("Key not found")
37
-
38
- def remove(self, key):
39
- index = self.hash_function(key)
40
- while self.table[index] is not None:
41
- if self.table[index][0] == key:
42
- self.table[index] = None
43
- self.size -= 1
44
- return
45
- index = self.probe(index)
46
- raise KeyError("Key not found")
47
-
48
- def resize(self):
49
- # 扩容为原容量的两倍
50
- self.capacity *= 2
51
- new_table = [None] * self.capacity
52
- for item in self.table:
53
- if item is not None:
54
- key, value = item
55
- new_index = self.hash_function(key)
56
- while new_table[new_index] is not None:
57
- new_index = self.probe(new_index)
58
- new_table[new_index] = [key, value]
59
- self.table = new_table
60
-
cobweb/base/queue_tmp.py DELETED
@@ -1,60 +0,0 @@
1
- from typing import Iterable
2
-
3
- # from pympler import asizeof
4
- from collections import deque
5
-
6
-
7
- class Queue:
8
-
9
- def __init__(self):
10
- self._seed_queue = deque()
11
-
12
- @property
13
- def queue_names(self):
14
- return tuple(self.__dict__.keys())
15
-
16
- @property
17
- def used_memory(self):
18
- return asizeof.asizeof(self)
19
-
20
- def create_queue(self, queue_name: str):
21
- self.__setattr__(queue_name, deque())
22
-
23
- def push_seed(self, seed):
24
- self.push("_seed_queue", seed)
25
-
26
- def pop_seed(self):
27
- return self.pop("_seed_queue")
28
-
29
- def push(self, queue_name: str, data, left: bool = False):
30
- try:
31
- if not data:
32
- return None
33
- queue = self.__getattribute__(queue_name)
34
- if isinstance(data, Iterable):
35
- queue.extend(data) if left else queue.extendleft(data)
36
- else:
37
- queue.appendleft(data) if left else queue.append(data)
38
- except AttributeError as e:
39
- print(e)
40
-
41
- def pop(self, queue_name: str, left: bool = True):
42
- try:
43
- queue = self.__getattribute__(queue_name)
44
- return queue.pop() if left else queue.popleft()
45
- except IndexError as e:
46
- print(e)
47
- return None
48
- except AttributeError as e:
49
- print(e)
50
- return None
51
-
52
-
53
- # qqueue = Queue()
54
- # # qqueue.create_queue("test")
55
- # print(qqueue.queue_names)
56
- # qqueue.push("task_queue", "key")
57
- # print(qqueue.used_memory)
58
- # c = qqueue.pop("task_queue")
59
- # print(c)
60
-
cobweb/base/request.py DELETED
@@ -1,62 +0,0 @@
1
- import random
2
- from typing import Union
3
-
4
- import requests
5
-
6
-
7
- class Request:
8
-
9
- def __init__(self):
10
- pass
11
-
12
-
13
- def gen_user_agent(platform: str = 'android', redis_client=None):
14
- user_agent = ''
15
- if platform == 'android':
16
- os_version = f'{random.randint(4, 10)}.{random.randint(0, 9)}.{random.randint(0, 9)}'
17
- model = (redis_client and redis_client.srandmember('(md)set_android_model').decode()) or ''
18
- webkit_version = f'{random.randint(450, 550)}.{random.randint(0, 100)}.{random.randint(0, 100)}'
19
- version = f'{random.randint(3, 6)}.{random.randint(0, 9)}.{random.randint(0, 9)}'
20
- chrome_version = f'{random.randint(50, 88)}.{random.randint(0, 9)}.{random.randint(1000, 5000)}.{random.randint(0, 1000)}'
21
- user_agent = f'Mozilla/5.0 (Linux; U; Android {os_version}; zh-cn; {model} Build/{model}) AppleWebKit/{webkit_version} (KHTML, like Gecko) Version/{version} Chrome/{chrome_version} Mobile Safari/{webkit_version}'
22
- elif platform == 'iphone':
23
- os_version = f'{random.randint(5, 13)}_{random.randint(0, 9)}_{random.randint(0, 9)}'
24
- webkit_version = f'{random.randint(550, 650)}.{random.randint(0, 100)}.{random.randint(0, 100)}'
25
- version = f'{random.randint(4, 13)}.{random.randint(0, 9)}.{random.randint(0, 9)}'
26
- user_agent = f'Mozilla/5.0 (iPhone; CPU iPhone OS {os_version} like Mac OS X) AppleWebKit/{webkit_version} (KHTML, like Gecko) Version/{version} Mobile Safari/{webkit_version}'
27
-
28
- return user_agent
29
-
30
-
31
- def config(
32
- url,
33
- method: str = "GET",
34
- headers: dict = None,
35
- proxies: dict = None,
36
- cookies: dict = None,
37
- params: dict = None,
38
- timeout: int = None,
39
- stream: bool = False,
40
- data: Union[dict, str, tuple] = None,
41
- ) -> dict:
42
- if not headers:
43
- headers = {"accept": "*/*", "user-agent": gen_user_agent()}
44
-
45
- elif "user-agent" not in [key.lower() for key in headers.keys()]:
46
- headers["user-agent"] = gen_user_agent()
47
-
48
- return {
49
- "method": method,
50
- "url": url,
51
- "data": data,
52
- "params": params,
53
- "cookies": cookies,
54
- "headers": headers,
55
- "proxies": proxies,
56
- "stream": stream,
57
- "timeout": timeout or 3,
58
- }
59
-
60
-
61
- def request(**kwargs):
62
- return requests.request(**kwargs)
cobweb/base/task.py DELETED
@@ -1,38 +0,0 @@
1
- from config import info
2
-
3
-
4
- class Task:
5
-
6
- def __init__(
7
- self,
8
- project=None,
9
- task_name=None,
10
- start_seed=None,
11
- spider_num=None,
12
- # queue_length=None,
13
- max_retries=None,
14
- scheduler_info=None,
15
- storer_info=None,
16
- redis_info=None
17
- ):
18
- """
19
-
20
- :param project:
21
- :param task_name:
22
- :param start_seed:
23
- :param spider_num:
24
- # :param queue_length:
25
- :param scheduler_info:
26
- :param storer_info: Union(list, DataInfo/namedtuple), 单个元素构成必须有3个值(数据库类型,表名,字段名)
27
- """
28
- self.project = project or "test"
29
- self.task_name = task_name or "spider"
30
- self.start_seed = start_seed
31
- self.spider_num = spider_num or 1
32
- self.max_retries = max_retries or 5
33
- # self.redis_info = RedisInfo(**(redis_info or dict()))
34
- self.redis_info = info(redis_info, tag=0)
35
- # self.scheduler_info = SchedulerDB.info(scheduler_info)
36
- self.scheduler_info = info(scheduler_info, tag=1)
37
- # self.storer_info = StorerDB.info(storer_info)
38
- self.storer_info = info(storer_info, tag=2)
cobweb/base/utils.py DELETED
@@ -1,15 +0,0 @@
1
- import sys
2
-
3
-
4
- def struct_table_name(table_name):
5
- return table_name.replace(".", "__p__").replace(":", "__c__")
6
-
7
-
8
- def restore_table_name(table_name):
9
- return table_name.replace("__p__", ".").replace("__c__", ":")
10
-
11
-
12
- def struct_queue_name(db_name, table_name):
13
- return sys.intern(f"__{db_name}_{table_name}_queue__")
14
-
15
-
File without changes
@@ -1 +0,0 @@
1
-