cobweb-launcher 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
cobweb/base/config.py DELETED
@@ -1,164 +0,0 @@
1
- import json
2
- from collections import namedtuple
3
- from utils import struct_table_name
4
-
5
- StorerInfo = namedtuple(
6
- "StorerInfo",
7
- "DB, table, fields, length, config"
8
- )
9
- SchedulerInfo = namedtuple(
10
- "SchedulerInfo",
11
- "DB, table, sql, length, size, config",
12
- )
13
- RedisInfo = namedtuple(
14
- "RedisInfo",
15
- "host, port, username, password, db",
16
- defaults=("localhost", 6379, None, None, 0)
17
- )
18
-
19
- # redis_info = dict(
20
- # host="localhost",
21
- # port=6379,
22
- # username=None,
23
- # password=None,
24
- # db=0
25
- # )
26
-
27
-
28
- class SchedulerDB:
29
-
30
- @staticmethod
31
- def default():
32
- from db.scheduler.default import Default
33
- return SchedulerInfo(DB=Default, table="", sql="", length=100, size=500000, config=None)
34
-
35
- @staticmethod
36
- def textfile(table, sql=None, length=100, size=500000):
37
- from db.scheduler.textfile import Textfile
38
- return SchedulerInfo(DB=Textfile, table=table, sql=sql, length=length, size=size, config=None)
39
-
40
- @staticmethod
41
- def diy(DB, table, sql=None, length=100, size=500000, config=None):
42
- from base.interface import SchedulerInterface
43
- if not isinstance(DB, SchedulerInterface):
44
- raise Exception("DB must be inherit from SchedulerInterface")
45
- return SchedulerInfo(DB=DB, table=table, sql=sql, length=length, size=size, config=config)
46
-
47
- # @staticmethod
48
- # def info(scheduler_info):
49
- # if not scheduler_info:
50
- # return SchedulerDB.default()
51
- #
52
- # if isinstance(scheduler_info, SchedulerInfo):
53
- # return scheduler_info
54
- #
55
- # if isinstance(scheduler_info, str):
56
- # scheduler = json.loads(scheduler_info)
57
- # if isinstance(scheduler, dict):
58
- # db_name = scheduler["DB"]
59
- # if db_name in dir(SchedulerDB):
60
- # del scheduler["DB"]
61
- # else:
62
- # db_name = "diy"
63
- # func = getattr(SchedulerDB, db_name)
64
- # return func(**scheduler)
65
-
66
-
67
- class StorerDB:
68
-
69
- @staticmethod
70
- def console(table, fields, length=200):
71
- from db.storer.console import Console
72
- table = struct_table_name(table)
73
- return StorerInfo(DB=Console, table=table, fields=fields, length=length, config=None)
74
-
75
- @staticmethod
76
- def textfile(table, fields, length=200):
77
- from db.storer.textfile import Textfile
78
- table = struct_table_name(table)
79
- return StorerInfo(DB=Textfile, table=table, fields=fields, length=length, config=None)
80
-
81
- @staticmethod
82
- def loghub(table, fields, length=200, config=None):
83
- from db.storer.loghub import Loghub
84
- table = struct_table_name(table)
85
- return StorerInfo(DB=Loghub, table=table, fields=fields, length=length, config=config)
86
-
87
- @staticmethod
88
- def diy(DB, table, fields, length=200, config=None):
89
- from base.interface import StorerInterface
90
- if not isinstance(DB, StorerInterface):
91
- raise Exception("DB must be inherit from StorerInterface")
92
- table = struct_table_name(table)
93
- return StorerInfo(DB=DB, table=table, fields=fields, length=length, config=config)
94
-
95
- # @staticmethod
96
- # def info(storer_info):
97
- # if not storer_info:
98
- # return None
99
- #
100
- # if isinstance(storer_info, str):
101
- # storer_info = json.loads(storer_info)
102
- #
103
- # if any(isinstance(storer_info, t) for t in (dict, StorerInfo)):
104
- # storer_info = [storer_info]
105
- #
106
- # if not isinstance(storer_info, list):
107
- # raise Exception("StorerDB.info storer_info")
108
- #
109
- # storer_info_list = []
110
- # for storer in storer_info:
111
- # if isinstance(storer, StorerInfo):
112
- # storer_info_list.append(storer)
113
- # else:
114
- # db_name = storer["DB"]
115
- # if db_name in dir(StorerDB):
116
- # del storer["DB"]
117
- # else:
118
- # db_name = "diy"
119
- # func = getattr(StorerDB, db_name)
120
- # storer_info_list.append(func(**storer))
121
- # return storer_info_list
122
-
123
-
124
-
125
- def deal(config, tag):
126
- if isinstance(config, dict):
127
- if tag == 0:
128
- return RedisInfo(**config)
129
- elif tag == 1:
130
- db_name = config["DB"]
131
- if db_name in dir(SchedulerDB):
132
- del config["DB"]
133
- else:
134
- db_name = "diy"
135
- func = getattr(SchedulerDB, db_name)
136
- return func(**config)
137
- elif tag == 2:
138
- db_name = config["DB"]
139
- if db_name in dir(StorerDB):
140
- del config["DB"]
141
- else:
142
- db_name = "diy"
143
- func = getattr(StorerDB, db_name)
144
- return func(**config)
145
- raise ValueError("tag must be in [0, 1, 2]")
146
- elif any(isinstance(config, t) for t in (StorerInfo, SchedulerInfo, RedisInfo)):
147
- return config
148
- raise TypeError("config must be in [StorerInfo, SchedulerInfo, RedisInfo]")
149
-
150
-
151
- def info(configs, tag = 0):
152
- if configs is None:
153
- return SchedulerDB.default() if tag == 1 else None
154
-
155
- if isinstance(configs, str):
156
- configs = json.loads(configs)
157
-
158
- if tag == 0:
159
- return deal(configs, tag)
160
-
161
- if not isinstance(configs, list):
162
- configs = [configs]
163
-
164
- return [deal(config, tag) for config in configs]
cobweb/base/decorators.py DELETED
@@ -1,95 +0,0 @@
1
- import time
2
- from functools import wraps
3
-
4
- # from config import DBType
5
- from log import log
6
-
7
-
8
- # def find_func_name(func_name, name_list):
9
- # for name in name_list:
10
- # if func_name.find(name) == 0:
11
- # return True
12
- # return False
13
-
14
-
15
- # def starter_decorator(execute):
16
- # @wraps(execute)
17
- # def wrapper(starter, *args, **kwargs):
18
- # spider_dynamic_funcs = []
19
- # scheduler_dynamic_funcs = []
20
- # starter_functions = inspect.getmembers(starter, lambda a: inspect.isfunction(a))
21
- # for starter_function in starter_functions:
22
- # if find_func_name(starter_function[0], starter.scheduler_funcs):
23
- # scheduler_dynamic_funcs.append(starter_function)
24
- # elif find_func_name(starter_function[0], starter.spider_funcs):
25
- # spider_dynamic_funcs.append(starter_function)
26
- # return execute(starter, scheduler_dynamic_funcs, spider_dynamic_funcs, *args, **kwargs)
27
- #
28
- # return wrapper
29
-
30
-
31
- # def scheduler_decorator(execute):
32
- # @wraps(execute)
33
- # def wrapper(scheduler, distribute_task):
34
- # if not issubclass(scheduler, SchedulerInterface):
35
- # scheduler.stop = True
36
- # elif getattr(scheduler, "scheduler", None):
37
- # execute(scheduler, distribute_task)
38
- # else:
39
- # log.error(f"scheduler type: {scheduler.db_type} not have add function!")
40
- # scheduler.stop = True
41
- # return wrapper
42
-
43
-
44
- def storer_decorator(execute):
45
- @wraps(execute)
46
- def wrapper(storer, stop_event, last_event, table_name, callback):
47
- if getattr(storer, "save", None):
48
- execute(storer, stop_event, last_event, table_name, callback)
49
- else:
50
- log.error(f"storer base_type: {storer.data_type} not have add function!")
51
- storer.stop = True
52
- return wrapper
53
-
54
-
55
- def distribute_scheduler_decorators(func):
56
- @wraps(func)
57
- def wrapper(distributor, callback):
58
- try:
59
- func(distributor, callback)
60
- except TypeError:
61
- pass
62
- distributor.event.set()
63
- return wrapper
64
-
65
-
66
- def distribute_spider_decorators(func):
67
- @wraps(func)
68
- def wrapper(distributor, stop_event, db, callback):
69
- while not stop_event.is_set():
70
- try:
71
- seed = distributor.queue_client.pop("_seed_queue")
72
- if not seed:
73
- time.sleep(3)
74
- continue
75
- distributor.spider_in_progress.append(1)
76
- func(distributor, db, seed, callback)
77
- except Exception as e:
78
- print(e)
79
- finally:
80
- distributor.spider_in_progress.pop()
81
-
82
- return wrapper
83
-
84
-
85
- def distribute_storer_decorators(func):
86
- @wraps(func)
87
- def wrapper(distributor, callback, data_type, table_name, last):
88
- data_list = []
89
- try:
90
- func(distributor, callback, data_list, data_type, table_name, last)
91
- except Exception as e:
92
- log.info("storage exception! " + str(e))
93
- # distributor._task_queue.extendleft(data_list)
94
-
95
- return wrapper
cobweb/base/hash_table.py DELETED
@@ -1,60 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
-
3
- class DynamicHashTable:
4
- def __init__(self):
5
- self.capacity = 1000 # 初始容量
6
- self.size = 0 # 元素个数
7
- self.table = [None] * self.capacity
8
-
9
- def hash_function(self, key):
10
- return hash(key) % self.capacity
11
-
12
- def probe(self, index):
13
- # 线性探测法
14
- return (index + 1) % self.capacity
15
-
16
- def insert(self, key, value):
17
- index = self.hash_function(key)
18
- while self.table[index] is not None:
19
- if self.table[index][0] == key:
20
- self.table[index][1] = value
21
- return
22
- index = self.probe(index)
23
- self.table[index] = [key, value]
24
- self.size += 1
25
-
26
- # 动态扩容
27
- if self.size / self.capacity >= 0.7:
28
- self.resize()
29
-
30
- def get(self, key):
31
- index = self.hash_function(key)
32
- while self.table[index] is not None:
33
- if self.table[index][0] == key:
34
- return self.table[index][1]
35
- index = self.probe(index)
36
- raise KeyError("Key not found")
37
-
38
- def remove(self, key):
39
- index = self.hash_function(key)
40
- while self.table[index] is not None:
41
- if self.table[index][0] == key:
42
- self.table[index] = None
43
- self.size -= 1
44
- return
45
- index = self.probe(index)
46
- raise KeyError("Key not found")
47
-
48
- def resize(self):
49
- # 扩容为原容量的两倍
50
- self.capacity *= 2
51
- new_table = [None] * self.capacity
52
- for item in self.table:
53
- if item is not None:
54
- key, value = item
55
- new_index = self.hash_function(key)
56
- while new_table[new_index] is not None:
57
- new_index = self.probe(new_index)
58
- new_table[new_index] = [key, value]
59
- self.table = new_table
60
-
cobweb/base/queue_tmp.py DELETED
@@ -1,60 +0,0 @@
1
- from typing import Iterable
2
-
3
- # from pympler import asizeof
4
- from collections import deque
5
-
6
-
7
- class Queue:
8
-
9
- def __init__(self):
10
- self._seed_queue = deque()
11
-
12
- @property
13
- def queue_names(self):
14
- return tuple(self.__dict__.keys())
15
-
16
- @property
17
- def used_memory(self):
18
- return asizeof.asizeof(self)
19
-
20
- def create_queue(self, queue_name: str):
21
- self.__setattr__(queue_name, deque())
22
-
23
- def push_seed(self, seed):
24
- self.push("_seed_queue", seed)
25
-
26
- def pop_seed(self):
27
- return self.pop("_seed_queue")
28
-
29
- def push(self, queue_name: str, data, left: bool = False):
30
- try:
31
- if not data:
32
- return None
33
- queue = self.__getattribute__(queue_name)
34
- if isinstance(data, Iterable):
35
- queue.extend(data) if left else queue.extendleft(data)
36
- else:
37
- queue.appendleft(data) if left else queue.append(data)
38
- except AttributeError as e:
39
- print(e)
40
-
41
- def pop(self, queue_name: str, left: bool = True):
42
- try:
43
- queue = self.__getattribute__(queue_name)
44
- return queue.pop() if left else queue.popleft()
45
- except IndexError as e:
46
- print(e)
47
- return None
48
- except AttributeError as e:
49
- print(e)
50
- return None
51
-
52
-
53
- # qqueue = Queue()
54
- # # qqueue.create_queue("test")
55
- # print(qqueue.queue_names)
56
- # qqueue.push("task_queue", "key")
57
- # print(qqueue.used_memory)
58
- # c = qqueue.pop("task_queue")
59
- # print(c)
60
-
cobweb/base/request.py DELETED
@@ -1,62 +0,0 @@
1
- import random
2
- from typing import Union
3
-
4
- import requests
5
-
6
-
7
- class Request:
8
-
9
- def __init__(self):
10
- pass
11
-
12
-
13
- def gen_user_agent(platform: str = 'android', redis_client=None):
14
- user_agent = ''
15
- if platform == 'android':
16
- os_version = f'{random.randint(4, 10)}.{random.randint(0, 9)}.{random.randint(0, 9)}'
17
- model = (redis_client and redis_client.srandmember('(md)set_android_model').decode()) or ''
18
- webkit_version = f'{random.randint(450, 550)}.{random.randint(0, 100)}.{random.randint(0, 100)}'
19
- version = f'{random.randint(3, 6)}.{random.randint(0, 9)}.{random.randint(0, 9)}'
20
- chrome_version = f'{random.randint(50, 88)}.{random.randint(0, 9)}.{random.randint(1000, 5000)}.{random.randint(0, 1000)}'
21
- user_agent = f'Mozilla/5.0 (Linux; U; Android {os_version}; zh-cn; {model} Build/{model}) AppleWebKit/{webkit_version} (KHTML, like Gecko) Version/{version} Chrome/{chrome_version} Mobile Safari/{webkit_version}'
22
- elif platform == 'iphone':
23
- os_version = f'{random.randint(5, 13)}_{random.randint(0, 9)}_{random.randint(0, 9)}'
24
- webkit_version = f'{random.randint(550, 650)}.{random.randint(0, 100)}.{random.randint(0, 100)}'
25
- version = f'{random.randint(4, 13)}.{random.randint(0, 9)}.{random.randint(0, 9)}'
26
- user_agent = f'Mozilla/5.0 (iPhone; CPU iPhone OS {os_version} like Mac OS X) AppleWebKit/{webkit_version} (KHTML, like Gecko) Version/{version} Mobile Safari/{webkit_version}'
27
-
28
- return user_agent
29
-
30
-
31
- def config(
32
- url,
33
- method: str = "GET",
34
- headers: dict = None,
35
- proxies: dict = None,
36
- cookies: dict = None,
37
- params: dict = None,
38
- timeout: int = None,
39
- stream: bool = False,
40
- data: Union[dict, str, tuple] = None,
41
- ) -> dict:
42
- if not headers:
43
- headers = {"accept": "*/*", "user-agent": gen_user_agent()}
44
-
45
- elif "user-agent" not in [key.lower() for key in headers.keys()]:
46
- headers["user-agent"] = gen_user_agent()
47
-
48
- return {
49
- "method": method,
50
- "url": url,
51
- "data": data,
52
- "params": params,
53
- "cookies": cookies,
54
- "headers": headers,
55
- "proxies": proxies,
56
- "stream": stream,
57
- "timeout": timeout or 3,
58
- }
59
-
60
-
61
- def request(**kwargs):
62
- return requests.request(**kwargs)
cobweb/base/task.py DELETED
@@ -1,38 +0,0 @@
1
- from config import info
2
-
3
-
4
- class Task:
5
-
6
- def __init__(
7
- self,
8
- project=None,
9
- task_name=None,
10
- start_seed=None,
11
- spider_num=None,
12
- # queue_length=None,
13
- max_retries=None,
14
- scheduler_info=None,
15
- storer_info=None,
16
- redis_info=None
17
- ):
18
- """
19
-
20
- :param project:
21
- :param task_name:
22
- :param start_seed:
23
- :param spider_num:
24
- # :param queue_length:
25
- :param scheduler_info:
26
- :param storer_info: Union(list, DataInfo/namedtuple), 单个元素构成必须有3个值(数据库类型,表名,字段名)
27
- """
28
- self.project = project or "test"
29
- self.task_name = task_name or "spider"
30
- self.start_seed = start_seed
31
- self.spider_num = spider_num or 1
32
- self.max_retries = max_retries or 5
33
- # self.redis_info = RedisInfo(**(redis_info or dict()))
34
- self.redis_info = info(redis_info, tag=0)
35
- # self.scheduler_info = SchedulerDB.info(scheduler_info)
36
- self.scheduler_info = info(scheduler_info, tag=1)
37
- # self.storer_info = StorerDB.info(storer_info)
38
- self.storer_info = info(storer_info, tag=2)
cobweb/base/utils.py DELETED
@@ -1,15 +0,0 @@
1
- import sys
2
-
3
-
4
- def struct_table_name(table_name):
5
- return table_name.replace(".", "__p__").replace(":", "__c__")
6
-
7
-
8
- def restore_table_name(table_name):
9
- return table_name.replace("__p__", ".").replace("__c__", ":")
10
-
11
-
12
- def struct_queue_name(db_name, table_name):
13
- return sys.intern(f"__{db_name}_{table_name}_queue__")
14
-
15
-
File without changes
@@ -1 +0,0 @@
1
-