cobweb-launcher 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,60 @@
1
+ from typing import Iterable
2
+
3
+ # from pympler import asizeof
4
+ from collections import deque
5
+
6
+
7
+ class Queue:
8
+
9
+ def __init__(self):
10
+ self._seed_queue = deque()
11
+
12
+ @property
13
+ def queue_names(self):
14
+ return tuple(self.__dict__.keys())
15
+
16
+ @property
17
+ def used_memory(self):
18
+ return asizeof.asizeof(self)
19
+
20
+ def create_queue(self, queue_name: str):
21
+ self.__setattr__(queue_name, deque())
22
+
23
+ def push_seed(self, seed):
24
+ self.push("_seed_queue", seed)
25
+
26
+ def pop_seed(self):
27
+ return self.pop("_seed_queue")
28
+
29
+ def push(self, queue_name: str, data, left: bool = False):
30
+ try:
31
+ if not data:
32
+ return None
33
+ queue = self.__getattribute__(queue_name)
34
+ if isinstance(data, Iterable):
35
+ queue.extend(data) if left else queue.extendleft(data)
36
+ else:
37
+ queue.appendleft(data) if left else queue.append(data)
38
+ except AttributeError as e:
39
+ print(e)
40
+
41
+ def pop(self, queue_name: str, left: bool = True):
42
+ try:
43
+ queue = self.__getattribute__(queue_name)
44
+ return queue.pop() if left else queue.popleft()
45
+ except IndexError as e:
46
+ print(e)
47
+ return None
48
+ except AttributeError as e:
49
+ print(e)
50
+ return None
51
+
52
+
53
+ # qqueue = Queue()
54
+ # # qqueue.create_queue("test")
55
+ # print(qqueue.queue_names)
56
+ # qqueue.push("task_queue", "key")
57
+ # print(qqueue.used_memory)
58
+ # c = qqueue.pop("task_queue")
59
+ # print(c)
60
+
cobweb/base/request.py ADDED
@@ -0,0 +1,62 @@
1
+ import random
2
+ from typing import Union
3
+
4
+ import requests
5
+
6
+
7
+ class Request:
8
+
9
+ def __init__(self):
10
+ pass
11
+
12
+
13
+ def gen_user_agent(platform: str = 'android', redis_client=None):
14
+ user_agent = ''
15
+ if platform == 'android':
16
+ os_version = f'{random.randint(4, 10)}.{random.randint(0, 9)}.{random.randint(0, 9)}'
17
+ model = (redis_client and redis_client.srandmember('(md)set_android_model').decode()) or ''
18
+ webkit_version = f'{random.randint(450, 550)}.{random.randint(0, 100)}.{random.randint(0, 100)}'
19
+ version = f'{random.randint(3, 6)}.{random.randint(0, 9)}.{random.randint(0, 9)}'
20
+ chrome_version = f'{random.randint(50, 88)}.{random.randint(0, 9)}.{random.randint(1000, 5000)}.{random.randint(0, 1000)}'
21
+ user_agent = f'Mozilla/5.0 (Linux; U; Android {os_version}; zh-cn; {model} Build/{model}) AppleWebKit/{webkit_version} (KHTML, like Gecko) Version/{version} Chrome/{chrome_version} Mobile Safari/{webkit_version}'
22
+ elif platform == 'iphone':
23
+ os_version = f'{random.randint(5, 13)}_{random.randint(0, 9)}_{random.randint(0, 9)}'
24
+ webkit_version = f'{random.randint(550, 650)}.{random.randint(0, 100)}.{random.randint(0, 100)}'
25
+ version = f'{random.randint(4, 13)}.{random.randint(0, 9)}.{random.randint(0, 9)}'
26
+ user_agent = f'Mozilla/5.0 (iPhone; CPU iPhone OS {os_version} like Mac OS X) AppleWebKit/{webkit_version} (KHTML, like Gecko) Version/{version} Mobile Safari/{webkit_version}'
27
+
28
+ return user_agent
29
+
30
+
31
+ def config(
32
+ url,
33
+ method: str = "GET",
34
+ headers: dict = None,
35
+ proxies: dict = None,
36
+ cookies: dict = None,
37
+ params: dict = None,
38
+ timeout: int = None,
39
+ stream: bool = False,
40
+ data: Union[dict, str, tuple] = None,
41
+ ) -> dict:
42
+ if not headers:
43
+ headers = {"accept": "*/*", "user-agent": gen_user_agent()}
44
+
45
+ elif "user-agent" not in [key.lower() for key in headers.keys()]:
46
+ headers["user-agent"] = gen_user_agent()
47
+
48
+ return {
49
+ "method": method,
50
+ "url": url,
51
+ "data": data,
52
+ "params": params,
53
+ "cookies": cookies,
54
+ "headers": headers,
55
+ "proxies": proxies,
56
+ "stream": stream,
57
+ "timeout": timeout or 3,
58
+ }
59
+
60
+
61
+ def request(**kwargs):
62
+ return requests.request(**kwargs)
cobweb/base/task.py ADDED
@@ -0,0 +1,38 @@
1
+ from config import info
2
+
3
+
4
+ class Task:
5
+
6
+ def __init__(
7
+ self,
8
+ project=None,
9
+ task_name=None,
10
+ start_seed=None,
11
+ spider_num=None,
12
+ # queue_length=None,
13
+ max_retries=None,
14
+ scheduler_info=None,
15
+ storer_info=None,
16
+ redis_info=None
17
+ ):
18
+ """
19
+
20
+ :param project:
21
+ :param task_name:
22
+ :param start_seed:
23
+ :param spider_num:
24
+ # :param queue_length:
25
+ :param scheduler_info:
26
+ :param storer_info: Union(list, DataInfo/namedtuple), 单个元素构成必须有3个值(数据库类型,表名,字段名)
27
+ """
28
+ self.project = project or "test"
29
+ self.task_name = task_name or "spider"
30
+ self.start_seed = start_seed
31
+ self.spider_num = spider_num or 1
32
+ self.max_retries = max_retries or 5
33
+ # self.redis_info = RedisInfo(**(redis_info or dict()))
34
+ self.redis_info = info(redis_info, tag=0)
35
+ # self.scheduler_info = SchedulerDB.info(scheduler_info)
36
+ self.scheduler_info = info(scheduler_info, tag=1)
37
+ # self.storer_info = StorerDB.info(storer_info)
38
+ self.storer_info = info(storer_info, tag=2)
cobweb/base/utils.py ADDED
@@ -0,0 +1,15 @@
1
+ import sys
2
+
3
+
4
+ def struct_table_name(table_name):
5
+ return table_name.replace(".", "__p__").replace(":", "__c__")
6
+
7
+
8
+ def restore_table_name(table_name):
9
+ return table_name.replace("__p__", ".").replace("__c__", ":")
10
+
11
+
12
+ def struct_queue_name(db_name, table_name):
13
+ return sys.intern(f"__{db_name}_{table_name}_queue__")
14
+
15
+
cobweb/db/__init__.py ADDED
File without changes
File without changes
@@ -0,0 +1 @@
1
+
@@ -0,0 +1,116 @@
1
+ import oss2
2
+ from typing import Union
3
+ from oss2.models import PartInfo
4
+ from requests import Response
5
+ from base.log import log
6
+
7
+
8
+ class OssDB:
9
+
10
+ def __init__(
11
+ self,
12
+ bucket_name,
13
+ endpoint,
14
+ access_key,
15
+ secret_key,
16
+ chunk_size,
17
+ min_size
18
+ ):
19
+ self.auth = oss2.Auth(
20
+ access_key_id=access_key,
21
+ access_key_secret=secret_key
22
+ )
23
+ self.bucket = oss2.Bucket(
24
+ auth=self.auth,
25
+ endpoint=endpoint,
26
+ bucket_name=bucket_name
27
+ )
28
+ self.chunk_size = chunk_size or 1024 ** 2
29
+ self.min_size = min_size or 1024
30
+
31
+ @staticmethod
32
+ def format_upload_len(length):
33
+ if not length:
34
+ raise ValueError("Length cannot be None or 0")
35
+
36
+ units = ["KB", "MB", "GB", "TB"]
37
+ for i in range(3):
38
+ num = length / 1024 ** (i + 1)
39
+ if num >= 1:
40
+ return f"{round(num, 2)} {units[i]}"
41
+
42
+ def assemble(self, ready_data, part_data):
43
+ upload_data = None
44
+ ready_data = ready_data + part_data
45
+ if len(ready_data) >= self.chunk_size:
46
+ upload_data = ready_data[:self.chunk_size]
47
+ ready_data = ready_data[self.chunk_size:]
48
+
49
+ return ready_data, upload_data
50
+
51
+ def iter_data(self, data):
52
+ if isinstance(data, Response):
53
+ for part_data in data.iter_content(self.chunk_size):
54
+ yield part_data
55
+ if isinstance(data, bytes):
56
+ for i in range(0, len(data), self.chunk_size):
57
+ yield data[i:i + self.chunk_size]
58
+
59
+ def upload_split(
60
+ self, oss_path: str,
61
+ data: Union[bytes, Response],
62
+ timeout: int = 300,
63
+ ):
64
+ parts = []
65
+ upload_id = None
66
+ ready_data = b""
67
+ upload_data_len = 0
68
+ headers = {"Expires": str(timeout * 1000)}
69
+ try:
70
+ upload_id = self.bucket.init_multipart_upload(oss_path).upload_id
71
+ for part_data in self.iter_data(data):
72
+ upload_data_len += len(part_data)
73
+ ready_data, upload_data = self.assemble(ready_data, part_data)
74
+ if upload_data:
75
+ part_index = len(parts) + 1
76
+ upload_info = self.bucket.upload_part(
77
+ oss_path, upload_id, part_index, upload_data
78
+ )
79
+ parts.append(PartInfo(part_index, upload_info.etag))
80
+
81
+ format_upload = self.format_upload_len(upload_data_len)
82
+
83
+ if parts and ready_data:
84
+ part_index = len(parts) + 1
85
+ upload_info = self.bucket.upload_part(
86
+ oss_path, upload_id, part_index, ready_data
87
+ )
88
+ parts.append(PartInfo(part_index, upload_info.etag))
89
+ self.bucket.complete_multipart_upload(
90
+ oss_path, upload_id, parts
91
+ )
92
+ log.info(
93
+ f"split upload, file path: {oss_path}"
94
+ f", file size: {format_upload}"
95
+ )
96
+
97
+ elif len(ready_data) > self.min_size:
98
+ self.bucket.put_object(oss_path, ready_data, headers)
99
+ log.info(
100
+ f"upload file, file path: {oss_path}"
101
+ f", file size: {format_upload}"
102
+ )
103
+
104
+ else:
105
+ log.info(
106
+ f"file size smaller than min size! "
107
+ f"file size: {format_upload}"
108
+ )
109
+ status = True
110
+ except Exception as e:
111
+ self.bucket.abort_multipart_upload(oss_path, upload_id, headers)
112
+ log.exception("upload file exception: " + str(e))
113
+ status = False
114
+
115
+ return status
116
+
@@ -0,0 +1,214 @@
1
+ import time
2
+ import redis
3
+ from base.bbb import Seed
4
+
5
+
6
+ class RedisDB:
7
+
8
+ def __init__(
9
+ self,
10
+ project: str,
11
+ task_name: str,
12
+ host=None,
13
+ port=None,
14
+ username=None,
15
+ password=None,
16
+ db=0
17
+ ):
18
+ pool = redis.ConnectionPool(
19
+ host=host,
20
+ port=port,
21
+ username=username,
22
+ password=password,
23
+ db=db
24
+ )
25
+ self.heartbeat_key = f"{project}:{task_name}:heartbeat" # redis type string
26
+ self.spider_key = f"{project}:{task_name}:seed_info:spider" # redis type zset, .format(priority)
27
+ self.storer_key = f"{project}:{task_name}:seed_info:storer:%s" # redis type set,
28
+ self.failed_key = f"{project}:{task_name}:seed_info:failed" # redis type set, .format(priority)
29
+ self.succeed_key = f"{project}:{task_name}:seed_info:succeed" # redis type set, .format(priority)
30
+ self.update_lock = f"{project}:{task_name}:update_seed_lock" # redis type string
31
+ self.check_lock = f"{project}:{task_name}:check_seed_lock" # redis type string
32
+ self.scheduler_lock = f"{project}:{task_name}:scheduler_lock" # redis type string
33
+ self.client = redis.Redis(connection_pool=pool)
34
+
35
+ # pass!
36
+ def _get_lock(self, key, t=15, timeout=3, sleep_time=0.1):
37
+ begin_time = int(time.time())
38
+ while True:
39
+ if self.client.setnx(key, ""):
40
+ self.client.expire(key, t)
41
+ return True
42
+ if int(time.time()) - begin_time > timeout:
43
+ break
44
+ time.sleep(sleep_time)
45
+
46
+ if self.client.ttl(key) == -1:
47
+ delete_status = True
48
+ for _ in range(3):
49
+ if self.client.ttl(key) != -1:
50
+ delete_status = False
51
+ break
52
+ time.sleep(0.5)
53
+ if delete_status:
54
+ self.client.expire(key, t)
55
+ return False
56
+ else:
57
+ ttl = self.client.ttl(key)
58
+ print("ttl: " + str(ttl))
59
+ return False
60
+
61
+ # pass!
62
+ def _deal_seed(self, seeds, is_add: bool):
63
+ if not seeds:
64
+ return None
65
+
66
+ if not isinstance(seeds, list):
67
+ seeds = [seeds]
68
+
69
+ item_info = dict()
70
+
71
+ for seed in seeds:
72
+ if not isinstance(seed, Seed):
73
+ seed = Seed(seed)
74
+ item_info[seed.format_seed] = seed._priority
75
+
76
+ if item_info:
77
+ self.client.zadd(self.spider_key, mapping=item_info, nx=is_add, xx=not is_add)
78
+
79
+ # pass!
80
+ def add_seed(self, seeds):
81
+ self._deal_seed(seeds, is_add=True)
82
+
83
+ def reset_seed(self, seeds):
84
+ self._deal_seed(seeds, is_add=False)
85
+
86
+ # pass!
87
+ def del_seed(self, seeds, spider_status: bool = True):
88
+ if not seeds:
89
+ return None
90
+
91
+ if not isinstance(seeds, list):
92
+ seeds = [seeds]
93
+
94
+ seeds = [seed if isinstance(seed, Seed) else Seed(seed) for seed in seeds]
95
+
96
+ if seeds:
97
+ redis_key = self.succeed_key if spider_status else self.failed_key
98
+ self.client.sadd(redis_key, *(str(seed) for seed in seeds))
99
+ self.client.zrem(self.spider_key, *(seed.format_seed for seed in seeds))
100
+
101
+ # pass!
102
+ def set_storer(self, key, seeds):
103
+ if not seeds:
104
+ return None
105
+
106
+ if not isinstance(seeds, list):
107
+ seeds = [seeds]
108
+
109
+ item_info = dict()
110
+ score = -int(time.time())
111
+ for seed in seeds:
112
+ if not isinstance(seed, Seed):
113
+ seed = Seed(seed)
114
+ item_info[seed.format_seed] = score
115
+
116
+ if item_info:
117
+ self.client.zadd(self.storer_key % key, mapping=item_info)
118
+ print("zadd storer key", len(item_info.keys()))
119
+
120
+ # pass!
121
+ def get_seed(self, length: int = 200):
122
+ cs = time.time()
123
+
124
+ if self._get_lock(key=self.update_lock):
125
+
126
+ update_item, result = {}, []
127
+
128
+ version = int(time.time())
129
+
130
+ items = self.client.zrangebyscore(self.spider_key, min=0, max="+inf", start=0, num=length, withscores=True)
131
+
132
+ for value, priority in items:
133
+ score = -(version + int(priority) / 1000)
134
+ seed = Seed(value, priority=priority, version=version)
135
+ update_item[value] = score
136
+ result.append(seed)
137
+
138
+ print("\nset seeds into queue time: " + str(time.time() - cs))
139
+ if result:
140
+ self.client.zadd(self.spider_key, mapping=update_item, xx=True)
141
+
142
+ self.client.delete(self.update_lock)
143
+ print("push seeds into queue time: " + str(time.time() - cs))
144
+ return result
145
+
146
+ # pass!
147
+ def check_spider_queue(self, stop, storer_num):
148
+ while not stop.is_set():
149
+ # 每15s获取check锁,等待600s后仍获取不到锁则重试;获取到锁后,设置锁的存活时间为15s
150
+ if self._get_lock(key=self.check_lock, t=5, timeout=600, sleep_time=3):
151
+ heartbeat = True if self.client.exists(self.heartbeat_key) else False
152
+ # 重启重制score值,否则获取10分钟前的分数值
153
+ score = -int(time.time()) + 600 if heartbeat else "-inf"
154
+
155
+ keys = self.client.keys(self.storer_key % "*")
156
+ if len(keys) == storer_num:
157
+ intersection_key = self.storer_key % "intersection"
158
+ self.client.delete(intersection_key)
159
+ self.client.zinterstore(intersection_key, keys)
160
+ while True:
161
+ members = self.client.zrange(intersection_key, 0, 1999)
162
+ if not members:
163
+ break
164
+ for key in keys:
165
+ self.client.zrem(key, *members)
166
+ self.client.sadd(self.succeed_key, *members)
167
+ self.client.zrem(self.spider_key, *members)
168
+ self.client.zrem(intersection_key, *members)
169
+ print("succeed spider data ...")
170
+
171
+ for key in keys:
172
+ self.client.zremrangebyscore(key, min=score, max="(0")
173
+
174
+ while True:
175
+ items = self.client.zrangebyscore(self.spider_key, min=score, max="(0", start=0, num=5000, withscores=True)
176
+ if not items:
177
+ break
178
+ reset_items = {}
179
+ for value, priority in items:
180
+ reset_score = "{:.3f}".format(priority).split(".")[1]
181
+ reset_items[value] = int(reset_score)
182
+ if reset_items:
183
+ self.client.zadd(self.spider_key, mapping=reset_items, xx=True)
184
+
185
+ if not heartbeat:
186
+ self.client.setex(self.heartbeat_key, 15, "")
187
+
188
+ self.client.delete(self.check_lock)
189
+ time.sleep(3)
190
+
191
+ # pass!
192
+ def set_heartbeat(self, stop):
193
+ time.sleep(5)
194
+ while not stop.is_set():
195
+ self.client.setex(self.heartbeat_key, 5, "")
196
+ time.sleep(3)
197
+
198
+ # # pass!
199
+ # def heartbeat(self):
200
+ # """
201
+ # 返回心跳key剩余存活时间
202
+ # """
203
+ # return self.client.ttl(self.heartbeat_key)
204
+
205
+ # pass!
206
+ def spider_queue_length(self):
207
+ return self.client.zcard(self.spider_key)
208
+
209
+ # pass!
210
+ def ready_seed_length(self):
211
+ return self.client.zcount(self.spider_key, min=0, max="+inf")
212
+
213
+ def get_scheduler_lock(self):
214
+ return self._get_lock(self.scheduler_lock)