cobweb-launcher 0.0.1__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,60 @@
1
+ from typing import Iterable
2
+
3
+ # from pympler import asizeof
4
+ from collections import deque
5
+
6
+
7
+ class Queue:
8
+
9
+ def __init__(self):
10
+ self._seed_queue = deque()
11
+
12
+ @property
13
+ def queue_names(self):
14
+ return tuple(self.__dict__.keys())
15
+
16
+ @property
17
+ def used_memory(self):
18
+ return asizeof.asizeof(self)
19
+
20
+ def create_queue(self, queue_name: str):
21
+ self.__setattr__(queue_name, deque())
22
+
23
+ def push_seed(self, seed):
24
+ self.push("_seed_queue", seed)
25
+
26
+ def pop_seed(self):
27
+ return self.pop("_seed_queue")
28
+
29
+ def push(self, queue_name: str, data, left: bool = False):
30
+ try:
31
+ if not data:
32
+ return None
33
+ queue = self.__getattribute__(queue_name)
34
+ if isinstance(data, Iterable):
35
+ queue.extend(data) if left else queue.extendleft(data)
36
+ else:
37
+ queue.appendleft(data) if left else queue.append(data)
38
+ except AttributeError as e:
39
+ print(e)
40
+
41
+ def pop(self, queue_name: str, left: bool = True):
42
+ try:
43
+ queue = self.__getattribute__(queue_name)
44
+ return queue.pop() if left else queue.popleft()
45
+ except IndexError as e:
46
+ print(e)
47
+ return None
48
+ except AttributeError as e:
49
+ print(e)
50
+ return None
51
+
52
+
53
+ # qqueue = Queue()
54
+ # # qqueue.create_queue("test")
55
+ # print(qqueue.queue_names)
56
+ # qqueue.push("task_queue", "key")
57
+ # print(qqueue.used_memory)
58
+ # c = qqueue.pop("task_queue")
59
+ # print(c)
60
+
cobweb/base/request.py ADDED
@@ -0,0 +1,62 @@
1
+ import random
2
+ from typing import Union
3
+
4
+ import requests
5
+
6
+
7
+ class Request:
8
+
9
+ def __init__(self):
10
+ pass
11
+
12
+
13
+ def gen_user_agent(platform: str = 'android', redis_client=None):
14
+ user_agent = ''
15
+ if platform == 'android':
16
+ os_version = f'{random.randint(4, 10)}.{random.randint(0, 9)}.{random.randint(0, 9)}'
17
+ model = (redis_client and redis_client.srandmember('(md)set_android_model').decode()) or ''
18
+ webkit_version = f'{random.randint(450, 550)}.{random.randint(0, 100)}.{random.randint(0, 100)}'
19
+ version = f'{random.randint(3, 6)}.{random.randint(0, 9)}.{random.randint(0, 9)}'
20
+ chrome_version = f'{random.randint(50, 88)}.{random.randint(0, 9)}.{random.randint(1000, 5000)}.{random.randint(0, 1000)}'
21
+ user_agent = f'Mozilla/5.0 (Linux; U; Android {os_version}; zh-cn; {model} Build/{model}) AppleWebKit/{webkit_version} (KHTML, like Gecko) Version/{version} Chrome/{chrome_version} Mobile Safari/{webkit_version}'
22
+ elif platform == 'iphone':
23
+ os_version = f'{random.randint(5, 13)}_{random.randint(0, 9)}_{random.randint(0, 9)}'
24
+ webkit_version = f'{random.randint(550, 650)}.{random.randint(0, 100)}.{random.randint(0, 100)}'
25
+ version = f'{random.randint(4, 13)}.{random.randint(0, 9)}.{random.randint(0, 9)}'
26
+ user_agent = f'Mozilla/5.0 (iPhone; CPU iPhone OS {os_version} like Mac OS X) AppleWebKit/{webkit_version} (KHTML, like Gecko) Version/{version} Mobile Safari/{webkit_version}'
27
+
28
+ return user_agent
29
+
30
+
31
+ def config(
32
+ url,
33
+ method: str = "GET",
34
+ headers: dict = None,
35
+ proxies: dict = None,
36
+ cookies: dict = None,
37
+ params: dict = None,
38
+ timeout: int = None,
39
+ stream: bool = False,
40
+ data: Union[dict, str, tuple] = None,
41
+ ) -> dict:
42
+ if not headers:
43
+ headers = {"accept": "*/*", "user-agent": gen_user_agent()}
44
+
45
+ elif "user-agent" not in [key.lower() for key in headers.keys()]:
46
+ headers["user-agent"] = gen_user_agent()
47
+
48
+ return {
49
+ "method": method,
50
+ "url": url,
51
+ "data": data,
52
+ "params": params,
53
+ "cookies": cookies,
54
+ "headers": headers,
55
+ "proxies": proxies,
56
+ "stream": stream,
57
+ "timeout": timeout or 3,
58
+ }
59
+
60
+
61
+ def request(**kwargs):
62
+ return requests.request(**kwargs)
cobweb/base/task.py ADDED
@@ -0,0 +1,38 @@
1
+ from config import info
2
+
3
+
4
+ class Task:
5
+
6
+ def __init__(
7
+ self,
8
+ project=None,
9
+ task_name=None,
10
+ start_seed=None,
11
+ spider_num=None,
12
+ # queue_length=None,
13
+ max_retries=None,
14
+ scheduler_info=None,
15
+ storer_info=None,
16
+ redis_info=None
17
+ ):
18
+ """
19
+
20
+ :param project:
21
+ :param task_name:
22
+ :param start_seed:
23
+ :param spider_num:
24
+ # :param queue_length:
25
+ :param scheduler_info:
26
+ :param storer_info: Union(list, DataInfo/namedtuple), 单个元素构成必须有3个值(数据库类型,表名,字段名)
27
+ """
28
+ self.project = project or "test"
29
+ self.task_name = task_name or "spider"
30
+ self.start_seed = start_seed
31
+ self.spider_num = spider_num or 1
32
+ self.max_retries = max_retries or 5
33
+ # self.redis_info = RedisInfo(**(redis_info or dict()))
34
+ self.redis_info = info(redis_info, tag=0)
35
+ # self.scheduler_info = SchedulerDB.info(scheduler_info)
36
+ self.scheduler_info = info(scheduler_info, tag=1)
37
+ # self.storer_info = StorerDB.info(storer_info)
38
+ self.storer_info = info(storer_info, tag=2)
cobweb/base/utils.py ADDED
@@ -0,0 +1,15 @@
1
+ import sys
2
+
3
+
4
+ def struct_table_name(table_name):
5
+ return table_name.replace(".", "__p__").replace(":", "__c__")
6
+
7
+
8
+ def restore_table_name(table_name):
9
+ return table_name.replace("__p__", ".").replace("__c__", ":")
10
+
11
+
12
+ def struct_queue_name(db_name, table_name):
13
+ return sys.intern(f"__{db_name}_{table_name}_queue__")
14
+
15
+
cobweb/db/__init__.py ADDED
File without changes
File without changes
@@ -0,0 +1 @@
1
+
@@ -0,0 +1,116 @@
1
+ import oss2
2
+ from typing import Union
3
+ from oss2.models import PartInfo
4
+ from requests import Response
5
+ from base.log import log
6
+
7
+
8
+ class OssDB:
9
+
10
+ def __init__(
11
+ self,
12
+ bucket_name,
13
+ endpoint,
14
+ access_key,
15
+ secret_key,
16
+ chunk_size,
17
+ min_size
18
+ ):
19
+ self.auth = oss2.Auth(
20
+ access_key_id=access_key,
21
+ access_key_secret=secret_key
22
+ )
23
+ self.bucket = oss2.Bucket(
24
+ auth=self.auth,
25
+ endpoint=endpoint,
26
+ bucket_name=bucket_name
27
+ )
28
+ self.chunk_size = chunk_size or 1024 ** 2
29
+ self.min_size = min_size or 1024
30
+
31
+ @staticmethod
32
+ def format_upload_len(length):
33
+ if not length:
34
+ raise ValueError("Length cannot be None or 0")
35
+
36
+ units = ["KB", "MB", "GB", "TB"]
37
+ for i in range(3):
38
+ num = length / 1024 ** (i + 1)
39
+ if num >= 1:
40
+ return f"{round(num, 2)} {units[i]}"
41
+
42
+ def assemble(self, ready_data, part_data):
43
+ upload_data = None
44
+ ready_data = ready_data + part_data
45
+ if len(ready_data) >= self.chunk_size:
46
+ upload_data = ready_data[:self.chunk_size]
47
+ ready_data = ready_data[self.chunk_size:]
48
+
49
+ return ready_data, upload_data
50
+
51
+ def iter_data(self, data):
52
+ if isinstance(data, Response):
53
+ for part_data in data.iter_content(self.chunk_size):
54
+ yield part_data
55
+ if isinstance(data, bytes):
56
+ for i in range(0, len(data), self.chunk_size):
57
+ yield data[i:i + self.chunk_size]
58
+
59
+ def upload_split(
60
+ self, oss_path: str,
61
+ data: Union[bytes, Response],
62
+ timeout: int = 300,
63
+ ):
64
+ parts = []
65
+ upload_id = None
66
+ ready_data = b""
67
+ upload_data_len = 0
68
+ headers = {"Expires": str(timeout * 1000)}
69
+ try:
70
+ upload_id = self.bucket.init_multipart_upload(oss_path).upload_id
71
+ for part_data in self.iter_data(data):
72
+ upload_data_len += len(part_data)
73
+ ready_data, upload_data = self.assemble(ready_data, part_data)
74
+ if upload_data:
75
+ part_index = len(parts) + 1
76
+ upload_info = self.bucket.upload_part(
77
+ oss_path, upload_id, part_index, upload_data
78
+ )
79
+ parts.append(PartInfo(part_index, upload_info.etag))
80
+
81
+ format_upload = self.format_upload_len(upload_data_len)
82
+
83
+ if parts and ready_data:
84
+ part_index = len(parts) + 1
85
+ upload_info = self.bucket.upload_part(
86
+ oss_path, upload_id, part_index, ready_data
87
+ )
88
+ parts.append(PartInfo(part_index, upload_info.etag))
89
+ self.bucket.complete_multipart_upload(
90
+ oss_path, upload_id, parts
91
+ )
92
+ log.info(
93
+ f"split upload, file path: {oss_path}"
94
+ f", file size: {format_upload}"
95
+ )
96
+
97
+ elif len(ready_data) > self.min_size:
98
+ self.bucket.put_object(oss_path, ready_data, headers)
99
+ log.info(
100
+ f"upload file, file path: {oss_path}"
101
+ f", file size: {format_upload}"
102
+ )
103
+
104
+ else:
105
+ log.info(
106
+ f"file size smaller than min size! "
107
+ f"file size: {format_upload}"
108
+ )
109
+ status = True
110
+ except Exception as e:
111
+ self.bucket.abort_multipart_upload(oss_path, upload_id, headers)
112
+ log.exception("upload file exception: " + str(e))
113
+ status = False
114
+
115
+ return status
116
+
@@ -0,0 +1,214 @@
1
+ import time
2
+ import redis
3
+ from base.bbb import Seed
4
+
5
+
6
+ class RedisDB:
7
+
8
+ def __init__(
9
+ self,
10
+ project: str,
11
+ task_name: str,
12
+ host=None,
13
+ port=None,
14
+ username=None,
15
+ password=None,
16
+ db=0
17
+ ):
18
+ pool = redis.ConnectionPool(
19
+ host=host,
20
+ port=port,
21
+ username=username,
22
+ password=password,
23
+ db=db
24
+ )
25
+ self.heartbeat_key = f"{project}:{task_name}:heartbeat" # redis type string
26
+ self.spider_key = f"{project}:{task_name}:seed_info:spider" # redis type zset, .format(priority)
27
+ self.storer_key = f"{project}:{task_name}:seed_info:storer:%s" # redis type set,
28
+ self.failed_key = f"{project}:{task_name}:seed_info:failed" # redis type set, .format(priority)
29
+ self.succeed_key = f"{project}:{task_name}:seed_info:succeed" # redis type set, .format(priority)
30
+ self.update_lock = f"{project}:{task_name}:update_seed_lock" # redis type string
31
+ self.check_lock = f"{project}:{task_name}:check_seed_lock" # redis type string
32
+ self.scheduler_lock = f"{project}:{task_name}:scheduler_lock" # redis type string
33
+ self.client = redis.Redis(connection_pool=pool)
34
+
35
+ # pass!
36
+ def _get_lock(self, key, t=15, timeout=3, sleep_time=0.1):
37
+ begin_time = int(time.time())
38
+ while True:
39
+ if self.client.setnx(key, ""):
40
+ self.client.expire(key, t)
41
+ return True
42
+ if int(time.time()) - begin_time > timeout:
43
+ break
44
+ time.sleep(sleep_time)
45
+
46
+ if self.client.ttl(key) == -1:
47
+ delete_status = True
48
+ for _ in range(3):
49
+ if self.client.ttl(key) != -1:
50
+ delete_status = False
51
+ break
52
+ time.sleep(0.5)
53
+ if delete_status:
54
+ self.client.expire(key, t)
55
+ return False
56
+ else:
57
+ ttl = self.client.ttl(key)
58
+ print("ttl: " + str(ttl))
59
+ return False
60
+
61
+ # pass!
62
+ def _deal_seed(self, seeds, is_add: bool):
63
+ if not seeds:
64
+ return None
65
+
66
+ if not isinstance(seeds, list):
67
+ seeds = [seeds]
68
+
69
+ item_info = dict()
70
+
71
+ for seed in seeds:
72
+ if not isinstance(seed, Seed):
73
+ seed = Seed(seed)
74
+ item_info[seed.format_seed] = seed._priority
75
+
76
+ if item_info:
77
+ self.client.zadd(self.spider_key, mapping=item_info, nx=is_add, xx=not is_add)
78
+
79
+ # pass!
80
+ def add_seed(self, seeds):
81
+ self._deal_seed(seeds, is_add=True)
82
+
83
+ def reset_seed(self, seeds):
84
+ self._deal_seed(seeds, is_add=False)
85
+
86
+ # pass!
87
+ def del_seed(self, seeds, spider_status: bool = True):
88
+ if not seeds:
89
+ return None
90
+
91
+ if not isinstance(seeds, list):
92
+ seeds = [seeds]
93
+
94
+ seeds = [seed if isinstance(seed, Seed) else Seed(seed) for seed in seeds]
95
+
96
+ if seeds:
97
+ redis_key = self.succeed_key if spider_status else self.failed_key
98
+ self.client.sadd(redis_key, *(str(seed) for seed in seeds))
99
+ self.client.zrem(self.spider_key, *(seed.format_seed for seed in seeds))
100
+
101
+ # pass!
102
+ def set_storer(self, key, seeds):
103
+ if not seeds:
104
+ return None
105
+
106
+ if not isinstance(seeds, list):
107
+ seeds = [seeds]
108
+
109
+ item_info = dict()
110
+ score = -int(time.time())
111
+ for seed in seeds:
112
+ if not isinstance(seed, Seed):
113
+ seed = Seed(seed)
114
+ item_info[seed.format_seed] = score
115
+
116
+ if item_info:
117
+ self.client.zadd(self.storer_key % key, mapping=item_info)
118
+ print("zadd storer key", len(item_info.keys()))
119
+
120
+ # pass!
121
+ def get_seed(self, length: int = 200):
122
+ cs = time.time()
123
+
124
+ if self._get_lock(key=self.update_lock):
125
+
126
+ update_item, result = {}, []
127
+
128
+ version = int(time.time())
129
+
130
+ items = self.client.zrangebyscore(self.spider_key, min=0, max="+inf", start=0, num=length, withscores=True)
131
+
132
+ for value, priority in items:
133
+ score = -(version + int(priority) / 1000)
134
+ seed = Seed(value, priority=priority, version=version)
135
+ update_item[value] = score
136
+ result.append(seed)
137
+
138
+ print("\nset seeds into queue time: " + str(time.time() - cs))
139
+ if result:
140
+ self.client.zadd(self.spider_key, mapping=update_item, xx=True)
141
+
142
+ self.client.delete(self.update_lock)
143
+ print("push seeds into queue time: " + str(time.time() - cs))
144
+ return result
145
+
146
+ # pass!
147
+ def check_spider_queue(self, stop, storer_num):
148
+ while not stop.is_set():
149
+ # 每15s获取check锁,等待600s后仍获取不到锁则重试;获取到锁后,设置锁的存活时间为15s
150
+ if self._get_lock(key=self.check_lock, t=5, timeout=600, sleep_time=3):
151
+ heartbeat = True if self.client.exists(self.heartbeat_key) else False
152
+ # 重启重制score值,否则获取10分钟前的分数值
153
+ score = -int(time.time()) + 600 if heartbeat else "-inf"
154
+
155
+ keys = self.client.keys(self.storer_key % "*")
156
+ if len(keys) == storer_num:
157
+ intersection_key = self.storer_key % "intersection"
158
+ self.client.delete(intersection_key)
159
+ self.client.zinterstore(intersection_key, keys)
160
+ while True:
161
+ members = self.client.zrange(intersection_key, 0, 1999)
162
+ if not members:
163
+ break
164
+ for key in keys:
165
+ self.client.zrem(key, *members)
166
+ self.client.sadd(self.succeed_key, *members)
167
+ self.client.zrem(self.spider_key, *members)
168
+ self.client.zrem(intersection_key, *members)
169
+ print("succeed spider data ...")
170
+
171
+ for key in keys:
172
+ self.client.zremrangebyscore(key, min=score, max="(0")
173
+
174
+ while True:
175
+ items = self.client.zrangebyscore(self.spider_key, min=score, max="(0", start=0, num=5000, withscores=True)
176
+ if not items:
177
+ break
178
+ reset_items = {}
179
+ for value, priority in items:
180
+ reset_score = "{:.3f}".format(priority).split(".")[1]
181
+ reset_items[value] = int(reset_score)
182
+ if reset_items:
183
+ self.client.zadd(self.spider_key, mapping=reset_items, xx=True)
184
+
185
+ if not heartbeat:
186
+ self.client.setex(self.heartbeat_key, 15, "")
187
+
188
+ self.client.delete(self.check_lock)
189
+ time.sleep(3)
190
+
191
+ # pass!
192
+ def set_heartbeat(self, stop):
193
+ time.sleep(5)
194
+ while not stop.is_set():
195
+ self.client.setex(self.heartbeat_key, 5, "")
196
+ time.sleep(3)
197
+
198
+ # # pass!
199
+ # def heartbeat(self):
200
+ # """
201
+ # 返回心跳key剩余存活时间
202
+ # """
203
+ # return self.client.ttl(self.heartbeat_key)
204
+
205
+ # pass!
206
+ def spider_queue_length(self):
207
+ return self.client.zcard(self.spider_key)
208
+
209
+ # pass!
210
+ def ready_seed_length(self):
211
+ return self.client.zcount(self.spider_key, min=0, max="+inf")
212
+
213
+ def get_scheduler_lock(self):
214
+ return self._get_lock(self.scheduler_lock)