cobweb-launcher 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,231 @@
1
+ import json
2
+ import random
3
+ import time
4
+ import redis
5
+ from datetime import datetime
6
+ from base.bbb import Seed
7
+
8
+
9
+ class RedisDB:
10
+
11
+ def __init__(
12
+ self,
13
+ project: str,
14
+ task_name: str,
15
+ # retry_num: int = 3,
16
+ host=None,
17
+ port=None,
18
+ username=None,
19
+ password=None,
20
+ db=0
21
+ ):
22
+ pool = redis.ConnectionPool(
23
+ host=host,
24
+ port=port,
25
+ username=username,
26
+ password=password,
27
+ db=db
28
+ )
29
+ self.heartbeat_key = f"{project}:{task_name}:heartbeat" # redis type string
30
+ self.ready_key = f"{project}:{task_name}:seed_info:ready" # redis type zset, .format(priority)
31
+ self.spider_key = f"{project}:{task_name}:seed_info:spider" # redis type hash, .format(priority)
32
+ self.store_key = f"{project}:{task_name}:seed_info:store:%s" # redis type set,
33
+ self.failed_key = f"{project}:{task_name}:seed_info:failed" # redis type set, .format(priority)
34
+ self.succeed_key = f"{project}:{task_name}:seed_info:succeed" # redis type set, .format(priority)
35
+ self.update_lock = f"{project}:{task_name}:update_seed_lock" # redis type string
36
+ self.check_lock = f"{project}:{task_name}:check_seed_lock" # redis type string
37
+ # self.retry_lock = f"{project}:{task_name}:retry_seed_lock" # redis type string
38
+ self.scheduler_lock = f"{project}:{task_name}:scheduler_lock" # redis type string
39
+ self.client = redis.Redis(connection_pool=pool)
40
+ # self.retry_num = retry_num
41
+
42
+ def set_heartbeat(self, t=3):
43
+ self.client.expire(self.heartbeat_key, t)
44
+
45
+ # @property
46
+ def heartbeat(self):
47
+ return self.client.ttl(self.heartbeat_key)
48
+
49
+ def iterate_hash(self, key, count=1000, match=None):
50
+ cursor = "0"
51
+ while cursor != 0:
52
+ # 使用HSCAN命令迭代获取键值对
53
+ cursor, data = self.client.hscan(key, cursor=cursor, match=match, count=count)
54
+ if not data:
55
+ return None
56
+ for field, value in data.items():
57
+ yield field.decode(), value.decode()
58
+
59
+ def get_lock(self, key, t=15, timeout=3, sleep_time=0.1):
60
+ begin_time = int(time.time())
61
+ while True:
62
+ if self.client.setnx(key, ""):
63
+ self.client.expire(key, t)
64
+ return True
65
+ if int(time.time()) - begin_time > timeout:
66
+ break
67
+ time.sleep(sleep_time)
68
+
69
+ if self.client.ttl(key) == -1:
70
+ delete_status = True
71
+ for _ in range(3):
72
+ if self.client.ttl(key) != -1:
73
+ delete_status = False
74
+ break
75
+ time.sleep(0.5)
76
+ if delete_status:
77
+ self.client.expire(key, t)
78
+ return False
79
+ else:
80
+ ttl = self.client.ttl(key)
81
+ print("ttl: " + str(ttl))
82
+ return False
83
+
84
+ def execute_update(
85
+ self,
86
+ set_info,
87
+ del_info,
88
+ status: int = 0
89
+ ):
90
+ if status not in [0, 1, 2, 3]:
91
+ return None
92
+
93
+ pipe = self.client.pipeline()
94
+ pipe.multi()
95
+
96
+ if status == 0:
97
+ pipe.hset(self.spider_key, mapping=set_info)
98
+ pipe.zrem(self.ready_key, *del_info)
99
+ elif status == 1:
100
+ pipe.zadd(self.ready_key, mapping=set_info)
101
+ pipe.hdel(self.spider_key, *del_info)
102
+ elif status == 2:
103
+ pipe.sadd(self.failed_key, *set_info)
104
+ pipe.hdel(self.spider_key, *del_info)
105
+ else:
106
+ pipe.sadd(self.succeed_key, *set_info)
107
+ pipe.hdel(self.spider_key, *del_info)
108
+ pipe.execute()
109
+
110
+ @property
111
+ def seed_count(self):
112
+ return self.client.zcard(self.ready_key)
113
+
114
+ def deal_seeds(self, sids, status: bool):
115
+ if isinstance(sids, str):
116
+ sids = [sids]
117
+ # if self.get_lock(key=self.retry_lock, t=15):
118
+ status = 2 if status else 3
119
+ del_list, fail_set = [], set()
120
+ for sid in sids:
121
+ for field, value in self.iterate_hash(self.spider_key, match=f"*{sid}"):
122
+ _, priority, _sid = field.split("_")
123
+ if sid != _sid:
124
+ continue
125
+ seed = Seed(value, priority=priority)
126
+ del_list.append(field)
127
+ fail_set.add(seed.format_seed)
128
+ if del_list:
129
+ self.execute_update(fail_set, del_list, status=status)
130
+ # self.client.delete(self.retry_lock)
131
+ print("retry seeds, sids: {}".format(json.dumps(sids)))
132
+
133
+ def set_seeds(self, seeds):
134
+ item_info = {}
135
+ if any(isinstance(seeds, t) for t in (list, tuple)):
136
+ for seed in seeds:
137
+ item_info[seed.format_seed] = seed.priority
138
+ elif isinstance(seeds, Seed):
139
+ item_info[seeds.format_seed] = seeds.priority
140
+ self.client.zadd(self.ready_key, mapping=item_info)
141
+
142
+ def get_seeds(self, length: int = 1000):
143
+ """
144
+ redis获取种子
145
+ """
146
+ cs = time.time()
147
+
148
+ if self.get_lock(key=self.update_lock):
149
+
150
+ set_dict, del_list, result = {}, [], []
151
+
152
+ # version = int(time.time() * 1e3)
153
+ version = time.time() * 1e6
154
+
155
+ items = self.client.zrangebyscore(self.ready_key, min=0, max="+inf", start=0, num=length, withscores=True)
156
+
157
+ # for value, priority in items:
158
+ # seed = Seed(value, priority=priority, version=version)
159
+ # pty = "{:03d}".format(int(priority))
160
+ # key = f"{version}_{pty}_{seed.sid}"
161
+ # set_dict[key] = value
162
+ # del_list.append(value)
163
+ # result.append(seed)
164
+
165
+ for value, priority in items:
166
+ v = version + int(priority) / 1000 + random.random() / 1000
167
+ seed = Seed(value, priority=priority, version=version)
168
+ pty = "{:03d}".format(int(priority))
169
+ key = f"{version}_{pty}_{seed.sid}"
170
+ set_dict[key] = value
171
+ del_list.append(value)
172
+ result.append(seed)
173
+
174
+ print("\nset seeds into queue time: " + str(time.time() - cs))
175
+ if result:
176
+ self.execute_update(set_dict, del_list)
177
+
178
+ self.client.delete(self.update_lock)
179
+ print("push seeds into queue time: " + str(time.time() - cs))
180
+ return result
181
+
182
+ def check_spider_hash(self):
183
+ cs = time.time()
184
+ set_dict, del_list, heartbeat = {}, [], False
185
+ if self.get_lock(key=self.check_lock, t=60, timeout=600, sleep_time=60):
186
+ count = self.client.hlen(self.spider_key)
187
+ if self.client.exists(self.heartbeat_key):
188
+ heartbeat = True
189
+ now = int(time.time())
190
+ for field, value in self.iterate_hash(key=self.spider_key, count=count):
191
+ version, priority, sid = field.split("_")
192
+ if heartbeat and int(version) + 600 > now:
193
+ continue
194
+ set_dict[value] = priority
195
+ del_list.append(field)
196
+
197
+ if len(del_list) >= 1000:
198
+ self.client.expire(self.check_lock, 60)
199
+ self.execute_update(set_dict, del_list, status=1)
200
+ set_dict, del_list = {}, []
201
+
202
+ if set_dict and del_list:
203
+ self.execute_update(set_dict, del_list, status=1)
204
+
205
+ # self.client.delete(self.check_lock)
206
+ print("init seeds time: " + str(time.time() - cs))
207
+ if not heartbeat:
208
+ self.client.setnx(self.heartbeat_key, "")
209
+ self.set_heartbeat(t=15)
210
+
211
+ def add_store_sid(self, key, data):
212
+ redis_key = self.store_key % key
213
+ self.client.sadd(redis_key, *data)
214
+
215
+
216
+ current_time = datetime.now()
217
+ # 格式化日期时间字符串
218
+ formatted_time = current_time.strftime("%m%d%H%M%S%f")
219
+ c = int(formatted_time)
220
+ print(c)
221
+ d = 200 + 0.9 * random.random()
222
+ print(d)
223
+ print(time.time())
224
+ print(c + d / 1000)
225
+ # for _ in range(100):
226
+ # redis_db.get_seeds(1000)
227
+ # redis_db.get_seeds(1000)
228
+ # redis_db.check_spider_hash()
229
+ # redis_db.retry_seeds(["dc895aee47f8fc39c479f7cac6025879"])
230
+ # "1705996980_200_dc895aee47f8fc39c479f7cac6025879"
231
+
File without changes
@@ -0,0 +1,8 @@
1
+ from base.interface import SchedulerInterface
2
+
3
+
4
+ class Default(SchedulerInterface):
5
+
6
+ def schedule(self):
7
+ pass
8
+
@@ -0,0 +1,29 @@
1
+ from base.log import log
2
+ from base.bbb import Seed
3
+ from base.interface import SchedulerInterface
4
+
5
+
6
+ class Textfile(SchedulerInterface):
7
+
8
+ index = None
9
+
10
+ def schedule(self):
11
+ try:
12
+ seeds = []
13
+ with open(self.table, "r") as fp:
14
+ fp.seek(self.index or 0, 0)
15
+ for _ in range(self.length):
16
+ data = fp.readline().strip()
17
+ if not data:
18
+ log.info("scheduler end!")
19
+ self.stop = True
20
+ break
21
+ seeds.append(Seed(data))
22
+ self.index = fp.tell()
23
+ return seeds
24
+ except FileNotFoundError:
25
+ log.error("task table not found!")
26
+ return None
27
+ except TypeError:
28
+ log.error("task table type error!")
29
+ return None
File without changes
@@ -0,0 +1,10 @@
1
+ from base.log import log
2
+ from base.interface import StorerInterface
3
+
4
+
5
+ class Console(StorerInterface):
6
+
7
+ def store(self, data_list):
8
+ for item in data_list:
9
+ log.info(f"item info: {item}")
10
+
@@ -0,0 +1,55 @@
1
+ import json
2
+ from base.log import log
3
+ from base.interface import StorerInterface
4
+ from aliyun.log import LogClient, LogItem, PutLogsRequest
5
+
6
+
7
+ class Loghub(StorerInterface):
8
+
9
+ def __init__(self, table, fields, length, queue, config):
10
+ super().__init__(table, fields, length, queue, config)
11
+ self.client = None
12
+
13
+ def init_loghub_clint(self):
14
+ try:
15
+ self.client = LogClient(
16
+ self.config['endpoint'],
17
+ self.config['access_key_id'],
18
+ self.config['access_key']
19
+ )
20
+ except Exception as e:
21
+ self.client = None
22
+ return False
23
+
24
+ def store(self, data_list):
25
+ try:
26
+ if not self.client:
27
+ self.init_loghub_clint()
28
+
29
+ log_items = list()
30
+ for item in data_list:
31
+ temp = item._asdict()
32
+ for key, value in temp.items():
33
+ if isinstance(value, str):
34
+ temp[key] = value
35
+ else:
36
+ temp[key] = json.dumps(value, ensure_ascii=False)
37
+ log_item = LogItem()
38
+ contents = sorted(temp.items()) # dict to tuple
39
+ log_item.set_contents(contents)
40
+ log_items.append(log_item)
41
+ request = PutLogsRequest(
42
+ project=self.config["project"],
43
+ logstore=self.table,
44
+ topic=self.config["topic"],
45
+ source=self.config.get("source"),
46
+ logitems=log_items,
47
+ compress=True
48
+ )
49
+ self.client.put_logs(request=request)
50
+ log.info(f"save data, data length: {len(data_list)}")
51
+ return True
52
+ except Exception as e:
53
+ log.exception(e)
54
+ return False
55
+
@@ -0,0 +1,16 @@
1
+ from base.log import log
2
+ from base.interface import StorerInterface
3
+
4
+
5
+ class Redis(StorerInterface):
6
+
7
+ def store(self, data_list):
8
+ try:
9
+ data_str = "\n".join(str(data) for data in data_list)
10
+ with open(self.table, "a") as fp:
11
+ fp.write(data_str)
12
+ log.info(f"save data, data length: {len(data_list)}")
13
+ return True
14
+ except Exception as e:
15
+ return False
16
+
@@ -0,0 +1,16 @@
1
+ from base.log import log
2
+ from base.interface import StorerInterface
3
+
4
+
5
+ class Textfile(StorerInterface):
6
+
7
+ def store(self, data_list):
8
+ try:
9
+ data_str = "\n".join(str(data) for data in data_list)
10
+ with open(self.table, "a") as fp:
11
+ fp.write(data_str)
12
+ log.info(f"save data, data length: {len(data_list)}")
13
+ return True
14
+ except Exception as e:
15
+ return False
16
+
File without changes
@@ -0,0 +1,194 @@
1
+ import time
2
+ import threading
3
+ from threading import Thread
4
+ from base.log import log
5
+ from db.base.redis_db import RedisDB
6
+ from base.bbb import Queue, Seed, DBItem
7
+ from base.utils import struct_queue_name, restore_table_name
8
+ from models import Scheduler, Spider, Storer
9
+
10
+
11
+ def start_seeds(seeds):
12
+ if not seeds:
13
+ return None
14
+ if any(isinstance(seeds, t) for t in (list, tuple)):
15
+ return [Seed(seed) for seed in seeds]
16
+ elif any(isinstance(seeds, t) for t in (str, dict)):
17
+ return Seed(seeds)
18
+
19
+
20
+ def parse_storer_info(storer_info):
21
+ storer_data = {}
22
+ storer_info_list = []
23
+ if storer_info.__class__.__name__ == 'StorerInfo':
24
+ storer_info_list.append(storer_info)
25
+ elif any(isinstance(storer_info, t) for t in (list, tuple)):
26
+ storer_info_list = storer_info
27
+ for info in storer_info_list:
28
+ db_name = info.DB.__name__
29
+ storer_data.setdefault(db_name, {"StorerDB": info.DB, "db_args_list": []})
30
+ storer_data[db_name]["db_args_list"].append(info[1:])
31
+ return storer_data
32
+
33
+
34
+ def check(stop, last, spider, scheduler, storer_list, ready_seed_length, spider_queue_length):
35
+ time.sleep(5)
36
+ while True:
37
+ if (
38
+ scheduler.stop and
39
+ not ready_seed_length() and
40
+ not scheduler.queue.length and
41
+ not spider.spider_in_progress.length
42
+ ):
43
+ log.info("spider is done?")
44
+ last.set()
45
+ time.sleep(5)
46
+ storer_queue_empty = True
47
+ for storer in storer_list:
48
+ if storer.queue.length:
49
+ storer_queue_empty = False
50
+ break
51
+ if storer_queue_empty and not spider_queue_length():
52
+ log.info("spider done!")
53
+ break
54
+ last.clear()
55
+ time.sleep(3)
56
+ stop.set()
57
+
58
+
59
+ def launcher(task):
60
+ """
61
+ 任务启动装饰器
62
+ :param task: 任务配置信息
63
+ """
64
+ def decorator(func):
65
+ """
66
+ Item:
67
+ Textfile()
68
+ Loghub()
69
+ Console()
70
+ e.g.
71
+ task.fields = "a,b"
72
+ func(item, seed)
73
+ a = "a"
74
+ b = "b"
75
+ data = {"a": "a", "b": "b"}
76
+ yield item.Loghub(**data)
77
+ yield item.Loghub(a=a, b=b)
78
+ """
79
+ storer_list = []
80
+
81
+ # 程序结束事件
82
+ last = threading.Event()
83
+ # 停止采集事件
84
+ stop = threading.Event()
85
+
86
+ # 初始化redis信息
87
+ redis_db = RedisDB(task.project, task.task_name, *task.redis_info)
88
+
89
+ log.info("初始化cobweb!")
90
+
91
+ seed_queue = Queue()
92
+
93
+ # 调度器动态继承
94
+ SchedulerDB, table, sql, length, size, config = task.scheduler_info
95
+ SchedulerTmp = type(SchedulerDB.__name__, (Scheduler, SchedulerDB), {})
96
+
97
+ # 初始化调度器
98
+ scheduler = SchedulerTmp(table, sql, length, size, seed_queue, config)
99
+
100
+ # 初始化采集器
101
+ spider = Spider(seed_queue, task.max_retries)
102
+
103
+ # 解析存储器信息
104
+ storer_data = parse_storer_info(task.storer_info)
105
+
106
+ # new item
107
+ item = type("Item", (object,), {"redis_client": redis_db})()
108
+ for db_name in storer_data.keys():
109
+ # 存储器动态继承
110
+ StorerDB = storer_data[db_name]["StorerDB"]
111
+ StorerTmp = type(StorerDB.__name__, (Storer, StorerDB), {})
112
+ db_args_list = storer_data[db_name]["db_args_list"]
113
+ for storer_db_args in db_args_list:
114
+ table, fields, length, config = storer_db_args
115
+ if not getattr(item, db_name, None):
116
+ instance = type(db_name, (DBItem,), {})
117
+ setattr(item, db_name, instance)
118
+ # 创建存储xxx, 创建存储队列
119
+ storer_item_instance = getattr(item, db_name)
120
+ storer_item_instance.init_item(table, fields)
121
+ #
122
+ storer_queue = struct_queue_name(db_name, table)
123
+ queue = getattr(storer_item_instance, storer_queue)
124
+ # 初始话存储器
125
+ table_name = restore_table_name(table_name=table)
126
+ storer = StorerTmp(table_name, fields, length, queue, config)
127
+ storer_list.append(storer)
128
+
129
+ Thread(target=redis_db.check_spider_queue, args=(stop, len(storer_list))).start()
130
+ Thread(target=redis_db.set_heartbeat, args=(stop,)).start()
131
+
132
+ # 推送初始种子
133
+ seeds = start_seeds(task.start_seed)
134
+ redis_db.add_seed(seeds)
135
+ # 启动调度器, 调度至redis队列
136
+ Thread(
137
+ # name="xxxx_schedule_seeds",
138
+ target=scheduler.schedule_seed,
139
+ args=(
140
+ redis_db.ready_seed_length,
141
+ redis_db.get_scheduler_lock,
142
+ redis_db.add_seed
143
+ )
144
+ ).start()
145
+
146
+ # 启动调度器, 调度任务队列
147
+ Thread(
148
+ # name="xxxx_schedule_task",
149
+ target=scheduler.schedule_task,
150
+ args=(
151
+ stop, redis_db.get_seed,
152
+ redis_db.ready_seed_length
153
+ )
154
+ ).start()
155
+
156
+ # 启动采集器
157
+ for index in range(task.spider_num):
158
+ Thread(
159
+ # name=f"xxxx_spider_task:{index}",
160
+ target=spider.spider_task,
161
+ args=(
162
+ stop, func, item,
163
+ redis_db.del_seed
164
+ )
165
+ ).start()
166
+
167
+ # 启动存储器
168
+ for storer in storer_list:
169
+ Thread(
170
+ # name=f"xxxx_store_task:{storer.table}",
171
+ target=storer.store_task,
172
+ args=(
173
+ stop, last,
174
+ redis_db.reset_seed,
175
+ redis_db.set_storer
176
+ )
177
+ ).start()
178
+
179
+ Thread(
180
+ # name="check_spider",
181
+ target=check,
182
+ args=(
183
+ stop, last, spider,
184
+ scheduler, storer_list,
185
+ redis_db.ready_seed_length,
186
+ redis_db.spider_queue_length,
187
+ )
188
+ ).start()
189
+
190
+ return decorator
191
+
192
+
193
+
194
+