cobweb-launcher 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,231 +0,0 @@
1
- import json
2
- import random
3
- import time
4
- import redis
5
- from datetime import datetime
6
- from base.bbb import Seed
7
-
8
-
9
- class RedisDB:
10
-
11
- def __init__(
12
- self,
13
- project: str,
14
- task_name: str,
15
- # retry_num: int = 3,
16
- host=None,
17
- port=None,
18
- username=None,
19
- password=None,
20
- db=0
21
- ):
22
- pool = redis.ConnectionPool(
23
- host=host,
24
- port=port,
25
- username=username,
26
- password=password,
27
- db=db
28
- )
29
- self.heartbeat_key = f"{project}:{task_name}:heartbeat" # redis type string
30
- self.ready_key = f"{project}:{task_name}:seed_info:ready" # redis type zset, .format(priority)
31
- self.spider_key = f"{project}:{task_name}:seed_info:spider" # redis type hash, .format(priority)
32
- self.store_key = f"{project}:{task_name}:seed_info:store:%s" # redis type set,
33
- self.failed_key = f"{project}:{task_name}:seed_info:failed" # redis type set, .format(priority)
34
- self.succeed_key = f"{project}:{task_name}:seed_info:succeed" # redis type set, .format(priority)
35
- self.update_lock = f"{project}:{task_name}:update_seed_lock" # redis type string
36
- self.check_lock = f"{project}:{task_name}:check_seed_lock" # redis type string
37
- # self.retry_lock = f"{project}:{task_name}:retry_seed_lock" # redis type string
38
- self.scheduler_lock = f"{project}:{task_name}:scheduler_lock" # redis type string
39
- self.client = redis.Redis(connection_pool=pool)
40
- # self.retry_num = retry_num
41
-
42
- def set_heartbeat(self, t=3):
43
- self.client.expire(self.heartbeat_key, t)
44
-
45
- # @property
46
- def heartbeat(self):
47
- return self.client.ttl(self.heartbeat_key)
48
-
49
- def iterate_hash(self, key, count=1000, match=None):
50
- cursor = "0"
51
- while cursor != 0:
52
- # 使用HSCAN命令迭代获取键值对
53
- cursor, data = self.client.hscan(key, cursor=cursor, match=match, count=count)
54
- if not data:
55
- return None
56
- for field, value in data.items():
57
- yield field.decode(), value.decode()
58
-
59
- def get_lock(self, key, t=15, timeout=3, sleep_time=0.1):
60
- begin_time = int(time.time())
61
- while True:
62
- if self.client.setnx(key, ""):
63
- self.client.expire(key, t)
64
- return True
65
- if int(time.time()) - begin_time > timeout:
66
- break
67
- time.sleep(sleep_time)
68
-
69
- if self.client.ttl(key) == -1:
70
- delete_status = True
71
- for _ in range(3):
72
- if self.client.ttl(key) != -1:
73
- delete_status = False
74
- break
75
- time.sleep(0.5)
76
- if delete_status:
77
- self.client.expire(key, t)
78
- return False
79
- else:
80
- ttl = self.client.ttl(key)
81
- print("ttl: " + str(ttl))
82
- return False
83
-
84
- def execute_update(
85
- self,
86
- set_info,
87
- del_info,
88
- status: int = 0
89
- ):
90
- if status not in [0, 1, 2, 3]:
91
- return None
92
-
93
- pipe = self.client.pipeline()
94
- pipe.multi()
95
-
96
- if status == 0:
97
- pipe.hset(self.spider_key, mapping=set_info)
98
- pipe.zrem(self.ready_key, *del_info)
99
- elif status == 1:
100
- pipe.zadd(self.ready_key, mapping=set_info)
101
- pipe.hdel(self.spider_key, *del_info)
102
- elif status == 2:
103
- pipe.sadd(self.failed_key, *set_info)
104
- pipe.hdel(self.spider_key, *del_info)
105
- else:
106
- pipe.sadd(self.succeed_key, *set_info)
107
- pipe.hdel(self.spider_key, *del_info)
108
- pipe.execute()
109
-
110
- @property
111
- def seed_count(self):
112
- return self.client.zcard(self.ready_key)
113
-
114
- def deal_seeds(self, sids, status: bool):
115
- if isinstance(sids, str):
116
- sids = [sids]
117
- # if self.get_lock(key=self.retry_lock, t=15):
118
- status = 2 if status else 3
119
- del_list, fail_set = [], set()
120
- for sid in sids:
121
- for field, value in self.iterate_hash(self.spider_key, match=f"*{sid}"):
122
- _, priority, _sid = field.split("_")
123
- if sid != _sid:
124
- continue
125
- seed = Seed(value, priority=priority)
126
- del_list.append(field)
127
- fail_set.add(seed.format_seed)
128
- if del_list:
129
- self.execute_update(fail_set, del_list, status=status)
130
- # self.client.delete(self.retry_lock)
131
- print("retry seeds, sids: {}".format(json.dumps(sids)))
132
-
133
- def set_seeds(self, seeds):
134
- item_info = {}
135
- if any(isinstance(seeds, t) for t in (list, tuple)):
136
- for seed in seeds:
137
- item_info[seed.format_seed] = seed.priority
138
- elif isinstance(seeds, Seed):
139
- item_info[seeds.format_seed] = seeds.priority
140
- self.client.zadd(self.ready_key, mapping=item_info)
141
-
142
- def get_seeds(self, length: int = 1000):
143
- """
144
- redis获取种子
145
- """
146
- cs = time.time()
147
-
148
- if self.get_lock(key=self.update_lock):
149
-
150
- set_dict, del_list, result = {}, [], []
151
-
152
- # version = int(time.time() * 1e3)
153
- version = time.time() * 1e6
154
-
155
- items = self.client.zrangebyscore(self.ready_key, min=0, max="+inf", start=0, num=length, withscores=True)
156
-
157
- # for value, priority in items:
158
- # seed = Seed(value, priority=priority, version=version)
159
- # pty = "{:03d}".format(int(priority))
160
- # key = f"{version}_{pty}_{seed.sid}"
161
- # set_dict[key] = value
162
- # del_list.append(value)
163
- # result.append(seed)
164
-
165
- for value, priority in items:
166
- v = version + int(priority) / 1000 + random.random() / 1000
167
- seed = Seed(value, priority=priority, version=version)
168
- pty = "{:03d}".format(int(priority))
169
- key = f"{version}_{pty}_{seed.sid}"
170
- set_dict[key] = value
171
- del_list.append(value)
172
- result.append(seed)
173
-
174
- print("\nset seeds into queue time: " + str(time.time() - cs))
175
- if result:
176
- self.execute_update(set_dict, del_list)
177
-
178
- self.client.delete(self.update_lock)
179
- print("push seeds into queue time: " + str(time.time() - cs))
180
- return result
181
-
182
- def check_spider_hash(self):
183
- cs = time.time()
184
- set_dict, del_list, heartbeat = {}, [], False
185
- if self.get_lock(key=self.check_lock, t=60, timeout=600, sleep_time=60):
186
- count = self.client.hlen(self.spider_key)
187
- if self.client.exists(self.heartbeat_key):
188
- heartbeat = True
189
- now = int(time.time())
190
- for field, value in self.iterate_hash(key=self.spider_key, count=count):
191
- version, priority, sid = field.split("_")
192
- if heartbeat and int(version) + 600 > now:
193
- continue
194
- set_dict[value] = priority
195
- del_list.append(field)
196
-
197
- if len(del_list) >= 1000:
198
- self.client.expire(self.check_lock, 60)
199
- self.execute_update(set_dict, del_list, status=1)
200
- set_dict, del_list = {}, []
201
-
202
- if set_dict and del_list:
203
- self.execute_update(set_dict, del_list, status=1)
204
-
205
- # self.client.delete(self.check_lock)
206
- print("init seeds time: " + str(time.time() - cs))
207
- if not heartbeat:
208
- self.client.setnx(self.heartbeat_key, "")
209
- self.set_heartbeat(t=15)
210
-
211
- def add_store_sid(self, key, data):
212
- redis_key = self.store_key % key
213
- self.client.sadd(redis_key, *data)
214
-
215
-
216
- current_time = datetime.now()
217
- # 格式化日期时间字符串
218
- formatted_time = current_time.strftime("%m%d%H%M%S%f")
219
- c = int(formatted_time)
220
- print(c)
221
- d = 200 + 0.9 * random.random()
222
- print(d)
223
- print(time.time())
224
- print(c + d / 1000)
225
- # for _ in range(100):
226
- # redis_db.get_seeds(1000)
227
- # redis_db.get_seeds(1000)
228
- # redis_db.check_spider_hash()
229
- # redis_db.retry_seeds(["dc895aee47f8fc39c479f7cac6025879"])
230
- # "1705996980_200_dc895aee47f8fc39c479f7cac6025879"
231
-
@@ -1,37 +0,0 @@
1
- cobweb/__init__.py,sha256=cuMlo5UdlfMRnMf6TfMHL8-FIw83zCiCUtfOrrG6gog,356
2
- cobweb/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- cobweb/base/bbb.py,sha256=iX2Xn5AukRussL8xHKTGFfgODWTMaFA3JnvQWd02XDQ,5615
4
- cobweb/base/config.py,sha256=qZY26tziCeENdopGyVSTCyd_8B30S37GbWx0-_g7EiY,5357
5
- cobweb/base/decorators.py,sha256=gb2puZLRHv_348ZTIAFLhKuJWKZ6a9_tzduCS8p1-UQ,3089
6
- cobweb/base/hash_table.py,sha256=-EPHRMNOHHbdjxZJUuDxghUfwrbMA05sqrldHAgrIco,1885
7
- cobweb/base/interface.py,sha256=LM6C0eh-d1b2CxjtiHKfP8I3XhhlYQR5r-3MD6TMIc4,1037
8
- cobweb/base/log.py,sha256=Gb3_y4IzTo5pJohTggBCU9rK6-ZN3hgTOHkoXHyN6CU,2384
9
- cobweb/base/queue_tmp.py,sha256=NS4qBHKq2o-R78Jpv5xp7TtOHIMg8y0livilTVK49M8,1527
10
- cobweb/base/request.py,sha256=dHTR7qMHbIIW4ggpTAg4io1TBAYH77teYU4bmcWPXH0,2318
11
- cobweb/base/task.py,sha256=ztgNh4_tgy95pe3REBfMLKkwf7HaShvp-fdRIWJiJXo,1230
12
- cobweb/base/utils.py,sha256=NSSgCBE4u1yTpXZrjg8RIepYedo4ZdM38rhDObVfRhI,325
13
- cobweb/db/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
- cobweb/db/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
- cobweb/db/base/client_db.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
16
- cobweb/db/base/oss_db.py,sha256=LYRsh26-Fttc6sjc9oOVsCd93jZSja6dtRc8G2Em-1E,3812
17
- cobweb/db/base/redis_db.py,sha256=2-YMrTpiLNGwA7_bt62HYKevVeJr1Y_n88JKjPg1V3s,7636
18
- cobweb/db/base/redis_dbv3.py,sha256=u-Tmexl0nrYVVRCCbxAjcH6fyRx7CP4J7iW4BdO7q98,8354
19
- cobweb/db/scheduler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
- cobweb/db/scheduler/default.py,sha256=m-zzC2cbHGxplEW5OoB9Vj3nJm30Xl0sAQ94wpPb7Yw,122
21
- cobweb/db/scheduler/textfile.py,sha256=EiOxV8h99ouIr2HvmpM9B90QY3hqGNPMeQEnps_RG-c,869
22
- cobweb/db/storer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
- cobweb/db/storer/console.py,sha256=QbHnZ3ou0AR6c35iVy2hYEbKlYYTnFaD8-bUODeNt14,210
24
- cobweb/db/storer/loghub.py,sha256=Pb0OwVteIllYjL2cIyBlc1WWau1PHn9K1rlAGrr0M3k,1815
25
- cobweb/db/storer/redis.py,sha256=jK_RirqgSaV4aIWSuySIm5f1ZfZiULqFj2kman2H-Qw,440
26
- cobweb/db/storer/textfile.py,sha256=yAvtbPkScjZ298H25kWsI0MDg2JuI2Im4m2qmPEUNTM,443
27
- cobweb/distributed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
- cobweb/distributed/launcher.py,sha256=IV2jd1hLyt1YyhyN3leSMrtbN5APsmvqblr2NXy18xg,6163
29
- cobweb/distributed/models.py,sha256=7ypYQaiHP91LbPE0u5Lb-9LDazg74UW7KacK4-ai1tM,4438
30
- cobweb/single/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
- cobweb/single/models.py,sha256=lu8teNWnWcUwZFra8XmqyhzOAf3UyuEztwBr1Ne6pUs,2898
32
- cobweb/single/nest.py,sha256=mL8q9a5BjtoeUyzXCIVw_vyUsNY8ltbvQpYIIpZEDFU,5012
33
- cobweb_launcher-0.0.4.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
34
- cobweb_launcher-0.0.4.dist-info/METADATA,sha256=4JShM9pxXi1aouOY1OFrEuW7yrTM1K35GUJLKQpr78w,1225
35
- cobweb_launcher-0.0.4.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
36
- cobweb_launcher-0.0.4.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
37
- cobweb_launcher-0.0.4.dist-info/RECORD,,
File without changes