cobweb-launcher 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cobweb/__init__.py +6 -8
- cobweb/{base/bbb.py → bbb.py} +2 -2
- cobweb/db/__init__.py +2 -0
- cobweb/db/{base/oss_db.py → oss_db.py} +1 -1
- cobweb/db/{base/redis_db.py → redis_db.py} +3 -13
- cobweb/db/scheduler/default.py +1 -1
- cobweb/db/scheduler/textfile.py +1 -3
- cobweb/db/storer/console.py +1 -2
- cobweb/db/storer/loghub.py +3 -4
- cobweb/db/storer/redis.py +1 -2
- cobweb/db/storer/textfile.py +1 -2
- cobweb/distributed/launcher.py +135 -56
- cobweb/distributed/models.py +1 -3
- cobweb/{base/interface.py → interface.py} +3 -13
- cobweb/task.py +44 -0
- cobweb/utils.py +85 -0
- {cobweb_launcher-0.0.4.dist-info → cobweb_launcher-0.0.6.dist-info}/METADATA +1 -1
- cobweb_launcher-0.0.6.dist-info/RECORD +28 -0
- cobweb/base/__init__.py +0 -0
- cobweb/base/config.py +0 -164
- cobweb/base/decorators.py +0 -95
- cobweb/base/hash_table.py +0 -60
- cobweb/base/queue_tmp.py +0 -60
- cobweb/base/request.py +0 -62
- cobweb/base/task.py +0 -38
- cobweb/base/utils.py +0 -15
- cobweb/db/base/__init__.py +0 -0
- cobweb/db/base/client_db.py +0 -1
- cobweb/db/base/redis_dbv3.py +0 -231
- cobweb_launcher-0.0.4.dist-info/RECORD +0 -37
- /cobweb/{base/log.py → log.py} +0 -0
- {cobweb_launcher-0.0.4.dist-info → cobweb_launcher-0.0.6.dist-info}/LICENSE +0 -0
- {cobweb_launcher-0.0.4.dist-info → cobweb_launcher-0.0.6.dist-info}/WHEEL +0 -0
- {cobweb_launcher-0.0.4.dist-info → cobweb_launcher-0.0.6.dist-info}/top_level.txt +0 -0
cobweb/db/base/redis_dbv3.py
DELETED
@@ -1,231 +0,0 @@
|
|
1
|
-
import json
|
2
|
-
import random
|
3
|
-
import time
|
4
|
-
import redis
|
5
|
-
from datetime import datetime
|
6
|
-
from base.bbb import Seed
|
7
|
-
|
8
|
-
|
9
|
-
class RedisDB:
|
10
|
-
|
11
|
-
def __init__(
|
12
|
-
self,
|
13
|
-
project: str,
|
14
|
-
task_name: str,
|
15
|
-
# retry_num: int = 3,
|
16
|
-
host=None,
|
17
|
-
port=None,
|
18
|
-
username=None,
|
19
|
-
password=None,
|
20
|
-
db=0
|
21
|
-
):
|
22
|
-
pool = redis.ConnectionPool(
|
23
|
-
host=host,
|
24
|
-
port=port,
|
25
|
-
username=username,
|
26
|
-
password=password,
|
27
|
-
db=db
|
28
|
-
)
|
29
|
-
self.heartbeat_key = f"{project}:{task_name}:heartbeat" # redis type string
|
30
|
-
self.ready_key = f"{project}:{task_name}:seed_info:ready" # redis type zset, .format(priority)
|
31
|
-
self.spider_key = f"{project}:{task_name}:seed_info:spider" # redis type hash, .format(priority)
|
32
|
-
self.store_key = f"{project}:{task_name}:seed_info:store:%s" # redis type set,
|
33
|
-
self.failed_key = f"{project}:{task_name}:seed_info:failed" # redis type set, .format(priority)
|
34
|
-
self.succeed_key = f"{project}:{task_name}:seed_info:succeed" # redis type set, .format(priority)
|
35
|
-
self.update_lock = f"{project}:{task_name}:update_seed_lock" # redis type string
|
36
|
-
self.check_lock = f"{project}:{task_name}:check_seed_lock" # redis type string
|
37
|
-
# self.retry_lock = f"{project}:{task_name}:retry_seed_lock" # redis type string
|
38
|
-
self.scheduler_lock = f"{project}:{task_name}:scheduler_lock" # redis type string
|
39
|
-
self.client = redis.Redis(connection_pool=pool)
|
40
|
-
# self.retry_num = retry_num
|
41
|
-
|
42
|
-
def set_heartbeat(self, t=3):
|
43
|
-
self.client.expire(self.heartbeat_key, t)
|
44
|
-
|
45
|
-
# @property
|
46
|
-
def heartbeat(self):
|
47
|
-
return self.client.ttl(self.heartbeat_key)
|
48
|
-
|
49
|
-
def iterate_hash(self, key, count=1000, match=None):
|
50
|
-
cursor = "0"
|
51
|
-
while cursor != 0:
|
52
|
-
# 使用HSCAN命令迭代获取键值对
|
53
|
-
cursor, data = self.client.hscan(key, cursor=cursor, match=match, count=count)
|
54
|
-
if not data:
|
55
|
-
return None
|
56
|
-
for field, value in data.items():
|
57
|
-
yield field.decode(), value.decode()
|
58
|
-
|
59
|
-
def get_lock(self, key, t=15, timeout=3, sleep_time=0.1):
|
60
|
-
begin_time = int(time.time())
|
61
|
-
while True:
|
62
|
-
if self.client.setnx(key, ""):
|
63
|
-
self.client.expire(key, t)
|
64
|
-
return True
|
65
|
-
if int(time.time()) - begin_time > timeout:
|
66
|
-
break
|
67
|
-
time.sleep(sleep_time)
|
68
|
-
|
69
|
-
if self.client.ttl(key) == -1:
|
70
|
-
delete_status = True
|
71
|
-
for _ in range(3):
|
72
|
-
if self.client.ttl(key) != -1:
|
73
|
-
delete_status = False
|
74
|
-
break
|
75
|
-
time.sleep(0.5)
|
76
|
-
if delete_status:
|
77
|
-
self.client.expire(key, t)
|
78
|
-
return False
|
79
|
-
else:
|
80
|
-
ttl = self.client.ttl(key)
|
81
|
-
print("ttl: " + str(ttl))
|
82
|
-
return False
|
83
|
-
|
84
|
-
def execute_update(
|
85
|
-
self,
|
86
|
-
set_info,
|
87
|
-
del_info,
|
88
|
-
status: int = 0
|
89
|
-
):
|
90
|
-
if status not in [0, 1, 2, 3]:
|
91
|
-
return None
|
92
|
-
|
93
|
-
pipe = self.client.pipeline()
|
94
|
-
pipe.multi()
|
95
|
-
|
96
|
-
if status == 0:
|
97
|
-
pipe.hset(self.spider_key, mapping=set_info)
|
98
|
-
pipe.zrem(self.ready_key, *del_info)
|
99
|
-
elif status == 1:
|
100
|
-
pipe.zadd(self.ready_key, mapping=set_info)
|
101
|
-
pipe.hdel(self.spider_key, *del_info)
|
102
|
-
elif status == 2:
|
103
|
-
pipe.sadd(self.failed_key, *set_info)
|
104
|
-
pipe.hdel(self.spider_key, *del_info)
|
105
|
-
else:
|
106
|
-
pipe.sadd(self.succeed_key, *set_info)
|
107
|
-
pipe.hdel(self.spider_key, *del_info)
|
108
|
-
pipe.execute()
|
109
|
-
|
110
|
-
@property
|
111
|
-
def seed_count(self):
|
112
|
-
return self.client.zcard(self.ready_key)
|
113
|
-
|
114
|
-
def deal_seeds(self, sids, status: bool):
|
115
|
-
if isinstance(sids, str):
|
116
|
-
sids = [sids]
|
117
|
-
# if self.get_lock(key=self.retry_lock, t=15):
|
118
|
-
status = 2 if status else 3
|
119
|
-
del_list, fail_set = [], set()
|
120
|
-
for sid in sids:
|
121
|
-
for field, value in self.iterate_hash(self.spider_key, match=f"*{sid}"):
|
122
|
-
_, priority, _sid = field.split("_")
|
123
|
-
if sid != _sid:
|
124
|
-
continue
|
125
|
-
seed = Seed(value, priority=priority)
|
126
|
-
del_list.append(field)
|
127
|
-
fail_set.add(seed.format_seed)
|
128
|
-
if del_list:
|
129
|
-
self.execute_update(fail_set, del_list, status=status)
|
130
|
-
# self.client.delete(self.retry_lock)
|
131
|
-
print("retry seeds, sids: {}".format(json.dumps(sids)))
|
132
|
-
|
133
|
-
def set_seeds(self, seeds):
|
134
|
-
item_info = {}
|
135
|
-
if any(isinstance(seeds, t) for t in (list, tuple)):
|
136
|
-
for seed in seeds:
|
137
|
-
item_info[seed.format_seed] = seed.priority
|
138
|
-
elif isinstance(seeds, Seed):
|
139
|
-
item_info[seeds.format_seed] = seeds.priority
|
140
|
-
self.client.zadd(self.ready_key, mapping=item_info)
|
141
|
-
|
142
|
-
def get_seeds(self, length: int = 1000):
|
143
|
-
"""
|
144
|
-
redis获取种子
|
145
|
-
"""
|
146
|
-
cs = time.time()
|
147
|
-
|
148
|
-
if self.get_lock(key=self.update_lock):
|
149
|
-
|
150
|
-
set_dict, del_list, result = {}, [], []
|
151
|
-
|
152
|
-
# version = int(time.time() * 1e3)
|
153
|
-
version = time.time() * 1e6
|
154
|
-
|
155
|
-
items = self.client.zrangebyscore(self.ready_key, min=0, max="+inf", start=0, num=length, withscores=True)
|
156
|
-
|
157
|
-
# for value, priority in items:
|
158
|
-
# seed = Seed(value, priority=priority, version=version)
|
159
|
-
# pty = "{:03d}".format(int(priority))
|
160
|
-
# key = f"{version}_{pty}_{seed.sid}"
|
161
|
-
# set_dict[key] = value
|
162
|
-
# del_list.append(value)
|
163
|
-
# result.append(seed)
|
164
|
-
|
165
|
-
for value, priority in items:
|
166
|
-
v = version + int(priority) / 1000 + random.random() / 1000
|
167
|
-
seed = Seed(value, priority=priority, version=version)
|
168
|
-
pty = "{:03d}".format(int(priority))
|
169
|
-
key = f"{version}_{pty}_{seed.sid}"
|
170
|
-
set_dict[key] = value
|
171
|
-
del_list.append(value)
|
172
|
-
result.append(seed)
|
173
|
-
|
174
|
-
print("\nset seeds into queue time: " + str(time.time() - cs))
|
175
|
-
if result:
|
176
|
-
self.execute_update(set_dict, del_list)
|
177
|
-
|
178
|
-
self.client.delete(self.update_lock)
|
179
|
-
print("push seeds into queue time: " + str(time.time() - cs))
|
180
|
-
return result
|
181
|
-
|
182
|
-
def check_spider_hash(self):
|
183
|
-
cs = time.time()
|
184
|
-
set_dict, del_list, heartbeat = {}, [], False
|
185
|
-
if self.get_lock(key=self.check_lock, t=60, timeout=600, sleep_time=60):
|
186
|
-
count = self.client.hlen(self.spider_key)
|
187
|
-
if self.client.exists(self.heartbeat_key):
|
188
|
-
heartbeat = True
|
189
|
-
now = int(time.time())
|
190
|
-
for field, value in self.iterate_hash(key=self.spider_key, count=count):
|
191
|
-
version, priority, sid = field.split("_")
|
192
|
-
if heartbeat and int(version) + 600 > now:
|
193
|
-
continue
|
194
|
-
set_dict[value] = priority
|
195
|
-
del_list.append(field)
|
196
|
-
|
197
|
-
if len(del_list) >= 1000:
|
198
|
-
self.client.expire(self.check_lock, 60)
|
199
|
-
self.execute_update(set_dict, del_list, status=1)
|
200
|
-
set_dict, del_list = {}, []
|
201
|
-
|
202
|
-
if set_dict and del_list:
|
203
|
-
self.execute_update(set_dict, del_list, status=1)
|
204
|
-
|
205
|
-
# self.client.delete(self.check_lock)
|
206
|
-
print("init seeds time: " + str(time.time() - cs))
|
207
|
-
if not heartbeat:
|
208
|
-
self.client.setnx(self.heartbeat_key, "")
|
209
|
-
self.set_heartbeat(t=15)
|
210
|
-
|
211
|
-
def add_store_sid(self, key, data):
|
212
|
-
redis_key = self.store_key % key
|
213
|
-
self.client.sadd(redis_key, *data)
|
214
|
-
|
215
|
-
|
216
|
-
current_time = datetime.now()
|
217
|
-
# 格式化日期时间字符串
|
218
|
-
formatted_time = current_time.strftime("%m%d%H%M%S%f")
|
219
|
-
c = int(formatted_time)
|
220
|
-
print(c)
|
221
|
-
d = 200 + 0.9 * random.random()
|
222
|
-
print(d)
|
223
|
-
print(time.time())
|
224
|
-
print(c + d / 1000)
|
225
|
-
# for _ in range(100):
|
226
|
-
# redis_db.get_seeds(1000)
|
227
|
-
# redis_db.get_seeds(1000)
|
228
|
-
# redis_db.check_spider_hash()
|
229
|
-
# redis_db.retry_seeds(["dc895aee47f8fc39c479f7cac6025879"])
|
230
|
-
# "1705996980_200_dc895aee47f8fc39c479f7cac6025879"
|
231
|
-
|
@@ -1,37 +0,0 @@
|
|
1
|
-
cobweb/__init__.py,sha256=cuMlo5UdlfMRnMf6TfMHL8-FIw83zCiCUtfOrrG6gog,356
|
2
|
-
cobweb/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
cobweb/base/bbb.py,sha256=iX2Xn5AukRussL8xHKTGFfgODWTMaFA3JnvQWd02XDQ,5615
|
4
|
-
cobweb/base/config.py,sha256=qZY26tziCeENdopGyVSTCyd_8B30S37GbWx0-_g7EiY,5357
|
5
|
-
cobweb/base/decorators.py,sha256=gb2puZLRHv_348ZTIAFLhKuJWKZ6a9_tzduCS8p1-UQ,3089
|
6
|
-
cobweb/base/hash_table.py,sha256=-EPHRMNOHHbdjxZJUuDxghUfwrbMA05sqrldHAgrIco,1885
|
7
|
-
cobweb/base/interface.py,sha256=LM6C0eh-d1b2CxjtiHKfP8I3XhhlYQR5r-3MD6TMIc4,1037
|
8
|
-
cobweb/base/log.py,sha256=Gb3_y4IzTo5pJohTggBCU9rK6-ZN3hgTOHkoXHyN6CU,2384
|
9
|
-
cobweb/base/queue_tmp.py,sha256=NS4qBHKq2o-R78Jpv5xp7TtOHIMg8y0livilTVK49M8,1527
|
10
|
-
cobweb/base/request.py,sha256=dHTR7qMHbIIW4ggpTAg4io1TBAYH77teYU4bmcWPXH0,2318
|
11
|
-
cobweb/base/task.py,sha256=ztgNh4_tgy95pe3REBfMLKkwf7HaShvp-fdRIWJiJXo,1230
|
12
|
-
cobweb/base/utils.py,sha256=NSSgCBE4u1yTpXZrjg8RIepYedo4ZdM38rhDObVfRhI,325
|
13
|
-
cobweb/db/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
|
-
cobweb/db/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
|
-
cobweb/db/base/client_db.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
16
|
-
cobweb/db/base/oss_db.py,sha256=LYRsh26-Fttc6sjc9oOVsCd93jZSja6dtRc8G2Em-1E,3812
|
17
|
-
cobweb/db/base/redis_db.py,sha256=2-YMrTpiLNGwA7_bt62HYKevVeJr1Y_n88JKjPg1V3s,7636
|
18
|
-
cobweb/db/base/redis_dbv3.py,sha256=u-Tmexl0nrYVVRCCbxAjcH6fyRx7CP4J7iW4BdO7q98,8354
|
19
|
-
cobweb/db/scheduler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
20
|
-
cobweb/db/scheduler/default.py,sha256=m-zzC2cbHGxplEW5OoB9Vj3nJm30Xl0sAQ94wpPb7Yw,122
|
21
|
-
cobweb/db/scheduler/textfile.py,sha256=EiOxV8h99ouIr2HvmpM9B90QY3hqGNPMeQEnps_RG-c,869
|
22
|
-
cobweb/db/storer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
23
|
-
cobweb/db/storer/console.py,sha256=QbHnZ3ou0AR6c35iVy2hYEbKlYYTnFaD8-bUODeNt14,210
|
24
|
-
cobweb/db/storer/loghub.py,sha256=Pb0OwVteIllYjL2cIyBlc1WWau1PHn9K1rlAGrr0M3k,1815
|
25
|
-
cobweb/db/storer/redis.py,sha256=jK_RirqgSaV4aIWSuySIm5f1ZfZiULqFj2kman2H-Qw,440
|
26
|
-
cobweb/db/storer/textfile.py,sha256=yAvtbPkScjZ298H25kWsI0MDg2JuI2Im4m2qmPEUNTM,443
|
27
|
-
cobweb/distributed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
28
|
-
cobweb/distributed/launcher.py,sha256=IV2jd1hLyt1YyhyN3leSMrtbN5APsmvqblr2NXy18xg,6163
|
29
|
-
cobweb/distributed/models.py,sha256=7ypYQaiHP91LbPE0u5Lb-9LDazg74UW7KacK4-ai1tM,4438
|
30
|
-
cobweb/single/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
31
|
-
cobweb/single/models.py,sha256=lu8teNWnWcUwZFra8XmqyhzOAf3UyuEztwBr1Ne6pUs,2898
|
32
|
-
cobweb/single/nest.py,sha256=mL8q9a5BjtoeUyzXCIVw_vyUsNY8ltbvQpYIIpZEDFU,5012
|
33
|
-
cobweb_launcher-0.0.4.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
|
34
|
-
cobweb_launcher-0.0.4.dist-info/METADATA,sha256=4JShM9pxXi1aouOY1OFrEuW7yrTM1K35GUJLKQpr78w,1225
|
35
|
-
cobweb_launcher-0.0.4.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
|
36
|
-
cobweb_launcher-0.0.4.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
|
37
|
-
cobweb_launcher-0.0.4.dist-info/RECORD,,
|
/cobweb/{base/log.py → log.py}
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|