cobweb-launcher 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- cobweb/__init__.py +6 -8
- cobweb/{base/bbb.py → bbb.py} +2 -2
- cobweb/db/__init__.py +2 -0
- cobweb/db/{base/oss_db.py → oss_db.py} +1 -1
- cobweb/db/{base/redis_db.py → redis_db.py} +3 -13
- cobweb/db/scheduler/default.py +1 -1
- cobweb/db/scheduler/textfile.py +1 -3
- cobweb/db/storer/console.py +1 -2
- cobweb/db/storer/loghub.py +3 -4
- cobweb/db/storer/redis.py +1 -2
- cobweb/db/storer/textfile.py +1 -2
- cobweb/distributed/launcher.py +135 -56
- cobweb/distributed/models.py +1 -3
- cobweb/{base/interface.py → interface.py} +3 -13
- cobweb/task.py +44 -0
- cobweb/utils.py +85 -0
- {cobweb_launcher-0.0.4.dist-info → cobweb_launcher-0.0.6.dist-info}/METADATA +1 -1
- cobweb_launcher-0.0.6.dist-info/RECORD +28 -0
- cobweb/base/__init__.py +0 -0
- cobweb/base/config.py +0 -164
- cobweb/base/decorators.py +0 -95
- cobweb/base/hash_table.py +0 -60
- cobweb/base/queue_tmp.py +0 -60
- cobweb/base/request.py +0 -62
- cobweb/base/task.py +0 -38
- cobweb/base/utils.py +0 -15
- cobweb/db/base/__init__.py +0 -0
- cobweb/db/base/client_db.py +0 -1
- cobweb/db/base/redis_dbv3.py +0 -231
- cobweb_launcher-0.0.4.dist-info/RECORD +0 -37
- /cobweb/{base/log.py → log.py} +0 -0
- {cobweb_launcher-0.0.4.dist-info → cobweb_launcher-0.0.6.dist-info}/LICENSE +0 -0
- {cobweb_launcher-0.0.4.dist-info → cobweb_launcher-0.0.6.dist-info}/WHEEL +0 -0
- {cobweb_launcher-0.0.4.dist-info → cobweb_launcher-0.0.6.dist-info}/top_level.txt +0 -0
cobweb/db/base/redis_dbv3.py
DELETED
@@ -1,231 +0,0 @@
|
|
1
|
-
import json
|
2
|
-
import random
|
3
|
-
import time
|
4
|
-
import redis
|
5
|
-
from datetime import datetime
|
6
|
-
from base.bbb import Seed
|
7
|
-
|
8
|
-
|
9
|
-
class RedisDB:
|
10
|
-
|
11
|
-
def __init__(
|
12
|
-
self,
|
13
|
-
project: str,
|
14
|
-
task_name: str,
|
15
|
-
# retry_num: int = 3,
|
16
|
-
host=None,
|
17
|
-
port=None,
|
18
|
-
username=None,
|
19
|
-
password=None,
|
20
|
-
db=0
|
21
|
-
):
|
22
|
-
pool = redis.ConnectionPool(
|
23
|
-
host=host,
|
24
|
-
port=port,
|
25
|
-
username=username,
|
26
|
-
password=password,
|
27
|
-
db=db
|
28
|
-
)
|
29
|
-
self.heartbeat_key = f"{project}:{task_name}:heartbeat" # redis type string
|
30
|
-
self.ready_key = f"{project}:{task_name}:seed_info:ready" # redis type zset, .format(priority)
|
31
|
-
self.spider_key = f"{project}:{task_name}:seed_info:spider" # redis type hash, .format(priority)
|
32
|
-
self.store_key = f"{project}:{task_name}:seed_info:store:%s" # redis type set,
|
33
|
-
self.failed_key = f"{project}:{task_name}:seed_info:failed" # redis type set, .format(priority)
|
34
|
-
self.succeed_key = f"{project}:{task_name}:seed_info:succeed" # redis type set, .format(priority)
|
35
|
-
self.update_lock = f"{project}:{task_name}:update_seed_lock" # redis type string
|
36
|
-
self.check_lock = f"{project}:{task_name}:check_seed_lock" # redis type string
|
37
|
-
# self.retry_lock = f"{project}:{task_name}:retry_seed_lock" # redis type string
|
38
|
-
self.scheduler_lock = f"{project}:{task_name}:scheduler_lock" # redis type string
|
39
|
-
self.client = redis.Redis(connection_pool=pool)
|
40
|
-
# self.retry_num = retry_num
|
41
|
-
|
42
|
-
def set_heartbeat(self, t=3):
|
43
|
-
self.client.expire(self.heartbeat_key, t)
|
44
|
-
|
45
|
-
# @property
|
46
|
-
def heartbeat(self):
|
47
|
-
return self.client.ttl(self.heartbeat_key)
|
48
|
-
|
49
|
-
def iterate_hash(self, key, count=1000, match=None):
|
50
|
-
cursor = "0"
|
51
|
-
while cursor != 0:
|
52
|
-
# 使用HSCAN命令迭代获取键值对
|
53
|
-
cursor, data = self.client.hscan(key, cursor=cursor, match=match, count=count)
|
54
|
-
if not data:
|
55
|
-
return None
|
56
|
-
for field, value in data.items():
|
57
|
-
yield field.decode(), value.decode()
|
58
|
-
|
59
|
-
def get_lock(self, key, t=15, timeout=3, sleep_time=0.1):
|
60
|
-
begin_time = int(time.time())
|
61
|
-
while True:
|
62
|
-
if self.client.setnx(key, ""):
|
63
|
-
self.client.expire(key, t)
|
64
|
-
return True
|
65
|
-
if int(time.time()) - begin_time > timeout:
|
66
|
-
break
|
67
|
-
time.sleep(sleep_time)
|
68
|
-
|
69
|
-
if self.client.ttl(key) == -1:
|
70
|
-
delete_status = True
|
71
|
-
for _ in range(3):
|
72
|
-
if self.client.ttl(key) != -1:
|
73
|
-
delete_status = False
|
74
|
-
break
|
75
|
-
time.sleep(0.5)
|
76
|
-
if delete_status:
|
77
|
-
self.client.expire(key, t)
|
78
|
-
return False
|
79
|
-
else:
|
80
|
-
ttl = self.client.ttl(key)
|
81
|
-
print("ttl: " + str(ttl))
|
82
|
-
return False
|
83
|
-
|
84
|
-
def execute_update(
|
85
|
-
self,
|
86
|
-
set_info,
|
87
|
-
del_info,
|
88
|
-
status: int = 0
|
89
|
-
):
|
90
|
-
if status not in [0, 1, 2, 3]:
|
91
|
-
return None
|
92
|
-
|
93
|
-
pipe = self.client.pipeline()
|
94
|
-
pipe.multi()
|
95
|
-
|
96
|
-
if status == 0:
|
97
|
-
pipe.hset(self.spider_key, mapping=set_info)
|
98
|
-
pipe.zrem(self.ready_key, *del_info)
|
99
|
-
elif status == 1:
|
100
|
-
pipe.zadd(self.ready_key, mapping=set_info)
|
101
|
-
pipe.hdel(self.spider_key, *del_info)
|
102
|
-
elif status == 2:
|
103
|
-
pipe.sadd(self.failed_key, *set_info)
|
104
|
-
pipe.hdel(self.spider_key, *del_info)
|
105
|
-
else:
|
106
|
-
pipe.sadd(self.succeed_key, *set_info)
|
107
|
-
pipe.hdel(self.spider_key, *del_info)
|
108
|
-
pipe.execute()
|
109
|
-
|
110
|
-
@property
|
111
|
-
def seed_count(self):
|
112
|
-
return self.client.zcard(self.ready_key)
|
113
|
-
|
114
|
-
def deal_seeds(self, sids, status: bool):
|
115
|
-
if isinstance(sids, str):
|
116
|
-
sids = [sids]
|
117
|
-
# if self.get_lock(key=self.retry_lock, t=15):
|
118
|
-
status = 2 if status else 3
|
119
|
-
del_list, fail_set = [], set()
|
120
|
-
for sid in sids:
|
121
|
-
for field, value in self.iterate_hash(self.spider_key, match=f"*{sid}"):
|
122
|
-
_, priority, _sid = field.split("_")
|
123
|
-
if sid != _sid:
|
124
|
-
continue
|
125
|
-
seed = Seed(value, priority=priority)
|
126
|
-
del_list.append(field)
|
127
|
-
fail_set.add(seed.format_seed)
|
128
|
-
if del_list:
|
129
|
-
self.execute_update(fail_set, del_list, status=status)
|
130
|
-
# self.client.delete(self.retry_lock)
|
131
|
-
print("retry seeds, sids: {}".format(json.dumps(sids)))
|
132
|
-
|
133
|
-
def set_seeds(self, seeds):
|
134
|
-
item_info = {}
|
135
|
-
if any(isinstance(seeds, t) for t in (list, tuple)):
|
136
|
-
for seed in seeds:
|
137
|
-
item_info[seed.format_seed] = seed.priority
|
138
|
-
elif isinstance(seeds, Seed):
|
139
|
-
item_info[seeds.format_seed] = seeds.priority
|
140
|
-
self.client.zadd(self.ready_key, mapping=item_info)
|
141
|
-
|
142
|
-
def get_seeds(self, length: int = 1000):
|
143
|
-
"""
|
144
|
-
redis获取种子
|
145
|
-
"""
|
146
|
-
cs = time.time()
|
147
|
-
|
148
|
-
if self.get_lock(key=self.update_lock):
|
149
|
-
|
150
|
-
set_dict, del_list, result = {}, [], []
|
151
|
-
|
152
|
-
# version = int(time.time() * 1e3)
|
153
|
-
version = time.time() * 1e6
|
154
|
-
|
155
|
-
items = self.client.zrangebyscore(self.ready_key, min=0, max="+inf", start=0, num=length, withscores=True)
|
156
|
-
|
157
|
-
# for value, priority in items:
|
158
|
-
# seed = Seed(value, priority=priority, version=version)
|
159
|
-
# pty = "{:03d}".format(int(priority))
|
160
|
-
# key = f"{version}_{pty}_{seed.sid}"
|
161
|
-
# set_dict[key] = value
|
162
|
-
# del_list.append(value)
|
163
|
-
# result.append(seed)
|
164
|
-
|
165
|
-
for value, priority in items:
|
166
|
-
v = version + int(priority) / 1000 + random.random() / 1000
|
167
|
-
seed = Seed(value, priority=priority, version=version)
|
168
|
-
pty = "{:03d}".format(int(priority))
|
169
|
-
key = f"{version}_{pty}_{seed.sid}"
|
170
|
-
set_dict[key] = value
|
171
|
-
del_list.append(value)
|
172
|
-
result.append(seed)
|
173
|
-
|
174
|
-
print("\nset seeds into queue time: " + str(time.time() - cs))
|
175
|
-
if result:
|
176
|
-
self.execute_update(set_dict, del_list)
|
177
|
-
|
178
|
-
self.client.delete(self.update_lock)
|
179
|
-
print("push seeds into queue time: " + str(time.time() - cs))
|
180
|
-
return result
|
181
|
-
|
182
|
-
def check_spider_hash(self):
|
183
|
-
cs = time.time()
|
184
|
-
set_dict, del_list, heartbeat = {}, [], False
|
185
|
-
if self.get_lock(key=self.check_lock, t=60, timeout=600, sleep_time=60):
|
186
|
-
count = self.client.hlen(self.spider_key)
|
187
|
-
if self.client.exists(self.heartbeat_key):
|
188
|
-
heartbeat = True
|
189
|
-
now = int(time.time())
|
190
|
-
for field, value in self.iterate_hash(key=self.spider_key, count=count):
|
191
|
-
version, priority, sid = field.split("_")
|
192
|
-
if heartbeat and int(version) + 600 > now:
|
193
|
-
continue
|
194
|
-
set_dict[value] = priority
|
195
|
-
del_list.append(field)
|
196
|
-
|
197
|
-
if len(del_list) >= 1000:
|
198
|
-
self.client.expire(self.check_lock, 60)
|
199
|
-
self.execute_update(set_dict, del_list, status=1)
|
200
|
-
set_dict, del_list = {}, []
|
201
|
-
|
202
|
-
if set_dict and del_list:
|
203
|
-
self.execute_update(set_dict, del_list, status=1)
|
204
|
-
|
205
|
-
# self.client.delete(self.check_lock)
|
206
|
-
print("init seeds time: " + str(time.time() - cs))
|
207
|
-
if not heartbeat:
|
208
|
-
self.client.setnx(self.heartbeat_key, "")
|
209
|
-
self.set_heartbeat(t=15)
|
210
|
-
|
211
|
-
def add_store_sid(self, key, data):
|
212
|
-
redis_key = self.store_key % key
|
213
|
-
self.client.sadd(redis_key, *data)
|
214
|
-
|
215
|
-
|
216
|
-
current_time = datetime.now()
|
217
|
-
# 格式化日期时间字符串
|
218
|
-
formatted_time = current_time.strftime("%m%d%H%M%S%f")
|
219
|
-
c = int(formatted_time)
|
220
|
-
print(c)
|
221
|
-
d = 200 + 0.9 * random.random()
|
222
|
-
print(d)
|
223
|
-
print(time.time())
|
224
|
-
print(c + d / 1000)
|
225
|
-
# for _ in range(100):
|
226
|
-
# redis_db.get_seeds(1000)
|
227
|
-
# redis_db.get_seeds(1000)
|
228
|
-
# redis_db.check_spider_hash()
|
229
|
-
# redis_db.retry_seeds(["dc895aee47f8fc39c479f7cac6025879"])
|
230
|
-
# "1705996980_200_dc895aee47f8fc39c479f7cac6025879"
|
231
|
-
|
@@ -1,37 +0,0 @@
|
|
1
|
-
cobweb/__init__.py,sha256=cuMlo5UdlfMRnMf6TfMHL8-FIw83zCiCUtfOrrG6gog,356
|
2
|
-
cobweb/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
cobweb/base/bbb.py,sha256=iX2Xn5AukRussL8xHKTGFfgODWTMaFA3JnvQWd02XDQ,5615
|
4
|
-
cobweb/base/config.py,sha256=qZY26tziCeENdopGyVSTCyd_8B30S37GbWx0-_g7EiY,5357
|
5
|
-
cobweb/base/decorators.py,sha256=gb2puZLRHv_348ZTIAFLhKuJWKZ6a9_tzduCS8p1-UQ,3089
|
6
|
-
cobweb/base/hash_table.py,sha256=-EPHRMNOHHbdjxZJUuDxghUfwrbMA05sqrldHAgrIco,1885
|
7
|
-
cobweb/base/interface.py,sha256=LM6C0eh-d1b2CxjtiHKfP8I3XhhlYQR5r-3MD6TMIc4,1037
|
8
|
-
cobweb/base/log.py,sha256=Gb3_y4IzTo5pJohTggBCU9rK6-ZN3hgTOHkoXHyN6CU,2384
|
9
|
-
cobweb/base/queue_tmp.py,sha256=NS4qBHKq2o-R78Jpv5xp7TtOHIMg8y0livilTVK49M8,1527
|
10
|
-
cobweb/base/request.py,sha256=dHTR7qMHbIIW4ggpTAg4io1TBAYH77teYU4bmcWPXH0,2318
|
11
|
-
cobweb/base/task.py,sha256=ztgNh4_tgy95pe3REBfMLKkwf7HaShvp-fdRIWJiJXo,1230
|
12
|
-
cobweb/base/utils.py,sha256=NSSgCBE4u1yTpXZrjg8RIepYedo4ZdM38rhDObVfRhI,325
|
13
|
-
cobweb/db/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
|
-
cobweb/db/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
|
-
cobweb/db/base/client_db.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
16
|
-
cobweb/db/base/oss_db.py,sha256=LYRsh26-Fttc6sjc9oOVsCd93jZSja6dtRc8G2Em-1E,3812
|
17
|
-
cobweb/db/base/redis_db.py,sha256=2-YMrTpiLNGwA7_bt62HYKevVeJr1Y_n88JKjPg1V3s,7636
|
18
|
-
cobweb/db/base/redis_dbv3.py,sha256=u-Tmexl0nrYVVRCCbxAjcH6fyRx7CP4J7iW4BdO7q98,8354
|
19
|
-
cobweb/db/scheduler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
20
|
-
cobweb/db/scheduler/default.py,sha256=m-zzC2cbHGxplEW5OoB9Vj3nJm30Xl0sAQ94wpPb7Yw,122
|
21
|
-
cobweb/db/scheduler/textfile.py,sha256=EiOxV8h99ouIr2HvmpM9B90QY3hqGNPMeQEnps_RG-c,869
|
22
|
-
cobweb/db/storer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
23
|
-
cobweb/db/storer/console.py,sha256=QbHnZ3ou0AR6c35iVy2hYEbKlYYTnFaD8-bUODeNt14,210
|
24
|
-
cobweb/db/storer/loghub.py,sha256=Pb0OwVteIllYjL2cIyBlc1WWau1PHn9K1rlAGrr0M3k,1815
|
25
|
-
cobweb/db/storer/redis.py,sha256=jK_RirqgSaV4aIWSuySIm5f1ZfZiULqFj2kman2H-Qw,440
|
26
|
-
cobweb/db/storer/textfile.py,sha256=yAvtbPkScjZ298H25kWsI0MDg2JuI2Im4m2qmPEUNTM,443
|
27
|
-
cobweb/distributed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
28
|
-
cobweb/distributed/launcher.py,sha256=IV2jd1hLyt1YyhyN3leSMrtbN5APsmvqblr2NXy18xg,6163
|
29
|
-
cobweb/distributed/models.py,sha256=7ypYQaiHP91LbPE0u5Lb-9LDazg74UW7KacK4-ai1tM,4438
|
30
|
-
cobweb/single/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
31
|
-
cobweb/single/models.py,sha256=lu8teNWnWcUwZFra8XmqyhzOAf3UyuEztwBr1Ne6pUs,2898
|
32
|
-
cobweb/single/nest.py,sha256=mL8q9a5BjtoeUyzXCIVw_vyUsNY8ltbvQpYIIpZEDFU,5012
|
33
|
-
cobweb_launcher-0.0.4.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
|
34
|
-
cobweb_launcher-0.0.4.dist-info/METADATA,sha256=4JShM9pxXi1aouOY1OFrEuW7yrTM1K35GUJLKQpr78w,1225
|
35
|
-
cobweb_launcher-0.0.4.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
|
36
|
-
cobweb_launcher-0.0.4.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
|
37
|
-
cobweb_launcher-0.0.4.dist-info/RECORD,,
|
/cobweb/{base/log.py → log.py}
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|