cobweb-launcher 0.1.8__py3-none-any.whl → 1.2.42__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- cobweb/__init__.py +2 -11
- cobweb/base/__init__.py +9 -0
- cobweb/base/basic.py +297 -0
- cobweb/base/common_queue.py +30 -0
- cobweb/base/decorators.py +40 -0
- cobweb/base/dotting.py +35 -0
- cobweb/base/item.py +46 -0
- cobweb/{log.py → base/log.py} +4 -6
- cobweb/base/request.py +82 -0
- cobweb/base/response.py +23 -0
- cobweb/base/seed.py +114 -0
- cobweb/constant.py +94 -0
- cobweb/crawlers/__init__.py +1 -0
- cobweb/crawlers/base_crawler.py +144 -0
- cobweb/crawlers/crawler.py +212 -0
- cobweb/crawlers/file_crawler.py +98 -0
- cobweb/db/__init__.py +2 -2
- cobweb/db/api_db.py +82 -0
- cobweb/db/redis_db.py +125 -218
- cobweb/exceptions/__init__.py +1 -0
- cobweb/exceptions/oss_db_exception.py +28 -0
- cobweb/launchers/__init__.py +3 -0
- cobweb/launchers/launcher.py +235 -0
- cobweb/launchers/launcher_air.py +88 -0
- cobweb/launchers/launcher_api.py +209 -0
- cobweb/launchers/launcher_pro.py +208 -0
- cobweb/pipelines/__init__.py +3 -0
- cobweb/pipelines/pipeline.py +69 -0
- cobweb/pipelines/pipeline_console.py +22 -0
- cobweb/pipelines/pipeline_loghub.py +34 -0
- cobweb/schedulers/__init__.py +3 -0
- cobweb/schedulers/scheduler_api.py +72 -0
- cobweb/schedulers/scheduler_redis.py +72 -0
- cobweb/setting.py +67 -6
- cobweb/utils/__init__.py +5 -0
- cobweb/utils/bloom.py +58 -0
- cobweb/utils/dotting.py +32 -0
- cobweb/utils/oss.py +94 -0
- cobweb/utils/tools.py +42 -0
- cobweb_launcher-1.2.42.dist-info/METADATA +205 -0
- cobweb_launcher-1.2.42.dist-info/RECORD +44 -0
- {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.42.dist-info}/WHEEL +1 -1
- cobweb/bbb.py +0 -191
- cobweb/db/oss_db.py +0 -127
- cobweb/db/scheduler/__init__.py +0 -0
- cobweb/db/scheduler/default.py +0 -8
- cobweb/db/scheduler/textfile.py +0 -27
- cobweb/db/storer/__init__.py +0 -0
- cobweb/db/storer/console.py +0 -9
- cobweb/db/storer/loghub.py +0 -54
- cobweb/db/storer/redis.py +0 -15
- cobweb/db/storer/textfile.py +0 -15
- cobweb/decorators.py +0 -16
- cobweb/distributed/__init__.py +0 -0
- cobweb/distributed/launcher.py +0 -243
- cobweb/distributed/models.py +0 -143
- cobweb/interface.py +0 -34
- cobweb/single/__init__.py +0 -0
- cobweb/single/launcher.py +0 -231
- cobweb/single/models.py +0 -134
- cobweb/single/nest.py +0 -153
- cobweb/task.py +0 -50
- cobweb/utils.py +0 -90
- cobweb_launcher-0.1.8.dist-info/METADATA +0 -45
- cobweb_launcher-0.1.8.dist-info/RECORD +0 -31
- {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.42.dist-info}/LICENSE +0 -0
- {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.42.dist-info}/top_level.txt +0 -0
cobweb/db/api_db.py
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
import os
|
2
|
+
import json
|
3
|
+
import requests
|
4
|
+
|
5
|
+
|
6
|
+
class ApiDB:
|
7
|
+
|
8
|
+
def __init__(self, host=None, **kwargs):
|
9
|
+
self.host = host or os.getenv("REDIS_API_HOST", "http://127.0.0.1:4396")
|
10
|
+
|
11
|
+
def _get_response(self, api, params: dict = None):
|
12
|
+
try:
|
13
|
+
url = self.host + api
|
14
|
+
response = requests.get(url, params=params)
|
15
|
+
json_data = response.json()
|
16
|
+
response.close()
|
17
|
+
return json_data["data"]
|
18
|
+
except:
|
19
|
+
return None
|
20
|
+
|
21
|
+
def _post_response(self, api, params: dict = None, data: dict = None):
|
22
|
+
try:
|
23
|
+
url = self.host + api
|
24
|
+
headers = {"Content-Type": "application/json"}
|
25
|
+
response = requests.post(url, headers=headers, params=params, data=json.dumps(data))
|
26
|
+
json_data = response.json()
|
27
|
+
response.close()
|
28
|
+
return json_data["data"]
|
29
|
+
except:
|
30
|
+
return None
|
31
|
+
|
32
|
+
def get(self, name):
|
33
|
+
return self._get_response(api="/get", params=dict(name=name))
|
34
|
+
|
35
|
+
def setnx(self, name, value=""):
|
36
|
+
return self._get_response(api="/setnx", params=dict(name=name, value=value))
|
37
|
+
|
38
|
+
def setex(self, name, t, value=""):
|
39
|
+
return self._get_response(api="/setex", params=dict(name=name, value=value, t=t))
|
40
|
+
|
41
|
+
def expire(self, name, t, nx: bool = False, xx: bool = False, gt: bool = False, lt: bool = False):
|
42
|
+
return self._get_response(api="/expire", params=dict(name=name, t=t, nx=nx, xx=xx, gt=gt, lt=lt))
|
43
|
+
|
44
|
+
def ttl(self, name):
|
45
|
+
return self._get_response(api="/ttl", params=dict(name=name))
|
46
|
+
|
47
|
+
def delete(self, name):
|
48
|
+
return self._get_response(api="/delete", params=dict(name=name))
|
49
|
+
|
50
|
+
def exists(self, name):
|
51
|
+
return self._get_response(api="/exists", params=dict(name=name))
|
52
|
+
|
53
|
+
def incrby(self, name, value):
|
54
|
+
return self._get_response(api="/incrby", params=dict(name=name, value=value))
|
55
|
+
|
56
|
+
def zcard(self, name) -> bool:
|
57
|
+
return self._get_response(api="/zcard", params=dict(name=name))
|
58
|
+
|
59
|
+
def zadd(self, name, item: dict, **kwargs):
|
60
|
+
return self._post_response(api="/zadd", data=dict(name=name, mapping=item, **kwargs))
|
61
|
+
|
62
|
+
def zrem(self, name, *values):
|
63
|
+
return self._post_response(api="/zrem", data=dict(name=name, values=values))
|
64
|
+
|
65
|
+
def zcount(self, name, _min, _max):
|
66
|
+
return self._get_response(api="/zcount", params=dict(name=name, min=_min, max=_max))
|
67
|
+
|
68
|
+
def lock(self, name, t=15) -> bool:
|
69
|
+
return self._get_response(api="/lock", params=dict(name=name, t=t))
|
70
|
+
|
71
|
+
def auto_incr(self, name, t=15, limit=1000) -> bool:
|
72
|
+
return self._get_response(api="/auto_incr", params=dict(name=name, t=t, limit=limit))
|
73
|
+
|
74
|
+
def members(self, name, score, start=0, count=5000, _min="-inf", _max="+inf"):
|
75
|
+
return self._get_response(api="/members", params=dict(name=name, score=score, start=start, count=count, min=_min, max=_max))
|
76
|
+
|
77
|
+
def done(self, name: list, *values):
|
78
|
+
return self._post_response(api="/done", data=dict(name=name, values=values))
|
79
|
+
|
80
|
+
|
81
|
+
|
82
|
+
|
cobweb/db/redis_db.py
CHANGED
@@ -1,223 +1,130 @@
|
|
1
|
-
import time
|
2
1
|
import redis
|
3
|
-
from cobweb import
|
4
|
-
from cobweb.decorators import check_redis_status
|
2
|
+
from cobweb import setting
|
5
3
|
|
6
4
|
|
7
5
|
class RedisDB:
|
8
6
|
|
9
|
-
def __init__(
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
):
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
self.
|
22
|
-
|
23
|
-
|
24
|
-
self.
|
25
|
-
|
26
|
-
|
27
|
-
self.
|
28
|
-
|
29
|
-
|
30
|
-
self.
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
def
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
if
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
else
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
version = int(time.time())
|
134
|
-
|
135
|
-
items = self.client.zrangebyscore(self.spider_key, min=0, max="+inf", start=0, num=length, withscores=True)
|
136
|
-
|
137
|
-
for value, priority in items:
|
138
|
-
score = -(version + int(priority) / 1000)
|
139
|
-
seed = Seed(value, priority=priority, version=version)
|
140
|
-
update_item[value] = score
|
141
|
-
result.append(seed)
|
142
|
-
|
143
|
-
log.info("set seeds into queue time: " + str(time.time() - cs))
|
144
|
-
if result:
|
145
|
-
self.client.zadd(self.spider_key, mapping=update_item, xx=True)
|
146
|
-
|
147
|
-
self.client.delete(self.update_lock)
|
148
|
-
log.info("push seeds into queue time: " + str(time.time() - cs))
|
149
|
-
return result
|
150
|
-
|
151
|
-
@check_redis_status
|
152
|
-
def check_spider_queue(self, stop, storer_num):
|
153
|
-
while not stop.is_set():
|
154
|
-
# 每15s获取check锁,等待600s后仍获取不到锁则重试;获取到锁后,设置锁的存活时间为${cs_lct}s
|
155
|
-
if self._get_lock(key=self.check_lock, t=self.cs_lct, timeout=600, sleep_time=3):
|
156
|
-
heartbeat = True if self.client.exists(self.heartbeat_key) else False
|
157
|
-
# 重启重制score值,否则获取${rs_time}分钟前的分数值
|
158
|
-
score = -int(time.time()) + self.rs_time if heartbeat else "-inf"
|
159
|
-
|
160
|
-
keys = self.client.keys(self.storer_key % "*")
|
161
|
-
|
162
|
-
if keys and len(keys) >= storer_num:
|
163
|
-
intersection_key = self.storer_key % "intersection"
|
164
|
-
self.client.delete(intersection_key)
|
165
|
-
self.client.zinterstore(intersection_key, keys)
|
166
|
-
|
167
|
-
while True:
|
168
|
-
members = self.client.zrange(intersection_key, 0, 1999)
|
169
|
-
if not members:
|
170
|
-
break
|
171
|
-
for key in keys:
|
172
|
-
self.client.zrem(key, *members)
|
173
|
-
if self.model == 2:
|
174
|
-
self.client.sadd(self.succeed_key, *members)
|
175
|
-
self.client.zrem(self.spider_key, *members)
|
176
|
-
self.client.zrem(intersection_key, *members)
|
177
|
-
log.info("succeed spider data ...")
|
178
|
-
|
179
|
-
for key in keys:
|
180
|
-
self.client.zremrangebyscore(key, min=score, max="(0")
|
181
|
-
|
182
|
-
while True:
|
183
|
-
items = self.client.zrangebyscore(self.spider_key, min=score, max="(0", start=0, num=5000, withscores=True)
|
184
|
-
if not items:
|
185
|
-
break
|
186
|
-
reset_items = {}
|
187
|
-
for value, priority in items:
|
188
|
-
reset_score = "{:.3f}".format(priority).split(".")[1]
|
189
|
-
reset_items[value] = int(reset_score)
|
190
|
-
if reset_items:
|
191
|
-
self.client.zadd(self.spider_key, mapping=reset_items, xx=True)
|
192
|
-
|
193
|
-
if not heartbeat:
|
194
|
-
self.client.setex(self.heartbeat_key, 15, "")
|
195
|
-
|
196
|
-
# self.client.delete(self.check_lock)
|
197
|
-
# time.sleep(3)
|
198
|
-
|
199
|
-
@check_redis_status
|
200
|
-
def set_heartbeat(self, stop):
|
201
|
-
time.sleep(5)
|
202
|
-
while not stop.is_set():
|
203
|
-
self.client.setex(self.heartbeat_key, 5, "")
|
204
|
-
time.sleep(3)
|
205
|
-
|
206
|
-
# @check_redis_status
|
207
|
-
# def heartbeat(self):
|
208
|
-
# """
|
209
|
-
# 返回心跳key剩余存活时间
|
210
|
-
# """
|
211
|
-
# return self.client.ttl(self.heartbeat_key)
|
212
|
-
|
213
|
-
@check_redis_status
|
214
|
-
def spider_queue_length(self):
|
215
|
-
return self.client.zcard(self.spider_key)
|
216
|
-
|
217
|
-
@check_redis_status
|
218
|
-
def ready_seed_length(self):
|
219
|
-
return self.client.zcount(self.spider_key, min=0, max="+inf")
|
220
|
-
|
221
|
-
@check_redis_status
|
222
|
-
def get_scheduler_lock(self):
|
223
|
-
return self._get_lock(self.scheduler_lock)
|
7
|
+
def __init__(self, **kwargs):
|
8
|
+
redis_config = kwargs or setting.REDIS_CONFIG
|
9
|
+
pool = redis.ConnectionPool(**redis_config)
|
10
|
+
self._client = redis.Redis(connection_pool=pool)
|
11
|
+
|
12
|
+
def setnx(self, name, value=""):
|
13
|
+
return self._client.setnx(name, value)
|
14
|
+
|
15
|
+
def setex(self, name, t, value=""):
|
16
|
+
return self._client.setex(name, t, value)
|
17
|
+
|
18
|
+
def expire(self, name, t, nx: bool = False, xx: bool = False, gt: bool = False, lt: bool = False):
|
19
|
+
return self._client.expire(name, t, nx, xx, gt, lt)
|
20
|
+
|
21
|
+
def ttl(self, name):
|
22
|
+
return self._client.ttl(name)
|
23
|
+
|
24
|
+
def delete(self, name):
|
25
|
+
return self._client.delete(name)
|
26
|
+
|
27
|
+
def exists(self, *name) -> bool:
|
28
|
+
return self._client.exists(*name)
|
29
|
+
|
30
|
+
def sadd(self, name, value):
|
31
|
+
return self._client.sadd(name, value)
|
32
|
+
|
33
|
+
def zcard(self, name) -> bool:
|
34
|
+
return self._client.zcard(name)
|
35
|
+
|
36
|
+
def zadd(self, name, item: dict, **kwargs):
|
37
|
+
return self._client.zadd(name, item, **kwargs)
|
38
|
+
|
39
|
+
def zrem(self, name, *value):
|
40
|
+
return self._client.zrem(name, *value)
|
41
|
+
|
42
|
+
def zcount(self, name, _min, _max):
|
43
|
+
return self._client.zcount(name, _min, _max)
|
44
|
+
|
45
|
+
# def zrangebyscore(self, name, _min, _max, start, num, withscores: bool = False, *args):
|
46
|
+
# return self._client.zrangebyscore(name, _min, _max, start, num, withscores, *args)
|
47
|
+
|
48
|
+
def lua(self, script: str, keys: list = None, args: list = None):
|
49
|
+
keys = keys or []
|
50
|
+
args = args or []
|
51
|
+
keys_count = len(keys)
|
52
|
+
return self._client.eval(script, keys_count, *keys, *args)
|
53
|
+
|
54
|
+
def lua_sha(self, sha1: str, keys: list = None, args: list = None):
|
55
|
+
keys = keys or []
|
56
|
+
args = args or []
|
57
|
+
keys_count = len(keys)
|
58
|
+
return self._client.evalsha(sha1, keys_count, *keys, *args)
|
59
|
+
|
60
|
+
def execute_lua(self, lua_script: str, keys: list, *args):
|
61
|
+
execute = self._client.register_script(lua_script)
|
62
|
+
return execute(keys=keys, args=args)
|
63
|
+
|
64
|
+
def lock(self, key, t=15) -> bool:
|
65
|
+
lua_script = """
|
66
|
+
local status = redis.call('setnx', KEYS[1], 1)
|
67
|
+
if ( status == 1 ) then
|
68
|
+
redis.call('expire', KEYS[1], ARGV[1])
|
69
|
+
end
|
70
|
+
return status
|
71
|
+
"""
|
72
|
+
status = self.execute_lua(lua_script, [key], t)
|
73
|
+
return bool(status)
|
74
|
+
|
75
|
+
def members(self, key, score, start=0, count=5000, _min="-inf", _max="+inf") -> list:
|
76
|
+
lua_script = """
|
77
|
+
local min = ARGV[1]
|
78
|
+
local max = ARGV[2]
|
79
|
+
local start = ARGV[3]
|
80
|
+
local count = ARGV[4]
|
81
|
+
local score = ARGV[5]
|
82
|
+
local members = nil
|
83
|
+
|
84
|
+
if ( type(count) == string ) then
|
85
|
+
members = redis.call('zrangebyscore', KEYS[1], min, max, 'WITHSCORES')
|
86
|
+
else
|
87
|
+
members = redis.call('zrangebyscore', KEYS[1], min, max, 'WITHSCORES', 'limit', start, count)
|
88
|
+
end
|
89
|
+
|
90
|
+
local result = {}
|
91
|
+
|
92
|
+
for i = 1, #members, 2 do
|
93
|
+
local priority = nil
|
94
|
+
local member = members[i]
|
95
|
+
local originPriority = nil
|
96
|
+
if ( members[i+1] + 0 < 0 ) then
|
97
|
+
originPriority = math.ceil(members[i+1]) * 1000 - members[i+1] * 1000
|
98
|
+
else
|
99
|
+
originPriority = math.floor(members[i+1])
|
100
|
+
end
|
101
|
+
|
102
|
+
if ( score + 0 >= 1000 ) then
|
103
|
+
priority = -score - originPriority / 1000
|
104
|
+
elseif ( score + 0 == 0 ) then
|
105
|
+
priority = originPriority
|
106
|
+
else
|
107
|
+
originPriority = score
|
108
|
+
priority = score
|
109
|
+
end
|
110
|
+
redis.call('zadd', KEYS[1], priority, member)
|
111
|
+
table.insert(result, member)
|
112
|
+
table.insert(result, originPriority)
|
113
|
+
end
|
114
|
+
|
115
|
+
return result
|
116
|
+
"""
|
117
|
+
members = self.execute_lua(lua_script, [key], _min, _max, start, count, score)
|
118
|
+
return [(members[i].decode(), int(members[i + 1])) for i in range(0, len(members), 2)]
|
119
|
+
|
120
|
+
def done(self, keys: list, *args) -> list:
|
121
|
+
lua_script = """
|
122
|
+
for i, member in ipairs(ARGV) do
|
123
|
+
redis.call("zrem", KEYS[1], member)
|
124
|
+
redis.call("sadd", KEYS[2], member)
|
125
|
+
end
|
126
|
+
"""
|
127
|
+
self.execute_lua(lua_script, keys, *args)
|
128
|
+
|
129
|
+
|
130
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
from .oss_db_exception import *
|
@@ -0,0 +1,28 @@
|
|
1
|
+
class OssDBException(Exception):
|
2
|
+
"""Base oss client exception that all others inherit."""
|
3
|
+
|
4
|
+
|
5
|
+
class OssDBMergeError(OssDBException):
|
6
|
+
"""
|
7
|
+
Exception raised when execute merge operation fails.
|
8
|
+
"""
|
9
|
+
|
10
|
+
|
11
|
+
class OssDBPutPartError(OssDBException):
|
12
|
+
"""
|
13
|
+
Exception raised when upload part operation fails.
|
14
|
+
"""
|
15
|
+
|
16
|
+
|
17
|
+
class OssDBPutObjError(OssDBException):
|
18
|
+
"""
|
19
|
+
Exception raised when upload operation fails.
|
20
|
+
"""
|
21
|
+
|
22
|
+
|
23
|
+
class OssDBAppendObjError(OssDBException):
|
24
|
+
"""Exception raised when upload operation fails."""
|
25
|
+
|
26
|
+
|
27
|
+
class OssDBInitPartError(OssDBException):
|
28
|
+
"""Exception raised when init upload operation fails."""
|