cobweb-launcher 0.1.8__py3-none-any.whl → 1.2.41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cobweb/__init__.py +2 -11
- cobweb/base/__init__.py +9 -0
- cobweb/base/basic.py +297 -0
- cobweb/base/common_queue.py +30 -0
- cobweb/base/decorators.py +40 -0
- cobweb/base/dotting.py +35 -0
- cobweb/base/item.py +46 -0
- cobweb/{log.py → base/log.py} +4 -6
- cobweb/base/request.py +82 -0
- cobweb/base/response.py +23 -0
- cobweb/base/seed.py +114 -0
- cobweb/constant.py +94 -0
- cobweb/crawlers/__init__.py +1 -0
- cobweb/crawlers/base_crawler.py +144 -0
- cobweb/crawlers/crawler.py +209 -0
- cobweb/crawlers/file_crawler.py +98 -0
- cobweb/db/__init__.py +2 -2
- cobweb/db/api_db.py +82 -0
- cobweb/db/redis_db.py +125 -218
- cobweb/exceptions/__init__.py +1 -0
- cobweb/exceptions/oss_db_exception.py +28 -0
- cobweb/launchers/__init__.py +3 -0
- cobweb/launchers/launcher.py +235 -0
- cobweb/launchers/launcher_air.py +88 -0
- cobweb/launchers/launcher_api.py +209 -0
- cobweb/launchers/launcher_pro.py +208 -0
- cobweb/pipelines/__init__.py +3 -0
- cobweb/pipelines/pipeline.py +69 -0
- cobweb/pipelines/pipeline_console.py +22 -0
- cobweb/pipelines/pipeline_loghub.py +34 -0
- cobweb/schedulers/__init__.py +3 -0
- cobweb/schedulers/scheduler_api.py +72 -0
- cobweb/schedulers/scheduler_redis.py +72 -0
- cobweb/setting.py +67 -6
- cobweb/utils/__init__.py +5 -0
- cobweb/utils/bloom.py +58 -0
- cobweb/utils/dotting.py +32 -0
- cobweb/utils/oss.py +94 -0
- cobweb/utils/tools.py +42 -0
- cobweb_launcher-1.2.41.dist-info/METADATA +205 -0
- cobweb_launcher-1.2.41.dist-info/RECORD +44 -0
- {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.41.dist-info}/WHEEL +1 -1
- cobweb/bbb.py +0 -191
- cobweb/db/oss_db.py +0 -127
- cobweb/db/scheduler/__init__.py +0 -0
- cobweb/db/scheduler/default.py +0 -8
- cobweb/db/scheduler/textfile.py +0 -27
- cobweb/db/storer/__init__.py +0 -0
- cobweb/db/storer/console.py +0 -9
- cobweb/db/storer/loghub.py +0 -54
- cobweb/db/storer/redis.py +0 -15
- cobweb/db/storer/textfile.py +0 -15
- cobweb/decorators.py +0 -16
- cobweb/distributed/__init__.py +0 -0
- cobweb/distributed/launcher.py +0 -243
- cobweb/distributed/models.py +0 -143
- cobweb/interface.py +0 -34
- cobweb/single/__init__.py +0 -0
- cobweb/single/launcher.py +0 -231
- cobweb/single/models.py +0 -134
- cobweb/single/nest.py +0 -153
- cobweb/task.py +0 -50
- cobweb/utils.py +0 -90
- cobweb_launcher-0.1.8.dist-info/METADATA +0 -45
- cobweb_launcher-0.1.8.dist-info/RECORD +0 -31
- {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.41.dist-info}/LICENSE +0 -0
- {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.41.dist-info}/top_level.txt +0 -0
cobweb/db/api_db.py
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
import os
|
2
|
+
import json
|
3
|
+
import requests
|
4
|
+
|
5
|
+
|
6
|
+
class ApiDB:
|
7
|
+
|
8
|
+
def __init__(self, host=None, **kwargs):
|
9
|
+
self.host = host or os.getenv("REDIS_API_HOST", "http://127.0.0.1:4396")
|
10
|
+
|
11
|
+
def _get_response(self, api, params: dict = None):
|
12
|
+
try:
|
13
|
+
url = self.host + api
|
14
|
+
response = requests.get(url, params=params)
|
15
|
+
json_data = response.json()
|
16
|
+
response.close()
|
17
|
+
return json_data["data"]
|
18
|
+
except:
|
19
|
+
return None
|
20
|
+
|
21
|
+
def _post_response(self, api, params: dict = None, data: dict = None):
|
22
|
+
try:
|
23
|
+
url = self.host + api
|
24
|
+
headers = {"Content-Type": "application/json"}
|
25
|
+
response = requests.post(url, headers=headers, params=params, data=json.dumps(data))
|
26
|
+
json_data = response.json()
|
27
|
+
response.close()
|
28
|
+
return json_data["data"]
|
29
|
+
except:
|
30
|
+
return None
|
31
|
+
|
32
|
+
def get(self, name):
|
33
|
+
return self._get_response(api="/get", params=dict(name=name))
|
34
|
+
|
35
|
+
def setnx(self, name, value=""):
|
36
|
+
return self._get_response(api="/setnx", params=dict(name=name, value=value))
|
37
|
+
|
38
|
+
def setex(self, name, t, value=""):
|
39
|
+
return self._get_response(api="/setex", params=dict(name=name, value=value, t=t))
|
40
|
+
|
41
|
+
def expire(self, name, t, nx: bool = False, xx: bool = False, gt: bool = False, lt: bool = False):
|
42
|
+
return self._get_response(api="/expire", params=dict(name=name, t=t, nx=nx, xx=xx, gt=gt, lt=lt))
|
43
|
+
|
44
|
+
def ttl(self, name):
|
45
|
+
return self._get_response(api="/ttl", params=dict(name=name))
|
46
|
+
|
47
|
+
def delete(self, name):
|
48
|
+
return self._get_response(api="/delete", params=dict(name=name))
|
49
|
+
|
50
|
+
def exists(self, name):
|
51
|
+
return self._get_response(api="/exists", params=dict(name=name))
|
52
|
+
|
53
|
+
def incrby(self, name, value):
|
54
|
+
return self._get_response(api="/incrby", params=dict(name=name, value=value))
|
55
|
+
|
56
|
+
def zcard(self, name) -> bool:
|
57
|
+
return self._get_response(api="/zcard", params=dict(name=name))
|
58
|
+
|
59
|
+
def zadd(self, name, item: dict, **kwargs):
|
60
|
+
return self._post_response(api="/zadd", data=dict(name=name, mapping=item, **kwargs))
|
61
|
+
|
62
|
+
def zrem(self, name, *values):
|
63
|
+
return self._post_response(api="/zrem", data=dict(name=name, values=values))
|
64
|
+
|
65
|
+
def zcount(self, name, _min, _max):
|
66
|
+
return self._get_response(api="/zcount", params=dict(name=name, min=_min, max=_max))
|
67
|
+
|
68
|
+
def lock(self, name, t=15) -> bool:
|
69
|
+
return self._get_response(api="/lock", params=dict(name=name, t=t))
|
70
|
+
|
71
|
+
def auto_incr(self, name, t=15, limit=1000) -> bool:
|
72
|
+
return self._get_response(api="/auto_incr", params=dict(name=name, t=t, limit=limit))
|
73
|
+
|
74
|
+
def members(self, name, score, start=0, count=5000, _min="-inf", _max="+inf"):
|
75
|
+
return self._get_response(api="/members", params=dict(name=name, score=score, start=start, count=count, min=_min, max=_max))
|
76
|
+
|
77
|
+
def done(self, name: list, *values):
|
78
|
+
return self._post_response(api="/done", data=dict(name=name, values=values))
|
79
|
+
|
80
|
+
|
81
|
+
|
82
|
+
|
cobweb/db/redis_db.py
CHANGED
@@ -1,223 +1,130 @@
|
|
1
|
-
import time
|
2
1
|
import redis
|
3
|
-
from cobweb import
|
4
|
-
from cobweb.decorators import check_redis_status
|
2
|
+
from cobweb import setting
|
5
3
|
|
6
4
|
|
7
5
|
class RedisDB:
|
8
6
|
|
9
|
-
def __init__(
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
):
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
self.
|
22
|
-
|
23
|
-
|
24
|
-
self.
|
25
|
-
|
26
|
-
|
27
|
-
self.
|
28
|
-
|
29
|
-
|
30
|
-
self.
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
def
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
if
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
else
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
version = int(time.time())
|
134
|
-
|
135
|
-
items = self.client.zrangebyscore(self.spider_key, min=0, max="+inf", start=0, num=length, withscores=True)
|
136
|
-
|
137
|
-
for value, priority in items:
|
138
|
-
score = -(version + int(priority) / 1000)
|
139
|
-
seed = Seed(value, priority=priority, version=version)
|
140
|
-
update_item[value] = score
|
141
|
-
result.append(seed)
|
142
|
-
|
143
|
-
log.info("set seeds into queue time: " + str(time.time() - cs))
|
144
|
-
if result:
|
145
|
-
self.client.zadd(self.spider_key, mapping=update_item, xx=True)
|
146
|
-
|
147
|
-
self.client.delete(self.update_lock)
|
148
|
-
log.info("push seeds into queue time: " + str(time.time() - cs))
|
149
|
-
return result
|
150
|
-
|
151
|
-
@check_redis_status
|
152
|
-
def check_spider_queue(self, stop, storer_num):
|
153
|
-
while not stop.is_set():
|
154
|
-
# 每15s获取check锁,等待600s后仍获取不到锁则重试;获取到锁后,设置锁的存活时间为${cs_lct}s
|
155
|
-
if self._get_lock(key=self.check_lock, t=self.cs_lct, timeout=600, sleep_time=3):
|
156
|
-
heartbeat = True if self.client.exists(self.heartbeat_key) else False
|
157
|
-
# 重启重制score值,否则获取${rs_time}分钟前的分数值
|
158
|
-
score = -int(time.time()) + self.rs_time if heartbeat else "-inf"
|
159
|
-
|
160
|
-
keys = self.client.keys(self.storer_key % "*")
|
161
|
-
|
162
|
-
if keys and len(keys) >= storer_num:
|
163
|
-
intersection_key = self.storer_key % "intersection"
|
164
|
-
self.client.delete(intersection_key)
|
165
|
-
self.client.zinterstore(intersection_key, keys)
|
166
|
-
|
167
|
-
while True:
|
168
|
-
members = self.client.zrange(intersection_key, 0, 1999)
|
169
|
-
if not members:
|
170
|
-
break
|
171
|
-
for key in keys:
|
172
|
-
self.client.zrem(key, *members)
|
173
|
-
if self.model == 2:
|
174
|
-
self.client.sadd(self.succeed_key, *members)
|
175
|
-
self.client.zrem(self.spider_key, *members)
|
176
|
-
self.client.zrem(intersection_key, *members)
|
177
|
-
log.info("succeed spider data ...")
|
178
|
-
|
179
|
-
for key in keys:
|
180
|
-
self.client.zremrangebyscore(key, min=score, max="(0")
|
181
|
-
|
182
|
-
while True:
|
183
|
-
items = self.client.zrangebyscore(self.spider_key, min=score, max="(0", start=0, num=5000, withscores=True)
|
184
|
-
if not items:
|
185
|
-
break
|
186
|
-
reset_items = {}
|
187
|
-
for value, priority in items:
|
188
|
-
reset_score = "{:.3f}".format(priority).split(".")[1]
|
189
|
-
reset_items[value] = int(reset_score)
|
190
|
-
if reset_items:
|
191
|
-
self.client.zadd(self.spider_key, mapping=reset_items, xx=True)
|
192
|
-
|
193
|
-
if not heartbeat:
|
194
|
-
self.client.setex(self.heartbeat_key, 15, "")
|
195
|
-
|
196
|
-
# self.client.delete(self.check_lock)
|
197
|
-
# time.sleep(3)
|
198
|
-
|
199
|
-
@check_redis_status
|
200
|
-
def set_heartbeat(self, stop):
|
201
|
-
time.sleep(5)
|
202
|
-
while not stop.is_set():
|
203
|
-
self.client.setex(self.heartbeat_key, 5, "")
|
204
|
-
time.sleep(3)
|
205
|
-
|
206
|
-
# @check_redis_status
|
207
|
-
# def heartbeat(self):
|
208
|
-
# """
|
209
|
-
# 返回心跳key剩余存活时间
|
210
|
-
# """
|
211
|
-
# return self.client.ttl(self.heartbeat_key)
|
212
|
-
|
213
|
-
@check_redis_status
|
214
|
-
def spider_queue_length(self):
|
215
|
-
return self.client.zcard(self.spider_key)
|
216
|
-
|
217
|
-
@check_redis_status
|
218
|
-
def ready_seed_length(self):
|
219
|
-
return self.client.zcount(self.spider_key, min=0, max="+inf")
|
220
|
-
|
221
|
-
@check_redis_status
|
222
|
-
def get_scheduler_lock(self):
|
223
|
-
return self._get_lock(self.scheduler_lock)
|
7
|
+
def __init__(self, **kwargs):
|
8
|
+
redis_config = kwargs or setting.REDIS_CONFIG
|
9
|
+
pool = redis.ConnectionPool(**redis_config)
|
10
|
+
self._client = redis.Redis(connection_pool=pool)
|
11
|
+
|
12
|
+
def setnx(self, name, value=""):
|
13
|
+
return self._client.setnx(name, value)
|
14
|
+
|
15
|
+
def setex(self, name, t, value=""):
|
16
|
+
return self._client.setex(name, t, value)
|
17
|
+
|
18
|
+
def expire(self, name, t, nx: bool = False, xx: bool = False, gt: bool = False, lt: bool = False):
|
19
|
+
return self._client.expire(name, t, nx, xx, gt, lt)
|
20
|
+
|
21
|
+
def ttl(self, name):
|
22
|
+
return self._client.ttl(name)
|
23
|
+
|
24
|
+
def delete(self, name):
|
25
|
+
return self._client.delete(name)
|
26
|
+
|
27
|
+
def exists(self, *name) -> bool:
|
28
|
+
return self._client.exists(*name)
|
29
|
+
|
30
|
+
def sadd(self, name, value):
|
31
|
+
return self._client.sadd(name, value)
|
32
|
+
|
33
|
+
def zcard(self, name) -> bool:
|
34
|
+
return self._client.zcard(name)
|
35
|
+
|
36
|
+
def zadd(self, name, item: dict, **kwargs):
|
37
|
+
return self._client.zadd(name, item, **kwargs)
|
38
|
+
|
39
|
+
def zrem(self, name, *value):
|
40
|
+
return self._client.zrem(name, *value)
|
41
|
+
|
42
|
+
def zcount(self, name, _min, _max):
|
43
|
+
return self._client.zcount(name, _min, _max)
|
44
|
+
|
45
|
+
# def zrangebyscore(self, name, _min, _max, start, num, withscores: bool = False, *args):
|
46
|
+
# return self._client.zrangebyscore(name, _min, _max, start, num, withscores, *args)
|
47
|
+
|
48
|
+
def lua(self, script: str, keys: list = None, args: list = None):
|
49
|
+
keys = keys or []
|
50
|
+
args = args or []
|
51
|
+
keys_count = len(keys)
|
52
|
+
return self._client.eval(script, keys_count, *keys, *args)
|
53
|
+
|
54
|
+
def lua_sha(self, sha1: str, keys: list = None, args: list = None):
|
55
|
+
keys = keys or []
|
56
|
+
args = args or []
|
57
|
+
keys_count = len(keys)
|
58
|
+
return self._client.evalsha(sha1, keys_count, *keys, *args)
|
59
|
+
|
60
|
+
def execute_lua(self, lua_script: str, keys: list, *args):
|
61
|
+
execute = self._client.register_script(lua_script)
|
62
|
+
return execute(keys=keys, args=args)
|
63
|
+
|
64
|
+
def lock(self, key, t=15) -> bool:
|
65
|
+
lua_script = """
|
66
|
+
local status = redis.call('setnx', KEYS[1], 1)
|
67
|
+
if ( status == 1 ) then
|
68
|
+
redis.call('expire', KEYS[1], ARGV[1])
|
69
|
+
end
|
70
|
+
return status
|
71
|
+
"""
|
72
|
+
status = self.execute_lua(lua_script, [key], t)
|
73
|
+
return bool(status)
|
74
|
+
|
75
|
+
def members(self, key, score, start=0, count=5000, _min="-inf", _max="+inf") -> list:
|
76
|
+
lua_script = """
|
77
|
+
local min = ARGV[1]
|
78
|
+
local max = ARGV[2]
|
79
|
+
local start = ARGV[3]
|
80
|
+
local count = ARGV[4]
|
81
|
+
local score = ARGV[5]
|
82
|
+
local members = nil
|
83
|
+
|
84
|
+
if ( type(count) == string ) then
|
85
|
+
members = redis.call('zrangebyscore', KEYS[1], min, max, 'WITHSCORES')
|
86
|
+
else
|
87
|
+
members = redis.call('zrangebyscore', KEYS[1], min, max, 'WITHSCORES', 'limit', start, count)
|
88
|
+
end
|
89
|
+
|
90
|
+
local result = {}
|
91
|
+
|
92
|
+
for i = 1, #members, 2 do
|
93
|
+
local priority = nil
|
94
|
+
local member = members[i]
|
95
|
+
local originPriority = nil
|
96
|
+
if ( members[i+1] + 0 < 0 ) then
|
97
|
+
originPriority = math.ceil(members[i+1]) * 1000 - members[i+1] * 1000
|
98
|
+
else
|
99
|
+
originPriority = math.floor(members[i+1])
|
100
|
+
end
|
101
|
+
|
102
|
+
if ( score + 0 >= 1000 ) then
|
103
|
+
priority = -score - originPriority / 1000
|
104
|
+
elseif ( score + 0 == 0 ) then
|
105
|
+
priority = originPriority
|
106
|
+
else
|
107
|
+
originPriority = score
|
108
|
+
priority = score
|
109
|
+
end
|
110
|
+
redis.call('zadd', KEYS[1], priority, member)
|
111
|
+
table.insert(result, member)
|
112
|
+
table.insert(result, originPriority)
|
113
|
+
end
|
114
|
+
|
115
|
+
return result
|
116
|
+
"""
|
117
|
+
members = self.execute_lua(lua_script, [key], _min, _max, start, count, score)
|
118
|
+
return [(members[i].decode(), int(members[i + 1])) for i in range(0, len(members), 2)]
|
119
|
+
|
120
|
+
def done(self, keys: list, *args) -> list:
|
121
|
+
lua_script = """
|
122
|
+
for i, member in ipairs(ARGV) do
|
123
|
+
redis.call("zrem", KEYS[1], member)
|
124
|
+
redis.call("sadd", KEYS[2], member)
|
125
|
+
end
|
126
|
+
"""
|
127
|
+
self.execute_lua(lua_script, keys, *args)
|
128
|
+
|
129
|
+
|
130
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
from .oss_db_exception import *
|
@@ -0,0 +1,28 @@
|
|
1
|
+
class OssDBException(Exception):
|
2
|
+
"""Base oss client exception that all others inherit."""
|
3
|
+
|
4
|
+
|
5
|
+
class OssDBMergeError(OssDBException):
|
6
|
+
"""
|
7
|
+
Exception raised when execute merge operation fails.
|
8
|
+
"""
|
9
|
+
|
10
|
+
|
11
|
+
class OssDBPutPartError(OssDBException):
|
12
|
+
"""
|
13
|
+
Exception raised when upload part operation fails.
|
14
|
+
"""
|
15
|
+
|
16
|
+
|
17
|
+
class OssDBPutObjError(OssDBException):
|
18
|
+
"""
|
19
|
+
Exception raised when upload operation fails.
|
20
|
+
"""
|
21
|
+
|
22
|
+
|
23
|
+
class OssDBAppendObjError(OssDBException):
|
24
|
+
"""Exception raised when upload operation fails."""
|
25
|
+
|
26
|
+
|
27
|
+
class OssDBInitPartError(OssDBException):
|
28
|
+
"""Exception raised when init upload operation fails."""
|