cobweb-launcher 0.1.8__py3-none-any.whl → 1.2.42__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- cobweb/__init__.py +2 -11
- cobweb/base/__init__.py +9 -0
- cobweb/base/basic.py +297 -0
- cobweb/base/common_queue.py +30 -0
- cobweb/base/decorators.py +40 -0
- cobweb/base/dotting.py +35 -0
- cobweb/base/item.py +46 -0
- cobweb/{log.py → base/log.py} +4 -6
- cobweb/base/request.py +82 -0
- cobweb/base/response.py +23 -0
- cobweb/base/seed.py +114 -0
- cobweb/constant.py +94 -0
- cobweb/crawlers/__init__.py +1 -0
- cobweb/crawlers/base_crawler.py +144 -0
- cobweb/crawlers/crawler.py +212 -0
- cobweb/crawlers/file_crawler.py +98 -0
- cobweb/db/__init__.py +2 -2
- cobweb/db/api_db.py +82 -0
- cobweb/db/redis_db.py +125 -218
- cobweb/exceptions/__init__.py +1 -0
- cobweb/exceptions/oss_db_exception.py +28 -0
- cobweb/launchers/__init__.py +3 -0
- cobweb/launchers/launcher.py +235 -0
- cobweb/launchers/launcher_air.py +88 -0
- cobweb/launchers/launcher_api.py +209 -0
- cobweb/launchers/launcher_pro.py +208 -0
- cobweb/pipelines/__init__.py +3 -0
- cobweb/pipelines/pipeline.py +69 -0
- cobweb/pipelines/pipeline_console.py +22 -0
- cobweb/pipelines/pipeline_loghub.py +34 -0
- cobweb/schedulers/__init__.py +3 -0
- cobweb/schedulers/scheduler_api.py +72 -0
- cobweb/schedulers/scheduler_redis.py +72 -0
- cobweb/setting.py +67 -6
- cobweb/utils/__init__.py +5 -0
- cobweb/utils/bloom.py +58 -0
- cobweb/utils/dotting.py +32 -0
- cobweb/utils/oss.py +94 -0
- cobweb/utils/tools.py +42 -0
- cobweb_launcher-1.2.42.dist-info/METADATA +205 -0
- cobweb_launcher-1.2.42.dist-info/RECORD +44 -0
- {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.42.dist-info}/WHEEL +1 -1
- cobweb/bbb.py +0 -191
- cobweb/db/oss_db.py +0 -127
- cobweb/db/scheduler/__init__.py +0 -0
- cobweb/db/scheduler/default.py +0 -8
- cobweb/db/scheduler/textfile.py +0 -27
- cobweb/db/storer/__init__.py +0 -0
- cobweb/db/storer/console.py +0 -9
- cobweb/db/storer/loghub.py +0 -54
- cobweb/db/storer/redis.py +0 -15
- cobweb/db/storer/textfile.py +0 -15
- cobweb/decorators.py +0 -16
- cobweb/distributed/__init__.py +0 -0
- cobweb/distributed/launcher.py +0 -243
- cobweb/distributed/models.py +0 -143
- cobweb/interface.py +0 -34
- cobweb/single/__init__.py +0 -0
- cobweb/single/launcher.py +0 -231
- cobweb/single/models.py +0 -134
- cobweb/single/nest.py +0 -153
- cobweb/task.py +0 -50
- cobweb/utils.py +0 -90
- cobweb_launcher-0.1.8.dist-info/METADATA +0 -45
- cobweb_launcher-0.1.8.dist-info/RECORD +0 -31
- {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.42.dist-info}/LICENSE +0 -0
- {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.42.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,208 @@
|
|
1
|
+
import time
|
2
|
+
import threading
|
3
|
+
|
4
|
+
from cobweb.db import RedisDB
|
5
|
+
from cobweb.base import Seed, logger
|
6
|
+
from cobweb.utils import BloomFilter
|
7
|
+
from cobweb.constant import DealModel, LogTemplate
|
8
|
+
from .launcher import Launcher, check_pause
|
9
|
+
|
10
|
+
|
11
|
+
class LauncherPro(Launcher):
|
12
|
+
|
13
|
+
def __init__(self, task, project, custom_setting=None, **kwargs):
|
14
|
+
super().__init__(task, project, custom_setting, **kwargs)
|
15
|
+
self._todo_key = "{%s:%s}:todo" % (project, task)
|
16
|
+
self._done_key = "{%s:%s}:done" % (project, task)
|
17
|
+
self._fail_key = "{%s:%s}:fail" % (project, task)
|
18
|
+
self._heartbeat_key = "heartbeat:%s_%s" % (project, task)
|
19
|
+
|
20
|
+
self._statistics_done_key = "statistics:%s:%s:done" % (project, task)
|
21
|
+
self._statistics_fail_key = "statistics:%s:%s:fail" % (project, task)
|
22
|
+
self._speed_control_key = "speed_control:%s_%s" % (project, task)
|
23
|
+
|
24
|
+
self._reset_lock_key = "lock:reset:%s_%s" % (project, task)
|
25
|
+
|
26
|
+
# self._bf_key = "bloom_%s_%s" % (project, task)
|
27
|
+
#
|
28
|
+
self._db = RedisDB()
|
29
|
+
#
|
30
|
+
# self._bf = BloomFilter(self._bf_key)
|
31
|
+
|
32
|
+
self._heartbeat_start_event = threading.Event()
|
33
|
+
self._redis_queue_empty_event = threading.Event()
|
34
|
+
|
35
|
+
@property
|
36
|
+
def heartbeat(self):
|
37
|
+
return self._db.exists(self._heartbeat_key)
|
38
|
+
|
39
|
+
def statistics(self, key, count):
|
40
|
+
if not self._task_model and not self._db.exists(key):
|
41
|
+
self._db.setex(key, 86400 * 30, int(count))
|
42
|
+
else:
|
43
|
+
self._db._client.incrby(key, count)
|
44
|
+
|
45
|
+
def _get_seed(self) -> Seed:
|
46
|
+
spider_speed = self._db._client.get(self._speed_control_key)
|
47
|
+
if int(spider_speed or 0) > self._spider_max_count:
|
48
|
+
expire_time = self._db.ttl(self._speed_control_key)
|
49
|
+
if expire_time == -1:
|
50
|
+
self._db.delete(self._speed_control_key)
|
51
|
+
else:
|
52
|
+
logger.info(f"Too fast! Please wait {expire_time} seconds...")
|
53
|
+
time.sleep(expire_time / 2)
|
54
|
+
return None
|
55
|
+
seed = self.__LAUNCHER_QUEUE__["todo"].pop()
|
56
|
+
if seed and not self._db.lock(self._speed_control_key, t=self._time_window):
|
57
|
+
self._db._client.incrby(self._speed_control_key, 1)
|
58
|
+
return seed
|
59
|
+
|
60
|
+
@check_pause
|
61
|
+
def _execute_heartbeat(self):
|
62
|
+
if self._heartbeat_start_event.is_set():
|
63
|
+
self._db.setex(self._heartbeat_key, 5)
|
64
|
+
time.sleep(3)
|
65
|
+
|
66
|
+
@check_pause
|
67
|
+
def _reset(self):
|
68
|
+
"""
|
69
|
+
检查过期种子,重新添加到redis缓存中
|
70
|
+
"""
|
71
|
+
reset_wait_seconds = 30
|
72
|
+
if self._db.lock(self._reset_lock_key, t=120):
|
73
|
+
|
74
|
+
_min = -int(time.time()) + self._seed_reset_seconds \
|
75
|
+
if self.heartbeat else "-inf"
|
76
|
+
|
77
|
+
self._db.members(self._todo_key, 0, _min=_min, _max="(0")
|
78
|
+
self._db.delete(self._reset_lock_key)
|
79
|
+
|
80
|
+
if not self.heartbeat:
|
81
|
+
self._heartbeat_start_event.set()
|
82
|
+
|
83
|
+
time.sleep(reset_wait_seconds)
|
84
|
+
|
85
|
+
@check_pause
|
86
|
+
def _scheduler(self):
|
87
|
+
"""
|
88
|
+
调度任务,获取redis队列种子,同时添加到doing字典中
|
89
|
+
"""
|
90
|
+
if not self._db.zcount(self._todo_key, 0, "(1000"):
|
91
|
+
time.sleep(self._scheduler_wait_seconds)
|
92
|
+
elif self.__LAUNCHER_QUEUE__['todo'].length >= self._todo_queue_size:
|
93
|
+
time.sleep(self._todo_queue_full_wait_seconds)
|
94
|
+
else:
|
95
|
+
members = self._db.members(
|
96
|
+
self._todo_key, int(time.time()),
|
97
|
+
count=self._todo_queue_size,
|
98
|
+
_min=0, _max="(1000"
|
99
|
+
)
|
100
|
+
for member, priority in members:
|
101
|
+
seed = Seed(member, priority=priority)
|
102
|
+
self.__LAUNCHER_QUEUE__['todo'].push(seed)
|
103
|
+
self.__DOING__[seed.to_string] = seed.params.priority
|
104
|
+
|
105
|
+
@check_pause
|
106
|
+
def _insert(self):
|
107
|
+
"""
|
108
|
+
添加新种子到redis队列中
|
109
|
+
"""
|
110
|
+
seeds = {}
|
111
|
+
status = self.__LAUNCHER_QUEUE__['new'].length < self._new_queue_max_size
|
112
|
+
for _ in range(self._new_queue_max_size):
|
113
|
+
seed = self.__LAUNCHER_QUEUE__['new'].pop()
|
114
|
+
if seed:
|
115
|
+
seeds[seed.to_string] = seed.params.priority
|
116
|
+
if seeds:
|
117
|
+
self._db.zadd(self._todo_key, seeds, nx=True)
|
118
|
+
if status:
|
119
|
+
time.sleep(self._new_queue_wait_seconds)
|
120
|
+
|
121
|
+
@check_pause
|
122
|
+
def _refresh(self):
|
123
|
+
"""
|
124
|
+
刷新doing种子过期时间,防止reset重新消费
|
125
|
+
"""
|
126
|
+
if self.__DOING__:
|
127
|
+
refresh_time = int(time.time())
|
128
|
+
seeds = {k:-refresh_time - v / 1000 for k, v in self.__DOING__.items()}
|
129
|
+
self._db.zadd(self._todo_key, item=seeds, xx=True)
|
130
|
+
time.sleep(15)
|
131
|
+
|
132
|
+
@check_pause
|
133
|
+
def _delete(self):
|
134
|
+
"""
|
135
|
+
删除队列种子,根据状态添加至成功或失败队列,移除doing字典种子索引
|
136
|
+
"""
|
137
|
+
seed_list = []
|
138
|
+
status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
|
139
|
+
|
140
|
+
for _ in range(self._done_queue_max_size):
|
141
|
+
seed = self.__LAUNCHER_QUEUE__['done'].pop()
|
142
|
+
if not seed:
|
143
|
+
break
|
144
|
+
seed_list.append(seed.to_string)
|
145
|
+
|
146
|
+
if seed_list:
|
147
|
+
|
148
|
+
self._db.zrem(self._todo_key, *seed_list)
|
149
|
+
self._remove_doing_seeds(seed_list)
|
150
|
+
|
151
|
+
if status:
|
152
|
+
time.sleep(self._done_queue_wait_seconds)
|
153
|
+
|
154
|
+
def _polling(self):
|
155
|
+
wait_scheduler_execute = True
|
156
|
+
check_emtpy_times = 0
|
157
|
+
while not self._stop.is_set():
|
158
|
+
queue_not_empty_count = 0
|
159
|
+
pooling_wait_seconds = 30
|
160
|
+
|
161
|
+
for q in self.__LAUNCHER_QUEUE__.values():
|
162
|
+
if q.length != 0:
|
163
|
+
queue_not_empty_count += 1
|
164
|
+
wait_scheduler_execute = False
|
165
|
+
|
166
|
+
if queue_not_empty_count == 0:
|
167
|
+
pooling_wait_seconds = 3
|
168
|
+
if self._pause.is_set():
|
169
|
+
check_emtpy_times = 0
|
170
|
+
if not self._task_model and (
|
171
|
+
not wait_scheduler_execute or
|
172
|
+
int(time.time()) - self._app_time > self._before_scheduler_wait_seconds
|
173
|
+
):
|
174
|
+
logger.info("Done! ready to close thread...")
|
175
|
+
self._stop.set()
|
176
|
+
|
177
|
+
elif self._db.zcount(self._todo_key, _min=0, _max="(1000"):
|
178
|
+
logger.info(f"Recovery {self.task} task run!")
|
179
|
+
self._pause.clear()
|
180
|
+
self._execute()
|
181
|
+
else:
|
182
|
+
logger.info("pause! waiting for resume...")
|
183
|
+
elif check_emtpy_times > 2:
|
184
|
+
self.__DOING__ = {}
|
185
|
+
if not self._db.zcount(self._todo_key, _min="-inf", _max="(1000"):
|
186
|
+
self._pause.set()
|
187
|
+
else:
|
188
|
+
logger.info(
|
189
|
+
"check whether the task is complete, "
|
190
|
+
f"reset times {3 - check_emtpy_times}"
|
191
|
+
)
|
192
|
+
check_emtpy_times += 1
|
193
|
+
else:
|
194
|
+
logger.info(LogTemplate.launcher_pro_polling.format(
|
195
|
+
task=self.task,
|
196
|
+
doing_len=len(self.__DOING__.keys()),
|
197
|
+
todo_len=self.__LAUNCHER_QUEUE__['todo'].length,
|
198
|
+
done_len=self.__LAUNCHER_QUEUE__['done'].length,
|
199
|
+
redis_seed_count=self._db.zcount(self._todo_key, "-inf", "+inf"),
|
200
|
+
redis_todo_len=self._db.zcount(self._todo_key, 0, "(1000"),
|
201
|
+
redis_doing_len=self._db.zcount(self._todo_key, "-inf", "(0"),
|
202
|
+
upload_len=self.__LAUNCHER_QUEUE__['upload'].length,
|
203
|
+
))
|
204
|
+
|
205
|
+
time.sleep(pooling_wait_seconds)
|
206
|
+
|
207
|
+
logger.info("Done! Ready to close thread...")
|
208
|
+
|
@@ -0,0 +1,69 @@
|
|
1
|
+
import time
|
2
|
+
import threading
|
3
|
+
|
4
|
+
from abc import ABC, abstractmethod
|
5
|
+
from cobweb.base import BaseItem, Queue, logger
|
6
|
+
|
7
|
+
|
8
|
+
class Pipeline(threading.Thread, ABC):
|
9
|
+
|
10
|
+
def __init__(
|
11
|
+
self,
|
12
|
+
stop: threading.Event,
|
13
|
+
pause: threading.Event,
|
14
|
+
upload: Queue, done: Queue,
|
15
|
+
upload_size: int,
|
16
|
+
wait_seconds: int
|
17
|
+
):
|
18
|
+
super().__init__()
|
19
|
+
self._stop = stop
|
20
|
+
self._pause = pause
|
21
|
+
self._upload = upload
|
22
|
+
self._done = done
|
23
|
+
|
24
|
+
self.upload_size = upload_size
|
25
|
+
self.wait_seconds = wait_seconds
|
26
|
+
|
27
|
+
@abstractmethod
|
28
|
+
def build(self, item: BaseItem) -> dict:
|
29
|
+
pass
|
30
|
+
|
31
|
+
@abstractmethod
|
32
|
+
def upload(self, table: str, data: list) -> bool:
|
33
|
+
pass
|
34
|
+
|
35
|
+
def run(self):
|
36
|
+
while not self._stop.is_set():
|
37
|
+
if not self._upload.length:
|
38
|
+
time.sleep(self.wait_seconds)
|
39
|
+
continue
|
40
|
+
if self._upload.length < self.upload_size:
|
41
|
+
time.sleep(self.wait_seconds)
|
42
|
+
status = True
|
43
|
+
data_info, seeds = {}, []
|
44
|
+
try:
|
45
|
+
for _ in range(self.upload_size):
|
46
|
+
item = self._upload.pop()
|
47
|
+
if not item:
|
48
|
+
break
|
49
|
+
seeds.append(item.seed)
|
50
|
+
data = self.build(item)
|
51
|
+
data_info.setdefault(item.table, []).append(data)
|
52
|
+
for table, datas in data_info.items():
|
53
|
+
try:
|
54
|
+
self.upload(table, datas)
|
55
|
+
except Exception as e:
|
56
|
+
logger.info(e)
|
57
|
+
status = False
|
58
|
+
except Exception as e:
|
59
|
+
logger.info(e)
|
60
|
+
status = False
|
61
|
+
if not status:
|
62
|
+
for seed in seeds:
|
63
|
+
seed.params.seed_status = "deal model: fail"
|
64
|
+
if seeds:
|
65
|
+
self._done.push(seeds)
|
66
|
+
|
67
|
+
logger.info("upload pipeline close!")
|
68
|
+
|
69
|
+
|
@@ -0,0 +1,22 @@
|
|
1
|
+
from cobweb.base import ConsoleItem, logger
|
2
|
+
from cobweb.constant import LogTemplate
|
3
|
+
from cobweb.pipelines import Pipeline
|
4
|
+
|
5
|
+
|
6
|
+
class Console(Pipeline):
|
7
|
+
|
8
|
+
def build(self, item: ConsoleItem):
|
9
|
+
return {
|
10
|
+
"seed": item.seed.to_dict,
|
11
|
+
"data": item.to_dict
|
12
|
+
}
|
13
|
+
|
14
|
+
def upload(self, table, datas):
|
15
|
+
for data in datas:
|
16
|
+
parse_detail = LogTemplate.log_info(data["data"])
|
17
|
+
if len(parse_detail) > 500:
|
18
|
+
parse_detail = parse_detail[:500] + " ...\n" + " " * 12 + "-- Text is too long and details are omitted!"
|
19
|
+
logger.info(LogTemplate.console_item.format(
|
20
|
+
seed_detail=LogTemplate.log_info(data["seed"]),
|
21
|
+
parse_detail=parse_detail
|
22
|
+
))
|
@@ -0,0 +1,34 @@
|
|
1
|
+
import json
|
2
|
+
|
3
|
+
from cobweb import setting
|
4
|
+
from cobweb.base import BaseItem
|
5
|
+
from cobweb.pipelines import Pipeline
|
6
|
+
from aliyun.log import LogClient, LogItem, PutLogsRequest
|
7
|
+
|
8
|
+
|
9
|
+
class Loghub(Pipeline):
|
10
|
+
|
11
|
+
def __init__(self, *args, **kwargs):
|
12
|
+
super().__init__(*args, **kwargs)
|
13
|
+
self.client = LogClient(**setting.LOGHUB_CONFIG)
|
14
|
+
|
15
|
+
def build(self, item: BaseItem):
|
16
|
+
log_item = LogItem()
|
17
|
+
temp = item.to_dict
|
18
|
+
for key, value in temp.items():
|
19
|
+
if not isinstance(value, str):
|
20
|
+
temp[key] = json.dumps(value, ensure_ascii=False)
|
21
|
+
contents = sorted(temp.items())
|
22
|
+
log_item.set_contents(contents)
|
23
|
+
return log_item
|
24
|
+
|
25
|
+
def upload(self, table, datas):
|
26
|
+
request = PutLogsRequest(
|
27
|
+
project=setting.LOGHUB_PROJECT,
|
28
|
+
logstore=table,
|
29
|
+
topic=setting.LOGHUB_TOPIC,
|
30
|
+
source=setting.LOGHUB_SOURCE,
|
31
|
+
logitems=datas,
|
32
|
+
compress=True
|
33
|
+
)
|
34
|
+
self.client.put_logs(request=request)
|
@@ -0,0 +1,72 @@
|
|
1
|
+
import threading
|
2
|
+
import time
|
3
|
+
|
4
|
+
# from cobweb.base import Seed
|
5
|
+
from cobweb.db import ApiDB
|
6
|
+
|
7
|
+
|
8
|
+
class ApiScheduler:
|
9
|
+
|
10
|
+
def __init__(self, task, project, scheduler_wait_seconds=30):
|
11
|
+
self._todo_key = "{%s:%s}:todo" % (project, task)
|
12
|
+
self._download_key = "{%s:%s}:download" % (project, task)
|
13
|
+
self._heartbeat_key = "heartbeat:%s_%s" % (project, task)
|
14
|
+
self._speed_control_key = "speed_control:%s_%s" % (project, task)
|
15
|
+
self._reset_lock_key = "lock:reset:%s_%s" % (project, task)
|
16
|
+
self._db = ApiDB()
|
17
|
+
|
18
|
+
self.scheduler_wait_seconds = scheduler_wait_seconds
|
19
|
+
self.working = threading.Event()
|
20
|
+
|
21
|
+
@property
|
22
|
+
def heartbeat(self):
|
23
|
+
return self._db.exists(self._heartbeat_key)
|
24
|
+
|
25
|
+
def set_heartbeat(self):
|
26
|
+
return self._db.setex(self._heartbeat_key, 5)
|
27
|
+
|
28
|
+
def schedule(self, key, count):
|
29
|
+
if not self._db.zcount(key, 0, "(1000"):
|
30
|
+
time.sleep(self.scheduler_wait_seconds)
|
31
|
+
else:
|
32
|
+
source = int(time.time())
|
33
|
+
members = self._db.members(key, source, count=count, _min=0, _max="(1000")
|
34
|
+
for member, priority in members:
|
35
|
+
# seed = Seed(member, priority=priority)
|
36
|
+
yield member, priority
|
37
|
+
|
38
|
+
def insert(self, key, items):
|
39
|
+
if items:
|
40
|
+
self._db.zadd(key, items, nx=True)
|
41
|
+
|
42
|
+
def reset(self, keys, reset_time=30):
|
43
|
+
if self._db.lock(self._reset_lock_key, t=120):
|
44
|
+
|
45
|
+
if isinstance(keys, str):
|
46
|
+
keys = [keys]
|
47
|
+
|
48
|
+
_min = reset_time - int(time.time()) if self.heartbeat else "-inf"
|
49
|
+
|
50
|
+
for key in keys:
|
51
|
+
if self._db.exists(key):
|
52
|
+
self._db.members(key, 0, _min=_min, _max="(0")
|
53
|
+
|
54
|
+
if not self.heartbeat:
|
55
|
+
self.working.set()
|
56
|
+
time.sleep(10)
|
57
|
+
|
58
|
+
self._db.delete(self._reset_lock_key)
|
59
|
+
|
60
|
+
def refresh(self, key, items: dict[str, int]):
|
61
|
+
refresh_time = int(time.time())
|
62
|
+
its = {k: -refresh_time - v / 1000 for k, v in items.items()}
|
63
|
+
if its:
|
64
|
+
self._db.zadd(key, item=its, xx=True)
|
65
|
+
|
66
|
+
def delete(self, key, values):
|
67
|
+
if values:
|
68
|
+
self._db.zrem(key, *values)
|
69
|
+
|
70
|
+
|
71
|
+
|
72
|
+
|
@@ -0,0 +1,72 @@
|
|
1
|
+
import threading
|
2
|
+
import time
|
3
|
+
|
4
|
+
# from cobweb.base import Seed
|
5
|
+
from cobweb.db import RedisDB
|
6
|
+
|
7
|
+
|
8
|
+
class RedisScheduler:
|
9
|
+
|
10
|
+
def __init__(self, task, project, scheduler_wait_seconds=30):
|
11
|
+
self._todo_key = "{%s:%s}:todo" % (project, task)
|
12
|
+
self._download_key = "{%s:%s}:download" % (project, task)
|
13
|
+
self._heartbeat_key = "heartbeat:%s_%s" % (project, task)
|
14
|
+
self._speed_control_key = "speed_control:%s_%s" % (project, task)
|
15
|
+
self._reset_lock_key = "lock:reset:%s_%s" % (project, task)
|
16
|
+
self._db = RedisDB()
|
17
|
+
|
18
|
+
self.scheduler_wait_seconds = scheduler_wait_seconds
|
19
|
+
self.working = threading.Event()
|
20
|
+
|
21
|
+
@property
|
22
|
+
def heartbeat(self):
|
23
|
+
return self._db.exists(self._heartbeat_key)
|
24
|
+
|
25
|
+
def set_heartbeat(self):
|
26
|
+
return self._db.setex(self._heartbeat_key, 5)
|
27
|
+
|
28
|
+
def schedule(self, key, count):
|
29
|
+
if not self._db.zcount(key, 0, "(1000"):
|
30
|
+
time.sleep(self.scheduler_wait_seconds)
|
31
|
+
else:
|
32
|
+
source = int(time.time())
|
33
|
+
members = self._db.members(key, source, count=count, _min=0, _max="(1000")
|
34
|
+
for member, priority in members:
|
35
|
+
# seed = Seed(member, priority=priority)
|
36
|
+
yield member, priority
|
37
|
+
|
38
|
+
def insert(self, key, items):
|
39
|
+
if items:
|
40
|
+
self._db.zadd(key, items, nx=True)
|
41
|
+
|
42
|
+
def reset(self, keys, reset_time=30):
|
43
|
+
if self._db.lock(self._reset_lock_key, t=120):
|
44
|
+
|
45
|
+
if isinstance(keys, str):
|
46
|
+
keys = [keys]
|
47
|
+
|
48
|
+
_min = reset_time - int(time.time()) if self.heartbeat else "-inf"
|
49
|
+
|
50
|
+
for key in keys:
|
51
|
+
if self._db.exists(key):
|
52
|
+
self._db.members(key, 0, _min=_min, _max="(0")
|
53
|
+
|
54
|
+
if not self.heartbeat:
|
55
|
+
self.working.set()
|
56
|
+
time.sleep(10)
|
57
|
+
|
58
|
+
self._db.delete(self._reset_lock_key)
|
59
|
+
|
60
|
+
def refresh(self, key, items: dict[str, int]):
|
61
|
+
refresh_time = int(time.time())
|
62
|
+
its = {k: -refresh_time - v / 1000 for k, v in items.items()}
|
63
|
+
if its:
|
64
|
+
self._db.zadd(key, item=its, xx=True)
|
65
|
+
|
66
|
+
def delete(self, key, values):
|
67
|
+
if values:
|
68
|
+
self._db.zrem(key, *values)
|
69
|
+
|
70
|
+
|
71
|
+
|
72
|
+
|
cobweb/setting.py
CHANGED
@@ -1,13 +1,74 @@
|
|
1
1
|
import os
|
2
2
|
|
3
|
+
# redis db config
|
4
|
+
REDIS_CONFIG = {
|
5
|
+
"host": os.getenv("REDIS_HOST"),
|
6
|
+
"password": os.getenv("REDIS_PASSWORD"),
|
7
|
+
"port": int(os.getenv("REDIS_PORT", 6379)),
|
8
|
+
"db": int(os.getenv("REDIS_DB", 0)),
|
9
|
+
}
|
3
10
|
|
4
|
-
#
|
5
|
-
|
11
|
+
# loghub db config
|
12
|
+
LOGHUB_TOPIC = os.getenv("LOGHUB_TOPIC")
|
13
|
+
LOGHUB_SOURCE = os.getenv("LOGHUB_SOURCE")
|
14
|
+
LOGHUB_PROJECT = os.getenv("LOGHUB_PROJECT")
|
15
|
+
LOGHUB_CONFIG = {
|
16
|
+
"endpoint": os.getenv("LOGHUB_ENDPOINT"),
|
17
|
+
"accessKeyId": os.getenv("LOGHUB_ACCESS_KEY"),
|
18
|
+
"accessKey": os.getenv("LOGHUB_SECRET_KEY")
|
19
|
+
}
|
6
20
|
|
7
|
-
#
|
8
|
-
|
21
|
+
# oss util config
|
22
|
+
OSS_BUCKET = os.getenv("OSS_BUCKET")
|
23
|
+
OSS_ENDPOINT = os.getenv("OSS_ENDPOINT")
|
24
|
+
OSS_ACCESS_KEY = os.getenv("OSS_ACCESS_KEY")
|
25
|
+
OSS_SECRET_KEY = os.getenv("OSS_SECRET_KEY")
|
26
|
+
OSS_CHUNK_SIZE = 10 * 1024 ** 2
|
27
|
+
OSS_MIN_UPLOAD_SIZE = 1024
|
9
28
|
|
10
|
-
# 默认设置检查spider queue队列锁的存活时间为30s
|
11
|
-
CHECK_LOCK_TIME = int(os.getenv("CHECK_LOCK_TIME", 30))
|
12
29
|
|
30
|
+
# 采集器选择
|
31
|
+
CRAWLER = "cobweb.crawlers.Crawler"
|
13
32
|
|
33
|
+
# 数据存储链路
|
34
|
+
PIPELINE = "cobweb.pipelines.pipeline_console.Console"
|
35
|
+
|
36
|
+
|
37
|
+
# Launcher 等待时间
|
38
|
+
|
39
|
+
BEFORE_SCHEDULER_WAIT_SECONDS = 60 # 调度前等待时间,只作用于单次任务
|
40
|
+
SCHEDULER_WAIT_SECONDS = 15 # 调度等待时间
|
41
|
+
TODO_QUEUE_FULL_WAIT_SECONDS = 5 # todo队列已满时等待时间
|
42
|
+
NEW_QUEUE_WAIT_SECONDS = 30 # new队列等待时间
|
43
|
+
DONE_QUEUE_WAIT_SECONDS = 5 # done队列等待时间
|
44
|
+
UPLOAD_QUEUE_WAIT_SECONDS = 15 # upload队列等待时间
|
45
|
+
SEED_RESET_SECONDS = 30 # 种子重制时间
|
46
|
+
|
47
|
+
|
48
|
+
# Launcher 队列长度
|
49
|
+
TODO_QUEUE_SIZE = 100 # todo队列长度
|
50
|
+
NEW_QUEUE_MAX_SIZE = 100 # new队列长度
|
51
|
+
DONE_QUEUE_MAX_SIZE = 100 # done队列长度
|
52
|
+
UPLOAD_QUEUE_MAX_SIZE = 100 # upload队列长度
|
53
|
+
|
54
|
+
# DONE_MODEL IN (0, 1), 种子完成模式
|
55
|
+
DONE_MODEL = 0 # 0:种子消费成功直接从队列移除,失败则添加至失败队列;1:种子消费成功添加至成功队列,失败添加至失败队列
|
56
|
+
|
57
|
+
# spider
|
58
|
+
SPIDER_THREAD_NUM = 10
|
59
|
+
SPIDER_MAX_RETRIES = 5
|
60
|
+
SPIDER_TIME_SLEEP = 10
|
61
|
+
|
62
|
+
SPIDER_MAX_COUNT = 1000 # 在规定时间窗口内最大采集数
|
63
|
+
TIME_WINDOW = 60 # 频控固定时间窗口(秒)
|
64
|
+
|
65
|
+
# 任务模式
|
66
|
+
TASK_MODEL = 0 # 0:单次,1:常驻
|
67
|
+
|
68
|
+
|
69
|
+
# bloom过滤器
|
70
|
+
CAPACITY = 100000000
|
71
|
+
ERROR_RATE = 0.001
|
72
|
+
FILTER_FIELD = "url"
|
73
|
+
# 文件下载响应类型过滤
|
74
|
+
# FILE_FILTER_CONTENT_TYPE = ["text/html", "application/xhtml+xml"]
|
cobweb/utils/__init__.py
ADDED
cobweb/utils/bloom.py
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
import math
|
2
|
+
import time
|
3
|
+
|
4
|
+
import mmh3
|
5
|
+
import redis
|
6
|
+
from cobweb import setting
|
7
|
+
|
8
|
+
|
9
|
+
class BloomFilter:
|
10
|
+
|
11
|
+
def __init__(self, key, redis_config=None, capacity=None, error_rate=None):
|
12
|
+
redis_config = redis_config or setting.REDIS_CONFIG
|
13
|
+
capacity = capacity or setting.CAPACITY
|
14
|
+
error_rate = error_rate or setting.ERROR_RATE
|
15
|
+
redis_config['db'] = 3
|
16
|
+
|
17
|
+
self.key = key
|
18
|
+
|
19
|
+
pool = redis.ConnectionPool(**redis_config)
|
20
|
+
self._client = redis.Redis(connection_pool=pool)
|
21
|
+
self.bit_size = self.get_bit_size(capacity, error_rate)
|
22
|
+
self.hash_count = self.get_hash_count(self.bit_size, capacity)
|
23
|
+
self._init_bloom_key()
|
24
|
+
|
25
|
+
def add(self, value):
|
26
|
+
for seed in range(self.hash_count):
|
27
|
+
result = mmh3.hash(value, seed) % self.bit_size
|
28
|
+
self._client.setbit(self.key, result, 1)
|
29
|
+
return True
|
30
|
+
|
31
|
+
def exists(self, value):
|
32
|
+
if not self._client.exists(self.key):
|
33
|
+
return False
|
34
|
+
for seed in range(self.hash_count):
|
35
|
+
result = mmh3.hash(value, seed) % self.bit_size
|
36
|
+
if not self._client.getbit(self.key, result):
|
37
|
+
return False
|
38
|
+
return True
|
39
|
+
|
40
|
+
def _init_bloom_key(self):
|
41
|
+
lua_script = """
|
42
|
+
redis.call("SETBIT", KEYS[1], ARGV[1], ARGV[2])
|
43
|
+
redis.call("EXPIRE", KEYS[1], 604800)
|
44
|
+
"""
|
45
|
+
if self._client.exists(self.key):
|
46
|
+
return True
|
47
|
+
execute = self._client.register_script(lua_script)
|
48
|
+
execute(keys=[self.key], args=[self.bit_size-1, 1])
|
49
|
+
|
50
|
+
@classmethod
|
51
|
+
def get_bit_size(cls, n, p):
|
52
|
+
return int(-(n * math.log(p)) / (math.log(2) ** 2))
|
53
|
+
|
54
|
+
@classmethod
|
55
|
+
def get_hash_count(cls, m, n):
|
56
|
+
return int((m / n) * math.log(2))
|
57
|
+
|
58
|
+
|
cobweb/utils/dotting.py
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
import json
|
2
|
+
|
3
|
+
from aliyun.log import LogClient, LogItem, PutLogsRequest
|
4
|
+
from cobweb import setting
|
5
|
+
|
6
|
+
|
7
|
+
class LoghubDot:
|
8
|
+
|
9
|
+
def __init__(self):
|
10
|
+
self.client = LogClient(**setting.LOGHUB_CONFIG)
|
11
|
+
|
12
|
+
def build(self, topic, **kwargs):
|
13
|
+
|
14
|
+
temp = {}
|
15
|
+
log_items = []
|
16
|
+
log_item = LogItem()
|
17
|
+
for key, value in kwargs.items():
|
18
|
+
if not isinstance(value, str):
|
19
|
+
temp[key] = json.dumps(value, ensure_ascii=False)
|
20
|
+
else:
|
21
|
+
temp[key] = value
|
22
|
+
contents = sorted(temp.items())
|
23
|
+
log_item.set_contents(contents)
|
24
|
+
log_items.append(log_item)
|
25
|
+
request = PutLogsRequest(
|
26
|
+
project="databee-download-log",
|
27
|
+
logstore="log",
|
28
|
+
topic=topic,
|
29
|
+
logitems=log_items,
|
30
|
+
compress=True
|
31
|
+
)
|
32
|
+
self.client.put_logs(request=request)
|