cobweb-launcher 1.2.15__py3-none-any.whl → 1.2.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cobweb-launcher might be problematic. Click here for more details.

cobweb/__init__.py CHANGED
@@ -1,2 +1,2 @@
1
- from .launchers import LauncherAir, LauncherPro
1
+ from .launchers import LauncherAir, LauncherPro, LauncherApi
2
2
  from .constant import CrawlerModel
cobweb/db/__init__.py CHANGED
@@ -1 +1,2 @@
1
1
  from .redis_db import RedisDB
2
+ from .api_db import ApiDB
cobweb/db/api_db.py ADDED
@@ -0,0 +1,71 @@
1
+ import json
2
+ import os
3
+ import requests
4
+
5
+
6
+ class ApiDB:
7
+
8
+ def __init__(self, host=None, **kwargs):
9
+ self.host = host or os.getenv("REDIS_API_HOST", "http://127.0.0.1:4396")
10
+
11
+ def _get_response(self, api, params: dict = None):
12
+ url = self.host + api
13
+ response = requests.get(url, params=params)
14
+ json_data = response.json()
15
+ return json_data["data"]
16
+
17
+ def _post_response(self, api, params: dict = None, data: dict = None):
18
+ url = self.host + api
19
+ headers = {"Content-Type": "application/json"}
20
+ response = requests.post(url, headers=headers, params=params, data=json.dumps(data))
21
+ json_data = response.json()
22
+ return json_data["data"]
23
+
24
+ def get(self, name):
25
+ return self._get_response(api="/get", params=dict(name=name))
26
+
27
+ def setnx(self, name, value=""):
28
+ return self._get_response(api="/setnx", params=dict(name=name, value=value))
29
+
30
+ def setex(self, name, t, value=""):
31
+ return self._get_response(api="/setex", params=dict(name=name, value=value, t=t))
32
+
33
+ def expire(self, name, t, nx: bool = False, xx: bool = False, gt: bool = False, lt: bool = False):
34
+ return self._get_response(api="/expire", params=dict(name=name, t=t, nx=nx, xx=xx, gt=gt, lt=lt))
35
+
36
+ def ttl(self, name):
37
+ return self._get_response(api="/ttl", params=dict(name=name))
38
+
39
+ def delete(self, name):
40
+ return self._get_response(api="/delete", params=dict(name=name))
41
+
42
+ def exists(self, name):
43
+ return self._get_response(api="/exists", params=dict(name=name))
44
+
45
+ def incrby(self, name, value):
46
+ return self._get_response(api="/incrby", params=dict(name=name, value=value))
47
+
48
+ def zcard(self, name) -> bool:
49
+ return self._get_response(api="/zcard", params=dict(name=name))
50
+
51
+ def zadd(self, name, item: dict, **kwargs):
52
+ return self._post_response(api="/zadd", data=dict(name=name, mapping=item, **kwargs))
53
+
54
+ def zrem(self, name, *values):
55
+ return self._post_response(api="/zrem", data=dict(name=name, values=values))
56
+
57
+ def zcount(self, name, _min, _max):
58
+ return self._get_response(api="/zcount", params=dict(name=name, min=_min, max=_max))
59
+
60
+ def lock(self, name, t=15) -> bool:
61
+ return self._get_response(api="/lock", params=dict(name=name, t=t))
62
+
63
+ def members(self, name, score, start=0, count=5000, _min="-inf", _max="+inf"):
64
+ return self._get_response(api="/members", params=dict(name=name, score=score, start=start, count=count, min=_min, max=_max))
65
+
66
+ def done(self, name: list, *values):
67
+ return self._post_response(api="/done", data=dict(name=name, values=values))
68
+
69
+
70
+
71
+
@@ -1,2 +1,3 @@
1
1
  from .launcher_air import LauncherAir
2
2
  from .launcher_pro import LauncherPro
3
+ from .launcher_api import LauncherApi
@@ -97,7 +97,8 @@ class Launcher(threading.Thread):
97
97
  self._spider_max_retries = setting.SPIDER_MAX_RETRIES
98
98
  self._spider_thread_num = setting.SPIDER_THREAD_NUM
99
99
  self._spider_time_sleep = setting.SPIDER_TIME_SLEEP
100
- self._spider_max_speed = setting.SPIDER_MAX_SPEED
100
+ self._spider_max_count = setting.SPIDER_MAX_COUNT
101
+ self._time_window = setting.TIME_WINDOW
101
102
 
102
103
  self._done_model = setting.DONE_MODEL
103
104
  self._task_model = setting.TASK_MODEL
@@ -0,0 +1,225 @@
1
+ import time
2
+ import threading
3
+
4
+ from cobweb.db import ApiDB
5
+ from cobweb.base import Seed, logger
6
+ from cobweb.constant import DealModel, LogTemplate
7
+ from .launcher import Launcher, check_pause
8
+
9
+
10
+ class LauncherApi(Launcher):
11
+
12
+ def __init__(self, task, project, custom_setting=None, **kwargs):
13
+ super().__init__(task, project, custom_setting, **kwargs)
14
+ self._db = ApiDB()
15
+
16
+ self._todo_key = "{%s:%s}:todo" % (project, task)
17
+ self._done_key = "{%s:%s}:done" % (project, task)
18
+ self._fail_key = "{%s:%s}:fail" % (project, task)
19
+ self._heartbeat_key = "heartbeat:%s_%s" % (project, task)
20
+
21
+ self._statistics_done_key = "statistics:%s:%s:done" % (project, task)
22
+ self._statistics_fail_key = "statistics:%s:%s:fail" % (project, task)
23
+ self._speed_control_key = "speed_control:%s_%s" % (project, task)
24
+
25
+ self._reset_lock_key = "lock:reset:%s_%s" % (project, task)
26
+
27
+ # self._bf_key = "bloom_%s_%s" % (project, task)
28
+ # self._bf = BloomFilter(self._bf_key)
29
+
30
+ self._heartbeat_start_event = threading.Event()
31
+ self._redis_queue_empty_event = threading.Event()
32
+
33
+ @property
34
+ def heartbeat(self):
35
+ return self._db.exists(self._heartbeat_key)
36
+
37
+ def statistics(self, key, count):
38
+ if not self._task_model and not self._db.exists(key):
39
+ self._db.setex(key, 86400 * 30, int(count))
40
+ else:
41
+ self._db.incrby(key, count)
42
+
43
+ def _get_seed(self) -> Seed:
44
+ """
45
+ 从队列中获取种子(频控)
46
+ 设置时间窗口为self._time_window(秒),判断在该窗口内的采集量是否满足阈值(self._spider_max_speed)
47
+ :return: True -> 种子, False -> None
48
+ """
49
+ spider_speed = self._db.get(self._speed_control_key)
50
+ if int(spider_speed or 0) > self._spider_max_count:
51
+ expire_time = self._db.ttl(self._speed_control_key)
52
+ if expire_time == -1:
53
+ self._db.delete(self._speed_control_key)
54
+ elif expire_time and expire_time > 0:
55
+ logger.info(f"Too fast! Please wait {expire_time} seconds...")
56
+ time.sleep(expire_time / 2)
57
+ return None
58
+ seed = self.__LAUNCHER_QUEUE__["todo"].pop()
59
+ if seed and not self._db.lock(self._speed_control_key, t=self._time_window):
60
+ self._db.incrby(self._speed_control_key, 1)
61
+ return seed
62
+
63
+ @check_pause
64
+ def _execute_heartbeat(self):
65
+ if self._heartbeat_start_event.is_set():
66
+ self._db.setex(self._heartbeat_key, 5)
67
+ time.sleep(3)
68
+
69
+ @check_pause
70
+ def _reset(self):
71
+ """
72
+ 检查过期种子,重新添加到redis缓存中
73
+ """
74
+ reset_wait_seconds = 30
75
+ if self._db.lock(self._reset_lock_key, t=120):
76
+
77
+ _min = -int(time.time()) + self._seed_reset_seconds \
78
+ if self.heartbeat else "-inf"
79
+
80
+ self._db.members(self._todo_key, 0, _min=_min, _max="(0")
81
+ self._db.delete(self._reset_lock_key)
82
+
83
+ if not self.heartbeat:
84
+ self._heartbeat_start_event.set()
85
+
86
+ time.sleep(reset_wait_seconds)
87
+
88
+ @check_pause
89
+ def _scheduler(self):
90
+ """
91
+ 调度任务,获取redis队列种子,同时添加到doing字典中
92
+ """
93
+ if not self._db.zcount(self._todo_key, 0, "(1000"):
94
+ time.sleep(self._scheduler_wait_seconds)
95
+ elif self.__LAUNCHER_QUEUE__['todo'].length >= self._todo_queue_size:
96
+ time.sleep(self._todo_queue_full_wait_seconds)
97
+ else:
98
+ members = self._db.members(
99
+ self._todo_key, int(time.time()),
100
+ count=self._todo_queue_size,
101
+ _min=0, _max="(1000"
102
+ )
103
+ for member, priority in members:
104
+ seed = Seed(member, priority=priority)
105
+ self.__LAUNCHER_QUEUE__['todo'].push(seed)
106
+ self.__DOING__[seed.to_string] = seed.params.priority
107
+
108
+ @check_pause
109
+ def _insert(self):
110
+ """
111
+ 添加新种子到redis队列中
112
+ """
113
+ seeds = {}
114
+ status = self.__LAUNCHER_QUEUE__['new'].length < self._new_queue_max_size
115
+ for _ in range(self._new_queue_max_size):
116
+ seed = self.__LAUNCHER_QUEUE__['new'].pop()
117
+ if seed:
118
+ seeds[seed.to_string] = seed.params.priority
119
+ if seeds:
120
+ self._db.zadd(self._todo_key, seeds, nx=True)
121
+ if status:
122
+ time.sleep(self._new_queue_wait_seconds)
123
+
124
+ @check_pause
125
+ def _refresh(self):
126
+ """
127
+ 刷新doing种子过期时间,防止reset重新消费
128
+ """
129
+ if self.__DOING__:
130
+ refresh_time = int(time.time())
131
+ seeds = {k:-refresh_time - v / 1000 for k, v in self.__DOING__.items()}
132
+ self._db.zadd(self._todo_key, item=seeds, xx=True)
133
+ time.sleep(15)
134
+
135
+ @check_pause
136
+ def _delete(self):
137
+ """
138
+ 删除队列种子,根据状态添加至成功或失败队列,移除doing字典种子索引
139
+ """
140
+ seed_info = {"count": 0, "failed": [], "succeed": [], "common": []}
141
+ status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
142
+
143
+ for _ in range(self._done_queue_max_size):
144
+ seed = self.__LAUNCHER_QUEUE__['done'].pop()
145
+ if not seed:
146
+ break
147
+ if seed.params.seed_status == DealModel.fail:
148
+ seed_info["failed"].append(seed.to_string)
149
+ elif self._done_model == 1:
150
+ seed_info["succeed"].append(seed.to_string)
151
+ else:
152
+ seed_info["common"].append(seed.to_string)
153
+ seed_info['count'] += 1
154
+
155
+ if seed_info["count"]:
156
+
157
+ succeed_count = self._db.zrem(self._todo_key, *seed_info["common"])
158
+ succeed_count += self._db.done([self._todo_key, self._done_key], *seed_info["succeed"])
159
+ failed_count = self._db.done([self._todo_key, self._fail_key], *seed_info["failed"])
160
+
161
+ if failed_count:
162
+ self.statistics(self._statistics_fail_key, failed_count)
163
+ if succeed_count:
164
+ self.statistics(self._statistics_done_key, succeed_count)
165
+
166
+ self._remove_doing_seeds(seed_info["common"] + seed_info["succeed"] + seed_info["failed"])
167
+
168
+ if status:
169
+ time.sleep(self._done_queue_wait_seconds)
170
+
171
+ def _polling(self):
172
+ wait_scheduler_execute = True
173
+ check_emtpy_times = 0
174
+ while not self._stop.is_set():
175
+ queue_not_empty_count = 0
176
+ pooling_wait_seconds = 30
177
+
178
+ for q in self.__LAUNCHER_QUEUE__.values():
179
+ if q.length != 0:
180
+ queue_not_empty_count += 1
181
+ wait_scheduler_execute = False
182
+
183
+ if queue_not_empty_count == 0:
184
+ pooling_wait_seconds = 3
185
+ if self._pause.is_set():
186
+ check_emtpy_times = 0
187
+ if not self._task_model and (
188
+ not wait_scheduler_execute or
189
+ int(time.time()) - self._app_time > self._before_scheduler_wait_seconds
190
+ ):
191
+ logger.info("Done! ready to close thread...")
192
+ self._stop.set()
193
+
194
+ elif self._db.zcount(self._todo_key, _min=0, _max="(1000"):
195
+ logger.info(f"Recovery {self.task} task run!")
196
+ self._pause.clear()
197
+ self._execute()
198
+ else:
199
+ logger.info("pause! waiting for resume...")
200
+ elif check_emtpy_times > 2:
201
+ self.__DOING__ = {}
202
+ if not self._db.zcount(self._todo_key, _min="-inf", _max="(1000"):
203
+ self._pause.set()
204
+ else:
205
+ logger.info(
206
+ "check whether the task is complete, "
207
+ f"reset times {3 - check_emtpy_times}"
208
+ )
209
+ check_emtpy_times += 1
210
+ else:
211
+ logger.info(LogTemplate.launcher_pro_polling.format(
212
+ task=self.task,
213
+ doing_len=len(self.__DOING__.keys()),
214
+ todo_len=self.__LAUNCHER_QUEUE__['todo'].length,
215
+ done_len=self.__LAUNCHER_QUEUE__['done'].length,
216
+ redis_seed_count=self._db.zcount(self._todo_key, "-inf", "+inf"),
217
+ redis_todo_len=self._db.zcount(self._todo_key, 0, "(1000"),
218
+ redis_doing_len=self._db.zcount(self._todo_key, "-inf", "(0"),
219
+ upload_len=self.__LAUNCHER_QUEUE__['upload'].length,
220
+ ))
221
+
222
+ time.sleep(pooling_wait_seconds)
223
+
224
+ logger.info("Done! Ready to close thread...")
225
+
@@ -44,7 +44,7 @@ class LauncherPro(Launcher):
44
44
 
45
45
  def _get_seed(self) -> Seed:
46
46
  spider_speed = self._db._client.get(self._speed_control_key)
47
- if int(spider_speed or 0) > self._spider_max_speed:
47
+ if int(spider_speed or 0) > self._spider_max_count:
48
48
  expire_time = self._db.ttl(self._speed_control_key)
49
49
  if expire_time == -1:
50
50
  self._db.delete(self._speed_control_key)
@@ -53,7 +53,7 @@ class LauncherPro(Launcher):
53
53
  time.sleep(expire_time / 2)
54
54
  return None
55
55
  seed = self.__LAUNCHER_QUEUE__["todo"].pop()
56
- if seed and not self._db.lock(self._speed_control_key, t=60):
56
+ if seed and not self._db.lock(self._speed_control_key, t=self._time_window):
57
57
  self._db._client.incrby(self._speed_control_key, 1)
58
58
  return seed
59
59
 
cobweb/setting.py CHANGED
@@ -58,7 +58,9 @@ DONE_MODEL = 0 # 0:种子消费成功直接从队列移除,失败则添加
58
58
  SPIDER_THREAD_NUM = 10
59
59
  SPIDER_MAX_RETRIES = 5
60
60
  SPIDER_TIME_SLEEP = 10
61
- SPIDER_MAX_SPEED = 1000 # 一分钟最大采集数
61
+
62
+ SPIDER_MAX_COUNT = 1000 # 在规定时间窗口内最大采集数
63
+ TIME_WINDOW = 60 # 频控固定时间窗口(秒)
62
64
 
63
65
  # 任务模式
64
66
  TASK_MODEL = 0 # 0:单次,1:常驻
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 1.2.15
3
+ Version: 1.2.17
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -1,6 +1,6 @@
1
- cobweb/__init__.py,sha256=uMHyf4Fekbyw2xBCbkA8R0LwCpBJf5p_7pWbh60ZWYk,83
1
+ cobweb/__init__.py,sha256=CBd2oByCfc5EmH2dCZYVHkxXYZG-oWrLyTtZU5sEoP0,96
2
2
  cobweb/constant.py,sha256=zy3XYsc1qp2B76_Fn_hVQ8eGHlPBd3OFlZK2cryE6FY,2839
3
- cobweb/setting.py,sha256=MGe4QGnE5XOTh9Z7NhakaTFK7f-lZtzlA9PFcuc1qoY,2145
3
+ cobweb/setting.py,sha256=47HZsw40HLpsmOmvij1lyQALPQQCN_tWlKZ0wbn2MtM,2216
4
4
  cobweb/base/__init__.py,sha256=4gwWWQ0Q8cYG9cD7Lwf4XMqRGc5M_mapS3IczR6zeCE,222
5
5
  cobweb/base/common_queue.py,sha256=W7PPZZFl52j3Mc916T0imHj7oAUelA6aKJwW-FecDPE,872
6
6
  cobweb/base/decorators.py,sha256=wDCaQ94aAZGxks9Ljc0aXq6omDXT1_yzFy83ZW6VbVI,930
@@ -13,14 +13,16 @@ cobweb/crawlers/__init__.py,sha256=msvkB9mTpsgyj8JfNMsmwAcpy5kWk_2NrO1Adw2Hkw0,2
13
13
  cobweb/crawlers/base_crawler.py,sha256=ee_WSDnPQpPTk6wlFuY2UEx5L3hcsAZFcr6i3GLSry8,5751
14
14
  cobweb/crawlers/crawler.py,sha256=xiFNM0t69f5xlm59hPbO2MpqtdirVAUhD84-CLpyHPM,6349
15
15
  cobweb/crawlers/file_crawler.py,sha256=2Sjbdgxzqd41WykKUQE3QQlGai3T8k-pmHNmPlTchjQ,4454
16
- cobweb/db/__init__.py,sha256=ut0iEyBLjcJL06WNG_5_d4hO5PJWvDrKWMkDOdmgh2M,30
16
+ cobweb/db/__init__.py,sha256=uZwSkd105EAwYo95oZQXAfofUKHVIAZZIPpNMy-hm2Q,56
17
+ cobweb/db/api_db.py,sha256=53-avN4zkqWLxmRRils8K3RFx2lBG5jNOuNFOHDMCD8,2663
17
18
  cobweb/db/redis_db.py,sha256=fumNZJiio-uQqRcSrymx8eJ1PqsdOwITe_Y-9JOXxrQ,4298
18
19
  cobweb/exceptions/__init__.py,sha256=E9SHnJBbhD7fOgPFMswqyOf8SKRDrI_i25L0bSpohvk,32
19
20
  cobweb/exceptions/oss_db_exception.py,sha256=iP_AImjNHT3-Iv49zCFQ3rdLnlvuHa3h2BXApgrOYpA,636
20
- cobweb/launchers/__init__.py,sha256=af0Y6wrGX8SQZ7w7XL2sOtREjCT3dwad-uCc3nIontY,76
21
- cobweb/launchers/launcher.py,sha256=AbkrytfJEyj8FhTbLgjmOOIuvOYV3cpVknE9yt31WbM,6930
21
+ cobweb/launchers/__init__.py,sha256=qMuVlQcjErVK67HyKFZEsXf_rfZD5ODjx1QucSCKMOM,114
22
+ cobweb/launchers/launcher.py,sha256=SK4f3Fpuv-QMMriHruXGQ1sh1lxT1DZ2PdG0p2wAzNw,6978
22
23
  cobweb/launchers/launcher_air.py,sha256=KAk_M8F3029cXYe7m4nn3Nzyi89lbxJ2cqZjqW8iZ0E,2832
23
- cobweb/launchers/launcher_pro.py,sha256=8QKhToKoD2WonIaqRQAhUWRhbNOIgYXzGFRK1id_3yM,8638
24
+ cobweb/launchers/launcher_api.py,sha256=Gcaj38_CdKN6dqzju9NtWBAwbm7KM4J2BG0STXnIh8Y,8877
25
+ cobweb/launchers/launcher_pro.py,sha256=2JdN4khAFGBWLXDKaWknjG72XBhfTAnwtrgVmkF0K3M,8653
24
26
  cobweb/pipelines/__init__.py,sha256=zSUsGtx6smbs2iXBXvYynReKSgky-3gjqaAtKVnA_OU,105
25
27
  cobweb/pipelines/base_pipeline.py,sha256=fYnWf79GmhufXpcnMa3te18SbmnVeYLwxfyo-zLd9CY,1577
26
28
  cobweb/pipelines/loghub_pipeline.py,sha256=cjPO6w6UJ0jNw2fVvdX0BCdlm58T7dmYXlxzXOBpvfY,1027
@@ -31,8 +33,8 @@ cobweb/utils/__init__.py,sha256=vBtZTy3EfRE0MmH43URhmr7nw6_oOWTEbGOM9xR_9o8,78
31
33
  cobweb/utils/bloom.py,sha256=vng-YbKgh9HbtpAWYf_nkUSbfVTOj40aqUUejRYlsCU,1752
32
34
  cobweb/utils/oss.py,sha256=gyt8-UB07tVphZLQXMOf-JTJwU-mWq8KZkOXKkAf3uk,3513
33
35
  cobweb/utils/tools.py,sha256=5JEaaAwYoV9Sdla2UBIJn6faUBuXmxUMagm9ck6FVqs,1253
34
- cobweb_launcher-1.2.15.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
35
- cobweb_launcher-1.2.15.dist-info/METADATA,sha256=rP73tkfQPQB8MCEYg31ZOdEbBYUUoyXzAcUevfnwqec,6510
36
- cobweb_launcher-1.2.15.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
37
- cobweb_launcher-1.2.15.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
38
- cobweb_launcher-1.2.15.dist-info/RECORD,,
36
+ cobweb_launcher-1.2.17.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
37
+ cobweb_launcher-1.2.17.dist-info/METADATA,sha256=z1BLGBzB7XsJtv15rxaT_IZRdeGNevCtMXdpzzLKB1k,6510
38
+ cobweb_launcher-1.2.17.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
39
+ cobweb_launcher-1.2.17.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
40
+ cobweb_launcher-1.2.17.dist-info/RECORD,,