cobweb-launcher 0.1.7__py3-none-any.whl → 1.2.41__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. cobweb/__init__.py +2 -11
  2. cobweb/base/__init__.py +9 -0
  3. cobweb/base/basic.py +297 -0
  4. cobweb/base/common_queue.py +30 -0
  5. cobweb/base/decorators.py +40 -0
  6. cobweb/base/dotting.py +35 -0
  7. cobweb/base/item.py +46 -0
  8. cobweb/{log.py → base/log.py} +4 -6
  9. cobweb/base/request.py +82 -0
  10. cobweb/base/response.py +23 -0
  11. cobweb/base/seed.py +114 -0
  12. cobweb/constant.py +94 -0
  13. cobweb/crawlers/__init__.py +1 -0
  14. cobweb/crawlers/base_crawler.py +144 -0
  15. cobweb/crawlers/crawler.py +209 -0
  16. cobweb/crawlers/file_crawler.py +98 -0
  17. cobweb/db/__init__.py +2 -2
  18. cobweb/db/api_db.py +82 -0
  19. cobweb/db/redis_db.py +125 -218
  20. cobweb/exceptions/__init__.py +1 -0
  21. cobweb/exceptions/oss_db_exception.py +28 -0
  22. cobweb/launchers/__init__.py +3 -0
  23. cobweb/launchers/launcher.py +235 -0
  24. cobweb/launchers/launcher_air.py +88 -0
  25. cobweb/launchers/launcher_api.py +209 -0
  26. cobweb/launchers/launcher_pro.py +208 -0
  27. cobweb/pipelines/__init__.py +3 -0
  28. cobweb/pipelines/pipeline.py +69 -0
  29. cobweb/pipelines/pipeline_console.py +22 -0
  30. cobweb/pipelines/pipeline_loghub.py +34 -0
  31. cobweb/schedulers/__init__.py +3 -0
  32. cobweb/schedulers/scheduler_api.py +72 -0
  33. cobweb/schedulers/scheduler_redis.py +72 -0
  34. cobweb/setting.py +67 -6
  35. cobweb/utils/__init__.py +5 -0
  36. cobweb/utils/bloom.py +58 -0
  37. cobweb/utils/dotting.py +32 -0
  38. cobweb/utils/oss.py +94 -0
  39. cobweb/utils/tools.py +42 -0
  40. cobweb_launcher-1.2.41.dist-info/METADATA +205 -0
  41. cobweb_launcher-1.2.41.dist-info/RECORD +44 -0
  42. {cobweb_launcher-0.1.7.dist-info → cobweb_launcher-1.2.41.dist-info}/WHEEL +1 -1
  43. cobweb/bbb.py +0 -191
  44. cobweb/db/oss_db.py +0 -127
  45. cobweb/db/scheduler/__init__.py +0 -0
  46. cobweb/db/scheduler/default.py +0 -8
  47. cobweb/db/scheduler/textfile.py +0 -27
  48. cobweb/db/storer/__init__.py +0 -0
  49. cobweb/db/storer/console.py +0 -9
  50. cobweb/db/storer/loghub.py +0 -54
  51. cobweb/db/storer/redis.py +0 -15
  52. cobweb/db/storer/textfile.py +0 -15
  53. cobweb/decorators.py +0 -16
  54. cobweb/distributed/__init__.py +0 -0
  55. cobweb/distributed/launcher.py +0 -243
  56. cobweb/distributed/models.py +0 -143
  57. cobweb/interface.py +0 -34
  58. cobweb/single/__init__.py +0 -0
  59. cobweb/single/launcher.py +0 -231
  60. cobweb/single/models.py +0 -134
  61. cobweb/single/nest.py +0 -153
  62. cobweb/task.py +0 -50
  63. cobweb/utils.py +0 -90
  64. cobweb_launcher-0.1.7.dist-info/METADATA +0 -45
  65. cobweb_launcher-0.1.7.dist-info/RECORD +0 -31
  66. {cobweb_launcher-0.1.7.dist-info → cobweb_launcher-1.2.41.dist-info}/LICENSE +0 -0
  67. {cobweb_launcher-0.1.7.dist-info → cobweb_launcher-1.2.41.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,208 @@
1
+ import time
2
+ import threading
3
+
4
+ from cobweb.db import RedisDB
5
+ from cobweb.base import Seed, logger
6
+ from cobweb.utils import BloomFilter
7
+ from cobweb.constant import DealModel, LogTemplate
8
+ from .launcher import Launcher, check_pause
9
+
10
+
11
+ class LauncherPro(Launcher):
12
+
13
+ def __init__(self, task, project, custom_setting=None, **kwargs):
14
+ super().__init__(task, project, custom_setting, **kwargs)
15
+ self._todo_key = "{%s:%s}:todo" % (project, task)
16
+ self._done_key = "{%s:%s}:done" % (project, task)
17
+ self._fail_key = "{%s:%s}:fail" % (project, task)
18
+ self._heartbeat_key = "heartbeat:%s_%s" % (project, task)
19
+
20
+ self._statistics_done_key = "statistics:%s:%s:done" % (project, task)
21
+ self._statistics_fail_key = "statistics:%s:%s:fail" % (project, task)
22
+ self._speed_control_key = "speed_control:%s_%s" % (project, task)
23
+
24
+ self._reset_lock_key = "lock:reset:%s_%s" % (project, task)
25
+
26
+ # self._bf_key = "bloom_%s_%s" % (project, task)
27
+ #
28
+ self._db = RedisDB()
29
+ #
30
+ # self._bf = BloomFilter(self._bf_key)
31
+
32
+ self._heartbeat_start_event = threading.Event()
33
+ self._redis_queue_empty_event = threading.Event()
34
+
35
+ @property
36
+ def heartbeat(self):
37
+ return self._db.exists(self._heartbeat_key)
38
+
39
+ def statistics(self, key, count):
40
+ if not self._task_model and not self._db.exists(key):
41
+ self._db.setex(key, 86400 * 30, int(count))
42
+ else:
43
+ self._db._client.incrby(key, count)
44
+
45
+ def _get_seed(self) -> Seed:
46
+ spider_speed = self._db._client.get(self._speed_control_key)
47
+ if int(spider_speed or 0) > self._spider_max_count:
48
+ expire_time = self._db.ttl(self._speed_control_key)
49
+ if expire_time == -1:
50
+ self._db.delete(self._speed_control_key)
51
+ else:
52
+ logger.info(f"Too fast! Please wait {expire_time} seconds...")
53
+ time.sleep(expire_time / 2)
54
+ return None
55
+ seed = self.__LAUNCHER_QUEUE__["todo"].pop()
56
+ if seed and not self._db.lock(self._speed_control_key, t=self._time_window):
57
+ self._db._client.incrby(self._speed_control_key, 1)
58
+ return seed
59
+
60
+ @check_pause
61
+ def _execute_heartbeat(self):
62
+ if self._heartbeat_start_event.is_set():
63
+ self._db.setex(self._heartbeat_key, 5)
64
+ time.sleep(3)
65
+
66
+ @check_pause
67
+ def _reset(self):
68
+ """
69
+ 检查过期种子,重新添加到redis缓存中
70
+ """
71
+ reset_wait_seconds = 30
72
+ if self._db.lock(self._reset_lock_key, t=120):
73
+
74
+ _min = -int(time.time()) + self._seed_reset_seconds \
75
+ if self.heartbeat else "-inf"
76
+
77
+ self._db.members(self._todo_key, 0, _min=_min, _max="(0")
78
+ self._db.delete(self._reset_lock_key)
79
+
80
+ if not self.heartbeat:
81
+ self._heartbeat_start_event.set()
82
+
83
+ time.sleep(reset_wait_seconds)
84
+
85
+ @check_pause
86
+ def _scheduler(self):
87
+ """
88
+ 调度任务,获取redis队列种子,同时添加到doing字典中
89
+ """
90
+ if not self._db.zcount(self._todo_key, 0, "(1000"):
91
+ time.sleep(self._scheduler_wait_seconds)
92
+ elif self.__LAUNCHER_QUEUE__['todo'].length >= self._todo_queue_size:
93
+ time.sleep(self._todo_queue_full_wait_seconds)
94
+ else:
95
+ members = self._db.members(
96
+ self._todo_key, int(time.time()),
97
+ count=self._todo_queue_size,
98
+ _min=0, _max="(1000"
99
+ )
100
+ for member, priority in members:
101
+ seed = Seed(member, priority=priority)
102
+ self.__LAUNCHER_QUEUE__['todo'].push(seed)
103
+ self.__DOING__[seed.to_string] = seed.params.priority
104
+
105
+ @check_pause
106
+ def _insert(self):
107
+ """
108
+ 添加新种子到redis队列中
109
+ """
110
+ seeds = {}
111
+ status = self.__LAUNCHER_QUEUE__['new'].length < self._new_queue_max_size
112
+ for _ in range(self._new_queue_max_size):
113
+ seed = self.__LAUNCHER_QUEUE__['new'].pop()
114
+ if seed:
115
+ seeds[seed.to_string] = seed.params.priority
116
+ if seeds:
117
+ self._db.zadd(self._todo_key, seeds, nx=True)
118
+ if status:
119
+ time.sleep(self._new_queue_wait_seconds)
120
+
121
+ @check_pause
122
+ def _refresh(self):
123
+ """
124
+ 刷新doing种子过期时间,防止reset重新消费
125
+ """
126
+ if self.__DOING__:
127
+ refresh_time = int(time.time())
128
+ seeds = {k:-refresh_time - v / 1000 for k, v in self.__DOING__.items()}
129
+ self._db.zadd(self._todo_key, item=seeds, xx=True)
130
+ time.sleep(15)
131
+
132
+ @check_pause
133
+ def _delete(self):
134
+ """
135
+ 删除队列种子,根据状态添加至成功或失败队列,移除doing字典种子索引
136
+ """
137
+ seed_list = []
138
+ status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
139
+
140
+ for _ in range(self._done_queue_max_size):
141
+ seed = self.__LAUNCHER_QUEUE__['done'].pop()
142
+ if not seed:
143
+ break
144
+ seed_list.append(seed.to_string)
145
+
146
+ if seed_list:
147
+
148
+ self._db.zrem(self._todo_key, *seed_list)
149
+ self._remove_doing_seeds(seed_list)
150
+
151
+ if status:
152
+ time.sleep(self._done_queue_wait_seconds)
153
+
154
+ def _polling(self):
155
+ wait_scheduler_execute = True
156
+ check_emtpy_times = 0
157
+ while not self._stop.is_set():
158
+ queue_not_empty_count = 0
159
+ pooling_wait_seconds = 30
160
+
161
+ for q in self.__LAUNCHER_QUEUE__.values():
162
+ if q.length != 0:
163
+ queue_not_empty_count += 1
164
+ wait_scheduler_execute = False
165
+
166
+ if queue_not_empty_count == 0:
167
+ pooling_wait_seconds = 3
168
+ if self._pause.is_set():
169
+ check_emtpy_times = 0
170
+ if not self._task_model and (
171
+ not wait_scheduler_execute or
172
+ int(time.time()) - self._app_time > self._before_scheduler_wait_seconds
173
+ ):
174
+ logger.info("Done! ready to close thread...")
175
+ self._stop.set()
176
+
177
+ elif self._db.zcount(self._todo_key, _min=0, _max="(1000"):
178
+ logger.info(f"Recovery {self.task} task run!")
179
+ self._pause.clear()
180
+ self._execute()
181
+ else:
182
+ logger.info("pause! waiting for resume...")
183
+ elif check_emtpy_times > 2:
184
+ self.__DOING__ = {}
185
+ if not self._db.zcount(self._todo_key, _min="-inf", _max="(1000"):
186
+ self._pause.set()
187
+ else:
188
+ logger.info(
189
+ "check whether the task is complete, "
190
+ f"reset times {3 - check_emtpy_times}"
191
+ )
192
+ check_emtpy_times += 1
193
+ else:
194
+ logger.info(LogTemplate.launcher_pro_polling.format(
195
+ task=self.task,
196
+ doing_len=len(self.__DOING__.keys()),
197
+ todo_len=self.__LAUNCHER_QUEUE__['todo'].length,
198
+ done_len=self.__LAUNCHER_QUEUE__['done'].length,
199
+ redis_seed_count=self._db.zcount(self._todo_key, "-inf", "+inf"),
200
+ redis_todo_len=self._db.zcount(self._todo_key, 0, "(1000"),
201
+ redis_doing_len=self._db.zcount(self._todo_key, "-inf", "(0"),
202
+ upload_len=self.__LAUNCHER_QUEUE__['upload'].length,
203
+ ))
204
+
205
+ time.sleep(pooling_wait_seconds)
206
+
207
+ logger.info("Done! Ready to close thread...")
208
+
@@ -0,0 +1,3 @@
1
+ from .pipeline import Pipeline
2
+ from .pipeline_console import Console
3
+ from .pipeline_loghub import Loghub
@@ -0,0 +1,69 @@
1
+ import time
2
+ import threading
3
+
4
+ from abc import ABC, abstractmethod
5
+ from cobweb.base import BaseItem, Queue, logger
6
+
7
+
8
+ class Pipeline(threading.Thread, ABC):
9
+
10
+ def __init__(
11
+ self,
12
+ stop: threading.Event,
13
+ pause: threading.Event,
14
+ upload: Queue, done: Queue,
15
+ upload_size: int,
16
+ wait_seconds: int
17
+ ):
18
+ super().__init__()
19
+ self._stop = stop
20
+ self._pause = pause
21
+ self._upload = upload
22
+ self._done = done
23
+
24
+ self.upload_size = upload_size
25
+ self.wait_seconds = wait_seconds
26
+
27
+ @abstractmethod
28
+ def build(self, item: BaseItem) -> dict:
29
+ pass
30
+
31
+ @abstractmethod
32
+ def upload(self, table: str, data: list) -> bool:
33
+ pass
34
+
35
+ def run(self):
36
+ while not self._stop.is_set():
37
+ if not self._upload.length:
38
+ time.sleep(self.wait_seconds)
39
+ continue
40
+ if self._upload.length < self.upload_size:
41
+ time.sleep(self.wait_seconds)
42
+ status = True
43
+ data_info, seeds = {}, []
44
+ try:
45
+ for _ in range(self.upload_size):
46
+ item = self._upload.pop()
47
+ if not item:
48
+ break
49
+ seeds.append(item.seed)
50
+ data = self.build(item)
51
+ data_info.setdefault(item.table, []).append(data)
52
+ for table, datas in data_info.items():
53
+ try:
54
+ self.upload(table, datas)
55
+ except Exception as e:
56
+ logger.info(e)
57
+ status = False
58
+ except Exception as e:
59
+ logger.info(e)
60
+ status = False
61
+ if not status:
62
+ for seed in seeds:
63
+ seed.params.seed_status = "deal model: fail"
64
+ if seeds:
65
+ self._done.push(seeds)
66
+
67
+ logger.info("upload pipeline close!")
68
+
69
+
@@ -0,0 +1,22 @@
1
+ from cobweb.base import ConsoleItem, logger
2
+ from cobweb.constant import LogTemplate
3
+ from cobweb.pipelines import Pipeline
4
+
5
+
6
+ class Console(Pipeline):
7
+
8
+ def build(self, item: ConsoleItem):
9
+ return {
10
+ "seed": item.seed.to_dict,
11
+ "data": item.to_dict
12
+ }
13
+
14
+ def upload(self, table, datas):
15
+ for data in datas:
16
+ parse_detail = LogTemplate.log_info(data["data"])
17
+ if len(parse_detail) > 500:
18
+ parse_detail = parse_detail[:500] + " ...\n" + " " * 12 + "-- Text is too long and details are omitted!"
19
+ logger.info(LogTemplate.console_item.format(
20
+ seed_detail=LogTemplate.log_info(data["seed"]),
21
+ parse_detail=parse_detail
22
+ ))
@@ -0,0 +1,34 @@
1
+ import json
2
+
3
+ from cobweb import setting
4
+ from cobweb.base import BaseItem
5
+ from cobweb.pipelines import Pipeline
6
+ from aliyun.log import LogClient, LogItem, PutLogsRequest
7
+
8
+
9
+ class Loghub(Pipeline):
10
+
11
+ def __init__(self, *args, **kwargs):
12
+ super().__init__(*args, **kwargs)
13
+ self.client = LogClient(**setting.LOGHUB_CONFIG)
14
+
15
+ def build(self, item: BaseItem):
16
+ log_item = LogItem()
17
+ temp = item.to_dict
18
+ for key, value in temp.items():
19
+ if not isinstance(value, str):
20
+ temp[key] = json.dumps(value, ensure_ascii=False)
21
+ contents = sorted(temp.items())
22
+ log_item.set_contents(contents)
23
+ return log_item
24
+
25
+ def upload(self, table, datas):
26
+ request = PutLogsRequest(
27
+ project=setting.LOGHUB_PROJECT,
28
+ logstore=table,
29
+ topic=setting.LOGHUB_TOPIC,
30
+ source=setting.LOGHUB_SOURCE,
31
+ logitems=datas,
32
+ compress=True
33
+ )
34
+ self.client.put_logs(request=request)
@@ -0,0 +1,3 @@
1
+ from .scheduler_redis import RedisScheduler
2
+ from .scheduler_api import ApiScheduler
3
+
@@ -0,0 +1,72 @@
1
+ import threading
2
+ import time
3
+
4
+ # from cobweb.base import Seed
5
+ from cobweb.db import ApiDB
6
+
7
+
8
+ class ApiScheduler:
9
+
10
+ def __init__(self, task, project, scheduler_wait_seconds=30):
11
+ self._todo_key = "{%s:%s}:todo" % (project, task)
12
+ self._download_key = "{%s:%s}:download" % (project, task)
13
+ self._heartbeat_key = "heartbeat:%s_%s" % (project, task)
14
+ self._speed_control_key = "speed_control:%s_%s" % (project, task)
15
+ self._reset_lock_key = "lock:reset:%s_%s" % (project, task)
16
+ self._db = ApiDB()
17
+
18
+ self.scheduler_wait_seconds = scheduler_wait_seconds
19
+ self.working = threading.Event()
20
+
21
+ @property
22
+ def heartbeat(self):
23
+ return self._db.exists(self._heartbeat_key)
24
+
25
+ def set_heartbeat(self):
26
+ return self._db.setex(self._heartbeat_key, 5)
27
+
28
+ def schedule(self, key, count):
29
+ if not self._db.zcount(key, 0, "(1000"):
30
+ time.sleep(self.scheduler_wait_seconds)
31
+ else:
32
+ source = int(time.time())
33
+ members = self._db.members(key, source, count=count, _min=0, _max="(1000")
34
+ for member, priority in members:
35
+ # seed = Seed(member, priority=priority)
36
+ yield member, priority
37
+
38
+ def insert(self, key, items):
39
+ if items:
40
+ self._db.zadd(key, items, nx=True)
41
+
42
+ def reset(self, keys, reset_time=30):
43
+ if self._db.lock(self._reset_lock_key, t=120):
44
+
45
+ if isinstance(keys, str):
46
+ keys = [keys]
47
+
48
+ _min = reset_time - int(time.time()) if self.heartbeat else "-inf"
49
+
50
+ for key in keys:
51
+ if self._db.exists(key):
52
+ self._db.members(key, 0, _min=_min, _max="(0")
53
+
54
+ if not self.heartbeat:
55
+ self.working.set()
56
+ time.sleep(10)
57
+
58
+ self._db.delete(self._reset_lock_key)
59
+
60
+ def refresh(self, key, items: dict[str, int]):
61
+ refresh_time = int(time.time())
62
+ its = {k: -refresh_time - v / 1000 for k, v in items.items()}
63
+ if its:
64
+ self._db.zadd(key, item=its, xx=True)
65
+
66
+ def delete(self, key, values):
67
+ if values:
68
+ self._db.zrem(key, *values)
69
+
70
+
71
+
72
+
@@ -0,0 +1,72 @@
1
+ import threading
2
+ import time
3
+
4
+ # from cobweb.base import Seed
5
+ from cobweb.db import RedisDB
6
+
7
+
8
+ class RedisScheduler:
9
+
10
+ def __init__(self, task, project, scheduler_wait_seconds=30):
11
+ self._todo_key = "{%s:%s}:todo" % (project, task)
12
+ self._download_key = "{%s:%s}:download" % (project, task)
13
+ self._heartbeat_key = "heartbeat:%s_%s" % (project, task)
14
+ self._speed_control_key = "speed_control:%s_%s" % (project, task)
15
+ self._reset_lock_key = "lock:reset:%s_%s" % (project, task)
16
+ self._db = RedisDB()
17
+
18
+ self.scheduler_wait_seconds = scheduler_wait_seconds
19
+ self.working = threading.Event()
20
+
21
+ @property
22
+ def heartbeat(self):
23
+ return self._db.exists(self._heartbeat_key)
24
+
25
+ def set_heartbeat(self):
26
+ return self._db.setex(self._heartbeat_key, 5)
27
+
28
+ def schedule(self, key, count):
29
+ if not self._db.zcount(key, 0, "(1000"):
30
+ time.sleep(self.scheduler_wait_seconds)
31
+ else:
32
+ source = int(time.time())
33
+ members = self._db.members(key, source, count=count, _min=0, _max="(1000")
34
+ for member, priority in members:
35
+ # seed = Seed(member, priority=priority)
36
+ yield member, priority
37
+
38
+ def insert(self, key, items):
39
+ if items:
40
+ self._db.zadd(key, items, nx=True)
41
+
42
+ def reset(self, keys, reset_time=30):
43
+ if self._db.lock(self._reset_lock_key, t=120):
44
+
45
+ if isinstance(keys, str):
46
+ keys = [keys]
47
+
48
+ _min = reset_time - int(time.time()) if self.heartbeat else "-inf"
49
+
50
+ for key in keys:
51
+ if self._db.exists(key):
52
+ self._db.members(key, 0, _min=_min, _max="(0")
53
+
54
+ if not self.heartbeat:
55
+ self.working.set()
56
+ time.sleep(10)
57
+
58
+ self._db.delete(self._reset_lock_key)
59
+
60
+ def refresh(self, key, items: dict[str, int]):
61
+ refresh_time = int(time.time())
62
+ its = {k: -refresh_time - v / 1000 for k, v in items.items()}
63
+ if its:
64
+ self._db.zadd(key, item=its, xx=True)
65
+
66
+ def delete(self, key, values):
67
+ if values:
68
+ self._db.zrem(key, *values)
69
+
70
+
71
+
72
+
cobweb/setting.py CHANGED
@@ -1,13 +1,74 @@
1
1
  import os
2
2
 
3
+ # redis db config
4
+ REDIS_CONFIG = {
5
+ "host": os.getenv("REDIS_HOST"),
6
+ "password": os.getenv("REDIS_PASSWORD"),
7
+ "port": int(os.getenv("REDIS_PORT", 6379)),
8
+ "db": int(os.getenv("REDIS_DB", 0)),
9
+ }
3
10
 
4
- # model: 0, 1, 2
5
- MODEL = int(os.getenv("MODEL", "0"))
11
+ # loghub db config
12
+ LOGHUB_TOPIC = os.getenv("LOGHUB_TOPIC")
13
+ LOGHUB_SOURCE = os.getenv("LOGHUB_SOURCE")
14
+ LOGHUB_PROJECT = os.getenv("LOGHUB_PROJECT")
15
+ LOGHUB_CONFIG = {
16
+ "endpoint": os.getenv("LOGHUB_ENDPOINT"),
17
+ "accessKeyId": os.getenv("LOGHUB_ACCESS_KEY"),
18
+ "accessKey": os.getenv("LOGHUB_SECRET_KEY")
19
+ }
6
20
 
7
- # 重制score值的等待时间, 默认10分钟
8
- RESET_SCORE = int(os.getenv("RESET_SCORE", "600"))
21
+ # oss util config
22
+ OSS_BUCKET = os.getenv("OSS_BUCKET")
23
+ OSS_ENDPOINT = os.getenv("OSS_ENDPOINT")
24
+ OSS_ACCESS_KEY = os.getenv("OSS_ACCESS_KEY")
25
+ OSS_SECRET_KEY = os.getenv("OSS_SECRET_KEY")
26
+ OSS_CHUNK_SIZE = 10 * 1024 ** 2
27
+ OSS_MIN_UPLOAD_SIZE = 1024
9
28
 
10
- # 默认设置检查spider queue队列锁的存活时间为30s
11
- CHECK_LOCK_TIME = int(os.getenv("CHECK_LOCK_TIME", 30))
12
29
 
30
+ # 采集器选择
31
+ CRAWLER = "cobweb.crawlers.Crawler"
13
32
 
33
+ # 数据存储链路
34
+ PIPELINE = "cobweb.pipelines.pipeline_console.Console"
35
+
36
+
37
+ # Launcher 等待时间
38
+
39
+ BEFORE_SCHEDULER_WAIT_SECONDS = 60 # 调度前等待时间,只作用于单次任务
40
+ SCHEDULER_WAIT_SECONDS = 15 # 调度等待时间
41
+ TODO_QUEUE_FULL_WAIT_SECONDS = 5 # todo队列已满时等待时间
42
+ NEW_QUEUE_WAIT_SECONDS = 30 # new队列等待时间
43
+ DONE_QUEUE_WAIT_SECONDS = 5 # done队列等待时间
44
+ UPLOAD_QUEUE_WAIT_SECONDS = 15 # upload队列等待时间
45
+ SEED_RESET_SECONDS = 30 # 种子重制时间
46
+
47
+
48
+ # Launcher 队列长度
49
+ TODO_QUEUE_SIZE = 100 # todo队列长度
50
+ NEW_QUEUE_MAX_SIZE = 100 # new队列长度
51
+ DONE_QUEUE_MAX_SIZE = 100 # done队列长度
52
+ UPLOAD_QUEUE_MAX_SIZE = 100 # upload队列长度
53
+
54
+ # DONE_MODEL IN (0, 1), 种子完成模式
55
+ DONE_MODEL = 0 # 0:种子消费成功直接从队列移除,失败则添加至失败队列;1:种子消费成功添加至成功队列,失败添加至失败队列
56
+
57
+ # spider
58
+ SPIDER_THREAD_NUM = 10
59
+ SPIDER_MAX_RETRIES = 5
60
+ SPIDER_TIME_SLEEP = 10
61
+
62
+ SPIDER_MAX_COUNT = 1000 # 在规定时间窗口内最大采集数
63
+ TIME_WINDOW = 60 # 频控固定时间窗口(秒)
64
+
65
+ # 任务模式
66
+ TASK_MODEL = 0 # 0:单次,1:常驻
67
+
68
+
69
+ # bloom过滤器
70
+ CAPACITY = 100000000
71
+ ERROR_RATE = 0.001
72
+ FILTER_FIELD = "url"
73
+ # 文件下载响应类型过滤
74
+ # FILE_FILTER_CONTENT_TYPE = ["text/html", "application/xhtml+xml"]
@@ -0,0 +1,5 @@
1
+ from .oss import OssUtil
2
+ from .tools import *
3
+ from .bloom import BloomFilter
4
+ from .dotting import LoghubDot
5
+
cobweb/utils/bloom.py ADDED
@@ -0,0 +1,58 @@
1
+ import math
2
+ import time
3
+
4
+ import mmh3
5
+ import redis
6
+ from cobweb import setting
7
+
8
+
9
+ class BloomFilter:
10
+
11
+ def __init__(self, key, redis_config=None, capacity=None, error_rate=None):
12
+ redis_config = redis_config or setting.REDIS_CONFIG
13
+ capacity = capacity or setting.CAPACITY
14
+ error_rate = error_rate or setting.ERROR_RATE
15
+ redis_config['db'] = 3
16
+
17
+ self.key = key
18
+
19
+ pool = redis.ConnectionPool(**redis_config)
20
+ self._client = redis.Redis(connection_pool=pool)
21
+ self.bit_size = self.get_bit_size(capacity, error_rate)
22
+ self.hash_count = self.get_hash_count(self.bit_size, capacity)
23
+ self._init_bloom_key()
24
+
25
+ def add(self, value):
26
+ for seed in range(self.hash_count):
27
+ result = mmh3.hash(value, seed) % self.bit_size
28
+ self._client.setbit(self.key, result, 1)
29
+ return True
30
+
31
+ def exists(self, value):
32
+ if not self._client.exists(self.key):
33
+ return False
34
+ for seed in range(self.hash_count):
35
+ result = mmh3.hash(value, seed) % self.bit_size
36
+ if not self._client.getbit(self.key, result):
37
+ return False
38
+ return True
39
+
40
+ def _init_bloom_key(self):
41
+ lua_script = """
42
+ redis.call("SETBIT", KEYS[1], ARGV[1], ARGV[2])
43
+ redis.call("EXPIRE", KEYS[1], 604800)
44
+ """
45
+ if self._client.exists(self.key):
46
+ return True
47
+ execute = self._client.register_script(lua_script)
48
+ execute(keys=[self.key], args=[self.bit_size-1, 1])
49
+
50
+ @classmethod
51
+ def get_bit_size(cls, n, p):
52
+ return int(-(n * math.log(p)) / (math.log(2) ** 2))
53
+
54
+ @classmethod
55
+ def get_hash_count(cls, m, n):
56
+ return int((m / n) * math.log(2))
57
+
58
+
@@ -0,0 +1,32 @@
1
+ import json
2
+
3
+ from aliyun.log import LogClient, LogItem, PutLogsRequest
4
+ from cobweb import setting
5
+
6
+
7
+ class LoghubDot:
8
+
9
+ def __init__(self):
10
+ self.client = LogClient(**setting.LOGHUB_CONFIG)
11
+
12
+ def build(self, topic, **kwargs):
13
+
14
+ temp = {}
15
+ log_items = []
16
+ log_item = LogItem()
17
+ for key, value in kwargs.items():
18
+ if not isinstance(value, str):
19
+ temp[key] = json.dumps(value, ensure_ascii=False)
20
+ else:
21
+ temp[key] = value
22
+ contents = sorted(temp.items())
23
+ log_item.set_contents(contents)
24
+ log_items.append(log_item)
25
+ request = PutLogsRequest(
26
+ project="databee-download-log",
27
+ logstore="log",
28
+ topic=topic,
29
+ logitems=log_items,
30
+ compress=True
31
+ )
32
+ self.client.put_logs(request=request)