cobweb-launcher 0.1.8__py3-none-any.whl → 1.2.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. cobweb/__init__.py +2 -11
  2. cobweb/base/__init__.py +9 -0
  3. cobweb/base/basic.py +297 -0
  4. cobweb/base/common_queue.py +30 -0
  5. cobweb/base/decorators.py +40 -0
  6. cobweb/base/dotting.py +35 -0
  7. cobweb/base/item.py +46 -0
  8. cobweb/{log.py → base/log.py} +4 -6
  9. cobweb/base/request.py +82 -0
  10. cobweb/base/response.py +23 -0
  11. cobweb/base/seed.py +114 -0
  12. cobweb/constant.py +94 -0
  13. cobweb/crawlers/__init__.py +1 -0
  14. cobweb/crawlers/base_crawler.py +144 -0
  15. cobweb/crawlers/crawler.py +209 -0
  16. cobweb/crawlers/file_crawler.py +98 -0
  17. cobweb/db/__init__.py +2 -2
  18. cobweb/db/api_db.py +82 -0
  19. cobweb/db/redis_db.py +125 -218
  20. cobweb/exceptions/__init__.py +1 -0
  21. cobweb/exceptions/oss_db_exception.py +28 -0
  22. cobweb/launchers/__init__.py +3 -0
  23. cobweb/launchers/launcher.py +235 -0
  24. cobweb/launchers/launcher_air.py +88 -0
  25. cobweb/launchers/launcher_api.py +209 -0
  26. cobweb/launchers/launcher_pro.py +208 -0
  27. cobweb/pipelines/__init__.py +3 -0
  28. cobweb/pipelines/pipeline.py +69 -0
  29. cobweb/pipelines/pipeline_console.py +22 -0
  30. cobweb/pipelines/pipeline_loghub.py +34 -0
  31. cobweb/schedulers/__init__.py +3 -0
  32. cobweb/schedulers/scheduler_api.py +72 -0
  33. cobweb/schedulers/scheduler_redis.py +72 -0
  34. cobweb/setting.py +67 -6
  35. cobweb/utils/__init__.py +5 -0
  36. cobweb/utils/bloom.py +58 -0
  37. cobweb/utils/dotting.py +32 -0
  38. cobweb/utils/oss.py +94 -0
  39. cobweb/utils/tools.py +42 -0
  40. cobweb_launcher-1.2.41.dist-info/METADATA +205 -0
  41. cobweb_launcher-1.2.41.dist-info/RECORD +44 -0
  42. {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.41.dist-info}/WHEEL +1 -1
  43. cobweb/bbb.py +0 -191
  44. cobweb/db/oss_db.py +0 -127
  45. cobweb/db/scheduler/__init__.py +0 -0
  46. cobweb/db/scheduler/default.py +0 -8
  47. cobweb/db/scheduler/textfile.py +0 -27
  48. cobweb/db/storer/__init__.py +0 -0
  49. cobweb/db/storer/console.py +0 -9
  50. cobweb/db/storer/loghub.py +0 -54
  51. cobweb/db/storer/redis.py +0 -15
  52. cobweb/db/storer/textfile.py +0 -15
  53. cobweb/decorators.py +0 -16
  54. cobweb/distributed/__init__.py +0 -0
  55. cobweb/distributed/launcher.py +0 -243
  56. cobweb/distributed/models.py +0 -143
  57. cobweb/interface.py +0 -34
  58. cobweb/single/__init__.py +0 -0
  59. cobweb/single/launcher.py +0 -231
  60. cobweb/single/models.py +0 -134
  61. cobweb/single/nest.py +0 -153
  62. cobweb/task.py +0 -50
  63. cobweb/utils.py +0 -90
  64. cobweb_launcher-0.1.8.dist-info/METADATA +0 -45
  65. cobweb_launcher-0.1.8.dist-info/RECORD +0 -31
  66. {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.41.dist-info}/LICENSE +0 -0
  67. {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.41.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,208 @@
1
+ import time
2
+ import threading
3
+
4
+ from cobweb.db import RedisDB
5
+ from cobweb.base import Seed, logger
6
+ from cobweb.utils import BloomFilter
7
+ from cobweb.constant import DealModel, LogTemplate
8
+ from .launcher import Launcher, check_pause
9
+
10
+
11
+ class LauncherPro(Launcher):
12
+
13
+ def __init__(self, task, project, custom_setting=None, **kwargs):
14
+ super().__init__(task, project, custom_setting, **kwargs)
15
+ self._todo_key = "{%s:%s}:todo" % (project, task)
16
+ self._done_key = "{%s:%s}:done" % (project, task)
17
+ self._fail_key = "{%s:%s}:fail" % (project, task)
18
+ self._heartbeat_key = "heartbeat:%s_%s" % (project, task)
19
+
20
+ self._statistics_done_key = "statistics:%s:%s:done" % (project, task)
21
+ self._statistics_fail_key = "statistics:%s:%s:fail" % (project, task)
22
+ self._speed_control_key = "speed_control:%s_%s" % (project, task)
23
+
24
+ self._reset_lock_key = "lock:reset:%s_%s" % (project, task)
25
+
26
+ # self._bf_key = "bloom_%s_%s" % (project, task)
27
+ #
28
+ self._db = RedisDB()
29
+ #
30
+ # self._bf = BloomFilter(self._bf_key)
31
+
32
+ self._heartbeat_start_event = threading.Event()
33
+ self._redis_queue_empty_event = threading.Event()
34
+
35
+ @property
36
+ def heartbeat(self):
37
+ return self._db.exists(self._heartbeat_key)
38
+
39
+ def statistics(self, key, count):
40
+ if not self._task_model and not self._db.exists(key):
41
+ self._db.setex(key, 86400 * 30, int(count))
42
+ else:
43
+ self._db._client.incrby(key, count)
44
+
45
+ def _get_seed(self) -> Seed:
46
+ spider_speed = self._db._client.get(self._speed_control_key)
47
+ if int(spider_speed or 0) > self._spider_max_count:
48
+ expire_time = self._db.ttl(self._speed_control_key)
49
+ if expire_time == -1:
50
+ self._db.delete(self._speed_control_key)
51
+ else:
52
+ logger.info(f"Too fast! Please wait {expire_time} seconds...")
53
+ time.sleep(expire_time / 2)
54
+ return None
55
+ seed = self.__LAUNCHER_QUEUE__["todo"].pop()
56
+ if seed and not self._db.lock(self._speed_control_key, t=self._time_window):
57
+ self._db._client.incrby(self._speed_control_key, 1)
58
+ return seed
59
+
60
+ @check_pause
61
+ def _execute_heartbeat(self):
62
+ if self._heartbeat_start_event.is_set():
63
+ self._db.setex(self._heartbeat_key, 5)
64
+ time.sleep(3)
65
+
66
+ @check_pause
67
+ def _reset(self):
68
+ """
69
+ 检查过期种子,重新添加到redis缓存中
70
+ """
71
+ reset_wait_seconds = 30
72
+ if self._db.lock(self._reset_lock_key, t=120):
73
+
74
+ _min = -int(time.time()) + self._seed_reset_seconds \
75
+ if self.heartbeat else "-inf"
76
+
77
+ self._db.members(self._todo_key, 0, _min=_min, _max="(0")
78
+ self._db.delete(self._reset_lock_key)
79
+
80
+ if not self.heartbeat:
81
+ self._heartbeat_start_event.set()
82
+
83
+ time.sleep(reset_wait_seconds)
84
+
85
+ @check_pause
86
+ def _scheduler(self):
87
+ """
88
+ 调度任务,获取redis队列种子,同时添加到doing字典中
89
+ """
90
+ if not self._db.zcount(self._todo_key, 0, "(1000"):
91
+ time.sleep(self._scheduler_wait_seconds)
92
+ elif self.__LAUNCHER_QUEUE__['todo'].length >= self._todo_queue_size:
93
+ time.sleep(self._todo_queue_full_wait_seconds)
94
+ else:
95
+ members = self._db.members(
96
+ self._todo_key, int(time.time()),
97
+ count=self._todo_queue_size,
98
+ _min=0, _max="(1000"
99
+ )
100
+ for member, priority in members:
101
+ seed = Seed(member, priority=priority)
102
+ self.__LAUNCHER_QUEUE__['todo'].push(seed)
103
+ self.__DOING__[seed.to_string] = seed.params.priority
104
+
105
+ @check_pause
106
+ def _insert(self):
107
+ """
108
+ 添加新种子到redis队列中
109
+ """
110
+ seeds = {}
111
+ status = self.__LAUNCHER_QUEUE__['new'].length < self._new_queue_max_size
112
+ for _ in range(self._new_queue_max_size):
113
+ seed = self.__LAUNCHER_QUEUE__['new'].pop()
114
+ if seed:
115
+ seeds[seed.to_string] = seed.params.priority
116
+ if seeds:
117
+ self._db.zadd(self._todo_key, seeds, nx=True)
118
+ if status:
119
+ time.sleep(self._new_queue_wait_seconds)
120
+
121
+ @check_pause
122
+ def _refresh(self):
123
+ """
124
+ 刷新doing种子过期时间,防止reset重新消费
125
+ """
126
+ if self.__DOING__:
127
+ refresh_time = int(time.time())
128
+ seeds = {k:-refresh_time - v / 1000 for k, v in self.__DOING__.items()}
129
+ self._db.zadd(self._todo_key, item=seeds, xx=True)
130
+ time.sleep(15)
131
+
132
+ @check_pause
133
+ def _delete(self):
134
+ """
135
+ 删除队列种子,根据状态添加至成功或失败队列,移除doing字典种子索引
136
+ """
137
+ seed_list = []
138
+ status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
139
+
140
+ for _ in range(self._done_queue_max_size):
141
+ seed = self.__LAUNCHER_QUEUE__['done'].pop()
142
+ if not seed:
143
+ break
144
+ seed_list.append(seed.to_string)
145
+
146
+ if seed_list:
147
+
148
+ self._db.zrem(self._todo_key, *seed_list)
149
+ self._remove_doing_seeds(seed_list)
150
+
151
+ if status:
152
+ time.sleep(self._done_queue_wait_seconds)
153
+
154
+ def _polling(self):
155
+ wait_scheduler_execute = True
156
+ check_emtpy_times = 0
157
+ while not self._stop.is_set():
158
+ queue_not_empty_count = 0
159
+ pooling_wait_seconds = 30
160
+
161
+ for q in self.__LAUNCHER_QUEUE__.values():
162
+ if q.length != 0:
163
+ queue_not_empty_count += 1
164
+ wait_scheduler_execute = False
165
+
166
+ if queue_not_empty_count == 0:
167
+ pooling_wait_seconds = 3
168
+ if self._pause.is_set():
169
+ check_emtpy_times = 0
170
+ if not self._task_model and (
171
+ not wait_scheduler_execute or
172
+ int(time.time()) - self._app_time > self._before_scheduler_wait_seconds
173
+ ):
174
+ logger.info("Done! ready to close thread...")
175
+ self._stop.set()
176
+
177
+ elif self._db.zcount(self._todo_key, _min=0, _max="(1000"):
178
+ logger.info(f"Recovery {self.task} task run!")
179
+ self._pause.clear()
180
+ self._execute()
181
+ else:
182
+ logger.info("pause! waiting for resume...")
183
+ elif check_emtpy_times > 2:
184
+ self.__DOING__ = {}
185
+ if not self._db.zcount(self._todo_key, _min="-inf", _max="(1000"):
186
+ self._pause.set()
187
+ else:
188
+ logger.info(
189
+ "check whether the task is complete, "
190
+ f"reset times {3 - check_emtpy_times}"
191
+ )
192
+ check_emtpy_times += 1
193
+ else:
194
+ logger.info(LogTemplate.launcher_pro_polling.format(
195
+ task=self.task,
196
+ doing_len=len(self.__DOING__.keys()),
197
+ todo_len=self.__LAUNCHER_QUEUE__['todo'].length,
198
+ done_len=self.__LAUNCHER_QUEUE__['done'].length,
199
+ redis_seed_count=self._db.zcount(self._todo_key, "-inf", "+inf"),
200
+ redis_todo_len=self._db.zcount(self._todo_key, 0, "(1000"),
201
+ redis_doing_len=self._db.zcount(self._todo_key, "-inf", "(0"),
202
+ upload_len=self.__LAUNCHER_QUEUE__['upload'].length,
203
+ ))
204
+
205
+ time.sleep(pooling_wait_seconds)
206
+
207
+ logger.info("Done! Ready to close thread...")
208
+
@@ -0,0 +1,3 @@
1
+ from .pipeline import Pipeline
2
+ from .pipeline_console import Console
3
+ from .pipeline_loghub import Loghub
@@ -0,0 +1,69 @@
1
+ import time
2
+ import threading
3
+
4
+ from abc import ABC, abstractmethod
5
+ from cobweb.base import BaseItem, Queue, logger
6
+
7
+
8
+ class Pipeline(threading.Thread, ABC):
9
+
10
+ def __init__(
11
+ self,
12
+ stop: threading.Event,
13
+ pause: threading.Event,
14
+ upload: Queue, done: Queue,
15
+ upload_size: int,
16
+ wait_seconds: int
17
+ ):
18
+ super().__init__()
19
+ self._stop = stop
20
+ self._pause = pause
21
+ self._upload = upload
22
+ self._done = done
23
+
24
+ self.upload_size = upload_size
25
+ self.wait_seconds = wait_seconds
26
+
27
+ @abstractmethod
28
+ def build(self, item: BaseItem) -> dict:
29
+ pass
30
+
31
+ @abstractmethod
32
+ def upload(self, table: str, data: list) -> bool:
33
+ pass
34
+
35
+ def run(self):
36
+ while not self._stop.is_set():
37
+ if not self._upload.length:
38
+ time.sleep(self.wait_seconds)
39
+ continue
40
+ if self._upload.length < self.upload_size:
41
+ time.sleep(self.wait_seconds)
42
+ status = True
43
+ data_info, seeds = {}, []
44
+ try:
45
+ for _ in range(self.upload_size):
46
+ item = self._upload.pop()
47
+ if not item:
48
+ break
49
+ seeds.append(item.seed)
50
+ data = self.build(item)
51
+ data_info.setdefault(item.table, []).append(data)
52
+ for table, datas in data_info.items():
53
+ try:
54
+ self.upload(table, datas)
55
+ except Exception as e:
56
+ logger.info(e)
57
+ status = False
58
+ except Exception as e:
59
+ logger.info(e)
60
+ status = False
61
+ if not status:
62
+ for seed in seeds:
63
+ seed.params.seed_status = "deal model: fail"
64
+ if seeds:
65
+ self._done.push(seeds)
66
+
67
+ logger.info("upload pipeline close!")
68
+
69
+
@@ -0,0 +1,22 @@
1
+ from cobweb.base import ConsoleItem, logger
2
+ from cobweb.constant import LogTemplate
3
+ from cobweb.pipelines import Pipeline
4
+
5
+
6
+ class Console(Pipeline):
7
+
8
+ def build(self, item: ConsoleItem):
9
+ return {
10
+ "seed": item.seed.to_dict,
11
+ "data": item.to_dict
12
+ }
13
+
14
+ def upload(self, table, datas):
15
+ for data in datas:
16
+ parse_detail = LogTemplate.log_info(data["data"])
17
+ if len(parse_detail) > 500:
18
+ parse_detail = parse_detail[:500] + " ...\n" + " " * 12 + "-- Text is too long and details are omitted!"
19
+ logger.info(LogTemplate.console_item.format(
20
+ seed_detail=LogTemplate.log_info(data["seed"]),
21
+ parse_detail=parse_detail
22
+ ))
@@ -0,0 +1,34 @@
1
+ import json
2
+
3
+ from cobweb import setting
4
+ from cobweb.base import BaseItem
5
+ from cobweb.pipelines import Pipeline
6
+ from aliyun.log import LogClient, LogItem, PutLogsRequest
7
+
8
+
9
+ class Loghub(Pipeline):
10
+
11
+ def __init__(self, *args, **kwargs):
12
+ super().__init__(*args, **kwargs)
13
+ self.client = LogClient(**setting.LOGHUB_CONFIG)
14
+
15
+ def build(self, item: BaseItem):
16
+ log_item = LogItem()
17
+ temp = item.to_dict
18
+ for key, value in temp.items():
19
+ if not isinstance(value, str):
20
+ temp[key] = json.dumps(value, ensure_ascii=False)
21
+ contents = sorted(temp.items())
22
+ log_item.set_contents(contents)
23
+ return log_item
24
+
25
+ def upload(self, table, datas):
26
+ request = PutLogsRequest(
27
+ project=setting.LOGHUB_PROJECT,
28
+ logstore=table,
29
+ topic=setting.LOGHUB_TOPIC,
30
+ source=setting.LOGHUB_SOURCE,
31
+ logitems=datas,
32
+ compress=True
33
+ )
34
+ self.client.put_logs(request=request)
@@ -0,0 +1,3 @@
1
+ from .scheduler_redis import RedisScheduler
2
+ from .scheduler_api import ApiScheduler
3
+
@@ -0,0 +1,72 @@
1
+ import threading
2
+ import time
3
+
4
+ # from cobweb.base import Seed
5
+ from cobweb.db import ApiDB
6
+
7
+
8
+ class ApiScheduler:
9
+
10
+ def __init__(self, task, project, scheduler_wait_seconds=30):
11
+ self._todo_key = "{%s:%s}:todo" % (project, task)
12
+ self._download_key = "{%s:%s}:download" % (project, task)
13
+ self._heartbeat_key = "heartbeat:%s_%s" % (project, task)
14
+ self._speed_control_key = "speed_control:%s_%s" % (project, task)
15
+ self._reset_lock_key = "lock:reset:%s_%s" % (project, task)
16
+ self._db = ApiDB()
17
+
18
+ self.scheduler_wait_seconds = scheduler_wait_seconds
19
+ self.working = threading.Event()
20
+
21
+ @property
22
+ def heartbeat(self):
23
+ return self._db.exists(self._heartbeat_key)
24
+
25
+ def set_heartbeat(self):
26
+ return self._db.setex(self._heartbeat_key, 5)
27
+
28
+ def schedule(self, key, count):
29
+ if not self._db.zcount(key, 0, "(1000"):
30
+ time.sleep(self.scheduler_wait_seconds)
31
+ else:
32
+ source = int(time.time())
33
+ members = self._db.members(key, source, count=count, _min=0, _max="(1000")
34
+ for member, priority in members:
35
+ # seed = Seed(member, priority=priority)
36
+ yield member, priority
37
+
38
+ def insert(self, key, items):
39
+ if items:
40
+ self._db.zadd(key, items, nx=True)
41
+
42
+ def reset(self, keys, reset_time=30):
43
+ if self._db.lock(self._reset_lock_key, t=120):
44
+
45
+ if isinstance(keys, str):
46
+ keys = [keys]
47
+
48
+ _min = reset_time - int(time.time()) if self.heartbeat else "-inf"
49
+
50
+ for key in keys:
51
+ if self._db.exists(key):
52
+ self._db.members(key, 0, _min=_min, _max="(0")
53
+
54
+ if not self.heartbeat:
55
+ self.working.set()
56
+ time.sleep(10)
57
+
58
+ self._db.delete(self._reset_lock_key)
59
+
60
+ def refresh(self, key, items: dict[str, int]):
61
+ refresh_time = int(time.time())
62
+ its = {k: -refresh_time - v / 1000 for k, v in items.items()}
63
+ if its:
64
+ self._db.zadd(key, item=its, xx=True)
65
+
66
+ def delete(self, key, values):
67
+ if values:
68
+ self._db.zrem(key, *values)
69
+
70
+
71
+
72
+
@@ -0,0 +1,72 @@
1
+ import threading
2
+ import time
3
+
4
+ # from cobweb.base import Seed
5
+ from cobweb.db import RedisDB
6
+
7
+
8
+ class RedisScheduler:
9
+
10
+ def __init__(self, task, project, scheduler_wait_seconds=30):
11
+ self._todo_key = "{%s:%s}:todo" % (project, task)
12
+ self._download_key = "{%s:%s}:download" % (project, task)
13
+ self._heartbeat_key = "heartbeat:%s_%s" % (project, task)
14
+ self._speed_control_key = "speed_control:%s_%s" % (project, task)
15
+ self._reset_lock_key = "lock:reset:%s_%s" % (project, task)
16
+ self._db = RedisDB()
17
+
18
+ self.scheduler_wait_seconds = scheduler_wait_seconds
19
+ self.working = threading.Event()
20
+
21
+ @property
22
+ def heartbeat(self):
23
+ return self._db.exists(self._heartbeat_key)
24
+
25
+ def set_heartbeat(self):
26
+ return self._db.setex(self._heartbeat_key, 5)
27
+
28
+ def schedule(self, key, count):
29
+ if not self._db.zcount(key, 0, "(1000"):
30
+ time.sleep(self.scheduler_wait_seconds)
31
+ else:
32
+ source = int(time.time())
33
+ members = self._db.members(key, source, count=count, _min=0, _max="(1000")
34
+ for member, priority in members:
35
+ # seed = Seed(member, priority=priority)
36
+ yield member, priority
37
+
38
+ def insert(self, key, items):
39
+ if items:
40
+ self._db.zadd(key, items, nx=True)
41
+
42
+ def reset(self, keys, reset_time=30):
43
+ if self._db.lock(self._reset_lock_key, t=120):
44
+
45
+ if isinstance(keys, str):
46
+ keys = [keys]
47
+
48
+ _min = reset_time - int(time.time()) if self.heartbeat else "-inf"
49
+
50
+ for key in keys:
51
+ if self._db.exists(key):
52
+ self._db.members(key, 0, _min=_min, _max="(0")
53
+
54
+ if not self.heartbeat:
55
+ self.working.set()
56
+ time.sleep(10)
57
+
58
+ self._db.delete(self._reset_lock_key)
59
+
60
+ def refresh(self, key, items: dict[str, int]):
61
+ refresh_time = int(time.time())
62
+ its = {k: -refresh_time - v / 1000 for k, v in items.items()}
63
+ if its:
64
+ self._db.zadd(key, item=its, xx=True)
65
+
66
+ def delete(self, key, values):
67
+ if values:
68
+ self._db.zrem(key, *values)
69
+
70
+
71
+
72
+
cobweb/setting.py CHANGED
@@ -1,13 +1,74 @@
1
1
  import os
2
2
 
3
+ # redis db config
4
+ REDIS_CONFIG = {
5
+ "host": os.getenv("REDIS_HOST"),
6
+ "password": os.getenv("REDIS_PASSWORD"),
7
+ "port": int(os.getenv("REDIS_PORT", 6379)),
8
+ "db": int(os.getenv("REDIS_DB", 0)),
9
+ }
3
10
 
4
- # model: 0, 1, 2
5
- MODEL = int(os.getenv("MODEL", "0"))
11
+ # loghub db config
12
+ LOGHUB_TOPIC = os.getenv("LOGHUB_TOPIC")
13
+ LOGHUB_SOURCE = os.getenv("LOGHUB_SOURCE")
14
+ LOGHUB_PROJECT = os.getenv("LOGHUB_PROJECT")
15
+ LOGHUB_CONFIG = {
16
+ "endpoint": os.getenv("LOGHUB_ENDPOINT"),
17
+ "accessKeyId": os.getenv("LOGHUB_ACCESS_KEY"),
18
+ "accessKey": os.getenv("LOGHUB_SECRET_KEY")
19
+ }
6
20
 
7
- # 重制score值的等待时间, 默认10分钟
8
- RESET_SCORE = int(os.getenv("RESET_SCORE", "600"))
21
+ # oss util config
22
+ OSS_BUCKET = os.getenv("OSS_BUCKET")
23
+ OSS_ENDPOINT = os.getenv("OSS_ENDPOINT")
24
+ OSS_ACCESS_KEY = os.getenv("OSS_ACCESS_KEY")
25
+ OSS_SECRET_KEY = os.getenv("OSS_SECRET_KEY")
26
+ OSS_CHUNK_SIZE = 10 * 1024 ** 2
27
+ OSS_MIN_UPLOAD_SIZE = 1024
9
28
 
10
- # 默认设置检查spider queue队列锁的存活时间为30s
11
- CHECK_LOCK_TIME = int(os.getenv("CHECK_LOCK_TIME", 30))
12
29
 
30
+ # 采集器选择
31
+ CRAWLER = "cobweb.crawlers.Crawler"
13
32
 
33
+ # 数据存储链路
34
+ PIPELINE = "cobweb.pipelines.pipeline_console.Console"
35
+
36
+
37
+ # Launcher 等待时间
38
+
39
+ BEFORE_SCHEDULER_WAIT_SECONDS = 60 # 调度前等待时间,只作用于单次任务
40
+ SCHEDULER_WAIT_SECONDS = 15 # 调度等待时间
41
+ TODO_QUEUE_FULL_WAIT_SECONDS = 5 # todo队列已满时等待时间
42
+ NEW_QUEUE_WAIT_SECONDS = 30 # new队列等待时间
43
+ DONE_QUEUE_WAIT_SECONDS = 5 # done队列等待时间
44
+ UPLOAD_QUEUE_WAIT_SECONDS = 15 # upload队列等待时间
45
+ SEED_RESET_SECONDS = 30 # 种子重制时间
46
+
47
+
48
+ # Launcher 队列长度
49
+ TODO_QUEUE_SIZE = 100 # todo队列长度
50
+ NEW_QUEUE_MAX_SIZE = 100 # new队列长度
51
+ DONE_QUEUE_MAX_SIZE = 100 # done队列长度
52
+ UPLOAD_QUEUE_MAX_SIZE = 100 # upload队列长度
53
+
54
+ # DONE_MODEL IN (0, 1), 种子完成模式
55
+ DONE_MODEL = 0 # 0:种子消费成功直接从队列移除,失败则添加至失败队列;1:种子消费成功添加至成功队列,失败添加至失败队列
56
+
57
+ # spider
58
+ SPIDER_THREAD_NUM = 10
59
+ SPIDER_MAX_RETRIES = 5
60
+ SPIDER_TIME_SLEEP = 10
61
+
62
+ SPIDER_MAX_COUNT = 1000 # 在规定时间窗口内最大采集数
63
+ TIME_WINDOW = 60 # 频控固定时间窗口(秒)
64
+
65
+ # 任务模式
66
+ TASK_MODEL = 0 # 0:单次,1:常驻
67
+
68
+
69
+ # bloom过滤器
70
+ CAPACITY = 100000000
71
+ ERROR_RATE = 0.001
72
+ FILTER_FIELD = "url"
73
+ # 文件下载响应类型过滤
74
+ # FILE_FILTER_CONTENT_TYPE = ["text/html", "application/xhtml+xml"]
@@ -0,0 +1,5 @@
1
+ from .oss import OssUtil
2
+ from .tools import *
3
+ from .bloom import BloomFilter
4
+ from .dotting import LoghubDot
5
+
cobweb/utils/bloom.py ADDED
@@ -0,0 +1,58 @@
1
+ import math
2
+ import time
3
+
4
+ import mmh3
5
+ import redis
6
+ from cobweb import setting
7
+
8
+
9
+ class BloomFilter:
10
+
11
+ def __init__(self, key, redis_config=None, capacity=None, error_rate=None):
12
+ redis_config = redis_config or setting.REDIS_CONFIG
13
+ capacity = capacity or setting.CAPACITY
14
+ error_rate = error_rate or setting.ERROR_RATE
15
+ redis_config['db'] = 3
16
+
17
+ self.key = key
18
+
19
+ pool = redis.ConnectionPool(**redis_config)
20
+ self._client = redis.Redis(connection_pool=pool)
21
+ self.bit_size = self.get_bit_size(capacity, error_rate)
22
+ self.hash_count = self.get_hash_count(self.bit_size, capacity)
23
+ self._init_bloom_key()
24
+
25
+ def add(self, value):
26
+ for seed in range(self.hash_count):
27
+ result = mmh3.hash(value, seed) % self.bit_size
28
+ self._client.setbit(self.key, result, 1)
29
+ return True
30
+
31
+ def exists(self, value):
32
+ if not self._client.exists(self.key):
33
+ return False
34
+ for seed in range(self.hash_count):
35
+ result = mmh3.hash(value, seed) % self.bit_size
36
+ if not self._client.getbit(self.key, result):
37
+ return False
38
+ return True
39
+
40
+ def _init_bloom_key(self):
41
+ lua_script = """
42
+ redis.call("SETBIT", KEYS[1], ARGV[1], ARGV[2])
43
+ redis.call("EXPIRE", KEYS[1], 604800)
44
+ """
45
+ if self._client.exists(self.key):
46
+ return True
47
+ execute = self._client.register_script(lua_script)
48
+ execute(keys=[self.key], args=[self.bit_size-1, 1])
49
+
50
+ @classmethod
51
+ def get_bit_size(cls, n, p):
52
+ return int(-(n * math.log(p)) / (math.log(2) ** 2))
53
+
54
+ @classmethod
55
+ def get_hash_count(cls, m, n):
56
+ return int((m / n) * math.log(2))
57
+
58
+
@@ -0,0 +1,32 @@
1
+ import json
2
+
3
+ from aliyun.log import LogClient, LogItem, PutLogsRequest
4
+ from cobweb import setting
5
+
6
+
7
+ class LoghubDot:
8
+
9
+ def __init__(self):
10
+ self.client = LogClient(**setting.LOGHUB_CONFIG)
11
+
12
+ def build(self, topic, **kwargs):
13
+
14
+ temp = {}
15
+ log_items = []
16
+ log_item = LogItem()
17
+ for key, value in kwargs.items():
18
+ if not isinstance(value, str):
19
+ temp[key] = json.dumps(value, ensure_ascii=False)
20
+ else:
21
+ temp[key] = value
22
+ contents = sorted(temp.items())
23
+ log_item.set_contents(contents)
24
+ log_items.append(log_item)
25
+ request = PutLogsRequest(
26
+ project="databee-download-log",
27
+ logstore="log",
28
+ topic=topic,
29
+ logitems=log_items,
30
+ compress=True
31
+ )
32
+ self.client.put_logs(request=request)