cobweb-launcher 1.2.25__py3-none-any.whl → 3.2.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. cobweb/__init__.py +4 -1
  2. cobweb/base/__init__.py +3 -3
  3. cobweb/base/common_queue.py +37 -16
  4. cobweb/base/item.py +35 -16
  5. cobweb/base/{log.py → logger.py} +3 -3
  6. cobweb/base/request.py +741 -54
  7. cobweb/base/response.py +380 -13
  8. cobweb/base/seed.py +96 -48
  9. cobweb/base/task_queue.py +180 -0
  10. cobweb/base/test.py +257 -0
  11. cobweb/constant.py +10 -1
  12. cobweb/crawlers/crawler.py +12 -155
  13. cobweb/db/api_db.py +3 -2
  14. cobweb/db/redis_db.py +117 -28
  15. cobweb/launchers/__init__.py +4 -3
  16. cobweb/launchers/distributor.py +141 -0
  17. cobweb/launchers/launcher.py +95 -157
  18. cobweb/launchers/uploader.py +68 -0
  19. cobweb/log_dots/__init__.py +2 -0
  20. cobweb/log_dots/dot.py +258 -0
  21. cobweb/log_dots/loghub_dot.py +53 -0
  22. cobweb/pipelines/__init__.py +1 -1
  23. cobweb/pipelines/pipeline.py +5 -55
  24. cobweb/pipelines/pipeline_csv.py +25 -0
  25. cobweb/pipelines/pipeline_loghub.py +32 -12
  26. cobweb/schedulers/__init__.py +1 -0
  27. cobweb/schedulers/scheduler.py +66 -0
  28. cobweb/schedulers/scheduler_with_redis.py +189 -0
  29. cobweb/setting.py +27 -40
  30. cobweb/utils/__init__.py +5 -3
  31. cobweb/utils/bloom.py +58 -58
  32. cobweb/{base → utils}/decorators.py +14 -12
  33. cobweb/utils/dotting.py +300 -0
  34. cobweb/utils/oss.py +113 -94
  35. cobweb/utils/tools.py +3 -15
  36. {cobweb_launcher-1.2.25.dist-info → cobweb_launcher-3.2.20.dist-info}/METADATA +31 -43
  37. cobweb_launcher-3.2.20.dist-info/RECORD +44 -0
  38. {cobweb_launcher-1.2.25.dist-info → cobweb_launcher-3.2.20.dist-info}/WHEEL +1 -1
  39. cobweb/crawlers/base_crawler.py +0 -144
  40. cobweb/crawlers/file_crawler.py +0 -98
  41. cobweb/launchers/launcher_air.py +0 -88
  42. cobweb/launchers/launcher_api.py +0 -221
  43. cobweb/launchers/launcher_pro.py +0 -222
  44. cobweb/pipelines/base_pipeline.py +0 -54
  45. cobweb/pipelines/loghub_pipeline.py +0 -34
  46. cobweb/pipelines/pipeline_console.py +0 -22
  47. cobweb_launcher-1.2.25.dist-info/RECORD +0 -40
  48. {cobweb_launcher-1.2.25.dist-info → cobweb_launcher-3.2.20.dist-info}/LICENSE +0 -0
  49. {cobweb_launcher-1.2.25.dist-info → cobweb_launcher-3.2.20.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,189 @@
1
+ import os
2
+ import time
3
+ import threading
4
+ from typing import Callable
5
+ from cobweb.db import RedisDB, ApiDB
6
+ from cobweb.utils import check_pause
7
+ from cobweb.base import Seed, logger, TaskQueue, Status
8
+ from cobweb.constant import LogTemplate
9
+ from .scheduler import Scheduler
10
+ use_api = bool(os.getenv("REDIS_API_HOST", 0))
11
+
12
+
13
+ class RedisScheduler(Scheduler):
14
+
15
+ def __init__(
16
+ self,
17
+ task,
18
+ project,
19
+ stop: threading.Event,
20
+ pause: threading.Event,
21
+ task_queue: TaskQueue,
22
+ callback_register: Callable
23
+ ):
24
+ super().__init__(task, project, stop, pause, task_queue, callback_register)
25
+ self.todo_key = f"{{{project}:{task}}}:todo"
26
+ self.done_key = f"{{{project}:{task}}}:done"
27
+ self.fail_key = f"{{{project}:{task}}}:fail"
28
+ self.heartbeat_key = f"heartbeat:{project}_{task}"
29
+ self.heartbeat_run_key = f"run:{project}_{task}"
30
+ self.speed_control_key = f"speed_control:{project}_{task}"
31
+ self.reset_lock_key = f"lock:reset:{project}_{task}"
32
+ self.db = ApiDB() if use_api else RedisDB()
33
+
34
+ def reset(self):
35
+ """
36
+ 检查过期种子,重新添加到redis缓存中
37
+ """
38
+ while not self.stop.is_set():
39
+ if self.db.lock(self.reset_lock_key, t=360):
40
+
41
+ _min = -int(time.time()) + self.seed_reset_seconds
42
+ self.db.members(self.todo_key, 0, _min=_min, _max="(0")
43
+ self.db.delete(self.reset_lock_key)
44
+
45
+ time.sleep(self.seed_reset_seconds)
46
+
47
+ @check_pause
48
+ def schedule(self):
49
+ """
50
+ 调度任务,获取redis队列种子,同时添加到doing字典中
51
+ """
52
+ if not self.db.zcount(self.todo_key, 0, "(1000"):
53
+ time.sleep(self.scheduler_wait_seconds)
54
+ return
55
+
56
+ if self.task_queue.status_length(Status.PENDING) >= self.todo_queue_size\
57
+ or self.task_queue.length() > 5 * self.todo_queue_size:
58
+ time.sleep(self.todo_queue_full_wait_seconds)
59
+ return
60
+
61
+ if members := self.db.members(
62
+ self.todo_key, int(time.time()),
63
+ count=self.todo_queue_size,
64
+ _min=0, _max="(1000"
65
+ ):
66
+ for member, priority in members:
67
+ seed = Seed(member, priority=int(priority % 1000))
68
+ seed.params.get_time = time.time()
69
+ self.task_queue.add_task(
70
+ task_id=seed.sid, data=seed,
71
+ status=Status.PENDING,
72
+ priority=seed.params.priority
73
+ )
74
+
75
+ @check_pause
76
+ def insert(self):
77
+ """
78
+ 添加新种子到redis队列中
79
+ """
80
+ if task_list := self.task_queue.get_task_by_status(
81
+ status=Status.INSERT, limit=self.new_queue_max_size
82
+ ):
83
+ seed_info, task_ids = dict(), set()
84
+
85
+ for task_item in task_list:
86
+ seed = task_item.data
87
+ task_ids.add(task_item.task_id)
88
+ seed_info[seed.to_string] = seed.params.priority
89
+
90
+ self.db.zadd(self.todo_key, seed_info, nx=True)
91
+ self.task_queue.remove(task_ids)
92
+
93
+ if self.task_queue.status_length(status=Status.INSERT) < self.new_queue_max_size:
94
+ time.sleep(self.scheduler_wait_seconds)
95
+
96
+ @check_pause
97
+ def refresh(self):
98
+ """
99
+ 刷新doing种子过期时间,防止reset重新消费
100
+ """
101
+ if task_list := self.task_queue.get_task_by_status(
102
+ status=[Status.PENDING, Status.PROCESSING, Status.FINISHED],
103
+ ):
104
+ refresh_time = int(time.time())
105
+ seed_info = {it.data.to_string: -refresh_time - it.data.params.priority / 1000 for it in task_list}
106
+ self.db.zadd(self.todo_key, seed_info, xx=True)
107
+ time.sleep(self.seed_reset_seconds // 3)
108
+
109
+ @check_pause
110
+ def delete(self):
111
+ """
112
+ 删除队列种子,根据状态添加至成功或失败队列,移除doing字典种子索引
113
+ """
114
+ if task_list := self.task_queue.get_task_by_status(
115
+ status=Status.FINISHED, limit=self.done_queue_max_size
116
+ ):
117
+ zrem_items = [it.data.to_string for it in task_list]
118
+ remove_task_ids = [it.task_id for it in task_list]
119
+ self.db.zrem(self.todo_key, *zrem_items)
120
+ self.task_queue.remove(remove_task_ids)
121
+
122
+ if self.task_queue.status_length(status=Status.FINISHED) < self.done_queue_max_size:
123
+ time.sleep(self.done_queue_wait_seconds)
124
+
125
+ def run(self):
126
+ start_time = int(time.time())
127
+
128
+ for func in [self.reset, self.insert, self.delete, self.refresh, self.schedule]:
129
+ self.callback_register(func, tag="scheduler")
130
+
131
+ while not self.stop.is_set():
132
+ todo_len = self.task_queue.status_length(status=Status.PENDING)
133
+ doing_len = self.task_queue.status_length(status=Status.PROCESSING)
134
+ done_len = self.task_queue.status_length(status=Status.FINISHED)
135
+ upload_len = self.task_queue.status_length(status=Status.UPLOAD)
136
+
137
+ redis_doing_count = self.db.zcount(self.todo_key, "-inf", "(0")
138
+ redis_todo_len = self.db.zcount(self.todo_key, 0, "(1000")
139
+ redis_seed_count = self.db.zcard(self.todo_key)
140
+
141
+ if self.pause.is_set():
142
+ execute_time = int(time.time()) - start_time
143
+ if redis_todo_len and self.task_queue.length() > 0:
144
+ logger.info(
145
+ f"Recovery {self.task} task run!"
146
+ f"Todo seeds count: {redis_todo_len}"
147
+ f", queue length: {redis_seed_count}"
148
+ )
149
+ self.pause.clear()
150
+ elif not self.task_model and execute_time > self.before_scheduler_wait_seconds:
151
+ logger.info("Done! ready to close thread...")
152
+ self.stop.set()
153
+ else:
154
+ logger.info("Pause! waiting for resume...")
155
+
156
+ elif self.task_queue.length() == 0:
157
+ if redis_seed_count:
158
+ logger.info(
159
+ f"Todo seeds count: {redis_todo_len}"
160
+ f", queue length: {redis_seed_count}"
161
+ )
162
+ self.pause.clear()
163
+ else:
164
+ count = 0
165
+ for _ in range(3):
166
+ if not redis_seed_count:
167
+ count += 1
168
+ time.sleep(5)
169
+ logger.info("Checking count...")
170
+ else:
171
+ break
172
+ if count >= 3:
173
+ logger.info("Todo queue is empty! Pause set...")
174
+ self.pause.set()
175
+
176
+ else:
177
+ self.db.setex(self.heartbeat_run_key, 60, 1)
178
+ logger.info(LogTemplate.launcher_pro_polling.format(
179
+ task=self.task,
180
+ doing_len=doing_len,
181
+ todo_len=todo_len,
182
+ done_len=done_len,
183
+ redis_seed_count=redis_seed_count,
184
+ redis_todo_len=redis_todo_len,
185
+ redis_doing_len=redis_doing_count,
186
+ upload_len=upload_len,
187
+ ))
188
+
189
+ time.sleep(30)
cobweb/setting.py CHANGED
@@ -1,37 +1,11 @@
1
- import os
2
-
3
- # redis db config
4
- REDIS_CONFIG = {
5
- "host": os.getenv("REDIS_HOST"),
6
- "password": os.getenv("REDIS_PASSWORD"),
7
- "port": int(os.getenv("REDIS_PORT", 6379)),
8
- "db": int(os.getenv("REDIS_DB", 0)),
9
- }
10
-
11
- # loghub db config
12
- LOGHUB_TOPIC = os.getenv("LOGHUB_TOPIC")
13
- LOGHUB_SOURCE = os.getenv("LOGHUB_SOURCE")
14
- LOGHUB_PROJECT = os.getenv("LOGHUB_PROJECT")
15
- LOGHUB_CONFIG = {
16
- "endpoint": os.getenv("LOGHUB_ENDPOINT"),
17
- "accessKeyId": os.getenv("LOGHUB_ACCESS_KEY"),
18
- "accessKey": os.getenv("LOGHUB_SECRET_KEY")
19
- }
20
-
21
- # oss util config
22
- OSS_BUCKET = os.getenv("OSS_BUCKET")
23
- OSS_ENDPOINT = os.getenv("OSS_ENDPOINT")
24
- OSS_ACCESS_KEY = os.getenv("OSS_ACCESS_KEY")
25
- OSS_SECRET_KEY = os.getenv("OSS_SECRET_KEY")
26
- OSS_CHUNK_SIZE = 10 * 1024 ** 2
27
- OSS_MIN_UPLOAD_SIZE = 1024
28
-
29
-
30
1
  # 采集器选择
31
2
  CRAWLER = "cobweb.crawlers.Crawler"
32
3
 
33
- # 数据存储链路
34
- PIPELINE = "cobweb.pipelines.pipeline_console.Console"
4
+ # 数据管道
5
+ PIPELINE = "cobweb.pipelines.CSV"
6
+
7
+ # 调度器
8
+ SCHEDULER = "cobweb.schedulers.RedisScheduler"
35
9
 
36
10
 
37
11
  # Launcher 等待时间
@@ -42,7 +16,7 @@ TODO_QUEUE_FULL_WAIT_SECONDS = 5 # todo队列已满时等待时间
42
16
  NEW_QUEUE_WAIT_SECONDS = 30 # new队列等待时间
43
17
  DONE_QUEUE_WAIT_SECONDS = 5 # done队列等待时间
44
18
  UPLOAD_QUEUE_WAIT_SECONDS = 15 # upload队列等待时间
45
- SEED_RESET_SECONDS = 30 # 种子重制时间
19
+ SEED_RESET_SECONDS = 60 # 种子重制时间
46
20
 
47
21
 
48
22
  # Launcher 队列长度
@@ -58,17 +32,30 @@ DONE_MODEL = 0 # 0:种子消费成功直接从队列移除,失败则添加
58
32
  SPIDER_THREAD_NUM = 10
59
33
  SPIDER_MAX_RETRIES = 5
60
34
  SPIDER_TIME_SLEEP = 10
35
+ RECORD_FAILED_SPIDER = True
61
36
 
62
37
  SPIDER_MAX_COUNT = 1000 # 在规定时间窗口内最大采集数
63
38
  TIME_WINDOW = 60 # 频控固定时间窗口(秒)
64
39
 
65
- # 任务模式
66
- TASK_MODEL = 0 # 0:单次,1:常驻
40
+ # 任务模式, 0:单次,1:常驻
41
+ TASK_MODEL = 0
42
+
43
+ # 流控措施, 0:关闭,1:开启
44
+ SPEED_CONTROL = 1
67
45
 
46
+ DOT = 0
47
+
48
+ # redis config
49
+ REDIS_CONFIG = {
50
+ "host": "127.0.0.1",
51
+ "port": 6379,
52
+ "db": 0
53
+ }
68
54
 
69
- # bloom过滤器
70
- CAPACITY = 100000000
71
- ERROR_RATE = 0.001
72
- FILTER_FIELD = "url"
73
- # 文件下载响应类型过滤
74
- # FILE_FILTER_CONTENT_TYPE = ["text/html", "application/xhtml+xml"]
55
+ # loghub pipeline config
56
+ # os.getenv("LOGHUB_ENDPOINT"),
57
+ # os.getenv("LOGHUB_ACCESS_KEY"),
58
+ # os.getenv("LOGHUB_SECRET_KEY")
59
+ # os.getenv("LOGHUB_PROJECT")
60
+ # os.getenv("LOGHUB_SOURCE")
61
+ # os.getenv("LOGHUB_TOPIC")
cobweb/utils/__init__.py CHANGED
@@ -1,4 +1,6 @@
1
- from .oss import OssUtil
2
- from .tools import *
3
- from .bloom import BloomFilter
1
+ # from .oss import OssUtil
2
+ # from .bloom import BloomFilter
3
+ # from .dotting import LoghubDot
4
+ from .decorators import check_pause
5
+ from .tools import md5, dynamic_load_class
4
6
 
cobweb/utils/bloom.py CHANGED
@@ -1,58 +1,58 @@
1
- import math
2
- import time
3
-
4
- import mmh3
5
- import redis
6
- from cobweb import setting
7
-
8
-
9
- class BloomFilter:
10
-
11
- def __init__(self, key, redis_config=None, capacity=None, error_rate=None):
12
- redis_config = redis_config or setting.REDIS_CONFIG
13
- capacity = capacity or setting.CAPACITY
14
- error_rate = error_rate or setting.ERROR_RATE
15
- redis_config['db'] = 3
16
-
17
- self.key = key
18
-
19
- pool = redis.ConnectionPool(**redis_config)
20
- self._client = redis.Redis(connection_pool=pool)
21
- self.bit_size = self.get_bit_size(capacity, error_rate)
22
- self.hash_count = self.get_hash_count(self.bit_size, capacity)
23
- self._init_bloom_key()
24
-
25
- def add(self, value):
26
- for seed in range(self.hash_count):
27
- result = mmh3.hash(value, seed) % self.bit_size
28
- self._client.setbit(self.key, result, 1)
29
- return True
30
-
31
- def exists(self, value):
32
- if not self._client.exists(self.key):
33
- return False
34
- for seed in range(self.hash_count):
35
- result = mmh3.hash(value, seed) % self.bit_size
36
- if not self._client.getbit(self.key, result):
37
- return False
38
- return True
39
-
40
- def _init_bloom_key(self):
41
- lua_script = """
42
- redis.call("SETBIT", KEYS[1], ARGV[1], ARGV[2])
43
- redis.call("EXPIRE", KEYS[1], 604800)
44
- """
45
- if self._client.exists(self.key):
46
- return True
47
- execute = self._client.register_script(lua_script)
48
- execute(keys=[self.key], args=[self.bit_size-1, 1])
49
-
50
- @classmethod
51
- def get_bit_size(cls, n, p):
52
- return int(-(n * math.log(p)) / (math.log(2) ** 2))
53
-
54
- @classmethod
55
- def get_hash_count(cls, m, n):
56
- return int((m / n) * math.log(2))
57
-
58
-
1
+ # import math
2
+ # import time
3
+ #
4
+ # import mmh3
5
+ # import redis
6
+ # from cobweb import setting
7
+ #
8
+ #
9
+ # class BloomFilter:
10
+ #
11
+ # def __init__(self, key, redis_config=None, capacity=None, error_rate=None):
12
+ # redis_config = redis_config or setting.REDIS_CONFIG
13
+ # capacity = capacity or setting.CAPACITY
14
+ # error_rate = error_rate or setting.ERROR_RATE
15
+ # redis_config['db'] = 3
16
+ #
17
+ # self.key = key
18
+ #
19
+ # pool = redis.ConnectionPool(**redis_config)
20
+ # self._client = redis.Redis(connection_pool=pool)
21
+ # self.bit_size = self.get_bit_size(capacity, error_rate)
22
+ # self.hash_count = self.get_hash_count(self.bit_size, capacity)
23
+ # self._init_bloom_key()
24
+ #
25
+ # def add(self, value):
26
+ # for seed in range(self.hash_count):
27
+ # result = mmh3.hash(value, seed) % self.bit_size
28
+ # self._client.setbit(self.key, result, 1)
29
+ # return True
30
+ #
31
+ # def exists(self, value):
32
+ # if not self._client.exists(self.key):
33
+ # return False
34
+ # for seed in range(self.hash_count):
35
+ # result = mmh3.hash(value, seed) % self.bit_size
36
+ # if not self._client.getbit(self.key, result):
37
+ # return False
38
+ # return True
39
+ #
40
+ # def _init_bloom_key(self):
41
+ # lua_script = """
42
+ # redis.call("SETBIT", KEYS[1], ARGV[1], ARGV[2])
43
+ # redis.call("EXPIRE", KEYS[1], 604800)
44
+ # """
45
+ # if self._client.exists(self.key):
46
+ # return True
47
+ # execute = self._client.register_script(lua_script)
48
+ # execute(keys=[self.key], args=[self.bit_size-1, 1])
49
+ #
50
+ # @classmethod
51
+ # def get_bit_size(cls, n, p):
52
+ # return int(-(n * math.log(p)) / (math.log(2) ** 2))
53
+ #
54
+ # @classmethod
55
+ # def get_hash_count(cls, m, n):
56
+ # return int((m / n) * math.log(2))
57
+ #
58
+ #
@@ -1,16 +1,6 @@
1
+ import time
1
2
  from functools import wraps
2
-
3
-
4
- # def check_redis_status(func):
5
- # @wraps(func)
6
- # def wrapper(*args, **kwargs):
7
- # try:
8
- # result = func(*args, **kwargs)
9
- # except Exception:
10
- # result = False
11
- # return result
12
- #
13
- # return wrapper
3
+ from cobweb.base import logger
14
4
 
15
5
 
16
6
  def decorator_oss_db(exception, retries=3):
@@ -37,4 +27,16 @@ def decorator_oss_db(exception, retries=3):
37
27
  return decorator
38
28
 
39
29
 
30
+ def check_pause(func):
31
+ @wraps(func)
32
+ def wrapper(self, *args, **kwargs):
33
+ while not self.pause.is_set():
34
+ try:
35
+ func(self, *args, **kwargs)
36
+ except Exception as e:
37
+ logger.info(f"{func.__name__}: " + str(e))
38
+ finally:
39
+ time.sleep(0.1)
40
+ logger.info(f"Pause detected: {func.__name__} thread closing...")
40
41
 
42
+ return wrapper