cobweb-launcher 1.0.5__py3-none-any.whl → 3.2.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. cobweb/__init__.py +5 -1
  2. cobweb/base/__init__.py +3 -3
  3. cobweb/base/common_queue.py +37 -16
  4. cobweb/base/item.py +40 -14
  5. cobweb/base/{log.py → logger.py} +3 -3
  6. cobweb/base/request.py +744 -47
  7. cobweb/base/response.py +381 -13
  8. cobweb/base/seed.py +98 -50
  9. cobweb/base/task_queue.py +180 -0
  10. cobweb/base/test.py +257 -0
  11. cobweb/constant.py +39 -2
  12. cobweb/crawlers/__init__.py +1 -2
  13. cobweb/crawlers/crawler.py +27 -0
  14. cobweb/db/__init__.py +1 -0
  15. cobweb/db/api_db.py +83 -0
  16. cobweb/db/redis_db.py +118 -27
  17. cobweb/launchers/__init__.py +3 -1
  18. cobweb/launchers/distributor.py +141 -0
  19. cobweb/launchers/launcher.py +103 -130
  20. cobweb/launchers/uploader.py +68 -0
  21. cobweb/log_dots/__init__.py +2 -0
  22. cobweb/log_dots/dot.py +258 -0
  23. cobweb/log_dots/loghub_dot.py +53 -0
  24. cobweb/pipelines/__init__.py +3 -2
  25. cobweb/pipelines/pipeline.py +19 -0
  26. cobweb/pipelines/pipeline_csv.py +25 -0
  27. cobweb/pipelines/pipeline_loghub.py +54 -0
  28. cobweb/schedulers/__init__.py +1 -0
  29. cobweb/schedulers/scheduler.py +66 -0
  30. cobweb/schedulers/scheduler_with_redis.py +189 -0
  31. cobweb/setting.py +37 -38
  32. cobweb/utils/__init__.py +5 -2
  33. cobweb/utils/bloom.py +58 -0
  34. cobweb/{base → utils}/decorators.py +14 -12
  35. cobweb/utils/dotting.py +300 -0
  36. cobweb/utils/oss.py +113 -86
  37. cobweb/utils/tools.py +3 -15
  38. cobweb_launcher-3.2.18.dist-info/METADATA +193 -0
  39. cobweb_launcher-3.2.18.dist-info/RECORD +44 -0
  40. {cobweb_launcher-1.0.5.dist-info → cobweb_launcher-3.2.18.dist-info}/WHEEL +1 -1
  41. cobweb/crawlers/base_crawler.py +0 -121
  42. cobweb/crawlers/file_crawler.py +0 -181
  43. cobweb/launchers/launcher_pro.py +0 -174
  44. cobweb/pipelines/base_pipeline.py +0 -54
  45. cobweb/pipelines/loghub_pipeline.py +0 -34
  46. cobweb_launcher-1.0.5.dist-info/METADATA +0 -48
  47. cobweb_launcher-1.0.5.dist-info/RECORD +0 -32
  48. {cobweb_launcher-1.0.5.dist-info → cobweb_launcher-3.2.18.dist-info}/LICENSE +0 -0
  49. {cobweb_launcher-1.0.5.dist-info → cobweb_launcher-3.2.18.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,189 @@
1
+ import os
2
+ import time
3
+ import threading
4
+ from typing import Callable
5
+ from cobweb.db import RedisDB, ApiDB
6
+ from cobweb.utils import check_pause
7
+ from cobweb.base import Seed, logger, TaskQueue, Status
8
+ from cobweb.constant import LogTemplate
9
+ from .scheduler import Scheduler
10
+ use_api = bool(os.getenv("REDIS_API_HOST", 0))
11
+
12
+
13
+ class RedisScheduler(Scheduler):
14
+
15
+ def __init__(
16
+ self,
17
+ task,
18
+ project,
19
+ stop: threading.Event,
20
+ pause: threading.Event,
21
+ task_queue: TaskQueue,
22
+ callback_register: Callable
23
+ ):
24
+ super().__init__(task, project, stop, pause, task_queue, callback_register)
25
+ self.todo_key = f"{{{project}:{task}}}:todo"
26
+ self.done_key = f"{{{project}:{task}}}:done"
27
+ self.fail_key = f"{{{project}:{task}}}:fail"
28
+ self.heartbeat_key = f"heartbeat:{project}_{task}"
29
+ self.heartbeat_run_key = f"run:{project}_{task}"
30
+ self.speed_control_key = f"speed_control:{project}_{task}"
31
+ self.reset_lock_key = f"lock:reset:{project}_{task}"
32
+ self.db = ApiDB() if use_api else RedisDB()
33
+
34
+ def reset(self):
35
+ """
36
+ 检查过期种子,重新添加到redis缓存中
37
+ """
38
+ while not self.stop.is_set():
39
+ if self.db.lock(self.reset_lock_key, t=360):
40
+
41
+ _min = -int(time.time()) + self.seed_reset_seconds
42
+ self.db.members(self.todo_key, 0, _min=_min, _max="(0")
43
+ self.db.delete(self.reset_lock_key)
44
+
45
+ time.sleep(self.seed_reset_seconds)
46
+
47
+ @check_pause
48
+ def schedule(self):
49
+ """
50
+ 调度任务,获取redis队列种子,同时添加到doing字典中
51
+ """
52
+ if not self.db.zcount(self.todo_key, 0, "(1000"):
53
+ time.sleep(self.scheduler_wait_seconds)
54
+ return
55
+
56
+ if self.task_queue.status_length(Status.PENDING) >= self.todo_queue_size\
57
+ or self.task_queue.length() > 5 * self.todo_queue_size:
58
+ time.sleep(self.todo_queue_full_wait_seconds)
59
+ return
60
+
61
+ if members := self.db.members(
62
+ self.todo_key, int(time.time()),
63
+ count=self.todo_queue_size,
64
+ _min=0, _max="(1000"
65
+ ):
66
+ for member, priority in members:
67
+ seed = Seed(member, priority=int(priority % 1000))
68
+ seed.params.get_time = time.time()
69
+ self.task_queue.add_task(
70
+ task_id=seed.sid, data=seed,
71
+ status=Status.PENDING,
72
+ priority=seed.params.priority
73
+ )
74
+
75
+ @check_pause
76
+ def insert(self):
77
+ """
78
+ 添加新种子到redis队列中
79
+ """
80
+ if task_list := self.task_queue.get_task_by_status(
81
+ status=Status.INSERT, limit=self.new_queue_max_size
82
+ ):
83
+ seed_info, task_ids = dict(), set()
84
+
85
+ for task_item in task_list:
86
+ seed = task_item.data
87
+ task_ids.add(task_item.task_id)
88
+ seed_info[seed.to_string] = seed.params.priority
89
+
90
+ self.db.zadd(self.todo_key, seed_info, nx=True)
91
+ self.task_queue.remove(task_ids)
92
+
93
+ if self.task_queue.status_length(status=Status.INSERT) < self.new_queue_max_size:
94
+ time.sleep(self.scheduler_wait_seconds)
95
+
96
+ @check_pause
97
+ def refresh(self):
98
+ """
99
+ 刷新doing种子过期时间,防止reset重新消费
100
+ """
101
+ if task_list := self.task_queue.get_task_by_status(
102
+ status=[Status.PENDING, Status.PROCESSING, Status.FINISHED],
103
+ ):
104
+ refresh_time = int(time.time())
105
+ seed_info = {it.data.to_string: -refresh_time - it.data.params.priority / 1000 for it in task_list}
106
+ self.db.zadd(self.todo_key, seed_info, xx=True)
107
+ time.sleep(self.seed_reset_seconds // 3)
108
+
109
+ @check_pause
110
+ def delete(self):
111
+ """
112
+ 删除队列种子,根据状态添加至成功或失败队列,移除doing字典种子索引
113
+ """
114
+ if task_list := self.task_queue.get_task_by_status(
115
+ status=Status.FINISHED, limit=self.done_queue_max_size
116
+ ):
117
+ zrem_items = [it.data.to_string for it in task_list]
118
+ remove_task_ids = [it.task_id for it in task_list]
119
+ self.db.zrem(self.todo_key, *zrem_items)
120
+ self.task_queue.remove(remove_task_ids)
121
+
122
+ if self.task_queue.status_length(status=Status.FINISHED) < self.done_queue_max_size:
123
+ time.sleep(self.done_queue_wait_seconds)
124
+
125
+ def run(self):
126
+ start_time = int(time.time())
127
+
128
+ for func in [self.reset, self.insert, self.delete, self.refresh, self.schedule]:
129
+ self.callback_register(func, tag="scheduler")
130
+
131
+ while not self.stop.is_set():
132
+ todo_len = self.task_queue.status_length(status=Status.PENDING)
133
+ doing_len = self.task_queue.status_length(status=Status.PROCESSING)
134
+ done_len = self.task_queue.status_length(status=Status.FINISHED)
135
+ upload_len = self.task_queue.status_length(status=Status.UPLOAD)
136
+
137
+ redis_doing_count = self.db.zcount(self.todo_key, "-inf", "(0")
138
+ redis_todo_len = self.db.zcount(self.todo_key, 0, "(1000")
139
+ redis_seed_count = self.db.zcard(self.todo_key)
140
+
141
+ if self.pause.is_set():
142
+ execute_time = int(time.time()) - start_time
143
+ if not self.task_model and execute_time > self.before_scheduler_wait_seconds:
144
+ logger.info("Done! ready to close thread...")
145
+ self.stop.set()
146
+ elif redis_todo_len:
147
+ logger.info(
148
+ f"Recovery {self.task} task run!"
149
+ f"Todo seeds count: {redis_todo_len}"
150
+ f", queue length: {redis_seed_count}"
151
+ )
152
+ self.pause.clear()
153
+ else:
154
+ logger.info("Pause! waiting for resume...")
155
+
156
+ elif self.task_queue.length() == 0:
157
+ if redis_seed_count:
158
+ logger.info(
159
+ f"Todo seeds count: {redis_todo_len}"
160
+ f", queue length: {redis_seed_count}"
161
+ )
162
+ self.pause.clear()
163
+ else:
164
+ count = 0
165
+ for _ in range(3):
166
+ if not redis_seed_count:
167
+ count += 1
168
+ time.sleep(5)
169
+ logger.info("Checking count...")
170
+ else:
171
+ break
172
+ if count >= 3:
173
+ logger.info("Todo queue is empty! Pause set...")
174
+ self.pause.set()
175
+
176
+ else:
177
+ self.db.setex(self.heartbeat_run_key, 60, 1)
178
+ logger.info(LogTemplate.launcher_pro_polling.format(
179
+ task=self.task,
180
+ doing_len=doing_len,
181
+ todo_len=todo_len,
182
+ done_len=done_len,
183
+ redis_seed_count=redis_seed_count,
184
+ redis_todo_len=redis_todo_len,
185
+ redis_doing_len=redis_doing_count,
186
+ upload_len=upload_len,
187
+ ))
188
+
189
+ time.sleep(30)
cobweb/setting.py CHANGED
@@ -1,45 +1,22 @@
1
- import os
2
-
3
- # redis db config
4
- REDIS_CONFIG = {
5
- "host": os.getenv("REDIS_HOST"),
6
- "password": os.getenv("REDIS_PASSWORD"),
7
- "port": int(os.getenv("REDIS_PORT", 6379)),
8
- "db": int(os.getenv("REDIS_DB", 0)),
9
- }
10
-
11
- # loghub db config
12
- LOGHUB_TOPIC = os.getenv("LOGHUB_TOPIC")
13
- LOGHUB_SOURCE = os.getenv("LOGHUB_SOURCE")
14
- LOGHUB_PROJECT = os.getenv("LOGHUB_PROJECT")
15
- LOGHUB_CONFIG = {
16
- "endpoint": os.getenv("LOGHUB_ENDPOINT"),
17
- "accessKeyId": os.getenv("LOGHUB_ACCESS_KEY"),
18
- "accessKey": os.getenv("LOGHUB_SECRET_KEY")
19
- }
20
-
21
- # oss util config
22
- OSS_BUCKET = os.getenv("OSS_BUCKET")
23
- OSS_ENDPOINT = os.getenv("OSS_ENDPOINT")
24
- OSS_ACCESS_KEY = os.getenv("OSS_ACCESS_KEY")
25
- OSS_SECRET_KEY = os.getenv("OSS_SECRET_KEY")
26
- OSS_MIN_UPLOAD_SIZE = 1024 * 100
27
- OSS_CHUNK_SIZE = 1024 ** 2
28
-
29
1
  # 采集器选择
30
- CRAWLER = "cobweb.crawlers.CrawlerAir"
2
+ CRAWLER = "cobweb.crawlers.Crawler"
31
3
 
32
- # 数据上传链路
33
- PIPELINE = "cobweb.pipelines.loghub_pipeline.LoghubPipeline"
4
+ # 数据管道
5
+ PIPELINE = "cobweb.pipelines.CSV"
6
+
7
+ # 调度器
8
+ SCHEDULER = "cobweb.schedulers.RedisScheduler"
34
9
 
35
10
 
36
11
  # Launcher 等待时间
12
+
13
+ BEFORE_SCHEDULER_WAIT_SECONDS = 60 # 调度前等待时间,只作用于单次任务
37
14
  SCHEDULER_WAIT_SECONDS = 15 # 调度等待时间
38
15
  TODO_QUEUE_FULL_WAIT_SECONDS = 5 # todo队列已满时等待时间
39
16
  NEW_QUEUE_WAIT_SECONDS = 30 # new队列等待时间
40
- DONE_QUEUE_WAIT_SECONDS = 15 # done队列等待时间
17
+ DONE_QUEUE_WAIT_SECONDS = 5 # done队列等待时间
41
18
  UPLOAD_QUEUE_WAIT_SECONDS = 15 # upload队列等待时间
42
- SEED_RESET_SECONDS = 600 # 种子重制时间
19
+ SEED_RESET_SECONDS = 60 # 种子重制时间
43
20
 
44
21
 
45
22
  # Launcher 队列长度
@@ -51,12 +28,34 @@ UPLOAD_QUEUE_MAX_SIZE = 100 # upload队列长度
51
28
  # DONE_MODEL IN (0, 1), 种子完成模式
52
29
  DONE_MODEL = 0 # 0:种子消费成功直接从队列移除,失败则添加至失败队列;1:种子消费成功添加至成功队列,失败添加至失败队列
53
30
 
54
- # DOWNLOAD_MODEL IN (0, 1), 下载模式
55
- DOWNLOAD_MODEL = 0 # 0: 通用下载;1:文件下载
56
-
57
31
  # spider
58
32
  SPIDER_THREAD_NUM = 10
59
33
  SPIDER_MAX_RETRIES = 5
34
+ SPIDER_TIME_SLEEP = 10
35
+ RECORD_FAILED_SPIDER = True
36
+
37
+ SPIDER_MAX_COUNT = 1000 # 在规定时间窗口内最大采集数
38
+ TIME_WINDOW = 60 # 频控固定时间窗口(秒)
39
+
40
+ # 任务模式, 0:单次,1:常驻
41
+ TASK_MODEL = 0
42
+
43
+ # 流控措施, 0:关闭,1:开启
44
+ SPEED_CONTROL = 1
45
+
46
+ DOT = 0
47
+
48
+ # redis config
49
+ REDIS_CONFIG = {
50
+ "host": "127.0.0.1",
51
+ "port": 6379,
52
+ "db": 0
53
+ }
60
54
 
61
- # 文件下载响应类型过滤
62
- FILE_FILTER_CONTENT_TYPE = ["text/html", "application/xhtml+xml"]
55
+ # loghub pipeline config
56
+ # os.getenv("LOGHUB_ENDPOINT"),
57
+ # os.getenv("LOGHUB_ACCESS_KEY"),
58
+ # os.getenv("LOGHUB_SECRET_KEY")
59
+ # os.getenv("LOGHUB_PROJECT")
60
+ # os.getenv("LOGHUB_SOURCE")
61
+ # os.getenv("LOGHUB_TOPIC")
cobweb/utils/__init__.py CHANGED
@@ -1,3 +1,6 @@
1
- from .oss import OssUtil
2
- from .tools import *
1
+ # from .oss import OssUtil
2
+ # from .bloom import BloomFilter
3
+ # from .dotting import LoghubDot
4
+ from .decorators import check_pause
5
+ from .tools import md5, dynamic_load_class
3
6
 
cobweb/utils/bloom.py ADDED
@@ -0,0 +1,58 @@
1
+ # import math
2
+ # import time
3
+ #
4
+ # import mmh3
5
+ # import redis
6
+ # from cobweb import setting
7
+ #
8
+ #
9
+ # class BloomFilter:
10
+ #
11
+ # def __init__(self, key, redis_config=None, capacity=None, error_rate=None):
12
+ # redis_config = redis_config or setting.REDIS_CONFIG
13
+ # capacity = capacity or setting.CAPACITY
14
+ # error_rate = error_rate or setting.ERROR_RATE
15
+ # redis_config['db'] = 3
16
+ #
17
+ # self.key = key
18
+ #
19
+ # pool = redis.ConnectionPool(**redis_config)
20
+ # self._client = redis.Redis(connection_pool=pool)
21
+ # self.bit_size = self.get_bit_size(capacity, error_rate)
22
+ # self.hash_count = self.get_hash_count(self.bit_size, capacity)
23
+ # self._init_bloom_key()
24
+ #
25
+ # def add(self, value):
26
+ # for seed in range(self.hash_count):
27
+ # result = mmh3.hash(value, seed) % self.bit_size
28
+ # self._client.setbit(self.key, result, 1)
29
+ # return True
30
+ #
31
+ # def exists(self, value):
32
+ # if not self._client.exists(self.key):
33
+ # return False
34
+ # for seed in range(self.hash_count):
35
+ # result = mmh3.hash(value, seed) % self.bit_size
36
+ # if not self._client.getbit(self.key, result):
37
+ # return False
38
+ # return True
39
+ #
40
+ # def _init_bloom_key(self):
41
+ # lua_script = """
42
+ # redis.call("SETBIT", KEYS[1], ARGV[1], ARGV[2])
43
+ # redis.call("EXPIRE", KEYS[1], 604800)
44
+ # """
45
+ # if self._client.exists(self.key):
46
+ # return True
47
+ # execute = self._client.register_script(lua_script)
48
+ # execute(keys=[self.key], args=[self.bit_size-1, 1])
49
+ #
50
+ # @classmethod
51
+ # def get_bit_size(cls, n, p):
52
+ # return int(-(n * math.log(p)) / (math.log(2) ** 2))
53
+ #
54
+ # @classmethod
55
+ # def get_hash_count(cls, m, n):
56
+ # return int((m / n) * math.log(2))
57
+ #
58
+ #
@@ -1,16 +1,6 @@
1
+ import time
1
2
  from functools import wraps
2
-
3
-
4
- # def check_redis_status(func):
5
- # @wraps(func)
6
- # def wrapper(*args, **kwargs):
7
- # try:
8
- # result = func(*args, **kwargs)
9
- # except Exception:
10
- # result = False
11
- # return result
12
- #
13
- # return wrapper
3
+ from cobweb.base import logger
14
4
 
15
5
 
16
6
  def decorator_oss_db(exception, retries=3):
@@ -37,4 +27,16 @@ def decorator_oss_db(exception, retries=3):
37
27
  return decorator
38
28
 
39
29
 
30
+ def check_pause(func):
31
+ @wraps(func)
32
+ def wrapper(self, *args, **kwargs):
33
+ while not self.pause.is_set():
34
+ try:
35
+ func(self, *args, **kwargs)
36
+ except Exception as e:
37
+ logger.info(f"{func.__name__}: " + str(e))
38
+ finally:
39
+ time.sleep(0.1)
40
+ logger.info(f"Pause detected: {func.__name__} thread closing...")
40
41
 
42
+ return wrapper
@@ -0,0 +1,300 @@
1
+ import os
2
+ import json
3
+ import time
4
+ from threading import Event
5
+ from requests import RequestException, Response as requests_Response
6
+
7
+ from cobweb.base import Queue, Request, Seed, Response, BaseItem, logger
8
+ from aliyun.log import LogClient, LogItem, PutLogsRequest
9
+
10
+
11
+ class LoghubDot:
12
+
13
+ def __init__(self, stop: Event, project: str, task: str) -> None:
14
+ self._stop = stop
15
+ self._queue = Queue()
16
+ self._client = LogClient(
17
+ endpoint=os.getenv("LOGHUB_ENDPOINT"),
18
+ accessKeyId=os.getenv("LOGHUB_ACCESS_KEY"),
19
+ accessKey=os.getenv("LOGHUB_SECRET_KEY")
20
+ )
21
+ self.project = project
22
+ self.task = task
23
+
24
+ def logging(self, topic, msg):
25
+ log_item = LogItem()
26
+ log_data = {
27
+ "stage": topic,
28
+ "message": msg,
29
+ "project": self.project,
30
+ "task": self.task,
31
+ }
32
+
33
+ for key, value in log_data.items():
34
+ if not isinstance(value, str):
35
+ log_data[key] = json.dumps(value, ensure_ascii=False)
36
+ else:
37
+ log_data[key] = value
38
+
39
+ contents = sorted(log_data.items())
40
+ log_item.set_contents(contents)
41
+ self._queue.push(log_item)
42
+
43
+ def _build_request_log(self, request_item: Request):
44
+ log_item = LogItem()
45
+
46
+ seed: Seed = request_item.seed
47
+ get_time = seed.params.get_time
48
+ start_time = seed.params.start_time
49
+ request_time = seed.params.request_time
50
+ stage_cost = request_time - start_time
51
+ cost = request_time - start_time
52
+
53
+ request_settings = json.dumps(
54
+ request_item.request_settings,
55
+ ensure_ascii=False, separators=(',', ':')
56
+ )
57
+
58
+ log_data = {
59
+ "stage": "request",
60
+ "project": self.project,
61
+ "task": self.task,
62
+ "seed": seed.to_string,
63
+ "request": repr(request_item),
64
+ "request_settings": request_settings,
65
+ "get_time": get_time,
66
+ "start_time": start_time,
67
+ "stage_cost": stage_cost,
68
+ "cost": cost,
69
+ "time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(request_time)),
70
+ }
71
+
72
+ for key, value in log_data.items():
73
+ if not isinstance(value, str):
74
+ log_data[key] = json.dumps(value, ensure_ascii=False)
75
+ else:
76
+ log_data[key] = value
77
+
78
+ contents = sorted(log_data.items())
79
+ log_item.set_contents(contents)
80
+ self._queue.push(log_item)
81
+
82
+ def _build_download_log(self, response_item: Response):
83
+ """
84
+ 构建下载阶段的日志项
85
+
86
+ Args:
87
+ response_item: 响应对象
88
+ """
89
+ log_item = LogItem()
90
+
91
+ seed: Seed = response_item.seed
92
+ get_time = seed.params.get_time
93
+ start_time = seed.params.start_time
94
+ request_time = seed.params.request_time
95
+ download_time = seed.params.download_time
96
+ stage_cost = download_time - request_time
97
+ cost = download_time - start_time
98
+
99
+ log_data = {
100
+ "stage": "download",
101
+ "project": self.project,
102
+ "task": self.task,
103
+ "seed": seed.to_string,
104
+ "response": repr(response_item),
105
+ "get_time": get_time,
106
+ "start_time": start_time,
107
+ "request_time": request_time,
108
+ "download_time": download_time,
109
+ "stage_cost": stage_cost,
110
+ "cost": cost,
111
+ "proxy": seed.params.proxy or '-',
112
+ "time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(download_time)),
113
+ }
114
+
115
+ response = response_item.response
116
+ if isinstance(response, requests_Response):
117
+ log_data['request_info'] = {
118
+ 'method': response.request.method,
119
+ 'url': response.request.url,
120
+ 'headers': dict(response.request.headers),
121
+ 'body': response.request.body or "-",
122
+ }
123
+ log_data['response_info'] = {
124
+ "status_code": response.status_code,
125
+ "reason": response.reason,
126
+ "headers": dict(response.headers),
127
+ "content": response.text[:500], # 截取内容
128
+ "content_type": response.headers.get('content-type', '-'),
129
+ "content_length": response.headers.get('content-length', '-'),
130
+ "server": response.headers.get('server', '-'),
131
+ "date": response.headers.get('date', '-'),
132
+ }
133
+
134
+ for key, value in log_data.items():
135
+ if not isinstance(value, str):
136
+ log_data[key] = json.dumps(value, ensure_ascii=False)
137
+ else:
138
+ log_data[key] = value
139
+
140
+ contents = sorted(log_data.items())
141
+ log_item.set_contents(contents)
142
+ self._queue.push(log_item)
143
+
144
+ def _build_parse_log(self, parse_item: BaseItem):
145
+ log_item = LogItem()
146
+
147
+ seed: Seed = parse_item.seed
148
+ get_time = seed.params.get_time
149
+ start_time = seed.params.start_time
150
+ request_time = seed.params.request_time
151
+ response_time = seed.params.response_time
152
+ parse_time = seed.params.parse_time
153
+
154
+ pre_time = request_time or response_time
155
+ stage_cost = parse_time - pre_time
156
+ cost = parse_time - start_time
157
+
158
+ log_data = {
159
+ "stage": "parse",
160
+ "project": self.project,
161
+ "task": self.task,
162
+ "seed": seed.to_string,
163
+ "parse": repr(parse_item),
164
+ "get_time": get_time,
165
+ "start_time": start_time,
166
+ "parse_time": parse_time,
167
+ "stage_cost": stage_cost,
168
+ "cost": cost,
169
+ "time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(parse_time)),
170
+ }
171
+
172
+ for key, value in log_data.items():
173
+ if not isinstance(value, str):
174
+ log_data[key] = json.dumps(value, ensure_ascii=False)
175
+ else:
176
+ log_data[key] = value
177
+
178
+ contents = sorted(log_data.items())
179
+ log_item.set_contents(contents)
180
+ self._queue.push(log_item)
181
+
182
+ def _build_http_error_log(self, seed: Seed, e: RequestException):
183
+ log_item = LogItem()
184
+
185
+ status_code = getattr(e.response, 'status_code', '-')
186
+
187
+ request_info = {
188
+ 'method': getattr(e.request, 'method', '-'),
189
+ 'url': getattr(e.request, 'url', '-'),
190
+ 'headers': dict(getattr(e.request, 'headers', {})),
191
+ 'body': getattr(e.request, 'body', '-'),
192
+ }
193
+
194
+ response_info = {
195
+ 'status_code': getattr(e.response, 'status_code', '-'),
196
+ 'reason': getattr(e.response, 'reason', '-'),
197
+ 'headers': dict(getattr(e.response, 'headers', {})),
198
+ 'content': getattr(e.response, 'text', '')[:500],
199
+ 'content_type': e.response.headers.get('content-type', '-') if e.response else '-',
200
+ 'content_length': e.response.headers.get('content-length', '-') if e.response else '-',
201
+ 'server': e.response.headers.get('server', '-') if e.response else '-',
202
+ 'date': e.response.headers.get('date', '-') if e.response else '-',
203
+ }
204
+ retry = seed.params.retry
205
+ get_time = seed.params.get_time
206
+ start_time = seed.params.start_time
207
+ failed_time = seed.params.failed_time
208
+ cost = failed_time - start_time
209
+
210
+ log_data = {
211
+ "stage": "http_error",
212
+ "project": self.project,
213
+ "task": self.task,
214
+ "seed": seed.to_string,
215
+ "status_code": status_code,
216
+ "request_info": request_info,
217
+ "response_info": response_info,
218
+ "retry": retry,
219
+ "proxy": seed.params.proxy or '-',
220
+ "exception_type": type(e).__name__,
221
+ "exception_message": str(e),
222
+ "traceback": seed.params.traceback or '-',
223
+ "get_time": get_time,
224
+ "start_time": start_time,
225
+ "error_time": failed_time,
226
+ "stage_cost": cost,
227
+ "cost": cost,
228
+ "time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(failed_time)),
229
+ }
230
+
231
+ for key, value in log_data.items():
232
+ if not isinstance(value, str):
233
+ log_data[key] = json.dumps(value, ensure_ascii=False)
234
+ else:
235
+ log_data[key] = value
236
+
237
+ contents = sorted(log_data.items())
238
+ log_item.set_contents(contents)
239
+ self._queue.push(log_item)
240
+
241
+ def _build_exception_log(self, seed: Seed, e: Exception):
242
+ log_item = LogItem()
243
+
244
+ retry = seed.params.retry
245
+ get_time = seed.params.get_time
246
+ start_time = seed.params.start_time
247
+ failed_time = seed.params.failed_time
248
+ cost = failed_time - start_time
249
+
250
+ log_data = {
251
+ "stage": "exception",
252
+ "project": self.project,
253
+ "task": self.task,
254
+ "seed": seed.to_string,
255
+ "retry": retry,
256
+ "exception_type": type(e).__name__,
257
+ "exception_message": str(e),
258
+ "traceback": seed.params.traceback or '-',
259
+ "proxy": seed.params.proxy or '-',
260
+ "get_time": get_time,
261
+ "start_time": start_time,
262
+ "error_time": failed_time,
263
+ "stage_cost": cost,
264
+ "cost": cost,
265
+ "time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(failed_time)),
266
+ }
267
+
268
+ for key, value in log_data.items():
269
+ if not isinstance(value, str):
270
+ log_data[key] = json.dumps(value, ensure_ascii=False)
271
+ else:
272
+ log_data[key] = value
273
+
274
+ contents = sorted(log_data.items())
275
+ log_item.set_contents(contents)
276
+ self._queue.push(log_item)
277
+
278
+ def _build_run(self):
279
+ while not self._stop.is_set():
280
+ try:
281
+ items = []
282
+ start_time = int(time.time())
283
+
284
+ while len(items) < 1000:
285
+ log_item = self._queue.pop()
286
+ if not log_item or (int(time.time()) - start_time > 10):
287
+ break
288
+ items.append(log_item)
289
+
290
+ if items:
291
+ request = PutLogsRequest(
292
+ project="databee-download-log",
293
+ logstore="log",
294
+ topic="cobweb",
295
+ logitems=items,
296
+ compress=True
297
+ )
298
+ self._client.put_logs(request=request)
299
+ except Exception as e:
300
+ logger.info(str(e))