cobweb-launcher 1.0.5__py3-none-any.whl → 3.2.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cobweb/__init__.py +5 -1
- cobweb/base/__init__.py +3 -3
- cobweb/base/common_queue.py +37 -16
- cobweb/base/item.py +40 -14
- cobweb/base/{log.py → logger.py} +3 -3
- cobweb/base/request.py +744 -47
- cobweb/base/response.py +381 -13
- cobweb/base/seed.py +98 -50
- cobweb/base/task_queue.py +180 -0
- cobweb/base/test.py +257 -0
- cobweb/constant.py +39 -2
- cobweb/crawlers/__init__.py +1 -2
- cobweb/crawlers/crawler.py +27 -0
- cobweb/db/__init__.py +1 -0
- cobweb/db/api_db.py +83 -0
- cobweb/db/redis_db.py +118 -27
- cobweb/launchers/__init__.py +3 -1
- cobweb/launchers/distributor.py +141 -0
- cobweb/launchers/launcher.py +103 -130
- cobweb/launchers/uploader.py +68 -0
- cobweb/log_dots/__init__.py +2 -0
- cobweb/log_dots/dot.py +258 -0
- cobweb/log_dots/loghub_dot.py +53 -0
- cobweb/pipelines/__init__.py +3 -2
- cobweb/pipelines/pipeline.py +19 -0
- cobweb/pipelines/pipeline_csv.py +25 -0
- cobweb/pipelines/pipeline_loghub.py +54 -0
- cobweb/schedulers/__init__.py +1 -0
- cobweb/schedulers/scheduler.py +66 -0
- cobweb/schedulers/scheduler_with_redis.py +189 -0
- cobweb/setting.py +37 -38
- cobweb/utils/__init__.py +5 -2
- cobweb/utils/bloom.py +58 -0
- cobweb/{base → utils}/decorators.py +14 -12
- cobweb/utils/dotting.py +300 -0
- cobweb/utils/oss.py +113 -86
- cobweb/utils/tools.py +3 -15
- cobweb_launcher-3.2.18.dist-info/METADATA +193 -0
- cobweb_launcher-3.2.18.dist-info/RECORD +44 -0
- {cobweb_launcher-1.0.5.dist-info → cobweb_launcher-3.2.18.dist-info}/WHEEL +1 -1
- cobweb/crawlers/base_crawler.py +0 -121
- cobweb/crawlers/file_crawler.py +0 -181
- cobweb/launchers/launcher_pro.py +0 -174
- cobweb/pipelines/base_pipeline.py +0 -54
- cobweb/pipelines/loghub_pipeline.py +0 -34
- cobweb_launcher-1.0.5.dist-info/METADATA +0 -48
- cobweb_launcher-1.0.5.dist-info/RECORD +0 -32
- {cobweb_launcher-1.0.5.dist-info → cobweb_launcher-3.2.18.dist-info}/LICENSE +0 -0
- {cobweb_launcher-1.0.5.dist-info → cobweb_launcher-3.2.18.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import time
|
|
3
|
+
import threading
|
|
4
|
+
from typing import Callable
|
|
5
|
+
from cobweb.db import RedisDB, ApiDB
|
|
6
|
+
from cobweb.utils import check_pause
|
|
7
|
+
from cobweb.base import Seed, logger, TaskQueue, Status
|
|
8
|
+
from cobweb.constant import LogTemplate
|
|
9
|
+
from .scheduler import Scheduler
|
|
10
|
+
use_api = bool(os.getenv("REDIS_API_HOST", 0))
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class RedisScheduler(Scheduler):
|
|
14
|
+
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
task,
|
|
18
|
+
project,
|
|
19
|
+
stop: threading.Event,
|
|
20
|
+
pause: threading.Event,
|
|
21
|
+
task_queue: TaskQueue,
|
|
22
|
+
callback_register: Callable
|
|
23
|
+
):
|
|
24
|
+
super().__init__(task, project, stop, pause, task_queue, callback_register)
|
|
25
|
+
self.todo_key = f"{{{project}:{task}}}:todo"
|
|
26
|
+
self.done_key = f"{{{project}:{task}}}:done"
|
|
27
|
+
self.fail_key = f"{{{project}:{task}}}:fail"
|
|
28
|
+
self.heartbeat_key = f"heartbeat:{project}_{task}"
|
|
29
|
+
self.heartbeat_run_key = f"run:{project}_{task}"
|
|
30
|
+
self.speed_control_key = f"speed_control:{project}_{task}"
|
|
31
|
+
self.reset_lock_key = f"lock:reset:{project}_{task}"
|
|
32
|
+
self.db = ApiDB() if use_api else RedisDB()
|
|
33
|
+
|
|
34
|
+
def reset(self):
|
|
35
|
+
"""
|
|
36
|
+
检查过期种子,重新添加到redis缓存中
|
|
37
|
+
"""
|
|
38
|
+
while not self.stop.is_set():
|
|
39
|
+
if self.db.lock(self.reset_lock_key, t=360):
|
|
40
|
+
|
|
41
|
+
_min = -int(time.time()) + self.seed_reset_seconds
|
|
42
|
+
self.db.members(self.todo_key, 0, _min=_min, _max="(0")
|
|
43
|
+
self.db.delete(self.reset_lock_key)
|
|
44
|
+
|
|
45
|
+
time.sleep(self.seed_reset_seconds)
|
|
46
|
+
|
|
47
|
+
@check_pause
|
|
48
|
+
def schedule(self):
|
|
49
|
+
"""
|
|
50
|
+
调度任务,获取redis队列种子,同时添加到doing字典中
|
|
51
|
+
"""
|
|
52
|
+
if not self.db.zcount(self.todo_key, 0, "(1000"):
|
|
53
|
+
time.sleep(self.scheduler_wait_seconds)
|
|
54
|
+
return
|
|
55
|
+
|
|
56
|
+
if self.task_queue.status_length(Status.PENDING) >= self.todo_queue_size\
|
|
57
|
+
or self.task_queue.length() > 5 * self.todo_queue_size:
|
|
58
|
+
time.sleep(self.todo_queue_full_wait_seconds)
|
|
59
|
+
return
|
|
60
|
+
|
|
61
|
+
if members := self.db.members(
|
|
62
|
+
self.todo_key, int(time.time()),
|
|
63
|
+
count=self.todo_queue_size,
|
|
64
|
+
_min=0, _max="(1000"
|
|
65
|
+
):
|
|
66
|
+
for member, priority in members:
|
|
67
|
+
seed = Seed(member, priority=int(priority % 1000))
|
|
68
|
+
seed.params.get_time = time.time()
|
|
69
|
+
self.task_queue.add_task(
|
|
70
|
+
task_id=seed.sid, data=seed,
|
|
71
|
+
status=Status.PENDING,
|
|
72
|
+
priority=seed.params.priority
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
@check_pause
|
|
76
|
+
def insert(self):
|
|
77
|
+
"""
|
|
78
|
+
添加新种子到redis队列中
|
|
79
|
+
"""
|
|
80
|
+
if task_list := self.task_queue.get_task_by_status(
|
|
81
|
+
status=Status.INSERT, limit=self.new_queue_max_size
|
|
82
|
+
):
|
|
83
|
+
seed_info, task_ids = dict(), set()
|
|
84
|
+
|
|
85
|
+
for task_item in task_list:
|
|
86
|
+
seed = task_item.data
|
|
87
|
+
task_ids.add(task_item.task_id)
|
|
88
|
+
seed_info[seed.to_string] = seed.params.priority
|
|
89
|
+
|
|
90
|
+
self.db.zadd(self.todo_key, seed_info, nx=True)
|
|
91
|
+
self.task_queue.remove(task_ids)
|
|
92
|
+
|
|
93
|
+
if self.task_queue.status_length(status=Status.INSERT) < self.new_queue_max_size:
|
|
94
|
+
time.sleep(self.scheduler_wait_seconds)
|
|
95
|
+
|
|
96
|
+
@check_pause
|
|
97
|
+
def refresh(self):
|
|
98
|
+
"""
|
|
99
|
+
刷新doing种子过期时间,防止reset重新消费
|
|
100
|
+
"""
|
|
101
|
+
if task_list := self.task_queue.get_task_by_status(
|
|
102
|
+
status=[Status.PENDING, Status.PROCESSING, Status.FINISHED],
|
|
103
|
+
):
|
|
104
|
+
refresh_time = int(time.time())
|
|
105
|
+
seed_info = {it.data.to_string: -refresh_time - it.data.params.priority / 1000 for it in task_list}
|
|
106
|
+
self.db.zadd(self.todo_key, seed_info, xx=True)
|
|
107
|
+
time.sleep(self.seed_reset_seconds // 3)
|
|
108
|
+
|
|
109
|
+
@check_pause
|
|
110
|
+
def delete(self):
|
|
111
|
+
"""
|
|
112
|
+
删除队列种子,根据状态添加至成功或失败队列,移除doing字典种子索引
|
|
113
|
+
"""
|
|
114
|
+
if task_list := self.task_queue.get_task_by_status(
|
|
115
|
+
status=Status.FINISHED, limit=self.done_queue_max_size
|
|
116
|
+
):
|
|
117
|
+
zrem_items = [it.data.to_string for it in task_list]
|
|
118
|
+
remove_task_ids = [it.task_id for it in task_list]
|
|
119
|
+
self.db.zrem(self.todo_key, *zrem_items)
|
|
120
|
+
self.task_queue.remove(remove_task_ids)
|
|
121
|
+
|
|
122
|
+
if self.task_queue.status_length(status=Status.FINISHED) < self.done_queue_max_size:
|
|
123
|
+
time.sleep(self.done_queue_wait_seconds)
|
|
124
|
+
|
|
125
|
+
def run(self):
|
|
126
|
+
start_time = int(time.time())
|
|
127
|
+
|
|
128
|
+
for func in [self.reset, self.insert, self.delete, self.refresh, self.schedule]:
|
|
129
|
+
self.callback_register(func, tag="scheduler")
|
|
130
|
+
|
|
131
|
+
while not self.stop.is_set():
|
|
132
|
+
todo_len = self.task_queue.status_length(status=Status.PENDING)
|
|
133
|
+
doing_len = self.task_queue.status_length(status=Status.PROCESSING)
|
|
134
|
+
done_len = self.task_queue.status_length(status=Status.FINISHED)
|
|
135
|
+
upload_len = self.task_queue.status_length(status=Status.UPLOAD)
|
|
136
|
+
|
|
137
|
+
redis_doing_count = self.db.zcount(self.todo_key, "-inf", "(0")
|
|
138
|
+
redis_todo_len = self.db.zcount(self.todo_key, 0, "(1000")
|
|
139
|
+
redis_seed_count = self.db.zcard(self.todo_key)
|
|
140
|
+
|
|
141
|
+
if self.pause.is_set():
|
|
142
|
+
execute_time = int(time.time()) - start_time
|
|
143
|
+
if not self.task_model and execute_time > self.before_scheduler_wait_seconds:
|
|
144
|
+
logger.info("Done! ready to close thread...")
|
|
145
|
+
self.stop.set()
|
|
146
|
+
elif redis_todo_len:
|
|
147
|
+
logger.info(
|
|
148
|
+
f"Recovery {self.task} task run!"
|
|
149
|
+
f"Todo seeds count: {redis_todo_len}"
|
|
150
|
+
f", queue length: {redis_seed_count}"
|
|
151
|
+
)
|
|
152
|
+
self.pause.clear()
|
|
153
|
+
else:
|
|
154
|
+
logger.info("Pause! waiting for resume...")
|
|
155
|
+
|
|
156
|
+
elif self.task_queue.length() == 0:
|
|
157
|
+
if redis_seed_count:
|
|
158
|
+
logger.info(
|
|
159
|
+
f"Todo seeds count: {redis_todo_len}"
|
|
160
|
+
f", queue length: {redis_seed_count}"
|
|
161
|
+
)
|
|
162
|
+
self.pause.clear()
|
|
163
|
+
else:
|
|
164
|
+
count = 0
|
|
165
|
+
for _ in range(3):
|
|
166
|
+
if not redis_seed_count:
|
|
167
|
+
count += 1
|
|
168
|
+
time.sleep(5)
|
|
169
|
+
logger.info("Checking count...")
|
|
170
|
+
else:
|
|
171
|
+
break
|
|
172
|
+
if count >= 3:
|
|
173
|
+
logger.info("Todo queue is empty! Pause set...")
|
|
174
|
+
self.pause.set()
|
|
175
|
+
|
|
176
|
+
else:
|
|
177
|
+
self.db.setex(self.heartbeat_run_key, 60, 1)
|
|
178
|
+
logger.info(LogTemplate.launcher_pro_polling.format(
|
|
179
|
+
task=self.task,
|
|
180
|
+
doing_len=doing_len,
|
|
181
|
+
todo_len=todo_len,
|
|
182
|
+
done_len=done_len,
|
|
183
|
+
redis_seed_count=redis_seed_count,
|
|
184
|
+
redis_todo_len=redis_todo_len,
|
|
185
|
+
redis_doing_len=redis_doing_count,
|
|
186
|
+
upload_len=upload_len,
|
|
187
|
+
))
|
|
188
|
+
|
|
189
|
+
time.sleep(30)
|
cobweb/setting.py
CHANGED
|
@@ -1,45 +1,22 @@
|
|
|
1
|
-
import os
|
|
2
|
-
|
|
3
|
-
# redis db config
|
|
4
|
-
REDIS_CONFIG = {
|
|
5
|
-
"host": os.getenv("REDIS_HOST"),
|
|
6
|
-
"password": os.getenv("REDIS_PASSWORD"),
|
|
7
|
-
"port": int(os.getenv("REDIS_PORT", 6379)),
|
|
8
|
-
"db": int(os.getenv("REDIS_DB", 0)),
|
|
9
|
-
}
|
|
10
|
-
|
|
11
|
-
# loghub db config
|
|
12
|
-
LOGHUB_TOPIC = os.getenv("LOGHUB_TOPIC")
|
|
13
|
-
LOGHUB_SOURCE = os.getenv("LOGHUB_SOURCE")
|
|
14
|
-
LOGHUB_PROJECT = os.getenv("LOGHUB_PROJECT")
|
|
15
|
-
LOGHUB_CONFIG = {
|
|
16
|
-
"endpoint": os.getenv("LOGHUB_ENDPOINT"),
|
|
17
|
-
"accessKeyId": os.getenv("LOGHUB_ACCESS_KEY"),
|
|
18
|
-
"accessKey": os.getenv("LOGHUB_SECRET_KEY")
|
|
19
|
-
}
|
|
20
|
-
|
|
21
|
-
# oss util config
|
|
22
|
-
OSS_BUCKET = os.getenv("OSS_BUCKET")
|
|
23
|
-
OSS_ENDPOINT = os.getenv("OSS_ENDPOINT")
|
|
24
|
-
OSS_ACCESS_KEY = os.getenv("OSS_ACCESS_KEY")
|
|
25
|
-
OSS_SECRET_KEY = os.getenv("OSS_SECRET_KEY")
|
|
26
|
-
OSS_MIN_UPLOAD_SIZE = 1024 * 100
|
|
27
|
-
OSS_CHUNK_SIZE = 1024 ** 2
|
|
28
|
-
|
|
29
1
|
# 采集器选择
|
|
30
|
-
CRAWLER = "cobweb.crawlers.
|
|
2
|
+
CRAWLER = "cobweb.crawlers.Crawler"
|
|
31
3
|
|
|
32
|
-
#
|
|
33
|
-
PIPELINE = "cobweb.pipelines.
|
|
4
|
+
# 数据管道
|
|
5
|
+
PIPELINE = "cobweb.pipelines.CSV"
|
|
6
|
+
|
|
7
|
+
# 调度器
|
|
8
|
+
SCHEDULER = "cobweb.schedulers.RedisScheduler"
|
|
34
9
|
|
|
35
10
|
|
|
36
11
|
# Launcher 等待时间
|
|
12
|
+
|
|
13
|
+
BEFORE_SCHEDULER_WAIT_SECONDS = 60 # 调度前等待时间,只作用于单次任务
|
|
37
14
|
SCHEDULER_WAIT_SECONDS = 15 # 调度等待时间
|
|
38
15
|
TODO_QUEUE_FULL_WAIT_SECONDS = 5 # todo队列已满时等待时间
|
|
39
16
|
NEW_QUEUE_WAIT_SECONDS = 30 # new队列等待时间
|
|
40
|
-
DONE_QUEUE_WAIT_SECONDS =
|
|
17
|
+
DONE_QUEUE_WAIT_SECONDS = 5 # done队列等待时间
|
|
41
18
|
UPLOAD_QUEUE_WAIT_SECONDS = 15 # upload队列等待时间
|
|
42
|
-
SEED_RESET_SECONDS =
|
|
19
|
+
SEED_RESET_SECONDS = 60 # 种子重制时间
|
|
43
20
|
|
|
44
21
|
|
|
45
22
|
# Launcher 队列长度
|
|
@@ -51,12 +28,34 @@ UPLOAD_QUEUE_MAX_SIZE = 100 # upload队列长度
|
|
|
51
28
|
# DONE_MODEL IN (0, 1), 种子完成模式
|
|
52
29
|
DONE_MODEL = 0 # 0:种子消费成功直接从队列移除,失败则添加至失败队列;1:种子消费成功添加至成功队列,失败添加至失败队列
|
|
53
30
|
|
|
54
|
-
# DOWNLOAD_MODEL IN (0, 1), 下载模式
|
|
55
|
-
DOWNLOAD_MODEL = 0 # 0: 通用下载;1:文件下载
|
|
56
|
-
|
|
57
31
|
# spider
|
|
58
32
|
SPIDER_THREAD_NUM = 10
|
|
59
33
|
SPIDER_MAX_RETRIES = 5
|
|
34
|
+
SPIDER_TIME_SLEEP = 10
|
|
35
|
+
RECORD_FAILED_SPIDER = True
|
|
36
|
+
|
|
37
|
+
SPIDER_MAX_COUNT = 1000 # 在规定时间窗口内最大采集数
|
|
38
|
+
TIME_WINDOW = 60 # 频控固定时间窗口(秒)
|
|
39
|
+
|
|
40
|
+
# 任务模式, 0:单次,1:常驻
|
|
41
|
+
TASK_MODEL = 0
|
|
42
|
+
|
|
43
|
+
# 流控措施, 0:关闭,1:开启
|
|
44
|
+
SPEED_CONTROL = 1
|
|
45
|
+
|
|
46
|
+
DOT = 0
|
|
47
|
+
|
|
48
|
+
# redis config
|
|
49
|
+
REDIS_CONFIG = {
|
|
50
|
+
"host": "127.0.0.1",
|
|
51
|
+
"port": 6379,
|
|
52
|
+
"db": 0
|
|
53
|
+
}
|
|
60
54
|
|
|
61
|
-
#
|
|
62
|
-
|
|
55
|
+
# loghub pipeline config
|
|
56
|
+
# os.getenv("LOGHUB_ENDPOINT"),
|
|
57
|
+
# os.getenv("LOGHUB_ACCESS_KEY"),
|
|
58
|
+
# os.getenv("LOGHUB_SECRET_KEY")
|
|
59
|
+
# os.getenv("LOGHUB_PROJECT")
|
|
60
|
+
# os.getenv("LOGHUB_SOURCE")
|
|
61
|
+
# os.getenv("LOGHUB_TOPIC")
|
cobweb/utils/__init__.py
CHANGED
cobweb/utils/bloom.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# import math
|
|
2
|
+
# import time
|
|
3
|
+
#
|
|
4
|
+
# import mmh3
|
|
5
|
+
# import redis
|
|
6
|
+
# from cobweb import setting
|
|
7
|
+
#
|
|
8
|
+
#
|
|
9
|
+
# class BloomFilter:
|
|
10
|
+
#
|
|
11
|
+
# def __init__(self, key, redis_config=None, capacity=None, error_rate=None):
|
|
12
|
+
# redis_config = redis_config or setting.REDIS_CONFIG
|
|
13
|
+
# capacity = capacity or setting.CAPACITY
|
|
14
|
+
# error_rate = error_rate or setting.ERROR_RATE
|
|
15
|
+
# redis_config['db'] = 3
|
|
16
|
+
#
|
|
17
|
+
# self.key = key
|
|
18
|
+
#
|
|
19
|
+
# pool = redis.ConnectionPool(**redis_config)
|
|
20
|
+
# self._client = redis.Redis(connection_pool=pool)
|
|
21
|
+
# self.bit_size = self.get_bit_size(capacity, error_rate)
|
|
22
|
+
# self.hash_count = self.get_hash_count(self.bit_size, capacity)
|
|
23
|
+
# self._init_bloom_key()
|
|
24
|
+
#
|
|
25
|
+
# def add(self, value):
|
|
26
|
+
# for seed in range(self.hash_count):
|
|
27
|
+
# result = mmh3.hash(value, seed) % self.bit_size
|
|
28
|
+
# self._client.setbit(self.key, result, 1)
|
|
29
|
+
# return True
|
|
30
|
+
#
|
|
31
|
+
# def exists(self, value):
|
|
32
|
+
# if not self._client.exists(self.key):
|
|
33
|
+
# return False
|
|
34
|
+
# for seed in range(self.hash_count):
|
|
35
|
+
# result = mmh3.hash(value, seed) % self.bit_size
|
|
36
|
+
# if not self._client.getbit(self.key, result):
|
|
37
|
+
# return False
|
|
38
|
+
# return True
|
|
39
|
+
#
|
|
40
|
+
# def _init_bloom_key(self):
|
|
41
|
+
# lua_script = """
|
|
42
|
+
# redis.call("SETBIT", KEYS[1], ARGV[1], ARGV[2])
|
|
43
|
+
# redis.call("EXPIRE", KEYS[1], 604800)
|
|
44
|
+
# """
|
|
45
|
+
# if self._client.exists(self.key):
|
|
46
|
+
# return True
|
|
47
|
+
# execute = self._client.register_script(lua_script)
|
|
48
|
+
# execute(keys=[self.key], args=[self.bit_size-1, 1])
|
|
49
|
+
#
|
|
50
|
+
# @classmethod
|
|
51
|
+
# def get_bit_size(cls, n, p):
|
|
52
|
+
# return int(-(n * math.log(p)) / (math.log(2) ** 2))
|
|
53
|
+
#
|
|
54
|
+
# @classmethod
|
|
55
|
+
# def get_hash_count(cls, m, n):
|
|
56
|
+
# return int((m / n) * math.log(2))
|
|
57
|
+
#
|
|
58
|
+
#
|
|
@@ -1,16 +1,6 @@
|
|
|
1
|
+
import time
|
|
1
2
|
from functools import wraps
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
# def check_redis_status(func):
|
|
5
|
-
# @wraps(func)
|
|
6
|
-
# def wrapper(*args, **kwargs):
|
|
7
|
-
# try:
|
|
8
|
-
# result = func(*args, **kwargs)
|
|
9
|
-
# except Exception:
|
|
10
|
-
# result = False
|
|
11
|
-
# return result
|
|
12
|
-
#
|
|
13
|
-
# return wrapper
|
|
3
|
+
from cobweb.base import logger
|
|
14
4
|
|
|
15
5
|
|
|
16
6
|
def decorator_oss_db(exception, retries=3):
|
|
@@ -37,4 +27,16 @@ def decorator_oss_db(exception, retries=3):
|
|
|
37
27
|
return decorator
|
|
38
28
|
|
|
39
29
|
|
|
30
|
+
def check_pause(func):
|
|
31
|
+
@wraps(func)
|
|
32
|
+
def wrapper(self, *args, **kwargs):
|
|
33
|
+
while not self.pause.is_set():
|
|
34
|
+
try:
|
|
35
|
+
func(self, *args, **kwargs)
|
|
36
|
+
except Exception as e:
|
|
37
|
+
logger.info(f"{func.__name__}: " + str(e))
|
|
38
|
+
finally:
|
|
39
|
+
time.sleep(0.1)
|
|
40
|
+
logger.info(f"Pause detected: {func.__name__} thread closing...")
|
|
40
41
|
|
|
42
|
+
return wrapper
|
cobweb/utils/dotting.py
ADDED
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import time
|
|
4
|
+
from threading import Event
|
|
5
|
+
from requests import RequestException, Response as requests_Response
|
|
6
|
+
|
|
7
|
+
from cobweb.base import Queue, Request, Seed, Response, BaseItem, logger
|
|
8
|
+
from aliyun.log import LogClient, LogItem, PutLogsRequest
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class LoghubDot:
|
|
12
|
+
|
|
13
|
+
def __init__(self, stop: Event, project: str, task: str) -> None:
|
|
14
|
+
self._stop = stop
|
|
15
|
+
self._queue = Queue()
|
|
16
|
+
self._client = LogClient(
|
|
17
|
+
endpoint=os.getenv("LOGHUB_ENDPOINT"),
|
|
18
|
+
accessKeyId=os.getenv("LOGHUB_ACCESS_KEY"),
|
|
19
|
+
accessKey=os.getenv("LOGHUB_SECRET_KEY")
|
|
20
|
+
)
|
|
21
|
+
self.project = project
|
|
22
|
+
self.task = task
|
|
23
|
+
|
|
24
|
+
def logging(self, topic, msg):
|
|
25
|
+
log_item = LogItem()
|
|
26
|
+
log_data = {
|
|
27
|
+
"stage": topic,
|
|
28
|
+
"message": msg,
|
|
29
|
+
"project": self.project,
|
|
30
|
+
"task": self.task,
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
for key, value in log_data.items():
|
|
34
|
+
if not isinstance(value, str):
|
|
35
|
+
log_data[key] = json.dumps(value, ensure_ascii=False)
|
|
36
|
+
else:
|
|
37
|
+
log_data[key] = value
|
|
38
|
+
|
|
39
|
+
contents = sorted(log_data.items())
|
|
40
|
+
log_item.set_contents(contents)
|
|
41
|
+
self._queue.push(log_item)
|
|
42
|
+
|
|
43
|
+
def _build_request_log(self, request_item: Request):
|
|
44
|
+
log_item = LogItem()
|
|
45
|
+
|
|
46
|
+
seed: Seed = request_item.seed
|
|
47
|
+
get_time = seed.params.get_time
|
|
48
|
+
start_time = seed.params.start_time
|
|
49
|
+
request_time = seed.params.request_time
|
|
50
|
+
stage_cost = request_time - start_time
|
|
51
|
+
cost = request_time - start_time
|
|
52
|
+
|
|
53
|
+
request_settings = json.dumps(
|
|
54
|
+
request_item.request_settings,
|
|
55
|
+
ensure_ascii=False, separators=(',', ':')
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
log_data = {
|
|
59
|
+
"stage": "request",
|
|
60
|
+
"project": self.project,
|
|
61
|
+
"task": self.task,
|
|
62
|
+
"seed": seed.to_string,
|
|
63
|
+
"request": repr(request_item),
|
|
64
|
+
"request_settings": request_settings,
|
|
65
|
+
"get_time": get_time,
|
|
66
|
+
"start_time": start_time,
|
|
67
|
+
"stage_cost": stage_cost,
|
|
68
|
+
"cost": cost,
|
|
69
|
+
"time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(request_time)),
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
for key, value in log_data.items():
|
|
73
|
+
if not isinstance(value, str):
|
|
74
|
+
log_data[key] = json.dumps(value, ensure_ascii=False)
|
|
75
|
+
else:
|
|
76
|
+
log_data[key] = value
|
|
77
|
+
|
|
78
|
+
contents = sorted(log_data.items())
|
|
79
|
+
log_item.set_contents(contents)
|
|
80
|
+
self._queue.push(log_item)
|
|
81
|
+
|
|
82
|
+
def _build_download_log(self, response_item: Response):
|
|
83
|
+
"""
|
|
84
|
+
构建下载阶段的日志项
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
response_item: 响应对象
|
|
88
|
+
"""
|
|
89
|
+
log_item = LogItem()
|
|
90
|
+
|
|
91
|
+
seed: Seed = response_item.seed
|
|
92
|
+
get_time = seed.params.get_time
|
|
93
|
+
start_time = seed.params.start_time
|
|
94
|
+
request_time = seed.params.request_time
|
|
95
|
+
download_time = seed.params.download_time
|
|
96
|
+
stage_cost = download_time - request_time
|
|
97
|
+
cost = download_time - start_time
|
|
98
|
+
|
|
99
|
+
log_data = {
|
|
100
|
+
"stage": "download",
|
|
101
|
+
"project": self.project,
|
|
102
|
+
"task": self.task,
|
|
103
|
+
"seed": seed.to_string,
|
|
104
|
+
"response": repr(response_item),
|
|
105
|
+
"get_time": get_time,
|
|
106
|
+
"start_time": start_time,
|
|
107
|
+
"request_time": request_time,
|
|
108
|
+
"download_time": download_time,
|
|
109
|
+
"stage_cost": stage_cost,
|
|
110
|
+
"cost": cost,
|
|
111
|
+
"proxy": seed.params.proxy or '-',
|
|
112
|
+
"time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(download_time)),
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
response = response_item.response
|
|
116
|
+
if isinstance(response, requests_Response):
|
|
117
|
+
log_data['request_info'] = {
|
|
118
|
+
'method': response.request.method,
|
|
119
|
+
'url': response.request.url,
|
|
120
|
+
'headers': dict(response.request.headers),
|
|
121
|
+
'body': response.request.body or "-",
|
|
122
|
+
}
|
|
123
|
+
log_data['response_info'] = {
|
|
124
|
+
"status_code": response.status_code,
|
|
125
|
+
"reason": response.reason,
|
|
126
|
+
"headers": dict(response.headers),
|
|
127
|
+
"content": response.text[:500], # 截取内容
|
|
128
|
+
"content_type": response.headers.get('content-type', '-'),
|
|
129
|
+
"content_length": response.headers.get('content-length', '-'),
|
|
130
|
+
"server": response.headers.get('server', '-'),
|
|
131
|
+
"date": response.headers.get('date', '-'),
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
for key, value in log_data.items():
|
|
135
|
+
if not isinstance(value, str):
|
|
136
|
+
log_data[key] = json.dumps(value, ensure_ascii=False)
|
|
137
|
+
else:
|
|
138
|
+
log_data[key] = value
|
|
139
|
+
|
|
140
|
+
contents = sorted(log_data.items())
|
|
141
|
+
log_item.set_contents(contents)
|
|
142
|
+
self._queue.push(log_item)
|
|
143
|
+
|
|
144
|
+
def _build_parse_log(self, parse_item: BaseItem):
|
|
145
|
+
log_item = LogItem()
|
|
146
|
+
|
|
147
|
+
seed: Seed = parse_item.seed
|
|
148
|
+
get_time = seed.params.get_time
|
|
149
|
+
start_time = seed.params.start_time
|
|
150
|
+
request_time = seed.params.request_time
|
|
151
|
+
response_time = seed.params.response_time
|
|
152
|
+
parse_time = seed.params.parse_time
|
|
153
|
+
|
|
154
|
+
pre_time = request_time or response_time
|
|
155
|
+
stage_cost = parse_time - pre_time
|
|
156
|
+
cost = parse_time - start_time
|
|
157
|
+
|
|
158
|
+
log_data = {
|
|
159
|
+
"stage": "parse",
|
|
160
|
+
"project": self.project,
|
|
161
|
+
"task": self.task,
|
|
162
|
+
"seed": seed.to_string,
|
|
163
|
+
"parse": repr(parse_item),
|
|
164
|
+
"get_time": get_time,
|
|
165
|
+
"start_time": start_time,
|
|
166
|
+
"parse_time": parse_time,
|
|
167
|
+
"stage_cost": stage_cost,
|
|
168
|
+
"cost": cost,
|
|
169
|
+
"time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(parse_time)),
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
for key, value in log_data.items():
|
|
173
|
+
if not isinstance(value, str):
|
|
174
|
+
log_data[key] = json.dumps(value, ensure_ascii=False)
|
|
175
|
+
else:
|
|
176
|
+
log_data[key] = value
|
|
177
|
+
|
|
178
|
+
contents = sorted(log_data.items())
|
|
179
|
+
log_item.set_contents(contents)
|
|
180
|
+
self._queue.push(log_item)
|
|
181
|
+
|
|
182
|
+
def _build_http_error_log(self, seed: Seed, e: RequestException):
|
|
183
|
+
log_item = LogItem()
|
|
184
|
+
|
|
185
|
+
status_code = getattr(e.response, 'status_code', '-')
|
|
186
|
+
|
|
187
|
+
request_info = {
|
|
188
|
+
'method': getattr(e.request, 'method', '-'),
|
|
189
|
+
'url': getattr(e.request, 'url', '-'),
|
|
190
|
+
'headers': dict(getattr(e.request, 'headers', {})),
|
|
191
|
+
'body': getattr(e.request, 'body', '-'),
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
response_info = {
|
|
195
|
+
'status_code': getattr(e.response, 'status_code', '-'),
|
|
196
|
+
'reason': getattr(e.response, 'reason', '-'),
|
|
197
|
+
'headers': dict(getattr(e.response, 'headers', {})),
|
|
198
|
+
'content': getattr(e.response, 'text', '')[:500],
|
|
199
|
+
'content_type': e.response.headers.get('content-type', '-') if e.response else '-',
|
|
200
|
+
'content_length': e.response.headers.get('content-length', '-') if e.response else '-',
|
|
201
|
+
'server': e.response.headers.get('server', '-') if e.response else '-',
|
|
202
|
+
'date': e.response.headers.get('date', '-') if e.response else '-',
|
|
203
|
+
}
|
|
204
|
+
retry = seed.params.retry
|
|
205
|
+
get_time = seed.params.get_time
|
|
206
|
+
start_time = seed.params.start_time
|
|
207
|
+
failed_time = seed.params.failed_time
|
|
208
|
+
cost = failed_time - start_time
|
|
209
|
+
|
|
210
|
+
log_data = {
|
|
211
|
+
"stage": "http_error",
|
|
212
|
+
"project": self.project,
|
|
213
|
+
"task": self.task,
|
|
214
|
+
"seed": seed.to_string,
|
|
215
|
+
"status_code": status_code,
|
|
216
|
+
"request_info": request_info,
|
|
217
|
+
"response_info": response_info,
|
|
218
|
+
"retry": retry,
|
|
219
|
+
"proxy": seed.params.proxy or '-',
|
|
220
|
+
"exception_type": type(e).__name__,
|
|
221
|
+
"exception_message": str(e),
|
|
222
|
+
"traceback": seed.params.traceback or '-',
|
|
223
|
+
"get_time": get_time,
|
|
224
|
+
"start_time": start_time,
|
|
225
|
+
"error_time": failed_time,
|
|
226
|
+
"stage_cost": cost,
|
|
227
|
+
"cost": cost,
|
|
228
|
+
"time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(failed_time)),
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
for key, value in log_data.items():
|
|
232
|
+
if not isinstance(value, str):
|
|
233
|
+
log_data[key] = json.dumps(value, ensure_ascii=False)
|
|
234
|
+
else:
|
|
235
|
+
log_data[key] = value
|
|
236
|
+
|
|
237
|
+
contents = sorted(log_data.items())
|
|
238
|
+
log_item.set_contents(contents)
|
|
239
|
+
self._queue.push(log_item)
|
|
240
|
+
|
|
241
|
+
def _build_exception_log(self, seed: Seed, e: Exception):
|
|
242
|
+
log_item = LogItem()
|
|
243
|
+
|
|
244
|
+
retry = seed.params.retry
|
|
245
|
+
get_time = seed.params.get_time
|
|
246
|
+
start_time = seed.params.start_time
|
|
247
|
+
failed_time = seed.params.failed_time
|
|
248
|
+
cost = failed_time - start_time
|
|
249
|
+
|
|
250
|
+
log_data = {
|
|
251
|
+
"stage": "exception",
|
|
252
|
+
"project": self.project,
|
|
253
|
+
"task": self.task,
|
|
254
|
+
"seed": seed.to_string,
|
|
255
|
+
"retry": retry,
|
|
256
|
+
"exception_type": type(e).__name__,
|
|
257
|
+
"exception_message": str(e),
|
|
258
|
+
"traceback": seed.params.traceback or '-',
|
|
259
|
+
"proxy": seed.params.proxy or '-',
|
|
260
|
+
"get_time": get_time,
|
|
261
|
+
"start_time": start_time,
|
|
262
|
+
"error_time": failed_time,
|
|
263
|
+
"stage_cost": cost,
|
|
264
|
+
"cost": cost,
|
|
265
|
+
"time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(failed_time)),
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
for key, value in log_data.items():
|
|
269
|
+
if not isinstance(value, str):
|
|
270
|
+
log_data[key] = json.dumps(value, ensure_ascii=False)
|
|
271
|
+
else:
|
|
272
|
+
log_data[key] = value
|
|
273
|
+
|
|
274
|
+
contents = sorted(log_data.items())
|
|
275
|
+
log_item.set_contents(contents)
|
|
276
|
+
self._queue.push(log_item)
|
|
277
|
+
|
|
278
|
+
def _build_run(self):
|
|
279
|
+
while not self._stop.is_set():
|
|
280
|
+
try:
|
|
281
|
+
items = []
|
|
282
|
+
start_time = int(time.time())
|
|
283
|
+
|
|
284
|
+
while len(items) < 1000:
|
|
285
|
+
log_item = self._queue.pop()
|
|
286
|
+
if not log_item or (int(time.time()) - start_time > 10):
|
|
287
|
+
break
|
|
288
|
+
items.append(log_item)
|
|
289
|
+
|
|
290
|
+
if items:
|
|
291
|
+
request = PutLogsRequest(
|
|
292
|
+
project="databee-download-log",
|
|
293
|
+
logstore="log",
|
|
294
|
+
topic="cobweb",
|
|
295
|
+
logitems=items,
|
|
296
|
+
compress=True
|
|
297
|
+
)
|
|
298
|
+
self._client.put_logs(request=request)
|
|
299
|
+
except Exception as e:
|
|
300
|
+
logger.info(str(e))
|