cobweb-launcher 0.1.24__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cobweb-launcher might be problematic. Click here for more details.
- cobweb/__init__.py +1 -9
- cobweb/base/__init__.py +9 -0
- cobweb/base/common_queue.py +30 -0
- cobweb/base/decorators.py +40 -0
- cobweb/base/item.py +39 -0
- cobweb/base/log.py +94 -0
- cobweb/base/request.py +72 -0
- cobweb/base/response.py +22 -0
- cobweb/base/seed.py +114 -0
- cobweb/constant.py +52 -15
- cobweb/crawlers/__init__.py +2 -0
- cobweb/crawlers/base_crawler.py +121 -0
- cobweb/crawlers/file_crawler.py +182 -0
- cobweb/db/__init__.py +1 -3
- cobweb/db/redis_db.py +123 -205
- cobweb/exceptions/__init__.py +1 -0
- cobweb/exceptions/oss_db_exception.py +28 -0
- cobweb/launchers/__init__.py +2 -0
- cobweb/launchers/launcher.py +199 -0
- cobweb/launchers/launcher_pro.py +174 -0
- cobweb/pipelines/__init__.py +2 -0
- cobweb/pipelines/base_pipeline.py +54 -0
- cobweb/pipelines/loghub_pipeline.py +34 -0
- cobweb/setting.py +55 -6
- cobweb/utils/__init__.py +3 -0
- cobweb/utils/oss.py +87 -0
- cobweb/utils/tools.py +42 -0
- {cobweb_launcher-0.1.24.dist-info → cobweb_launcher-1.0.1.dist-info}/METADATA +1 -1
- {cobweb_launcher-0.1.24.dist-info → cobweb_launcher-1.0.1.dist-info}/RECORD +32 -10
- {cobweb_launcher-0.1.24.dist-info → cobweb_launcher-1.0.1.dist-info}/LICENSE +0 -0
- {cobweb_launcher-0.1.24.dist-info → cobweb_launcher-1.0.1.dist-info}/WHEEL +0 -0
- {cobweb_launcher-0.1.24.dist-info → cobweb_launcher-1.0.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
import time
|
|
2
|
+
|
|
3
|
+
import setting
|
|
4
|
+
import inspect
|
|
5
|
+
import threading
|
|
6
|
+
import importlib
|
|
7
|
+
|
|
8
|
+
from cobweb.base import Seed, Queue
|
|
9
|
+
from cobweb.utils.tools import dynamic_load_class
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Launcher(threading.Thread):
|
|
13
|
+
|
|
14
|
+
SEEDS = []
|
|
15
|
+
|
|
16
|
+
__DOING__ = {}
|
|
17
|
+
|
|
18
|
+
__CUSTOM_FUNC__ = {
|
|
19
|
+
"download": None,
|
|
20
|
+
"download_midware": None,
|
|
21
|
+
"parse": None,
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
__LAUNCHER_QUEUE__ = {
|
|
25
|
+
"new": Queue(),
|
|
26
|
+
"todo": Queue(),
|
|
27
|
+
"done": Queue(),
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
__LAUNCHER_FUNC__ = [
|
|
31
|
+
"_reset",
|
|
32
|
+
"_scheduler",
|
|
33
|
+
"_insert",
|
|
34
|
+
"_refresh",
|
|
35
|
+
"_delete",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
def __init__(self, task, project, custom_setting=None):
|
|
39
|
+
super().__init__()
|
|
40
|
+
self.task = task
|
|
41
|
+
self.project = project
|
|
42
|
+
|
|
43
|
+
self._stop = threading.Event() # 结束事件
|
|
44
|
+
self._pause = threading.Event() # 暂停事件
|
|
45
|
+
|
|
46
|
+
if custom_setting:
|
|
47
|
+
setting_ = dict()
|
|
48
|
+
if isinstance(custom_setting, dict):
|
|
49
|
+
setting_ = custom_setting
|
|
50
|
+
else:
|
|
51
|
+
if isinstance(custom_setting, str):
|
|
52
|
+
custom_setting = importlib.import_module(custom_setting)
|
|
53
|
+
if not inspect.ismodule(custom_setting):
|
|
54
|
+
raise Exception
|
|
55
|
+
for k, v in custom_setting.__dict__.items():
|
|
56
|
+
if not k.startswith("__") and not inspect.ismodule(v):
|
|
57
|
+
setting_[k] = v
|
|
58
|
+
for k, v in setting_.items():
|
|
59
|
+
setattr(setting, k, v)
|
|
60
|
+
|
|
61
|
+
self._Crawler = dynamic_load_class(setting.CRAWLER)
|
|
62
|
+
self._Pipeline = dynamic_load_class(setting.PIPELINE)
|
|
63
|
+
|
|
64
|
+
self._scheduler_wait_seconds = setting.SCHEDULER_WAIT_SECONDS
|
|
65
|
+
self._todo_queue_full_wait_seconds = setting.TODO_QUEUE_FULL_WAIT_SECONDS
|
|
66
|
+
self._new_queue_wait_seconds = setting.NEW_QUEUE_WAIT_SECONDS
|
|
67
|
+
self._done_queue_wait_seconds = setting.DONE_QUEUE_WAIT_SECONDS
|
|
68
|
+
self._upload_queue_wait_seconds = setting.UPLOAD_QUEUE_WAIT_SECONDS
|
|
69
|
+
self._seed_reset_seconds = setting.SEED_RESET_SECONDS
|
|
70
|
+
|
|
71
|
+
self._todo_queue_size = setting.TODO_QUEUE_SIZE
|
|
72
|
+
self._new_queue_max_size = setting.NEW_QUEUE_MAX_SIZE
|
|
73
|
+
self._done_queue_max_size = setting.DONE_QUEUE_MAX_SIZE
|
|
74
|
+
self._upload_queue_max_size = setting.UPLOAD_QUEUE_MAX_SIZE
|
|
75
|
+
|
|
76
|
+
self._done_model = setting.DONE_MODEL
|
|
77
|
+
|
|
78
|
+
self._upload_queue = Queue()
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
def start_seeds(self):
|
|
82
|
+
return [Seed(seed) for seed in self.SEEDS]
|
|
83
|
+
|
|
84
|
+
@property
|
|
85
|
+
def request(self):
|
|
86
|
+
"""
|
|
87
|
+
自定义request函数
|
|
88
|
+
use case:
|
|
89
|
+
from cobweb.base import Request, BaseItem
|
|
90
|
+
@launcher.request
|
|
91
|
+
def request(seed: Seed) -> Union[Request, BaseItem]:
|
|
92
|
+
...
|
|
93
|
+
return Request(seed.url, seed)
|
|
94
|
+
"""
|
|
95
|
+
def decorator(func):
|
|
96
|
+
self.__CUSTOM_FUNC__["request"] = func
|
|
97
|
+
return decorator
|
|
98
|
+
|
|
99
|
+
@property
|
|
100
|
+
def download(self):
|
|
101
|
+
"""
|
|
102
|
+
自定义download函数
|
|
103
|
+
use case:
|
|
104
|
+
from cobweb.base import Request, Response, Seed, BaseItem
|
|
105
|
+
@launcher.download
|
|
106
|
+
def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
|
|
107
|
+
...
|
|
108
|
+
yield Response(item.seed, response)
|
|
109
|
+
"""
|
|
110
|
+
def decorator(func):
|
|
111
|
+
self.__CUSTOM_FUNC__["download"] = func
|
|
112
|
+
return decorator
|
|
113
|
+
|
|
114
|
+
@property
|
|
115
|
+
def parse(self):
|
|
116
|
+
"""
|
|
117
|
+
自定义parse函数, xxxItem为自定义的存储数据类型
|
|
118
|
+
use case:
|
|
119
|
+
from cobweb.base import Request, Response
|
|
120
|
+
@launcher.download
|
|
121
|
+
def download(item: Response) -> BaseItem:
|
|
122
|
+
...
|
|
123
|
+
yield xxxItem(seed, **kwargs)
|
|
124
|
+
"""
|
|
125
|
+
def decorator(func):
|
|
126
|
+
self.__CUSTOM_FUNC__["parse"] = func
|
|
127
|
+
return decorator
|
|
128
|
+
|
|
129
|
+
def _remove_doing_seeds(self, seeds):
|
|
130
|
+
for seed in seeds:
|
|
131
|
+
self.__DOING__.pop(seed, None)
|
|
132
|
+
|
|
133
|
+
def _execute_heartbeat(self):
|
|
134
|
+
pass
|
|
135
|
+
|
|
136
|
+
def _reset(self):
|
|
137
|
+
pass
|
|
138
|
+
|
|
139
|
+
def _scheduler(self):
|
|
140
|
+
pass
|
|
141
|
+
|
|
142
|
+
def _insert(self):
|
|
143
|
+
pass
|
|
144
|
+
|
|
145
|
+
def _refresh(self):
|
|
146
|
+
pass
|
|
147
|
+
|
|
148
|
+
def _delete(self):
|
|
149
|
+
pass
|
|
150
|
+
|
|
151
|
+
def _execute(self):
|
|
152
|
+
for func_name in self.__LAUNCHER_FUNC__:
|
|
153
|
+
threading.Thread(name=func_name, target=getattr(self, func_name)).start()
|
|
154
|
+
time.sleep(2)
|
|
155
|
+
|
|
156
|
+
def _polling(self):
|
|
157
|
+
|
|
158
|
+
check_emtpy_times = 0
|
|
159
|
+
|
|
160
|
+
while not self._stop.is_set():
|
|
161
|
+
|
|
162
|
+
queue_not_empty_count = 0
|
|
163
|
+
|
|
164
|
+
for q in self.__LAUNCHER_QUEUE__.values():
|
|
165
|
+
if q.length != 0:
|
|
166
|
+
queue_not_empty_count += 1
|
|
167
|
+
|
|
168
|
+
if self._pause.is_set() and queue_not_empty_count != 0:
|
|
169
|
+
self._pause.clear()
|
|
170
|
+
self._execute()
|
|
171
|
+
|
|
172
|
+
elif queue_not_empty_count == 0:
|
|
173
|
+
check_emtpy_times += 1
|
|
174
|
+
else:
|
|
175
|
+
check_emtpy_times = 0
|
|
176
|
+
|
|
177
|
+
if check_emtpy_times > 2:
|
|
178
|
+
check_emtpy_times = 0
|
|
179
|
+
self.__DOING__ = {}
|
|
180
|
+
self._pause.set()
|
|
181
|
+
|
|
182
|
+
def run(self):
|
|
183
|
+
threading.Thread(target=self._execute_heartbeat).start()
|
|
184
|
+
|
|
185
|
+
self._Crawler(
|
|
186
|
+
upload_queue=self._upload_queue,
|
|
187
|
+
custom_func=self.__CUSTOM_FUNC__,
|
|
188
|
+
launcher_queue=self.__LAUNCHER_QUEUE__,
|
|
189
|
+
).start()
|
|
190
|
+
|
|
191
|
+
self._Pipeline(
|
|
192
|
+
upload_queue=self._upload_queue,
|
|
193
|
+
done_queue=self.__LAUNCHER_QUEUE__["done"],
|
|
194
|
+
upload_queue_size=self._upload_queue_max_size,
|
|
195
|
+
upload_wait_seconds=self._upload_queue_wait_seconds
|
|
196
|
+
).start()
|
|
197
|
+
|
|
198
|
+
self._execute()
|
|
199
|
+
self._polling()
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import threading
|
|
3
|
+
|
|
4
|
+
from cobweb.db import RedisDB
|
|
5
|
+
from cobweb.base import Seed, logger
|
|
6
|
+
from cobweb.launchers import Launcher
|
|
7
|
+
from cobweb.constant import DealModel, LogTemplate
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class LauncherPro(Launcher):
|
|
11
|
+
|
|
12
|
+
def __init__(self, task, project, custom_setting=None):
|
|
13
|
+
super().__init__(task, project, custom_setting)
|
|
14
|
+
self._todo = "{%s:%s}:todo" % (project, task)
|
|
15
|
+
self._done = "{%s:%s}:done" % (project, task)
|
|
16
|
+
self._fail = "{%s:%s}:fail" % (project, task)
|
|
17
|
+
self._heartbeat = "heartbeat:%s_%s" % (project, task)
|
|
18
|
+
self._reset_lock = "lock:reset:%s_%s" % (project, task)
|
|
19
|
+
self._heartbeat_lock = "lock:heartbeat:%s_%s" % (project, task)
|
|
20
|
+
self._db = RedisDB()
|
|
21
|
+
|
|
22
|
+
self._heartbeat_start_event = threading.Event()
|
|
23
|
+
self._redis_queue_empty_event = threading.Event()
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
def heartbeat(self):
|
|
27
|
+
return self._db.exists(self._heartbeat)
|
|
28
|
+
|
|
29
|
+
def _execute_heartbeat(self):
|
|
30
|
+
while not self._stop.is_set():
|
|
31
|
+
if self._heartbeat_start_event.is_set():
|
|
32
|
+
if self._db.lock(self._heartbeat_lock, t=1):
|
|
33
|
+
self._db.setex(self._heartbeat, 3)
|
|
34
|
+
time.sleep(1)
|
|
35
|
+
time.sleep(0.5)
|
|
36
|
+
|
|
37
|
+
def _reset(self):
|
|
38
|
+
"""
|
|
39
|
+
检查过期种子,重新添加到redis缓存中
|
|
40
|
+
"""
|
|
41
|
+
first = True
|
|
42
|
+
while not self._pause.is_set():
|
|
43
|
+
reset_wait_seconds = 15
|
|
44
|
+
if self._db.lock(self._reset_lock, t=120):
|
|
45
|
+
if not self.heartbeat:
|
|
46
|
+
self._heartbeat_start_event.set()
|
|
47
|
+
|
|
48
|
+
_min = -int(time.time()) + self._seed_reset_seconds \
|
|
49
|
+
if self.heartbeat or not first else "-inf"
|
|
50
|
+
|
|
51
|
+
self._db.members(
|
|
52
|
+
self._todo, 0,
|
|
53
|
+
_min=_min, _max="(0"
|
|
54
|
+
)
|
|
55
|
+
self._db.delete(self._reset_lock)
|
|
56
|
+
reset_wait_seconds = 60
|
|
57
|
+
|
|
58
|
+
time.sleep(reset_wait_seconds)
|
|
59
|
+
first = False
|
|
60
|
+
|
|
61
|
+
def _scheduler(self):
|
|
62
|
+
"""
|
|
63
|
+
调度任务,获取redis队列种子,同时添加到doing字典中
|
|
64
|
+
"""
|
|
65
|
+
if self.start_seeds:
|
|
66
|
+
self.__LAUNCHER_QUEUE__['todo'].push(self.start_seeds)
|
|
67
|
+
while not self._pause.is_set():
|
|
68
|
+
if not self._db.zcount(self._todo, 0, "(1000"):
|
|
69
|
+
time.sleep(self._scheduler_wait_seconds)
|
|
70
|
+
continue
|
|
71
|
+
if self.__LAUNCHER_QUEUE__['todo'].length >= self._todo_queue_size:
|
|
72
|
+
time.sleep(self._todo_queue_full_wait_seconds)
|
|
73
|
+
continue
|
|
74
|
+
members = self._db.members(
|
|
75
|
+
self._todo, int(time.time()),
|
|
76
|
+
count=self._todo_queue_size,
|
|
77
|
+
_min=0, _max="(1000"
|
|
78
|
+
)
|
|
79
|
+
for member, priority in members:
|
|
80
|
+
seed = Seed(member, priority=priority)
|
|
81
|
+
self.__LAUNCHER_QUEUE__['todo'].push(seed)
|
|
82
|
+
self.__DOING__[seed.to_string] = seed.params.priority
|
|
83
|
+
|
|
84
|
+
def _insert(self):
|
|
85
|
+
"""
|
|
86
|
+
添加新种子到redis队列中
|
|
87
|
+
"""
|
|
88
|
+
while not self._pause.is_set():
|
|
89
|
+
seeds = {}
|
|
90
|
+
status = self.__LAUNCHER_QUEUE__['new'].length < self._new_queue_max_size
|
|
91
|
+
for _ in range(self._new_queue_max_size):
|
|
92
|
+
seed = self.__LAUNCHER_QUEUE__['new'].pop()
|
|
93
|
+
if not seed:
|
|
94
|
+
break
|
|
95
|
+
seeds[seed.to_string] = seed.params.priority
|
|
96
|
+
if seeds:
|
|
97
|
+
self._db.zadd(self._todo, seeds, nx=True)
|
|
98
|
+
if status:
|
|
99
|
+
time.sleep(self._new_queue_wait_seconds)
|
|
100
|
+
|
|
101
|
+
def _refresh(self):
|
|
102
|
+
"""
|
|
103
|
+
刷新doing种子过期时间,防止reset重新消费
|
|
104
|
+
"""
|
|
105
|
+
while not self._pause.is_set():
|
|
106
|
+
if self.__DOING__:
|
|
107
|
+
refresh_time = int(time.time())
|
|
108
|
+
seeds = {k:-refresh_time - v / 1000 for k, v in self.__DOING__.items()}
|
|
109
|
+
self._db.zadd(self._todo, item=seeds, xx=True)
|
|
110
|
+
time.sleep(30)
|
|
111
|
+
|
|
112
|
+
def _delete(self):
|
|
113
|
+
"""
|
|
114
|
+
删除队列种子,根据状态添加至成功或失败队列,移除doing字典种子索引
|
|
115
|
+
"""
|
|
116
|
+
while not self._pause.is_set():
|
|
117
|
+
seeds, s_seeds, f_seeds = [], [], []
|
|
118
|
+
status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
|
|
119
|
+
|
|
120
|
+
for _ in range(self._done_queue_max_size):
|
|
121
|
+
seed = self.__LAUNCHER_QUEUE__['done'].pop()
|
|
122
|
+
if not seed:
|
|
123
|
+
break
|
|
124
|
+
if seed.params.identifier == DealModel.fail:
|
|
125
|
+
f_seeds.append(seed.to_string)
|
|
126
|
+
elif self._done_model == 1:
|
|
127
|
+
s_seeds.append(seed.to_string)
|
|
128
|
+
else:
|
|
129
|
+
seeds.append(seed.to_string)
|
|
130
|
+
if seeds:
|
|
131
|
+
self._db.zrem(self._todo, *seeds)
|
|
132
|
+
if s_seeds:
|
|
133
|
+
self._db.done([self._todo, self._done], *s_seeds)
|
|
134
|
+
if f_seeds:
|
|
135
|
+
self._db.done([self._todo, self._fail], *f_seeds)
|
|
136
|
+
|
|
137
|
+
self._remove_doing_seeds(seeds)
|
|
138
|
+
|
|
139
|
+
if status:
|
|
140
|
+
time.sleep(self._done_queue_wait_seconds)
|
|
141
|
+
|
|
142
|
+
def _polling(self):
|
|
143
|
+
check_emtpy_times = 0
|
|
144
|
+
while not self._stop.is_set():
|
|
145
|
+
queue_not_empty_count = 0
|
|
146
|
+
pooling_wait_seconds = 30
|
|
147
|
+
if not self._db.zcard(self._todo):
|
|
148
|
+
for q in self.__LAUNCHER_QUEUE__.values():
|
|
149
|
+
if q.length != 0:
|
|
150
|
+
queue_not_empty_count += 1
|
|
151
|
+
if self._pause.is_set() and queue_not_empty_count != 0:
|
|
152
|
+
self._pause.clear()
|
|
153
|
+
self._execute()
|
|
154
|
+
elif queue_not_empty_count == 0:
|
|
155
|
+
pooling_wait_seconds = 3
|
|
156
|
+
check_emtpy_times += 1
|
|
157
|
+
else:
|
|
158
|
+
check_emtpy_times = 0
|
|
159
|
+
if check_emtpy_times > 2:
|
|
160
|
+
check_emtpy_times = 0
|
|
161
|
+
self.__DOING__ = {}
|
|
162
|
+
self._pause.set()
|
|
163
|
+
if not self._pause.is_set():
|
|
164
|
+
logger.info(LogTemplate.launcher_pro_polling.format(
|
|
165
|
+
task=self.task,
|
|
166
|
+
doing_len=len(self.__DOING__.keys()),
|
|
167
|
+
todo_len=self.__LAUNCHER_QUEUE__['todo'].length,
|
|
168
|
+
done_len=self.__LAUNCHER_QUEUE__['done'].length,
|
|
169
|
+
redis_seed_count=self._db.zcount(self._todo, "-inf", "+inf"),
|
|
170
|
+
redis_todo_len=self._db.zcount(self._todo, 0, "(1000"),
|
|
171
|
+
redis_doing_len=self._db.zcount(self._todo, "-inf", "(0"),
|
|
172
|
+
upload_len=self._upload_queue.length
|
|
173
|
+
))
|
|
174
|
+
time.sleep(pooling_wait_seconds)
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import threading
|
|
3
|
+
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
from cobweb.base import BaseItem, Queue, logger
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Pipeline(threading.Thread, ABC):
|
|
9
|
+
|
|
10
|
+
def __init__(
|
|
11
|
+
self,
|
|
12
|
+
done_queue: Queue,
|
|
13
|
+
upload_queue: Queue,
|
|
14
|
+
upload_queue_size: int,
|
|
15
|
+
upload_wait_seconds: int
|
|
16
|
+
):
|
|
17
|
+
super().__init__()
|
|
18
|
+
self.done_queue = done_queue
|
|
19
|
+
self.upload_queue = upload_queue
|
|
20
|
+
self.upload_queue_size = upload_queue_size
|
|
21
|
+
self.upload_wait_seconds = upload_wait_seconds
|
|
22
|
+
|
|
23
|
+
@abstractmethod
|
|
24
|
+
def build(self, item: BaseItem) -> dict:
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
@abstractmethod
|
|
28
|
+
def upload(self, table: str, data: list) -> bool:
|
|
29
|
+
pass
|
|
30
|
+
|
|
31
|
+
def run(self):
|
|
32
|
+
while True:
|
|
33
|
+
status = self.upload_queue.length < self.upload_queue_size
|
|
34
|
+
if status:
|
|
35
|
+
time.sleep(self.upload_wait_seconds)
|
|
36
|
+
data_info, seeds = {}, []
|
|
37
|
+
for _ in range(self.upload_queue_size):
|
|
38
|
+
item = self.upload_queue.pop()
|
|
39
|
+
if not item:
|
|
40
|
+
break
|
|
41
|
+
data = self.build(item)
|
|
42
|
+
seeds.append(item.seed)
|
|
43
|
+
data_info.setdefault(item.table, []).append(data)
|
|
44
|
+
for table, datas in data_info.items():
|
|
45
|
+
try:
|
|
46
|
+
self.upload(table, datas)
|
|
47
|
+
status = True
|
|
48
|
+
except Exception as e:
|
|
49
|
+
logger.info(e)
|
|
50
|
+
status = False
|
|
51
|
+
if status:
|
|
52
|
+
self.done_queue.push(seeds)
|
|
53
|
+
|
|
54
|
+
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import setting
|
|
3
|
+
|
|
4
|
+
from cobweb.base import BaseItem
|
|
5
|
+
from cobweb.pipelines import Pipeline
|
|
6
|
+
from aliyun.log import LogClient, LogItem, PutLogsRequest
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class LoghubPipeline(Pipeline):
|
|
10
|
+
|
|
11
|
+
def __init__(self, *args, **kwargs):
|
|
12
|
+
super().__init__(*args, **kwargs)
|
|
13
|
+
self.client = LogClient(**setting.LOGHUB_CONFIG)
|
|
14
|
+
|
|
15
|
+
def build(self, item: BaseItem):
|
|
16
|
+
log_item = LogItem()
|
|
17
|
+
temp = item.to_dict
|
|
18
|
+
for key, value in temp.items():
|
|
19
|
+
if not isinstance(value, str):
|
|
20
|
+
temp[key] = json.dumps(value, ensure_ascii=False)
|
|
21
|
+
contents = sorted(temp.items())
|
|
22
|
+
log_item.set_contents(contents)
|
|
23
|
+
return log_item
|
|
24
|
+
|
|
25
|
+
def upload(self, table, datas):
|
|
26
|
+
request = PutLogsRequest(
|
|
27
|
+
project=setting.LOGHUB_PROJECT,
|
|
28
|
+
logstore=table,
|
|
29
|
+
topic=setting.LOGHUB_TOPIC,
|
|
30
|
+
source=setting.LOGHUB_SOURCE,
|
|
31
|
+
logitems=datas,
|
|
32
|
+
compress=True
|
|
33
|
+
)
|
|
34
|
+
self.client.put_logs(request=request)
|
cobweb/setting.py
CHANGED
|
@@ -1,13 +1,62 @@
|
|
|
1
1
|
import os
|
|
2
2
|
|
|
3
|
+
# redis db config
|
|
4
|
+
REDIS_CONFIG = {
|
|
5
|
+
"host": os.getenv("REDIS_HOST"),
|
|
6
|
+
"password": os.getenv("REDIS_PASSWORD"),
|
|
7
|
+
"port": int(os.getenv("REDIS_PORT", 6379)),
|
|
8
|
+
"db": int(os.getenv("REDIS_DB", 0)),
|
|
9
|
+
}
|
|
3
10
|
|
|
4
|
-
#
|
|
5
|
-
|
|
11
|
+
# loghub db config
|
|
12
|
+
LOGHUB_TOPIC = os.getenv("LOGHUB_TOPIC")
|
|
13
|
+
LOGHUB_SOURCE = os.getenv("LOGHUB_SOURCE")
|
|
14
|
+
LOGHUB_PROJECT = os.getenv("LOGHUB_PROJECT")
|
|
15
|
+
LOGHUB_CONFIG = {
|
|
16
|
+
"endpoint": os.getenv("LOGHUB_ENDPOINT"),
|
|
17
|
+
"accessKeyId": os.getenv("LOGHUB_ACCESS_KEY"),
|
|
18
|
+
"accessKey": os.getenv("LOGHUB_SECRET_KEY")
|
|
19
|
+
}
|
|
6
20
|
|
|
7
|
-
#
|
|
8
|
-
|
|
21
|
+
# oss util config
|
|
22
|
+
OSS_BUCKET = os.getenv("OSS_BUCKET")
|
|
23
|
+
OSS_ENDPOINT = os.getenv("OSS_ENDPOINT")
|
|
24
|
+
OSS_ACCESS_KEY = os.getenv("OSS_ACCESS_KEY")
|
|
25
|
+
OSS_SECRET_KEY = os.getenv("OSS_SECRET_KEY")
|
|
26
|
+
OSS_MIN_UPLOAD_SIZE = 1024 * 100
|
|
27
|
+
OSS_CHUNK_SIZE = 1024 ** 2
|
|
9
28
|
|
|
10
|
-
#
|
|
11
|
-
|
|
29
|
+
# 采集器选择
|
|
30
|
+
CRAWLER = "cobweb.crawlers.CrawlerAir"
|
|
12
31
|
|
|
32
|
+
# 数据上传链路
|
|
33
|
+
PIPELINE = "cobweb.pipelines.loghub_pipeline.LoghubPipeline"
|
|
13
34
|
|
|
35
|
+
|
|
36
|
+
# Launcher 等待时间
|
|
37
|
+
SCHEDULER_WAIT_SECONDS = 15 # 调度等待时间
|
|
38
|
+
TODO_QUEUE_FULL_WAIT_SECONDS = 5 # todo队列已满时等待时间
|
|
39
|
+
NEW_QUEUE_WAIT_SECONDS = 30 # new队列等待时间
|
|
40
|
+
DONE_QUEUE_WAIT_SECONDS = 15 # done队列等待时间
|
|
41
|
+
UPLOAD_QUEUE_WAIT_SECONDS = 15 # upload队列等待时间
|
|
42
|
+
SEED_RESET_SECONDS = 600 # 种子重制时间
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
# Launcher 队列长度
|
|
46
|
+
TODO_QUEUE_SIZE = 100 # todo队列长度
|
|
47
|
+
NEW_QUEUE_MAX_SIZE = 100 # new队列长度
|
|
48
|
+
DONE_QUEUE_MAX_SIZE = 100 # done队列长度
|
|
49
|
+
UPLOAD_QUEUE_MAX_SIZE = 100 # upload队列长度
|
|
50
|
+
|
|
51
|
+
# DONE_MODEL IN (0, 1), 种子完成模式
|
|
52
|
+
DONE_MODEL = 0 # 0:种子消费成功直接从队列移除,失败则添加至失败队列;1:种子消费成功添加至成功队列,失败添加至失败队列
|
|
53
|
+
|
|
54
|
+
# DOWNLOAD_MODEL IN (0, 1), 下载模式
|
|
55
|
+
DOWNLOAD_MODEL = 0 # 0: 通用下载;1:文件下载
|
|
56
|
+
|
|
57
|
+
# spider
|
|
58
|
+
SPIDER_THREAD_NUM = 10
|
|
59
|
+
SPIDER_MAX_RETRIES = 5
|
|
60
|
+
|
|
61
|
+
# 文件下载响应类型过滤
|
|
62
|
+
FILE_FILTER_CONTENT_TYPE = ["text/html", "application/xhtml+xml"]
|
cobweb/utils/__init__.py
ADDED
cobweb/utils/oss.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import setting
|
|
2
|
+
|
|
3
|
+
from requests import Response
|
|
4
|
+
from oss2 import Auth, Bucket, models
|
|
5
|
+
from cobweb.exceptions import oss_db_exception
|
|
6
|
+
from base.decorators import decorator_oss_db
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class OssUtil:
|
|
10
|
+
|
|
11
|
+
def __init__(
|
|
12
|
+
self,
|
|
13
|
+
bucket=None,
|
|
14
|
+
endpoint=None,
|
|
15
|
+
access_key=None,
|
|
16
|
+
secret_key=None,
|
|
17
|
+
chunk_size=None,
|
|
18
|
+
min_upload_size=None,
|
|
19
|
+
):
|
|
20
|
+
self.bucket = bucket or setting.OSS_BUCKET
|
|
21
|
+
self.endpoint = endpoint or setting.OSS_ENDPOINT
|
|
22
|
+
self.chunk_size = int(chunk_size or setting.OSS_CHUNK_SIZE)
|
|
23
|
+
self.min_upload_size = int(min_upload_size or setting.OSS_MIN_UPLOAD_SIZE)
|
|
24
|
+
self._auth = Auth(
|
|
25
|
+
access_key_id=access_key or setting.OSS_ACCESS_KEY,
|
|
26
|
+
access_key_secret=secret_key or setting.OSS_SECRET_KEY
|
|
27
|
+
)
|
|
28
|
+
self._client = Bucket(
|
|
29
|
+
auth=self._auth,
|
|
30
|
+
endpoint=self.endpoint,
|
|
31
|
+
bucket_name=self.bucket,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
def exists(self, key: str) -> bool:
|
|
35
|
+
return self._client.object_exists(key)
|
|
36
|
+
|
|
37
|
+
def head(self, key: str) -> models.HeadObjectResult:
|
|
38
|
+
return self._client.head_object(key)
|
|
39
|
+
|
|
40
|
+
@decorator_oss_db(exception=oss_db_exception.OssDBInitPartError)
|
|
41
|
+
def init_part(self, key) -> models.InitMultipartUploadResult:
|
|
42
|
+
"""初始化分片上传"""
|
|
43
|
+
return self._client.init_multipart_upload(key)
|
|
44
|
+
|
|
45
|
+
@decorator_oss_db(exception=oss_db_exception.OssDBPutObjError)
|
|
46
|
+
def put(self, key, data) -> models.PutObjectResult:
|
|
47
|
+
"""文件上传"""
|
|
48
|
+
return self._client.put_object(key, data)
|
|
49
|
+
|
|
50
|
+
@decorator_oss_db(exception=oss_db_exception.OssDBPutPartError)
|
|
51
|
+
def put_part(self, key, upload_id, position, data) -> models.PutObjectResult:
|
|
52
|
+
"""分片上传"""
|
|
53
|
+
return self._client.upload_part(key, upload_id, position, data)
|
|
54
|
+
|
|
55
|
+
@decorator_oss_db(exception=oss_db_exception.OssDBMergeError)
|
|
56
|
+
def merge(self, key, upload_id, parts=None) -> models.PutObjectResult:
|
|
57
|
+
"""合并分片"""
|
|
58
|
+
headers = None if parts else {"x-oss-complete-all": "yes"}
|
|
59
|
+
return self._client.complete_multipart_upload(key, upload_id, parts, headers=headers)
|
|
60
|
+
|
|
61
|
+
@decorator_oss_db(exception=oss_db_exception.OssDBAppendObjError)
|
|
62
|
+
def append(self, key, position, data) -> models.AppendObjectResult:
|
|
63
|
+
"""追加上传"""
|
|
64
|
+
return self._client.append_object(key, position, data)
|
|
65
|
+
|
|
66
|
+
def iter_data(self, data, chunk_size=None):
|
|
67
|
+
chunk_size = chunk_size or self.chunk_size
|
|
68
|
+
if isinstance(data, Response):
|
|
69
|
+
for part_data in data.iter_content(chunk_size):
|
|
70
|
+
yield part_data
|
|
71
|
+
if isinstance(data, bytes):
|
|
72
|
+
for i in range(0, len(data), chunk_size):
|
|
73
|
+
yield data[i:i + chunk_size]
|
|
74
|
+
|
|
75
|
+
def assemble(self, ready_data, data, chunk_size=None):
|
|
76
|
+
upload_data = b""
|
|
77
|
+
ready_data = ready_data + data
|
|
78
|
+
chunk_size = chunk_size or self.chunk_size
|
|
79
|
+
if len(ready_data) >= chunk_size:
|
|
80
|
+
upload_data = ready_data[:chunk_size]
|
|
81
|
+
ready_data = ready_data[chunk_size:]
|
|
82
|
+
return ready_data, upload_data
|
|
83
|
+
|
|
84
|
+
def content_length(self, key: str) -> int:
|
|
85
|
+
head = self.head(key)
|
|
86
|
+
return head.content_length
|
|
87
|
+
|
cobweb/utils/tools.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import hashlib
|
|
3
|
+
from typing import Union
|
|
4
|
+
from importlib import import_module
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def md5(text: Union[str, bytes]) -> str:
|
|
8
|
+
if isinstance(text, str):
|
|
9
|
+
text = text.encode('utf-8')
|
|
10
|
+
return hashlib.md5(text).hexdigest()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def build_path(site, url, file_type):
|
|
14
|
+
return f"{site}/{md5(url)}.{file_type}"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def format_size(content_length: int) -> str:
|
|
18
|
+
units = ["KB", "MB", "GB", "TB"]
|
|
19
|
+
for i in range(4):
|
|
20
|
+
num = content_length / (1024 ** (i + 1))
|
|
21
|
+
if num < 1024:
|
|
22
|
+
return f"{round(num, 2)} {units[i]}"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def dynamic_load_class(model_info):
|
|
26
|
+
if isinstance(model_info, str):
|
|
27
|
+
if "import" in model_info:
|
|
28
|
+
model_path, class_name = re.search(
|
|
29
|
+
r"from (.*?) import (.*?)$", model_info
|
|
30
|
+
).groups()
|
|
31
|
+
model = import_module(model_path)
|
|
32
|
+
class_object = getattr(model, class_name)
|
|
33
|
+
else:
|
|
34
|
+
model_path, class_name = model_info.rsplit(".", 1)
|
|
35
|
+
model = import_module(model_path)
|
|
36
|
+
class_object = getattr(model, class_name)
|
|
37
|
+
return class_object
|
|
38
|
+
raise TypeError()
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def download_log_info(item:dict) -> str:
|
|
42
|
+
return "\n".join([" " * 12 + f"{k.ljust(14)}: {v}" for k, v in item.items()])
|