cobweb-launcher 1.0.5__py3-none-any.whl → 3.2.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cobweb/__init__.py +5 -1
- cobweb/base/__init__.py +3 -3
- cobweb/base/common_queue.py +37 -16
- cobweb/base/item.py +40 -14
- cobweb/base/{log.py → logger.py} +3 -3
- cobweb/base/request.py +744 -47
- cobweb/base/response.py +381 -13
- cobweb/base/seed.py +98 -50
- cobweb/base/task_queue.py +180 -0
- cobweb/base/test.py +257 -0
- cobweb/constant.py +39 -2
- cobweb/crawlers/__init__.py +1 -2
- cobweb/crawlers/crawler.py +27 -0
- cobweb/db/__init__.py +1 -0
- cobweb/db/api_db.py +83 -0
- cobweb/db/redis_db.py +118 -27
- cobweb/launchers/__init__.py +3 -1
- cobweb/launchers/distributor.py +141 -0
- cobweb/launchers/launcher.py +103 -130
- cobweb/launchers/uploader.py +68 -0
- cobweb/log_dots/__init__.py +2 -0
- cobweb/log_dots/dot.py +258 -0
- cobweb/log_dots/loghub_dot.py +53 -0
- cobweb/pipelines/__init__.py +3 -2
- cobweb/pipelines/pipeline.py +19 -0
- cobweb/pipelines/pipeline_csv.py +25 -0
- cobweb/pipelines/pipeline_loghub.py +54 -0
- cobweb/schedulers/__init__.py +1 -0
- cobweb/schedulers/scheduler.py +66 -0
- cobweb/schedulers/scheduler_with_redis.py +189 -0
- cobweb/setting.py +37 -38
- cobweb/utils/__init__.py +5 -2
- cobweb/utils/bloom.py +58 -0
- cobweb/{base → utils}/decorators.py +14 -12
- cobweb/utils/dotting.py +300 -0
- cobweb/utils/oss.py +113 -86
- cobweb/utils/tools.py +3 -15
- cobweb_launcher-3.2.18.dist-info/METADATA +193 -0
- cobweb_launcher-3.2.18.dist-info/RECORD +44 -0
- {cobweb_launcher-1.0.5.dist-info → cobweb_launcher-3.2.18.dist-info}/WHEEL +1 -1
- cobweb/crawlers/base_crawler.py +0 -121
- cobweb/crawlers/file_crawler.py +0 -181
- cobweb/launchers/launcher_pro.py +0 -174
- cobweb/pipelines/base_pipeline.py +0 -54
- cobweb/pipelines/loghub_pipeline.py +0 -34
- cobweb_launcher-1.0.5.dist-info/METADATA +0 -48
- cobweb_launcher-1.0.5.dist-info/RECORD +0 -32
- {cobweb_launcher-1.0.5.dist-info → cobweb_launcher-3.2.18.dist-info}/LICENSE +0 -0
- {cobweb_launcher-1.0.5.dist-info → cobweb_launcher-3.2.18.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import threading
|
|
3
|
+
from typing import Callable, Type
|
|
4
|
+
from cobweb.pipelines import Pipeline
|
|
5
|
+
from cobweb.base import TaskQueue, logger, Status
|
|
6
|
+
from cobweb.utils import check_pause
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Uploader(threading.Thread):
|
|
10
|
+
|
|
11
|
+
def __init__(
|
|
12
|
+
self,
|
|
13
|
+
task: str,
|
|
14
|
+
project: str,
|
|
15
|
+
stop: threading.Event,
|
|
16
|
+
pause: threading.Event,
|
|
17
|
+
task_queue: TaskQueue,
|
|
18
|
+
callback_register: Callable,
|
|
19
|
+
SpiderPipeline: Type[Pipeline]
|
|
20
|
+
):
|
|
21
|
+
super().__init__()
|
|
22
|
+
self.task = task
|
|
23
|
+
self.project = project
|
|
24
|
+
|
|
25
|
+
self.stop = stop
|
|
26
|
+
self.pause = pause
|
|
27
|
+
|
|
28
|
+
self.task_queue = task_queue
|
|
29
|
+
self.callback_register = callback_register
|
|
30
|
+
|
|
31
|
+
from cobweb import setting
|
|
32
|
+
|
|
33
|
+
self.upload_size = setting.UPLOAD_QUEUE_MAX_SIZE
|
|
34
|
+
self.wait_seconds = setting.UPLOAD_QUEUE_WAIT_SECONDS
|
|
35
|
+
|
|
36
|
+
self.pipeline = SpiderPipeline(task=self.task, project=self.project)
|
|
37
|
+
|
|
38
|
+
logger.debug(f"Uploader instance attrs: {self.__dict__}")
|
|
39
|
+
|
|
40
|
+
@check_pause
|
|
41
|
+
def upload_data(self):
|
|
42
|
+
try:
|
|
43
|
+
data_info, task_ids = dict(), set()
|
|
44
|
+
if task_list := self.task_queue.get_task_by_status(
|
|
45
|
+
status=Status.UPLOAD, limit=self.upload_size
|
|
46
|
+
):
|
|
47
|
+
for task_item in task_list:
|
|
48
|
+
upload_data = self.pipeline.build(task_item.data)
|
|
49
|
+
data_info.setdefault(task_item.data.table, []).append(upload_data)
|
|
50
|
+
task_ids.add(task_item.task_id)
|
|
51
|
+
|
|
52
|
+
for table, datas in data_info.items():
|
|
53
|
+
try:
|
|
54
|
+
self.pipeline.upload(table, datas)
|
|
55
|
+
except Exception as e:
|
|
56
|
+
logger.info(e)
|
|
57
|
+
|
|
58
|
+
self.task_queue.remove(task_ids)
|
|
59
|
+
except Exception as e:
|
|
60
|
+
logger.info(e)
|
|
61
|
+
|
|
62
|
+
if self.task_queue.status_length(status=Status.UPLOAD) < self.upload_size:
|
|
63
|
+
time.sleep(self.wait_seconds)
|
|
64
|
+
|
|
65
|
+
def run(self):
|
|
66
|
+
self.callback_register(self.upload_data, tag="Uploader")
|
|
67
|
+
|
|
68
|
+
|
cobweb/log_dots/dot.py
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import time
|
|
4
|
+
from threading import Event
|
|
5
|
+
from requests import RequestException, Response as requests_Response
|
|
6
|
+
from cobweb.base import Queue, Request, Seed, Response, BaseItem, logger
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Dot:
|
|
10
|
+
|
|
11
|
+
def __init__(self, stop: Event, project: str, task: str) -> None:
|
|
12
|
+
self._stop = stop
|
|
13
|
+
self._queue = Queue()
|
|
14
|
+
self.project = project
|
|
15
|
+
self.task = task
|
|
16
|
+
|
|
17
|
+
def logging(self, topic, msg):
|
|
18
|
+
log_data = {
|
|
19
|
+
"stage": topic,
|
|
20
|
+
"message": msg,
|
|
21
|
+
"project": self.project,
|
|
22
|
+
"task": self.task,
|
|
23
|
+
}
|
|
24
|
+
self._queue.push(log_data)
|
|
25
|
+
|
|
26
|
+
def _build_request_log(self, request_item: Request):
|
|
27
|
+
seed: Seed = request_item.seed
|
|
28
|
+
get_time = seed.params.get_time
|
|
29
|
+
start_time = seed.params.start_time
|
|
30
|
+
request_time = seed.params.request_time
|
|
31
|
+
stage_cost = request_time - start_time if request_time and start_time else 0
|
|
32
|
+
cost = request_time - start_time if request_time and start_time else 0
|
|
33
|
+
|
|
34
|
+
instance_id = os.getenv("INSTANCE_ID")
|
|
35
|
+
|
|
36
|
+
request_settings = json.dumps(
|
|
37
|
+
request_item.request_settings,
|
|
38
|
+
ensure_ascii=False, separators=(',', ':')
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
log_data = {
|
|
42
|
+
"stage": "request",
|
|
43
|
+
"project": self.project,
|
|
44
|
+
"task": self.task,
|
|
45
|
+
"seed": seed.to_string,
|
|
46
|
+
"request": repr(request_item),
|
|
47
|
+
"instance_id": instance_id,
|
|
48
|
+
"request_settings": request_settings,
|
|
49
|
+
"get_time": get_time,
|
|
50
|
+
"start_time": start_time,
|
|
51
|
+
"stage_cost": stage_cost,
|
|
52
|
+
"cost": cost,
|
|
53
|
+
"time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(request_time)),
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
self._queue.push(log_data)
|
|
57
|
+
|
|
58
|
+
def _build_download_log(self, response_item: Response):
|
|
59
|
+
seed: Seed = response_item.seed
|
|
60
|
+
get_time = seed.params.get_time
|
|
61
|
+
start_time = seed.params.start_time
|
|
62
|
+
request_time = seed.params.request_time
|
|
63
|
+
download_time = seed.params.download_time
|
|
64
|
+
stage_cost = download_time - request_time if request_time and download_time else 0
|
|
65
|
+
cost = download_time - start_time if download_time and start_time else 0
|
|
66
|
+
|
|
67
|
+
instance_id = os.getenv("INSTANCE_ID")
|
|
68
|
+
|
|
69
|
+
log_data = {
|
|
70
|
+
"stage": "download",
|
|
71
|
+
"project": self.project,
|
|
72
|
+
"task": self.task,
|
|
73
|
+
"seed": seed.to_string,
|
|
74
|
+
"response": repr(response_item),
|
|
75
|
+
"get_time": get_time,
|
|
76
|
+
"start_time": start_time,
|
|
77
|
+
"request_time": request_time,
|
|
78
|
+
"download_time": download_time,
|
|
79
|
+
"stage_cost": stage_cost,
|
|
80
|
+
"cost": cost,
|
|
81
|
+
"instance_id": instance_id,
|
|
82
|
+
"proxy": seed.params.proxy or '-',
|
|
83
|
+
"time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(download_time)),
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
if (response := response_item.response) and isinstance(response, requests_Response):
|
|
87
|
+
log_data['request_info'] = {
|
|
88
|
+
'method': response.request.method,
|
|
89
|
+
'url': response.request.url,
|
|
90
|
+
'headers': dict(response.request.headers),
|
|
91
|
+
'body': response.request.body or "-",
|
|
92
|
+
}
|
|
93
|
+
log_data['response_info'] = {
|
|
94
|
+
"status_code": response.status_code,
|
|
95
|
+
"reason": response.reason,
|
|
96
|
+
"headers": dict(response.headers),
|
|
97
|
+
# "content": response.text[:500], # 截取内容
|
|
98
|
+
"content_type": response.headers.get('content-type', '-'),
|
|
99
|
+
"content_length": response.headers.get('content-length', '-'),
|
|
100
|
+
"server": response.headers.get('server', '-'),
|
|
101
|
+
"date": response.headers.get('date', '-'),
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
self._queue.push(log_data)
|
|
105
|
+
|
|
106
|
+
def _build_parse_log(self, parse_item: BaseItem):
|
|
107
|
+
seed: Seed = parse_item.seed
|
|
108
|
+
get_time = seed.params.get_time
|
|
109
|
+
start_time = seed.params.start_time
|
|
110
|
+
request_time = seed.params.request_time
|
|
111
|
+
response_time = seed.params.response_time
|
|
112
|
+
parse_time = seed.params.parse_time
|
|
113
|
+
instance_id = os.getenv("INSTANCE_ID")
|
|
114
|
+
|
|
115
|
+
pre_time = request_time or response_time
|
|
116
|
+
stage_cost = parse_time - pre_time if parse_time and pre_time else 0
|
|
117
|
+
cost = parse_time - start_time if parse_time and start_time else 0
|
|
118
|
+
|
|
119
|
+
log_data = {
|
|
120
|
+
"stage": "parse",
|
|
121
|
+
"project": self.project,
|
|
122
|
+
"task": self.task,
|
|
123
|
+
"seed": seed.to_string,
|
|
124
|
+
"parse": repr(parse_item),
|
|
125
|
+
"get_time": get_time,
|
|
126
|
+
"start_time": start_time,
|
|
127
|
+
"instance_id": instance_id,
|
|
128
|
+
"parse_time": parse_time,
|
|
129
|
+
"stage_cost": stage_cost,
|
|
130
|
+
"cost": cost,
|
|
131
|
+
"time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(parse_time)),
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
self._queue.push(log_data)
|
|
135
|
+
|
|
136
|
+
def _build_http_error_log(self, seed: Seed, e: RequestException):
|
|
137
|
+
status_code = getattr(e.response, 'status_code', '-')
|
|
138
|
+
instance_id = os.getenv("INSTANCE_ID")
|
|
139
|
+
request_info = {
|
|
140
|
+
'method': getattr(e.request, 'method', '-'),
|
|
141
|
+
'url': getattr(e.request, 'url', '-'),
|
|
142
|
+
'headers': dict(getattr(e.request, 'headers', {})),
|
|
143
|
+
'body': getattr(e.request, 'body', '-'),
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
response_info = {
|
|
147
|
+
'status_code': getattr(e.response, 'status_code', '-'),
|
|
148
|
+
'reason': getattr(e.response, 'reason', '-'),
|
|
149
|
+
'headers': dict(getattr(e.response, 'headers', {})),
|
|
150
|
+
# 'content': getattr(e.response, 'text', '')[:500],
|
|
151
|
+
'content_type': e.response.headers.get('content-type', '-') if e.response else '-',
|
|
152
|
+
'content_length': e.response.headers.get('content-length', '-') if e.response else '-',
|
|
153
|
+
'server': e.response.headers.get('server', '-') if e.response else '-',
|
|
154
|
+
'date': e.response.headers.get('date', '-') if e.response else '-',
|
|
155
|
+
}
|
|
156
|
+
retry = seed.params.retry
|
|
157
|
+
get_time = seed.params.get_time
|
|
158
|
+
start_time = seed.params.start_time
|
|
159
|
+
failed_time = seed.params.failed_time
|
|
160
|
+
cost = failed_time - start_time if failed_time and start_time else 0
|
|
161
|
+
|
|
162
|
+
log_data = {
|
|
163
|
+
"stage": "http_error",
|
|
164
|
+
"project": self.project,
|
|
165
|
+
"task": self.task,
|
|
166
|
+
"seed": seed.to_string,
|
|
167
|
+
"status_code": status_code,
|
|
168
|
+
"request_info": request_info,
|
|
169
|
+
"response_info": response_info,
|
|
170
|
+
"instance_id": instance_id,
|
|
171
|
+
"retry": retry,
|
|
172
|
+
"proxy": seed.params.proxy or '-',
|
|
173
|
+
"exception_type": type(e).__name__,
|
|
174
|
+
"exception_message": str(e),
|
|
175
|
+
"traceback": seed.params.traceback or '-',
|
|
176
|
+
"get_time": get_time,
|
|
177
|
+
"start_time": start_time,
|
|
178
|
+
"error_time": failed_time,
|
|
179
|
+
"stage_cost": cost,
|
|
180
|
+
"cost": cost,
|
|
181
|
+
"time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(failed_time)),
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
self._queue.push(log_data)
|
|
185
|
+
|
|
186
|
+
def _build_exception_log(self, seed: Seed, e: Exception):
|
|
187
|
+
retry = seed.params.retry
|
|
188
|
+
get_time = seed.params.get_time
|
|
189
|
+
start_time = seed.params.start_time
|
|
190
|
+
failed_time = seed.params.failed_time
|
|
191
|
+
cost = failed_time - start_time if failed_time and start_time else 0
|
|
192
|
+
instance_id = os.getenv("INSTANCE_ID")
|
|
193
|
+
log_data = {
|
|
194
|
+
"stage": "exception",
|
|
195
|
+
"project": self.project,
|
|
196
|
+
"task": self.task,
|
|
197
|
+
"seed": seed.to_string,
|
|
198
|
+
"instance_id": instance_id,
|
|
199
|
+
"retry": retry,
|
|
200
|
+
"exception_type": type(e).__name__,
|
|
201
|
+
"exception_message": str(e),
|
|
202
|
+
"traceback": seed.params.traceback or '-',
|
|
203
|
+
"proxy": seed.params.proxy or '-',
|
|
204
|
+
"get_time": get_time,
|
|
205
|
+
"start_time": start_time,
|
|
206
|
+
"error_time": failed_time,
|
|
207
|
+
"stage_cost": cost,
|
|
208
|
+
"cost": cost,
|
|
209
|
+
"time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(failed_time)),
|
|
210
|
+
}
|
|
211
|
+
self._queue.push(log_data)
|
|
212
|
+
|
|
213
|
+
def _build_finish_log(self, seed: Seed, status: bool):
|
|
214
|
+
retry = seed.params.retry
|
|
215
|
+
get_time = seed.params.get_time
|
|
216
|
+
start_time = seed.params.start_time
|
|
217
|
+
request_time = seed.params.request_time
|
|
218
|
+
response_time = seed.params.response_time
|
|
219
|
+
parse_time = seed.params.parse_time
|
|
220
|
+
finsh_time = seed.params.finsh_time
|
|
221
|
+
|
|
222
|
+
cost = finsh_time - start_time if finsh_time and start_time else 0
|
|
223
|
+
instance_id = os.getenv("instance_id")
|
|
224
|
+
log_data = {
|
|
225
|
+
"retry": retry,
|
|
226
|
+
"stage": "finish",
|
|
227
|
+
"status": "succeed" if status else "failed",
|
|
228
|
+
"project": self.project,
|
|
229
|
+
"task": self.task,
|
|
230
|
+
"seed": seed.to_string,
|
|
231
|
+
"instance_id": instance_id,
|
|
232
|
+
"traceback": seed.params.traceback or '-',
|
|
233
|
+
"proxy": seed.params.proxy or '-',
|
|
234
|
+
"get_time": get_time,
|
|
235
|
+
"start_time": start_time,
|
|
236
|
+
"request_time": request_time or '-',
|
|
237
|
+
"response_time": response_time or '-',
|
|
238
|
+
"parse_time": parse_time or '-',
|
|
239
|
+
"finsh_time": finsh_time or '-',
|
|
240
|
+
"cost": cost,
|
|
241
|
+
"time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(finsh_time)),
|
|
242
|
+
}
|
|
243
|
+
self._queue.push(log_data)
|
|
244
|
+
|
|
245
|
+
def _build_run(self):
|
|
246
|
+
while not self._stop.is_set():
|
|
247
|
+
try:
|
|
248
|
+
items = []
|
|
249
|
+
start_time = int(time.time())
|
|
250
|
+
|
|
251
|
+
while len(items) < 1000:
|
|
252
|
+
log_item = self._queue.pop()
|
|
253
|
+
print(log_item)
|
|
254
|
+
if not log_item or (int(time.time()) - start_time > 10):
|
|
255
|
+
break
|
|
256
|
+
|
|
257
|
+
except Exception as e:
|
|
258
|
+
logger.info(str(e))
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import time
|
|
4
|
+
|
|
5
|
+
from aliyun.log import LogClient, PutLogsRequest, LogItem
|
|
6
|
+
from .dot import Dot, logger, Event
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class LoghubDot(Dot):
|
|
10
|
+
|
|
11
|
+
def __init__(self, stop: Event, project: str, task: str):
|
|
12
|
+
super().__init__(stop, project, task)
|
|
13
|
+
self._client = LogClient(
|
|
14
|
+
endpoint=os.getenv("DOT_LOGHUB_ENDPOINT"),
|
|
15
|
+
accessKeyId=os.getenv("DOT_LOGHUB_ACCESS_KEY"),
|
|
16
|
+
accessKey=os.getenv("DOT_LOGHUB_SECRET_KEY")
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
def _build_run(self):
|
|
20
|
+
while not self._stop.is_set():
|
|
21
|
+
try:
|
|
22
|
+
items = []
|
|
23
|
+
start_time = int(time.time())
|
|
24
|
+
|
|
25
|
+
while len(items) < 1000 and (int(time.time()) - start_time) < 10:
|
|
26
|
+
log_data = self._queue.pop()
|
|
27
|
+
|
|
28
|
+
if not log_data:
|
|
29
|
+
break
|
|
30
|
+
|
|
31
|
+
for key, value in log_data.items():
|
|
32
|
+
if not isinstance(value, str):
|
|
33
|
+
log_data[key] = json.dumps(value, ensure_ascii=False)
|
|
34
|
+
else:
|
|
35
|
+
log_data[key] = value
|
|
36
|
+
|
|
37
|
+
log_item = LogItem()
|
|
38
|
+
contents = sorted(log_data.items())
|
|
39
|
+
log_item.set_contents(contents)
|
|
40
|
+
items.append(log_item)
|
|
41
|
+
|
|
42
|
+
if items:
|
|
43
|
+
request = PutLogsRequest(
|
|
44
|
+
project="databee-download-log",
|
|
45
|
+
logstore="log",
|
|
46
|
+
topic="cobweb",
|
|
47
|
+
logitems=items,
|
|
48
|
+
compress=True
|
|
49
|
+
)
|
|
50
|
+
self._client.put_logs(request=request)
|
|
51
|
+
except Exception as e:
|
|
52
|
+
logger.info(str(e))
|
|
53
|
+
|
cobweb/pipelines/__init__.py
CHANGED
|
@@ -1,2 +1,3 @@
|
|
|
1
|
-
from .
|
|
2
|
-
from .
|
|
1
|
+
from .pipeline import Pipeline
|
|
2
|
+
from .pipeline_loghub import Loghub
|
|
3
|
+
from .pipeline_csv import CSV
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from cobweb.base import BaseItem
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class Pipeline(ABC):
|
|
6
|
+
|
|
7
|
+
def __init__(self, task, project):
|
|
8
|
+
self.task = task
|
|
9
|
+
self.project = project
|
|
10
|
+
|
|
11
|
+
@abstractmethod
|
|
12
|
+
def build(self, item: BaseItem) -> dict:
|
|
13
|
+
pass
|
|
14
|
+
|
|
15
|
+
@abstractmethod
|
|
16
|
+
def upload(self, table: str, data: list) -> bool:
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import csv
|
|
3
|
+
|
|
4
|
+
from cobweb.base import BaseItem
|
|
5
|
+
from cobweb.pipelines import Pipeline
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class CSV(Pipeline):
|
|
9
|
+
|
|
10
|
+
def __init__(self, *args, **kwargs):
|
|
11
|
+
super(CSV, self).__init__(*args, **kwargs)
|
|
12
|
+
self.log_path = rf"{os.getcwd()}\{self.project}\{self.task}\%s.csv"
|
|
13
|
+
|
|
14
|
+
def build(self, item: BaseItem):
|
|
15
|
+
return item.to_dict
|
|
16
|
+
|
|
17
|
+
def upload(self, table, datas):
|
|
18
|
+
fieldnames = datas[0].keys()
|
|
19
|
+
file_path = self.log_path % table
|
|
20
|
+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
|
21
|
+
with open(file_path, mode='a', encoding='utf-8', newline="") as file:
|
|
22
|
+
writer = csv.DictWriter(file, fieldnames=fieldnames)
|
|
23
|
+
if file.tell() == 0: # 判断文件是否为空
|
|
24
|
+
writer.writeheader()
|
|
25
|
+
writer.writerows(datas)
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
|
|
4
|
+
from cobweb.base import BaseItem
|
|
5
|
+
from cobweb.pipelines import Pipeline
|
|
6
|
+
from aliyun.log import LogClient, LogItem, PutLogsRequest
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Loghub(Pipeline):
|
|
11
|
+
|
|
12
|
+
def __init__(self, *args, **kwargs):
|
|
13
|
+
super().__init__(*args, **kwargs)
|
|
14
|
+
self.client = LogClient(
|
|
15
|
+
endpoint=os.getenv("LOGHUB_ENDPOINT"),
|
|
16
|
+
accessKeyId=os.getenv("LOGHUB_ACCESS_KEY"),
|
|
17
|
+
accessKey=os.getenv("LOGHUB_SECRET_KEY")
|
|
18
|
+
)
|
|
19
|
+
self.project = os.getenv("LOGHUB_PROJECT")
|
|
20
|
+
self.source = os.getenv("LOGHUB_SOURCE")
|
|
21
|
+
self.topic = os.getenv("LOGHUB_TOPIC")
|
|
22
|
+
|
|
23
|
+
def build(self, item: BaseItem):
|
|
24
|
+
log_item = LogItem()
|
|
25
|
+
temp = item.to_dict
|
|
26
|
+
for key, value in temp.items():
|
|
27
|
+
if not isinstance(value, str):
|
|
28
|
+
temp[key] = json.dumps(value, ensure_ascii=False)
|
|
29
|
+
contents = sorted(temp.items())
|
|
30
|
+
log_item.set_contents(contents)
|
|
31
|
+
return (
|
|
32
|
+
log_item,
|
|
33
|
+
item.baseitem_topic or self.topic,
|
|
34
|
+
item.baseitem_source or self.source,
|
|
35
|
+
item.baseitem_project or self.project,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
def upload(self, table, datas):
|
|
39
|
+
|
|
40
|
+
upload_items = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
|
|
41
|
+
|
|
42
|
+
for log_item, topic, source, project in datas:
|
|
43
|
+
upload_items[project][source][topic].append(log_item)
|
|
44
|
+
|
|
45
|
+
for request in [
|
|
46
|
+
PutLogsRequest(
|
|
47
|
+
logstore=table, project=project,
|
|
48
|
+
topic=topic, source=source,
|
|
49
|
+
logitems=log_items, compress=True
|
|
50
|
+
) for project, sources in upload_items.items()
|
|
51
|
+
for source, topics in sources.items()
|
|
52
|
+
for topic, log_items in topics.items()
|
|
53
|
+
]:
|
|
54
|
+
self.client.put_logs(request=request)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .scheduler_with_redis import RedisScheduler
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import threading
|
|
2
|
+
|
|
3
|
+
from typing import Callable
|
|
4
|
+
from cobweb.base import TaskQueue
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Scheduler(ABC, threading.Thread):
|
|
9
|
+
|
|
10
|
+
# __LAUNCHER_FUNC__ = ["_reset", "_scheduler", "_insert", "_refresh", "_delete"]
|
|
11
|
+
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
task,
|
|
15
|
+
project,
|
|
16
|
+
stop: threading.Event,
|
|
17
|
+
pause: threading.Event,
|
|
18
|
+
task_queue: TaskQueue,
|
|
19
|
+
callback_register: Callable
|
|
20
|
+
):
|
|
21
|
+
super().__init__()
|
|
22
|
+
self.task = task
|
|
23
|
+
self.project = project
|
|
24
|
+
from cobweb import setting
|
|
25
|
+
|
|
26
|
+
self.task_model = setting.TASK_MODEL
|
|
27
|
+
self.seed_reset_seconds = setting.SEED_RESET_SECONDS
|
|
28
|
+
self.scheduler_wait_seconds = setting.SCHEDULER_WAIT_SECONDS
|
|
29
|
+
self.new_queue_wait_seconds = setting.NEW_QUEUE_WAIT_SECONDS
|
|
30
|
+
self.done_queue_wait_seconds = setting.DONE_QUEUE_WAIT_SECONDS
|
|
31
|
+
self.todo_queue_full_wait_seconds = setting.TODO_QUEUE_FULL_WAIT_SECONDS
|
|
32
|
+
self.before_scheduler_wait_seconds = setting.BEFORE_SCHEDULER_WAIT_SECONDS
|
|
33
|
+
|
|
34
|
+
self.todo_queue_size = setting.TODO_QUEUE_SIZE
|
|
35
|
+
self.new_queue_max_size = setting.NEW_QUEUE_MAX_SIZE
|
|
36
|
+
self.done_queue_max_size = setting.DONE_QUEUE_MAX_SIZE
|
|
37
|
+
self.upload_queue_max_size = setting.UPLOAD_QUEUE_MAX_SIZE
|
|
38
|
+
|
|
39
|
+
self.stop = stop
|
|
40
|
+
self.pause = pause
|
|
41
|
+
|
|
42
|
+
self.task_queue = task_queue
|
|
43
|
+
|
|
44
|
+
self.callback_register = callback_register
|
|
45
|
+
|
|
46
|
+
@abstractmethod
|
|
47
|
+
def reset(self):
|
|
48
|
+
...
|
|
49
|
+
|
|
50
|
+
@abstractmethod
|
|
51
|
+
def schedule(self):
|
|
52
|
+
...
|
|
53
|
+
|
|
54
|
+
@abstractmethod
|
|
55
|
+
def insert(self):
|
|
56
|
+
...
|
|
57
|
+
|
|
58
|
+
@abstractmethod
|
|
59
|
+
def refresh(self):
|
|
60
|
+
...
|
|
61
|
+
|
|
62
|
+
@abstractmethod
|
|
63
|
+
def delete(self):
|
|
64
|
+
...
|
|
65
|
+
|
|
66
|
+
|