cobweb-launcher 1.1.23__tar.gz → 1.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cobweb-launcher might be problematic. Click here for more details.
- {cobweb-launcher-1.1.23/cobweb_launcher.egg-info → cobweb-launcher-1.2.0}/PKG-INFO +1 -1
- cobweb-launcher-1.2.0/cobweb/__init__.py +2 -0
- {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.0}/cobweb/base/__init__.py +1 -1
- {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.0}/cobweb/base/item.py +7 -0
- {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.0}/cobweb/constant.py +23 -1
- cobweb-launcher-1.2.0/cobweb/crawlers/__init__.py +1 -0
- cobweb-launcher-1.1.23/cobweb/crawlers/base_crawler.py → cobweb-launcher-1.2.0/cobweb/crawlers/crawler.py +45 -29
- {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.0}/cobweb/launchers/__init__.py +1 -1
- {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.0}/cobweb/launchers/launcher.py +38 -52
- cobweb-launcher-1.2.0/cobweb/launchers/launcher_air.py +88 -0
- {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.0}/cobweb/launchers/launcher_pro.py +75 -34
- cobweb-launcher-1.2.0/cobweb/pipelines/__init__.py +3 -0
- cobweb-launcher-1.1.23/cobweb/pipelines/base_pipeline.py → cobweb-launcher-1.2.0/cobweb/pipelines/pipeline.py +20 -14
- cobweb-launcher-1.2.0/cobweb/pipelines/pipeline_console.py +24 -0
- cobweb-launcher-1.1.23/cobweb/pipelines/loghub_pipeline.py → cobweb-launcher-1.2.0/cobweb/pipelines/pipeline_loghub.py +1 -1
- {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.0}/cobweb/setting.py +6 -6
- {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.0}/cobweb/utils/tools.py +2 -2
- {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.0/cobweb_launcher.egg-info}/PKG-INFO +1 -1
- {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.0}/cobweb_launcher.egg-info/SOURCES.txt +5 -4
- {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.0}/setup.py +1 -1
- cobweb-launcher-1.1.23/cobweb/__init__.py +0 -2
- cobweb-launcher-1.1.23/cobweb/crawlers/__init__.py +0 -2
- cobweb-launcher-1.1.23/cobweb/crawlers/file_crawler.py +0 -98
- cobweb-launcher-1.1.23/cobweb/pipelines/__init__.py +0 -2
- {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.0}/LICENSE +0 -0
- {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.0}/README.md +0 -0
- {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.0}/cobweb/base/common_queue.py +0 -0
- {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.0}/cobweb/base/decorators.py +0 -0
- {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.0}/cobweb/base/log.py +0 -0
- {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.0}/cobweb/base/request.py +0 -0
- {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.0}/cobweb/base/response.py +0 -0
- {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.0}/cobweb/base/seed.py +0 -0
- {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.0}/cobweb/db/__init__.py +0 -0
- {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.0}/cobweb/db/redis_db.py +0 -0
- {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.0}/cobweb/exceptions/__init__.py +0 -0
- {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.0}/cobweb/exceptions/oss_db_exception.py +0 -0
- {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.0}/cobweb/utils/__init__.py +0 -0
- {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.0}/cobweb/utils/oss.py +0 -0
- {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.0}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
- {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.0}/cobweb_launcher.egg-info/requires.txt +0 -0
- {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.0}/cobweb_launcher.egg-info/top_level.txt +0 -0
- {cobweb-launcher-1.1.23 → cobweb-launcher-1.2.0}/setup.cfg +0 -0
|
@@ -30,6 +30,24 @@ class DealModel:
|
|
|
30
30
|
|
|
31
31
|
class LogTemplate:
|
|
32
32
|
|
|
33
|
+
console_item = """
|
|
34
|
+
----------------------- start - console pipeline -----------------
|
|
35
|
+
种子详情 \n{seed_detail}
|
|
36
|
+
解析详情 \n{parse_detail}
|
|
37
|
+
----------------------- end - console pipeline ------------------
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
launcher_air_polling = """
|
|
41
|
+
----------------------- start - 轮训日志: {task} -----------------
|
|
42
|
+
内存队列
|
|
43
|
+
种子数: {doing_len}
|
|
44
|
+
待消费: {todo_len}
|
|
45
|
+
已消费: {done_len}
|
|
46
|
+
存储队列
|
|
47
|
+
待上传: {upload_len}
|
|
48
|
+
----------------------- end - 轮训日志: {task} ------------------
|
|
49
|
+
"""
|
|
50
|
+
|
|
33
51
|
launcher_pro_polling = """
|
|
34
52
|
----------------------- start - 轮训日志: {task} -----------------
|
|
35
53
|
内存队列
|
|
@@ -69,4 +87,8 @@ class LogTemplate:
|
|
|
69
87
|
response
|
|
70
88
|
status : {status} \n{response}
|
|
71
89
|
------------------------------------------------------------------
|
|
72
|
-
"""
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
@staticmethod
|
|
93
|
+
def log_info(item: dict) -> str:
|
|
94
|
+
return "\n".join([" " * 12 + f"{str(k).ljust(14)}: {str(v)}" for k, v in item.items()])
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .crawler import Crawler
|
|
@@ -1,40 +1,52 @@
|
|
|
1
|
+
import json
|
|
1
2
|
import threading
|
|
2
3
|
import time
|
|
3
4
|
import traceback
|
|
4
|
-
|
|
5
5
|
from inspect import isgenerator
|
|
6
6
|
from typing import Union, Callable, Mapping
|
|
7
7
|
|
|
8
|
-
from cobweb.base import Queue, Seed, BaseItem, Request, Response, logger
|
|
9
8
|
from cobweb.constant import DealModel, LogTemplate
|
|
10
|
-
from cobweb.
|
|
11
|
-
|
|
9
|
+
from cobweb.base import (
|
|
10
|
+
Queue,
|
|
11
|
+
Seed,
|
|
12
|
+
BaseItem,
|
|
13
|
+
Request,
|
|
14
|
+
Response,
|
|
15
|
+
ConsoleItem,
|
|
16
|
+
logger
|
|
17
|
+
)
|
|
12
18
|
|
|
13
19
|
|
|
14
20
|
class Crawler(threading.Thread):
|
|
15
21
|
|
|
16
22
|
def __init__(
|
|
17
23
|
self,
|
|
18
|
-
|
|
19
|
-
|
|
24
|
+
stop: threading.Event,
|
|
25
|
+
pause: threading.Event,
|
|
20
26
|
launcher_queue: Union[Mapping[str, Queue]],
|
|
27
|
+
custom_func: Union[Mapping[str, Callable]],
|
|
28
|
+
thread_num: int,
|
|
29
|
+
max_retries: int
|
|
21
30
|
):
|
|
22
31
|
super().__init__()
|
|
23
32
|
|
|
24
|
-
self.
|
|
33
|
+
self._stop = stop
|
|
34
|
+
self._pause = pause
|
|
35
|
+
self._new = launcher_queue["new"]
|
|
36
|
+
self._todo = launcher_queue["todo"]
|
|
37
|
+
self._done = launcher_queue["done"]
|
|
38
|
+
self._upload = launcher_queue["upload"]
|
|
39
|
+
|
|
25
40
|
for func_name, _callable in custom_func.items():
|
|
26
41
|
if isinstance(_callable, Callable):
|
|
27
42
|
self.__setattr__(func_name, _callable)
|
|
28
43
|
|
|
29
|
-
self.
|
|
30
|
-
|
|
31
|
-
self.spider_thread_num = setting.SPIDER_THREAD_NUM
|
|
32
|
-
self.max_retries = setting.SPIDER_MAX_RETRIES
|
|
44
|
+
self.thread_num = thread_num
|
|
45
|
+
self.max_retries = max_retries
|
|
33
46
|
|
|
34
47
|
@staticmethod
|
|
35
48
|
def request(seed: Seed) -> Union[Request, BaseItem]:
|
|
36
|
-
|
|
37
|
-
yield Request(seed.url, seed, stream=stream, timeout=5)
|
|
49
|
+
yield Request(seed.url, seed, timeout=5)
|
|
38
50
|
|
|
39
51
|
@staticmethod
|
|
40
52
|
def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
|
|
@@ -43,39 +55,43 @@ class Crawler(threading.Thread):
|
|
|
43
55
|
|
|
44
56
|
@staticmethod
|
|
45
57
|
def parse(item: Response) -> BaseItem:
|
|
46
|
-
|
|
58
|
+
upload_item = item.to_dict
|
|
59
|
+
upload_item["text"] = item.response.text
|
|
60
|
+
yield ConsoleItem(item.seed, data=json.dumps(upload_item, ensure_ascii=False))
|
|
47
61
|
|
|
48
|
-
def get_seed(self) -> Seed:
|
|
49
|
-
|
|
62
|
+
# def get_seed(self) -> Seed:
|
|
63
|
+
# return self._todo.pop()
|
|
50
64
|
|
|
51
65
|
def distribute(self, item, seed):
|
|
52
66
|
if isinstance(item, BaseItem):
|
|
53
|
-
self.
|
|
67
|
+
self._upload.push(item)
|
|
54
68
|
elif isinstance(item, Seed):
|
|
55
|
-
self.
|
|
69
|
+
self._new.push(item)
|
|
56
70
|
elif isinstance(item, str) and item == DealModel.poll:
|
|
57
|
-
self.
|
|
71
|
+
self._todo.push(seed)
|
|
58
72
|
elif isinstance(item, str) and item == DealModel.done:
|
|
59
|
-
self.
|
|
73
|
+
self._done.push(seed)
|
|
60
74
|
elif isinstance(item, str) and item == DealModel.fail:
|
|
61
75
|
seed.params.seed_status = DealModel.fail
|
|
62
|
-
self.
|
|
76
|
+
self._done.push(seed)
|
|
63
77
|
else:
|
|
64
78
|
raise TypeError("yield value type error!")
|
|
65
79
|
|
|
66
80
|
def spider(self):
|
|
67
|
-
while
|
|
68
|
-
|
|
81
|
+
while not self._stop.is_set():
|
|
82
|
+
|
|
83
|
+
seed = self._todo.pop()
|
|
69
84
|
|
|
70
85
|
if not seed:
|
|
86
|
+
time.sleep(1)
|
|
71
87
|
continue
|
|
72
88
|
|
|
73
89
|
elif seed.params.retry >= self.max_retries:
|
|
74
90
|
seed.params.seed_status = DealModel.fail
|
|
75
|
-
self.
|
|
91
|
+
self._done.push(seed)
|
|
76
92
|
continue
|
|
77
93
|
|
|
78
|
-
seed_detail_log_info =
|
|
94
|
+
seed_detail_log_info = LogTemplate.log_info(seed.to_dict)
|
|
79
95
|
|
|
80
96
|
try:
|
|
81
97
|
request_iterators = self.request(seed)
|
|
@@ -106,7 +122,7 @@ class Crawler(threading.Thread):
|
|
|
106
122
|
seed_version=seed.params.seed_version,
|
|
107
123
|
identifier=seed.identifier or "",
|
|
108
124
|
status=download_item.response,
|
|
109
|
-
response=
|
|
125
|
+
response=LogTemplate.log_info(download_item.to_dict)
|
|
110
126
|
))
|
|
111
127
|
parse_iterators = self.parse(download_item)
|
|
112
128
|
if not isgenerator(parse_iterators):
|
|
@@ -123,7 +139,6 @@ class Crawler(threading.Thread):
|
|
|
123
139
|
|
|
124
140
|
if not iterator_status:
|
|
125
141
|
raise ValueError("request/download/parse function yield value error!")
|
|
126
|
-
|
|
127
142
|
except Exception as e:
|
|
128
143
|
logger.info(LogTemplate.download_exception.format(
|
|
129
144
|
detail=seed_detail_log_info,
|
|
@@ -134,11 +149,12 @@ class Crawler(threading.Thread):
|
|
|
134
149
|
exception=''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
|
135
150
|
))
|
|
136
151
|
seed.params.retry += 1
|
|
137
|
-
self.
|
|
152
|
+
self._todo.push(seed)
|
|
138
153
|
finally:
|
|
139
154
|
time.sleep(0.1)
|
|
155
|
+
logger.info("spider thread close")
|
|
140
156
|
|
|
141
157
|
def run(self):
|
|
142
|
-
for index in range(self.
|
|
158
|
+
for index in range(self.thread_num):
|
|
143
159
|
threading.Thread(name=f"spider_{index}", target=self.spider).start()
|
|
144
160
|
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
from .
|
|
1
|
+
from .launcher_air import LauncherAir
|
|
2
2
|
from .launcher_pro import LauncherPro
|
|
@@ -15,15 +15,16 @@ class Launcher(threading.Thread):
|
|
|
15
15
|
__DOING__ = {}
|
|
16
16
|
|
|
17
17
|
__CUSTOM_FUNC__ = {
|
|
18
|
-
"download": None,
|
|
19
|
-
"
|
|
20
|
-
"parse": None,
|
|
18
|
+
# "download": None,
|
|
19
|
+
# "request": None,
|
|
20
|
+
# "parse": None,
|
|
21
21
|
}
|
|
22
22
|
|
|
23
23
|
__LAUNCHER_QUEUE__ = {
|
|
24
24
|
"new": Queue(),
|
|
25
25
|
"todo": Queue(),
|
|
26
26
|
"done": Queue(),
|
|
27
|
+
"upload": Queue()
|
|
27
28
|
}
|
|
28
29
|
|
|
29
30
|
__LAUNCHER_FUNC__ = [
|
|
@@ -76,9 +77,13 @@ class Launcher(threading.Thread):
|
|
|
76
77
|
self._done_queue_max_size = setting.DONE_QUEUE_MAX_SIZE
|
|
77
78
|
self._upload_queue_max_size = setting.UPLOAD_QUEUE_MAX_SIZE
|
|
78
79
|
|
|
80
|
+
self._spider_thread_num = setting.SPIDER_MAX_RETRIES
|
|
81
|
+
self._spider_max_retries = setting.SPIDER_THREAD_NUM
|
|
82
|
+
|
|
79
83
|
self._done_model = setting.DONE_MODEL
|
|
84
|
+
self._task_model = setting.TASK_MODEL
|
|
80
85
|
|
|
81
|
-
self._upload_queue = Queue()
|
|
86
|
+
# self._upload_queue = Queue()
|
|
82
87
|
|
|
83
88
|
@property
|
|
84
89
|
def start_seeds(self):
|
|
@@ -121,7 +126,7 @@ class Launcher(threading.Thread):
|
|
|
121
126
|
use case:
|
|
122
127
|
from cobweb.base import Request, Response
|
|
123
128
|
@launcher.download
|
|
124
|
-
def
|
|
129
|
+
def parse(item: Response) -> BaseItem:
|
|
125
130
|
...
|
|
126
131
|
yield xxxItem(seed, **kwargs)
|
|
127
132
|
"""
|
|
@@ -133,6 +138,33 @@ class Launcher(threading.Thread):
|
|
|
133
138
|
for seed in seeds:
|
|
134
139
|
self.__DOING__.pop(seed, None)
|
|
135
140
|
|
|
141
|
+
def _execute(self):
|
|
142
|
+
for func_name in self.__LAUNCHER_FUNC__:
|
|
143
|
+
threading.Thread(name=func_name, target=getattr(self, func_name)).start()
|
|
144
|
+
time.sleep(2)
|
|
145
|
+
|
|
146
|
+
def run(self):
|
|
147
|
+
threading.Thread(target=self._execute_heartbeat).start()
|
|
148
|
+
|
|
149
|
+
self._Crawler(
|
|
150
|
+
stop=self._stop, pause=self._pause,
|
|
151
|
+
launcher_queue=self.__LAUNCHER_QUEUE__,
|
|
152
|
+
custom_func=self.__CUSTOM_FUNC__,
|
|
153
|
+
thread_num = self._spider_thread_num,
|
|
154
|
+
max_retries = self._spider_max_retries
|
|
155
|
+
).start()
|
|
156
|
+
|
|
157
|
+
self._Pipeline(
|
|
158
|
+
stop=self._stop, pause=self._pause,
|
|
159
|
+
upload=self.__LAUNCHER_QUEUE__["upload"],
|
|
160
|
+
done=self.__LAUNCHER_QUEUE__["done"],
|
|
161
|
+
upload_size=self._upload_queue_max_size,
|
|
162
|
+
wait_seconds=self._upload_queue_wait_seconds
|
|
163
|
+
).start()
|
|
164
|
+
|
|
165
|
+
self._execute()
|
|
166
|
+
self._polling()
|
|
167
|
+
|
|
136
168
|
def _execute_heartbeat(self):
|
|
137
169
|
pass
|
|
138
170
|
|
|
@@ -151,52 +183,6 @@ class Launcher(threading.Thread):
|
|
|
151
183
|
def _delete(self):
|
|
152
184
|
pass
|
|
153
185
|
|
|
154
|
-
def _execute(self):
|
|
155
|
-
for func_name in self.__LAUNCHER_FUNC__:
|
|
156
|
-
threading.Thread(name=func_name, target=getattr(self, func_name)).start()
|
|
157
|
-
time.sleep(2)
|
|
158
|
-
|
|
159
186
|
def _polling(self):
|
|
187
|
+
pass
|
|
160
188
|
|
|
161
|
-
check_emtpy_times = 0
|
|
162
|
-
|
|
163
|
-
while not self._stop.is_set():
|
|
164
|
-
|
|
165
|
-
queue_not_empty_count = 0
|
|
166
|
-
|
|
167
|
-
for q in self.__LAUNCHER_QUEUE__.values():
|
|
168
|
-
if q.length != 0:
|
|
169
|
-
queue_not_empty_count += 1
|
|
170
|
-
|
|
171
|
-
if self._pause.is_set() and queue_not_empty_count != 0:
|
|
172
|
-
self._pause.clear()
|
|
173
|
-
self._execute()
|
|
174
|
-
|
|
175
|
-
elif queue_not_empty_count == 0:
|
|
176
|
-
check_emtpy_times += 1
|
|
177
|
-
else:
|
|
178
|
-
check_emtpy_times = 0
|
|
179
|
-
|
|
180
|
-
if check_emtpy_times > 2:
|
|
181
|
-
check_emtpy_times = 0
|
|
182
|
-
self.__DOING__ = {}
|
|
183
|
-
self._pause.set()
|
|
184
|
-
|
|
185
|
-
def run(self):
|
|
186
|
-
threading.Thread(target=self._execute_heartbeat).start()
|
|
187
|
-
|
|
188
|
-
self._Crawler(
|
|
189
|
-
upload_queue=self._upload_queue,
|
|
190
|
-
custom_func=self.__CUSTOM_FUNC__,
|
|
191
|
-
launcher_queue=self.__LAUNCHER_QUEUE__,
|
|
192
|
-
).start()
|
|
193
|
-
|
|
194
|
-
self._Pipeline(
|
|
195
|
-
upload_queue=self._upload_queue,
|
|
196
|
-
done_queue=self.__LAUNCHER_QUEUE__["done"],
|
|
197
|
-
upload_queue_size=self._upload_queue_max_size,
|
|
198
|
-
upload_wait_seconds=self._upload_queue_wait_seconds
|
|
199
|
-
).start()
|
|
200
|
-
|
|
201
|
-
self._execute()
|
|
202
|
-
self._polling()
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import time
|
|
2
|
+
|
|
3
|
+
from cobweb.constant import LogTemplate
|
|
4
|
+
from cobweb.base import logger
|
|
5
|
+
from .launcher import Launcher
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class LauncherAir(Launcher):
|
|
9
|
+
|
|
10
|
+
def _scheduler(self):
|
|
11
|
+
if self.start_seeds:
|
|
12
|
+
self.__LAUNCHER_QUEUE__['todo'].push(self.start_seeds)
|
|
13
|
+
|
|
14
|
+
def _insert(self):
|
|
15
|
+
while not self._pause.is_set():
|
|
16
|
+
seeds = {}
|
|
17
|
+
status = self.__LAUNCHER_QUEUE__['new'].length < self._new_queue_max_size
|
|
18
|
+
for _ in range(self._new_queue_max_size):
|
|
19
|
+
seed = self.__LAUNCHER_QUEUE__['new'].pop()
|
|
20
|
+
if not seed:
|
|
21
|
+
break
|
|
22
|
+
seeds[seed.to_string] = seed.params.priority
|
|
23
|
+
if seeds:
|
|
24
|
+
self.__LAUNCHER_QUEUE__['todo'].push(seeds)
|
|
25
|
+
if status:
|
|
26
|
+
time.sleep(self._new_queue_wait_seconds)
|
|
27
|
+
|
|
28
|
+
def _delete(self):
|
|
29
|
+
while not self._pause.is_set():
|
|
30
|
+
seeds = []
|
|
31
|
+
status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
|
|
32
|
+
|
|
33
|
+
for _ in range(self._done_queue_max_size):
|
|
34
|
+
seed = self.__LAUNCHER_QUEUE__['done'].pop()
|
|
35
|
+
if not seed:
|
|
36
|
+
break
|
|
37
|
+
seeds.append(seed.to_string)
|
|
38
|
+
|
|
39
|
+
if seeds:
|
|
40
|
+
self._remove_doing_seeds(seeds)
|
|
41
|
+
|
|
42
|
+
if status:
|
|
43
|
+
time.sleep(self._done_queue_wait_seconds)
|
|
44
|
+
|
|
45
|
+
def _polling(self):
|
|
46
|
+
|
|
47
|
+
check_emtpy_times = 0
|
|
48
|
+
|
|
49
|
+
while not self._stop.is_set():
|
|
50
|
+
|
|
51
|
+
queue_not_empty_count = 0
|
|
52
|
+
pooling_wait_seconds = 30
|
|
53
|
+
|
|
54
|
+
for q in self.__LAUNCHER_QUEUE__.values():
|
|
55
|
+
if q.length != 0:
|
|
56
|
+
queue_not_empty_count += 1
|
|
57
|
+
|
|
58
|
+
if queue_not_empty_count == 0:
|
|
59
|
+
pooling_wait_seconds = 3
|
|
60
|
+
if self._pause.is_set():
|
|
61
|
+
check_emtpy_times = 0
|
|
62
|
+
if not self._task_model:
|
|
63
|
+
logger.info("Done! Ready to close thread...")
|
|
64
|
+
self._stop.set()
|
|
65
|
+
elif check_emtpy_times > 2:
|
|
66
|
+
self.__DOING__ = {}
|
|
67
|
+
self._pause.set()
|
|
68
|
+
else:
|
|
69
|
+
logger.info(
|
|
70
|
+
"check whether the task is complete, "
|
|
71
|
+
f"reset times {3 - check_emtpy_times}"
|
|
72
|
+
)
|
|
73
|
+
check_emtpy_times += 1
|
|
74
|
+
elif self._pause.is_set():
|
|
75
|
+
self._pause.clear()
|
|
76
|
+
self._execute()
|
|
77
|
+
else:
|
|
78
|
+
logger.info(LogTemplate.launcher_air_polling.format(
|
|
79
|
+
task=self.task,
|
|
80
|
+
doing_len=len(self.__DOING__.keys()),
|
|
81
|
+
todo_len=self.__LAUNCHER_QUEUE__['todo'].length,
|
|
82
|
+
done_len=self.__LAUNCHER_QUEUE__['done'].length,
|
|
83
|
+
upload_len=self.__LAUNCHER_QUEUE__['upload'].length,
|
|
84
|
+
))
|
|
85
|
+
|
|
86
|
+
time.sleep(pooling_wait_seconds)
|
|
87
|
+
|
|
88
|
+
|
|
@@ -3,19 +3,19 @@ import threading
|
|
|
3
3
|
|
|
4
4
|
from cobweb.db import RedisDB
|
|
5
5
|
from cobweb.base import Seed, logger
|
|
6
|
-
from cobweb.launchers import Launcher
|
|
7
6
|
from cobweb.constant import DealModel, LogTemplate
|
|
7
|
+
from .launcher import Launcher
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
class LauncherPro(Launcher):
|
|
11
11
|
|
|
12
12
|
def __init__(self, task, project, custom_setting=None, **kwargs):
|
|
13
13
|
super().__init__(task, project, custom_setting, **kwargs)
|
|
14
|
-
self.
|
|
15
|
-
self.
|
|
16
|
-
self.
|
|
17
|
-
self.
|
|
18
|
-
self.
|
|
14
|
+
self._todo_key = "{%s:%s}:todo" % (project, task)
|
|
15
|
+
self._done_key = "{%s:%s}:done" % (project, task)
|
|
16
|
+
self._fail_key = "{%s:%s}:fail" % (project, task)
|
|
17
|
+
self._heartbeat_key = "heartbeat:%s_%s" % (project, task)
|
|
18
|
+
self._reset_lock_key = "lock:reset:%s_%s" % (project, task)
|
|
19
19
|
self._db = RedisDB()
|
|
20
20
|
|
|
21
21
|
self._heartbeat_start_event = threading.Event()
|
|
@@ -23,12 +23,12 @@ class LauncherPro(Launcher):
|
|
|
23
23
|
|
|
24
24
|
@property
|
|
25
25
|
def heartbeat(self):
|
|
26
|
-
return self._db.exists(self.
|
|
26
|
+
return self._db.exists(self._heartbeat_key)
|
|
27
27
|
|
|
28
28
|
def _execute_heartbeat(self):
|
|
29
29
|
while not self._stop.is_set():
|
|
30
30
|
if self._heartbeat_start_event.is_set():
|
|
31
|
-
self._db.setex(self.
|
|
31
|
+
self._db.setex(self._heartbeat_key, 3)
|
|
32
32
|
time.sleep(2)
|
|
33
33
|
|
|
34
34
|
def _reset(self):
|
|
@@ -39,15 +39,15 @@ class LauncherPro(Launcher):
|
|
|
39
39
|
while not self._pause.is_set():
|
|
40
40
|
reset_wait_seconds = 30
|
|
41
41
|
start_reset_time = int(time.time())
|
|
42
|
-
if self._db.lock(self.
|
|
42
|
+
if self._db.lock(self._reset_lock_key, t=120):
|
|
43
43
|
if not self.heartbeat:
|
|
44
44
|
self._heartbeat_start_event.set()
|
|
45
45
|
|
|
46
46
|
_min = -int(time.time()) + self._seed_reset_seconds \
|
|
47
47
|
if self.heartbeat or not init else "-inf"
|
|
48
48
|
|
|
49
|
-
self._db.members(self.
|
|
50
|
-
self._db.delete(self.
|
|
49
|
+
self._db.members(self._todo_key, 0, _min=_min, _max="(0")
|
|
50
|
+
self._db.delete(self._reset_lock_key)
|
|
51
51
|
|
|
52
52
|
ttl = 120 - int(time.time()) + start_reset_time
|
|
53
53
|
reset_wait_seconds = max(ttl, 1)
|
|
@@ -61,14 +61,14 @@ class LauncherPro(Launcher):
|
|
|
61
61
|
if self.start_seeds:
|
|
62
62
|
self.__LAUNCHER_QUEUE__['todo'].push(self.start_seeds)
|
|
63
63
|
while not self._pause.is_set():
|
|
64
|
-
if not self._db.zcount(self.
|
|
64
|
+
if not self._db.zcount(self._todo_key, 0, "(1000"):
|
|
65
65
|
time.sleep(self._scheduler_wait_seconds)
|
|
66
66
|
continue
|
|
67
67
|
if self.__LAUNCHER_QUEUE__['todo'].length >= self._todo_queue_size:
|
|
68
68
|
time.sleep(self._todo_queue_full_wait_seconds)
|
|
69
69
|
continue
|
|
70
70
|
members = self._db.members(
|
|
71
|
-
self.
|
|
71
|
+
self._todo_key, int(time.time()),
|
|
72
72
|
count=self._todo_queue_size,
|
|
73
73
|
_min=0, _max="(1000"
|
|
74
74
|
)
|
|
@@ -90,7 +90,7 @@ class LauncherPro(Launcher):
|
|
|
90
90
|
break
|
|
91
91
|
seeds[seed.to_string] = seed.params.priority
|
|
92
92
|
if seeds:
|
|
93
|
-
self._db.zadd(self.
|
|
93
|
+
self._db.zadd(self._todo_key, seeds, nx=True)
|
|
94
94
|
if status:
|
|
95
95
|
time.sleep(self._new_queue_wait_seconds)
|
|
96
96
|
|
|
@@ -102,7 +102,7 @@ class LauncherPro(Launcher):
|
|
|
102
102
|
if self.__DOING__:
|
|
103
103
|
refresh_time = int(time.time())
|
|
104
104
|
seeds = {k:-refresh_time - v / 1000 for k, v in self.__DOING__.items()}
|
|
105
|
-
self._db.zadd(self.
|
|
105
|
+
self._db.zadd(self._todo_key, item=seeds, xx=True)
|
|
106
106
|
time.sleep(30)
|
|
107
107
|
|
|
108
108
|
def _delete(self):
|
|
@@ -124,13 +124,13 @@ class LauncherPro(Launcher):
|
|
|
124
124
|
else:
|
|
125
125
|
seeds.append(seed.to_string)
|
|
126
126
|
if seeds:
|
|
127
|
-
self._db.zrem(self.
|
|
127
|
+
self._db.zrem(self._todo_key, *seeds)
|
|
128
128
|
self._remove_doing_seeds(seeds)
|
|
129
129
|
if s_seeds:
|
|
130
|
-
self._db.done([self.
|
|
130
|
+
self._db.done([self._todo_key, self._done_key], *s_seeds)
|
|
131
131
|
self._remove_doing_seeds(s_seeds)
|
|
132
132
|
if f_seeds:
|
|
133
|
-
self._db.done([self.
|
|
133
|
+
self._db.done([self._todo_key, self._fail_key], *f_seeds)
|
|
134
134
|
self._remove_doing_seeds(f_seeds)
|
|
135
135
|
|
|
136
136
|
if status:
|
|
@@ -141,32 +141,73 @@ class LauncherPro(Launcher):
|
|
|
141
141
|
while not self._stop.is_set():
|
|
142
142
|
queue_not_empty_count = 0
|
|
143
143
|
pooling_wait_seconds = 30
|
|
144
|
+
|
|
144
145
|
for q in self.__LAUNCHER_QUEUE__.values():
|
|
145
146
|
if q.length != 0:
|
|
146
147
|
queue_not_empty_count += 1
|
|
147
|
-
|
|
148
|
+
|
|
149
|
+
if queue_not_empty_count == 0:
|
|
150
|
+
pooling_wait_seconds = 3
|
|
151
|
+
if self._pause.is_set():
|
|
152
|
+
check_emtpy_times = 0
|
|
153
|
+
if not self._task_model:
|
|
154
|
+
logger.info("Done! Ready to close thread...")
|
|
155
|
+
self._stop.set()
|
|
156
|
+
elif not self._db.zcount(self._todo_key, _min=0, _max="(1000") and check_emtpy_times > 2:
|
|
157
|
+
self.__DOING__ = {}
|
|
158
|
+
self._pause.set()
|
|
159
|
+
else:
|
|
160
|
+
logger.info(
|
|
161
|
+
"check whether the task is complete, "
|
|
162
|
+
f"reset times {3 - check_emtpy_times}"
|
|
163
|
+
)
|
|
164
|
+
check_emtpy_times += 1
|
|
165
|
+
elif self._pause.is_set():
|
|
148
166
|
self._pause.clear()
|
|
149
167
|
self._execute()
|
|
150
|
-
elif queue_not_empty_count == 0:
|
|
151
|
-
pooling_wait_seconds = 5
|
|
152
|
-
check_emtpy_times += 1
|
|
153
168
|
else:
|
|
154
|
-
check_emtpy_times = 0
|
|
155
|
-
|
|
156
|
-
if not self._db.zcount(self._todo, _min=0, _max="(1000") and check_emtpy_times > 2:
|
|
157
|
-
check_emtpy_times = 0
|
|
158
|
-
self.__DOING__ = {}
|
|
159
|
-
self._pause.set()
|
|
160
|
-
|
|
161
|
-
if not self._pause.is_set():
|
|
162
169
|
logger.info(LogTemplate.launcher_pro_polling.format(
|
|
163
170
|
task=self.task,
|
|
164
171
|
doing_len=len(self.__DOING__.keys()),
|
|
165
172
|
todo_len=self.__LAUNCHER_QUEUE__['todo'].length,
|
|
166
173
|
done_len=self.__LAUNCHER_QUEUE__['done'].length,
|
|
167
|
-
redis_seed_count=self._db.zcount(self.
|
|
168
|
-
redis_todo_len=self._db.zcount(self.
|
|
169
|
-
redis_doing_len=self._db.zcount(self.
|
|
170
|
-
upload_len=self.
|
|
174
|
+
redis_seed_count=self._db.zcount(self._todo_key, "-inf", "+inf"),
|
|
175
|
+
redis_todo_len=self._db.zcount(self._todo_key, 0, "(1000"),
|
|
176
|
+
redis_doing_len=self._db.zcount(self._todo_key, "-inf", "(0"),
|
|
177
|
+
upload_len=self.__LAUNCHER_QUEUE__['upload'].length,
|
|
171
178
|
))
|
|
179
|
+
|
|
172
180
|
time.sleep(pooling_wait_seconds)
|
|
181
|
+
# if self._pause.is_set():
|
|
182
|
+
# self._pause.clear()
|
|
183
|
+
# self._execute()
|
|
184
|
+
#
|
|
185
|
+
# elif queue_not_empty_count == 0:
|
|
186
|
+
# pooling_wait_seconds = 5
|
|
187
|
+
# check_emtpy_times += 1
|
|
188
|
+
# else:
|
|
189
|
+
# check_emtpy_times = 0
|
|
190
|
+
#
|
|
191
|
+
# if not self._db.zcount(self._todo, _min=0, _max="(1000") and check_emtpy_times > 2:
|
|
192
|
+
# check_emtpy_times = 0
|
|
193
|
+
# self.__DOING__ = {}
|
|
194
|
+
# self._pause.set()
|
|
195
|
+
#
|
|
196
|
+
# time.sleep(pooling_wait_seconds)
|
|
197
|
+
#
|
|
198
|
+
# if not self._pause.is_set():
|
|
199
|
+
# logger.info(LogTemplate.launcher_pro_polling.format(
|
|
200
|
+
# task=self.task,
|
|
201
|
+
# doing_len=len(self.__DOING__.keys()),
|
|
202
|
+
# todo_len=self.__LAUNCHER_QUEUE__['todo'].length,
|
|
203
|
+
# done_len=self.__LAUNCHER_QUEUE__['done'].length,
|
|
204
|
+
# redis_seed_count=self._db.zcount(self._todo, "-inf", "+inf"),
|
|
205
|
+
# redis_todo_len=self._db.zcount(self._todo, 0, "(1000"),
|
|
206
|
+
# redis_doing_len=self._db.zcount(self._todo, "-inf", "(0"),
|
|
207
|
+
# upload_len=self.__LAUNCHER_QUEUE__['upload'].length,
|
|
208
|
+
# ))
|
|
209
|
+
# elif not self._task_model:
|
|
210
|
+
# self._stop.set()
|
|
211
|
+
|
|
212
|
+
logger.info("Done! Ready to close thread...")
|
|
213
|
+
|
|
@@ -9,16 +9,20 @@ class Pipeline(threading.Thread, ABC):
|
|
|
9
9
|
|
|
10
10
|
def __init__(
|
|
11
11
|
self,
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
12
|
+
stop: threading.Event,
|
|
13
|
+
pause: threading.Event,
|
|
14
|
+
upload: Queue, done: Queue,
|
|
15
|
+
upload_size: int,
|
|
16
|
+
wait_seconds: int
|
|
16
17
|
):
|
|
17
18
|
super().__init__()
|
|
18
|
-
self.
|
|
19
|
-
self.
|
|
20
|
-
self.
|
|
21
|
-
self.
|
|
19
|
+
self._stop = stop
|
|
20
|
+
self._pause = pause
|
|
21
|
+
self._upload = upload
|
|
22
|
+
self._done = done
|
|
23
|
+
|
|
24
|
+
self.upload_size = upload_size
|
|
25
|
+
self.wait_seconds = wait_seconds
|
|
22
26
|
|
|
23
27
|
@abstractmethod
|
|
24
28
|
def build(self, item: BaseItem) -> dict:
|
|
@@ -29,13 +33,13 @@ class Pipeline(threading.Thread, ABC):
|
|
|
29
33
|
pass
|
|
30
34
|
|
|
31
35
|
def run(self):
|
|
32
|
-
while
|
|
33
|
-
status = self.
|
|
36
|
+
while not self._stop.is_set():
|
|
37
|
+
status = self._upload.length < self.upload_size
|
|
34
38
|
if status:
|
|
35
|
-
time.sleep(self.
|
|
39
|
+
time.sleep(self.wait_seconds)
|
|
36
40
|
data_info, seeds = {}, []
|
|
37
|
-
for _ in range(self.
|
|
38
|
-
item = self.
|
|
41
|
+
for _ in range(self.upload_size):
|
|
42
|
+
item = self._upload.pop()
|
|
39
43
|
if not item:
|
|
40
44
|
break
|
|
41
45
|
data = self.build(item)
|
|
@@ -49,6 +53,8 @@ class Pipeline(threading.Thread, ABC):
|
|
|
49
53
|
logger.info(e)
|
|
50
54
|
status = False
|
|
51
55
|
if status:
|
|
52
|
-
self.
|
|
56
|
+
self._done.push(seeds)
|
|
57
|
+
|
|
58
|
+
logger.info("upload pipeline close!")
|
|
53
59
|
|
|
54
60
|
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
from cobweb.base import ConsoleItem, logger
|
|
4
|
+
from cobweb.constant import LogTemplate
|
|
5
|
+
from cobweb.pipelines import Pipeline
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Console(Pipeline):
|
|
9
|
+
|
|
10
|
+
def build(self, item: ConsoleItem):
|
|
11
|
+
return {
|
|
12
|
+
"seed": item.seed.to_dict,
|
|
13
|
+
"data": item.to_dict
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
def upload(self, table, datas):
|
|
17
|
+
for data in datas:
|
|
18
|
+
parse_detail = LogTemplate.log_info(data["data"])
|
|
19
|
+
if len(parse_detail) > 500:
|
|
20
|
+
parse_detail = parse_detail[:500] + " ...\n" + " " * 12 + "-- Text is too long and details are omitted!"
|
|
21
|
+
logger.info(LogTemplate.console_item.format(
|
|
22
|
+
seed_detail=LogTemplate.log_info(data["seed"]),
|
|
23
|
+
parse_detail=parse_detail
|
|
24
|
+
))
|
|
@@ -30,8 +30,8 @@ OSS_MIN_UPLOAD_SIZE = 1024
|
|
|
30
30
|
# 采集器选择
|
|
31
31
|
CRAWLER = "cobweb.crawlers.Crawler"
|
|
32
32
|
|
|
33
|
-
#
|
|
34
|
-
PIPELINE = "cobweb.pipelines.
|
|
33
|
+
# 数据存储链路
|
|
34
|
+
PIPELINE = "cobweb.pipelines.pipeline_console.Console"
|
|
35
35
|
|
|
36
36
|
|
|
37
37
|
# Launcher 等待时间
|
|
@@ -52,12 +52,12 @@ UPLOAD_QUEUE_MAX_SIZE = 100 # upload队列长度
|
|
|
52
52
|
# DONE_MODEL IN (0, 1), 种子完成模式
|
|
53
53
|
DONE_MODEL = 0 # 0:种子消费成功直接从队列移除,失败则添加至失败队列;1:种子消费成功添加至成功队列,失败添加至失败队列
|
|
54
54
|
|
|
55
|
-
# DOWNLOAD_MODEL IN (0, 1), 下载模式
|
|
56
|
-
DOWNLOAD_MODEL = 0 # 0: 通用下载;1:文件下载
|
|
57
|
-
|
|
58
55
|
# spider
|
|
59
56
|
SPIDER_THREAD_NUM = 10
|
|
60
57
|
SPIDER_MAX_RETRIES = 5
|
|
61
58
|
|
|
59
|
+
# 任务模式
|
|
60
|
+
TASK_MODEL = 0 # 0:单次,1:常驻
|
|
61
|
+
|
|
62
62
|
# 文件下载响应类型过滤
|
|
63
|
-
FILE_FILTER_CONTENT_TYPE = ["text/html", "application/xhtml+xml"]
|
|
63
|
+
# FILE_FILTER_CONTENT_TYPE = ["text/html", "application/xhtml+xml"]
|
|
@@ -38,5 +38,5 @@ def dynamic_load_class(model_info):
|
|
|
38
38
|
raise TypeError()
|
|
39
39
|
|
|
40
40
|
|
|
41
|
-
def download_log_info(item:dict) -> str:
|
|
42
|
-
|
|
41
|
+
# def download_log_info(item:dict) -> str:
|
|
42
|
+
# return "\n".join([" " * 12 + f"{str(k).ljust(14)}: {str(v)}" for k, v in item.items()])
|
|
@@ -13,18 +13,19 @@ cobweb/base/request.py
|
|
|
13
13
|
cobweb/base/response.py
|
|
14
14
|
cobweb/base/seed.py
|
|
15
15
|
cobweb/crawlers/__init__.py
|
|
16
|
-
cobweb/crawlers/
|
|
17
|
-
cobweb/crawlers/file_crawler.py
|
|
16
|
+
cobweb/crawlers/crawler.py
|
|
18
17
|
cobweb/db/__init__.py
|
|
19
18
|
cobweb/db/redis_db.py
|
|
20
19
|
cobweb/exceptions/__init__.py
|
|
21
20
|
cobweb/exceptions/oss_db_exception.py
|
|
22
21
|
cobweb/launchers/__init__.py
|
|
23
22
|
cobweb/launchers/launcher.py
|
|
23
|
+
cobweb/launchers/launcher_air.py
|
|
24
24
|
cobweb/launchers/launcher_pro.py
|
|
25
25
|
cobweb/pipelines/__init__.py
|
|
26
|
-
cobweb/pipelines/
|
|
27
|
-
cobweb/pipelines/
|
|
26
|
+
cobweb/pipelines/pipeline.py
|
|
27
|
+
cobweb/pipelines/pipeline_console.py
|
|
28
|
+
cobweb/pipelines/pipeline_loghub.py
|
|
28
29
|
cobweb/utils/__init__.py
|
|
29
30
|
cobweb/utils/oss.py
|
|
30
31
|
cobweb/utils/tools.py
|
|
@@ -1,98 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from typing import Union
|
|
3
|
-
from cobweb import setting
|
|
4
|
-
from cobweb.utils import OssUtil
|
|
5
|
-
from cobweb.crawlers import Crawler
|
|
6
|
-
from cobweb.base import Seed, BaseItem, Request, Response
|
|
7
|
-
from cobweb.exceptions import OssDBPutPartError, OssDBMergeError
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
oss_util = OssUtil(is_path_style=bool(int(os.getenv("PRIVATE_LINK", 0))))
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class FileCrawlerAir(Crawler):
|
|
14
|
-
|
|
15
|
-
@staticmethod
|
|
16
|
-
def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
|
|
17
|
-
seed_dict = item.seed.to_dict
|
|
18
|
-
seed_dict["bucket_name"] = oss_util.bucket
|
|
19
|
-
try:
|
|
20
|
-
seed_dict["oss_path"] = key = item.seed.oss_path or getattr(item, "oss_path")
|
|
21
|
-
|
|
22
|
-
if oss_util.exists(key):
|
|
23
|
-
seed_dict["data_size"] = oss_util.head(key).content_length
|
|
24
|
-
yield Response(item.seed, "exists", **seed_dict)
|
|
25
|
-
|
|
26
|
-
else:
|
|
27
|
-
seed_dict.setdefault("end", "")
|
|
28
|
-
seed_dict.setdefault("start", 0)
|
|
29
|
-
|
|
30
|
-
if seed_dict["end"] or seed_dict["start"]:
|
|
31
|
-
start, end = seed_dict["start"], seed_dict["end"]
|
|
32
|
-
item.request_setting["headers"]['Range'] = f'bytes={start}-{end}'
|
|
33
|
-
|
|
34
|
-
if not item.seed.identifier:
|
|
35
|
-
content = b""
|
|
36
|
-
chunk_size = oss_util.chunk_size
|
|
37
|
-
min_upload_size = oss_util.min_upload_size
|
|
38
|
-
seed_dict.setdefault("position", 1)
|
|
39
|
-
|
|
40
|
-
response = item.download()
|
|
41
|
-
|
|
42
|
-
content_type = response.headers.get("content-type", "").split(";")[0]
|
|
43
|
-
seed_dict["data_size"] = content_length = int(response.headers.get("content-length", 0))
|
|
44
|
-
|
|
45
|
-
if content_type and content_type in setting.FILE_FILTER_CONTENT_TYPE:
|
|
46
|
-
"""过滤响应文件类型"""
|
|
47
|
-
response.close()
|
|
48
|
-
seed_dict["filter"] = True
|
|
49
|
-
seed_dict["msg"] = f"response content type is {content_type}"
|
|
50
|
-
yield Response(item.seed, response, **seed_dict)
|
|
51
|
-
|
|
52
|
-
elif seed_dict['position'] == 1 and min_upload_size >= content_length > 0:
|
|
53
|
-
"""过小文件标识返回"""
|
|
54
|
-
response.close()
|
|
55
|
-
seed_dict["filter"] = True
|
|
56
|
-
seed_dict["msg"] = "file size is too small"
|
|
57
|
-
yield Response(item.seed, response, **seed_dict)
|
|
58
|
-
|
|
59
|
-
elif seed_dict['position'] == 1 and chunk_size > content_length > min_upload_size:
|
|
60
|
-
"""小文件直接下载"""
|
|
61
|
-
for part_data in response.iter_content(chunk_size):
|
|
62
|
-
content += part_data
|
|
63
|
-
response.close()
|
|
64
|
-
oss_util.put(key, content)
|
|
65
|
-
yield Response(item.seed, response, **seed_dict)
|
|
66
|
-
|
|
67
|
-
else:
|
|
68
|
-
"""中大文件同步分片下载"""
|
|
69
|
-
seed_dict.setdefault("upload_id", oss_util.init_part(key).upload_id)
|
|
70
|
-
|
|
71
|
-
for part_data in response.iter_content(chunk_size):
|
|
72
|
-
content += part_data
|
|
73
|
-
if len(content) >= chunk_size:
|
|
74
|
-
upload_data = content[:chunk_size]
|
|
75
|
-
content = content[chunk_size:]
|
|
76
|
-
oss_util.put_part(key, seed_dict["upload_id"], seed_dict['position'], content)
|
|
77
|
-
seed_dict['start'] += len(upload_data)
|
|
78
|
-
seed_dict['position'] += 1
|
|
79
|
-
|
|
80
|
-
response.close()
|
|
81
|
-
|
|
82
|
-
if content:
|
|
83
|
-
oss_util.put_part(key, seed_dict["upload_id"], seed_dict['position'], content)
|
|
84
|
-
oss_util.merge(key, seed_dict["upload_id"])
|
|
85
|
-
seed_dict["data_size"] = oss_util.head(key).content_length
|
|
86
|
-
yield Response(item.seed, response, **seed_dict)
|
|
87
|
-
|
|
88
|
-
elif item.seed.identifier == "merge":
|
|
89
|
-
oss_util.merge(key, seed_dict["upload_id"])
|
|
90
|
-
seed_dict["data_size"] = oss_util.head(key).content_length
|
|
91
|
-
yield Response(item.seed, "merge", **seed_dict)
|
|
92
|
-
|
|
93
|
-
except OssDBPutPartError:
|
|
94
|
-
yield Seed(seed_dict)
|
|
95
|
-
except OssDBMergeError:
|
|
96
|
-
yield Seed(seed_dict, identifier="merge")
|
|
97
|
-
|
|
98
|
-
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{cobweb-launcher-1.1.23 → cobweb-launcher-1.2.0}/cobweb_launcher.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|