cobweb-launcher 0.1.7__py3-none-any.whl → 1.2.41__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- cobweb/__init__.py +2 -11
- cobweb/base/__init__.py +9 -0
- cobweb/base/basic.py +297 -0
- cobweb/base/common_queue.py +30 -0
- cobweb/base/decorators.py +40 -0
- cobweb/base/dotting.py +35 -0
- cobweb/base/item.py +46 -0
- cobweb/{log.py → base/log.py} +4 -6
- cobweb/base/request.py +82 -0
- cobweb/base/response.py +23 -0
- cobweb/base/seed.py +114 -0
- cobweb/constant.py +94 -0
- cobweb/crawlers/__init__.py +1 -0
- cobweb/crawlers/base_crawler.py +144 -0
- cobweb/crawlers/crawler.py +209 -0
- cobweb/crawlers/file_crawler.py +98 -0
- cobweb/db/__init__.py +2 -2
- cobweb/db/api_db.py +82 -0
- cobweb/db/redis_db.py +125 -218
- cobweb/exceptions/__init__.py +1 -0
- cobweb/exceptions/oss_db_exception.py +28 -0
- cobweb/launchers/__init__.py +3 -0
- cobweb/launchers/launcher.py +235 -0
- cobweb/launchers/launcher_air.py +88 -0
- cobweb/launchers/launcher_api.py +209 -0
- cobweb/launchers/launcher_pro.py +208 -0
- cobweb/pipelines/__init__.py +3 -0
- cobweb/pipelines/pipeline.py +69 -0
- cobweb/pipelines/pipeline_console.py +22 -0
- cobweb/pipelines/pipeline_loghub.py +34 -0
- cobweb/schedulers/__init__.py +3 -0
- cobweb/schedulers/scheduler_api.py +72 -0
- cobweb/schedulers/scheduler_redis.py +72 -0
- cobweb/setting.py +67 -6
- cobweb/utils/__init__.py +5 -0
- cobweb/utils/bloom.py +58 -0
- cobweb/utils/dotting.py +32 -0
- cobweb/utils/oss.py +94 -0
- cobweb/utils/tools.py +42 -0
- cobweb_launcher-1.2.41.dist-info/METADATA +205 -0
- cobweb_launcher-1.2.41.dist-info/RECORD +44 -0
- {cobweb_launcher-0.1.7.dist-info → cobweb_launcher-1.2.41.dist-info}/WHEEL +1 -1
- cobweb/bbb.py +0 -191
- cobweb/db/oss_db.py +0 -127
- cobweb/db/scheduler/__init__.py +0 -0
- cobweb/db/scheduler/default.py +0 -8
- cobweb/db/scheduler/textfile.py +0 -27
- cobweb/db/storer/__init__.py +0 -0
- cobweb/db/storer/console.py +0 -9
- cobweb/db/storer/loghub.py +0 -54
- cobweb/db/storer/redis.py +0 -15
- cobweb/db/storer/textfile.py +0 -15
- cobweb/decorators.py +0 -16
- cobweb/distributed/__init__.py +0 -0
- cobweb/distributed/launcher.py +0 -243
- cobweb/distributed/models.py +0 -143
- cobweb/interface.py +0 -34
- cobweb/single/__init__.py +0 -0
- cobweb/single/launcher.py +0 -231
- cobweb/single/models.py +0 -134
- cobweb/single/nest.py +0 -153
- cobweb/task.py +0 -50
- cobweb/utils.py +0 -90
- cobweb_launcher-0.1.7.dist-info/METADATA +0 -45
- cobweb_launcher-0.1.7.dist-info/RECORD +0 -31
- {cobweb_launcher-0.1.7.dist-info → cobweb_launcher-1.2.41.dist-info}/LICENSE +0 -0
- {cobweb_launcher-0.1.7.dist-info → cobweb_launcher-1.2.41.dist-info}/top_level.txt +0 -0
cobweb/base/seed.py
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
import json
|
2
|
+
import time
|
3
|
+
import hashlib
|
4
|
+
|
5
|
+
|
6
|
+
class SeedParams:
|
7
|
+
|
8
|
+
def __init__(self, retry, priority, seed_version, seed_status=None):
|
9
|
+
self.retry = retry or 0
|
10
|
+
self.priority = priority or 300
|
11
|
+
self.seed_version = seed_version or int(time.time())
|
12
|
+
self.seed_status = seed_status
|
13
|
+
|
14
|
+
|
15
|
+
class Seed:
|
16
|
+
|
17
|
+
__SEED_PARAMS__ = [
|
18
|
+
"retry",
|
19
|
+
"priority",
|
20
|
+
"seed_version",
|
21
|
+
"seed_status"
|
22
|
+
]
|
23
|
+
|
24
|
+
def __init__(
|
25
|
+
self,
|
26
|
+
seed,
|
27
|
+
sid=None,
|
28
|
+
retry=None,
|
29
|
+
priority=None,
|
30
|
+
seed_version=None,
|
31
|
+
seed_status=None,
|
32
|
+
**kwargs
|
33
|
+
):
|
34
|
+
if any(isinstance(seed, t) for t in (str, bytes)):
|
35
|
+
try:
|
36
|
+
item = json.loads(seed)
|
37
|
+
self._init_seed(item)
|
38
|
+
except json.JSONDecodeError:
|
39
|
+
self.__setattr__("url", seed)
|
40
|
+
elif isinstance(seed, dict):
|
41
|
+
self._init_seed(seed)
|
42
|
+
else:
|
43
|
+
raise TypeError(Exception(
|
44
|
+
f"seed type error, "
|
45
|
+
f"must be str or dict! "
|
46
|
+
f"seed: {seed}"
|
47
|
+
))
|
48
|
+
|
49
|
+
seed_params = {
|
50
|
+
"retry": retry,
|
51
|
+
"priority": priority,
|
52
|
+
"seed_version": seed_version,
|
53
|
+
"seed_status": seed_status,
|
54
|
+
}
|
55
|
+
|
56
|
+
if kwargs:
|
57
|
+
self._init_seed(kwargs)
|
58
|
+
seed_params.update({
|
59
|
+
k:v for k, v in kwargs.items()
|
60
|
+
if k in self.__SEED_PARAMS__
|
61
|
+
})
|
62
|
+
if sid or not getattr(self, "sid", None):
|
63
|
+
self._init_id(sid)
|
64
|
+
self.params = SeedParams(**seed_params)
|
65
|
+
|
66
|
+
def __getattr__(self, name):
|
67
|
+
return None
|
68
|
+
|
69
|
+
def __setitem__(self, key, value):
|
70
|
+
setattr(self, key, value)
|
71
|
+
|
72
|
+
def __getitem__(self, item):
|
73
|
+
return getattr(self, item)
|
74
|
+
|
75
|
+
def __str__(self):
|
76
|
+
return json.dumps(self.__dict__, ensure_ascii=False)
|
77
|
+
|
78
|
+
def __repr__(self):
|
79
|
+
chars = [f"{k}={v}" for k, v in self.__dict__.items()]
|
80
|
+
return f'{self.__class__.__name__}({", ".join(chars)})'
|
81
|
+
|
82
|
+
def _init_seed(self, seed_info:dict):
|
83
|
+
for k, v in seed_info.items():
|
84
|
+
if k not in self.__SEED_PARAMS__:
|
85
|
+
self.__setattr__(k, v)
|
86
|
+
|
87
|
+
def _init_id(self, sid):
|
88
|
+
if not sid:
|
89
|
+
sid = hashlib.md5(self.to_string.encode()).hexdigest()
|
90
|
+
self.__setattr__("sid", sid)
|
91
|
+
|
92
|
+
@property
|
93
|
+
def to_dict(self) -> dict:
|
94
|
+
seed = self.__dict__.copy()
|
95
|
+
if seed.get("params"):
|
96
|
+
del seed["params"]
|
97
|
+
return seed
|
98
|
+
|
99
|
+
@property
|
100
|
+
def to_string(self) -> str:
|
101
|
+
return json.dumps(
|
102
|
+
self.to_dict,
|
103
|
+
ensure_ascii=False,
|
104
|
+
separators=(",", ":")
|
105
|
+
)
|
106
|
+
|
107
|
+
@property
|
108
|
+
def get_all(self):
|
109
|
+
return json.dumps(
|
110
|
+
self.__dict__,
|
111
|
+
ensure_ascii=False,
|
112
|
+
separators=(",", ":")
|
113
|
+
)
|
114
|
+
|
cobweb/constant.py
ADDED
@@ -0,0 +1,94 @@
|
|
1
|
+
|
2
|
+
class CrawlerModel:
|
3
|
+
|
4
|
+
default = "cobweb.crawlers.Crawler"
|
5
|
+
file_air = "cobweb.crawlers.FileCrawlerAir"
|
6
|
+
file_pro = "cobweb.crawlers.FileCrawlerPro"
|
7
|
+
|
8
|
+
|
9
|
+
class LauncherModel:
|
10
|
+
task = "launcher model: task"
|
11
|
+
resident = "launcher model: resident"
|
12
|
+
|
13
|
+
|
14
|
+
class DownloadModel:
|
15
|
+
common = "download model: common"
|
16
|
+
file = "download model: file"
|
17
|
+
|
18
|
+
|
19
|
+
class LogModel:
|
20
|
+
simple = "log model: simple"
|
21
|
+
common = "log model: common"
|
22
|
+
detailed = "log model: detailed"
|
23
|
+
|
24
|
+
|
25
|
+
class DealModel:
|
26
|
+
fail = "deal model: fail"
|
27
|
+
done = "deal model: done"
|
28
|
+
poll = "deal model: poll"
|
29
|
+
|
30
|
+
|
31
|
+
class LogTemplate:
|
32
|
+
|
33
|
+
console_item = """
|
34
|
+
----------------------- start - console pipeline -----------------
|
35
|
+
种子详情 \n{seed_detail}
|
36
|
+
解析详情 \n{parse_detail}
|
37
|
+
----------------------- end - console pipeline ------------------
|
38
|
+
"""
|
39
|
+
|
40
|
+
launcher_air_polling = """
|
41
|
+
----------------------- start - 轮训日志: {task} -----------------
|
42
|
+
内存队列
|
43
|
+
种子数: {doing_len}
|
44
|
+
待消费: {todo_len}
|
45
|
+
已消费: {done_len}
|
46
|
+
存储队列
|
47
|
+
待上传: {upload_len}
|
48
|
+
----------------------- end - 轮训日志: {task} ------------------
|
49
|
+
"""
|
50
|
+
|
51
|
+
launcher_pro_polling = """
|
52
|
+
----------------------- start - 轮训日志: {task} -----------------
|
53
|
+
内存队列
|
54
|
+
种子数: {doing_len}
|
55
|
+
待消费: {todo_len}
|
56
|
+
已消费: {done_len}
|
57
|
+
redis队列
|
58
|
+
种子数: {redis_seed_count}
|
59
|
+
待消费: {redis_todo_len}
|
60
|
+
消费中: {redis_doing_len}
|
61
|
+
存储队列
|
62
|
+
待上传: {upload_len}
|
63
|
+
----------------------- end - 轮训日志: {task} ------------------
|
64
|
+
"""
|
65
|
+
|
66
|
+
download_exception = """
|
67
|
+
----------------------- download exception -----------------------
|
68
|
+
种子详情 \n{detail}
|
69
|
+
种子参数
|
70
|
+
retry : {retry}
|
71
|
+
priority : {priority}
|
72
|
+
seed_version : {seed_version}
|
73
|
+
identifier : {identifier}
|
74
|
+
exception
|
75
|
+
msg : {exception}
|
76
|
+
------------------------------------------------------------------
|
77
|
+
"""
|
78
|
+
|
79
|
+
download_info = """
|
80
|
+
------------------------ download info ---------------------------
|
81
|
+
种子详情 \n{detail}
|
82
|
+
种子参数
|
83
|
+
retry : {retry}
|
84
|
+
priority : {priority}
|
85
|
+
seed_version : {seed_version}
|
86
|
+
identifier : {identifier}
|
87
|
+
response
|
88
|
+
status : {status} \n{response}
|
89
|
+
------------------------------------------------------------------
|
90
|
+
"""
|
91
|
+
|
92
|
+
@staticmethod
|
93
|
+
def log_info(item: dict) -> str:
|
94
|
+
return "\n".join([" " * 12 + f"{str(k).ljust(14)}: {str(v)}" for k, v in item.items()])
|
@@ -0,0 +1 @@
|
|
1
|
+
from .crawler import Crawler
|
@@ -0,0 +1,144 @@
|
|
1
|
+
import threading
|
2
|
+
import time
|
3
|
+
import traceback
|
4
|
+
|
5
|
+
from inspect import isgenerator
|
6
|
+
from typing import Union, Callable, Mapping
|
7
|
+
|
8
|
+
from cobweb.base import Queue, Seed, BaseItem, Request, Response, logger
|
9
|
+
from cobweb.constant import DealModel, LogTemplate
|
10
|
+
from cobweb.utils import download_log_info
|
11
|
+
from cobweb import setting
|
12
|
+
|
13
|
+
|
14
|
+
class Crawler(threading.Thread):
|
15
|
+
|
16
|
+
def __init__(
|
17
|
+
self,
|
18
|
+
upload_queue: Queue,
|
19
|
+
custom_func: Union[Mapping[str, Callable]],
|
20
|
+
launcher_queue: Union[Mapping[str, Queue]],
|
21
|
+
):
|
22
|
+
super().__init__()
|
23
|
+
|
24
|
+
self.upload_queue = upload_queue
|
25
|
+
for func_name, _callable in custom_func.items():
|
26
|
+
if isinstance(_callable, Callable):
|
27
|
+
self.__setattr__(func_name, _callable)
|
28
|
+
|
29
|
+
self.launcher_queue = launcher_queue
|
30
|
+
|
31
|
+
self.spider_thread_num = setting.SPIDER_THREAD_NUM
|
32
|
+
self.max_retries = setting.SPIDER_MAX_RETRIES
|
33
|
+
|
34
|
+
@staticmethod
|
35
|
+
def request(seed: Seed) -> Union[Request, BaseItem]:
|
36
|
+
stream = True if setting.DOWNLOAD_MODEL else False
|
37
|
+
yield Request(seed.url, seed, stream=stream, timeout=5)
|
38
|
+
|
39
|
+
@staticmethod
|
40
|
+
def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
|
41
|
+
response = item.download()
|
42
|
+
yield Response(item.seed, response, **item.to_dict)
|
43
|
+
|
44
|
+
@staticmethod
|
45
|
+
def parse(item: Response) -> BaseItem:
|
46
|
+
pass
|
47
|
+
|
48
|
+
def get_seed(self) -> Seed:
|
49
|
+
return self.launcher_queue['todo'].pop()
|
50
|
+
|
51
|
+
def distribute(self, item, seed):
|
52
|
+
if isinstance(item, BaseItem):
|
53
|
+
self.upload_queue.push(item)
|
54
|
+
elif isinstance(item, Seed):
|
55
|
+
self.launcher_queue['new'].push(item)
|
56
|
+
elif isinstance(item, str) and item == DealModel.poll:
|
57
|
+
self.launcher_queue['todo'].push(seed)
|
58
|
+
elif isinstance(item, str) and item == DealModel.done:
|
59
|
+
self.launcher_queue['done'].push(seed)
|
60
|
+
elif isinstance(item, str) and item == DealModel.fail:
|
61
|
+
seed.params.seed_status = DealModel.fail
|
62
|
+
self.launcher_queue['done'].push(seed)
|
63
|
+
else:
|
64
|
+
raise TypeError("yield value type error!")
|
65
|
+
|
66
|
+
def spider(self):
|
67
|
+
while True:
|
68
|
+
seed = self.get_seed()
|
69
|
+
|
70
|
+
if not seed:
|
71
|
+
continue
|
72
|
+
|
73
|
+
elif seed.params.retry >= self.max_retries:
|
74
|
+
seed.params.seed_status = DealModel.fail
|
75
|
+
self.launcher_queue['done'].push(seed)
|
76
|
+
continue
|
77
|
+
|
78
|
+
seed_detail_log_info = download_log_info(seed.to_dict)
|
79
|
+
|
80
|
+
try:
|
81
|
+
request_iterators = self.request(seed)
|
82
|
+
|
83
|
+
if not isgenerator(request_iterators):
|
84
|
+
raise TypeError("request function isn't a generator!")
|
85
|
+
|
86
|
+
iterator_status = False
|
87
|
+
|
88
|
+
for request_item in request_iterators:
|
89
|
+
|
90
|
+
iterator_status = True
|
91
|
+
|
92
|
+
if isinstance(request_item, Request):
|
93
|
+
iterator_status = False
|
94
|
+
download_iterators = self.download(request_item)
|
95
|
+
if not isgenerator(download_iterators):
|
96
|
+
raise TypeError("download function isn't a generator")
|
97
|
+
|
98
|
+
for download_item in download_iterators:
|
99
|
+
iterator_status = True
|
100
|
+
if isinstance(download_item, Response):
|
101
|
+
iterator_status = False
|
102
|
+
logger.info(LogTemplate.download_info.format(
|
103
|
+
detail=seed_detail_log_info,
|
104
|
+
retry=seed.params.retry,
|
105
|
+
priority=seed.params.priority,
|
106
|
+
seed_version=seed.params.seed_version,
|
107
|
+
identifier=seed.identifier or "",
|
108
|
+
status=download_item.response,
|
109
|
+
response=download_log_info(download_item.to_dict)
|
110
|
+
))
|
111
|
+
parse_iterators = self.parse(download_item)
|
112
|
+
if not isgenerator(parse_iterators):
|
113
|
+
raise TypeError("parse function isn't a generator")
|
114
|
+
for parse_item in parse_iterators:
|
115
|
+
iterator_status = True
|
116
|
+
if isinstance(parse_item, Response):
|
117
|
+
raise TypeError("upload_item can't be a Response instance")
|
118
|
+
self.distribute(parse_item, seed)
|
119
|
+
else:
|
120
|
+
self.distribute(download_item, seed)
|
121
|
+
else:
|
122
|
+
self.distribute(request_item, seed)
|
123
|
+
|
124
|
+
if not iterator_status:
|
125
|
+
raise ValueError("request/download/parse function yield value error!")
|
126
|
+
|
127
|
+
except Exception as e:
|
128
|
+
logger.info(LogTemplate.download_exception.format(
|
129
|
+
detail=seed_detail_log_info,
|
130
|
+
retry=seed.params.retry,
|
131
|
+
priority=seed.params.priority,
|
132
|
+
seed_version=seed.params.seed_version,
|
133
|
+
identifier=seed.identifier or "",
|
134
|
+
exception=''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
135
|
+
))
|
136
|
+
seed.params.retry += 1
|
137
|
+
self.launcher_queue['todo'].push(seed)
|
138
|
+
finally:
|
139
|
+
time.sleep(0.1)
|
140
|
+
|
141
|
+
def run(self):
|
142
|
+
for index in range(self.spider_thread_num):
|
143
|
+
threading.Thread(name=f"spider_{index}", target=self.spider).start()
|
144
|
+
|
@@ -0,0 +1,209 @@
|
|
1
|
+
import json
|
2
|
+
import threading
|
3
|
+
import time
|
4
|
+
import traceback
|
5
|
+
from inspect import isgenerator
|
6
|
+
from typing import Union, Callable, Mapping
|
7
|
+
from urllib.parse import urlparse
|
8
|
+
|
9
|
+
from requests import HTTPError, Response as Res
|
10
|
+
|
11
|
+
from cobweb.constant import DealModel, LogTemplate
|
12
|
+
from cobweb.base import (
|
13
|
+
Seed,
|
14
|
+
BaseItem,
|
15
|
+
Request,
|
16
|
+
Response,
|
17
|
+
ConsoleItem,
|
18
|
+
logger
|
19
|
+
)
|
20
|
+
from cobweb.utils import LoghubDot
|
21
|
+
|
22
|
+
|
23
|
+
class Crawler(threading.Thread):
|
24
|
+
|
25
|
+
def __init__(
|
26
|
+
self,
|
27
|
+
task: str,
|
28
|
+
project: str,
|
29
|
+
stop: threading.Event,
|
30
|
+
pause: threading.Event,
|
31
|
+
# launcher_queue: Union[Mapping[str, Queue]],
|
32
|
+
get_seed: Callable,
|
33
|
+
set_seed: Callable,
|
34
|
+
add_seed: Callable,
|
35
|
+
delete_seed: Callable,
|
36
|
+
upload_data: Callable,
|
37
|
+
custom_func: Union[Mapping[str, Callable]],
|
38
|
+
thread_num: int,
|
39
|
+
max_retries: int,
|
40
|
+
time_sleep: int,
|
41
|
+
):
|
42
|
+
super().__init__()
|
43
|
+
self.task = task
|
44
|
+
self.project = project
|
45
|
+
self._stop = stop
|
46
|
+
self._pause = pause
|
47
|
+
self._get_seed = get_seed
|
48
|
+
self._set_seed = set_seed
|
49
|
+
self._add_seed = add_seed
|
50
|
+
self._delete_seed = delete_seed
|
51
|
+
self._upload_data = upload_data
|
52
|
+
|
53
|
+
for func_name, _callable in custom_func.items():
|
54
|
+
if isinstance(_callable, Callable):
|
55
|
+
self.__setattr__(func_name, _callable)
|
56
|
+
|
57
|
+
self.thread_num = thread_num
|
58
|
+
self.time_sleep = time_sleep
|
59
|
+
self.max_retries = max_retries
|
60
|
+
|
61
|
+
self.loghub_dot = LoghubDot()
|
62
|
+
|
63
|
+
@staticmethod
|
64
|
+
def request(seed: Seed) -> Union[Request, BaseItem]:
|
65
|
+
yield Request(seed.url, seed, timeout=5)
|
66
|
+
|
67
|
+
@staticmethod
|
68
|
+
def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
|
69
|
+
response = item.download()
|
70
|
+
yield Response(item.seed, response, **item.to_dict)
|
71
|
+
|
72
|
+
@staticmethod
|
73
|
+
def parse(item: Response) -> BaseItem:
|
74
|
+
upload_item = item.to_dict
|
75
|
+
upload_item["text"] = item.response.text
|
76
|
+
yield ConsoleItem(item.seed, data=json.dumps(upload_item, ensure_ascii=False))
|
77
|
+
|
78
|
+
# def get_seed(self) -> Seed:
|
79
|
+
# return self._todo.pop()
|
80
|
+
|
81
|
+
def distribute(self, item, seed):
|
82
|
+
if isinstance(item, BaseItem):
|
83
|
+
self._upload_data(item)
|
84
|
+
elif isinstance(item, Seed):
|
85
|
+
self._add_seed(item)
|
86
|
+
elif isinstance(item, str) and item == DealModel.poll:
|
87
|
+
self._set_seed(seed)
|
88
|
+
elif isinstance(item, str) and item == DealModel.done:
|
89
|
+
self._delete_seed(seed)
|
90
|
+
elif isinstance(item, str) and item == DealModel.fail:
|
91
|
+
seed.params.seed_status = DealModel.fail
|
92
|
+
self._delete_seed(seed)
|
93
|
+
else:
|
94
|
+
raise TypeError("yield value type error!")
|
95
|
+
|
96
|
+
def spider(self):
|
97
|
+
while not self._stop.is_set():
|
98
|
+
|
99
|
+
seed = self._get_seed()
|
100
|
+
|
101
|
+
if not seed:
|
102
|
+
time.sleep(1)
|
103
|
+
continue
|
104
|
+
|
105
|
+
elif seed.params.retry > self.max_retries:
|
106
|
+
seed.params.seed_status = DealModel.fail
|
107
|
+
self._delete_seed(seed)
|
108
|
+
continue
|
109
|
+
|
110
|
+
seed_detail_log_info = LogTemplate.log_info(seed.to_dict)
|
111
|
+
|
112
|
+
try:
|
113
|
+
request_iterators = self.request(seed)
|
114
|
+
|
115
|
+
if not isgenerator(request_iterators):
|
116
|
+
raise TypeError("request function isn't a generator!")
|
117
|
+
|
118
|
+
iterator_status = False
|
119
|
+
|
120
|
+
for request_item in request_iterators:
|
121
|
+
|
122
|
+
iterator_status = True
|
123
|
+
|
124
|
+
if isinstance(request_item, Request):
|
125
|
+
iterator_status = False
|
126
|
+
start_time = time.time()
|
127
|
+
download_iterators = self.download(request_item)
|
128
|
+
if not isgenerator(download_iterators):
|
129
|
+
raise TypeError("download function isn't a generator")
|
130
|
+
|
131
|
+
for download_item in download_iterators:
|
132
|
+
iterator_status = True
|
133
|
+
if isinstance(download_item, Response):
|
134
|
+
iterator_status = False
|
135
|
+
logger.info(LogTemplate.download_info.format(
|
136
|
+
detail=seed_detail_log_info,
|
137
|
+
retry=seed.params.retry,
|
138
|
+
priority=seed.params.priority,
|
139
|
+
seed_version=seed.params.seed_version,
|
140
|
+
identifier=seed.identifier or "",
|
141
|
+
status=download_item.response,
|
142
|
+
response=LogTemplate.log_info(download_item.to_dict)
|
143
|
+
))
|
144
|
+
if isinstance(download_item.response, Res):
|
145
|
+
end_time = time.time()
|
146
|
+
self.loghub_dot.build(
|
147
|
+
topic=urlparse(download_item.response.request.url).netloc,
|
148
|
+
data_size=int(download_item.response.headers.get("content-length", 0)),
|
149
|
+
cost_time=end_time - start_time, status = 200,
|
150
|
+
url=download_item.response.url,
|
151
|
+
)
|
152
|
+
parse_iterators = self.parse(download_item)
|
153
|
+
if not isgenerator(parse_iterators):
|
154
|
+
raise TypeError("parse function isn't a generator")
|
155
|
+
for parse_item in parse_iterators:
|
156
|
+
iterator_status = True
|
157
|
+
if isinstance(parse_item, Response):
|
158
|
+
raise TypeError("upload_item can't be a Response instance")
|
159
|
+
self.distribute(parse_item, seed)
|
160
|
+
else:
|
161
|
+
self.distribute(download_item, seed)
|
162
|
+
else:
|
163
|
+
self.distribute(request_item, seed)
|
164
|
+
|
165
|
+
if not iterator_status:
|
166
|
+
raise ValueError("request/download/parse function yield value error!")
|
167
|
+
except HTTPError as e:
|
168
|
+
if isinstance(e.response, Res):
|
169
|
+
url = e.response.request.url
|
170
|
+
status = e.response.status_code
|
171
|
+
exception_msg = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
172
|
+
self.loghub_dot.build(
|
173
|
+
topic=urlparse(url).netloc,
|
174
|
+
data_size=-1, cost_time=-1,
|
175
|
+
status=status, url=url,
|
176
|
+
msg=exception_msg
|
177
|
+
)
|
178
|
+
logger.info(LogTemplate.download_exception.format(
|
179
|
+
detail=seed_detail_log_info,
|
180
|
+
retry=seed.params.retry,
|
181
|
+
priority=seed.params.priority,
|
182
|
+
seed_version=seed.params.seed_version,
|
183
|
+
identifier=seed.identifier or "",
|
184
|
+
exception=''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
185
|
+
))
|
186
|
+
seed.params.retry += 1
|
187
|
+
self._set_seed(seed)
|
188
|
+
time.sleep(self.time_sleep * seed.params.retry)
|
189
|
+
except Exception as e:
|
190
|
+
logger.info(LogTemplate.download_exception.format(
|
191
|
+
detail=seed_detail_log_info,
|
192
|
+
retry=seed.params.retry,
|
193
|
+
priority=seed.params.priority,
|
194
|
+
seed_version=seed.params.seed_version,
|
195
|
+
identifier=seed.identifier or "",
|
196
|
+
exception=''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
197
|
+
))
|
198
|
+
seed.params.retry += 1
|
199
|
+
# self._todo.push(seed)
|
200
|
+
self._set_seed(seed)
|
201
|
+
# time.sleep(self.time_sleep * seed.params.retry)
|
202
|
+
finally:
|
203
|
+
time.sleep(0.1)
|
204
|
+
logger.info("spider thread close")
|
205
|
+
|
206
|
+
def run(self):
|
207
|
+
for index in range(self.thread_num):
|
208
|
+
threading.Thread(name=f"spider_{index}", target=self.spider).start()
|
209
|
+
|
@@ -0,0 +1,98 @@
|
|
1
|
+
import os
|
2
|
+
from typing import Union
|
3
|
+
from cobweb import setting
|
4
|
+
from cobweb.utils import OssUtil
|
5
|
+
from cobweb.crawlers import Crawler
|
6
|
+
from cobweb.base import Seed, BaseItem, Request, Response
|
7
|
+
from cobweb.exceptions import OssDBPutPartError, OssDBMergeError
|
8
|
+
|
9
|
+
|
10
|
+
oss_util = OssUtil(is_path_style=bool(int(os.getenv("PRIVATE_LINK", 0))))
|
11
|
+
|
12
|
+
|
13
|
+
class FileCrawlerAir(Crawler):
|
14
|
+
|
15
|
+
@staticmethod
|
16
|
+
def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
|
17
|
+
seed_dict = item.seed.to_dict
|
18
|
+
seed_dict["bucket_name"] = oss_util.bucket
|
19
|
+
try:
|
20
|
+
seed_dict["oss_path"] = key = item.seed.oss_path or getattr(item, "oss_path")
|
21
|
+
|
22
|
+
if oss_util.exists(key):
|
23
|
+
seed_dict["data_size"] = oss_util.head(key).content_length
|
24
|
+
yield Response(item.seed, "exists", **seed_dict)
|
25
|
+
|
26
|
+
else:
|
27
|
+
seed_dict.setdefault("end", "")
|
28
|
+
seed_dict.setdefault("start", 0)
|
29
|
+
|
30
|
+
if seed_dict["end"] or seed_dict["start"]:
|
31
|
+
start, end = seed_dict["start"], seed_dict["end"]
|
32
|
+
item.request_setting["headers"]['Range'] = f'bytes={start}-{end}'
|
33
|
+
|
34
|
+
if not item.seed.identifier:
|
35
|
+
content = b""
|
36
|
+
chunk_size = oss_util.chunk_size
|
37
|
+
min_upload_size = oss_util.min_upload_size
|
38
|
+
seed_dict.setdefault("position", 1)
|
39
|
+
|
40
|
+
response = item.download()
|
41
|
+
|
42
|
+
content_type = response.headers.get("content-type", "").split(";")[0]
|
43
|
+
seed_dict["data_size"] = content_length = int(response.headers.get("content-length", 0))
|
44
|
+
|
45
|
+
if content_type and content_type in setting.FILE_FILTER_CONTENT_TYPE:
|
46
|
+
"""过滤响应文件类型"""
|
47
|
+
response.close()
|
48
|
+
seed_dict["filter"] = True
|
49
|
+
seed_dict["msg"] = f"response content type is {content_type}"
|
50
|
+
yield Response(item.seed, response, **seed_dict)
|
51
|
+
|
52
|
+
elif seed_dict['position'] == 1 and min_upload_size >= content_length > 0:
|
53
|
+
"""过小文件标识返回"""
|
54
|
+
response.close()
|
55
|
+
seed_dict["filter"] = True
|
56
|
+
seed_dict["msg"] = "file size is too small"
|
57
|
+
yield Response(item.seed, response, **seed_dict)
|
58
|
+
|
59
|
+
elif seed_dict['position'] == 1 and chunk_size > content_length > min_upload_size:
|
60
|
+
"""小文件直接下载"""
|
61
|
+
for part_data in response.iter_content(chunk_size):
|
62
|
+
content += part_data
|
63
|
+
response.close()
|
64
|
+
oss_util.put(key, content)
|
65
|
+
yield Response(item.seed, response, **seed_dict)
|
66
|
+
|
67
|
+
else:
|
68
|
+
"""中大文件同步分片下载"""
|
69
|
+
seed_dict.setdefault("upload_id", oss_util.init_part(key).upload_id)
|
70
|
+
|
71
|
+
for part_data in response.iter_content(chunk_size):
|
72
|
+
content += part_data
|
73
|
+
if len(content) >= chunk_size:
|
74
|
+
upload_data = content[:chunk_size]
|
75
|
+
content = content[chunk_size:]
|
76
|
+
oss_util.put_part(key, seed_dict["upload_id"], seed_dict['position'], content)
|
77
|
+
seed_dict['start'] += len(upload_data)
|
78
|
+
seed_dict['position'] += 1
|
79
|
+
|
80
|
+
response.close()
|
81
|
+
|
82
|
+
if content:
|
83
|
+
oss_util.put_part(key, seed_dict["upload_id"], seed_dict['position'], content)
|
84
|
+
oss_util.merge(key, seed_dict["upload_id"])
|
85
|
+
seed_dict["data_size"] = oss_util.head(key).content_length
|
86
|
+
yield Response(item.seed, response, **seed_dict)
|
87
|
+
|
88
|
+
elif item.seed.identifier == "merge":
|
89
|
+
oss_util.merge(key, seed_dict["upload_id"])
|
90
|
+
seed_dict["data_size"] = oss_util.head(key).content_length
|
91
|
+
yield Response(item.seed, "merge", **seed_dict)
|
92
|
+
|
93
|
+
except OssDBPutPartError:
|
94
|
+
yield Seed(seed_dict)
|
95
|
+
except OssDBMergeError:
|
96
|
+
yield Seed(seed_dict, identifier="merge")
|
97
|
+
|
98
|
+
|
cobweb/db/__init__.py
CHANGED
@@ -1,2 +1,2 @@
|
|
1
|
-
from . import
|
2
|
-
from . import
|
1
|
+
from .redis_db import RedisDB
|
2
|
+
from .api_db import ApiDB
|