cobweb-launcher 0.1.8__py3-none-any.whl → 1.2.42__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. cobweb/__init__.py +2 -11
  2. cobweb/base/__init__.py +9 -0
  3. cobweb/base/basic.py +297 -0
  4. cobweb/base/common_queue.py +30 -0
  5. cobweb/base/decorators.py +40 -0
  6. cobweb/base/dotting.py +35 -0
  7. cobweb/base/item.py +46 -0
  8. cobweb/{log.py → base/log.py} +4 -6
  9. cobweb/base/request.py +82 -0
  10. cobweb/base/response.py +23 -0
  11. cobweb/base/seed.py +114 -0
  12. cobweb/constant.py +94 -0
  13. cobweb/crawlers/__init__.py +1 -0
  14. cobweb/crawlers/base_crawler.py +144 -0
  15. cobweb/crawlers/crawler.py +212 -0
  16. cobweb/crawlers/file_crawler.py +98 -0
  17. cobweb/db/__init__.py +2 -2
  18. cobweb/db/api_db.py +82 -0
  19. cobweb/db/redis_db.py +125 -218
  20. cobweb/exceptions/__init__.py +1 -0
  21. cobweb/exceptions/oss_db_exception.py +28 -0
  22. cobweb/launchers/__init__.py +3 -0
  23. cobweb/launchers/launcher.py +235 -0
  24. cobweb/launchers/launcher_air.py +88 -0
  25. cobweb/launchers/launcher_api.py +209 -0
  26. cobweb/launchers/launcher_pro.py +208 -0
  27. cobweb/pipelines/__init__.py +3 -0
  28. cobweb/pipelines/pipeline.py +69 -0
  29. cobweb/pipelines/pipeline_console.py +22 -0
  30. cobweb/pipelines/pipeline_loghub.py +34 -0
  31. cobweb/schedulers/__init__.py +3 -0
  32. cobweb/schedulers/scheduler_api.py +72 -0
  33. cobweb/schedulers/scheduler_redis.py +72 -0
  34. cobweb/setting.py +67 -6
  35. cobweb/utils/__init__.py +5 -0
  36. cobweb/utils/bloom.py +58 -0
  37. cobweb/utils/dotting.py +32 -0
  38. cobweb/utils/oss.py +94 -0
  39. cobweb/utils/tools.py +42 -0
  40. cobweb_launcher-1.2.42.dist-info/METADATA +205 -0
  41. cobweb_launcher-1.2.42.dist-info/RECORD +44 -0
  42. {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.42.dist-info}/WHEEL +1 -1
  43. cobweb/bbb.py +0 -191
  44. cobweb/db/oss_db.py +0 -127
  45. cobweb/db/scheduler/__init__.py +0 -0
  46. cobweb/db/scheduler/default.py +0 -8
  47. cobweb/db/scheduler/textfile.py +0 -27
  48. cobweb/db/storer/__init__.py +0 -0
  49. cobweb/db/storer/console.py +0 -9
  50. cobweb/db/storer/loghub.py +0 -54
  51. cobweb/db/storer/redis.py +0 -15
  52. cobweb/db/storer/textfile.py +0 -15
  53. cobweb/decorators.py +0 -16
  54. cobweb/distributed/__init__.py +0 -0
  55. cobweb/distributed/launcher.py +0 -243
  56. cobweb/distributed/models.py +0 -143
  57. cobweb/interface.py +0 -34
  58. cobweb/single/__init__.py +0 -0
  59. cobweb/single/launcher.py +0 -231
  60. cobweb/single/models.py +0 -134
  61. cobweb/single/nest.py +0 -153
  62. cobweb/task.py +0 -50
  63. cobweb/utils.py +0 -90
  64. cobweb_launcher-0.1.8.dist-info/METADATA +0 -45
  65. cobweb_launcher-0.1.8.dist-info/RECORD +0 -31
  66. {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.42.dist-info}/LICENSE +0 -0
  67. {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.42.dist-info}/top_level.txt +0 -0
cobweb/base/seed.py ADDED
@@ -0,0 +1,114 @@
1
+ import json
2
+ import time
3
+ import hashlib
4
+
5
+
6
+ class SeedParams:
7
+
8
+ def __init__(self, retry, priority, seed_version, seed_status=None):
9
+ self.retry = retry or 0
10
+ self.priority = priority or 300
11
+ self.seed_version = seed_version or int(time.time())
12
+ self.seed_status = seed_status
13
+
14
+
15
+ class Seed:
16
+
17
+ __SEED_PARAMS__ = [
18
+ "retry",
19
+ "priority",
20
+ "seed_version",
21
+ "seed_status"
22
+ ]
23
+
24
+ def __init__(
25
+ self,
26
+ seed,
27
+ sid=None,
28
+ retry=None,
29
+ priority=None,
30
+ seed_version=None,
31
+ seed_status=None,
32
+ **kwargs
33
+ ):
34
+ if any(isinstance(seed, t) for t in (str, bytes)):
35
+ try:
36
+ item = json.loads(seed)
37
+ self._init_seed(item)
38
+ except json.JSONDecodeError:
39
+ self.__setattr__("url", seed)
40
+ elif isinstance(seed, dict):
41
+ self._init_seed(seed)
42
+ else:
43
+ raise TypeError(Exception(
44
+ f"seed type error, "
45
+ f"must be str or dict! "
46
+ f"seed: {seed}"
47
+ ))
48
+
49
+ seed_params = {
50
+ "retry": retry,
51
+ "priority": priority,
52
+ "seed_version": seed_version,
53
+ "seed_status": seed_status,
54
+ }
55
+
56
+ if kwargs:
57
+ self._init_seed(kwargs)
58
+ seed_params.update({
59
+ k:v for k, v in kwargs.items()
60
+ if k in self.__SEED_PARAMS__
61
+ })
62
+ if sid or not getattr(self, "sid", None):
63
+ self._init_id(sid)
64
+ self.params = SeedParams(**seed_params)
65
+
66
+ def __getattr__(self, name):
67
+ return None
68
+
69
+ def __setitem__(self, key, value):
70
+ setattr(self, key, value)
71
+
72
+ def __getitem__(self, item):
73
+ return getattr(self, item)
74
+
75
+ def __str__(self):
76
+ return json.dumps(self.__dict__, ensure_ascii=False)
77
+
78
+ def __repr__(self):
79
+ chars = [f"{k}={v}" for k, v in self.__dict__.items()]
80
+ return f'{self.__class__.__name__}({", ".join(chars)})'
81
+
82
+ def _init_seed(self, seed_info:dict):
83
+ for k, v in seed_info.items():
84
+ if k not in self.__SEED_PARAMS__:
85
+ self.__setattr__(k, v)
86
+
87
+ def _init_id(self, sid):
88
+ if not sid:
89
+ sid = hashlib.md5(self.to_string.encode()).hexdigest()
90
+ self.__setattr__("sid", sid)
91
+
92
+ @property
93
+ def to_dict(self) -> dict:
94
+ seed = self.__dict__.copy()
95
+ if seed.get("params"):
96
+ del seed["params"]
97
+ return seed
98
+
99
+ @property
100
+ def to_string(self) -> str:
101
+ return json.dumps(
102
+ self.to_dict,
103
+ ensure_ascii=False,
104
+ separators=(",", ":")
105
+ )
106
+
107
+ @property
108
+ def get_all(self):
109
+ return json.dumps(
110
+ self.__dict__,
111
+ ensure_ascii=False,
112
+ separators=(",", ":")
113
+ )
114
+
cobweb/constant.py ADDED
@@ -0,0 +1,94 @@
1
+
2
+ class CrawlerModel:
3
+
4
+ default = "cobweb.crawlers.Crawler"
5
+ file_air = "cobweb.crawlers.FileCrawlerAir"
6
+ file_pro = "cobweb.crawlers.FileCrawlerPro"
7
+
8
+
9
+ class LauncherModel:
10
+ task = "launcher model: task"
11
+ resident = "launcher model: resident"
12
+
13
+
14
+ class DownloadModel:
15
+ common = "download model: common"
16
+ file = "download model: file"
17
+
18
+
19
+ class LogModel:
20
+ simple = "log model: simple"
21
+ common = "log model: common"
22
+ detailed = "log model: detailed"
23
+
24
+
25
+ class DealModel:
26
+ fail = "deal model: fail"
27
+ done = "deal model: done"
28
+ poll = "deal model: poll"
29
+
30
+
31
+ class LogTemplate:
32
+
33
+ console_item = """
34
+ ----------------------- start - console pipeline -----------------
35
+ 种子详情 \n{seed_detail}
36
+ 解析详情 \n{parse_detail}
37
+ ----------------------- end - console pipeline ------------------
38
+ """
39
+
40
+ launcher_air_polling = """
41
+ ----------------------- start - 轮训日志: {task} -----------------
42
+ 内存队列
43
+ 种子数: {doing_len}
44
+ 待消费: {todo_len}
45
+ 已消费: {done_len}
46
+ 存储队列
47
+ 待上传: {upload_len}
48
+ ----------------------- end - 轮训日志: {task} ------------------
49
+ """
50
+
51
+ launcher_pro_polling = """
52
+ ----------------------- start - 轮训日志: {task} -----------------
53
+ 内存队列
54
+ 种子数: {doing_len}
55
+ 待消费: {todo_len}
56
+ 已消费: {done_len}
57
+ redis队列
58
+ 种子数: {redis_seed_count}
59
+ 待消费: {redis_todo_len}
60
+ 消费中: {redis_doing_len}
61
+ 存储队列
62
+ 待上传: {upload_len}
63
+ ----------------------- end - 轮训日志: {task} ------------------
64
+ """
65
+
66
+ download_exception = """
67
+ ----------------------- download exception -----------------------
68
+ 种子详情 \n{detail}
69
+ 种子参数
70
+ retry : {retry}
71
+ priority : {priority}
72
+ seed_version : {seed_version}
73
+ identifier : {identifier}
74
+ exception
75
+ msg : {exception}
76
+ ------------------------------------------------------------------
77
+ """
78
+
79
+ download_info = """
80
+ ------------------------ download info ---------------------------
81
+ 种子详情 \n{detail}
82
+ 种子参数
83
+ retry : {retry}
84
+ priority : {priority}
85
+ seed_version : {seed_version}
86
+ identifier : {identifier}
87
+ response
88
+ status : {status} \n{response}
89
+ ------------------------------------------------------------------
90
+ """
91
+
92
+ @staticmethod
93
+ def log_info(item: dict) -> str:
94
+ return "\n".join([" " * 12 + f"{str(k).ljust(14)}: {str(v)}" for k, v in item.items()])
@@ -0,0 +1 @@
1
+ from .crawler import Crawler
@@ -0,0 +1,144 @@
1
+ import threading
2
+ import time
3
+ import traceback
4
+
5
+ from inspect import isgenerator
6
+ from typing import Union, Callable, Mapping
7
+
8
+ from cobweb.base import Queue, Seed, BaseItem, Request, Response, logger
9
+ from cobweb.constant import DealModel, LogTemplate
10
+ from cobweb.utils import download_log_info
11
+ from cobweb import setting
12
+
13
+
14
+ class Crawler(threading.Thread):
15
+
16
+ def __init__(
17
+ self,
18
+ upload_queue: Queue,
19
+ custom_func: Union[Mapping[str, Callable]],
20
+ launcher_queue: Union[Mapping[str, Queue]],
21
+ ):
22
+ super().__init__()
23
+
24
+ self.upload_queue = upload_queue
25
+ for func_name, _callable in custom_func.items():
26
+ if isinstance(_callable, Callable):
27
+ self.__setattr__(func_name, _callable)
28
+
29
+ self.launcher_queue = launcher_queue
30
+
31
+ self.spider_thread_num = setting.SPIDER_THREAD_NUM
32
+ self.max_retries = setting.SPIDER_MAX_RETRIES
33
+
34
+ @staticmethod
35
+ def request(seed: Seed) -> Union[Request, BaseItem]:
36
+ stream = True if setting.DOWNLOAD_MODEL else False
37
+ yield Request(seed.url, seed, stream=stream, timeout=5)
38
+
39
+ @staticmethod
40
+ def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
41
+ response = item.download()
42
+ yield Response(item.seed, response, **item.to_dict)
43
+
44
+ @staticmethod
45
+ def parse(item: Response) -> BaseItem:
46
+ pass
47
+
48
+ def get_seed(self) -> Seed:
49
+ return self.launcher_queue['todo'].pop()
50
+
51
+ def distribute(self, item, seed):
52
+ if isinstance(item, BaseItem):
53
+ self.upload_queue.push(item)
54
+ elif isinstance(item, Seed):
55
+ self.launcher_queue['new'].push(item)
56
+ elif isinstance(item, str) and item == DealModel.poll:
57
+ self.launcher_queue['todo'].push(seed)
58
+ elif isinstance(item, str) and item == DealModel.done:
59
+ self.launcher_queue['done'].push(seed)
60
+ elif isinstance(item, str) and item == DealModel.fail:
61
+ seed.params.seed_status = DealModel.fail
62
+ self.launcher_queue['done'].push(seed)
63
+ else:
64
+ raise TypeError("yield value type error!")
65
+
66
+ def spider(self):
67
+ while True:
68
+ seed = self.get_seed()
69
+
70
+ if not seed:
71
+ continue
72
+
73
+ elif seed.params.retry >= self.max_retries:
74
+ seed.params.seed_status = DealModel.fail
75
+ self.launcher_queue['done'].push(seed)
76
+ continue
77
+
78
+ seed_detail_log_info = download_log_info(seed.to_dict)
79
+
80
+ try:
81
+ request_iterators = self.request(seed)
82
+
83
+ if not isgenerator(request_iterators):
84
+ raise TypeError("request function isn't a generator!")
85
+
86
+ iterator_status = False
87
+
88
+ for request_item in request_iterators:
89
+
90
+ iterator_status = True
91
+
92
+ if isinstance(request_item, Request):
93
+ iterator_status = False
94
+ download_iterators = self.download(request_item)
95
+ if not isgenerator(download_iterators):
96
+ raise TypeError("download function isn't a generator")
97
+
98
+ for download_item in download_iterators:
99
+ iterator_status = True
100
+ if isinstance(download_item, Response):
101
+ iterator_status = False
102
+ logger.info(LogTemplate.download_info.format(
103
+ detail=seed_detail_log_info,
104
+ retry=seed.params.retry,
105
+ priority=seed.params.priority,
106
+ seed_version=seed.params.seed_version,
107
+ identifier=seed.identifier or "",
108
+ status=download_item.response,
109
+ response=download_log_info(download_item.to_dict)
110
+ ))
111
+ parse_iterators = self.parse(download_item)
112
+ if not isgenerator(parse_iterators):
113
+ raise TypeError("parse function isn't a generator")
114
+ for parse_item in parse_iterators:
115
+ iterator_status = True
116
+ if isinstance(parse_item, Response):
117
+ raise TypeError("upload_item can't be a Response instance")
118
+ self.distribute(parse_item, seed)
119
+ else:
120
+ self.distribute(download_item, seed)
121
+ else:
122
+ self.distribute(request_item, seed)
123
+
124
+ if not iterator_status:
125
+ raise ValueError("request/download/parse function yield value error!")
126
+
127
+ except Exception as e:
128
+ logger.info(LogTemplate.download_exception.format(
129
+ detail=seed_detail_log_info,
130
+ retry=seed.params.retry,
131
+ priority=seed.params.priority,
132
+ seed_version=seed.params.seed_version,
133
+ identifier=seed.identifier or "",
134
+ exception=''.join(traceback.format_exception(type(e), e, e.__traceback__))
135
+ ))
136
+ seed.params.retry += 1
137
+ self.launcher_queue['todo'].push(seed)
138
+ finally:
139
+ time.sleep(0.1)
140
+
141
+ def run(self):
142
+ for index in range(self.spider_thread_num):
143
+ threading.Thread(name=f"spider_{index}", target=self.spider).start()
144
+
@@ -0,0 +1,212 @@
1
+ import json
2
+ import threading
3
+ import time
4
+ import traceback
5
+ from inspect import isgenerator
6
+ from typing import Union, Callable, Mapping
7
+ from urllib.parse import urlparse
8
+
9
+ import urllib3
10
+ from requests import HTTPError, Response as Res
11
+
12
+ from cobweb.constant import DealModel, LogTemplate
13
+ from cobweb.base import (
14
+ Seed,
15
+ BaseItem,
16
+ Request,
17
+ Response,
18
+ ConsoleItem,
19
+ logger
20
+ )
21
+ from cobweb.utils import LoghubDot
22
+
23
+
24
+ class Crawler(threading.Thread):
25
+
26
+ def __init__(
27
+ self,
28
+ task: str,
29
+ project: str,
30
+ stop: threading.Event,
31
+ pause: threading.Event,
32
+ # launcher_queue: Union[Mapping[str, Queue]],
33
+ get_seed: Callable,
34
+ set_seed: Callable,
35
+ add_seed: Callable,
36
+ delete_seed: Callable,
37
+ upload_data: Callable,
38
+ custom_func: Union[Mapping[str, Callable]],
39
+ thread_num: int,
40
+ max_retries: int,
41
+ time_sleep: int,
42
+ ):
43
+ super().__init__()
44
+ self.task = task
45
+ self.project = project
46
+ self._stop = stop
47
+ self._pause = pause
48
+ self._get_seed = get_seed
49
+ self._set_seed = set_seed
50
+ self._add_seed = add_seed
51
+ self._delete_seed = delete_seed
52
+ self._upload_data = upload_data
53
+
54
+ for func_name, _callable in custom_func.items():
55
+ if isinstance(_callable, Callable):
56
+ self.__setattr__(func_name, _callable)
57
+
58
+ self.thread_num = thread_num
59
+ self.time_sleep = time_sleep
60
+ self.max_retries = max_retries
61
+
62
+ self.loghub_dot = LoghubDot()
63
+
64
+ @staticmethod
65
+ def request(seed: Seed) -> Union[Request, BaseItem]:
66
+ yield Request(seed.url, seed, timeout=5)
67
+
68
+ @staticmethod
69
+ def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
70
+ response = item.download()
71
+ yield Response(item.seed, response, **item.to_dict)
72
+
73
+ @staticmethod
74
+ def parse(item: Response) -> BaseItem:
75
+ upload_item = item.to_dict
76
+ upload_item["text"] = item.response.text
77
+ yield ConsoleItem(item.seed, data=json.dumps(upload_item, ensure_ascii=False))
78
+
79
+ # def get_seed(self) -> Seed:
80
+ # return self._todo.pop()
81
+
82
+ def distribute(self, item, seed):
83
+ if isinstance(item, BaseItem):
84
+ self._upload_data(item)
85
+ elif isinstance(item, Seed):
86
+ self._add_seed(item)
87
+ elif isinstance(item, str) and item == DealModel.poll:
88
+ self._set_seed(seed)
89
+ elif isinstance(item, str) and item == DealModel.done:
90
+ self._delete_seed(seed)
91
+ elif isinstance(item, str) and item == DealModel.fail:
92
+ seed.params.seed_status = DealModel.fail
93
+ self._delete_seed(seed)
94
+ else:
95
+ raise TypeError("yield value type error!")
96
+
97
+ def spider(self):
98
+ while not self._stop.is_set():
99
+
100
+ seed = self._get_seed()
101
+
102
+ if not seed:
103
+ time.sleep(1)
104
+ continue
105
+
106
+ elif seed.params.retry > self.max_retries:
107
+ seed.params.seed_status = DealModel.fail
108
+ self._delete_seed(seed)
109
+ continue
110
+
111
+ seed_detail_log_info = LogTemplate.log_info(seed.to_dict)
112
+
113
+ try:
114
+ request_iterators = self.request(seed)
115
+
116
+ if not isgenerator(request_iterators):
117
+ raise TypeError("request function isn't a generator!")
118
+
119
+ iterator_status = False
120
+
121
+ for request_item in request_iterators:
122
+
123
+ iterator_status = True
124
+
125
+ if isinstance(request_item, Request):
126
+ iterator_status = False
127
+ start_time = time.time()
128
+ download_iterators = self.download(request_item)
129
+ if not isgenerator(download_iterators):
130
+ raise TypeError("download function isn't a generator")
131
+
132
+ for download_item in download_iterators:
133
+ iterator_status = True
134
+ if isinstance(download_item, Response):
135
+ iterator_status = False
136
+ logger.info(LogTemplate.download_info.format(
137
+ detail=seed_detail_log_info,
138
+ retry=seed.params.retry,
139
+ priority=seed.params.priority,
140
+ seed_version=seed.params.seed_version,
141
+ identifier=seed.identifier or "",
142
+ status=download_item.response,
143
+ response=LogTemplate.log_info(download_item.to_dict)
144
+ ))
145
+ if isinstance(download_item.response, Res):
146
+ end_time = time.time()
147
+ self.loghub_dot.build(
148
+ topic=urlparse(download_item.response.request.url).netloc,
149
+ data_size=int(download_item.response.headers.get("content-length", 0)),
150
+ cost_time=end_time - start_time, status = 200,
151
+ url=download_item.response.url,
152
+ )
153
+ parse_iterators = self.parse(download_item)
154
+ if not isgenerator(parse_iterators):
155
+ raise TypeError("parse function isn't a generator")
156
+ for parse_item in parse_iterators:
157
+ iterator_status = True
158
+ if isinstance(parse_item, Response):
159
+ raise TypeError("upload_item can't be a Response instance")
160
+ self.distribute(parse_item, seed)
161
+ else:
162
+ self.distribute(download_item, seed)
163
+ else:
164
+ self.distribute(request_item, seed)
165
+
166
+ if not iterator_status:
167
+ raise ValueError("request/download/parse function yield value error!")
168
+ except (HTTPError, urllib3.exceptions.HTTPError, urllib3.exceptions.PoolError) as e:
169
+ exception_msg = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
170
+ url = seed.url
171
+ status = str(e)
172
+ if getattr(e, "response", None) and isinstance(e.response, Res):
173
+ url = e.response.request.url
174
+ status = e.response.status_code
175
+ self.loghub_dot.build(
176
+ topic=urlparse(url).netloc,
177
+ data_size=-1, cost_time=-1,
178
+ status=status, url=url,
179
+ msg=exception_msg
180
+ )
181
+ logger.info(LogTemplate.download_exception.format(
182
+ detail=seed_detail_log_info,
183
+ retry=seed.params.retry,
184
+ priority=seed.params.priority,
185
+ seed_version=seed.params.seed_version,
186
+ identifier=seed.identifier or "",
187
+ exception=''.join(traceback.format_exception(type(e), e, e.__traceback__))
188
+ ))
189
+ seed.params.retry += 1
190
+ self._set_seed(seed)
191
+ # time.sleep(self.time_sleep * seed.params.retry)
192
+ except Exception as e:
193
+ logger.info(LogTemplate.download_exception.format(
194
+ detail=seed_detail_log_info,
195
+ retry=seed.params.retry,
196
+ priority=seed.params.priority,
197
+ seed_version=seed.params.seed_version,
198
+ identifier=seed.identifier or "",
199
+ exception=''.join(traceback.format_exception(type(e), e, e.__traceback__))
200
+ ))
201
+ seed.params.retry += 1
202
+ # self._todo.push(seed)
203
+ self._set_seed(seed)
204
+ # time.sleep(self.time_sleep * seed.params.retry)
205
+ finally:
206
+ time.sleep(0.1)
207
+ logger.info("spider thread close")
208
+
209
+ def run(self):
210
+ for index in range(self.thread_num):
211
+ threading.Thread(name=f"spider_{index}", target=self.spider).start()
212
+
@@ -0,0 +1,98 @@
1
+ import os
2
+ from typing import Union
3
+ from cobweb import setting
4
+ from cobweb.utils import OssUtil
5
+ from cobweb.crawlers import Crawler
6
+ from cobweb.base import Seed, BaseItem, Request, Response
7
+ from cobweb.exceptions import OssDBPutPartError, OssDBMergeError
8
+
9
+
10
+ oss_util = OssUtil(is_path_style=bool(int(os.getenv("PRIVATE_LINK", 0))))
11
+
12
+
13
+ class FileCrawlerAir(Crawler):
14
+
15
+ @staticmethod
16
+ def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
17
+ seed_dict = item.seed.to_dict
18
+ seed_dict["bucket_name"] = oss_util.bucket
19
+ try:
20
+ seed_dict["oss_path"] = key = item.seed.oss_path or getattr(item, "oss_path")
21
+
22
+ if oss_util.exists(key):
23
+ seed_dict["data_size"] = oss_util.head(key).content_length
24
+ yield Response(item.seed, "exists", **seed_dict)
25
+
26
+ else:
27
+ seed_dict.setdefault("end", "")
28
+ seed_dict.setdefault("start", 0)
29
+
30
+ if seed_dict["end"] or seed_dict["start"]:
31
+ start, end = seed_dict["start"], seed_dict["end"]
32
+ item.request_setting["headers"]['Range'] = f'bytes={start}-{end}'
33
+
34
+ if not item.seed.identifier:
35
+ content = b""
36
+ chunk_size = oss_util.chunk_size
37
+ min_upload_size = oss_util.min_upload_size
38
+ seed_dict.setdefault("position", 1)
39
+
40
+ response = item.download()
41
+
42
+ content_type = response.headers.get("content-type", "").split(";")[0]
43
+ seed_dict["data_size"] = content_length = int(response.headers.get("content-length", 0))
44
+
45
+ if content_type and content_type in setting.FILE_FILTER_CONTENT_TYPE:
46
+ """过滤响应文件类型"""
47
+ response.close()
48
+ seed_dict["filter"] = True
49
+ seed_dict["msg"] = f"response content type is {content_type}"
50
+ yield Response(item.seed, response, **seed_dict)
51
+
52
+ elif seed_dict['position'] == 1 and min_upload_size >= content_length > 0:
53
+ """过小文件标识返回"""
54
+ response.close()
55
+ seed_dict["filter"] = True
56
+ seed_dict["msg"] = "file size is too small"
57
+ yield Response(item.seed, response, **seed_dict)
58
+
59
+ elif seed_dict['position'] == 1 and chunk_size > content_length > min_upload_size:
60
+ """小文件直接下载"""
61
+ for part_data in response.iter_content(chunk_size):
62
+ content += part_data
63
+ response.close()
64
+ oss_util.put(key, content)
65
+ yield Response(item.seed, response, **seed_dict)
66
+
67
+ else:
68
+ """中大文件同步分片下载"""
69
+ seed_dict.setdefault("upload_id", oss_util.init_part(key).upload_id)
70
+
71
+ for part_data in response.iter_content(chunk_size):
72
+ content += part_data
73
+ if len(content) >= chunk_size:
74
+ upload_data = content[:chunk_size]
75
+ content = content[chunk_size:]
76
+ oss_util.put_part(key, seed_dict["upload_id"], seed_dict['position'], content)
77
+ seed_dict['start'] += len(upload_data)
78
+ seed_dict['position'] += 1
79
+
80
+ response.close()
81
+
82
+ if content:
83
+ oss_util.put_part(key, seed_dict["upload_id"], seed_dict['position'], content)
84
+ oss_util.merge(key, seed_dict["upload_id"])
85
+ seed_dict["data_size"] = oss_util.head(key).content_length
86
+ yield Response(item.seed, response, **seed_dict)
87
+
88
+ elif item.seed.identifier == "merge":
89
+ oss_util.merge(key, seed_dict["upload_id"])
90
+ seed_dict["data_size"] = oss_util.head(key).content_length
91
+ yield Response(item.seed, "merge", **seed_dict)
92
+
93
+ except OssDBPutPartError:
94
+ yield Seed(seed_dict)
95
+ except OssDBMergeError:
96
+ yield Seed(seed_dict, identifier="merge")
97
+
98
+
cobweb/db/__init__.py CHANGED
@@ -1,2 +1,2 @@
1
- from . import oss_db, redis_db
2
- from . import scheduler, storer
1
+ from .redis_db import RedisDB
2
+ from .api_db import ApiDB