cobweb-launcher 0.1.8__py3-none-any.whl → 1.2.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. cobweb/__init__.py +2 -11
  2. cobweb/base/__init__.py +9 -0
  3. cobweb/base/basic.py +297 -0
  4. cobweb/base/common_queue.py +30 -0
  5. cobweb/base/decorators.py +40 -0
  6. cobweb/base/dotting.py +35 -0
  7. cobweb/base/item.py +46 -0
  8. cobweb/{log.py → base/log.py} +4 -6
  9. cobweb/base/request.py +82 -0
  10. cobweb/base/response.py +23 -0
  11. cobweb/base/seed.py +114 -0
  12. cobweb/constant.py +94 -0
  13. cobweb/crawlers/__init__.py +1 -0
  14. cobweb/crawlers/base_crawler.py +144 -0
  15. cobweb/crawlers/crawler.py +209 -0
  16. cobweb/crawlers/file_crawler.py +98 -0
  17. cobweb/db/__init__.py +2 -2
  18. cobweb/db/api_db.py +82 -0
  19. cobweb/db/redis_db.py +125 -218
  20. cobweb/exceptions/__init__.py +1 -0
  21. cobweb/exceptions/oss_db_exception.py +28 -0
  22. cobweb/launchers/__init__.py +3 -0
  23. cobweb/launchers/launcher.py +235 -0
  24. cobweb/launchers/launcher_air.py +88 -0
  25. cobweb/launchers/launcher_api.py +209 -0
  26. cobweb/launchers/launcher_pro.py +208 -0
  27. cobweb/pipelines/__init__.py +3 -0
  28. cobweb/pipelines/pipeline.py +69 -0
  29. cobweb/pipelines/pipeline_console.py +22 -0
  30. cobweb/pipelines/pipeline_loghub.py +34 -0
  31. cobweb/schedulers/__init__.py +3 -0
  32. cobweb/schedulers/scheduler_api.py +72 -0
  33. cobweb/schedulers/scheduler_redis.py +72 -0
  34. cobweb/setting.py +67 -6
  35. cobweb/utils/__init__.py +5 -0
  36. cobweb/utils/bloom.py +58 -0
  37. cobweb/utils/dotting.py +32 -0
  38. cobweb/utils/oss.py +94 -0
  39. cobweb/utils/tools.py +42 -0
  40. cobweb_launcher-1.2.41.dist-info/METADATA +205 -0
  41. cobweb_launcher-1.2.41.dist-info/RECORD +44 -0
  42. {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.41.dist-info}/WHEEL +1 -1
  43. cobweb/bbb.py +0 -191
  44. cobweb/db/oss_db.py +0 -127
  45. cobweb/db/scheduler/__init__.py +0 -0
  46. cobweb/db/scheduler/default.py +0 -8
  47. cobweb/db/scheduler/textfile.py +0 -27
  48. cobweb/db/storer/__init__.py +0 -0
  49. cobweb/db/storer/console.py +0 -9
  50. cobweb/db/storer/loghub.py +0 -54
  51. cobweb/db/storer/redis.py +0 -15
  52. cobweb/db/storer/textfile.py +0 -15
  53. cobweb/decorators.py +0 -16
  54. cobweb/distributed/__init__.py +0 -0
  55. cobweb/distributed/launcher.py +0 -243
  56. cobweb/distributed/models.py +0 -143
  57. cobweb/interface.py +0 -34
  58. cobweb/single/__init__.py +0 -0
  59. cobweb/single/launcher.py +0 -231
  60. cobweb/single/models.py +0 -134
  61. cobweb/single/nest.py +0 -153
  62. cobweb/task.py +0 -50
  63. cobweb/utils.py +0 -90
  64. cobweb_launcher-0.1.8.dist-info/METADATA +0 -45
  65. cobweb_launcher-0.1.8.dist-info/RECORD +0 -31
  66. {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.41.dist-info}/LICENSE +0 -0
  67. {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.41.dist-info}/top_level.txt +0 -0
cobweb/base/seed.py ADDED
@@ -0,0 +1,114 @@
1
+ import json
2
+ import time
3
+ import hashlib
4
+
5
+
6
+ class SeedParams:
7
+
8
+ def __init__(self, retry, priority, seed_version, seed_status=None):
9
+ self.retry = retry or 0
10
+ self.priority = priority or 300
11
+ self.seed_version = seed_version or int(time.time())
12
+ self.seed_status = seed_status
13
+
14
+
15
+ class Seed:
16
+
17
+ __SEED_PARAMS__ = [
18
+ "retry",
19
+ "priority",
20
+ "seed_version",
21
+ "seed_status"
22
+ ]
23
+
24
+ def __init__(
25
+ self,
26
+ seed,
27
+ sid=None,
28
+ retry=None,
29
+ priority=None,
30
+ seed_version=None,
31
+ seed_status=None,
32
+ **kwargs
33
+ ):
34
+ if any(isinstance(seed, t) for t in (str, bytes)):
35
+ try:
36
+ item = json.loads(seed)
37
+ self._init_seed(item)
38
+ except json.JSONDecodeError:
39
+ self.__setattr__("url", seed)
40
+ elif isinstance(seed, dict):
41
+ self._init_seed(seed)
42
+ else:
43
+ raise TypeError(Exception(
44
+ f"seed type error, "
45
+ f"must be str or dict! "
46
+ f"seed: {seed}"
47
+ ))
48
+
49
+ seed_params = {
50
+ "retry": retry,
51
+ "priority": priority,
52
+ "seed_version": seed_version,
53
+ "seed_status": seed_status,
54
+ }
55
+
56
+ if kwargs:
57
+ self._init_seed(kwargs)
58
+ seed_params.update({
59
+ k:v for k, v in kwargs.items()
60
+ if k in self.__SEED_PARAMS__
61
+ })
62
+ if sid or not getattr(self, "sid", None):
63
+ self._init_id(sid)
64
+ self.params = SeedParams(**seed_params)
65
+
66
+ def __getattr__(self, name):
67
+ return None
68
+
69
+ def __setitem__(self, key, value):
70
+ setattr(self, key, value)
71
+
72
+ def __getitem__(self, item):
73
+ return getattr(self, item)
74
+
75
+ def __str__(self):
76
+ return json.dumps(self.__dict__, ensure_ascii=False)
77
+
78
+ def __repr__(self):
79
+ chars = [f"{k}={v}" for k, v in self.__dict__.items()]
80
+ return f'{self.__class__.__name__}({", ".join(chars)})'
81
+
82
+ def _init_seed(self, seed_info:dict):
83
+ for k, v in seed_info.items():
84
+ if k not in self.__SEED_PARAMS__:
85
+ self.__setattr__(k, v)
86
+
87
+ def _init_id(self, sid):
88
+ if not sid:
89
+ sid = hashlib.md5(self.to_string.encode()).hexdigest()
90
+ self.__setattr__("sid", sid)
91
+
92
+ @property
93
+ def to_dict(self) -> dict:
94
+ seed = self.__dict__.copy()
95
+ if seed.get("params"):
96
+ del seed["params"]
97
+ return seed
98
+
99
+ @property
100
+ def to_string(self) -> str:
101
+ return json.dumps(
102
+ self.to_dict,
103
+ ensure_ascii=False,
104
+ separators=(",", ":")
105
+ )
106
+
107
+ @property
108
+ def get_all(self):
109
+ return json.dumps(
110
+ self.__dict__,
111
+ ensure_ascii=False,
112
+ separators=(",", ":")
113
+ )
114
+
cobweb/constant.py ADDED
@@ -0,0 +1,94 @@
1
+
2
+ class CrawlerModel:
3
+
4
+ default = "cobweb.crawlers.Crawler"
5
+ file_air = "cobweb.crawlers.FileCrawlerAir"
6
+ file_pro = "cobweb.crawlers.FileCrawlerPro"
7
+
8
+
9
+ class LauncherModel:
10
+ task = "launcher model: task"
11
+ resident = "launcher model: resident"
12
+
13
+
14
+ class DownloadModel:
15
+ common = "download model: common"
16
+ file = "download model: file"
17
+
18
+
19
+ class LogModel:
20
+ simple = "log model: simple"
21
+ common = "log model: common"
22
+ detailed = "log model: detailed"
23
+
24
+
25
+ class DealModel:
26
+ fail = "deal model: fail"
27
+ done = "deal model: done"
28
+ poll = "deal model: poll"
29
+
30
+
31
+ class LogTemplate:
32
+
33
+ console_item = """
34
+ ----------------------- start - console pipeline -----------------
35
+ 种子详情 \n{seed_detail}
36
+ 解析详情 \n{parse_detail}
37
+ ----------------------- end - console pipeline ------------------
38
+ """
39
+
40
+ launcher_air_polling = """
41
+ ----------------------- start - 轮训日志: {task} -----------------
42
+ 内存队列
43
+ 种子数: {doing_len}
44
+ 待消费: {todo_len}
45
+ 已消费: {done_len}
46
+ 存储队列
47
+ 待上传: {upload_len}
48
+ ----------------------- end - 轮训日志: {task} ------------------
49
+ """
50
+
51
+ launcher_pro_polling = """
52
+ ----------------------- start - 轮训日志: {task} -----------------
53
+ 内存队列
54
+ 种子数: {doing_len}
55
+ 待消费: {todo_len}
56
+ 已消费: {done_len}
57
+ redis队列
58
+ 种子数: {redis_seed_count}
59
+ 待消费: {redis_todo_len}
60
+ 消费中: {redis_doing_len}
61
+ 存储队列
62
+ 待上传: {upload_len}
63
+ ----------------------- end - 轮训日志: {task} ------------------
64
+ """
65
+
66
+ download_exception = """
67
+ ----------------------- download exception -----------------------
68
+ 种子详情 \n{detail}
69
+ 种子参数
70
+ retry : {retry}
71
+ priority : {priority}
72
+ seed_version : {seed_version}
73
+ identifier : {identifier}
74
+ exception
75
+ msg : {exception}
76
+ ------------------------------------------------------------------
77
+ """
78
+
79
+ download_info = """
80
+ ------------------------ download info ---------------------------
81
+ 种子详情 \n{detail}
82
+ 种子参数
83
+ retry : {retry}
84
+ priority : {priority}
85
+ seed_version : {seed_version}
86
+ identifier : {identifier}
87
+ response
88
+ status : {status} \n{response}
89
+ ------------------------------------------------------------------
90
+ """
91
+
92
+ @staticmethod
93
+ def log_info(item: dict) -> str:
94
+ return "\n".join([" " * 12 + f"{str(k).ljust(14)}: {str(v)}" for k, v in item.items()])
@@ -0,0 +1 @@
1
+ from .crawler import Crawler
@@ -0,0 +1,144 @@
1
+ import threading
2
+ import time
3
+ import traceback
4
+
5
+ from inspect import isgenerator
6
+ from typing import Union, Callable, Mapping
7
+
8
+ from cobweb.base import Queue, Seed, BaseItem, Request, Response, logger
9
+ from cobweb.constant import DealModel, LogTemplate
10
+ from cobweb.utils import download_log_info
11
+ from cobweb import setting
12
+
13
+
14
+ class Crawler(threading.Thread):
15
+
16
+ def __init__(
17
+ self,
18
+ upload_queue: Queue,
19
+ custom_func: Union[Mapping[str, Callable]],
20
+ launcher_queue: Union[Mapping[str, Queue]],
21
+ ):
22
+ super().__init__()
23
+
24
+ self.upload_queue = upload_queue
25
+ for func_name, _callable in custom_func.items():
26
+ if isinstance(_callable, Callable):
27
+ self.__setattr__(func_name, _callable)
28
+
29
+ self.launcher_queue = launcher_queue
30
+
31
+ self.spider_thread_num = setting.SPIDER_THREAD_NUM
32
+ self.max_retries = setting.SPIDER_MAX_RETRIES
33
+
34
+ @staticmethod
35
+ def request(seed: Seed) -> Union[Request, BaseItem]:
36
+ stream = True if setting.DOWNLOAD_MODEL else False
37
+ yield Request(seed.url, seed, stream=stream, timeout=5)
38
+
39
+ @staticmethod
40
+ def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
41
+ response = item.download()
42
+ yield Response(item.seed, response, **item.to_dict)
43
+
44
+ @staticmethod
45
+ def parse(item: Response) -> BaseItem:
46
+ pass
47
+
48
+ def get_seed(self) -> Seed:
49
+ return self.launcher_queue['todo'].pop()
50
+
51
+ def distribute(self, item, seed):
52
+ if isinstance(item, BaseItem):
53
+ self.upload_queue.push(item)
54
+ elif isinstance(item, Seed):
55
+ self.launcher_queue['new'].push(item)
56
+ elif isinstance(item, str) and item == DealModel.poll:
57
+ self.launcher_queue['todo'].push(seed)
58
+ elif isinstance(item, str) and item == DealModel.done:
59
+ self.launcher_queue['done'].push(seed)
60
+ elif isinstance(item, str) and item == DealModel.fail:
61
+ seed.params.seed_status = DealModel.fail
62
+ self.launcher_queue['done'].push(seed)
63
+ else:
64
+ raise TypeError("yield value type error!")
65
+
66
+ def spider(self):
67
+ while True:
68
+ seed = self.get_seed()
69
+
70
+ if not seed:
71
+ continue
72
+
73
+ elif seed.params.retry >= self.max_retries:
74
+ seed.params.seed_status = DealModel.fail
75
+ self.launcher_queue['done'].push(seed)
76
+ continue
77
+
78
+ seed_detail_log_info = download_log_info(seed.to_dict)
79
+
80
+ try:
81
+ request_iterators = self.request(seed)
82
+
83
+ if not isgenerator(request_iterators):
84
+ raise TypeError("request function isn't a generator!")
85
+
86
+ iterator_status = False
87
+
88
+ for request_item in request_iterators:
89
+
90
+ iterator_status = True
91
+
92
+ if isinstance(request_item, Request):
93
+ iterator_status = False
94
+ download_iterators = self.download(request_item)
95
+ if not isgenerator(download_iterators):
96
+ raise TypeError("download function isn't a generator")
97
+
98
+ for download_item in download_iterators:
99
+ iterator_status = True
100
+ if isinstance(download_item, Response):
101
+ iterator_status = False
102
+ logger.info(LogTemplate.download_info.format(
103
+ detail=seed_detail_log_info,
104
+ retry=seed.params.retry,
105
+ priority=seed.params.priority,
106
+ seed_version=seed.params.seed_version,
107
+ identifier=seed.identifier or "",
108
+ status=download_item.response,
109
+ response=download_log_info(download_item.to_dict)
110
+ ))
111
+ parse_iterators = self.parse(download_item)
112
+ if not isgenerator(parse_iterators):
113
+ raise TypeError("parse function isn't a generator")
114
+ for parse_item in parse_iterators:
115
+ iterator_status = True
116
+ if isinstance(parse_item, Response):
117
+ raise TypeError("upload_item can't be a Response instance")
118
+ self.distribute(parse_item, seed)
119
+ else:
120
+ self.distribute(download_item, seed)
121
+ else:
122
+ self.distribute(request_item, seed)
123
+
124
+ if not iterator_status:
125
+ raise ValueError("request/download/parse function yield value error!")
126
+
127
+ except Exception as e:
128
+ logger.info(LogTemplate.download_exception.format(
129
+ detail=seed_detail_log_info,
130
+ retry=seed.params.retry,
131
+ priority=seed.params.priority,
132
+ seed_version=seed.params.seed_version,
133
+ identifier=seed.identifier or "",
134
+ exception=''.join(traceback.format_exception(type(e), e, e.__traceback__))
135
+ ))
136
+ seed.params.retry += 1
137
+ self.launcher_queue['todo'].push(seed)
138
+ finally:
139
+ time.sleep(0.1)
140
+
141
+ def run(self):
142
+ for index in range(self.spider_thread_num):
143
+ threading.Thread(name=f"spider_{index}", target=self.spider).start()
144
+
@@ -0,0 +1,209 @@
1
+ import json
2
+ import threading
3
+ import time
4
+ import traceback
5
+ from inspect import isgenerator
6
+ from typing import Union, Callable, Mapping
7
+ from urllib.parse import urlparse
8
+
9
+ from requests import HTTPError, Response as Res
10
+
11
+ from cobweb.constant import DealModel, LogTemplate
12
+ from cobweb.base import (
13
+ Seed,
14
+ BaseItem,
15
+ Request,
16
+ Response,
17
+ ConsoleItem,
18
+ logger
19
+ )
20
+ from cobweb.utils import LoghubDot
21
+
22
+
23
+ class Crawler(threading.Thread):
24
+
25
+ def __init__(
26
+ self,
27
+ task: str,
28
+ project: str,
29
+ stop: threading.Event,
30
+ pause: threading.Event,
31
+ # launcher_queue: Union[Mapping[str, Queue]],
32
+ get_seed: Callable,
33
+ set_seed: Callable,
34
+ add_seed: Callable,
35
+ delete_seed: Callable,
36
+ upload_data: Callable,
37
+ custom_func: Union[Mapping[str, Callable]],
38
+ thread_num: int,
39
+ max_retries: int,
40
+ time_sleep: int,
41
+ ):
42
+ super().__init__()
43
+ self.task = task
44
+ self.project = project
45
+ self._stop = stop
46
+ self._pause = pause
47
+ self._get_seed = get_seed
48
+ self._set_seed = set_seed
49
+ self._add_seed = add_seed
50
+ self._delete_seed = delete_seed
51
+ self._upload_data = upload_data
52
+
53
+ for func_name, _callable in custom_func.items():
54
+ if isinstance(_callable, Callable):
55
+ self.__setattr__(func_name, _callable)
56
+
57
+ self.thread_num = thread_num
58
+ self.time_sleep = time_sleep
59
+ self.max_retries = max_retries
60
+
61
+ self.loghub_dot = LoghubDot()
62
+
63
+ @staticmethod
64
+ def request(seed: Seed) -> Union[Request, BaseItem]:
65
+ yield Request(seed.url, seed, timeout=5)
66
+
67
+ @staticmethod
68
+ def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
69
+ response = item.download()
70
+ yield Response(item.seed, response, **item.to_dict)
71
+
72
+ @staticmethod
73
+ def parse(item: Response) -> BaseItem:
74
+ upload_item = item.to_dict
75
+ upload_item["text"] = item.response.text
76
+ yield ConsoleItem(item.seed, data=json.dumps(upload_item, ensure_ascii=False))
77
+
78
+ # def get_seed(self) -> Seed:
79
+ # return self._todo.pop()
80
+
81
+ def distribute(self, item, seed):
82
+ if isinstance(item, BaseItem):
83
+ self._upload_data(item)
84
+ elif isinstance(item, Seed):
85
+ self._add_seed(item)
86
+ elif isinstance(item, str) and item == DealModel.poll:
87
+ self._set_seed(seed)
88
+ elif isinstance(item, str) and item == DealModel.done:
89
+ self._delete_seed(seed)
90
+ elif isinstance(item, str) and item == DealModel.fail:
91
+ seed.params.seed_status = DealModel.fail
92
+ self._delete_seed(seed)
93
+ else:
94
+ raise TypeError("yield value type error!")
95
+
96
+ def spider(self):
97
+ while not self._stop.is_set():
98
+
99
+ seed = self._get_seed()
100
+
101
+ if not seed:
102
+ time.sleep(1)
103
+ continue
104
+
105
+ elif seed.params.retry > self.max_retries:
106
+ seed.params.seed_status = DealModel.fail
107
+ self._delete_seed(seed)
108
+ continue
109
+
110
+ seed_detail_log_info = LogTemplate.log_info(seed.to_dict)
111
+
112
+ try:
113
+ request_iterators = self.request(seed)
114
+
115
+ if not isgenerator(request_iterators):
116
+ raise TypeError("request function isn't a generator!")
117
+
118
+ iterator_status = False
119
+
120
+ for request_item in request_iterators:
121
+
122
+ iterator_status = True
123
+
124
+ if isinstance(request_item, Request):
125
+ iterator_status = False
126
+ start_time = time.time()
127
+ download_iterators = self.download(request_item)
128
+ if not isgenerator(download_iterators):
129
+ raise TypeError("download function isn't a generator")
130
+
131
+ for download_item in download_iterators:
132
+ iterator_status = True
133
+ if isinstance(download_item, Response):
134
+ iterator_status = False
135
+ logger.info(LogTemplate.download_info.format(
136
+ detail=seed_detail_log_info,
137
+ retry=seed.params.retry,
138
+ priority=seed.params.priority,
139
+ seed_version=seed.params.seed_version,
140
+ identifier=seed.identifier or "",
141
+ status=download_item.response,
142
+ response=LogTemplate.log_info(download_item.to_dict)
143
+ ))
144
+ if isinstance(download_item.response, Res):
145
+ end_time = time.time()
146
+ self.loghub_dot.build(
147
+ topic=urlparse(download_item.response.request.url).netloc,
148
+ data_size=int(download_item.response.headers.get("content-length", 0)),
149
+ cost_time=end_time - start_time, status = 200,
150
+ url=download_item.response.url,
151
+ )
152
+ parse_iterators = self.parse(download_item)
153
+ if not isgenerator(parse_iterators):
154
+ raise TypeError("parse function isn't a generator")
155
+ for parse_item in parse_iterators:
156
+ iterator_status = True
157
+ if isinstance(parse_item, Response):
158
+ raise TypeError("upload_item can't be a Response instance")
159
+ self.distribute(parse_item, seed)
160
+ else:
161
+ self.distribute(download_item, seed)
162
+ else:
163
+ self.distribute(request_item, seed)
164
+
165
+ if not iterator_status:
166
+ raise ValueError("request/download/parse function yield value error!")
167
+ except HTTPError as e:
168
+ if isinstance(e.response, Res):
169
+ url = e.response.request.url
170
+ status = e.response.status_code
171
+ exception_msg = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
172
+ self.loghub_dot.build(
173
+ topic=urlparse(url).netloc,
174
+ data_size=-1, cost_time=-1,
175
+ status=status, url=url,
176
+ msg=exception_msg
177
+ )
178
+ logger.info(LogTemplate.download_exception.format(
179
+ detail=seed_detail_log_info,
180
+ retry=seed.params.retry,
181
+ priority=seed.params.priority,
182
+ seed_version=seed.params.seed_version,
183
+ identifier=seed.identifier or "",
184
+ exception=''.join(traceback.format_exception(type(e), e, e.__traceback__))
185
+ ))
186
+ seed.params.retry += 1
187
+ self._set_seed(seed)
188
+ time.sleep(self.time_sleep * seed.params.retry)
189
+ except Exception as e:
190
+ logger.info(LogTemplate.download_exception.format(
191
+ detail=seed_detail_log_info,
192
+ retry=seed.params.retry,
193
+ priority=seed.params.priority,
194
+ seed_version=seed.params.seed_version,
195
+ identifier=seed.identifier or "",
196
+ exception=''.join(traceback.format_exception(type(e), e, e.__traceback__))
197
+ ))
198
+ seed.params.retry += 1
199
+ # self._todo.push(seed)
200
+ self._set_seed(seed)
201
+ # time.sleep(self.time_sleep * seed.params.retry)
202
+ finally:
203
+ time.sleep(0.1)
204
+ logger.info("spider thread close")
205
+
206
+ def run(self):
207
+ for index in range(self.thread_num):
208
+ threading.Thread(name=f"spider_{index}", target=self.spider).start()
209
+
@@ -0,0 +1,98 @@
1
+ import os
2
+ from typing import Union
3
+ from cobweb import setting
4
+ from cobweb.utils import OssUtil
5
+ from cobweb.crawlers import Crawler
6
+ from cobweb.base import Seed, BaseItem, Request, Response
7
+ from cobweb.exceptions import OssDBPutPartError, OssDBMergeError
8
+
9
+
10
+ oss_util = OssUtil(is_path_style=bool(int(os.getenv("PRIVATE_LINK", 0))))
11
+
12
+
13
+ class FileCrawlerAir(Crawler):
14
+
15
+ @staticmethod
16
+ def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
17
+ seed_dict = item.seed.to_dict
18
+ seed_dict["bucket_name"] = oss_util.bucket
19
+ try:
20
+ seed_dict["oss_path"] = key = item.seed.oss_path or getattr(item, "oss_path")
21
+
22
+ if oss_util.exists(key):
23
+ seed_dict["data_size"] = oss_util.head(key).content_length
24
+ yield Response(item.seed, "exists", **seed_dict)
25
+
26
+ else:
27
+ seed_dict.setdefault("end", "")
28
+ seed_dict.setdefault("start", 0)
29
+
30
+ if seed_dict["end"] or seed_dict["start"]:
31
+ start, end = seed_dict["start"], seed_dict["end"]
32
+ item.request_setting["headers"]['Range'] = f'bytes={start}-{end}'
33
+
34
+ if not item.seed.identifier:
35
+ content = b""
36
+ chunk_size = oss_util.chunk_size
37
+ min_upload_size = oss_util.min_upload_size
38
+ seed_dict.setdefault("position", 1)
39
+
40
+ response = item.download()
41
+
42
+ content_type = response.headers.get("content-type", "").split(";")[0]
43
+ seed_dict["data_size"] = content_length = int(response.headers.get("content-length", 0))
44
+
45
+ if content_type and content_type in setting.FILE_FILTER_CONTENT_TYPE:
46
+ """过滤响应文件类型"""
47
+ response.close()
48
+ seed_dict["filter"] = True
49
+ seed_dict["msg"] = f"response content type is {content_type}"
50
+ yield Response(item.seed, response, **seed_dict)
51
+
52
+ elif seed_dict['position'] == 1 and min_upload_size >= content_length > 0:
53
+ """过小文件标识返回"""
54
+ response.close()
55
+ seed_dict["filter"] = True
56
+ seed_dict["msg"] = "file size is too small"
57
+ yield Response(item.seed, response, **seed_dict)
58
+
59
+ elif seed_dict['position'] == 1 and chunk_size > content_length > min_upload_size:
60
+ """小文件直接下载"""
61
+ for part_data in response.iter_content(chunk_size):
62
+ content += part_data
63
+ response.close()
64
+ oss_util.put(key, content)
65
+ yield Response(item.seed, response, **seed_dict)
66
+
67
+ else:
68
+ """中大文件同步分片下载"""
69
+ seed_dict.setdefault("upload_id", oss_util.init_part(key).upload_id)
70
+
71
+ for part_data in response.iter_content(chunk_size):
72
+ content += part_data
73
+ if len(content) >= chunk_size:
74
+ upload_data = content[:chunk_size]
75
+ content = content[chunk_size:]
76
+ oss_util.put_part(key, seed_dict["upload_id"], seed_dict['position'], content)
77
+ seed_dict['start'] += len(upload_data)
78
+ seed_dict['position'] += 1
79
+
80
+ response.close()
81
+
82
+ if content:
83
+ oss_util.put_part(key, seed_dict["upload_id"], seed_dict['position'], content)
84
+ oss_util.merge(key, seed_dict["upload_id"])
85
+ seed_dict["data_size"] = oss_util.head(key).content_length
86
+ yield Response(item.seed, response, **seed_dict)
87
+
88
+ elif item.seed.identifier == "merge":
89
+ oss_util.merge(key, seed_dict["upload_id"])
90
+ seed_dict["data_size"] = oss_util.head(key).content_length
91
+ yield Response(item.seed, "merge", **seed_dict)
92
+
93
+ except OssDBPutPartError:
94
+ yield Seed(seed_dict)
95
+ except OssDBMergeError:
96
+ yield Seed(seed_dict, identifier="merge")
97
+
98
+
cobweb/db/__init__.py CHANGED
@@ -1,2 +1,2 @@
1
- from . import oss_db, redis_db
2
- from . import scheduler, storer
1
+ from .redis_db import RedisDB
2
+ from .api_db import ApiDB