cobweb-launcher 0.1.8__py3-none-any.whl → 1.2.41__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. cobweb/__init__.py +2 -11
  2. cobweb/base/__init__.py +9 -0
  3. cobweb/base/basic.py +297 -0
  4. cobweb/base/common_queue.py +30 -0
  5. cobweb/base/decorators.py +40 -0
  6. cobweb/base/dotting.py +35 -0
  7. cobweb/base/item.py +46 -0
  8. cobweb/{log.py → base/log.py} +4 -6
  9. cobweb/base/request.py +82 -0
  10. cobweb/base/response.py +23 -0
  11. cobweb/base/seed.py +114 -0
  12. cobweb/constant.py +94 -0
  13. cobweb/crawlers/__init__.py +1 -0
  14. cobweb/crawlers/base_crawler.py +144 -0
  15. cobweb/crawlers/crawler.py +209 -0
  16. cobweb/crawlers/file_crawler.py +98 -0
  17. cobweb/db/__init__.py +2 -2
  18. cobweb/db/api_db.py +82 -0
  19. cobweb/db/redis_db.py +125 -218
  20. cobweb/exceptions/__init__.py +1 -0
  21. cobweb/exceptions/oss_db_exception.py +28 -0
  22. cobweb/launchers/__init__.py +3 -0
  23. cobweb/launchers/launcher.py +235 -0
  24. cobweb/launchers/launcher_air.py +88 -0
  25. cobweb/launchers/launcher_api.py +209 -0
  26. cobweb/launchers/launcher_pro.py +208 -0
  27. cobweb/pipelines/__init__.py +3 -0
  28. cobweb/pipelines/pipeline.py +69 -0
  29. cobweb/pipelines/pipeline_console.py +22 -0
  30. cobweb/pipelines/pipeline_loghub.py +34 -0
  31. cobweb/schedulers/__init__.py +3 -0
  32. cobweb/schedulers/scheduler_api.py +72 -0
  33. cobweb/schedulers/scheduler_redis.py +72 -0
  34. cobweb/setting.py +67 -6
  35. cobweb/utils/__init__.py +5 -0
  36. cobweb/utils/bloom.py +58 -0
  37. cobweb/utils/dotting.py +32 -0
  38. cobweb/utils/oss.py +94 -0
  39. cobweb/utils/tools.py +42 -0
  40. cobweb_launcher-1.2.41.dist-info/METADATA +205 -0
  41. cobweb_launcher-1.2.41.dist-info/RECORD +44 -0
  42. {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.41.dist-info}/WHEEL +1 -1
  43. cobweb/bbb.py +0 -191
  44. cobweb/db/oss_db.py +0 -127
  45. cobweb/db/scheduler/__init__.py +0 -0
  46. cobweb/db/scheduler/default.py +0 -8
  47. cobweb/db/scheduler/textfile.py +0 -27
  48. cobweb/db/storer/__init__.py +0 -0
  49. cobweb/db/storer/console.py +0 -9
  50. cobweb/db/storer/loghub.py +0 -54
  51. cobweb/db/storer/redis.py +0 -15
  52. cobweb/db/storer/textfile.py +0 -15
  53. cobweb/decorators.py +0 -16
  54. cobweb/distributed/__init__.py +0 -0
  55. cobweb/distributed/launcher.py +0 -243
  56. cobweb/distributed/models.py +0 -143
  57. cobweb/interface.py +0 -34
  58. cobweb/single/__init__.py +0 -0
  59. cobweb/single/launcher.py +0 -231
  60. cobweb/single/models.py +0 -134
  61. cobweb/single/nest.py +0 -153
  62. cobweb/task.py +0 -50
  63. cobweb/utils.py +0 -90
  64. cobweb_launcher-0.1.8.dist-info/METADATA +0 -45
  65. cobweb_launcher-0.1.8.dist-info/RECORD +0 -31
  66. {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.41.dist-info}/LICENSE +0 -0
  67. {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.41.dist-info}/top_level.txt +0 -0
cobweb/base/seed.py ADDED
@@ -0,0 +1,114 @@
1
+ import json
2
+ import time
3
+ import hashlib
4
+
5
+
6
+ class SeedParams:
7
+
8
+ def __init__(self, retry, priority, seed_version, seed_status=None):
9
+ self.retry = retry or 0
10
+ self.priority = priority or 300
11
+ self.seed_version = seed_version or int(time.time())
12
+ self.seed_status = seed_status
13
+
14
+
15
+ class Seed:
16
+
17
+ __SEED_PARAMS__ = [
18
+ "retry",
19
+ "priority",
20
+ "seed_version",
21
+ "seed_status"
22
+ ]
23
+
24
+ def __init__(
25
+ self,
26
+ seed,
27
+ sid=None,
28
+ retry=None,
29
+ priority=None,
30
+ seed_version=None,
31
+ seed_status=None,
32
+ **kwargs
33
+ ):
34
+ if any(isinstance(seed, t) for t in (str, bytes)):
35
+ try:
36
+ item = json.loads(seed)
37
+ self._init_seed(item)
38
+ except json.JSONDecodeError:
39
+ self.__setattr__("url", seed)
40
+ elif isinstance(seed, dict):
41
+ self._init_seed(seed)
42
+ else:
43
+ raise TypeError(Exception(
44
+ f"seed type error, "
45
+ f"must be str or dict! "
46
+ f"seed: {seed}"
47
+ ))
48
+
49
+ seed_params = {
50
+ "retry": retry,
51
+ "priority": priority,
52
+ "seed_version": seed_version,
53
+ "seed_status": seed_status,
54
+ }
55
+
56
+ if kwargs:
57
+ self._init_seed(kwargs)
58
+ seed_params.update({
59
+ k:v for k, v in kwargs.items()
60
+ if k in self.__SEED_PARAMS__
61
+ })
62
+ if sid or not getattr(self, "sid", None):
63
+ self._init_id(sid)
64
+ self.params = SeedParams(**seed_params)
65
+
66
+ def __getattr__(self, name):
67
+ return None
68
+
69
+ def __setitem__(self, key, value):
70
+ setattr(self, key, value)
71
+
72
+ def __getitem__(self, item):
73
+ return getattr(self, item)
74
+
75
+ def __str__(self):
76
+ return json.dumps(self.__dict__, ensure_ascii=False)
77
+
78
+ def __repr__(self):
79
+ chars = [f"{k}={v}" for k, v in self.__dict__.items()]
80
+ return f'{self.__class__.__name__}({", ".join(chars)})'
81
+
82
+ def _init_seed(self, seed_info:dict):
83
+ for k, v in seed_info.items():
84
+ if k not in self.__SEED_PARAMS__:
85
+ self.__setattr__(k, v)
86
+
87
+ def _init_id(self, sid):
88
+ if not sid:
89
+ sid = hashlib.md5(self.to_string.encode()).hexdigest()
90
+ self.__setattr__("sid", sid)
91
+
92
+ @property
93
+ def to_dict(self) -> dict:
94
+ seed = self.__dict__.copy()
95
+ if seed.get("params"):
96
+ del seed["params"]
97
+ return seed
98
+
99
+ @property
100
+ def to_string(self) -> str:
101
+ return json.dumps(
102
+ self.to_dict,
103
+ ensure_ascii=False,
104
+ separators=(",", ":")
105
+ )
106
+
107
+ @property
108
+ def get_all(self):
109
+ return json.dumps(
110
+ self.__dict__,
111
+ ensure_ascii=False,
112
+ separators=(",", ":")
113
+ )
114
+
cobweb/constant.py ADDED
@@ -0,0 +1,94 @@
1
+
2
+ class CrawlerModel:
3
+
4
+ default = "cobweb.crawlers.Crawler"
5
+ file_air = "cobweb.crawlers.FileCrawlerAir"
6
+ file_pro = "cobweb.crawlers.FileCrawlerPro"
7
+
8
+
9
+ class LauncherModel:
10
+ task = "launcher model: task"
11
+ resident = "launcher model: resident"
12
+
13
+
14
+ class DownloadModel:
15
+ common = "download model: common"
16
+ file = "download model: file"
17
+
18
+
19
+ class LogModel:
20
+ simple = "log model: simple"
21
+ common = "log model: common"
22
+ detailed = "log model: detailed"
23
+
24
+
25
+ class DealModel:
26
+ fail = "deal model: fail"
27
+ done = "deal model: done"
28
+ poll = "deal model: poll"
29
+
30
+
31
+ class LogTemplate:
32
+
33
+ console_item = """
34
+ ----------------------- start - console pipeline -----------------
35
+ 种子详情 \n{seed_detail}
36
+ 解析详情 \n{parse_detail}
37
+ ----------------------- end - console pipeline ------------------
38
+ """
39
+
40
+ launcher_air_polling = """
41
+ ----------------------- start - 轮训日志: {task} -----------------
42
+ 内存队列
43
+ 种子数: {doing_len}
44
+ 待消费: {todo_len}
45
+ 已消费: {done_len}
46
+ 存储队列
47
+ 待上传: {upload_len}
48
+ ----------------------- end - 轮训日志: {task} ------------------
49
+ """
50
+
51
+ launcher_pro_polling = """
52
+ ----------------------- start - 轮训日志: {task} -----------------
53
+ 内存队列
54
+ 种子数: {doing_len}
55
+ 待消费: {todo_len}
56
+ 已消费: {done_len}
57
+ redis队列
58
+ 种子数: {redis_seed_count}
59
+ 待消费: {redis_todo_len}
60
+ 消费中: {redis_doing_len}
61
+ 存储队列
62
+ 待上传: {upload_len}
63
+ ----------------------- end - 轮训日志: {task} ------------------
64
+ """
65
+
66
+ download_exception = """
67
+ ----------------------- download exception -----------------------
68
+ 种子详情 \n{detail}
69
+ 种子参数
70
+ retry : {retry}
71
+ priority : {priority}
72
+ seed_version : {seed_version}
73
+ identifier : {identifier}
74
+ exception
75
+ msg : {exception}
76
+ ------------------------------------------------------------------
77
+ """
78
+
79
+ download_info = """
80
+ ------------------------ download info ---------------------------
81
+ 种子详情 \n{detail}
82
+ 种子参数
83
+ retry : {retry}
84
+ priority : {priority}
85
+ seed_version : {seed_version}
86
+ identifier : {identifier}
87
+ response
88
+ status : {status} \n{response}
89
+ ------------------------------------------------------------------
90
+ """
91
+
92
+ @staticmethod
93
+ def log_info(item: dict) -> str:
94
+ return "\n".join([" " * 12 + f"{str(k).ljust(14)}: {str(v)}" for k, v in item.items()])
@@ -0,0 +1 @@
1
+ from .crawler import Crawler
@@ -0,0 +1,144 @@
1
+ import threading
2
+ import time
3
+ import traceback
4
+
5
+ from inspect import isgenerator
6
+ from typing import Union, Callable, Mapping
7
+
8
+ from cobweb.base import Queue, Seed, BaseItem, Request, Response, logger
9
+ from cobweb.constant import DealModel, LogTemplate
10
+ from cobweb.utils import download_log_info
11
+ from cobweb import setting
12
+
13
+
14
+ class Crawler(threading.Thread):
15
+
16
+ def __init__(
17
+ self,
18
+ upload_queue: Queue,
19
+ custom_func: Union[Mapping[str, Callable]],
20
+ launcher_queue: Union[Mapping[str, Queue]],
21
+ ):
22
+ super().__init__()
23
+
24
+ self.upload_queue = upload_queue
25
+ for func_name, _callable in custom_func.items():
26
+ if isinstance(_callable, Callable):
27
+ self.__setattr__(func_name, _callable)
28
+
29
+ self.launcher_queue = launcher_queue
30
+
31
+ self.spider_thread_num = setting.SPIDER_THREAD_NUM
32
+ self.max_retries = setting.SPIDER_MAX_RETRIES
33
+
34
+ @staticmethod
35
+ def request(seed: Seed) -> Union[Request, BaseItem]:
36
+ stream = True if setting.DOWNLOAD_MODEL else False
37
+ yield Request(seed.url, seed, stream=stream, timeout=5)
38
+
39
+ @staticmethod
40
+ def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
41
+ response = item.download()
42
+ yield Response(item.seed, response, **item.to_dict)
43
+
44
+ @staticmethod
45
+ def parse(item: Response) -> BaseItem:
46
+ pass
47
+
48
+ def get_seed(self) -> Seed:
49
+ return self.launcher_queue['todo'].pop()
50
+
51
+ def distribute(self, item, seed):
52
+ if isinstance(item, BaseItem):
53
+ self.upload_queue.push(item)
54
+ elif isinstance(item, Seed):
55
+ self.launcher_queue['new'].push(item)
56
+ elif isinstance(item, str) and item == DealModel.poll:
57
+ self.launcher_queue['todo'].push(seed)
58
+ elif isinstance(item, str) and item == DealModel.done:
59
+ self.launcher_queue['done'].push(seed)
60
+ elif isinstance(item, str) and item == DealModel.fail:
61
+ seed.params.seed_status = DealModel.fail
62
+ self.launcher_queue['done'].push(seed)
63
+ else:
64
+ raise TypeError("yield value type error!")
65
+
66
+ def spider(self):
67
+ while True:
68
+ seed = self.get_seed()
69
+
70
+ if not seed:
71
+ continue
72
+
73
+ elif seed.params.retry >= self.max_retries:
74
+ seed.params.seed_status = DealModel.fail
75
+ self.launcher_queue['done'].push(seed)
76
+ continue
77
+
78
+ seed_detail_log_info = download_log_info(seed.to_dict)
79
+
80
+ try:
81
+ request_iterators = self.request(seed)
82
+
83
+ if not isgenerator(request_iterators):
84
+ raise TypeError("request function isn't a generator!")
85
+
86
+ iterator_status = False
87
+
88
+ for request_item in request_iterators:
89
+
90
+ iterator_status = True
91
+
92
+ if isinstance(request_item, Request):
93
+ iterator_status = False
94
+ download_iterators = self.download(request_item)
95
+ if not isgenerator(download_iterators):
96
+ raise TypeError("download function isn't a generator")
97
+
98
+ for download_item in download_iterators:
99
+ iterator_status = True
100
+ if isinstance(download_item, Response):
101
+ iterator_status = False
102
+ logger.info(LogTemplate.download_info.format(
103
+ detail=seed_detail_log_info,
104
+ retry=seed.params.retry,
105
+ priority=seed.params.priority,
106
+ seed_version=seed.params.seed_version,
107
+ identifier=seed.identifier or "",
108
+ status=download_item.response,
109
+ response=download_log_info(download_item.to_dict)
110
+ ))
111
+ parse_iterators = self.parse(download_item)
112
+ if not isgenerator(parse_iterators):
113
+ raise TypeError("parse function isn't a generator")
114
+ for parse_item in parse_iterators:
115
+ iterator_status = True
116
+ if isinstance(parse_item, Response):
117
+ raise TypeError("upload_item can't be a Response instance")
118
+ self.distribute(parse_item, seed)
119
+ else:
120
+ self.distribute(download_item, seed)
121
+ else:
122
+ self.distribute(request_item, seed)
123
+
124
+ if not iterator_status:
125
+ raise ValueError("request/download/parse function yield value error!")
126
+
127
+ except Exception as e:
128
+ logger.info(LogTemplate.download_exception.format(
129
+ detail=seed_detail_log_info,
130
+ retry=seed.params.retry,
131
+ priority=seed.params.priority,
132
+ seed_version=seed.params.seed_version,
133
+ identifier=seed.identifier or "",
134
+ exception=''.join(traceback.format_exception(type(e), e, e.__traceback__))
135
+ ))
136
+ seed.params.retry += 1
137
+ self.launcher_queue['todo'].push(seed)
138
+ finally:
139
+ time.sleep(0.1)
140
+
141
+ def run(self):
142
+ for index in range(self.spider_thread_num):
143
+ threading.Thread(name=f"spider_{index}", target=self.spider).start()
144
+
@@ -0,0 +1,209 @@
1
+ import json
2
+ import threading
3
+ import time
4
+ import traceback
5
+ from inspect import isgenerator
6
+ from typing import Union, Callable, Mapping
7
+ from urllib.parse import urlparse
8
+
9
+ from requests import HTTPError, Response as Res
10
+
11
+ from cobweb.constant import DealModel, LogTemplate
12
+ from cobweb.base import (
13
+ Seed,
14
+ BaseItem,
15
+ Request,
16
+ Response,
17
+ ConsoleItem,
18
+ logger
19
+ )
20
+ from cobweb.utils import LoghubDot
21
+
22
+
23
+ class Crawler(threading.Thread):
24
+
25
+ def __init__(
26
+ self,
27
+ task: str,
28
+ project: str,
29
+ stop: threading.Event,
30
+ pause: threading.Event,
31
+ # launcher_queue: Union[Mapping[str, Queue]],
32
+ get_seed: Callable,
33
+ set_seed: Callable,
34
+ add_seed: Callable,
35
+ delete_seed: Callable,
36
+ upload_data: Callable,
37
+ custom_func: Union[Mapping[str, Callable]],
38
+ thread_num: int,
39
+ max_retries: int,
40
+ time_sleep: int,
41
+ ):
42
+ super().__init__()
43
+ self.task = task
44
+ self.project = project
45
+ self._stop = stop
46
+ self._pause = pause
47
+ self._get_seed = get_seed
48
+ self._set_seed = set_seed
49
+ self._add_seed = add_seed
50
+ self._delete_seed = delete_seed
51
+ self._upload_data = upload_data
52
+
53
+ for func_name, _callable in custom_func.items():
54
+ if isinstance(_callable, Callable):
55
+ self.__setattr__(func_name, _callable)
56
+
57
+ self.thread_num = thread_num
58
+ self.time_sleep = time_sleep
59
+ self.max_retries = max_retries
60
+
61
+ self.loghub_dot = LoghubDot()
62
+
63
+ @staticmethod
64
+ def request(seed: Seed) -> Union[Request, BaseItem]:
65
+ yield Request(seed.url, seed, timeout=5)
66
+
67
+ @staticmethod
68
+ def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
69
+ response = item.download()
70
+ yield Response(item.seed, response, **item.to_dict)
71
+
72
+ @staticmethod
73
+ def parse(item: Response) -> BaseItem:
74
+ upload_item = item.to_dict
75
+ upload_item["text"] = item.response.text
76
+ yield ConsoleItem(item.seed, data=json.dumps(upload_item, ensure_ascii=False))
77
+
78
+ # def get_seed(self) -> Seed:
79
+ # return self._todo.pop()
80
+
81
+ def distribute(self, item, seed):
82
+ if isinstance(item, BaseItem):
83
+ self._upload_data(item)
84
+ elif isinstance(item, Seed):
85
+ self._add_seed(item)
86
+ elif isinstance(item, str) and item == DealModel.poll:
87
+ self._set_seed(seed)
88
+ elif isinstance(item, str) and item == DealModel.done:
89
+ self._delete_seed(seed)
90
+ elif isinstance(item, str) and item == DealModel.fail:
91
+ seed.params.seed_status = DealModel.fail
92
+ self._delete_seed(seed)
93
+ else:
94
+ raise TypeError("yield value type error!")
95
+
96
+ def spider(self):
97
+ while not self._stop.is_set():
98
+
99
+ seed = self._get_seed()
100
+
101
+ if not seed:
102
+ time.sleep(1)
103
+ continue
104
+
105
+ elif seed.params.retry > self.max_retries:
106
+ seed.params.seed_status = DealModel.fail
107
+ self._delete_seed(seed)
108
+ continue
109
+
110
+ seed_detail_log_info = LogTemplate.log_info(seed.to_dict)
111
+
112
+ try:
113
+ request_iterators = self.request(seed)
114
+
115
+ if not isgenerator(request_iterators):
116
+ raise TypeError("request function isn't a generator!")
117
+
118
+ iterator_status = False
119
+
120
+ for request_item in request_iterators:
121
+
122
+ iterator_status = True
123
+
124
+ if isinstance(request_item, Request):
125
+ iterator_status = False
126
+ start_time = time.time()
127
+ download_iterators = self.download(request_item)
128
+ if not isgenerator(download_iterators):
129
+ raise TypeError("download function isn't a generator")
130
+
131
+ for download_item in download_iterators:
132
+ iterator_status = True
133
+ if isinstance(download_item, Response):
134
+ iterator_status = False
135
+ logger.info(LogTemplate.download_info.format(
136
+ detail=seed_detail_log_info,
137
+ retry=seed.params.retry,
138
+ priority=seed.params.priority,
139
+ seed_version=seed.params.seed_version,
140
+ identifier=seed.identifier or "",
141
+ status=download_item.response,
142
+ response=LogTemplate.log_info(download_item.to_dict)
143
+ ))
144
+ if isinstance(download_item.response, Res):
145
+ end_time = time.time()
146
+ self.loghub_dot.build(
147
+ topic=urlparse(download_item.response.request.url).netloc,
148
+ data_size=int(download_item.response.headers.get("content-length", 0)),
149
+ cost_time=end_time - start_time, status = 200,
150
+ url=download_item.response.url,
151
+ )
152
+ parse_iterators = self.parse(download_item)
153
+ if not isgenerator(parse_iterators):
154
+ raise TypeError("parse function isn't a generator")
155
+ for parse_item in parse_iterators:
156
+ iterator_status = True
157
+ if isinstance(parse_item, Response):
158
+ raise TypeError("upload_item can't be a Response instance")
159
+ self.distribute(parse_item, seed)
160
+ else:
161
+ self.distribute(download_item, seed)
162
+ else:
163
+ self.distribute(request_item, seed)
164
+
165
+ if not iterator_status:
166
+ raise ValueError("request/download/parse function yield value error!")
167
+ except HTTPError as e:
168
+ if isinstance(e.response, Res):
169
+ url = e.response.request.url
170
+ status = e.response.status_code
171
+ exception_msg = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
172
+ self.loghub_dot.build(
173
+ topic=urlparse(url).netloc,
174
+ data_size=-1, cost_time=-1,
175
+ status=status, url=url,
176
+ msg=exception_msg
177
+ )
178
+ logger.info(LogTemplate.download_exception.format(
179
+ detail=seed_detail_log_info,
180
+ retry=seed.params.retry,
181
+ priority=seed.params.priority,
182
+ seed_version=seed.params.seed_version,
183
+ identifier=seed.identifier or "",
184
+ exception=''.join(traceback.format_exception(type(e), e, e.__traceback__))
185
+ ))
186
+ seed.params.retry += 1
187
+ self._set_seed(seed)
188
+ time.sleep(self.time_sleep * seed.params.retry)
189
+ except Exception as e:
190
+ logger.info(LogTemplate.download_exception.format(
191
+ detail=seed_detail_log_info,
192
+ retry=seed.params.retry,
193
+ priority=seed.params.priority,
194
+ seed_version=seed.params.seed_version,
195
+ identifier=seed.identifier or "",
196
+ exception=''.join(traceback.format_exception(type(e), e, e.__traceback__))
197
+ ))
198
+ seed.params.retry += 1
199
+ # self._todo.push(seed)
200
+ self._set_seed(seed)
201
+ # time.sleep(self.time_sleep * seed.params.retry)
202
+ finally:
203
+ time.sleep(0.1)
204
+ logger.info("spider thread close")
205
+
206
+ def run(self):
207
+ for index in range(self.thread_num):
208
+ threading.Thread(name=f"spider_{index}", target=self.spider).start()
209
+
@@ -0,0 +1,98 @@
1
+ import os
2
+ from typing import Union
3
+ from cobweb import setting
4
+ from cobweb.utils import OssUtil
5
+ from cobweb.crawlers import Crawler
6
+ from cobweb.base import Seed, BaseItem, Request, Response
7
+ from cobweb.exceptions import OssDBPutPartError, OssDBMergeError
8
+
9
+
10
+ oss_util = OssUtil(is_path_style=bool(int(os.getenv("PRIVATE_LINK", 0))))
11
+
12
+
13
+ class FileCrawlerAir(Crawler):
14
+
15
+ @staticmethod
16
+ def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
17
+ seed_dict = item.seed.to_dict
18
+ seed_dict["bucket_name"] = oss_util.bucket
19
+ try:
20
+ seed_dict["oss_path"] = key = item.seed.oss_path or getattr(item, "oss_path")
21
+
22
+ if oss_util.exists(key):
23
+ seed_dict["data_size"] = oss_util.head(key).content_length
24
+ yield Response(item.seed, "exists", **seed_dict)
25
+
26
+ else:
27
+ seed_dict.setdefault("end", "")
28
+ seed_dict.setdefault("start", 0)
29
+
30
+ if seed_dict["end"] or seed_dict["start"]:
31
+ start, end = seed_dict["start"], seed_dict["end"]
32
+ item.request_setting["headers"]['Range'] = f'bytes={start}-{end}'
33
+
34
+ if not item.seed.identifier:
35
+ content = b""
36
+ chunk_size = oss_util.chunk_size
37
+ min_upload_size = oss_util.min_upload_size
38
+ seed_dict.setdefault("position", 1)
39
+
40
+ response = item.download()
41
+
42
+ content_type = response.headers.get("content-type", "").split(";")[0]
43
+ seed_dict["data_size"] = content_length = int(response.headers.get("content-length", 0))
44
+
45
+ if content_type and content_type in setting.FILE_FILTER_CONTENT_TYPE:
46
+ """过滤响应文件类型"""
47
+ response.close()
48
+ seed_dict["filter"] = True
49
+ seed_dict["msg"] = f"response content type is {content_type}"
50
+ yield Response(item.seed, response, **seed_dict)
51
+
52
+ elif seed_dict['position'] == 1 and min_upload_size >= content_length > 0:
53
+ """过小文件标识返回"""
54
+ response.close()
55
+ seed_dict["filter"] = True
56
+ seed_dict["msg"] = "file size is too small"
57
+ yield Response(item.seed, response, **seed_dict)
58
+
59
+ elif seed_dict['position'] == 1 and chunk_size > content_length > min_upload_size:
60
+ """小文件直接下载"""
61
+ for part_data in response.iter_content(chunk_size):
62
+ content += part_data
63
+ response.close()
64
+ oss_util.put(key, content)
65
+ yield Response(item.seed, response, **seed_dict)
66
+
67
+ else:
68
+ """中大文件同步分片下载"""
69
+ seed_dict.setdefault("upload_id", oss_util.init_part(key).upload_id)
70
+
71
+ for part_data in response.iter_content(chunk_size):
72
+ content += part_data
73
+ if len(content) >= chunk_size:
74
+ upload_data = content[:chunk_size]
75
+ content = content[chunk_size:]
76
+ oss_util.put_part(key, seed_dict["upload_id"], seed_dict['position'], content)
77
+ seed_dict['start'] += len(upload_data)
78
+ seed_dict['position'] += 1
79
+
80
+ response.close()
81
+
82
+ if content:
83
+ oss_util.put_part(key, seed_dict["upload_id"], seed_dict['position'], content)
84
+ oss_util.merge(key, seed_dict["upload_id"])
85
+ seed_dict["data_size"] = oss_util.head(key).content_length
86
+ yield Response(item.seed, response, **seed_dict)
87
+
88
+ elif item.seed.identifier == "merge":
89
+ oss_util.merge(key, seed_dict["upload_id"])
90
+ seed_dict["data_size"] = oss_util.head(key).content_length
91
+ yield Response(item.seed, "merge", **seed_dict)
92
+
93
+ except OssDBPutPartError:
94
+ yield Seed(seed_dict)
95
+ except OssDBMergeError:
96
+ yield Seed(seed_dict, identifier="merge")
97
+
98
+
cobweb/db/__init__.py CHANGED
@@ -1,2 +1,2 @@
1
- from . import oss_db, redis_db
2
- from . import scheduler, storer
1
+ from .redis_db import RedisDB
2
+ from .api_db import ApiDB