cobweb-launcher 1.3.6__py3-none-any.whl → 1.3.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. cobweb/base/__init__.py +9 -9
  2. cobweb/base/dotting.py +1 -1
  3. cobweb/utils/oss.py +7 -7
  4. {cobweb_launcher-1.3.6.dist-info → cobweb_launcher-1.3.8.dist-info}/METADATA +1 -1
  5. cobweb_launcher-1.3.8.dist-info/RECORD +40 -0
  6. cobweb/base/decorators.py +0 -40
  7. cobweb/crawlers/base_crawler.py +0 -144
  8. cobweb/crawlers/file_crawler.py +0 -98
  9. cobweb/pipelines/base_pipeline.py +0 -54
  10. cobweb/pipelines/loghub_pipeline.py +0 -34
  11. cobweb/utils/dotting.py +0 -32
  12. cobweb_/__init__.py +0 -2
  13. cobweb_/base/__init__.py +0 -9
  14. cobweb_/base/common_queue.py +0 -30
  15. cobweb_/base/decorators.py +0 -40
  16. cobweb_/base/item.py +0 -46
  17. cobweb_/base/log.py +0 -94
  18. cobweb_/base/request.py +0 -82
  19. cobweb_/base/response.py +0 -23
  20. cobweb_/base/seed.py +0 -114
  21. cobweb_/constant.py +0 -94
  22. cobweb_/crawlers/__init__.py +0 -1
  23. cobweb_/crawlers/crawler.py +0 -184
  24. cobweb_/db/__init__.py +0 -2
  25. cobweb_/db/api_db.py +0 -82
  26. cobweb_/db/redis_db.py +0 -130
  27. cobweb_/exceptions/__init__.py +0 -1
  28. cobweb_/exceptions/oss_db_exception.py +0 -28
  29. cobweb_/launchers/__init__.py +0 -3
  30. cobweb_/launchers/launcher.py +0 -235
  31. cobweb_/launchers/launcher_air.py +0 -88
  32. cobweb_/launchers/launcher_api.py +0 -221
  33. cobweb_/launchers/launcher_pro.py +0 -222
  34. cobweb_/pipelines/__init__.py +0 -3
  35. cobweb_/pipelines/pipeline.py +0 -69
  36. cobweb_/pipelines/pipeline_console.py +0 -22
  37. cobweb_/pipelines/pipeline_loghub.py +0 -34
  38. cobweb_/setting.py +0 -74
  39. cobweb_/utils/__init__.py +0 -5
  40. cobweb_/utils/bloom.py +0 -58
  41. cobweb_/utils/dotting.py +0 -32
  42. cobweb_/utils/oss.py +0 -94
  43. cobweb_/utils/tools.py +0 -42
  44. cobweb_launcher-1.3.6.dist-info/RECORD +0 -111
  45. cobweb_new/__init__.py +0 -2
  46. cobweb_new/base/__init__.py +0 -72
  47. cobweb_new/base/common_queue.py +0 -53
  48. cobweb_new/base/decorators.py +0 -72
  49. cobweb_new/base/item.py +0 -46
  50. cobweb_new/base/log.py +0 -94
  51. cobweb_new/base/request.py +0 -82
  52. cobweb_new/base/response.py +0 -23
  53. cobweb_new/base/seed.py +0 -118
  54. cobweb_new/constant.py +0 -105
  55. cobweb_new/crawlers/__init__.py +0 -1
  56. cobweb_new/crawlers/crawler-new.py +0 -85
  57. cobweb_new/crawlers/crawler.py +0 -170
  58. cobweb_new/db/__init__.py +0 -2
  59. cobweb_new/db/api_db.py +0 -82
  60. cobweb_new/db/redis_db.py +0 -158
  61. cobweb_new/exceptions/__init__.py +0 -1
  62. cobweb_new/exceptions/oss_db_exception.py +0 -28
  63. cobweb_new/launchers/__init__.py +0 -3
  64. cobweb_new/launchers/launcher.py +0 -237
  65. cobweb_new/launchers/launcher_air.py +0 -88
  66. cobweb_new/launchers/launcher_api.py +0 -161
  67. cobweb_new/launchers/launcher_pro.py +0 -96
  68. cobweb_new/launchers/tesss.py +0 -47
  69. cobweb_new/pipelines/__init__.py +0 -3
  70. cobweb_new/pipelines/pipeline.py +0 -68
  71. cobweb_new/pipelines/pipeline_console.py +0 -22
  72. cobweb_new/pipelines/pipeline_loghub.py +0 -34
  73. cobweb_new/setting.py +0 -95
  74. cobweb_new/utils/__init__.py +0 -5
  75. cobweb_new/utils/bloom.py +0 -58
  76. cobweb_new/utils/oss.py +0 -94
  77. cobweb_new/utils/tools.py +0 -42
  78. {cobweb_launcher-1.3.6.dist-info → cobweb_launcher-1.3.8.dist-info}/LICENSE +0 -0
  79. {cobweb_launcher-1.3.6.dist-info → cobweb_launcher-1.3.8.dist-info}/WHEEL +0 -0
  80. {cobweb_launcher-1.3.6.dist-info → cobweb_launcher-1.3.8.dist-info}/top_level.txt +0 -0
cobweb/base/__init__.py CHANGED
@@ -1,3 +1,4 @@
1
+ import os
1
2
  import time
2
3
  import traceback
3
4
  import threading
@@ -12,7 +13,7 @@ from .basic import Seed, Request, Response
12
13
  from .item import BaseItem, ConsoleItem
13
14
  # from .seed import Seed
14
15
  from .log import logger
15
- # from .dotting import LoghubDot
16
+ from .dotting import LoghubDot
16
17
 
17
18
 
18
19
  class TaskQueue:
@@ -25,8 +26,7 @@ class TaskQueue:
25
26
  DONE = Queue() # 下载完成队列
26
27
  UPLOAD = Queue() # 任务上传队列
27
28
  DELETE = Queue() # 任务删除队列
28
-
29
- # DOT = LoghubDot()
29
+ DOT = LoghubDot()
30
30
 
31
31
  @staticmethod
32
32
  def is_empty():
@@ -43,6 +43,7 @@ class TaskQueue:
43
43
  @staticmethod
44
44
  def process_task(it: Union[Seed, Request, Response, BaseItem], crawler_func: Callable):
45
45
  try:
46
+ start_time = time.time()
46
47
  iterators = crawler_func(it)
47
48
  if not isgenerator(iterators):
48
49
  raise TypeError(f"{crawler_func.__name__} function isn't a generator")
@@ -57,12 +58,11 @@ class TaskQueue:
57
58
  TaskQueue.SEED.push(tk)
58
59
  else:
59
60
  raise TypeError(f"{crawler_func.__name__} function return type isn't supported")
60
- # TaskQueue.DOT.build(
61
- # topic=f"{self.project}:{self.task}",
62
- # cost_time=end_time - start_time,
63
- # **download_item.to_dict
64
- # )
65
- # todo: 数据打点
61
+ TaskQueue.DOT.build(
62
+ topic=f"{os.getenv('PROJECT')}:{os.getenv('TASK')}",
63
+ cost_time=round(time.time() - start_time, 2),
64
+ **tk.to_dict
65
+ )
66
66
  except Exception as e:
67
67
  it.params.retry += 1
68
68
  if isinstance(it, Request):
cobweb/base/dotting.py CHANGED
@@ -27,7 +27,7 @@ class LoghubDot:
27
27
  log_items.append(log_item)
28
28
  request = PutLogsRequest(
29
29
  project="databee-download-log",
30
- logstore="cobweb_log",
30
+ logstore="download-logging",
31
31
  topic=topic,
32
32
  logitems=log_items,
33
33
  compress=True
cobweb/utils/oss.py CHANGED
@@ -1,9 +1,9 @@
1
- from typing import List
1
+ # from typing import List
2
2
  from cobweb import setting
3
3
  from requests import Response
4
4
  from oss2 import Auth, Bucket, models, PartIterator
5
5
  from cobweb.exceptions import oss_db_exception
6
- from cobweb.base.decorators import decorator_oss_db
6
+ from cobweb.base import Decorators
7
7
 
8
8
 
9
9
  class OssUtil:
@@ -40,17 +40,17 @@ class OssUtil:
40
40
  def head(self, key: str) -> models.HeadObjectResult:
41
41
  return self._client.head_object(key)
42
42
 
43
- @decorator_oss_db(exception=oss_db_exception.OssDBInitPartError)
43
+ @Decorators.decorator_oss_db(exception=oss_db_exception.OssDBInitPartError)
44
44
  def init_part(self, key) -> models.InitMultipartUploadResult:
45
45
  """初始化分片上传"""
46
46
  return self._client.init_multipart_upload(key)
47
47
 
48
- @decorator_oss_db(exception=oss_db_exception.OssDBPutObjError)
48
+ @Decorators.decorator_oss_db(exception=oss_db_exception.OssDBPutObjError)
49
49
  def put(self, key, data) -> models.PutObjectResult:
50
50
  """文件上传"""
51
51
  return self._client.put_object(key, data)
52
52
 
53
- @decorator_oss_db(exception=oss_db_exception.OssDBPutPartError)
53
+ @Decorators.decorator_oss_db(exception=oss_db_exception.OssDBPutPartError)
54
54
  def put_part(self, key, upload_id, position, data) -> models.PutObjectResult:
55
55
  """分片上传"""
56
56
  return self._client.upload_part(key, upload_id, position, data)
@@ -59,13 +59,13 @@ class OssUtil:
59
59
  """获取分片列表"""
60
60
  return [part_info for part_info in PartIterator(self._client, key, upload_id)]
61
61
 
62
- @decorator_oss_db(exception=oss_db_exception.OssDBMergeError)
62
+ @Decorators.decorator_oss_db(exception=oss_db_exception.OssDBMergeError)
63
63
  def merge(self, key, upload_id, parts=None) -> models.PutObjectResult:
64
64
  """合并分片"""
65
65
  headers = None if parts else {"x-oss-complete-all": "yes"}
66
66
  return self._client.complete_multipart_upload(key, upload_id, parts, headers=headers)
67
67
 
68
- @decorator_oss_db(exception=oss_db_exception.OssDBAppendObjError)
68
+ @Decorators.decorator_oss_db(exception=oss_db_exception.OssDBAppendObjError)
69
69
  def append(self, key, position, data) -> models.AppendObjectResult:
70
70
  """追加上传"""
71
71
  return self._client.append_object(key, position, data)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 1.3.6
3
+ Version: 1.3.8
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -0,0 +1,40 @@
1
+ cobweb/__init__.py,sha256=oaEfsGUuGP0s39UbFRwrnsjMUeuB6QvQIAwStKFyUTk,83
2
+ cobweb/constant.py,sha256=eofONAntk9O6S-cb4KbYGYHL_u7nBlOqqFOw_HzJHAU,3588
3
+ cobweb/setting.py,sha256=pY6LKsgWI3164GiGA1z_y26LVf5-3mpiEgmm86mKRdY,3135
4
+ cobweb/base/__init__.py,sha256=HHxaDy0x6HKOq5BPhl8eNgqPx06-XYWe-WrM0GQZDDY,5108
5
+ cobweb/base/basic.py,sha256=s5G4LBZiLUfoymV-gLSIqeH-OJ7q7-L35sBa6xEH3EI,7666
6
+ cobweb/base/common_queue.py,sha256=Gor7sR3h1hlZWaI0XcNAbf0S15Ftjr3DFRWNTGL13uU,1137
7
+ cobweb/base/dotting.py,sha256=lfFXXqnVP__hxlW3qH5Bnuq69KtnFaQLbcz1M8e2Ajg,1239
8
+ cobweb/base/item.py,sha256=hYheVTV2Bozp4iciJpE2ZwBIXkaqBg4QQkRccP8yoVk,1049
9
+ cobweb/base/log.py,sha256=L01hXdk3L2qEm9X1FOXQ9VmWIoHSELe0cyZvrdAN61A,2003
10
+ cobweb/base/request.py,sha256=acGm3OzxsPed5VUTk7D9eeHZPMh7KUNQRUv44G5znZg,2659
11
+ cobweb/base/response.py,sha256=eB1DWMXFCpn3cJ3yzgCRU1WeZAdayGDohRgdjdMUFN4,406
12
+ cobweb/base/seed.py,sha256=PN5J4gKPEXylwyQeSGOBfauxHktxFr7RJe8nVX1hBw4,2987
13
+ cobweb/crawlers/__init__.py,sha256=msvkB9mTpsgyj8JfNMsmwAcpy5kWk_2NrO1Adw2Hkw0,29
14
+ cobweb/crawlers/crawler.py,sha256=ZQ6yVA1EaQRdKJEY3DNqShzp9HPMwlSXapnsRW9E5Wc,2987
15
+ cobweb/db/__init__.py,sha256=uZwSkd105EAwYo95oZQXAfofUKHVIAZZIPpNMy-hm2Q,56
16
+ cobweb/db/api_db.py,sha256=bDc5dJQxq4z04h70KUTHd0OqUOEY7Cm3wcNJZtTvJIM,3015
17
+ cobweb/db/redis_db.py,sha256=FvMzckJtmhwKhZqKoS23iXmJti5P2dnMVD5rJ__5LUw,5139
18
+ cobweb/exceptions/__init__.py,sha256=E9SHnJBbhD7fOgPFMswqyOf8SKRDrI_i25L0bSpohvk,32
19
+ cobweb/exceptions/oss_db_exception.py,sha256=iP_AImjNHT3-Iv49zCFQ3rdLnlvuHa3h2BXApgrOYpA,636
20
+ cobweb/launchers/__init__.py,sha256=m_XNG2bWuMbirPt3d0_s-Ezl1xycfUxeqZnwq_kkfuo,116
21
+ cobweb/launchers/launcher.py,sha256=NFwpc_0Um0hbDm1A8glWA4fcW6mNYL1eon4t3JAQUlw,7411
22
+ cobweb/launchers/launcher_air.py,sha256=yPr395HVIIHAq6lqRcYJu7c0KkfO9V8O-2sn0hC96p0,2990
23
+ cobweb/launchers/launcher_api.py,sha256=TfLrLXazFWsOJLI7caMGfZozCttL1WTwTo3uUpN_FV0,3370
24
+ cobweb/launchers/launcher_pro.py,sha256=2H-TcvQx-ga78GLNTa-GXMLYAj9nEeCJSWf8xl-1ISQ,3374
25
+ cobweb/pipelines/__init__.py,sha256=zSUsGtx6smbs2iXBXvYynReKSgky-3gjqaAtKVnA_OU,105
26
+ cobweb/pipelines/pipeline.py,sha256=Pycm22bHId9a3gdP81D5y7SsuMndYooTb5n4zQxP7dM,1321
27
+ cobweb/pipelines/pipeline_console.py,sha256=NEh-4zhuVAQOqwXLsqeb-rcNZ9_KXFUpL3otUTL5qBs,754
28
+ cobweb/pipelines/pipeline_loghub.py,sha256=xZ6D55BGdiM71WUv83jyLGbEyUwhBHLJRZoXthBxxTs,1019
29
+ cobweb/schedulers/__init__.py,sha256=y7Lv_7b0zfTl0OhIONb_8u1K1C9gVlBA-xz_XG_kI9g,85
30
+ cobweb/schedulers/scheduler_api.py,sha256=mC54QOS0PEu4SFvxfD5Qr9239hAxwMrKTg-33rirANE,2112
31
+ cobweb/schedulers/scheduler_redis.py,sha256=Aw7de0sXigRAxJgqUhHWu30hMBzgEWjkj-3OXXqmldg,2118
32
+ cobweb/utils/__init__.py,sha256=YvD4mIDBd9jmGA6WJBcwkgDU2jRFNBCEbarZCSUBAHE,114
33
+ cobweb/utils/bloom.py,sha256=vng-YbKgh9HbtpAWYf_nkUSbfVTOj40aqUUejRYlsCU,1752
34
+ cobweb/utils/oss.py,sha256=6Qlhdde7CcwD69bBe2rGWHY3-aptG9NXB_DZLhjgDRQ,3553
35
+ cobweb/utils/tools.py,sha256=5JEaaAwYoV9Sdla2UBIJn6faUBuXmxUMagm9ck6FVqs,1253
36
+ cobweb_launcher-1.3.8.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
37
+ cobweb_launcher-1.3.8.dist-info/METADATA,sha256=vxO6miknf7w1rom7AF3FubCwBiiYCHgS5xXVGKNdQdk,6509
38
+ cobweb_launcher-1.3.8.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
39
+ cobweb_launcher-1.3.8.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
40
+ cobweb_launcher-1.3.8.dist-info/RECORD,,
cobweb/base/decorators.py DELETED
@@ -1,40 +0,0 @@
1
- from functools import wraps
2
-
3
-
4
- # def check_redis_status(func):
5
- # @wraps(func)
6
- # def wrapper(*args, **kwargs):
7
- # try:
8
- # result = func(*args, **kwargs)
9
- # except Exception:
10
- # result = False
11
- # return result
12
- #
13
- # return wrapper
14
-
15
-
16
- def decorator_oss_db(exception, retries=3):
17
- def decorator(func):
18
- @wraps(func)
19
- def wrapper(callback_func, *args, **kwargs):
20
- result = None
21
- for i in range(retries):
22
- msg = None
23
- try:
24
- return func(callback_func, *args, **kwargs)
25
- except Exception as e:
26
- result = None
27
- msg = e
28
- finally:
29
- if result:
30
- return result
31
-
32
- if i >= 2 and msg:
33
- raise exception(msg)
34
-
35
- return wrapper
36
-
37
- return decorator
38
-
39
-
40
-
@@ -1,144 +0,0 @@
1
- import threading
2
- import time
3
- import traceback
4
-
5
- from inspect import isgenerator
6
- from typing import Union, Callable, Mapping
7
-
8
- from cobweb.base import Queue, Seed, BaseItem, Request, Response, logger
9
- from cobweb.constant import DealModel, LogTemplate
10
- from cobweb.utils import download_log_info
11
- from cobweb import setting
12
-
13
-
14
- class Crawler(threading.Thread):
15
-
16
- def __init__(
17
- self,
18
- upload_queue: Queue,
19
- custom_func: Union[Mapping[str, Callable]],
20
- launcher_queue: Union[Mapping[str, Queue]],
21
- ):
22
- super().__init__()
23
-
24
- self.upload_queue = upload_queue
25
- for func_name, _callable in custom_func.items():
26
- if isinstance(_callable, Callable):
27
- self.__setattr__(func_name, _callable)
28
-
29
- self.launcher_queue = launcher_queue
30
-
31
- self.spider_thread_num = setting.SPIDER_THREAD_NUM
32
- self.max_retries = setting.SPIDER_MAX_RETRIES
33
-
34
- @staticmethod
35
- def request(seed: Seed) -> Union[Request, BaseItem]:
36
- stream = True if setting.DOWNLOAD_MODEL else False
37
- yield Request(seed.url, seed, stream=stream, timeout=5)
38
-
39
- @staticmethod
40
- def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
41
- response = item.download()
42
- yield Response(item.seed, response, **item.to_dict)
43
-
44
- @staticmethod
45
- def parse(item: Response) -> BaseItem:
46
- pass
47
-
48
- def get_seed(self) -> Seed:
49
- return self.launcher_queue['todo'].pop()
50
-
51
- def distribute(self, item, seed):
52
- if isinstance(item, BaseItem):
53
- self.upload_queue.push(item)
54
- elif isinstance(item, Seed):
55
- self.launcher_queue['new'].push(item)
56
- elif isinstance(item, str) and item == DealModel.poll:
57
- self.launcher_queue['todo'].push(seed)
58
- elif isinstance(item, str) and item == DealModel.done:
59
- self.launcher_queue['done'].push(seed)
60
- elif isinstance(item, str) and item == DealModel.fail:
61
- seed.params.seed_status = DealModel.fail
62
- self.launcher_queue['done'].push(seed)
63
- else:
64
- raise TypeError("yield value type error!")
65
-
66
- def spider(self):
67
- while True:
68
- seed = self.get_seed()
69
-
70
- if not seed:
71
- continue
72
-
73
- elif seed.params.retry >= self.max_retries:
74
- seed.params.seed_status = DealModel.fail
75
- self.launcher_queue['done'].push(seed)
76
- continue
77
-
78
- seed_detail_log_info = download_log_info(seed.to_dict)
79
-
80
- try:
81
- request_iterators = self.request(seed)
82
-
83
- if not isgenerator(request_iterators):
84
- raise TypeError("request function isn't a generator!")
85
-
86
- iterator_status = False
87
-
88
- for request_item in request_iterators:
89
-
90
- iterator_status = True
91
-
92
- if isinstance(request_item, Request):
93
- iterator_status = False
94
- download_iterators = self.download(request_item)
95
- if not isgenerator(download_iterators):
96
- raise TypeError("download function isn't a generator")
97
-
98
- for download_item in download_iterators:
99
- iterator_status = True
100
- if isinstance(download_item, Response):
101
- iterator_status = False
102
- logger.info(LogTemplate.download_info.format(
103
- detail=seed_detail_log_info,
104
- retry=seed.params.retry,
105
- priority=seed.params.priority,
106
- seed_version=seed.params.seed_version,
107
- identifier=seed.identifier or "",
108
- status=download_item.response,
109
- response=download_log_info(download_item.to_dict)
110
- ))
111
- parse_iterators = self.parse(download_item)
112
- if not isgenerator(parse_iterators):
113
- raise TypeError("parse function isn't a generator")
114
- for parse_item in parse_iterators:
115
- iterator_status = True
116
- if isinstance(parse_item, Response):
117
- raise TypeError("upload_item can't be a Response instance")
118
- self.distribute(parse_item, seed)
119
- else:
120
- self.distribute(download_item, seed)
121
- else:
122
- self.distribute(request_item, seed)
123
-
124
- if not iterator_status:
125
- raise ValueError("request/download/parse function yield value error!")
126
-
127
- except Exception as e:
128
- logger.info(LogTemplate.download_exception.format(
129
- detail=seed_detail_log_info,
130
- retry=seed.params.retry,
131
- priority=seed.params.priority,
132
- seed_version=seed.params.seed_version,
133
- identifier=seed.identifier or "",
134
- exception=''.join(traceback.format_exception(type(e), e, e.__traceback__))
135
- ))
136
- seed.params.retry += 1
137
- self.launcher_queue['todo'].push(seed)
138
- finally:
139
- time.sleep(0.1)
140
-
141
- def run(self):
142
- for index in range(self.spider_thread_num):
143
- threading.Thread(name=f"spider_{index}", target=self.spider).start()
144
-
@@ -1,98 +0,0 @@
1
- import os
2
- from typing import Union
3
- from cobweb import setting
4
- from cobweb.utils import OssUtil
5
- from cobweb.crawlers import Crawler
6
- from cobweb.base import Seed, BaseItem, Request, Response
7
- from cobweb.exceptions import OssDBPutPartError, OssDBMergeError
8
-
9
-
10
- oss_util = OssUtil(is_path_style=bool(int(os.getenv("PRIVATE_LINK", 0))))
11
-
12
-
13
- class FileCrawlerAir(Crawler):
14
-
15
- @staticmethod
16
- def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
17
- seed_dict = item.seed.to_dict
18
- seed_dict["bucket_name"] = oss_util.bucket
19
- try:
20
- seed_dict["oss_path"] = key = item.seed.oss_path or getattr(item, "oss_path")
21
-
22
- if oss_util.exists(key):
23
- seed_dict["data_size"] = oss_util.head(key).content_length
24
- yield Response(item.seed, "exists", **seed_dict)
25
-
26
- else:
27
- seed_dict.setdefault("end", "")
28
- seed_dict.setdefault("start", 0)
29
-
30
- if seed_dict["end"] or seed_dict["start"]:
31
- start, end = seed_dict["start"], seed_dict["end"]
32
- item.request_setting["headers"]['Range'] = f'bytes={start}-{end}'
33
-
34
- if not item.seed.identifier:
35
- content = b""
36
- chunk_size = oss_util.chunk_size
37
- min_upload_size = oss_util.min_upload_size
38
- seed_dict.setdefault("position", 1)
39
-
40
- response = item.download()
41
-
42
- content_type = response.headers.get("content-type", "").split(";")[0]
43
- seed_dict["data_size"] = content_length = int(response.headers.get("content-length", 0))
44
-
45
- if content_type and content_type in setting.FILE_FILTER_CONTENT_TYPE:
46
- """过滤响应文件类型"""
47
- response.close()
48
- seed_dict["filter"] = True
49
- seed_dict["msg"] = f"response content type is {content_type}"
50
- yield Response(item.seed, response, **seed_dict)
51
-
52
- elif seed_dict['position'] == 1 and min_upload_size >= content_length > 0:
53
- """过小文件标识返回"""
54
- response.close()
55
- seed_dict["filter"] = True
56
- seed_dict["msg"] = "file size is too small"
57
- yield Response(item.seed, response, **seed_dict)
58
-
59
- elif seed_dict['position'] == 1 and chunk_size > content_length > min_upload_size:
60
- """小文件直接下载"""
61
- for part_data in response.iter_content(chunk_size):
62
- content += part_data
63
- response.close()
64
- oss_util.put(key, content)
65
- yield Response(item.seed, response, **seed_dict)
66
-
67
- else:
68
- """中大文件同步分片下载"""
69
- seed_dict.setdefault("upload_id", oss_util.init_part(key).upload_id)
70
-
71
- for part_data in response.iter_content(chunk_size):
72
- content += part_data
73
- if len(content) >= chunk_size:
74
- upload_data = content[:chunk_size]
75
- content = content[chunk_size:]
76
- oss_util.put_part(key, seed_dict["upload_id"], seed_dict['position'], content)
77
- seed_dict['start'] += len(upload_data)
78
- seed_dict['position'] += 1
79
-
80
- response.close()
81
-
82
- if content:
83
- oss_util.put_part(key, seed_dict["upload_id"], seed_dict['position'], content)
84
- oss_util.merge(key, seed_dict["upload_id"])
85
- seed_dict["data_size"] = oss_util.head(key).content_length
86
- yield Response(item.seed, response, **seed_dict)
87
-
88
- elif item.seed.identifier == "merge":
89
- oss_util.merge(key, seed_dict["upload_id"])
90
- seed_dict["data_size"] = oss_util.head(key).content_length
91
- yield Response(item.seed, "merge", **seed_dict)
92
-
93
- except OssDBPutPartError:
94
- yield Seed(seed_dict)
95
- except OssDBMergeError:
96
- yield Seed(seed_dict, identifier="merge")
97
-
98
-
@@ -1,54 +0,0 @@
1
- import time
2
- import threading
3
-
4
- from abc import ABC, abstractmethod
5
- from cobweb.base import BaseItem, Queue, logger
6
-
7
-
8
- class Pipeline(threading.Thread, ABC):
9
-
10
- def __init__(
11
- self,
12
- done_queue: Queue,
13
- upload_queue: Queue,
14
- upload_queue_size: int,
15
- upload_wait_seconds: int
16
- ):
17
- super().__init__()
18
- self.done_queue = done_queue
19
- self.upload_queue = upload_queue
20
- self.upload_queue_size = upload_queue_size
21
- self.upload_wait_seconds = upload_wait_seconds
22
-
23
- @abstractmethod
24
- def build(self, item: BaseItem) -> dict:
25
- pass
26
-
27
- @abstractmethod
28
- def upload(self, table: str, data: list) -> bool:
29
- pass
30
-
31
- def run(self):
32
- while True:
33
- status = self.upload_queue.length < self.upload_queue_size
34
- if status:
35
- time.sleep(self.upload_wait_seconds)
36
- data_info, seeds = {}, []
37
- for _ in range(self.upload_queue_size):
38
- item = self.upload_queue.pop()
39
- if not item:
40
- break
41
- data = self.build(item)
42
- seeds.append(item.seed)
43
- data_info.setdefault(item.table, []).append(data)
44
- for table, datas in data_info.items():
45
- try:
46
- self.upload(table, datas)
47
- status = True
48
- except Exception as e:
49
- logger.info(e)
50
- status = False
51
- if status:
52
- self.done_queue.push(seeds)
53
-
54
-
@@ -1,34 +0,0 @@
1
- import json
2
-
3
- from cobweb import setting
4
- from cobweb.base import BaseItem
5
- from cobweb.pipelines import Pipeline
6
- from aliyun.log import LogClient, LogItem, PutLogsRequest
7
-
8
-
9
- class LoghubPipeline(Pipeline):
10
-
11
- def __init__(self, *args, **kwargs):
12
- super().__init__(*args, **kwargs)
13
- self.client = LogClient(**setting.LOGHUB_CONFIG)
14
-
15
- def build(self, item: BaseItem):
16
- log_item = LogItem()
17
- temp = item.to_dict
18
- for key, value in temp.items():
19
- if not isinstance(value, str):
20
- temp[key] = json.dumps(value, ensure_ascii=False)
21
- contents = sorted(temp.items())
22
- log_item.set_contents(contents)
23
- return log_item
24
-
25
- def upload(self, table, datas):
26
- request = PutLogsRequest(
27
- project=setting.LOGHUB_PROJECT,
28
- logstore=table,
29
- topic=setting.LOGHUB_TOPIC,
30
- source=setting.LOGHUB_SOURCE,
31
- logitems=datas,
32
- compress=True
33
- )
34
- self.client.put_logs(request=request)
cobweb/utils/dotting.py DELETED
@@ -1,32 +0,0 @@
1
- import json
2
-
3
- from aliyun.log import LogClient, LogItem, PutLogsRequest
4
- from cobweb import setting
5
-
6
-
7
- class LoghubDot:
8
-
9
- def __init__(self):
10
- self.client = LogClient(**setting.LOGHUB_CONFIG)
11
-
12
- def build(self, topic, **kwargs):
13
-
14
- temp = {}
15
- log_items = []
16
- log_item = LogItem()
17
- for key, value in kwargs.items():
18
- if not isinstance(value, str):
19
- temp[key] = json.dumps(value, ensure_ascii=False)
20
- else:
21
- temp[key] = value
22
- contents = sorted(temp.items())
23
- log_item.set_contents(contents)
24
- log_items.append(log_item)
25
- request = PutLogsRequest(
26
- project="databee-download-log",
27
- logstore="cobweb_log",
28
- topic=topic,
29
- logitems=log_items,
30
- compress=True
31
- )
32
- self.client.put_logs(request=request)
cobweb_/__init__.py DELETED
@@ -1,2 +0,0 @@
1
- from .launchers import LauncherAir, LauncherPro, LauncherApi
2
- from .constant import CrawlerModel
cobweb_/base/__init__.py DELETED
@@ -1,9 +0,0 @@
1
- from .common_queue import Queue
2
- from .response import Response
3
- from .request import Request
4
- from .item import BaseItem, ConsoleItem
5
- from .seed import Seed
6
-
7
- from .log import logger
8
- from .decorators import decorator_oss_db
9
-
@@ -1,30 +0,0 @@
1
- from collections import deque
2
-
3
-
4
- class Queue:
5
-
6
- def __init__(self):
7
- self._queue = deque()
8
-
9
- @property
10
- def length(self) -> int:
11
- return len(self._queue)
12
-
13
- def push(self, data, left: bool = False, direct_insertion: bool = False):
14
- try:
15
- if not data:
16
- return None
17
- if not direct_insertion and any(isinstance(data, t) for t in (list, tuple)):
18
- self._queue.extendleft(data) if left else self._queue.extend(data)
19
- else:
20
- self._queue.appendleft(data) if left else self._queue.append(data)
21
- except AttributeError:
22
- pass
23
-
24
- def pop(self, left: bool = True):
25
- try:
26
- return self._queue.popleft() if left else self._queue.pop()
27
- except IndexError:
28
- return None
29
- except AttributeError:
30
- return None
@@ -1,40 +0,0 @@
1
- from functools import wraps
2
-
3
-
4
- # def check_redis_status(func):
5
- # @wraps(func)
6
- # def wrapper(*args, **kwargs):
7
- # try:
8
- # result = func(*args, **kwargs)
9
- # except Exception:
10
- # result = False
11
- # return result
12
- #
13
- # return wrapper
14
-
15
-
16
- def decorator_oss_db(exception, retries=3):
17
- def decorator(func):
18
- @wraps(func)
19
- def wrapper(callback_func, *args, **kwargs):
20
- result = None
21
- for i in range(retries):
22
- msg = None
23
- try:
24
- return func(callback_func, *args, **kwargs)
25
- except Exception as e:
26
- result = None
27
- msg = e
28
- finally:
29
- if result:
30
- return result
31
-
32
- if i >= 2 and msg:
33
- raise exception(msg)
34
-
35
- return wrapper
36
-
37
- return decorator
38
-
39
-
40
-