cobweb-launcher 1.1.2__tar.gz → 1.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cobweb-launcher might be problematic. Click here for more details.

Files changed (38) hide show
  1. {cobweb-launcher-1.1.2/cobweb_launcher.egg-info → cobweb-launcher-1.1.4}/PKG-INFO +1 -1
  2. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/__init__.py +1 -0
  3. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/constant.py +6 -0
  4. cobweb-launcher-1.1.4/cobweb/crawlers/__init__.py +2 -0
  5. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/crawlers/file_crawler.py +29 -30
  6. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/launchers/launcher_pro.py +8 -9
  7. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/setting.py +2 -2
  8. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4/cobweb_launcher.egg-info}/PKG-INFO +1 -1
  9. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/setup.py +1 -1
  10. cobweb-launcher-1.1.2/cobweb/crawlers/__init__.py +0 -2
  11. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/LICENSE +0 -0
  12. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/README.md +0 -0
  13. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/base/__init__.py +0 -0
  14. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/base/common_queue.py +0 -0
  15. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/base/decorators.py +0 -0
  16. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/base/item.py +0 -0
  17. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/base/log.py +0 -0
  18. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/base/request.py +0 -0
  19. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/base/response.py +0 -0
  20. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/base/seed.py +0 -0
  21. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/crawlers/base_crawler.py +0 -0
  22. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/db/__init__.py +0 -0
  23. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/db/redis_db.py +0 -0
  24. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/exceptions/__init__.py +0 -0
  25. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/exceptions/oss_db_exception.py +0 -0
  26. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/launchers/__init__.py +0 -0
  27. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/launchers/launcher.py +0 -0
  28. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/pipelines/__init__.py +0 -0
  29. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/pipelines/base_pipeline.py +0 -0
  30. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/pipelines/loghub_pipeline.py +0 -0
  31. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/utils/__init__.py +0 -0
  32. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/utils/oss.py +0 -0
  33. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/utils/tools.py +0 -0
  34. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb_launcher.egg-info/SOURCES.txt +0 -0
  35. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
  36. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb_launcher.egg-info/requires.txt +0 -0
  37. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb_launcher.egg-info/top_level.txt +0 -0
  38. {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 1.1.2
3
+ Version: 1.1.4
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -1 +1,2 @@
1
1
  from .launchers import Launcher, LauncherPro
2
+ from .constant import Crawler
@@ -1,4 +1,10 @@
1
1
 
2
+ class Crawler:
3
+
4
+ default = "cobweb.crawlers.Crawler"
5
+ file_air = "cobweb.crawlers.FileCrawlerAir"
6
+ file_pro = "cobweb.crawlers.FileCrawlerPro"
7
+
2
8
 
3
9
  class LauncherModel:
4
10
  task = "launcher model: task"
@@ -0,0 +1,2 @@
1
+ from .base_crawler import Crawler
2
+ from .file_crawler import FileCrawlerAir, FileCrawlerPro
@@ -7,18 +7,19 @@ from cobweb.base import Seed, BaseItem, Request, Response
7
7
  from cobweb.exceptions import OssDBPutPartError, OssDBMergeError
8
8
 
9
9
 
10
- class CrawlerAir(Crawler):
11
-
12
- oss_util = OssUtil()
10
+ oss_util = OssUtil()
11
+
12
+
13
+ class FileCrawlerAir(Crawler):
13
14
 
14
15
  @staticmethod
15
16
  def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
16
17
  seed_dict = item.seed.to_dict
17
- bucket_name = CrawlerAir.oss_util.bucket
18
+ bucket_name = oss_util.bucket
18
19
  try:
19
20
  key = item.seed.oss_path
20
- if CrawlerAir.oss_util.exists(key):
21
- content_length = CrawlerAir.oss_util.head(key).content_length
21
+ if oss_util.exists(key):
22
+ content_length = oss_util.head(key).content_length
22
23
  yield Response(item.seed, "exists", bucket_name=bucket_name, data_size=content_length, **seed_dict)
23
24
 
24
25
  end = seed_dict.get("end", "")
@@ -29,8 +30,8 @@ class CrawlerAir(Crawler):
29
30
 
30
31
  if not item.seed.params.identifier:
31
32
  content = b""
32
- chunk_size = CrawlerAir.oss_util.chunk_size
33
- min_upload_size = CrawlerAir.oss_util.min_upload_size
33
+ chunk_size = oss_util.chunk_size
34
+ min_upload_size = oss_util.min_upload_size
34
35
  position = seed_dict.get("position", 1)
35
36
 
36
37
  response = item.download()
@@ -52,21 +53,21 @@ class CrawlerAir(Crawler):
52
53
  """小文件直接下载"""
53
54
  for part_data in response.iter_content(chunk_size):
54
55
  content += part_data
55
- CrawlerAir.oss_util.put(key, content)
56
+ oss_util.put(key, content)
56
57
  yield Response(item.seed, response, bucket_name=bucket_name, data_size=content_length, **seed_dict)
57
58
  response.close()
58
59
  else:
59
60
  """中大文件同步分片下载"""
60
61
  upload_content_length = 0
61
62
  if not seed_dict.get("upload_id"):
62
- seed_dict["upload_id"] = CrawlerAir.oss_util.init_part(key).upload_id
63
+ seed_dict["upload_id"] = oss_util.init_part(key).upload_id
63
64
  upload_id = seed_dict["upload_id"]
64
65
  for part_data in response.iter_content(chunk_size):
65
66
  content += part_data
66
67
  if len(content) >= chunk_size:
67
68
  upload_data = content[:chunk_size]
68
69
  content = content[chunk_size:]
69
- CrawlerAir.oss_util.put_part(key, upload_id, position, upload_data)
70
+ oss_util.put_part(key, upload_id, position, upload_data)
70
71
  upload_content_length += len(upload_data)
71
72
  position += 1
72
73
  seed_dict['position'] = position
@@ -74,14 +75,14 @@ class CrawlerAir(Crawler):
74
75
 
75
76
  response.close()
76
77
  if content:
77
- CrawlerAir.oss_util.put_part(key, upload_id, position, content)
78
+ oss_util.put_part(key, upload_id, position, content)
78
79
  content_length += len(content)
79
- CrawlerAir.oss_util.merge(key, upload_id)
80
+ oss_util.merge(key, upload_id)
80
81
  yield Response(item.seed, response, bucket_name=bucket_name, data_size=content_length, **seed_dict)
81
82
 
82
83
  elif item.seed.params.identifier == "merge":
83
- CrawlerAir.oss_util.merge(key, seed_dict["upload_id"])
84
- content_length = CrawlerAir.oss_util.head(key).content_length
84
+ oss_util.merge(key, seed_dict["upload_id"])
85
+ content_length = oss_util.head(key).content_length
85
86
  yield Response(item.seed, "merge", bucket_name=bucket_name, data_size=content_length, **seed_dict)
86
87
  except OssDBPutPartError:
87
88
  yield Seed(seed_dict)
@@ -89,18 +90,16 @@ class CrawlerAir(Crawler):
89
90
  yield Seed(seed_dict, identifier="merge")
90
91
 
91
92
 
92
- class CrawlerPro(Crawler):
93
-
94
- oss_util = OssUtil()
93
+ class FileCrawlerPro(Crawler):
95
94
 
96
95
  @staticmethod
97
96
  def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
98
97
  seed_dict = item.seed.to_dict
99
- bucket_name = CrawlerAir.oss_util.bucket
98
+ bucket_name = oss_util.bucket
100
99
  try:
101
100
  key = item.seed.oss_path
102
- if CrawlerAir.oss_util.exists(key):
103
- content_length = CrawlerAir.oss_util.head(key).content_length
101
+ if oss_util.exists(key):
102
+ content_length = oss_util.head(key).content_length
104
103
  yield Response(item.seed, "exists", bucket_name=bucket_name, data_size=content_length, **seed_dict)
105
104
 
106
105
  end = seed_dict.get("end", "")
@@ -111,8 +110,8 @@ class CrawlerPro(Crawler):
111
110
 
112
111
  if not item.seed.params.identifier:
113
112
  content = b""
114
- chunk_size = CrawlerAir.oss_util.chunk_size
115
- min_upload_size = CrawlerAir.oss_util.min_upload_size
113
+ chunk_size = oss_util.chunk_size
114
+ min_upload_size = oss_util.min_upload_size
116
115
  position = seed_dict.get("position", 1)
117
116
 
118
117
  response = item.download()
@@ -134,21 +133,21 @@ class CrawlerPro(Crawler):
134
133
  """小文件直接下载"""
135
134
  for part_data in response.iter_content(chunk_size):
136
135
  content += part_data
137
- CrawlerAir.oss_util.put(key, content)
136
+ oss_util.put(key, content)
138
137
  yield Response(item.seed, response, bucket_name=bucket_name, data_size=content_length, **seed_dict)
139
138
  response.close()
140
139
  else:
141
140
  """中大文件同步分片下载"""
142
141
  upload_content_length = 0
143
142
  if not seed_dict.get("upload_id"):
144
- seed_dict["upload_id"] = CrawlerAir.oss_util.init_part(key).upload_id
143
+ seed_dict["upload_id"] = oss_util.init_part(key).upload_id
145
144
  upload_id = seed_dict["upload_id"]
146
145
  for part_data in response.iter_content(chunk_size):
147
146
  content += part_data
148
147
  if len(content) >= chunk_size:
149
148
  upload_data = content[:chunk_size]
150
149
  content = content[chunk_size:]
151
- CrawlerAir.oss_util.put_part(key, upload_id, position, upload_data)
150
+ oss_util.put_part(key, upload_id, position, upload_data)
152
151
  upload_content_length += len(upload_data)
153
152
  position += 1
154
153
  seed_dict['position'] = position
@@ -156,14 +155,14 @@ class CrawlerPro(Crawler):
156
155
 
157
156
  response.close()
158
157
  if content:
159
- CrawlerAir.oss_util.put_part(key, upload_id, position, content)
158
+ oss_util.put_part(key, upload_id, position, content)
160
159
  content_length += len(content)
161
- CrawlerAir.oss_util.merge(key, upload_id)
160
+ oss_util.merge(key, upload_id)
162
161
  yield Response(item.seed, response, bucket_name=bucket_name, data_size=content_length, **seed_dict)
163
162
 
164
163
  elif item.seed.params.identifier == "merge":
165
- CrawlerAir.oss_util.merge(key, seed_dict["upload_id"])
166
- content_length = CrawlerAir.oss_util.head(key).content_length
164
+ oss_util.merge(key, seed_dict["upload_id"])
165
+ content_length = oss_util.head(key).content_length
167
166
  yield Response(item.seed, "merge", bucket_name=bucket_name, data_size=content_length, **seed_dict)
168
167
 
169
168
  except OssDBPutPartError:
@@ -35,25 +35,24 @@ class LauncherPro(Launcher):
35
35
  """
36
36
  检查过期种子,重新添加到redis缓存中
37
37
  """
38
- first = True
38
+ init = True
39
39
  while not self._pause.is_set():
40
- reset_wait_seconds = 15
40
+ reset_wait_seconds = 30
41
+ start_reset_time = int(time.time())
41
42
  if self._db.lock(self._reset_lock, t=120):
42
43
  if not self.heartbeat:
43
44
  self._heartbeat_start_event.set()
44
45
 
45
46
  _min = -int(time.time()) + self._seed_reset_seconds \
46
- if self.heartbeat or not first else "-inf"
47
+ if self.heartbeat or not init else "-inf"
47
48
 
48
- self._db.members(
49
- self._todo, 0,
50
- _min=_min, _max="(0"
51
- )
49
+ self._db.members(self._todo, 0, _min=_min, _max="(0")
52
50
  self._db.delete(self._reset_lock)
53
- reset_wait_seconds = 60
54
51
 
52
+ ttl = 120 - int(time.time()) + start_reset_time
53
+ reset_wait_seconds = max(ttl, 1)
55
54
  time.sleep(reset_wait_seconds)
56
- first = False
55
+ init = False
57
56
 
58
57
  def _scheduler(self):
59
58
  """
@@ -27,7 +27,7 @@ OSS_CHUNK_SIZE = 10 * 1024 ** 2
27
27
  OSS_MIN_UPLOAD_SIZE = 1024
28
28
 
29
29
  # 采集器选择
30
- CRAWLER = "cobweb.crawlers.CrawlerAir"
30
+ CRAWLER = "cobweb.crawlers.Crawler"
31
31
 
32
32
  # 数据上传链路
33
33
  PIPELINE = "cobweb.pipelines.loghub_pipeline.LoghubPipeline"
@@ -39,7 +39,7 @@ TODO_QUEUE_FULL_WAIT_SECONDS = 5 # todo队列已满时等待时间
39
39
  NEW_QUEUE_WAIT_SECONDS = 30 # new队列等待时间
40
40
  DONE_QUEUE_WAIT_SECONDS = 15 # done队列等待时间
41
41
  UPLOAD_QUEUE_WAIT_SECONDS = 15 # upload队列等待时间
42
- SEED_RESET_SECONDS = 600 # 种子重制时间
42
+ SEED_RESET_SECONDS = 300 # 种子重制时间
43
43
 
44
44
 
45
45
  # Launcher 队列长度
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 1.1.2
3
+ Version: 1.1.4
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
5
5
 
6
6
  setup(
7
7
  name="cobweb-launcher",
8
- version="1.1.2",
8
+ version="1.1.4",
9
9
  packages=find_packages(),
10
10
  url="https://github.com/Juannie-PP/cobweb",
11
11
  license="MIT",
@@ -1,2 +0,0 @@
1
- from .base_crawler import Crawler
2
- from .file_crawler import CrawlerAir
File without changes