cobweb-launcher 1.1.2__tar.gz → 1.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cobweb-launcher might be problematic. Click here for more details.
- {cobweb-launcher-1.1.2/cobweb_launcher.egg-info → cobweb-launcher-1.1.4}/PKG-INFO +1 -1
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/__init__.py +1 -0
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/constant.py +6 -0
- cobweb-launcher-1.1.4/cobweb/crawlers/__init__.py +2 -0
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/crawlers/file_crawler.py +29 -30
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/launchers/launcher_pro.py +8 -9
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/setting.py +2 -2
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4/cobweb_launcher.egg-info}/PKG-INFO +1 -1
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/setup.py +1 -1
- cobweb-launcher-1.1.2/cobweb/crawlers/__init__.py +0 -2
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/LICENSE +0 -0
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/README.md +0 -0
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/base/__init__.py +0 -0
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/base/common_queue.py +0 -0
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/base/decorators.py +0 -0
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/base/item.py +0 -0
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/base/log.py +0 -0
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/base/request.py +0 -0
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/base/response.py +0 -0
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/base/seed.py +0 -0
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/crawlers/base_crawler.py +0 -0
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/db/__init__.py +0 -0
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/db/redis_db.py +0 -0
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/exceptions/__init__.py +0 -0
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/exceptions/oss_db_exception.py +0 -0
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/launchers/__init__.py +0 -0
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/launchers/launcher.py +0 -0
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/pipelines/__init__.py +0 -0
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/pipelines/base_pipeline.py +0 -0
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/pipelines/loghub_pipeline.py +0 -0
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/utils/__init__.py +0 -0
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/utils/oss.py +0 -0
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb/utils/tools.py +0 -0
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb_launcher.egg-info/SOURCES.txt +0 -0
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb_launcher.egg-info/requires.txt +0 -0
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb_launcher.egg-info/top_level.txt +0 -0
- {cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/setup.cfg +0 -0
|
@@ -7,18 +7,19 @@ from cobweb.base import Seed, BaseItem, Request, Response
|
|
|
7
7
|
from cobweb.exceptions import OssDBPutPartError, OssDBMergeError
|
|
8
8
|
|
|
9
9
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
10
|
+
oss_util = OssUtil()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class FileCrawlerAir(Crawler):
|
|
13
14
|
|
|
14
15
|
@staticmethod
|
|
15
16
|
def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
|
|
16
17
|
seed_dict = item.seed.to_dict
|
|
17
|
-
bucket_name =
|
|
18
|
+
bucket_name = oss_util.bucket
|
|
18
19
|
try:
|
|
19
20
|
key = item.seed.oss_path
|
|
20
|
-
if
|
|
21
|
-
content_length =
|
|
21
|
+
if oss_util.exists(key):
|
|
22
|
+
content_length = oss_util.head(key).content_length
|
|
22
23
|
yield Response(item.seed, "exists", bucket_name=bucket_name, data_size=content_length, **seed_dict)
|
|
23
24
|
|
|
24
25
|
end = seed_dict.get("end", "")
|
|
@@ -29,8 +30,8 @@ class CrawlerAir(Crawler):
|
|
|
29
30
|
|
|
30
31
|
if not item.seed.params.identifier:
|
|
31
32
|
content = b""
|
|
32
|
-
chunk_size =
|
|
33
|
-
min_upload_size =
|
|
33
|
+
chunk_size = oss_util.chunk_size
|
|
34
|
+
min_upload_size = oss_util.min_upload_size
|
|
34
35
|
position = seed_dict.get("position", 1)
|
|
35
36
|
|
|
36
37
|
response = item.download()
|
|
@@ -52,21 +53,21 @@ class CrawlerAir(Crawler):
|
|
|
52
53
|
"""小文件直接下载"""
|
|
53
54
|
for part_data in response.iter_content(chunk_size):
|
|
54
55
|
content += part_data
|
|
55
|
-
|
|
56
|
+
oss_util.put(key, content)
|
|
56
57
|
yield Response(item.seed, response, bucket_name=bucket_name, data_size=content_length, **seed_dict)
|
|
57
58
|
response.close()
|
|
58
59
|
else:
|
|
59
60
|
"""中大文件同步分片下载"""
|
|
60
61
|
upload_content_length = 0
|
|
61
62
|
if not seed_dict.get("upload_id"):
|
|
62
|
-
seed_dict["upload_id"] =
|
|
63
|
+
seed_dict["upload_id"] = oss_util.init_part(key).upload_id
|
|
63
64
|
upload_id = seed_dict["upload_id"]
|
|
64
65
|
for part_data in response.iter_content(chunk_size):
|
|
65
66
|
content += part_data
|
|
66
67
|
if len(content) >= chunk_size:
|
|
67
68
|
upload_data = content[:chunk_size]
|
|
68
69
|
content = content[chunk_size:]
|
|
69
|
-
|
|
70
|
+
oss_util.put_part(key, upload_id, position, upload_data)
|
|
70
71
|
upload_content_length += len(upload_data)
|
|
71
72
|
position += 1
|
|
72
73
|
seed_dict['position'] = position
|
|
@@ -74,14 +75,14 @@ class CrawlerAir(Crawler):
|
|
|
74
75
|
|
|
75
76
|
response.close()
|
|
76
77
|
if content:
|
|
77
|
-
|
|
78
|
+
oss_util.put_part(key, upload_id, position, content)
|
|
78
79
|
content_length += len(content)
|
|
79
|
-
|
|
80
|
+
oss_util.merge(key, upload_id)
|
|
80
81
|
yield Response(item.seed, response, bucket_name=bucket_name, data_size=content_length, **seed_dict)
|
|
81
82
|
|
|
82
83
|
elif item.seed.params.identifier == "merge":
|
|
83
|
-
|
|
84
|
-
content_length =
|
|
84
|
+
oss_util.merge(key, seed_dict["upload_id"])
|
|
85
|
+
content_length = oss_util.head(key).content_length
|
|
85
86
|
yield Response(item.seed, "merge", bucket_name=bucket_name, data_size=content_length, **seed_dict)
|
|
86
87
|
except OssDBPutPartError:
|
|
87
88
|
yield Seed(seed_dict)
|
|
@@ -89,18 +90,16 @@ class CrawlerAir(Crawler):
|
|
|
89
90
|
yield Seed(seed_dict, identifier="merge")
|
|
90
91
|
|
|
91
92
|
|
|
92
|
-
class
|
|
93
|
-
|
|
94
|
-
oss_util = OssUtil()
|
|
93
|
+
class FileCrawlerPro(Crawler):
|
|
95
94
|
|
|
96
95
|
@staticmethod
|
|
97
96
|
def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
|
|
98
97
|
seed_dict = item.seed.to_dict
|
|
99
|
-
bucket_name =
|
|
98
|
+
bucket_name = oss_util.bucket
|
|
100
99
|
try:
|
|
101
100
|
key = item.seed.oss_path
|
|
102
|
-
if
|
|
103
|
-
content_length =
|
|
101
|
+
if oss_util.exists(key):
|
|
102
|
+
content_length = oss_util.head(key).content_length
|
|
104
103
|
yield Response(item.seed, "exists", bucket_name=bucket_name, data_size=content_length, **seed_dict)
|
|
105
104
|
|
|
106
105
|
end = seed_dict.get("end", "")
|
|
@@ -111,8 +110,8 @@ class CrawlerPro(Crawler):
|
|
|
111
110
|
|
|
112
111
|
if not item.seed.params.identifier:
|
|
113
112
|
content = b""
|
|
114
|
-
chunk_size =
|
|
115
|
-
min_upload_size =
|
|
113
|
+
chunk_size = oss_util.chunk_size
|
|
114
|
+
min_upload_size = oss_util.min_upload_size
|
|
116
115
|
position = seed_dict.get("position", 1)
|
|
117
116
|
|
|
118
117
|
response = item.download()
|
|
@@ -134,21 +133,21 @@ class CrawlerPro(Crawler):
|
|
|
134
133
|
"""小文件直接下载"""
|
|
135
134
|
for part_data in response.iter_content(chunk_size):
|
|
136
135
|
content += part_data
|
|
137
|
-
|
|
136
|
+
oss_util.put(key, content)
|
|
138
137
|
yield Response(item.seed, response, bucket_name=bucket_name, data_size=content_length, **seed_dict)
|
|
139
138
|
response.close()
|
|
140
139
|
else:
|
|
141
140
|
"""中大文件同步分片下载"""
|
|
142
141
|
upload_content_length = 0
|
|
143
142
|
if not seed_dict.get("upload_id"):
|
|
144
|
-
seed_dict["upload_id"] =
|
|
143
|
+
seed_dict["upload_id"] = oss_util.init_part(key).upload_id
|
|
145
144
|
upload_id = seed_dict["upload_id"]
|
|
146
145
|
for part_data in response.iter_content(chunk_size):
|
|
147
146
|
content += part_data
|
|
148
147
|
if len(content) >= chunk_size:
|
|
149
148
|
upload_data = content[:chunk_size]
|
|
150
149
|
content = content[chunk_size:]
|
|
151
|
-
|
|
150
|
+
oss_util.put_part(key, upload_id, position, upload_data)
|
|
152
151
|
upload_content_length += len(upload_data)
|
|
153
152
|
position += 1
|
|
154
153
|
seed_dict['position'] = position
|
|
@@ -156,14 +155,14 @@ class CrawlerPro(Crawler):
|
|
|
156
155
|
|
|
157
156
|
response.close()
|
|
158
157
|
if content:
|
|
159
|
-
|
|
158
|
+
oss_util.put_part(key, upload_id, position, content)
|
|
160
159
|
content_length += len(content)
|
|
161
|
-
|
|
160
|
+
oss_util.merge(key, upload_id)
|
|
162
161
|
yield Response(item.seed, response, bucket_name=bucket_name, data_size=content_length, **seed_dict)
|
|
163
162
|
|
|
164
163
|
elif item.seed.params.identifier == "merge":
|
|
165
|
-
|
|
166
|
-
content_length =
|
|
164
|
+
oss_util.merge(key, seed_dict["upload_id"])
|
|
165
|
+
content_length = oss_util.head(key).content_length
|
|
167
166
|
yield Response(item.seed, "merge", bucket_name=bucket_name, data_size=content_length, **seed_dict)
|
|
168
167
|
|
|
169
168
|
except OssDBPutPartError:
|
|
@@ -35,25 +35,24 @@ class LauncherPro(Launcher):
|
|
|
35
35
|
"""
|
|
36
36
|
检查过期种子,重新添加到redis缓存中
|
|
37
37
|
"""
|
|
38
|
-
|
|
38
|
+
init = True
|
|
39
39
|
while not self._pause.is_set():
|
|
40
|
-
reset_wait_seconds =
|
|
40
|
+
reset_wait_seconds = 30
|
|
41
|
+
start_reset_time = int(time.time())
|
|
41
42
|
if self._db.lock(self._reset_lock, t=120):
|
|
42
43
|
if not self.heartbeat:
|
|
43
44
|
self._heartbeat_start_event.set()
|
|
44
45
|
|
|
45
46
|
_min = -int(time.time()) + self._seed_reset_seconds \
|
|
46
|
-
if self.heartbeat or not
|
|
47
|
+
if self.heartbeat or not init else "-inf"
|
|
47
48
|
|
|
48
|
-
self._db.members(
|
|
49
|
-
self._todo, 0,
|
|
50
|
-
_min=_min, _max="(0"
|
|
51
|
-
)
|
|
49
|
+
self._db.members(self._todo, 0, _min=_min, _max="(0")
|
|
52
50
|
self._db.delete(self._reset_lock)
|
|
53
|
-
reset_wait_seconds = 60
|
|
54
51
|
|
|
52
|
+
ttl = 120 - int(time.time()) + start_reset_time
|
|
53
|
+
reset_wait_seconds = max(ttl, 1)
|
|
55
54
|
time.sleep(reset_wait_seconds)
|
|
56
|
-
|
|
55
|
+
init = False
|
|
57
56
|
|
|
58
57
|
def _scheduler(self):
|
|
59
58
|
"""
|
|
@@ -27,7 +27,7 @@ OSS_CHUNK_SIZE = 10 * 1024 ** 2
|
|
|
27
27
|
OSS_MIN_UPLOAD_SIZE = 1024
|
|
28
28
|
|
|
29
29
|
# 采集器选择
|
|
30
|
-
CRAWLER = "cobweb.crawlers.
|
|
30
|
+
CRAWLER = "cobweb.crawlers.Crawler"
|
|
31
31
|
|
|
32
32
|
# 数据上传链路
|
|
33
33
|
PIPELINE = "cobweb.pipelines.loghub_pipeline.LoghubPipeline"
|
|
@@ -39,7 +39,7 @@ TODO_QUEUE_FULL_WAIT_SECONDS = 5 # todo队列已满时等待时间
|
|
|
39
39
|
NEW_QUEUE_WAIT_SECONDS = 30 # new队列等待时间
|
|
40
40
|
DONE_QUEUE_WAIT_SECONDS = 15 # done队列等待时间
|
|
41
41
|
UPLOAD_QUEUE_WAIT_SECONDS = 15 # upload队列等待时间
|
|
42
|
-
SEED_RESET_SECONDS =
|
|
42
|
+
SEED_RESET_SECONDS = 300 # 种子重制时间
|
|
43
43
|
|
|
44
44
|
|
|
45
45
|
# Launcher 队列长度
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{cobweb-launcher-1.1.2 → cobweb-launcher-1.1.4}/cobweb_launcher.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|