cobweb-launcher 1.0.6__py3-none-any.whl → 1.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cobweb-launcher might be problematic. Click here for more details.
- cobweb/crawlers/file_crawler.py +7 -16
- cobweb/setting.py +1 -1
- cobweb/utils/tools.py +1 -1
- {cobweb_launcher-1.0.6.dist-info → cobweb_launcher-1.0.8.dist-info}/METADATA +1 -1
- {cobweb_launcher-1.0.6.dist-info → cobweb_launcher-1.0.8.dist-info}/RECORD +8 -8
- {cobweb_launcher-1.0.6.dist-info → cobweb_launcher-1.0.8.dist-info}/LICENSE +0 -0
- {cobweb_launcher-1.0.6.dist-info → cobweb_launcher-1.0.8.dist-info}/WHEEL +0 -0
- {cobweb_launcher-1.0.6.dist-info → cobweb_launcher-1.0.8.dist-info}/top_level.txt +0 -0
cobweb/crawlers/file_crawler.py
CHANGED
|
@@ -20,8 +20,6 @@ class CrawlerAir(Crawler):
|
|
|
20
20
|
if CrawlerAir.oss_util.exists(key):
|
|
21
21
|
content_length = CrawlerAir.oss_util.head(key).content_length
|
|
22
22
|
yield Response(item.seed, "exists", bucket_name=bucket_name, data_size=content_length, **seed_dict)
|
|
23
|
-
# data, cols = download_meta(item.seed, bucket_name=bucket_name, data_size=content_length, **seed_dict)
|
|
24
|
-
# yield DownloadItem(item.seed, sid=item.seed.sid, cols=cols, data=data)
|
|
25
23
|
|
|
26
24
|
end = seed_dict.get("end", "")
|
|
27
25
|
start = seed_dict.get("start", "0")
|
|
@@ -37,20 +35,20 @@ class CrawlerAir(Crawler):
|
|
|
37
35
|
|
|
38
36
|
response = item.download()
|
|
39
37
|
|
|
40
|
-
content_length = response.headers.get("content-length"
|
|
38
|
+
content_length = int(response.headers.get("content-length", 0))
|
|
41
39
|
content_type = response.headers.get("content-type", "").split(";")[0]
|
|
42
40
|
if content_type and content_type in setting.FILE_FILTER_CONTENT_TYPE:
|
|
43
41
|
yield Response(
|
|
44
42
|
item.seed, response, filter=True, msg=f"response content type is {content_type}",
|
|
45
43
|
bucket_name=bucket_name, data_size=content_length, **seed_dict
|
|
46
44
|
)
|
|
47
|
-
elif position == 1 and min_upload_size >=
|
|
45
|
+
elif position == 1 and min_upload_size >= content_length > 0:
|
|
48
46
|
"""过小文件标识返回"""
|
|
49
47
|
yield Response(
|
|
50
48
|
item.seed, response, filter=True, msg="file size is too small",
|
|
51
49
|
bucket_name=bucket_name, data_size=content_length, **seed_dict
|
|
52
50
|
)
|
|
53
|
-
elif position == 1 and chunk_size >
|
|
51
|
+
elif position == 1 and chunk_size > content_length > min_upload_size:
|
|
54
52
|
"""小文件直接下载"""
|
|
55
53
|
for part_data in response.iter_content(chunk_size):
|
|
56
54
|
content += part_data
|
|
@@ -80,15 +78,11 @@ class CrawlerAir(Crawler):
|
|
|
80
78
|
content_length += len(content)
|
|
81
79
|
CrawlerAir.oss_util.merge(key, upload_id)
|
|
82
80
|
yield Response(item.seed, response, bucket_name=bucket_name, data_size=content_length, **seed_dict)
|
|
83
|
-
# data, cols = download_meta(item.seed, bucket_name, data_size=content_length, **seed_dict)
|
|
84
|
-
# yield DownloadItem(item.seed, sid=item.seed.sid, cols=cols, data=data)
|
|
85
81
|
|
|
86
82
|
elif item.seed.params.identifier == "merge":
|
|
87
83
|
CrawlerAir.oss_util.merge(key, seed_dict["upload_id"])
|
|
88
84
|
content_length = CrawlerAir.oss_util.head(key).content_length
|
|
89
85
|
yield Response(item.seed, "merge", bucket_name=bucket_name, data_size=content_length, **seed_dict)
|
|
90
|
-
# data, cols = download_meta(item.seed, bucket_name, data_size=content_length, **seed_dict)
|
|
91
|
-
# yield DownloadItem(item.seed, sid=item.seed.sid, cols=cols, data=data)
|
|
92
86
|
except OssDBPutPartError:
|
|
93
87
|
yield Seed(seed_dict)
|
|
94
88
|
except OssDBMergeError:
|
|
@@ -123,20 +117,20 @@ class CrawlerPro(Crawler):
|
|
|
123
117
|
|
|
124
118
|
response = item.download()
|
|
125
119
|
|
|
126
|
-
content_length = response.headers.get("content-length"
|
|
120
|
+
content_length = int(response.headers.get("content-length", 0))
|
|
127
121
|
content_type = response.headers.get("content-type", "").split(";")[0]
|
|
128
122
|
if content_type and content_type in setting.FILE_FILTER_CONTENT_TYPE:
|
|
129
123
|
yield Response(
|
|
130
124
|
item.seed, response, filter=True, msg=f"response content type is {content_type}",
|
|
131
125
|
bucket_name=bucket_name, data_size=content_length, **seed_dict
|
|
132
126
|
)
|
|
133
|
-
elif position == 1 and min_upload_size >=
|
|
127
|
+
elif position == 1 and min_upload_size >= content_length > 0:
|
|
134
128
|
"""过小文件标识返回"""
|
|
135
129
|
yield Response(
|
|
136
130
|
item.seed, response, filter=True, msg="file size is too small",
|
|
137
131
|
bucket_name=bucket_name, data_size=content_length, **seed_dict
|
|
138
132
|
)
|
|
139
|
-
elif position == 1 and chunk_size >
|
|
133
|
+
elif position == 1 and chunk_size > content_length > min_upload_size:
|
|
140
134
|
"""小文件直接下载"""
|
|
141
135
|
for part_data in response.iter_content(chunk_size):
|
|
142
136
|
content += part_data
|
|
@@ -166,15 +160,12 @@ class CrawlerPro(Crawler):
|
|
|
166
160
|
content_length += len(content)
|
|
167
161
|
CrawlerAir.oss_util.merge(key, upload_id)
|
|
168
162
|
yield Response(item.seed, response, bucket_name=bucket_name, data_size=content_length, **seed_dict)
|
|
169
|
-
# data, cols = download_meta(item.seed, bucket_name, data_size=content_length, **seed_dict)
|
|
170
|
-
# yield DownloadItem(item.seed, sid=item.seed.sid, cols=cols, data=data)
|
|
171
163
|
|
|
172
164
|
elif item.seed.params.identifier == "merge":
|
|
173
165
|
CrawlerAir.oss_util.merge(key, seed_dict["upload_id"])
|
|
174
166
|
content_length = CrawlerAir.oss_util.head(key).content_length
|
|
175
167
|
yield Response(item.seed, "merge", bucket_name=bucket_name, data_size=content_length, **seed_dict)
|
|
176
|
-
|
|
177
|
-
# yield DownloadItem(item.seed, sid=item.seed.sid, cols=cols, data=data)
|
|
168
|
+
|
|
178
169
|
except OssDBPutPartError:
|
|
179
170
|
yield Seed(seed_dict)
|
|
180
171
|
except OssDBMergeError:
|
cobweb/setting.py
CHANGED
|
@@ -23,7 +23,7 @@ OSS_BUCKET = os.getenv("OSS_BUCKET")
|
|
|
23
23
|
OSS_ENDPOINT = os.getenv("OSS_ENDPOINT")
|
|
24
24
|
OSS_ACCESS_KEY = os.getenv("OSS_ACCESS_KEY")
|
|
25
25
|
OSS_SECRET_KEY = os.getenv("OSS_SECRET_KEY")
|
|
26
|
-
OSS_MIN_UPLOAD_SIZE = 1024
|
|
26
|
+
OSS_MIN_UPLOAD_SIZE = 1024
|
|
27
27
|
OSS_CHUNK_SIZE = 1024 ** 2
|
|
28
28
|
|
|
29
29
|
# 采集器选择
|
cobweb/utils/tools.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
cobweb/__init__.py,sha256=IkGcdTU6fEBNyzWowcJSSMdErntFM1kmu4WUp1BgImU,45
|
|
2
2
|
cobweb/constant.py,sha256=Aw2ES_nohVRLTWylZp6WMiUAlgyw4kLbae7LpwdZ5y4,1867
|
|
3
|
-
cobweb/setting.py,sha256=
|
|
3
|
+
cobweb/setting.py,sha256=fYK8KgrZYwSoJC9ywsopAtV0mdRw0_MGAoYORgET01M,1949
|
|
4
4
|
cobweb/base/__init__.py,sha256=diiK5MygQaWjlWNLbW6eUIg-93O6glMGC9WLNM5jyOc,209
|
|
5
5
|
cobweb/base/common_queue.py,sha256=W7PPZZFl52j3Mc916T0imHj7oAUelA6aKJwW-FecDPE,872
|
|
6
6
|
cobweb/base/decorators.py,sha256=wDCaQ94aAZGxks9Ljc0aXq6omDXT1_yzFy83ZW6VbVI,930
|
|
@@ -11,7 +11,7 @@ cobweb/base/response.py,sha256=7h9TwCNqRlwM_fvNmid9zOoRfHbKB8ABSU0eaVUJdVo,405
|
|
|
11
11
|
cobweb/base/seed.py,sha256=XswH16eEd6iwIBpt71E2S_AsV5UVCcOEOBFoP0r5QRo,2900
|
|
12
12
|
cobweb/crawlers/__init__.py,sha256=1sMhQ0-NJxiff3IqF2aMCXkSXcJFzzoCKIayQ5go4aI,71
|
|
13
13
|
cobweb/crawlers/base_crawler.py,sha256=snfjGYV9oZl3UnrxUcpAD9cK0n_rPeuyUYQkU1aCBrA,4891
|
|
14
|
-
cobweb/crawlers/file_crawler.py,sha256=
|
|
14
|
+
cobweb/crawlers/file_crawler.py,sha256=AeK22bakMbwDfqrITlX6ssvR2KSpHQ3s8YEuzX_Bluk,8670
|
|
15
15
|
cobweb/db/__init__.py,sha256=ut0iEyBLjcJL06WNG_5_d4hO5PJWvDrKWMkDOdmgh2M,30
|
|
16
16
|
cobweb/db/redis_db.py,sha256=XE1ebIi_4e0KBKiyPdKX7l2lSgV5vhMwEhNDlAcsccU,4247
|
|
17
17
|
cobweb/exceptions/__init__.py,sha256=E9SHnJBbhD7fOgPFMswqyOf8SKRDrI_i25L0bSpohvk,32
|
|
@@ -24,9 +24,9 @@ cobweb/pipelines/base_pipeline.py,sha256=fYnWf79GmhufXpcnMa3te18SbmnVeYLwxfyo-zL
|
|
|
24
24
|
cobweb/pipelines/loghub_pipeline.py,sha256=cjPO6w6UJ0jNw2fVvdX0BCdlm58T7dmYXlxzXOBpvfY,1027
|
|
25
25
|
cobweb/utils/__init__.py,sha256=JTE4sBfHnKHhD6w9Auk0MIT7O9BMOamCeryhlHNx3Zg,47
|
|
26
26
|
cobweb/utils/oss.py,sha256=uD5aN2oVYImit3amE6TjxWMaTAcbAh9dCnpIQhf4M9Q,3238
|
|
27
|
-
cobweb/utils/tools.py,sha256=
|
|
28
|
-
cobweb_launcher-1.0.
|
|
29
|
-
cobweb_launcher-1.0.
|
|
30
|
-
cobweb_launcher-1.0.
|
|
31
|
-
cobweb_launcher-1.0.
|
|
32
|
-
cobweb_launcher-1.0.
|
|
27
|
+
cobweb/utils/tools.py,sha256=bVd3iRGBvwhohQAH7AXTTjbmQ54Z35K0O-fatEyhePU,1249
|
|
28
|
+
cobweb_launcher-1.0.8.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
|
|
29
|
+
cobweb_launcher-1.0.8.dist-info/METADATA,sha256=XxTFgYdij-0ct7QV4cqz8Tg4b7JhoOWQ3j_K1_SplEc,1245
|
|
30
|
+
cobweb_launcher-1.0.8.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
|
|
31
|
+
cobweb_launcher-1.0.8.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
|
|
32
|
+
cobweb_launcher-1.0.8.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|