cobweb-launcher 1.0.7__tar.gz → 1.0.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cobweb-launcher might be problematic. Click here for more details.
- {cobweb-launcher-1.0.7/cobweb_launcher.egg-info → cobweb-launcher-1.0.9}/PKG-INFO +1 -1
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/cobweb/base/request.py +10 -0
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/cobweb/crawlers/file_crawler.py +7 -10
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9/cobweb_launcher.egg-info}/PKG-INFO +1 -1
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/setup.py +1 -1
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/LICENSE +0 -0
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/README.md +0 -0
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/cobweb/__init__.py +0 -0
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/cobweb/base/__init__.py +0 -0
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/cobweb/base/common_queue.py +0 -0
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/cobweb/base/decorators.py +0 -0
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/cobweb/base/item.py +0 -0
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/cobweb/base/log.py +0 -0
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/cobweb/base/response.py +0 -0
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/cobweb/base/seed.py +0 -0
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/cobweb/constant.py +0 -0
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/cobweb/crawlers/__init__.py +0 -0
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/cobweb/crawlers/base_crawler.py +0 -0
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/cobweb/db/__init__.py +0 -0
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/cobweb/db/redis_db.py +0 -0
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/cobweb/exceptions/__init__.py +0 -0
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/cobweb/exceptions/oss_db_exception.py +0 -0
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/cobweb/launchers/__init__.py +0 -0
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/cobweb/launchers/launcher.py +0 -0
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/cobweb/launchers/launcher_pro.py +0 -0
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/cobweb/pipelines/__init__.py +0 -0
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/cobweb/pipelines/base_pipeline.py +0 -0
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/cobweb/pipelines/loghub_pipeline.py +0 -0
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/cobweb/setting.py +0 -0
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/cobweb/utils/__init__.py +0 -0
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/cobweb/utils/oss.py +0 -0
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/cobweb/utils/tools.py +0 -0
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/cobweb_launcher.egg-info/SOURCES.txt +0 -0
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/cobweb_launcher.egg-info/requires.txt +0 -0
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/cobweb_launcher.egg-info/top_level.txt +0 -0
- {cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/setup.cfg +0 -0
|
@@ -70,3 +70,13 @@ class Request:
|
|
|
70
70
|
response.raise_for_status()
|
|
71
71
|
return response
|
|
72
72
|
|
|
73
|
+
@property
|
|
74
|
+
def to_dict(self):
|
|
75
|
+
_dict = self.__dict__.copy()
|
|
76
|
+
_dict.pop('url')
|
|
77
|
+
_dict.pop('seed')
|
|
78
|
+
_dict.pop('check_status_code')
|
|
79
|
+
_dict.pop('request_setting')
|
|
80
|
+
return _dict
|
|
81
|
+
|
|
82
|
+
|
|
@@ -35,20 +35,20 @@ class CrawlerAir(Crawler):
|
|
|
35
35
|
|
|
36
36
|
response = item.download()
|
|
37
37
|
|
|
38
|
-
content_length = response.headers.get("content-length"
|
|
38
|
+
content_length = int(response.headers.get("content-length", 0))
|
|
39
39
|
content_type = response.headers.get("content-type", "").split(";")[0]
|
|
40
40
|
if content_type and content_type in setting.FILE_FILTER_CONTENT_TYPE:
|
|
41
41
|
yield Response(
|
|
42
42
|
item.seed, response, filter=True, msg=f"response content type is {content_type}",
|
|
43
43
|
bucket_name=bucket_name, data_size=content_length, **seed_dict
|
|
44
44
|
)
|
|
45
|
-
elif position == 1 and min_upload_size >=
|
|
45
|
+
elif position == 1 and min_upload_size >= content_length > 0:
|
|
46
46
|
"""过小文件标识返回"""
|
|
47
47
|
yield Response(
|
|
48
48
|
item.seed, response, filter=True, msg="file size is too small",
|
|
49
49
|
bucket_name=bucket_name, data_size=content_length, **seed_dict
|
|
50
50
|
)
|
|
51
|
-
elif position == 1 and chunk_size >
|
|
51
|
+
elif position == 1 and chunk_size > content_length > min_upload_size:
|
|
52
52
|
"""小文件直接下载"""
|
|
53
53
|
for part_data in response.iter_content(chunk_size):
|
|
54
54
|
content += part_data
|
|
@@ -117,20 +117,20 @@ class CrawlerPro(Crawler):
|
|
|
117
117
|
|
|
118
118
|
response = item.download()
|
|
119
119
|
|
|
120
|
-
content_length = response.headers.get("content-length"
|
|
120
|
+
content_length = int(response.headers.get("content-length", 0))
|
|
121
121
|
content_type = response.headers.get("content-type", "").split(";")[0]
|
|
122
122
|
if content_type and content_type in setting.FILE_FILTER_CONTENT_TYPE:
|
|
123
123
|
yield Response(
|
|
124
124
|
item.seed, response, filter=True, msg=f"response content type is {content_type}",
|
|
125
125
|
bucket_name=bucket_name, data_size=content_length, **seed_dict
|
|
126
126
|
)
|
|
127
|
-
elif position == 1 and min_upload_size >=
|
|
127
|
+
elif position == 1 and min_upload_size >= content_length > 0:
|
|
128
128
|
"""过小文件标识返回"""
|
|
129
129
|
yield Response(
|
|
130
130
|
item.seed, response, filter=True, msg="file size is too small",
|
|
131
131
|
bucket_name=bucket_name, data_size=content_length, **seed_dict
|
|
132
132
|
)
|
|
133
|
-
elif position == 1 and chunk_size >
|
|
133
|
+
elif position == 1 and chunk_size > content_length > min_upload_size:
|
|
134
134
|
"""小文件直接下载"""
|
|
135
135
|
for part_data in response.iter_content(chunk_size):
|
|
136
136
|
content += part_data
|
|
@@ -160,15 +160,12 @@ class CrawlerPro(Crawler):
|
|
|
160
160
|
content_length += len(content)
|
|
161
161
|
CrawlerAir.oss_util.merge(key, upload_id)
|
|
162
162
|
yield Response(item.seed, response, bucket_name=bucket_name, data_size=content_length, **seed_dict)
|
|
163
|
-
# data, cols = download_meta(item.seed, bucket_name, data_size=content_length, **seed_dict)
|
|
164
|
-
# yield DownloadItem(item.seed, sid=item.seed.sid, cols=cols, data=data)
|
|
165
163
|
|
|
166
164
|
elif item.seed.params.identifier == "merge":
|
|
167
165
|
CrawlerAir.oss_util.merge(key, seed_dict["upload_id"])
|
|
168
166
|
content_length = CrawlerAir.oss_util.head(key).content_length
|
|
169
167
|
yield Response(item.seed, "merge", bucket_name=bucket_name, data_size=content_length, **seed_dict)
|
|
170
|
-
|
|
171
|
-
# yield DownloadItem(item.seed, sid=item.seed.sid, cols=cols, data=data)
|
|
168
|
+
|
|
172
169
|
except OssDBPutPartError:
|
|
173
170
|
yield Seed(seed_dict)
|
|
174
171
|
except OssDBMergeError:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{cobweb-launcher-1.0.7 → cobweb-launcher-1.0.9}/cobweb_launcher.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|