cobweb-launcher 1.0.5__py3-none-any.whl → 3.2.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cobweb/__init__.py +5 -1
- cobweb/base/__init__.py +3 -3
- cobweb/base/common_queue.py +37 -16
- cobweb/base/item.py +40 -14
- cobweb/base/{log.py → logger.py} +3 -3
- cobweb/base/request.py +744 -47
- cobweb/base/response.py +381 -13
- cobweb/base/seed.py +98 -50
- cobweb/base/task_queue.py +180 -0
- cobweb/base/test.py +257 -0
- cobweb/constant.py +39 -2
- cobweb/crawlers/__init__.py +1 -2
- cobweb/crawlers/crawler.py +27 -0
- cobweb/db/__init__.py +1 -0
- cobweb/db/api_db.py +83 -0
- cobweb/db/redis_db.py +118 -27
- cobweb/launchers/__init__.py +3 -1
- cobweb/launchers/distributor.py +141 -0
- cobweb/launchers/launcher.py +103 -130
- cobweb/launchers/uploader.py +68 -0
- cobweb/log_dots/__init__.py +2 -0
- cobweb/log_dots/dot.py +258 -0
- cobweb/log_dots/loghub_dot.py +53 -0
- cobweb/pipelines/__init__.py +3 -2
- cobweb/pipelines/pipeline.py +19 -0
- cobweb/pipelines/pipeline_csv.py +25 -0
- cobweb/pipelines/pipeline_loghub.py +54 -0
- cobweb/schedulers/__init__.py +1 -0
- cobweb/schedulers/scheduler.py +66 -0
- cobweb/schedulers/scheduler_with_redis.py +189 -0
- cobweb/setting.py +37 -38
- cobweb/utils/__init__.py +5 -2
- cobweb/utils/bloom.py +58 -0
- cobweb/{base → utils}/decorators.py +14 -12
- cobweb/utils/dotting.py +300 -0
- cobweb/utils/oss.py +113 -86
- cobweb/utils/tools.py +3 -15
- cobweb_launcher-3.2.18.dist-info/METADATA +193 -0
- cobweb_launcher-3.2.18.dist-info/RECORD +44 -0
- {cobweb_launcher-1.0.5.dist-info → cobweb_launcher-3.2.18.dist-info}/WHEEL +1 -1
- cobweb/crawlers/base_crawler.py +0 -121
- cobweb/crawlers/file_crawler.py +0 -181
- cobweb/launchers/launcher_pro.py +0 -174
- cobweb/pipelines/base_pipeline.py +0 -54
- cobweb/pipelines/loghub_pipeline.py +0 -34
- cobweb_launcher-1.0.5.dist-info/METADATA +0 -48
- cobweb_launcher-1.0.5.dist-info/RECORD +0 -32
- {cobweb_launcher-1.0.5.dist-info → cobweb_launcher-3.2.18.dist-info}/LICENSE +0 -0
- {cobweb_launcher-1.0.5.dist-info → cobweb_launcher-3.2.18.dist-info}/top_level.txt +0 -0
cobweb/base/request.py
CHANGED
|
@@ -1,72 +1,769 @@
|
|
|
1
1
|
import random
|
|
2
|
+
import logging
|
|
3
|
+
import time
|
|
4
|
+
|
|
2
5
|
import requests
|
|
3
6
|
|
|
7
|
+
from urllib.parse import urlparse
|
|
8
|
+
from typing import Any, Set, Dict, Optional
|
|
9
|
+
from requests.exceptions import RequestException
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class FileTypeDetector:
|
|
13
|
+
|
|
14
|
+
def __init__(self):
|
|
15
|
+
self.file_signatures = {
|
|
16
|
+
# 图片格式 - 扩充更多类型
|
|
17
|
+
b'\x89PNG\r\n\x1a\n': 'PNG',
|
|
18
|
+
b'\xff\xd8\xff': 'JPEG',
|
|
19
|
+
b'GIF87a': 'GIF',
|
|
20
|
+
b'GIF89a': 'GIF',
|
|
21
|
+
b'RIFF': 'WEBP', # 需要进一步检查
|
|
22
|
+
b'BM': 'BMP',
|
|
23
|
+
b'II*\x00': 'TIFF',
|
|
24
|
+
b'MM\x00*': 'TIFF',
|
|
25
|
+
b'\x00\x00\x01\x00': 'ICO',
|
|
26
|
+
b'\x00\x00\x02\x00': 'CUR',
|
|
27
|
+
# 新增图片格式
|
|
28
|
+
b'\x00\x00\x00\x0cjP \r\n\x87\n': 'JP2', # JPEG 2000
|
|
29
|
+
b'\xff\x4f\xff\x51': 'JP2', # JPEG 2000 codestream
|
|
30
|
+
b'FORM': 'IFF', # 需要进一步检查 ILBM/PBM
|
|
31
|
+
b'\x0a\x05\x01\x08': 'PCX',
|
|
32
|
+
b'P1\n': 'PBM', # Portable Bitmap
|
|
33
|
+
b'P2\n': 'PGM', # Portable Graymap
|
|
34
|
+
b'P3\n': 'PPM', # Portable Pixmap
|
|
35
|
+
b'P4\n': 'PBM', # Portable Bitmap (binary)
|
|
36
|
+
b'P5\n': 'PGM', # Portable Graymap (binary)
|
|
37
|
+
b'P6\n': 'PPM', # Portable Pixmap (binary)
|
|
38
|
+
b'\x59\xa6\x6a\x95': 'RAS', # Sun Raster
|
|
39
|
+
b'\x01\xda\x01\x01\x00\x03': 'RGB', # SGI Image
|
|
40
|
+
b'\x53\x44\x50\x58': 'DPX', # Digital Picture Exchange
|
|
41
|
+
b'\x76\x2f\x31\x01': 'EXR', # OpenEXR
|
|
42
|
+
b'gimp xcf ': 'XCF', # GIMP native format
|
|
43
|
+
b'\x00\x00\x00\x0c': 'HEIC', # 需要进一步检查
|
|
44
|
+
b'ftypheic': 'HEIC',
|
|
45
|
+
b'ftypmif1': 'HEIF',
|
|
46
|
+
b'ftypavif': 'AVIF',
|
|
47
|
+
b'\x38\x42\x50\x53': 'PSD', # Photoshop Document
|
|
48
|
+
|
|
49
|
+
# 视频格式
|
|
50
|
+
b'\x00\x00\x00\x18ftypmp4': 'MP4',
|
|
51
|
+
b'\x00\x00\x00\x20ftypM4V': 'M4V',
|
|
52
|
+
b'FLV\x01': 'FLV',
|
|
53
|
+
b'\x1aE\xdf\xa3': 'WEBM',
|
|
54
|
+
b'RIFF': 'AVI', # 需要进一步检查
|
|
55
|
+
b'\x00\x00\x01\xba': 'MPEG',
|
|
56
|
+
b'\x00\x00\x01\xb3': 'MPEG',
|
|
57
|
+
b'OggS': 'OGV',
|
|
58
|
+
|
|
59
|
+
# 音频格式
|
|
60
|
+
b'ID3': 'MP3',
|
|
61
|
+
b'\xff\xfb': 'MP3',
|
|
62
|
+
b'\xff\xf3': 'MP3',
|
|
63
|
+
b'\xff\xf2': 'MP3',
|
|
64
|
+
b'fLaC': 'FLAC',
|
|
65
|
+
b'RIFF': 'WAV', # 需要进一步检查
|
|
66
|
+
b'OggS': 'OGG', # 需要进一步检查
|
|
67
|
+
b'ftypM4A': 'M4A',
|
|
68
|
+
b'MAC ': 'APE',
|
|
69
|
+
|
|
70
|
+
# 其他格式
|
|
71
|
+
b'%PDF': 'PDF',
|
|
72
|
+
b'PK\x03\x04': 'ZIP',
|
|
73
|
+
b'Rar!\x1a\x07\x00': 'RAR',
|
|
74
|
+
b'\x37\x7a\xbc\xaf\x27\x1c': '7Z',
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
# 扩展名映射 - 大幅扩充图片类型
|
|
78
|
+
self.extension_map = {
|
|
79
|
+
# 图片 - 常见格式
|
|
80
|
+
'.jpg': 'JPEG', '.jpeg': 'JPEG', '.png': 'PNG', '.gif': 'GIF',
|
|
81
|
+
'.webp': 'WEBP', '.bmp': 'BMP', '.tiff': 'TIFF', '.tif': 'TIFF',
|
|
82
|
+
'.ico': 'ICO', '.svg': 'SVG', '.heic': 'HEIC', '.avif': 'AVIF',
|
|
83
|
+
|
|
84
|
+
# 图片 - 专业格式
|
|
85
|
+
'.psd': 'PSD', '.psb': 'PSB', # Photoshop
|
|
86
|
+
'.ai': 'AI', # Adobe Illustrator
|
|
87
|
+
'.eps': 'EPS', '.ps': 'PS', # PostScript
|
|
88
|
+
'.pdf': 'PDF', # 也可作为图片格式
|
|
89
|
+
|
|
90
|
+
# 图片 - RAW格式
|
|
91
|
+
'.raw': 'RAW', '.cr2': 'CR2', '.cr3': 'CR3', # Canon
|
|
92
|
+
'.nef': 'NEF', '.nrw': 'NRW', # Nikon
|
|
93
|
+
'.arw': 'ARW', '.srf': 'SRF', '.sr2': 'SR2', # Sony
|
|
94
|
+
'.orf': 'ORF', # Olympus
|
|
95
|
+
'.rw2': 'RW2', # Panasonic
|
|
96
|
+
'.dng': 'DNG', # Adobe Digital Negative
|
|
97
|
+
'.raf': 'RAF', # Fujifilm
|
|
98
|
+
'.3fr': '3FR', # Hasselblad
|
|
99
|
+
'.fff': 'FFF', # Imacon
|
|
100
|
+
'.dcr': 'DCR', '.mrw': 'MRW', # Minolta
|
|
101
|
+
'.mos': 'MOS', # Leaf
|
|
102
|
+
'.ptx': 'PTX', '.pef': 'PEF', # Pentax
|
|
103
|
+
'.x3f': 'X3F', # Sigma
|
|
104
|
+
|
|
105
|
+
# 图片 - 其他格式
|
|
106
|
+
'.jp2': 'JP2', '.jpx': 'JPX', '.j2k': 'J2K', # JPEG 2000
|
|
107
|
+
'.jxr': 'JXR', '.hdp': 'HDP', '.wdp': 'WDP', # JPEG XR
|
|
108
|
+
'.jxl': 'JXL', # JPEG XL
|
|
109
|
+
'.heif': 'HEIF', # High Efficiency Image Format
|
|
110
|
+
'.dds': 'DDS', # DirectDraw Surface
|
|
111
|
+
'.tga': 'TGA', '.targa': 'TGA', # Truevision TGA
|
|
112
|
+
'.pcx': 'PCX', # PC Paintbrush
|
|
113
|
+
'.pbm': 'PBM', '.pgm': 'PGM', '.ppm': 'PPM', '.pnm': 'PNM', # Netpbm
|
|
114
|
+
'.xbm': 'XBM', '.xpm': 'XPM', # X11 Bitmap/Pixmap
|
|
115
|
+
'.sgi': 'SGI', '.rgb': 'RGB', # Silicon Graphics Image
|
|
116
|
+
'.ras': 'RAS', '.sun': 'SUN', # Sun Raster
|
|
117
|
+
'.iff': 'IFF', '.lbm': 'LBM', '.ilbm': 'ILBM', # Interchange File Format
|
|
118
|
+
'.mng': 'MNG', '.jng': 'JNG', # Multiple-image Network Graphics
|
|
119
|
+
'.wbmp': 'WBMP', # Wireless Bitmap
|
|
120
|
+
'.cur': 'CUR', # Windows Cursor
|
|
121
|
+
'.ani': 'ANI', # Windows Animated Cursor
|
|
122
|
+
'.icns': 'ICNS', # Apple Icon Image
|
|
123
|
+
'.dpx': 'DPX', # Digital Picture Exchange
|
|
124
|
+
'.exr': 'EXR', # OpenEXR
|
|
125
|
+
'.hdr': 'HDR', '.rgbe': 'RGBE', # Radiance HDR
|
|
126
|
+
'.pfm': 'PFM', # Portable Float Map
|
|
127
|
+
'.xcf': 'XCF', # GIMP native format
|
|
128
|
+
'.kra': 'KRA', # Krita Document
|
|
129
|
+
'.ora': 'ORA', # OpenRaster
|
|
130
|
+
'.clip': 'CLIP', # Clip Studio Paint
|
|
131
|
+
'.sai': 'SAI', '.sai2': 'SAI2', # PaintTool SAI
|
|
132
|
+
'.mdp': 'MDP', # FireAlpaca/MediBang Paint
|
|
133
|
+
'.procreate': 'PROCREATE', # Procreate
|
|
134
|
+
|
|
135
|
+
# 视频
|
|
136
|
+
'.mp4': 'MP4', '.avi': 'AVI', '.mov': 'MOV', '.wmv': 'WMV',
|
|
137
|
+
'.flv': 'FLV', '.webm': 'WEBM', '.mkv': 'MKV', '.m4v': 'M4V',
|
|
138
|
+
'.mpg': 'MPEG', '.mpeg': 'MPEG', '.3gp': '3GP', '.ogv': 'OGV',
|
|
139
|
+
'.ts': 'TS', '.mts': 'MTS', '.vob': 'VOB',
|
|
140
|
+
|
|
141
|
+
# 音频
|
|
142
|
+
'.mp3': 'MP3', '.wav': 'WAV', '.flac': 'FLAC', '.aac': 'AAC',
|
|
143
|
+
'.ogg': 'OGG', '.wma': 'WMA', '.m4a': 'M4A', '.ape': 'APE',
|
|
144
|
+
'.opus': 'OPUS', '.aiff': 'AIFF', '.au': 'AU',
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
# MIME类型映射 - 扩充图片类型
|
|
148
|
+
self.mime_type_map = {
|
|
149
|
+
# 图片 - 基础格式
|
|
150
|
+
'image/jpeg': 'JPEG', 'image/png': 'PNG', 'image/gif': 'GIF',
|
|
151
|
+
'image/webp': 'WEBP', 'image/bmp': 'BMP', 'image/tiff': 'TIFF',
|
|
152
|
+
'image/svg+xml': 'SVG', 'image/x-icon': 'ICO',
|
|
153
|
+
|
|
154
|
+
# 图片 - 现代格式
|
|
155
|
+
'image/heic': 'HEIC', 'image/heif': 'HEIF', 'image/avif': 'AVIF',
|
|
156
|
+
'image/jxl': 'JXL', 'image/jp2': 'JP2', 'image/jpx': 'JPX',
|
|
157
|
+
'image/jxr': 'JXR', 'image/vnd.ms-photo': 'JXR',
|
|
158
|
+
|
|
159
|
+
# 图片 - 其他格式
|
|
160
|
+
'image/x-targa': 'TGA', 'image/x-tga': 'TGA',
|
|
161
|
+
'image/x-pcx': 'PCX', 'image/x-portable-bitmap': 'PBM',
|
|
162
|
+
'image/x-portable-graymap': 'PGM', 'image/x-portable-pixmap': 'PPM',
|
|
163
|
+
'image/x-portable-anymap': 'PNM', 'image/x-xbitmap': 'XBM',
|
|
164
|
+
'image/x-xpixmap': 'XPM', 'image/x-sgi': 'SGI',
|
|
165
|
+
'image/x-sun-raster': 'RAS', 'image/x-iff': 'IFF',
|
|
166
|
+
'image/vnd.wap.wbmp': 'WBMP', 'image/x-ms-bmp': 'BMP',
|
|
167
|
+
'image/vnd.adobe.photoshop': 'PSD', 'image/x-photoshop': 'PSD',
|
|
168
|
+
'image/x-exr': 'EXR', 'image/vnd.radiance': 'HDR',
|
|
169
|
+
'image/x-xcf': 'XCF', 'image/openraster': 'ORA',
|
|
170
|
+
|
|
171
|
+
# 视频
|
|
172
|
+
'video/mp4': 'MP4', 'video/avi': 'AVI', 'video/quicktime': 'MOV',
|
|
173
|
+
'video/x-msvideo': 'AVI', 'video/webm': 'WEBM', 'video/x-flv': 'FLV',
|
|
174
|
+
'video/3gpp': '3GP', 'video/ogg': 'OGV',
|
|
175
|
+
|
|
176
|
+
# 音频
|
|
177
|
+
'audio/mpeg': 'MP3', 'audio/wav': 'WAV', 'audio/flac': 'FLAC',
|
|
178
|
+
'audio/aac': 'AAC', 'audio/ogg': 'OGG', 'audio/x-ms-wma': 'WMA',
|
|
179
|
+
'audio/mp4': 'M4A', 'audio/opus': 'OPUS',
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
self.session = requests.Session()
|
|
183
|
+
self.session.headers.update({
|
|
184
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
|
185
|
+
})
|
|
186
|
+
|
|
187
|
+
def get_file_extension(self, url: str) -> str:
|
|
188
|
+
"""从URL获取文件扩展名"""
|
|
189
|
+
parsed = urlparse(url)
|
|
190
|
+
path = parsed.path.lower()
|
|
191
|
+
site = parsed.netloc
|
|
192
|
+
|
|
193
|
+
# 移除查询参数
|
|
194
|
+
if '?' in path:
|
|
195
|
+
path = path.split('?')[0]
|
|
196
|
+
|
|
197
|
+
# 获取扩展名
|
|
198
|
+
if '.' in path:
|
|
199
|
+
return '.' + path.split('.')[-1], site
|
|
200
|
+
return '', site
|
|
201
|
+
|
|
202
|
+
def detect_by_extension(self, url: str) -> Optional[str]:
|
|
203
|
+
"""通过文件扩展名检测类型"""
|
|
204
|
+
ext, site = self.get_file_extension(url)
|
|
205
|
+
return self.extension_map.get(ext)
|
|
206
|
+
|
|
207
|
+
def detect_by_mime_type(self, content_type: str) -> Optional[str]:
|
|
208
|
+
"""通过MIME类型检测"""
|
|
209
|
+
if not content_type:
|
|
210
|
+
return None
|
|
211
|
+
|
|
212
|
+
# 清理content-type,移除参数
|
|
213
|
+
mime_type = content_type.split(';')[0].strip().lower()
|
|
214
|
+
return self.mime_type_map.get(mime_type)
|
|
215
|
+
|
|
216
|
+
def get_partial_content(self, url: str, max_bytes: int = 64) -> Optional[bytes]:
|
|
217
|
+
"""获取文件的前几个字节"""
|
|
218
|
+
try:
|
|
219
|
+
headers = {'Range': f'bytes=0-{max_bytes - 1}'}
|
|
220
|
+
response = self.session.get(url, headers=headers, timeout=10)
|
|
221
|
+
|
|
222
|
+
if response.status_code in [200, 206]:
|
|
223
|
+
return response.content
|
|
224
|
+
except Exception as e:
|
|
225
|
+
print(f"获取内容失败: {e}")
|
|
226
|
+
return None
|
|
227
|
+
|
|
228
|
+
def detect_by_signature(self, data: bytes) -> Optional[str]:
|
|
229
|
+
"""通过文件签名检测类型"""
|
|
230
|
+
if not data:
|
|
231
|
+
return None
|
|
232
|
+
|
|
233
|
+
# 检查各种文件签名
|
|
234
|
+
for signature, file_type in self.file_signatures.items():
|
|
235
|
+
if data.startswith(signature):
|
|
236
|
+
# 特殊处理需要进一步检查的格式
|
|
237
|
+
if signature == b'RIFF' and len(data) >= 12:
|
|
238
|
+
# 检查是WEBP、AVI还是WAV
|
|
239
|
+
if data[8:12] == b'WEBP':
|
|
240
|
+
return 'WEBP'
|
|
241
|
+
elif data[8:12] == b'AVI ':
|
|
242
|
+
return 'AVI'
|
|
243
|
+
elif data[8:12] == b'WAVE':
|
|
244
|
+
return 'WAV'
|
|
245
|
+
elif signature == b'OggS' and len(data) >= 32:
|
|
246
|
+
# 检查是OGG音频还是OGV视频
|
|
247
|
+
if b'vorbis' in data[:64].lower():
|
|
248
|
+
return 'OGG'
|
|
249
|
+
elif b'theora' in data[:64].lower():
|
|
250
|
+
return 'OGV'
|
|
251
|
+
else:
|
|
252
|
+
return 'OGG'
|
|
253
|
+
else:
|
|
254
|
+
return file_type
|
|
255
|
+
|
|
256
|
+
# 检查MP4相关格式
|
|
257
|
+
if len(data) >= 12 and data[4:8] == b'ftyp':
|
|
258
|
+
brand = data[8:12]
|
|
259
|
+
if brand in [b'mp41', b'mp42', b'isom', b'avc1']:
|
|
260
|
+
return 'MP4'
|
|
261
|
+
elif brand == b'M4A ':
|
|
262
|
+
return 'M4A'
|
|
263
|
+
elif brand == b'M4V ':
|
|
264
|
+
return 'M4V'
|
|
265
|
+
elif brand == b'qt ':
|
|
266
|
+
return 'MOV'
|
|
267
|
+
|
|
268
|
+
return None
|
|
269
|
+
|
|
270
|
+
def get_detailed_info(self, url, content_type, data) -> Dict:
|
|
271
|
+
"""获取详细的文件信息"""
|
|
272
|
+
result = {
|
|
273
|
+
'url': url,
|
|
274
|
+
'site': None,
|
|
275
|
+
'detected_type': None,
|
|
276
|
+
'confidence': 'unknown',
|
|
277
|
+
'methods_used': [],
|
|
278
|
+
'content_type': content_type,
|
|
279
|
+
'extension': None
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
# 1. 先尝试HEAD请求获取HTTP头信息
|
|
283
|
+
try:
|
|
284
|
+
result['content_type'] = content_type
|
|
285
|
+
# result['file_size'] = content_length
|
|
286
|
+
|
|
287
|
+
# 通过MIME类型检测
|
|
288
|
+
mime_detected = self.detect_by_mime_type(content_type)
|
|
289
|
+
if mime_detected:
|
|
290
|
+
result['detected_type'] = mime_detected
|
|
291
|
+
result['confidence'] = 'high'
|
|
292
|
+
result['methods_used'].append('mime_type')
|
|
293
|
+
except Exception as e:
|
|
294
|
+
print(f"HEAD请求失败: {e}")
|
|
295
|
+
|
|
296
|
+
# 2. 通过扩展名检测
|
|
297
|
+
ext_detected = self.detect_by_extension(url)
|
|
298
|
+
result['extension'], result['site'] = self.get_file_extension(url)
|
|
299
|
+
|
|
300
|
+
if ext_detected:
|
|
301
|
+
if not result['detected_type']:
|
|
302
|
+
result['detected_type'] = ext_detected
|
|
303
|
+
result['confidence'] = 'medium'
|
|
304
|
+
elif result['detected_type'] == ext_detected:
|
|
305
|
+
result['confidence'] = 'very_high' # MIME和扩展名一致
|
|
306
|
+
result['methods_used'].append('extension')
|
|
307
|
+
|
|
308
|
+
# 3. 如果前两种方法不确定,使用文件签名检测
|
|
309
|
+
if data and result['confidence'] in ['unknown', 'medium']:
|
|
310
|
+
signature_detected = self.detect_by_signature(data)
|
|
311
|
+
if signature_detected:
|
|
312
|
+
if not result['detected_type']:
|
|
313
|
+
result['detected_type'] = signature_detected
|
|
314
|
+
result['confidence'] = 'high'
|
|
315
|
+
elif result['detected_type'] == signature_detected:
|
|
316
|
+
result['confidence'] = 'very_high'
|
|
317
|
+
else:
|
|
318
|
+
# 冲突时,优先相信文件签名
|
|
319
|
+
result['detected_type'] = signature_detected
|
|
320
|
+
result['confidence'] = 'high'
|
|
321
|
+
result['methods_used'].append('file_signature')
|
|
322
|
+
|
|
323
|
+
result['cate'] = self.get_file_category(result['detected_type'])
|
|
324
|
+
return result
|
|
325
|
+
|
|
326
|
+
def get_file_category(self, file_type: str) -> str:
|
|
327
|
+
"""获取文件类别"""
|
|
328
|
+
if not file_type or file_type == 'Unknown':
|
|
329
|
+
return 'Unknown'
|
|
330
|
+
|
|
331
|
+
image_types = {'PNG', 'JPEG', 'GIF', 'WEBP', 'BMP', 'TIFF', 'ICO', 'SVG', 'HEIC', 'AVIF'}
|
|
332
|
+
video_types = {'MP4', 'AVI', 'MOV', 'WMV', 'FLV', 'WEBM', 'MKV', 'M4V', 'MPEG', '3GP', 'OGV', 'TS', 'MTS',
|
|
333
|
+
'VOB'}
|
|
334
|
+
audio_types = {'MP3', 'WAV', 'FLAC', 'AAC', 'OGG', 'WMA', 'M4A', 'APE', 'OPUS', 'AIFF', 'AU'}
|
|
335
|
+
|
|
336
|
+
if file_type in image_types:
|
|
337
|
+
return 'image'
|
|
338
|
+
elif file_type in video_types:
|
|
339
|
+
return 'video'
|
|
340
|
+
elif file_type in audio_types:
|
|
341
|
+
return 'audio'
|
|
342
|
+
else:
|
|
343
|
+
return 'other'
|
|
344
|
+
|
|
4
345
|
|
|
5
346
|
class Request:
|
|
347
|
+
"""
|
|
348
|
+
HTTP 请求封装类,提供统一的请求接口和相关功能。
|
|
6
349
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
"
|
|
17
|
-
"hooks",
|
|
18
|
-
"
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
350
|
+
Features:
|
|
351
|
+
- 自动 User-Agent 生成
|
|
352
|
+
- 灵活的请求参数配置
|
|
353
|
+
- 文件类型检测
|
|
354
|
+
- 错误处理和状态码检查
|
|
355
|
+
"""
|
|
356
|
+
|
|
357
|
+
# 支持的 requests 库参数
|
|
358
|
+
_REQUEST_ATTRS: Set[str] = frozenset({
|
|
359
|
+
"params", "headers", "cookies", "data", "json", "files",
|
|
360
|
+
"auth", "timeout", "proxies", "hooks", "stream", "verify",
|
|
361
|
+
"cert", "allow_redirects"
|
|
362
|
+
})
|
|
363
|
+
|
|
364
|
+
# 默认超时时间
|
|
365
|
+
_DEFAULT_TIMEOUT = 30
|
|
366
|
+
|
|
367
|
+
# User-Agent 模板和版本范围
|
|
368
|
+
_UA_TEMPLATE = (
|
|
369
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_{v1}_{v2}) "
|
|
370
|
+
"AppleWebKit/{v4}.{v3} (KHTML, like Gecko) "
|
|
371
|
+
"Chrome/105.0.0.0 Safari/{v4}.{v3} Edg/105.0.{v5}.{v6}"
|
|
372
|
+
)
|
|
373
|
+
_UA_VERSION_RANGES = {
|
|
374
|
+
'v1': (4, 15), 'v2': (3, 11), 'v3': (1, 16),
|
|
375
|
+
'v4': (533, 605), 'v5': (1000, 6000), 'v6': (10, 80)
|
|
22
376
|
}
|
|
23
377
|
|
|
24
378
|
def __init__(
|
|
25
379
|
self,
|
|
26
|
-
url,
|
|
27
|
-
seed,
|
|
28
|
-
|
|
29
|
-
|
|
380
|
+
url: str,
|
|
381
|
+
seed: Any = None,
|
|
382
|
+
method: Optional[str] = None,
|
|
383
|
+
random_ua: bool = True,
|
|
384
|
+
check_status_code: bool = True,
|
|
30
385
|
**kwargs
|
|
31
386
|
):
|
|
387
|
+
"""
|
|
388
|
+
初始化请求对象。
|
|
389
|
+
|
|
390
|
+
Args:
|
|
391
|
+
url: 请求的 URL
|
|
392
|
+
seed: 种子对象或标识符
|
|
393
|
+
method: HTTP 方法,如果不指定则自动推断
|
|
394
|
+
random_ua: 是否使用随机 User-Agent
|
|
395
|
+
check_status_code: 是否检查响应状态码
|
|
396
|
+
**kwargs: 其他请求参数
|
|
397
|
+
|
|
398
|
+
Raises:
|
|
399
|
+
ValueError: 当 URL 格式无效时
|
|
400
|
+
"""
|
|
401
|
+
self.scheme = None
|
|
402
|
+
self.netloc = None
|
|
403
|
+
self.response = None
|
|
404
|
+
self.detector_info = None
|
|
405
|
+
self.content_length = None
|
|
406
|
+
self._validate_url(url)
|
|
407
|
+
|
|
32
408
|
self.url = url
|
|
33
409
|
self.seed = seed
|
|
34
410
|
self.check_status_code = check_status_code
|
|
35
|
-
self.
|
|
411
|
+
self.request_settings: Dict[str, Any] = {}
|
|
412
|
+
|
|
413
|
+
# 分离请求参数和实例属性
|
|
414
|
+
self._process_kwargs(kwargs)
|
|
36
415
|
|
|
37
|
-
|
|
38
|
-
if k in self.__class__.__REQUEST_ATTRS__:
|
|
39
|
-
self.request_setting[k] = v
|
|
40
|
-
continue
|
|
41
|
-
self.__setattr__(k, v)
|
|
416
|
+
self.method = self._determine_method(method)
|
|
42
417
|
|
|
43
|
-
|
|
44
|
-
|
|
418
|
+
# 设置默认超时
|
|
419
|
+
if 'timeout' not in self.request_settings:
|
|
420
|
+
self.request_settings['timeout'] = self._DEFAULT_TIMEOUT
|
|
45
421
|
|
|
422
|
+
# 构建请求头
|
|
46
423
|
if random_ua:
|
|
47
|
-
self.
|
|
424
|
+
self._setup_headers()
|
|
48
425
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
426
|
+
def _validate_url(self, url: str) -> None:
|
|
427
|
+
"""验证 URL 格式"""
|
|
428
|
+
try:
|
|
429
|
+
result = urlparse(url)
|
|
430
|
+
self.scheme = result.scheme
|
|
431
|
+
self.netloc = result.netloc
|
|
432
|
+
if not all([self.scheme, self.netloc]):
|
|
433
|
+
raise ValueError(f"无效的 URL 格式: {url}")
|
|
434
|
+
except Exception as e:
|
|
435
|
+
raise ValueError(f"URL 解析失败: {e}")
|
|
436
|
+
|
|
437
|
+
def _process_kwargs(self, kwargs: Dict[str, Any]) -> None:
|
|
438
|
+
"""处理关键字参数,分离请求参数和实例属性"""
|
|
439
|
+
for key, value in kwargs.items():
|
|
440
|
+
if key in self._REQUEST_ATTRS:
|
|
441
|
+
self.request_settings[key] = value
|
|
442
|
+
else:
|
|
443
|
+
setattr(self, key, value)
|
|
444
|
+
|
|
445
|
+
def _determine_method(self, method: Optional[str]) -> str:
|
|
446
|
+
if method:
|
|
447
|
+
return method.upper()
|
|
448
|
+
|
|
449
|
+
has_body = bool(
|
|
450
|
+
self.request_settings.get("data") or
|
|
451
|
+
self.request_settings.get("json") or
|
|
452
|
+
self.request_settings.get("files")
|
|
453
|
+
)
|
|
454
|
+
return "POST" if has_body else "GET"
|
|
455
|
+
|
|
456
|
+
def _generate_random_ua(self) -> str:
|
|
457
|
+
"""生成随机 User-Agent"""
|
|
458
|
+
versions = {
|
|
459
|
+
key: random.randint(*range_tuple)
|
|
460
|
+
for key, range_tuple in self._UA_VERSION_RANGES.items()
|
|
461
|
+
}
|
|
462
|
+
return self._UA_TEMPLATE.format(**versions)
|
|
463
|
+
|
|
464
|
+
def _setup_headers(self) -> None:
|
|
465
|
+
"""设置请求头,包括随机 User-Agent"""
|
|
466
|
+
headers = self.request_settings.setdefault("headers", {})
|
|
467
|
+
|
|
468
|
+
# 使用小写键名进行检查,保持一致性
|
|
469
|
+
ua_keys = ['user-agent', 'User-Agent']
|
|
470
|
+
if not any(headers.get(key) for key in ua_keys):
|
|
471
|
+
headers["User-Agent"] = self._generate_random_ua()
|
|
66
472
|
|
|
473
|
+
def execute(self) -> requests.Response:
|
|
474
|
+
"""
|
|
475
|
+
执行 HTTP 请求。
|
|
476
|
+
|
|
477
|
+
Returns:
|
|
478
|
+
requests.Response: 响应对象
|
|
479
|
+
|
|
480
|
+
Raises:
|
|
481
|
+
RequestException: 请求执行失败
|
|
482
|
+
requests.HTTPError: HTTP 状态码错误(当 check_status_code=True 时)
|
|
483
|
+
"""
|
|
484
|
+
try:
|
|
485
|
+
self.response = requests.request(
|
|
486
|
+
method=self.method,
|
|
487
|
+
url=self.url,
|
|
488
|
+
**self.request_settings
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
if self.check_status_code:
|
|
492
|
+
self.response.raise_for_status()
|
|
493
|
+
|
|
494
|
+
return self.response
|
|
495
|
+
|
|
496
|
+
except RequestException as e:
|
|
497
|
+
logging.error(f"请求执行失败 - URL: {self.url}, 错误: {e}")
|
|
498
|
+
raise
|
|
499
|
+
|
|
500
|
+
# 保持向后兼容性
|
|
67
501
|
def download(self) -> requests.Response:
|
|
68
|
-
|
|
502
|
+
"""下载方法,为了向后兼容性保留"""
|
|
503
|
+
return self.execute()
|
|
504
|
+
|
|
505
|
+
def normal_download(self, file_type_detect: bool = True) -> bytes:
|
|
506
|
+
"""普通下载模式"""
|
|
507
|
+
detect_settings = self.request_settings.copy()
|
|
508
|
+
detect_settings.pop('stream', None)
|
|
509
|
+
|
|
510
|
+
response = requests.request(
|
|
511
|
+
method=self.method,
|
|
512
|
+
url=self.url,
|
|
513
|
+
**detect_settings
|
|
514
|
+
)
|
|
515
|
+
|
|
69
516
|
if self.check_status_code:
|
|
70
517
|
response.raise_for_status()
|
|
71
|
-
return response
|
|
72
518
|
|
|
519
|
+
content_type = response.headers.get('content-type')
|
|
520
|
+
result = response.content
|
|
521
|
+
response.close()
|
|
522
|
+
|
|
523
|
+
if file_type_detect and not self.detector_info:
|
|
524
|
+
head_data = result[:64]
|
|
525
|
+
detector = FileTypeDetector()
|
|
526
|
+
self.detector_info = detector.get_detailed_info(
|
|
527
|
+
url=self.url, content_type=content_type, data=head_data
|
|
528
|
+
)
|
|
529
|
+
|
|
530
|
+
return result
|
|
531
|
+
|
|
532
|
+
def _log_download_progress(self, start_time, downloaded):
|
|
533
|
+
try:
|
|
534
|
+
elapsed_time = time.time() - start_time
|
|
535
|
+
elapsed_time_str = time.strftime("%H:%M:%S", time.gmtime(elapsed_time))
|
|
536
|
+
progress = downloaded / self.content_length
|
|
537
|
+
downloaded_mb = downloaded / (1024 * 1024)
|
|
538
|
+
total_mb = self.content_length / (1024 * 1024)
|
|
539
|
+
speed = downloaded / elapsed_time / 1024
|
|
540
|
+
filled_length = int(50 * progress)
|
|
541
|
+
bar = '█' * filled_length + '-' * (50 - filled_length)
|
|
542
|
+
logging.debug(
|
|
543
|
+
f"\n\r\rDownloading {self.url}: |{bar}| {progress * 100:.1f}% "
|
|
544
|
+
f"{downloaded_mb:.1f}/{total_mb:.1f} MB [Time:{elapsed_time_str}, Speed {speed:.2f} KB/s]"
|
|
545
|
+
)
|
|
546
|
+
except Exception:
|
|
547
|
+
pass
|
|
548
|
+
|
|
549
|
+
def range_download(self, start: int = 0, chunk_size: int = 1024, file_type_detect: bool = True):
|
|
550
|
+
# 分块下载
|
|
551
|
+
downloaded = start
|
|
552
|
+
retry_count = 0
|
|
553
|
+
max_retries = 3
|
|
554
|
+
|
|
555
|
+
start_time = time.time()
|
|
556
|
+
|
|
557
|
+
detect_settings = self.request_settings.copy()
|
|
558
|
+
detect_settings.pop('stream', None)
|
|
559
|
+
|
|
560
|
+
if file_type_detect and not self.detector_info:
|
|
561
|
+
detect_settings.setdefault("headers", {})['Range'] = "bytes=0-63"
|
|
562
|
+
test_response = requests.request(
|
|
563
|
+
method=self.method,
|
|
564
|
+
url=self.url,
|
|
565
|
+
**detect_settings
|
|
566
|
+
)
|
|
567
|
+
content_type = test_response.headers.get("Content-Type")
|
|
568
|
+
head_data = test_response.content
|
|
569
|
+
test_response.close()
|
|
570
|
+
|
|
571
|
+
detector = FileTypeDetector()
|
|
572
|
+
self.detector_info = detector.get_detailed_info(
|
|
573
|
+
url=self.url, content_type=content_type, data=head_data
|
|
574
|
+
)
|
|
575
|
+
|
|
576
|
+
while downloaded < self.content_length:
|
|
577
|
+
_start = downloaded
|
|
578
|
+
_end = min(downloaded + chunk_size - 1, self.content_length - 1)
|
|
579
|
+
detect_settings.setdefault("headers", {})['Range'] = f"bytes={_start}-{_end}"
|
|
580
|
+
try:
|
|
581
|
+
|
|
582
|
+
self.response = requests.request(
|
|
583
|
+
method=self.method,
|
|
584
|
+
url=self.url,
|
|
585
|
+
**detect_settings
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
if self.response.status_code == 206:
|
|
589
|
+
chunk_data = self.response.content
|
|
590
|
+
yield chunk_data
|
|
591
|
+
downloaded += len(chunk_data)
|
|
592
|
+
retry_count = 0 # 重置重试计数
|
|
593
|
+
self._log_download_progress(
|
|
594
|
+
start_time=start_time,
|
|
595
|
+
downloaded=downloaded
|
|
596
|
+
)
|
|
597
|
+
elif self.response.status_code == 416: # Range Not Satisfiable
|
|
598
|
+
logging.info("Range请求超出范围")
|
|
599
|
+
break
|
|
600
|
+
|
|
601
|
+
except Exception as e:
|
|
602
|
+
logging.exception(f"请求失败 - URL: {self.url}, 错误: {e}, 当前重试次数: {retry_count}")
|
|
603
|
+
finally:
|
|
604
|
+
self.response.close()
|
|
605
|
+
self.response = None
|
|
606
|
+
if retry_count < max_retries:
|
|
607
|
+
time.sleep(0.5 * retry_count)
|
|
608
|
+
retry_count += 1
|
|
609
|
+
continue
|
|
610
|
+
else:
|
|
611
|
+
raise ValueError(f"超过当前最大重试次数,请求失败!当前重试次数: {retry_count}")
|
|
612
|
+
|
|
613
|
+
def stream_download(self, file_type_detect: bool = True):
|
|
614
|
+
|
|
615
|
+
downloaded = start = self.seed.params.start or 0
|
|
616
|
+
detect_settings = self.request_settings.copy()
|
|
617
|
+
detect_settings.setdefault("headers", {})['Range'] = f"bytes={start}-"
|
|
618
|
+
detect_settings['stream'] = True
|
|
619
|
+
|
|
620
|
+
self.response = requests.request(
|
|
621
|
+
method=self.method,
|
|
622
|
+
url=self.url,
|
|
623
|
+
**detect_settings
|
|
624
|
+
)
|
|
625
|
+
|
|
626
|
+
if self.check_status_code:
|
|
627
|
+
self.response.raise_for_status()
|
|
628
|
+
|
|
629
|
+
content = b""
|
|
630
|
+
start_time = time.time()
|
|
631
|
+
chunk_size = self.seed.params.chunk_size
|
|
632
|
+
|
|
633
|
+
for part_data in self.response.iter_content(1024):
|
|
634
|
+
content += part_data
|
|
635
|
+
downloaded += len(part_data)
|
|
636
|
+
if start == 0 and downloaded > 64 and file_type_detect and not self.detector_info:
|
|
637
|
+
detector = FileTypeDetector()
|
|
638
|
+
content_type = self.response.headers.get("Content-Type")
|
|
639
|
+
self.content_length = int(self.response.headers.get('content-length', 0))
|
|
640
|
+
self.detector_info = detector.get_detailed_info(
|
|
641
|
+
url=self.url, content_type=content_type, data=content[:64]
|
|
642
|
+
)
|
|
643
|
+
|
|
644
|
+
if not chunk_size:
|
|
645
|
+
max_size = 5 * 1024 * 1024
|
|
646
|
+
calculated_size = self.content_length / 8000
|
|
647
|
+
chunk_size = max(calculated_size, max_size)
|
|
648
|
+
self.seed.params.chunk_size = chunk_size
|
|
649
|
+
|
|
650
|
+
upload_data = content[:64]
|
|
651
|
+
content = content[64:]
|
|
652
|
+
yield upload_data
|
|
653
|
+
|
|
654
|
+
while len(content) >= chunk_size:
|
|
655
|
+
upload_data = content[:chunk_size]
|
|
656
|
+
content = content[chunk_size:]
|
|
657
|
+
|
|
658
|
+
yield upload_data
|
|
659
|
+
self.seed.params.start = downloaded
|
|
660
|
+
|
|
661
|
+
self._log_download_progress(
|
|
662
|
+
start_time=start_time,
|
|
663
|
+
downloaded=downloaded
|
|
664
|
+
)
|
|
665
|
+
|
|
666
|
+
if content:
|
|
667
|
+
yield content
|
|
668
|
+
|
|
669
|
+
self.response.close()
|
|
670
|
+
|
|
671
|
+
def detect_accept_ranges(self) -> bool:
|
|
672
|
+
detect_settings = self.request_settings.copy()
|
|
673
|
+
detect_settings.pop('stream', None)
|
|
674
|
+
|
|
675
|
+
head_response = requests.head(self.url, **detect_settings)
|
|
676
|
+
if head_response.status_code not in [200, 206]:
|
|
677
|
+
logging.error(f"HEAD请求失败: {head_response.status_code}")
|
|
678
|
+
raise ValueError("HTTP状态码错误")
|
|
679
|
+
|
|
680
|
+
self.content_length = int(head_response.headers.get('content-length', 0))
|
|
681
|
+
|
|
682
|
+
test_range_settings = detect_settings.copy()
|
|
683
|
+
test_range_settings.setdefault("headers", {})['Range'] = "bytes=0-63"
|
|
684
|
+
test_response = requests.request(
|
|
685
|
+
method=self.method,
|
|
686
|
+
url=self.url,
|
|
687
|
+
**test_range_settings
|
|
688
|
+
)
|
|
689
|
+
head_data = test_response.content
|
|
690
|
+
content_type = test_response.headers.get("Content-Type")
|
|
691
|
+
|
|
692
|
+
if test_response.status_code == 206 and len(head_data) == 64:
|
|
693
|
+
supports_range = True
|
|
694
|
+
elif test_response.status_code == 200:
|
|
695
|
+
supports_range = False
|
|
696
|
+
self.response = test_response
|
|
697
|
+
head_data = head_data[:64]
|
|
698
|
+
logging.debug(f"Range请求方式不支持, 实际{len(head_data)}")
|
|
699
|
+
else:
|
|
700
|
+
supports_range = False
|
|
701
|
+
logging.error(f"Range请求失败: {test_response.status_code}")
|
|
702
|
+
|
|
703
|
+
if not self.detector_info:
|
|
704
|
+
self.response = test_response
|
|
705
|
+
detector = FileTypeDetector()
|
|
706
|
+
self.detector_info = detector.get_detailed_info(
|
|
707
|
+
url=self.url, content_type=content_type, data=head_data
|
|
708
|
+
)
|
|
709
|
+
|
|
710
|
+
test_response.close()
|
|
711
|
+
return supports_range
|
|
712
|
+
|
|
713
|
+
def detect_file_type(self) -> Dict[str, Any]:
|
|
714
|
+
"""
|
|
715
|
+
检测文件类型。
|
|
716
|
+
|
|
717
|
+
Returns:
|
|
718
|
+
Dict[str, Any]: 文件类型信息
|
|
719
|
+
|
|
720
|
+
Raises:
|
|
721
|
+
RequestException: 请求执行失败
|
|
722
|
+
ImportError: FileTypeDetector 未找到
|
|
723
|
+
"""
|
|
724
|
+
try:
|
|
725
|
+
# 创建检测请求的配置
|
|
726
|
+
detect_settings = self.request_settings.copy()
|
|
727
|
+
|
|
728
|
+
# 设置 Range 头获取文件前64字节
|
|
729
|
+
headers = detect_settings.setdefault("headers", {}).copy()
|
|
730
|
+
headers['Range'] = "bytes=0-63"
|
|
731
|
+
detect_settings["headers"] = headers
|
|
732
|
+
|
|
733
|
+
# 移除 stream 参数避免冲突
|
|
734
|
+
detect_settings.pop('stream', None)
|
|
735
|
+
|
|
736
|
+
# 执行检测请求
|
|
737
|
+
response = requests.request(
|
|
738
|
+
method=self.method,
|
|
739
|
+
url=self.url,
|
|
740
|
+
**detect_settings
|
|
741
|
+
)
|
|
742
|
+
|
|
743
|
+
content_type = response.headers.get("Content-Type")
|
|
744
|
+
detector = FileTypeDetector()
|
|
745
|
+
|
|
746
|
+
return detector.get_detailed_info(
|
|
747
|
+
url=self.url,
|
|
748
|
+
content_type=content_type,
|
|
749
|
+
data=response.content
|
|
750
|
+
)
|
|
751
|
+
|
|
752
|
+
except RequestException as e:
|
|
753
|
+
logging.error(f"文件类型检测失败 - URL: {self.url}, 错误: {e}")
|
|
754
|
+
|
|
755
|
+
@property
|
|
756
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
757
|
+
excluded_keys = {"request_settings", "url", "seed", "method", "response", "check_status_code"}
|
|
758
|
+
result = {
|
|
759
|
+
key: value for key, value in self.__dict__.items()
|
|
760
|
+
if not key.startswith('_') and key not in excluded_keys
|
|
761
|
+
}
|
|
762
|
+
result['request_settings'] = self.request_settings.copy()
|
|
763
|
+
return result
|
|
764
|
+
|
|
765
|
+
def __repr__(self) -> str:
|
|
766
|
+
return f"Request(method='{self.method}', url='{self.url}')"
|
|
767
|
+
|
|
768
|
+
def __str__(self) -> str:
|
|
769
|
+
return f"{self.method} {self.url}"
|