cobweb-launcher 1.0.5__py3-none-any.whl → 3.2.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. cobweb/__init__.py +5 -1
  2. cobweb/base/__init__.py +3 -3
  3. cobweb/base/common_queue.py +37 -16
  4. cobweb/base/item.py +40 -14
  5. cobweb/base/{log.py → logger.py} +3 -3
  6. cobweb/base/request.py +744 -47
  7. cobweb/base/response.py +381 -13
  8. cobweb/base/seed.py +98 -50
  9. cobweb/base/task_queue.py +180 -0
  10. cobweb/base/test.py +257 -0
  11. cobweb/constant.py +39 -2
  12. cobweb/crawlers/__init__.py +1 -2
  13. cobweb/crawlers/crawler.py +27 -0
  14. cobweb/db/__init__.py +1 -0
  15. cobweb/db/api_db.py +83 -0
  16. cobweb/db/redis_db.py +118 -27
  17. cobweb/launchers/__init__.py +3 -1
  18. cobweb/launchers/distributor.py +141 -0
  19. cobweb/launchers/launcher.py +103 -130
  20. cobweb/launchers/uploader.py +68 -0
  21. cobweb/log_dots/__init__.py +2 -0
  22. cobweb/log_dots/dot.py +258 -0
  23. cobweb/log_dots/loghub_dot.py +53 -0
  24. cobweb/pipelines/__init__.py +3 -2
  25. cobweb/pipelines/pipeline.py +19 -0
  26. cobweb/pipelines/pipeline_csv.py +25 -0
  27. cobweb/pipelines/pipeline_loghub.py +54 -0
  28. cobweb/schedulers/__init__.py +1 -0
  29. cobweb/schedulers/scheduler.py +66 -0
  30. cobweb/schedulers/scheduler_with_redis.py +189 -0
  31. cobweb/setting.py +37 -38
  32. cobweb/utils/__init__.py +5 -2
  33. cobweb/utils/bloom.py +58 -0
  34. cobweb/{base → utils}/decorators.py +14 -12
  35. cobweb/utils/dotting.py +300 -0
  36. cobweb/utils/oss.py +113 -86
  37. cobweb/utils/tools.py +3 -15
  38. cobweb_launcher-3.2.18.dist-info/METADATA +193 -0
  39. cobweb_launcher-3.2.18.dist-info/RECORD +44 -0
  40. {cobweb_launcher-1.0.5.dist-info → cobweb_launcher-3.2.18.dist-info}/WHEEL +1 -1
  41. cobweb/crawlers/base_crawler.py +0 -121
  42. cobweb/crawlers/file_crawler.py +0 -181
  43. cobweb/launchers/launcher_pro.py +0 -174
  44. cobweb/pipelines/base_pipeline.py +0 -54
  45. cobweb/pipelines/loghub_pipeline.py +0 -34
  46. cobweb_launcher-1.0.5.dist-info/METADATA +0 -48
  47. cobweb_launcher-1.0.5.dist-info/RECORD +0 -32
  48. {cobweb_launcher-1.0.5.dist-info → cobweb_launcher-3.2.18.dist-info}/LICENSE +0 -0
  49. {cobweb_launcher-1.0.5.dist-info → cobweb_launcher-3.2.18.dist-info}/top_level.txt +0 -0
cobweb/base/request.py CHANGED
@@ -1,72 +1,769 @@
1
1
  import random
2
+ import logging
3
+ import time
4
+
2
5
  import requests
3
6
 
7
+ from urllib.parse import urlparse
8
+ from typing import Any, Set, Dict, Optional
9
+ from requests.exceptions import RequestException
10
+
11
+
12
+ class FileTypeDetector:
13
+
14
+ def __init__(self):
15
+ self.file_signatures = {
16
+ # 图片格式 - 扩充更多类型
17
+ b'\x89PNG\r\n\x1a\n': 'PNG',
18
+ b'\xff\xd8\xff': 'JPEG',
19
+ b'GIF87a': 'GIF',
20
+ b'GIF89a': 'GIF',
21
+ b'RIFF': 'WEBP', # 需要进一步检查
22
+ b'BM': 'BMP',
23
+ b'II*\x00': 'TIFF',
24
+ b'MM\x00*': 'TIFF',
25
+ b'\x00\x00\x01\x00': 'ICO',
26
+ b'\x00\x00\x02\x00': 'CUR',
27
+ # 新增图片格式
28
+ b'\x00\x00\x00\x0cjP \r\n\x87\n': 'JP2', # JPEG 2000
29
+ b'\xff\x4f\xff\x51': 'JP2', # JPEG 2000 codestream
30
+ b'FORM': 'IFF', # 需要进一步检查 ILBM/PBM
31
+ b'\x0a\x05\x01\x08': 'PCX',
32
+ b'P1\n': 'PBM', # Portable Bitmap
33
+ b'P2\n': 'PGM', # Portable Graymap
34
+ b'P3\n': 'PPM', # Portable Pixmap
35
+ b'P4\n': 'PBM', # Portable Bitmap (binary)
36
+ b'P5\n': 'PGM', # Portable Graymap (binary)
37
+ b'P6\n': 'PPM', # Portable Pixmap (binary)
38
+ b'\x59\xa6\x6a\x95': 'RAS', # Sun Raster
39
+ b'\x01\xda\x01\x01\x00\x03': 'RGB', # SGI Image
40
+ b'\x53\x44\x50\x58': 'DPX', # Digital Picture Exchange
41
+ b'\x76\x2f\x31\x01': 'EXR', # OpenEXR
42
+ b'gimp xcf ': 'XCF', # GIMP native format
43
+ b'\x00\x00\x00\x0c': 'HEIC', # 需要进一步检查
44
+ b'ftypheic': 'HEIC',
45
+ b'ftypmif1': 'HEIF',
46
+ b'ftypavif': 'AVIF',
47
+ b'\x38\x42\x50\x53': 'PSD', # Photoshop Document
48
+
49
+ # 视频格式
50
+ b'\x00\x00\x00\x18ftypmp4': 'MP4',
51
+ b'\x00\x00\x00\x20ftypM4V': 'M4V',
52
+ b'FLV\x01': 'FLV',
53
+ b'\x1aE\xdf\xa3': 'WEBM',
54
+ b'RIFF': 'AVI', # 需要进一步检查
55
+ b'\x00\x00\x01\xba': 'MPEG',
56
+ b'\x00\x00\x01\xb3': 'MPEG',
57
+ b'OggS': 'OGV',
58
+
59
+ # 音频格式
60
+ b'ID3': 'MP3',
61
+ b'\xff\xfb': 'MP3',
62
+ b'\xff\xf3': 'MP3',
63
+ b'\xff\xf2': 'MP3',
64
+ b'fLaC': 'FLAC',
65
+ b'RIFF': 'WAV', # 需要进一步检查
66
+ b'OggS': 'OGG', # 需要进一步检查
67
+ b'ftypM4A': 'M4A',
68
+ b'MAC ': 'APE',
69
+
70
+ # 其他格式
71
+ b'%PDF': 'PDF',
72
+ b'PK\x03\x04': 'ZIP',
73
+ b'Rar!\x1a\x07\x00': 'RAR',
74
+ b'\x37\x7a\xbc\xaf\x27\x1c': '7Z',
75
+ }
76
+
77
+ # 扩展名映射 - 大幅扩充图片类型
78
+ self.extension_map = {
79
+ # 图片 - 常见格式
80
+ '.jpg': 'JPEG', '.jpeg': 'JPEG', '.png': 'PNG', '.gif': 'GIF',
81
+ '.webp': 'WEBP', '.bmp': 'BMP', '.tiff': 'TIFF', '.tif': 'TIFF',
82
+ '.ico': 'ICO', '.svg': 'SVG', '.heic': 'HEIC', '.avif': 'AVIF',
83
+
84
+ # 图片 - 专业格式
85
+ '.psd': 'PSD', '.psb': 'PSB', # Photoshop
86
+ '.ai': 'AI', # Adobe Illustrator
87
+ '.eps': 'EPS', '.ps': 'PS', # PostScript
88
+ '.pdf': 'PDF', # 也可作为图片格式
89
+
90
+ # 图片 - RAW格式
91
+ '.raw': 'RAW', '.cr2': 'CR2', '.cr3': 'CR3', # Canon
92
+ '.nef': 'NEF', '.nrw': 'NRW', # Nikon
93
+ '.arw': 'ARW', '.srf': 'SRF', '.sr2': 'SR2', # Sony
94
+ '.orf': 'ORF', # Olympus
95
+ '.rw2': 'RW2', # Panasonic
96
+ '.dng': 'DNG', # Adobe Digital Negative
97
+ '.raf': 'RAF', # Fujifilm
98
+ '.3fr': '3FR', # Hasselblad
99
+ '.fff': 'FFF', # Imacon
100
+ '.dcr': 'DCR', '.mrw': 'MRW', # Minolta
101
+ '.mos': 'MOS', # Leaf
102
+ '.ptx': 'PTX', '.pef': 'PEF', # Pentax
103
+ '.x3f': 'X3F', # Sigma
104
+
105
+ # 图片 - 其他格式
106
+ '.jp2': 'JP2', '.jpx': 'JPX', '.j2k': 'J2K', # JPEG 2000
107
+ '.jxr': 'JXR', '.hdp': 'HDP', '.wdp': 'WDP', # JPEG XR
108
+ '.jxl': 'JXL', # JPEG XL
109
+ '.heif': 'HEIF', # High Efficiency Image Format
110
+ '.dds': 'DDS', # DirectDraw Surface
111
+ '.tga': 'TGA', '.targa': 'TGA', # Truevision TGA
112
+ '.pcx': 'PCX', # PC Paintbrush
113
+ '.pbm': 'PBM', '.pgm': 'PGM', '.ppm': 'PPM', '.pnm': 'PNM', # Netpbm
114
+ '.xbm': 'XBM', '.xpm': 'XPM', # X11 Bitmap/Pixmap
115
+ '.sgi': 'SGI', '.rgb': 'RGB', # Silicon Graphics Image
116
+ '.ras': 'RAS', '.sun': 'SUN', # Sun Raster
117
+ '.iff': 'IFF', '.lbm': 'LBM', '.ilbm': 'ILBM', # Interchange File Format
118
+ '.mng': 'MNG', '.jng': 'JNG', # Multiple-image Network Graphics
119
+ '.wbmp': 'WBMP', # Wireless Bitmap
120
+ '.cur': 'CUR', # Windows Cursor
121
+ '.ani': 'ANI', # Windows Animated Cursor
122
+ '.icns': 'ICNS', # Apple Icon Image
123
+ '.dpx': 'DPX', # Digital Picture Exchange
124
+ '.exr': 'EXR', # OpenEXR
125
+ '.hdr': 'HDR', '.rgbe': 'RGBE', # Radiance HDR
126
+ '.pfm': 'PFM', # Portable Float Map
127
+ '.xcf': 'XCF', # GIMP native format
128
+ '.kra': 'KRA', # Krita Document
129
+ '.ora': 'ORA', # OpenRaster
130
+ '.clip': 'CLIP', # Clip Studio Paint
131
+ '.sai': 'SAI', '.sai2': 'SAI2', # PaintTool SAI
132
+ '.mdp': 'MDP', # FireAlpaca/MediBang Paint
133
+ '.procreate': 'PROCREATE', # Procreate
134
+
135
+ # 视频
136
+ '.mp4': 'MP4', '.avi': 'AVI', '.mov': 'MOV', '.wmv': 'WMV',
137
+ '.flv': 'FLV', '.webm': 'WEBM', '.mkv': 'MKV', '.m4v': 'M4V',
138
+ '.mpg': 'MPEG', '.mpeg': 'MPEG', '.3gp': '3GP', '.ogv': 'OGV',
139
+ '.ts': 'TS', '.mts': 'MTS', '.vob': 'VOB',
140
+
141
+ # 音频
142
+ '.mp3': 'MP3', '.wav': 'WAV', '.flac': 'FLAC', '.aac': 'AAC',
143
+ '.ogg': 'OGG', '.wma': 'WMA', '.m4a': 'M4A', '.ape': 'APE',
144
+ '.opus': 'OPUS', '.aiff': 'AIFF', '.au': 'AU',
145
+ }
146
+
147
+ # MIME类型映射 - 扩充图片类型
148
+ self.mime_type_map = {
149
+ # 图片 - 基础格式
150
+ 'image/jpeg': 'JPEG', 'image/png': 'PNG', 'image/gif': 'GIF',
151
+ 'image/webp': 'WEBP', 'image/bmp': 'BMP', 'image/tiff': 'TIFF',
152
+ 'image/svg+xml': 'SVG', 'image/x-icon': 'ICO',
153
+
154
+ # 图片 - 现代格式
155
+ 'image/heic': 'HEIC', 'image/heif': 'HEIF', 'image/avif': 'AVIF',
156
+ 'image/jxl': 'JXL', 'image/jp2': 'JP2', 'image/jpx': 'JPX',
157
+ 'image/jxr': 'JXR', 'image/vnd.ms-photo': 'JXR',
158
+
159
+ # 图片 - 其他格式
160
+ 'image/x-targa': 'TGA', 'image/x-tga': 'TGA',
161
+ 'image/x-pcx': 'PCX', 'image/x-portable-bitmap': 'PBM',
162
+ 'image/x-portable-graymap': 'PGM', 'image/x-portable-pixmap': 'PPM',
163
+ 'image/x-portable-anymap': 'PNM', 'image/x-xbitmap': 'XBM',
164
+ 'image/x-xpixmap': 'XPM', 'image/x-sgi': 'SGI',
165
+ 'image/x-sun-raster': 'RAS', 'image/x-iff': 'IFF',
166
+ 'image/vnd.wap.wbmp': 'WBMP', 'image/x-ms-bmp': 'BMP',
167
+ 'image/vnd.adobe.photoshop': 'PSD', 'image/x-photoshop': 'PSD',
168
+ 'image/x-exr': 'EXR', 'image/vnd.radiance': 'HDR',
169
+ 'image/x-xcf': 'XCF', 'image/openraster': 'ORA',
170
+
171
+ # 视频
172
+ 'video/mp4': 'MP4', 'video/avi': 'AVI', 'video/quicktime': 'MOV',
173
+ 'video/x-msvideo': 'AVI', 'video/webm': 'WEBM', 'video/x-flv': 'FLV',
174
+ 'video/3gpp': '3GP', 'video/ogg': 'OGV',
175
+
176
+ # 音频
177
+ 'audio/mpeg': 'MP3', 'audio/wav': 'WAV', 'audio/flac': 'FLAC',
178
+ 'audio/aac': 'AAC', 'audio/ogg': 'OGG', 'audio/x-ms-wma': 'WMA',
179
+ 'audio/mp4': 'M4A', 'audio/opus': 'OPUS',
180
+ }
181
+
182
+ self.session = requests.Session()
183
+ self.session.headers.update({
184
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
185
+ })
186
+
187
+ def get_file_extension(self, url: str) -> str:
188
+ """从URL获取文件扩展名"""
189
+ parsed = urlparse(url)
190
+ path = parsed.path.lower()
191
+ site = parsed.netloc
192
+
193
+ # 移除查询参数
194
+ if '?' in path:
195
+ path = path.split('?')[0]
196
+
197
+ # 获取扩展名
198
+ if '.' in path:
199
+ return '.' + path.split('.')[-1], site
200
+ return '', site
201
+
202
+ def detect_by_extension(self, url: str) -> Optional[str]:
203
+ """通过文件扩展名检测类型"""
204
+ ext, site = self.get_file_extension(url)
205
+ return self.extension_map.get(ext)
206
+
207
+ def detect_by_mime_type(self, content_type: str) -> Optional[str]:
208
+ """通过MIME类型检测"""
209
+ if not content_type:
210
+ return None
211
+
212
+ # 清理content-type,移除参数
213
+ mime_type = content_type.split(';')[0].strip().lower()
214
+ return self.mime_type_map.get(mime_type)
215
+
216
+ def get_partial_content(self, url: str, max_bytes: int = 64) -> Optional[bytes]:
217
+ """获取文件的前几个字节"""
218
+ try:
219
+ headers = {'Range': f'bytes=0-{max_bytes - 1}'}
220
+ response = self.session.get(url, headers=headers, timeout=10)
221
+
222
+ if response.status_code in [200, 206]:
223
+ return response.content
224
+ except Exception as e:
225
+ print(f"获取内容失败: {e}")
226
+ return None
227
+
228
+ def detect_by_signature(self, data: bytes) -> Optional[str]:
229
+ """通过文件签名检测类型"""
230
+ if not data:
231
+ return None
232
+
233
+ # 检查各种文件签名
234
+ for signature, file_type in self.file_signatures.items():
235
+ if data.startswith(signature):
236
+ # 特殊处理需要进一步检查的格式
237
+ if signature == b'RIFF' and len(data) >= 12:
238
+ # 检查是WEBP、AVI还是WAV
239
+ if data[8:12] == b'WEBP':
240
+ return 'WEBP'
241
+ elif data[8:12] == b'AVI ':
242
+ return 'AVI'
243
+ elif data[8:12] == b'WAVE':
244
+ return 'WAV'
245
+ elif signature == b'OggS' and len(data) >= 32:
246
+ # 检查是OGG音频还是OGV视频
247
+ if b'vorbis' in data[:64].lower():
248
+ return 'OGG'
249
+ elif b'theora' in data[:64].lower():
250
+ return 'OGV'
251
+ else:
252
+ return 'OGG'
253
+ else:
254
+ return file_type
255
+
256
+ # 检查MP4相关格式
257
+ if len(data) >= 12 and data[4:8] == b'ftyp':
258
+ brand = data[8:12]
259
+ if brand in [b'mp41', b'mp42', b'isom', b'avc1']:
260
+ return 'MP4'
261
+ elif brand == b'M4A ':
262
+ return 'M4A'
263
+ elif brand == b'M4V ':
264
+ return 'M4V'
265
+ elif brand == b'qt ':
266
+ return 'MOV'
267
+
268
+ return None
269
+
270
+ def get_detailed_info(self, url, content_type, data) -> Dict:
271
+ """获取详细的文件信息"""
272
+ result = {
273
+ 'url': url,
274
+ 'site': None,
275
+ 'detected_type': None,
276
+ 'confidence': 'unknown',
277
+ 'methods_used': [],
278
+ 'content_type': content_type,
279
+ 'extension': None
280
+ }
281
+
282
+ # 1. 先尝试HEAD请求获取HTTP头信息
283
+ try:
284
+ result['content_type'] = content_type
285
+ # result['file_size'] = content_length
286
+
287
+ # 通过MIME类型检测
288
+ mime_detected = self.detect_by_mime_type(content_type)
289
+ if mime_detected:
290
+ result['detected_type'] = mime_detected
291
+ result['confidence'] = 'high'
292
+ result['methods_used'].append('mime_type')
293
+ except Exception as e:
294
+ print(f"HEAD请求失败: {e}")
295
+
296
+ # 2. 通过扩展名检测
297
+ ext_detected = self.detect_by_extension(url)
298
+ result['extension'], result['site'] = self.get_file_extension(url)
299
+
300
+ if ext_detected:
301
+ if not result['detected_type']:
302
+ result['detected_type'] = ext_detected
303
+ result['confidence'] = 'medium'
304
+ elif result['detected_type'] == ext_detected:
305
+ result['confidence'] = 'very_high' # MIME和扩展名一致
306
+ result['methods_used'].append('extension')
307
+
308
+ # 3. 如果前两种方法不确定,使用文件签名检测
309
+ if data and result['confidence'] in ['unknown', 'medium']:
310
+ signature_detected = self.detect_by_signature(data)
311
+ if signature_detected:
312
+ if not result['detected_type']:
313
+ result['detected_type'] = signature_detected
314
+ result['confidence'] = 'high'
315
+ elif result['detected_type'] == signature_detected:
316
+ result['confidence'] = 'very_high'
317
+ else:
318
+ # 冲突时,优先相信文件签名
319
+ result['detected_type'] = signature_detected
320
+ result['confidence'] = 'high'
321
+ result['methods_used'].append('file_signature')
322
+
323
+ result['cate'] = self.get_file_category(result['detected_type'])
324
+ return result
325
+
326
+ def get_file_category(self, file_type: str) -> str:
327
+ """获取文件类别"""
328
+ if not file_type or file_type == 'Unknown':
329
+ return 'Unknown'
330
+
331
+ image_types = {'PNG', 'JPEG', 'GIF', 'WEBP', 'BMP', 'TIFF', 'ICO', 'SVG', 'HEIC', 'AVIF'}
332
+ video_types = {'MP4', 'AVI', 'MOV', 'WMV', 'FLV', 'WEBM', 'MKV', 'M4V', 'MPEG', '3GP', 'OGV', 'TS', 'MTS',
333
+ 'VOB'}
334
+ audio_types = {'MP3', 'WAV', 'FLAC', 'AAC', 'OGG', 'WMA', 'M4A', 'APE', 'OPUS', 'AIFF', 'AU'}
335
+
336
+ if file_type in image_types:
337
+ return 'image'
338
+ elif file_type in video_types:
339
+ return 'video'
340
+ elif file_type in audio_types:
341
+ return 'audio'
342
+ else:
343
+ return 'other'
344
+
4
345
 
5
346
  class Request:
347
+ """
348
+ HTTP 请求封装类,提供统一的请求接口和相关功能。
6
349
 
7
- __REQUEST_ATTRS__ = {
8
- "params",
9
- "headers",
10
- "cookies",
11
- "data",
12
- "json",
13
- "files",
14
- "auth",
15
- "timeout",
16
- "proxies",
17
- "hooks",
18
- "stream",
19
- "verify",
20
- "cert",
21
- "allow_redirects",
350
+ Features:
351
+ - 自动 User-Agent 生成
352
+ - 灵活的请求参数配置
353
+ - 文件类型检测
354
+ - 错误处理和状态码检查
355
+ """
356
+
357
+ # 支持的 requests 库参数
358
+ _REQUEST_ATTRS: Set[str] = frozenset({
359
+ "params", "headers", "cookies", "data", "json", "files",
360
+ "auth", "timeout", "proxies", "hooks", "stream", "verify",
361
+ "cert", "allow_redirects"
362
+ })
363
+
364
+ # 默认超时时间
365
+ _DEFAULT_TIMEOUT = 30
366
+
367
+ # User-Agent 模板和版本范围
368
+ _UA_TEMPLATE = (
369
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_{v1}_{v2}) "
370
+ "AppleWebKit/{v4}.{v3} (KHTML, like Gecko) "
371
+ "Chrome/105.0.0.0 Safari/{v4}.{v3} Edg/105.0.{v5}.{v6}"
372
+ )
373
+ _UA_VERSION_RANGES = {
374
+ 'v1': (4, 15), 'v2': (3, 11), 'v3': (1, 16),
375
+ 'v4': (533, 605), 'v5': (1000, 6000), 'v6': (10, 80)
22
376
  }
23
377
 
24
378
  def __init__(
25
379
  self,
26
- url,
27
- seed,
28
- random_ua=True,
29
- check_status_code=True,
380
+ url: str,
381
+ seed: Any = None,
382
+ method: Optional[str] = None,
383
+ random_ua: bool = True,
384
+ check_status_code: bool = True,
30
385
  **kwargs
31
386
  ):
387
+ """
388
+ 初始化请求对象。
389
+
390
+ Args:
391
+ url: 请求的 URL
392
+ seed: 种子对象或标识符
393
+ method: HTTP 方法,如果不指定则自动推断
394
+ random_ua: 是否使用随机 User-Agent
395
+ check_status_code: 是否检查响应状态码
396
+ **kwargs: 其他请求参数
397
+
398
+ Raises:
399
+ ValueError: 当 URL 格式无效时
400
+ """
401
+ self.scheme = None
402
+ self.netloc = None
403
+ self.response = None
404
+ self.detector_info = None
405
+ self.content_length = None
406
+ self._validate_url(url)
407
+
32
408
  self.url = url
33
409
  self.seed = seed
34
410
  self.check_status_code = check_status_code
35
- self.request_setting = {}
411
+ self.request_settings: Dict[str, Any] = {}
412
+
413
+ # 分离请求参数和实例属性
414
+ self._process_kwargs(kwargs)
36
415
 
37
- for k, v in kwargs.items():
38
- if k in self.__class__.__REQUEST_ATTRS__:
39
- self.request_setting[k] = v
40
- continue
41
- self.__setattr__(k, v)
416
+ self.method = self._determine_method(method)
42
417
 
43
- if not getattr(self, "method", None):
44
- self.method = "POST" if self.request_setting.get("data") or self.request_setting.get("json") else "GET"
418
+ # 设置默认超时
419
+ if 'timeout' not in self.request_settings:
420
+ self.request_settings['timeout'] = self._DEFAULT_TIMEOUT
45
421
 
422
+ # 构建请求头
46
423
  if random_ua:
47
- self._build_header()
424
+ self._setup_headers()
48
425
 
49
- @property
50
- def _random_ua(self) -> str:
51
- v1 = random.randint(4, 15)
52
- v2 = random.randint(3, 11)
53
- v3 = random.randint(1, 16)
54
- v4 = random.randint(533, 605)
55
- v5 = random.randint(1000, 6000)
56
- v6 = random.randint(10, 80)
57
- user_agent = (f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_{v1}_{v2}) AppleWebKit/{v4}.{v3} "
58
- f"(KHTML, like Gecko) Chrome/105.0.0.0 Safari/{v4}.{v3} Edg/105.0.{v5}.{v6}")
59
- return user_agent
60
-
61
- def _build_header(self) -> dict:
62
- if not self.request_setting.get("headers"):
63
- self.request_setting["headers"] = {"accept": "*/*", "user-agent": self._random_ua}
64
- elif "user-agent" not in [key.lower() for key in self.request_setting["headers"].keys()]:
65
- self.request_setting["headers"]["user-agent"] = self._random_ua
426
+ def _validate_url(self, url: str) -> None:
427
+ """验证 URL 格式"""
428
+ try:
429
+ result = urlparse(url)
430
+ self.scheme = result.scheme
431
+ self.netloc = result.netloc
432
+ if not all([self.scheme, self.netloc]):
433
+ raise ValueError(f"无效的 URL 格式: {url}")
434
+ except Exception as e:
435
+ raise ValueError(f"URL 解析失败: {e}")
436
+
437
+ def _process_kwargs(self, kwargs: Dict[str, Any]) -> None:
438
+ """处理关键字参数,分离请求参数和实例属性"""
439
+ for key, value in kwargs.items():
440
+ if key in self._REQUEST_ATTRS:
441
+ self.request_settings[key] = value
442
+ else:
443
+ setattr(self, key, value)
444
+
445
+ def _determine_method(self, method: Optional[str]) -> str:
446
+ if method:
447
+ return method.upper()
448
+
449
+ has_body = bool(
450
+ self.request_settings.get("data") or
451
+ self.request_settings.get("json") or
452
+ self.request_settings.get("files")
453
+ )
454
+ return "POST" if has_body else "GET"
455
+
456
+ def _generate_random_ua(self) -> str:
457
+ """生成随机 User-Agent"""
458
+ versions = {
459
+ key: random.randint(*range_tuple)
460
+ for key, range_tuple in self._UA_VERSION_RANGES.items()
461
+ }
462
+ return self._UA_TEMPLATE.format(**versions)
463
+
464
+ def _setup_headers(self) -> None:
465
+ """设置请求头,包括随机 User-Agent"""
466
+ headers = self.request_settings.setdefault("headers", {})
467
+
468
+ # 使用小写键名进行检查,保持一致性
469
+ ua_keys = ['user-agent', 'User-Agent']
470
+ if not any(headers.get(key) for key in ua_keys):
471
+ headers["User-Agent"] = self._generate_random_ua()
66
472
 
473
+ def execute(self) -> requests.Response:
474
+ """
475
+ 执行 HTTP 请求。
476
+
477
+ Returns:
478
+ requests.Response: 响应对象
479
+
480
+ Raises:
481
+ RequestException: 请求执行失败
482
+ requests.HTTPError: HTTP 状态码错误(当 check_status_code=True 时)
483
+ """
484
+ try:
485
+ self.response = requests.request(
486
+ method=self.method,
487
+ url=self.url,
488
+ **self.request_settings
489
+ )
490
+
491
+ if self.check_status_code:
492
+ self.response.raise_for_status()
493
+
494
+ return self.response
495
+
496
+ except RequestException as e:
497
+ logging.error(f"请求执行失败 - URL: {self.url}, 错误: {e}")
498
+ raise
499
+
500
+ # 保持向后兼容性
67
501
  def download(self) -> requests.Response:
68
- response = requests.request(self.method, self.url, **self.request_setting)
502
+ """下载方法,为了向后兼容性保留"""
503
+ return self.execute()
504
+
505
+ def normal_download(self, file_type_detect: bool = True) -> bytes:
506
+ """普通下载模式"""
507
+ detect_settings = self.request_settings.copy()
508
+ detect_settings.pop('stream', None)
509
+
510
+ response = requests.request(
511
+ method=self.method,
512
+ url=self.url,
513
+ **detect_settings
514
+ )
515
+
69
516
  if self.check_status_code:
70
517
  response.raise_for_status()
71
- return response
72
518
 
519
+ content_type = response.headers.get('content-type')
520
+ result = response.content
521
+ response.close()
522
+
523
+ if file_type_detect and not self.detector_info:
524
+ head_data = result[:64]
525
+ detector = FileTypeDetector()
526
+ self.detector_info = detector.get_detailed_info(
527
+ url=self.url, content_type=content_type, data=head_data
528
+ )
529
+
530
+ return result
531
+
532
+ def _log_download_progress(self, start_time, downloaded):
533
+ try:
534
+ elapsed_time = time.time() - start_time
535
+ elapsed_time_str = time.strftime("%H:%M:%S", time.gmtime(elapsed_time))
536
+ progress = downloaded / self.content_length
537
+ downloaded_mb = downloaded / (1024 * 1024)
538
+ total_mb = self.content_length / (1024 * 1024)
539
+ speed = downloaded / elapsed_time / 1024
540
+ filled_length = int(50 * progress)
541
+ bar = '█' * filled_length + '-' * (50 - filled_length)
542
+ logging.debug(
543
+ f"\n\r\rDownloading {self.url}: |{bar}| {progress * 100:.1f}% "
544
+ f"{downloaded_mb:.1f}/{total_mb:.1f} MB [Time:{elapsed_time_str}, Speed {speed:.2f} KB/s]"
545
+ )
546
+ except Exception:
547
+ pass
548
+
549
+ def range_download(self, start: int = 0, chunk_size: int = 1024, file_type_detect: bool = True):
550
+ # 分块下载
551
+ downloaded = start
552
+ retry_count = 0
553
+ max_retries = 3
554
+
555
+ start_time = time.time()
556
+
557
+ detect_settings = self.request_settings.copy()
558
+ detect_settings.pop('stream', None)
559
+
560
+ if file_type_detect and not self.detector_info:
561
+ detect_settings.setdefault("headers", {})['Range'] = "bytes=0-63"
562
+ test_response = requests.request(
563
+ method=self.method,
564
+ url=self.url,
565
+ **detect_settings
566
+ )
567
+ content_type = test_response.headers.get("Content-Type")
568
+ head_data = test_response.content
569
+ test_response.close()
570
+
571
+ detector = FileTypeDetector()
572
+ self.detector_info = detector.get_detailed_info(
573
+ url=self.url, content_type=content_type, data=head_data
574
+ )
575
+
576
+ while downloaded < self.content_length:
577
+ _start = downloaded
578
+ _end = min(downloaded + chunk_size - 1, self.content_length - 1)
579
+ detect_settings.setdefault("headers", {})['Range'] = f"bytes={_start}-{_end}"
580
+ try:
581
+
582
+ self.response = requests.request(
583
+ method=self.method,
584
+ url=self.url,
585
+ **detect_settings
586
+ )
587
+
588
+ if self.response.status_code == 206:
589
+ chunk_data = self.response.content
590
+ yield chunk_data
591
+ downloaded += len(chunk_data)
592
+ retry_count = 0 # 重置重试计数
593
+ self._log_download_progress(
594
+ start_time=start_time,
595
+ downloaded=downloaded
596
+ )
597
+ elif self.response.status_code == 416: # Range Not Satisfiable
598
+ logging.info("Range请求超出范围")
599
+ break
600
+
601
+ except Exception as e:
602
+ logging.exception(f"请求失败 - URL: {self.url}, 错误: {e}, 当前重试次数: {retry_count}")
603
+ finally:
604
+ self.response.close()
605
+ self.response = None
606
+ if retry_count < max_retries:
607
+ time.sleep(0.5 * retry_count)
608
+ retry_count += 1
609
+ continue
610
+ else:
611
+ raise ValueError(f"超过当前最大重试次数,请求失败!当前重试次数: {retry_count}")
612
+
613
+ def stream_download(self, file_type_detect: bool = True):
614
+
615
+ downloaded = start = self.seed.params.start or 0
616
+ detect_settings = self.request_settings.copy()
617
+ detect_settings.setdefault("headers", {})['Range'] = f"bytes={start}-"
618
+ detect_settings['stream'] = True
619
+
620
+ self.response = requests.request(
621
+ method=self.method,
622
+ url=self.url,
623
+ **detect_settings
624
+ )
625
+
626
+ if self.check_status_code:
627
+ self.response.raise_for_status()
628
+
629
+ content = b""
630
+ start_time = time.time()
631
+ chunk_size = self.seed.params.chunk_size
632
+
633
+ for part_data in self.response.iter_content(1024):
634
+ content += part_data
635
+ downloaded += len(part_data)
636
+ if start == 0 and downloaded > 64 and file_type_detect and not self.detector_info:
637
+ detector = FileTypeDetector()
638
+ content_type = self.response.headers.get("Content-Type")
639
+ self.content_length = int(self.response.headers.get('content-length', 0))
640
+ self.detector_info = detector.get_detailed_info(
641
+ url=self.url, content_type=content_type, data=content[:64]
642
+ )
643
+
644
+ if not chunk_size:
645
+ max_size = 5 * 1024 * 1024
646
+ calculated_size = self.content_length / 8000
647
+ chunk_size = max(calculated_size, max_size)
648
+ self.seed.params.chunk_size = chunk_size
649
+
650
+ upload_data = content[:64]
651
+ content = content[64:]
652
+ yield upload_data
653
+
654
+ while len(content) >= chunk_size:
655
+ upload_data = content[:chunk_size]
656
+ content = content[chunk_size:]
657
+
658
+ yield upload_data
659
+ self.seed.params.start = downloaded
660
+
661
+ self._log_download_progress(
662
+ start_time=start_time,
663
+ downloaded=downloaded
664
+ )
665
+
666
+ if content:
667
+ yield content
668
+
669
+ self.response.close()
670
+
671
+ def detect_accept_ranges(self) -> bool:
672
+ detect_settings = self.request_settings.copy()
673
+ detect_settings.pop('stream', None)
674
+
675
+ head_response = requests.head(self.url, **detect_settings)
676
+ if head_response.status_code not in [200, 206]:
677
+ logging.error(f"HEAD请求失败: {head_response.status_code}")
678
+ raise ValueError("HTTP状态码错误")
679
+
680
+ self.content_length = int(head_response.headers.get('content-length', 0))
681
+
682
+ test_range_settings = detect_settings.copy()
683
+ test_range_settings.setdefault("headers", {})['Range'] = "bytes=0-63"
684
+ test_response = requests.request(
685
+ method=self.method,
686
+ url=self.url,
687
+ **test_range_settings
688
+ )
689
+ head_data = test_response.content
690
+ content_type = test_response.headers.get("Content-Type")
691
+
692
+ if test_response.status_code == 206 and len(head_data) == 64:
693
+ supports_range = True
694
+ elif test_response.status_code == 200:
695
+ supports_range = False
696
+ self.response = test_response
697
+ head_data = head_data[:64]
698
+ logging.debug(f"Range请求方式不支持, 实际{len(head_data)}")
699
+ else:
700
+ supports_range = False
701
+ logging.error(f"Range请求失败: {test_response.status_code}")
702
+
703
+ if not self.detector_info:
704
+ self.response = test_response
705
+ detector = FileTypeDetector()
706
+ self.detector_info = detector.get_detailed_info(
707
+ url=self.url, content_type=content_type, data=head_data
708
+ )
709
+
710
+ test_response.close()
711
+ return supports_range
712
+
713
+ def detect_file_type(self) -> Dict[str, Any]:
714
+ """
715
+ 检测文件类型。
716
+
717
+ Returns:
718
+ Dict[str, Any]: 文件类型信息
719
+
720
+ Raises:
721
+ RequestException: 请求执行失败
722
+ ImportError: FileTypeDetector 未找到
723
+ """
724
+ try:
725
+ # 创建检测请求的配置
726
+ detect_settings = self.request_settings.copy()
727
+
728
+ # 设置 Range 头获取文件前64字节
729
+ headers = detect_settings.setdefault("headers", {}).copy()
730
+ headers['Range'] = "bytes=0-63"
731
+ detect_settings["headers"] = headers
732
+
733
+ # 移除 stream 参数避免冲突
734
+ detect_settings.pop('stream', None)
735
+
736
+ # 执行检测请求
737
+ response = requests.request(
738
+ method=self.method,
739
+ url=self.url,
740
+ **detect_settings
741
+ )
742
+
743
+ content_type = response.headers.get("Content-Type")
744
+ detector = FileTypeDetector()
745
+
746
+ return detector.get_detailed_info(
747
+ url=self.url,
748
+ content_type=content_type,
749
+ data=response.content
750
+ )
751
+
752
+ except RequestException as e:
753
+ logging.error(f"文件类型检测失败 - URL: {self.url}, 错误: {e}")
754
+
755
+ @property
756
+ def to_dict(self) -> Dict[str, Any]:
757
+ excluded_keys = {"request_settings", "url", "seed", "method", "response", "check_status_code"}
758
+ result = {
759
+ key: value for key, value in self.__dict__.items()
760
+ if not key.startswith('_') and key not in excluded_keys
761
+ }
762
+ result['request_settings'] = self.request_settings.copy()
763
+ return result
764
+
765
+ def __repr__(self) -> str:
766
+ return f"Request(method='{self.method}', url='{self.url}')"
767
+
768
+ def __str__(self) -> str:
769
+ return f"{self.method} {self.url}"