cobweb-launcher 3.1.25__tar.gz → 3.1.27__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cobweb-launcher-3.1.25/cobweb_launcher.egg-info → cobweb-launcher-3.1.27}/PKG-INFO +1 -1
- cobweb-launcher-3.1.27/cobweb/base/request.py +467 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb/base/response.py +8 -0
- cobweb-launcher-3.1.27/cobweb/base/test.py +257 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27/cobweb_launcher.egg-info}/PKG-INFO +1 -1
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb_launcher.egg-info/SOURCES.txt +1 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/setup.py +1 -1
- cobweb-launcher-3.1.25/cobweb/base/request.py +0 -88
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/LICENSE +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/README.md +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb/__init__.py +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb/base/__init__.py +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb/base/common_queue.py +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb/base/item.py +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb/base/logger.py +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb/base/seed.py +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb/base/task_queue.py +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb/constant.py +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb/crawlers/__init__.py +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb/crawlers/crawler.py +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb/db/__init__.py +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb/db/api_db.py +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb/db/redis_db.py +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb/exceptions/__init__.py +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb/exceptions/oss_db_exception.py +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb/launchers/__init__.py +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb/launchers/distributor.py +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb/launchers/launcher.py +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb/launchers/uploader.py +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb/pipelines/__init__.py +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb/pipelines/pipeline.py +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb/pipelines/pipeline_csv.py +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb/pipelines/pipeline_loghub.py +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb/schedulers/__init__.py +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb/schedulers/scheduler.py +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb/schedulers/scheduler_with_redis.py +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb/setting.py +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb/utils/__init__.py +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb/utils/bloom.py +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb/utils/decorators.py +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb/utils/dotting.py +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb/utils/oss.py +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb/utils/tools.py +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb_launcher.egg-info/requires.txt +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb_launcher.egg-info/top_level.txt +0 -0
- {cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/setup.cfg +0 -0
@@ -0,0 +1,467 @@
|
|
1
|
+
import random
|
2
|
+
import logging
|
3
|
+
import requests
|
4
|
+
|
5
|
+
from urllib.parse import urlparse
|
6
|
+
from typing import Any, Set, Dict, Optional
|
7
|
+
from requests.exceptions import RequestException
|
8
|
+
|
9
|
+
|
10
|
+
class FileTypeDetector:
|
11
|
+
|
12
|
+
def __init__(self):
|
13
|
+
self.file_signatures = {
|
14
|
+
# 图片格式
|
15
|
+
b'\x89PNG\r\n\x1a\n': 'PNG',
|
16
|
+
b'\xff\xd8\xff': 'JPEG',
|
17
|
+
b'GIF87a': 'GIF',
|
18
|
+
b'GIF89a': 'GIF',
|
19
|
+
b'RIFF': 'WEBP', # 需要进一步检查
|
20
|
+
b'BM': 'BMP',
|
21
|
+
b'II*\x00': 'TIFF',
|
22
|
+
b'MM\x00*': 'TIFF',
|
23
|
+
b'\x00\x00\x01\x00': 'ICO',
|
24
|
+
b'\x00\x00\x02\x00': 'CUR',
|
25
|
+
|
26
|
+
# 视频格式
|
27
|
+
b'\x00\x00\x00\x18ftypmp4': 'MP4',
|
28
|
+
b'\x00\x00\x00\x20ftypM4V': 'M4V',
|
29
|
+
b'FLV\x01': 'FLV',
|
30
|
+
b'\x1aE\xdf\xa3': 'WEBM',
|
31
|
+
b'RIFF': 'AVI', # 需要进一步检查
|
32
|
+
b'\x00\x00\x01\xba': 'MPEG',
|
33
|
+
b'\x00\x00\x01\xb3': 'MPEG',
|
34
|
+
b'OggS': 'OGV',
|
35
|
+
|
36
|
+
# 音频格式
|
37
|
+
b'ID3': 'MP3',
|
38
|
+
b'\xff\xfb': 'MP3',
|
39
|
+
b'\xff\xf3': 'MP3',
|
40
|
+
b'\xff\xf2': 'MP3',
|
41
|
+
b'fLaC': 'FLAC',
|
42
|
+
b'RIFF': 'WAV', # 需要进一步检查
|
43
|
+
b'OggS': 'OGG', # 需要进一步检查
|
44
|
+
b'ftypM4A': 'M4A',
|
45
|
+
b'MAC ': 'APE',
|
46
|
+
|
47
|
+
# 其他格式
|
48
|
+
b'%PDF': 'PDF',
|
49
|
+
b'PK\x03\x04': 'ZIP',
|
50
|
+
b'Rar!\x1a\x07\x00': 'RAR',
|
51
|
+
b'\x37\x7a\xbc\xaf\x27\x1c': '7Z',
|
52
|
+
}
|
53
|
+
|
54
|
+
# 扩展名映射
|
55
|
+
self.extension_map = {
|
56
|
+
# 图片
|
57
|
+
'.jpg': 'JPEG', '.jpeg': 'JPEG', '.png': 'PNG', '.gif': 'GIF',
|
58
|
+
'.webp': 'WEBP', '.bmp': 'BMP', '.tiff': 'TIFF', '.tif': 'TIFF',
|
59
|
+
'.ico': 'ICO', '.svg': 'SVG', '.heic': 'HEIC', '.avif': 'AVIF',
|
60
|
+
|
61
|
+
# 视频
|
62
|
+
'.mp4': 'MP4', '.avi': 'AVI', '.mov': 'MOV', '.wmv': 'WMV',
|
63
|
+
'.flv': 'FLV', '.webm': 'WEBM', '.mkv': 'MKV', '.m4v': 'M4V',
|
64
|
+
'.mpg': 'MPEG', '.mpeg': 'MPEG', '.3gp': '3GP', '.ogv': 'OGV',
|
65
|
+
'.ts': 'TS', '.mts': 'MTS', '.vob': 'VOB',
|
66
|
+
|
67
|
+
# 音频
|
68
|
+
'.mp3': 'MP3', '.wav': 'WAV', '.flac': 'FLAC', '.aac': 'AAC',
|
69
|
+
'.ogg': 'OGG', '.wma': 'WMA', '.m4a': 'M4A', '.ape': 'APE',
|
70
|
+
'.opus': 'OPUS', '.aiff': 'AIFF', '.au': 'AU',
|
71
|
+
}
|
72
|
+
|
73
|
+
# MIME类型映射
|
74
|
+
self.mime_type_map = {
|
75
|
+
# 图片
|
76
|
+
'image/jpeg': 'JPEG', 'image/png': 'PNG', 'image/gif': 'GIF',
|
77
|
+
'image/webp': 'WEBP', 'image/bmp': 'BMP', 'image/tiff': 'TIFF',
|
78
|
+
'image/svg+xml': 'SVG', 'image/x-icon': 'ICO',
|
79
|
+
|
80
|
+
# 视频
|
81
|
+
'video/mp4': 'MP4', 'video/avi': 'AVI', 'video/quicktime': 'MOV',
|
82
|
+
'video/x-msvideo': 'AVI', 'video/webm': 'WEBM', 'video/x-flv': 'FLV',
|
83
|
+
'video/3gpp': '3GP', 'video/ogg': 'OGV',
|
84
|
+
|
85
|
+
# 音频
|
86
|
+
'audio/mpeg': 'MP3', 'audio/wav': 'WAV', 'audio/flac': 'FLAC',
|
87
|
+
'audio/aac': 'AAC', 'audio/ogg': 'OGG', 'audio/x-ms-wma': 'WMA',
|
88
|
+
'audio/mp4': 'M4A', 'audio/opus': 'OPUS',
|
89
|
+
}
|
90
|
+
|
91
|
+
self.session = requests.Session()
|
92
|
+
self.session.headers.update({
|
93
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
94
|
+
})
|
95
|
+
|
96
|
+
def get_file_extension(self, url: str) -> str:
|
97
|
+
"""从URL获取文件扩展名"""
|
98
|
+
parsed = urlparse(url)
|
99
|
+
path = parsed.path.lower()
|
100
|
+
site = parsed.netloc
|
101
|
+
|
102
|
+
# 移除查询参数
|
103
|
+
if '?' in path:
|
104
|
+
path = path.split('?')[0]
|
105
|
+
|
106
|
+
# 获取扩展名
|
107
|
+
if '.' in path:
|
108
|
+
return '.' + path.split('.')[-1], site
|
109
|
+
return '', site
|
110
|
+
|
111
|
+
def detect_by_extension(self, url: str) -> Optional[str]:
|
112
|
+
"""通过文件扩展名检测类型"""
|
113
|
+
ext, site = self.get_file_extension(url)
|
114
|
+
return self.extension_map.get(ext)
|
115
|
+
|
116
|
+
def detect_by_mime_type(self, content_type: str) -> Optional[str]:
|
117
|
+
"""通过MIME类型检测"""
|
118
|
+
if not content_type:
|
119
|
+
return None
|
120
|
+
|
121
|
+
# 清理content-type,移除参数
|
122
|
+
mime_type = content_type.split(';')[0].strip().lower()
|
123
|
+
return self.mime_type_map.get(mime_type)
|
124
|
+
|
125
|
+
def get_partial_content(self, url: str, max_bytes: int = 64) -> Optional[bytes]:
|
126
|
+
"""获取文件的前几个字节"""
|
127
|
+
try:
|
128
|
+
headers = {'Range': f'bytes=0-{max_bytes - 1}'}
|
129
|
+
response = self.session.get(url, headers=headers, timeout=10)
|
130
|
+
|
131
|
+
if response.status_code in [200, 206]:
|
132
|
+
return response.content
|
133
|
+
except Exception as e:
|
134
|
+
print(f"获取内容失败: {e}")
|
135
|
+
return None
|
136
|
+
|
137
|
+
def detect_by_signature(self, data: bytes) -> Optional[str]:
|
138
|
+
"""通过文件签名检测类型"""
|
139
|
+
if not data:
|
140
|
+
return None
|
141
|
+
|
142
|
+
# 检查各种文件签名
|
143
|
+
for signature, file_type in self.file_signatures.items():
|
144
|
+
if data.startswith(signature):
|
145
|
+
# 特殊处理需要进一步检查的格式
|
146
|
+
if signature == b'RIFF' and len(data) >= 12:
|
147
|
+
# 检查是WEBP、AVI还是WAV
|
148
|
+
if data[8:12] == b'WEBP':
|
149
|
+
return 'WEBP'
|
150
|
+
elif data[8:12] == b'AVI ':
|
151
|
+
return 'AVI'
|
152
|
+
elif data[8:12] == b'WAVE':
|
153
|
+
return 'WAV'
|
154
|
+
elif signature == b'OggS' and len(data) >= 32:
|
155
|
+
# 检查是OGG音频还是OGV视频
|
156
|
+
if b'vorbis' in data[:64].lower():
|
157
|
+
return 'OGG'
|
158
|
+
elif b'theora' in data[:64].lower():
|
159
|
+
return 'OGV'
|
160
|
+
else:
|
161
|
+
return 'OGG'
|
162
|
+
else:
|
163
|
+
return file_type
|
164
|
+
|
165
|
+
# 检查MP4相关格式
|
166
|
+
if len(data) >= 12 and data[4:8] == b'ftyp':
|
167
|
+
brand = data[8:12]
|
168
|
+
if brand in [b'mp41', b'mp42', b'isom', b'avc1']:
|
169
|
+
return 'MP4'
|
170
|
+
elif brand == b'M4A ':
|
171
|
+
return 'M4A'
|
172
|
+
elif brand == b'M4V ':
|
173
|
+
return 'M4V'
|
174
|
+
elif brand == b'qt ':
|
175
|
+
return 'MOV'
|
176
|
+
|
177
|
+
return None
|
178
|
+
|
179
|
+
def get_detailed_info(self, url, content_type, data) -> Dict:
|
180
|
+
"""获取详细的文件信息"""
|
181
|
+
result = {
|
182
|
+
'url': url,
|
183
|
+
'site': None,
|
184
|
+
'detected_type': None,
|
185
|
+
'confidence': 'unknown',
|
186
|
+
'methods_used': [],
|
187
|
+
'content_type': content_type,
|
188
|
+
'extension': None
|
189
|
+
}
|
190
|
+
|
191
|
+
# 1. 先尝试HEAD请求获取HTTP头信息
|
192
|
+
try:
|
193
|
+
result['content_type'] = content_type
|
194
|
+
# result['file_size'] = content_length
|
195
|
+
|
196
|
+
# 通过MIME类型检测
|
197
|
+
mime_detected = self.detect_by_mime_type(content_type)
|
198
|
+
if mime_detected:
|
199
|
+
result['detected_type'] = mime_detected
|
200
|
+
result['confidence'] = 'high'
|
201
|
+
result['methods_used'].append('mime_type')
|
202
|
+
except Exception as e:
|
203
|
+
print(f"HEAD请求失败: {e}")
|
204
|
+
|
205
|
+
# 2. 通过扩展名检测
|
206
|
+
ext_detected = self.detect_by_extension(url)
|
207
|
+
result['extension'], result['site'] = self.get_file_extension(url)
|
208
|
+
|
209
|
+
if ext_detected:
|
210
|
+
if not result['detected_type']:
|
211
|
+
result['detected_type'] = ext_detected
|
212
|
+
result['confidence'] = 'medium'
|
213
|
+
elif result['detected_type'] == ext_detected:
|
214
|
+
result['confidence'] = 'very_high' # MIME和扩展名一致
|
215
|
+
result['methods_used'].append('extension')
|
216
|
+
|
217
|
+
# 3. 如果前两种方法不确定,使用文件签名检测
|
218
|
+
if result['confidence'] in ['unknown', 'medium']:
|
219
|
+
signature_detected = self.detect_by_signature(data)
|
220
|
+
if signature_detected:
|
221
|
+
if not result['detected_type']:
|
222
|
+
result['detected_type'] = signature_detected
|
223
|
+
result['confidence'] = 'high'
|
224
|
+
elif result['detected_type'] == signature_detected:
|
225
|
+
result['confidence'] = 'very_high'
|
226
|
+
else:
|
227
|
+
# 冲突时,优先相信文件签名
|
228
|
+
result['detected_type'] = signature_detected
|
229
|
+
result['confidence'] = 'high'
|
230
|
+
result['methods_used'].append('file_signature')
|
231
|
+
|
232
|
+
result['cate'] = self.get_file_category(result['detected_type'])
|
233
|
+
return result
|
234
|
+
|
235
|
+
def get_file_category(self, file_type: str) -> str:
|
236
|
+
"""获取文件类别"""
|
237
|
+
if not file_type or file_type == 'Unknown':
|
238
|
+
return 'Unknown'
|
239
|
+
|
240
|
+
image_types = {'PNG', 'JPEG', 'GIF', 'WEBP', 'BMP', 'TIFF', 'ICO', 'SVG', 'HEIC', 'AVIF'}
|
241
|
+
video_types = {'MP4', 'AVI', 'MOV', 'WMV', 'FLV', 'WEBM', 'MKV', 'M4V', 'MPEG', '3GP', 'OGV', 'TS', 'MTS',
|
242
|
+
'VOB'}
|
243
|
+
audio_types = {'MP3', 'WAV', 'FLAC', 'AAC', 'OGG', 'WMA', 'M4A', 'APE', 'OPUS', 'AIFF', 'AU'}
|
244
|
+
|
245
|
+
if file_type in image_types:
|
246
|
+
return 'image'
|
247
|
+
elif file_type in video_types:
|
248
|
+
return 'video'
|
249
|
+
elif file_type in audio_types:
|
250
|
+
return 'audio'
|
251
|
+
else:
|
252
|
+
return 'other'
|
253
|
+
|
254
|
+
|
255
|
+
class Request:
|
256
|
+
"""
|
257
|
+
HTTP 请求封装类,提供统一的请求接口和相关功能。
|
258
|
+
|
259
|
+
Features:
|
260
|
+
- 自动 User-Agent 生成
|
261
|
+
- 灵活的请求参数配置
|
262
|
+
- 文件类型检测
|
263
|
+
- 错误处理和状态码检查
|
264
|
+
"""
|
265
|
+
|
266
|
+
# 支持的 requests 库参数
|
267
|
+
_REQUEST_ATTRS: Set[str] = frozenset({
|
268
|
+
"params", "headers", "cookies", "data", "json", "files",
|
269
|
+
"auth", "timeout", "proxies", "hooks", "stream", "verify",
|
270
|
+
"cert", "allow_redirects"
|
271
|
+
})
|
272
|
+
|
273
|
+
# 默认超时时间
|
274
|
+
_DEFAULT_TIMEOUT = 30
|
275
|
+
|
276
|
+
# User-Agent 模板和版本范围
|
277
|
+
_UA_TEMPLATE = (
|
278
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_{v1}_{v2}) "
|
279
|
+
"AppleWebKit/{v4}.{v3} (KHTML, like Gecko) "
|
280
|
+
"Chrome/105.0.0.0 Safari/{v4}.{v3} Edg/105.0.{v5}.{v6}"
|
281
|
+
)
|
282
|
+
_UA_VERSION_RANGES = {
|
283
|
+
'v1': (4, 15), 'v2': (3, 11), 'v3': (1, 16),
|
284
|
+
'v4': (533, 605), 'v5': (1000, 6000), 'v6': (10, 80)
|
285
|
+
}
|
286
|
+
|
287
|
+
def __init__(
|
288
|
+
self,
|
289
|
+
url: str,
|
290
|
+
seed: Any = None,
|
291
|
+
method: Optional[str] = None,
|
292
|
+
random_ua: bool = True,
|
293
|
+
check_status_code: bool = True,
|
294
|
+
**kwargs
|
295
|
+
):
|
296
|
+
"""
|
297
|
+
初始化请求对象。
|
298
|
+
|
299
|
+
Args:
|
300
|
+
url: 请求的 URL
|
301
|
+
seed: 种子对象或标识符
|
302
|
+
method: HTTP 方法,如果不指定则自动推断
|
303
|
+
random_ua: 是否使用随机 User-Agent
|
304
|
+
check_status_code: 是否检查响应状态码
|
305
|
+
**kwargs: 其他请求参数
|
306
|
+
|
307
|
+
Raises:
|
308
|
+
ValueError: 当 URL 格式无效时
|
309
|
+
"""
|
310
|
+
self.scheme = None
|
311
|
+
self.netloc = None
|
312
|
+
self._validate_url(url)
|
313
|
+
|
314
|
+
self.url = url
|
315
|
+
self.seed = seed
|
316
|
+
self.check_status_code = check_status_code
|
317
|
+
self.request_settings: Dict[str, Any] = {}
|
318
|
+
|
319
|
+
# 分离请求参数和实例属性
|
320
|
+
self._process_kwargs(kwargs)
|
321
|
+
|
322
|
+
self.method = self._determine_method(method)
|
323
|
+
|
324
|
+
# 设置默认超时
|
325
|
+
if 'timeout' not in self.request_settings:
|
326
|
+
self.request_settings['timeout'] = self._DEFAULT_TIMEOUT
|
327
|
+
|
328
|
+
# 构建请求头
|
329
|
+
if random_ua:
|
330
|
+
self._setup_headers()
|
331
|
+
|
332
|
+
def _validate_url(self, url: str) -> None:
|
333
|
+
"""验证 URL 格式"""
|
334
|
+
try:
|
335
|
+
result = urlparse(url)
|
336
|
+
self.scheme = result.scheme
|
337
|
+
self.netloc = result.netloc
|
338
|
+
if not all([self.scheme, self.netloc]):
|
339
|
+
raise ValueError(f"无效的 URL 格式: {url}")
|
340
|
+
except Exception as e:
|
341
|
+
raise ValueError(f"URL 解析失败: {e}")
|
342
|
+
|
343
|
+
def _process_kwargs(self, kwargs: Dict[str, Any]) -> None:
|
344
|
+
"""处理关键字参数,分离请求参数和实例属性"""
|
345
|
+
for key, value in kwargs.items():
|
346
|
+
if key in self._REQUEST_ATTRS:
|
347
|
+
self.request_settings[key] = value
|
348
|
+
else:
|
349
|
+
setattr(self, key, value)
|
350
|
+
|
351
|
+
def _determine_method(self, method: Optional[str]) -> str:
|
352
|
+
if method:
|
353
|
+
return method.upper()
|
354
|
+
|
355
|
+
has_body = bool(
|
356
|
+
self.request_settings.get("data") or
|
357
|
+
self.request_settings.get("json") or
|
358
|
+
self.request_settings.get("files")
|
359
|
+
)
|
360
|
+
return "POST" if has_body else "GET"
|
361
|
+
|
362
|
+
def _generate_random_ua(self) -> str:
|
363
|
+
"""生成随机 User-Agent"""
|
364
|
+
versions = {
|
365
|
+
key: random.randint(*range_tuple)
|
366
|
+
for key, range_tuple in self._UA_VERSION_RANGES.items()
|
367
|
+
}
|
368
|
+
return self._UA_TEMPLATE.format(**versions)
|
369
|
+
|
370
|
+
def _setup_headers(self) -> None:
|
371
|
+
"""设置请求头,包括随机 User-Agent"""
|
372
|
+
headers = self.request_settings.setdefault("headers", {})
|
373
|
+
|
374
|
+
# 使用小写键名进行检查,保持一致性
|
375
|
+
ua_keys = ['user-agent', 'User-Agent']
|
376
|
+
if not any(headers.get(key) for key in ua_keys):
|
377
|
+
headers["User-Agent"] = self._generate_random_ua()
|
378
|
+
|
379
|
+
def execute(self) -> requests.Response:
|
380
|
+
"""
|
381
|
+
执行 HTTP 请求。
|
382
|
+
|
383
|
+
Returns:
|
384
|
+
requests.Response: 响应对象
|
385
|
+
|
386
|
+
Raises:
|
387
|
+
RequestException: 请求执行失败
|
388
|
+
requests.HTTPError: HTTP 状态码错误(当 check_status_code=True 时)
|
389
|
+
"""
|
390
|
+
try:
|
391
|
+
response = requests.request(
|
392
|
+
method=self.method,
|
393
|
+
url=self.url,
|
394
|
+
**self.request_settings
|
395
|
+
)
|
396
|
+
|
397
|
+
if self.check_status_code:
|
398
|
+
response.raise_for_status()
|
399
|
+
|
400
|
+
return response
|
401
|
+
|
402
|
+
except RequestException as e:
|
403
|
+
logging.error(f"请求执行失败 - URL: {self.url}, 错误: {e}")
|
404
|
+
raise
|
405
|
+
|
406
|
+
# 保持向后兼容性
|
407
|
+
def download(self) -> requests.Response:
|
408
|
+
"""下载方法,为了向后兼容性保留"""
|
409
|
+
return self.execute()
|
410
|
+
|
411
|
+
def detect_file_type(self) -> Dict[str, Any]:
|
412
|
+
"""
|
413
|
+
检测文件类型。
|
414
|
+
|
415
|
+
Returns:
|
416
|
+
Dict[str, Any]: 文件类型信息
|
417
|
+
|
418
|
+
Raises:
|
419
|
+
RequestException: 请求执行失败
|
420
|
+
ImportError: FileTypeDetector 未找到
|
421
|
+
"""
|
422
|
+
try:
|
423
|
+
# 创建检测请求的配置
|
424
|
+
detect_settings = self.request_settings.copy()
|
425
|
+
|
426
|
+
# 设置 Range 头获取文件前64字节
|
427
|
+
headers = detect_settings.setdefault("headers", {}).copy()
|
428
|
+
headers['Range'] = "bytes=0-63"
|
429
|
+
detect_settings["headers"] = headers
|
430
|
+
|
431
|
+
# 移除 stream 参数避免冲突
|
432
|
+
detect_settings.pop('stream', None)
|
433
|
+
|
434
|
+
# 执行检测请求
|
435
|
+
response = requests.request(
|
436
|
+
method=self.method,
|
437
|
+
url=self.url,
|
438
|
+
**detect_settings
|
439
|
+
)
|
440
|
+
|
441
|
+
content_type = response.headers.get("Content-Type")
|
442
|
+
detector = FileTypeDetector()
|
443
|
+
|
444
|
+
return detector.get_detailed_info(
|
445
|
+
url=self.url,
|
446
|
+
content_type=content_type,
|
447
|
+
data=response.content
|
448
|
+
)
|
449
|
+
|
450
|
+
except RequestException as e:
|
451
|
+
logging.error(f"文件类型检测失败 - URL: {self.url}, 错误: {e}")
|
452
|
+
|
453
|
+
@property
|
454
|
+
def to_dict(self) -> Dict[str, Any]:
|
455
|
+
excluded_keys = {"request_settings"}
|
456
|
+
result = {
|
457
|
+
key: value for key, value in self.__dict__.items()
|
458
|
+
if not key.startswith('_') and key not in excluded_keys
|
459
|
+
}
|
460
|
+
result['request_settings'] = self.request_settings.copy()
|
461
|
+
return result
|
462
|
+
|
463
|
+
def __repr__(self) -> str:
|
464
|
+
return f"Request(method='{self.method}', url='{self.url}')"
|
465
|
+
|
466
|
+
def __str__(self) -> str:
|
467
|
+
return f"{self.method} {self.url}"
|
@@ -1,3 +1,4 @@
|
|
1
|
+
from typing import Any
|
1
2
|
|
2
3
|
|
3
4
|
class Response:
|
@@ -22,3 +23,10 @@ class Response:
|
|
22
23
|
_dict.pop('response')
|
23
24
|
return _dict
|
24
25
|
|
26
|
+
def __getattr__(self, name: str) -> Any:
|
27
|
+
"""动态获取未定义的属性,返回 None"""
|
28
|
+
return None
|
29
|
+
|
30
|
+
def __getitem__(self, key: str) -> Any:
|
31
|
+
"""支持字典式获取属性"""
|
32
|
+
return getattr(self, key, None)
|
@@ -0,0 +1,257 @@
|
|
1
|
+
import requests
|
2
|
+
from urllib.parse import urlparse
|
3
|
+
from typing import Dict, Optional
|
4
|
+
|
5
|
+
|
6
|
+
class FileTypeDetector:
|
7
|
+
def __init__(self):
|
8
|
+
self.file_signatures = {
|
9
|
+
# 图片格式
|
10
|
+
b'\x89PNG\r\n\x1a\n': 'PNG',
|
11
|
+
b'\xff\xd8\xff': 'JPEG',
|
12
|
+
b'GIF87a': 'GIF',
|
13
|
+
b'GIF89a': 'GIF',
|
14
|
+
b'RIFF': 'WEBP', # 需要进一步检查
|
15
|
+
b'BM': 'BMP',
|
16
|
+
b'II*\x00': 'TIFF',
|
17
|
+
b'MM\x00*': 'TIFF',
|
18
|
+
b'\x00\x00\x01\x00': 'ICO',
|
19
|
+
b'\x00\x00\x02\x00': 'CUR',
|
20
|
+
|
21
|
+
# 视频格式
|
22
|
+
b'\x00\x00\x00\x18ftypmp4': 'MP4',
|
23
|
+
b'\x00\x00\x00\x20ftypM4V': 'M4V',
|
24
|
+
b'FLV\x01': 'FLV',
|
25
|
+
b'\x1aE\xdf\xa3': 'WEBM',
|
26
|
+
b'RIFF': 'AVI', # 需要进一步检查
|
27
|
+
b'\x00\x00\x01\xba': 'MPEG',
|
28
|
+
b'\x00\x00\x01\xb3': 'MPEG',
|
29
|
+
b'OggS': 'OGV',
|
30
|
+
|
31
|
+
# 音频格式
|
32
|
+
b'ID3': 'MP3',
|
33
|
+
b'\xff\xfb': 'MP3',
|
34
|
+
b'\xff\xf3': 'MP3',
|
35
|
+
b'\xff\xf2': 'MP3',
|
36
|
+
b'fLaC': 'FLAC',
|
37
|
+
b'RIFF': 'WAV', # 需要进一步检查
|
38
|
+
b'OggS': 'OGG', # 需要进一步检查
|
39
|
+
b'ftypM4A': 'M4A',
|
40
|
+
b'MAC ': 'APE',
|
41
|
+
|
42
|
+
# 其他格式
|
43
|
+
b'%PDF': 'PDF',
|
44
|
+
b'PK\x03\x04': 'ZIP',
|
45
|
+
b'Rar!\x1a\x07\x00': 'RAR',
|
46
|
+
b'\x37\x7a\xbc\xaf\x27\x1c': '7Z',
|
47
|
+
}
|
48
|
+
|
49
|
+
# 扩展名映射
|
50
|
+
self.extension_map = {
|
51
|
+
# 图片
|
52
|
+
'.jpg': 'JPEG', '.jpeg': 'JPEG', '.png': 'PNG', '.gif': 'GIF',
|
53
|
+
'.webp': 'WEBP', '.bmp': 'BMP', '.tiff': 'TIFF', '.tif': 'TIFF',
|
54
|
+
'.ico': 'ICO', '.svg': 'SVG', '.heic': 'HEIC', '.avif': 'AVIF',
|
55
|
+
|
56
|
+
# 视频
|
57
|
+
'.mp4': 'MP4', '.avi': 'AVI', '.mov': 'MOV', '.wmv': 'WMV',
|
58
|
+
'.flv': 'FLV', '.webm': 'WEBM', '.mkv': 'MKV', '.m4v': 'M4V',
|
59
|
+
'.mpg': 'MPEG', '.mpeg': 'MPEG', '.3gp': '3GP', '.ogv': 'OGV',
|
60
|
+
'.ts': 'TS', '.mts': 'MTS', '.vob': 'VOB',
|
61
|
+
|
62
|
+
# 音频
|
63
|
+
'.mp3': 'MP3', '.wav': 'WAV', '.flac': 'FLAC', '.aac': 'AAC',
|
64
|
+
'.ogg': 'OGG', '.wma': 'WMA', '.m4a': 'M4A', '.ape': 'APE',
|
65
|
+
'.opus': 'OPUS', '.aiff': 'AIFF', '.au': 'AU',
|
66
|
+
}
|
67
|
+
|
68
|
+
# MIME类型映射
|
69
|
+
self.mime_type_map = {
|
70
|
+
# 图片
|
71
|
+
'image/jpeg': 'JPEG', 'image/png': 'PNG', 'image/gif': 'GIF',
|
72
|
+
'image/webp': 'WEBP', 'image/bmp': 'BMP', 'image/tiff': 'TIFF',
|
73
|
+
'image/svg+xml': 'SVG', 'image/x-icon': 'ICO',
|
74
|
+
|
75
|
+
# 视频
|
76
|
+
'video/mp4': 'MP4', 'video/avi': 'AVI', 'video/quicktime': 'MOV',
|
77
|
+
'video/x-msvideo': 'AVI', 'video/webm': 'WEBM', 'video/x-flv': 'FLV',
|
78
|
+
'video/3gpp': '3GP', 'video/ogg': 'OGV',
|
79
|
+
|
80
|
+
# 音频
|
81
|
+
'audio/mpeg': 'MP3', 'audio/wav': 'WAV', 'audio/flac': 'FLAC',
|
82
|
+
'audio/aac': 'AAC', 'audio/ogg': 'OGG', 'audio/x-ms-wma': 'WMA',
|
83
|
+
'audio/mp4': 'M4A', 'audio/opus': 'OPUS',
|
84
|
+
}
|
85
|
+
|
86
|
+
self.session = requests.Session()
|
87
|
+
self.session.headers.update({
|
88
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
89
|
+
})
|
90
|
+
|
91
|
+
def get_file_extension(self, url: str) -> str:
|
92
|
+
"""从URL获取文件扩展名"""
|
93
|
+
parsed = urlparse(url)
|
94
|
+
path = parsed.path.lower()
|
95
|
+
site = parsed.netloc
|
96
|
+
|
97
|
+
# 移除查询参数
|
98
|
+
if '?' in path:
|
99
|
+
path = path.split('?')[0]
|
100
|
+
|
101
|
+
# 获取扩展名
|
102
|
+
if '.' in path:
|
103
|
+
return '.' + path.split('.')[-1], site
|
104
|
+
return '', site
|
105
|
+
|
106
|
+
def detect_by_extension(self, url: str) -> Optional[str]:
|
107
|
+
"""通过文件扩展名检测类型"""
|
108
|
+
ext, site = self.get_file_extension(url)
|
109
|
+
return self.extension_map.get(ext)
|
110
|
+
|
111
|
+
def detect_by_mime_type(self, content_type: str) -> Optional[str]:
|
112
|
+
"""通过MIME类型检测"""
|
113
|
+
if not content_type:
|
114
|
+
return None
|
115
|
+
|
116
|
+
# 清理content-type,移除参数
|
117
|
+
mime_type = content_type.split(';')[0].strip().lower()
|
118
|
+
return self.mime_type_map.get(mime_type)
|
119
|
+
|
120
|
+
def get_partial_content(self, url: str, max_bytes: int = 64) -> Optional[bytes]:
|
121
|
+
"""获取文件的前几个字节"""
|
122
|
+
try:
|
123
|
+
headers = {'Range': f'bytes=0-{max_bytes - 1}'}
|
124
|
+
response = self.session.get(url, headers=headers, timeout=10)
|
125
|
+
|
126
|
+
if response.status_code in [200, 206]:
|
127
|
+
return response.content
|
128
|
+
except Exception as e:
|
129
|
+
print(f"获取内容失败: {e}")
|
130
|
+
return None
|
131
|
+
|
132
|
+
def detect_by_signature(self, data: bytes) -> Optional[str]:
|
133
|
+
"""通过文件签名检测类型"""
|
134
|
+
if not data:
|
135
|
+
return None
|
136
|
+
|
137
|
+
# 检查各种文件签名
|
138
|
+
for signature, file_type in self.file_signatures.items():
|
139
|
+
if data.startswith(signature):
|
140
|
+
# 特殊处理需要进一步检查的格式
|
141
|
+
if signature == b'RIFF' and len(data) >= 12:
|
142
|
+
# 检查是WEBP、AVI还是WAV
|
143
|
+
if data[8:12] == b'WEBP':
|
144
|
+
return 'WEBP'
|
145
|
+
elif data[8:12] == b'AVI ':
|
146
|
+
return 'AVI'
|
147
|
+
elif data[8:12] == b'WAVE':
|
148
|
+
return 'WAV'
|
149
|
+
elif signature == b'OggS' and len(data) >= 32:
|
150
|
+
# 检查是OGG音频还是OGV视频
|
151
|
+
if b'vorbis' in data[:64].lower():
|
152
|
+
return 'OGG'
|
153
|
+
elif b'theora' in data[:64].lower():
|
154
|
+
return 'OGV'
|
155
|
+
else:
|
156
|
+
return 'OGG'
|
157
|
+
else:
|
158
|
+
return file_type
|
159
|
+
|
160
|
+
# 检查MP4相关格式
|
161
|
+
if len(data) >= 12 and data[4:8] == b'ftyp':
|
162
|
+
brand = data[8:12]
|
163
|
+
if brand in [b'mp41', b'mp42', b'isom', b'avc1']:
|
164
|
+
return 'MP4'
|
165
|
+
elif brand == b'M4A ':
|
166
|
+
return 'M4A'
|
167
|
+
elif brand == b'M4V ':
|
168
|
+
return 'M4V'
|
169
|
+
elif brand == b'qt ':
|
170
|
+
return 'MOV'
|
171
|
+
|
172
|
+
return None
|
173
|
+
|
174
|
+
def get_detailed_info(self, url, content_type, data) -> Dict:
|
175
|
+
"""获取详细的文件信息"""
|
176
|
+
result = {
|
177
|
+
'url': url,
|
178
|
+
'site': None,
|
179
|
+
'detected_type': None,
|
180
|
+
'confidence': 'unknown',
|
181
|
+
'methods_used': [],
|
182
|
+
'content_type': content_type,
|
183
|
+
'extension': None
|
184
|
+
}
|
185
|
+
|
186
|
+
# 1. 先尝试HEAD请求获取HTTP头信息
|
187
|
+
try:
|
188
|
+
result['content_type'] = content_type
|
189
|
+
# result['file_size'] = content_length
|
190
|
+
|
191
|
+
# 通过MIME类型检测
|
192
|
+
mime_detected = self.detect_by_mime_type(content_type)
|
193
|
+
if mime_detected:
|
194
|
+
result['detected_type'] = mime_detected
|
195
|
+
result['confidence'] = 'high'
|
196
|
+
result['methods_used'].append('mime_type')
|
197
|
+
except Exception as e:
|
198
|
+
print(f"HEAD请求失败: {e}")
|
199
|
+
|
200
|
+
# 2. 通过扩展名检测
|
201
|
+
ext_detected = self.detect_by_extension(url)
|
202
|
+
result['extension'], result['site'] = self.get_file_extension(url)
|
203
|
+
|
204
|
+
if ext_detected:
|
205
|
+
if not result['detected_type']:
|
206
|
+
result['detected_type'] = ext_detected
|
207
|
+
result['confidence'] = 'medium'
|
208
|
+
elif result['detected_type'] == ext_detected:
|
209
|
+
result['confidence'] = 'very_high' # MIME和扩展名一致
|
210
|
+
result['methods_used'].append('extension')
|
211
|
+
|
212
|
+
# 3. 如果前两种方法不确定,使用文件签名检测
|
213
|
+
if result['confidence'] in ['unknown', 'medium']:
|
214
|
+
signature_detected = self.detect_by_signature(data)
|
215
|
+
if signature_detected:
|
216
|
+
if not result['detected_type']:
|
217
|
+
result['detected_type'] = signature_detected
|
218
|
+
result['confidence'] = 'high'
|
219
|
+
elif result['detected_type'] == signature_detected:
|
220
|
+
result['confidence'] = 'very_high'
|
221
|
+
else:
|
222
|
+
# 冲突时,优先相信文件签名
|
223
|
+
result['detected_type'] = signature_detected
|
224
|
+
result['confidence'] = 'high'
|
225
|
+
result['methods_used'].append('file_signature')
|
226
|
+
|
227
|
+
return result
|
228
|
+
|
229
|
+
def detect_file_type(self, url: str) -> str:
|
230
|
+
"""简单的文件类型检测,返回类型字符串"""
|
231
|
+
info = self.get_detailed_info(url)
|
232
|
+
return info.get('detected_type', 'Unknown')
|
233
|
+
|
234
|
+
def get_file_category(self, file_type: str) -> str:
|
235
|
+
"""获取文件类别"""
|
236
|
+
if not file_type or file_type == 'Unknown':
|
237
|
+
return 'Unknown'
|
238
|
+
|
239
|
+
image_types = {'PNG', 'JPEG', 'GIF', 'WEBP', 'BMP', 'TIFF', 'ICO', 'SVG', 'HEIC', 'AVIF'}
|
240
|
+
video_types = {'MP4', 'AVI', 'MOV', 'WMV', 'FLV', 'WEBM', 'MKV', 'M4V', 'MPEG', '3GP', 'OGV', 'TS', 'MTS',
|
241
|
+
'VOB'}
|
242
|
+
audio_types = {'MP3', 'WAV', 'FLAC', 'AAC', 'OGG', 'WMA', 'M4A', 'APE', 'OPUS', 'AIFF', 'AU'}
|
243
|
+
|
244
|
+
if file_type in image_types:
|
245
|
+
return 'Image'
|
246
|
+
elif file_type in video_types:
|
247
|
+
return 'Video'
|
248
|
+
elif file_type in audio_types:
|
249
|
+
return 'Audio'
|
250
|
+
else:
|
251
|
+
return 'Other'
|
252
|
+
|
253
|
+
|
254
|
+
# if __name__ == "__main__":
|
255
|
+
# detector = FileTypeDetector()
|
256
|
+
# result = detector.get_detailed_info("https://cdn.pixabay.com/user/2024/12/10/12-18-33-812_96x96.jpeg")
|
257
|
+
# print(result)
|
@@ -1,88 +0,0 @@
|
|
1
|
-
import random
|
2
|
-
import requests
|
3
|
-
from typing import Any, Dict
|
4
|
-
|
5
|
-
|
6
|
-
class Request:
|
7
|
-
"""
|
8
|
-
请求类,用于封装 HTTP 请求并提供相关功能。
|
9
|
-
"""
|
10
|
-
|
11
|
-
__REQUEST_ATTRS__ = {
|
12
|
-
"params",
|
13
|
-
"headers",
|
14
|
-
"cookies",
|
15
|
-
"data",
|
16
|
-
"json",
|
17
|
-
"files",
|
18
|
-
"auth",
|
19
|
-
"timeout",
|
20
|
-
"proxies",
|
21
|
-
"hooks",
|
22
|
-
"stream",
|
23
|
-
"verify",
|
24
|
-
"cert",
|
25
|
-
"allow_redirects",
|
26
|
-
}
|
27
|
-
|
28
|
-
def __init__(
|
29
|
-
self,
|
30
|
-
url: str,
|
31
|
-
seed: Any,
|
32
|
-
random_ua: bool = True,
|
33
|
-
check_status_code: bool = True,
|
34
|
-
**kwargs,
|
35
|
-
):
|
36
|
-
"""
|
37
|
-
初始化请求对象。
|
38
|
-
:param url: 请求的 URL。
|
39
|
-
:param seed: 种子对象或标识符。
|
40
|
-
:param random_ua: 是否使用随机 User-Agent,默认为 True。
|
41
|
-
:param check_status_code: 是否检查响应状态码,默认为 True。
|
42
|
-
:param kwargs: 其他扩展参数。
|
43
|
-
"""
|
44
|
-
self.url = url
|
45
|
-
self.seed = seed
|
46
|
-
self.check_status_code = check_status_code
|
47
|
-
self.request_setting: Dict[str, Any] = {}
|
48
|
-
|
49
|
-
for key, value in kwargs.items():
|
50
|
-
if key in self.__class__.__REQUEST_ATTRS__:
|
51
|
-
self.request_setting[key] = value
|
52
|
-
else:
|
53
|
-
setattr(self, key, value)
|
54
|
-
|
55
|
-
self.method = getattr(self, "method", None) or (
|
56
|
-
"POST" if self.request_setting.get("data") or self.request_setting.get("json") else "GET"
|
57
|
-
)
|
58
|
-
|
59
|
-
if random_ua:
|
60
|
-
self._build_header()
|
61
|
-
|
62
|
-
@property
|
63
|
-
def _random_ua(self) -> str:
|
64
|
-
v1 = random.randint(4, 15)
|
65
|
-
v2 = random.randint(3, 11)
|
66
|
-
v3 = random.randint(1, 16)
|
67
|
-
v4 = random.randint(533, 605)
|
68
|
-
v5 = random.randint(1000, 6000)
|
69
|
-
v6 = random.randint(10, 80)
|
70
|
-
return (f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_{v1}_{v2}) "
|
71
|
-
f"AppleWebKit/{v4}.{v3} (KHTML, like Gecko) "
|
72
|
-
f"Chrome/105.0.0.0 Safari/{v4}.{v3} Edg/105.0.{v5}.{v6}")
|
73
|
-
|
74
|
-
def _build_header(self):
|
75
|
-
headers = self.request_setting.setdefault("headers", {})
|
76
|
-
if not headers.get("user-agent"):
|
77
|
-
headers["user-agent"] = self._random_ua
|
78
|
-
|
79
|
-
def download(self) -> requests.Response:
|
80
|
-
response = requests.request(self.method, self.url, **self.request_setting)
|
81
|
-
if self.check_status_code:
|
82
|
-
response.raise_for_status()
|
83
|
-
return response
|
84
|
-
|
85
|
-
@property
|
86
|
-
def to_dict(self) -> Dict[str, Any]:
|
87
|
-
excluded_keys = {"url", "seed", "check_status_code", "request_setting"}
|
88
|
-
return {k: v for k, v in self.__dict__.items() if k not in excluded_keys}
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{cobweb-launcher-3.1.25 → cobweb-launcher-3.1.27}/cobweb_launcher.egg-info/dependency_links.txt
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|