cobweb-launcher 3.1.25__py3-none-any.whl → 3.1.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cobweb/base/request.py CHANGED
@@ -1,88 +1,467 @@
1
1
  import random
2
+ import logging
2
3
  import requests
3
- from typing import Any, Dict
4
+
5
+ from urllib.parse import urlparse
6
+ from typing import Any, Set, Dict, Optional
7
+ from requests.exceptions import RequestException
8
+
9
+
10
+ class FileTypeDetector:
11
+
12
+ def __init__(self):
13
+ self.file_signatures = {
14
+ # 图片格式
15
+ b'\x89PNG\r\n\x1a\n': 'PNG',
16
+ b'\xff\xd8\xff': 'JPEG',
17
+ b'GIF87a': 'GIF',
18
+ b'GIF89a': 'GIF',
19
+ b'RIFF': 'WEBP', # 需要进一步检查
20
+ b'BM': 'BMP',
21
+ b'II*\x00': 'TIFF',
22
+ b'MM\x00*': 'TIFF',
23
+ b'\x00\x00\x01\x00': 'ICO',
24
+ b'\x00\x00\x02\x00': 'CUR',
25
+
26
+ # 视频格式
27
+ b'\x00\x00\x00\x18ftypmp4': 'MP4',
28
+ b'\x00\x00\x00\x20ftypM4V': 'M4V',
29
+ b'FLV\x01': 'FLV',
30
+ b'\x1aE\xdf\xa3': 'WEBM',
31
+ b'RIFF': 'AVI', # 需要进一步检查
32
+ b'\x00\x00\x01\xba': 'MPEG',
33
+ b'\x00\x00\x01\xb3': 'MPEG',
34
+ b'OggS': 'OGV',
35
+
36
+ # 音频格式
37
+ b'ID3': 'MP3',
38
+ b'\xff\xfb': 'MP3',
39
+ b'\xff\xf3': 'MP3',
40
+ b'\xff\xf2': 'MP3',
41
+ b'fLaC': 'FLAC',
42
+ b'RIFF': 'WAV', # 需要进一步检查
43
+ b'OggS': 'OGG', # 需要进一步检查
44
+ b'ftypM4A': 'M4A',
45
+ b'MAC ': 'APE',
46
+
47
+ # 其他格式
48
+ b'%PDF': 'PDF',
49
+ b'PK\x03\x04': 'ZIP',
50
+ b'Rar!\x1a\x07\x00': 'RAR',
51
+ b'\x37\x7a\xbc\xaf\x27\x1c': '7Z',
52
+ }
53
+
54
+ # 扩展名映射
55
+ self.extension_map = {
56
+ # 图片
57
+ '.jpg': 'JPEG', '.jpeg': 'JPEG', '.png': 'PNG', '.gif': 'GIF',
58
+ '.webp': 'WEBP', '.bmp': 'BMP', '.tiff': 'TIFF', '.tif': 'TIFF',
59
+ '.ico': 'ICO', '.svg': 'SVG', '.heic': 'HEIC', '.avif': 'AVIF',
60
+
61
+ # 视频
62
+ '.mp4': 'MP4', '.avi': 'AVI', '.mov': 'MOV', '.wmv': 'WMV',
63
+ '.flv': 'FLV', '.webm': 'WEBM', '.mkv': 'MKV', '.m4v': 'M4V',
64
+ '.mpg': 'MPEG', '.mpeg': 'MPEG', '.3gp': '3GP', '.ogv': 'OGV',
65
+ '.ts': 'TS', '.mts': 'MTS', '.vob': 'VOB',
66
+
67
+ # 音频
68
+ '.mp3': 'MP3', '.wav': 'WAV', '.flac': 'FLAC', '.aac': 'AAC',
69
+ '.ogg': 'OGG', '.wma': 'WMA', '.m4a': 'M4A', '.ape': 'APE',
70
+ '.opus': 'OPUS', '.aiff': 'AIFF', '.au': 'AU',
71
+ }
72
+
73
+ # MIME类型映射
74
+ self.mime_type_map = {
75
+ # 图片
76
+ 'image/jpeg': 'JPEG', 'image/png': 'PNG', 'image/gif': 'GIF',
77
+ 'image/webp': 'WEBP', 'image/bmp': 'BMP', 'image/tiff': 'TIFF',
78
+ 'image/svg+xml': 'SVG', 'image/x-icon': 'ICO',
79
+
80
+ # 视频
81
+ 'video/mp4': 'MP4', 'video/avi': 'AVI', 'video/quicktime': 'MOV',
82
+ 'video/x-msvideo': 'AVI', 'video/webm': 'WEBM', 'video/x-flv': 'FLV',
83
+ 'video/3gpp': '3GP', 'video/ogg': 'OGV',
84
+
85
+ # 音频
86
+ 'audio/mpeg': 'MP3', 'audio/wav': 'WAV', 'audio/flac': 'FLAC',
87
+ 'audio/aac': 'AAC', 'audio/ogg': 'OGG', 'audio/x-ms-wma': 'WMA',
88
+ 'audio/mp4': 'M4A', 'audio/opus': 'OPUS',
89
+ }
90
+
91
+ self.session = requests.Session()
92
+ self.session.headers.update({
93
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
94
+ })
95
+
96
+ def get_file_extension(self, url: str) -> str:
97
+ """从URL获取文件扩展名"""
98
+ parsed = urlparse(url)
99
+ path = parsed.path.lower()
100
+ site = parsed.netloc
101
+
102
+ # 移除查询参数
103
+ if '?' in path:
104
+ path = path.split('?')[0]
105
+
106
+ # 获取扩展名
107
+ if '.' in path:
108
+ return '.' + path.split('.')[-1], site
109
+ return '', site
110
+
111
+ def detect_by_extension(self, url: str) -> Optional[str]:
112
+ """通过文件扩展名检测类型"""
113
+ ext, site = self.get_file_extension(url)
114
+ return self.extension_map.get(ext)
115
+
116
+ def detect_by_mime_type(self, content_type: str) -> Optional[str]:
117
+ """通过MIME类型检测"""
118
+ if not content_type:
119
+ return None
120
+
121
+ # 清理content-type,移除参数
122
+ mime_type = content_type.split(';')[0].strip().lower()
123
+ return self.mime_type_map.get(mime_type)
124
+
125
+ def get_partial_content(self, url: str, max_bytes: int = 64) -> Optional[bytes]:
126
+ """获取文件的前几个字节"""
127
+ try:
128
+ headers = {'Range': f'bytes=0-{max_bytes - 1}'}
129
+ response = self.session.get(url, headers=headers, timeout=10)
130
+
131
+ if response.status_code in [200, 206]:
132
+ return response.content
133
+ except Exception as e:
134
+ print(f"获取内容失败: {e}")
135
+ return None
136
+
137
+ def detect_by_signature(self, data: bytes) -> Optional[str]:
138
+ """通过文件签名检测类型"""
139
+ if not data:
140
+ return None
141
+
142
+ # 检查各种文件签名
143
+ for signature, file_type in self.file_signatures.items():
144
+ if data.startswith(signature):
145
+ # 特殊处理需要进一步检查的格式
146
+ if signature == b'RIFF' and len(data) >= 12:
147
+ # 检查是WEBP、AVI还是WAV
148
+ if data[8:12] == b'WEBP':
149
+ return 'WEBP'
150
+ elif data[8:12] == b'AVI ':
151
+ return 'AVI'
152
+ elif data[8:12] == b'WAVE':
153
+ return 'WAV'
154
+ elif signature == b'OggS' and len(data) >= 32:
155
+ # 检查是OGG音频还是OGV视频
156
+ if b'vorbis' in data[:64].lower():
157
+ return 'OGG'
158
+ elif b'theora' in data[:64].lower():
159
+ return 'OGV'
160
+ else:
161
+ return 'OGG'
162
+ else:
163
+ return file_type
164
+
165
+ # 检查MP4相关格式
166
+ if len(data) >= 12 and data[4:8] == b'ftyp':
167
+ brand = data[8:12]
168
+ if brand in [b'mp41', b'mp42', b'isom', b'avc1']:
169
+ return 'MP4'
170
+ elif brand == b'M4A ':
171
+ return 'M4A'
172
+ elif brand == b'M4V ':
173
+ return 'M4V'
174
+ elif brand == b'qt ':
175
+ return 'MOV'
176
+
177
+ return None
178
+
179
+ def get_detailed_info(self, url, content_type, data) -> Dict:
180
+ """获取详细的文件信息"""
181
+ result = {
182
+ 'url': url,
183
+ 'site': None,
184
+ 'detected_type': None,
185
+ 'confidence': 'unknown',
186
+ 'methods_used': [],
187
+ 'content_type': content_type,
188
+ 'extension': None
189
+ }
190
+
191
+ # 1. 先尝试HEAD请求获取HTTP头信息
192
+ try:
193
+ result['content_type'] = content_type
194
+ # result['file_size'] = content_length
195
+
196
+ # 通过MIME类型检测
197
+ mime_detected = self.detect_by_mime_type(content_type)
198
+ if mime_detected:
199
+ result['detected_type'] = mime_detected
200
+ result['confidence'] = 'high'
201
+ result['methods_used'].append('mime_type')
202
+ except Exception as e:
203
+ print(f"HEAD请求失败: {e}")
204
+
205
+ # 2. 通过扩展名检测
206
+ ext_detected = self.detect_by_extension(url)
207
+ result['extension'], result['site'] = self.get_file_extension(url)
208
+
209
+ if ext_detected:
210
+ if not result['detected_type']:
211
+ result['detected_type'] = ext_detected
212
+ result['confidence'] = 'medium'
213
+ elif result['detected_type'] == ext_detected:
214
+ result['confidence'] = 'very_high' # MIME和扩展名一致
215
+ result['methods_used'].append('extension')
216
+
217
+ # 3. 如果前两种方法不确定,使用文件签名检测
218
+ if result['confidence'] in ['unknown', 'medium']:
219
+ signature_detected = self.detect_by_signature(data)
220
+ if signature_detected:
221
+ if not result['detected_type']:
222
+ result['detected_type'] = signature_detected
223
+ result['confidence'] = 'high'
224
+ elif result['detected_type'] == signature_detected:
225
+ result['confidence'] = 'very_high'
226
+ else:
227
+ # 冲突时,优先相信文件签名
228
+ result['detected_type'] = signature_detected
229
+ result['confidence'] = 'high'
230
+ result['methods_used'].append('file_signature')
231
+
232
+ result['cate'] = self.get_file_category(result['detected_type'])
233
+ return result
234
+
235
+ def get_file_category(self, file_type: str) -> str:
236
+ """获取文件类别"""
237
+ if not file_type or file_type == 'Unknown':
238
+ return 'Unknown'
239
+
240
+ image_types = {'PNG', 'JPEG', 'GIF', 'WEBP', 'BMP', 'TIFF', 'ICO', 'SVG', 'HEIC', 'AVIF'}
241
+ video_types = {'MP4', 'AVI', 'MOV', 'WMV', 'FLV', 'WEBM', 'MKV', 'M4V', 'MPEG', '3GP', 'OGV', 'TS', 'MTS',
242
+ 'VOB'}
243
+ audio_types = {'MP3', 'WAV', 'FLAC', 'AAC', 'OGG', 'WMA', 'M4A', 'APE', 'OPUS', 'AIFF', 'AU'}
244
+
245
+ if file_type in image_types:
246
+ return 'image'
247
+ elif file_type in video_types:
248
+ return 'video'
249
+ elif file_type in audio_types:
250
+ return 'audio'
251
+ else:
252
+ return 'other'
4
253
 
5
254
 
6
255
  class Request:
7
256
  """
8
- 请求类,用于封装 HTTP 请求并提供相关功能。
257
+ HTTP 请求封装类,提供统一的请求接口和相关功能。
258
+
259
+ Features:
260
+ - 自动 User-Agent 生成
261
+ - 灵活的请求参数配置
262
+ - 文件类型检测
263
+ - 错误处理和状态码检查
9
264
  """
10
265
 
11
- __REQUEST_ATTRS__ = {
12
- "params",
13
- "headers",
14
- "cookies",
15
- "data",
16
- "json",
17
- "files",
18
- "auth",
19
- "timeout",
20
- "proxies",
21
- "hooks",
22
- "stream",
23
- "verify",
24
- "cert",
25
- "allow_redirects",
266
+ # 支持的 requests 库参数
267
+ _REQUEST_ATTRS: Set[str] = frozenset({
268
+ "params", "headers", "cookies", "data", "json", "files",
269
+ "auth", "timeout", "proxies", "hooks", "stream", "verify",
270
+ "cert", "allow_redirects"
271
+ })
272
+
273
+ # 默认超时时间
274
+ _DEFAULT_TIMEOUT = 30
275
+
276
+ # User-Agent 模板和版本范围
277
+ _UA_TEMPLATE = (
278
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_{v1}_{v2}) "
279
+ "AppleWebKit/{v4}.{v3} (KHTML, like Gecko) "
280
+ "Chrome/105.0.0.0 Safari/{v4}.{v3} Edg/105.0.{v5}.{v6}"
281
+ )
282
+ _UA_VERSION_RANGES = {
283
+ 'v1': (4, 15), 'v2': (3, 11), 'v3': (1, 16),
284
+ 'v4': (533, 605), 'v5': (1000, 6000), 'v6': (10, 80)
26
285
  }
27
286
 
28
287
  def __init__(
29
- self,
30
- url: str,
31
- seed: Any,
32
- random_ua: bool = True,
33
- check_status_code: bool = True,
34
- **kwargs,
288
+ self,
289
+ url: str,
290
+ seed: Any = None,
291
+ method: Optional[str] = None,
292
+ random_ua: bool = True,
293
+ check_status_code: bool = True,
294
+ **kwargs
35
295
  ):
36
296
  """
37
297
  初始化请求对象。
38
- :param url: 请求的 URL。
39
- :param seed: 种子对象或标识符。
40
- :param random_ua: 是否使用随机 User-Agent,默认为 True。
41
- :param check_status_code: 是否检查响应状态码,默认为 True。
42
- :param kwargs: 其他扩展参数。
298
+
299
+ Args:
300
+ url: 请求的 URL
301
+ seed: 种子对象或标识符
302
+ method: HTTP 方法,如果不指定则自动推断
303
+ random_ua: 是否使用随机 User-Agent
304
+ check_status_code: 是否检查响应状态码
305
+ **kwargs: 其他请求参数
306
+
307
+ Raises:
308
+ ValueError: 当 URL 格式无效时
43
309
  """
310
+ self.scheme = None
311
+ self.netloc = None
312
+ self._validate_url(url)
313
+
44
314
  self.url = url
45
315
  self.seed = seed
46
316
  self.check_status_code = check_status_code
47
- self.request_setting: Dict[str, Any] = {}
317
+ self.request_settings: Dict[str, Any] = {}
318
+
319
+ # 分离请求参数和实例属性
320
+ self._process_kwargs(kwargs)
321
+
322
+ self.method = self._determine_method(method)
323
+
324
+ # 设置默认超时
325
+ if 'timeout' not in self.request_settings:
326
+ self.request_settings['timeout'] = self._DEFAULT_TIMEOUT
327
+
328
+ # 构建请求头
329
+ if random_ua:
330
+ self._setup_headers()
331
+
332
+ def _validate_url(self, url: str) -> None:
333
+ """验证 URL 格式"""
334
+ try:
335
+ result = urlparse(url)
336
+ self.scheme = result.scheme
337
+ self.netloc = result.netloc
338
+ if not all([self.scheme, self.netloc]):
339
+ raise ValueError(f"无效的 URL 格式: {url}")
340
+ except Exception as e:
341
+ raise ValueError(f"URL 解析失败: {e}")
48
342
 
343
+ def _process_kwargs(self, kwargs: Dict[str, Any]) -> None:
344
+ """处理关键字参数,分离请求参数和实例属性"""
49
345
  for key, value in kwargs.items():
50
- if key in self.__class__.__REQUEST_ATTRS__:
51
- self.request_setting[key] = value
346
+ if key in self._REQUEST_ATTRS:
347
+ self.request_settings[key] = value
52
348
  else:
53
349
  setattr(self, key, value)
54
350
 
55
- self.method = getattr(self, "method", None) or (
56
- "POST" if self.request_setting.get("data") or self.request_setting.get("json") else "GET"
351
+ def _determine_method(self, method: Optional[str]) -> str:
352
+ if method:
353
+ return method.upper()
354
+
355
+ has_body = bool(
356
+ self.request_settings.get("data") or
357
+ self.request_settings.get("json") or
358
+ self.request_settings.get("files")
57
359
  )
360
+ return "POST" if has_body else "GET"
58
361
 
59
- if random_ua:
60
- self._build_header()
362
+ def _generate_random_ua(self) -> str:
363
+ """生成随机 User-Agent"""
364
+ versions = {
365
+ key: random.randint(*range_tuple)
366
+ for key, range_tuple in self._UA_VERSION_RANGES.items()
367
+ }
368
+ return self._UA_TEMPLATE.format(**versions)
61
369
 
62
- @property
63
- def _random_ua(self) -> str:
64
- v1 = random.randint(4, 15)
65
- v2 = random.randint(3, 11)
66
- v3 = random.randint(1, 16)
67
- v4 = random.randint(533, 605)
68
- v5 = random.randint(1000, 6000)
69
- v6 = random.randint(10, 80)
70
- return (f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_{v1}_{v2}) "
71
- f"AppleWebKit/{v4}.{v3} (KHTML, like Gecko) "
72
- f"Chrome/105.0.0.0 Safari/{v4}.{v3} Edg/105.0.{v5}.{v6}")
73
-
74
- def _build_header(self):
75
- headers = self.request_setting.setdefault("headers", {})
76
- if not headers.get("user-agent"):
77
- headers["user-agent"] = self._random_ua
370
+ def _setup_headers(self) -> None:
371
+ """设置请求头,包括随机 User-Agent"""
372
+ headers = self.request_settings.setdefault("headers", {})
373
+
374
+ # 使用小写键名进行检查,保持一致性
375
+ ua_keys = ['user-agent', 'User-Agent']
376
+ if not any(headers.get(key) for key in ua_keys):
377
+ headers["User-Agent"] = self._generate_random_ua()
378
+
379
+ def execute(self) -> requests.Response:
380
+ """
381
+ 执行 HTTP 请求。
382
+
383
+ Returns:
384
+ requests.Response: 响应对象
78
385
 
386
+ Raises:
387
+ RequestException: 请求执行失败
388
+ requests.HTTPError: HTTP 状态码错误(当 check_status_code=True 时)
389
+ """
390
+ try:
391
+ response = requests.request(
392
+ method=self.method,
393
+ url=self.url,
394
+ **self.request_settings
395
+ )
396
+
397
+ if self.check_status_code:
398
+ response.raise_for_status()
399
+
400
+ return response
401
+
402
+ except RequestException as e:
403
+ logging.error(f"请求执行失败 - URL: {self.url}, 错误: {e}")
404
+ raise
405
+
406
+ # 保持向后兼容性
79
407
  def download(self) -> requests.Response:
80
- response = requests.request(self.method, self.url, **self.request_setting)
81
- if self.check_status_code:
82
- response.raise_for_status()
83
- return response
408
+ """下载方法,为了向后兼容性保留"""
409
+ return self.execute()
410
+
411
+ def detect_file_type(self) -> Dict[str, Any]:
412
+ """
413
+ 检测文件类型。
414
+
415
+ Returns:
416
+ Dict[str, Any]: 文件类型信息
417
+
418
+ Raises:
419
+ RequestException: 请求执行失败
420
+ ImportError: FileTypeDetector 未找到
421
+ """
422
+ try:
423
+ # 创建检测请求的配置
424
+ detect_settings = self.request_settings.copy()
425
+
426
+ # 设置 Range 头获取文件前64字节
427
+ headers = detect_settings.setdefault("headers", {}).copy()
428
+ headers['Range'] = "bytes=0-63"
429
+ detect_settings["headers"] = headers
430
+
431
+ # 移除 stream 参数避免冲突
432
+ detect_settings.pop('stream', None)
433
+
434
+ # 执行检测请求
435
+ response = requests.request(
436
+ method=self.method,
437
+ url=self.url,
438
+ **detect_settings
439
+ )
440
+
441
+ content_type = response.headers.get("Content-Type")
442
+ detector = FileTypeDetector()
443
+
444
+ return detector.get_detailed_info(
445
+ url=self.url,
446
+ content_type=content_type,
447
+ data=response.content
448
+ )
449
+
450
+ except RequestException as e:
451
+ logging.error(f"文件类型检测失败 - URL: {self.url}, 错误: {e}")
84
452
 
85
453
  @property
86
454
  def to_dict(self) -> Dict[str, Any]:
87
- excluded_keys = {"url", "seed", "check_status_code", "request_setting"}
88
- return {k: v for k, v in self.__dict__.items() if k not in excluded_keys}
455
+ excluded_keys = {"request_settings"}
456
+ result = {
457
+ key: value for key, value in self.__dict__.items()
458
+ if not key.startswith('_') and key not in excluded_keys
459
+ }
460
+ result['request_settings'] = self.request_settings.copy()
461
+ return result
462
+
463
+ def __repr__(self) -> str:
464
+ return f"Request(method='{self.method}', url='{self.url}')"
465
+
466
+ def __str__(self) -> str:
467
+ return f"{self.method} {self.url}"
cobweb/base/response.py CHANGED
@@ -1,3 +1,4 @@
1
+ from typing import Any
1
2
 
2
3
 
3
4
  class Response:
@@ -22,3 +23,10 @@ class Response:
22
23
  _dict.pop('response')
23
24
  return _dict
24
25
 
26
+ def __getattr__(self, name: str) -> Any:
27
+ """动态获取未定义的属性,返回 None"""
28
+ return None
29
+
30
+ def __getitem__(self, key: str) -> Any:
31
+ """支持字典式获取属性"""
32
+ return getattr(self, key, None)
cobweb/base/test.py ADDED
@@ -0,0 +1,257 @@
1
+ import requests
2
+ from urllib.parse import urlparse
3
+ from typing import Dict, Optional
4
+
5
+
6
+ class FileTypeDetector:
7
+ def __init__(self):
8
+ self.file_signatures = {
9
+ # 图片格式
10
+ b'\x89PNG\r\n\x1a\n': 'PNG',
11
+ b'\xff\xd8\xff': 'JPEG',
12
+ b'GIF87a': 'GIF',
13
+ b'GIF89a': 'GIF',
14
+ b'RIFF': 'WEBP', # 需要进一步检查
15
+ b'BM': 'BMP',
16
+ b'II*\x00': 'TIFF',
17
+ b'MM\x00*': 'TIFF',
18
+ b'\x00\x00\x01\x00': 'ICO',
19
+ b'\x00\x00\x02\x00': 'CUR',
20
+
21
+ # 视频格式
22
+ b'\x00\x00\x00\x18ftypmp4': 'MP4',
23
+ b'\x00\x00\x00\x20ftypM4V': 'M4V',
24
+ b'FLV\x01': 'FLV',
25
+ b'\x1aE\xdf\xa3': 'WEBM',
26
+ b'RIFF': 'AVI', # 需要进一步检查
27
+ b'\x00\x00\x01\xba': 'MPEG',
28
+ b'\x00\x00\x01\xb3': 'MPEG',
29
+ b'OggS': 'OGV',
30
+
31
+ # 音频格式
32
+ b'ID3': 'MP3',
33
+ b'\xff\xfb': 'MP3',
34
+ b'\xff\xf3': 'MP3',
35
+ b'\xff\xf2': 'MP3',
36
+ b'fLaC': 'FLAC',
37
+ b'RIFF': 'WAV', # 需要进一步检查
38
+ b'OggS': 'OGG', # 需要进一步检查
39
+ b'ftypM4A': 'M4A',
40
+ b'MAC ': 'APE',
41
+
42
+ # 其他格式
43
+ b'%PDF': 'PDF',
44
+ b'PK\x03\x04': 'ZIP',
45
+ b'Rar!\x1a\x07\x00': 'RAR',
46
+ b'\x37\x7a\xbc\xaf\x27\x1c': '7Z',
47
+ }
48
+
49
+ # 扩展名映射
50
+ self.extension_map = {
51
+ # 图片
52
+ '.jpg': 'JPEG', '.jpeg': 'JPEG', '.png': 'PNG', '.gif': 'GIF',
53
+ '.webp': 'WEBP', '.bmp': 'BMP', '.tiff': 'TIFF', '.tif': 'TIFF',
54
+ '.ico': 'ICO', '.svg': 'SVG', '.heic': 'HEIC', '.avif': 'AVIF',
55
+
56
+ # 视频
57
+ '.mp4': 'MP4', '.avi': 'AVI', '.mov': 'MOV', '.wmv': 'WMV',
58
+ '.flv': 'FLV', '.webm': 'WEBM', '.mkv': 'MKV', '.m4v': 'M4V',
59
+ '.mpg': 'MPEG', '.mpeg': 'MPEG', '.3gp': '3GP', '.ogv': 'OGV',
60
+ '.ts': 'TS', '.mts': 'MTS', '.vob': 'VOB',
61
+
62
+ # 音频
63
+ '.mp3': 'MP3', '.wav': 'WAV', '.flac': 'FLAC', '.aac': 'AAC',
64
+ '.ogg': 'OGG', '.wma': 'WMA', '.m4a': 'M4A', '.ape': 'APE',
65
+ '.opus': 'OPUS', '.aiff': 'AIFF', '.au': 'AU',
66
+ }
67
+
68
+ # MIME类型映射
69
+ self.mime_type_map = {
70
+ # 图片
71
+ 'image/jpeg': 'JPEG', 'image/png': 'PNG', 'image/gif': 'GIF',
72
+ 'image/webp': 'WEBP', 'image/bmp': 'BMP', 'image/tiff': 'TIFF',
73
+ 'image/svg+xml': 'SVG', 'image/x-icon': 'ICO',
74
+
75
+ # 视频
76
+ 'video/mp4': 'MP4', 'video/avi': 'AVI', 'video/quicktime': 'MOV',
77
+ 'video/x-msvideo': 'AVI', 'video/webm': 'WEBM', 'video/x-flv': 'FLV',
78
+ 'video/3gpp': '3GP', 'video/ogg': 'OGV',
79
+
80
+ # 音频
81
+ 'audio/mpeg': 'MP3', 'audio/wav': 'WAV', 'audio/flac': 'FLAC',
82
+ 'audio/aac': 'AAC', 'audio/ogg': 'OGG', 'audio/x-ms-wma': 'WMA',
83
+ 'audio/mp4': 'M4A', 'audio/opus': 'OPUS',
84
+ }
85
+
86
+ self.session = requests.Session()
87
+ self.session.headers.update({
88
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
89
+ })
90
+
91
+ def get_file_extension(self, url: str) -> str:
92
+ """从URL获取文件扩展名"""
93
+ parsed = urlparse(url)
94
+ path = parsed.path.lower()
95
+ site = parsed.netloc
96
+
97
+ # 移除查询参数
98
+ if '?' in path:
99
+ path = path.split('?')[0]
100
+
101
+ # 获取扩展名
102
+ if '.' in path:
103
+ return '.' + path.split('.')[-1], site
104
+ return '', site
105
+
106
+ def detect_by_extension(self, url: str) -> Optional[str]:
107
+ """通过文件扩展名检测类型"""
108
+ ext, site = self.get_file_extension(url)
109
+ return self.extension_map.get(ext)
110
+
111
+ def detect_by_mime_type(self, content_type: str) -> Optional[str]:
112
+ """通过MIME类型检测"""
113
+ if not content_type:
114
+ return None
115
+
116
+ # 清理content-type,移除参数
117
+ mime_type = content_type.split(';')[0].strip().lower()
118
+ return self.mime_type_map.get(mime_type)
119
+
120
+ def get_partial_content(self, url: str, max_bytes: int = 64) -> Optional[bytes]:
121
+ """获取文件的前几个字节"""
122
+ try:
123
+ headers = {'Range': f'bytes=0-{max_bytes - 1}'}
124
+ response = self.session.get(url, headers=headers, timeout=10)
125
+
126
+ if response.status_code in [200, 206]:
127
+ return response.content
128
+ except Exception as e:
129
+ print(f"获取内容失败: {e}")
130
+ return None
131
+
132
+ def detect_by_signature(self, data: bytes) -> Optional[str]:
133
+ """通过文件签名检测类型"""
134
+ if not data:
135
+ return None
136
+
137
+ # 检查各种文件签名
138
+ for signature, file_type in self.file_signatures.items():
139
+ if data.startswith(signature):
140
+ # 特殊处理需要进一步检查的格式
141
+ if signature == b'RIFF' and len(data) >= 12:
142
+ # 检查是WEBP、AVI还是WAV
143
+ if data[8:12] == b'WEBP':
144
+ return 'WEBP'
145
+ elif data[8:12] == b'AVI ':
146
+ return 'AVI'
147
+ elif data[8:12] == b'WAVE':
148
+ return 'WAV'
149
+ elif signature == b'OggS' and len(data) >= 32:
150
+ # 检查是OGG音频还是OGV视频
151
+ if b'vorbis' in data[:64].lower():
152
+ return 'OGG'
153
+ elif b'theora' in data[:64].lower():
154
+ return 'OGV'
155
+ else:
156
+ return 'OGG'
157
+ else:
158
+ return file_type
159
+
160
+ # 检查MP4相关格式
161
+ if len(data) >= 12 and data[4:8] == b'ftyp':
162
+ brand = data[8:12]
163
+ if brand in [b'mp41', b'mp42', b'isom', b'avc1']:
164
+ return 'MP4'
165
+ elif brand == b'M4A ':
166
+ return 'M4A'
167
+ elif brand == b'M4V ':
168
+ return 'M4V'
169
+ elif brand == b'qt ':
170
+ return 'MOV'
171
+
172
+ return None
173
+
174
+ def get_detailed_info(self, url, content_type, data) -> Dict:
175
+ """获取详细的文件信息"""
176
+ result = {
177
+ 'url': url,
178
+ 'site': None,
179
+ 'detected_type': None,
180
+ 'confidence': 'unknown',
181
+ 'methods_used': [],
182
+ 'content_type': content_type,
183
+ 'extension': None
184
+ }
185
+
186
+ # 1. 先尝试HEAD请求获取HTTP头信息
187
+ try:
188
+ result['content_type'] = content_type
189
+ # result['file_size'] = content_length
190
+
191
+ # 通过MIME类型检测
192
+ mime_detected = self.detect_by_mime_type(content_type)
193
+ if mime_detected:
194
+ result['detected_type'] = mime_detected
195
+ result['confidence'] = 'high'
196
+ result['methods_used'].append('mime_type')
197
+ except Exception as e:
198
+ print(f"HEAD请求失败: {e}")
199
+
200
+ # 2. 通过扩展名检测
201
+ ext_detected = self.detect_by_extension(url)
202
+ result['extension'], result['site'] = self.get_file_extension(url)
203
+
204
+ if ext_detected:
205
+ if not result['detected_type']:
206
+ result['detected_type'] = ext_detected
207
+ result['confidence'] = 'medium'
208
+ elif result['detected_type'] == ext_detected:
209
+ result['confidence'] = 'very_high' # MIME和扩展名一致
210
+ result['methods_used'].append('extension')
211
+
212
+ # 3. 如果前两种方法不确定,使用文件签名检测
213
+ if result['confidence'] in ['unknown', 'medium']:
214
+ signature_detected = self.detect_by_signature(data)
215
+ if signature_detected:
216
+ if not result['detected_type']:
217
+ result['detected_type'] = signature_detected
218
+ result['confidence'] = 'high'
219
+ elif result['detected_type'] == signature_detected:
220
+ result['confidence'] = 'very_high'
221
+ else:
222
+ # 冲突时,优先相信文件签名
223
+ result['detected_type'] = signature_detected
224
+ result['confidence'] = 'high'
225
+ result['methods_used'].append('file_signature')
226
+
227
+ return result
228
+
229
+ def detect_file_type(self, url: str) -> str:
230
+ """简单的文件类型检测,返回类型字符串"""
231
+ info = self.get_detailed_info(url)
232
+ return info.get('detected_type', 'Unknown')
233
+
234
+ def get_file_category(self, file_type: str) -> str:
235
+ """获取文件类别"""
236
+ if not file_type or file_type == 'Unknown':
237
+ return 'Unknown'
238
+
239
+ image_types = {'PNG', 'JPEG', 'GIF', 'WEBP', 'BMP', 'TIFF', 'ICO', 'SVG', 'HEIC', 'AVIF'}
240
+ video_types = {'MP4', 'AVI', 'MOV', 'WMV', 'FLV', 'WEBM', 'MKV', 'M4V', 'MPEG', '3GP', 'OGV', 'TS', 'MTS',
241
+ 'VOB'}
242
+ audio_types = {'MP3', 'WAV', 'FLAC', 'AAC', 'OGG', 'WMA', 'M4A', 'APE', 'OPUS', 'AIFF', 'AU'}
243
+
244
+ if file_type in image_types:
245
+ return 'Image'
246
+ elif file_type in video_types:
247
+ return 'Video'
248
+ elif file_type in audio_types:
249
+ return 'Audio'
250
+ else:
251
+ return 'Other'
252
+
253
+
254
+ # if __name__ == "__main__":
255
+ # detector = FileTypeDetector()
256
+ # result = detector.get_detailed_info("https://cdn.pixabay.com/user/2024/12/10/12-18-33-812_96x96.jpeg")
257
+ # print(result)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 3.1.25
3
+ Version: 3.1.27
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -5,10 +5,11 @@ cobweb/base/__init__.py,sha256=NanSxJr0WsqjqCNOQAlxlkt-vQEsERHYBzacFC057oI,222
5
5
  cobweb/base/common_queue.py,sha256=hYdaM70KrWjvACuLKaGhkI2VqFCnd87NVvWzmnfIg8Q,1423
6
6
  cobweb/base/item.py,sha256=1bS4U_3vzI2jzSSeoEbLoLT_5CfgLPopWiEYtaahbvw,1674
7
7
  cobweb/base/logger.py,sha256=Vsg1bD4LXW91VgY-ANsmaUu-mD88hU_WS83f7jX3qF8,2011
8
- cobweb/base/request.py,sha256=MBYYjWpbRQRulPG0zPbK0DO3LKmScqQ4tBzFXekYkao,2652
9
- cobweb/base/response.py,sha256=g8e5H0hEiRfqseh3nD7t6a1rhIJYRMV7nI47kqNOd-U,446
8
+ cobweb/base/request.py,sha256=LBI2eCoXtQcUh6XQT813Q3rzhlhUFSm_UBGayYWIHKI,16052
9
+ cobweb/base/response.py,sha256=8gOfUJMorRg7nhN9ge3kQUjQoePXpns2XLWHIEfIui4,720
10
10
  cobweb/base/seed.py,sha256=ddaWCq_KaWwpmPl1CToJlfCxEEnoJ16kjo6azJs9uls,5000
11
11
  cobweb/base/task_queue.py,sha256=2MqGpHGNmK5B-kqv7z420RWyihzB9zgDHJUiLsmtzOI,6402
12
+ cobweb/base/test.py,sha256=N8MDGb94KQeI4pC5rCc2QdohE9_5AgcOyGqKjbMsOEs,9588
12
13
  cobweb/crawlers/__init__.py,sha256=msvkB9mTpsgyj8JfNMsmwAcpy5kWk_2NrO1Adw2Hkw0,29
13
14
  cobweb/crawlers/crawler.py,sha256=ZZVZJ17RWuvzUFGLjqdvyVZpmuq-ynslJwXQzdm_UdQ,709
14
15
  cobweb/db/__init__.py,sha256=uZwSkd105EAwYo95oZQXAfofUKHVIAZZIPpNMy-hm2Q,56
@@ -33,8 +34,8 @@ cobweb/utils/decorators.py,sha256=ZwVQlz-lYHgXgKf9KRCp15EWPzTDdhoikYUNUCIqNeM,11
33
34
  cobweb/utils/dotting.py,sha256=L-jGSApdnFIP4jUWH6p5qIme0aJ1vyDrxAx8wOJWvcs,1960
34
35
  cobweb/utils/oss.py,sha256=wmToIIVNO8nCQVRmreVaZejk01aCWS35e1NV6cr0yGI,4192
35
36
  cobweb/utils/tools.py,sha256=14TCedqt07m4z6bCnFAsITOFixeGr8V3aOKk--L7Cr0,879
36
- cobweb_launcher-3.1.25.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
37
- cobweb_launcher-3.1.25.dist-info/METADATA,sha256=QkLxxH-8qIdxnqsEB6W-dZjX4PtnoYqnCemFTXzgyNg,6051
38
- cobweb_launcher-3.1.25.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
39
- cobweb_launcher-3.1.25.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
40
- cobweb_launcher-3.1.25.dist-info/RECORD,,
37
+ cobweb_launcher-3.1.27.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
38
+ cobweb_launcher-3.1.27.dist-info/METADATA,sha256=WXPF7I2nt5GA7XBQnH7qy5RVWkxfb8kHp0s9axNopCs,6051
39
+ cobweb_launcher-3.1.27.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
40
+ cobweb_launcher-3.1.27.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
41
+ cobweb_launcher-3.1.27.dist-info/RECORD,,