PyPI - cobweb-launcher - Versions diffs - 3.1.24__py3-none-any.whl → 3.1.26__py3-none-any.whl - Mend

cobweb-launcher 3.1.24py3-none-any.whl → 3.1.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

cobweb/base/request.py CHANGED Viewed

@@ -1,88 +1,477 @@
 import random
+import logging
 import requests
-from typing import Any, Dict
+from urllib.parse import urlparse
+from typing import Any, Set, Dict, Optional
+from requests.exceptions import RequestException
+class FileTypeDetector:
+    def __init__(self):
+        self.file_signatures = {
+            # 图片格式
+            b'\x89PNG\r\n\x1a\n': 'PNG',
+            b'\xff\xd8\xff': 'JPEG',
+            b'GIF87a': 'GIF',
+            b'GIF89a': 'GIF',
+            b'RIFF': 'WEBP',  # 需要进一步检查
+            b'BM': 'BMP',
+            b'II*\x00': 'TIFF',
+            b'MM\x00*': 'TIFF',
+            b'\x00\x00\x01\x00': 'ICO',
+            b'\x00\x00\x02\x00': 'CUR',
+            # 视频格式
+            b'\x00\x00\x00\x18ftypmp4': 'MP4',
+            b'\x00\x00\x00\x20ftypM4V': 'M4V',
+            b'FLV\x01': 'FLV',
+            b'\x1aE\xdf\xa3': 'WEBM',
+            b'RIFF': 'AVI',  # 需要进一步检查
+            b'\x00\x00\x01\xba': 'MPEG',
+            b'\x00\x00\x01\xb3': 'MPEG',
+            b'OggS': 'OGV',
+            # 音频格式
+            b'ID3': 'MP3',
+            b'\xff\xfb': 'MP3',
+            b'\xff\xf3': 'MP3',
+            b'\xff\xf2': 'MP3',
+            b'fLaC': 'FLAC',
+            b'RIFF': 'WAV',  # 需要进一步检查
+            b'OggS': 'OGG',  # 需要进一步检查
+            b'ftypM4A': 'M4A',
+            b'MAC ': 'APE',
+            # 其他格式
+            b'%PDF': 'PDF',
+            b'PK\x03\x04': 'ZIP',
+            b'Rar!\x1a\x07\x00': 'RAR',
+            b'\x37\x7a\xbc\xaf\x27\x1c': '7Z',
+        }
+        # 扩展名映射
+        self.extension_map = {
+            # 图片
+            '.jpg': 'JPEG', '.jpeg': 'JPEG', '.png': 'PNG', '.gif': 'GIF',
+            '.webp': 'WEBP', '.bmp': 'BMP', '.tiff': 'TIFF', '.tif': 'TIFF',
+            '.ico': 'ICO', '.svg': 'SVG', '.heic': 'HEIC', '.avif': 'AVIF',
+            # 视频
+            '.mp4': 'MP4', '.avi': 'AVI', '.mov': 'MOV', '.wmv': 'WMV',
+            '.flv': 'FLV', '.webm': 'WEBM', '.mkv': 'MKV', '.m4v': 'M4V',
+            '.mpg': 'MPEG', '.mpeg': 'MPEG', '.3gp': '3GP', '.ogv': 'OGV',
+            '.ts': 'TS', '.mts': 'MTS', '.vob': 'VOB',
+            # 音频
+            '.mp3': 'MP3', '.wav': 'WAV', '.flac': 'FLAC', '.aac': 'AAC',
+            '.ogg': 'OGG', '.wma': 'WMA', '.m4a': 'M4A', '.ape': 'APE',
+            '.opus': 'OPUS', '.aiff': 'AIFF', '.au': 'AU',
+        }
+        # MIME类型映射
+        self.mime_type_map = {
+            # 图片
+            'image/jpeg': 'JPEG', 'image/png': 'PNG', 'image/gif': 'GIF',
+            'image/webp': 'WEBP', 'image/bmp': 'BMP', 'image/tiff': 'TIFF',
+            'image/svg+xml': 'SVG', 'image/x-icon': 'ICO',
+            # 视频
+            'video/mp4': 'MP4', 'video/avi': 'AVI', 'video/quicktime': 'MOV',
+            'video/x-msvideo': 'AVI', 'video/webm': 'WEBM', 'video/x-flv': 'FLV',
+            'video/3gpp': '3GP', 'video/ogg': 'OGV',
+            # 音频
+            'audio/mpeg': 'MP3', 'audio/wav': 'WAV', 'audio/flac': 'FLAC',
+            'audio/aac': 'AAC', 'audio/ogg': 'OGG', 'audio/x-ms-wma': 'WMA',
+            'audio/mp4': 'M4A', 'audio/opus': 'OPUS',
+        }
+        self.session = requests.Session()
+        self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+        })
+    def get_file_extension(self, url: str) -> str:
+        """从URL获取文件扩展名"""
+        parsed = urlparse(url)
+        path = parsed.path.lower()
+        site = parsed.netloc
+        # 移除查询参数
+        if '?' in path:
+            path = path.split('?')[0]
+        # 获取扩展名
+        if '.' in path:
+            return '.' + path.split('.')[-1], site
+        return '', site
+    def detect_by_extension(self, url: str) -> Optional[str]:
+        """通过文件扩展名检测类型"""
+        ext, site = self.get_file_extension(url)
+        return self.extension_map.get(ext)
+    def detect_by_mime_type(self, content_type: str) -> Optional[str]:
+        """通过MIME类型检测"""
+        if not content_type:
+            return None
+        # 清理content-type，移除参数
+        mime_type = content_type.split(';')[0].strip().lower()
+        return self.mime_type_map.get(mime_type)
+    def get_partial_content(self, url: str, max_bytes: int = 64) -> Optional[bytes]:
+        """获取文件的前几个字节"""
+        try:
+            headers = {'Range': f'bytes=0-{max_bytes - 1}'}
+            response = self.session.get(url, headers=headers, timeout=10)
+            if response.status_code in [200, 206]:
+                return response.content
+        except Exception as e:
+            print(f"获取内容失败: {e}")
+        return None
+    def detect_by_signature(self, data: bytes) -> Optional[str]:
+        """通过文件签名检测类型"""
+        if not data:
+            return None
+        # 检查各种文件签名
+        for signature, file_type in self.file_signatures.items():
+            if data.startswith(signature):
+                # 特殊处理需要进一步检查的格式
+                if signature == b'RIFF' and len(data) >= 12:
+                    # 检查是WEBP、AVI还是WAV
+                    if data[8:12] == b'WEBP':
+                        return 'WEBP'
+                    elif data[8:12] == b'AVI ':
+                        return 'AVI'
+                    elif data[8:12] == b'WAVE':
+                        return 'WAV'
+                elif signature == b'OggS' and len(data) >= 32:
+                    # 检查是OGG音频还是OGV视频
+                    if b'vorbis' in data[:64].lower():
+                        return 'OGG'
+                    elif b'theora' in data[:64].lower():
+                        return 'OGV'
+                    else:
+                        return 'OGG'
+                else:
+                    return file_type
+        # 检查MP4相关格式
+        if len(data) >= 12 and data[4:8] == b'ftyp':
+            brand = data[8:12]
+            if brand in [b'mp41', b'mp42', b'isom', b'avc1']:
+                return 'MP4'
+            elif brand == b'M4A ':
+                return 'M4A'
+            elif brand == b'M4V ':
+                return 'M4V'
+            elif brand == b'qt  ':
+                return 'MOV'
+        return None
+    def get_detailed_info(self, url, content_type, data) -> Dict:
+        """获取详细的文件信息"""
+        result = {
+            'url': url,
+            'site': None,
+            'detected_type': None,
+            'confidence': 'unknown',
+            'methods_used': [],
+            'content_type': content_type,
+            'extension': None
+        }
+        # 1. 先尝试HEAD请求获取HTTP头信息
+        try:
+            result['content_type'] = content_type
+            # result['file_size'] = content_length
+            # 通过MIME类型检测
+            mime_detected = self.detect_by_mime_type(content_type)
+            if mime_detected:
+                result['detected_type'] = mime_detected
+                result['confidence'] = 'high'
+                result['methods_used'].append('mime_type')
+        except Exception as e:
+            print(f"HEAD请求失败: {e}")
+        # 2. 通过扩展名检测
+        ext_detected = self.detect_by_extension(url)
+        result['extension'], result['site'] = self.get_file_extension(url)
+        if ext_detected:
+            if not result['detected_type']:
+                result['detected_type'] = ext_detected
+                result['confidence'] = 'medium'
+            elif result['detected_type'] == ext_detected:
+                result['confidence'] = 'very_high'  # MIME和扩展名一致
+            result['methods_used'].append('extension')
+        # 3. 如果前两种方法不确定，使用文件签名检测
+        if result['confidence'] in ['unknown', 'medium']:
+            signature_detected = self.detect_by_signature(data)
+            if signature_detected:
+                if not result['detected_type']:
+                    result['detected_type'] = signature_detected
+                    result['confidence'] = 'high'
+                elif result['detected_type'] == signature_detected:
+                    result['confidence'] = 'very_high'
+                else:
+                    # 冲突时，优先相信文件签名
+                    result['detected_type'] = signature_detected
+                    result['confidence'] = 'high'
+                result['methods_used'].append('file_signature')
+        return result
+    def get_file_category(self, file_type: str) -> str:
+        """获取文件类别"""
+        if not file_type or file_type == 'Unknown':
+            return 'Unknown'
+        image_types = {'PNG', 'JPEG', 'GIF', 'WEBP', 'BMP', 'TIFF', 'ICO', 'SVG', 'HEIC', 'AVIF'}
+        video_types = {'MP4', 'AVI', 'MOV', 'WMV', 'FLV', 'WEBM', 'MKV', 'M4V', 'MPEG', '3GP', 'OGV', 'TS', 'MTS',
+                       'VOB'}
+        audio_types = {'MP3', 'WAV', 'FLAC', 'AAC', 'OGG', 'WMA', 'M4A', 'APE', 'OPUS', 'AIFF', 'AU'}
+        if file_type in image_types:
+            return 'Image'
+        elif file_type in video_types:
+            return 'Video'
+        elif file_type in audio_types:
+            return 'Audio'
+        else:
+            return 'Other'
 class Request:
     """
-    请求类，用于封装 HTTP 请求并提供相关功能。
+    HTTP 请求封装类，提供统一的请求接口和相关功能。
+    Features:
+    - 自动 User-Agent 生成
+    - 灵活的请求参数配置
+    - 文件类型检测
+    - 错误处理和状态码检查
     """
-    __REQUEST_ATTRS__ = {
-        "params",
-        "headers",
-        "cookies",
-        "data",
-        "json",
-        "files",
-        "auth",
-        "timeout",
-        "proxies",
-        "hooks",
-        "stream",
-        "verify",
-        "cert",
-        "allow_redirects",
+    # 支持的 requests 库参数
+    _REQUEST_ATTRS: Set[str] = frozenset({
+        "params", "headers", "cookies", "data", "json", "files",
+        "auth", "timeout", "proxies", "hooks", "stream", "verify",
+        "cert", "allow_redirects"
+    })
+    # 默认超时时间
+    _DEFAULT_TIMEOUT = 30
+    # User-Agent 模板和版本范围
+    _UA_TEMPLATE = (
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_{v1}_{v2}) "
+        "AppleWebKit/{v4}.{v3} (KHTML, like Gecko) "
+        "Chrome/105.0.0.0 Safari/{v4}.{v3} Edg/105.0.{v5}.{v6}"
+    )
+    _UA_VERSION_RANGES = {
+        'v1': (4, 15), 'v2': (3, 11), 'v3': (1, 16),
+        'v4': (533, 605), 'v5': (1000, 6000), 'v6': (10, 80)
     }
     def __init__(
-        self,
-        url: str,
-        seed: Any,
-        random_ua: bool = True,
-        check_status_code: bool = True,
-        **kwargs,
+            self,
+            url: str,
+            seed: Any = None,
+            method: Optional[str] = None,
+            random_ua: bool = True,
+            check_status_code: bool = True,
+            **kwargs
     ):
         """
         初始化请求对象。
-        :param url: 请求的 URL。
-        :param seed: 种子对象或标识符。
-        :param random_ua: 是否使用随机 User-Agent，默认为 True。
-        :param check_status_code: 是否检查响应状态码，默认为 True。
-        :param kwargs: 其他扩展参数。
+        Args:
+            url: 请求的 URL
+            seed: 种子对象或标识符
+            method: HTTP 方法，如果不指定则自动推断
+            random_ua: 是否使用随机 User-Agent
+            check_status_code: 是否检查响应状态码
+            **kwargs: 其他请求参数
+        Raises:
+            ValueError: 当 URL 格式无效时
         """
+        self.scheme = None
+        self.netloc = None
+        self._validate_url(url)
         self.url = url
         self.seed = seed
         self.check_status_code = check_status_code
-        self.request_setting: Dict[str, Any] = {}
+        self.request_settings: Dict[str, Any] = {}
+        # 分离请求参数和实例属性
+        self._process_kwargs(kwargs)
+        self.method = self._determine_method(method)
+        # 设置默认超时
+        if 'timeout' not in self.request_settings:
+            self.request_settings['timeout'] = self._DEFAULT_TIMEOUT
+        # 构建请求头
+        if random_ua:
+            self._setup_headers()
+    def _validate_url(self, url: str) -> None:
+        """验证 URL 格式"""
+        try:
+            result = urlparse(url)
+            self.scheme = result.scheme
+            self.netloc = result.netloc
+            if not all([self.scheme, self.netloc]):
+                raise ValueError(f"无效的 URL 格式: {url}")
+        except Exception as e:
+            raise ValueError(f"URL 解析失败: {e}")
+    def _process_kwargs(self, kwargs: Dict[str, Any]) -> None:
+        """处理关键字参数，分离请求参数和实例属性"""
         for key, value in kwargs.items():
-            if key in self.__class__.__REQUEST_ATTRS__:
-                self.request_setting[key] = value
+            if key in self._REQUEST_ATTRS:
+                self.request_settings[key] = value
             else:
                 setattr(self, key, value)
-        self.method = getattr(self, "method", None) or (
-            "POST" if self.request_setting.get("data") or self.request_setting.get("json") else "GET"
+    def _determine_method(self, method: Optional[str]) -> str:
+        if method:
+            return method.upper()
+        has_body = bool(
+            self.request_settings.get("data") or
+            self.request_settings.get("json") or
+            self.request_settings.get("files")
         )
+        return "POST" if has_body else "GET"
-        if random_ua:
-            self._build_header()
+    def _generate_random_ua(self) -> str:
+        """生成随机 User-Agent"""
+        versions = {
+            key: random.randint(*range_tuple)
+            for key, range_tuple in self._UA_VERSION_RANGES.items()
+        }
+        return self._UA_TEMPLATE.format(**versions)
-    @property
-    def _random_ua(self) -> str:
-        v1 = random.randint(4, 15)
-        v2 = random.randint(3, 11)
-        v3 = random.randint(1, 16)
-        v4 = random.randint(533, 605)
-        v5 = random.randint(1000, 6000)
-        v6 = random.randint(10, 80)
-        return (f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_{v1}_{v2}) "
-                f"AppleWebKit/{v4}.{v3} (KHTML, like Gecko) "
-                f"Chrome/105.0.0.0 Safari/{v4}.{v3} Edg/105.0.{v5}.{v6}")
-    def _build_header(self):
-        headers = self.request_setting.setdefault("headers", {})
-        if not headers.get("user-agent"):
-            headers["user-agent"] = self._random_ua
+    def _setup_headers(self) -> None:
+        """设置请求头，包括随机 User-Agent"""
+        headers = self.request_settings.setdefault("headers", {})
+        # 使用小写键名进行检查，保持一致性
+        ua_keys = ['user-agent', 'User-Agent']
+        if not any(headers.get(key) for key in ua_keys):
+            headers["User-Agent"] = self._generate_random_ua()
+    def execute(self) -> requests.Response:
+        """
+        执行 HTTP 请求。
+        Returns:
+            requests.Response: 响应对象
+        Raises:
+            RequestException: 请求执行失败
+            requests.HTTPError: HTTP 状态码错误（当 check_status_code=True 时）
+        """
+        try:
+            response = requests.request(
+                method=self.method,
+                url=self.url,
+                **self.request_settings
+            )
+            if self.check_status_code:
+                response.raise_for_status()
+            return response
+        except RequestException as e:
+            logging.error(f"请求执行失败 - URL: {self.url}, 错误: {e}")
+            raise
+    # 保持向后兼容性
     def download(self) -> requests.Response:
-        response = requests.request(self.method, self.url, **self.request_setting)
-        if self.check_status_code:
-            response.raise_for_status()
-        return response
+        """下载方法，为了向后兼容性保留"""
+        return self.execute()
+    def detect_file_type(self) -> Dict[str, Any]:
+        """
+        检测文件类型。
+        Returns:
+            Dict[str, Any]: 文件类型信息
+        Raises:
+            RequestException: 请求执行失败
+            ImportError: FileTypeDetector 未找到
+        """
+        try:
+            # 创建检测请求的配置
+            detect_settings = self.request_settings.copy()
+            # 设置 Range 头获取文件前64字节
+            headers = detect_settings.setdefault("headers", {}).copy()
+            headers['Range'] = "bytes=0-63"
+            detect_settings["headers"] = headers
+            # 移除 stream 参数避免冲突
+            detect_settings.pop('stream', None)
+            # 执行检测请求
+            response = requests.request(
+                method=self.method,
+                url=self.url,
+                **detect_settings
+            )
+            content_type = response.headers.get("Content-Type")
+            detector = FileTypeDetector()
+            return detector.get_detailed_info(
+                url=self.url,
+                content_type=content_type,
+                data=response.content
+            )
+        except RequestException as e:
+            logging.error(f"文件类型检测失败 - URL: {self.url}, 错误: {e}")
     @property
     def to_dict(self) -> Dict[str, Any]:
-        excluded_keys = {"url", "seed", "check_status_code", "request_setting"}
-        return {k: v for k, v in self.__dict__.items() if k not in excluded_keys}
+        """
+        将请求对象转换为字典格式。
+        Returns:
+            Dict[str, Any]: 包含请求信息的字典
+        """
+        excluded_keys = {"request_settings"}
+        result = {
+            key: value for key, value in self.__dict__.items()
+            if not key.startswith('_') and key not in excluded_keys
+        }
+        # 添加请求设置信息
+        result['request_settings'] = self.request_settings.copy()
+        return result
+    def __repr__(self) -> str:
+        """返回对象的字符串表示"""
+        return f"Request(method='{self.method}', url='{self.url}')"
+    def __str__(self) -> str:
+        """返回对象的可读字符串表示"""
+        return f"{self.method} {self.url}"

cobweb/base/task_queue.py CHANGED Viewed

@@ -10,8 +10,8 @@ class Status(Enum):
     PENDING = 0     # 待处理
     PROCESSING = 1  # 处理中
     FINISHED = 2    # 已完成
-    INSERT = 3      # 失败
-    UPLOAD = 4     # 过期
+    INSERT = 3      # 新增
+    UPLOAD = 4     # 上传
 @dataclass
@@ -138,9 +138,10 @@ class TaskQueue:
             if data:
                 task_item.data = data
-            for tid in task_item.children_ids:
-                if self._tasks[tid].status == Status.INSERT:
-                    del self._tasks[tid]
+            if task_item.status != Status.FINISHED:
+                for tid in task_item.children_ids:
+                    if self._tasks[tid].status == Status.INSERT:
+                        del self._tasks[tid]
             task_item.children_ids = []
             self._tasks[task_id] = task_item
@@ -176,4 +177,4 @@ class TaskQueue:
     #                 expired_ids.append(seed_id)
     #         for seed_id in expired_ids:
     #             self._seeds[seed_id] = self._seeds[seed_id]._replace(status=SeedStatus.EXPIRED)
-    #         print(f"清理了 {len(expired_ids)} 个过期种子")
+    #         print(f"清理了 {len(expired_ids)} 个过期种子")

cobweb/base/test.py ADDED Viewed

@@ -0,0 +1,257 @@
+import requests
+from urllib.parse import urlparse
+from typing import Dict, Optional
+class FileTypeDetector:
+    def __init__(self):
+        self.file_signatures = {
+            # 图片格式
+            b'\x89PNG\r\n\x1a\n': 'PNG',
+            b'\xff\xd8\xff': 'JPEG',
+            b'GIF87a': 'GIF',
+            b'GIF89a': 'GIF',
+            b'RIFF': 'WEBP',  # 需要进一步检查
+            b'BM': 'BMP',
+            b'II*\x00': 'TIFF',
+            b'MM\x00*': 'TIFF',
+            b'\x00\x00\x01\x00': 'ICO',
+            b'\x00\x00\x02\x00': 'CUR',
+            # 视频格式
+            b'\x00\x00\x00\x18ftypmp4': 'MP4',
+            b'\x00\x00\x00\x20ftypM4V': 'M4V',
+            b'FLV\x01': 'FLV',
+            b'\x1aE\xdf\xa3': 'WEBM',
+            b'RIFF': 'AVI',  # 需要进一步检查
+            b'\x00\x00\x01\xba': 'MPEG',
+            b'\x00\x00\x01\xb3': 'MPEG',
+            b'OggS': 'OGV',
+            # 音频格式
+            b'ID3': 'MP3',
+            b'\xff\xfb': 'MP3',
+            b'\xff\xf3': 'MP3',
+            b'\xff\xf2': 'MP3',
+            b'fLaC': 'FLAC',
+            b'RIFF': 'WAV',  # 需要进一步检查
+            b'OggS': 'OGG',  # 需要进一步检查
+            b'ftypM4A': 'M4A',
+            b'MAC ': 'APE',
+            # 其他格式
+            b'%PDF': 'PDF',
+            b'PK\x03\x04': 'ZIP',
+            b'Rar!\x1a\x07\x00': 'RAR',
+            b'\x37\x7a\xbc\xaf\x27\x1c': '7Z',
+        }
+        # 扩展名映射
+        self.extension_map = {
+            # 图片
+            '.jpg': 'JPEG', '.jpeg': 'JPEG', '.png': 'PNG', '.gif': 'GIF',
+            '.webp': 'WEBP', '.bmp': 'BMP', '.tiff': 'TIFF', '.tif': 'TIFF',
+            '.ico': 'ICO', '.svg': 'SVG', '.heic': 'HEIC', '.avif': 'AVIF',
+            # 视频
+            '.mp4': 'MP4', '.avi': 'AVI', '.mov': 'MOV', '.wmv': 'WMV',
+            '.flv': 'FLV', '.webm': 'WEBM', '.mkv': 'MKV', '.m4v': 'M4V',
+            '.mpg': 'MPEG', '.mpeg': 'MPEG', '.3gp': '3GP', '.ogv': 'OGV',
+            '.ts': 'TS', '.mts': 'MTS', '.vob': 'VOB',
+            # 音频
+            '.mp3': 'MP3', '.wav': 'WAV', '.flac': 'FLAC', '.aac': 'AAC',
+            '.ogg': 'OGG', '.wma': 'WMA', '.m4a': 'M4A', '.ape': 'APE',
+            '.opus': 'OPUS', '.aiff': 'AIFF', '.au': 'AU',
+        }
+        # MIME类型映射
+        self.mime_type_map = {
+            # 图片
+            'image/jpeg': 'JPEG', 'image/png': 'PNG', 'image/gif': 'GIF',
+            'image/webp': 'WEBP', 'image/bmp': 'BMP', 'image/tiff': 'TIFF',
+            'image/svg+xml': 'SVG', 'image/x-icon': 'ICO',
+            # 视频
+            'video/mp4': 'MP4', 'video/avi': 'AVI', 'video/quicktime': 'MOV',
+            'video/x-msvideo': 'AVI', 'video/webm': 'WEBM', 'video/x-flv': 'FLV',
+            'video/3gpp': '3GP', 'video/ogg': 'OGV',
+            # 音频
+            'audio/mpeg': 'MP3', 'audio/wav': 'WAV', 'audio/flac': 'FLAC',
+            'audio/aac': 'AAC', 'audio/ogg': 'OGG', 'audio/x-ms-wma': 'WMA',
+            'audio/mp4': 'M4A', 'audio/opus': 'OPUS',
+        }
+        self.session = requests.Session()
+        self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+        })
+    def get_file_extension(self, url: str) -> str:
+        """从URL获取文件扩展名"""
+        parsed = urlparse(url)
+        path = parsed.path.lower()
+        site = parsed.netloc
+        # 移除查询参数
+        if '?' in path:
+            path = path.split('?')[0]
+        # 获取扩展名
+        if '.' in path:
+            return '.' + path.split('.')[-1], site
+        return '', site
+    def detect_by_extension(self, url: str) -> Optional[str]:
+        """通过文件扩展名检测类型"""
+        ext, site = self.get_file_extension(url)
+        return self.extension_map.get(ext)
+    def detect_by_mime_type(self, content_type: str) -> Optional[str]:
+        """通过MIME类型检测"""
+        if not content_type:
+            return None
+        # 清理content-type，移除参数
+        mime_type = content_type.split(';')[0].strip().lower()
+        return self.mime_type_map.get(mime_type)
+    def get_partial_content(self, url: str, max_bytes: int = 64) -> Optional[bytes]:
+        """获取文件的前几个字节"""
+        try:
+            headers = {'Range': f'bytes=0-{max_bytes - 1}'}
+            response = self.session.get(url, headers=headers, timeout=10)
+            if response.status_code in [200, 206]:
+                return response.content
+        except Exception as e:
+            print(f"获取内容失败: {e}")
+        return None
+    def detect_by_signature(self, data: bytes) -> Optional[str]:
+        """通过文件签名检测类型"""
+        if not data:
+            return None
+        # 检查各种文件签名
+        for signature, file_type in self.file_signatures.items():
+            if data.startswith(signature):
+                # 特殊处理需要进一步检查的格式
+                if signature == b'RIFF' and len(data) >= 12:
+                    # 检查是WEBP、AVI还是WAV
+                    if data[8:12] == b'WEBP':
+                        return 'WEBP'
+                    elif data[8:12] == b'AVI ':
+                        return 'AVI'
+                    elif data[8:12] == b'WAVE':
+                        return 'WAV'
+                elif signature == b'OggS' and len(data) >= 32:
+                    # 检查是OGG音频还是OGV视频
+                    if b'vorbis' in data[:64].lower():
+                        return 'OGG'
+                    elif b'theora' in data[:64].lower():
+                        return 'OGV'
+                    else:
+                        return 'OGG'
+                else:
+                    return file_type
+        # 检查MP4相关格式
+        if len(data) >= 12 and data[4:8] == b'ftyp':
+            brand = data[8:12]
+            if brand in [b'mp41', b'mp42', b'isom', b'avc1']:
+                return 'MP4'
+            elif brand == b'M4A ':
+                return 'M4A'
+            elif brand == b'M4V ':
+                return 'M4V'
+            elif brand == b'qt  ':
+                return 'MOV'
+        return None
+    def get_detailed_info(self, url, content_type, data) -> Dict:
+        """获取详细的文件信息"""
+        result = {
+            'url': url,
+            'site': None,
+            'detected_type': None,
+            'confidence': 'unknown',
+            'methods_used': [],
+            'content_type': content_type,
+            'extension': None
+        }
+        # 1. 先尝试HEAD请求获取HTTP头信息
+        try:
+            result['content_type'] = content_type
+            # result['file_size'] = content_length
+            # 通过MIME类型检测
+            mime_detected = self.detect_by_mime_type(content_type)
+            if mime_detected:
+                result['detected_type'] = mime_detected
+                result['confidence'] = 'high'
+                result['methods_used'].append('mime_type')
+        except Exception as e:
+            print(f"HEAD请求失败: {e}")
+        # 2. 通过扩展名检测
+        ext_detected = self.detect_by_extension(url)
+        result['extension'], result['site'] = self.get_file_extension(url)
+        if ext_detected:
+            if not result['detected_type']:
+                result['detected_type'] = ext_detected
+                result['confidence'] = 'medium'
+            elif result['detected_type'] == ext_detected:
+                result['confidence'] = 'very_high'  # MIME和扩展名一致
+            result['methods_used'].append('extension')
+        # 3. 如果前两种方法不确定，使用文件签名检测
+        if result['confidence'] in ['unknown', 'medium']:
+            signature_detected = self.detect_by_signature(data)
+            if signature_detected:
+                if not result['detected_type']:
+                    result['detected_type'] = signature_detected
+                    result['confidence'] = 'high'
+                elif result['detected_type'] == signature_detected:
+                    result['confidence'] = 'very_high'
+                else:
+                    # 冲突时，优先相信文件签名
+                    result['detected_type'] = signature_detected
+                    result['confidence'] = 'high'
+                result['methods_used'].append('file_signature')
+        return result
+    def detect_file_type(self, url: str) -> str:
+        """简单的文件类型检测，返回类型字符串"""
+        info = self.get_detailed_info(url)
+        return info.get('detected_type', 'Unknown')
+    def get_file_category(self, file_type: str) -> str:
+        """获取文件类别"""
+        if not file_type or file_type == 'Unknown':
+            return 'Unknown'
+        image_types = {'PNG', 'JPEG', 'GIF', 'WEBP', 'BMP', 'TIFF', 'ICO', 'SVG', 'HEIC', 'AVIF'}
+        video_types = {'MP4', 'AVI', 'MOV', 'WMV', 'FLV', 'WEBM', 'MKV', 'M4V', 'MPEG', '3GP', 'OGV', 'TS', 'MTS',
+                       'VOB'}
+        audio_types = {'MP3', 'WAV', 'FLAC', 'AAC', 'OGG', 'WMA', 'M4A', 'APE', 'OPUS', 'AIFF', 'AU'}
+        if file_type in image_types:
+            return 'Image'
+        elif file_type in video_types:
+            return 'Video'
+        elif file_type in audio_types:
+            return 'Audio'
+        else:
+            return 'Other'
+# if __name__ == "__main__":
+#     detector = FileTypeDetector()
+#     result = detector.get_detailed_info("https://cdn.pixabay.com/user/2024/12/10/12-18-33-812_96x96.jpeg")
+#     print(result)

cobweb/constant.py CHANGED Viewed

@@ -51,7 +51,7 @@ class LogTemplate:
     launcher_pro_polling = """
 ----------------------- start - 轮训日志: {task} -----------------
         内存队列
-            种子数:  {doing_len}
+            消费中:  {doing_len}
             待消费:  {todo_len}
             已消费:  {done_len}
         redis队列

{cobweb_launcher-3.1.24.dist-info → cobweb_launcher-3.1.26.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cobweb-launcher
-Version: 3.1.24
+Version: 3.1.26
 Summary: spider_hole
 Home-page: https://github.com/Juannie-PP/cobweb
 Author: Juannie-PP
@@ -177,12 +177,16 @@ app.start()
 > &nbsp;&nbsp;&nbsp;&nbsp;upload_item["text"] = item.response.text
 > &nbsp;&nbsp;&nbsp;&nbsp;yield ConsoleItem(item.seed, data=json.dumps(upload_item, ensure_ascii=False))
 ## todo
-- 队列优化完善，使用queue的机制wait()同步各模块执行？
-- 日志功能完善，单机模式调度和保存数据写入文件，结构化输出各任务日志
-- 去重过滤（布隆过滤器等）
-- 单机防丢失
+- [ ] 队列优化完善，使用queue的机制wait()同步各模块执行？
+- [x] 日志功能完善，单机模式调度和保存数据写入文件，结构化输出各任务日志
+- [ ] 去重过滤（布隆过滤器等）
+- [ ] 请求检验
+- [ ] 异常回调
+- [ ] 失败回调
 > 未更新流程图！！！
 ![img.png](https://image-luyuan.oss-cn-hangzhou.aliyuncs.com/image/D2388CDC-B9E5-4CE4-9F2C-7D173763B6A8.png)

{cobweb_launcher-3.1.24.dist-info → cobweb_launcher-3.1.26.dist-info}/RECORD RENAMED Viewed

@@ -1,14 +1,15 @@
 cobweb/__init__.py,sha256=YdBi3uytEFRXan155xU1kKMpiUKUupO2RGeJyXmH0zk,129
-cobweb/constant.py,sha256=zy3XYsc1qp2B76_Fn_hVQ8eGHlPBd3OFlZK2cryE6FY,2839
+cobweb/constant.py,sha256=s6W4Fz3DhH-4RutoWnR2bylL8eU44rc-CeOsovj87I0,2839
 cobweb/setting.py,sha256=rHPQfc4a1xMTbkt3_KXBfUomhYcbTXogsz7ew-QsqHw,1670
 cobweb/base/__init__.py,sha256=NanSxJr0WsqjqCNOQAlxlkt-vQEsERHYBzacFC057oI,222
 cobweb/base/common_queue.py,sha256=hYdaM70KrWjvACuLKaGhkI2VqFCnd87NVvWzmnfIg8Q,1423
 cobweb/base/item.py,sha256=1bS4U_3vzI2jzSSeoEbLoLT_5CfgLPopWiEYtaahbvw,1674
 cobweb/base/logger.py,sha256=Vsg1bD4LXW91VgY-ANsmaUu-mD88hU_WS83f7jX3qF8,2011
-cobweb/base/request.py,sha256=MBYYjWpbRQRulPG0zPbK0DO3LKmScqQ4tBzFXekYkao,2652
+cobweb/base/request.py,sha256=hBbjGfvmOA-GknfH32BwasUJfHxcDOZlmxh2jH_CJYg,16258
 cobweb/base/response.py,sha256=g8e5H0hEiRfqseh3nD7t6a1rhIJYRMV7nI47kqNOd-U,446
 cobweb/base/seed.py,sha256=ddaWCq_KaWwpmPl1CToJlfCxEEnoJ16kjo6azJs9uls,5000
-cobweb/base/task_queue.py,sha256=3ScPKnjlPEuuCzWyG9D2iHiND3L9lLM7fo1LNOkw8CY,6337
+cobweb/base/task_queue.py,sha256=2MqGpHGNmK5B-kqv7z420RWyihzB9zgDHJUiLsmtzOI,6402
+cobweb/base/test.py,sha256=N8MDGb94KQeI4pC5rCc2QdohE9_5AgcOyGqKjbMsOEs,9588
 cobweb/crawlers/__init__.py,sha256=msvkB9mTpsgyj8JfNMsmwAcpy5kWk_2NrO1Adw2Hkw0,29
 cobweb/crawlers/crawler.py,sha256=ZZVZJ17RWuvzUFGLjqdvyVZpmuq-ynslJwXQzdm_UdQ,709
 cobweb/db/__init__.py,sha256=uZwSkd105EAwYo95oZQXAfofUKHVIAZZIPpNMy-hm2Q,56
@@ -33,8 +34,8 @@ cobweb/utils/decorators.py,sha256=ZwVQlz-lYHgXgKf9KRCp15EWPzTDdhoikYUNUCIqNeM,11
 cobweb/utils/dotting.py,sha256=L-jGSApdnFIP4jUWH6p5qIme0aJ1vyDrxAx8wOJWvcs,1960
 cobweb/utils/oss.py,sha256=wmToIIVNO8nCQVRmreVaZejk01aCWS35e1NV6cr0yGI,4192
 cobweb/utils/tools.py,sha256=14TCedqt07m4z6bCnFAsITOFixeGr8V3aOKk--L7Cr0,879
-cobweb_launcher-3.1.24.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
-cobweb_launcher-3.1.24.dist-info/METADATA,sha256=THQpa-rStFvhQNEMWq392PlRrPH_hxDDf1_-c5lFIGA,5998
-cobweb_launcher-3.1.24.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
-cobweb_launcher-3.1.24.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
-cobweb_launcher-3.1.24.dist-info/RECORD,,
+cobweb_launcher-3.1.26.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
+cobweb_launcher-3.1.26.dist-info/METADATA,sha256=BVtA2LQ0U9wc7egAKrP4Enf_YEC391eZd-GtkgY_HVA,6051
+cobweb_launcher-3.1.26.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
+cobweb_launcher-3.1.26.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
+cobweb_launcher-3.1.26.dist-info/RECORD,,

{cobweb_launcher-3.1.24.dist-info → cobweb_launcher-3.1.26.dist-info}/LICENSE RENAMED Viewed

File without changes

{cobweb_launcher-3.1.24.dist-info → cobweb_launcher-3.1.26.dist-info}/WHEEL RENAMED Viewed

File without changes

{cobweb_launcher-3.1.24.dist-info → cobweb_launcher-3.1.26.dist-info}/top_level.txt RENAMED Viewed

File without changes

cobweb-launcher 3.1.24__py3-none-any.whl → 3.1.26__py3-none-any.whl

cobweb-launcher 3.1.24py3-none-any.whl → 3.1.26py3-none-any.whl