PyPI - crawlo - Versions diffs - 1.4.3__py3-none-any.whl → 1.4.5__py3-none-any.whl - Mend

crawlo 1.4.3py3-none-any.whl → 1.4.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crawlo might be problematic. Click here for more details.

Files changed (107) hide show

crawlo/__init__.py +11 -15
crawlo/__version__.py +1 -1
crawlo/commands/genspider.py +52 -17
crawlo/commands/startproject.py +24 -0
crawlo/core/engine.py +2 -2
crawlo/core/scheduler.py +4 -4
crawlo/crawler.py +13 -6
crawlo/downloader/__init__.py +5 -2
crawlo/extension/__init__.py +2 -2
crawlo/filters/aioredis_filter.py +8 -1
crawlo/filters/memory_filter.py +8 -1
crawlo/initialization/built_in.py +13 -4
crawlo/initialization/core.py +5 -4
crawlo/interfaces.py +24 -0
crawlo/middleware/__init__.py +7 -4
crawlo/middleware/middleware_manager.py +15 -8
crawlo/mode_manager.py +45 -11
crawlo/network/response.py +374 -69
crawlo/pipelines/mysql_pipeline.py +6 -6
crawlo/pipelines/pipeline_manager.py +2 -2
crawlo/project.py +2 -4
crawlo/queue/pqueue.py +2 -6
crawlo/queue/queue_manager.py +1 -2
crawlo/settings/default_settings.py +15 -30
crawlo/task_manager.py +2 -2
crawlo/templates/project/items.py.tmpl +2 -2
crawlo/templates/project/middlewares.py.tmpl +9 -89
crawlo/templates/project/pipelines.py.tmpl +8 -68
crawlo/templates/project/settings.py.tmpl +51 -65
crawlo/templates/project/settings_distributed.py.tmpl +59 -67
crawlo/templates/project/settings_gentle.py.tmpl +45 -40
crawlo/templates/project/settings_high_performance.py.tmpl +45 -40
crawlo/templates/project/settings_minimal.py.tmpl +37 -26
crawlo/templates/project/settings_simple.py.tmpl +45 -40
crawlo/templates/run.py.tmpl +3 -7
crawlo/tools/__init__.py +0 -11
crawlo/utils/__init__.py +17 -1
crawlo/utils/db_helper.py +220 -319
crawlo/utils/error_handler.py +313 -67
crawlo/utils/fingerprint.py +3 -4
crawlo/utils/misc.py +82 -0
crawlo/utils/request.py +55 -66
crawlo/utils/selector_helper.py +138 -0
crawlo/utils/spider_loader.py +185 -45
crawlo/utils/text_helper.py +95 -0
crawlo-1.4.5.dist-info/METADATA +329 -0
{crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/RECORD +89 -68
tests/bug_check_test.py +251 -0
tests/direct_selector_helper_test.py +97 -0
tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -0
tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -0
tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -0
tests/ofweek_scrapy/ofweek_scrapy/settings.py +85 -0
tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -0
tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +162 -0
tests/ofweek_scrapy/scrapy.cfg +11 -0
tests/performance_comparison.py +4 -5
tests/simple_crawlo_test.py +1 -2
tests/simple_follow_test.py +39 -0
tests/simple_response_selector_test.py +95 -0
tests/simple_selector_helper_test.py +155 -0
tests/simple_selector_test.py +208 -0
tests/simple_url_test.py +74 -0
tests/test_crawler_process_import.py +39 -0
tests/test_crawler_process_spider_modules.py +48 -0
tests/test_edge_cases.py +7 -5
tests/test_encoding_core.py +57 -0
tests/test_encoding_detection.py +127 -0
tests/test_factory_compatibility.py +197 -0
tests/test_multi_directory.py +68 -0
tests/test_multiple_spider_modules.py +81 -0
tests/test_optimized_selector_naming.py +101 -0
tests/test_priority_behavior.py +18 -18
tests/test_response_follow.py +105 -0
tests/test_response_selector_methods.py +93 -0
tests/test_response_url_methods.py +71 -0
tests/test_response_urljoin.py +87 -0
tests/test_scrapy_style_encoding.py +113 -0
tests/test_selector_helper.py +101 -0
tests/test_selector_optimizations.py +147 -0
tests/test_spider_loader.py +50 -0
tests/test_spider_loader_comprehensive.py +70 -0
tests/test_spider_modules.py +85 -0
tests/test_spiders/__init__.py +1 -0
tests/test_spiders/test_spider.py +10 -0
crawlo/tools/anti_crawler.py +0 -269
crawlo/utils/class_loader.py +0 -26
crawlo/utils/enhanced_error_handler.py +0 -357
crawlo-1.4.3.dist-info/METADATA +0 -190
examples/test_project/__init__.py +0 -7
examples/test_project/run.py +0 -35
examples/test_project/test_project/__init__.py +0 -4
examples/test_project/test_project/items.py +0 -18
examples/test_project/test_project/middlewares.py +0 -119
examples/test_project/test_project/pipelines.py +0 -97
examples/test_project/test_project/settings.py +0 -170
examples/test_project/test_project/spiders/__init__.py +0 -10
examples/test_project/test_project/spiders/of_week_dis.py +0 -144
tests/simple_log_test.py +0 -58
tests/simple_test.py +0 -48
tests/test_framework_logger.py +0 -67
tests/test_framework_startup.py +0 -65
tests/test_mode_change.py +0 -73
{crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/WHEEL +0 -0
{crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/entry_points.txt +0 -0
{crawlo-1.4.3.dist-info → crawlo-1.4.5.dist-info}/top_level.txt +0 -0
/tests/{final_command_test_report.md → ofweek_scrapy/ofweek_scrapy/__init__.py} +0 -0

crawlo/utils/request.py CHANGED Viewed

@@ -79,52 +79,63 @@ def request_fingerprint(
     :param include_headers: 指定要参与指纹计算的 header 名称列表（str 或 bytes）
     :return: 请求指纹（hex string）
     """
-    hash_func = hashlib.sha256()
-    # 基本字段
-    hash_func.update(to_bytes(request.method))
-    hash_func.update(to_bytes(canonicalize_url(request.url)))
-    hash_func.update(request.body or b'')
+    from crawlo.utils.fingerprint import FingerprintGenerator
+    # 准备请求数据
+    method = request.method
+    url = request.url
+    body = request.body or b''
+    headers = None
     # 处理 headers
-    if include_headers:
-        headers = request.headers  # 假设 headers 是类似字典或 MultiDict 的结构
+    if include_headers and hasattr(request, 'headers'):
+        headers = {}
         for header_name in include_headers:
-            name_bytes = to_bytes(header_name).lower()  # 统一转为小写进行匹配
-            value = b''
+            name_str = str(header_name).lower()  # 统一转为小写进行匹配
+            value = ''
             # 兼容 headers 的访问方式（如 MultiDict 或 dict）
-            if hasattr(headers, 'get_all'):
+            if hasattr(request.headers, 'get_all'):
                 # 如 scrapy.http.Headers 的 get_all 方法
-                values = headers.get_all(name_bytes)
-                value = b';'.join(values) if values else b''
-            elif hasattr(headers, '__getitem__'):
+                values = request.headers.get_all(name_str)
+                value = ';'.join(str(v) for v in values) if values else ''
+            elif hasattr(request.headers, '__getitem__'):
                 # 如普通 dict
                 try:
-                    raw_value = headers[name_bytes]
+                    raw_value = request.headers[name_str]
                     if isinstance(raw_value, list):
-                        value = b';'.join(to_bytes(v) for v in raw_value)
+                        value = ';'.join(str(v) for v in raw_value)
                     else:
-                        value = to_bytes(raw_value)
+                        value = str(raw_value)
                 except (KeyError, TypeError):
-                    value = b''
+                    value = ''
             else:
-                value = b''
-            hash_func.update(name_bytes + b':' + value)
-    return hash_func.hexdigest()
+                value = ''
+            headers[name_str] = value
+    # 使用统一的指纹生成器
+    return FingerprintGenerator.request_fingerprint(method, url, body, headers)
 def set_request(request: Request, priority: int) -> None:
+    """
+    设置请求的深度和优先级
+    :param request: Request 对象
+    :param priority: 优先级值
+    """
+    # 增加请求深度
     request.meta['depth'] = request.meta.setdefault('depth', 0) + 1
+    # 根据深度调整优先级，深度越深优先级越低
     if priority:
         request.priority -= request.meta['depth'] * priority
 def request_to_dict(request: Request, spider=None) -> Dict[str, Any]:
     """
-    将 Request 对象转换为可 JSON 序列化的字典。
+    将 Request 对象转换为可 JSON 序列化的字典，用于分布式爬虫中的请求序列化。
     Args:
         request: 要序列化的 Request 对象
@@ -146,28 +157,22 @@ def request_to_dict(request: Request, spider=None) -> Dict[str, Any]:
     # 1. 处理 callback
     #    不能直接序列化函数，所以存储其路径
-    if callable(request.callback):
+    if callable(getattr(request, 'callback', None)):
         d['_callback'] = _get_function_path(request.callback)
     # 2. 处理 errback
-    if callable(request.errback):
-        d['_errback'] = _get_function_path(request.errback)
+    if callable(getattr(request, 'err_back', None)):
+        d['_errback'] = _get_function_path(request.err_back)
     # 3. 记录原始类名，以便反序列化时创建正确的实例
     d['_class'] = request.__class__.__module__ + '.' + request.__class__.__name__
-    # 4. 特殊处理 FormRequest
-    #    如果是 FormRequest，需要保存 formdata
-    if isinstance(request, Request):
-        if hasattr(request, 'formdata'):
-            d['formdata'] = request.formdata
     return d
 def request_from_dict(d: Dict[str, Any], spider=None) -> Request:
     """
-    从字典重建 Request 对象。
+    从字典重建 Request 对象，用于分布式爬虫中的请求反序列化。
     Args:
         d: 由 request_to_dict 生成的字典
@@ -193,37 +198,21 @@ def request_from_dict(d: Dict[str, Any], spider=None) -> Request:
     errback_path = d.pop('_errback', None)
     errback = _get_function_from_path(errback_path, spider) if errback_path else None
-    # 4. 提取特殊字段
-    formdata = d.pop('formdata', None)
     # 5. 创建 Request 实例
-    #    注意：body 和 formdata 不能同时存在
-    if formdata is not None and cls is FormRequest:
-        # 如果是 FormRequest 且有 formdata，优先使用 formdata
-        request = FormRequest(
-            url=d['url'],
-            method=d.get('method', 'GET'),
-            headers=d.get('headers', {}),
-            formdata=formdata,
-            callback=callback,
-            errback=errback,
-            meta=d.get('meta', {}),
-            flags=d.get('flags', []),
-            cb_kwargs=d.get('cb_kwargs', {}),
-        )
-    else:
-        # 普通 Request 或没有 formdata 的情况
-        request = cls(
-            url=d['url'],
-            method=d.get('method', 'GET'),
-            headers=d.get('headers', {}),
-            body=d.get('body'),
-            callback=callback,
-            errback=errback,
-            meta=d.get('meta', {}),
-            flags=d.get('flags', []),
-            cb_kwargs=d.get('cb_kwargs', {}),
-        )
+    request = cls(
+        url=d['url'],
+        method=d.get('method', 'GET'),
+        headers=d.get('headers', {}),
+        body=d.get('body'),
+        callback=callback,
+        meta=d.get('meta', {}),
+        flags=d.get('flags', []),
+        cb_kwargs=d.get('cb_kwargs', {}),
+    )
+    # 手动设置 err_back 属性
+    if errback is not None:
+        request.err_back = errback
     return request
@@ -256,7 +245,7 @@ def _get_function_from_path(path: str, spider=None) -> Optional[callable]:
             func = getattr(func, attr)
         # 如果 spider 存在，并且 func 是 spider 的方法
-        if spider and hasattr(spider, func.__name__):
+        if spider and hasattr(func, '__name__') and hasattr(spider, func.__name__):
             spider_method = getattr(spider, func.__name__)
             if spider_method is func:
                 return spider_method  # 返回绑定的方法

crawlo/utils/selector_helper.py ADDED Viewed

@@ -0,0 +1,138 @@
+#!/usr/bin/python
+# -*- coding:UTF-8 -*-
+"""
+选择器辅助工具模块
+==================
+提供用于处理parsel选择器的辅助函数，用于提取文本和属性等操作。
+该模块包含以下主要函数：
+- extract_text: 从元素列表中提取文本并拼接
+- extract_texts: 从元素列表中提取多个文本列表
+- extract_attr: 从元素列表中提取单个元素的属性值
+- extract_attrs: 从元素列表中提取多个元素的属性值列表
+- is_xpath: 判断查询语句是否为XPath
+所有方法都采用了简洁直观的命名风格，便于记忆和使用。
+"""
+from typing import List, Any, Optional
+from parsel import Selector, SelectorList
+def extract_text(elements: SelectorList, join_str: str = " ") -> str:
+    """
+    从元素列表中提取文本并拼接
+    :param elements: SelectorList元素列表
+    :param join_str: 文本拼接分隔符
+    :return: 拼接后的文本
+    示例:
+        title_elements = selector.css('title')
+        title_text = extract_text(title_elements)
+    """
+    texts = []
+    for element in elements:
+        # 获取元素的所有文本节点
+        if hasattr(element, 'xpath'):
+            element_texts = element.xpath('.//text()').getall()
+        else:
+            element_texts = [str(element)]
+        # 清理并添加非空文本
+        for text in element_texts:
+            cleaned = text.strip()
+            if cleaned:
+                texts.append(cleaned)
+    return join_str.join(texts)
+def extract_texts(elements: SelectorList, join_str: str = " ") -> List[str]:
+    """
+    从元素列表中提取多个文本列表
+    :param elements: SelectorList元素列表
+    :param join_str: 单个节点内文本拼接分隔符
+    :return: 纯文本列表(每个元素对应一个节点的文本)
+    示例:
+        li_elements = selector.css('.list li')
+        li_texts = extract_texts(li_elements)
+    """
+    result = []
+    for element in elements:
+        # 对每个元素提取文本
+        if hasattr(element, 'xpath'):
+            texts = element.xpath('.//text()').getall()
+        else:
+            texts = [str(element)]
+        # 清理文本并拼接
+        clean_texts = [text.strip() for text in texts if text.strip()]
+        if clean_texts:
+            result.append(join_str.join(clean_texts))
+    return result
+def extract_attr(elements: SelectorList, attr_name: str, default: Any = None) -> Any:
+    """
+    从元素列表中提取单个元素的属性值
+    :param elements: SelectorList元素列表
+    :param attr_name: 属性名称
+    :param default: 默认返回值
+    :return: 属性值或默认值
+    示例:
+        link_elements = selector.css('.link')
+        link_href = extract_attr(link_elements, 'href')
+    """
+    # 使用parsel的attrib属性获取第一个匹配元素的属性值
+    if hasattr(elements, 'attrib'):
+        return elements.attrib.get(attr_name, default)
+    # 如果elements是SelectorList，获取第一个元素的属性
+    elif len(elements) > 0 and hasattr(elements[0], 'attrib'):
+        return elements[0].attrib.get(attr_name, default)
+    return default
+def extract_attrs(elements: SelectorList, attr_name: str) -> List[Any]:
+    """
+    从元素列表中提取多个元素的属性值列表
+    :param elements: SelectorList元素列表
+    :param attr_name: 属性名称
+    :return: 属性值列表
+    示例:
+        all_links = selector.css('a')
+        all_hrefs = extract_attrs(all_links, 'href')
+    """
+    result = []
+    for element in elements:
+        # 使用parsel的attrib属性获取元素的属性值
+        if hasattr(element, 'attrib'):
+            attr_value = element.attrib.get(attr_name)
+            if attr_value is not None:
+                result.append(attr_value)
+    return result
+def is_xpath(query: str) -> bool:
+    """
+    判断查询语句是否为XPath
+    :param query: 查询语句
+    :return: 是否为XPath
+    """
+    return query.startswith(('/', '//', './'))
+__all__ = [
+    "extract_text",
+    "extract_texts",
+    "extract_attr",
+    "extract_attrs",
+    "is_xpath"
+]

crawlo/utils/spider_loader.py CHANGED Viewed

@@ -1,62 +1,202 @@
 import importlib
+import traceback
+import warnings
+from collections import defaultdict
 from pathlib import Path
-from typing import List, Type, Optional, Dict
+from typing import List, Type, Dict, Any
+from crawlo.interfaces import ISpiderLoader
+from crawlo.settings.setting_manager import SettingManager
 from crawlo.spider import Spider
+from crawlo.network.request import Request
 from crawlo.utils.log import get_logger
 logger = get_logger(__name__)
-class SpiderLoader:
-    """爬虫加载器，负责发现和加载爬虫"""
-    def __init__(self, project_package: str):
-        self.project_package = project_package
-        self._spiders: Dict[str, Type[Spider]] = {}
-        self._load_spiders()
-    def _load_spiders(self):
-        """加载所有爬虫"""
-        spiders_dir = Path.cwd() / self.project_package / 'spiders'
-        if not spiders_dir.exists():
-            logger.warning(f"Spiders directory not found: {spiders_dir}")
-            return
-        for py_file in spiders_dir.glob("*.py"):
-            if py_file.name.startswith('_'):
-                continue
-            module_name = py_file.stem
-            spider_module_path = f"{self.project_package}.spiders.{module_name}"
-            try:
-                module = importlib.import_module(spider_module_path)
-            except ImportError as e:
-                logger.debug(f"Skip module {module_name}: {e}")
-                continue
-            # 查找所有 Spider 子类
-            for attr_name in dir(module):
-                attr_value = getattr(module, attr_name)
-                if (isinstance(attr_value, type) and
-                        issubclass(attr_value, Spider) and
-                        attr_value != Spider and
-                        hasattr(attr_value, 'name')):
-                    spider_name = getattr(attr_value, 'name')
-                    if spider_name in self._spiders:
-                        logger.warning(f"Duplicate spider name '{spider_name}' found")
-                    self._spiders[spider_name] = attr_value
+class SpiderLoaderProtocol:
+    """Protocol for spider loader"""
+    @classmethod
+    def from_settings(cls, settings: SettingManager) -> 'SpiderLoaderProtocol':
+        """Create spider loader from settings"""
+        return cls(settings)
+    def load(self, spider_name: str) -> Type[Spider]:
+        """Load a spider by name"""
+        raise NotImplementedError
+    def list(self) -> List[str]:
+        """List all available spider names"""
+        raise NotImplementedError
+    def find_by_request(self, request: 'Request') -> List[str]:
+        """Find spider names that can handle the given request"""
+        raise NotImplementedError
-    def load(self, spider_name: str) -> Optional[Type[Spider]]:
-        """通过 name 加载爬虫"""
-        return self._spiders.get(spider_name)
+class SpiderLoader(ISpiderLoader):
+    """爬虫加载器，负责发现和加载爬虫"""
+    def __init__(self, settings: SettingManager = None):
+        # 如果提供了settings，则从settings中获取配置
+        if settings is not None:
+            self.spider_modules = settings.get('SPIDER_MODULES', [])
+            self.warn_only = settings.get('SPIDER_LOADER_WARN_ONLY', False)
+        else:
+            # 默认配置
+            self.spider_modules = []
+            self.warn_only = False
+        self._spiders: Dict[str, Type[Spider]] = {}
+        self._found: Dict[str, List[tuple]] = defaultdict(list)
+        self._load_all_spiders()
+    @classmethod
+    def from_settings(cls, settings: SettingManager) -> 'SpiderLoader':
+        """从设置创建SpiderLoader实例"""
+        return cls(settings)
+    def _check_name_duplicates(self) -> None:
+        """检查重复的spider名称"""
+        dupes = []
+        for name, locations in self._found.items():
+            if len(locations) > 1:
+                dupes.extend([
+                    f"  {cls} named {name!r} (in {mod})"
+                    for mod, cls in locations
+                ])
+        if dupes:
+            dupes_string = "\n\n".join(dupes)
+            warnings.warn(
+                "There are several spiders with the same name:\n\n"
+                f"{dupes_string}\n\n  This can cause unexpected behavior.",
+                category=UserWarning,
+            )
+    def _load_spiders(self, module) -> None:
+        """加载模块中的所有spider"""
+        for attr_name in dir(module):
+            attr_value = getattr(module, attr_name)
+            if (isinstance(attr_value, type) and
+                    issubclass(attr_value, Spider) and
+                    attr_value != Spider and
+                    hasattr(attr_value, 'name')):
+                spider_name = getattr(attr_value, 'name')
+                self._found[spider_name].append((module.__name__, attr_value.__name__))
+                self._spiders[spider_name] = attr_value
+    def _load_spiders_from_package(self, package_name: str) -> None:
+        """从包中加载spiders"""
+        try:
+            # 尝试导入包
+            package = importlib.import_module(package_name)
+            # 遍历包中的所有模块
+            package_path = Path(package.__file__).parent
+            for py_file in package_path.glob("*.py"):
+                if py_file.name.startswith('_'):
+                    continue
+                module_name = py_file.stem
+                spider_module_path = f"{package_name}.{module_name}"
+                try:
+                    module = importlib.import_module(spider_module_path)
+                    self._load_spiders(module)
+                except ImportError as e:
+                    if self.warn_only:
+                        logger.warning(f"Could not load spiders from module '{spider_module_path}': {e}")
+                        logger.debug(traceback.format_exc())
+                    else:
+                        raise
+        except (ImportError, SyntaxError) as e:
+            if self.warn_only:
+                logger.warning(f"Could not load spiders from package '{package_name}': {e}")
+                logger.debug(traceback.format_exc())
+            else:
+                raise
+    def _load_all_spiders(self) -> None:
+        """加载所有spiders"""
+        # 如果配置了SPIDER_MODULES，则从这些模块加载
+        if self.spider_modules:
+            for module_name in self.spider_modules:
+                self._load_spiders_from_package(module_name)
+        else:
+            # 向后兼容：如果没有配置SPIDER_MODULES，则使用旧的方式
+            # 这里假设默认的spiders目录结构
+            spiders_dir = Path.cwd() / 'spiders'
+            if not spiders_dir.exists():
+                spiders_dir = Path.cwd() / 'spider'
+                if not spiders_dir.exists():
+                    logger.warning("Spiders directory not found")
+                    return
+            for py_file in spiders_dir.glob("*.py"):
+                if py_file.name.startswith('_'):
+                    continue
+                module_name = py_file.stem
+                module = None
+                try:
+                    # 尝试不同的导入路径
+                    spider_module_path = None
+                    for possible_package in ['spiders', 'spider']:
+                        try:
+                            spider_module_path = f"{possible_package}.{module_name}"
+                            module = importlib.import_module(spider_module_path)
+                            break
+                        except ImportError:
+                            continue
+                    if module is None:
+                        raise ImportError(f"Could not import {module_name}")
+                    self._load_spiders(module)
+                except ImportError as e:
+                    logger.debug(f"Skip module {module_name}: {e}")
+                    continue
+        self._check_name_duplicates()
+    def load(self, spider_name: str) -> Type[Spider]:
+        """
+        通过name加载爬虫
+        Args:
+            spider_name: 爬虫名称
+        Returns:
+            Spider类
+        Raises:
+            KeyError: 如果找不到指定名称的爬虫
+        """
+        if spider_name not in self._spiders:
+            raise KeyError(f"Spider not found: {spider_name}")
+        return self._spiders[spider_name]
     def list(self) -> List[str]:
         """列出所有可用的爬虫名称"""
         return list(self._spiders.keys())
+    def find_by_request(self, request: 'Request') -> List[str]:
+        """
+        根据请求找到可以处理该请求的爬虫名称
+        Args:
+            request: 请求对象
+        Returns:
+            可以处理该请求的爬虫名称列表
+        """
+        # 这里可以实现更复杂的匹配逻辑
+        # 目前只是返回所有爬虫名称
+        return list(self._spiders.keys())
     def get_all(self) -> Dict[str, Type[Spider]]:
         """获取所有爬虫"""
         return self._spiders.copy()

crawlo/utils/text_helper.py ADDED Viewed

@@ -0,0 +1,95 @@
+# -*- coding: utf-8 -*-
+import json
+import re
+from typing import Any, Union, List, Dict, Tuple, Optional
+from crawlo.utils.log import get_logger
+logger = get_logger(__name__)
+# 正则表达式缓存
+_REGEXPS: Dict[str, "re.Pattern"] = {}
+def extract_text_by_regex(
+    text: Union[str, Any],
+    patterns: Union[str, List[str]],
+    allow_repeat: bool = True,
+    fetch_one: bool = False,
+    join_with: Optional[str] = None,
+) -> Union[str, List[str], Tuple]:
+    """
+    从文本中提取信息，支持正则匹配和多模式 fallback。
+    Args:
+        text (str): 文本内容或可转为字符串的类型
+        patterns (str or list of str): 正则表达式模式，按顺序尝试匹配
+        allow_repeat (bool): 是否允许重复结果
+        fetch_one (bool): 是否只提取第一个匹配项（返回元组）
+        join_with (str, optional): 若提供，则将结果用该字符连接成字符串
+    Returns:
+        str | list | tuple: 匹配结果，根据参数返回字符串、列表或元组
+    """
+    if isinstance(patterns, str):
+        patterns = [patterns]
+    results = []
+    for pattern in patterns:
+        if not pattern:
+            continue
+        if pattern not in _REGEXPS:
+            _REGEXPS[pattern] = re.compile(pattern, re.S)
+        if fetch_one:
+            match = _REGEXPS[pattern].search(str(text))
+            results = match.groups() if match else ("",)
+            break
+        else:
+            found = _REGEXPS[pattern].findall(str(text))
+            if found:
+                results = found
+                break
+    if fetch_one:
+        return results[0] if len(results) == 1 else results
+    if not allow_repeat:
+        results = sorted(set(results), key=results.index)
+    return join_with.join(results) if join_with else results
+def parse_json_safely(json_str: Union[str, Any]) -> Dict:
+    """
+    安全解析 JSON 字符串，兼容非标准格式（如单引号、缺少引号键）。
+    Args:
+        json_str (str): JSON 字符串
+    Returns:
+        dict: 解析后的字典，失败返回空字典
+    """
+    if not json_str:
+        return {}
+    try:
+        return json.loads(json_str)
+    except Exception as e1:
+        try:
+            cleaned = json_str.strip().replace("'", '"')
+            # 使用新的函数名
+            keys = extract_text_by_regex(cleaned, r'(\w+):')
+            for key in keys:
+                cleaned = cleaned.replace(f"{key}:", f'"{key}":')
+            return json.loads(cleaned) if cleaned else {}
+        except Exception as e2:
+            logger.error(
+                f"JSON 解析失败\n"
+                f"原始内容: {json_str}\n"
+                f"错误1: {e1}\n"
+                f"修复后: {cleaned}\n"
+                f"错误2: {e2}"
+            )
+        return {}

crawlo 1.4.3__py3-none-any.whl → 1.4.5__py3-none-any.whl

Potentially problematic release.

crawlo 1.4.3py3-none-any.whl → 1.4.5py3-none-any.whl