PyPI - crawlo - Versions diffs - 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl - Mend

crawlo 1.4.6py3-none-any.whl → 1.4.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crawlo might be problematic. Click here for more details.

Files changed (162) hide show

crawlo/__init__.py +2 -1
crawlo/__version__.py +1 -1
crawlo/cli.py +2 -2
crawlo/commands/check.py +1 -1
crawlo/commands/help.py +5 -3
crawlo/commands/list.py +1 -1
crawlo/commands/run.py +49 -11
crawlo/commands/stats.py +1 -1
crawlo/config.py +12 -4
crawlo/config_validator.py +1 -1
crawlo/core/engine.py +20 -7
crawlo/core/processor.py +1 -1
crawlo/core/scheduler.py +4 -5
crawlo/crawler.py +51 -10
crawlo/downloader/__init__.py +7 -3
crawlo/downloader/aiohttp_downloader.py +18 -18
crawlo/downloader/cffi_downloader.py +5 -2
crawlo/downloader/httpx_downloader.py +9 -3
crawlo/downloader/hybrid_downloader.py +2 -2
crawlo/downloader/playwright_downloader.py +38 -15
crawlo/downloader/selenium_downloader.py +16 -2
crawlo/event.py +42 -8
crawlo/exceptions.py +157 -24
crawlo/extension/__init__.py +10 -9
crawlo/extension/health_check.py +7 -7
crawlo/extension/log_interval.py +6 -6
crawlo/extension/log_stats.py +2 -2
crawlo/extension/logging_extension.py +4 -12
crawlo/extension/memory_monitor.py +5 -5
crawlo/extension/performance_profiler.py +5 -5
crawlo/extension/request_recorder.py +6 -6
crawlo/factories/base.py +1 -1
crawlo/factories/crawler.py +61 -60
crawlo/factories/utils.py +135 -0
crawlo/filters/__init__.py +19 -2
crawlo/filters/aioredis_filter.py +133 -49
crawlo/filters/memory_filter.py +6 -21
crawlo/framework.py +22 -8
crawlo/initialization/built_in.py +24 -67
crawlo/initialization/core.py +65 -19
crawlo/initialization/phases.py +83 -2
crawlo/initialization/registry.py +5 -7
crawlo/initialization/utils.py +49 -0
crawlo/logging/__init__.py +6 -10
crawlo/logging/config.py +106 -22
crawlo/logging/factory.py +12 -8
crawlo/logging/manager.py +19 -27
crawlo/middleware/__init__.py +72 -9
crawlo/middleware/default_header.py +2 -2
crawlo/middleware/download_delay.py +2 -2
crawlo/middleware/middleware_manager.py +6 -6
crawlo/middleware/offsite.py +2 -2
crawlo/middleware/proxy.py +2 -2
crawlo/middleware/request_ignore.py +4 -4
crawlo/middleware/response_code.py +2 -2
crawlo/middleware/response_filter.py +2 -2
crawlo/middleware/retry.py +1 -1
crawlo/mode_manager.py +38 -4
crawlo/network/request.py +54 -26
crawlo/network/response.py +69 -135
crawlo/pipelines/__init__.py +40 -9
crawlo/pipelines/base_pipeline.py +452 -0
crawlo/pipelines/bloom_dedup_pipeline.py +4 -5
crawlo/pipelines/console_pipeline.py +2 -2
crawlo/pipelines/csv_pipeline.py +4 -4
crawlo/pipelines/database_dedup_pipeline.py +4 -5
crawlo/pipelines/json_pipeline.py +4 -4
crawlo/pipelines/memory_dedup_pipeline.py +4 -5
crawlo/pipelines/mongo_pipeline.py +23 -14
crawlo/pipelines/mysql_pipeline.py +31 -39
crawlo/pipelines/pipeline_manager.py +8 -8
crawlo/pipelines/redis_dedup_pipeline.py +13 -14
crawlo/project.py +1 -1
crawlo/queue/__init__.py +10 -0
crawlo/queue/queue_manager.py +79 -13
crawlo/queue/redis_priority_queue.py +196 -47
crawlo/settings/default_settings.py +16 -6
crawlo/spider/__init__.py +6 -5
crawlo/stats_collector.py +2 -2
crawlo/task_manager.py +1 -1
crawlo/templates/crawlo.cfg.tmpl +3 -3
crawlo/templates/project/__init__.py.tmpl +1 -3
crawlo/templates/project/items.py.tmpl +2 -6
crawlo/templates/project/middlewares.py.tmpl +1 -1
crawlo/templates/project/pipelines.py.tmpl +1 -2
crawlo/templates/project/settings.py.tmpl +12 -10
crawlo/templates/project/settings_distributed.py.tmpl +14 -13
crawlo/templates/project/settings_gentle.py.tmpl +21 -23
crawlo/templates/project/settings_high_performance.py.tmpl +21 -23
crawlo/templates/project/settings_minimal.py.tmpl +10 -8
crawlo/templates/project/settings_simple.py.tmpl +21 -23
crawlo/templates/run.py.tmpl +1 -1
crawlo/templates/spider/spider.py.tmpl +4 -12
crawlo/templates/spiders_init.py.tmpl +3 -8
crawlo/tools/__init__.py +0 -103
crawlo/tools/scenario_adapter.py +1 -1
crawlo/utils/__init__.py +25 -1
crawlo/utils/batch_processor.py +23 -6
crawlo/utils/config_manager.py +442 -0
crawlo/utils/controlled_spider_mixin.py +1 -1
crawlo/utils/db_helper.py +1 -1
crawlo/utils/encoding_helper.py +190 -0
crawlo/utils/error_handler.py +2 -2
crawlo/utils/large_scale_helper.py +1 -1
crawlo/utils/leak_detector.py +335 -0
crawlo/utils/mongo_connection_pool.py +157 -0
crawlo/utils/mysql_connection_pool.py +197 -0
crawlo/utils/performance_monitor.py +1 -1
crawlo/utils/redis_checker.py +91 -0
crawlo/utils/redis_connection_pool.py +260 -70
crawlo/utils/redis_key_validator.py +1 -1
crawlo/utils/request.py +24 -2
crawlo/utils/request_serializer.py +1 -1
crawlo/utils/resource_manager.py +337 -0
crawlo/utils/response_helper.py +113 -0
crawlo/utils/selector_helper.py +3 -2
crawlo/utils/singleton.py +70 -0
crawlo/utils/spider_loader.py +1 -1
crawlo/utils/text_helper.py +1 -1
crawlo-1.4.8.dist-info/METADATA +831 -0
{crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/RECORD +131 -145
tests/advanced_tools_example.py +10 -68
tests/distributed_dedup_test.py +467 -0
tests/monitor_redis_dedup.sh +72 -0
tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
tests/simple_cli_test.py +55 -0
tests/test_cli_arguments.py +119 -0
tests/test_dedup_fix.py +10 -10
crawlo/logging/async_handler.py +0 -181
crawlo/logging/monitor.py +0 -153
crawlo/logging/sampler.py +0 -167
crawlo/tools/authenticated_proxy.py +0 -241
crawlo/tools/data_formatter.py +0 -226
crawlo/tools/data_validator.py +0 -181
crawlo/tools/encoding_converter.py +0 -127
crawlo/tools/network_diagnostic.py +0 -365
crawlo/tools/request_tools.py +0 -83
crawlo/tools/retry_mechanism.py +0 -224
crawlo/utils/env_config.py +0 -143
crawlo/utils/large_scale_config.py +0 -287
crawlo/utils/log.py +0 -80
crawlo/utils/system.py +0 -11
crawlo/utils/tools.py +0 -5
crawlo/utils/url.py +0 -40
crawlo-1.4.6.dist-info/METADATA +0 -329
tests/env_config_example.py +0 -134
tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
tests/test_authenticated_proxy.py +0 -142
tests/test_comprehensive.py +0 -147
tests/test_dynamic_downloaders_proxy.py +0 -125
tests/test_dynamic_proxy.py +0 -93
tests/test_dynamic_proxy_config.py +0 -147
tests/test_dynamic_proxy_real.py +0 -110
tests/test_env_config.py +0 -122
tests/test_framework_env_usage.py +0 -104
tests/test_large_scale_config.py +0 -113
tests/test_proxy_api.py +0 -265
tests/test_real_scenario_proxy.py +0 -196
tests/tools_example.py +0 -261
{crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
{crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
{crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0

crawlo/network/response.py CHANGED Viewed

@@ -11,12 +11,9 @@ HTTP Response 封装模块
 - Cookie 处理
 """
 import re
-from http.cookies import SimpleCookie
-from typing import Dict, Any, List, Optional, Tuple
-from urllib.parse import urljoin as _urljoin, urlparse as _urlparse, urlsplit as _urlsplit, parse_qs as _parse_qs, \
-    urlencode as _urlencode, quote as _quote, unquote as _unquote, urldefrag as _urldefrag
 import ujson
+from typing import Dict, Any, List, Optional
+from urllib.parse import urljoin as _urljoin
 from parsel import Selector, SelectorList
 # 尝试导入 w3lib 编码检测函数
@@ -31,6 +28,14 @@ try:
     W3LIB_AVAILABLE = True
 except ImportError:
     W3LIB_AVAILABLE = False
+    # 当 w3lib 不可用时，从 utils 导入替代函数
+    from crawlo.utils import (
+        html_body_declared_encoding,
+        html_to_unicode,
+        http_content_type_encoding,
+        read_bom,
+        resolve_encoding,
+    )
 from crawlo.exceptions import DecodeError
 from crawlo.utils import (
@@ -38,7 +43,11 @@ from crawlo.utils import (
     extract_texts,
     extract_attr,
     extract_attrs,
-    is_xpath
+    is_xpath,
+    parse_cookies,
+    regex_search,
+    regex_findall,
+    get_header_value
 )
@@ -74,10 +83,10 @@ class Response:
             self,
             url: str,
             *,
-            headers: Dict[str, Any] = None,
+            headers: Optional[Dict[str, Any]] = None,
             body: bytes = b"",
             method: str = 'GET',
-            request: 'Request' = None,  # 使用字符串注解避免循环导入
+            request: Optional['Request'] = None,  # 使用字符串注解避免循环导入
             status_code: int = 200,
     ):
         # 基本属性
@@ -106,7 +115,7 @@ class Response:
     def _determine_encoding(self) -> str:
         """
-        智能检测响应编码（参考 Scrapy 设计）
+        智能检测响应编码
         编码检测优先级：
         1. Request 中指定的编码
@@ -184,15 +193,19 @@ class Response:
     def _bom_encoding(self) -> Optional[str]:
         """BOM 字节顺序标记编码检测"""
         if not W3LIB_AVAILABLE:
-            return None
+            # 使用替代函数
+            encoding, _ = read_bom(self.body)
+            return encoding
         return read_bom(self.body)[0]
     @memoize_method_noargs
     def _headers_encoding(self) -> Optional[str]:
         """HTTP Content-Type 头部编码检测"""
         if not W3LIB_AVAILABLE:
-            return None
-        content_type = self.headers.get(b"Content-Type", b"") or self.headers.get("content-type", b"")
+            # 使用替代函数
+            content_type = self.headers.get("content-type", "") or self.headers.get("Content-Type", "")
+            return http_content_type_encoding(content_type)
+        content_type = self.headers.get("Content-Type", b"") or self.headers.get("content-type", b"")
         if isinstance(content_type, bytes):
             content_type = content_type.decode('latin-1')
         return http_content_type_encoding(content_type)
@@ -201,23 +214,26 @@ class Response:
     def _body_declared_encoding(self) -> Optional[str]:
         """HTML meta 标签声明编码检测"""
         if not W3LIB_AVAILABLE:
-            return None
+            # 使用替代函数
+            return html_body_declared_encoding(self.body)
         return html_body_declared_encoding(self.body)
     @memoize_method_noargs
     def _body_inferred_encoding(self) -> str:
         """内容自动检测编码"""
         if not W3LIB_AVAILABLE:
-            # 回退到简单的自动检测
-            for enc in (self._DEFAULT_ENCODING, "utf-8", "cp1252"):
-                try:
-                    self.body.decode(enc)
-                except UnicodeError:
-                    continue
-                return enc
-            return self._DEFAULT_ENCODING
-        content_type = self.headers.get(b"Content-Type", b"") or self.headers.get("content-type", b"")
+            # 使用替代函数
+            content_type = self.headers.get("content-type", "") or self.headers.get("Content-Type", "")
+            # 使用 html_to_unicode 函数进行编码检测
+            encoding, _ = html_to_unicode(
+                content_type,
+                self.body,
+                auto_detect_fun=self._auto_detect_fun,
+                default_encoding=self._DEFAULT_ENCODING,
+            )
+            return encoding
+        content_type = self.headers.get("Content-Type", b"") or self.headers.get("content-type", b"")
         if isinstance(content_type, bytes):
             content_type = content_type.decode('latin-1')
@@ -233,6 +249,13 @@ class Response:
     def _auto_detect_fun(self, text: bytes) -> Optional[str]:
         """自动检测编码的回调函数"""
         if not W3LIB_AVAILABLE:
+            # 使用替代函数
+            for enc in (self._DEFAULT_ENCODING, "utf-8", "cp1252"):
+                try:
+                    text.decode(enc)
+                except UnicodeError:
+                    continue
+                return resolve_encoding(enc)
             return None
         for enc in (self._DEFAULT_ENCODING, "utf-8", "cp1252"):
             try:
@@ -255,7 +278,7 @@ class Response:
         # 如果可用，使用 w3lib 进行更准确的解码
         if W3LIB_AVAILABLE:
             try:
-                content_type = self.headers.get(b"Content-Type", b"") or self.headers.get("content-type", b"")
+                content_type = self.headers.get("Content-Type", b"") or self.headers.get("content-type", b"")
                 if isinstance(content_type, bytes):
                     content_type = content_type.decode('latin-1')
@@ -269,6 +292,20 @@ class Response:
             except Exception:
                 # 如果 w3lib 解码失败，回退到原有方法
                 pass
+        else:
+            # 使用替代函数
+            try:
+                content_type = self.headers.get("content-type", "") or self.headers.get("Content-Type", "")
+                # 使用 html_to_unicode 函数
+                _, self._text_cache = html_to_unicode(
+                    content_type,
+                    self.body,
+                    default_encoding=self.encoding
+                )
+                return self._text_cache
+            except Exception:
+                # 如果解码失败，回退到原有方法
+                pass
         # 尝试多种编码
         encodings_to_try = [self.encoding]
@@ -323,12 +360,12 @@ class Response:
     @property
     def content_type(self) -> str:
         """获取响应的 Content-Type"""
-        return self.headers.get('content-type', '') or self.headers.get('Content-Type', '')
+        return get_header_value(self.headers, 'content-type', '')
     @property
     def content_length(self) -> Optional[int]:
         """获取响应的 Content-Length"""
-        length = self.headers.get('content-length') or self.headers.get('Content-Length')
+        length = get_header_value(self.headers, 'content-length')
         return int(length) if length else None
     # ==================== JSON处理方法 ====================
@@ -352,103 +389,8 @@ class Response:
         """拼接 URL，自动处理相对路径。"""
         return _urljoin(self.url, url)
-    def urlparse(self, url: str = None) -> Tuple:
-        """
-        解析 URL 为组件元组 (scheme, netloc, path, params, query, fragment)
-        Args:
-            url (str, optional): 要解析的URL，默认为响应的URL
-        Returns:
-            tuple: URL组件元组
-        """
-        target_url = url if url is not None else self.url
-        return _urlparse(target_url)
-    def urlsplit(self, url: str = None) -> Tuple:
-        """
-        解析 URL 为组件元组 (scheme, netloc, path, query, fragment)
-        Args:
-            url (str, optional): 要解析的URL，默认为响应的URL
-        Returns:
-            tuple: URL组件元组（不包含params）
-        """
-        target_url = url if url is not None else self.url
-        return _urlsplit(target_url)
-    def parse_qs(self, query_string: str = None, keep_blank_values: bool = False) -> Dict[str, List[str]]:
-        """
-        解析查询字符串为字典
-        Args:
-            query_string (str, optional): 查询字符串，默认从URL中提取
-            keep_blank_values (bool): 是否保留空值
-        Returns:
-            dict: 查询参数字典
-        """
-        if query_string is None:
-            # 从URL中提取查询字符串
-            parsed = _urlparse(self.url)
-            query_string = parsed.query
-        return _parse_qs(query_string, keep_blank_values=keep_blank_values)
-    def urlencode(self, query: Dict[str, Any]) -> str:
-        """
-        将字典编码为查询字符串
-        Args:
-            query (dict): 要编码的查询参数字典
-        Returns:
-            str: 编码后的查询字符串
-        """
-        return _urlencode(query)
-    def quote(self, string: str, safe: str = '/') -> str:
-        """
-        URL 编码
-        Args:
-            string (str): 要编码的字符串
-            safe (str): 不编码的字符，默认为 '/'
-        Returns:
-            str: URL编码后的字符串
-        """
-        return _quote(string, safe=safe)
-    def unquote(self, string: str) -> str:
-        """
-        URL 解码
-        Args:
-            string (str): 要解码的字符串
-        Returns:
-            str: URL解码后的字符串
-        """
-        return _unquote(string)
-    def urldefrag(self, url: str = None) -> Tuple[str, str]:
-        """
-        移除 URL 中的片段标识符
-        Args:
-            url (str, optional): 要处理的URL，默认为响应的URL
-        Returns:
-            tuple: (去除片段的URL, 片段)
-        """
-        target_url = url if url is not None else self.url
-        defrag_result = _urldefrag(target_url)
-        return (defrag_result.url, defrag_result.fragment)
     # ==================== 选择器相关方法 ====================
     @property
     def _selector(self) -> Selector:
         """懒加载 Selector 实例"""
@@ -507,7 +449,7 @@ class Response:
         except Exception:
             return default
-    def extract_texts(self, xpath_or_css: str, join_str: str = " ", default: List[str] = None) -> List[str]:
+    def extract_texts(self, xpath_or_css: str, join_str: str = " ", default: Optional[List[str]] = None) -> List[str]:
         """
         提取多个元素的文本内容列表，支持CSS和XPath选择器
@@ -570,7 +512,7 @@ class Response:
         except Exception:
             return default
-    def extract_attrs(self, xpath_or_css: str, attr_name: str, default: List[Any] = None) -> List[Any]:
+    def extract_attrs(self, xpath_or_css: str, attr_name: str, default: Optional[List[Any]] = None) -> List[Any]:
         """
         提取多个元素的属性值列表，支持CSS和XPath选择器
@@ -608,26 +550,18 @@ class Response:
     def re_search(self, pattern: str, flags: int = re.DOTALL) -> Optional[re.Match]:
         """在响应文本上执行正则表达式搜索。"""
-        if not isinstance(pattern, str):
-            raise TypeError("Pattern must be a string")
-        return re.search(pattern, self.text, flags=flags)
+        return regex_search(pattern, self.text, flags)
     def re_findall(self, pattern: str, flags: int = re.DOTALL) -> List[Any]:
         """在响应文本上执行正则表达式查找。"""
-        if not isinstance(pattern, str):
-            raise TypeError("Pattern must be a string")
-        return re.findall(pattern, self.text, flags=flags)
+        return regex_findall(pattern, self.text, flags)
     # ==================== Cookie处理方法 ====================
     def get_cookies(self) -> Dict[str, str]:
         """从响应头中解析并返回Cookies。"""
         cookie_header = self.headers.get("Set-Cookie", "")
-        if isinstance(cookie_header, list):
-            cookie_header = ", ".join(cookie_header)
-        cookies = SimpleCookie()
-        cookies.load(cookie_header)
-        return {key: morsel.value for key, morsel in cookies.items()}
+        return parse_cookies(cookie_header)
     # ==================== 请求相关方法 ====================

crawlo/pipelines/__init__.py CHANGED Viewed

@@ -1,17 +1,36 @@
 #!/usr/bin/python
 # -*- coding:UTF-8 -*-
-from crawlo.items import Item
+"""
+Pipeline 模块
+=============
+Pipeline体系：
+- BasePipeline: 基础抽象类，定义Pipeline接口规范
+- ResourceManagedPipeline: 提供资源管理功能（推荐使用）
+- FileBasedPipeline/DatabasePipeline/CacheBasedPipeline: 特定场景的专用基类
-class BasePipeline:
+内置去重Pipeline：
+- MemoryDedupPipeline: 基于内存的去重
+- RedisDedupPipeline: 基于Redis的分布式去重
+- BloomDedupPipeline: 基于Bloom Filter的高效去重
+- DatabaseDedupPipeline: 基于数据库的去重
-    def process_item(self, item: Item, spider):
-        raise NotImplementedError
-    @classmethod
-    def create_instance(cls, crawler):
-        return cls()
+使用示例：
+    # 在settings.py中配置
+    PIPELINES = [
+        'crawlo.pipelines.RedisDedupPipeline',
+        'your_project.pipelines.MongoPipeline',
+    ]
+"""
+# 导入所有基类（从base_pipeline.py）
+from .base_pipeline import (
+    BasePipeline,
+    ResourceManagedPipeline,
+    FileBasedPipeline,
+    DatabasePipeline,
+    CacheBasedPipeline
+)
 # 导出去重管道
 from .memory_dedup_pipeline import MemoryDedupPipeline
@@ -19,4 +38,16 @@ from .redis_dedup_pipeline import RedisDedupPipeline
 from .bloom_dedup_pipeline import BloomDedupPipeline
 from .database_dedup_pipeline import DatabaseDedupPipeline
-__all__ = ['BasePipeline', 'MemoryDedupPipeline', 'RedisDedupPipeline', 'BloomDedupPipeline', 'DatabaseDedupPipeline']
+__all__ = [
+    # 基类
+    'BasePipeline',
+    'ResourceManagedPipeline',
+    'FileBasedPipeline',
+    'DatabasePipeline',
+    'CacheBasedPipeline',
+    # 去重管道
+    'MemoryDedupPipeline',
+    'RedisDedupPipeline',
+    'BloomDedupPipeline',
+    'DatabaseDedupPipeline'
+]

crawlo 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl

Potentially problematic release.

crawlo 1.4.6py3-none-any.whl → 1.4.8py3-none-any.whl