PyPI - crawlo - Versions diffs - 1.1.2__py3-none-any.whl → 1.1.3__py3-none-any.whl - Mend

crawlo 1.1.2py3-none-any.whl → 1.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crawlo might be problematic. Click here for more details.

Files changed (113) hide show

crawlo/__init__.py +34 -34
crawlo/__version__.py +1 -1
crawlo/cli.py +40 -40
crawlo/commands/__init__.py +13 -13
crawlo/commands/check.py +594 -594
crawlo/commands/genspider.py +151 -151
crawlo/commands/list.py +155 -155
crawlo/commands/run.py +285 -285
crawlo/commands/startproject.py +196 -196
crawlo/commands/stats.py +188 -188
crawlo/commands/utils.py +186 -186
crawlo/config.py +279 -279
crawlo/core/__init__.py +2 -2
crawlo/core/engine.py +171 -171
crawlo/core/enhanced_engine.py +189 -189
crawlo/core/processor.py +40 -40
crawlo/core/scheduler.py +166 -162
crawlo/crawler.py +1027 -1027
crawlo/downloader/__init__.py +242 -242
crawlo/downloader/aiohttp_downloader.py +212 -212
crawlo/downloader/cffi_downloader.py +251 -251
crawlo/downloader/httpx_downloader.py +259 -257
crawlo/event.py +11 -11
crawlo/exceptions.py +82 -78
crawlo/extension/__init__.py +31 -31
crawlo/extension/log_interval.py +49 -49
crawlo/extension/log_stats.py +44 -44
crawlo/extension/logging_extension.py +34 -34
crawlo/filters/__init__.py +154 -154
crawlo/filters/aioredis_filter.py +242 -242
crawlo/filters/memory_filter.py +269 -269
crawlo/items/__init__.py +23 -23
crawlo/items/base.py +21 -21
crawlo/items/fields.py +53 -53
crawlo/items/items.py +104 -104
crawlo/middleware/__init__.py +21 -21
crawlo/middleware/default_header.py +32 -32
crawlo/middleware/download_delay.py +28 -28
crawlo/middleware/middleware_manager.py +135 -135
crawlo/middleware/proxy.py +248 -248
crawlo/middleware/request_ignore.py +30 -30
crawlo/middleware/response_code.py +18 -18
crawlo/middleware/response_filter.py +26 -26
crawlo/middleware/retry.py +125 -125
crawlo/mode_manager.py +200 -200
crawlo/network/__init__.py +21 -21
crawlo/network/request.py +311 -311
crawlo/network/response.py +271 -269
crawlo/pipelines/__init__.py +22 -13
crawlo/pipelines/bloom_dedup_pipeline.py +157 -0
crawlo/pipelines/console_pipeline.py +39 -39
crawlo/pipelines/csv_pipeline.py +316 -316
crawlo/pipelines/database_dedup_pipeline.py +225 -0
crawlo/pipelines/json_pipeline.py +218 -218
crawlo/pipelines/memory_dedup_pipeline.py +116 -0
crawlo/pipelines/mongo_pipeline.py +116 -116
crawlo/pipelines/mysql_pipeline.py +195 -195
crawlo/pipelines/pipeline_manager.py +56 -56
crawlo/pipelines/redis_dedup_pipeline.py +163 -0
crawlo/project.py +153 -153
crawlo/queue/pqueue.py +37 -37
crawlo/queue/queue_manager.py +307 -303
crawlo/queue/redis_priority_queue.py +208 -191
crawlo/settings/__init__.py +7 -7
crawlo/settings/default_settings.py +245 -226
crawlo/settings/setting_manager.py +99 -99
crawlo/spider/__init__.py +639 -639
crawlo/stats_collector.py +59 -59
crawlo/subscriber.py +106 -106
crawlo/task_manager.py +30 -30
crawlo/templates/crawlo.cfg.tmpl +10 -10
crawlo/templates/project/__init__.py.tmpl +3 -3
crawlo/templates/project/items.py.tmpl +17 -17
crawlo/templates/project/middlewares.py.tmpl +86 -86
crawlo/templates/project/pipelines.py.tmpl +341 -335
crawlo/templates/project/run.py.tmpl +251 -238
crawlo/templates/project/settings.py.tmpl +250 -247
crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
crawlo/templates/spider/spider.py.tmpl +177 -177
crawlo/utils/__init__.py +7 -7
crawlo/utils/controlled_spider_mixin.py +439 -335
crawlo/utils/date_tools.py +233 -233
crawlo/utils/db_helper.py +343 -343
crawlo/utils/func_tools.py +82 -82
crawlo/utils/large_scale_config.py +286 -286
crawlo/utils/large_scale_helper.py +343 -343
crawlo/utils/log.py +128 -128
crawlo/utils/queue_helper.py +175 -175
crawlo/utils/request.py +267 -267
crawlo/utils/request_serializer.py +219 -219
crawlo/utils/spider_loader.py +62 -62
crawlo/utils/system.py +11 -11
crawlo/utils/tools.py +4 -4
crawlo/utils/url.py +39 -39
{crawlo-1.1.2.dist-info → crawlo-1.1.3.dist-info}/METADATA +635 -567
crawlo-1.1.3.dist-info/RECORD +113 -0
examples/__init__.py +7 -7
examples/controlled_spider_example.py +205 -0
tests/__init__.py +7 -7
tests/test_final_validation.py +153 -153
tests/test_proxy_health_check.py +32 -32
tests/test_proxy_middleware_integration.py +136 -136
tests/test_proxy_providers.py +56 -56
tests/test_proxy_stats.py +19 -19
tests/test_proxy_strategies.py +59 -59
tests/test_redis_config.py +28 -28
tests/test_redis_queue.py +224 -224
tests/test_request_serialization.py +70 -70
tests/test_scheduler.py +241 -241
crawlo-1.1.2.dist-info/RECORD +0 -108
{crawlo-1.1.2.dist-info → crawlo-1.1.3.dist-info}/WHEEL +0 -0
{crawlo-1.1.2.dist-info → crawlo-1.1.3.dist-info}/entry_points.txt +0 -0
{crawlo-1.1.2.dist-info → crawlo-1.1.3.dist-info}/top_level.txt +0 -0

crawlo/commands/utils.py CHANGED Viewed

@@ -1,187 +1,187 @@
-#!/usr/bin/python
-# -*- coding: UTF-8 -*-
-"""
-命令行工具公共模块
-提供命令行工具的公共函数和工具
-"""
-import sys
-import configparser
-from pathlib import Path
-from importlib import import_module
-from typing import Optional, Tuple
-from rich.console import Console
-from rich.panel import Panel
-from rich.text import Text
-console = Console()
-def get_project_root() -> Optional[Path]:
-    """
-    自动检测项目根目录：从当前目录向上查找 crawlo.cfg
-    Returns:
-        Path: 项目根目录路径，如果未找到返回 None
-    """
-    current = Path.cwd()
-    for _ in range(10):  # 最多向上查找10层
-        cfg_file = current / "crawlo.cfg"
-        if cfg_file.exists():
-            return current
-        if current == current.parent:
-            break
-        current = current.parent
-    return None
-def validate_project_environment() -> Tuple[bool, Optional[str], Optional[str]]:
-    """
-    验证项目环境，确保在正确的 Crawlo 项目中
-    Returns:
-        Tuple[bool, Optional[str], Optional[str]]:
-        (是否有效, 项目包名, 错误信息)
-    """
-    # 1. 查找项目根目录
-    project_root = get_project_root()
-    if not project_root:
-        return False, None, "Cannot find 'crawlo.cfg'. Run this command inside your project directory."
-    # 2. 将项目根加入 Python 路径
-    project_root_str = str(project_root)
-    if project_root_str not in sys.path:
-        sys.path.insert(0, project_root_str)
-    # 3. 读取配置文件
-    cfg_file = project_root / "crawlo.cfg"
-    config = configparser.ConfigParser()
-    try:
-        config.read(cfg_file, encoding="utf-8")
-    except Exception as e:
-        return False, None, f"Failed to read crawlo.cfg: {e}"
-    if not config.has_section("settings") or not config.has_option("settings", "default"):
-        return False, None, "Invalid crawlo.cfg: missing [settings] section or 'default' option"
-    # 4. 获取项目包名
-    settings_module = config.get("settings", "default")
-    project_package = settings_module.split(".")[0]
-    # 5. 验证项目包是否可导入
-    try:
-        import_module(project_package)
-    except ImportError as e:
-        return False, None, f"Failed to import project package '{project_package}': {e}"
-    return True, project_package, None
-def show_error_panel(title: str, message: str, show_json: bool = False) -> None:
-    """
-    显示错误面板或JSON格式错误
-    Args:
-        title: 错误标题
-        message: 错误消息
-        show_json: 是否以JSON格式输出
-    """
-    if show_json:
-        console.print_json(data={"success": False, "error": message})
-    else:
-        console.print(Panel(
-            Text.from_markup(f":cross_mark: [bold red]{message}[/bold red]"),
-            title=f"❌ {title}",
-            border_style="red",
-            padding=(1, 2)
-        ))
-def show_success_panel(title: str, message: str, show_json: bool = False, data: dict = None) -> None:
-    """
-    显示成功面板或JSON格式结果
-    Args:
-        title: 成功标题
-        message: 成功消息
-        show_json: 是否以JSON格式输出
-        data: JSON数据（当show_json=True时）
-    """
-    if show_json:
-        result = {"success": True, "message": message}
-        if data:
-            result.update(data)
-        console.print_json(data=result)
-    else:
-        console.print(Panel(
-            Text.from_markup(f":white_check_mark: [bold green]{message}[/bold green]"),
-            title=f"✅ {title}",
-            border_style="green",
-            padding=(1, 2)
-        ))
-def validate_spider_name(spider_name: str) -> bool:
-    """
-    验证爬虫名称是否符合规范
-    Args:
-        spider_name: 爬虫名称
-    Returns:
-        bool: 是否有效
-    """
-    import re
-    # 爬虫名称应该是有效的Python标识符
-    return spider_name.isidentifier() and re.match(r'^[a-z][a-z0-9_]*$', spider_name)
-def format_file_size(size_bytes: int) -> str:
-    """
-    格式化文件大小
-    Args:
-        size_bytes: 字节数
-    Returns:
-        str: 格式化后的大小字符串
-    """
-    for unit in ['B', 'KB', 'MB', 'GB']:
-        if size_bytes < 1024.0:
-            return f"{size_bytes:.1f} {unit}"
-        size_bytes /= 1024.0
-    return f"{size_bytes:.1f} TB"
-def truncate_text(text: str, max_length: int = 80) -> str:
-    """
-    截断过长的文本
-    Args:
-        text: 原始文本
-        max_length: 最大长度
-    Returns:
-        str: 截断后的文本
-    """
-    if len(text) <= max_length:
-        return text
-    return text[:max_length-3] + "..."
-def is_valid_domain(domain: str) -> bool:
-    """
-    验证域名格式是否正确
-    Args:
-        domain: 域名
-    Returns:
-        bool: 是否有效
-    """
-    import re
-    pattern = re.compile(
-        r'^(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)*[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?$'
-    )
+#!/usr/bin/python
+# -*- coding: UTF-8 -*-
+"""
+命令行工具公共模块
+提供命令行工具的公共函数和工具
+"""
+import sys
+import configparser
+from pathlib import Path
+from importlib import import_module
+from typing import Optional, Tuple
+from rich.console import Console
+from rich.panel import Panel
+from rich.text import Text
+console = Console()
+def get_project_root() -> Optional[Path]:
+    """
+    自动检测项目根目录：从当前目录向上查找 crawlo.cfg
+    Returns:
+        Path: 项目根目录路径，如果未找到返回 None
+    """
+    current = Path.cwd()
+    for _ in range(10):  # 最多向上查找10层
+        cfg_file = current / "crawlo.cfg"
+        if cfg_file.exists():
+            return current
+        if current == current.parent:
+            break
+        current = current.parent
+    return None
+def validate_project_environment() -> Tuple[bool, Optional[str], Optional[str]]:
+    """
+    验证项目环境，确保在正确的 Crawlo 项目中
+    Returns:
+        Tuple[bool, Optional[str], Optional[str]]:
+        (是否有效, 项目包名, 错误信息)
+    """
+    # 1. 查找项目根目录
+    project_root = get_project_root()
+    if not project_root:
+        return False, None, "Cannot find 'crawlo.cfg'. Run this command inside your project directory."
+    # 2. 将项目根加入 Python 路径
+    project_root_str = str(project_root)
+    if project_root_str not in sys.path:
+        sys.path.insert(0, project_root_str)
+    # 3. 读取配置文件
+    cfg_file = project_root / "crawlo.cfg"
+    config = configparser.ConfigParser()
+    try:
+        config.read(cfg_file, encoding="utf-8")
+    except Exception as e:
+        return False, None, f"Failed to read crawlo.cfg: {e}"
+    if not config.has_section("settings") or not config.has_option("settings", "default"):
+        return False, None, "Invalid crawlo.cfg: missing [settings] section or 'default' option"
+    # 4. 获取项目包名
+    settings_module = config.get("settings", "default")
+    project_package = settings_module.split(".")[0]
+    # 5. 验证项目包是否可导入
+    try:
+        import_module(project_package)
+    except ImportError as e:
+        return False, None, f"Failed to import project package '{project_package}': {e}"
+    return True, project_package, None
+def show_error_panel(title: str, message: str, show_json: bool = False) -> None:
+    """
+    显示错误面板或JSON格式错误
+    Args:
+        title: 错误标题
+        message: 错误消息
+        show_json: 是否以JSON格式输出
+    """
+    if show_json:
+        console.print_json(data={"success": False, "error": message})
+    else:
+        console.print(Panel(
+            Text.from_markup(f":cross_mark: [bold red]{message}[/bold red]"),
+            title=f"❌ {title}",
+            border_style="red",
+            padding=(1, 2)
+        ))
+def show_success_panel(title: str, message: str, show_json: bool = False, data: dict = None) -> None:
+    """
+    显示成功面板或JSON格式结果
+    Args:
+        title: 成功标题
+        message: 成功消息
+        show_json: 是否以JSON格式输出
+        data: JSON数据（当show_json=True时）
+    """
+    if show_json:
+        result = {"success": True, "message": message}
+        if data:
+            result.update(data)
+        console.print_json(data=result)
+    else:
+        console.print(Panel(
+            Text.from_markup(f":white_check_mark: [bold green]{message}[/bold green]"),
+            title=f"✅ {title}",
+            border_style="green",
+            padding=(1, 2)
+        ))
+def validate_spider_name(spider_name: str) -> bool:
+    """
+    验证爬虫名称是否符合规范
+    Args:
+        spider_name: 爬虫名称
+    Returns:
+        bool: 是否有效
+    """
+    import re
+    # 爬虫名称应该是有效的Python标识符
+    return spider_name.isidentifier() and re.match(r'^[a-z][a-z0-9_]*$', spider_name)
+def format_file_size(size_bytes: int) -> str:
+    """
+    格式化文件大小
+    Args:
+        size_bytes: 字节数
+    Returns:
+        str: 格式化后的大小字符串
+    """
+    for unit in ['B', 'KB', 'MB', 'GB']:
+        if size_bytes < 1024.0:
+            return f"{size_bytes:.1f} {unit}"
+        size_bytes /= 1024.0
+    return f"{size_bytes:.1f} TB"
+def truncate_text(text: str, max_length: int = 80) -> str:
+    """
+    截断过长的文本
+    Args:
+        text: 原始文本
+        max_length: 最大长度
+    Returns:
+        str: 截断后的文本
+    """
+    if len(text) <= max_length:
+        return text
+    return text[:max_length-3] + "..."
+def is_valid_domain(domain: str) -> bool:
+    """
+    验证域名格式是否正确
+    Args:
+        domain: 域名
+    Returns:
+        bool: 是否有效
+    """
+    import re
+    pattern = re.compile(
+        r'^(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)*[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?$'
+    )
     return bool(pattern.match(domain))

crawlo 1.1.2__py3-none-any.whl → 1.1.3__py3-none-any.whl

Potentially problematic release.

crawlo 1.1.2py3-none-any.whl → 1.1.3py3-none-any.whl