PyPI - crawlo - Versions diffs - 1.1.1__py3-none-any.whl → 1.1.2__py3-none-any.whl - Mend

crawlo 1.1.1py3-none-any.whl → 1.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crawlo might be problematic. Click here for more details.

Files changed (68) hide show

crawlo/__init__.py +2 -1
crawlo/__version__.py +1 -1
crawlo/commands/genspider.py +68 -42
crawlo/commands/list.py +102 -93
crawlo/commands/startproject.py +89 -4
crawlo/commands/utils.py +187 -0
crawlo/config.py +280 -0
crawlo/core/engine.py +16 -3
crawlo/core/enhanced_engine.py +190 -0
crawlo/core/scheduler.py +113 -8
crawlo/crawler.py +840 -307
crawlo/downloader/__init__.py +181 -17
crawlo/downloader/aiohttp_downloader.py +15 -2
crawlo/downloader/cffi_downloader.py +11 -1
crawlo/downloader/httpx_downloader.py +14 -3
crawlo/filters/__init__.py +122 -5
crawlo/filters/aioredis_filter.py +128 -36
crawlo/filters/memory_filter.py +99 -32
crawlo/middleware/proxy.py +11 -8
crawlo/middleware/retry.py +40 -5
crawlo/mode_manager.py +201 -0
crawlo/network/__init__.py +17 -3
crawlo/network/request.py +118 -10
crawlo/network/response.py +131 -28
crawlo/pipelines/__init__.py +1 -1
crawlo/pipelines/csv_pipeline.py +317 -0
crawlo/pipelines/json_pipeline.py +219 -0
crawlo/queue/__init__.py +0 -0
crawlo/queue/pqueue.py +37 -0
crawlo/queue/queue_manager.py +304 -0
crawlo/queue/redis_priority_queue.py +192 -0
crawlo/settings/default_settings.py +68 -9
crawlo/spider/__init__.py +576 -66
crawlo/task_manager.py +4 -1
crawlo/templates/project/middlewares.py.tmpl +56 -45
crawlo/templates/project/pipelines.py.tmpl +308 -36
crawlo/templates/project/run.py.tmpl +239 -0
crawlo/templates/project/settings.py.tmpl +211 -17
crawlo/templates/spider/spider.py.tmpl +153 -7
crawlo/utils/controlled_spider_mixin.py +336 -0
crawlo/utils/large_scale_config.py +287 -0
crawlo/utils/large_scale_helper.py +344 -0
crawlo/utils/queue_helper.py +176 -0
crawlo/utils/request_serializer.py +220 -0
crawlo-1.1.2.dist-info/METADATA +567 -0
{crawlo-1.1.1.dist-info → crawlo-1.1.2.dist-info}/RECORD +54 -46
tests/test_final_validation.py +154 -0
tests/test_redis_config.py +29 -0
tests/test_redis_queue.py +225 -0
tests/test_request_serialization.py +71 -0
tests/test_scheduler.py +242 -0
crawlo/pipelines/mysql_batch_pipline.py +0 -273
crawlo/utils/pqueue.py +0 -174
crawlo-1.1.1.dist-info/METADATA +0 -220
examples/baidu_spider/__init__.py +0 -7
examples/baidu_spider/demo.py +0 -94
examples/baidu_spider/items.py +0 -46
examples/baidu_spider/middleware.py +0 -49
examples/baidu_spider/pipeline.py +0 -55
examples/baidu_spider/run.py +0 -27
examples/baidu_spider/settings.py +0 -121
examples/baidu_spider/spiders/__init__.py +0 -7
examples/baidu_spider/spiders/bai_du.py +0 -61
examples/baidu_spider/spiders/miit.py +0 -159
examples/baidu_spider/spiders/sina.py +0 -79
{crawlo-1.1.1.dist-info → crawlo-1.1.2.dist-info}/WHEEL +0 -0
{crawlo-1.1.1.dist-info → crawlo-1.1.2.dist-info}/entry_points.txt +0 -0
{crawlo-1.1.1.dist-info → crawlo-1.1.2.dist-info}/top_level.txt +0 -0

crawlo/__init__.py CHANGED Viewed

@@ -4,7 +4,7 @@
 Crawlo - 一个异步爬虫框架
 """
 from crawlo.spider import Spider
-from crawlo.items.items import Item
+from crawlo.items import Item, Field
 from crawlo.network.request import Request
 from crawlo.network.response import Response
 from crawlo.downloader import DownloaderBase
@@ -26,6 +26,7 @@ except Exception:
 __all__ = [
     'Spider',
     'Item',
+    'Field',
     'Request',
     'Response',
     'DownloaderBase',

crawlo/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.1.1"
1	+ __version__ = "1.1.2"

crawlo/commands/genspider.py CHANGED Viewed

@@ -11,6 +11,15 @@ import configparser
 import importlib
 from rich.console import Console
+from .utils import (
+    get_project_root,
+    validate_project_environment,
+    show_error_panel,
+    show_success_panel,
+    validate_spider_name,
+    is_valid_domain
+)
 # 初始化 rich 控制台
 console = Console()
@@ -29,41 +38,42 @@ def _render_template(tmpl_path, context):
 def main(args):
     if len(args) < 2:
         console.print("[bold red]Error:[/bold red] Usage: [blue]crawlo genspider[/blue] <spider_name> <domain>")
+        console.print("💡 Examples:")
+        console.print("   [blue]crawlo genspider[/blue] news_spider news.example.com")
+        console.print("   [blue]crawlo genspider[/blue] product_spider shop.example.com")
         return 1
     spider_name = args[0]
     domain = args[1]
-    # 查找项目根目录
-    project_root = None
-    current = Path.cwd()
-    while True:
-        cfg_file = current / 'crawlo.cfg'
-        if cfg_file.exists():
-            project_root = current
-            break
-        parent = current.parent
-        if parent == current:
-            break
-        current = parent
-    if not project_root:
-        console.print("[bold red]:cross_mark: Error:[/bold red] Not a crawlo project. [cyan]crawlo.cfg[/cyan] not found.")
+    # 验证爬虫名称
+    if not validate_spider_name(spider_name):
+        show_error_panel(
+            "Invalid Spider Name",
+            f"Spider name '[cyan]{spider_name}[/cyan]' is invalid.\n"
+            "💡 Spider name should:\n"
+            "  • Start with lowercase letter\n"
+            "  • Contain only lowercase letters, numbers, and underscores\n"
+            "  • Be a valid Python identifier"
+        )
+        return 1
+    # 验证域名格式
+    if not is_valid_domain(domain):
+        show_error_panel(
+            "Invalid Domain",
+            f"Domain '[cyan]{domain}[/cyan]' format is invalid.\n"
+            "💡 Please provide a valid domain name like 'example.com'"
+        )
         return 1
-    # 将项目根目录加入 sys.path
-    if str(project_root) not in sys.path:
-        sys.path.insert(0, str(project_root))
-    # 从 crawlo.cfg 读取 settings 模块，获取项目包名
-    config = configparser.ConfigParser()
-    try:
-        config.read(cfg_file, encoding='utf-8')
-        settings_module = config.get('settings', 'default')
-        project_package = settings_module.split('.')[0]  # e.g., myproject.settings -> myproject
-    except Exception as e:
-        console.print(f"[bold red]:cross_mark: Error reading crawlo.cfg:[/bold red] {e}")
+    # 验证项目环境
+    is_valid, project_package, error_msg = validate_project_environment()
+    if not is_valid:
+        show_error_panel("Not a Crawlo Project", error_msg)
         return 1
+    project_root = get_project_root()
     # 确定 items 模块的路径
     items_module_path = f"{project_package}.items"
@@ -93,17 +103,23 @@ def main(args):
     spider_file = spiders_dir / f'{spider_name}.py'
     if spider_file.exists():
-        console.print(f"[bold red]:cross_mark: Error:[/bold red] Spider '[cyan]{spider_name}[/cyan]' already exists at [green]{spider_file}[/green]")
+        show_error_panel(
+            "Spider Already Exists",
+            f"Spider '[cyan]{spider_name}[/cyan]' already exists at\n[green]{spider_file}[/green]"
+        )
         return 1
     # 模板路径
     tmpl_path = TEMPLATES_DIR / 'spider' / 'spider.py.tmpl'
     if not tmpl_path.exists():
-        console.print(f"[bold red]:cross_mark: Error:[/bold red] Template file not found at [cyan]{tmpl_path}[/cyan]")
+        show_error_panel(
+            "Template Not Found",
+            f"Template file not found at [cyan]{tmpl_path}[/cyan]"
+        )
         return 1
     # 生成类名
-    class_name = f"{spider_name.capitalize()}Spider"
+    class_name = f"{spider_name.replace('_', '').capitalize()}Spider"
     context = {
         'spider_name': spider_name,
@@ -113,14 +129,24 @@ def main(args):
         'class_name': class_name
     }
-    content = _render_template(tmpl_path, context)
-    with open(spider_file, 'w', encoding='utf-8') as f:
-        f.write(content)
-    console.print(f":white_check_mark: [green]Spider '[bold]{spider_name}[/bold]' created successfully![/green]")
-    console.print(f"  → Location: [cyan]{spider_file}[/cyan]")
-    console.print("\n[bold]Next step:[/bold]")
-    console.print(f"  [blue]crawlo run[/blue] {spider_name}")
-    return 0
+    try:
+        content = _render_template(tmpl_path, context)
+        with open(spider_file, 'w', encoding='utf-8') as f:
+            f.write(content)
+        console.print(f":white_check_mark: [green]Spider '[bold]{spider_name}[/bold]' created successfully![/green]")
+        console.print(f"  → Location: [cyan]{spider_file}[/cyan]")
+        console.print(f"  → Class: [yellow]{class_name}[/yellow]")
+        console.print(f"  → Domain: [blue]{domain}[/blue]")
+        console.print("\n[bold]Next steps:[/bold]")
+        console.print(f"  [blue]crawlo run[/blue] {spider_name}")
+        console.print(f"  [blue]crawlo check[/blue] {spider_name}")
+        return 0
+    except Exception as e:
+        show_error_panel(
+            "Creation Failed",
+            f"Failed to create spider: {e}"
+        )
+        return 1

crawlo/commands/list.py CHANGED Viewed

@@ -6,7 +6,6 @@
 # @Desc    : 命令行入口：crawlo list，用于列出所有已注册的爬虫
 """
 import sys
-import configparser
 from pathlib import Path
 from importlib import import_module
@@ -18,110 +17,96 @@ from rich import box
 from crawlo.crawler import CrawlerProcess
 from crawlo.utils.log import get_logger
+from .utils import validate_project_environment, show_error_panel
 logger = get_logger(__name__)
 console = Console()
-def get_project_root():
-    """
-    自动检测项目根目录：从当前目录向上查找 crawlo.cfg
-    找到后返回该目录路径（字符串），最多向上查找10层。
-    """
-    current = Path.cwd()
-    for _ in range(10):
-        cfg = current / "crawlo.cfg"
-        if cfg.exists():
-            return str(current)
-        if current == current.parent:
-            break
-        current = current.parent
-    return None  # 未找到
 def main(args):
     """
     主函数：列出所有可用爬虫
-    用法: crawlo list
+    用法: crawlo list [--json]
     """
-    if args:
-        console.print("[bold red]❌ Error:[/bold red] Usage: [blue]crawlo list[/blue]")
+    show_json = "--json" in args
+    # 过滤掉参数后检查是否有额外参数
+    filtered_args = [arg for arg in args if not arg.startswith('--')]
+    if filtered_args:
+        if show_json:
+            console.print_json(data={"success": False, "error": "Usage: crawlo list [--json]"})
+        else:
+            console.print("[bold red]❌ Error:[/bold red] Usage: [blue]crawlo list[/blue] [--json]")
         return 1
     try:
-        # 1. 查找项目根目录
-        project_root = get_project_root()
-        if not project_root:
-            console.print(Panel(
-                Text.from_markup(
-                    ":cross_mark: [bold red]Cannot find 'crawlo.cfg'[/bold red]\n"
-                    "💡 Run this command inside your project directory.\n"
-                    "🚀 Or create a new project with:\n"
-                    "   [blue]crawlo startproject myproject[/blue]"
-                ),
-                title="❌ Not in a Crawlo Project",
-                border_style="red",
-                padding=(1, 2)
-            ))
-            return 1
-        project_root_path = Path(project_root)
-        project_root_str = str(project_root_path)
-        # 2. 将项目根加入 Python 路径
-        if project_root_str not in sys.path:
-            sys.path.insert(0, project_root_str)
-        # 3. 读取 crawlo.cfg 获取 settings 模块
-        cfg_file = project_root_path / "crawlo.cfg"
-        config = configparser.ConfigParser()
-        config.read(cfg_file, encoding="utf-8")
-        if not config.has_section("settings") or not config.has_option("settings", "default"):
-            console.print(Panel(
-                ":cross_mark: [bold red]Invalid crawlo.cfg[/bold red]\n"
-                "Missing [settings] section or 'default' option.",
-                title="❌ Config Error",
-                border_style="red"
-            ))
-            return 1
-        settings_module = config.get("settings", "default")
-        project_package = settings_module.split(".")[0]
-        # 4. 确保项目包可导入
-        try:
-            import_module(project_package)
-        except ImportError as e:
-            console.print(Panel(
-                f":cross_mark: Failed to import project package '[cyan]{project_package}[/cyan]':\n{e}",
-                title="❌ Import Error",
-                border_style="red"
-            ))
+        # 验证项目环境
+        is_valid, project_package, error_msg = validate_project_environment()
+        if not is_valid:
+            if show_json:
+                console.print_json(data={"success": False, "error": error_msg})
+            else:
+                show_error_panel("Not a Crawlo Project", error_msg)
             return 1
-        # 5. 初始化 CrawlerProcess 并加载爬虫模块
+        # 初始化 CrawlerProcess 并加载爬虫模块
         spider_modules = [f"{project_package}.spiders"]
         process = CrawlerProcess(spider_modules=spider_modules)
-        # 6. 获取所有爬虫名称
+        # 获取所有爬虫名称
         spider_names = process.get_spider_names()
         if not spider_names:
-            console.print(Panel(
-                Text.from_markup(
-                    ":envelope_with_arrow: [bold]No spiders found[/bold] in '[cyan]spiders/[/cyan]' directory.\n\n"
-                    "[bold]💡 Make sure:[/bold]\n"
-                    "  • Spider classes inherit from [blue]`crawlo.spider.Spider`[/blue]\n"
-                    "  • Each spider has a [green]`name`[/green] attribute\n"
-                    "  • Spiders are imported in [cyan]`spiders/__init__.py`[/cyan] (if using package)"
-                ),
-                title="📭 No Spiders Found",
-                border_style="yellow",
-                padding=(1, 2)
-            ))
-            return 1
-        # 7. 输出爬虫列表 —— 使用表格
+            if show_json:
+                console.print_json(data={
+                    "success": True,
+                    "spiders": [],
+                    "message": "No spiders found in project"
+                })
+            else:
+                console.print(Panel(
+                    Text.from_markup(
+                        ":envelope_with_arrow: [bold]No spiders found[/bold] in '[cyan]spiders/[/cyan]' directory.\n\n"
+                        "[bold]💡 Make sure:[/bold]\n"
+                        "  • Spider classes inherit from [blue]`crawlo.spider.Spider`[/blue]\n"
+                        "  • Each spider has a [green]`name`[/green] attribute\n"
+                        "  • Spiders are imported in [cyan]`spiders/__init__.py`[/cyan] (if using package)"
+                    ),
+                    title="📭 No Spiders Found",
+                    border_style="yellow",
+                    padding=(1, 2)
+                ))
+            return 0
+        # 准备爬虫信息
+        spider_info = []
+        for name in sorted(spider_names):
+            spider_cls = process.get_spider_class(name)
+            module_name = spider_cls.__module__.replace(f"{project_package}.", "")
+            # 获取额外信息
+            start_urls_count = len(getattr(spider_cls, 'start_urls', []))
+            allowed_domains = getattr(spider_cls, 'allowed_domains', [])
+            custom_settings = getattr(spider_cls, 'custom_settings', {})
+            spider_info.append({
+                "name": name,
+                "class": spider_cls.__name__,
+                "module": module_name,
+                "start_urls_count": start_urls_count,
+                "allowed_domains": allowed_domains,
+                "has_custom_settings": bool(custom_settings)
+            })
+        # JSON 输出
+        if show_json:
+            console.print_json(data={
+                "success": True,
+                "count": len(spider_info),
+                "spiders": spider_info
+            })
+            return 0
+        # 表格输出
         table = Table(
             title=f"📋 Found {len(spider_names)} spider(s)",
             box=box.ROUNDED,
@@ -132,16 +117,40 @@ def main(args):
         table.add_column("Name", style="cyan", no_wrap=True)
         table.add_column("Class", style="green")
         table.add_column("Module", style="dim")
-        for name in sorted(spider_names):
-            spider_cls = process.get_spider_class(name)
-            module_name = spider_cls.__module__.replace(f"{project_package}.", "")
-            table.add_row(name, spider_cls.__name__, module_name)
+        table.add_column("URLs", style="blue", justify="center")
+        table.add_column("Domains", style="yellow")
+        table.add_column("Custom Settings", style="magenta", justify="center")
+        for info in spider_info:
+            domains_display = ", ".join(info["allowed_domains"][:2])  # 显示前2个域名
+            if len(info["allowed_domains"]) > 2:
+                domains_display += f" (+{len(info['allowed_domains'])-2})"
+            elif not domains_display:
+                domains_display = "-"
+            table.add_row(
+                info["name"],
+                info["class"],
+                info["module"],
+                str(info["start_urls_count"]),
+                domains_display,
+                "✓" if info["has_custom_settings"] else "-"
+            )
         console.print(table)
+        # 显示使用提示
+        console.print("\n[bold]🚀 Next steps:[/bold]")
+        console.print("  [blue]crawlo run[/blue] <spider_name>    # Run a specific spider")
+        console.print("  [blue]crawlo run[/blue] all             # Run all spiders")
+        console.print("  [blue]crawlo check[/blue] <spider_name>  # Check spider validity")
         return 0
     except Exception as e:
-        console.print(f"[bold red]❌ Unexpected error:[/bold red] {e}")
+        if show_json:
+            console.print_json(data={"success": False, "error": str(e)})
+        else:
+            console.print(f"[bold red]❌ Unexpected error:[/bold red] {e}")
         logger.exception("Exception during 'crawlo list'")
-        return 1
+        return 1

crawlo/commands/startproject.py CHANGED Viewed

@@ -6,11 +6,14 @@
 # @Desc    : 命令行入口：crawlo startproject baidu，创建项目。
 """
 import shutil
+import re
 from pathlib import Path
 from rich.console import Console
 from rich.panel import Panel
 from rich.text import Text
+from .utils import show_error_panel, show_success_panel
 # 初始化 rich 控制台
 console = Console()
@@ -51,16 +54,86 @@ def _copytree_with_templates(src, dst, context):
                 shutil.copy2(item, dst_item)
+def validate_project_name(project_name: str) -> tuple[bool, str]:
+    """
+    验证项目名称是否有效
+    Returns:
+        tuple[bool, str]: (是否有效, 错误信息)
+    """
+    # 检查是否为空
+    if not project_name or not project_name.strip():
+        return False, "Project name cannot be empty"
+    project_name = project_name.strip()
+    # 检查长度
+    if len(project_name) > 50:
+        return False, "Project name too long (max 50 characters)"
+    # 检查是否为Python关键字
+    python_keywords = {
+        'False', 'None', 'True', 'and', 'as', 'assert', 'break', 'class',
+        'continue', 'def', 'del', 'elif', 'else', 'except', 'finally',
+        'for', 'from', 'global', 'if', 'import', 'in', 'is', 'lambda',
+        'nonlocal', 'not', 'or', 'pass', 'raise', 'return', 'try',
+        'while', 'with', 'yield'
+    }
+    if project_name in python_keywords:
+        return False, f"'{project_name}' is a Python keyword and cannot be used as project name"
+    # 检查是否为有效的Python标识符
+    if not project_name.isidentifier():
+        return False, "Project name must be a valid Python identifier"
+    # 检查格式（建议使用snake_case）
+    if not re.match(r'^[a-z][a-z0-9_]*$', project_name):
+        return False, (
+            "Project name should start with lowercase letter and "
+            "contain only lowercase letters, numbers, and underscores"
+        )
+    # 检查是否以数字结尾（不推荐）
+    if project_name[-1].isdigit():
+        return False, "Project name should not end with a number"
+    return True, ""
 def main(args):
     if len(args) != 1:
-        console.print("[bold red]Error:[/bold red] Usage: crawlo startproject <project_name>")
+        console.print("[bold red]Error:[/bold red] Usage: [blue]crawlo startproject[/blue] <project_name>")
+        console.print("💡 Examples:")
+        console.print("   [blue]crawlo startproject[/blue] my_spider_project")
+        console.print("   [blue]crawlo startproject[/blue] news_crawler")
+        console.print("   [blue]crawlo startproject[/blue] ecommerce_spider")
         return 1
     project_name = args[0]
+    # 验证项目名称
+    is_valid, error_msg = validate_project_name(project_name)
+    if not is_valid:
+        show_error_panel(
+            "Invalid Project Name",
+            f"[cyan]{project_name}[/cyan] is not a valid project name.\n"
+            f"❌ {error_msg}\n\n"
+            "💡 Project name should:\n"
+            "  • Start with lowercase letter\n"
+            "  • Contain only lowercase letters, numbers, and underscores\n"
+            "  • Be a valid Python identifier\n"
+            "  • Not be a Python keyword"
+        )
+        return 1
     project_dir = Path(project_name)
     if project_dir.exists():
-        console.print(f"[bold red]Error:[/bold red] Directory '[cyan]{project_dir}[/cyan]' already exists.")
+        show_error_panel(
+            "Directory Exists",
+            f"Directory '[cyan]{project_dir}[/cyan]' already exists.\n"
+            "💡 Choose a different project name or remove the existing directory."
+        )
         return 1
     context = {'project_name': project_name}
@@ -87,6 +160,10 @@ def main(args):
         # 4. 创建 logs 目录
         (project_dir / 'logs').mkdir(exist_ok=True)
         console.print(":white_check_mark: Created logs directory")
+        # 5. 创建 output 目录（用于数据输出）
+        (project_dir / 'output').mkdir(exist_ok=True)
+        console.print(":white_check_mark: Created output directory")
         # 成功面板
         success_text = Text.from_markup(f"Project '[bold cyan]{project_name}[/bold cyan]' created successfully!")
@@ -94,17 +171,25 @@ def main(args):
         # 下一步操作提示（对齐美观 + 语法高亮）
         next_steps = f"""
-        [bold]Next steps:[/bold]
+        [bold]🚀 Next steps:[/bold]
         [blue]cd[/blue] {project_name}
         [blue]crawlo genspider[/blue] example example.com
         [blue]crawlo run[/blue] example
+        [bold]📚 Learn more:[/bold]
+        [blue]crawlo list[/blue]                    # List all spiders
+        [blue]crawlo check[/blue] example          # Check spider validity
+        [blue]crawlo stats[/blue]                  # View statistics
         """.strip()
         console.print(next_steps)
         return 0
     except Exception as e:
-        console.print(f"[bold red]Error creating project:[/bold red] {e}")
+        show_error_panel(
+            "Creation Failed",
+            f"Failed to create project: {e}"
+        )
         if project_dir.exists():
             shutil.rmtree(project_dir, ignore_errors=True)
             console.print("[red]:cross_mark: Cleaned up partially created project.[/red]")

crawlo 1.1.1__py3-none-any.whl → 1.1.2__py3-none-any.whl

Potentially problematic release.

crawlo 1.1.1py3-none-any.whl → 1.1.2py3-none-any.whl