PyPI - crawlo - Versions diffs - 1.0.6__py3-none-any.whl → 1.0.8__py3-none-any.whl - Mend

crawlo 1.0.6py3-none-any.whl → 1.0.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crawlo might be problematic. Click here for more details.

Files changed (15) hide show

crawlo/__version__.py +1 -1
crawlo/commands/check.py +107 -0
crawlo/commands/list.py +92 -0
crawlo/commands/run.py +109 -77
crawlo/commands/stats.py +59 -0
crawlo/crawler.py +340 -66
crawlo/items/__init__.py +2 -1
crawlo/items/base.py +1 -10
crawlo/spider/__init__.py +91 -3
crawlo/utils/project.py +14 -16
{crawlo-1.0.6.dist-info → crawlo-1.0.8.dist-info}/METADATA +1 -1
{crawlo-1.0.6.dist-info → crawlo-1.0.8.dist-info}/RECORD +15 -12
{crawlo-1.0.6.dist-info → crawlo-1.0.8.dist-info}/WHEEL +0 -0
{crawlo-1.0.6.dist-info → crawlo-1.0.8.dist-info}/entry_points.txt +0 -0
{crawlo-1.0.6.dist-info → crawlo-1.0.8.dist-info}/top_level.txt +0 -0

crawlo/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.0.6"
1	+ __version__ = "1.0.8"

crawlo/commands/check.py ADDED Viewed

@@ -0,0 +1,107 @@
+#!/usr/bin/python
+# -*- coding:UTF-8 -*-
+"""
+# @Time    :    2025-08-31 22:35
+# @Author  :   crawl-coder
+# @Desc    :   命令行入口：crawlo check， 检查所有爬虫定义是否合规。
+"""
+import sys
+import configparser
+from crawlo.crawler import CrawlerProcess
+from crawlo.utils.project import get_settings
+from crawlo.utils.log import get_logger
+logger = get_logger(__name__)
+def main(args):
+    if args:
+        print("Usage: crawlo check")
+        return 1
+    try:
+        project_root = get_settings().get('PROJECT_ROOT')
+        if not project_root:
+            print("❌ Error: Cannot determine project root.")
+            return 1
+        if str(project_root) not in sys.path:
+            sys.path.insert(0, str(project_root))
+        cfg_file = project_root / 'crawlo.cfg'
+        if not cfg_file.exists():
+            print(f"❌ Error: crawlo.cfg not found in {project_root}")
+            return 1
+        config = configparser.ConfigParser()
+        config.read(cfg_file, encoding='utf-8')
+        if not config.has_section('settings') or not config.has_option('settings', 'default'):
+            print("❌ Error: Missing [settings] section or 'default' option in crawlo.cfg")
+            return 1
+        settings_module = config.get('settings', 'default')
+        project_package = settings_module.split('.')[0]
+        # 创建 CrawlerProcess 并发现爬虫
+        process = CrawlerProcess(spider_modules=[f"{project_package}.spiders"])
+        spider_names = process.get_spider_names()
+        if not spider_names:
+            print("📭 No spiders found.")
+            return 1
+        print(f"🔍 Checking {len(spider_names)} spider(s)...")
+        print("-" * 60)
+        issues_found = False
+        for name in sorted(spider_names):
+            cls = process.get_spider_class(name)
+            issues = []
+            if not hasattr(cls, 'name') or not cls.name:
+                issues.append("missing or empty 'name' attribute")
+            elif not isinstance(cls.name, str):
+                issues.append("'name' is not a string")
+            if not callable(getattr(cls, 'start_requests', None)):
+                issues.append("missing or non-callable 'start_requests' method")
+            if hasattr(cls, 'start_urls') and isinstance(cls.start_urls, str):
+                issues.append("'start_urls' is a string, should be list/tuple")
+            # 实例化检查（轻量）
+            try:
+                spider = cls.create_instance(None)
+                if not callable(getattr(spider, 'parse', None)):
+                    issues.append("no 'parse' method defined (optional but recommended)")
+            except Exception as e:
+                issues.append(f"failed to create instance: {e}")
+            if issues:
+                print(f"❌ {name:<20} {cls.__name__}")
+                for issue in issues:
+                    print(f"     • {issue}")
+                issues_found = True
+            else:
+                print(f"✅ {name:<20} {cls.__name__} (OK)")
+        print("-" * 60)
+        if issues_found:
+            print("⚠️  Some spiders have issues. Please fix them.")
+            return 1
+        else:
+            print("🎉 All spiders are compliant!")
+            return 0
+    except Exception as e:
+        print(f"❌ Error during check: {e}")
+        import traceback
+        traceback.print_exc()
+        return 1
+if __name__ == '__main__':
+    sys.exit(main(sys.argv[1:]))

crawlo/commands/list.py ADDED Viewed

@@ -0,0 +1,92 @@
+#!/usr/bin/python
+# -*- coding:UTF-8 -*-
+"""
+# @Time    :    2025-08-31 22:33
+# @Author  :   crawl-coder
+# @Desc    :   命令行入口：crawlo list，用于列出所有已注册的爬虫
+"""
+import sys
+import configparser
+from crawlo.crawler import CrawlerProcess
+from crawlo.utils.project import get_settings
+from crawlo.utils.log import get_logger
+logger = get_logger(__name__)
+def main(args):
+    """
+    列出所有可用爬虫
+    用法: crawlo list
+    """
+    if args:
+        print("Usage: crawlo list")
+        return 1
+    try:
+        # 1. 获取项目根目录
+        project_root = get_settings().get('PROJECT_ROOT')
+        if not project_root:
+            print("❌ Error: Cannot determine project root.")
+            return 1
+        # 将项目根目录加入 sys.path
+        project_root_str = str(project_root)
+        if project_root_str not in sys.path:
+            sys.path.insert(0, project_root_str)
+        # 2. 读取 crawlo.cfg 获取项目包名
+        cfg_file = project_root / 'crawlo.cfg'
+        if not cfg_file.exists():
+            print(f"❌ Error: crawlo.cfg not found in {project_root}")
+            return 1
+        config = configparser.ConfigParser()
+        config.read(cfg_file, encoding='utf-8')
+        if not config.has_section('settings') or not config.has_option('settings', 'default'):
+            print("❌ Error: Missing [settings] section or 'default' option in crawlo.cfg")
+            return 1
+        settings_module = config.get('settings', 'default')
+        project_package = settings_module.split('.')[0]
+        # 3. 创建 CrawlerProcess 并自动发现爬虫
+        spider_modules = [f"{project_package}.spiders"]
+        process = CrawlerProcess(spider_modules=spider_modules)
+        # 4. 获取所有爬虫信息
+        spider_names = process.get_spider_names()
+        if not spider_names:
+            print("📭 No spiders found.")
+            print("💡 Make sure:")
+            print("   - Your spider classes inherit from `Spider`")
+            print("   - They define a `name` attribute")
+            print("   - The modules are imported (e.g. via __init__.py)")
+            return 1
+        # 5. 输出爬虫列表
+        print(f"📋 Found {len(spider_names)} spider(s):")
+        print("-" * 50)
+        for name in sorted(spider_names):
+            cls = process.get_spider_class(name)
+            module = cls.__module__.replace(project_package + ".", "")  # 简化模块名
+            print(f"🕷️  {name:<20} {cls.__name__:<25} ({module})")
+        print("-" * 50)
+        return 0
+    except Exception as e:
+        print(f"❌ Error listing spiders: {e}")
+        import traceback
+        traceback.print_exc()
+        return 1
+if __name__ == '__main__':
+    """
+    允许直接运行：
+        python -m crawlo.commands.list
+    """
+    sys.exit(main(sys.argv[1:]))

crawlo/commands/run.py CHANGED Viewed

@@ -1,14 +1,15 @@
-# crawlo/commands/run.py
+"""
+命令行入口：crawlo run <spider_name>
+用于运行指定名称的爬虫。
+"""
 import asyncio
-import importlib
-import sys
 from pathlib import Path
 import configparser
 from crawlo.crawler import CrawlerProcess
 from crawlo.utils.project import get_settings
 from crawlo.utils.log import get_logger
-from crawlo.utils.spider_loader import SpiderLoader
 logger = get_logger(__name__)
@@ -16,24 +17,30 @@ logger = get_logger(__name__)
 def main(args):
     """
     运行指定爬虫的主函数
-    用法: crawlo run <spider_name>
+    用法:
+        crawlo run <spider_name>
+        crawlo run all
     """
     if len(args) < 1:
-        print("Usage: crawlo run <spider_name>")
-        print("Example: crawlo run baidu")
+        print("Usage: crawlo run <spider_name>|all")
+        print("Examples:")
+        print("  crawlo run baidu")
+        print("  crawlo run all")
         return 1
-    spider_name = args[0]
+    spider_arg = args[0]
     try:
         # 1. 获取项目根目录
-        project_root = get_settings()
+        project_root = get_settings().get('PROJECT_ROOT')
+        if not project_root:
+            print("❌ Error: Cannot determine project root.")
+            return 1
-        # 将项目根目录添加到 Python 路径
         if str(project_root) not in sys.path:
             sys.path.insert(0, str(project_root))
-        # 2. 读取配置文件获取项目包名
+        # 2. 读取 crawlo.cfg 获取项目包名
         cfg_file = project_root / 'crawlo.cfg'
         if not cfg_file.exists():
             print(f"❌ Error: crawlo.cfg not found in {project_root}")
@@ -49,27 +56,60 @@ def main(args):
         settings_module = config.get('settings', 'default')
         project_package = settings_module.split('.')[0]
-        # 3. 查找并加载指定名称的 Spider
-        spider_class = find_spider_by_name(project_package, spider_name)
-        if spider_class is None:
+        # 3. 创建 CrawlerProcess 并自动发现爬虫模块
+        spider_modules = [f"{project_package}.spiders"]
+        settings = get_settings()
+        process = CrawlerProcess(settings=settings, spider_modules=spider_modules)
+        # === 新增：支持 'all' ===
+        if spider_arg.lower() == "all":
+            spider_names = process.get_spider_names()
+            if not spider_names:
+                print("❌ No spiders found. Make sure spiders are defined and imported.")
+                return 1
+            print(f"🚀 Starting ALL {len(spider_names)} spiders:")
+            for name in sorted(spider_names):
+                cls = process.get_spider_class(name)
+                print(f"   🕷️  {name} ({cls.__name__})")
+            print("-" * 50)
+            # 启动所有爬虫
+            asyncio.run(process.crawl(spider_names))
+            return 0
+        # === 原有：启动单个爬虫 ===
+        spider_name = spider_arg
+        if not process.is_spider_registered(spider_name):
+            print(f"❌ Error: Spider with name '{spider_name}' not found.")
+            available_names = process.get_spider_names()
+            if available_names:
+                print("💡 Available spiders:")
+                for name in sorted(available_names):
+                    cls = process.get_spider_class(name)
+                    print(f"   - {name} (class: {cls.__name__})")
+            else:
+                print("💡 No spiders found. Make sure your spider classes are defined and imported.")
             return 1
-        # 4. 创建 CrawlerProcess 并运行单个爬虫
-        settings = get_settings()
-        process = CrawlerProcess(settings)
+        spider_class = process.get_spider_class(spider_name)
-        print(f"🚀 Starting spider: {spider_class.name}")
+        # 打印启动信息
+        print(f"🚀 Starting spider: {spider_name}")
         print(f"📁 Project: {project_package}")
         print(f"🕷️  Class: {spider_class.__name__}")
         print("-" * 50)
-        # 运行单个爬虫
-        asyncio.run(process.crawl(spider_class))
+        # 启动爬虫
+        asyncio.run(process.crawl(spider_name))
         print("-" * 50)
         print("✅ Spider completed successfully!")
         return 0
+    except KeyboardInterrupt:
+        print("\n⚠️  Spider interrupted by user.")
+        return 1
     except Exception as e:
         print(f"❌ Error running spider: {e}")
         import traceback
@@ -77,73 +117,65 @@ def main(args):
         return 1
-def find_spider_by_name(project_package: str, target_spider_name: str):
-    """使用 SpiderLoader 查找爬虫"""
-    loader = SpiderLoader(project_package)
-    spider_class = loader.load(target_spider_name)
-    if spider_class is None:
-        print(f"❌ Error: Spider with name '{target_spider_name}' not found")
-        print("💡 Available spiders:")
-        available_spiders = loader.list()
-        for spider_name in available_spiders:
-            print(f"   - {spider_name}")
-        return None
-    return spider_class
 def list_available_spiders(project_package: str):
     """
-    列出所有可用的爬虫
+    列出指定项目包中所有可用的爬虫（用于调试或命令行扩展）
     """
-    spiders_dir = Path.cwd() / project_package / 'spiders'
-    if not spiders_dir.exists():
-        print("   No spiders directory found")
-        return
-    spider_count = 0
-    for py_file in spiders_dir.glob("*.py"):
-        if py_file.name.startswith('_'):
-            continue
-        module_name = py_file.stem
-        spider_module_path = f"{project_package}.spiders.{module_name}"
-        try:
-            module = importlib.import_module(spider_module_path)
-        except ImportError:
-            continue
-        # 查找模块中所有 Spider 子类
-        from crawlo.spider import Spider
-        for attr_name in dir(module):
-            attr_value = getattr(module, attr_name)
-            if (isinstance(attr_value, type) and
-                    issubclass(attr_value, Spider) and
-                    attr_value != Spider and
-                    hasattr(attr_value, 'name')):
-                print(f"   - {attr_value.name} (class: {attr_value.__name__}, module: {module_name})")
-                spider_count += 1
-    if spider_count == 0:
-        print("   No spiders found")
-def run_spider_by_name(spider_name: str, project_root: Path = None):
+    try:
+        # 临时创建一个 CrawlerProcess 来发现爬虫
+        process = CrawlerProcess(spider_modules=[f"{project_package}.spiders"])
+        available_names = process.get_spider_names()
+        if not available_names:
+            print("   No spiders found. Make sure:")
+            print("   - spiders/ 目录存在")
+            print("   - 爬虫类继承 Spider 且定义了 name")
+            print("   - 模块被导入（可通过 __init__.py 触发）")
+            return
+        print(f"Found {len(available_names)} spider(s):")
+        for name in sorted(available_names):
+            cls = process.get_spider_class(name)
+            module = cls.__module__.replace(project_package + ".", "")
+            print(f"   - {name} ({cls.__name__} @ {module})")
+    except Exception as e:
+        print(f"❌ Failed to list spiders: {e}")
+        import traceback
+        traceback.print_exc()
+def run_spider_by_name(spider_name: str, project_package: str = None):
     """
-    直接在代码中通过 spider name 运行爬虫
+    在代码中直接运行某个爬虫（需提供 project_package）
     """
-    if project_root:
-        if str(project_root) not in sys.path:
-            sys.path.insert(0, str(project_root))
+    if project_package is None:
+        # 尝试从配置读取
+        cfg_file = Path('crawlo.cfg')
+        if cfg_file.exists():
+            config = configparser.ConfigParser()
+            config.read(cfg_file, encoding='utf-8')
+            if config.has_option('settings', 'default'):
+                project_package = config.get('settings', 'default').split('.')[0]
+    if not project_package:
+        print("❌ Error: project_package is required.")
+        return 1
+    # 添加项目路径
+    project_root = get_settings().get('PROJECT_ROOT')
+    if project_root and str(project_root) not in sys.path:
+        sys.path.insert(0, str(project_root))
+    # 复用 main 函数逻辑
     args = [spider_name]
     return main(args)
 if __name__ == '__main__':
-    # 允许直接运行: python -m crawlo.commands.run <spider_name>
+    """
+    允许直接运行：
+        python -m crawlo.commands.run <spider_name>
+    """
     import sys
-    sys.exit(main(sys.argv[1:]))
+    sys.exit(main(sys.argv[1:]))

crawlo/commands/stats.py ADDED Viewed

@@ -0,0 +1,59 @@
+#!/usr/bin/python
+# -*- coding:UTF-8 -*-
+"""
+# @Time    :    2025-08-31 22:36
+# @Author  :   crawl-coder
+# @Desc    :   命令行入口：crawlo stats，查看最近运行的爬虫统计信息。
+"""
+import sys
+from crawlo.utils.log import get_logger
+logger = get_logger(__name__)
+# 保存最近运行的爬虫的统计（示例）
+_LAST_RUN_STATS = {}
+def record_stats(crawler):
+    """在爬虫关闭后记录统计（需在 close 中调用）"""
+    if crawler.stats and crawler.spider:
+        _LAST_RUN_STATS[crawler.spider.name] = crawler.stats.get_stats()
+def main(args):
+    if len(args) == 0:
+        # 显示所有历史统计
+        if not _LAST_RUN_STATS:
+            print("📊 No stats available. Run a spider first.")
+            return 0
+        print("📊 Recent Spider Statistics:")
+        print("-" * 60)
+        for spider_name, stats in _LAST_RUN_STATS.items():
+            print(f"🕷️  {spider_name}")
+            for k, v in stats.items():
+                print(f"    {k:<30} {v}")
+            print()
+        return 0
+    elif len(args) == 1:
+        spider_name = args[0]
+        if spider_name not in _LAST_RUN_STATS:
+            print(f"📊 No stats found for spider '{spider_name}'")
+            return 1
+        stats = _LAST_RUN_STATS[spider_name]
+        print(f"📊 Stats for '{spider_name}':")
+        print("-" * 60)
+        for k, v in stats.items():
+            print(f"    {k:<30} {v}")
+        return 0
+    else:
+        print("Usage: crawlo stats [spider_name]")
+        return 1
+if __name__ == '__main__':
+    sys.exit(main(sys.argv[1:]))

crawlo 1.0.6__py3-none-any.whl → 1.0.8__py3-none-any.whl

Potentially problematic release.

crawlo 1.0.6py3-none-any.whl → 1.0.8py3-none-any.whl