PyPI - crawlo - Versions diffs - 1.0.8__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

crawlo 1.0.8py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crawlo might be problematic. Click here for more details.

Files changed (13) hide show

crawlo/__version__.py +1 -1
crawlo/commands/__init__.py +6 -2
crawlo/commands/check.py +80 -31
crawlo/commands/list.py +67 -40
crawlo/commands/run.py +92 -102
crawlo/commands/stats.py +139 -31
crawlo/core/engine.py +1 -1
{crawlo-1.0.8.dist-info → crawlo-1.1.0.dist-info}/METADATA +1 -1
{crawlo-1.0.8.dist-info → crawlo-1.1.0.dist-info}/RECORD +13 -13
examples/gxb/spider/__init__.py +2 -0
{crawlo-1.0.8.dist-info → crawlo-1.1.0.dist-info}/WHEEL +0 -0
{crawlo-1.0.8.dist-info → crawlo-1.1.0.dist-info}/entry_points.txt +0 -0
{crawlo-1.0.8.dist-info → crawlo-1.1.0.dist-info}/top_level.txt +0 -0

crawlo/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.0.8"
1	+ __version__ = "1.1.0"

crawlo/commands/__init__.py CHANGED Viewed

@@ -1,9 +1,13 @@
-# crawlo/commands/__init__.py
-# 定义可用的命令
+#!/usr/bin/python
+# -*- coding:UTF-8 -*-
 _commands = {
     'startproject': 'crawlo.commands.startproject',
     'genspider': 'crawlo.commands.genspider',
     'run': 'crawlo.commands.run',
+    'check': 'crawlo.commands.check',
+    'list': 'crawlo.commands.list',
+    'stats': 'crawlo.commands.stats'
 }
 def get_commands():

crawlo/commands/check.py CHANGED Viewed

@@ -1,85 +1,130 @@
 #!/usr/bin/python
-# -*- coding:UTF-8 -*-
+# -*- coding: UTF-8 -*-
 """
-# @Time    :    2025-08-31 22:35
-# @Author  :   crawl-coder
-# @Desc    :   命令行入口：crawlo check， 检查所有爬虫定义是否合规。
+# @Time    : 2025-08-31 22:35
+# @Author  : crawl-coder
+# @Desc    : 命令行入口：crawlo check，检查所有爬虫定义是否合规。
 """
 import sys
 import configparser
+from pathlib import Path
+from importlib import import_module
 from crawlo.crawler import CrawlerProcess
-from crawlo.utils.project import get_settings
 from crawlo.utils.log import get_logger
 logger = get_logger(__name__)
+def get_project_root():
+    """
+    从当前目录向上查找 crawlo.cfg，确定项目根目录
+    """
+    current = Path.cwd()
+    for _ in range(10):
+        cfg = current / "crawlo.cfg"
+        if cfg.exists():
+            return current
+        if current == current.parent:
+            break
+        current = current.parent
+    return None
 def main(args):
+    """
+    主函数：检查所有爬虫定义的合规性
+    用法: crawlo check
+    """
     if args:
-        print("Usage: crawlo check")
+        print("❌ Usage: crawlo check")
         return 1
     try:
-        project_root = get_settings().get('PROJECT_ROOT')
+        # 1. 查找项目根目录
+        project_root = get_project_root()
         if not project_root:
-            print("❌ Error: Cannot determine project root.")
+            print("❌ Error: Cannot find 'crawlo.cfg'. Are you in a crawlo project?")
+            print("💡 Tip: Run this command inside your project directory.")
             return 1
-        if str(project_root) not in sys.path:
-            sys.path.insert(0, str(project_root))
+        project_root_str = str(project_root)
+        if project_root_str not in sys.path:
+            sys.path.insert(0, project_root_str)
-        cfg_file = project_root / 'crawlo.cfg'
+        # 2. 读取 crawlo.cfg
+        cfg_file = project_root / "crawlo.cfg"
         if not cfg_file.exists():
-            print(f"❌ Error: crawlo.cfg not found in {project_root}")
+            print(f"❌ Error: Expected config file not found: {cfg_file}")
             return 1
         config = configparser.ConfigParser()
-        config.read(cfg_file, encoding='utf-8')
+        config.read(cfg_file, encoding="utf-8")
-        if not config.has_section('settings') or not config.has_option('settings', 'default'):
+        if not config.has_section("settings") or not config.has_option("settings", "default"):
             print("❌ Error: Missing [settings] section or 'default' option in crawlo.cfg")
             return 1
-        settings_module = config.get('settings', 'default')
-        project_package = settings_module.split('.')[0]
+        settings_module = config.get("settings", "default")
+        project_package = settings_module.split(".")[0]
+        # 3. 确保项目包可导入
+        try:
+            import_module(project_package)
+        except ImportError as e:
+            print(f"❌ Failed to import project package '{project_package}': {e}")
+            return 1
-        # 创建 CrawlerProcess 并发现爬虫
-        process = CrawlerProcess(spider_modules=[f"{project_package}.spiders"])
+        # 4. 加载爬虫
+        spider_modules = [f"{project_package}.spiders"]
+        process = CrawlerProcess(spider_modules=spider_modules)
         spider_names = process.get_spider_names()
         if not spider_names:
             print("📭 No spiders found.")
+            print("💡 Make sure:")
+            print("   • Spiders are defined in the 'spiders' module")
+            print("   • They have a `name` attribute")
+            print("   • Modules are properly imported")
             return 1
         print(f"🔍 Checking {len(spider_names)} spider(s)...")
         print("-" * 60)
         issues_found = False
         for name in sorted(spider_names):
             cls = process.get_spider_class(name)
             issues = []
-            if not hasattr(cls, 'name') or not cls.name:
+            # 检查 name 属性
+            if not getattr(cls, "name", None):
                 issues.append("missing or empty 'name' attribute")
             elif not isinstance(cls.name, str):
                 issues.append("'name' is not a string")
-            if not callable(getattr(cls, 'start_requests', None)):
+            # 检查 start_requests 是否可调用
+            if not callable(getattr(cls, "start_requests", None)):
                 issues.append("missing or non-callable 'start_requests' method")
-            if hasattr(cls, 'start_urls') and isinstance(cls.start_urls, str):
-                issues.append("'start_urls' is a string, should be list/tuple")
+            # 检查 start_urls 类型（不应是字符串）
+            if hasattr(cls, "start_urls") and isinstance(cls.start_urls, str):
+                issues.append("'start_urls' is a string; should be list or tuple")
-            # 实例化检查（轻量）
+            # 实例化并检查 parse 方法（非强制但推荐）
             try:
                 spider = cls.create_instance(None)
-                if not callable(getattr(spider, 'parse', None)):
-                    issues.append("no 'parse' method defined (optional but recommended)")
+                if not callable(getattr(spider, "parse", None)):
+                    issues.append("no 'parse' method defined (recommended)")
             except Exception as e:
-                issues.append(f"failed to create instance: {e}")
+                issues.append(f"failed to instantiate spider: {e}")
+            # 输出结果
             if issues:
                 print(f"❌ {name:<20} {cls.__name__}")
                 for issue in issues:
@@ -89,19 +134,23 @@ def main(args):
                 print(f"✅ {name:<20} {cls.__name__} (OK)")
         print("-" * 60)
         if issues_found:
             print("⚠️  Some spiders have issues. Please fix them.")
             return 1
         else:
-            print("🎉 All spiders are compliant!")
+            print("🎉 All spiders are compliant and well-defined!")
             return 0
     except Exception as e:
-        print(f"❌ Error during check: {e}")
-        import traceback
-        traceback.print_exc()
+        print(f"❌ Unexpected error during check: {e}")
+        logger.exception("Exception in 'crawlo check'")
         return 1
-if __name__ == '__main__':
+if __name__ == "__main__":
+    """
+    支持直接运行：
+        python -m crawlo.commands.check
+    """
     sys.exit(main(sys.argv[1:]))

crawlo/commands/list.py CHANGED Viewed

@@ -1,92 +1,119 @@
 #!/usr/bin/python
-# -*- coding:UTF-8 -*-
+# -*- coding: UTF-8 -*-
 """
-# @Time    :    2025-08-31 22:33
-# @Author  :   crawl-coder
-# @Desc    :   命令行入口：crawlo list，用于列出所有已注册的爬虫
+# @Time    : 2025-08-31 22:33
+# @Author  : crawl-coder
+# @Desc    : 命令行入口：crawlo list，用于列出所有已注册的爬虫
 """
 import sys
 import configparser
+from pathlib import Path
+from importlib import import_module
 from crawlo.crawler import CrawlerProcess
-from crawlo.utils.project import get_settings
 from crawlo.utils.log import get_logger
 logger = get_logger(__name__)
+def get_project_root():
+    """
+    自动检测项目根目录：从当前目录向上查找 crawlo.cfg
+    找到后返回该目录路径（字符串），最多向上查找10层。
+    """
+    current = Path.cwd()
+    for _ in range(10):
+        cfg = current / "crawlo.cfg"
+        if cfg.exists():
+            return str(current)
+        # 到达文件系统根目录
+        if current == current.parent:
+            break
+        current = current.parent
+    return None  # 未找到
 def main(args):
     """
-    列出所有可用爬虫
+    主函数：列出所有可用爬虫
     用法: crawlo list
     """
     if args:
-        print("Usage: crawlo list")
+        print("❌ Usage: crawlo list")
         return 1
     try:
-        # 1. 获取项目根目录
-        project_root = get_settings().get('PROJECT_ROOT')
+        # 1. 查找项目根目录
+        project_root = get_project_root()
         if not project_root:
-            print("❌ Error: Cannot determine project root.")
+            print("❌ Error: Cannot find 'crawlo.cfg'. Are you in a crawlo project?")
+            print("💡 Tip: Run this command inside your project directory, or create a project with 'crawlo startproject'.")
             return 1
-        # 将项目根目录加入 sys.path
-        project_root_str = str(project_root)
+        project_root_path = Path(project_root)
+        project_root_str = str(project_root_path)
+        # 2. 将项目根加入 Python 路径，以便导入项目模块
         if project_root_str not in sys.path:
             sys.path.insert(0, project_root_str)
-        # 2. 读取 crawlo.cfg 获取项目包名
-        cfg_file = project_root / 'crawlo.cfg'
-        if not cfg_file.exists():
-            print(f"❌ Error: crawlo.cfg not found in {project_root}")
-            return 1
+        # 3. 读取 crawlo.cfg 获取 settings 模块
+        cfg_file = project_root_path / "crawlo.cfg"
         config = configparser.ConfigParser()
-        config.read(cfg_file, encoding='utf-8')
+        config.read(cfg_file, encoding="utf-8")
-        if not config.has_section('settings') or not config.has_option('settings', 'default'):
-            print("❌ Error: Missing [settings] section or 'default' option in crawlo.cfg")
+        if not config.has_section("settings") or not config.has_option("settings", "default"):
+            print("❌ Error: Invalid crawlo.cfg — missing [settings] or 'default' option.")
             return 1
-        settings_module = config.get('settings', 'default')
-        project_package = settings_module.split('.')[0]
+        settings_module = config.get("settings", "default")
+        project_package = settings_module.split(".")[0]
+        # 4. 确保项目包可导入（可选：尝试导入以触发异常）
+        try:
+            import_module(project_package)
+        except ImportError as e:
+            print(f"❌ Failed to import project package '{project_package}': {e}")
+            return 1
-        # 3. 创建 CrawlerProcess 并自动发现爬虫
+        # 5. 初始化 CrawlerProcess 并加载爬虫模块
         spider_modules = [f"{project_package}.spiders"]
         process = CrawlerProcess(spider_modules=spider_modules)
-        # 4. 获取所有爬虫信息
+        # 6. 获取所有爬虫名称
         spider_names = process.get_spider_names()
         if not spider_names:
-            print("📭 No spiders found.")
+            print("📭 No spiders found in 'spiders/' directory.")
             print("💡 Make sure:")
-            print("   - Your spider classes inherit from `Spider`")
-            print("   - They define a `name` attribute")
-            print("   - The modules are imported (e.g. via __init__.py)")
+            print("   • Spider classes inherit from `crawlo.spider.Spider`")
+            print("   • Each spider has a `name` attribute")
+            print("   • Spiders are imported in `spiders/__init__.py` (if using package)")
             return 1
-        # 5. 输出爬虫列表
+        # 7. 输出爬虫列表
         print(f"📋 Found {len(spider_names)} spider(s):")
-        print("-" * 50)
+        print("-" * 60)
         for name in sorted(spider_names):
-            cls = process.get_spider_class(name)
-            module = cls.__module__.replace(project_package + ".", "")  # 简化模块名
-            print(f"🕷️  {name:<20} {cls.__name__:<25} ({module})")
-        print("-" * 50)
+            spider_cls = process.get_spider_class(name)
+            module_name = spider_cls.__module__.replace(f"{project_package}.", "")
+            print(f"🕷️  {name:<20} {spider_cls.__name__:<25} ({module_name})")
+        print("-" * 60)
         return 0
     except Exception as e:
-        print(f"❌ Error listing spiders: {e}")
-        import traceback
-        traceback.print_exc()
+        print(f"❌ Unexpected error: {e}")
+        logger.exception("Exception during 'crawlo list'")
         return 1
-if __name__ == '__main__':
+if __name__ == "__main__":
     """
-    允许直接运行：
+    支持直接运行：
         python -m crawlo.commands.list
     """
     sys.exit(main(sys.argv[1:]))

crawlo/commands/run.py CHANGED Viewed

@@ -1,106 +1,153 @@
+#!/usr/bin/python
+# -*- coding: UTF-8 -*-
 """
-命令行入口：crawlo run <spider_name>
-用于运行指定名称的爬虫。
+# @Time    : 2025-08-31 22:36
+# @Author  : crawl-coder
+# @Desc    : 命令行入口：crawlo run <spider_name>|all，用于运行指定爬虫。
 """
+import sys
 import asyncio
-from pathlib import Path
 import configparser
+from pathlib import Path
+from importlib import import_module
 from crawlo.crawler import CrawlerProcess
-from crawlo.utils.project import get_settings
 from crawlo.utils.log import get_logger
+from crawlo.utils.project import get_settings
+from crawlo.commands.stats import record_stats  # 自动记录 stats
 logger = get_logger(__name__)
+def get_project_root():
+    """
+    向上查找 crawlo.cfg 来确定项目根目录
+    """
+    current = Path.cwd()
+    for _ in range(10):
+        cfg = current / "crawlo.cfg"
+        if cfg.exists():
+            return current
+        if current == current.parent:
+            break
+        current = current.parent
+    return None
 def main(args):
     """
-    运行指定爬虫的主函数
+    主函数：运行指定爬虫
     用法:
         crawlo run <spider_name>
         crawlo run all
     """
     if len(args) < 1:
-        print("Usage: crawlo run <spider_name>|all")
-        print("Examples:")
-        print("  crawlo run baidu")
-        print("  crawlo run all")
+        print("❌ Usage: crawlo run <spider_name>|all")
+        print("💡 Examples:")
+        print("   crawlo run baidu")
+        print("   crawlo run all")
         return 1
     spider_arg = args[0]
     try:
-        # 1. 获取项目根目录
-        project_root = get_settings().get('PROJECT_ROOT')
+        # 1. 查找项目根目录
+        project_root = get_project_root()
         if not project_root:
-            print("❌ Error: Cannot determine project root.")
+            print("❌ Error: Cannot find 'crawlo.cfg'. Are you in a crawlo project?")
+            print("💡 Tip: Run this command inside your project directory.")
             return 1
-        if str(project_root) not in sys.path:
-            sys.path.insert(0, str(project_root))
+        project_root_str = str(project_root)
+        if project_root_str not in sys.path:
+            sys.path.insert(0, project_root_str)
-        # 2. 读取 crawlo.cfg 获取项目包名
-        cfg_file = project_root / 'crawlo.cfg'
+        # 2. 读取 crawlo.cfg 获取 settings 模块
+        cfg_file = project_root / "crawlo.cfg"
         if not cfg_file.exists():
             print(f"❌ Error: crawlo.cfg not found in {project_root}")
             return 1
         config = configparser.ConfigParser()
-        config.read(cfg_file, encoding='utf-8')
+        config.read(cfg_file, encoding="utf-8")
-        if not config.has_section('settings') or not config.has_option('settings', 'default'):
+        if not config.has_section("settings") or not config.has_option("settings", "default"):
             print("❌ Error: Missing [settings] section or 'default' option in crawlo.cfg")
             return 1
-        settings_module = config.get('settings', 'default')
-        project_package = settings_module.split('.')[0]
+        settings_module = config.get("settings", "default")
+        project_package = settings_module.split(".")[0]
-        # 3. 创建 CrawlerProcess 并自动发现爬虫模块
+        # 3. 确保项目包可导入
+        try:
+            import_module(project_package)
+        except ImportError as e:
+            print(f"❌ Failed to import project package '{project_package}': {e}")
+            return 1
+        # 4. 加载 settings 和爬虫模块
+        settings = get_settings()  # 此时已安全
         spider_modules = [f"{project_package}.spiders"]
-        settings = get_settings()
         process = CrawlerProcess(settings=settings, spider_modules=spider_modules)
-        # === 新增：支持 'all' ===
+        # === 情况1：运行所有爬虫 ===
         if spider_arg.lower() == "all":
             spider_names = process.get_spider_names()
             if not spider_names:
-                print("❌ No spiders found. Make sure spiders are defined and imported.")
+                print("❌ No spiders found.")
+                print("💡 Make sure:")
+                print("   • Spiders are defined in 'spiders/'")
+                print("   • They have a `name` attribute")
+                print("   • Modules are imported (e.g. via __init__.py)")
                 return 1
-            print(f"🚀 Starting ALL {len(spider_names)} spiders:")
+            print(f"🚀 Starting ALL {len(spider_names)} spider(s):")
+            print("-" * 60)
             for name in sorted(spider_names):
                 cls = process.get_spider_class(name)
-                print(f"   🕷️  {name} ({cls.__name__})")
-            print("-" * 50)
+                print(f"🕷️  {name:<20} {cls.__name__}")
+            print("-" * 60)
+            # 注册 stats 记录（每个爬虫结束时保存）
+            for crawler in process.crawlers:
+                crawler.signals.connect(record_stats, signal="spider_closed")
-            # 启动所有爬虫
+            # 并行运行所有爬虫（可改为串行：for name in ... await process.crawl(name)）
             asyncio.run(process.crawl(spider_names))
+            print("✅ All spiders completed.")
             return 0
-        # === 原有：启动单个爬虫 ===
+        # === 情况2：运行单个爬虫 ===
         spider_name = spider_arg
         if not process.is_spider_registered(spider_name):
-            print(f"❌ Error: Spider with name '{spider_name}' not found.")
-            available_names = process.get_spider_names()
-            if available_names:
+            print(f"❌ Spider '{spider_name}' not found.")
+            available = process.get_spider_names()
+            if available:
                 print("💡 Available spiders:")
-                for name in sorted(available_names):
+                for name in sorted(available):
                     cls = process.get_spider_class(name)
-                    print(f"   - {name} (class: {cls.__name__})")
+                    print(f"   • {name} ({cls.__name__})")
             else:
-                print("💡 No spiders found. Make sure your spider classes are defined and imported.")
+                print("💡 No spiders found. Check your spiders module.")
             return 1
         spider_class = process.get_spider_class(spider_name)
         # 打印启动信息
         print(f"🚀 Starting spider: {spider_name}")
-        print(f"📁 Project: {project_package}")
-        print(f"🕷️  Class: {spider_class.__name__}")
+        print(f"📦 Project: {project_package}")
+        print(f"CppClass: {spider_class.__name__}")
+        print(f"📄 Module: {spider_class.__module__}")
         print("-" * 50)
-        # 启动爬虫
+        # 注册 stats 记录
+        for crawler in process.crawlers:
+            crawler.signals.connect(record_stats, signal="spider_closed")
+        # 运行爬虫
         asyncio.run(process.crawl(spider_name))
         print("-" * 50)
@@ -111,71 +158,14 @@ def main(args):
         print("\n⚠️  Spider interrupted by user.")
         return 1
     except Exception as e:
-        print(f"❌ Error running spider: {e}")
-        import traceback
-        traceback.print_exc()
+        print(f"❌ Unexpected error: {e}")
+        logger.exception("Exception during 'crawlo run'")
         return 1
-def list_available_spiders(project_package: str):
-    """
-    列出指定项目包中所有可用的爬虫（用于调试或命令行扩展）
-    """
-    try:
-        # 临时创建一个 CrawlerProcess 来发现爬虫
-        process = CrawlerProcess(spider_modules=[f"{project_package}.spiders"])
-        available_names = process.get_spider_names()
-        if not available_names:
-            print("   No spiders found. Make sure:")
-            print("   - spiders/ 目录存在")
-            print("   - 爬虫类继承 Spider 且定义了 name")
-            print("   - 模块被导入（可通过 __init__.py 触发）")
-            return
-        print(f"Found {len(available_names)} spider(s):")
-        for name in sorted(available_names):
-            cls = process.get_spider_class(name)
-            module = cls.__module__.replace(project_package + ".", "")
-            print(f"   - {name} ({cls.__name__} @ {module})")
-    except Exception as e:
-        print(f"❌ Failed to list spiders: {e}")
-        import traceback
-        traceback.print_exc()
-def run_spider_by_name(spider_name: str, project_package: str = None):
+if __name__ == "__main__":
     """
-    在代码中直接运行某个爬虫（需提供 project_package）
+    支持直接运行：
+        python -m crawlo.commands.run spider_name
     """
-    if project_package is None:
-        # 尝试从配置读取
-        cfg_file = Path('crawlo.cfg')
-        if cfg_file.exists():
-            config = configparser.ConfigParser()
-            config.read(cfg_file, encoding='utf-8')
-            if config.has_option('settings', 'default'):
-                project_package = config.get('settings', 'default').split('.')[0]
-    if not project_package:
-        print("❌ Error: project_package is required.")
-        return 1
-    # 添加项目路径
-    project_root = get_settings().get('PROJECT_ROOT')
-    if project_root and str(project_root) not in sys.path:
-        sys.path.insert(0, str(project_root))
-    # 复用 main 函数逻辑
-    args = [spider_name]
-    return main(args)
-if __name__ == '__main__':
-    """
-    允许直接运行：
-        python -m crawlo.commands.run <spider_name>
-    """
-    import sys
-    sys.exit(main(sys.argv[1:]))
+    sys.exit(main(sys.argv[1:]))

crawlo/commands/stats.py CHANGED Viewed

@@ -1,59 +1,167 @@
 #!/usr/bin/python
-# -*- coding:UTF-8 -*-
+# -*- coding: UTF-8 -*-
 """
-# @Time    :    2025-08-31 22:36
-# @Author  :   crawl-coder
-# @Desc    :   命令行入口：crawlo stats，查看最近运行的爬虫统计信息。
+# @Time    : 2025-08-31 22:36
+# @Author  : crawl-coder
+# @Desc    : 命令行入口：crawlo stats，查看最近运行的爬虫统计信息。
 """
 import sys
+import json
+from pathlib import Path
+from datetime import datetime
+from typing import Dict, Any
 from crawlo.utils.log import get_logger
 logger = get_logger(__name__)
-# 保存最近运行的爬虫的统计（示例）
-_LAST_RUN_STATS = {}
+# 默认存储目录（相对于项目根目录）
+STATS_DIR = "logs/stats"
+def get_stats_dir() -> Path:
+    """
+    获取统计文件存储目录，优先使用项目根下的 logs/stats/
+    如果不在项目中，回退到当前目录
+    """
+    # 尝试查找项目根目录（通过 crawlo.cfg）
+    current = Path.cwd()
+    for _ in range(10):
+        if (current / "crawlo.cfg").exists():
+            return current / STATS_DIR
+        if current == current.parent:
+            break
+        current = current.parent
+    # 回退：使用当前目录下的 logs/stats
+    return Path.cwd() / STATS_DIR
 def record_stats(crawler):
-    """在爬虫关闭后记录统计（需在 close 中调用）"""
-    if crawler.stats and crawler.spider:
-        _LAST_RUN_STATS[crawler.spider.name] = crawler.stats.get_stats()
+    """
+    【供爬虫运行时调用】记录爬虫结束后的统计信息到 JSON 文件
+    需在 Crawler 的 closed 回调中调用
+    """
+    spider_name = getattr(crawler.spider, "name", "unknown")
+    stats = crawler.stats.get_stats() if crawler.stats else {}
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    stats_dir = Path(get_stats_dir())
+    stats_dir.mkdir(parents=True, exist_ok=True)
+    filename = stats_dir / f"{spider_name}_{timestamp}.json"
+    try:
+        with open(filename, "w", encoding="utf-8") as f:
+            json.dump({
+                "spider": spider_name,
+                "timestamp": datetime.now().isoformat(),
+                "stats": stats
+            }, f, ensure_ascii=False, indent=2, default=str)
+        logger.info(f"📊 Stats saved for spider '{spider_name}' → {filename}")
+    except Exception as e:
+        logger.error(f"Failed to save stats for '{spider_name}': {e}")
+def load_all_stats() -> Dict[str, list]:
+    """
+    加载所有已保存的统计文件，按 spider name 分组
+    返回: {spider_name: [stats_record, ...]}
+    """
+    stats_dir = get_stats_dir()
+    if not stats_dir.exists():
+        return {}
+    result = {}
+    json_files = sorted(stats_dir.glob("*.json"), key=lambda x: x.stat().st_mtime, reverse=True)
+    for file in json_files:
+        try:
+            with open(file, "r", encoding="utf-8") as f:
+                data = json.load(f)
+            spider_name = data.get("spider", "unknown")
+            result.setdefault(spider_name, []).append(data)
+        except Exception as e:
+            logger.warning(f"Failed to load stats file {file}: {e}")
+    return result
+def format_value(v: Any) -> str:
+    """格式化值，防止太长或不可打印"""
+    if isinstance(v, float):
+        return f"{v:.4f}"
+    return str(v)
 def main(args):
-    if len(args) == 0:
-        # 显示所有历史统计
-        if not _LAST_RUN_STATS:
-            print("📊 No stats available. Run a spider first.")
-            return 0
+    """
+    主函数：查看统计信息
+    用法:
+        crawlo stats                 → 显示所有爬虫最近一次运行
+        crawlo stats myspider        → 显示指定爬虫所有历史记录
+        crawlo stats myspider --all  → 显示所有历史（同上）
+    """
+    if len(args) > 2:
+        print("Usage: crawlo stats [spider_name] [--all]")
+        return 1
+    spider_name = None
+    show_all = False
+    if args:
+        spider_name = args[0]
+        show_all = "--all" in args or "-a" in args
+    # 加载所有 stats
+    all_stats = load_all_stats()
+    if not all_stats:
+        print("📊 No stats found. Run a spider first.")
+        print(f"💡 Stats are saved in: {get_stats_dir()}")
+        return 0
-        print("📊 Recent Spider Statistics:")
+    if not spider_name:
+        # 显示每个爬虫最近一次运行
+        print("📊 Recent Spider Statistics (last run):")
         print("-" * 60)
-        for spider_name, stats in _LAST_RUN_STATS.items():
-            print(f"🕷️  {spider_name}")
-            for k, v in stats.items():
-                print(f"    {k:<30} {v}")
+        for name, runs in all_stats.items():
+            latest = runs[0]
+            print(f"🕷️  {name} ({latest['timestamp'][:19]})")
+            stats = latest["stats"]
+            for k in sorted(stats.keys()):
+                print(f"    {k:<30} {format_value(stats[k])}")
             print()
         return 0
-    elif len(args) == 1:
-        spider_name = args[0]
-        if spider_name not in _LAST_RUN_STATS:
+    else:
+        # 查看指定爬虫
+        if spider_name not in all_stats:
             print(f"📊 No stats found for spider '{spider_name}'")
+            available = ', '.join(all_stats.keys())
+            if available:
+                print(f"💡 Available spiders: {available}")
             return 1
-        stats = _LAST_RUN_STATS[spider_name]
-        print(f"📊 Stats for '{spider_name}':")
+        runs = all_stats[spider_name]
+        if show_all:
+            print(f"📊 All runs for '{spider_name}' ({len(runs)} runs):")
+        else:
+            runs = runs[:1]
+            print(f"📊 Last run for '{spider_name}':")
         print("-" * 60)
-        for k, v in stats.items():
-            print(f"    {k:<30} {v}")
+        for run in runs:
+            print(f"⏱️  Timestamp: {run['timestamp']}")
+            stats = run["stats"]
+            for k in sorted(stats.keys()):
+                print(f"    {k:<30} {format_value(stats[k])}")
+            print("─" * 60)
         return 0
-    else:
-        print("Usage: crawlo stats [spider_name]")
-        return 1
-if __name__ == '__main__':
+if __name__ == "__main__":
+    """
+    支持直接运行：
+        python -m crawlo.commands.stats
+    """
     sys.exit(main(sys.argv[1:]))

crawlo/core/engine.py CHANGED Viewed

@@ -42,7 +42,7 @@ class Engine(object):
     def engine_start(self):
         self.running = True
         self.logger.info(
-            f"Crawlo (version {self.settings.get_int('VERSION')}) started. "
+            f"Crawlo (version {self.settings.get_float('VERSION')}) started. "
             f"(project name : {self.settings.get('PROJECT_NAME')})"
         )

{crawlo-1.0.8.dist-info → crawlo-1.1.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: crawlo
-Version: 1.0.8
+Version: 1.1.0
 Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架，支持分布式抓取。
 Home-page: https://github.com/crawl-coder/Crawlo.git
 Author: crawl-coder

{crawlo-1.0.8.dist-info → crawlo-1.1.0.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 crawlo/__init__.py,sha256=xpiIAZbSG3CzneJuDLPCbwfRcvw2wyHYl2kJjaNfNGY,584
-crawlo/__version__.py,sha256=uyL3a6o1xccXPZ2OS65zqIN_lbEMT7PcCxErq7cuWwA,23
+crawlo/__version__.py,sha256=Zrv57EzpjdsuSPqsYvFkVsQKKRUOHFG7yURCf7qN-Tk,23
 crawlo/cli.py,sha256=hjAJKx9pba375sATvvcy-dtZyBIgXj8fRBq9RFIZHA4,1206
 crawlo/crawler.py,sha256=AyKxUyJvCwb1u4d3Zn3vFmjH28ExWKIygfTICps-3yY,20026
 crawlo/event.py,sha256=ZhoPW5CglCEuZNFEwviSCBIw0pT5O6jT98bqYrDFd3E,324
@@ -7,15 +7,15 @@ crawlo/exceptions.py,sha256=xdyZkvVcLEJ-19sWMHvn9IJsu30-hAY2jJhA2kYIims,1207
 crawlo/stats_collector.py,sha256=v4jC9BAe-23w93hWzbeMCCgQ9VuFPyxw5JV9ItbGH8w,1636
 crawlo/subscriber.py,sha256=udlHeTR0ymGQhCDxVUGwUzeeeR4TYCEJrJwFnkgr0cU,3836
 crawlo/task_manager.py,sha256=D9m-nqnGj-FZPtGk4CdwZX3Gw7IWyYvTS7CHpRGWc_w,748
-crawlo/commands/__init__.py,sha256=dRu3ipuhDM7M1eTb6zJtQZ_u7N_tZumGfH5_I92xno8,252
-crawlo/commands/check.py,sha256=Q8wFjIo43XW0wP93TTlM7HSShgytJsbSWHIlmkcNxz0,3585
+crawlo/commands/__init__.py,sha256=kZ3qATqDPmMUCNUQSFfBfIA8fp_1dgBwIAWbmFN3_To,355
+crawlo/commands/check.py,sha256=He5Dmpn8M0gYEfiXRW801I6ULypWKMvT5Iwjg_4cUYE,5070
 crawlo/commands/genspider.py,sha256=kSHYsAGHRoxU6Qf_MGpR_VS-Ua5NUGY2KGm_Wapn0sw,3529
-crawlo/commands/list.py,sha256=itR05muZlZs8FbRh88kOhcRbZc77OXiR6A86UnVhSMY,2974
-crawlo/commands/run.py,sha256=s6JJC8HNa-tBgPDB2BPUmj26D7PMckhlx4AOEz57ESY,6197
+crawlo/commands/list.py,sha256=iwd1piFYa7cr4WkRTD0ndCZEN0xoZX0vvlWTU1FbSYE,3972
+crawlo/commands/run.py,sha256=ppgEUNVNuhpQFiBkgB6ZFAKeOJiLybd68gGHcAJgF4w,5813
 crawlo/commands/startproject.py,sha256=1KOq_CALy01oklr0dAUYhGFzu4f7w45q2H0O3qafLX4,3494
-crawlo/commands/stats.py,sha256=rH0TlD0o-xUr9RxtvNYgnSjHHoRyma3rvx9Q9nIGDNg,1659
+crawlo/commands/stats.py,sha256=siuCv2PGhr0_eqAaER2YYwI_IHmOlFbgIHWmX3-EWs4,5246
 crawlo/core/__init__.py,sha256=JYSAn15r8yWgRK_Nc69t_8tZCyb70MiPZKssA8wrYz0,43
-crawlo/core/engine.py,sha256=JFHooPp-5cfHSyxEh87nOOR5NMaPLVDfNSqAsbtx4PM,6030
+crawlo/core/engine.py,sha256=SoTVS3F2EI1G_zQVe9UbeUz8cBhyVFlxJ-HuhPD3ock,6032
 crawlo/core/processor.py,sha256=oHLs-cno0bJGTNc9NGD2S7_2-grI3ruvggO0SY2mf3Q,1180
 crawlo/core/scheduler.py,sha256=ZMPs4LSs69FsFfDTvaOMJKqpSQQGvIEE9pMyYVVAA64,1948
 crawlo/downloader/__init__.py,sha256=72u2Hef4HaMfs9VCqEjbMtiaRXbaXmgNiJn6qy09LHs,2384
@@ -81,7 +81,7 @@ examples/gxb/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 examples/gxb/items.py,sha256=3-1Lxpi7EqMzheDJoO0MPyHky5nHG_nqQGgKlm8y6mQ,989
 examples/gxb/run.py,sha256=9kJlR8f-tZ3BqP5PW7sCLTw6PAFWo3x4cG5lc-6GWqI,333
 examples/gxb/settings.py,sha256=_nbXj9HV2e0F6liUzK0ueygLcaMM_IUlkuwL6mJqUfc,2345
-examples/gxb/spider/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+examples/gxb/spider/__init__.py,sha256=E5bYTAuqcy2KBgnZnZ7OoW7mE6YUIy2w748zCrE68nI,92
 examples/gxb/spider/miit_spider.py,sha256=tcQnuyUHfu-Re1QbKKSI9DXW3Sp1vyBW8qBzKLf_RC4,6666
 examples/gxb/spider/telecom_device.py,sha256=58iG6BQtQjjDHOF7-DXH0u5_XnppP5AJTQwaVJVyBEo,4929
 tests/__init__.py,sha256=409aRX8hsPffiZCVjOogtxwhACzBp8G2UTJyUQSxhK0,136
@@ -90,8 +90,8 @@ tests/test_proxy_middleware_integration.py,sha256=mTPK_XvbmLCV_QoVZzA3ybWOOX6149
 tests/test_proxy_providers.py,sha256=u_R2fhab90vqvQEaOAztpAOe9tJXvUMIdoDxmStmXJ4,1749
 tests/test_proxy_stats.py,sha256=ES00CEoDITYPFBGPk8pecFzD3ItYIv6NSpcqNd8-kvo,526
 tests/test_proxy_strategies.py,sha256=9Z1pXmTNyw-eIhGXlf2abZbJx6igLohYq-_3hldQ5uE,1868
-crawlo-1.0.8.dist-info/METADATA,sha256=ia-nA0g0Rl76iHFIlvaRbvUnjd88KEKoxIrJKcjtCyw,1825
-crawlo-1.0.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-crawlo-1.0.8.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
-crawlo-1.0.8.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
-crawlo-1.0.8.dist-info/RECORD,,
+crawlo-1.1.0.dist-info/METADATA,sha256=WTcM-8FqMpTLvIAPGvruLzPCBZnY3ODYklhnv7eVS70,1825
+crawlo-1.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+crawlo-1.1.0.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
+crawlo-1.1.0.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
+crawlo-1.1.0.dist-info/RECORD,,

examples/gxb/spider/__init__.py CHANGED Viewed

	@@ -0,0 +1,2 @@
1	+ from .miit_spider import MiitSpider
2	+ from .telecom_device import TelecomDeviceLicensesSpider

{crawlo-1.0.8.dist-info → crawlo-1.1.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{crawlo-1.0.8.dist-info → crawlo-1.1.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{crawlo-1.0.8.dist-info → crawlo-1.1.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

crawlo 1.0.8__py3-none-any.whl → 1.1.0__py3-none-any.whl

Potentially problematic release.

crawlo 1.0.8py3-none-any.whl → 1.1.0py3-none-any.whl