PyPI - crawlo - Versions diffs - 1.0.8__tar.gz → 1.1.0__tar.gz - Mend

crawlo 1.0.8tar.gz → 1.1.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crawlo might be problematic. Click here for more details.

Files changed (112) hide show

{crawlo-1.0.8/crawlo.egg-info → crawlo-1.1.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: crawlo
-Version: 1.0.8
+Version: 1.1.0
 Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架，支持分布式抓取。
 Home-page: https://github.com/crawl-coder/Crawlo.git
 Author: crawl-coder

crawlo-1.1.0/crawlo/__version__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "1.1.0"

crawlo-1.1.0/crawlo/commands/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+#!/usr/bin/python
+# -*- coding:UTF-8 -*-
+_commands = {
+    'startproject': 'crawlo.commands.startproject',
+    'genspider': 'crawlo.commands.genspider',
+    'run': 'crawlo.commands.run',
+    'check': 'crawlo.commands.check',
+    'list': 'crawlo.commands.list',
+    'stats': 'crawlo.commands.stats'
+}
+def get_commands():
+    return _commands

crawlo-1.1.0/crawlo/commands/check.py ADDED Viewed

@@ -0,0 +1,156 @@
+#!/usr/bin/python
+# -*- coding: UTF-8 -*-
+"""
+# @Time    : 2025-08-31 22:35
+# @Author  : crawl-coder
+# @Desc    : 命令行入口：crawlo check，检查所有爬虫定义是否合规。
+"""
+import sys
+import configparser
+from pathlib import Path
+from importlib import import_module
+from crawlo.crawler import CrawlerProcess
+from crawlo.utils.log import get_logger
+logger = get_logger(__name__)
+def get_project_root():
+    """
+    从当前目录向上查找 crawlo.cfg，确定项目根目录
+    """
+    current = Path.cwd()
+    for _ in range(10):
+        cfg = current / "crawlo.cfg"
+        if cfg.exists():
+            return current
+        if current == current.parent:
+            break
+        current = current.parent
+    return None
+def main(args):
+    """
+    主函数：检查所有爬虫定义的合规性
+    用法: crawlo check
+    """
+    if args:
+        print("❌ Usage: crawlo check")
+        return 1
+    try:
+        # 1. 查找项目根目录
+        project_root = get_project_root()
+        if not project_root:
+            print("❌ Error: Cannot find 'crawlo.cfg'. Are you in a crawlo project?")
+            print("💡 Tip: Run this command inside your project directory.")
+            return 1
+        project_root_str = str(project_root)
+        if project_root_str not in sys.path:
+            sys.path.insert(0, project_root_str)
+        # 2. 读取 crawlo.cfg
+        cfg_file = project_root / "crawlo.cfg"
+        if not cfg_file.exists():
+            print(f"❌ Error: Expected config file not found: {cfg_file}")
+            return 1
+        config = configparser.ConfigParser()
+        config.read(cfg_file, encoding="utf-8")
+        if not config.has_section("settings") or not config.has_option("settings", "default"):
+            print("❌ Error: Missing [settings] section or 'default' option in crawlo.cfg")
+            return 1
+        settings_module = config.get("settings", "default")
+        project_package = settings_module.split(".")[0]
+        # 3. 确保项目包可导入
+        try:
+            import_module(project_package)
+        except ImportError as e:
+            print(f"❌ Failed to import project package '{project_package}': {e}")
+            return 1
+        # 4. 加载爬虫
+        spider_modules = [f"{project_package}.spiders"]
+        process = CrawlerProcess(spider_modules=spider_modules)
+        spider_names = process.get_spider_names()
+        if not spider_names:
+            print("📭 No spiders found.")
+            print("💡 Make sure:")
+            print("   • Spiders are defined in the 'spiders' module")
+            print("   • They have a `name` attribute")
+            print("   • Modules are properly imported")
+            return 1
+        print(f"🔍 Checking {len(spider_names)} spider(s)...")
+        print("-" * 60)
+        issues_found = False
+        for name in sorted(spider_names):
+            cls = process.get_spider_class(name)
+            issues = []
+            # 检查 name 属性
+            if not getattr(cls, "name", None):
+                issues.append("missing or empty 'name' attribute")
+            elif not isinstance(cls.name, str):
+                issues.append("'name' is not a string")
+            # 检查 start_requests 是否可调用
+            if not callable(getattr(cls, "start_requests", None)):
+                issues.append("missing or non-callable 'start_requests' method")
+            # 检查 start_urls 类型（不应是字符串）
+            if hasattr(cls, "start_urls") and isinstance(cls.start_urls, str):
+                issues.append("'start_urls' is a string; should be list or tuple")
+            # 实例化并检查 parse 方法（非强制但推荐）
+            try:
+                spider = cls.create_instance(None)
+                if not callable(getattr(spider, "parse", None)):
+                    issues.append("no 'parse' method defined (recommended)")
+            except Exception as e:
+                issues.append(f"failed to instantiate spider: {e}")
+            # 输出结果
+            if issues:
+                print(f"❌ {name:<20} {cls.__name__}")
+                for issue in issues:
+                    print(f"     • {issue}")
+                issues_found = True
+            else:
+                print(f"✅ {name:<20} {cls.__name__} (OK)")
+        print("-" * 60)
+        if issues_found:
+            print("⚠️  Some spiders have issues. Please fix them.")
+            return 1
+        else:
+            print("🎉 All spiders are compliant and well-defined!")
+            return 0
+    except Exception as e:
+        print(f"❌ Unexpected error during check: {e}")
+        logger.exception("Exception in 'crawlo check'")
+        return 1
+if __name__ == "__main__":
+    """
+    支持直接运行：
+        python -m crawlo.commands.check
+    """
+    sys.exit(main(sys.argv[1:]))

crawlo-1.1.0/crawlo/commands/list.py ADDED Viewed

@@ -0,0 +1,119 @@
+#!/usr/bin/python
+# -*- coding: UTF-8 -*-
+"""
+# @Time    : 2025-08-31 22:33
+# @Author  : crawl-coder
+# @Desc    : 命令行入口：crawlo list，用于列出所有已注册的爬虫
+"""
+import sys
+import configparser
+from pathlib import Path
+from importlib import import_module
+from crawlo.crawler import CrawlerProcess
+from crawlo.utils.log import get_logger
+logger = get_logger(__name__)
+def get_project_root():
+    """
+    自动检测项目根目录：从当前目录向上查找 crawlo.cfg
+    找到后返回该目录路径（字符串），最多向上查找10层。
+    """
+    current = Path.cwd()
+    for _ in range(10):
+        cfg = current / "crawlo.cfg"
+        if cfg.exists():
+            return str(current)
+        # 到达文件系统根目录
+        if current == current.parent:
+            break
+        current = current.parent
+    return None  # 未找到
+def main(args):
+    """
+    主函数：列出所有可用爬虫
+    用法: crawlo list
+    """
+    if args:
+        print("❌ Usage: crawlo list")
+        return 1
+    try:
+        # 1. 查找项目根目录
+        project_root = get_project_root()
+        if not project_root:
+            print("❌ Error: Cannot find 'crawlo.cfg'. Are you in a crawlo project?")
+            print("💡 Tip: Run this command inside your project directory, or create a project with 'crawlo startproject'.")
+            return 1
+        project_root_path = Path(project_root)
+        project_root_str = str(project_root_path)
+        # 2. 将项目根加入 Python 路径，以便导入项目模块
+        if project_root_str not in sys.path:
+            sys.path.insert(0, project_root_str)
+        # 3. 读取 crawlo.cfg 获取 settings 模块
+        cfg_file = project_root_path / "crawlo.cfg"
+        config = configparser.ConfigParser()
+        config.read(cfg_file, encoding="utf-8")
+        if not config.has_section("settings") or not config.has_option("settings", "default"):
+            print("❌ Error: Invalid crawlo.cfg — missing [settings] or 'default' option.")
+            return 1
+        settings_module = config.get("settings", "default")
+        project_package = settings_module.split(".")[0]
+        # 4. 确保项目包可导入（可选：尝试导入以触发异常）
+        try:
+            import_module(project_package)
+        except ImportError as e:
+            print(f"❌ Failed to import project package '{project_package}': {e}")
+            return 1
+        # 5. 初始化 CrawlerProcess 并加载爬虫模块
+        spider_modules = [f"{project_package}.spiders"]
+        process = CrawlerProcess(spider_modules=spider_modules)
+        # 6. 获取所有爬虫名称
+        spider_names = process.get_spider_names()
+        if not spider_names:
+            print("📭 No spiders found in 'spiders/' directory.")
+            print("💡 Make sure:")
+            print("   • Spider classes inherit from `crawlo.spider.Spider`")
+            print("   • Each spider has a `name` attribute")
+            print("   • Spiders are imported in `spiders/__init__.py` (if using package)")
+            return 1
+        # 7. 输出爬虫列表
+        print(f"📋 Found {len(spider_names)} spider(s):")
+        print("-" * 60)
+        for name in sorted(spider_names):
+            spider_cls = process.get_spider_class(name)
+            module_name = spider_cls.__module__.replace(f"{project_package}.", "")
+            print(f"🕷️  {name:<20} {spider_cls.__name__:<25} ({module_name})")
+        print("-" * 60)
+        return 0
+    except Exception as e:
+        print(f"❌ Unexpected error: {e}")
+        logger.exception("Exception during 'crawlo list'")
+        return 1
+if __name__ == "__main__":
+    """
+    支持直接运行：
+        python -m crawlo.commands.list
+    """
+    sys.exit(main(sys.argv[1:]))

crawlo-1.1.0/crawlo/commands/run.py ADDED Viewed

@@ -0,0 +1,171 @@
+#!/usr/bin/python
+# -*- coding: UTF-8 -*-
+"""
+# @Time    : 2025-08-31 22:36
+# @Author  : crawl-coder
+# @Desc    : 命令行入口：crawlo run <spider_name>|all，用于运行指定爬虫。
+"""
+import sys
+import asyncio
+import configparser
+from pathlib import Path
+from importlib import import_module
+from crawlo.crawler import CrawlerProcess
+from crawlo.utils.log import get_logger
+from crawlo.utils.project import get_settings
+from crawlo.commands.stats import record_stats  # 自动记录 stats
+logger = get_logger(__name__)
+def get_project_root():
+    """
+    向上查找 crawlo.cfg 来确定项目根目录
+    """
+    current = Path.cwd()
+    for _ in range(10):
+        cfg = current / "crawlo.cfg"
+        if cfg.exists():
+            return current
+        if current == current.parent:
+            break
+        current = current.parent
+    return None
+def main(args):
+    """
+    主函数：运行指定爬虫
+    用法:
+        crawlo run <spider_name>
+        crawlo run all
+    """
+    if len(args) < 1:
+        print("❌ Usage: crawlo run <spider_name>|all")
+        print("💡 Examples:")
+        print("   crawlo run baidu")
+        print("   crawlo run all")
+        return 1
+    spider_arg = args[0]
+    try:
+        # 1. 查找项目根目录
+        project_root = get_project_root()
+        if not project_root:
+            print("❌ Error: Cannot find 'crawlo.cfg'. Are you in a crawlo project?")
+            print("💡 Tip: Run this command inside your project directory.")
+            return 1
+        project_root_str = str(project_root)
+        if project_root_str not in sys.path:
+            sys.path.insert(0, project_root_str)
+        # 2. 读取 crawlo.cfg 获取 settings 模块
+        cfg_file = project_root / "crawlo.cfg"
+        if not cfg_file.exists():
+            print(f"❌ Error: crawlo.cfg not found in {project_root}")
+            return 1
+        config = configparser.ConfigParser()
+        config.read(cfg_file, encoding="utf-8")
+        if not config.has_section("settings") or not config.has_option("settings", "default"):
+            print("❌ Error: Missing [settings] section or 'default' option in crawlo.cfg")
+            return 1
+        settings_module = config.get("settings", "default")
+        project_package = settings_module.split(".")[0]
+        # 3. 确保项目包可导入
+        try:
+            import_module(project_package)
+        except ImportError as e:
+            print(f"❌ Failed to import project package '{project_package}': {e}")
+            return 1
+        # 4. 加载 settings 和爬虫模块
+        settings = get_settings()  # 此时已安全
+        spider_modules = [f"{project_package}.spiders"]
+        process = CrawlerProcess(settings=settings, spider_modules=spider_modules)
+        # === 情况1：运行所有爬虫 ===
+        if spider_arg.lower() == "all":
+            spider_names = process.get_spider_names()
+            if not spider_names:
+                print("❌ No spiders found.")
+                print("💡 Make sure:")
+                print("   • Spiders are defined in 'spiders/'")
+                print("   • They have a `name` attribute")
+                print("   • Modules are imported (e.g. via __init__.py)")
+                return 1
+            print(f"🚀 Starting ALL {len(spider_names)} spider(s):")
+            print("-" * 60)
+            for name in sorted(spider_names):
+                cls = process.get_spider_class(name)
+                print(f"🕷️  {name:<20} {cls.__name__}")
+            print("-" * 60)
+            # 注册 stats 记录（每个爬虫结束时保存）
+            for crawler in process.crawlers:
+                crawler.signals.connect(record_stats, signal="spider_closed")
+            # 并行运行所有爬虫（可改为串行：for name in ... await process.crawl(name)）
+            asyncio.run(process.crawl(spider_names))
+            print("✅ All spiders completed.")
+            return 0
+        # === 情况2：运行单个爬虫 ===
+        spider_name = spider_arg
+        if not process.is_spider_registered(spider_name):
+            print(f"❌ Spider '{spider_name}' not found.")
+            available = process.get_spider_names()
+            if available:
+                print("💡 Available spiders:")
+                for name in sorted(available):
+                    cls = process.get_spider_class(name)
+                    print(f"   • {name} ({cls.__name__})")
+            else:
+                print("💡 No spiders found. Check your spiders module.")
+            return 1
+        spider_class = process.get_spider_class(spider_name)
+        # 打印启动信息
+        print(f"🚀 Starting spider: {spider_name}")
+        print(f"📦 Project: {project_package}")
+        print(f"CppClass: {spider_class.__name__}")
+        print(f"📄 Module: {spider_class.__module__}")
+        print("-" * 50)
+        # 注册 stats 记录
+        for crawler in process.crawlers:
+            crawler.signals.connect(record_stats, signal="spider_closed")
+        # 运行爬虫
+        asyncio.run(process.crawl(spider_name))
+        print("-" * 50)
+        print("✅ Spider completed successfully!")
+        return 0
+    except KeyboardInterrupt:
+        print("\n⚠️  Spider interrupted by user.")
+        return 1
+    except Exception as e:
+        print(f"❌ Unexpected error: {e}")
+        logger.exception("Exception during 'crawlo run'")
+        return 1
+if __name__ == "__main__":
+    """
+    支持直接运行：
+        python -m crawlo.commands.run spider_name
+    """
+    sys.exit(main(sys.argv[1:]))

crawlo-1.1.0/crawlo/commands/stats.py ADDED Viewed

@@ -0,0 +1,167 @@
+#!/usr/bin/python
+# -*- coding: UTF-8 -*-
+"""
+# @Time    : 2025-08-31 22:36
+# @Author  : crawl-coder
+# @Desc    : 命令行入口：crawlo stats，查看最近运行的爬虫统计信息。
+"""
+import sys
+import json
+from pathlib import Path
+from datetime import datetime
+from typing import Dict, Any
+from crawlo.utils.log import get_logger
+logger = get_logger(__name__)
+# 默认存储目录（相对于项目根目录）
+STATS_DIR = "logs/stats"
+def get_stats_dir() -> Path:
+    """
+    获取统计文件存储目录，优先使用项目根下的 logs/stats/
+    如果不在项目中，回退到当前目录
+    """
+    # 尝试查找项目根目录（通过 crawlo.cfg）
+    current = Path.cwd()
+    for _ in range(10):
+        if (current / "crawlo.cfg").exists():
+            return current / STATS_DIR
+        if current == current.parent:
+            break
+        current = current.parent
+    # 回退：使用当前目录下的 logs/stats
+    return Path.cwd() / STATS_DIR
+def record_stats(crawler):
+    """
+    【供爬虫运行时调用】记录爬虫结束后的统计信息到 JSON 文件
+    需在 Crawler 的 closed 回调中调用
+    """
+    spider_name = getattr(crawler.spider, "name", "unknown")
+    stats = crawler.stats.get_stats() if crawler.stats else {}
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    stats_dir = Path(get_stats_dir())
+    stats_dir.mkdir(parents=True, exist_ok=True)
+    filename = stats_dir / f"{spider_name}_{timestamp}.json"
+    try:
+        with open(filename, "w", encoding="utf-8") as f:
+            json.dump({
+                "spider": spider_name,
+                "timestamp": datetime.now().isoformat(),
+                "stats": stats
+            }, f, ensure_ascii=False, indent=2, default=str)
+        logger.info(f"📊 Stats saved for spider '{spider_name}' → {filename}")
+    except Exception as e:
+        logger.error(f"Failed to save stats for '{spider_name}': {e}")
+def load_all_stats() -> Dict[str, list]:
+    """
+    加载所有已保存的统计文件，按 spider name 分组
+    返回: {spider_name: [stats_record, ...]}
+    """
+    stats_dir = get_stats_dir()
+    if not stats_dir.exists():
+        return {}
+    result = {}
+    json_files = sorted(stats_dir.glob("*.json"), key=lambda x: x.stat().st_mtime, reverse=True)
+    for file in json_files:
+        try:
+            with open(file, "r", encoding="utf-8") as f:
+                data = json.load(f)
+            spider_name = data.get("spider", "unknown")
+            result.setdefault(spider_name, []).append(data)
+        except Exception as e:
+            logger.warning(f"Failed to load stats file {file}: {e}")
+    return result
+def format_value(v: Any) -> str:
+    """格式化值，防止太长或不可打印"""
+    if isinstance(v, float):
+        return f"{v:.4f}"
+    return str(v)
+def main(args):
+    """
+    主函数：查看统计信息
+    用法:
+        crawlo stats                 → 显示所有爬虫最近一次运行
+        crawlo stats myspider        → 显示指定爬虫所有历史记录
+        crawlo stats myspider --all  → 显示所有历史（同上）
+    """
+    if len(args) > 2:
+        print("Usage: crawlo stats [spider_name] [--all]")
+        return 1
+    spider_name = None
+    show_all = False
+    if args:
+        spider_name = args[0]
+        show_all = "--all" in args or "-a" in args
+    # 加载所有 stats
+    all_stats = load_all_stats()
+    if not all_stats:
+        print("📊 No stats found. Run a spider first.")
+        print(f"💡 Stats are saved in: {get_stats_dir()}")
+        return 0
+    if not spider_name:
+        # 显示每个爬虫最近一次运行
+        print("📊 Recent Spider Statistics (last run):")
+        print("-" * 60)
+        for name, runs in all_stats.items():
+            latest = runs[0]
+            print(f"🕷️  {name} ({latest['timestamp'][:19]})")
+            stats = latest["stats"]
+            for k in sorted(stats.keys()):
+                print(f"    {k:<30} {format_value(stats[k])}")
+            print()
+        return 0
+    else:
+        # 查看指定爬虫
+        if spider_name not in all_stats:
+            print(f"📊 No stats found for spider '{spider_name}'")
+            available = ', '.join(all_stats.keys())
+            if available:
+                print(f"💡 Available spiders: {available}")
+            return 1
+        runs = all_stats[spider_name]
+        if show_all:
+            print(f"📊 All runs for '{spider_name}' ({len(runs)} runs):")
+        else:
+            runs = runs[:1]
+            print(f"📊 Last run for '{spider_name}':")
+        print("-" * 60)
+        for run in runs:
+            print(f"⏱️  Timestamp: {run['timestamp']}")
+            stats = run["stats"]
+            for k in sorted(stats.keys()):
+                print(f"    {k:<30} {format_value(stats[k])}")
+            print("─" * 60)
+        return 0
+if __name__ == "__main__":
+    """
+    支持直接运行：
+        python -m crawlo.commands.stats
+    """
+    sys.exit(main(sys.argv[1:]))

{crawlo-1.0.8 → crawlo-1.1.0}/crawlo/core/engine.py RENAMED Viewed

@@ -42,7 +42,7 @@ class Engine(object):
     def engine_start(self):
         self.running = True
         self.logger.info(
-            f"Crawlo (version {self.settings.get_int('VERSION')}) started. "
+            f"Crawlo (version {self.settings.get_float('VERSION')}) started. "
             f"(project name : {self.settings.get('PROJECT_NAME')})"
         )

{crawlo-1.0.8 → crawlo-1.1.0/crawlo.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: crawlo
-Version: 1.0.8
+Version: 1.1.0
 Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架，支持分布式抓取。
 Home-page: https://github.com/crawl-coder/Crawlo.git
 Author: crawl-coder

crawlo-1.1.0/examples/gxb/spider/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .miit_spider import MiitSpider
2	+ from .telecom_device import TelecomDeviceLicensesSpider

crawlo-1.0.8/crawlo/__version__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- __version__ = "1.0.8"

crawlo-1.0.8/crawlo/commands/__init__.py DELETED Viewed

@@ -1,10 +0,0 @@
-# crawlo/commands/__init__.py
-# 定义可用的命令
-_commands = {
-    'startproject': 'crawlo.commands.startproject',
-    'genspider': 'crawlo.commands.genspider',
-    'run': 'crawlo.commands.run',
-}
-def get_commands():
-    return _commands

crawlo 1.0.8__tar.gz → 1.1.0__tar.gz

Potentially problematic release.

crawlo 1.0.8tar.gz → 1.1.0tar.gz