PyPI - crawlo - Versions diffs - 1.1.1__py3-none-any.whl → 1.1.2__py3-none-any.whl - Mend

crawlo 1.1.1py3-none-any.whl → 1.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crawlo might be problematic. Click here for more details.

Files changed (68) hide show

crawlo/__init__.py +2 -1
crawlo/__version__.py +1 -1
crawlo/commands/genspider.py +68 -42
crawlo/commands/list.py +102 -93
crawlo/commands/startproject.py +89 -4
crawlo/commands/utils.py +187 -0
crawlo/config.py +280 -0
crawlo/core/engine.py +16 -3
crawlo/core/enhanced_engine.py +190 -0
crawlo/core/scheduler.py +113 -8
crawlo/crawler.py +840 -307
crawlo/downloader/__init__.py +181 -17
crawlo/downloader/aiohttp_downloader.py +15 -2
crawlo/downloader/cffi_downloader.py +11 -1
crawlo/downloader/httpx_downloader.py +14 -3
crawlo/filters/__init__.py +122 -5
crawlo/filters/aioredis_filter.py +128 -36
crawlo/filters/memory_filter.py +99 -32
crawlo/middleware/proxy.py +11 -8
crawlo/middleware/retry.py +40 -5
crawlo/mode_manager.py +201 -0
crawlo/network/__init__.py +17 -3
crawlo/network/request.py +118 -10
crawlo/network/response.py +131 -28
crawlo/pipelines/__init__.py +1 -1
crawlo/pipelines/csv_pipeline.py +317 -0
crawlo/pipelines/json_pipeline.py +219 -0
crawlo/queue/__init__.py +0 -0
crawlo/queue/pqueue.py +37 -0
crawlo/queue/queue_manager.py +304 -0
crawlo/queue/redis_priority_queue.py +192 -0
crawlo/settings/default_settings.py +68 -9
crawlo/spider/__init__.py +576 -66
crawlo/task_manager.py +4 -1
crawlo/templates/project/middlewares.py.tmpl +56 -45
crawlo/templates/project/pipelines.py.tmpl +308 -36
crawlo/templates/project/run.py.tmpl +239 -0
crawlo/templates/project/settings.py.tmpl +211 -17
crawlo/templates/spider/spider.py.tmpl +153 -7
crawlo/utils/controlled_spider_mixin.py +336 -0
crawlo/utils/large_scale_config.py +287 -0
crawlo/utils/large_scale_helper.py +344 -0
crawlo/utils/queue_helper.py +176 -0
crawlo/utils/request_serializer.py +220 -0
crawlo-1.1.2.dist-info/METADATA +567 -0
{crawlo-1.1.1.dist-info → crawlo-1.1.2.dist-info}/RECORD +54 -46
tests/test_final_validation.py +154 -0
tests/test_redis_config.py +29 -0
tests/test_redis_queue.py +225 -0
tests/test_request_serialization.py +71 -0
tests/test_scheduler.py +242 -0
crawlo/pipelines/mysql_batch_pipline.py +0 -273
crawlo/utils/pqueue.py +0 -174
crawlo-1.1.1.dist-info/METADATA +0 -220
examples/baidu_spider/__init__.py +0 -7
examples/baidu_spider/demo.py +0 -94
examples/baidu_spider/items.py +0 -46
examples/baidu_spider/middleware.py +0 -49
examples/baidu_spider/pipeline.py +0 -55
examples/baidu_spider/run.py +0 -27
examples/baidu_spider/settings.py +0 -121
examples/baidu_spider/spiders/__init__.py +0 -7
examples/baidu_spider/spiders/bai_du.py +0 -61
examples/baidu_spider/spiders/miit.py +0 -159
examples/baidu_spider/spiders/sina.py +0 -79
{crawlo-1.1.1.dist-info → crawlo-1.1.2.dist-info}/WHEEL +0 -0
{crawlo-1.1.1.dist-info → crawlo-1.1.2.dist-info}/entry_points.txt +0 -0
{crawlo-1.1.1.dist-info → crawlo-1.1.2.dist-info}/top_level.txt +0 -0

crawlo/templates/project/run.py.tmpl ADDED Viewed

@@ -0,0 +1,239 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+"""
+{{project_name}} 项目运行脚本
+============================
+基于 Crawlo 框架的智能爬虫启动器。
+支持单机/分布式模式，灵活配置，开箱即用。
+🎯 快速使用:
+    python run.py spider_name                     # 单机模式运行
+    python run.py spider_name --distributed       # 分布式模式运行
+    python run.py spider_name --env production    # 使用预设配置
+🔧 高级选项:
+    python run.py spider_name --dry-run           # 干运行（不执行实际爬取）
+    python run.py spider_name --concurrency 16    # 自定义并发数
+    python run.py spider_name --mode gentle       # 温和模式（低负载）
+    python run.py spider1 spider2 --distributed   # 多爬虫分布式运行
+📦 配置模式:
+    --standalone     单机模式（默认）- 内存队列，无需外部依赖
+    --distributed    分布式模式 - Redis队列，支持多节点
+    --auto          自动模式 - 智能检测Redis可用性
+🎛️ 预设配置:
+    --env development    开发环境（调试友好）
+    --env production     生产环境（高性能）
+    --env large-scale    大规模爬取（优化内存）
+    --env gentle         温和模式（低负载）
+"""
+import os
+import sys
+import asyncio
+import argparse
+from pathlib import Path
+from crawlo.crawler import CrawlerProcess
+from crawlo.config import CrawloConfig
+from crawlo.mode_manager import standalone_mode, distributed_mode, auto_mode
+def create_parser():
+    """创建命令行参数解析器"""
+    parser = argparse.ArgumentParser(
+        description='{{project_name}} 爬虫启动器 - 基于 Crawlo 框架',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+示例用法:
+  python run.py my_spider                    # 默认单机模式
+  python run.py my_spider --distributed      # 分布式模式
+  python run.py my_spider --env production   # 生产环境配置
+  python run.py spider1 spider2              # 运行多个爬虫
+  python run.py my_spider --dry-run          # 测试模式
+        """
+    )
+    # 爬虫名称（位置参数）
+    parser.add_argument(
+        'spiders',
+        nargs='*',
+        help='要运行的爬虫名称（可指定多个）'
+    )
+    # 运行模式选择
+    mode_group = parser.add_mutually_exclusive_group()
+    mode_group.add_argument(
+        '--standalone',
+        action='store_true',
+        help='单机模式（默认）- 使用内存队列，无需外部依赖'
+    )
+    mode_group.add_argument(
+        '--distributed',
+        action='store_true',
+        help='分布式模式 - 使用 Redis 队列，支持多节点爬取'
+    )
+    mode_group.add_argument(
+        '--auto',
+        action='store_true',
+        help='自动模式 - 智能检测 Redis 可用性选择队列类型'
+    )
+    # 预设环境配置
+    parser.add_argument(
+        '--env',
+        choices=['development', 'production', 'large-scale', 'gentle'],
+        help='预设环境配置（优先级高于模式选择）'
+    )
+    # 性能调优选项
+    parser.add_argument(
+        '--concurrency',
+        type=int,
+        help='并发请求数（覆盖默认设置）'
+    )
+    parser.add_argument(
+        '--delay',
+        type=float,
+        help='请求延迟时间（秒）'
+    )
+    # 功能选项
+    parser.add_argument(
+        '--dry-run',
+        action='store_true',
+        help='干运行模式 - 解析页面但不执行实际爬取操作'
+    )
+    parser.add_argument(
+        '--debug',
+        action='store_true',
+        help='启用调试模式 - 详细日志输出'
+    )
+    parser.add_argument(
+        '--config-file',
+        type=str,
+        help='自定义配置文件路径'
+    )
+    # 环境变量支持
+    parser.add_argument(
+        '--from-env',
+        action='store_true',
+        help='从环境变量加载配置（CRAWLO_*）'
+    )
+    return parser
+def build_config(args):
+    """根据命令行参数构建配置"""
+    config = None
+    # 1. 优先使用环境变量配置
+    if args.from_env:
+        config = CrawloConfig.from_env()
+        print("📋 使用环境变量配置")
+    # 2. 使用预设环境配置
+    elif args.env:
+        presets = {
+            'development': CrawloConfig.presets().development(),
+            'production': CrawloConfig.presets().production(),
+            'large-scale': CrawloConfig.presets().large_scale(),
+            'gentle': CrawloConfig.presets().gentle()
+        }
+        config = presets[args.env]
+        print(f"🎛️  使用预设配置: {args.env}")
+    # 3. 使用模式配置
+    elif args.distributed:
+        config = CrawloConfig.distributed()
+        print("🌐 启用分布式模式")
+    elif args.auto:
+        config = CrawloConfig.auto()
+        print("🤖 启用自动检测模式")
+    else:
+        # 默认单机模式
+        config = CrawloConfig.standalone()
+        print("💻 使用单机模式（默认）")
+    # 4. 应用命令行参数覆盖
+    if args.concurrency:
+        config.set('CONCURRENCY', args.concurrency)
+        print(f"⚡ 设置并发数: {args.concurrency}")
+    if args.delay:
+        config.set('DOWNLOAD_DELAY', args.delay)
+        print(f"⏱️  设置请求延迟: {args.delay}秒")
+    if args.debug:
+        config.set('LOG_LEVEL', 'DEBUG')
+        print("🐛 启用调试模式")
+    if args.dry_run:
+        # 干运行模式的配置（可根据需要调整）
+        config.set('DOWNLOAD_DELAY', 0.1)  # 加快速度
+        config.set('CONCURRENCY', 1)       # 降低并发
+        print("🧪 启用干运行模式")
+    return config
+async def main():
+    """主函数：解析参数，构建配置，启动爬虫"""
+    # 解析命令行参数
+    parser = create_parser()
+    args = parser.parse_args()
+    # 检查是否指定了爬虫
+    if not args.spiders:
+        print("❌ 请指定要运行的爬虫名称")
+        print("\n可用的爬虫:")
+        print("   # TODO: 在这里列出你的爬虫")
+        print("   # from {{project_name}}.spiders import MySpider")
+        print("\n使用方法: python run.py <spider_name>")
+        parser.print_help()
+        return
+    # 构建配置
+    config = build_config(args)
+    # 创建爬虫进程
+    print(f"\n🚀 正在启动爬虫: {', '.join(args.spiders)}")
+    if args.dry_run:
+        print("   🧪 [干运行模式] 将解析页面但不执行实际爬取")
+    try:
+        # 应用配置并启动
+        process = CrawlerProcess(settings=config.to_dict())
+        # TODO: 在这里添加你的爬虫导入
+        # from {{project_name}}.spiders.example_spider import ExampleSpider
+        # spider_classes = {'example_spider': ExampleSpider}
+        # 运行指定爬虫
+        await process.crawl(args.spiders)
+        print("\n✅ 所有爬虫执行完成")
+    except ImportError as e:
+        print(f"❌ 无法导入爬虫: {e}")
+        print("   请检查爬虫文件是否存在，并更新 run.py 中的导入语句")
+    except Exception as e:
+        print(f"❌ 运行错误: {e}")
+        raise
+if __name__ == '__main__':
+    try:
+        asyncio.run(main())
+    except KeyboardInterrupt:
+        print("\n⏹️  用户中断爬虫执行")
+    except Exception as e:
+        print(f"❌ 运行错误: {e}")
+        sys.exit(1)

crawlo/templates/project/settings.py.tmpl CHANGED Viewed

@@ -1,54 +1,248 @@
 # -*- coding: UTF-8 -*-
-"""自动创建的 settings.py 文件"""
+"""
+{{project_name}} 项目配置文件
+=============================
+基于 Crawlo 框架的爬虫项目配置。
+🎯 快速开始：
+# 方式1：使用默认单机模式（推荐）
+from crawlo.crawler import CrawlerProcess
+process = CrawlerProcess()  # 无需任何配置
+# 方式2：使用配置工厂
+from crawlo.config import CrawloConfig
+config = CrawloConfig.standalone()  # 单机模式
+config = CrawloConfig.distributed(redis_host='192.168.1.100')  # 分布式模式
+process = CrawlerProcess(settings=config.to_dict())
+# 方式3：使用环境变量
+from crawlo.config import CrawloConfig
+config = CrawloConfig.from_env()  # 从环境变量读取
+"""
+import os
+from crawlo.config import CrawloConfig
+# ============================== 项目基本信息 ==============================
 PROJECT_NAME = '{{project_name}}'
-VERSION = '1.0'
+VERSION = '1.0.0'
+# ============================== 运行模式选择 ==============================
+# 🎯 选择一种配置方式：
+# 方式1：使用配置工厂（推荐）
+# 单机模式（默认）
+CONFIG = CrawloConfig.standalone(
+    concurrency=8,
+    download_delay=1.0
+)
+# 分布式模式（去掉注释并修改 Redis 地址）
+# CONFIG = CrawloConfig.distributed(
+#     redis_host='127.0.0.1',
+#     redis_password='your_password',  # 如果有密码
+#     project_name='{{project_name}}',
+#     concurrency=16,
+#     download_delay=1.0
+# )
+# 自动检测模式
+# CONFIG = CrawloConfig.auto(concurrency=12)
+# 方式2：从环境变量读取（适合部署）
+# CONFIG = CrawloConfig.from_env()
+# 方式3：使用预设配置
+# from crawlo.config import Presets
+# CONFIG = Presets.development()  # 开发环境
+# CONFIG = Presets.production()   # 生产环境
+# 获取最终配置
+locals().update(CONFIG.to_dict())
 # ============================== 网络请求配置 ==============================
-DOWNLOADER = "crawlo.downloader.aiohttp_downloader.AioHttpDownloader"
-DOWNLOAD_TIMEOUT = 60
+# 下载器选择（推荐使用 CurlCffi，支持浏览器指纹模拟）
+DOWNLOADER = "crawlo.downloader.cffi_downloader.CurlCffiDownloader"  # 支持浏览器指纹
+# DOWNLOADER = "crawlo.downloader.aiohttp_downloader.AioHttpDownloader"  # 轻量级选择
+# DOWNLOADER = "crawlo.downloader.httpx_downloader.HttpXDownloader"     # HTTP/2 支持
+# 请求超时与安全
+DOWNLOAD_TIMEOUT = 30
 VERIFY_SSL = True
 USE_SESSION = True
+# 请求延迟控制（防反爬）
 DOWNLOAD_DELAY = 1.0
+RANDOM_RANGE = (0.8, 1.2)
 RANDOMNESS = True
+# 重试策略
 MAX_RETRY_TIMES = 3
+RETRY_PRIORITY = -1
 RETRY_HTTP_CODES = [408, 429, 500, 502, 503, 504, 522, 524]
 IGNORE_HTTP_CODES = [403, 404]
+ALLOWED_CODES = []
-CONNECTION_POOL_LIMIT = 100
+# 连接池配置
+CONNECTION_POOL_LIMIT = 50
+DOWNLOAD_MAXSIZE = 10 * 1024 * 1024    # 10MB
+DOWNLOAD_WARN_SIZE = 1024 * 1024       # 1MB
-# ============================== 并发与调度 ==============================
+# ============================== 并发与调度配置 ==============================
 CONCURRENCY = 8
+INTERVAL = 5
+DEPTH_PRIORITY = 1
 MAX_RUNNING_SPIDERS = 3
-# ============================== 数据存储 ==============================
-MYSQL_HOST = '127.0.0.1'
-MYSQL_PORT = 3306
-MYSQL_USER = 'root'
-MYSQL_PASSWORD = '123456'
-MYSQL_DB = '{{project_name}}'
-MYSQL_TABLE = 'crawled_data'
+# ============================== 队列配置（支持分布式） ==============================
+# 队列类型：'auto'（自动选择）, 'memory'（内存队列）, 'redis'（分布式队列）
+QUEUE_TYPE = 'auto'
+SCHEDULER_MAX_QUEUE_SIZE = 2000
+SCHEDULER_QUEUE_NAME = f'{{project_name}}:requests'
+QUEUE_MAX_RETRIES = 3
+QUEUE_TIMEOUT = 300
+# 大规模爬取优化
+LARGE_SCALE_BATCH_SIZE = 1000
+LARGE_SCALE_CHECKPOINT_INTERVAL = 5000
+LARGE_SCALE_MAX_MEMORY_USAGE = 500
+# ============================== 数据存储配置 ==============================
+# --- MySQL 配置 ---
+MYSQL_HOST = os.getenv('MYSQL_HOST', '127.0.0.1')
+MYSQL_PORT = int(os.getenv('MYSQL_PORT', 3306))
+MYSQL_USER = os.getenv('MYSQL_USER', 'root')
+MYSQL_PASSWORD = os.getenv('MYSQL_PASSWORD', '123456')
+MYSQL_DB = os.getenv('MYSQL_DB', '{{project_name}}')
+MYSQL_TABLE = '{{project_name}}_data'
+MYSQL_BATCH_SIZE = 100
+# MySQL 连接池
+MYSQL_FLUSH_INTERVAL = 5
+MYSQL_POOL_MIN = 5
+MYSQL_POOL_MAX = 20
+MYSQL_ECHO = False
+# --- MongoDB 配置 ---
+MONGO_URI = os.getenv('MONGO_URI', 'mongodb://localhost:27017')
+MONGO_DATABASE = f'{{project_name}}_db'
+MONGO_COLLECTION = '{{project_name}}_items'
+MONGO_MAX_POOL_SIZE = 200
+MONGO_MIN_POOL_SIZE = 20
+# ============================== 去重过滤配置 ==============================
-# ============================== 去重过滤 ==============================
+REQUEST_DIR = '.'
+# 去重过滤器（推荐分布式项目使用 Redis 过滤器）
 FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
+# FILTER_CLASS = 'crawlo.filters.aioredis_filter.AioRedisFilter'  # 分布式去重
+# --- Redis 配置（用于分布式去重和队列） ---
+REDIS_HOST = os.getenv('REDIS_HOST', '127.0.0.1')
+REDIS_PORT = int(os.getenv('REDIS_PORT', 6379))
+REDIS_PASSWORD = os.getenv('REDIS_PASSWORD', '')
+# 根据是否有密码生成 URL
+if REDIS_PASSWORD:
+    REDIS_URL = f'redis://:{REDIS_PASSWORD}@{REDIS_HOST}:{REDIS_PORT}/0'
+else:
+    REDIS_URL = f'redis://{REDIS_HOST}:{REDIS_PORT}/0'
+REDIS_KEY = f'{{project_name}}:fingerprint'
+REDIS_TTL = 0
+CLEANUP_FP = 0
+FILTER_DEBUG = True
+DECODE_RESPONSES = True
+# ============================== 中间件配置 ==============================
-# ============================== 中间件 & 管道 ==============================
 MIDDLEWARES = [
+    # === 请求预处理阶段 ===
     'crawlo.middleware.request_ignore.RequestIgnoreMiddleware',
     'crawlo.middleware.download_delay.DownloadDelayMiddleware',
     'crawlo.middleware.default_header.DefaultHeaderMiddleware',
     'crawlo.middleware.proxy.ProxyMiddleware',
+    # === 响应处理阶段 ===
     'crawlo.middleware.retry.RetryMiddleware',
     'crawlo.middleware.response_code.ResponseCodeMiddleware',
     'crawlo.middleware.response_filter.ResponseFilterMiddleware',
 ]
+# ============================== 数据管道配置 ==============================
 PIPELINES = [
     'crawlo.pipelines.console_pipeline.ConsolePipeline',
+    # '{{project_name}}.pipelines.DatabasePipeline',        # 自定义数据库管道
+    # 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline',  # MySQL 存储
+    # 'crawlo.pipelines.mongo_pipeline.MongoPipeline',      # MongoDB 存储
 ]
-# ============================== 日志 ==============================
+# ============================== 扩展组件 ==============================
+EXTENSIONS = [
+    'crawlo.extension.log_interval.LogIntervalExtension',
+    'crawlo.extension.log_stats.LogStats',
+    'crawlo.extension.logging_extension.CustomLoggerExtension',
+]
+# ============================== 日志配置 ==============================
 LOG_LEVEL = 'INFO'
-LOG_FILE = f'logs/{{{project_name}}}.log'
+STATS_DUMP = True
+LOG_FILE = f'logs/{{project_name}}.log'
+LOG_FORMAT = '%(asctime)s - [%(name)s] - %(levelname)s： %(message)s'
+LOG_ENCODING = 'utf-8'
+# ============================== 代理配置 ==============================
+PROXY_ENABLED = False
+PROXY_API_URL = ""  # 请填入真实的代理API地址
+PROXY_EXTRACTOR = "proxy"
+PROXY_REFRESH_INTERVAL = 60
+PROXY_API_TIMEOUT = 10
+# ============================== 浏览器指纹配置 ==============================
+# CurlCffi 下载器专用配置
+CURL_BROWSER_TYPE = "chrome"
+CURL_BROWSER_VERSION_MAP = {
+    "chrome": "chrome136",
+    "edge": "edge101",
+    "safari": "safari184",
+    "firefox": "firefox135",
+}
+# 默认请求头
+DEFAULT_REQUEST_HEADERS = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
+                  '(KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36',
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
+    'Accept-Encoding': 'gzip, deflate, br',
+    'Connection': 'keep-alive',
+    'Upgrade-Insecure-Requests': '1',
+}
+# ============================== 开发与调试 ==============================
+# 开发模式配置
+DEBUG = False
+TESTING = False
+# 性能监控
+ENABLE_PERFORMANCE_MONITORING = True
+MEMORY_USAGE_WARNING_THRESHOLD = 500  # MB
+# ============================== 自定义配置区域 ==============================
+# 在此处添加项目特定的配置项
+# 示例：目标网站特定配置
+# TARGET_DOMAIN = '{{domain}}'
+# MAX_PAGES_PER_DOMAIN = 10000
+# CUSTOM_RATE_LIMIT = 1.5

crawlo/templates/spider/spider.py.tmpl CHANGED Viewed

@@ -3,30 +3,176 @@
 {{project_name}}.spiders.{{spider_name}}
 =======================================
 由 `crawlo genspider` 命令生成的爬虫。
+基于 Crawlo 框架，支持异步并发、分布式爬取等功能。
+使用示例：
+    crawlo crawl {{spider_name}}
 """
 from crawlo.spider import Spider
+from crawlo import Request
+# from {{project_name}}.items import {{item_class}}  # 可选：导入数据项
 class {{class_name}}(Spider):
     """
     爬虫：{{spider_name}}
+    功能说明：
+    - 支持并发爬取
+    - 自动去重过滤
+    - 错误重试机制
+    - 数据管道处理
     """
     name = '{{spider_name}}'
     allowed_domains = ['{{domain}}']
     start_urls = ['https://{{domain}}/']
+    # 高级配置（可选）
+    # custom_settings = {
+    #     'DOWNLOAD_DELAY': 2.0,
+    #     'CONCURRENCY': 4,
+    #     'RETRY_HTTP_CODES': [500, 502, 503, 504, 408, 429],
+    # }
+    def start_requests(self):
+        """
+        生成初始请求。
+        支持自定义请求头、代理、优先级等。
+        """
+        headers = {
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+            'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
+        }
+        for url in self.start_urls:
+            yield Request(
+                url=url,
+                callback=self.parse,
+                headers=headers,
+                # meta={'proxy': 'http://proxy.example.com:8080'},  # 自定义代理
+                # priority=10,  # 请求优先级（数字越大优先级越高）
+            )
     def parse(self, response):
         """
         解析响应的主方法。
+        Args:
+            response: 响应对象，包含页面内容和元数据
+        Yields:
+            Request: 新的请求对象（用于深度爬取）
+            Item: 数据项对象（用于数据存储）
         """
-        # TODO: 在这里编写你的解析逻辑
-        # 示例：提取数据
+        self.logger.info(f'正在解析页面: {response.url}')
+        # ================== 数据提取示例 ==================
+        # 提取数据并创建 Item
         # item = {{item_class}}()
-        # item['title'] = response.xpath('//title/text()').get()
+        # item['title'] = response.xpath('//title/text()').get(default='')
+        # item['url'] = response.url
+        # item['content'] = response.xpath('//div[@class="content"]//text()').getall()
         # yield item
+        # 直接返回字典（简单数据）
+        yield {
+            'title': response.xpath('//title/text()').get(default=''),
+            'url': response.url,
+            'status_code': response.status_code,
+            # 'description': response.xpath('//meta[@name="description"]/@content').get(),
+            # 'keywords': response.xpath('//meta[@name="keywords"]/@content').get(),
+        }
+        # ================== 链接提取示例 ==================
+        # 提取并跟进链接
+        # links = response.xpath('//a/@href').getall()
+        # for link in links:
+        #     # 过滤有效链接
+        #     if link and not link.startswith(('javascript:', 'mailto:', '#')):
+        #         yield response.follow(
+        #             link,
+        #             callback=self.parse_detail,  # 或者 self.parse 继续递归
+        #             meta={'parent_url': response.url}  # 传递父页面信息
+        #         )
+        # 用 CSS 选择器提取链接
+        # for link in response.css('a.item-link::attr(href)').getall():
+        #     yield response.follow(link, callback=self.parse_detail)
+        # ================== 分页处理示例 ==================
+        # 处理分页
+        # next_page = response.xpath('//a[@class="next"]/@href').get()
+        # if next_page:
+        #     yield response.follow(next_page, callback=self.parse)
+        # 数字分页
+        # current_page = int(response.meta.get('page', 1))
+        # max_pages = 100  # 设置最大页数
+        # if current_page < max_pages:
+        #     next_url = f'https://{{domain}}/page/{current_page + 1}'
+        #     yield Request(
+        #         url=next_url,
+        #         callback=self.parse,
+        #         meta={'page': current_page + 1}
+        #     )
+    def parse_detail(self, response):
+        """
+        解析详情页面的方法（可选）。
+        用于处理从列表页跳转而来的详情页。
+        """
+        self.logger.info(f'正在解析详情页: {response.url}')
+        # parent_url = response.meta.get('parent_url', '')
+        #
+        # yield {
+        #     'title': response.xpath('//h1/text()').get(default=''),
+        #     'content': '\n'.join(response.xpath('//div[@class="content"]//text()').getall()),
+        #     'url': response.url,
+        #     'parent_url': parent_url,
+        #     'publish_time': response.xpath('//time/@datetime').get(),
+        # }
+        pass
-        # 示例：提取链接并跟进
-        # for href in response.xpath('//a/@href').getall():
-        #     yield response.follow(href, callback=self.parse)
+    def parse_error(self, failure):
+        """
+        处理请求失败的方法（可选）。
+        当请求失败时会被调用。
+        """
+        self.logger.error(f'请求失败: {failure.request.url} - {failure.value}')
+        # 可以选择重试或记录失败信息
+        # yield {
+        #     'error_url': failure.request.url,
+        #     'error_message': str(failure.value),
+        #     'error_type': failure.type.__name__,
+        # }
+    def spider_opened(self, spider):
+        """
+        爬虫启动时的回调方法（可选）。
+        """
+        self.logger.info(f'爬虫 {spider.name} 已启动')
+        # 初始化操作，例如连接数据库、加载配置等
+        # self.database = self.connect_database()
+        # self.cookies = self.load_cookies()
+    def spider_closed(self, spider, reason):
+        """
+        爬虫关闭时的回调方法（可选）。
+        """
+        self.logger.info(f'爬虫 {spider.name} 已关闭，原因: {reason}')
+        # 清理操作，例如关闭数据库连接、保存状态等
+        # if hasattr(self, 'database'):
+        #     self.database.close()
+        # self.save_cookies()

crawlo 1.1.1__py3-none-any.whl → 1.1.2__py3-none-any.whl

Potentially problematic release.

crawlo 1.1.1py3-none-any.whl → 1.1.2py3-none-any.whl