PyPI - crawlo - Versions diffs - 1.1.1__py3-none-any.whl → 1.1.3__py3-none-any.whl - Mend

crawlo 1.1.1py3-none-any.whl → 1.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crawlo might be problematic. Click here for more details.

Files changed (128) hide show

crawlo/__init__.py +34 -33
crawlo/__version__.py +1 -1
crawlo/cli.py +40 -40
crawlo/commands/__init__.py +13 -13
crawlo/commands/check.py +594 -594
crawlo/commands/genspider.py +152 -126
crawlo/commands/list.py +156 -147
crawlo/commands/run.py +285 -285
crawlo/commands/startproject.py +196 -111
crawlo/commands/stats.py +188 -188
crawlo/commands/utils.py +187 -0
crawlo/config.py +280 -0
crawlo/core/__init__.py +2 -2
crawlo/core/engine.py +171 -158
crawlo/core/enhanced_engine.py +190 -0
crawlo/core/processor.py +40 -40
crawlo/core/scheduler.py +166 -57
crawlo/crawler.py +1028 -495
crawlo/downloader/__init__.py +242 -78
crawlo/downloader/aiohttp_downloader.py +212 -199
crawlo/downloader/cffi_downloader.py +251 -241
crawlo/downloader/httpx_downloader.py +259 -246
crawlo/event.py +11 -11
crawlo/exceptions.py +82 -78
crawlo/extension/__init__.py +31 -31
crawlo/extension/log_interval.py +49 -49
crawlo/extension/log_stats.py +44 -44
crawlo/extension/logging_extension.py +34 -34
crawlo/filters/__init__.py +154 -37
crawlo/filters/aioredis_filter.py +242 -150
crawlo/filters/memory_filter.py +269 -202
crawlo/items/__init__.py +23 -23
crawlo/items/base.py +21 -21
crawlo/items/fields.py +53 -53
crawlo/items/items.py +104 -104
crawlo/middleware/__init__.py +21 -21
crawlo/middleware/default_header.py +32 -32
crawlo/middleware/download_delay.py +28 -28
crawlo/middleware/middleware_manager.py +135 -135
crawlo/middleware/proxy.py +248 -245
crawlo/middleware/request_ignore.py +30 -30
crawlo/middleware/response_code.py +18 -18
crawlo/middleware/response_filter.py +26 -26
crawlo/middleware/retry.py +125 -90
crawlo/mode_manager.py +201 -0
crawlo/network/__init__.py +21 -7
crawlo/network/request.py +311 -203
crawlo/network/response.py +271 -166
crawlo/pipelines/__init__.py +22 -13
crawlo/pipelines/bloom_dedup_pipeline.py +157 -0
crawlo/pipelines/console_pipeline.py +39 -39
crawlo/pipelines/csv_pipeline.py +317 -0
crawlo/pipelines/database_dedup_pipeline.py +225 -0
crawlo/pipelines/json_pipeline.py +219 -0
crawlo/pipelines/memory_dedup_pipeline.py +116 -0
crawlo/pipelines/mongo_pipeline.py +116 -116
crawlo/pipelines/mysql_pipeline.py +195 -195
crawlo/pipelines/pipeline_manager.py +56 -56
crawlo/pipelines/redis_dedup_pipeline.py +163 -0
crawlo/project.py +153 -153
crawlo/queue/__init__.py +0 -0
crawlo/queue/pqueue.py +37 -0
crawlo/queue/queue_manager.py +308 -0
crawlo/queue/redis_priority_queue.py +209 -0
crawlo/settings/__init__.py +7 -7
crawlo/settings/default_settings.py +245 -167
crawlo/settings/setting_manager.py +99 -99
crawlo/spider/__init__.py +639 -129
crawlo/stats_collector.py +59 -59
crawlo/subscriber.py +106 -106
crawlo/task_manager.py +30 -27
crawlo/templates/crawlo.cfg.tmpl +10 -10
crawlo/templates/project/__init__.py.tmpl +3 -3
crawlo/templates/project/items.py.tmpl +17 -17
crawlo/templates/project/middlewares.py.tmpl +87 -76
crawlo/templates/project/pipelines.py.tmpl +342 -64
crawlo/templates/project/run.py.tmpl +252 -0
crawlo/templates/project/settings.py.tmpl +251 -54
crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
crawlo/templates/spider/spider.py.tmpl +178 -32
crawlo/utils/__init__.py +7 -7
crawlo/utils/controlled_spider_mixin.py +440 -0
crawlo/utils/date_tools.py +233 -233
crawlo/utils/db_helper.py +343 -343
crawlo/utils/func_tools.py +82 -82
crawlo/utils/large_scale_config.py +287 -0
crawlo/utils/large_scale_helper.py +344 -0
crawlo/utils/log.py +128 -128
crawlo/utils/queue_helper.py +176 -0
crawlo/utils/request.py +267 -267
crawlo/utils/request_serializer.py +220 -0
crawlo/utils/spider_loader.py +62 -62
crawlo/utils/system.py +11 -11
crawlo/utils/tools.py +4 -4
crawlo/utils/url.py +39 -39
crawlo-1.1.3.dist-info/METADATA +635 -0
crawlo-1.1.3.dist-info/RECORD +113 -0
examples/__init__.py +7 -7
examples/controlled_spider_example.py +205 -0
tests/__init__.py +7 -7
tests/test_final_validation.py +154 -0
tests/test_proxy_health_check.py +32 -32
tests/test_proxy_middleware_integration.py +136 -136
tests/test_proxy_providers.py +56 -56
tests/test_proxy_stats.py +19 -19
tests/test_proxy_strategies.py +59 -59
tests/test_redis_config.py +29 -0
tests/test_redis_queue.py +225 -0
tests/test_request_serialization.py +71 -0
tests/test_scheduler.py +242 -0
crawlo/pipelines/mysql_batch_pipline.py +0 -273
crawlo/utils/pqueue.py +0 -174
crawlo-1.1.1.dist-info/METADATA +0 -220
crawlo-1.1.1.dist-info/RECORD +0 -100
examples/baidu_spider/__init__.py +0 -7
examples/baidu_spider/demo.py +0 -94
examples/baidu_spider/items.py +0 -46
examples/baidu_spider/middleware.py +0 -49
examples/baidu_spider/pipeline.py +0 -55
examples/baidu_spider/run.py +0 -27
examples/baidu_spider/settings.py +0 -121
examples/baidu_spider/spiders/__init__.py +0 -7
examples/baidu_spider/spiders/bai_du.py +0 -61
examples/baidu_spider/spiders/miit.py +0 -159
examples/baidu_spider/spiders/sina.py +0 -79
{crawlo-1.1.1.dist-info → crawlo-1.1.3.dist-info}/WHEEL +0 -0
{crawlo-1.1.1.dist-info → crawlo-1.1.3.dist-info}/entry_points.txt +0 -0
{crawlo-1.1.1.dist-info → crawlo-1.1.3.dist-info}/top_level.txt +0 -0

crawlo/templates/project/settings.py.tmpl CHANGED Viewed

@@ -1,54 +1,251 @@
-# -*- coding: UTF-8 -*-
-"""自动创建的 settings.py 文件"""
-PROJECT_NAME = '{{project_name}}'
-VERSION = '1.0'
-# ============================== 网络请求配置 ==============================
-DOWNLOADER = "crawlo.downloader.aiohttp_downloader.AioHttpDownloader"
-DOWNLOAD_TIMEOUT = 60
-VERIFY_SSL = True
-USE_SESSION = True
-DOWNLOAD_DELAY = 1.0
-RANDOMNESS = True
-MAX_RETRY_TIMES = 3
-RETRY_HTTP_CODES = [408, 429, 500, 502, 503, 504, 522, 524]
-IGNORE_HTTP_CODES = [403, 404]
-CONNECTION_POOL_LIMIT = 100
-# ============================== 并发与调度 ==============================
-CONCURRENCY = 8
-MAX_RUNNING_SPIDERS = 3
-# ============================== 数据存储 ==============================
-MYSQL_HOST = '127.0.0.1'
-MYSQL_PORT = 3306
-MYSQL_USER = 'root'
-MYSQL_PASSWORD = '123456'
-MYSQL_DB = '{{project_name}}'
-MYSQL_TABLE = 'crawled_data'
-# ============================== 去重过滤 ==============================
-FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
-# ============================== 中间件 & 管道 ==============================
-MIDDLEWARES = [
-    'crawlo.middleware.request_ignore.RequestIgnoreMiddleware',
-    'crawlo.middleware.download_delay.DownloadDelayMiddleware',
-    'crawlo.middleware.default_header.DefaultHeaderMiddleware',
-    'crawlo.middleware.proxy.ProxyMiddleware',
-    'crawlo.middleware.retry.RetryMiddleware',
-    'crawlo.middleware.response_code.ResponseCodeMiddleware',
-    'crawlo.middleware.response_filter.ResponseFilterMiddleware',
-]
-PIPELINES = [
-    'crawlo.pipelines.console_pipeline.ConsolePipeline',
-]
-# ============================== 日志 ==============================
-LOG_LEVEL = 'INFO'
-LOG_FILE = f'logs/{{{project_name}}}.log'
+# -*- coding: UTF-8 -*-
+"""
+{{project_name}} 项目配置文件
+=============================
+基于 Crawlo 框架的爬虫项目配置。
+🎯 快速开始：
+# 方式1：使用默认单机模式（推荐）
+from crawlo.crawler import CrawlerProcess
+process = CrawlerProcess()  # 无需任何配置
+# 方式2：使用配置工厂
+from crawlo.config import CrawloConfig
+config = CrawloConfig.standalone()  # 单机模式
+config = CrawloConfig.distributed(redis_host='192.168.1.100')  # 分布式模式
+process = CrawlerProcess(settings=config.to_dict())
+# 方式3：使用环境变量
+from crawlo.config import CrawloConfig
+config = CrawloConfig.from_env()  # 从环境变量读取
+"""
+import os
+from crawlo.config import CrawloConfig
+# ============================== 项目基本信息 ==============================
+PROJECT_NAME = '{{project_name}}'
+VERSION = '1.0.0'
+# ============================== 运行模式选择 ==============================
+# 🎯 选择一种配置方式：
+# 方式1：使用配置工厂（推荐）
+# 单机模式（默认）
+CONFIG = CrawloConfig.standalone(
+    concurrency=8,
+    download_delay=1.0
+)
+# 分布式模式（去掉注释并修改 Redis 地址）
+# CONFIG = CrawloConfig.distributed(
+#     redis_host='127.0.0.1',
+#     redis_password='your_password',  # 如果有密码
+#     project_name='{{project_name}}',
+#     concurrency=16,
+#     download_delay=1.0
+# )
+# 自动检测模式
+# CONFIG = CrawloConfig.auto(concurrency=12)
+# 方式2：从环境变量读取（适合部署）
+# CONFIG = CrawloConfig.from_env()
+# 方式3：使用预设配置
+# from crawlo.config import Presets
+# CONFIG = Presets.development()  # 开发环境
+# CONFIG = Presets.production()   # 生产环境
+# 获取最终配置
+locals().update(CONFIG.to_dict())
+# ============================== 网络请求配置 ==============================
+# 下载器选择（推荐使用 CurlCffi，支持浏览器指纹模拟）
+DOWNLOADER = "crawlo.downloader.cffi_downloader.CurlCffiDownloader"  # 支持浏览器指纹
+# DOWNLOADER = "crawlo.downloader.aiohttp_downloader.AioHttpDownloader"  # 轻量级选择
+# DOWNLOADER = "crawlo.downloader.httpx_downloader.HttpXDownloader"     # HTTP/2 支持
+# 请求超时与安全
+DOWNLOAD_TIMEOUT = 30
+VERIFY_SSL = True
+USE_SESSION = True
+# 请求延迟控制（防反爬）
+DOWNLOAD_DELAY = 1.0
+RANDOM_RANGE = (0.8, 1.2)
+RANDOMNESS = True
+# 重试策略
+MAX_RETRY_TIMES = 3
+RETRY_PRIORITY = -1
+RETRY_HTTP_CODES = [408, 429, 500, 502, 503, 504, 522, 524]
+IGNORE_HTTP_CODES = [403, 404]
+ALLOWED_CODES = []
+# 连接池配置
+CONNECTION_POOL_LIMIT = 50
+DOWNLOAD_MAXSIZE = 10 * 1024 * 1024    # 10MB
+DOWNLOAD_WARN_SIZE = 1024 * 1024       # 1MB
+# ============================== 并发与调度配置 ==============================
+CONCURRENCY = 8
+INTERVAL = 5
+DEPTH_PRIORITY = 1
+MAX_RUNNING_SPIDERS = 3
+# ============================== 队列配置（支持分布式） ==============================
+# 队列类型：'auto'（自动选择）, 'memory'（内存队列）, 'redis'（分布式队列）
+QUEUE_TYPE = 'auto'
+SCHEDULER_MAX_QUEUE_SIZE = 2000
+SCHEDULER_QUEUE_NAME = f'{{project_name}}:requests'
+QUEUE_MAX_RETRIES = 3
+QUEUE_TIMEOUT = 300
+# 大规模爬取优化
+LARGE_SCALE_BATCH_SIZE = 1000
+LARGE_SCALE_CHECKPOINT_INTERVAL = 5000
+LARGE_SCALE_MAX_MEMORY_USAGE = 500
+# ============================== 数据存储配置 ==============================
+# --- MySQL 配置 ---
+MYSQL_HOST = os.getenv('MYSQL_HOST', '127.0.0.1')
+MYSQL_PORT = int(os.getenv('MYSQL_PORT', 3306))
+MYSQL_USER = os.getenv('MYSQL_USER', 'root')
+MYSQL_PASSWORD = os.getenv('MYSQL_PASSWORD', '123456')
+MYSQL_DB = os.getenv('MYSQL_DB', '{{project_name}}')
+MYSQL_TABLE = '{{project_name}}_data'
+MYSQL_BATCH_SIZE = 100
+# MySQL 连接池
+MYSQL_FLUSH_INTERVAL = 5
+MYSQL_POOL_MIN = 5
+MYSQL_POOL_MAX = 20
+MYSQL_ECHO = False
+# --- MongoDB 配置 ---
+MONGO_URI = os.getenv('MONGO_URI', 'mongodb://localhost:27017')
+MONGO_DATABASE = f'{{project_name}}_db'
+MONGO_COLLECTION = '{{project_name}}_items'
+MONGO_MAX_POOL_SIZE = 200
+MONGO_MIN_POOL_SIZE = 20
+# ============================== 去重过滤配置 ==============================
+REQUEST_DIR = '.'
+# 去重过滤器（推荐分布式项目使用 Redis 过滤器）
+FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFilter'
+# FILTER_CLASS = 'crawlo.filters.aioredis_filter.AioRedisFilter'  # 分布式去重
+# --- Redis 配置（用于分布式去重和队列） ---
+REDIS_HOST = os.getenv('REDIS_HOST', '127.0.0.1')
+REDIS_PORT = int(os.getenv('REDIS_PORT', 6379))
+REDIS_PASSWORD = os.getenv('REDIS_PASSWORD', '')
+# 根据是否有密码生成 URL
+if REDIS_PASSWORD:
+    REDIS_URL = f'redis://:{REDIS_PASSWORD}@{REDIS_HOST}:{REDIS_PORT}/0'
+else:
+    REDIS_URL = f'redis://{REDIS_HOST}:{REDIS_PORT}/0'
+REDIS_KEY = f'{{project_name}}:fingerprint'
+REDIS_TTL = 0
+CLEANUP_FP = 0
+FILTER_DEBUG = True
+DECODE_RESPONSES = True
+# ============================== 中间件配置 ==============================
+MIDDLEWARES = [
+    # === 请求预处理阶段 ===
+    'crawlo.middleware.request_ignore.RequestIgnoreMiddleware',
+    'crawlo.middleware.download_delay.DownloadDelayMiddleware',
+    'crawlo.middleware.default_header.DefaultHeaderMiddleware',
+    'crawlo.middleware.proxy.ProxyMiddleware',
+    # === 响应处理阶段 ===
+    'crawlo.middleware.retry.RetryMiddleware',
+    'crawlo.middleware.response_code.ResponseCodeMiddleware',
+    'crawlo.middleware.response_filter.ResponseFilterMiddleware',
+]
+# ============================== 数据管道配置 ==============================
+PIPELINES = [
+    # 根据运行模式自动选择默认去重管道
+    # 单机模式：crawlo.pipelines.MemoryDedupPipeline
+    # 分布式模式：crawlo.pipelines.RedisDedupPipeline
+    'crawlo.pipelines.console_pipeline.ConsolePipeline',
+    # '{{project_name}}.pipelines.DatabasePipeline',        # 自定义数据库管道
+    # 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline',  # MySQL 存储
+    # 'crawlo.pipelines.mongo_pipeline.MongoPipeline',      # MongoDB 存储
+]
+# ============================== 扩展组件 ==============================
+EXTENSIONS = [
+    'crawlo.extension.log_interval.LogIntervalExtension',
+    'crawlo.extension.log_stats.LogStats',
+    'crawlo.extension.logging_extension.CustomLoggerExtension',
+]
+# ============================== 日志配置 ==============================
+LOG_LEVEL = 'INFO'
+STATS_DUMP = True
+LOG_FILE = f'logs/{{project_name}}.log'
+LOG_FORMAT = '%(asctime)s - [%(name)s] - %(levelname)s： %(message)s'
+LOG_ENCODING = 'utf-8'
+# ============================== 代理配置 ==============================
+PROXY_ENABLED = False
+PROXY_API_URL = ""  # 请填入真实的代理API地址
+PROXY_EXTRACTOR = "proxy"
+PROXY_REFRESH_INTERVAL = 60
+PROXY_API_TIMEOUT = 10
+# ============================== 浏览器指纹配置 ==============================
+# CurlCffi 下载器专用配置
+CURL_BROWSER_TYPE = "chrome"
+CURL_BROWSER_VERSION_MAP = {
+    "chrome": "chrome136",
+    "edge": "edge101",
+    "safari": "safari184",
+    "firefox": "firefox135",
+}
+# 默认请求头
+DEFAULT_REQUEST_HEADERS = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
+                  '(KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36',
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
+    'Accept-Encoding': 'gzip, deflate, br',
+    'Connection': 'keep-alive',
+    'Upgrade-Insecure-Requests': '1',
+}
+# ============================== 开发与调试 ==============================
+# 开发模式配置
+DEBUG = False
+TESTING = False
+# 性能监控
+ENABLE_PERFORMANCE_MONITORING = True
+MEMORY_USAGE_WARNING_THRESHOLD = 500  # MB
+# ============================== 自定义配置区域 ==============================
+# 在此处添加项目特定的配置项
+# 示例：目标网站特定配置
+# TARGET_DOMAIN = '{{domain}}'
+# MAX_PAGES_PER_DOMAIN = 10000
+# CUSTOM_RATE_LIMIT = 1.5

crawlo/templates/project/spiders/__init__.py.tmpl CHANGED Viewed

@@ -1,6 +1,6 @@
-# -*- coding: UTF-8 -*-
-"""
-{{project_name}}.spiders
-========================
-存放所有的爬虫。
+# -*- coding: UTF-8 -*-
+"""
+{{project_name}}.spiders
+========================
+存放所有的爬虫。
 """

crawlo/templates/spider/spider.py.tmpl CHANGED Viewed

@@ -1,32 +1,178 @@
-# -*- coding: UTF-8 -*-
-"""
-{{project_name}}.spiders.{{spider_name}}
-=======================================
-由 `crawlo genspider` 命令生成的爬虫。
-"""
-from crawlo.spider import Spider
-class {{class_name}}(Spider):
-    """
-    爬虫：{{spider_name}}
-    """
-    name = '{{spider_name}}'
-    allowed_domains = ['{{domain}}']
-    start_urls = ['https://{{domain}}/']
-    def parse(self, response):
-        """
-        解析响应的主方法。
-        """
-        # TODO: 在这里编写你的解析逻辑
-        # 示例：提取数据
-        # item = {{item_class}}()
-        # item['title'] = response.xpath('//title/text()').get()
-        # yield item
-        # 示例：提取链接并跟进
-        # for href in response.xpath('//a/@href').getall():
-        #     yield response.follow(href, callback=self.parse)
+# -*- coding: UTF-8 -*-
+"""
+{{project_name}}.spiders.{{spider_name}}
+=======================================
+由 `crawlo genspider` 命令生成的爬虫。
+基于 Crawlo 框架，支持异步并发、分布式爬取等功能。
+使用示例：
+    crawlo crawl {{spider_name}}
+"""
+from crawlo.spider import Spider
+from crawlo import Request
+# from {{project_name}}.items import {{item_class}}  # 可选：导入数据项
+class {{class_name}}(Spider):
+    """
+    爬虫：{{spider_name}}
+    功能说明：
+    - 支持并发爬取
+    - 自动去重过滤
+    - 错误重试机制
+    - 数据管道处理
+    """
+    name = '{{spider_name}}'
+    allowed_domains = ['{{domain}}']
+    start_urls = ['https://{{domain}}/']
+    # 高级配置（可选）
+    # custom_settings = {
+    #     'DOWNLOAD_DELAY': 2.0,
+    #     'CONCURRENCY': 4,
+    #     'RETRY_HTTP_CODES': [500, 502, 503, 504, 408, 429],
+    # }
+    def start_requests(self):
+        """
+        生成初始请求。
+        支持自定义请求头、代理、优先级等。
+        """
+        headers = {
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+            'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
+        }
+        for url in self.start_urls:
+            yield Request(
+                url=url,
+                callback=self.parse,
+                headers=headers,
+                # meta={'proxy': 'http://proxy.example.com:8080'},  # 自定义代理
+                # priority=10,  # 请求优先级（数字越大优先级越高）
+            )
+    def parse(self, response):
+        """
+        解析响应的主方法。
+        Args:
+            response: 响应对象，包含页面内容和元数据
+        Yields:
+            Request: 新的请求对象（用于深度爬取）
+            Item: 数据项对象（用于数据存储）
+        """
+        self.logger.info(f'正在解析页面: {response.url}')
+        # ================== 数据提取示例 ==================
+        # 提取数据并创建 Item
+        # item = {{item_class}}()
+        # item['title'] = response.xpath('//title/text()').get(default='')
+        # item['url'] = response.url
+        # item['content'] = response.xpath('//div[@class="content"]//text()').getall()
+        # yield item
+        # 直接返回字典（简单数据）
+        yield {
+            'title': response.xpath('//title/text()').get(default=''),
+            'url': response.url,
+            'status_code': response.status_code,
+            # 'description': response.xpath('//meta[@name="description"]/@content').get(),
+            # 'keywords': response.xpath('//meta[@name="keywords"]/@content').get(),
+        }
+        # ================== 链接提取示例 ==================
+        # 提取并跟进链接
+        # links = response.xpath('//a/@href').getall()
+        # for link in links:
+        #     # 过滤有效链接
+        #     if link and not link.startswith(('javascript:', 'mailto:', '#')):
+        #         yield response.follow(
+        #             link,
+        #             callback=self.parse_detail,  # 或者 self.parse 继续递归
+        #             meta={'parent_url': response.url}  # 传递父页面信息
+        #         )
+        # 用 CSS 选择器提取链接
+        # for link in response.css('a.item-link::attr(href)').getall():
+        #     yield response.follow(link, callback=self.parse_detail)
+        # ================== 分页处理示例 ==================
+        # 处理分页
+        # next_page = response.xpath('//a[@class="next"]/@href').get()
+        # if next_page:
+        #     yield response.follow(next_page, callback=self.parse)
+        # 数字分页
+        # current_page = int(response.meta.get('page', 1))
+        # max_pages = 100  # 设置最大页数
+        # if current_page < max_pages:
+        #     next_url = f'https://{{domain}}/page/{current_page + 1}'
+        #     yield Request(
+        #         url=next_url,
+        #         callback=self.parse,
+        #         meta={'page': current_page + 1}
+        #     )
+    def parse_detail(self, response):
+        """
+        解析详情页面的方法（可选）。
+        用于处理从列表页跳转而来的详情页。
+        """
+        self.logger.info(f'正在解析详情页: {response.url}')
+        # parent_url = response.meta.get('parent_url', '')
+        #
+        # yield {
+        #     'title': response.xpath('//h1/text()').get(default=''),
+        #     'content': '\n'.join(response.xpath('//div[@class="content"]//text()').getall()),
+        #     'url': response.url,
+        #     'parent_url': parent_url,
+        #     'publish_time': response.xpath('//time/@datetime').get(),
+        # }
+        pass
+    def parse_error(self, failure):
+        """
+        处理请求失败的方法（可选）。
+        当请求失败时会被调用。
+        """
+        self.logger.error(f'请求失败: {failure.request.url} - {failure.value}')
+        # 可以选择重试或记录失败信息
+        # yield {
+        #     'error_url': failure.request.url,
+        #     'error_message': str(failure.value),
+        #     'error_type': failure.type.__name__,
+        # }
+    def spider_opened(self, spider):
+        """
+        爬虫启动时的回调方法（可选）。
+        """
+        self.logger.info(f'爬虫 {spider.name} 已启动')
+        # 初始化操作，例如连接数据库、加载配置等
+        # self.database = self.connect_database()
+        # self.cookies = self.load_cookies()
+    def spider_closed(self, spider, reason):
+        """
+        爬虫关闭时的回调方法（可选）。
+        """
+        self.logger.info(f'爬虫 {spider.name} 已关闭，原因: {reason}')
+        # 清理操作，例如关闭数据库连接、保存状态等
+        # if hasattr(self, 'database'):
+        #     self.database.close()
+        # self.save_cookies()

crawlo/utils/__init__.py CHANGED Viewed

@@ -1,7 +1,7 @@
-#!/usr/bin/python
-# -*- coding:UTF-8 -*-
-"""
-# @Time    :    2025-02-05 13:57
-# @Author  :   oscar
-# @Desc    :   None
-"""
+#!/usr/bin/python
+# -*- coding:UTF-8 -*-
+"""
+# @Time    :    2025-02-05 13:57
+# @Author  :   oscar
+# @Desc    :   None
+"""

crawlo 1.1.1__py3-none-any.whl → 1.1.3__py3-none-any.whl

Potentially problematic release.

crawlo 1.1.1py3-none-any.whl → 1.1.3py3-none-any.whl