PyPI - crawlo - Versions diffs - 1.1.2__py3-none-any.whl → 1.1.4__py3-none-any.whl - Mend

crawlo 1.1.2py3-none-any.whl → 1.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crawlo might be problematic. Click here for more details.

Files changed (41) hide show

crawlo/__version__.py +1 -1
crawlo/core/scheduler.py +20 -16
crawlo/downloader/httpx_downloader.py +14 -12
crawlo/exceptions.py +4 -0
crawlo/extension/__init__.py +17 -10
crawlo/extension/health_check.py +142 -0
crawlo/extension/log_interval.py +27 -18
crawlo/extension/log_stats.py +62 -24
crawlo/extension/logging_extension.py +18 -9
crawlo/extension/memory_monitor.py +89 -0
crawlo/extension/performance_profiler.py +118 -0
crawlo/extension/request_recorder.py +108 -0
crawlo/filters/aioredis_filter.py +2 -2
crawlo/middleware/retry.py +3 -3
crawlo/network/request.py +2 -2
crawlo/network/response.py +25 -23
crawlo/pipelines/__init__.py +9 -0
crawlo/pipelines/bloom_dedup_pipeline.py +157 -0
crawlo/pipelines/database_dedup_pipeline.py +225 -0
crawlo/pipelines/memory_dedup_pipeline.py +116 -0
crawlo/pipelines/mongo_pipeline.py +81 -66
crawlo/pipelines/mysql_pipeline.py +165 -43
crawlo/pipelines/redis_dedup_pipeline.py +163 -0
crawlo/queue/queue_manager.py +4 -0
crawlo/queue/redis_priority_queue.py +20 -3
crawlo/settings/default_settings.py +119 -66
crawlo/subscriber.py +62 -37
crawlo/templates/project/items.py.tmpl +1 -1
crawlo/templates/project/middlewares.py.tmpl +73 -49
crawlo/templates/project/pipelines.py.tmpl +52 -290
crawlo/templates/project/run.py.tmpl +20 -7
crawlo/templates/project/settings.py.tmpl +35 -3
crawlo/templates/spider/spider.py.tmpl +1 -37
crawlo/utils/controlled_spider_mixin.py +109 -5
crawlo-1.1.4.dist-info/METADATA +403 -0
{crawlo-1.1.2.dist-info → crawlo-1.1.4.dist-info}/RECORD +40 -31
examples/controlled_spider_example.py +205 -0
crawlo-1.1.2.dist-info/METADATA +0 -567
{crawlo-1.1.2.dist-info → crawlo-1.1.4.dist-info}/WHEEL +0 -0
{crawlo-1.1.2.dist-info → crawlo-1.1.4.dist-info}/entry_points.txt +0 -0
{crawlo-1.1.2.dist-info → crawlo-1.1.4.dist-info}/top_level.txt +0 -0

crawlo/utils/controlled_spider_mixin.py CHANGED Viewed

@@ -221,6 +221,59 @@ class AsyncControlledRequestMixin:
         self._monitoring_task = None
         self._stop_generation = False
+    def _original_start_requests(self) -> Generator[Request, None, None]:
+        """
+        子类应该实现这个方法，提供原始的请求生成逻辑
+        示例：
+        def _original_start_requests(self):
+            for i in range(50000):  # 5万个请求
+                yield Request(url=f"https://example.com/page/{i}")
+        """
+        raise NotImplementedError(
+            "子类必须实现 _original_start_requests() 方法，"
+            "或者确保原始的 start_requests() 方法存在"
+        )
+    def _get_original_requests(self) -> Generator[Request, None, None]:
+        """尝试获取原始请求（向后兼容）"""
+        # 这里可以尝试调用父类的 start_requests 或其他方式
+        # 具体实现取决于你的需求
+        return iter([])  # 默认返回空生成器
+    def _should_pause_generation(self) -> bool:
+        """判断是否应该暂停请求生成"""
+        # 检查队列大小（如果可以访问scheduler的话）
+        if hasattr(self, 'crawler') and self.crawler:
+            engine = getattr(self.crawler, 'engine', None)
+            if engine and engine.scheduler:
+                queue_size = len(engine.scheduler)
+                if queue_size > 200:  # 背压阈值
+                    return True
+        # 检查任务管理器负载
+        if hasattr(self, 'crawler') and self.crawler:
+            engine = getattr(self.crawler, 'engine', None)
+            if engine and engine.task_manager:
+                current_tasks = len(engine.task_manager.current_task)
+                concurrency = getattr(engine.task_manager, 'semaphore', None)
+                if concurrency and hasattr(concurrency, '_initial_value'):
+                    max_concurrency = concurrency._initial_value
+                    # 如果当前任务数接近最大并发数，暂停生成
+                    if current_tasks >= max_concurrency * 0.8:  # 80% 阈值
+                        return True
+        return False
+    def _process_request_before_yield(self, request: Request) -> Optional[Request]:
+        """
+        在 yield 请求前进行处理
+        子类可以重写这个方法来添加自定义逻辑
+        返回 None 表示跳过这个请求
+        """
+        return request
     async def start_requests_async(self) -> Generator[Request, None, None]:
         """异步版本的受控请求生成"""
         # 初始化信号量
@@ -239,12 +292,14 @@ class AsyncControlledRequestMixin:
                 batch.append(request)
                 if len(batch) >= 50:  # 批次大小
-                    yield from await self._process_async_batch(batch)
+                    async for request in self._process_async_batch(batch):
+                        yield request
                     batch = []
             # 处理剩余请求
             if batch:
-                yield from await self._process_async_batch(batch)
+                async for request in self._process_async_batch(batch):
+                    yield request
         finally:
             # 清理
@@ -298,7 +353,7 @@ class AsyncControlledRequestMixin:
 # 使用示例和文档
 USAGE_EXAMPLE = '''
-# 使用示例：
+# 同步版本使用示例：
 class MyControlledSpider(Spider, ControlledRequestMixin):
     name = 'controlled_spider'
@@ -326,11 +381,60 @@ class MyControlledSpider(Spider, ControlledRequestMixin):
         # 解析逻辑
         yield {"url": response.url}
+# 异步版本使用示例：
+class MyAsyncControlledSpider(Spider, AsyncControlledRequestMixin):
+    name = 'async_controlled_spider'
+    def __init__(self):
+        Spider.__init__(self)
+        AsyncControlledRequestMixin.__init__(self)
+        # 配置异步控制参数
+        self.max_concurrent_generations = 15
+        self.queue_monitor_interval = 0.5
+    def _original_start_requests(self):
+        """提供原始的大量请求"""
+        categories = ['tech', 'finance', 'sports']
+        for category in categories:
+            for page in range(1, 10000):  # 每个分类1万页
+                yield Request(
+                    url=f"https://news-site.com/{category}?page={page}",
+                    meta={'category': category}
+                )
+    def _process_request_before_yield(self, request):
+        """异步版本的请求预处理"""
+        # 根据分类设置优先级
+        category = request.meta.get('category', '')
+        if category == 'tech':
+            request.priority = 10
+        return request
+    async def parse(self, response):
+        # 异步解析逻辑
+        yield {
+            "url": response.url,
+            "category": response.meta['category']
+        }
 # 使用时：
 from crawlo.crawler import CrawlerProcess
+from crawlo.config import CrawloConfig
-process = CrawlerProcess()
-process.settings.set('CONCURRENCY', 16)  # 设置并发数
+# 同步版本
+config = CrawloConfig.standalone(concurrency=16)
+process = CrawlerProcess(config)
 process.crawl(MyControlledSpider)
 process.start()
+# 异步版本
+async_config = CrawloConfig.standalone(
+    concurrency=30,
+    downloader='httpx'  # 推荐使用支持异步的下载器
+)
+async_process = CrawlerProcess(async_config)
+async_process.crawl(MyAsyncControlledSpider)
+async_process.start()
 '''

crawlo-1.1.4.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,403 @@
+Metadata-Version: 2.4
+Name: crawlo
+Version: 1.1.4
+Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架，支持分布式抓取。
+Home-page: https://github.com/crawl-coder/Crawlo.git
+Author: crawl-coder
+Author-email: crawlo@qq.com
+License: MIT
+Classifier: Programming Language :: Python :: 3
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Requires-Python: >=3.6
+Description-Content-Type: text/markdown
+Requires-Dist: aiohttp>=3.12.14
+Requires-Dist: aiomysql>=0.2.0
+Requires-Dist: aioredis>=2.0.1
+Requires-Dist: asyncmy>=0.2.10
+Requires-Dist: cssselect>=1.2.0
+Requires-Dist: dateparser>=1.2.2
+Requires-Dist: httpx[http2]>=0.27.0
+Requires-Dist: curl-cffi>=0.13.0
+Requires-Dist: lxml>=5.2.1
+Requires-Dist: motor>=3.7.0
+Requires-Dist: parsel>=1.9.1
+Requires-Dist: pydantic>=2.11.7
+Requires-Dist: pymongo>=4.11
+Requires-Dist: PyMySQL>=1.1.1
+Requires-Dist: python-dateutil>=2.9.0.post0
+Requires-Dist: redis>=6.2.0
+Requires-Dist: requests>=2.32.4
+Requires-Dist: six>=1.17.0
+Requires-Dist: ujson>=5.9.0
+Requires-Dist: urllib3>=2.5.0
+Requires-Dist: w3lib>=2.1.2
+Requires-Dist: rich>=14.1.0
+Requires-Dist: astor>=0.8.1
+Requires-Dist: watchdog>=6.0.0
+Provides-Extra: render
+Requires-Dist: webdriver-manager>=4.0.0; extra == "render"
+Requires-Dist: playwright; extra == "render"
+Requires-Dist: selenium>=3.141.0; extra == "render"
+Provides-Extra: all
+Requires-Dist: bitarray>=1.5.3; extra == "all"
+Requires-Dist: PyExecJS>=1.5.1; extra == "all"
+Requires-Dist: pymongo>=3.10.1; extra == "all"
+Requires-Dist: redis-py-cluster>=2.1.0; extra == "all"
+Requires-Dist: webdriver-manager>=4.0.0; extra == "all"
+Requires-Dist: playwright; extra == "all"
+Requires-Dist: selenium>=3.141.0; extra == "all"
+# Crawlo - 异步分布式爬虫框架
+<div align="center">
+[![Python Version](https://img.shields.io/badge/python-3.8%2B-blue)](https://www.python.org/downloads/)
+[![License](https://img.shields.io/badge/license-MIT-green)](LICENSE)
+[![Documentation](https://img.shields.io/badge/docs-latest-brightgreen)](https://crawlo.readthedocs.io/)
+一个基于 asyncio 的高性能异步分布式爬虫框架，支持单机和分布式部署。
+</div>
+## 🌟 特性
+- **异步高性能**: 基于 asyncio 实现，充分利用现代 CPU 多核性能
+- **分布式支持**: 内置 Redis 队列，轻松实现分布式部署
+- **模块化设计**: 中间件、管道、扩展组件系统，易于定制和扩展
+- **智能去重**: 多种去重策略（内存、Redis、Bloom Filter）
+- **灵活配置**: 支持多种配置方式，适应不同场景需求
+- **丰富文档**: 完整的中英文双语文档和示例项目
+## 🚀 快速开始
+### 安装
+```bash
+pip install crawlo
+```
+### 创建项目
+```bash
+crawlo startproject myproject
+cd myproject
+```
+### 编写爬虫
+```python
+from crawlo import Spider, Request, Item
+class MyItem(Item):
+    title = ''
+    url = ''
+class MySpider(Spider):
+    name = 'myspider'
+    async def start_requests(self):
+        yield Request('https://httpbin.org/get', callback=self.parse)
+    async def parse(self, response):
+        yield MyItem(
+            title='Example Title',
+            url=response.url
+        )
+```
+### 运行爬虫
+```bash
+crawlo crawl myspider
+```
+## 🏗️ 架构设计
+### 组件交互图
+```
+┌─────────────────────────────────────────────────────────────────────┐
+│                            Crawler                                  │
+│  ┌──────────────┐  ┌──────────────┐  ┌──────────────────────────┐  │
+│  │   Spider     │  │   Engine     │  │      ExtensionManager     │  │
+│  │              │  │              │  │                          │  │
+│  │ start_urls   │  │  Scheduler ◄─┼──┼──► StatsCollector         │  │
+│  │ parse()      │  │              │  │                          │  │
+│  │              │  │ Downloader ◄─┼──┼──► MiddlewareManager     │  │
+│  │              │  │              │  │                          │  │
+│  │              │  │ Processor  ◄─┼──┼──► PipelineManager       │  │
+│  └──────────────┘  └──────┬───────┘  └──────────────────────────┘  │
+└──────────────────────────┼─────────────────────────────────────────┘
+                           │
+        ┌──────────────────▼──────────────────┐
+        │         Scheduler                   │
+        │  ┌──────────────────────────────┐   │
+        │  │       QueueManager           │   │
+        │  │  ┌─────────┐  ┌────────────┐ │   │
+        │  │  │ Memory  │  │   Redis    │ │   │
+        │  │  │ Queue   │  │  Queue     │ │   │
+        │  │  └─────────┘  └────────────┘ │   │
+        │  └──────────────────────────────┘   │
+        │  ┌──────────────────────────────┐   │
+        │  │        Filter                │   │
+        │  │  ┌─────────┐  ┌────────────┐ │   │
+        │  │  │ Memory  │  │   Redis    │ │   │
+        │  │  │ Filter  │  │  Filter    │ │   │
+        │  │  └─────────┘  └────────────┘ │   │
+        │  └──────────────────────────────┘   │
+        └─────────────────────────────────────┘
+                           │
+        ┌──────────────────▼──────────────────┐
+        │         Downloader                  │
+        │  ┌──────────────────────────────┐   │
+        │  │    MiddlewareManager         │   │
+        │  │                              │   │
+        │  │ RequestMiddleware ◄────────┐ │   │
+        │  │ ResponseMiddleware        │ │   │
+        │  │ ExceptionMiddleware       │ │   │
+        │  │                          ╱  │   │
+        │  └─────────────────────────╱───┘   │
+        │                           ╱        │
+        │  ┌───────────────────────▼──┐      │
+        │  │  Download Implementations │      │
+        │  │  - AioHttpDownloader   │      │
+        │  │  - HttpXDownloader     │      │
+        │  │  - CurlCffiDownloader  │      │
+        │  └──────────────────────────┘      │
+        └─────────────────────────────────────┘
+                           │
+        ┌──────────────────▼──────────────────┐
+        │          Processor                  │
+        │  ┌──────────────────────────────┐   │
+        │  │    PipelineManager           │   │
+        │  │  ┌─────────────────────────┐ │   │
+        │  │  │   Pipeline Stages       │ │   │
+        │  │  │ - ValidationPipeline    │ │   │
+        │  │  │ - ProcessingPipeline    │ │   │
+        │  │  │ - StoragePipeline       │ │   │
+        │  │  └─────────────────────────┘ │   │
+        │  └──────────────────────────────┘   │
+        └─────────────────────────────────────┘
+```
+### 运行模式切换图
+```
+                    ┌─────────────────────┐
+                    │   ModeManager       │
+                    │  (运行模式管理器)    │
+                    └─────────┬───────────┘
+                              │
+        ┌─────────────────────┼─────────────────────┐
+        │                     │                     │
+        ▼                     ▼                     ▼
+┌───────────────┐    ┌─────────────────┐   ┌─────────────────┐
+│  Standalone   │    │   Distributed   │   │      Auto       │
+│   (单机模式)   │    │   (分布式模式)   │   │   (自动检测模式)  │
+└───────┬───────┘    └─────────┬───────┘   └─────────┬───────┘
+        │                      │                     │
+        ▼                      ▼                     ▼
+┌───────────────┐    ┌─────────────────┐   ┌─────────────────┐
+│ Memory Queue  │    │   Redis Queue   │   │  Auto Select    │
+│ Memory Filter │    │  Redis Filter   │   │ Memory/Redis    │
+└───────────────┘    └─────────────────┘   └─────────────────┘
+```
+### 数据流向图
+```
+┌─────────────┐    1.生成初始请求     ┌──────────────┐
+│   Spider    ├─────────────────────►│  Scheduler   │
+└─────────────┘                      └──────┬───────┘
+                                            │ 2.去重检查
+                                            ▼
+                                  ┌─────────────────┐
+                                  │     Filter      │
+                                  └─────────┬───────┘
+                                            │ 3.入队
+                                            ▼
+                                  ┌─────────────────┐
+                                  │      Queue      │
+                                  └─────────┬───────┘
+                                            │ 4.获取请求
+                                            ▼
+                                  ┌─────────────────┐    5.下载请求
+                                  │   Downloader    ├──────────────────┐
+                                  └─────────────────┘                  │
+                                            │ 6.解析响应              │
+                                            ▼                         ▼
+                                  ┌─────────────────┐    7.生成数据    ┌─────────────┐
+                                  │   Processor     ├────────────────►│   Pipeline  │
+                                  └─────────────────┘                 └─────────────┘
+                                            │ 8.存储数据
+                                            ▼
+                                  ┌─────────────────┐
+                                  │     Items       │
+                                  └─────────────────┘
+```
+### 模块层次结构图
+```
+crawlo/
+├── cli.py                          # 命令行接口
+├── crawler.py                      # 爬虫运行实例
+├── project.py                      # 项目管理
+├── config.py                       # 配置管理
+├── mode_manager.py                 # 运行模式管理器
+├── stats_collector.py              # 统计收集器
+├── subscriber.py                   # 事件订阅器
+├── task_manager.py                 # 任务管理器
+├── event.py                        # 事件定义
+├── exceptions.py                   # 异常定义
+├──
+├── core/                           # 核心组件
+│   ├── engine.py                   # 引擎
+│   ├── scheduler.py                # 调度器
+│   ├── processor.py                # 处理器
+│
+├── spider/                         # 爬虫基类
+│   └── __init__.py                 # 爬虫元类和基类
+│
+├── network/                        # 网络相关
+│   ├── request.py                  # 请求对象
+│   └── response.py                 # 响应对象
+│
+├── downloader/                     # 下载器
+│   ├── __init__.py                 # 下载器基类
+│   ├── aiohttp_downloader.py      # AioHttp实现
+│   ├── httpx_downloader.py        # HttpX实现
+│   └── cffi_downloader.py         # CurlCffi实现
+│
+├── queue/                          # 队列管理
+│   ├── __init__.py
+│   ├── queue_manager.py           # 队列管理器
+│   ├── pqueue.py                  # 内存优先队列
+│   └── redis_priority_queue.py    # Redis优先队列
+│
+├── filters/                        # 过滤器
+│   ├── __init__.py
+│   ├── base_filter.py             # 过滤器基类
+│   ├── memory_filter.py           # 内存过滤器
+│   └── aioredis_filter.py         # Redis过滤器
+│
+├── middleware/                     # 中间件
+│   ├── __init__.py
+│   ├── middleware_manager.py      # 中间件管理器
+│   ├── default_header.py          # 默认请求头
+│   ├── download_delay.py          # 下载延迟
+│   ├── proxy.py                   # 代理支持
+│   ├── request_ignore.py          # 请求忽略
+│   ├── response_code.py           # 响应码处理
+│   ├── response_filter.py         # 响应过滤
+│   └── retry.py                   # 重试机制
+│
+├── pipelines/                      # 数据管道
+│   ├── __init__.py
+│   ├── pipeline_manager.py        # 管道管理器
+│   ├── base_pipeline.py           # 管道基类
+│   ├── console_pipeline.py        # 控制台输出管道
+│   └── mysql_pipeline.py          # MySQL存储管道
+│
+├── extension/                      # 扩展组件
+│   ├── __init__.py
+│   ├── log_interval.py            # 定时日志
+│   ├── log_stats.py               # 统计日志
+│   ├── logging_extension.py       # 日志扩展
+│   ├── memory_monitor.py          # 内存监控
+│   └── performance_profiler.py    # 性能分析
+│
+├── settings/                       # 配置系统
+│   ├── __init__.py
+│   ├── default_settings.py        # 默认配置
+│   └── setting_manager.py         # 配置管理器
+│
+├── utils/                          # 工具库
+│   ├── __init__.py
+│   ├── log.py                     # 日志工具
+│   ├── request.py                 # 请求工具
+│   ├── request_serializer.py      # 请求序列化
+│   └── func_tools.py              # 函数工具
+│
+└── templates/                      # 模板文件
+    ├── project/
+    └── spider/
+```
+### 组件说明
+- **Crawler**: 爬虫运行实例，管理Spider与引擎的生命周期
+- **Engine**: 引擎组件，协调Scheduler、Downloader、Processor
+- **Scheduler**: 调度器，管理请求队列和去重过滤
+- **Downloader**: 下载器，负责网络请求，支持多种实现(aiohttp, httpx, curl-cffi)
+- **Processor**: 处理器，处理响应数据和管道
+- **QueueManager**: 统一的队列管理器，支持内存队列和Redis队列的自动切换
+- **Filter**: 请求去重过滤器，支持内存和Redis两种实现
+- **Middleware**: 中间件系统，处理请求/响应的预处理和后处理
+- **Pipeline**: 数据处理管道，支持多种存储方式(控制台、数据库等)
+- **Spider**: 爬虫基类，定义爬取逻辑
+### 运行模式
+Crawlo支持三种运行模式：
+- **standalone**: 单机模式，使用内存队列和内存过滤器
+- **distributed**: 分布式模式，使用Redis队列和Redis过滤器
+- **auto**: 自动检测模式，根据环境自动选择最佳运行方式
+## 🎛️ 配置系统
+### 传统配置方式
+```
+# settings.py
+PROJECT_NAME = 'myproject'
+CONCURRENCY = 16
+DOWNLOAD_DELAY = 1.0
+QUEUE_TYPE = 'memory'  # 单机模式
+# QUEUE_TYPE = 'redis'   # 分布式模式
+```
+### 命令行配置
+```
+crawlo crawl myspider --concurrency=32 --delay=0.5
+```
+## 🧩 核心组件
+### 中间件系统
+灵活的中间件系统，支持请求预处理、响应处理和异常处理。
+### 管道系统
+可扩展的数据处理管道，支持多种存储方式（控制台、数据库等）。
+### 扩展组件
+功能增强扩展，包括日志、监控、性能分析等。
+### 过滤系统
+智能去重过滤，支持多种去重策略（内存、Redis、Bloom Filter）。
+## 📦 示例项目
+- [API数据采集](examples/api_data_collection/) - 简单的API数据采集示例
+- [电信设备许可证](examples/telecom_licenses_distributed/) - 分布式爬取示例
+## 📚 文档
+完整的文档请访问 [Crawlo Documentation](https://crawlo.readthedocs.io/)
+- [快速开始指南](docs/quick_start.md)
+- [框架文档](docs/crawlo_framework_documentation.md)
+- [API参考](docs/api_reference.md)
+- [分布式爬取教程](docs/distributed_crawling_tutorial.md)
+- [配置最佳实践](docs/configuration_best_practices.md)
+- [扩展组件](docs/extensions.md)
+## 🤝 贡献
+欢迎提交 Issue 和 Pull Request 来帮助改进 Crawlo！
+## 📄 许可证
+本项目采用 MIT 许可证，详情请见 [LICENSE](LICENSE) 文件。

crawlo 1.1.2__py3-none-any.whl → 1.1.4__py3-none-any.whl

Potentially problematic release.

crawlo 1.1.2py3-none-any.whl → 1.1.4py3-none-any.whl