crawlo 1.1.3__py3-none-any.whl → 1.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (115) hide show
  1. crawlo/__init__.py +28 -1
  2. crawlo/__version__.py +1 -1
  3. crawlo/cleaners/__init__.py +61 -0
  4. crawlo/cleaners/data_formatter.py +226 -0
  5. crawlo/cleaners/encoding_converter.py +126 -0
  6. crawlo/cleaners/text_cleaner.py +233 -0
  7. crawlo/commands/startproject.py +117 -13
  8. crawlo/config.py +30 -0
  9. crawlo/config_validator.py +253 -0
  10. crawlo/core/engine.py +185 -11
  11. crawlo/core/scheduler.py +49 -78
  12. crawlo/crawler.py +6 -6
  13. crawlo/downloader/__init__.py +24 -0
  14. crawlo/downloader/aiohttp_downloader.py +8 -0
  15. crawlo/downloader/cffi_downloader.py +5 -0
  16. crawlo/downloader/hybrid_downloader.py +214 -0
  17. crawlo/downloader/playwright_downloader.py +403 -0
  18. crawlo/downloader/selenium_downloader.py +473 -0
  19. crawlo/extension/__init__.py +17 -10
  20. crawlo/extension/health_check.py +142 -0
  21. crawlo/extension/log_interval.py +27 -18
  22. crawlo/extension/log_stats.py +62 -24
  23. crawlo/extension/logging_extension.py +18 -9
  24. crawlo/extension/memory_monitor.py +105 -0
  25. crawlo/extension/performance_profiler.py +134 -0
  26. crawlo/extension/request_recorder.py +108 -0
  27. crawlo/filters/aioredis_filter.py +50 -12
  28. crawlo/middleware/proxy.py +26 -2
  29. crawlo/mode_manager.py +24 -19
  30. crawlo/network/request.py +30 -3
  31. crawlo/network/response.py +114 -25
  32. crawlo/pipelines/mongo_pipeline.py +81 -66
  33. crawlo/pipelines/mysql_pipeline.py +165 -43
  34. crawlo/pipelines/redis_dedup_pipeline.py +7 -3
  35. crawlo/queue/queue_manager.py +15 -2
  36. crawlo/queue/redis_priority_queue.py +144 -76
  37. crawlo/settings/default_settings.py +93 -121
  38. crawlo/subscriber.py +62 -37
  39. crawlo/templates/project/items.py.tmpl +1 -1
  40. crawlo/templates/project/middlewares.py.tmpl +73 -49
  41. crawlo/templates/project/pipelines.py.tmpl +51 -295
  42. crawlo/templates/project/settings.py.tmpl +93 -17
  43. crawlo/templates/project/settings_distributed.py.tmpl +120 -0
  44. crawlo/templates/project/settings_gentle.py.tmpl +95 -0
  45. crawlo/templates/project/settings_high_performance.py.tmpl +152 -0
  46. crawlo/templates/project/settings_simple.py.tmpl +69 -0
  47. crawlo/templates/spider/spider.py.tmpl +2 -38
  48. crawlo/tools/__init__.py +183 -0
  49. crawlo/tools/anti_crawler.py +269 -0
  50. crawlo/tools/authenticated_proxy.py +241 -0
  51. crawlo/tools/data_validator.py +181 -0
  52. crawlo/tools/date_tools.py +36 -0
  53. crawlo/tools/distributed_coordinator.py +387 -0
  54. crawlo/tools/retry_mechanism.py +221 -0
  55. crawlo/tools/scenario_adapter.py +263 -0
  56. crawlo/utils/__init__.py +29 -1
  57. crawlo/utils/batch_processor.py +261 -0
  58. crawlo/utils/date_tools.py +58 -1
  59. crawlo/utils/enhanced_error_handler.py +360 -0
  60. crawlo/utils/env_config.py +106 -0
  61. crawlo/utils/error_handler.py +126 -0
  62. crawlo/utils/performance_monitor.py +285 -0
  63. crawlo/utils/redis_connection_pool.py +335 -0
  64. crawlo/utils/redis_key_validator.py +200 -0
  65. crawlo-1.1.5.dist-info/METADATA +401 -0
  66. crawlo-1.1.5.dist-info/RECORD +185 -0
  67. tests/advanced_tools_example.py +276 -0
  68. tests/authenticated_proxy_example.py +237 -0
  69. tests/cleaners_example.py +161 -0
  70. tests/config_validation_demo.py +103 -0
  71. tests/date_tools_example.py +181 -0
  72. tests/dynamic_loading_example.py +524 -0
  73. tests/dynamic_loading_test.py +105 -0
  74. tests/env_config_example.py +134 -0
  75. tests/error_handling_example.py +172 -0
  76. tests/redis_key_validation_demo.py +131 -0
  77. tests/response_improvements_example.py +145 -0
  78. tests/test_advanced_tools.py +149 -0
  79. tests/test_all_redis_key_configs.py +146 -0
  80. tests/test_authenticated_proxy.py +142 -0
  81. tests/test_cleaners.py +55 -0
  82. tests/test_comprehensive.py +147 -0
  83. tests/test_config_validator.py +194 -0
  84. tests/test_date_tools.py +124 -0
  85. tests/test_dynamic_downloaders_proxy.py +125 -0
  86. tests/test_dynamic_proxy.py +93 -0
  87. tests/test_dynamic_proxy_config.py +147 -0
  88. tests/test_dynamic_proxy_real.py +110 -0
  89. tests/test_edge_cases.py +304 -0
  90. tests/test_enhanced_error_handler.py +271 -0
  91. tests/test_env_config.py +122 -0
  92. tests/test_error_handler_compatibility.py +113 -0
  93. tests/test_framework_env_usage.py +104 -0
  94. tests/test_integration.py +357 -0
  95. tests/test_item_dedup_redis_key.py +123 -0
  96. tests/test_parsel.py +30 -0
  97. tests/test_performance.py +328 -0
  98. tests/test_queue_manager_redis_key.py +177 -0
  99. tests/test_redis_connection_pool.py +295 -0
  100. tests/test_redis_key_naming.py +182 -0
  101. tests/test_redis_key_validator.py +124 -0
  102. tests/test_response_improvements.py +153 -0
  103. tests/test_simple_response.py +62 -0
  104. tests/test_telecom_spider_redis_key.py +206 -0
  105. tests/test_template_content.py +88 -0
  106. tests/test_template_redis_key.py +135 -0
  107. tests/test_tools.py +154 -0
  108. tests/tools_example.py +258 -0
  109. crawlo/core/enhanced_engine.py +0 -190
  110. crawlo-1.1.3.dist-info/METADATA +0 -635
  111. crawlo-1.1.3.dist-info/RECORD +0 -113
  112. {crawlo-1.1.3.dist-info → crawlo-1.1.5.dist-info}/WHEEL +0 -0
  113. {crawlo-1.1.3.dist-info → crawlo-1.1.5.dist-info}/entry_points.txt +0 -0
  114. {crawlo-1.1.3.dist-info → crawlo-1.1.5.dist-info}/top_level.txt +0 -0
  115. {examples → tests}/controlled_spider_example.py +0 -0
@@ -1,635 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: crawlo
3
- Version: 1.1.3
4
- Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
5
- Home-page: https://github.com/crawl-coder/Crawlo.git
6
- Author: crawl-coder
7
- Author-email: crawlo@qq.com
8
- License: MIT
9
- Classifier: Programming Language :: Python :: 3
10
- Classifier: License :: OSI Approved :: MIT License
11
- Classifier: Operating System :: OS Independent
12
- Requires-Python: >=3.6
13
- Description-Content-Type: text/markdown
14
- Requires-Dist: aiohttp>=3.12.14
15
- Requires-Dist: aiomysql>=0.2.0
16
- Requires-Dist: aioredis>=2.0.1
17
- Requires-Dist: asyncmy>=0.2.10
18
- Requires-Dist: cssselect>=1.2.0
19
- Requires-Dist: dateparser>=1.2.2
20
- Requires-Dist: httpx[http2]>=0.27.0
21
- Requires-Dist: curl-cffi>=0.13.0
22
- Requires-Dist: lxml>=5.2.1
23
- Requires-Dist: motor>=3.7.0
24
- Requires-Dist: parsel>=1.9.1
25
- Requires-Dist: pydantic>=2.11.7
26
- Requires-Dist: pymongo>=4.11
27
- Requires-Dist: PyMySQL>=1.1.1
28
- Requires-Dist: python-dateutil>=2.9.0.post0
29
- Requires-Dist: redis>=6.2.0
30
- Requires-Dist: requests>=2.32.4
31
- Requires-Dist: six>=1.17.0
32
- Requires-Dist: ujson>=5.9.0
33
- Requires-Dist: urllib3>=2.5.0
34
- Requires-Dist: w3lib>=2.1.2
35
- Requires-Dist: rich>=14.1.0
36
- Requires-Dist: astor>=0.8.1
37
- Requires-Dist: watchdog>=6.0.0
38
- Provides-Extra: render
39
- Requires-Dist: webdriver-manager>=4.0.0; extra == "render"
40
- Requires-Dist: playwright; extra == "render"
41
- Requires-Dist: selenium>=3.141.0; extra == "render"
42
- Provides-Extra: all
43
- Requires-Dist: bitarray>=1.5.3; extra == "all"
44
- Requires-Dist: PyExecJS>=1.5.1; extra == "all"
45
- Requires-Dist: pymongo>=3.10.1; extra == "all"
46
- Requires-Dist: redis-py-cluster>=2.1.0; extra == "all"
47
- Requires-Dist: webdriver-manager>=4.0.0; extra == "all"
48
- Requires-Dist: playwright; extra == "all"
49
- Requires-Dist: selenium>=3.141.0; extra == "all"
50
-
51
- # 🕷️ Crawlo - 智能异步爬虫框架
52
-
53
- > 一个现代化、高性能的 Python 异步爬虫框架,支持单机和分布式模式,开箱即用。
54
-
55
- 🚀 **核心特色**:默认单机模式,一键分布式,配置优雅,扩展灵活。
56
-
57
- ---
58
-
59
- ## ✨ 核心特性
60
-
61
- ### 🎯 运行模式
62
- - **单机模式**(默认):零配置启动,适合开发和中小规模爬取
63
- - **分布式模式**:Redis 队列,多节点协同,适合大规模生产环境
64
- - **自动模式**:智能检测 Redis 可用性,自动选择最佳运行方式
65
-
66
- ### 🛠️ 开发友好
67
- - ✅ **命令行驱动**:`crawlo startproject`、`crawlo genspider`、`crawlo run`
68
- - ✅ **自动发现爬虫**:无需手动注册,自动加载 `spiders/` 模块
69
- - ✅ **智能配置系统**:配置工厂 + 链式调用 + 预设配置
70
- - ✅ **灵活运行参数**:`--env`、`--concurrency`、`--debug`、`--distributed`
71
-
72
- ### ⚡ 高性能架构
73
- - ✅ **异步核心**:基于 `asyncio` 实现高并发抓取
74
- - ✅ **多下载器支持**:aiohttp、httpx、curl-cffi(浏览器指纹)
75
- - ✅ **智能中间件**:请求去重、延迟控制、重试机制、代理支持
76
- - ✅ **分布式去重**:Redis 分布式去重,避免重复爬取
77
-
78
- ### 📊 监控与管理
79
- - ✅ **实时统计**:爬取进度、成功率、错误统计
80
- - ✅ **日志系统**:结构化日志输出,支持文件和控制台
81
- - ✅ **健康检查**:`crawlo check` 验证爬虫定义是否合规
82
- - ✅ **性能分析**:`crawlo stats` 查看历史运行指标
83
-
84
- ---
85
-
86
- ## 🌐 语言选择 / Language
87
-
88
- - [中文文档 (默认)](#中文文档)
89
- - [English Documentation](#english-documentation)
90
-
91
- ---
92
-
93
- ## 📚 中文文档
94
-
95
- 详细的框架文档现已可用,提供中英文双语版本,默认使用中文:
96
-
97
- ### 快速入门
98
- - [快速入门指南](docs/quick_start_guide_zh.md) - 快速上手 Crawlo
99
- - [框架完整文档](docs/crawlo_framework_documentation_zh.md) - 框架所有特性的综合指南
100
- - [API 参考](docs/api_reference_zh.md) - 所有类和方法的详细文档
101
-
102
- ### 高级主题
103
- - [分布式爬取教程](docs/distributed_crawling_tutorial_zh.md) - 分布式爬取的完整指南
104
- - [配置最佳实践](docs/configuration_best_practices_zh.md) - 配置 Crawlo 项目的指南
105
- - [去重管道指南](docs/deduplication_pipelines_guide.md) - 所有去重管道的详细指南
106
- - [去重配置说明](docs/deduplication_configuration_zh.md) - 如何为不同模式配置去重
107
- - [示例项目](examples/) - 真实项目的完整示例
108
-
109
- ---
110
-
111
- ## 📚 English Documentation
112
-
113
- Comprehensive framework documentation is now available in both Chinese and English, with Chinese as the default:
114
-
115
- ### Getting Started
116
- - [Quick Start Guide](docs/quick_start_guide.md) - Get up and running with Crawlo quickly
117
- - [Framework Documentation](docs/crawlo_framework_documentation.md) - Comprehensive guide to all framework features
118
- - [API Reference](docs/api_reference.md) - Detailed documentation of all classes and methods
119
-
120
- ### Advanced Topics
121
- - [Distributed Crawling Tutorial](docs/distributed_crawling_tutorial.md) - Complete guide to setting up distributed crawling
122
- - [Configuration Best Practices](docs/configuration_best_practices.md) - Guidelines for configuring Crawlo projects
123
- - [Deduplication Pipelines Guide](docs/deduplication_pipelines_guide.md) - Detailed guide to all deduplication pipelines
124
- - [Deduplication Configuration](docs/deduplication_configuration.md) - How to configure deduplication for different modes
125
-
126
- ---
127
-
128
- ## 🚀 快速开始
129
-
130
- ### 1. 安装框架
131
-
132
- ```bash
133
- # 从源码安装(推荐)
134
- git clone https://github.com/crawl-coder/Crawlo.git
135
- cd crawlo
136
- pip install -e .
137
-
138
- # 或直接安装(开发中)
139
- pip install crawlo
140
- ```
141
-
142
- ### 2. 创建项目
143
-
144
- ```bash
145
- # 创建新项目
146
- crawlo startproject myproject
147
- cd myproject
148
-
149
- # 项目结构
150
- # myproject/
151
- # ├── crawlo.cfg # 项目配置
152
- # ├── myproject/
153
- # │ ├── __init__.py
154
- # │ ├── settings.py # 设置文件
155
- # │ ├── items.py # 数据项定义
156
- # │ └── spiders/ # 爬虫目录
157
- # └── run.py # 运行脚本
158
- ```
159
-
160
- ### 3. 生成爬虫
161
-
162
- ```bash
163
- # 生成爬虫模板
164
- crawlo genspider example example.com
165
- ```
166
-
167
- 生成的爬虫代码:
168
-
169
- ```python
170
- from crawlo import Spider, Request
171
- from myproject.items import ExampleItem
172
-
173
- class ExampleSpider(Spider):
174
- name = "example"
175
- allowed_domains = ["example.com"]
176
- start_urls = ["https://example.com"]
177
-
178
- def parse(self, response):
179
- # 提取数据
180
- item = ExampleItem()
181
- item['title'] = response.css('title::text').get()
182
- item['url'] = response.url
183
- yield item
184
-
185
- # 跟进链接
186
- for link in response.css('a::attr(href)').getall():
187
- yield Request(url=response.urljoin(link), callback=self.parse)
188
- ```
189
-
190
- ### 4. 运行爬虫
191
-
192
- ```bash
193
- # 🏠 单机模式(默认)
194
- python run.py example
195
-
196
- # 🌐 分布式模式
197
- python run.py example --distributed
198
-
199
- # 🛠️ 开发环境
200
- python run.py example --env development --debug
201
-
202
- # ⚡ 自定义并发
203
- python run.py example --concurrency 20 --delay 0.5
204
-
205
- # 🔄 使用预设配置
206
- python run.py example --env production
207
- ```
208
-
209
- ---
210
-
211
- ## 🎛️ 配置系统
212
-
213
- ### 传统配置方式
214
-
215
- ```python
216
- # settings.py
217
- PROJECT_NAME = 'myproject'
218
- CONCURRENCY = 16
219
- DOWNLOAD_DELAY = 1.0
220
- QUEUE_TYPE = 'memory' # 单机模式
221
- # QUEUE_TYPE = 'redis' # 分布式模式
222
- ```
223
-
224
- ### 🆕 智能配置工厂
225
-
226
- ```python
227
- from crawlo.config import CrawloConfig
228
-
229
- # 单机模式
230
- config = CrawloConfig.standalone().set_concurrency(16)
231
-
232
- # 分布式模式
233
- config = CrawloConfig.distributed(redis_host='192.168.1.100')
234
-
235
- # 预设配置
236
- config = CrawloConfig.presets().production()
237
-
238
- # 链式调用
239
- config = (CrawloConfig.standalone()
240
- .set_concurrency(20)
241
- .set_delay(1.5)
242
- .enable_debug()
243
- .enable_mysql())
244
-
245
- # 环境变量配置
246
- config = CrawloConfig.from_env()
247
- ```
248
-
249
- ### 🎯 预设配置
250
-
251
- | 配置 | 适用场景 | 特点 |
252
- |------|----------|------|
253
- | `development()` | 开发调试 | 低并发、详细日志、调试友好 |
254
- | `production()` | 生产环境 | 高性能、自动模式、稳定可靠 |
255
- | `large_scale()` | 大规模爬取 | 分布式、内存优化、批处理 |
256
- | `gentle()` | 温和模式 | 低负载、对目标服务器友好 |
257
-
258
- ---
259
-
260
- ## 🌐 分布式架构
261
-
262
- ```
263
- ┌─────────────┐ ┌─────────────┐ ┌─────────────┐
264
- │ 节点 A │ │ 节点 B │ │ 节点 N │
265
- │ (爬虫实例) │ │ (爬虫实例) │ │ (爬虫实例) │
266
- └──────┬──────┘ └──────┬──────┘ └──────┬──────┘
267
- │ │ │
268
- └──────────────────┼──────────────────┘
269
-
270
- ┌───────────▼────────────┐
271
- │ Redis 集群 │
272
- │ ┌─────────────────────┐│
273
- │ │ 任务队列 (Queue) ││
274
- │ │ 去重集合 (Filter) ││
275
- │ │ 统计监控 (Stats) ││
276
- │ └─────────────────────┘│
277
- └─────────────────────────┘
278
-
279
- ┌───────────▼────────────┐
280
- │ 共享数据存储 │
281
- │ MySQL / MongoDB │
282
- └─────────────────────────┘
283
- ```
284
-
285
- ### 分布式特性
286
-
287
- - **🔄 自动负载均衡**:节点间自动分配任务
288
- - **🛡️ 分布式去重**:避免重复爬取
289
- - **📈 水平扩展**:动态增减节点
290
- - **🔧 故障恢复**:节点故障不影响整体运行
291
-
292
- ---
293
-
294
- ## 🛠️ 命令行工具
295
-
296
- | 命令 | 功能 | 示例 |
297
- |------|------|------|
298
- | `startproject` | 创建新项目 | `crawlo startproject myproject` |
299
- | `genspider` | 生成爬虫 | `crawlo genspider news news.com` |
300
- | `list` | 列出所有爬虫 | `crawlo list` |
301
- | `check` | 检查爬虫合规性 | `crawlo check` |
302
- | `run` | 运行爬虫 | `crawlo run news --distributed` |
303
- | `stats` | 查看统计信息 | `crawlo stats news` |
304
-
305
- ---
306
-
307
- ## 📖 完整示例
308
-
309
- 我们提供了基于真实项目的完整示例,帮助您快速上手:
310
-
311
- ### 🏠 单机版示例
312
-
313
- ```bash
314
- # 进入单机版示例
315
- cd examples/telecom_licenses_standalone
316
-
317
- # 零配置运行(使用默认 httpx 下载器)
318
- python run.py telecom_device
319
-
320
- # 开发环境配置
321
- python run.py telecom_device --env development --concurrency 4
322
-
323
- # 调试模式(详细日志)
324
- python run.py telecom_device --debug
325
-
326
- # 自定义下载器(在项目 settings.py 中配置)
327
- # DOWNLOADER_TYPE = 'aiohttp' # 高性能下载器
328
- # DOWNLOADER_TYPE = 'curl_cffi' # 浏览器指纹模拟
329
- ```
330
-
331
- **特点**:
332
- - ✅ 零配置启动,开箱即用
333
- - ✅ 内存队列,速度快
334
- - ✅ 适合开发调试和中小规模爬取
335
-
336
- ### 🌐 分布式示例
337
-
338
- ```bash
339
- # 进入分布式示例
340
- cd examples/telecom_licenses_distributed
341
-
342
- # 启动 Redis
343
- redis-server
344
-
345
- # 启动分布式爬虫(使用默认 aiohttp 下载器)
346
- python run.py telecom_device --distributed
347
-
348
- # 高并发分布式模式
349
- python run.py telecom_device --distributed --concurrency 30
350
-
351
- # 多节点部署
352
- # 机器A (Redis服务器)
353
- python run.py telecom_device
354
-
355
- # 机器B
356
- python run.py telecom_device --redis-host 192.168.1.100 --concurrency 16
357
-
358
- # 机器C
359
- python run.py telecom_device --redis-host 192.168.1.100 --concurrency 24
360
-
361
- # 环境变量配置
362
- NODE_ID=node-1 REDIS_HOST=192.168.1.100 python run.py telecom_device --distributed
363
- ```
364
-
365
- **特点**:
366
- - ✅ 多节点协同,高并发
367
- - ✅ Redis 队列和去重
368
- - ✅ 适合大规模生产环境
369
-
370
- ### 📚 详细教程
371
-
372
- - **[单机版示例说明](examples/telecom_licenses_standalone/)**:从创建到运行的完整流程
373
- - **[分布式版示例说明](examples/telecom_licenses_distributed/)**:分布式架构和部署方案
374
- - **[examples/README.md](examples/README.md)**:完整示例说明
375
-
376
- ---
377
-
378
- ## 🎯 使用场景对比
379
-
380
- | 特性 | 单机版 | 分布式版 |
381
- |-----|--------|----------|
382
- | **配置复杂度** | 零配置 | 需要 Redis |
383
- | **外部依赖** | 无 | Redis + 数据库 |
384
- | **并发能力** | 中等 | 高 |
385
- | **扩展性** | 有限 | 水平扩展 |
386
- | **适用场景** | 开发测试、中小规模 | 生产环境、大规模 |
387
- | **学习难度** | 简单 | 中等 |
388
-
389
- ---
390
-
391
- ## 🔧 高级功能
392
-
393
- ### 多下载器支持
394
-
395
- ```python
396
- # 方式1: 使用简化名称(推荐)
397
- DOWNLOADER_TYPE = 'aiohttp' # 高性能默认选择
398
- DOWNLOADER_TYPE = 'httpx' # HTTP/2 支持
399
- DOWNLOADER_TYPE = 'curl_cffi' # 浏览器指纹模拟
400
-
401
- # 方式2: 完整类路径(兼容旧版本)
402
- DOWNLOADER = "crawlo.downloader.aiohttp_downloader.AioHttpDownloader"
403
- DOWNLOADER = "crawlo.downloader.httpx_downloader.HttpXDownloader"
404
- DOWNLOADER = "crawlo.downloader.cffi_downloader.CurlCffiDownloader"
405
-
406
- # 方式3: 在 Spider 中动态选择
407
- class MySpider(Spider):
408
- custom_settings = {
409
- 'DOWNLOADER_TYPE': 'curl_cffi', # 需要浏览器指纹时
410
- 'CURL_BROWSER_TYPE': 'chrome136'
411
- }
412
-
413
- # 下载器特定配置
414
- CURL_BROWSER_TYPE = "chrome136" # curl-cffi 模拟浏览器
415
- HTTPX_HTTP2 = True # httpx 启用 HTTP/2
416
- CONNECTION_POOL_LIMIT_PER_HOST = 20 # 连接池优化
417
- ```
418
-
419
- ### 智能中间件
420
-
421
- ```python
422
- MIDDLEWARES = [
423
- 'crawlo.middleware.request_ignore.RequestIgnoreMiddleware', # 请求过滤
424
- 'crawlo.middleware.download_delay.DownloadDelayMiddleware', # 延迟控制
425
- 'crawlo.middleware.default_header.DefaultHeaderMiddleware', # 默认请求头
426
- 'crawlo.middleware.proxy.ProxyMiddleware', # 代理支持
427
- 'crawlo.middleware.retry.RetryMiddleware', # 重试机制
428
- 'crawlo.middleware.response_code.ResponseCodeMiddleware', # 状态码处理
429
- ]
430
- ```
431
-
432
- ### 数据管道
433
-
434
- ```python
435
- PIPELINES = [
436
- 'crawlo.pipelines.console_pipeline.ConsolePipeline', # 控制台输出
437
- 'crawlo.pipelines.json_pipeline.JsonPipeline', # JSON 文件(逐行)
438
- 'crawlo.pipelines.json_pipeline.JsonLinesPipeline', # JSON Lines 格式
439
- 'crawlo.pipelines.json_pipeline.JsonArrayPipeline', # JSON 数组格式
440
- 'crawlo.pipelines.csv_pipeline.CsvPipeline', # CSV 文件
441
- 'crawlo.pipelines.csv_pipeline.CsvDictPipeline', # CSV 字典格式
442
- 'crawlo.pipelines.csv_pipeline.CsvBatchPipeline', # CSV 批量写入
443
- 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline', # MySQL 数据库(推荐)
444
- 'crawlo.pipelines.mysql_pipeline.AiomysqlMySQLPipeline', # MySQL 数据库(备选)
445
- 'crawlo.pipelines.mongo_pipeline.MongoPipeline', # MongoDB
446
- 'crawlo.pipelines.mongo_pipeline.MongoPoolPipeline', # MongoDB 连接池版本
447
- ]
448
- ```
449
-
450
- ### 智能去重配置
451
-
452
- Crawlo 框架根据运行模式自动选择合适的去重管道:
453
-
454
- - **单机模式**:默认使用内存去重管道 ([MemoryDedupPipeline](file://d:/dowell/projects/Crawlo/crawlo/pipelines/memory_dedup_pipeline.py#L25-L115))
455
- - **分布式模式**:默认使用 Redis 去重管道 ([RedisDedupPipeline](file://d:/dowell/projects/Crawlo/crawlo/pipelines/redis_dedup_pipeline.py#L33-L162))
456
-
457
- 用户也可以手动指定其他去重管道:
458
-
459
- ```python
460
- # settings.py
461
- ITEM_PIPELINES = {
462
- 'crawlo.pipelines.BloomDedupPipeline': 100, # 使用Bloom Filter去重
463
- 'crawlo.pipelines.ConsolePipeline': 300,
464
- }
465
- ```
466
-
467
- 更多去重配置信息请参阅:
468
- - [去重管道指南](docs/deduplication_pipelines_guide.md)
469
- - [去重配置说明](docs/deduplication_configuration_zh.md)
470
-
471
- ---
472
-
473
- ## 📊 监控与运维
474
-
475
- ### 实时监控
476
-
477
- ```bash
478
- # 查看运行统计
479
- crawlo stats
480
-
481
- # 查看特定爬虫
482
- crawlo stats my_spider
483
-
484
- # Redis 队列监控
485
- redis-cli llen crawlo:requests
486
- redis-cli scard crawlo:fingerprint
487
- ```
488
-
489
- ### 日志系统
490
-
491
- ```python
492
- # 日志配置
493
- LOG_LEVEL = 'INFO'
494
- LOG_FILE = 'logs/crawlo.log'
495
- LOG_FORMAT = '%(asctime)s - [%(name)s] - %(levelname)s: %(message)s'
496
- ```
497
-
498
- ### 性能调优
499
-
500
- ```python
501
- # 并发控制
502
- CONCURRENCY = 16 # 并发请求数
503
- DOWNLOAD_DELAY = 1.0 # 下载延迟
504
- CONNECTION_POOL_LIMIT = 100 # 全局连接池大小
505
- CONNECTION_POOL_LIMIT_PER_HOST = 30 # 每个主机连接数
506
-
507
- # 重试策略
508
- MAX_RETRY_TIMES = 3 # 最大重试次数
509
- RETRY_HTTP_CODES = [500, 502, 503] # 重试状态码
510
-
511
- # 统计和监控(新增)
512
- DOWNLOADER_STATS = True # 启用下载器统计
513
- DOWNLOAD_STATS = True # 记录下载时间和大小
514
- DOWNLOADER_HEALTH_CHECK = True # 下载器健康检查
515
- REQUEST_STATS_ENABLED = True # 请求统计
516
- ```
517
-
518
- ---
519
-
520
- ## 🚀 最佳实践
521
-
522
- ### 1. 开发阶段
523
- ```bash
524
- # 使用开发配置,低并发,详细日志
525
- python run.py my_spider --env development --debug
526
- ```
527
-
528
- ### 2. 测试阶段
529
- ```bash
530
- # 干运行模式,验证逻辑
531
- python run.py my_spider --dry-run
532
- ```
533
-
534
- ### 3. 生产环境
535
- ```bash
536
- # 使用生产配置或分布式模式
537
- python run.py my_spider --env production
538
- python run.py my_spider --distributed --concurrency 50
539
- ```
540
-
541
- ### 4. 大规模爬取
542
- ```bash
543
- # 使用大规模配置,启用分布式
544
- python run.py my_spider --env large-scale
545
- ```
546
-
547
- ### 5. 下载器选择最佳实践
548
- ```python
549
- # 开发/测试环境 - 使用 httpx(稳定、兼容性好)
550
- DOWNLOADER_TYPE = 'httpx'
551
-
552
- # 生产环境 - 使用 aiohttp(高性能)
553
- DOWNLOADER_TYPE = 'aiohttp'
554
-
555
- # 反爬虫场景 - 使用 curl_cffi(浏览器指纹)
556
- DOWNLOADER_TYPE = 'curl_cffi'
557
- CURL_BROWSER_TYPE = 'chrome136'
558
- ```
559
-
560
- ---
561
-
562
- ## 💡 核心优势
563
-
564
- ### 🎯 开箱即用
565
- - **零配置启动**:默认单机模式,无需复杂配置
566
- - **智能检测**:自动发现爬虫,智能选择运行模式
567
- - **预设配置**:内置多种场景的最佳实践配置
568
-
569
- ### 🔧 灵活配置
570
- - **配置工厂**:链式调用,代码即配置
571
- - **多下载器支持**:简化配置,支持 aiohttp、httpx、curl_cffi
572
- - **环境变量**:支持容器化部署
573
- - **多种模式**:单机、分布式、自动模式
574
-
575
- ### ⚡ 高性能
576
- - **异步架构**:基于 asyncio 的高并发设计
577
- - **多下载器**:aiohttp、httpx、curl_cffi 灵活选择
578
- - **智能去重**:内存/Redis 分布式去重
579
- - **负载均衡**:多节点自动任务分配
580
- - **性能监控**:实时统计和健康检查
581
-
582
- ### 🛡️ 生产就绪
583
- - **容错机制**:节点故障自动恢复
584
- - **监控系统**:完善的统计和监控
585
- - **扩展能力**:水平扩展,按需增减节点
586
-
587
- ---
588
-
589
- ## 🆚 与其他框架对比
590
-
591
- | 特性 | Crawlo | Scrapy | 其他框架 |
592
- |------|--------|--------|---------|
593
- | **学习曲线** | 简单 | 中等 | 复杂 |
594
- | **配置方式** | 智能配置工厂 | 传统配置 | 手动配置 |
595
- | **分布式** | 一键切换 | 需要 Scrapyd | 复杂 |
596
- | **默认模式** | 单机零配置 | 单机 | 各异 |
597
- | **运行方式** | 多种灵活选项 | 命令行 | 各异 |
598
- | **现代化** | 现代 Python | 传统 | 各异 |
599
-
600
- ---
601
-
602
- ## 📞 支持与贡献
603
-
604
- ### 🐛 问题反馈
605
- - **GitHub Issues**:[提交问题](https://github.com/yourname/crawlo/issues)
606
- - **文档**:查看 [examples/README.md](examples/README.md) 获取更多示例
607
-
608
- ### 🤝 参与贡献
609
- - **Fork 项目**:欢迎提交 Pull Request
610
- - **改进文档**:帮助完善文档和示例
611
- - **分享经验**:分享使用经验和最佳实践
612
-
613
- ### 📋 开发路线图
614
- - [ ] 图形化管理界面
615
- - [ ] 更多数据存储支持
616
- - [ ] 云原生部署方案
617
- - [ ] 智能反爬虫对抗
618
- - [ ] 可视化监控面板
619
-
620
- ---
621
-
622
- ## 📄 许可证
623
-
624
- MIT License - 自由使用,商业友好
625
-
626
- ---
627
-
628
- **🎉 立即开始您的爬虫之旅!**
629
-
630
- ```bash
631
- git clone https://github.com/yourname/crawlo.git
632
- cd crawlo
633
- pip install -e .
634
- crawlo startproject my_first_spider
635
- ```