crawlo 1.1.1__py3-none-any.whl → 1.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +2 -1
- crawlo/__version__.py +1 -1
- crawlo/commands/genspider.py +68 -42
- crawlo/commands/list.py +102 -93
- crawlo/commands/startproject.py +89 -4
- crawlo/commands/utils.py +187 -0
- crawlo/config.py +280 -0
- crawlo/core/engine.py +16 -3
- crawlo/core/enhanced_engine.py +190 -0
- crawlo/core/scheduler.py +113 -8
- crawlo/crawler.py +840 -307
- crawlo/downloader/__init__.py +181 -17
- crawlo/downloader/aiohttp_downloader.py +15 -2
- crawlo/downloader/cffi_downloader.py +11 -1
- crawlo/downloader/httpx_downloader.py +14 -3
- crawlo/filters/__init__.py +122 -5
- crawlo/filters/aioredis_filter.py +128 -36
- crawlo/filters/memory_filter.py +99 -32
- crawlo/middleware/proxy.py +11 -8
- crawlo/middleware/retry.py +40 -5
- crawlo/mode_manager.py +201 -0
- crawlo/network/__init__.py +17 -3
- crawlo/network/request.py +118 -10
- crawlo/network/response.py +131 -28
- crawlo/pipelines/__init__.py +1 -1
- crawlo/pipelines/csv_pipeline.py +317 -0
- crawlo/pipelines/json_pipeline.py +219 -0
- crawlo/queue/__init__.py +0 -0
- crawlo/queue/pqueue.py +37 -0
- crawlo/queue/queue_manager.py +304 -0
- crawlo/queue/redis_priority_queue.py +192 -0
- crawlo/settings/default_settings.py +68 -9
- crawlo/spider/__init__.py +576 -66
- crawlo/task_manager.py +4 -1
- crawlo/templates/project/middlewares.py.tmpl +56 -45
- crawlo/templates/project/pipelines.py.tmpl +308 -36
- crawlo/templates/project/run.py.tmpl +239 -0
- crawlo/templates/project/settings.py.tmpl +211 -17
- crawlo/templates/spider/spider.py.tmpl +153 -7
- crawlo/utils/controlled_spider_mixin.py +336 -0
- crawlo/utils/large_scale_config.py +287 -0
- crawlo/utils/large_scale_helper.py +344 -0
- crawlo/utils/queue_helper.py +176 -0
- crawlo/utils/request_serializer.py +220 -0
- crawlo-1.1.2.dist-info/METADATA +567 -0
- {crawlo-1.1.1.dist-info → crawlo-1.1.2.dist-info}/RECORD +54 -46
- tests/test_final_validation.py +154 -0
- tests/test_redis_config.py +29 -0
- tests/test_redis_queue.py +225 -0
- tests/test_request_serialization.py +71 -0
- tests/test_scheduler.py +242 -0
- crawlo/pipelines/mysql_batch_pipline.py +0 -273
- crawlo/utils/pqueue.py +0 -174
- crawlo-1.1.1.dist-info/METADATA +0 -220
- examples/baidu_spider/__init__.py +0 -7
- examples/baidu_spider/demo.py +0 -94
- examples/baidu_spider/items.py +0 -46
- examples/baidu_spider/middleware.py +0 -49
- examples/baidu_spider/pipeline.py +0 -55
- examples/baidu_spider/run.py +0 -27
- examples/baidu_spider/settings.py +0 -121
- examples/baidu_spider/spiders/__init__.py +0 -7
- examples/baidu_spider/spiders/bai_du.py +0 -61
- examples/baidu_spider/spiders/miit.py +0 -159
- examples/baidu_spider/spiders/sina.py +0 -79
- {crawlo-1.1.1.dist-info → crawlo-1.1.2.dist-info}/WHEEL +0 -0
- {crawlo-1.1.1.dist-info → crawlo-1.1.2.dist-info}/entry_points.txt +0 -0
- {crawlo-1.1.1.dist-info → crawlo-1.1.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,567 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: crawlo
|
|
3
|
+
Version: 1.1.2
|
|
4
|
+
Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
|
|
5
|
+
Home-page: https://github.com/crawl-coder/Crawlo.git
|
|
6
|
+
Author: crawl-coder
|
|
7
|
+
Author-email: crawlo@qq.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Requires-Python: >=3.6
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
Requires-Dist: aiohttp>=3.12.14
|
|
15
|
+
Requires-Dist: aiomysql>=0.2.0
|
|
16
|
+
Requires-Dist: aioredis>=2.0.1
|
|
17
|
+
Requires-Dist: asyncmy>=0.2.10
|
|
18
|
+
Requires-Dist: cssselect>=1.2.0
|
|
19
|
+
Requires-Dist: dateparser>=1.2.2
|
|
20
|
+
Requires-Dist: httpx[http2]>=0.27.0
|
|
21
|
+
Requires-Dist: curl-cffi>=0.13.0
|
|
22
|
+
Requires-Dist: lxml>=5.2.1
|
|
23
|
+
Requires-Dist: motor>=3.7.0
|
|
24
|
+
Requires-Dist: parsel>=1.9.1
|
|
25
|
+
Requires-Dist: pydantic>=2.11.7
|
|
26
|
+
Requires-Dist: pymongo>=4.11
|
|
27
|
+
Requires-Dist: PyMySQL>=1.1.1
|
|
28
|
+
Requires-Dist: python-dateutil>=2.9.0.post0
|
|
29
|
+
Requires-Dist: redis>=6.2.0
|
|
30
|
+
Requires-Dist: requests>=2.32.4
|
|
31
|
+
Requires-Dist: six>=1.17.0
|
|
32
|
+
Requires-Dist: ujson>=5.9.0
|
|
33
|
+
Requires-Dist: urllib3>=2.5.0
|
|
34
|
+
Requires-Dist: w3lib>=2.1.2
|
|
35
|
+
Requires-Dist: rich>=14.1.0
|
|
36
|
+
Requires-Dist: astor>=0.8.1
|
|
37
|
+
Requires-Dist: watchdog>=6.0.0
|
|
38
|
+
Provides-Extra: render
|
|
39
|
+
Requires-Dist: webdriver-manager>=4.0.0; extra == "render"
|
|
40
|
+
Requires-Dist: playwright; extra == "render"
|
|
41
|
+
Requires-Dist: selenium>=3.141.0; extra == "render"
|
|
42
|
+
Provides-Extra: all
|
|
43
|
+
Requires-Dist: bitarray>=1.5.3; extra == "all"
|
|
44
|
+
Requires-Dist: PyExecJS>=1.5.1; extra == "all"
|
|
45
|
+
Requires-Dist: pymongo>=3.10.1; extra == "all"
|
|
46
|
+
Requires-Dist: redis-py-cluster>=2.1.0; extra == "all"
|
|
47
|
+
Requires-Dist: webdriver-manager>=4.0.0; extra == "all"
|
|
48
|
+
Requires-Dist: playwright; extra == "all"
|
|
49
|
+
Requires-Dist: selenium>=3.141.0; extra == "all"
|
|
50
|
+
|
|
51
|
+
# 🕷️ Crawlo - 智能异步爬虫框架
|
|
52
|
+
|
|
53
|
+
> 一个现代化、高性能的 Python 异步爬虫框架,支持单机和分布式模式,开箱即用。
|
|
54
|
+
|
|
55
|
+
🚀 **核心特色**:默认单机模式,一键分布式,配置优雅,扩展灵活。
|
|
56
|
+
|
|
57
|
+
---
|
|
58
|
+
|
|
59
|
+
## ✨ 核心特性
|
|
60
|
+
|
|
61
|
+
### 🎯 运行模式
|
|
62
|
+
- **单机模式**(默认):零配置启动,适合开发和中小规模爬取
|
|
63
|
+
- **分布式模式**:Redis 队列,多节点协同,适合大规模生产环境
|
|
64
|
+
- **自动模式**:智能检测 Redis 可用性,自动选择最佳运行方式
|
|
65
|
+
|
|
66
|
+
### 🛠️ 开发友好
|
|
67
|
+
- ✅ **命令行驱动**:`crawlo startproject`、`crawlo genspider`、`crawlo run`
|
|
68
|
+
- ✅ **自动发现爬虫**:无需手动注册,自动加载 `spiders/` 模块
|
|
69
|
+
- ✅ **智能配置系统**:配置工厂 + 链式调用 + 预设配置
|
|
70
|
+
- ✅ **灵活运行参数**:`--env`、`--concurrency`、`--debug`、`--distributed`
|
|
71
|
+
|
|
72
|
+
### ⚡ 高性能架构
|
|
73
|
+
- ✅ **异步核心**:基于 `asyncio` 实现高并发抓取
|
|
74
|
+
- ✅ **多下载器支持**:aiohttp、httpx、curl-cffi(浏览器指纹)
|
|
75
|
+
- ✅ **智能中间件**:请求去重、延迟控制、重试机制、代理支持
|
|
76
|
+
- ✅ **分布式去重**:Redis 分布式去重,避免重复爬取
|
|
77
|
+
|
|
78
|
+
### 📊 监控与管理
|
|
79
|
+
- ✅ **实时统计**:爬取进度、成功率、错误统计
|
|
80
|
+
- ✅ **日志系统**:结构化日志输出,支持文件和控制台
|
|
81
|
+
- ✅ **健康检查**:`crawlo check` 验证爬虫定义是否合规
|
|
82
|
+
- ✅ **性能分析**:`crawlo stats` 查看历史运行指标
|
|
83
|
+
|
|
84
|
+
---
|
|
85
|
+
|
|
86
|
+
## 🚀 快速开始
|
|
87
|
+
|
|
88
|
+
### 1. 安装框架
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
# 从源码安装(推荐)
|
|
92
|
+
git clone https://github.com/crawl-coder/Crawlo.git
|
|
93
|
+
cd crawlo
|
|
94
|
+
pip install -e .
|
|
95
|
+
|
|
96
|
+
# 或直接安装(开发中)
|
|
97
|
+
pip install crawlo
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### 2. 创建项目
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
# 创建新项目
|
|
104
|
+
crawlo startproject myproject
|
|
105
|
+
cd myproject
|
|
106
|
+
|
|
107
|
+
# 项目结构
|
|
108
|
+
# myproject/
|
|
109
|
+
# ├── crawlo.cfg # 项目配置
|
|
110
|
+
# ├── myproject/
|
|
111
|
+
# │ ├── __init__.py
|
|
112
|
+
# │ ├── settings.py # 设置文件
|
|
113
|
+
# │ ├── items.py # 数据项定义
|
|
114
|
+
# │ └── spiders/ # 爬虫目录
|
|
115
|
+
# └── run.py # 运行脚本
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### 3. 生成爬虫
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
# 生成爬虫模板
|
|
122
|
+
crawlo genspider example example.com
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
生成的爬虫代码:
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
from crawlo import Spider, Request
|
|
129
|
+
from myproject.items import ExampleItem
|
|
130
|
+
|
|
131
|
+
class ExampleSpider(Spider):
|
|
132
|
+
name = "example"
|
|
133
|
+
allowed_domains = ["example.com"]
|
|
134
|
+
start_urls = ["https://example.com"]
|
|
135
|
+
|
|
136
|
+
def parse(self, response):
|
|
137
|
+
# 提取数据
|
|
138
|
+
item = ExampleItem()
|
|
139
|
+
item['title'] = response.css('title::text').get()
|
|
140
|
+
item['url'] = response.url
|
|
141
|
+
yield item
|
|
142
|
+
|
|
143
|
+
# 跟进链接
|
|
144
|
+
for link in response.css('a::attr(href)').getall():
|
|
145
|
+
yield Request(url=response.urljoin(link), callback=self.parse)
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
### 4. 运行爬虫
|
|
149
|
+
|
|
150
|
+
```bash
|
|
151
|
+
# 🏠 单机模式(默认)
|
|
152
|
+
python run.py example
|
|
153
|
+
|
|
154
|
+
# 🌐 分布式模式
|
|
155
|
+
python run.py example --distributed
|
|
156
|
+
|
|
157
|
+
# 🛠️ 开发环境
|
|
158
|
+
python run.py example --env development --debug
|
|
159
|
+
|
|
160
|
+
# ⚡ 自定义并发
|
|
161
|
+
python run.py example --concurrency 20 --delay 0.5
|
|
162
|
+
|
|
163
|
+
# 🔄 使用预设配置
|
|
164
|
+
python run.py example --env production
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
---
|
|
168
|
+
|
|
169
|
+
## 🎛️ 配置系统
|
|
170
|
+
|
|
171
|
+
### 传统配置方式
|
|
172
|
+
|
|
173
|
+
```python
|
|
174
|
+
# settings.py
|
|
175
|
+
PROJECT_NAME = 'myproject'
|
|
176
|
+
CONCURRENCY = 16
|
|
177
|
+
DOWNLOAD_DELAY = 1.0
|
|
178
|
+
QUEUE_TYPE = 'memory' # 单机模式
|
|
179
|
+
# QUEUE_TYPE = 'redis' # 分布式模式
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
### 🆕 智能配置工厂
|
|
183
|
+
|
|
184
|
+
```python
|
|
185
|
+
from crawlo.config import CrawloConfig
|
|
186
|
+
|
|
187
|
+
# 单机模式
|
|
188
|
+
config = CrawloConfig.standalone().set_concurrency(16)
|
|
189
|
+
|
|
190
|
+
# 分布式模式
|
|
191
|
+
config = CrawloConfig.distributed(redis_host='192.168.1.100')
|
|
192
|
+
|
|
193
|
+
# 预设配置
|
|
194
|
+
config = CrawloConfig.presets().production()
|
|
195
|
+
|
|
196
|
+
# 链式调用
|
|
197
|
+
config = (CrawloConfig.standalone()
|
|
198
|
+
.set_concurrency(20)
|
|
199
|
+
.set_delay(1.5)
|
|
200
|
+
.enable_debug()
|
|
201
|
+
.enable_mysql())
|
|
202
|
+
|
|
203
|
+
# 环境变量配置
|
|
204
|
+
config = CrawloConfig.from_env()
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
### 🎯 预设配置
|
|
208
|
+
|
|
209
|
+
| 配置 | 适用场景 | 特点 |
|
|
210
|
+
|------|----------|------|
|
|
211
|
+
| `development()` | 开发调试 | 低并发、详细日志、调试友好 |
|
|
212
|
+
| `production()` | 生产环境 | 高性能、自动模式、稳定可靠 |
|
|
213
|
+
| `large_scale()` | 大规模爬取 | 分布式、内存优化、批处理 |
|
|
214
|
+
| `gentle()` | 温和模式 | 低负载、对目标服务器友好 |
|
|
215
|
+
|
|
216
|
+
---
|
|
217
|
+
|
|
218
|
+
## 🌐 分布式架构
|
|
219
|
+
|
|
220
|
+
```
|
|
221
|
+
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
|
|
222
|
+
│ 节点 A │ │ 节点 B │ │ 节点 N │
|
|
223
|
+
│ (爬虫实例) │ │ (爬虫实例) │ │ (爬虫实例) │
|
|
224
|
+
└──────┬──────┘ └──────┬──────┘ └──────┬──────┘
|
|
225
|
+
│ │ │
|
|
226
|
+
└──────────────────┼──────────────────┘
|
|
227
|
+
│
|
|
228
|
+
┌───────────▼────────────┐
|
|
229
|
+
│ Redis 集群 │
|
|
230
|
+
│ ┌─────────────────────┐│
|
|
231
|
+
│ │ 任务队列 (Queue) ││
|
|
232
|
+
│ │ 去重集合 (Filter) ││
|
|
233
|
+
│ │ 统计监控 (Stats) ││
|
|
234
|
+
│ └─────────────────────┘│
|
|
235
|
+
└─────────────────────────┘
|
|
236
|
+
│
|
|
237
|
+
┌───────────▼────────────┐
|
|
238
|
+
│ 共享数据存储 │
|
|
239
|
+
│ MySQL / MongoDB │
|
|
240
|
+
└─────────────────────────┘
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
### 分布式特性
|
|
244
|
+
|
|
245
|
+
- **🔄 自动负载均衡**:节点间自动分配任务
|
|
246
|
+
- **🛡️ 分布式去重**:避免重复爬取
|
|
247
|
+
- **📈 水平扩展**:动态增减节点
|
|
248
|
+
- **🔧 故障恢复**:节点故障不影响整体运行
|
|
249
|
+
|
|
250
|
+
---
|
|
251
|
+
|
|
252
|
+
## 🛠️ 命令行工具
|
|
253
|
+
|
|
254
|
+
| 命令 | 功能 | 示例 |
|
|
255
|
+
|------|------|------|
|
|
256
|
+
| `startproject` | 创建新项目 | `crawlo startproject myproject` |
|
|
257
|
+
| `genspider` | 生成爬虫 | `crawlo genspider news news.com` |
|
|
258
|
+
| `list` | 列出所有爬虫 | `crawlo list` |
|
|
259
|
+
| `check` | 检查爬虫合规性 | `crawlo check` |
|
|
260
|
+
| `run` | 运行爬虫 | `crawlo run news --distributed` |
|
|
261
|
+
| `stats` | 查看统计信息 | `crawlo stats news` |
|
|
262
|
+
|
|
263
|
+
---
|
|
264
|
+
|
|
265
|
+
## 📖 完整示例
|
|
266
|
+
|
|
267
|
+
我们提供了基于真实项目的完整示例,帮助您快速上手:
|
|
268
|
+
|
|
269
|
+
### 🏠 单机版示例
|
|
270
|
+
|
|
271
|
+
```bash
|
|
272
|
+
# 进入单机版示例
|
|
273
|
+
cd examples/miit_spider_standalone
|
|
274
|
+
|
|
275
|
+
# 零配置运行(使用默认 httpx 下载器)
|
|
276
|
+
python run.py miit_device
|
|
277
|
+
|
|
278
|
+
# 开发环境配置
|
|
279
|
+
python run.py miit_device --env development --concurrency 4
|
|
280
|
+
|
|
281
|
+
# 调试模式(详细日志)
|
|
282
|
+
python run.py miit_device --debug
|
|
283
|
+
|
|
284
|
+
# 自定义下载器(在项目 settings.py 中配置)
|
|
285
|
+
# DOWNLOADER_TYPE = 'aiohttp' # 高性能下载器
|
|
286
|
+
# DOWNLOADER_TYPE = 'curl_cffi' # 浏览器指纹模拟
|
|
287
|
+
```
|
|
288
|
+
|
|
289
|
+
**特点**:
|
|
290
|
+
- ✅ 零配置启动,开箱即用
|
|
291
|
+
- ✅ 内存队列,速度快
|
|
292
|
+
- ✅ 适合开发调试和中小规模爬取
|
|
293
|
+
|
|
294
|
+
### 🌐 分布式示例
|
|
295
|
+
|
|
296
|
+
```bash
|
|
297
|
+
# 进入分布式示例
|
|
298
|
+
cd examples/miit_spider_distributed
|
|
299
|
+
|
|
300
|
+
# 启动 Redis
|
|
301
|
+
redis-server
|
|
302
|
+
|
|
303
|
+
# 启动分布式爬虫(使用默认 aiohttp 下载器)
|
|
304
|
+
python run.py miit_device --distributed
|
|
305
|
+
|
|
306
|
+
# 高并发分布式模式
|
|
307
|
+
python run.py miit_device --distributed --concurrency 30
|
|
308
|
+
|
|
309
|
+
# 多节点部署
|
|
310
|
+
cd examples/miit_spider_distributed
|
|
311
|
+
./deploy_distributed.sh node-1 20
|
|
312
|
+
./deploy_distributed.sh node-2 25
|
|
313
|
+
|
|
314
|
+
# 环境变量配置
|
|
315
|
+
NODE_ID=node-1 REDIS_HOST=192.168.1.100 python run.py miit_device --distributed
|
|
316
|
+
```
|
|
317
|
+
|
|
318
|
+
**特点**:
|
|
319
|
+
- ✅ 多节点协同,高并发
|
|
320
|
+
- ✅ Redis 队列和去重
|
|
321
|
+
- ✅ 适合大规模生产环境
|
|
322
|
+
|
|
323
|
+
### 📚 详细教程
|
|
324
|
+
|
|
325
|
+
- **[单机版爬虫教程](examples/tutorials/单机版爬虫教程.md)**:从创建到运行的完整流程
|
|
326
|
+
- **[分布式爬虫教程](examples/tutorials/分布式爬虫教程.md)**:分布式架构和部署方案
|
|
327
|
+
- **[examples/README.md](examples/README.md)**:完整示例说明
|
|
328
|
+
|
|
329
|
+
---
|
|
330
|
+
|
|
331
|
+
## 🎯 使用场景对比
|
|
332
|
+
|
|
333
|
+
| 特性 | 单机版 | 分布式版 |
|
|
334
|
+
|-----|--------|----------|
|
|
335
|
+
| **配置复杂度** | 零配置 | 需要 Redis |
|
|
336
|
+
| **外部依赖** | 无 | Redis + 数据库 |
|
|
337
|
+
| **并发能力** | 中等 | 高 |
|
|
338
|
+
| **扩展性** | 有限 | 水平扩展 |
|
|
339
|
+
| **适用场景** | 开发测试、中小规模 | 生产环境、大规模 |
|
|
340
|
+
| **学习难度** | 简单 | 中等 |
|
|
341
|
+
|
|
342
|
+
---
|
|
343
|
+
|
|
344
|
+
## 🔧 高级功能
|
|
345
|
+
|
|
346
|
+
### 多下载器支持
|
|
347
|
+
|
|
348
|
+
```python
|
|
349
|
+
# 方式1: 使用简化名称(推荐)
|
|
350
|
+
DOWNLOADER_TYPE = 'aiohttp' # 高性能默认选择
|
|
351
|
+
DOWNLOADER_TYPE = 'httpx' # HTTP/2 支持
|
|
352
|
+
DOWNLOADER_TYPE = 'curl_cffi' # 浏览器指纹模拟
|
|
353
|
+
|
|
354
|
+
# 方式2: 完整类路径(兼容旧版本)
|
|
355
|
+
DOWNLOADER = "crawlo.downloader.aiohttp_downloader.AioHttpDownloader"
|
|
356
|
+
DOWNLOADER = "crawlo.downloader.httpx_downloader.HttpXDownloader"
|
|
357
|
+
DOWNLOADER = "crawlo.downloader.cffi_downloader.CurlCffiDownloader"
|
|
358
|
+
|
|
359
|
+
# 方式3: 在 Spider 中动态选择
|
|
360
|
+
class MySpider(Spider):
|
|
361
|
+
custom_settings = {
|
|
362
|
+
'DOWNLOADER_TYPE': 'curl_cffi', # 需要浏览器指纹时
|
|
363
|
+
'CURL_BROWSER_TYPE': 'chrome136'
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
# 下载器特定配置
|
|
367
|
+
CURL_BROWSER_TYPE = "chrome136" # curl-cffi 模拟浏览器
|
|
368
|
+
HTTPX_HTTP2 = True # httpx 启用 HTTP/2
|
|
369
|
+
CONNECTION_POOL_LIMIT_PER_HOST = 20 # 连接池优化
|
|
370
|
+
```
|
|
371
|
+
|
|
372
|
+
### 智能中间件
|
|
373
|
+
|
|
374
|
+
```python
|
|
375
|
+
MIDDLEWARES = [
|
|
376
|
+
'crawlo.middleware.request_ignore.RequestIgnoreMiddleware', # 请求过滤
|
|
377
|
+
'crawlo.middleware.download_delay.DownloadDelayMiddleware', # 延迟控制
|
|
378
|
+
'crawlo.middleware.default_header.DefaultHeaderMiddleware', # 默认请求头
|
|
379
|
+
'crawlo.middleware.proxy.ProxyMiddleware', # 代理支持
|
|
380
|
+
'crawlo.middleware.retry.RetryMiddleware', # 重试机制
|
|
381
|
+
'crawlo.middleware.response_code.ResponseCodeMiddleware', # 状态码处理
|
|
382
|
+
]
|
|
383
|
+
```
|
|
384
|
+
|
|
385
|
+
### 数据管道
|
|
386
|
+
|
|
387
|
+
```python
|
|
388
|
+
PIPELINES = [
|
|
389
|
+
'crawlo.pipelines.console_pipeline.ConsolePipeline', # 控制台输出
|
|
390
|
+
'crawlo.pipelines.json_pipeline.JsonPipeline', # JSON 文件(逐行)
|
|
391
|
+
'crawlo.pipelines.json_pipeline.JsonLinesPipeline', # JSON Lines 格式
|
|
392
|
+
'crawlo.pipelines.json_pipeline.JsonArrayPipeline', # JSON 数组格式
|
|
393
|
+
'crawlo.pipelines.csv_pipeline.CsvPipeline', # CSV 文件
|
|
394
|
+
'crawlo.pipelines.csv_pipeline.CsvDictPipeline', # CSV 字典格式
|
|
395
|
+
'crawlo.pipelines.csv_pipeline.CsvBatchPipeline', # CSV 批量写入
|
|
396
|
+
'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline', # MySQL 数据库(推荐)
|
|
397
|
+
'crawlo.pipelines.mysql_pipeline.AiomysqlMySQLPipeline', # MySQL 数据库(备选)
|
|
398
|
+
'crawlo.pipelines.mongo_pipeline.MongoPipeline', # MongoDB
|
|
399
|
+
'crawlo.pipelines.mongo_pipeline.MongoPoolPipeline', # MongoDB 连接池版本
|
|
400
|
+
]
|
|
401
|
+
```
|
|
402
|
+
|
|
403
|
+
---
|
|
404
|
+
|
|
405
|
+
## 📊 监控与运维
|
|
406
|
+
|
|
407
|
+
### 实时监控
|
|
408
|
+
|
|
409
|
+
```bash
|
|
410
|
+
# 查看运行统计
|
|
411
|
+
crawlo stats
|
|
412
|
+
|
|
413
|
+
# 查看特定爬虫
|
|
414
|
+
crawlo stats my_spider
|
|
415
|
+
|
|
416
|
+
# Redis 队列监控
|
|
417
|
+
redis-cli llen crawlo:requests
|
|
418
|
+
redis-cli scard crawlo:fingerprint
|
|
419
|
+
```
|
|
420
|
+
|
|
421
|
+
### 日志系统
|
|
422
|
+
|
|
423
|
+
```python
|
|
424
|
+
# 日志配置
|
|
425
|
+
LOG_LEVEL = 'INFO'
|
|
426
|
+
LOG_FILE = 'logs/crawlo.log'
|
|
427
|
+
LOG_FORMAT = '%(asctime)s - [%(name)s] - %(levelname)s: %(message)s'
|
|
428
|
+
```
|
|
429
|
+
|
|
430
|
+
### 性能调优
|
|
431
|
+
|
|
432
|
+
```python
|
|
433
|
+
# 并发控制
|
|
434
|
+
CONCURRENCY = 16 # 并发请求数
|
|
435
|
+
DOWNLOAD_DELAY = 1.0 # 下载延迟
|
|
436
|
+
CONNECTION_POOL_LIMIT = 100 # 全局连接池大小
|
|
437
|
+
CONNECTION_POOL_LIMIT_PER_HOST = 30 # 每个主机连接数
|
|
438
|
+
|
|
439
|
+
# 重试策略
|
|
440
|
+
MAX_RETRY_TIMES = 3 # 最大重试次数
|
|
441
|
+
RETRY_HTTP_CODES = [500, 502, 503] # 重试状态码
|
|
442
|
+
|
|
443
|
+
# 统计和监控(新增)
|
|
444
|
+
DOWNLOADER_STATS = True # 启用下载器统计
|
|
445
|
+
DOWNLOAD_STATS = True # 记录下载时间和大小
|
|
446
|
+
DOWNLOADER_HEALTH_CHECK = True # 下载器健康检查
|
|
447
|
+
REQUEST_STATS_ENABLED = True # 请求统计
|
|
448
|
+
```
|
|
449
|
+
|
|
450
|
+
---
|
|
451
|
+
|
|
452
|
+
## 🚀 最佳实践
|
|
453
|
+
|
|
454
|
+
### 1. 开发阶段
|
|
455
|
+
```bash
|
|
456
|
+
# 使用开发配置,低并发,详细日志
|
|
457
|
+
python run.py my_spider --env development --debug
|
|
458
|
+
```
|
|
459
|
+
|
|
460
|
+
### 2. 测试阶段
|
|
461
|
+
```bash
|
|
462
|
+
# 干运行模式,验证逻辑
|
|
463
|
+
python run.py my_spider --dry-run
|
|
464
|
+
```
|
|
465
|
+
|
|
466
|
+
### 3. 生产环境
|
|
467
|
+
```bash
|
|
468
|
+
# 使用生产配置或分布式模式
|
|
469
|
+
python run.py my_spider --env production
|
|
470
|
+
python run.py my_spider --distributed --concurrency 50
|
|
471
|
+
```
|
|
472
|
+
|
|
473
|
+
### 4. 大规模爬取
|
|
474
|
+
```bash
|
|
475
|
+
# 使用大规模配置,启用分布式
|
|
476
|
+
python run.py my_spider --env large-scale
|
|
477
|
+
```
|
|
478
|
+
|
|
479
|
+
### 5. 下载器选择最佳实践
|
|
480
|
+
```python
|
|
481
|
+
# 开发/测试环境 - 使用 httpx(稳定、兼容性好)
|
|
482
|
+
DOWNLOADER_TYPE = 'httpx'
|
|
483
|
+
|
|
484
|
+
# 生产环境 - 使用 aiohttp(高性能)
|
|
485
|
+
DOWNLOADER_TYPE = 'aiohttp'
|
|
486
|
+
|
|
487
|
+
# 反爬虫场景 - 使用 curl_cffi(浏览器指纹)
|
|
488
|
+
DOWNLOADER_TYPE = 'curl_cffi'
|
|
489
|
+
CURL_BROWSER_TYPE = 'chrome136'
|
|
490
|
+
```
|
|
491
|
+
|
|
492
|
+
---
|
|
493
|
+
|
|
494
|
+
## 💡 核心优势
|
|
495
|
+
|
|
496
|
+
### 🎯 开箱即用
|
|
497
|
+
- **零配置启动**:默认单机模式,无需复杂配置
|
|
498
|
+
- **智能检测**:自动发现爬虫,智能选择运行模式
|
|
499
|
+
- **预设配置**:内置多种场景的最佳实践配置
|
|
500
|
+
|
|
501
|
+
### 🔧 灵活配置
|
|
502
|
+
- **配置工厂**:链式调用,代码即配置
|
|
503
|
+
- **多下载器支持**:简化配置,支持 aiohttp、httpx、curl_cffi
|
|
504
|
+
- **环境变量**:支持容器化部署
|
|
505
|
+
- **多种模式**:单机、分布式、自动模式
|
|
506
|
+
|
|
507
|
+
### ⚡ 高性能
|
|
508
|
+
- **异步架构**:基于 asyncio 的高并发设计
|
|
509
|
+
- **多下载器**:aiohttp、httpx、curl_cffi 灵活选择
|
|
510
|
+
- **智能去重**:内存/Redis 分布式去重
|
|
511
|
+
- **负载均衡**:多节点自动任务分配
|
|
512
|
+
- **性能监控**:实时统计和健康检查
|
|
513
|
+
|
|
514
|
+
### 🛡️ 生产就绪
|
|
515
|
+
- **容错机制**:节点故障自动恢复
|
|
516
|
+
- **监控系统**:完善的统计和监控
|
|
517
|
+
- **扩展能力**:水平扩展,按需增减节点
|
|
518
|
+
|
|
519
|
+
---
|
|
520
|
+
|
|
521
|
+
## 🆚 与其他框架对比
|
|
522
|
+
|
|
523
|
+
| 特性 | Crawlo | Scrapy | 其他框架 |
|
|
524
|
+
|------|--------|--------|---------|
|
|
525
|
+
| **学习曲线** | 简单 | 中等 | 复杂 |
|
|
526
|
+
| **配置方式** | 智能配置工厂 | 传统配置 | 手动配置 |
|
|
527
|
+
| **分布式** | 一键切换 | 需要 Scrapyd | 复杂 |
|
|
528
|
+
| **默认模式** | 单机零配置 | 单机 | 各异 |
|
|
529
|
+
| **运行方式** | 多种灵活选项 | 命令行 | 各异 |
|
|
530
|
+
| **现代化** | 现代 Python | 传统 | 各异 |
|
|
531
|
+
|
|
532
|
+
---
|
|
533
|
+
|
|
534
|
+
## 📞 支持与贡献
|
|
535
|
+
|
|
536
|
+
### 🐛 问题反馈
|
|
537
|
+
- **GitHub Issues**:[提交问题](https://github.com/yourname/crawlo/issues)
|
|
538
|
+
- **文档**:查看 [examples/README.md](examples/README.md) 获取更多示例
|
|
539
|
+
|
|
540
|
+
### 🤝 参与贡献
|
|
541
|
+
- **Fork 项目**:欢迎提交 Pull Request
|
|
542
|
+
- **改进文档**:帮助完善文档和示例
|
|
543
|
+
- **分享经验**:分享使用经验和最佳实践
|
|
544
|
+
|
|
545
|
+
### 📋 开发路线图
|
|
546
|
+
- [ ] 图形化管理界面
|
|
547
|
+
- [ ] 更多数据存储支持
|
|
548
|
+
- [ ] 云原生部署方案
|
|
549
|
+
- [ ] 智能反爬虫对抗
|
|
550
|
+
- [ ] 可视化监控面板
|
|
551
|
+
|
|
552
|
+
---
|
|
553
|
+
|
|
554
|
+
## 📄 许可证
|
|
555
|
+
|
|
556
|
+
MIT License - 自由使用,商业友好
|
|
557
|
+
|
|
558
|
+
---
|
|
559
|
+
|
|
560
|
+
**🎉 立即开始您的爬虫之旅!**
|
|
561
|
+
|
|
562
|
+
```bash
|
|
563
|
+
git clone https://github.com/yourname/crawlo.git
|
|
564
|
+
cd crawlo
|
|
565
|
+
pip install -e .
|
|
566
|
+
crawlo startproject my_first_spider
|
|
567
|
+
```
|