crawlo 1.1.2__tar.gz → 1.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo-1.1.4/PKG-INFO +403 -0
- crawlo-1.1.4/README.md +353 -0
- crawlo-1.1.4/crawlo/__version__.py +1 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/core/scheduler.py +20 -16
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/downloader/httpx_downloader.py +14 -12
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/exceptions.py +4 -0
- crawlo-1.1.4/crawlo/extension/__init__.py +38 -0
- crawlo-1.1.4/crawlo/extension/health_check.py +142 -0
- crawlo-1.1.4/crawlo/extension/log_interval.py +58 -0
- crawlo-1.1.4/crawlo/extension/log_stats.py +82 -0
- crawlo-1.1.4/crawlo/extension/logging_extension.py +44 -0
- crawlo-1.1.4/crawlo/extension/memory_monitor.py +89 -0
- crawlo-1.1.4/crawlo/extension/performance_profiler.py +118 -0
- crawlo-1.1.4/crawlo/extension/request_recorder.py +108 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/filters/aioredis_filter.py +2 -2
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/middleware/retry.py +3 -3
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/network/request.py +2 -2
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/network/response.py +25 -23
- crawlo-1.1.4/crawlo/pipelines/__init__.py +22 -0
- crawlo-1.1.4/crawlo/pipelines/bloom_dedup_pipeline.py +157 -0
- crawlo-1.1.4/crawlo/pipelines/database_dedup_pipeline.py +225 -0
- crawlo-1.1.4/crawlo/pipelines/memory_dedup_pipeline.py +116 -0
- crawlo-1.1.4/crawlo/pipelines/mongo_pipeline.py +132 -0
- crawlo-1.1.4/crawlo/pipelines/mysql_pipeline.py +317 -0
- crawlo-1.1.4/crawlo/pipelines/redis_dedup_pipeline.py +163 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/queue/queue_manager.py +4 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/queue/redis_priority_queue.py +20 -3
- crawlo-1.1.4/crawlo/settings/default_settings.py +279 -0
- crawlo-1.1.4/crawlo/subscriber.py +131 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/templates/project/items.py.tmpl +1 -1
- crawlo-1.1.4/crawlo/templates/project/middlewares.py.tmpl +111 -0
- crawlo-1.1.4/crawlo/templates/project/pipelines.py.tmpl +98 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/templates/project/run.py.tmpl +20 -7
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/templates/project/settings.py.tmpl +35 -3
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/templates/spider/spider.py.tmpl +1 -37
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/utils/controlled_spider_mixin.py +109 -5
- crawlo-1.1.4/crawlo.egg-info/PKG-INFO +403 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo.egg-info/SOURCES.txt +9 -0
- crawlo-1.1.4/examples/controlled_spider_example.py +205 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/requirements.txt +4 -5
- crawlo-1.1.2/PKG-INFO +0 -567
- crawlo-1.1.2/README.md +0 -517
- crawlo-1.1.2/crawlo/__version__.py +0 -1
- crawlo-1.1.2/crawlo/extension/__init__.py +0 -31
- crawlo-1.1.2/crawlo/extension/log_interval.py +0 -49
- crawlo-1.1.2/crawlo/extension/log_stats.py +0 -44
- crawlo-1.1.2/crawlo/extension/logging_extension.py +0 -35
- crawlo-1.1.2/crawlo/pipelines/__init__.py +0 -13
- crawlo-1.1.2/crawlo/pipelines/mongo_pipeline.py +0 -117
- crawlo-1.1.2/crawlo/pipelines/mysql_pipeline.py +0 -195
- crawlo-1.1.2/crawlo/settings/default_settings.py +0 -226
- crawlo-1.1.2/crawlo/subscriber.py +0 -106
- crawlo-1.1.2/crawlo/templates/project/middlewares.py.tmpl +0 -87
- crawlo-1.1.2/crawlo/templates/project/pipelines.py.tmpl +0 -336
- crawlo-1.1.2/crawlo.egg-info/PKG-INFO +0 -567
- {crawlo-1.1.2 → crawlo-1.1.4}/LICENSE +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/MANIFEST.in +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/__init__.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/cli.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/commands/__init__.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/commands/check.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/commands/genspider.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/commands/list.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/commands/run.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/commands/startproject.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/commands/stats.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/commands/utils.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/config.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/core/__init__.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/core/engine.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/core/enhanced_engine.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/core/processor.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/crawler.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/downloader/__init__.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/downloader/aiohttp_downloader.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/downloader/cffi_downloader.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/event.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/filters/__init__.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/filters/memory_filter.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/items/__init__.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/items/base.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/items/fields.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/items/items.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/middleware/__init__.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/middleware/default_header.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/middleware/download_delay.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/middleware/middleware_manager.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/middleware/proxy.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/middleware/request_ignore.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/middleware/response_code.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/middleware/response_filter.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/mode_manager.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/network/__init__.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/pipelines/console_pipeline.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/pipelines/csv_pipeline.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/pipelines/json_pipeline.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/pipelines/pipeline_manager.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/project.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/queue/__init__.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/queue/pqueue.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/settings/__init__.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/settings/setting_manager.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/spider/__init__.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/stats_collector.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/task_manager.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/templates/crawlo.cfg.tmpl +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/templates/project/__init__.py.tmpl +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/templates/project/spiders/__init__.py.tmpl +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/utils/__init__.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/utils/date_tools.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/utils/db_helper.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/utils/func_tools.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/utils/large_scale_config.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/utils/large_scale_helper.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/utils/log.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/utils/queue_helper.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/utils/request.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/utils/request_serializer.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/utils/spider_loader.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/utils/system.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/utils/tools.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo/utils/url.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo.egg-info/dependency_links.txt +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo.egg-info/entry_points.txt +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo.egg-info/requires.txt +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/crawlo.egg-info/top_level.txt +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/examples/__init__.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/pyproject.toml +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/setup.cfg +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/tests/__init__.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/tests/test_final_validation.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/tests/test_proxy_health_check.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/tests/test_proxy_middleware_integration.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/tests/test_proxy_providers.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/tests/test_proxy_stats.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/tests/test_proxy_strategies.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/tests/test_redis_config.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/tests/test_redis_queue.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/tests/test_request_serialization.py +0 -0
- {crawlo-1.1.2 → crawlo-1.1.4}/tests/test_scheduler.py +0 -0
crawlo-1.1.4/PKG-INFO
ADDED
|
@@ -0,0 +1,403 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: crawlo
|
|
3
|
+
Version: 1.1.4
|
|
4
|
+
Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
|
|
5
|
+
Home-page: https://github.com/crawl-coder/Crawlo.git
|
|
6
|
+
Author: crawl-coder
|
|
7
|
+
Author-email: crawlo@qq.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Requires-Python: >=3.6
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
Requires-Dist: aiohttp>=3.12.14
|
|
15
|
+
Requires-Dist: aiomysql>=0.2.0
|
|
16
|
+
Requires-Dist: aioredis>=2.0.1
|
|
17
|
+
Requires-Dist: asyncmy>=0.2.10
|
|
18
|
+
Requires-Dist: cssselect>=1.2.0
|
|
19
|
+
Requires-Dist: dateparser>=1.2.2
|
|
20
|
+
Requires-Dist: httpx[http2]>=0.27.0
|
|
21
|
+
Requires-Dist: curl-cffi>=0.13.0
|
|
22
|
+
Requires-Dist: lxml>=5.2.1
|
|
23
|
+
Requires-Dist: motor>=3.7.0
|
|
24
|
+
Requires-Dist: parsel>=1.9.1
|
|
25
|
+
Requires-Dist: pydantic>=2.11.7
|
|
26
|
+
Requires-Dist: pymongo>=4.11
|
|
27
|
+
Requires-Dist: PyMySQL>=1.1.1
|
|
28
|
+
Requires-Dist: python-dateutil>=2.9.0.post0
|
|
29
|
+
Requires-Dist: redis>=6.2.0
|
|
30
|
+
Requires-Dist: requests>=2.32.4
|
|
31
|
+
Requires-Dist: six>=1.17.0
|
|
32
|
+
Requires-Dist: ujson>=5.9.0
|
|
33
|
+
Requires-Dist: urllib3>=2.5.0
|
|
34
|
+
Requires-Dist: w3lib>=2.1.2
|
|
35
|
+
Requires-Dist: rich>=14.1.0
|
|
36
|
+
Requires-Dist: astor>=0.8.1
|
|
37
|
+
Requires-Dist: watchdog>=6.0.0
|
|
38
|
+
Provides-Extra: render
|
|
39
|
+
Requires-Dist: webdriver-manager>=4.0.0; extra == "render"
|
|
40
|
+
Requires-Dist: playwright; extra == "render"
|
|
41
|
+
Requires-Dist: selenium>=3.141.0; extra == "render"
|
|
42
|
+
Provides-Extra: all
|
|
43
|
+
Requires-Dist: bitarray>=1.5.3; extra == "all"
|
|
44
|
+
Requires-Dist: PyExecJS>=1.5.1; extra == "all"
|
|
45
|
+
Requires-Dist: pymongo>=3.10.1; extra == "all"
|
|
46
|
+
Requires-Dist: redis-py-cluster>=2.1.0; extra == "all"
|
|
47
|
+
Requires-Dist: webdriver-manager>=4.0.0; extra == "all"
|
|
48
|
+
Requires-Dist: playwright; extra == "all"
|
|
49
|
+
Requires-Dist: selenium>=3.141.0; extra == "all"
|
|
50
|
+
|
|
51
|
+
# Crawlo - 异步分布式爬虫框架
|
|
52
|
+
|
|
53
|
+
<div align="center">
|
|
54
|
+
|
|
55
|
+
[](https://www.python.org/downloads/)
|
|
56
|
+
[](LICENSE)
|
|
57
|
+
[](https://crawlo.readthedocs.io/)
|
|
58
|
+
|
|
59
|
+
一个基于 asyncio 的高性能异步分布式爬虫框架,支持单机和分布式部署。
|
|
60
|
+
|
|
61
|
+
</div>
|
|
62
|
+
|
|
63
|
+
## 🌟 特性
|
|
64
|
+
|
|
65
|
+
- **异步高性能**: 基于 asyncio 实现,充分利用现代 CPU 多核性能
|
|
66
|
+
- **分布式支持**: 内置 Redis 队列,轻松实现分布式部署
|
|
67
|
+
- **模块化设计**: 中间件、管道、扩展组件系统,易于定制和扩展
|
|
68
|
+
- **智能去重**: 多种去重策略(内存、Redis、Bloom Filter)
|
|
69
|
+
- **灵活配置**: 支持多种配置方式,适应不同场景需求
|
|
70
|
+
- **丰富文档**: 完整的中英文双语文档和示例项目
|
|
71
|
+
|
|
72
|
+
## 🚀 快速开始
|
|
73
|
+
|
|
74
|
+
### 安装
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
pip install crawlo
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### 创建项目
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
crawlo startproject myproject
|
|
84
|
+
cd myproject
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### 编写爬虫
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from crawlo import Spider, Request, Item
|
|
91
|
+
|
|
92
|
+
class MyItem(Item):
|
|
93
|
+
title = ''
|
|
94
|
+
url = ''
|
|
95
|
+
|
|
96
|
+
class MySpider(Spider):
|
|
97
|
+
name = 'myspider'
|
|
98
|
+
|
|
99
|
+
async def start_requests(self):
|
|
100
|
+
yield Request('https://httpbin.org/get', callback=self.parse)
|
|
101
|
+
|
|
102
|
+
async def parse(self, response):
|
|
103
|
+
yield MyItem(
|
|
104
|
+
title='Example Title',
|
|
105
|
+
url=response.url
|
|
106
|
+
)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### 运行爬虫
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
crawlo crawl myspider
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
## 🏗️ 架构设计
|
|
116
|
+
|
|
117
|
+
### 组件交互图
|
|
118
|
+
|
|
119
|
+
```
|
|
120
|
+
┌─────────────────────────────────────────────────────────────────────┐
|
|
121
|
+
│ Crawler │
|
|
122
|
+
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────────────────┐ │
|
|
123
|
+
│ │ Spider │ │ Engine │ │ ExtensionManager │ │
|
|
124
|
+
│ │ │ │ │ │ │ │
|
|
125
|
+
│ │ start_urls │ │ Scheduler ◄─┼──┼──► StatsCollector │ │
|
|
126
|
+
│ │ parse() │ │ │ │ │ │
|
|
127
|
+
│ │ │ │ Downloader ◄─┼──┼──► MiddlewareManager │ │
|
|
128
|
+
│ │ │ │ │ │ │ │
|
|
129
|
+
│ │ │ │ Processor ◄─┼──┼──► PipelineManager │ │
|
|
130
|
+
│ └──────────────┘ └──────┬───────┘ └──────────────────────────┘ │
|
|
131
|
+
└──────────────────────────┼─────────────────────────────────────────┘
|
|
132
|
+
│
|
|
133
|
+
┌──────────────────▼──────────────────┐
|
|
134
|
+
│ Scheduler │
|
|
135
|
+
│ ┌──────────────────────────────┐ │
|
|
136
|
+
│ │ QueueManager │ │
|
|
137
|
+
│ │ ┌─────────┐ ┌────────────┐ │ │
|
|
138
|
+
│ │ │ Memory │ │ Redis │ │ │
|
|
139
|
+
│ │ │ Queue │ │ Queue │ │ │
|
|
140
|
+
│ │ └─────────┘ └────────────┘ │ │
|
|
141
|
+
│ └──────────────────────────────┘ │
|
|
142
|
+
│ ┌──────────────────────────────┐ │
|
|
143
|
+
│ │ Filter │ │
|
|
144
|
+
│ │ ┌─────────┐ ┌────────────┐ │ │
|
|
145
|
+
│ │ │ Memory │ │ Redis │ │ │
|
|
146
|
+
│ │ │ Filter │ │ Filter │ │ │
|
|
147
|
+
│ │ └─────────┘ └────────────┘ │ │
|
|
148
|
+
│ └──────────────────────────────┘ │
|
|
149
|
+
└─────────────────────────────────────┘
|
|
150
|
+
│
|
|
151
|
+
┌──────────────────▼──────────────────┐
|
|
152
|
+
│ Downloader │
|
|
153
|
+
│ ┌──────────────────────────────┐ │
|
|
154
|
+
│ │ MiddlewareManager │ │
|
|
155
|
+
│ │ │ │
|
|
156
|
+
│ │ RequestMiddleware ◄────────┐ │ │
|
|
157
|
+
│ │ ResponseMiddleware │ │ │
|
|
158
|
+
│ │ ExceptionMiddleware │ │ │
|
|
159
|
+
│ │ ╱ │ │
|
|
160
|
+
│ └─────────────────────────╱───┘ │
|
|
161
|
+
│ ╱ │
|
|
162
|
+
│ ┌───────────────────────▼──┐ │
|
|
163
|
+
│ │ Download Implementations │ │
|
|
164
|
+
│ │ - AioHttpDownloader │ │
|
|
165
|
+
│ │ - HttpXDownloader │ │
|
|
166
|
+
│ │ - CurlCffiDownloader │ │
|
|
167
|
+
│ └──────────────────────────┘ │
|
|
168
|
+
└─────────────────────────────────────┘
|
|
169
|
+
│
|
|
170
|
+
┌──────────────────▼──────────────────┐
|
|
171
|
+
│ Processor │
|
|
172
|
+
│ ┌──────────────────────────────┐ │
|
|
173
|
+
│ │ PipelineManager │ │
|
|
174
|
+
│ │ ┌─────────────────────────┐ │ │
|
|
175
|
+
│ │ │ Pipeline Stages │ │ │
|
|
176
|
+
│ │ │ - ValidationPipeline │ │ │
|
|
177
|
+
│ │ │ - ProcessingPipeline │ │ │
|
|
178
|
+
│ │ │ - StoragePipeline │ │ │
|
|
179
|
+
│ │ └─────────────────────────┘ │ │
|
|
180
|
+
│ └──────────────────────────────┘ │
|
|
181
|
+
└─────────────────────────────────────┘
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
### 运行模式切换图
|
|
185
|
+
|
|
186
|
+
```
|
|
187
|
+
┌─────────────────────┐
|
|
188
|
+
│ ModeManager │
|
|
189
|
+
│ (运行模式管理器) │
|
|
190
|
+
└─────────┬───────────┘
|
|
191
|
+
│
|
|
192
|
+
┌─────────────────────┼─────────────────────┐
|
|
193
|
+
│ │ │
|
|
194
|
+
▼ ▼ ▼
|
|
195
|
+
┌───────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
|
196
|
+
│ Standalone │ │ Distributed │ │ Auto │
|
|
197
|
+
│ (单机模式) │ │ (分布式模式) │ │ (自动检测模式) │
|
|
198
|
+
└───────┬───────┘ └─────────┬───────┘ └─────────┬───────┘
|
|
199
|
+
│ │ │
|
|
200
|
+
▼ ▼ ▼
|
|
201
|
+
┌───────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
|
202
|
+
│ Memory Queue │ │ Redis Queue │ │ Auto Select │
|
|
203
|
+
│ Memory Filter │ │ Redis Filter │ │ Memory/Redis │
|
|
204
|
+
└───────────────┘ └─────────────────┘ └─────────────────┘
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
### 数据流向图
|
|
208
|
+
|
|
209
|
+
```
|
|
210
|
+
┌─────────────┐ 1.生成初始请求 ┌──────────────┐
|
|
211
|
+
│ Spider ├─────────────────────►│ Scheduler │
|
|
212
|
+
└─────────────┘ └──────┬───────┘
|
|
213
|
+
│ 2.去重检查
|
|
214
|
+
▼
|
|
215
|
+
┌─────────────────┐
|
|
216
|
+
│ Filter │
|
|
217
|
+
└─────────┬───────┘
|
|
218
|
+
│ 3.入队
|
|
219
|
+
▼
|
|
220
|
+
┌─────────────────┐
|
|
221
|
+
│ Queue │
|
|
222
|
+
└─────────┬───────┘
|
|
223
|
+
│ 4.获取请求
|
|
224
|
+
▼
|
|
225
|
+
┌─────────────────┐ 5.下载请求
|
|
226
|
+
│ Downloader ├──────────────────┐
|
|
227
|
+
└─────────────────┘ │
|
|
228
|
+
│ 6.解析响应 │
|
|
229
|
+
▼ ▼
|
|
230
|
+
┌─────────────────┐ 7.生成数据 ┌─────────────┐
|
|
231
|
+
│ Processor ├────────────────►│ Pipeline │
|
|
232
|
+
└─────────────────┘ └─────────────┘
|
|
233
|
+
│ 8.存储数据
|
|
234
|
+
▼
|
|
235
|
+
┌─────────────────┐
|
|
236
|
+
│ Items │
|
|
237
|
+
└─────────────────┘
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
### 模块层次结构图
|
|
241
|
+
|
|
242
|
+
```
|
|
243
|
+
crawlo/
|
|
244
|
+
├── cli.py # 命令行接口
|
|
245
|
+
├── crawler.py # 爬虫运行实例
|
|
246
|
+
├── project.py # 项目管理
|
|
247
|
+
├── config.py # 配置管理
|
|
248
|
+
├── mode_manager.py # 运行模式管理器
|
|
249
|
+
├── stats_collector.py # 统计收集器
|
|
250
|
+
├── subscriber.py # 事件订阅器
|
|
251
|
+
├── task_manager.py # 任务管理器
|
|
252
|
+
├── event.py # 事件定义
|
|
253
|
+
├── exceptions.py # 异常定义
|
|
254
|
+
├──
|
|
255
|
+
├── core/ # 核心组件
|
|
256
|
+
│ ├── engine.py # 引擎
|
|
257
|
+
│ ├── scheduler.py # 调度器
|
|
258
|
+
│ ├── processor.py # 处理器
|
|
259
|
+
│
|
|
260
|
+
├── spider/ # 爬虫基类
|
|
261
|
+
│ └── __init__.py # 爬虫元类和基类
|
|
262
|
+
│
|
|
263
|
+
├── network/ # 网络相关
|
|
264
|
+
│ ├── request.py # 请求对象
|
|
265
|
+
│ └── response.py # 响应对象
|
|
266
|
+
│
|
|
267
|
+
├── downloader/ # 下载器
|
|
268
|
+
│ ├── __init__.py # 下载器基类
|
|
269
|
+
│ ├── aiohttp_downloader.py # AioHttp实现
|
|
270
|
+
│ ├── httpx_downloader.py # HttpX实现
|
|
271
|
+
│ └── cffi_downloader.py # CurlCffi实现
|
|
272
|
+
│
|
|
273
|
+
├── queue/ # 队列管理
|
|
274
|
+
│ ├── __init__.py
|
|
275
|
+
│ ├── queue_manager.py # 队列管理器
|
|
276
|
+
│ ├── pqueue.py # 内存优先队列
|
|
277
|
+
│ └── redis_priority_queue.py # Redis优先队列
|
|
278
|
+
│
|
|
279
|
+
├── filters/ # 过滤器
|
|
280
|
+
│ ├── __init__.py
|
|
281
|
+
│ ├── base_filter.py # 过滤器基类
|
|
282
|
+
│ ├── memory_filter.py # 内存过滤器
|
|
283
|
+
│ └── aioredis_filter.py # Redis过滤器
|
|
284
|
+
│
|
|
285
|
+
├── middleware/ # 中间件
|
|
286
|
+
│ ├── __init__.py
|
|
287
|
+
│ ├── middleware_manager.py # 中间件管理器
|
|
288
|
+
│ ├── default_header.py # 默认请求头
|
|
289
|
+
│ ├── download_delay.py # 下载延迟
|
|
290
|
+
│ ├── proxy.py # 代理支持
|
|
291
|
+
│ ├── request_ignore.py # 请求忽略
|
|
292
|
+
│ ├── response_code.py # 响应码处理
|
|
293
|
+
│ ├── response_filter.py # 响应过滤
|
|
294
|
+
│ └── retry.py # 重试机制
|
|
295
|
+
│
|
|
296
|
+
├── pipelines/ # 数据管道
|
|
297
|
+
│ ├── __init__.py
|
|
298
|
+
│ ├── pipeline_manager.py # 管道管理器
|
|
299
|
+
│ ├── base_pipeline.py # 管道基类
|
|
300
|
+
│ ├── console_pipeline.py # 控制台输出管道
|
|
301
|
+
│ └── mysql_pipeline.py # MySQL存储管道
|
|
302
|
+
│
|
|
303
|
+
├── extension/ # 扩展组件
|
|
304
|
+
│ ├── __init__.py
|
|
305
|
+
│ ├── log_interval.py # 定时日志
|
|
306
|
+
│ ├── log_stats.py # 统计日志
|
|
307
|
+
│ ├── logging_extension.py # 日志扩展
|
|
308
|
+
│ ├── memory_monitor.py # 内存监控
|
|
309
|
+
│ └── performance_profiler.py # 性能分析
|
|
310
|
+
│
|
|
311
|
+
├── settings/ # 配置系统
|
|
312
|
+
│ ├── __init__.py
|
|
313
|
+
│ ├── default_settings.py # 默认配置
|
|
314
|
+
│ └── setting_manager.py # 配置管理器
|
|
315
|
+
│
|
|
316
|
+
├── utils/ # 工具库
|
|
317
|
+
│ ├── __init__.py
|
|
318
|
+
│ ├── log.py # 日志工具
|
|
319
|
+
│ ├── request.py # 请求工具
|
|
320
|
+
│ ├── request_serializer.py # 请求序列化
|
|
321
|
+
│ └── func_tools.py # 函数工具
|
|
322
|
+
│
|
|
323
|
+
└── templates/ # 模板文件
|
|
324
|
+
├── project/
|
|
325
|
+
└── spider/
|
|
326
|
+
```
|
|
327
|
+
|
|
328
|
+
### 组件说明
|
|
329
|
+
|
|
330
|
+
- **Crawler**: 爬虫运行实例,管理Spider与引擎的生命周期
|
|
331
|
+
- **Engine**: 引擎组件,协调Scheduler、Downloader、Processor
|
|
332
|
+
- **Scheduler**: 调度器,管理请求队列和去重过滤
|
|
333
|
+
- **Downloader**: 下载器,负责网络请求,支持多种实现(aiohttp, httpx, curl-cffi)
|
|
334
|
+
- **Processor**: 处理器,处理响应数据和管道
|
|
335
|
+
- **QueueManager**: 统一的队列管理器,支持内存队列和Redis队列的自动切换
|
|
336
|
+
- **Filter**: 请求去重过滤器,支持内存和Redis两种实现
|
|
337
|
+
- **Middleware**: 中间件系统,处理请求/响应的预处理和后处理
|
|
338
|
+
- **Pipeline**: 数据处理管道,支持多种存储方式(控制台、数据库等)
|
|
339
|
+
- **Spider**: 爬虫基类,定义爬取逻辑
|
|
340
|
+
|
|
341
|
+
### 运行模式
|
|
342
|
+
|
|
343
|
+
Crawlo支持三种运行模式:
|
|
344
|
+
- **standalone**: 单机模式,使用内存队列和内存过滤器
|
|
345
|
+
- **distributed**: 分布式模式,使用Redis队列和Redis过滤器
|
|
346
|
+
- **auto**: 自动检测模式,根据环境自动选择最佳运行方式
|
|
347
|
+
|
|
348
|
+
## 🎛️ 配置系统
|
|
349
|
+
|
|
350
|
+
### 传统配置方式
|
|
351
|
+
|
|
352
|
+
```
|
|
353
|
+
# settings.py
|
|
354
|
+
PROJECT_NAME = 'myproject'
|
|
355
|
+
CONCURRENCY = 16
|
|
356
|
+
DOWNLOAD_DELAY = 1.0
|
|
357
|
+
QUEUE_TYPE = 'memory' # 单机模式
|
|
358
|
+
# QUEUE_TYPE = 'redis' # 分布式模式
|
|
359
|
+
```
|
|
360
|
+
|
|
361
|
+
### 命令行配置
|
|
362
|
+
|
|
363
|
+
```
|
|
364
|
+
crawlo crawl myspider --concurrency=32 --delay=0.5
|
|
365
|
+
```
|
|
366
|
+
|
|
367
|
+
## 🧩 核心组件
|
|
368
|
+
|
|
369
|
+
### 中间件系统
|
|
370
|
+
灵活的中间件系统,支持请求预处理、响应处理和异常处理。
|
|
371
|
+
|
|
372
|
+
### 管道系统
|
|
373
|
+
可扩展的数据处理管道,支持多种存储方式(控制台、数据库等)。
|
|
374
|
+
|
|
375
|
+
### 扩展组件
|
|
376
|
+
功能增强扩展,包括日志、监控、性能分析等。
|
|
377
|
+
|
|
378
|
+
### 过滤系统
|
|
379
|
+
智能去重过滤,支持多种去重策略(内存、Redis、Bloom Filter)。
|
|
380
|
+
|
|
381
|
+
## 📦 示例项目
|
|
382
|
+
|
|
383
|
+
- [API数据采集](examples/api_data_collection/) - 简单的API数据采集示例
|
|
384
|
+
- [电信设备许可证](examples/telecom_licenses_distributed/) - 分布式爬取示例
|
|
385
|
+
|
|
386
|
+
## 📚 文档
|
|
387
|
+
|
|
388
|
+
完整的文档请访问 [Crawlo Documentation](https://crawlo.readthedocs.io/)
|
|
389
|
+
|
|
390
|
+
- [快速开始指南](docs/quick_start.md)
|
|
391
|
+
- [框架文档](docs/crawlo_framework_documentation.md)
|
|
392
|
+
- [API参考](docs/api_reference.md)
|
|
393
|
+
- [分布式爬取教程](docs/distributed_crawling_tutorial.md)
|
|
394
|
+
- [配置最佳实践](docs/configuration_best_practices.md)
|
|
395
|
+
- [扩展组件](docs/extensions.md)
|
|
396
|
+
|
|
397
|
+
## 🤝 贡献
|
|
398
|
+
|
|
399
|
+
欢迎提交 Issue 和 Pull Request 来帮助改进 Crawlo!
|
|
400
|
+
|
|
401
|
+
## 📄 许可证
|
|
402
|
+
|
|
403
|
+
本项目采用 MIT 许可证,详情请见 [LICENSE](LICENSE) 文件。
|