crawlo 1.1.3__py3-none-any.whl → 1.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (118) hide show
  1. crawlo/__init__.py +34 -34
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +40 -40
  4. crawlo/commands/__init__.py +13 -13
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +151 -151
  7. crawlo/commands/list.py +155 -155
  8. crawlo/commands/run.py +285 -285
  9. crawlo/commands/startproject.py +196 -196
  10. crawlo/commands/stats.py +188 -188
  11. crawlo/commands/utils.py +186 -186
  12. crawlo/config.py +279 -279
  13. crawlo/core/__init__.py +2 -2
  14. crawlo/core/engine.py +171 -171
  15. crawlo/core/enhanced_engine.py +189 -189
  16. crawlo/core/processor.py +40 -40
  17. crawlo/core/scheduler.py +165 -165
  18. crawlo/crawler.py +1027 -1027
  19. crawlo/downloader/__init__.py +242 -242
  20. crawlo/downloader/aiohttp_downloader.py +212 -212
  21. crawlo/downloader/cffi_downloader.py +251 -251
  22. crawlo/downloader/httpx_downloader.py +259 -259
  23. crawlo/event.py +11 -11
  24. crawlo/exceptions.py +81 -81
  25. crawlo/extension/__init__.py +38 -31
  26. crawlo/extension/health_check.py +142 -0
  27. crawlo/extension/log_interval.py +58 -49
  28. crawlo/extension/log_stats.py +82 -44
  29. crawlo/extension/logging_extension.py +44 -35
  30. crawlo/extension/memory_monitor.py +89 -0
  31. crawlo/extension/performance_profiler.py +118 -0
  32. crawlo/extension/request_recorder.py +108 -0
  33. crawlo/filters/__init__.py +154 -154
  34. crawlo/filters/aioredis_filter.py +241 -241
  35. crawlo/filters/memory_filter.py +269 -269
  36. crawlo/items/__init__.py +23 -23
  37. crawlo/items/base.py +21 -21
  38. crawlo/items/fields.py +53 -53
  39. crawlo/items/items.py +104 -104
  40. crawlo/middleware/__init__.py +21 -21
  41. crawlo/middleware/default_header.py +32 -32
  42. crawlo/middleware/download_delay.py +28 -28
  43. crawlo/middleware/middleware_manager.py +135 -135
  44. crawlo/middleware/proxy.py +248 -248
  45. crawlo/middleware/request_ignore.py +30 -30
  46. crawlo/middleware/response_code.py +18 -18
  47. crawlo/middleware/response_filter.py +26 -26
  48. crawlo/middleware/retry.py +124 -124
  49. crawlo/mode_manager.py +200 -200
  50. crawlo/network/__init__.py +21 -21
  51. crawlo/network/request.py +311 -311
  52. crawlo/network/response.py +271 -271
  53. crawlo/pipelines/__init__.py +21 -21
  54. crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
  55. crawlo/pipelines/console_pipeline.py +39 -39
  56. crawlo/pipelines/csv_pipeline.py +316 -316
  57. crawlo/pipelines/database_dedup_pipeline.py +224 -224
  58. crawlo/pipelines/json_pipeline.py +218 -218
  59. crawlo/pipelines/memory_dedup_pipeline.py +115 -115
  60. crawlo/pipelines/mongo_pipeline.py +132 -117
  61. crawlo/pipelines/mysql_pipeline.py +317 -195
  62. crawlo/pipelines/pipeline_manager.py +56 -56
  63. crawlo/pipelines/redis_dedup_pipeline.py +162 -162
  64. crawlo/project.py +153 -153
  65. crawlo/queue/pqueue.py +37 -37
  66. crawlo/queue/queue_manager.py +307 -307
  67. crawlo/queue/redis_priority_queue.py +208 -208
  68. crawlo/settings/__init__.py +7 -7
  69. crawlo/settings/default_settings.py +278 -244
  70. crawlo/settings/setting_manager.py +99 -99
  71. crawlo/spider/__init__.py +639 -639
  72. crawlo/stats_collector.py +59 -59
  73. crawlo/subscriber.py +131 -106
  74. crawlo/task_manager.py +30 -30
  75. crawlo/templates/crawlo.cfg.tmpl +10 -10
  76. crawlo/templates/project/__init__.py.tmpl +3 -3
  77. crawlo/templates/project/items.py.tmpl +17 -17
  78. crawlo/templates/project/middlewares.py.tmpl +111 -87
  79. crawlo/templates/project/pipelines.py.tmpl +97 -341
  80. crawlo/templates/project/run.py.tmpl +251 -251
  81. crawlo/templates/project/settings.py.tmpl +279 -250
  82. crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
  83. crawlo/templates/spider/spider.py.tmpl +142 -178
  84. crawlo/utils/__init__.py +7 -7
  85. crawlo/utils/controlled_spider_mixin.py +439 -439
  86. crawlo/utils/date_tools.py +233 -233
  87. crawlo/utils/db_helper.py +343 -343
  88. crawlo/utils/func_tools.py +82 -82
  89. crawlo/utils/large_scale_config.py +286 -286
  90. crawlo/utils/large_scale_helper.py +343 -343
  91. crawlo/utils/log.py +128 -128
  92. crawlo/utils/queue_helper.py +175 -175
  93. crawlo/utils/request.py +267 -267
  94. crawlo/utils/request_serializer.py +219 -219
  95. crawlo/utils/spider_loader.py +62 -62
  96. crawlo/utils/system.py +11 -11
  97. crawlo/utils/tools.py +4 -4
  98. crawlo/utils/url.py +39 -39
  99. crawlo-1.1.4.dist-info/METADATA +403 -0
  100. crawlo-1.1.4.dist-info/RECORD +117 -0
  101. examples/__init__.py +7 -7
  102. examples/controlled_spider_example.py +205 -205
  103. tests/__init__.py +7 -7
  104. tests/test_final_validation.py +153 -153
  105. tests/test_proxy_health_check.py +32 -32
  106. tests/test_proxy_middleware_integration.py +136 -136
  107. tests/test_proxy_providers.py +56 -56
  108. tests/test_proxy_stats.py +19 -19
  109. tests/test_proxy_strategies.py +59 -59
  110. tests/test_redis_config.py +28 -28
  111. tests/test_redis_queue.py +224 -224
  112. tests/test_request_serialization.py +70 -70
  113. tests/test_scheduler.py +241 -241
  114. crawlo-1.1.3.dist-info/METADATA +0 -635
  115. crawlo-1.1.3.dist-info/RECORD +0 -113
  116. {crawlo-1.1.3.dist-info → crawlo-1.1.4.dist-info}/WHEEL +0 -0
  117. {crawlo-1.1.3.dist-info → crawlo-1.1.4.dist-info}/entry_points.txt +0 -0
  118. {crawlo-1.1.3.dist-info → crawlo-1.1.4.dist-info}/top_level.txt +0 -0
crawlo/utils/url.py CHANGED
@@ -1,40 +1,40 @@
1
- from urllib.parse import urldefrag
2
- from w3lib.url import add_or_replace_parameter
3
-
4
-
5
- def escape_ajax(url: str) -> str:
6
- """
7
- 根据Google AJAX爬取规范转换URL(处理哈希片段#!):
8
- https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
9
-
10
- 规则说明:
11
- 1. 仅当URL包含 `#!` 时才转换(表示这是AJAX可爬取页面)
12
- 2. 将 `#!key=value` 转换为 `?_escaped_fragment_=key%3Dvalue`
13
- 3. 保留原始查询参数(如果有)
14
-
15
- 示例:
16
- >>> escape_ajax("www.example.com/ajax.html#!key=value")
17
- 'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
18
- >>> escape_ajax("www.example.com/ajax.html?k1=v1#!key=value")
19
- 'www.example.com/ajax.html?k1=v1&_escaped_fragment_=key%3Dvalue'
20
- >>> escape_ajax("www.example.com/ajax.html#!")
21
- 'www.example.com/ajax.html?_escaped_fragment_='
22
-
23
- 非AJAX可爬取的URL(无#!)原样返回:
24
- >>> escape_ajax("www.example.com/ajax.html#normal")
25
- 'www.example.com/ajax.html#normal'
26
- """
27
- # 分离URL的基础部分和哈希片段
28
- de_frag, frag = urldefrag(url)
29
-
30
- # 仅处理以"!"开头的哈希片段(Google规范)
31
- if not frag.startswith("!"):
32
- return url # 不符合规则则原样返回
33
-
34
- # 调用辅助函数添加 `_escaped_fragment_` 参数
35
- return add_or_replace_parameter(de_frag, "_escaped_fragment_", frag[1:])
36
-
37
-
38
- if __name__ == '__main__':
39
- f = escape_ajax('http://example.com/page#!')
1
+ from urllib.parse import urldefrag
2
+ from w3lib.url import add_or_replace_parameter
3
+
4
+
5
+ def escape_ajax(url: str) -> str:
6
+ """
7
+ 根据Google AJAX爬取规范转换URL(处理哈希片段#!):
8
+ https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
9
+
10
+ 规则说明:
11
+ 1. 仅当URL包含 `#!` 时才转换(表示这是AJAX可爬取页面)
12
+ 2. 将 `#!key=value` 转换为 `?_escaped_fragment_=key%3Dvalue`
13
+ 3. 保留原始查询参数(如果有)
14
+
15
+ 示例:
16
+ >>> escape_ajax("www.example.com/ajax.html#!key=value")
17
+ 'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
18
+ >>> escape_ajax("www.example.com/ajax.html?k1=v1#!key=value")
19
+ 'www.example.com/ajax.html?k1=v1&_escaped_fragment_=key%3Dvalue'
20
+ >>> escape_ajax("www.example.com/ajax.html#!")
21
+ 'www.example.com/ajax.html?_escaped_fragment_='
22
+
23
+ 非AJAX可爬取的URL(无#!)原样返回:
24
+ >>> escape_ajax("www.example.com/ajax.html#normal")
25
+ 'www.example.com/ajax.html#normal'
26
+ """
27
+ # 分离URL的基础部分和哈希片段
28
+ de_frag, frag = urldefrag(url)
29
+
30
+ # 仅处理以"!"开头的哈希片段(Google规范)
31
+ if not frag.startswith("!"):
32
+ return url # 不符合规则则原样返回
33
+
34
+ # 调用辅助函数添加 `_escaped_fragment_` 参数
35
+ return add_or_replace_parameter(de_frag, "_escaped_fragment_", frag[1:])
36
+
37
+
38
+ if __name__ == '__main__':
39
+ f = escape_ajax('http://example.com/page#!')
40
40
  print(f)
@@ -0,0 +1,403 @@
1
+ Metadata-Version: 2.4
2
+ Name: crawlo
3
+ Version: 1.1.4
4
+ Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
5
+ Home-page: https://github.com/crawl-coder/Crawlo.git
6
+ Author: crawl-coder
7
+ Author-email: crawlo@qq.com
8
+ License: MIT
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.6
13
+ Description-Content-Type: text/markdown
14
+ Requires-Dist: aiohttp>=3.12.14
15
+ Requires-Dist: aiomysql>=0.2.0
16
+ Requires-Dist: aioredis>=2.0.1
17
+ Requires-Dist: asyncmy>=0.2.10
18
+ Requires-Dist: cssselect>=1.2.0
19
+ Requires-Dist: dateparser>=1.2.2
20
+ Requires-Dist: httpx[http2]>=0.27.0
21
+ Requires-Dist: curl-cffi>=0.13.0
22
+ Requires-Dist: lxml>=5.2.1
23
+ Requires-Dist: motor>=3.7.0
24
+ Requires-Dist: parsel>=1.9.1
25
+ Requires-Dist: pydantic>=2.11.7
26
+ Requires-Dist: pymongo>=4.11
27
+ Requires-Dist: PyMySQL>=1.1.1
28
+ Requires-Dist: python-dateutil>=2.9.0.post0
29
+ Requires-Dist: redis>=6.2.0
30
+ Requires-Dist: requests>=2.32.4
31
+ Requires-Dist: six>=1.17.0
32
+ Requires-Dist: ujson>=5.9.0
33
+ Requires-Dist: urllib3>=2.5.0
34
+ Requires-Dist: w3lib>=2.1.2
35
+ Requires-Dist: rich>=14.1.0
36
+ Requires-Dist: astor>=0.8.1
37
+ Requires-Dist: watchdog>=6.0.0
38
+ Provides-Extra: render
39
+ Requires-Dist: webdriver-manager>=4.0.0; extra == "render"
40
+ Requires-Dist: playwright; extra == "render"
41
+ Requires-Dist: selenium>=3.141.0; extra == "render"
42
+ Provides-Extra: all
43
+ Requires-Dist: bitarray>=1.5.3; extra == "all"
44
+ Requires-Dist: PyExecJS>=1.5.1; extra == "all"
45
+ Requires-Dist: pymongo>=3.10.1; extra == "all"
46
+ Requires-Dist: redis-py-cluster>=2.1.0; extra == "all"
47
+ Requires-Dist: webdriver-manager>=4.0.0; extra == "all"
48
+ Requires-Dist: playwright; extra == "all"
49
+ Requires-Dist: selenium>=3.141.0; extra == "all"
50
+
51
+ # Crawlo - 异步分布式爬虫框架
52
+
53
+ <div align="center">
54
+
55
+ [![Python Version](https://img.shields.io/badge/python-3.8%2B-blue)](https://www.python.org/downloads/)
56
+ [![License](https://img.shields.io/badge/license-MIT-green)](LICENSE)
57
+ [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen)](https://crawlo.readthedocs.io/)
58
+
59
+ 一个基于 asyncio 的高性能异步分布式爬虫框架,支持单机和分布式部署。
60
+
61
+ </div>
62
+
63
+ ## 🌟 特性
64
+
65
+ - **异步高性能**: 基于 asyncio 实现,充分利用现代 CPU 多核性能
66
+ - **分布式支持**: 内置 Redis 队列,轻松实现分布式部署
67
+ - **模块化设计**: 中间件、管道、扩展组件系统,易于定制和扩展
68
+ - **智能去重**: 多种去重策略(内存、Redis、Bloom Filter)
69
+ - **灵活配置**: 支持多种配置方式,适应不同场景需求
70
+ - **丰富文档**: 完整的中英文双语文档和示例项目
71
+
72
+ ## 🚀 快速开始
73
+
74
+ ### 安装
75
+
76
+ ```bash
77
+ pip install crawlo
78
+ ```
79
+
80
+ ### 创建项目
81
+
82
+ ```bash
83
+ crawlo startproject myproject
84
+ cd myproject
85
+ ```
86
+
87
+ ### 编写爬虫
88
+
89
+ ```python
90
+ from crawlo import Spider, Request, Item
91
+
92
+ class MyItem(Item):
93
+ title = ''
94
+ url = ''
95
+
96
+ class MySpider(Spider):
97
+ name = 'myspider'
98
+
99
+ async def start_requests(self):
100
+ yield Request('https://httpbin.org/get', callback=self.parse)
101
+
102
+ async def parse(self, response):
103
+ yield MyItem(
104
+ title='Example Title',
105
+ url=response.url
106
+ )
107
+ ```
108
+
109
+ ### 运行爬虫
110
+
111
+ ```bash
112
+ crawlo crawl myspider
113
+ ```
114
+
115
+ ## 🏗️ 架构设计
116
+
117
+ ### 组件交互图
118
+
119
+ ```
120
+ ┌─────────────────────────────────────────────────────────────────────┐
121
+ │ Crawler │
122
+ │ ┌──────────────┐ ┌──────────────┐ ┌──────────────────────────┐ │
123
+ │ │ Spider │ │ Engine │ │ ExtensionManager │ │
124
+ │ │ │ │ │ │ │ │
125
+ │ │ start_urls │ │ Scheduler ◄─┼──┼──► StatsCollector │ │
126
+ │ │ parse() │ │ │ │ │ │
127
+ │ │ │ │ Downloader ◄─┼──┼──► MiddlewareManager │ │
128
+ │ │ │ │ │ │ │ │
129
+ │ │ │ │ Processor ◄─┼──┼──► PipelineManager │ │
130
+ │ └──────────────┘ └──────┬───────┘ └──────────────────────────┘ │
131
+ └──────────────────────────┼─────────────────────────────────────────┘
132
+
133
+ ┌──────────────────▼──────────────────┐
134
+ │ Scheduler │
135
+ │ ┌──────────────────────────────┐ │
136
+ │ │ QueueManager │ │
137
+ │ │ ┌─────────┐ ┌────────────┐ │ │
138
+ │ │ │ Memory │ │ Redis │ │ │
139
+ │ │ │ Queue │ │ Queue │ │ │
140
+ │ │ └─────────┘ └────────────┘ │ │
141
+ │ └──────────────────────────────┘ │
142
+ │ ┌──────────────────────────────┐ │
143
+ │ │ Filter │ │
144
+ │ │ ┌─────────┐ ┌────────────┐ │ │
145
+ │ │ │ Memory │ │ Redis │ │ │
146
+ │ │ │ Filter │ │ Filter │ │ │
147
+ │ │ └─────────┘ └────────────┘ │ │
148
+ │ └──────────────────────────────┘ │
149
+ └─────────────────────────────────────┘
150
+
151
+ ┌──────────────────▼──────────────────┐
152
+ │ Downloader │
153
+ │ ┌──────────────────────────────┐ │
154
+ │ │ MiddlewareManager │ │
155
+ │ │ │ │
156
+ │ │ RequestMiddleware ◄────────┐ │ │
157
+ │ │ ResponseMiddleware │ │ │
158
+ │ │ ExceptionMiddleware │ │ │
159
+ │ │ ╱ │ │
160
+ │ └─────────────────────────╱───┘ │
161
+ │ ╱ │
162
+ │ ┌───────────────────────▼──┐ │
163
+ │ │ Download Implementations │ │
164
+ │ │ - AioHttpDownloader │ │
165
+ │ │ - HttpXDownloader │ │
166
+ │ │ - CurlCffiDownloader │ │
167
+ │ └──────────────────────────┘ │
168
+ └─────────────────────────────────────┘
169
+
170
+ ┌──────────────────▼──────────────────┐
171
+ │ Processor │
172
+ │ ┌──────────────────────────────┐ │
173
+ │ │ PipelineManager │ │
174
+ │ │ ┌─────────────────────────┐ │ │
175
+ │ │ │ Pipeline Stages │ │ │
176
+ │ │ │ - ValidationPipeline │ │ │
177
+ │ │ │ - ProcessingPipeline │ │ │
178
+ │ │ │ - StoragePipeline │ │ │
179
+ │ │ └─────────────────────────┘ │ │
180
+ │ └──────────────────────────────┘ │
181
+ └─────────────────────────────────────┘
182
+ ```
183
+
184
+ ### 运行模式切换图
185
+
186
+ ```
187
+ ┌─────────────────────┐
188
+ │ ModeManager │
189
+ │ (运行模式管理器) │
190
+ └─────────┬───────────┘
191
+
192
+ ┌─────────────────────┼─────────────────────┐
193
+ │ │ │
194
+ ▼ ▼ ▼
195
+ ┌───────────────┐ ┌─────────────────┐ ┌─────────────────┐
196
+ │ Standalone │ │ Distributed │ │ Auto │
197
+ │ (单机模式) │ │ (分布式模式) │ │ (自动检测模式) │
198
+ └───────┬───────┘ └─────────┬───────┘ └─────────┬───────┘
199
+ │ │ │
200
+ ▼ ▼ ▼
201
+ ┌───────────────┐ ┌─────────────────┐ ┌─────────────────┐
202
+ │ Memory Queue │ │ Redis Queue │ │ Auto Select │
203
+ │ Memory Filter │ │ Redis Filter │ │ Memory/Redis │
204
+ └───────────────┘ └─────────────────┘ └─────────────────┘
205
+ ```
206
+
207
+ ### 数据流向图
208
+
209
+ ```
210
+ ┌─────────────┐ 1.生成初始请求 ┌──────────────┐
211
+ │ Spider ├─────────────────────►│ Scheduler │
212
+ └─────────────┘ └──────┬───────┘
213
+ │ 2.去重检查
214
+
215
+ ┌─────────────────┐
216
+ │ Filter │
217
+ └─────────┬───────┘
218
+ │ 3.入队
219
+
220
+ ┌─────────────────┐
221
+ │ Queue │
222
+ └─────────┬───────┘
223
+ │ 4.获取请求
224
+
225
+ ┌─────────────────┐ 5.下载请求
226
+ │ Downloader ├──────────────────┐
227
+ └─────────────────┘ │
228
+ │ 6.解析响应 │
229
+ ▼ ▼
230
+ ┌─────────────────┐ 7.生成数据 ┌─────────────┐
231
+ │ Processor ├────────────────►│ Pipeline │
232
+ └─────────────────┘ └─────────────┘
233
+ │ 8.存储数据
234
+
235
+ ┌─────────────────┐
236
+ │ Items │
237
+ └─────────────────┘
238
+ ```
239
+
240
+ ### 模块层次结构图
241
+
242
+ ```
243
+ crawlo/
244
+ ├── cli.py # 命令行接口
245
+ ├── crawler.py # 爬虫运行实例
246
+ ├── project.py # 项目管理
247
+ ├── config.py # 配置管理
248
+ ├── mode_manager.py # 运行模式管理器
249
+ ├── stats_collector.py # 统计收集器
250
+ ├── subscriber.py # 事件订阅器
251
+ ├── task_manager.py # 任务管理器
252
+ ├── event.py # 事件定义
253
+ ├── exceptions.py # 异常定义
254
+ ├──
255
+ ├── core/ # 核心组件
256
+ │ ├── engine.py # 引擎
257
+ │ ├── scheduler.py # 调度器
258
+ │ ├── processor.py # 处理器
259
+
260
+ ├── spider/ # 爬虫基类
261
+ │ └── __init__.py # 爬虫元类和基类
262
+
263
+ ├── network/ # 网络相关
264
+ │ ├── request.py # 请求对象
265
+ │ └── response.py # 响应对象
266
+
267
+ ├── downloader/ # 下载器
268
+ │ ├── __init__.py # 下载器基类
269
+ │ ├── aiohttp_downloader.py # AioHttp实现
270
+ │ ├── httpx_downloader.py # HttpX实现
271
+ │ └── cffi_downloader.py # CurlCffi实现
272
+
273
+ ├── queue/ # 队列管理
274
+ │ ├── __init__.py
275
+ │ ├── queue_manager.py # 队列管理器
276
+ │ ├── pqueue.py # 内存优先队列
277
+ │ └── redis_priority_queue.py # Redis优先队列
278
+
279
+ ├── filters/ # 过滤器
280
+ │ ├── __init__.py
281
+ │ ├── base_filter.py # 过滤器基类
282
+ │ ├── memory_filter.py # 内存过滤器
283
+ │ └── aioredis_filter.py # Redis过滤器
284
+
285
+ ├── middleware/ # 中间件
286
+ │ ├── __init__.py
287
+ │ ├── middleware_manager.py # 中间件管理器
288
+ │ ├── default_header.py # 默认请求头
289
+ │ ├── download_delay.py # 下载延迟
290
+ │ ├── proxy.py # 代理支持
291
+ │ ├── request_ignore.py # 请求忽略
292
+ │ ├── response_code.py # 响应码处理
293
+ │ ├── response_filter.py # 响应过滤
294
+ │ └── retry.py # 重试机制
295
+
296
+ ├── pipelines/ # 数据管道
297
+ │ ├── __init__.py
298
+ │ ├── pipeline_manager.py # 管道管理器
299
+ │ ├── base_pipeline.py # 管道基类
300
+ │ ├── console_pipeline.py # 控制台输出管道
301
+ │ └── mysql_pipeline.py # MySQL存储管道
302
+
303
+ ├── extension/ # 扩展组件
304
+ │ ├── __init__.py
305
+ │ ├── log_interval.py # 定时日志
306
+ │ ├── log_stats.py # 统计日志
307
+ │ ├── logging_extension.py # 日志扩展
308
+ │ ├── memory_monitor.py # 内存监控
309
+ │ └── performance_profiler.py # 性能分析
310
+
311
+ ├── settings/ # 配置系统
312
+ │ ├── __init__.py
313
+ │ ├── default_settings.py # 默认配置
314
+ │ └── setting_manager.py # 配置管理器
315
+
316
+ ├── utils/ # 工具库
317
+ │ ├── __init__.py
318
+ │ ├── log.py # 日志工具
319
+ │ ├── request.py # 请求工具
320
+ │ ├── request_serializer.py # 请求序列化
321
+ │ └── func_tools.py # 函数工具
322
+
323
+ └── templates/ # 模板文件
324
+ ├── project/
325
+ └── spider/
326
+ ```
327
+
328
+ ### 组件说明
329
+
330
+ - **Crawler**: 爬虫运行实例,管理Spider与引擎的生命周期
331
+ - **Engine**: 引擎组件,协调Scheduler、Downloader、Processor
332
+ - **Scheduler**: 调度器,管理请求队列和去重过滤
333
+ - **Downloader**: 下载器,负责网络请求,支持多种实现(aiohttp, httpx, curl-cffi)
334
+ - **Processor**: 处理器,处理响应数据和管道
335
+ - **QueueManager**: 统一的队列管理器,支持内存队列和Redis队列的自动切换
336
+ - **Filter**: 请求去重过滤器,支持内存和Redis两种实现
337
+ - **Middleware**: 中间件系统,处理请求/响应的预处理和后处理
338
+ - **Pipeline**: 数据处理管道,支持多种存储方式(控制台、数据库等)
339
+ - **Spider**: 爬虫基类,定义爬取逻辑
340
+
341
+ ### 运行模式
342
+
343
+ Crawlo支持三种运行模式:
344
+ - **standalone**: 单机模式,使用内存队列和内存过滤器
345
+ - **distributed**: 分布式模式,使用Redis队列和Redis过滤器
346
+ - **auto**: 自动检测模式,根据环境自动选择最佳运行方式
347
+
348
+ ## 🎛️ 配置系统
349
+
350
+ ### 传统配置方式
351
+
352
+ ```
353
+ # settings.py
354
+ PROJECT_NAME = 'myproject'
355
+ CONCURRENCY = 16
356
+ DOWNLOAD_DELAY = 1.0
357
+ QUEUE_TYPE = 'memory' # 单机模式
358
+ # QUEUE_TYPE = 'redis' # 分布式模式
359
+ ```
360
+
361
+ ### 命令行配置
362
+
363
+ ```
364
+ crawlo crawl myspider --concurrency=32 --delay=0.5
365
+ ```
366
+
367
+ ## 🧩 核心组件
368
+
369
+ ### 中间件系统
370
+ 灵活的中间件系统,支持请求预处理、响应处理和异常处理。
371
+
372
+ ### 管道系统
373
+ 可扩展的数据处理管道,支持多种存储方式(控制台、数据库等)。
374
+
375
+ ### 扩展组件
376
+ 功能增强扩展,包括日志、监控、性能分析等。
377
+
378
+ ### 过滤系统
379
+ 智能去重过滤,支持多种去重策略(内存、Redis、Bloom Filter)。
380
+
381
+ ## 📦 示例项目
382
+
383
+ - [API数据采集](examples/api_data_collection/) - 简单的API数据采集示例
384
+ - [电信设备许可证](examples/telecom_licenses_distributed/) - 分布式爬取示例
385
+
386
+ ## 📚 文档
387
+
388
+ 完整的文档请访问 [Crawlo Documentation](https://crawlo.readthedocs.io/)
389
+
390
+ - [快速开始指南](docs/quick_start.md)
391
+ - [框架文档](docs/crawlo_framework_documentation.md)
392
+ - [API参考](docs/api_reference.md)
393
+ - [分布式爬取教程](docs/distributed_crawling_tutorial.md)
394
+ - [配置最佳实践](docs/configuration_best_practices.md)
395
+ - [扩展组件](docs/extensions.md)
396
+
397
+ ## 🤝 贡献
398
+
399
+ 欢迎提交 Issue 和 Pull Request 来帮助改进 Crawlo!
400
+
401
+ ## 📄 许可证
402
+
403
+ 本项目采用 MIT 许可证,详情请见 [LICENSE](LICENSE) 文件。
@@ -0,0 +1,117 @@
1
+ crawlo/__init__.py,sha256=esOolburYDjtF43D5N9Kh6TSQW2yKcz888ilhBSinBc,825
2
+ crawlo/__version__.py,sha256=XxXhu8-QnuD9hA8Ah0WX5rgpt_DwOQmAwcK-FtpngyQ,22
3
+ crawlo/cli.py,sha256=CtR2Pfa7SyRxEKPaXqt-6E6K5Vq5z3rfdAI95UO4cbU,1166
4
+ crawlo/config.py,sha256=i0Amz6wNPgv_aVcdCBRRlcwuZLSa87cH9OEmTQvB97Q,8329
5
+ crawlo/crawler.py,sha256=v6i5tjgSOtbMoqiw1qdgKx1cY4kcVcd5l5bUTWtJNNU,36461
6
+ crawlo/event.py,sha256=7-y6HNv_EIJSYQNzsj0mVK-Gg4ON3wdQeMdQjfFJPlw,313
7
+ crawlo/exceptions.py,sha256=sMay0wnWLfc_FXWslqxm60qz6b66LXs3EdN_w8ygE9k,1166
8
+ crawlo/mode_manager.py,sha256=WIxrq9S3EAH0D71LH1AxvcqXomeABqoXgtUN4A--DKY,6702
9
+ crawlo/project.py,sha256=xWN2eTAjf_Pza-wWvvV4JjScQRWxe9hXlztX81ccUMc,5182
10
+ crawlo/stats_collector.py,sha256=NkO09CB-220qz5rxFcD_dedGfr2VPFrDo4hya0Zh8Qc,1577
11
+ crawlo/subscriber.py,sha256=gioTIqRdEwVG-bwIiQonbk1vWWAqTh9hzVkrqZ1AfP0,5006
12
+ crawlo/task_manager.py,sha256=19InAxS9oJ9EMj20Aw2urN-v6BeC22dkgPuW-B9-4UI,819
13
+ crawlo/commands/__init__.py,sha256=AMYjXG7ulE8dPVmgWVo0uqXsaCYUUZYmmu2-7kFzH1M,342
14
+ crawlo/commands/check.py,sha256=172OiAxnX5wwSlszUsyPgMZwAoIbGDTdfhtRz309ilc,22843
15
+ crawlo/commands/genspider.py,sha256=-jGJdfXLsefX_H1ydQ2wirdu6p6wmhClzVXY_0L-1aE,5050
16
+ crawlo/commands/list.py,sha256=yByqQeZBgvjewOKxpnOobpeJ7Hnbs-CWsoyITqZu2ZY,5781
17
+ crawlo/commands/run.py,sha256=8Qngjsl8Q4RBdO39a__wKGsheY2PFuPit2hds_jwEbM,10524
18
+ crawlo/commands/startproject.py,sha256=bzNgpkKzUEggY2m7Iw810mSPe8wOPFBqSCO0jZX3z_g,7138
19
+ crawlo/commands/stats.py,sha256=6pAgkEi8MBnCer2rWmKpaTYr1jaM6HeMG9owAvEzJyY,6064
20
+ crawlo/commands/utils.py,sha256=nohMvUU2zLvX0XzXk6KeCNxP0EvSWj9DiVLxM_7tD5o,5106
21
+ crawlo/core/__init__.py,sha256=PnFyJdVNHBoPmV1sW0AHQXijeoSTQ8cMYrbNM1JK8kA,41
22
+ crawlo/core/engine.py,sha256=8Dcew1XyxChW5Fz1wFEWpJlPrQb2hKDWKul8e61S-Q0,6662
23
+ crawlo/core/enhanced_engine.py,sha256=9I9Uxdy2oAz8zDGTzEiytuKu__VDVmIN8zwZKfrD8bw,6254
24
+ crawlo/core/processor.py,sha256=qmCqAeqhwYu-UE86evYesaGt9qpuSIfH-ZIZKcXFCZc,1140
25
+ crawlo/core/scheduler.py,sha256=fiU-Q-lzyC3B6ih8NSWqjP1Xw_ryNVb_4dLUARtWRBE,5804
26
+ crawlo/downloader/__init__.py,sha256=tl0mE54reR-PuJYSsXsKP2VY5uzvq4lITxZwKKjNzPs,7663
27
+ crawlo/downloader/aiohttp_downloader.py,sha256=UKupGYPOWrscAVsjhFgKYElTa9tbEeltqV7nuWqjIeE,8005
28
+ crawlo/downloader/cffi_downloader.py,sha256=-GVfSIhi1Ip56suSiGf8jnUE2EBF1P56vw0uxLh_T6I,10440
29
+ crawlo/downloader/httpx_downloader.py,sha256=7jfQfvAtfk8yD_mvwUbWLhYOxMM7r1nudiU7m_Jl9wc,12037
30
+ crawlo/extension/__init__.py,sha256=Sg588p6UhyrwFNTiD2wqGW-i3xgLX6HlLuQPKT7mayE,1526
31
+ crawlo/extension/health_check.py,sha256=IVaaVo_0CcZtf1LoCAYXIBvs3wZ7hdmT6U4-NYWAgP0,5527
32
+ crawlo/extension/log_interval.py,sha256=VCIeNqXcWDnxj4m6l77cjqgRzV8LfsPMb22X0Xc1Vwc,2417
33
+ crawlo/extension/log_stats.py,sha256=Ssxz6R1YpWIj5WJvQ2cJ9F5oR7FUFdj-ITc9lV92SSU,2908
34
+ crawlo/extension/logging_extension.py,sha256=ET6VAu1J2qNMz4NnG1G3zQLRhbsvV7l6xRIuQLE6DaE,1626
35
+ crawlo/extension/memory_monitor.py,sha256=gg-GK5RD9XhnrN_zp3KTmPKyWDmKLMv_OTY-HxSxBNI,3664
36
+ crawlo/extension/performance_profiler.py,sha256=NvQuuvE83dXJ-1URpN8OF9za9J1l7xhVbV22JynPQpA,4235
37
+ crawlo/extension/request_recorder.py,sha256=RC23yzXClnVv9j2ljQvjBkUfWznfnDHsrQejKhE9y5E,4074
38
+ crawlo/filters/__init__.py,sha256=XC_Q4ykZtSNYizYlAcehVwBBNO3lZ2zuWwafzXiuWyQ,4241
39
+ crawlo/filters/aioredis_filter.py,sha256=WhkFZcVAym9wLSUa8WTVctYfEibjxG42umtmacO1IY0,8370
40
+ crawlo/filters/memory_filter.py,sha256=VJO0UFRYGxmV8dj4G1subsQ-FtvPcGLbvd7IVtqXnOs,9260
41
+ crawlo/items/__init__.py,sha256=bqekZrRlDhxfWie0UbCs656TptYseoe9QJ67I4E7Elk,386
42
+ crawlo/items/base.py,sha256=tAYrPJgblp3ZEihDXvappdYc6pGdim6x2_9QSmMKI2o,577
43
+ crawlo/items/fields.py,sha256=wMlakQTsEwyrlLzMt1gI4pScLQZMqd3E1xcfH4dbSqk,1801
44
+ crawlo/items/items.py,sha256=e-3nXI9ckD64vcDxxQiAU6ufbtJMs09gbZQcYjxgwHY,3374
45
+ crawlo/middleware/__init__.py,sha256=ldaGFNbiJnK9Fx12Vdf9fDNfzXxoETtShp5r-vodtw0,549
46
+ crawlo/middleware/default_header.py,sha256=i_Uj07JObyeZFxL7ZAZmvZsHvA1HGtkNab1sA0d-nWI,1067
47
+ crawlo/middleware/download_delay.py,sha256=2M-TchDA7MwyTfYy0Hzh_bW9wlHlpiP-oQlys7crTj0,966
48
+ crawlo/middleware/middleware_manager.py,sha256=j1hkWRFB5rnC5SnB7oXWE5eUNv8blS9krDIDM5fIDs8,6213
49
+ crawlo/middleware/proxy.py,sha256=m2ZZ50En9hUtgrqSqA6hItGT74xMqccHFPhZshutIco,9811
50
+ crawlo/middleware/request_ignore.py,sha256=QI2z4fUnJ-4xvPTZAmsL-GqR4RFHS1xq9iDr5KFrMco,997
51
+ crawlo/middleware/response_code.py,sha256=tmef2QVl3JCiTMii6VQkASlOY2OyqmOPoOfNxIK1eF8,659
52
+ crawlo/middleware/response_filter.py,sha256=ep8ZxDlfIefi9YqK8dPASEp5TTDRo9QEY_jMceC411s,837
53
+ crawlo/middleware/retry.py,sha256=-7zpRURugiTTm4QYUSUlbnURD5mcT2Ji0yHvCgY1wGc,4124
54
+ crawlo/network/__init__.py,sha256=BLPERYPo22g1BXrW--wUnlolrdFUmOPjgOB8XQQJlck,397
55
+ crawlo/network/request.py,sha256=tPAiOVJyF3Kk-midqydTGXgv5M5tsYJRtwUUJTrUsrE,11075
56
+ crawlo/network/response.py,sha256=cUvdjsB2cQ-qWEKHNGIkwWGgCg-EnQ81xTrjrUOVno0,9738
57
+ crawlo/pipelines/__init__.py,sha256=lrdVDjeHLNkA4_MAwI1auk_I9xfeU1SlBWXiammb6lc,616
58
+ crawlo/pipelines/bloom_dedup_pipeline.py,sha256=QQxGFGEoMHN4Vx2kq7G_i1o9pmuXp8clZebilOar3fk,5642
59
+ crawlo/pipelines/console_pipeline.py,sha256=KABkR3J-rqO0Awox7lizxKR2XuHfVhWPiVRgIybwwu4,1248
60
+ crawlo/pipelines/csv_pipeline.py,sha256=6FBT2AoU6iNU-5NfgWRq7-JpF9dK2nBokjxx-y4jIas,12174
61
+ crawlo/pipelines/database_dedup_pipeline.py,sha256=wVBXEGArFR3uxoN7yfJSOarBmtGrJpOqowAqa7OUs98,8000
62
+ crawlo/pipelines/json_pipeline.py,sha256=vlu1nqbD2mtqtExt9cL5nibx1CwJM1RNqd4WGjZRHAY,8367
63
+ crawlo/pipelines/memory_dedup_pipeline.py,sha256=5jeL2jEq7sioYmXlzfkx-LNSbWyChrXeWx8d15YEZOA,3839
64
+ crawlo/pipelines/mongo_pipeline.py,sha256=k7gNqAO-g2MtIfArphC6z5ZzkKVRkBKcv-2ImziPFA0,5706
65
+ crawlo/pipelines/mysql_pipeline.py,sha256=cwgJvRORTRea_Eep2coBaMf3G8PQVTQA1qrnIlDZApc,13480
66
+ crawlo/pipelines/pipeline_manager.py,sha256=VrbebOYiqrobtKhp5II18w-odCICdWkmRg5WPK0Emz4,2112
67
+ crawlo/pipelines/redis_dedup_pipeline.py,sha256=TaokJ4wP5-Cxf-ueFJdh4SX58hchT0QzZ5RBDXHDN64,6003
68
+ crawlo/queue/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
69
+ crawlo/queue/pqueue.py,sha256=yzF900ko2RReVNZtfk_VV3PzSXEUALI6SHf97geuu6s,1058
70
+ crawlo/queue/queue_manager.py,sha256=NMR0Fo8XFBg6_g7htq4D80cS6Ilo0EKt5QtyF-KxNuM,11467
71
+ crawlo/queue/redis_priority_queue.py,sha256=boJCKqcKxRw9XCCzaHy5qmrX9DvzPiQBzOkBHI5odfE,8116
72
+ crawlo/settings/__init__.py,sha256=xsukVKn_h2Hopm1Nj-bXkhbfyS62QTTvJi7fhZUwR9M,123
73
+ crawlo/settings/default_settings.py,sha256=B4_61tNJvqzVvyqt9AtRV7Iq5q8G4pJOExcN0ve7S_A,11559
74
+ crawlo/settings/setting_manager.py,sha256=SxKB1aCWh4OySM_bH9cYng9I3PAmrSP-Q8XOZEWEwbI,2899
75
+ crawlo/spider/__init__.py,sha256=Z_rK23l5yt-DuwJPg8bcqodM_FIs4-iHLaKOimGumcE,20452
76
+ crawlo/templates/crawlo.cfg.tmpl,sha256=9BAmwEibS5Tvy6HIcGXPb0BGeuesmibebmTW0iAEkmo,230
77
+ crawlo/templates/project/__init__.py.tmpl,sha256=f3ETIXw_O6K-lkL6lXM5znMPJW1FZYGFrwDs2BnHcnQ,58
78
+ crawlo/templates/project/items.py.tmpl,sha256=mt1Mm--H2Ouos3r7JPkYh0r33rgYJf1YOMz0OZy8TYs,297
79
+ crawlo/templates/project/middlewares.py.tmpl,sha256=jpmj7b7Zb7d3nVyxcaVNdp4KqSts6l2cPSqn_oJUSrM,3775
80
+ crawlo/templates/project/pipelines.py.tmpl,sha256=k_4MJnwZ6GPqVwJSEDURUlTxWybmts4vHrF0de2vgAk,2620
81
+ crawlo/templates/project/run.py.tmpl,sha256=ktkYOslcCh9mpklg6yE5VqfATx3Frj_jNT5z5gHjQ4o,8177
82
+ crawlo/templates/project/settings.py.tmpl,sha256=O_teIARjzRD3aMvPnuIgjaDHdjwW-3beyzfo1QH-Hr8,9580
83
+ crawlo/templates/project/spiders/__init__.py.tmpl,sha256=j_YKsw6HQMJyqlk3WUouP3bsr-XVxshRoSNakHBc00g,106
84
+ crawlo/templates/spider/spider.py.tmpl,sha256=a8S9j43z5gE4auMhf_OnnuVHSZN3JbMDu8Bczu8zIZY,4944
85
+ crawlo/utils/__init__.py,sha256=BDORpyjMN7VGPKImnCDKSkprS-petgD7ezc9rMlBvb0,123
86
+ crawlo/utils/controlled_spider_mixin.py,sha256=VjT30pNW_YIgmTD0nb7DDl2D3HvpnAYFzgSgV3fxFN0,16475
87
+ crawlo/utils/date_tools.py,sha256=0yG0tzGb1VFgWDJJ_cow2LJfz3kj_w2MqSjmfKKESl8,6961
88
+ crawlo/utils/db_helper.py,sha256=3ib5-agrlwf2t5S_QtLRYH75wvJDlYbRqRmDEbpH5Bo,10559
89
+ crawlo/utils/func_tools.py,sha256=WUZEGpWMuDDX7g-QySM7iaiC74erW2SSkZoUvDw1NjM,2369
90
+ crawlo/utils/large_scale_config.py,sha256=j7wQ5ty7pQlBRygw2vhRJ7OI19RYBZKPfYMP3WeF2WI,8154
91
+ crawlo/utils/large_scale_helper.py,sha256=JJqcGSI6VaVe3MSL6IWjmCp8XQIu6T4U-BvBLSttr_s,12157
92
+ crawlo/utils/log.py,sha256=A3lPyhD8kD88cV23KOL-_eT8g69xGQ5L1toDB2AO0mc,4005
93
+ crawlo/utils/queue_helper.py,sha256=xpUUTOqlU1xz2Pb9NKAVGo3AfAO-7Xvx8Lm1q65Dgck,4743
94
+ crawlo/utils/request.py,sha256=yoLB2rY8d78vgPjIWpdhY5SalIKjyLIvTG_UH6EMdVI,8798
95
+ crawlo/utils/request_serializer.py,sha256=bPoSQqE2ksiMyP3WiPB3w3UqZs4f_LgkAw4Pj0qyBDo,8565
96
+ crawlo/utils/spider_loader.py,sha256=pEDUsYOTGjszA6KgjiMlYN4GS5fP4uakkhcp3JTFFQY,2187
97
+ crawlo/utils/system.py,sha256=HvWV1acxou0Rn0L7pNq4CnV_GWFeU0Tmjy3_nLD8M64,237
98
+ crawlo/utils/tools.py,sha256=5Uv25Wy4m_ndZY0-n-eX-t3PxvaZ6wR3-Wvx-o7_Vrs,271
99
+ crawlo/utils/url.py,sha256=rlgX2VlJv6JvLmCDTsbxzMSXE6R5ZL_0dLEqprsA-JU,1482
100
+ examples/__init__.py,sha256=6i631BPnS_TR_BWUjtjB5CBO-zv9kRkwQTQvSya2wHE,123
101
+ examples/controlled_spider_example.py,sha256=SP_k4mdKPvD1JCPs9UCm68jcy2Frg84vvXv9-14RC6I,7776
102
+ tests/__init__.py,sha256=scL1IPVT1iucth7v8ffrjRdeW7QrC_Y7AMmFVMdTY1Y,129
103
+ tests/test_final_validation.py,sha256=fBxf_6YcAEa_HyV_oGAXmmVHY4i6FdA4J6klCmc36hQ,4925
104
+ tests/test_proxy_health_check.py,sha256=xo3QMP1YNw9hu7JDHZOYCUZmFFKLJpHSh4SbxXhCRPQ,1091
105
+ tests/test_proxy_middleware_integration.py,sha256=zcl7fR9Toc-I-stSUTzKZPwcfh3kgrpjI5SbkZ6AVmE,4305
106
+ tests/test_proxy_providers.py,sha256=XwWZCywTYguSsUxSm6fsbaoH1p9dKjqSIx9-sqKZehA,1693
107
+ tests/test_proxy_stats.py,sha256=Til_yksrRz2yBVw-yJi5-36LhNW3vTwpXTm4BdR9PUM,507
108
+ tests/test_proxy_strategies.py,sha256=ZkziozkvZd3KWOQnpHQ8Upd3WpyoX7gN0qFGluNm348,1809
109
+ tests/test_redis_config.py,sha256=TqzFRojc6esGXjGhUCvSLYQDUTAgEJsty9vRVuNraMU,893
110
+ tests/test_redis_queue.py,sha256=o6xViXxJcdx-1eMcG3vhAQEIm8h346HnZb7JXs7ZjwM,6622
111
+ tests/test_request_serialization.py,sha256=8sVdppAsohJ5u-m1WvablCndwL-M_36YPLdGKwgeznM,2289
112
+ tests/test_scheduler.py,sha256=-FOkTWzaMdr6yfO1Msu74hI_GgSfD7iRxO-cFA-9Iyk,7442
113
+ crawlo-1.1.4.dist-info/METADATA,sha256=2I2NA0BR-MWoPZmRUkWrUQYMjuPiUi9mrogIYPWpASU,19781
114
+ crawlo-1.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
115
+ crawlo-1.1.4.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
116
+ crawlo-1.1.4.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
117
+ crawlo-1.1.4.dist-info/RECORD,,
examples/__init__.py CHANGED
@@ -1,7 +1,7 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- """
4
- # @Time : 2025-02-05 12:36
5
- # @Author : oscar
6
- # @Desc : None
7
- """
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ # @Time : 2025-02-05 12:36
5
+ # @Author : oscar
6
+ # @Desc : None
7
+ """