crawlo 1.1.3__py3-none-any.whl → 1.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +34 -34
- crawlo/__version__.py +1 -1
- crawlo/cli.py +40 -40
- crawlo/commands/__init__.py +13 -13
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +151 -151
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +285 -285
- crawlo/commands/startproject.py +196 -196
- crawlo/commands/stats.py +188 -188
- crawlo/commands/utils.py +186 -186
- crawlo/config.py +279 -279
- crawlo/core/__init__.py +2 -2
- crawlo/core/engine.py +171 -171
- crawlo/core/enhanced_engine.py +189 -189
- crawlo/core/processor.py +40 -40
- crawlo/core/scheduler.py +165 -165
- crawlo/crawler.py +1027 -1027
- crawlo/downloader/__init__.py +242 -242
- crawlo/downloader/aiohttp_downloader.py +212 -212
- crawlo/downloader/cffi_downloader.py +251 -251
- crawlo/downloader/httpx_downloader.py +259 -259
- crawlo/event.py +11 -11
- crawlo/exceptions.py +81 -81
- crawlo/extension/__init__.py +38 -31
- crawlo/extension/health_check.py +142 -0
- crawlo/extension/log_interval.py +58 -49
- crawlo/extension/log_stats.py +82 -44
- crawlo/extension/logging_extension.py +44 -35
- crawlo/extension/memory_monitor.py +89 -0
- crawlo/extension/performance_profiler.py +118 -0
- crawlo/extension/request_recorder.py +108 -0
- crawlo/filters/__init__.py +154 -154
- crawlo/filters/aioredis_filter.py +241 -241
- crawlo/filters/memory_filter.py +269 -269
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +21 -21
- crawlo/items/fields.py +53 -53
- crawlo/items/items.py +104 -104
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +32 -32
- crawlo/middleware/download_delay.py +28 -28
- crawlo/middleware/middleware_manager.py +135 -135
- crawlo/middleware/proxy.py +248 -248
- crawlo/middleware/request_ignore.py +30 -30
- crawlo/middleware/response_code.py +18 -18
- crawlo/middleware/response_filter.py +26 -26
- crawlo/middleware/retry.py +124 -124
- crawlo/mode_manager.py +200 -200
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +311 -311
- crawlo/network/response.py +271 -271
- crawlo/pipelines/__init__.py +21 -21
- crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +224 -224
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +115 -115
- crawlo/pipelines/mongo_pipeline.py +132 -117
- crawlo/pipelines/mysql_pipeline.py +317 -195
- crawlo/pipelines/pipeline_manager.py +56 -56
- crawlo/pipelines/redis_dedup_pipeline.py +162 -162
- crawlo/project.py +153 -153
- crawlo/queue/pqueue.py +37 -37
- crawlo/queue/queue_manager.py +307 -307
- crawlo/queue/redis_priority_queue.py +208 -208
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +278 -244
- crawlo/settings/setting_manager.py +99 -99
- crawlo/spider/__init__.py +639 -639
- crawlo/stats_collector.py +59 -59
- crawlo/subscriber.py +131 -106
- crawlo/task_manager.py +30 -30
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +3 -3
- crawlo/templates/project/items.py.tmpl +17 -17
- crawlo/templates/project/middlewares.py.tmpl +111 -87
- crawlo/templates/project/pipelines.py.tmpl +97 -341
- crawlo/templates/project/run.py.tmpl +251 -251
- crawlo/templates/project/settings.py.tmpl +279 -250
- crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
- crawlo/templates/spider/spider.py.tmpl +142 -178
- crawlo/utils/__init__.py +7 -7
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/date_tools.py +233 -233
- crawlo/utils/db_helper.py +343 -343
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_config.py +286 -286
- crawlo/utils/large_scale_helper.py +343 -343
- crawlo/utils/log.py +128 -128
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/request.py +267 -267
- crawlo/utils/request_serializer.py +219 -219
- crawlo/utils/spider_loader.py +62 -62
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +4 -4
- crawlo/utils/url.py +39 -39
- crawlo-1.1.4.dist-info/METADATA +403 -0
- crawlo-1.1.4.dist-info/RECORD +117 -0
- examples/__init__.py +7 -7
- examples/controlled_spider_example.py +205 -205
- tests/__init__.py +7 -7
- tests/test_final_validation.py +153 -153
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware_integration.py +136 -136
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_redis_config.py +28 -28
- tests/test_redis_queue.py +224 -224
- tests/test_request_serialization.py +70 -70
- tests/test_scheduler.py +241 -241
- crawlo-1.1.3.dist-info/METADATA +0 -635
- crawlo-1.1.3.dist-info/RECORD +0 -113
- {crawlo-1.1.3.dist-info → crawlo-1.1.4.dist-info}/WHEEL +0 -0
- {crawlo-1.1.3.dist-info → crawlo-1.1.4.dist-info}/entry_points.txt +0 -0
- {crawlo-1.1.3.dist-info → crawlo-1.1.4.dist-info}/top_level.txt +0 -0
crawlo/utils/url.py
CHANGED
|
@@ -1,40 +1,40 @@
|
|
|
1
|
-
from urllib.parse import urldefrag
|
|
2
|
-
from w3lib.url import add_or_replace_parameter
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
def escape_ajax(url: str) -> str:
|
|
6
|
-
"""
|
|
7
|
-
根据Google AJAX爬取规范转换URL(处理哈希片段#!):
|
|
8
|
-
https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
|
|
9
|
-
|
|
10
|
-
规则说明:
|
|
11
|
-
1. 仅当URL包含 `#!` 时才转换(表示这是AJAX可爬取页面)
|
|
12
|
-
2. 将 `#!key=value` 转换为 `?_escaped_fragment_=key%3Dvalue`
|
|
13
|
-
3. 保留原始查询参数(如果有)
|
|
14
|
-
|
|
15
|
-
示例:
|
|
16
|
-
>>> escape_ajax("www.example.com/ajax.html#!key=value")
|
|
17
|
-
'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
|
|
18
|
-
>>> escape_ajax("www.example.com/ajax.html?k1=v1#!key=value")
|
|
19
|
-
'www.example.com/ajax.html?k1=v1&_escaped_fragment_=key%3Dvalue'
|
|
20
|
-
>>> escape_ajax("www.example.com/ajax.html#!")
|
|
21
|
-
'www.example.com/ajax.html?_escaped_fragment_='
|
|
22
|
-
|
|
23
|
-
非AJAX可爬取的URL(无#!)原样返回:
|
|
24
|
-
>>> escape_ajax("www.example.com/ajax.html#normal")
|
|
25
|
-
'www.example.com/ajax.html#normal'
|
|
26
|
-
"""
|
|
27
|
-
# 分离URL的基础部分和哈希片段
|
|
28
|
-
de_frag, frag = urldefrag(url)
|
|
29
|
-
|
|
30
|
-
# 仅处理以"!"开头的哈希片段(Google规范)
|
|
31
|
-
if not frag.startswith("!"):
|
|
32
|
-
return url # 不符合规则则原样返回
|
|
33
|
-
|
|
34
|
-
# 调用辅助函数添加 `_escaped_fragment_` 参数
|
|
35
|
-
return add_or_replace_parameter(de_frag, "_escaped_fragment_", frag[1:])
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
if __name__ == '__main__':
|
|
39
|
-
f = escape_ajax('http://example.com/page#!')
|
|
1
|
+
from urllib.parse import urldefrag
|
|
2
|
+
from w3lib.url import add_or_replace_parameter
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def escape_ajax(url: str) -> str:
|
|
6
|
+
"""
|
|
7
|
+
根据Google AJAX爬取规范转换URL(处理哈希片段#!):
|
|
8
|
+
https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
|
|
9
|
+
|
|
10
|
+
规则说明:
|
|
11
|
+
1. 仅当URL包含 `#!` 时才转换(表示这是AJAX可爬取页面)
|
|
12
|
+
2. 将 `#!key=value` 转换为 `?_escaped_fragment_=key%3Dvalue`
|
|
13
|
+
3. 保留原始查询参数(如果有)
|
|
14
|
+
|
|
15
|
+
示例:
|
|
16
|
+
>>> escape_ajax("www.example.com/ajax.html#!key=value")
|
|
17
|
+
'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
|
|
18
|
+
>>> escape_ajax("www.example.com/ajax.html?k1=v1#!key=value")
|
|
19
|
+
'www.example.com/ajax.html?k1=v1&_escaped_fragment_=key%3Dvalue'
|
|
20
|
+
>>> escape_ajax("www.example.com/ajax.html#!")
|
|
21
|
+
'www.example.com/ajax.html?_escaped_fragment_='
|
|
22
|
+
|
|
23
|
+
非AJAX可爬取的URL(无#!)原样返回:
|
|
24
|
+
>>> escape_ajax("www.example.com/ajax.html#normal")
|
|
25
|
+
'www.example.com/ajax.html#normal'
|
|
26
|
+
"""
|
|
27
|
+
# 分离URL的基础部分和哈希片段
|
|
28
|
+
de_frag, frag = urldefrag(url)
|
|
29
|
+
|
|
30
|
+
# 仅处理以"!"开头的哈希片段(Google规范)
|
|
31
|
+
if not frag.startswith("!"):
|
|
32
|
+
return url # 不符合规则则原样返回
|
|
33
|
+
|
|
34
|
+
# 调用辅助函数添加 `_escaped_fragment_` 参数
|
|
35
|
+
return add_or_replace_parameter(de_frag, "_escaped_fragment_", frag[1:])
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
if __name__ == '__main__':
|
|
39
|
+
f = escape_ajax('http://example.com/page#!')
|
|
40
40
|
print(f)
|
|
@@ -0,0 +1,403 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: crawlo
|
|
3
|
+
Version: 1.1.4
|
|
4
|
+
Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
|
|
5
|
+
Home-page: https://github.com/crawl-coder/Crawlo.git
|
|
6
|
+
Author: crawl-coder
|
|
7
|
+
Author-email: crawlo@qq.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Requires-Python: >=3.6
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
Requires-Dist: aiohttp>=3.12.14
|
|
15
|
+
Requires-Dist: aiomysql>=0.2.0
|
|
16
|
+
Requires-Dist: aioredis>=2.0.1
|
|
17
|
+
Requires-Dist: asyncmy>=0.2.10
|
|
18
|
+
Requires-Dist: cssselect>=1.2.0
|
|
19
|
+
Requires-Dist: dateparser>=1.2.2
|
|
20
|
+
Requires-Dist: httpx[http2]>=0.27.0
|
|
21
|
+
Requires-Dist: curl-cffi>=0.13.0
|
|
22
|
+
Requires-Dist: lxml>=5.2.1
|
|
23
|
+
Requires-Dist: motor>=3.7.0
|
|
24
|
+
Requires-Dist: parsel>=1.9.1
|
|
25
|
+
Requires-Dist: pydantic>=2.11.7
|
|
26
|
+
Requires-Dist: pymongo>=4.11
|
|
27
|
+
Requires-Dist: PyMySQL>=1.1.1
|
|
28
|
+
Requires-Dist: python-dateutil>=2.9.0.post0
|
|
29
|
+
Requires-Dist: redis>=6.2.0
|
|
30
|
+
Requires-Dist: requests>=2.32.4
|
|
31
|
+
Requires-Dist: six>=1.17.0
|
|
32
|
+
Requires-Dist: ujson>=5.9.0
|
|
33
|
+
Requires-Dist: urllib3>=2.5.0
|
|
34
|
+
Requires-Dist: w3lib>=2.1.2
|
|
35
|
+
Requires-Dist: rich>=14.1.0
|
|
36
|
+
Requires-Dist: astor>=0.8.1
|
|
37
|
+
Requires-Dist: watchdog>=6.0.0
|
|
38
|
+
Provides-Extra: render
|
|
39
|
+
Requires-Dist: webdriver-manager>=4.0.0; extra == "render"
|
|
40
|
+
Requires-Dist: playwright; extra == "render"
|
|
41
|
+
Requires-Dist: selenium>=3.141.0; extra == "render"
|
|
42
|
+
Provides-Extra: all
|
|
43
|
+
Requires-Dist: bitarray>=1.5.3; extra == "all"
|
|
44
|
+
Requires-Dist: PyExecJS>=1.5.1; extra == "all"
|
|
45
|
+
Requires-Dist: pymongo>=3.10.1; extra == "all"
|
|
46
|
+
Requires-Dist: redis-py-cluster>=2.1.0; extra == "all"
|
|
47
|
+
Requires-Dist: webdriver-manager>=4.0.0; extra == "all"
|
|
48
|
+
Requires-Dist: playwright; extra == "all"
|
|
49
|
+
Requires-Dist: selenium>=3.141.0; extra == "all"
|
|
50
|
+
|
|
51
|
+
# Crawlo - 异步分布式爬虫框架
|
|
52
|
+
|
|
53
|
+
<div align="center">
|
|
54
|
+
|
|
55
|
+
[](https://www.python.org/downloads/)
|
|
56
|
+
[](LICENSE)
|
|
57
|
+
[](https://crawlo.readthedocs.io/)
|
|
58
|
+
|
|
59
|
+
一个基于 asyncio 的高性能异步分布式爬虫框架,支持单机和分布式部署。
|
|
60
|
+
|
|
61
|
+
</div>
|
|
62
|
+
|
|
63
|
+
## 🌟 特性
|
|
64
|
+
|
|
65
|
+
- **异步高性能**: 基于 asyncio 实现,充分利用现代 CPU 多核性能
|
|
66
|
+
- **分布式支持**: 内置 Redis 队列,轻松实现分布式部署
|
|
67
|
+
- **模块化设计**: 中间件、管道、扩展组件系统,易于定制和扩展
|
|
68
|
+
- **智能去重**: 多种去重策略(内存、Redis、Bloom Filter)
|
|
69
|
+
- **灵活配置**: 支持多种配置方式,适应不同场景需求
|
|
70
|
+
- **丰富文档**: 完整的中英文双语文档和示例项目
|
|
71
|
+
|
|
72
|
+
## 🚀 快速开始
|
|
73
|
+
|
|
74
|
+
### 安装
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
pip install crawlo
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### 创建项目
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
crawlo startproject myproject
|
|
84
|
+
cd myproject
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### 编写爬虫
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from crawlo import Spider, Request, Item
|
|
91
|
+
|
|
92
|
+
class MyItem(Item):
|
|
93
|
+
title = ''
|
|
94
|
+
url = ''
|
|
95
|
+
|
|
96
|
+
class MySpider(Spider):
|
|
97
|
+
name = 'myspider'
|
|
98
|
+
|
|
99
|
+
async def start_requests(self):
|
|
100
|
+
yield Request('https://httpbin.org/get', callback=self.parse)
|
|
101
|
+
|
|
102
|
+
async def parse(self, response):
|
|
103
|
+
yield MyItem(
|
|
104
|
+
title='Example Title',
|
|
105
|
+
url=response.url
|
|
106
|
+
)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### 运行爬虫
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
crawlo crawl myspider
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
## 🏗️ 架构设计
|
|
116
|
+
|
|
117
|
+
### 组件交互图
|
|
118
|
+
|
|
119
|
+
```
|
|
120
|
+
┌─────────────────────────────────────────────────────────────────────┐
|
|
121
|
+
│ Crawler │
|
|
122
|
+
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────────────────┐ │
|
|
123
|
+
│ │ Spider │ │ Engine │ │ ExtensionManager │ │
|
|
124
|
+
│ │ │ │ │ │ │ │
|
|
125
|
+
│ │ start_urls │ │ Scheduler ◄─┼──┼──► StatsCollector │ │
|
|
126
|
+
│ │ parse() │ │ │ │ │ │
|
|
127
|
+
│ │ │ │ Downloader ◄─┼──┼──► MiddlewareManager │ │
|
|
128
|
+
│ │ │ │ │ │ │ │
|
|
129
|
+
│ │ │ │ Processor ◄─┼──┼──► PipelineManager │ │
|
|
130
|
+
│ └──────────────┘ └──────┬───────┘ └──────────────────────────┘ │
|
|
131
|
+
└──────────────────────────┼─────────────────────────────────────────┘
|
|
132
|
+
│
|
|
133
|
+
┌──────────────────▼──────────────────┐
|
|
134
|
+
│ Scheduler │
|
|
135
|
+
│ ┌──────────────────────────────┐ │
|
|
136
|
+
│ │ QueueManager │ │
|
|
137
|
+
│ │ ┌─────────┐ ┌────────────┐ │ │
|
|
138
|
+
│ │ │ Memory │ │ Redis │ │ │
|
|
139
|
+
│ │ │ Queue │ │ Queue │ │ │
|
|
140
|
+
│ │ └─────────┘ └────────────┘ │ │
|
|
141
|
+
│ └──────────────────────────────┘ │
|
|
142
|
+
│ ┌──────────────────────────────┐ │
|
|
143
|
+
│ │ Filter │ │
|
|
144
|
+
│ │ ┌─────────┐ ┌────────────┐ │ │
|
|
145
|
+
│ │ │ Memory │ │ Redis │ │ │
|
|
146
|
+
│ │ │ Filter │ │ Filter │ │ │
|
|
147
|
+
│ │ └─────────┘ └────────────┘ │ │
|
|
148
|
+
│ └──────────────────────────────┘ │
|
|
149
|
+
└─────────────────────────────────────┘
|
|
150
|
+
│
|
|
151
|
+
┌──────────────────▼──────────────────┐
|
|
152
|
+
│ Downloader │
|
|
153
|
+
│ ┌──────────────────────────────┐ │
|
|
154
|
+
│ │ MiddlewareManager │ │
|
|
155
|
+
│ │ │ │
|
|
156
|
+
│ │ RequestMiddleware ◄────────┐ │ │
|
|
157
|
+
│ │ ResponseMiddleware │ │ │
|
|
158
|
+
│ │ ExceptionMiddleware │ │ │
|
|
159
|
+
│ │ ╱ │ │
|
|
160
|
+
│ └─────────────────────────╱───┘ │
|
|
161
|
+
│ ╱ │
|
|
162
|
+
│ ┌───────────────────────▼──┐ │
|
|
163
|
+
│ │ Download Implementations │ │
|
|
164
|
+
│ │ - AioHttpDownloader │ │
|
|
165
|
+
│ │ - HttpXDownloader │ │
|
|
166
|
+
│ │ - CurlCffiDownloader │ │
|
|
167
|
+
│ └──────────────────────────┘ │
|
|
168
|
+
└─────────────────────────────────────┘
|
|
169
|
+
│
|
|
170
|
+
┌──────────────────▼──────────────────┐
|
|
171
|
+
│ Processor │
|
|
172
|
+
│ ┌──────────────────────────────┐ │
|
|
173
|
+
│ │ PipelineManager │ │
|
|
174
|
+
│ │ ┌─────────────────────────┐ │ │
|
|
175
|
+
│ │ │ Pipeline Stages │ │ │
|
|
176
|
+
│ │ │ - ValidationPipeline │ │ │
|
|
177
|
+
│ │ │ - ProcessingPipeline │ │ │
|
|
178
|
+
│ │ │ - StoragePipeline │ │ │
|
|
179
|
+
│ │ └─────────────────────────┘ │ │
|
|
180
|
+
│ └──────────────────────────────┘ │
|
|
181
|
+
└─────────────────────────────────────┘
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
### 运行模式切换图
|
|
185
|
+
|
|
186
|
+
```
|
|
187
|
+
┌─────────────────────┐
|
|
188
|
+
│ ModeManager │
|
|
189
|
+
│ (运行模式管理器) │
|
|
190
|
+
└─────────┬───────────┘
|
|
191
|
+
│
|
|
192
|
+
┌─────────────────────┼─────────────────────┐
|
|
193
|
+
│ │ │
|
|
194
|
+
▼ ▼ ▼
|
|
195
|
+
┌───────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
|
196
|
+
│ Standalone │ │ Distributed │ │ Auto │
|
|
197
|
+
│ (单机模式) │ │ (分布式模式) │ │ (自动检测模式) │
|
|
198
|
+
└───────┬───────┘ └─────────┬───────┘ └─────────┬───────┘
|
|
199
|
+
│ │ │
|
|
200
|
+
▼ ▼ ▼
|
|
201
|
+
┌───────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
|
202
|
+
│ Memory Queue │ │ Redis Queue │ │ Auto Select │
|
|
203
|
+
│ Memory Filter │ │ Redis Filter │ │ Memory/Redis │
|
|
204
|
+
└───────────────┘ └─────────────────┘ └─────────────────┘
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
### 数据流向图
|
|
208
|
+
|
|
209
|
+
```
|
|
210
|
+
┌─────────────┐ 1.生成初始请求 ┌──────────────┐
|
|
211
|
+
│ Spider ├─────────────────────►│ Scheduler │
|
|
212
|
+
└─────────────┘ └──────┬───────┘
|
|
213
|
+
│ 2.去重检查
|
|
214
|
+
▼
|
|
215
|
+
┌─────────────────┐
|
|
216
|
+
│ Filter │
|
|
217
|
+
└─────────┬───────┘
|
|
218
|
+
│ 3.入队
|
|
219
|
+
▼
|
|
220
|
+
┌─────────────────┐
|
|
221
|
+
│ Queue │
|
|
222
|
+
└─────────┬───────┘
|
|
223
|
+
│ 4.获取请求
|
|
224
|
+
▼
|
|
225
|
+
┌─────────────────┐ 5.下载请求
|
|
226
|
+
│ Downloader ├──────────────────┐
|
|
227
|
+
└─────────────────┘ │
|
|
228
|
+
│ 6.解析响应 │
|
|
229
|
+
▼ ▼
|
|
230
|
+
┌─────────────────┐ 7.生成数据 ┌─────────────┐
|
|
231
|
+
│ Processor ├────────────────►│ Pipeline │
|
|
232
|
+
└─────────────────┘ └─────────────┘
|
|
233
|
+
│ 8.存储数据
|
|
234
|
+
▼
|
|
235
|
+
┌─────────────────┐
|
|
236
|
+
│ Items │
|
|
237
|
+
└─────────────────┘
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
### 模块层次结构图
|
|
241
|
+
|
|
242
|
+
```
|
|
243
|
+
crawlo/
|
|
244
|
+
├── cli.py # 命令行接口
|
|
245
|
+
├── crawler.py # 爬虫运行实例
|
|
246
|
+
├── project.py # 项目管理
|
|
247
|
+
├── config.py # 配置管理
|
|
248
|
+
├── mode_manager.py # 运行模式管理器
|
|
249
|
+
├── stats_collector.py # 统计收集器
|
|
250
|
+
├── subscriber.py # 事件订阅器
|
|
251
|
+
├── task_manager.py # 任务管理器
|
|
252
|
+
├── event.py # 事件定义
|
|
253
|
+
├── exceptions.py # 异常定义
|
|
254
|
+
├──
|
|
255
|
+
├── core/ # 核心组件
|
|
256
|
+
│ ├── engine.py # 引擎
|
|
257
|
+
│ ├── scheduler.py # 调度器
|
|
258
|
+
│ ├── processor.py # 处理器
|
|
259
|
+
│
|
|
260
|
+
├── spider/ # 爬虫基类
|
|
261
|
+
│ └── __init__.py # 爬虫元类和基类
|
|
262
|
+
│
|
|
263
|
+
├── network/ # 网络相关
|
|
264
|
+
│ ├── request.py # 请求对象
|
|
265
|
+
│ └── response.py # 响应对象
|
|
266
|
+
│
|
|
267
|
+
├── downloader/ # 下载器
|
|
268
|
+
│ ├── __init__.py # 下载器基类
|
|
269
|
+
│ ├── aiohttp_downloader.py # AioHttp实现
|
|
270
|
+
│ ├── httpx_downloader.py # HttpX实现
|
|
271
|
+
│ └── cffi_downloader.py # CurlCffi实现
|
|
272
|
+
│
|
|
273
|
+
├── queue/ # 队列管理
|
|
274
|
+
│ ├── __init__.py
|
|
275
|
+
│ ├── queue_manager.py # 队列管理器
|
|
276
|
+
│ ├── pqueue.py # 内存优先队列
|
|
277
|
+
│ └── redis_priority_queue.py # Redis优先队列
|
|
278
|
+
│
|
|
279
|
+
├── filters/ # 过滤器
|
|
280
|
+
│ ├── __init__.py
|
|
281
|
+
│ ├── base_filter.py # 过滤器基类
|
|
282
|
+
│ ├── memory_filter.py # 内存过滤器
|
|
283
|
+
│ └── aioredis_filter.py # Redis过滤器
|
|
284
|
+
│
|
|
285
|
+
├── middleware/ # 中间件
|
|
286
|
+
│ ├── __init__.py
|
|
287
|
+
│ ├── middleware_manager.py # 中间件管理器
|
|
288
|
+
│ ├── default_header.py # 默认请求头
|
|
289
|
+
│ ├── download_delay.py # 下载延迟
|
|
290
|
+
│ ├── proxy.py # 代理支持
|
|
291
|
+
│ ├── request_ignore.py # 请求忽略
|
|
292
|
+
│ ├── response_code.py # 响应码处理
|
|
293
|
+
│ ├── response_filter.py # 响应过滤
|
|
294
|
+
│ └── retry.py # 重试机制
|
|
295
|
+
│
|
|
296
|
+
├── pipelines/ # 数据管道
|
|
297
|
+
│ ├── __init__.py
|
|
298
|
+
│ ├── pipeline_manager.py # 管道管理器
|
|
299
|
+
│ ├── base_pipeline.py # 管道基类
|
|
300
|
+
│ ├── console_pipeline.py # 控制台输出管道
|
|
301
|
+
│ └── mysql_pipeline.py # MySQL存储管道
|
|
302
|
+
│
|
|
303
|
+
├── extension/ # 扩展组件
|
|
304
|
+
│ ├── __init__.py
|
|
305
|
+
│ ├── log_interval.py # 定时日志
|
|
306
|
+
│ ├── log_stats.py # 统计日志
|
|
307
|
+
│ ├── logging_extension.py # 日志扩展
|
|
308
|
+
│ ├── memory_monitor.py # 内存监控
|
|
309
|
+
│ └── performance_profiler.py # 性能分析
|
|
310
|
+
│
|
|
311
|
+
├── settings/ # 配置系统
|
|
312
|
+
│ ├── __init__.py
|
|
313
|
+
│ ├── default_settings.py # 默认配置
|
|
314
|
+
│ └── setting_manager.py # 配置管理器
|
|
315
|
+
│
|
|
316
|
+
├── utils/ # 工具库
|
|
317
|
+
│ ├── __init__.py
|
|
318
|
+
│ ├── log.py # 日志工具
|
|
319
|
+
│ ├── request.py # 请求工具
|
|
320
|
+
│ ├── request_serializer.py # 请求序列化
|
|
321
|
+
│ └── func_tools.py # 函数工具
|
|
322
|
+
│
|
|
323
|
+
└── templates/ # 模板文件
|
|
324
|
+
├── project/
|
|
325
|
+
└── spider/
|
|
326
|
+
```
|
|
327
|
+
|
|
328
|
+
### 组件说明
|
|
329
|
+
|
|
330
|
+
- **Crawler**: 爬虫运行实例,管理Spider与引擎的生命周期
|
|
331
|
+
- **Engine**: 引擎组件,协调Scheduler、Downloader、Processor
|
|
332
|
+
- **Scheduler**: 调度器,管理请求队列和去重过滤
|
|
333
|
+
- **Downloader**: 下载器,负责网络请求,支持多种实现(aiohttp, httpx, curl-cffi)
|
|
334
|
+
- **Processor**: 处理器,处理响应数据和管道
|
|
335
|
+
- **QueueManager**: 统一的队列管理器,支持内存队列和Redis队列的自动切换
|
|
336
|
+
- **Filter**: 请求去重过滤器,支持内存和Redis两种实现
|
|
337
|
+
- **Middleware**: 中间件系统,处理请求/响应的预处理和后处理
|
|
338
|
+
- **Pipeline**: 数据处理管道,支持多种存储方式(控制台、数据库等)
|
|
339
|
+
- **Spider**: 爬虫基类,定义爬取逻辑
|
|
340
|
+
|
|
341
|
+
### 运行模式
|
|
342
|
+
|
|
343
|
+
Crawlo支持三种运行模式:
|
|
344
|
+
- **standalone**: 单机模式,使用内存队列和内存过滤器
|
|
345
|
+
- **distributed**: 分布式模式,使用Redis队列和Redis过滤器
|
|
346
|
+
- **auto**: 自动检测模式,根据环境自动选择最佳运行方式
|
|
347
|
+
|
|
348
|
+
## 🎛️ 配置系统
|
|
349
|
+
|
|
350
|
+
### 传统配置方式
|
|
351
|
+
|
|
352
|
+
```
|
|
353
|
+
# settings.py
|
|
354
|
+
PROJECT_NAME = 'myproject'
|
|
355
|
+
CONCURRENCY = 16
|
|
356
|
+
DOWNLOAD_DELAY = 1.0
|
|
357
|
+
QUEUE_TYPE = 'memory' # 单机模式
|
|
358
|
+
# QUEUE_TYPE = 'redis' # 分布式模式
|
|
359
|
+
```
|
|
360
|
+
|
|
361
|
+
### 命令行配置
|
|
362
|
+
|
|
363
|
+
```
|
|
364
|
+
crawlo crawl myspider --concurrency=32 --delay=0.5
|
|
365
|
+
```
|
|
366
|
+
|
|
367
|
+
## 🧩 核心组件
|
|
368
|
+
|
|
369
|
+
### 中间件系统
|
|
370
|
+
灵活的中间件系统,支持请求预处理、响应处理和异常处理。
|
|
371
|
+
|
|
372
|
+
### 管道系统
|
|
373
|
+
可扩展的数据处理管道,支持多种存储方式(控制台、数据库等)。
|
|
374
|
+
|
|
375
|
+
### 扩展组件
|
|
376
|
+
功能增强扩展,包括日志、监控、性能分析等。
|
|
377
|
+
|
|
378
|
+
### 过滤系统
|
|
379
|
+
智能去重过滤,支持多种去重策略(内存、Redis、Bloom Filter)。
|
|
380
|
+
|
|
381
|
+
## 📦 示例项目
|
|
382
|
+
|
|
383
|
+
- [API数据采集](examples/api_data_collection/) - 简单的API数据采集示例
|
|
384
|
+
- [电信设备许可证](examples/telecom_licenses_distributed/) - 分布式爬取示例
|
|
385
|
+
|
|
386
|
+
## 📚 文档
|
|
387
|
+
|
|
388
|
+
完整的文档请访问 [Crawlo Documentation](https://crawlo.readthedocs.io/)
|
|
389
|
+
|
|
390
|
+
- [快速开始指南](docs/quick_start.md)
|
|
391
|
+
- [框架文档](docs/crawlo_framework_documentation.md)
|
|
392
|
+
- [API参考](docs/api_reference.md)
|
|
393
|
+
- [分布式爬取教程](docs/distributed_crawling_tutorial.md)
|
|
394
|
+
- [配置最佳实践](docs/configuration_best_practices.md)
|
|
395
|
+
- [扩展组件](docs/extensions.md)
|
|
396
|
+
|
|
397
|
+
## 🤝 贡献
|
|
398
|
+
|
|
399
|
+
欢迎提交 Issue 和 Pull Request 来帮助改进 Crawlo!
|
|
400
|
+
|
|
401
|
+
## 📄 许可证
|
|
402
|
+
|
|
403
|
+
本项目采用 MIT 许可证,详情请见 [LICENSE](LICENSE) 文件。
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
crawlo/__init__.py,sha256=esOolburYDjtF43D5N9Kh6TSQW2yKcz888ilhBSinBc,825
|
|
2
|
+
crawlo/__version__.py,sha256=XxXhu8-QnuD9hA8Ah0WX5rgpt_DwOQmAwcK-FtpngyQ,22
|
|
3
|
+
crawlo/cli.py,sha256=CtR2Pfa7SyRxEKPaXqt-6E6K5Vq5z3rfdAI95UO4cbU,1166
|
|
4
|
+
crawlo/config.py,sha256=i0Amz6wNPgv_aVcdCBRRlcwuZLSa87cH9OEmTQvB97Q,8329
|
|
5
|
+
crawlo/crawler.py,sha256=v6i5tjgSOtbMoqiw1qdgKx1cY4kcVcd5l5bUTWtJNNU,36461
|
|
6
|
+
crawlo/event.py,sha256=7-y6HNv_EIJSYQNzsj0mVK-Gg4ON3wdQeMdQjfFJPlw,313
|
|
7
|
+
crawlo/exceptions.py,sha256=sMay0wnWLfc_FXWslqxm60qz6b66LXs3EdN_w8ygE9k,1166
|
|
8
|
+
crawlo/mode_manager.py,sha256=WIxrq9S3EAH0D71LH1AxvcqXomeABqoXgtUN4A--DKY,6702
|
|
9
|
+
crawlo/project.py,sha256=xWN2eTAjf_Pza-wWvvV4JjScQRWxe9hXlztX81ccUMc,5182
|
|
10
|
+
crawlo/stats_collector.py,sha256=NkO09CB-220qz5rxFcD_dedGfr2VPFrDo4hya0Zh8Qc,1577
|
|
11
|
+
crawlo/subscriber.py,sha256=gioTIqRdEwVG-bwIiQonbk1vWWAqTh9hzVkrqZ1AfP0,5006
|
|
12
|
+
crawlo/task_manager.py,sha256=19InAxS9oJ9EMj20Aw2urN-v6BeC22dkgPuW-B9-4UI,819
|
|
13
|
+
crawlo/commands/__init__.py,sha256=AMYjXG7ulE8dPVmgWVo0uqXsaCYUUZYmmu2-7kFzH1M,342
|
|
14
|
+
crawlo/commands/check.py,sha256=172OiAxnX5wwSlszUsyPgMZwAoIbGDTdfhtRz309ilc,22843
|
|
15
|
+
crawlo/commands/genspider.py,sha256=-jGJdfXLsefX_H1ydQ2wirdu6p6wmhClzVXY_0L-1aE,5050
|
|
16
|
+
crawlo/commands/list.py,sha256=yByqQeZBgvjewOKxpnOobpeJ7Hnbs-CWsoyITqZu2ZY,5781
|
|
17
|
+
crawlo/commands/run.py,sha256=8Qngjsl8Q4RBdO39a__wKGsheY2PFuPit2hds_jwEbM,10524
|
|
18
|
+
crawlo/commands/startproject.py,sha256=bzNgpkKzUEggY2m7Iw810mSPe8wOPFBqSCO0jZX3z_g,7138
|
|
19
|
+
crawlo/commands/stats.py,sha256=6pAgkEi8MBnCer2rWmKpaTYr1jaM6HeMG9owAvEzJyY,6064
|
|
20
|
+
crawlo/commands/utils.py,sha256=nohMvUU2zLvX0XzXk6KeCNxP0EvSWj9DiVLxM_7tD5o,5106
|
|
21
|
+
crawlo/core/__init__.py,sha256=PnFyJdVNHBoPmV1sW0AHQXijeoSTQ8cMYrbNM1JK8kA,41
|
|
22
|
+
crawlo/core/engine.py,sha256=8Dcew1XyxChW5Fz1wFEWpJlPrQb2hKDWKul8e61S-Q0,6662
|
|
23
|
+
crawlo/core/enhanced_engine.py,sha256=9I9Uxdy2oAz8zDGTzEiytuKu__VDVmIN8zwZKfrD8bw,6254
|
|
24
|
+
crawlo/core/processor.py,sha256=qmCqAeqhwYu-UE86evYesaGt9qpuSIfH-ZIZKcXFCZc,1140
|
|
25
|
+
crawlo/core/scheduler.py,sha256=fiU-Q-lzyC3B6ih8NSWqjP1Xw_ryNVb_4dLUARtWRBE,5804
|
|
26
|
+
crawlo/downloader/__init__.py,sha256=tl0mE54reR-PuJYSsXsKP2VY5uzvq4lITxZwKKjNzPs,7663
|
|
27
|
+
crawlo/downloader/aiohttp_downloader.py,sha256=UKupGYPOWrscAVsjhFgKYElTa9tbEeltqV7nuWqjIeE,8005
|
|
28
|
+
crawlo/downloader/cffi_downloader.py,sha256=-GVfSIhi1Ip56suSiGf8jnUE2EBF1P56vw0uxLh_T6I,10440
|
|
29
|
+
crawlo/downloader/httpx_downloader.py,sha256=7jfQfvAtfk8yD_mvwUbWLhYOxMM7r1nudiU7m_Jl9wc,12037
|
|
30
|
+
crawlo/extension/__init__.py,sha256=Sg588p6UhyrwFNTiD2wqGW-i3xgLX6HlLuQPKT7mayE,1526
|
|
31
|
+
crawlo/extension/health_check.py,sha256=IVaaVo_0CcZtf1LoCAYXIBvs3wZ7hdmT6U4-NYWAgP0,5527
|
|
32
|
+
crawlo/extension/log_interval.py,sha256=VCIeNqXcWDnxj4m6l77cjqgRzV8LfsPMb22X0Xc1Vwc,2417
|
|
33
|
+
crawlo/extension/log_stats.py,sha256=Ssxz6R1YpWIj5WJvQ2cJ9F5oR7FUFdj-ITc9lV92SSU,2908
|
|
34
|
+
crawlo/extension/logging_extension.py,sha256=ET6VAu1J2qNMz4NnG1G3zQLRhbsvV7l6xRIuQLE6DaE,1626
|
|
35
|
+
crawlo/extension/memory_monitor.py,sha256=gg-GK5RD9XhnrN_zp3KTmPKyWDmKLMv_OTY-HxSxBNI,3664
|
|
36
|
+
crawlo/extension/performance_profiler.py,sha256=NvQuuvE83dXJ-1URpN8OF9za9J1l7xhVbV22JynPQpA,4235
|
|
37
|
+
crawlo/extension/request_recorder.py,sha256=RC23yzXClnVv9j2ljQvjBkUfWznfnDHsrQejKhE9y5E,4074
|
|
38
|
+
crawlo/filters/__init__.py,sha256=XC_Q4ykZtSNYizYlAcehVwBBNO3lZ2zuWwafzXiuWyQ,4241
|
|
39
|
+
crawlo/filters/aioredis_filter.py,sha256=WhkFZcVAym9wLSUa8WTVctYfEibjxG42umtmacO1IY0,8370
|
|
40
|
+
crawlo/filters/memory_filter.py,sha256=VJO0UFRYGxmV8dj4G1subsQ-FtvPcGLbvd7IVtqXnOs,9260
|
|
41
|
+
crawlo/items/__init__.py,sha256=bqekZrRlDhxfWie0UbCs656TptYseoe9QJ67I4E7Elk,386
|
|
42
|
+
crawlo/items/base.py,sha256=tAYrPJgblp3ZEihDXvappdYc6pGdim6x2_9QSmMKI2o,577
|
|
43
|
+
crawlo/items/fields.py,sha256=wMlakQTsEwyrlLzMt1gI4pScLQZMqd3E1xcfH4dbSqk,1801
|
|
44
|
+
crawlo/items/items.py,sha256=e-3nXI9ckD64vcDxxQiAU6ufbtJMs09gbZQcYjxgwHY,3374
|
|
45
|
+
crawlo/middleware/__init__.py,sha256=ldaGFNbiJnK9Fx12Vdf9fDNfzXxoETtShp5r-vodtw0,549
|
|
46
|
+
crawlo/middleware/default_header.py,sha256=i_Uj07JObyeZFxL7ZAZmvZsHvA1HGtkNab1sA0d-nWI,1067
|
|
47
|
+
crawlo/middleware/download_delay.py,sha256=2M-TchDA7MwyTfYy0Hzh_bW9wlHlpiP-oQlys7crTj0,966
|
|
48
|
+
crawlo/middleware/middleware_manager.py,sha256=j1hkWRFB5rnC5SnB7oXWE5eUNv8blS9krDIDM5fIDs8,6213
|
|
49
|
+
crawlo/middleware/proxy.py,sha256=m2ZZ50En9hUtgrqSqA6hItGT74xMqccHFPhZshutIco,9811
|
|
50
|
+
crawlo/middleware/request_ignore.py,sha256=QI2z4fUnJ-4xvPTZAmsL-GqR4RFHS1xq9iDr5KFrMco,997
|
|
51
|
+
crawlo/middleware/response_code.py,sha256=tmef2QVl3JCiTMii6VQkASlOY2OyqmOPoOfNxIK1eF8,659
|
|
52
|
+
crawlo/middleware/response_filter.py,sha256=ep8ZxDlfIefi9YqK8dPASEp5TTDRo9QEY_jMceC411s,837
|
|
53
|
+
crawlo/middleware/retry.py,sha256=-7zpRURugiTTm4QYUSUlbnURD5mcT2Ji0yHvCgY1wGc,4124
|
|
54
|
+
crawlo/network/__init__.py,sha256=BLPERYPo22g1BXrW--wUnlolrdFUmOPjgOB8XQQJlck,397
|
|
55
|
+
crawlo/network/request.py,sha256=tPAiOVJyF3Kk-midqydTGXgv5M5tsYJRtwUUJTrUsrE,11075
|
|
56
|
+
crawlo/network/response.py,sha256=cUvdjsB2cQ-qWEKHNGIkwWGgCg-EnQ81xTrjrUOVno0,9738
|
|
57
|
+
crawlo/pipelines/__init__.py,sha256=lrdVDjeHLNkA4_MAwI1auk_I9xfeU1SlBWXiammb6lc,616
|
|
58
|
+
crawlo/pipelines/bloom_dedup_pipeline.py,sha256=QQxGFGEoMHN4Vx2kq7G_i1o9pmuXp8clZebilOar3fk,5642
|
|
59
|
+
crawlo/pipelines/console_pipeline.py,sha256=KABkR3J-rqO0Awox7lizxKR2XuHfVhWPiVRgIybwwu4,1248
|
|
60
|
+
crawlo/pipelines/csv_pipeline.py,sha256=6FBT2AoU6iNU-5NfgWRq7-JpF9dK2nBokjxx-y4jIas,12174
|
|
61
|
+
crawlo/pipelines/database_dedup_pipeline.py,sha256=wVBXEGArFR3uxoN7yfJSOarBmtGrJpOqowAqa7OUs98,8000
|
|
62
|
+
crawlo/pipelines/json_pipeline.py,sha256=vlu1nqbD2mtqtExt9cL5nibx1CwJM1RNqd4WGjZRHAY,8367
|
|
63
|
+
crawlo/pipelines/memory_dedup_pipeline.py,sha256=5jeL2jEq7sioYmXlzfkx-LNSbWyChrXeWx8d15YEZOA,3839
|
|
64
|
+
crawlo/pipelines/mongo_pipeline.py,sha256=k7gNqAO-g2MtIfArphC6z5ZzkKVRkBKcv-2ImziPFA0,5706
|
|
65
|
+
crawlo/pipelines/mysql_pipeline.py,sha256=cwgJvRORTRea_Eep2coBaMf3G8PQVTQA1qrnIlDZApc,13480
|
|
66
|
+
crawlo/pipelines/pipeline_manager.py,sha256=VrbebOYiqrobtKhp5II18w-odCICdWkmRg5WPK0Emz4,2112
|
|
67
|
+
crawlo/pipelines/redis_dedup_pipeline.py,sha256=TaokJ4wP5-Cxf-ueFJdh4SX58hchT0QzZ5RBDXHDN64,6003
|
|
68
|
+
crawlo/queue/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
69
|
+
crawlo/queue/pqueue.py,sha256=yzF900ko2RReVNZtfk_VV3PzSXEUALI6SHf97geuu6s,1058
|
|
70
|
+
crawlo/queue/queue_manager.py,sha256=NMR0Fo8XFBg6_g7htq4D80cS6Ilo0EKt5QtyF-KxNuM,11467
|
|
71
|
+
crawlo/queue/redis_priority_queue.py,sha256=boJCKqcKxRw9XCCzaHy5qmrX9DvzPiQBzOkBHI5odfE,8116
|
|
72
|
+
crawlo/settings/__init__.py,sha256=xsukVKn_h2Hopm1Nj-bXkhbfyS62QTTvJi7fhZUwR9M,123
|
|
73
|
+
crawlo/settings/default_settings.py,sha256=B4_61tNJvqzVvyqt9AtRV7Iq5q8G4pJOExcN0ve7S_A,11559
|
|
74
|
+
crawlo/settings/setting_manager.py,sha256=SxKB1aCWh4OySM_bH9cYng9I3PAmrSP-Q8XOZEWEwbI,2899
|
|
75
|
+
crawlo/spider/__init__.py,sha256=Z_rK23l5yt-DuwJPg8bcqodM_FIs4-iHLaKOimGumcE,20452
|
|
76
|
+
crawlo/templates/crawlo.cfg.tmpl,sha256=9BAmwEibS5Tvy6HIcGXPb0BGeuesmibebmTW0iAEkmo,230
|
|
77
|
+
crawlo/templates/project/__init__.py.tmpl,sha256=f3ETIXw_O6K-lkL6lXM5znMPJW1FZYGFrwDs2BnHcnQ,58
|
|
78
|
+
crawlo/templates/project/items.py.tmpl,sha256=mt1Mm--H2Ouos3r7JPkYh0r33rgYJf1YOMz0OZy8TYs,297
|
|
79
|
+
crawlo/templates/project/middlewares.py.tmpl,sha256=jpmj7b7Zb7d3nVyxcaVNdp4KqSts6l2cPSqn_oJUSrM,3775
|
|
80
|
+
crawlo/templates/project/pipelines.py.tmpl,sha256=k_4MJnwZ6GPqVwJSEDURUlTxWybmts4vHrF0de2vgAk,2620
|
|
81
|
+
crawlo/templates/project/run.py.tmpl,sha256=ktkYOslcCh9mpklg6yE5VqfATx3Frj_jNT5z5gHjQ4o,8177
|
|
82
|
+
crawlo/templates/project/settings.py.tmpl,sha256=O_teIARjzRD3aMvPnuIgjaDHdjwW-3beyzfo1QH-Hr8,9580
|
|
83
|
+
crawlo/templates/project/spiders/__init__.py.tmpl,sha256=j_YKsw6HQMJyqlk3WUouP3bsr-XVxshRoSNakHBc00g,106
|
|
84
|
+
crawlo/templates/spider/spider.py.tmpl,sha256=a8S9j43z5gE4auMhf_OnnuVHSZN3JbMDu8Bczu8zIZY,4944
|
|
85
|
+
crawlo/utils/__init__.py,sha256=BDORpyjMN7VGPKImnCDKSkprS-petgD7ezc9rMlBvb0,123
|
|
86
|
+
crawlo/utils/controlled_spider_mixin.py,sha256=VjT30pNW_YIgmTD0nb7DDl2D3HvpnAYFzgSgV3fxFN0,16475
|
|
87
|
+
crawlo/utils/date_tools.py,sha256=0yG0tzGb1VFgWDJJ_cow2LJfz3kj_w2MqSjmfKKESl8,6961
|
|
88
|
+
crawlo/utils/db_helper.py,sha256=3ib5-agrlwf2t5S_QtLRYH75wvJDlYbRqRmDEbpH5Bo,10559
|
|
89
|
+
crawlo/utils/func_tools.py,sha256=WUZEGpWMuDDX7g-QySM7iaiC74erW2SSkZoUvDw1NjM,2369
|
|
90
|
+
crawlo/utils/large_scale_config.py,sha256=j7wQ5ty7pQlBRygw2vhRJ7OI19RYBZKPfYMP3WeF2WI,8154
|
|
91
|
+
crawlo/utils/large_scale_helper.py,sha256=JJqcGSI6VaVe3MSL6IWjmCp8XQIu6T4U-BvBLSttr_s,12157
|
|
92
|
+
crawlo/utils/log.py,sha256=A3lPyhD8kD88cV23KOL-_eT8g69xGQ5L1toDB2AO0mc,4005
|
|
93
|
+
crawlo/utils/queue_helper.py,sha256=xpUUTOqlU1xz2Pb9NKAVGo3AfAO-7Xvx8Lm1q65Dgck,4743
|
|
94
|
+
crawlo/utils/request.py,sha256=yoLB2rY8d78vgPjIWpdhY5SalIKjyLIvTG_UH6EMdVI,8798
|
|
95
|
+
crawlo/utils/request_serializer.py,sha256=bPoSQqE2ksiMyP3WiPB3w3UqZs4f_LgkAw4Pj0qyBDo,8565
|
|
96
|
+
crawlo/utils/spider_loader.py,sha256=pEDUsYOTGjszA6KgjiMlYN4GS5fP4uakkhcp3JTFFQY,2187
|
|
97
|
+
crawlo/utils/system.py,sha256=HvWV1acxou0Rn0L7pNq4CnV_GWFeU0Tmjy3_nLD8M64,237
|
|
98
|
+
crawlo/utils/tools.py,sha256=5Uv25Wy4m_ndZY0-n-eX-t3PxvaZ6wR3-Wvx-o7_Vrs,271
|
|
99
|
+
crawlo/utils/url.py,sha256=rlgX2VlJv6JvLmCDTsbxzMSXE6R5ZL_0dLEqprsA-JU,1482
|
|
100
|
+
examples/__init__.py,sha256=6i631BPnS_TR_BWUjtjB5CBO-zv9kRkwQTQvSya2wHE,123
|
|
101
|
+
examples/controlled_spider_example.py,sha256=SP_k4mdKPvD1JCPs9UCm68jcy2Frg84vvXv9-14RC6I,7776
|
|
102
|
+
tests/__init__.py,sha256=scL1IPVT1iucth7v8ffrjRdeW7QrC_Y7AMmFVMdTY1Y,129
|
|
103
|
+
tests/test_final_validation.py,sha256=fBxf_6YcAEa_HyV_oGAXmmVHY4i6FdA4J6klCmc36hQ,4925
|
|
104
|
+
tests/test_proxy_health_check.py,sha256=xo3QMP1YNw9hu7JDHZOYCUZmFFKLJpHSh4SbxXhCRPQ,1091
|
|
105
|
+
tests/test_proxy_middleware_integration.py,sha256=zcl7fR9Toc-I-stSUTzKZPwcfh3kgrpjI5SbkZ6AVmE,4305
|
|
106
|
+
tests/test_proxy_providers.py,sha256=XwWZCywTYguSsUxSm6fsbaoH1p9dKjqSIx9-sqKZehA,1693
|
|
107
|
+
tests/test_proxy_stats.py,sha256=Til_yksrRz2yBVw-yJi5-36LhNW3vTwpXTm4BdR9PUM,507
|
|
108
|
+
tests/test_proxy_strategies.py,sha256=ZkziozkvZd3KWOQnpHQ8Upd3WpyoX7gN0qFGluNm348,1809
|
|
109
|
+
tests/test_redis_config.py,sha256=TqzFRojc6esGXjGhUCvSLYQDUTAgEJsty9vRVuNraMU,893
|
|
110
|
+
tests/test_redis_queue.py,sha256=o6xViXxJcdx-1eMcG3vhAQEIm8h346HnZb7JXs7ZjwM,6622
|
|
111
|
+
tests/test_request_serialization.py,sha256=8sVdppAsohJ5u-m1WvablCndwL-M_36YPLdGKwgeznM,2289
|
|
112
|
+
tests/test_scheduler.py,sha256=-FOkTWzaMdr6yfO1Msu74hI_GgSfD7iRxO-cFA-9Iyk,7442
|
|
113
|
+
crawlo-1.1.4.dist-info/METADATA,sha256=2I2NA0BR-MWoPZmRUkWrUQYMjuPiUi9mrogIYPWpASU,19781
|
|
114
|
+
crawlo-1.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
115
|
+
crawlo-1.1.4.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
|
|
116
|
+
crawlo-1.1.4.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
|
|
117
|
+
crawlo-1.1.4.dist-info/RECORD,,
|
examples/__init__.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
# @Time : 2025-02-05 12:36
|
|
5
|
-
# @Author : oscar
|
|
6
|
-
# @Desc : None
|
|
7
|
-
"""
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
# @Time : 2025-02-05 12:36
|
|
5
|
+
# @Author : oscar
|
|
6
|
+
# @Desc : None
|
|
7
|
+
"""
|