crawlo 1.2.3__py3-none-any.whl → 1.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +61 -61
- crawlo/__version__.py +1 -1
- crawlo/cleaners/__init__.py +60 -60
- crawlo/cleaners/data_formatter.py +225 -225
- crawlo/cleaners/encoding_converter.py +125 -125
- crawlo/cleaners/text_cleaner.py +232 -232
- crawlo/cli.py +88 -81
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +151 -151
- crawlo/commands/help.py +144 -142
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +323 -292
- crawlo/commands/startproject.py +436 -417
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +186 -186
- crawlo/config.py +312 -312
- crawlo/config_validator.py +251 -251
- crawlo/core/__init__.py +2 -2
- crawlo/core/engine.py +354 -354
- crawlo/core/processor.py +40 -40
- crawlo/core/scheduler.py +143 -143
- crawlo/crawler.py +1110 -1027
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +107 -107
- crawlo/downloader/__init__.py +266 -266
- crawlo/downloader/aiohttp_downloader.py +220 -220
- crawlo/downloader/cffi_downloader.py +256 -256
- crawlo/downloader/httpx_downloader.py +259 -259
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +402 -402
- crawlo/downloader/selenium_downloader.py +472 -472
- crawlo/event.py +11 -11
- crawlo/exceptions.py +81 -81
- crawlo/extension/__init__.py +37 -37
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +57 -57
- crawlo/extension/log_stats.py +81 -81
- crawlo/extension/logging_extension.py +43 -43
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/filters/__init__.py +154 -154
- crawlo/filters/aioredis_filter.py +280 -280
- crawlo/filters/memory_filter.py +269 -269
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +21 -21
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +131 -131
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +135 -135
- crawlo/middleware/offsite.py +114 -114
- crawlo/middleware/proxy.py +367 -367
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +163 -163
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/mode_manager.py +211 -211
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +338 -338
- crawlo/network/response.py +359 -359
- crawlo/pipelines/__init__.py +21 -21
- crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +222 -222
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +115 -115
- crawlo/pipelines/mongo_pipeline.py +131 -131
- crawlo/pipelines/mysql_pipeline.py +317 -317
- crawlo/pipelines/pipeline_manager.py +61 -61
- crawlo/pipelines/redis_dedup_pipeline.py +165 -165
- crawlo/project.py +279 -187
- crawlo/queue/pqueue.py +37 -37
- crawlo/queue/queue_manager.py +337 -337
- crawlo/queue/redis_priority_queue.py +298 -298
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +217 -226
- crawlo/settings/setting_manager.py +122 -122
- crawlo/spider/__init__.py +639 -639
- crawlo/stats_collector.py +59 -59
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +30 -30
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +3 -3
- crawlo/templates/project/items.py.tmpl +17 -17
- crawlo/templates/project/middlewares.py.tmpl +118 -118
- crawlo/templates/project/pipelines.py.tmpl +96 -96
- crawlo/templates/project/settings.py.tmpl +324 -325
- crawlo/templates/project/settings_distributed.py.tmpl +154 -121
- crawlo/templates/project/settings_gentle.py.tmpl +127 -94
- crawlo/templates/project/settings_high_performance.py.tmpl +149 -151
- crawlo/templates/project/settings_simple.py.tmpl +102 -68
- crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
- crawlo/templates/{project/run.py.tmpl → run.py.tmpl} +47 -45
- crawlo/templates/spider/spider.py.tmpl +143 -143
- crawlo/tools/__init__.py +182 -182
- crawlo/tools/anti_crawler.py +268 -268
- crawlo/tools/authenticated_proxy.py +240 -240
- crawlo/tools/data_validator.py +180 -180
- crawlo/tools/date_tools.py +35 -35
- crawlo/tools/distributed_coordinator.py +386 -386
- crawlo/tools/retry_mechanism.py +220 -220
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/utils/__init__.py +35 -35
- crawlo/utils/batch_processor.py +259 -259
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/date_tools.py +290 -290
- crawlo/utils/db_helper.py +343 -343
- crawlo/utils/enhanced_error_handler.py +356 -356
- crawlo/utils/env_config.py +105 -105
- crawlo/utils/error_handler.py +123 -123
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_config.py +286 -286
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/log.py +128 -128
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_connection_pool.py +334 -334
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +267 -267
- crawlo/utils/request_serializer.py +218 -218
- crawlo/utils/spider_loader.py +61 -61
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +4 -4
- crawlo/utils/url.py +39 -39
- {crawlo-1.2.3.dist-info → crawlo-1.2.5.dist-info}/METADATA +764 -692
- crawlo-1.2.5.dist-info/RECORD +206 -0
- examples/__init__.py +7 -7
- tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +81 -81
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +275 -275
- tests/authenticated_proxy_example.py +236 -236
- tests/cleaners_example.py +160 -160
- tests/config_validation_demo.py +102 -102
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/env_config_example.py +133 -133
- tests/error_handling_example.py +171 -171
- tests/redis_key_validation_demo.py +130 -130
- tests/response_improvements_example.py +144 -144
- tests/test_advanced_tools.py +148 -148
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_authenticated_proxy.py +141 -141
- tests/test_cleaners.py +54 -54
- tests/test_comprehensive.py +146 -146
- tests/test_config_validator.py +193 -193
- tests/test_crawlo_proxy_integration.py +172 -172
- tests/test_date_tools.py +123 -123
- tests/test_default_header_middleware.py +158 -158
- tests/test_double_crawlo_fix.py +207 -207
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +268 -268
- tests/test_dynamic_downloaders_proxy.py +124 -124
- tests/test_dynamic_proxy.py +92 -92
- tests/test_dynamic_proxy_config.py +146 -146
- tests/test_dynamic_proxy_real.py +109 -109
- tests/test_edge_cases.py +303 -303
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_env_config.py +121 -121
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_final_validation.py +153 -153
- tests/test_framework_env_usage.py +103 -103
- tests/test_integration.py +356 -356
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_offsite_middleware.py +221 -221
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_proxy_api.py +264 -264
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +121 -121
- tests/test_proxy_middleware_enhanced.py +216 -216
- tests/test_proxy_middleware_integration.py +136 -136
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +176 -176
- tests/test_real_scenario_proxy.py +195 -195
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_improvements.py +152 -152
- tests/test_retry_middleware.py +241 -241
- tests/test_scheduler.py +241 -241
- tests/test_simple_response.py +61 -61
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +153 -153
- tests/tools_example.py +257 -257
- crawlo-1.2.3.dist-info/RECORD +0 -222
- examples/aiohttp_settings.py +0 -42
- examples/curl_cffi_settings.py +0 -41
- examples/default_header_middleware_example.py +0 -107
- examples/default_header_spider_example.py +0 -129
- examples/download_delay_middleware_example.py +0 -160
- examples/httpx_settings.py +0 -42
- examples/multi_downloader_proxy_example.py +0 -81
- examples/offsite_middleware_example.py +0 -55
- examples/offsite_spider_example.py +0 -107
- examples/proxy_spider_example.py +0 -166
- examples/request_ignore_middleware_example.py +0 -51
- examples/request_ignore_spider_example.py +0 -99
- examples/response_code_middleware_example.py +0 -52
- examples/response_filter_middleware_example.py +0 -67
- examples/tong_hua_shun_settings.py +0 -62
- examples/tong_hua_shun_spider.py +0 -170
- {crawlo-1.2.3.dist-info → crawlo-1.2.5.dist-info}/WHEEL +0 -0
- {crawlo-1.2.3.dist-info → crawlo-1.2.5.dist-info}/entry_points.txt +0 -0
- {crawlo-1.2.3.dist-info → crawlo-1.2.5.dist-info}/top_level.txt +0 -0
|
@@ -1,173 +1,173 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
Crawlo框架代理集成测试
|
|
5
|
-
====================
|
|
6
|
-
展示如何在Crawlo框架中集成和使用指定的代理API
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
import asyncio
|
|
10
|
-
import sys
|
|
11
|
-
import os
|
|
12
|
-
|
|
13
|
-
# 添加项目根目录到Python路径
|
|
14
|
-
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
15
|
-
|
|
16
|
-
from crawlo import Spider, Request
|
|
17
|
-
from crawlo.middleware.proxy import ProxyMiddleware
|
|
18
|
-
from crawlo.settings.setting_manager import SettingManager
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
class TestProxySpider(Spider):
|
|
22
|
-
"""测试代理的爬虫示例"""
|
|
23
|
-
name = 'test_proxy_spider'
|
|
24
|
-
|
|
25
|
-
def __init__(self):
|
|
26
|
-
super().__init__()
|
|
27
|
-
self.test_urls = [
|
|
28
|
-
'https://httpbin.org/ip', # 查看IP地址
|
|
29
|
-
'https://httpbin.org/headers', # 查看请求头
|
|
30
|
-
'https://stock.10jqka.com.cn/20240315/c655957791.shtml', # 测试目标链接
|
|
31
|
-
]
|
|
32
|
-
|
|
33
|
-
def start_requests(self):
|
|
34
|
-
"""生成初始请求"""
|
|
35
|
-
for url in self.test_urls:
|
|
36
|
-
request = Request(url=url, callback=self.parse)
|
|
37
|
-
yield request
|
|
38
|
-
|
|
39
|
-
def parse(self, response):
|
|
40
|
-
"""解析响应"""
|
|
41
|
-
print(f"\n=== 响应详情 ===")
|
|
42
|
-
print(f"URL: {response.url}")
|
|
43
|
-
print(f"状态码: {response.status_code}")
|
|
44
|
-
print(f"响应头: {dict(response.headers)}")
|
|
45
|
-
|
|
46
|
-
# 对于httpbin.org/ip,显示IP信息
|
|
47
|
-
if 'httpbin.org/ip' in response.url:
|
|
48
|
-
print(f"IP信息: {response.text[:200]}")
|
|
49
|
-
|
|
50
|
-
# 对于httpbin.org/headers,显示请求头信息
|
|
51
|
-
elif 'httpbin.org/headers' in response.url:
|
|
52
|
-
print(f"请求头信息: {response.text[:200]}")
|
|
53
|
-
|
|
54
|
-
# 对于目标链接,显示部分内容
|
|
55
|
-
else:
|
|
56
|
-
# 只显示前200个字符
|
|
57
|
-
content_preview = response.text[:200] if response.text else ""
|
|
58
|
-
print(f"内容预览: {content_preview}")
|
|
59
|
-
|
|
60
|
-
# 返回一个简单的item
|
|
61
|
-
return {
|
|
62
|
-
'url': response.url,
|
|
63
|
-
'status_code': response.status_code,
|
|
64
|
-
'title': response.css('title::text').get() if response.text else None
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
def create_proxy_settings():
|
|
69
|
-
"""创建代理配置"""
|
|
70
|
-
settings = SettingManager()
|
|
71
|
-
|
|
72
|
-
# 基础配置
|
|
73
|
-
settings.set("LOG_LEVEL", "INFO")
|
|
74
|
-
settings.set("CONCURRENCY", 1) # 为了测试,设置并发数为1
|
|
75
|
-
|
|
76
|
-
# 代理配置
|
|
77
|
-
settings.set("PROXY_ENABLED", True)
|
|
78
|
-
settings.set("PROXY_API_URL", "http://test.proxy.api:8080/proxy/getitem/")
|
|
79
|
-
settings.set("PROXY_EXTRACTOR", "proxy") # 根据API响应结构调整
|
|
80
|
-
settings.set("PROXY_REFRESH_INTERVAL", 30) # 30秒刷新一次
|
|
81
|
-
settings.set("PROXY_API_TIMEOUT", 10) # 10秒超时
|
|
82
|
-
settings.set("PROXY_POOL_SIZE", 3) # 代理池大小
|
|
83
|
-
settings.set("PROXY_HEALTH_CHECK_THRESHOLD", 0.5) # 健康检查阈值
|
|
84
|
-
|
|
85
|
-
return settings
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
async def test_proxy_middleware_integration():
|
|
89
|
-
"""测试代理中间件集成"""
|
|
90
|
-
print("=== 测试Crawlo代理中间件集成 ===")
|
|
91
|
-
|
|
92
|
-
# 创建配置
|
|
93
|
-
settings = create_proxy_settings()
|
|
94
|
-
|
|
95
|
-
# 创建代理中间件实例
|
|
96
|
-
proxy_middleware = ProxyMiddleware(settings, "INFO")
|
|
97
|
-
|
|
98
|
-
# 测试代理API连接
|
|
99
|
-
print(f"代理API URL: {proxy_middleware.api_url}")
|
|
100
|
-
print(f"代理刷新间隔: {proxy_middleware.refresh_interval}秒")
|
|
101
|
-
print(f"代理池大小: {proxy_middleware.proxy_pool_size}")
|
|
102
|
-
|
|
103
|
-
# 测试获取代理
|
|
104
|
-
print("\n--- 测试获取代理 ---")
|
|
105
|
-
try:
|
|
106
|
-
# 这里我们直接测试API连接,而不是完整的代理池更新
|
|
107
|
-
proxy_data = await proxy_middleware._get_proxy_from_api()
|
|
108
|
-
if proxy_data:
|
|
109
|
-
print(f"✅ 成功从API获取代理信息: {proxy_data}")
|
|
110
|
-
else:
|
|
111
|
-
print("❌ 无法从API获取代理信息")
|
|
112
|
-
except Exception as e:
|
|
113
|
-
print(f"❌ 获取代理时出错: {e}")
|
|
114
|
-
|
|
115
|
-
print("\n=== 代理中间件集成测试完成 ===")
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
def show_proxy_configuration_example():
|
|
119
|
-
"""显示代理配置示例"""
|
|
120
|
-
print("\n=== 代理配置示例 ===")
|
|
121
|
-
print("""
|
|
122
|
-
在Crawlo项目中配置代理的方法:
|
|
123
|
-
|
|
124
|
-
1. 在settings.py中添加以下配置:
|
|
125
|
-
|
|
126
|
-
```python
|
|
127
|
-
# 代理配置
|
|
128
|
-
PROXY_ENABLED = True
|
|
129
|
-
PROXY_API_URL = 'http://test.proxy.api:8080/proxy/getitem/'
|
|
130
|
-
PROXY_EXTRACTOR = 'proxy'
|
|
131
|
-
PROXY_REFRESH_INTERVAL = 30
|
|
132
|
-
PROXY_API_TIMEOUT = 10
|
|
133
|
-
PROXY_POOL_SIZE = 5
|
|
134
|
-
PROXY_HEALTH_CHECK_THRESHOLD = 0.5
|
|
135
|
-
```
|
|
136
|
-
|
|
137
|
-
2. 确保代理中间件在MIDDLEWARES列表中:
|
|
138
|
-
|
|
139
|
-
```python
|
|
140
|
-
MIDDLEWARES = [
|
|
141
|
-
'crawlo.middleware.request_ignore.RequestIgnoreMiddleware',
|
|
142
|
-
'crawlo.middleware.download_delay.DownloadDelayMiddleware',
|
|
143
|
-
'crawlo.middleware.default_header.DefaultHeaderMiddleware',
|
|
144
|
-
'crawlo.middleware.proxy.ProxyMiddleware', # 代理中间件
|
|
145
|
-
'crawlo.middleware.retry.RetryMiddleware',
|
|
146
|
-
'crawlo.middleware.response_code.ResponseCodeMiddleware',
|
|
147
|
-
'crawlo.middleware.response_filter.ResponseFilterMiddleware',
|
|
148
|
-
]
|
|
149
|
-
```
|
|
150
|
-
|
|
151
|
-
3. 启动爬虫后,代理中间件会自动:
|
|
152
|
-
- 定期从API获取代理
|
|
153
|
-
- 维护代理池
|
|
154
|
-
- 自动为请求分配代理
|
|
155
|
-
- 监控代理健康状态
|
|
156
|
-
""")
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
async def main():
|
|
160
|
-
"""主函数"""
|
|
161
|
-
print("开始Crawlo代理集成测试...\n")
|
|
162
|
-
|
|
163
|
-
# 1. 测试代理中间件集成
|
|
164
|
-
await test_proxy_middleware_integration()
|
|
165
|
-
|
|
166
|
-
# 2. 显示配置示例
|
|
167
|
-
show_proxy_configuration_example()
|
|
168
|
-
|
|
169
|
-
print("\n所有测试完成!")
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
if __name__ == "__main__":
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Crawlo框架代理集成测试
|
|
5
|
+
====================
|
|
6
|
+
展示如何在Crawlo框架中集成和使用指定的代理API
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
import sys
|
|
11
|
+
import os
|
|
12
|
+
|
|
13
|
+
# 添加项目根目录到Python路径
|
|
14
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
15
|
+
|
|
16
|
+
from crawlo import Spider, Request
|
|
17
|
+
from crawlo.middleware.proxy import ProxyMiddleware
|
|
18
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class TestProxySpider(Spider):
|
|
22
|
+
"""测试代理的爬虫示例"""
|
|
23
|
+
name = 'test_proxy_spider'
|
|
24
|
+
|
|
25
|
+
def __init__(self):
|
|
26
|
+
super().__init__()
|
|
27
|
+
self.test_urls = [
|
|
28
|
+
'https://httpbin.org/ip', # 查看IP地址
|
|
29
|
+
'https://httpbin.org/headers', # 查看请求头
|
|
30
|
+
'https://stock.10jqka.com.cn/20240315/c655957791.shtml', # 测试目标链接
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
def start_requests(self):
|
|
34
|
+
"""生成初始请求"""
|
|
35
|
+
for url in self.test_urls:
|
|
36
|
+
request = Request(url=url, callback=self.parse)
|
|
37
|
+
yield request
|
|
38
|
+
|
|
39
|
+
def parse(self, response):
|
|
40
|
+
"""解析响应"""
|
|
41
|
+
print(f"\n=== 响应详情 ===")
|
|
42
|
+
print(f"URL: {response.url}")
|
|
43
|
+
print(f"状态码: {response.status_code}")
|
|
44
|
+
print(f"响应头: {dict(response.headers)}")
|
|
45
|
+
|
|
46
|
+
# 对于httpbin.org/ip,显示IP信息
|
|
47
|
+
if 'httpbin.org/ip' in response.url:
|
|
48
|
+
print(f"IP信息: {response.text[:200]}")
|
|
49
|
+
|
|
50
|
+
# 对于httpbin.org/headers,显示请求头信息
|
|
51
|
+
elif 'httpbin.org/headers' in response.url:
|
|
52
|
+
print(f"请求头信息: {response.text[:200]}")
|
|
53
|
+
|
|
54
|
+
# 对于目标链接,显示部分内容
|
|
55
|
+
else:
|
|
56
|
+
# 只显示前200个字符
|
|
57
|
+
content_preview = response.text[:200] if response.text else ""
|
|
58
|
+
print(f"内容预览: {content_preview}")
|
|
59
|
+
|
|
60
|
+
# 返回一个简单的item
|
|
61
|
+
return {
|
|
62
|
+
'url': response.url,
|
|
63
|
+
'status_code': response.status_code,
|
|
64
|
+
'title': response.css('title::text').get() if response.text else None
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def create_proxy_settings():
|
|
69
|
+
"""创建代理配置"""
|
|
70
|
+
settings = SettingManager()
|
|
71
|
+
|
|
72
|
+
# 基础配置
|
|
73
|
+
settings.set("LOG_LEVEL", "INFO")
|
|
74
|
+
settings.set("CONCURRENCY", 1) # 为了测试,设置并发数为1
|
|
75
|
+
|
|
76
|
+
# 代理配置
|
|
77
|
+
settings.set("PROXY_ENABLED", True)
|
|
78
|
+
settings.set("PROXY_API_URL", "http://test.proxy.api:8080/proxy/getitem/")
|
|
79
|
+
settings.set("PROXY_EXTRACTOR", "proxy") # 根据API响应结构调整
|
|
80
|
+
settings.set("PROXY_REFRESH_INTERVAL", 30) # 30秒刷新一次
|
|
81
|
+
settings.set("PROXY_API_TIMEOUT", 10) # 10秒超时
|
|
82
|
+
settings.set("PROXY_POOL_SIZE", 3) # 代理池大小
|
|
83
|
+
settings.set("PROXY_HEALTH_CHECK_THRESHOLD", 0.5) # 健康检查阈值
|
|
84
|
+
|
|
85
|
+
return settings
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
async def test_proxy_middleware_integration():
|
|
89
|
+
"""测试代理中间件集成"""
|
|
90
|
+
print("=== 测试Crawlo代理中间件集成 ===")
|
|
91
|
+
|
|
92
|
+
# 创建配置
|
|
93
|
+
settings = create_proxy_settings()
|
|
94
|
+
|
|
95
|
+
# 创建代理中间件实例
|
|
96
|
+
proxy_middleware = ProxyMiddleware(settings, "INFO")
|
|
97
|
+
|
|
98
|
+
# 测试代理API连接
|
|
99
|
+
print(f"代理API URL: {proxy_middleware.api_url}")
|
|
100
|
+
print(f"代理刷新间隔: {proxy_middleware.refresh_interval}秒")
|
|
101
|
+
print(f"代理池大小: {proxy_middleware.proxy_pool_size}")
|
|
102
|
+
|
|
103
|
+
# 测试获取代理
|
|
104
|
+
print("\n--- 测试获取代理 ---")
|
|
105
|
+
try:
|
|
106
|
+
# 这里我们直接测试API连接,而不是完整的代理池更新
|
|
107
|
+
proxy_data = await proxy_middleware._get_proxy_from_api()
|
|
108
|
+
if proxy_data:
|
|
109
|
+
print(f"✅ 成功从API获取代理信息: {proxy_data}")
|
|
110
|
+
else:
|
|
111
|
+
print("❌ 无法从API获取代理信息")
|
|
112
|
+
except Exception as e:
|
|
113
|
+
print(f"❌ 获取代理时出错: {e}")
|
|
114
|
+
|
|
115
|
+
print("\n=== 代理中间件集成测试完成 ===")
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def show_proxy_configuration_example():
|
|
119
|
+
"""显示代理配置示例"""
|
|
120
|
+
print("\n=== 代理配置示例 ===")
|
|
121
|
+
print("""
|
|
122
|
+
在Crawlo项目中配置代理的方法:
|
|
123
|
+
|
|
124
|
+
1. 在settings.py中添加以下配置:
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
# 代理配置
|
|
128
|
+
PROXY_ENABLED = True
|
|
129
|
+
PROXY_API_URL = 'http://test.proxy.api:8080/proxy/getitem/'
|
|
130
|
+
PROXY_EXTRACTOR = 'proxy'
|
|
131
|
+
PROXY_REFRESH_INTERVAL = 30
|
|
132
|
+
PROXY_API_TIMEOUT = 10
|
|
133
|
+
PROXY_POOL_SIZE = 5
|
|
134
|
+
PROXY_HEALTH_CHECK_THRESHOLD = 0.5
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
2. 确保代理中间件在MIDDLEWARES列表中:
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
MIDDLEWARES = [
|
|
141
|
+
'crawlo.middleware.request_ignore.RequestIgnoreMiddleware',
|
|
142
|
+
'crawlo.middleware.download_delay.DownloadDelayMiddleware',
|
|
143
|
+
'crawlo.middleware.default_header.DefaultHeaderMiddleware',
|
|
144
|
+
'crawlo.middleware.proxy.ProxyMiddleware', # 代理中间件
|
|
145
|
+
'crawlo.middleware.retry.RetryMiddleware',
|
|
146
|
+
'crawlo.middleware.response_code.ResponseCodeMiddleware',
|
|
147
|
+
'crawlo.middleware.response_filter.ResponseFilterMiddleware',
|
|
148
|
+
]
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
3. 启动爬虫后,代理中间件会自动:
|
|
152
|
+
- 定期从API获取代理
|
|
153
|
+
- 维护代理池
|
|
154
|
+
- 自动为请求分配代理
|
|
155
|
+
- 监控代理健康状态
|
|
156
|
+
""")
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
async def main():
|
|
160
|
+
"""主函数"""
|
|
161
|
+
print("开始Crawlo代理集成测试...\n")
|
|
162
|
+
|
|
163
|
+
# 1. 测试代理中间件集成
|
|
164
|
+
await test_proxy_middleware_integration()
|
|
165
|
+
|
|
166
|
+
# 2. 显示配置示例
|
|
167
|
+
show_proxy_configuration_example()
|
|
168
|
+
|
|
169
|
+
print("\n所有测试完成!")
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
if __name__ == "__main__":
|
|
173
173
|
asyncio.run(main())
|
tests/test_date_tools.py
CHANGED
|
@@ -1,124 +1,124 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
日期工具测试
|
|
5
|
-
"""
|
|
6
|
-
import unittest
|
|
7
|
-
from crawlo.tools import (
|
|
8
|
-
TimeUtils,
|
|
9
|
-
parse_time,
|
|
10
|
-
format_time,
|
|
11
|
-
time_diff,
|
|
12
|
-
to_timestamp,
|
|
13
|
-
to_datetime,
|
|
14
|
-
now,
|
|
15
|
-
to_timezone,
|
|
16
|
-
to_utc,
|
|
17
|
-
to_local
|
|
18
|
-
)
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
class TestDateTools(unittest.TestCase):
|
|
22
|
-
"""日期工具测试类"""
|
|
23
|
-
|
|
24
|
-
def test_date_parsing(self):
|
|
25
|
-
"""测试日期解析功能"""
|
|
26
|
-
# 测试标准格式
|
|
27
|
-
dt = parse_time("2025-09-10 14:30:00")
|
|
28
|
-
self.assertIsNotNone(dt)
|
|
29
|
-
self.assertEqual(dt.year, 2025)
|
|
30
|
-
self.assertEqual(dt.month, 9)
|
|
31
|
-
self.assertEqual(dt.day, 10)
|
|
32
|
-
self.assertEqual(dt.hour, 14)
|
|
33
|
-
self.assertEqual(dt.minute, 30)
|
|
34
|
-
self.assertEqual(dt.second, 0)
|
|
35
|
-
|
|
36
|
-
# 测试不同格式
|
|
37
|
-
dt2 = parse_time("September 10, 2025 2:30 PM")
|
|
38
|
-
self.assertIsNotNone(dt2)
|
|
39
|
-
self.assertEqual(dt2.year, 2025)
|
|
40
|
-
self.assertEqual(dt2.month, 9)
|
|
41
|
-
self.assertEqual(dt2.day, 10)
|
|
42
|
-
self.assertEqual(dt2.hour, 14)
|
|
43
|
-
self.assertEqual(dt2.minute, 30)
|
|
44
|
-
|
|
45
|
-
def test_date_formatting(self):
|
|
46
|
-
"""测试日期格式化功能"""
|
|
47
|
-
dt = parse_time("2025-09-10 14:30:00")
|
|
48
|
-
|
|
49
|
-
# 测试标准格式
|
|
50
|
-
formatted = format_time(dt, "%Y-%m-%d")
|
|
51
|
-
self.assertEqual(formatted, "2025-09-10")
|
|
52
|
-
|
|
53
|
-
# 测试中文格式
|
|
54
|
-
formatted_cn = format_time(dt, "%Y年%m月%d日")
|
|
55
|
-
self.assertEqual(formatted_cn, "2025年09月10日")
|
|
56
|
-
|
|
57
|
-
def test_time_difference(self):
|
|
58
|
-
"""测试时间差计算功能"""
|
|
59
|
-
start_time = "2025-09-10 10:00:00"
|
|
60
|
-
end_time = "2025-09-10 14:30:45"
|
|
61
|
-
|
|
62
|
-
# 测试秒级差值
|
|
63
|
-
diff_seconds = time_diff(start_time, end_time, "seconds")
|
|
64
|
-
self.assertEqual(diff_seconds, 16245) # 4小时30分45秒 = 16245秒
|
|
65
|
-
|
|
66
|
-
# 测试分钟级差值
|
|
67
|
-
diff_minutes = time_diff(start_time, end_time, "minutes")
|
|
68
|
-
self.assertEqual(diff_minutes, 270) # 270分钟
|
|
69
|
-
|
|
70
|
-
# 测试小时级差值
|
|
71
|
-
diff_hours = time_diff(start_time, end_time, "hours")
|
|
72
|
-
self.assertEqual(diff_hours, 4) # 4小时
|
|
73
|
-
|
|
74
|
-
def test_timestamp_conversion(self):
|
|
75
|
-
"""测试时间戳转换功能"""
|
|
76
|
-
# 测试转换为时间戳
|
|
77
|
-
dt = parse_time("2025-09-10 14:30:00")
|
|
78
|
-
timestamp = to_timestamp(dt)
|
|
79
|
-
self.assertIsInstance(timestamp, float)
|
|
80
|
-
|
|
81
|
-
# 测试从时间戳转换
|
|
82
|
-
dt_from_ts = to_datetime(timestamp)
|
|
83
|
-
self.assertEqual(dt.year, dt_from_ts.year)
|
|
84
|
-
self.assertEqual(dt.month, dt_from_ts.month)
|
|
85
|
-
self.assertEqual(dt.day, dt_from_ts.day)
|
|
86
|
-
self.assertEqual(dt.hour, dt_from_ts.hour)
|
|
87
|
-
self.assertEqual(dt.minute, dt_from_ts.minute)
|
|
88
|
-
self.assertEqual(dt.second, dt_from_ts.second)
|
|
89
|
-
|
|
90
|
-
def test_timezone_conversion(self):
|
|
91
|
-
"""测试时区转换功能"""
|
|
92
|
-
dt = parse_time("2025-09-10 14:30:00")
|
|
93
|
-
|
|
94
|
-
# 测试UTC转换
|
|
95
|
-
utc_time = to_utc(dt)
|
|
96
|
-
self.assertIsNotNone(utc_time)
|
|
97
|
-
|
|
98
|
-
# 测试本地时区转换
|
|
99
|
-
local_time = to_local(dt)
|
|
100
|
-
self.assertIsNotNone(local_time)
|
|
101
|
-
|
|
102
|
-
# 测试指定时区转换
|
|
103
|
-
ny_time = to_timezone(dt, "America/New_York")
|
|
104
|
-
self.assertIsNotNone(ny_time)
|
|
105
|
-
|
|
106
|
-
def test_time_utils_class(self):
|
|
107
|
-
"""测试TimeUtils类方法"""
|
|
108
|
-
# 测试日期加减
|
|
109
|
-
base_date = "2025-09-10"
|
|
110
|
-
plus_30_days = TimeUtils.add_days(base_date, 30)
|
|
111
|
-
self.assertEqual(plus_30_days.month, 10)
|
|
112
|
-
self.assertEqual(plus_30_days.day, 10)
|
|
113
|
-
|
|
114
|
-
# 测试月份加减
|
|
115
|
-
plus_3_months = TimeUtils.add_months(base_date, 3)
|
|
116
|
-
self.assertEqual(plus_3_months.month, 12)
|
|
117
|
-
|
|
118
|
-
# 测试闰年判断
|
|
119
|
-
self.assertTrue(TimeUtils.is_leap_year(2024))
|
|
120
|
-
self.assertFalse(TimeUtils.is_leap_year(2025))
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
if __name__ == '__main__':
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
日期工具测试
|
|
5
|
+
"""
|
|
6
|
+
import unittest
|
|
7
|
+
from crawlo.tools import (
|
|
8
|
+
TimeUtils,
|
|
9
|
+
parse_time,
|
|
10
|
+
format_time,
|
|
11
|
+
time_diff,
|
|
12
|
+
to_timestamp,
|
|
13
|
+
to_datetime,
|
|
14
|
+
now,
|
|
15
|
+
to_timezone,
|
|
16
|
+
to_utc,
|
|
17
|
+
to_local
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class TestDateTools(unittest.TestCase):
|
|
22
|
+
"""日期工具测试类"""
|
|
23
|
+
|
|
24
|
+
def test_date_parsing(self):
|
|
25
|
+
"""测试日期解析功能"""
|
|
26
|
+
# 测试标准格式
|
|
27
|
+
dt = parse_time("2025-09-10 14:30:00")
|
|
28
|
+
self.assertIsNotNone(dt)
|
|
29
|
+
self.assertEqual(dt.year, 2025)
|
|
30
|
+
self.assertEqual(dt.month, 9)
|
|
31
|
+
self.assertEqual(dt.day, 10)
|
|
32
|
+
self.assertEqual(dt.hour, 14)
|
|
33
|
+
self.assertEqual(dt.minute, 30)
|
|
34
|
+
self.assertEqual(dt.second, 0)
|
|
35
|
+
|
|
36
|
+
# 测试不同格式
|
|
37
|
+
dt2 = parse_time("September 10, 2025 2:30 PM")
|
|
38
|
+
self.assertIsNotNone(dt2)
|
|
39
|
+
self.assertEqual(dt2.year, 2025)
|
|
40
|
+
self.assertEqual(dt2.month, 9)
|
|
41
|
+
self.assertEqual(dt2.day, 10)
|
|
42
|
+
self.assertEqual(dt2.hour, 14)
|
|
43
|
+
self.assertEqual(dt2.minute, 30)
|
|
44
|
+
|
|
45
|
+
def test_date_formatting(self):
|
|
46
|
+
"""测试日期格式化功能"""
|
|
47
|
+
dt = parse_time("2025-09-10 14:30:00")
|
|
48
|
+
|
|
49
|
+
# 测试标准格式
|
|
50
|
+
formatted = format_time(dt, "%Y-%m-%d")
|
|
51
|
+
self.assertEqual(formatted, "2025-09-10")
|
|
52
|
+
|
|
53
|
+
# 测试中文格式
|
|
54
|
+
formatted_cn = format_time(dt, "%Y年%m月%d日")
|
|
55
|
+
self.assertEqual(formatted_cn, "2025年09月10日")
|
|
56
|
+
|
|
57
|
+
def test_time_difference(self):
|
|
58
|
+
"""测试时间差计算功能"""
|
|
59
|
+
start_time = "2025-09-10 10:00:00"
|
|
60
|
+
end_time = "2025-09-10 14:30:45"
|
|
61
|
+
|
|
62
|
+
# 测试秒级差值
|
|
63
|
+
diff_seconds = time_diff(start_time, end_time, "seconds")
|
|
64
|
+
self.assertEqual(diff_seconds, 16245) # 4小时30分45秒 = 16245秒
|
|
65
|
+
|
|
66
|
+
# 测试分钟级差值
|
|
67
|
+
diff_minutes = time_diff(start_time, end_time, "minutes")
|
|
68
|
+
self.assertEqual(diff_minutes, 270) # 270分钟
|
|
69
|
+
|
|
70
|
+
# 测试小时级差值
|
|
71
|
+
diff_hours = time_diff(start_time, end_time, "hours")
|
|
72
|
+
self.assertEqual(diff_hours, 4) # 4小时
|
|
73
|
+
|
|
74
|
+
def test_timestamp_conversion(self):
|
|
75
|
+
"""测试时间戳转换功能"""
|
|
76
|
+
# 测试转换为时间戳
|
|
77
|
+
dt = parse_time("2025-09-10 14:30:00")
|
|
78
|
+
timestamp = to_timestamp(dt)
|
|
79
|
+
self.assertIsInstance(timestamp, float)
|
|
80
|
+
|
|
81
|
+
# 测试从时间戳转换
|
|
82
|
+
dt_from_ts = to_datetime(timestamp)
|
|
83
|
+
self.assertEqual(dt.year, dt_from_ts.year)
|
|
84
|
+
self.assertEqual(dt.month, dt_from_ts.month)
|
|
85
|
+
self.assertEqual(dt.day, dt_from_ts.day)
|
|
86
|
+
self.assertEqual(dt.hour, dt_from_ts.hour)
|
|
87
|
+
self.assertEqual(dt.minute, dt_from_ts.minute)
|
|
88
|
+
self.assertEqual(dt.second, dt_from_ts.second)
|
|
89
|
+
|
|
90
|
+
def test_timezone_conversion(self):
|
|
91
|
+
"""测试时区转换功能"""
|
|
92
|
+
dt = parse_time("2025-09-10 14:30:00")
|
|
93
|
+
|
|
94
|
+
# 测试UTC转换
|
|
95
|
+
utc_time = to_utc(dt)
|
|
96
|
+
self.assertIsNotNone(utc_time)
|
|
97
|
+
|
|
98
|
+
# 测试本地时区转换
|
|
99
|
+
local_time = to_local(dt)
|
|
100
|
+
self.assertIsNotNone(local_time)
|
|
101
|
+
|
|
102
|
+
# 测试指定时区转换
|
|
103
|
+
ny_time = to_timezone(dt, "America/New_York")
|
|
104
|
+
self.assertIsNotNone(ny_time)
|
|
105
|
+
|
|
106
|
+
def test_time_utils_class(self):
|
|
107
|
+
"""测试TimeUtils类方法"""
|
|
108
|
+
# 测试日期加减
|
|
109
|
+
base_date = "2025-09-10"
|
|
110
|
+
plus_30_days = TimeUtils.add_days(base_date, 30)
|
|
111
|
+
self.assertEqual(plus_30_days.month, 10)
|
|
112
|
+
self.assertEqual(plus_30_days.day, 10)
|
|
113
|
+
|
|
114
|
+
# 测试月份加减
|
|
115
|
+
plus_3_months = TimeUtils.add_months(base_date, 3)
|
|
116
|
+
self.assertEqual(plus_3_months.month, 12)
|
|
117
|
+
|
|
118
|
+
# 测试闰年判断
|
|
119
|
+
self.assertTrue(TimeUtils.is_leap_year(2024))
|
|
120
|
+
self.assertFalse(TimeUtils.is_leap_year(2025))
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
if __name__ == '__main__':
|
|
124
124
|
unittest.main()
|