crawlo 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +61 -61
- crawlo/__version__.py +1 -1
- crawlo/cleaners/__init__.py +60 -60
- crawlo/cleaners/data_formatter.py +225 -225
- crawlo/cleaners/encoding_converter.py +125 -125
- crawlo/cleaners/text_cleaner.py +232 -232
- crawlo/cli.py +81 -65
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +151 -151
- crawlo/commands/help.py +143 -133
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +292 -292
- crawlo/commands/startproject.py +418 -418
- crawlo/commands/stats.py +188 -188
- crawlo/commands/utils.py +186 -186
- crawlo/config.py +312 -312
- crawlo/config_validator.py +252 -252
- crawlo/core/__init__.py +2 -2
- crawlo/core/engine.py +354 -354
- crawlo/core/processor.py +40 -40
- crawlo/core/scheduler.py +143 -143
- crawlo/crawler.py +1027 -1027
- crawlo/downloader/__init__.py +266 -266
- crawlo/downloader/aiohttp_downloader.py +220 -220
- crawlo/downloader/cffi_downloader.py +256 -256
- crawlo/downloader/httpx_downloader.py +259 -259
- crawlo/downloader/hybrid_downloader.py +213 -213
- crawlo/downloader/playwright_downloader.py +402 -402
- crawlo/downloader/selenium_downloader.py +472 -472
- crawlo/event.py +11 -11
- crawlo/exceptions.py +81 -81
- crawlo/extension/__init__.py +37 -37
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +57 -57
- crawlo/extension/log_stats.py +81 -81
- crawlo/extension/logging_extension.py +43 -43
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/filters/__init__.py +154 -154
- crawlo/filters/aioredis_filter.py +280 -280
- crawlo/filters/memory_filter.py +269 -269
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +21 -21
- crawlo/items/fields.py +53 -53
- crawlo/items/items.py +104 -104
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +132 -32
- crawlo/middleware/download_delay.py +105 -28
- crawlo/middleware/middleware_manager.py +135 -135
- crawlo/middleware/offsite.py +116 -0
- crawlo/middleware/proxy.py +366 -272
- crawlo/middleware/request_ignore.py +88 -30
- crawlo/middleware/response_code.py +164 -18
- crawlo/middleware/response_filter.py +138 -26
- crawlo/middleware/retry.py +124 -124
- crawlo/mode_manager.py +211 -211
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +338 -338
- crawlo/network/response.py +359 -359
- crawlo/pipelines/__init__.py +21 -21
- crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +224 -224
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +115 -115
- crawlo/pipelines/mongo_pipeline.py +131 -131
- crawlo/pipelines/mysql_pipeline.py +316 -316
- crawlo/pipelines/pipeline_manager.py +61 -61
- crawlo/pipelines/redis_dedup_pipeline.py +167 -167
- crawlo/project.py +187 -187
- crawlo/queue/pqueue.py +37 -37
- crawlo/queue/queue_manager.py +337 -337
- crawlo/queue/redis_priority_queue.py +298 -298
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +226 -219
- crawlo/settings/setting_manager.py +122 -122
- crawlo/spider/__init__.py +639 -639
- crawlo/stats_collector.py +59 -59
- crawlo/subscriber.py +130 -130
- crawlo/task_manager.py +30 -30
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +3 -3
- crawlo/templates/project/items.py.tmpl +17 -17
- crawlo/templates/project/middlewares.py.tmpl +118 -109
- crawlo/templates/project/pipelines.py.tmpl +96 -96
- crawlo/templates/project/run.py.tmpl +45 -45
- crawlo/templates/project/settings.py.tmpl +327 -326
- crawlo/templates/project/settings_distributed.py.tmpl +119 -119
- crawlo/templates/project/settings_gentle.py.tmpl +94 -94
- crawlo/templates/project/settings_high_performance.py.tmpl +151 -151
- crawlo/templates/project/settings_simple.py.tmpl +68 -68
- crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
- crawlo/templates/spider/spider.py.tmpl +143 -141
- crawlo/tools/__init__.py +182 -182
- crawlo/tools/anti_crawler.py +268 -268
- crawlo/tools/authenticated_proxy.py +240 -240
- crawlo/tools/data_validator.py +180 -180
- crawlo/tools/date_tools.py +35 -35
- crawlo/tools/distributed_coordinator.py +386 -386
- crawlo/tools/retry_mechanism.py +220 -220
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/utils/__init__.py +35 -35
- crawlo/utils/batch_processor.py +260 -260
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/date_tools.py +290 -290
- crawlo/utils/db_helper.py +343 -343
- crawlo/utils/enhanced_error_handler.py +359 -359
- crawlo/utils/env_config.py +105 -105
- crawlo/utils/error_handler.py +125 -125
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_config.py +286 -286
- crawlo/utils/large_scale_helper.py +343 -343
- crawlo/utils/log.py +128 -128
- crawlo/utils/performance_monitor.py +284 -284
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_connection_pool.py +334 -334
- crawlo/utils/redis_key_validator.py +199 -199
- crawlo/utils/request.py +267 -267
- crawlo/utils/request_serializer.py +219 -219
- crawlo/utils/spider_loader.py +62 -62
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +4 -4
- crawlo/utils/url.py +39 -39
- {crawlo-1.2.0.dist-info → crawlo-1.2.2.dist-info}/METADATA +692 -697
- crawlo-1.2.2.dist-info/RECORD +220 -0
- examples/__init__.py +7 -7
- examples/aiohttp_settings.py +42 -0
- examples/curl_cffi_settings.py +41 -0
- examples/default_header_middleware_example.py +107 -0
- examples/default_header_spider_example.py +129 -0
- examples/download_delay_middleware_example.py +160 -0
- examples/httpx_settings.py +42 -0
- examples/multi_downloader_proxy_example.py +81 -0
- examples/offsite_middleware_example.py +55 -0
- examples/offsite_spider_example.py +107 -0
- examples/proxy_spider_example.py +166 -0
- examples/request_ignore_middleware_example.py +51 -0
- examples/request_ignore_spider_example.py +99 -0
- examples/response_code_middleware_example.py +52 -0
- examples/response_filter_middleware_example.py +67 -0
- examples/tong_hua_shun_settings.py +62 -0
- examples/tong_hua_shun_spider.py +170 -0
- tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +81 -81
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +275 -275
- tests/authenticated_proxy_example.py +236 -236
- tests/cleaners_example.py +160 -160
- tests/config_validation_demo.py +102 -102
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/env_config_example.py +133 -133
- tests/error_handling_example.py +171 -171
- tests/redis_key_validation_demo.py +130 -130
- tests/response_improvements_example.py +144 -144
- tests/test_advanced_tools.py +148 -148
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_authenticated_proxy.py +141 -141
- tests/test_cleaners.py +54 -54
- tests/test_comprehensive.py +146 -146
- tests/test_config_validator.py +193 -193
- tests/test_crawlo_proxy_integration.py +173 -0
- tests/test_date_tools.py +123 -123
- tests/test_default_header_middleware.py +159 -0
- tests/test_double_crawlo_fix.py +207 -207
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +222 -0
- tests/test_downloader_proxy_compatibility.py +269 -0
- tests/test_dynamic_downloaders_proxy.py +124 -124
- tests/test_dynamic_proxy.py +92 -92
- tests/test_dynamic_proxy_config.py +146 -146
- tests/test_dynamic_proxy_real.py +109 -109
- tests/test_edge_cases.py +303 -303
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_env_config.py +121 -121
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_final_validation.py +153 -153
- tests/test_framework_env_usage.py +103 -103
- tests/test_integration.py +356 -356
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_offsite_middleware.py +222 -0
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_proxy_api.py +265 -0
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +122 -0
- tests/test_proxy_middleware_enhanced.py +217 -0
- tests/test_proxy_middleware_integration.py +136 -136
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +176 -176
- tests/test_real_scenario_proxy.py +196 -0
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_request_ignore_middleware.py +183 -0
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +350 -0
- tests/test_response_filter_middleware.py +428 -0
- tests/test_response_improvements.py +152 -152
- tests/test_retry_middleware.py +242 -0
- tests/test_scheduler.py +241 -241
- tests/test_simple_response.py +61 -61
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +153 -153
- tests/tools_example.py +257 -257
- crawlo-1.2.0.dist-info/RECORD +0 -190
- {crawlo-1.2.0.dist-info → crawlo-1.2.2.dist-info}/WHEEL +0 -0
- {crawlo-1.2.0.dist-info → crawlo-1.2.2.dist-info}/entry_points.txt +0 -0
- {crawlo-1.2.0.dist-info → crawlo-1.2.2.dist-info}/top_level.txt +0 -0
tests/test_proxy_api.py
ADDED
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
代理API测试脚本
|
|
5
|
+
================
|
|
6
|
+
测试指定的代理API接口是否能正常工作
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
import aiohttp
|
|
11
|
+
import sys
|
|
12
|
+
import os
|
|
13
|
+
from urllib.parse import urlparse
|
|
14
|
+
|
|
15
|
+
# 添加项目根目录到Python路径
|
|
16
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
17
|
+
|
|
18
|
+
from crawlo.middleware.proxy import ProxyMiddleware
|
|
19
|
+
from crawlo.network.request import Request
|
|
20
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
async def test_proxy_api(proxy_api_url):
|
|
24
|
+
"""测试代理API接口"""
|
|
25
|
+
print(f"=== 测试代理API接口 ===")
|
|
26
|
+
print(f"API地址: {proxy_api_url}")
|
|
27
|
+
|
|
28
|
+
try:
|
|
29
|
+
timeout = aiohttp.ClientTimeout(total=10)
|
|
30
|
+
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
31
|
+
async with session.get(proxy_api_url) as response:
|
|
32
|
+
print(f"状态码: {response.status}")
|
|
33
|
+
print(f"响应头: {response.headers.get('content-type', 'Unknown')}")
|
|
34
|
+
|
|
35
|
+
# 尝试解析JSON响应
|
|
36
|
+
try:
|
|
37
|
+
data = await response.json()
|
|
38
|
+
print(f"响应数据: {data}")
|
|
39
|
+
return data
|
|
40
|
+
except Exception as e:
|
|
41
|
+
# 如果不是JSON,尝试获取文本
|
|
42
|
+
try:
|
|
43
|
+
text = await response.text()
|
|
44
|
+
print(f"响应文本: {text[:200]}{'...' if len(text) > 200 else ''}")
|
|
45
|
+
return text
|
|
46
|
+
except Exception as e2:
|
|
47
|
+
print(f"无法解析响应内容: {e2}")
|
|
48
|
+
return None
|
|
49
|
+
|
|
50
|
+
except asyncio.TimeoutError:
|
|
51
|
+
print("请求超时")
|
|
52
|
+
return None
|
|
53
|
+
except Exception as e:
|
|
54
|
+
print(f"请求失败: {e}")
|
|
55
|
+
return None
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def extract_proxy_url(proxy_data):
|
|
59
|
+
"""从API响应中提取代理URL"""
|
|
60
|
+
proxy_url = None
|
|
61
|
+
|
|
62
|
+
if isinstance(proxy_data, dict):
|
|
63
|
+
# 检查是否有status字段且为成功状态
|
|
64
|
+
if proxy_data.get('status') == 0:
|
|
65
|
+
# 获取proxy字段
|
|
66
|
+
proxy_info = proxy_data.get('proxy', {})
|
|
67
|
+
if isinstance(proxy_info, dict):
|
|
68
|
+
# 优先使用https代理,否则使用http代理
|
|
69
|
+
proxy_url = proxy_info.get('https') or proxy_info.get('http')
|
|
70
|
+
elif isinstance(proxy_info, str):
|
|
71
|
+
proxy_url = proxy_info
|
|
72
|
+
else:
|
|
73
|
+
# 直接尝试常见的字段名
|
|
74
|
+
for key in ['proxy', 'data', 'url', 'http', 'https']:
|
|
75
|
+
if key in proxy_data:
|
|
76
|
+
value = proxy_data[key]
|
|
77
|
+
if isinstance(value, str):
|
|
78
|
+
proxy_url = value
|
|
79
|
+
break
|
|
80
|
+
elif isinstance(value, dict):
|
|
81
|
+
proxy_url = value.get('https') or value.get('http')
|
|
82
|
+
break
|
|
83
|
+
|
|
84
|
+
# 如果还是没有找到,尝试更深层的嵌套
|
|
85
|
+
if not proxy_url:
|
|
86
|
+
for key, value in proxy_data.items():
|
|
87
|
+
if isinstance(value, str) and (value.startswith('http://') or value.startswith('https://')):
|
|
88
|
+
proxy_url = value
|
|
89
|
+
break
|
|
90
|
+
elif isinstance(value, dict):
|
|
91
|
+
# 递归查找
|
|
92
|
+
for sub_key, sub_value in value.items():
|
|
93
|
+
if isinstance(sub_value, str) and (sub_value.startswith('http://') or sub_value.startswith('https://')):
|
|
94
|
+
proxy_url = sub_value
|
|
95
|
+
break
|
|
96
|
+
if proxy_url:
|
|
97
|
+
break
|
|
98
|
+
|
|
99
|
+
elif isinstance(proxy_data, str):
|
|
100
|
+
# 如果响应是字符串,直接使用
|
|
101
|
+
if proxy_data.startswith('http://') or proxy_data.startswith('https://'):
|
|
102
|
+
proxy_url = proxy_data
|
|
103
|
+
|
|
104
|
+
return proxy_url
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
async def test_target_url_without_proxy(target_url):
|
|
108
|
+
"""不使用代理直接测试访问目标URL"""
|
|
109
|
+
print(f"\n=== 直接访问目标URL(不使用代理) ===")
|
|
110
|
+
print(f"目标URL: {target_url}")
|
|
111
|
+
|
|
112
|
+
try:
|
|
113
|
+
timeout = aiohttp.ClientTimeout(total=15)
|
|
114
|
+
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
115
|
+
# 添加用户代理头,避免被反爬虫机制拦截
|
|
116
|
+
headers = {
|
|
117
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
|
|
118
|
+
}
|
|
119
|
+
async with session.get(target_url, headers=headers) as response:
|
|
120
|
+
print(f"状态码: {response.status}")
|
|
121
|
+
print(f"响应头: {response.headers.get('content-type', 'Unknown')}")
|
|
122
|
+
|
|
123
|
+
# 只读取响应状态,不尝试解码内容
|
|
124
|
+
return response.status == 200
|
|
125
|
+
|
|
126
|
+
except asyncio.TimeoutError:
|
|
127
|
+
print("请求超时")
|
|
128
|
+
return False
|
|
129
|
+
except Exception as e:
|
|
130
|
+
print(f"请求失败: {e}")
|
|
131
|
+
return False
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
async def test_target_url_with_proxy(proxy_url, target_url, max_retries=3):
|
|
135
|
+
"""使用代理测试访问目标URL"""
|
|
136
|
+
print(f"\n=== 使用代理测试访问目标URL ===")
|
|
137
|
+
print(f"代理地址: {proxy_url}")
|
|
138
|
+
print(f"目标URL: {target_url}")
|
|
139
|
+
|
|
140
|
+
# 添加用户代理头,避免被反爬虫机制拦截
|
|
141
|
+
headers = {
|
|
142
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
for attempt in range(max_retries):
|
|
146
|
+
if attempt > 0:
|
|
147
|
+
print(f"\n第 {attempt + 1} 次重试...")
|
|
148
|
+
|
|
149
|
+
try:
|
|
150
|
+
# 创建aiohttp客户端会话
|
|
151
|
+
timeout = aiohttp.ClientTimeout(total=15)
|
|
152
|
+
async with aiohttp.ClientSession(timeout=timeout, headers=headers) as session:
|
|
153
|
+
# 处理代理URL,支持带认证的代理
|
|
154
|
+
if isinstance(proxy_url, str) and "@" in proxy_url and "://" in proxy_url:
|
|
155
|
+
parsed = urlparse(proxy_url)
|
|
156
|
+
if parsed.username and parsed.password:
|
|
157
|
+
# 提取认证信息
|
|
158
|
+
auth = aiohttp.BasicAuth(parsed.username, parsed.password)
|
|
159
|
+
# 清理代理URL,移除认证信息
|
|
160
|
+
clean_proxy = f"{parsed.scheme}://{parsed.hostname}"
|
|
161
|
+
if parsed.port:
|
|
162
|
+
clean_proxy += f":{parsed.port}"
|
|
163
|
+
|
|
164
|
+
print(f"使用带认证的代理: {clean_proxy}")
|
|
165
|
+
async with session.get(target_url, proxy=clean_proxy, proxy_auth=auth) as response:
|
|
166
|
+
print(f"状态码: {response.status}")
|
|
167
|
+
print(f"响应头: {response.headers.get('content-type', 'Unknown')}")
|
|
168
|
+
return response.status == 200
|
|
169
|
+
else:
|
|
170
|
+
# 没有认证信息的代理
|
|
171
|
+
print(f"使用普通代理: {proxy_url}")
|
|
172
|
+
async with session.get(target_url, proxy=proxy_url) as response:
|
|
173
|
+
print(f"状态码: {response.status}")
|
|
174
|
+
print(f"响应头: {response.headers.get('content-type', 'Unknown')}")
|
|
175
|
+
return response.status == 200
|
|
176
|
+
else:
|
|
177
|
+
# 直接使用代理URL
|
|
178
|
+
print(f"使用代理: {proxy_url}")
|
|
179
|
+
async with session.get(target_url, proxy=proxy_url) as response:
|
|
180
|
+
print(f"状态码: {response.status}")
|
|
181
|
+
print(f"响应头: {response.headers.get('content-type', 'Unknown')}")
|
|
182
|
+
return response.status == 200
|
|
183
|
+
|
|
184
|
+
except asyncio.TimeoutError:
|
|
185
|
+
print("请求超时")
|
|
186
|
+
if attempt < max_retries - 1:
|
|
187
|
+
await asyncio.sleep(2) # 等待2秒后重试
|
|
188
|
+
continue
|
|
189
|
+
except aiohttp.ClientConnectorError as e:
|
|
190
|
+
print(f"连接错误: {e}")
|
|
191
|
+
if attempt < max_retries - 1:
|
|
192
|
+
await asyncio.sleep(2) # 等待2秒后重试
|
|
193
|
+
continue
|
|
194
|
+
except aiohttp.ClientHttpProxyError as e:
|
|
195
|
+
print(f"代理HTTP错误: {e}")
|
|
196
|
+
if attempt < max_retries - 1:
|
|
197
|
+
await asyncio.sleep(2) # 等待2秒后重试
|
|
198
|
+
continue
|
|
199
|
+
except aiohttp.ServerDisconnectedError as e:
|
|
200
|
+
print(f"服务器断开连接: {e}")
|
|
201
|
+
if attempt < max_retries - 1:
|
|
202
|
+
await asyncio.sleep(2) # 等待2秒后重试
|
|
203
|
+
continue
|
|
204
|
+
except Exception as e:
|
|
205
|
+
print(f"请求失败: {e}")
|
|
206
|
+
if attempt < max_retries - 1:
|
|
207
|
+
await asyncio.sleep(2) # 等待2秒后重试
|
|
208
|
+
continue
|
|
209
|
+
|
|
210
|
+
return False
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
async def main():
|
|
214
|
+
"""主测试函数"""
|
|
215
|
+
# 指定的代理API和测试链接
|
|
216
|
+
proxy_api = 'http://test.proxy.api:8080/proxy/getitem/'
|
|
217
|
+
target_url = 'https://stock.10jqka.com.cn/20240315/c655957791.shtml'
|
|
218
|
+
|
|
219
|
+
print("开始测试代理接口和目标链接访问...\n")
|
|
220
|
+
|
|
221
|
+
# 1. 测试代理API接口
|
|
222
|
+
proxy_data = await test_proxy_api(proxy_api)
|
|
223
|
+
|
|
224
|
+
if not proxy_data:
|
|
225
|
+
print("代理API测试失败,无法获取代理信息")
|
|
226
|
+
return
|
|
227
|
+
|
|
228
|
+
# 2. 从API响应中提取代理URL
|
|
229
|
+
proxy_url = extract_proxy_url(proxy_data)
|
|
230
|
+
|
|
231
|
+
if not proxy_url:
|
|
232
|
+
print("无法从API响应中提取代理URL")
|
|
233
|
+
print(f"API响应内容: {proxy_data}")
|
|
234
|
+
return
|
|
235
|
+
|
|
236
|
+
print(f"\n提取到的代理URL: {proxy_url}")
|
|
237
|
+
|
|
238
|
+
# 3. 首先尝试直接访问,确认目标URL是否可访问
|
|
239
|
+
print("\n=== 测试直接访问目标URL ===")
|
|
240
|
+
direct_success = await test_target_url_without_proxy(target_url)
|
|
241
|
+
|
|
242
|
+
if direct_success:
|
|
243
|
+
print("✅ 直接访问目标URL成功")
|
|
244
|
+
else:
|
|
245
|
+
print("❌ 直接访问目标URL失败")
|
|
246
|
+
|
|
247
|
+
# 4. 使用代理访问目标URL
|
|
248
|
+
print("\n=== 测试使用代理访问目标URL ===")
|
|
249
|
+
proxy_success = await test_target_url_with_proxy(proxy_url, target_url)
|
|
250
|
+
|
|
251
|
+
if proxy_success:
|
|
252
|
+
print(f"✅ 代理测试成功!代理 {proxy_url} 可以正常访问目标链接")
|
|
253
|
+
else:
|
|
254
|
+
print(f"❌ 代理测试失败!代理 {proxy_url} 无法访问目标链接")
|
|
255
|
+
|
|
256
|
+
# 5. 总结
|
|
257
|
+
print(f"\n=== 测试总结 ===")
|
|
258
|
+
print(f"代理API访问: {'成功' if proxy_data else '失败'}")
|
|
259
|
+
print(f"代理提取: {'成功' if proxy_url else '失败'}")
|
|
260
|
+
print(f"直接访问: {'成功' if direct_success else '失败'}")
|
|
261
|
+
print(f"代理访问: {'成功' if proxy_success else '失败'}")
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
if __name__ == "__main__":
|
|
265
|
+
asyncio.run(main())
|
tests/test_proxy_health_check.py
CHANGED
|
@@ -1,33 +1,33 @@
|
|
|
1
|
-
# tests/test_proxy_health_check.py
|
|
2
|
-
import pytest
|
|
3
|
-
from unittest.mock import AsyncMock, patch
|
|
4
|
-
from crawlo.proxy.health_check import check_single_proxy
|
|
5
|
-
import httpx
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
@pytest.mark.asyncio
|
|
9
|
-
@patch('httpx.AsyncClient')
|
|
10
|
-
async def test_health_check_success(mock_client_class):
|
|
11
|
-
"""测试健康检查:成功"""
|
|
12
|
-
mock_resp = AsyncMock()
|
|
13
|
-
mock_resp.status_code = 200
|
|
14
|
-
mock_client_class.return_value.__aenter__.return_value.get.return_value = mock_resp
|
|
15
|
-
|
|
16
|
-
proxy_info = {'url': 'http://good:8080', 'healthy': False}
|
|
17
|
-
await check_single_proxy(proxy_info)
|
|
18
|
-
|
|
19
|
-
assert proxy_info['healthy'] is True
|
|
20
|
-
assert proxy_info['failures'] == 0
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
@pytest.mark.asyncio
|
|
24
|
-
@patch('httpx.AsyncClient')
|
|
25
|
-
async def test_health_check_failure(mock_client_class):
|
|
26
|
-
"""测试健康检查:失败"""
|
|
27
|
-
mock_client_class.return_value.__aenter__.return_value.get.side_effect = httpx.ConnectError("Failed")
|
|
28
|
-
|
|
29
|
-
proxy_info = {'url': 'http://bad:8080', 'healthy': True, 'failures': 0}
|
|
30
|
-
await check_single_proxy(proxy_info)
|
|
31
|
-
|
|
32
|
-
assert proxy_info['healthy'] is False
|
|
1
|
+
# tests/test_proxy_health_check.py
|
|
2
|
+
import pytest
|
|
3
|
+
from unittest.mock import AsyncMock, patch
|
|
4
|
+
from crawlo.proxy.health_check import check_single_proxy
|
|
5
|
+
import httpx
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.asyncio
|
|
9
|
+
@patch('httpx.AsyncClient')
|
|
10
|
+
async def test_health_check_success(mock_client_class):
|
|
11
|
+
"""测试健康检查:成功"""
|
|
12
|
+
mock_resp = AsyncMock()
|
|
13
|
+
mock_resp.status_code = 200
|
|
14
|
+
mock_client_class.return_value.__aenter__.return_value.get.return_value = mock_resp
|
|
15
|
+
|
|
16
|
+
proxy_info = {'url': 'http://good:8080', 'healthy': False}
|
|
17
|
+
await check_single_proxy(proxy_info)
|
|
18
|
+
|
|
19
|
+
assert proxy_info['healthy'] is True
|
|
20
|
+
assert proxy_info['failures'] == 0
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@pytest.mark.asyncio
|
|
24
|
+
@patch('httpx.AsyncClient')
|
|
25
|
+
async def test_health_check_failure(mock_client_class):
|
|
26
|
+
"""测试健康检查:失败"""
|
|
27
|
+
mock_client_class.return_value.__aenter__.return_value.get.side_effect = httpx.ConnectError("Failed")
|
|
28
|
+
|
|
29
|
+
proxy_info = {'url': 'http://bad:8080', 'healthy': True, 'failures': 0}
|
|
30
|
+
await check_single_proxy(proxy_info)
|
|
31
|
+
|
|
32
|
+
assert proxy_info['healthy'] is False
|
|
33
33
|
assert proxy_info['failures'] == 1
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
ProxyMiddleware 测试文件
|
|
5
|
+
用于测试代理中间件的功能
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import unittest
|
|
10
|
+
from unittest.mock import Mock, patch
|
|
11
|
+
|
|
12
|
+
from crawlo.middleware.proxy import ProxyMiddleware
|
|
13
|
+
from crawlo.exceptions import NotConfiguredError
|
|
14
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class MockLogger:
|
|
18
|
+
"""Mock Logger 类,用于测试日志输出"""
|
|
19
|
+
def __init__(self, name, level=None):
|
|
20
|
+
self.name = name
|
|
21
|
+
self.level = level
|
|
22
|
+
self.logs = []
|
|
23
|
+
|
|
24
|
+
def debug(self, msg):
|
|
25
|
+
self.logs.append(('debug', msg))
|
|
26
|
+
|
|
27
|
+
def info(self, msg):
|
|
28
|
+
self.logs.append(('info', msg))
|
|
29
|
+
|
|
30
|
+
def warning(self, msg):
|
|
31
|
+
self.logs.append(('warning', msg))
|
|
32
|
+
|
|
33
|
+
def error(self, msg):
|
|
34
|
+
self.logs.append(('error', msg))
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class TestProxyMiddleware(unittest.TestCase):
|
|
38
|
+
"""ProxyMiddleware 测试类"""
|
|
39
|
+
|
|
40
|
+
def setUp(self):
|
|
41
|
+
"""测试前准备"""
|
|
42
|
+
# 创建设置管理器
|
|
43
|
+
self.settings = SettingManager()
|
|
44
|
+
|
|
45
|
+
# 创建爬虫模拟对象
|
|
46
|
+
self.crawler = Mock()
|
|
47
|
+
self.crawler.settings = self.settings
|
|
48
|
+
|
|
49
|
+
@patch('crawlo.utils.log.get_logger')
|
|
50
|
+
def test_middleware_initialization_without_api_url(self, mock_get_logger):
|
|
51
|
+
"""测试没有配置API URL时中间件初始化"""
|
|
52
|
+
self.settings.set('PROXY_ENABLED', True)
|
|
53
|
+
self.settings.set('PROXY_API_URL', None)
|
|
54
|
+
self.settings.set('LOG_LEVEL', 'INFO')
|
|
55
|
+
|
|
56
|
+
mock_get_logger.return_value = MockLogger('ProxyMiddleware')
|
|
57
|
+
|
|
58
|
+
# 应该抛出NotConfiguredError异常
|
|
59
|
+
with self.assertRaises(NotConfiguredError):
|
|
60
|
+
ProxyMiddleware.create_instance(self.crawler)
|
|
61
|
+
|
|
62
|
+
@patch('crawlo.utils.log.get_logger')
|
|
63
|
+
def test_middleware_initialization_with_disabled_proxy(self, mock_get_logger):
|
|
64
|
+
"""测试禁用代理时中间件初始化"""
|
|
65
|
+
self.settings.set('PROXY_ENABLED', False)
|
|
66
|
+
self.settings.set('LOG_LEVEL', 'INFO')
|
|
67
|
+
|
|
68
|
+
mock_get_logger.return_value = MockLogger('ProxyMiddleware')
|
|
69
|
+
|
|
70
|
+
# 应该正常创建实例
|
|
71
|
+
middleware = ProxyMiddleware.create_instance(self.crawler)
|
|
72
|
+
self.assertIsInstance(middleware, ProxyMiddleware)
|
|
73
|
+
self.assertFalse(middleware.enabled)
|
|
74
|
+
|
|
75
|
+
@patch('crawlo.utils.log.get_logger')
|
|
76
|
+
def test_middleware_initialization_with_api_url(self, mock_get_logger):
|
|
77
|
+
"""测试配置API URL时中间件初始化"""
|
|
78
|
+
self.settings.set('PROXY_ENABLED', True)
|
|
79
|
+
self.settings.set('PROXY_API_URL', 'http://proxy-api.example.com')
|
|
80
|
+
self.settings.set('LOG_LEVEL', 'INFO')
|
|
81
|
+
|
|
82
|
+
mock_get_logger.return_value = MockLogger('ProxyMiddleware')
|
|
83
|
+
|
|
84
|
+
# 应该正常创建实例
|
|
85
|
+
middleware = ProxyMiddleware.create_instance(self.crawler)
|
|
86
|
+
self.assertIsInstance(middleware, ProxyMiddleware)
|
|
87
|
+
self.assertTrue(middleware.enabled)
|
|
88
|
+
self.assertEqual(middleware.api_url, 'http://proxy-api.example.com')
|
|
89
|
+
|
|
90
|
+
def test_is_https_with_https_url(self):
|
|
91
|
+
"""测试HTTPS URL判断"""
|
|
92
|
+
# 创建中间件实例
|
|
93
|
+
middleware = ProxyMiddleware(
|
|
94
|
+
settings=self.settings,
|
|
95
|
+
log_level='INFO'
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# 创建请求对象
|
|
99
|
+
request = Mock()
|
|
100
|
+
request.url = 'https://example.com/page'
|
|
101
|
+
|
|
102
|
+
# 应该返回True
|
|
103
|
+
self.assertTrue(middleware._is_https(request))
|
|
104
|
+
|
|
105
|
+
def test_is_https_with_http_url(self):
|
|
106
|
+
"""测试HTTP URL判断"""
|
|
107
|
+
# 创建中间件实例
|
|
108
|
+
middleware = ProxyMiddleware(
|
|
109
|
+
settings=self.settings,
|
|
110
|
+
log_level='INFO'
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# 创建请求对象
|
|
114
|
+
request = Mock()
|
|
115
|
+
request.url = 'http://example.com/page'
|
|
116
|
+
|
|
117
|
+
# 应该返回False
|
|
118
|
+
self.assertFalse(middleware._is_https(request))
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
if __name__ == '__main__':
|
|
122
|
+
unittest.main()
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
增强版代理中间件测试
|
|
5
|
+
==================
|
|
6
|
+
测试ProxyMiddleware的代理池和健康检查功能
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
import json
|
|
11
|
+
import sys
|
|
12
|
+
import os
|
|
13
|
+
|
|
14
|
+
# 添加项目根目录到Python路径
|
|
15
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
16
|
+
|
|
17
|
+
from unittest.mock import AsyncMock, Mock, patch
|
|
18
|
+
|
|
19
|
+
from crawlo.middleware.proxy import ProxyMiddleware, Proxy
|
|
20
|
+
from crawlo.network.request import Request
|
|
21
|
+
from crawlo.network.response import Response
|
|
22
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def test_proxy_class():
|
|
26
|
+
"""测试Proxy类的基本功能"""
|
|
27
|
+
print("=== 测试Proxy类 ===")
|
|
28
|
+
|
|
29
|
+
# 创建代理对象
|
|
30
|
+
proxy = Proxy("http://127.0.0.1:8080")
|
|
31
|
+
print(f"初始代理: {proxy.proxy_str}")
|
|
32
|
+
print(f"初始成功率: {proxy.success_rate}")
|
|
33
|
+
print(f"是否健康: {proxy.is_healthy}")
|
|
34
|
+
|
|
35
|
+
# 测试成功标记
|
|
36
|
+
proxy.mark_success()
|
|
37
|
+
print(f"标记成功后 - 成功率: {proxy.success_rate}, 成功次数: {proxy.success_count}")
|
|
38
|
+
|
|
39
|
+
# 测试失败标记
|
|
40
|
+
proxy.mark_failure()
|
|
41
|
+
print(f"标记失败后 - 成功率: {proxy.success_rate}, 失败次数: {proxy.failure_count}")
|
|
42
|
+
print(f"是否健康: {proxy.is_healthy}")
|
|
43
|
+
|
|
44
|
+
# 测试多次失败后健康状态
|
|
45
|
+
for _ in range(5):
|
|
46
|
+
proxy.mark_failure()
|
|
47
|
+
print(f"多次失败后 - 成功率: {proxy.success_rate}, 是否健康: {proxy.is_healthy}")
|
|
48
|
+
|
|
49
|
+
print("Proxy类测试完成\n")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def create_mock_settings():
|
|
53
|
+
"""创建模拟设置"""
|
|
54
|
+
settings = SettingManager()
|
|
55
|
+
settings.set("PROXY_ENABLED", True)
|
|
56
|
+
settings.set("PROXY_API_URL", "http://test.proxy.api/get")
|
|
57
|
+
settings.set("PROXY_EXTRACTOR", "proxy")
|
|
58
|
+
settings.set("PROXY_REFRESH_INTERVAL", 10)
|
|
59
|
+
settings.set("PROXY_POOL_SIZE", 3)
|
|
60
|
+
settings.set("PROXY_HEALTH_CHECK_THRESHOLD", 0.5)
|
|
61
|
+
settings.set("LOG_LEVEL", "DEBUG")
|
|
62
|
+
return settings
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
async def test_proxy_middleware_initialization():
|
|
66
|
+
"""测试代理中间件初始化"""
|
|
67
|
+
print("=== 测试代理中间件初始化 ===")
|
|
68
|
+
|
|
69
|
+
settings = create_mock_settings()
|
|
70
|
+
middleware = ProxyMiddleware(settings, "DEBUG")
|
|
71
|
+
|
|
72
|
+
print(f"代理中间件已启用: {middleware.enabled}")
|
|
73
|
+
print(f"API URL: {middleware.api_url}")
|
|
74
|
+
print(f"代理池大小: {middleware.proxy_pool_size}")
|
|
75
|
+
print(f"健康检查阈值: {middleware.health_check_threshold}")
|
|
76
|
+
print(f"刷新间隔: {middleware.refresh_interval}")
|
|
77
|
+
|
|
78
|
+
print("代理中间件初始化测试完成\n")
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
async def test_proxy_pool_management():
|
|
82
|
+
"""测试代理池管理功能"""
|
|
83
|
+
print("=== 测试代理池管理 ===")
|
|
84
|
+
|
|
85
|
+
settings = create_mock_settings()
|
|
86
|
+
middleware = ProxyMiddleware(settings, "DEBUG")
|
|
87
|
+
|
|
88
|
+
# 模拟API响应
|
|
89
|
+
mock_proxies = [
|
|
90
|
+
"http://proxy1.example.com:8080",
|
|
91
|
+
"http://proxy2.example.com:8080",
|
|
92
|
+
"http://proxy3.example.com:8080"
|
|
93
|
+
]
|
|
94
|
+
|
|
95
|
+
# 测试更新代理池
|
|
96
|
+
with patch.object(middleware, '_get_proxy_from_api', AsyncMock(return_value=mock_proxies[0])):
|
|
97
|
+
await middleware._update_proxy_pool()
|
|
98
|
+
print(f"代理池大小: {len(middleware._proxy_pool)}")
|
|
99
|
+
if middleware._proxy_pool:
|
|
100
|
+
print(f"第一个代理: {middleware._proxy_pool[0].proxy_str}")
|
|
101
|
+
|
|
102
|
+
# 测试获取健康代理
|
|
103
|
+
healthy_proxy = await middleware._get_healthy_proxy()
|
|
104
|
+
if healthy_proxy:
|
|
105
|
+
print(f"获取到健康代理: {healthy_proxy.proxy_str}")
|
|
106
|
+
else:
|
|
107
|
+
print("未获取到健康代理")
|
|
108
|
+
|
|
109
|
+
print("代理池管理测试完成\n")
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
async def test_process_request():
|
|
113
|
+
"""测试请求处理"""
|
|
114
|
+
print("=== 测试请求处理 ===")
|
|
115
|
+
|
|
116
|
+
settings = create_mock_settings()
|
|
117
|
+
middleware = ProxyMiddleware(settings, "DEBUG")
|
|
118
|
+
|
|
119
|
+
# 创建模拟请求
|
|
120
|
+
request = Request(url="http://example.com")
|
|
121
|
+
|
|
122
|
+
# 创建模拟爬虫对象
|
|
123
|
+
mock_spider = Mock()
|
|
124
|
+
mock_spider.crawler.settings.get.return_value = "aiohttp"
|
|
125
|
+
|
|
126
|
+
# 添加一些测试代理到池中
|
|
127
|
+
middleware._proxy_pool = [
|
|
128
|
+
Proxy("http://proxy1.example.com:8080"),
|
|
129
|
+
Proxy("http://proxy2.example.com:8080")
|
|
130
|
+
]
|
|
131
|
+
|
|
132
|
+
# 处理请求
|
|
133
|
+
result = await middleware.process_request(request, mock_spider)
|
|
134
|
+
print(f"处理结果: {result}")
|
|
135
|
+
print(f"请求代理: {request.proxy}")
|
|
136
|
+
if "_used_proxy" in request.meta:
|
|
137
|
+
print(f"使用的代理对象: {request.meta['_used_proxy'].proxy_str}")
|
|
138
|
+
|
|
139
|
+
print("请求处理测试完成\n")
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def test_process_response():
|
|
143
|
+
"""测试响应处理"""
|
|
144
|
+
print("=== 测试响应处理 ===")
|
|
145
|
+
|
|
146
|
+
settings = create_mock_settings()
|
|
147
|
+
middleware = ProxyMiddleware(settings, "DEBUG")
|
|
148
|
+
|
|
149
|
+
# 创建带代理信息的请求
|
|
150
|
+
request = Request(url="http://example.com")
|
|
151
|
+
proxy_obj = Proxy("http://proxy1.example.com:8080")
|
|
152
|
+
request.meta["_used_proxy"] = proxy_obj
|
|
153
|
+
|
|
154
|
+
# 创建响应
|
|
155
|
+
response = Response(
|
|
156
|
+
url="http://example.com",
|
|
157
|
+
status_code=200,
|
|
158
|
+
body=b"test response",
|
|
159
|
+
request=request
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
# 处理响应前
|
|
163
|
+
print(f"处理前 - 代理成功次数: {proxy_obj.success_count}, 失败次数: {proxy_obj.failure_count}")
|
|
164
|
+
|
|
165
|
+
# 处理响应
|
|
166
|
+
result = middleware.process_response(request, response, None)
|
|
167
|
+
|
|
168
|
+
# 处理后
|
|
169
|
+
print(f"处理后 - 代理成功次数: {proxy_obj.success_count}, 失败次数: {proxy_obj.failure_count}")
|
|
170
|
+
print(f"成功率: {proxy_obj.success_rate}")
|
|
171
|
+
|
|
172
|
+
print("响应处理测试完成\n")
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def test_process_exception():
|
|
176
|
+
"""测试异常处理"""
|
|
177
|
+
print("=== 测试异常处理 ===")
|
|
178
|
+
|
|
179
|
+
settings = create_mock_settings()
|
|
180
|
+
middleware = ProxyMiddleware(settings, "DEBUG")
|
|
181
|
+
|
|
182
|
+
# 创建带代理信息的请求
|
|
183
|
+
request = Request(url="http://example.com")
|
|
184
|
+
proxy_obj = Proxy("http://proxy1.example.com:8080")
|
|
185
|
+
request.meta["_used_proxy"] = proxy_obj
|
|
186
|
+
|
|
187
|
+
# 处理异常前
|
|
188
|
+
print(f"处理前 - 代理成功次数: {proxy_obj.success_count}, 失败次数: {proxy_obj.failure_count}")
|
|
189
|
+
|
|
190
|
+
# 处理异常
|
|
191
|
+
result = middleware.process_exception(request, Exception("Test error"), None)
|
|
192
|
+
|
|
193
|
+
# 处理后
|
|
194
|
+
print(f"处理后 - 代理成功次数: {proxy_obj.success_count}, 失败次数: {proxy_obj.failure_count}")
|
|
195
|
+
print(f"成功率: {proxy_obj.success_rate}")
|
|
196
|
+
print(f"是否健康: {proxy_obj.is_healthy}")
|
|
197
|
+
|
|
198
|
+
print("异常处理测试完成\n")
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
async def main():
|
|
202
|
+
"""主测试函数"""
|
|
203
|
+
print("开始测试增强版代理中间件...\n")
|
|
204
|
+
|
|
205
|
+
# 运行各个测试
|
|
206
|
+
test_proxy_class()
|
|
207
|
+
await test_proxy_middleware_initialization()
|
|
208
|
+
await test_proxy_pool_management()
|
|
209
|
+
await test_process_request()
|
|
210
|
+
test_process_response()
|
|
211
|
+
test_process_exception()
|
|
212
|
+
|
|
213
|
+
print("所有测试完成!")
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
if __name__ == "__main__":
|
|
217
|
+
asyncio.run(main())
|