crawlo 1.4.7__py3-none-any.whl → 1.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +90 -90
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +186 -186
- crawlo/commands/help.py +140 -140
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +379 -379
- crawlo/commands/startproject.py +460 -460
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +320 -320
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +52 -52
- crawlo/core/engine.py +451 -451
- crawlo/core/processor.py +47 -47
- crawlo/core/scheduler.py +290 -290
- crawlo/crawler.py +698 -698
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +280 -280
- crawlo/downloader/aiohttp_downloader.py +233 -233
- crawlo/downloader/cffi_downloader.py +250 -250
- crawlo/downloader/httpx_downloader.py +265 -265
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +425 -425
- crawlo/downloader/selenium_downloader.py +486 -486
- crawlo/event.py +45 -45
- crawlo/exceptions.py +214 -214
- crawlo/extension/__init__.py +64 -64
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +94 -94
- crawlo/extension/log_stats.py +70 -70
- crawlo/extension/logging_extension.py +53 -53
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/factories/__init__.py +27 -27
- crawlo/factories/base.py +68 -68
- crawlo/factories/crawler.py +104 -104
- crawlo/factories/registry.py +84 -84
- crawlo/factories/utils.py +134 -134
- crawlo/filters/__init__.py +170 -170
- crawlo/filters/aioredis_filter.py +347 -347
- crawlo/filters/memory_filter.py +261 -261
- crawlo/framework.py +306 -306
- crawlo/initialization/__init__.py +44 -44
- crawlo/initialization/built_in.py +391 -391
- crawlo/initialization/context.py +141 -141
- crawlo/initialization/core.py +240 -240
- crawlo/initialization/phases.py +229 -229
- crawlo/initialization/registry.py +143 -143
- crawlo/initialization/utils.py +48 -48
- crawlo/interfaces.py +23 -23
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +23 -23
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/logging/__init__.py +42 -42
- crawlo/logging/config.py +280 -276
- crawlo/logging/factory.py +175 -175
- crawlo/logging/manager.py +104 -104
- crawlo/middleware/__init__.py +87 -87
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +142 -142
- crawlo/middleware/offsite.py +123 -123
- crawlo/middleware/proxy.py +209 -209
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +150 -150
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/mode_manager.py +287 -287
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +408 -376
- crawlo/network/response.py +598 -569
- crawlo/pipelines/__init__.py +52 -52
- crawlo/pipelines/base_pipeline.py +452 -452
- crawlo/pipelines/bloom_dedup_pipeline.py +145 -146
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +196 -197
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +104 -105
- crawlo/pipelines/mongo_pipeline.py +140 -139
- crawlo/pipelines/mysql_pipeline.py +468 -469
- crawlo/pipelines/pipeline_manager.py +100 -100
- crawlo/pipelines/redis_dedup_pipeline.py +155 -155
- crawlo/project.py +347 -347
- crawlo/queue/__init__.py +9 -9
- crawlo/queue/pqueue.py +38 -38
- crawlo/queue/queue_manager.py +591 -591
- crawlo/queue/redis_priority_queue.py +518 -518
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +287 -284
- crawlo/settings/setting_manager.py +219 -219
- crawlo/spider/__init__.py +658 -657
- crawlo/stats_collector.py +81 -81
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +138 -138
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +1 -1
- crawlo/templates/project/items.py.tmpl +13 -13
- crawlo/templates/project/middlewares.py.tmpl +38 -38
- crawlo/templates/project/pipelines.py.tmpl +35 -35
- crawlo/templates/project/settings.py.tmpl +113 -109
- crawlo/templates/project/settings_distributed.py.tmpl +160 -156
- crawlo/templates/project/settings_gentle.py.tmpl +174 -170
- crawlo/templates/project/settings_high_performance.py.tmpl +175 -171
- crawlo/templates/project/settings_minimal.py.tmpl +102 -98
- crawlo/templates/project/settings_simple.py.tmpl +172 -168
- crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
- crawlo/templates/run.py.tmpl +23 -23
- crawlo/templates/spider/spider.py.tmpl +32 -32
- crawlo/templates/spiders_init.py.tmpl +4 -4
- crawlo/tools/__init__.py +86 -86
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +384 -384
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +74 -50
- crawlo/utils/batch_processor.py +276 -276
- crawlo/utils/config_manager.py +442 -442
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +250 -250
- crawlo/utils/encoding_helper.py +190 -0
- crawlo/utils/error_handler.py +410 -410
- crawlo/utils/fingerprint.py +121 -121
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/leak_detector.py +335 -335
- crawlo/utils/misc.py +81 -81
- crawlo/utils/mongo_connection_pool.py +157 -157
- crawlo/utils/mysql_connection_pool.py +197 -197
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_checker.py +90 -90
- crawlo/utils/redis_connection_pool.py +578 -578
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +278 -278
- crawlo/utils/request_serializer.py +225 -225
- crawlo/utils/resource_manager.py +337 -337
- crawlo/utils/response_helper.py +113 -0
- crawlo/utils/selector_helper.py +138 -137
- crawlo/utils/singleton.py +69 -69
- crawlo/utils/spider_loader.py +201 -201
- crawlo/utils/text_helper.py +94 -94
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/METADATA +831 -689
- crawlo-1.4.8.dist-info/RECORD +347 -0
- examples/__init__.py +7 -7
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +217 -217
- tests/authenticated_proxy_example.py +110 -110
- tests/baidu_performance_test.py +108 -108
- tests/baidu_test.py +59 -59
- tests/bug_check_test.py +250 -250
- tests/cleaners_example.py +160 -160
- tests/comprehensive_framework_test.py +212 -212
- tests/comprehensive_test.py +81 -81
- tests/comprehensive_testing_summary.md +186 -186
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_configure.py +69 -69
- tests/debug_framework_logger.py +84 -84
- tests/debug_log_config.py +126 -126
- tests/debug_log_levels.py +63 -63
- tests/debug_pipelines.py +66 -66
- tests/detailed_log_test.py +233 -233
- tests/direct_selector_helper_test.py +96 -96
- tests/distributed_dedup_test.py +467 -467
- tests/distributed_test.py +66 -66
- tests/distributed_test_debug.py +76 -76
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/error_handling_example.py +171 -171
- tests/explain_mysql_update_behavior.py +76 -76
- tests/final_comprehensive_test.py +151 -151
- tests/final_log_test.py +260 -260
- tests/final_validation_test.py +182 -182
- tests/fix_log_test.py +142 -142
- tests/framework_performance_test.py +202 -202
- tests/log_buffering_test.py +111 -111
- tests/log_generation_timing_test.py +153 -153
- tests/monitor_redis_dedup.sh +72 -72
- tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
- tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
- tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
- tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
- tests/ofweek_scrapy/scrapy.cfg +11 -11
- tests/optimized_performance_test.py +211 -211
- tests/performance_comparison.py +244 -244
- tests/queue_blocking_test.py +113 -113
- tests/queue_test.py +89 -89
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/scrapy_comparison/ofweek_scrapy.py +138 -138
- tests/scrapy_comparison/scrapy_test.py +133 -133
- tests/simple_cli_test.py +54 -54
- tests/simple_command_test.py +119 -119
- tests/simple_crawlo_test.py +126 -126
- tests/simple_follow_test.py +38 -38
- tests/simple_log_test2.py +137 -137
- tests/simple_optimization_test.py +128 -128
- tests/simple_queue_type_test.py +41 -41
- tests/simple_response_selector_test.py +94 -94
- tests/simple_selector_helper_test.py +154 -154
- tests/simple_selector_test.py +207 -207
- tests/simple_spider_test.py +49 -49
- tests/simple_url_test.py +73 -73
- tests/simulate_mysql_update_test.py +139 -139
- tests/spider_log_timing_test.py +177 -177
- tests/test_advanced_tools.py +148 -148
- tests/test_all_commands.py +230 -230
- tests/test_all_pipeline_fingerprints.py +133 -133
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_asyncmy_usage.py +56 -56
- tests/test_batch_processor.py +178 -178
- tests/test_cleaners.py +54 -54
- tests/test_cli_arguments.py +118 -118
- tests/test_component_factory.py +174 -174
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_controlled_spider_mixin.py +79 -79
- tests/test_crawler_process_import.py +38 -38
- tests/test_crawler_process_spider_modules.py +47 -47
- tests/test_crawlo_proxy_integration.py +114 -114
- tests/test_date_tools.py +123 -123
- tests/test_dedup_fix.py +220 -220
- tests/test_dedup_pipeline_consistency.py +124 -124
- tests/test_default_header_middleware.py +313 -313
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +204 -204
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +272 -272
- tests/test_edge_cases.py +305 -305
- tests/test_encoding_core.py +56 -56
- tests/test_encoding_detection.py +126 -126
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_enhanced_error_handler_comprehensive.py +245 -245
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_factories.py +252 -252
- tests/test_factory_compatibility.py +196 -196
- tests/test_final_validation.py +153 -153
- tests/test_fingerprint_consistency.py +135 -135
- tests/test_fingerprint_simple.py +51 -51
- tests/test_get_component_logger.py +83 -83
- tests/test_hash_performance.py +99 -99
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_large_scale_helper.py +235 -235
- tests/test_logging_enhancements.py +374 -374
- tests/test_logging_final.py +184 -184
- tests/test_logging_integration.py +312 -312
- tests/test_logging_system.py +282 -282
- tests/test_middleware_debug.py +141 -141
- tests/test_mode_consistency.py +51 -51
- tests/test_multi_directory.py +67 -67
- tests/test_multiple_spider_modules.py +80 -80
- tests/test_mysql_pipeline_config.py +164 -164
- tests/test_mysql_pipeline_error.py +98 -98
- tests/test_mysql_pipeline_init_log.py +82 -82
- tests/test_mysql_pipeline_integration.py +132 -132
- tests/test_mysql_pipeline_refactor.py +143 -143
- tests/test_mysql_pipeline_refactor_simple.py +85 -85
- tests/test_mysql_pipeline_robustness.py +195 -195
- tests/test_mysql_pipeline_types.py +88 -88
- tests/test_mysql_update_columns.py +93 -93
- tests/test_offsite_middleware.py +244 -244
- tests/test_offsite_middleware_simple.py +203 -203
- tests/test_optimized_selector_naming.py +100 -100
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_performance_monitor.py +115 -115
- tests/test_pipeline_fingerprint_consistency.py +86 -86
- tests/test_priority_behavior.py +211 -211
- tests/test_priority_consistency.py +151 -151
- tests/test_priority_consistency_fixed.py +249 -249
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +217 -217
- tests/test_proxy_middleware_enhanced.py +212 -212
- tests/test_proxy_middleware_integration.py +142 -142
- tests/test_proxy_middleware_refactored.py +207 -207
- tests/test_proxy_only.py +83 -83
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_proxy_with_downloader.py +152 -152
- tests/test_queue_empty_check.py +41 -41
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +179 -179
- tests/test_queue_naming.py +154 -154
- tests/test_queue_type.py +106 -106
- tests/test_queue_type_redis_config_consistency.py +130 -130
- tests/test_random_headers_default.py +322 -322
- tests/test_random_headers_necessity.py +308 -308
- tests/test_random_user_agent.py +72 -72
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_redis_queue_name_fix.py +175 -175
- tests/test_redis_queue_type_fallback.py +129 -129
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_follow.py +104 -104
- tests/test_response_improvements.py +152 -152
- tests/test_response_selector_methods.py +92 -92
- tests/test_response_url_methods.py +70 -70
- tests/test_response_urljoin.py +86 -86
- tests/test_retry_middleware.py +333 -333
- tests/test_retry_middleware_realistic.py +273 -273
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_scrapy_style_encoding.py +112 -112
- tests/test_selector_helper.py +100 -100
- tests/test_selector_optimizations.py +146 -146
- tests/test_simple_response.py +61 -61
- tests/test_spider_loader.py +49 -49
- tests/test_spider_loader_comprehensive.py +69 -69
- tests/test_spider_modules.py +84 -84
- tests/test_spiders/test_spider.py +9 -9
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agent_randomness.py +176 -176
- tests/test_user_agents.py +96 -96
- tests/untested_features_report.md +138 -138
- tests/verify_debug.py +51 -51
- tests/verify_distributed.py +117 -117
- tests/verify_log_fix.py +111 -111
- tests/verify_mysql_warnings.py +109 -109
- crawlo/utils/log.py +0 -80
- crawlo/utils/url_utils.py +0 -40
- crawlo-1.4.7.dist-info/RECORD +0 -347
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
|
@@ -1,213 +1,213 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
增强版代理中间件测试
|
|
5
|
-
==================
|
|
6
|
-
测试ProxyMiddleware的代理池和健康检查功能
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
import asyncio
|
|
10
|
-
import json
|
|
11
|
-
import sys
|
|
12
|
-
import os
|
|
13
|
-
|
|
14
|
-
# 添加项目根目录到Python路径
|
|
15
|
-
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
16
|
-
|
|
17
|
-
from unittest.mock import AsyncMock, Mock, patch
|
|
18
|
-
|
|
19
|
-
from crawlo.middleware.proxy import ProxyMiddleware, Proxy
|
|
20
|
-
from crawlo.network.request import Request
|
|
21
|
-
from crawlo.network.response import Response
|
|
22
|
-
from crawlo.settings.setting_manager import SettingManager
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def test_proxy_class():
|
|
26
|
-
"""测试Proxy类的基本功能"""
|
|
27
|
-
print("=== 测试Proxy类 ===")
|
|
28
|
-
|
|
29
|
-
# 创建代理对象
|
|
30
|
-
proxy = Proxy("http://127.0.0.1:8080")
|
|
31
|
-
print(f"初始代理: {proxy.proxy_str}")
|
|
32
|
-
print(f"初始成功率: {proxy.success_rate}")
|
|
33
|
-
print(f"是否健康: {proxy.is_healthy}")
|
|
34
|
-
|
|
35
|
-
# 测试成功标记
|
|
36
|
-
proxy.mark_success()
|
|
37
|
-
print(f"标记成功后 - 成功率: {proxy.success_rate}, 成功次数: {proxy.success_count}")
|
|
38
|
-
|
|
39
|
-
# 测试失败标记
|
|
40
|
-
proxy.mark_failure()
|
|
41
|
-
print(f"标记失败后 - 成功率: {proxy.success_rate}, 失败次数: {proxy.failure_count}")
|
|
42
|
-
print(f"是否健康: {proxy.is_healthy}")
|
|
43
|
-
|
|
44
|
-
# 测试多次失败后健康状态
|
|
45
|
-
for _ in range(5):
|
|
46
|
-
proxy.mark_failure()
|
|
47
|
-
print(f"多次失败后 - 成功率: {proxy.success_rate}, 是否健康: {proxy.is_healthy}")
|
|
48
|
-
|
|
49
|
-
print("Proxy类测试完成\n")
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
def create_mock_settings():
|
|
53
|
-
"""创建模拟设置"""
|
|
54
|
-
settings = SettingManager()
|
|
55
|
-
# 不再需要显式设置 PROXY_ENABLED,只要配置了 PROXY_API_URL 就会启用
|
|
56
|
-
settings.set("PROXY_API_URL", "http://test.proxy.api/get")
|
|
57
|
-
settings.set("LOG_LEVEL", "DEBUG")
|
|
58
|
-
return settings
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
async def test_proxy_middleware_initialization():
|
|
62
|
-
"""测试代理中间件初始化"""
|
|
63
|
-
print("=== 测试代理中间件初始化 ===")
|
|
64
|
-
|
|
65
|
-
settings = create_mock_settings()
|
|
66
|
-
middleware = ProxyMiddleware(settings, "DEBUG")
|
|
67
|
-
|
|
68
|
-
print(f"代理中间件已启用: {middleware.enabled}")
|
|
69
|
-
print(f"API URL: {middleware.api_url}")
|
|
70
|
-
print(f"代理池大小: {middleware.proxy_pool_size}")
|
|
71
|
-
print(f"健康检查阈值: {middleware.health_check_threshold}")
|
|
72
|
-
print(f"刷新间隔: {middleware.refresh_interval}")
|
|
73
|
-
|
|
74
|
-
print("代理中间件初始化测试完成\n")
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
async def test_proxy_pool_management():
|
|
78
|
-
"""测试代理池管理功能"""
|
|
79
|
-
print("=== 测试代理池管理 ===")
|
|
80
|
-
|
|
81
|
-
settings = create_mock_settings()
|
|
82
|
-
middleware = ProxyMiddleware(settings, "DEBUG")
|
|
83
|
-
|
|
84
|
-
# 模拟API响应
|
|
85
|
-
mock_proxies = [
|
|
86
|
-
"http://proxy1.example.com:8080",
|
|
87
|
-
"http://proxy2.example.com:8080",
|
|
88
|
-
"http://proxy3.example.com:8080"
|
|
89
|
-
]
|
|
90
|
-
|
|
91
|
-
# 测试更新代理池
|
|
92
|
-
with patch.object(middleware, '_get_proxy_from_api', AsyncMock(return_value=mock_proxies[0])):
|
|
93
|
-
await middleware._update_proxy_pool()
|
|
94
|
-
print(f"代理池大小: {len(middleware._proxy_pool)}")
|
|
95
|
-
if middleware._proxy_pool:
|
|
96
|
-
print(f"第一个代理: {middleware._proxy_pool[0].proxy_str}")
|
|
97
|
-
|
|
98
|
-
# 测试获取健康代理
|
|
99
|
-
healthy_proxy = await middleware._get_healthy_proxy()
|
|
100
|
-
if healthy_proxy:
|
|
101
|
-
print(f"获取到健康代理: {healthy_proxy.proxy_str}")
|
|
102
|
-
else:
|
|
103
|
-
print("未获取到健康代理")
|
|
104
|
-
|
|
105
|
-
print("代理池管理测试完成\n")
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
async def test_process_request():
|
|
109
|
-
"""测试请求处理"""
|
|
110
|
-
print("=== 测试请求处理 ===")
|
|
111
|
-
|
|
112
|
-
settings = create_mock_settings()
|
|
113
|
-
middleware = ProxyMiddleware(settings, "DEBUG")
|
|
114
|
-
|
|
115
|
-
# 创建模拟请求
|
|
116
|
-
request = Request(url="http://example.com")
|
|
117
|
-
|
|
118
|
-
# 创建模拟爬虫对象
|
|
119
|
-
mock_spider = Mock()
|
|
120
|
-
mock_spider.crawler.settings.get.return_value = "aiohttp"
|
|
121
|
-
|
|
122
|
-
# 添加一些测试代理到池中
|
|
123
|
-
middleware._proxy_pool = [
|
|
124
|
-
Proxy("http://proxy1.example.com:8080"),
|
|
125
|
-
Proxy("http://proxy2.example.com:8080")
|
|
126
|
-
]
|
|
127
|
-
|
|
128
|
-
# 处理请求
|
|
129
|
-
result = await middleware.process_request(request, mock_spider)
|
|
130
|
-
print(f"处理结果: {result}")
|
|
131
|
-
print(f"请求代理: {request.proxy}")
|
|
132
|
-
if "_used_proxy" in request.meta:
|
|
133
|
-
print(f"使用的代理对象: {request.meta['_used_proxy'].proxy_str}")
|
|
134
|
-
|
|
135
|
-
print("请求处理测试完成\n")
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
def test_process_response():
|
|
139
|
-
"""测试响应处理"""
|
|
140
|
-
print("=== 测试响应处理 ===")
|
|
141
|
-
|
|
142
|
-
settings = create_mock_settings()
|
|
143
|
-
middleware = ProxyMiddleware(settings, "DEBUG")
|
|
144
|
-
|
|
145
|
-
# 创建带代理信息的请求
|
|
146
|
-
request = Request(url="http://example.com")
|
|
147
|
-
proxy_obj = Proxy("http://proxy1.example.com:8080")
|
|
148
|
-
request.meta["_used_proxy"] = proxy_obj
|
|
149
|
-
|
|
150
|
-
# 创建响应
|
|
151
|
-
response = Response(
|
|
152
|
-
url="http://example.com",
|
|
153
|
-
status_code=200,
|
|
154
|
-
body=b"test response",
|
|
155
|
-
request=request
|
|
156
|
-
)
|
|
157
|
-
|
|
158
|
-
# 处理响应前
|
|
159
|
-
print(f"处理前 - 代理成功次数: {proxy_obj.success_count}, 失败次数: {proxy_obj.failure_count}")
|
|
160
|
-
|
|
161
|
-
# 处理响应
|
|
162
|
-
result = middleware.process_response(request, response, None)
|
|
163
|
-
|
|
164
|
-
# 处理后
|
|
165
|
-
print(f"处理后 - 代理成功次数: {proxy_obj.success_count}, 失败次数: {proxy_obj.failure_count}")
|
|
166
|
-
print(f"成功率: {proxy_obj.success_rate}")
|
|
167
|
-
|
|
168
|
-
print("响应处理测试完成\n")
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
def test_process_exception():
|
|
172
|
-
"""测试异常处理"""
|
|
173
|
-
print("=== 测试异常处理 ===")
|
|
174
|
-
|
|
175
|
-
settings = create_mock_settings()
|
|
176
|
-
middleware = ProxyMiddleware(settings, "DEBUG")
|
|
177
|
-
|
|
178
|
-
# 创建带代理信息的请求
|
|
179
|
-
request = Request(url="http://example.com")
|
|
180
|
-
proxy_obj = Proxy("http://proxy1.example.com:8080")
|
|
181
|
-
request.meta["_used_proxy"] = proxy_obj
|
|
182
|
-
|
|
183
|
-
# 处理异常前
|
|
184
|
-
print(f"处理前 - 代理成功次数: {proxy_obj.success_count}, 失败次数: {proxy_obj.failure_count}")
|
|
185
|
-
|
|
186
|
-
# 处理异常
|
|
187
|
-
result = middleware.process_exception(request, Exception("Test error"), None)
|
|
188
|
-
|
|
189
|
-
# 处理后
|
|
190
|
-
print(f"处理后 - 代理成功次数: {proxy_obj.success_count}, 失败次数: {proxy_obj.failure_count}")
|
|
191
|
-
print(f"成功率: {proxy_obj.success_rate}")
|
|
192
|
-
print(f"是否健康: {proxy_obj.is_healthy}")
|
|
193
|
-
|
|
194
|
-
print("异常处理测试完成\n")
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
async def main():
|
|
198
|
-
"""主测试函数"""
|
|
199
|
-
print("开始测试增强版代理中间件...\n")
|
|
200
|
-
|
|
201
|
-
# 运行各个测试
|
|
202
|
-
test_proxy_class()
|
|
203
|
-
await test_proxy_middleware_initialization()
|
|
204
|
-
await test_proxy_pool_management()
|
|
205
|
-
await test_process_request()
|
|
206
|
-
test_process_response()
|
|
207
|
-
test_process_exception()
|
|
208
|
-
|
|
209
|
-
print("所有测试完成!")
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
if __name__ == "__main__":
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
增强版代理中间件测试
|
|
5
|
+
==================
|
|
6
|
+
测试ProxyMiddleware的代理池和健康检查功能
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
import json
|
|
11
|
+
import sys
|
|
12
|
+
import os
|
|
13
|
+
|
|
14
|
+
# 添加项目根目录到Python路径
|
|
15
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
16
|
+
|
|
17
|
+
from unittest.mock import AsyncMock, Mock, patch
|
|
18
|
+
|
|
19
|
+
from crawlo.middleware.proxy import ProxyMiddleware, Proxy
|
|
20
|
+
from crawlo.network.request import Request
|
|
21
|
+
from crawlo.network.response import Response
|
|
22
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def test_proxy_class():
|
|
26
|
+
"""测试Proxy类的基本功能"""
|
|
27
|
+
print("=== 测试Proxy类 ===")
|
|
28
|
+
|
|
29
|
+
# 创建代理对象
|
|
30
|
+
proxy = Proxy("http://127.0.0.1:8080")
|
|
31
|
+
print(f"初始代理: {proxy.proxy_str}")
|
|
32
|
+
print(f"初始成功率: {proxy.success_rate}")
|
|
33
|
+
print(f"是否健康: {proxy.is_healthy}")
|
|
34
|
+
|
|
35
|
+
# 测试成功标记
|
|
36
|
+
proxy.mark_success()
|
|
37
|
+
print(f"标记成功后 - 成功率: {proxy.success_rate}, 成功次数: {proxy.success_count}")
|
|
38
|
+
|
|
39
|
+
# 测试失败标记
|
|
40
|
+
proxy.mark_failure()
|
|
41
|
+
print(f"标记失败后 - 成功率: {proxy.success_rate}, 失败次数: {proxy.failure_count}")
|
|
42
|
+
print(f"是否健康: {proxy.is_healthy}")
|
|
43
|
+
|
|
44
|
+
# 测试多次失败后健康状态
|
|
45
|
+
for _ in range(5):
|
|
46
|
+
proxy.mark_failure()
|
|
47
|
+
print(f"多次失败后 - 成功率: {proxy.success_rate}, 是否健康: {proxy.is_healthy}")
|
|
48
|
+
|
|
49
|
+
print("Proxy类测试完成\n")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def create_mock_settings():
|
|
53
|
+
"""创建模拟设置"""
|
|
54
|
+
settings = SettingManager()
|
|
55
|
+
# 不再需要显式设置 PROXY_ENABLED,只要配置了 PROXY_API_URL 就会启用
|
|
56
|
+
settings.set("PROXY_API_URL", "http://test.proxy.api/get")
|
|
57
|
+
settings.set("LOG_LEVEL", "DEBUG")
|
|
58
|
+
return settings
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
async def test_proxy_middleware_initialization():
|
|
62
|
+
"""测试代理中间件初始化"""
|
|
63
|
+
print("=== 测试代理中间件初始化 ===")
|
|
64
|
+
|
|
65
|
+
settings = create_mock_settings()
|
|
66
|
+
middleware = ProxyMiddleware(settings, "DEBUG")
|
|
67
|
+
|
|
68
|
+
print(f"代理中间件已启用: {middleware.enabled}")
|
|
69
|
+
print(f"API URL: {middleware.api_url}")
|
|
70
|
+
print(f"代理池大小: {middleware.proxy_pool_size}")
|
|
71
|
+
print(f"健康检查阈值: {middleware.health_check_threshold}")
|
|
72
|
+
print(f"刷新间隔: {middleware.refresh_interval}")
|
|
73
|
+
|
|
74
|
+
print("代理中间件初始化测试完成\n")
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
async def test_proxy_pool_management():
|
|
78
|
+
"""测试代理池管理功能"""
|
|
79
|
+
print("=== 测试代理池管理 ===")
|
|
80
|
+
|
|
81
|
+
settings = create_mock_settings()
|
|
82
|
+
middleware = ProxyMiddleware(settings, "DEBUG")
|
|
83
|
+
|
|
84
|
+
# 模拟API响应
|
|
85
|
+
mock_proxies = [
|
|
86
|
+
"http://proxy1.example.com:8080",
|
|
87
|
+
"http://proxy2.example.com:8080",
|
|
88
|
+
"http://proxy3.example.com:8080"
|
|
89
|
+
]
|
|
90
|
+
|
|
91
|
+
# 测试更新代理池
|
|
92
|
+
with patch.object(middleware, '_get_proxy_from_api', AsyncMock(return_value=mock_proxies[0])):
|
|
93
|
+
await middleware._update_proxy_pool()
|
|
94
|
+
print(f"代理池大小: {len(middleware._proxy_pool)}")
|
|
95
|
+
if middleware._proxy_pool:
|
|
96
|
+
print(f"第一个代理: {middleware._proxy_pool[0].proxy_str}")
|
|
97
|
+
|
|
98
|
+
# 测试获取健康代理
|
|
99
|
+
healthy_proxy = await middleware._get_healthy_proxy()
|
|
100
|
+
if healthy_proxy:
|
|
101
|
+
print(f"获取到健康代理: {healthy_proxy.proxy_str}")
|
|
102
|
+
else:
|
|
103
|
+
print("未获取到健康代理")
|
|
104
|
+
|
|
105
|
+
print("代理池管理测试完成\n")
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
async def test_process_request():
|
|
109
|
+
"""测试请求处理"""
|
|
110
|
+
print("=== 测试请求处理 ===")
|
|
111
|
+
|
|
112
|
+
settings = create_mock_settings()
|
|
113
|
+
middleware = ProxyMiddleware(settings, "DEBUG")
|
|
114
|
+
|
|
115
|
+
# 创建模拟请求
|
|
116
|
+
request = Request(url="http://example.com")
|
|
117
|
+
|
|
118
|
+
# 创建模拟爬虫对象
|
|
119
|
+
mock_spider = Mock()
|
|
120
|
+
mock_spider.crawler.settings.get.return_value = "aiohttp"
|
|
121
|
+
|
|
122
|
+
# 添加一些测试代理到池中
|
|
123
|
+
middleware._proxy_pool = [
|
|
124
|
+
Proxy("http://proxy1.example.com:8080"),
|
|
125
|
+
Proxy("http://proxy2.example.com:8080")
|
|
126
|
+
]
|
|
127
|
+
|
|
128
|
+
# 处理请求
|
|
129
|
+
result = await middleware.process_request(request, mock_spider)
|
|
130
|
+
print(f"处理结果: {result}")
|
|
131
|
+
print(f"请求代理: {request.proxy}")
|
|
132
|
+
if "_used_proxy" in request.meta:
|
|
133
|
+
print(f"使用的代理对象: {request.meta['_used_proxy'].proxy_str}")
|
|
134
|
+
|
|
135
|
+
print("请求处理测试完成\n")
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def test_process_response():
|
|
139
|
+
"""测试响应处理"""
|
|
140
|
+
print("=== 测试响应处理 ===")
|
|
141
|
+
|
|
142
|
+
settings = create_mock_settings()
|
|
143
|
+
middleware = ProxyMiddleware(settings, "DEBUG")
|
|
144
|
+
|
|
145
|
+
# 创建带代理信息的请求
|
|
146
|
+
request = Request(url="http://example.com")
|
|
147
|
+
proxy_obj = Proxy("http://proxy1.example.com:8080")
|
|
148
|
+
request.meta["_used_proxy"] = proxy_obj
|
|
149
|
+
|
|
150
|
+
# 创建响应
|
|
151
|
+
response = Response(
|
|
152
|
+
url="http://example.com",
|
|
153
|
+
status_code=200,
|
|
154
|
+
body=b"test response",
|
|
155
|
+
request=request
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
# 处理响应前
|
|
159
|
+
print(f"处理前 - 代理成功次数: {proxy_obj.success_count}, 失败次数: {proxy_obj.failure_count}")
|
|
160
|
+
|
|
161
|
+
# 处理响应
|
|
162
|
+
result = middleware.process_response(request, response, None)
|
|
163
|
+
|
|
164
|
+
# 处理后
|
|
165
|
+
print(f"处理后 - 代理成功次数: {proxy_obj.success_count}, 失败次数: {proxy_obj.failure_count}")
|
|
166
|
+
print(f"成功率: {proxy_obj.success_rate}")
|
|
167
|
+
|
|
168
|
+
print("响应处理测试完成\n")
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def test_process_exception():
|
|
172
|
+
"""测试异常处理"""
|
|
173
|
+
print("=== 测试异常处理 ===")
|
|
174
|
+
|
|
175
|
+
settings = create_mock_settings()
|
|
176
|
+
middleware = ProxyMiddleware(settings, "DEBUG")
|
|
177
|
+
|
|
178
|
+
# 创建带代理信息的请求
|
|
179
|
+
request = Request(url="http://example.com")
|
|
180
|
+
proxy_obj = Proxy("http://proxy1.example.com:8080")
|
|
181
|
+
request.meta["_used_proxy"] = proxy_obj
|
|
182
|
+
|
|
183
|
+
# 处理异常前
|
|
184
|
+
print(f"处理前 - 代理成功次数: {proxy_obj.success_count}, 失败次数: {proxy_obj.failure_count}")
|
|
185
|
+
|
|
186
|
+
# 处理异常
|
|
187
|
+
result = middleware.process_exception(request, Exception("Test error"), None)
|
|
188
|
+
|
|
189
|
+
# 处理后
|
|
190
|
+
print(f"处理后 - 代理成功次数: {proxy_obj.success_count}, 失败次数: {proxy_obj.failure_count}")
|
|
191
|
+
print(f"成功率: {proxy_obj.success_rate}")
|
|
192
|
+
print(f"是否健康: {proxy_obj.is_healthy}")
|
|
193
|
+
|
|
194
|
+
print("异常处理测试完成\n")
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
async def main():
|
|
198
|
+
"""主测试函数"""
|
|
199
|
+
print("开始测试增强版代理中间件...\n")
|
|
200
|
+
|
|
201
|
+
# 运行各个测试
|
|
202
|
+
test_proxy_class()
|
|
203
|
+
await test_proxy_middleware_initialization()
|
|
204
|
+
await test_proxy_pool_management()
|
|
205
|
+
await test_process_request()
|
|
206
|
+
test_process_response()
|
|
207
|
+
test_process_exception()
|
|
208
|
+
|
|
209
|
+
print("所有测试完成!")
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
if __name__ == "__main__":
|
|
213
213
|
asyncio.run(main())
|