crawlo 1.4.6__py3-none-any.whl → 1.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +90 -89
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +186 -186
- crawlo/commands/help.py +140 -138
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +379 -341
- crawlo/commands/startproject.py +460 -460
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +320 -312
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +52 -52
- crawlo/core/engine.py +451 -438
- crawlo/core/processor.py +47 -47
- crawlo/core/scheduler.py +290 -291
- crawlo/crawler.py +698 -657
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +280 -276
- crawlo/downloader/aiohttp_downloader.py +233 -233
- crawlo/downloader/cffi_downloader.py +250 -247
- crawlo/downloader/httpx_downloader.py +265 -259
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +425 -402
- crawlo/downloader/selenium_downloader.py +486 -472
- crawlo/event.py +45 -11
- crawlo/exceptions.py +215 -82
- crawlo/extension/__init__.py +65 -64
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +94 -94
- crawlo/extension/log_stats.py +70 -70
- crawlo/extension/logging_extension.py +53 -61
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/factories/__init__.py +27 -27
- crawlo/factories/base.py +68 -68
- crawlo/factories/crawler.py +104 -103
- crawlo/factories/registry.py +84 -84
- crawlo/factories/utils.py +135 -0
- crawlo/filters/__init__.py +170 -153
- crawlo/filters/aioredis_filter.py +348 -264
- crawlo/filters/memory_filter.py +261 -276
- crawlo/framework.py +306 -292
- crawlo/initialization/__init__.py +44 -44
- crawlo/initialization/built_in.py +391 -434
- crawlo/initialization/context.py +141 -141
- crawlo/initialization/core.py +240 -194
- crawlo/initialization/phases.py +230 -149
- crawlo/initialization/registry.py +143 -145
- crawlo/initialization/utils.py +49 -0
- crawlo/interfaces.py +23 -23
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +23 -23
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/logging/__init__.py +42 -46
- crawlo/logging/config.py +277 -197
- crawlo/logging/factory.py +175 -171
- crawlo/logging/manager.py +104 -112
- crawlo/middleware/__init__.py +87 -24
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +142 -142
- crawlo/middleware/offsite.py +123 -123
- crawlo/middleware/proxy.py +209 -209
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +150 -150
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/mode_manager.py +287 -253
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +375 -379
- crawlo/network/response.py +569 -664
- crawlo/pipelines/__init__.py +53 -22
- crawlo/pipelines/base_pipeline.py +452 -0
- crawlo/pipelines/bloom_dedup_pipeline.py +146 -146
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +197 -197
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +105 -105
- crawlo/pipelines/mongo_pipeline.py +140 -132
- crawlo/pipelines/mysql_pipeline.py +469 -476
- crawlo/pipelines/pipeline_manager.py +100 -100
- crawlo/pipelines/redis_dedup_pipeline.py +155 -156
- crawlo/project.py +347 -347
- crawlo/queue/__init__.py +10 -0
- crawlo/queue/pqueue.py +38 -38
- crawlo/queue/queue_manager.py +591 -525
- crawlo/queue/redis_priority_queue.py +519 -370
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +284 -277
- crawlo/settings/setting_manager.py +219 -219
- crawlo/spider/__init__.py +657 -657
- crawlo/stats_collector.py +81 -81
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +138 -138
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +2 -4
- crawlo/templates/project/items.py.tmpl +13 -17
- crawlo/templates/project/middlewares.py.tmpl +38 -38
- crawlo/templates/project/pipelines.py.tmpl +35 -36
- crawlo/templates/project/settings.py.tmpl +109 -111
- crawlo/templates/project/settings_distributed.py.tmpl +156 -159
- crawlo/templates/project/settings_gentle.py.tmpl +170 -176
- crawlo/templates/project/settings_high_performance.py.tmpl +171 -177
- crawlo/templates/project/settings_minimal.py.tmpl +98 -100
- crawlo/templates/project/settings_simple.py.tmpl +168 -174
- crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
- crawlo/templates/run.py.tmpl +23 -23
- crawlo/templates/spider/spider.py.tmpl +32 -40
- crawlo/templates/spiders_init.py.tmpl +5 -10
- crawlo/tools/__init__.py +86 -189
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +384 -384
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +50 -50
- crawlo/utils/batch_processor.py +276 -259
- crawlo/utils/config_manager.py +442 -0
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +250 -250
- crawlo/utils/error_handler.py +410 -410
- crawlo/utils/fingerprint.py +121 -121
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/leak_detector.py +335 -0
- crawlo/utils/log.py +79 -79
- crawlo/utils/misc.py +81 -81
- crawlo/utils/mongo_connection_pool.py +157 -0
- crawlo/utils/mysql_connection_pool.py +197 -0
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_checker.py +91 -0
- crawlo/utils/redis_connection_pool.py +578 -388
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +278 -256
- crawlo/utils/request_serializer.py +225 -225
- crawlo/utils/resource_manager.py +337 -0
- crawlo/utils/selector_helper.py +137 -137
- crawlo/utils/singleton.py +70 -0
- crawlo/utils/spider_loader.py +201 -201
- crawlo/utils/text_helper.py +94 -94
- crawlo/utils/{url.py → url_utils.py} +39 -39
- crawlo-1.4.7.dist-info/METADATA +689 -0
- crawlo-1.4.7.dist-info/RECORD +347 -0
- examples/__init__.py +7 -7
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +217 -275
- tests/authenticated_proxy_example.py +110 -110
- tests/baidu_performance_test.py +108 -108
- tests/baidu_test.py +59 -59
- tests/bug_check_test.py +250 -250
- tests/cleaners_example.py +160 -160
- tests/comprehensive_framework_test.py +212 -212
- tests/comprehensive_test.py +81 -81
- tests/comprehensive_testing_summary.md +186 -186
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_configure.py +69 -69
- tests/debug_framework_logger.py +84 -84
- tests/debug_log_config.py +126 -126
- tests/debug_log_levels.py +63 -63
- tests/debug_pipelines.py +66 -66
- tests/detailed_log_test.py +233 -233
- tests/direct_selector_helper_test.py +96 -96
- tests/distributed_dedup_test.py +467 -0
- tests/distributed_test.py +66 -66
- tests/distributed_test_debug.py +76 -76
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/error_handling_example.py +171 -171
- tests/explain_mysql_update_behavior.py +76 -76
- tests/final_comprehensive_test.py +151 -151
- tests/final_log_test.py +260 -260
- tests/final_validation_test.py +182 -182
- tests/fix_log_test.py +142 -142
- tests/framework_performance_test.py +202 -202
- tests/log_buffering_test.py +111 -111
- tests/log_generation_timing_test.py +153 -153
- tests/monitor_redis_dedup.sh +72 -0
- tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
- tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
- tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
- tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
- tests/ofweek_scrapy/scrapy.cfg +11 -11
- tests/optimized_performance_test.py +211 -211
- tests/performance_comparison.py +244 -244
- tests/queue_blocking_test.py +113 -113
- tests/queue_test.py +89 -89
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/scrapy_comparison/ofweek_scrapy.py +138 -138
- tests/scrapy_comparison/scrapy_test.py +133 -133
- tests/simple_cli_test.py +55 -0
- tests/simple_command_test.py +119 -119
- tests/simple_crawlo_test.py +126 -126
- tests/simple_follow_test.py +38 -38
- tests/simple_log_test2.py +137 -137
- tests/simple_optimization_test.py +128 -128
- tests/simple_queue_type_test.py +41 -41
- tests/simple_response_selector_test.py +94 -94
- tests/simple_selector_helper_test.py +154 -154
- tests/simple_selector_test.py +207 -207
- tests/simple_spider_test.py +49 -49
- tests/simple_url_test.py +73 -73
- tests/simulate_mysql_update_test.py +139 -139
- tests/spider_log_timing_test.py +177 -177
- tests/test_advanced_tools.py +148 -148
- tests/test_all_commands.py +230 -230
- tests/test_all_pipeline_fingerprints.py +133 -133
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_asyncmy_usage.py +56 -56
- tests/test_batch_processor.py +178 -178
- tests/test_cleaners.py +54 -54
- tests/test_cli_arguments.py +119 -0
- tests/test_component_factory.py +174 -174
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_controlled_spider_mixin.py +79 -79
- tests/test_crawler_process_import.py +38 -38
- tests/test_crawler_process_spider_modules.py +47 -47
- tests/test_crawlo_proxy_integration.py +114 -114
- tests/test_date_tools.py +123 -123
- tests/test_dedup_fix.py +220 -220
- tests/test_dedup_pipeline_consistency.py +124 -124
- tests/test_default_header_middleware.py +313 -313
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +204 -204
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +272 -272
- tests/test_edge_cases.py +305 -305
- tests/test_encoding_core.py +56 -56
- tests/test_encoding_detection.py +126 -126
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_enhanced_error_handler_comprehensive.py +245 -245
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_factories.py +252 -252
- tests/test_factory_compatibility.py +196 -196
- tests/test_final_validation.py +153 -153
- tests/test_fingerprint_consistency.py +135 -135
- tests/test_fingerprint_simple.py +51 -51
- tests/test_get_component_logger.py +83 -83
- tests/test_hash_performance.py +99 -99
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_large_scale_helper.py +235 -235
- tests/test_logging_enhancements.py +374 -374
- tests/test_logging_final.py +184 -184
- tests/test_logging_integration.py +312 -312
- tests/test_logging_system.py +282 -282
- tests/test_middleware_debug.py +141 -141
- tests/test_mode_consistency.py +51 -51
- tests/test_multi_directory.py +67 -67
- tests/test_multiple_spider_modules.py +80 -80
- tests/test_mysql_pipeline_config.py +164 -164
- tests/test_mysql_pipeline_error.py +98 -98
- tests/test_mysql_pipeline_init_log.py +82 -82
- tests/test_mysql_pipeline_integration.py +132 -132
- tests/test_mysql_pipeline_refactor.py +143 -143
- tests/test_mysql_pipeline_refactor_simple.py +85 -85
- tests/test_mysql_pipeline_robustness.py +195 -195
- tests/test_mysql_pipeline_types.py +88 -88
- tests/test_mysql_update_columns.py +93 -93
- tests/test_offsite_middleware.py +244 -244
- tests/test_offsite_middleware_simple.py +203 -203
- tests/test_optimized_selector_naming.py +100 -100
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_performance_monitor.py +115 -115
- tests/test_pipeline_fingerprint_consistency.py +86 -86
- tests/test_priority_behavior.py +211 -211
- tests/test_priority_consistency.py +151 -151
- tests/test_priority_consistency_fixed.py +249 -249
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +217 -217
- tests/test_proxy_middleware_enhanced.py +212 -212
- tests/test_proxy_middleware_integration.py +142 -142
- tests/test_proxy_middleware_refactored.py +207 -207
- tests/test_proxy_only.py +83 -83
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_proxy_with_downloader.py +152 -152
- tests/test_queue_empty_check.py +41 -41
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +179 -179
- tests/test_queue_naming.py +154 -154
- tests/test_queue_type.py +106 -106
- tests/test_queue_type_redis_config_consistency.py +130 -130
- tests/test_random_headers_default.py +322 -322
- tests/test_random_headers_necessity.py +308 -308
- tests/test_random_user_agent.py +72 -72
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_redis_queue_name_fix.py +175 -175
- tests/test_redis_queue_type_fallback.py +129 -129
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_follow.py +104 -104
- tests/test_response_improvements.py +152 -152
- tests/test_response_selector_methods.py +92 -92
- tests/test_response_url_methods.py +70 -70
- tests/test_response_urljoin.py +86 -86
- tests/test_retry_middleware.py +333 -333
- tests/test_retry_middleware_realistic.py +273 -273
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_scrapy_style_encoding.py +112 -112
- tests/test_selector_helper.py +100 -100
- tests/test_selector_optimizations.py +146 -146
- tests/test_simple_response.py +61 -61
- tests/test_spider_loader.py +49 -49
- tests/test_spider_loader_comprehensive.py +69 -69
- tests/test_spider_modules.py +84 -84
- tests/test_spiders/test_spider.py +9 -9
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agent_randomness.py +176 -176
- tests/test_user_agents.py +96 -96
- tests/untested_features_report.md +138 -138
- tests/verify_debug.py +51 -51
- tests/verify_distributed.py +117 -117
- tests/verify_log_fix.py +111 -111
- tests/verify_mysql_warnings.py +109 -109
- crawlo/logging/async_handler.py +0 -181
- crawlo/logging/monitor.py +0 -153
- crawlo/logging/sampler.py +0 -167
- crawlo/tools/authenticated_proxy.py +0 -241
- crawlo/tools/data_formatter.py +0 -226
- crawlo/tools/data_validator.py +0 -181
- crawlo/tools/encoding_converter.py +0 -127
- crawlo/tools/network_diagnostic.py +0 -365
- crawlo/tools/request_tools.py +0 -83
- crawlo/tools/retry_mechanism.py +0 -224
- crawlo/utils/env_config.py +0 -143
- crawlo/utils/large_scale_config.py +0 -287
- crawlo/utils/system.py +0 -11
- crawlo/utils/tools.py +0 -5
- crawlo-1.4.6.dist-info/METADATA +0 -329
- crawlo-1.4.6.dist-info/RECORD +0 -361
- tests/env_config_example.py +0 -134
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
- tests/test_authenticated_proxy.py +0 -142
- tests/test_comprehensive.py +0 -147
- tests/test_dynamic_downloaders_proxy.py +0 -125
- tests/test_dynamic_proxy.py +0 -93
- tests/test_dynamic_proxy_config.py +0 -147
- tests/test_dynamic_proxy_real.py +0 -110
- tests/test_env_config.py +0 -122
- tests/test_framework_env_usage.py +0 -104
- tests/test_large_scale_config.py +0 -113
- tests/test_proxy_api.py +0 -265
- tests/test_real_scenario_proxy.py +0 -196
- tests/tools_example.py +0 -261
- {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/WHEEL +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/top_level.txt +0 -0
|
@@ -1,208 +1,208 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
ProxyMiddleware 重构测试文件
|
|
5
|
-
用于测试重构后的代理中间件功能,特别是修复的重复逻辑
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import asyncio
|
|
9
|
-
import unittest
|
|
10
|
-
from unittest.mock import Mock, patch
|
|
11
|
-
|
|
12
|
-
from crawlo.middleware.proxy import ProxyMiddleware, Proxy
|
|
13
|
-
from crawlo.exceptions import NotConfiguredError
|
|
14
|
-
from crawlo.settings.setting_manager import SettingManager
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class TestProxyMiddlewareRefactored(unittest.TestCase):
|
|
18
|
-
"""ProxyMiddleware 重构测试类"""
|
|
19
|
-
|
|
20
|
-
def setUp(self):
|
|
21
|
-
"""测试前准备"""
|
|
22
|
-
# 创建设置管理器
|
|
23
|
-
self.settings = SettingManager()
|
|
24
|
-
|
|
25
|
-
# 创建爬虫模拟对象
|
|
26
|
-
self.crawler = Mock()
|
|
27
|
-
self.crawler.settings = self.settings
|
|
28
|
-
|
|
29
|
-
def test_parse_proxy_data_with_string(self):
|
|
30
|
-
"""测试解析字符串代理数据"""
|
|
31
|
-
middleware = ProxyMiddleware(
|
|
32
|
-
settings=self.settings,
|
|
33
|
-
log_level='INFO'
|
|
34
|
-
)
|
|
35
|
-
|
|
36
|
-
# 测试有效的HTTP代理URL
|
|
37
|
-
result = middleware._parse_proxy_data("http://proxy.example.com:8080")
|
|
38
|
-
self.assertEqual(result, ["http://proxy.example.com:8080"])
|
|
39
|
-
|
|
40
|
-
# 测试有效的HTTPS代理URL
|
|
41
|
-
result = middleware._parse_proxy_data("https://proxy.example.com:8080")
|
|
42
|
-
self.assertEqual(result, ["https://proxy.example.com:8080"])
|
|
43
|
-
|
|
44
|
-
# 测试无效的代理URL
|
|
45
|
-
result = middleware._parse_proxy_data("invalid-proxy")
|
|
46
|
-
self.assertEqual(result, [])
|
|
47
|
-
|
|
48
|
-
def test_parse_proxy_data_with_dict(self):
|
|
49
|
-
"""测试解析字典代理数据"""
|
|
50
|
-
middleware = ProxyMiddleware(
|
|
51
|
-
settings=self.settings,
|
|
52
|
-
log_level='INFO'
|
|
53
|
-
)
|
|
54
|
-
|
|
55
|
-
# 测试包含字符串代理的字典
|
|
56
|
-
proxy_data = {
|
|
57
|
-
"proxy": "http://proxy1.example.com:8080"
|
|
58
|
-
}
|
|
59
|
-
result = middleware._parse_proxy_data(proxy_data)
|
|
60
|
-
self.assertEqual(result, ["http://proxy1.example.com:8080"])
|
|
61
|
-
|
|
62
|
-
# 测试包含列表代理的字典
|
|
63
|
-
proxy_data = {
|
|
64
|
-
"proxies": [
|
|
65
|
-
"http://proxy1.example.com:8080",
|
|
66
|
-
"https://proxy2.example.com:8080"
|
|
67
|
-
]
|
|
68
|
-
}
|
|
69
|
-
result = middleware._parse_proxy_data(proxy_data)
|
|
70
|
-
self.assertEqual(result, [
|
|
71
|
-
"http://proxy1.example.com:8080",
|
|
72
|
-
"https://proxy2.example.com:8080"
|
|
73
|
-
])
|
|
74
|
-
|
|
75
|
-
# 测试混合数据
|
|
76
|
-
proxy_data = {
|
|
77
|
-
"proxy": "http://proxy1.example.com:8080",
|
|
78
|
-
"proxies": [
|
|
79
|
-
"https://proxy2.example.com:8080",
|
|
80
|
-
"invalid-proxy"
|
|
81
|
-
]
|
|
82
|
-
}
|
|
83
|
-
result = middleware._parse_proxy_data(proxy_data)
|
|
84
|
-
self.assertEqual(result, [
|
|
85
|
-
"http://proxy1.example.com:8080",
|
|
86
|
-
"https://proxy2.example.com:8080"
|
|
87
|
-
])
|
|
88
|
-
|
|
89
|
-
def test_get_healthy_proxies(self):
|
|
90
|
-
"""测试获取健康代理"""
|
|
91
|
-
middleware = ProxyMiddleware(
|
|
92
|
-
settings=self.settings,
|
|
93
|
-
log_level='INFO'
|
|
94
|
-
)
|
|
95
|
-
|
|
96
|
-
# 创建测试代理
|
|
97
|
-
proxy1 = Proxy("http://proxy1.example.com:8080")
|
|
98
|
-
proxy2 = Proxy("http://proxy2.example.com:8080")
|
|
99
|
-
proxy3 = Proxy("http://proxy3.example.com:8080")
|
|
100
|
-
|
|
101
|
-
# 设置代理池
|
|
102
|
-
middleware._proxy_pool = [proxy1, proxy2, proxy3]
|
|
103
|
-
|
|
104
|
-
# 所有代理都是健康的
|
|
105
|
-
healthy_proxies = middleware._get_healthy_proxies()
|
|
106
|
-
self.assertEqual(len(healthy_proxies), 3)
|
|
107
|
-
|
|
108
|
-
# 标记一个代理为不健康
|
|
109
|
-
proxy2.is_healthy = False
|
|
110
|
-
healthy_proxies = middleware._get_healthy_proxies()
|
|
111
|
-
self.assertEqual(len(healthy_proxies), 2)
|
|
112
|
-
self.assertIn(proxy1, healthy_proxies)
|
|
113
|
-
self.assertNotIn(proxy2, healthy_proxies)
|
|
114
|
-
self.assertIn(proxy3, healthy_proxies)
|
|
115
|
-
|
|
116
|
-
# 标记一个代理成功率低于阈值
|
|
117
|
-
proxy3.mark_failure()
|
|
118
|
-
proxy3.mark_failure()
|
|
119
|
-
proxy3.mark_failure()
|
|
120
|
-
proxy3.mark_failure() # 4次失败,0次成功,成功率=0 < 0.5(默认阈值)
|
|
121
|
-
healthy_proxies = middleware._get_healthy_proxies()
|
|
122
|
-
self.assertEqual(len(healthy_proxies), 1)
|
|
123
|
-
self.assertIn(proxy1, healthy_proxies)
|
|
124
|
-
self.assertNotIn(proxy3, healthy_proxies)
|
|
125
|
-
|
|
126
|
-
@patch('crawlo.utils.log.get_logger')
|
|
127
|
-
def test_update_proxy_pool_with_parsed_data(self, mock_get_logger):
|
|
128
|
-
"""测试使用解析后的代理数据更新代理池"""
|
|
129
|
-
# 不再需要 PROXY_ENABLED,只要配置了 PROXY_API_URL 就会启用
|
|
130
|
-
self.settings.set('PROXY_API_URL', 'http://proxy-api.example.com')
|
|
131
|
-
self.settings.set('LOG_LEVEL', 'INFO')
|
|
132
|
-
|
|
133
|
-
mock_get_logger.return_value = Mock()
|
|
134
|
-
|
|
135
|
-
middleware = ProxyMiddleware.create_instance(self.crawler)
|
|
136
|
-
|
|
137
|
-
# 测试解析字符串代理数据
|
|
138
|
-
new_proxies = middleware._parse_proxy_data("http://proxy1.example.com:8080")
|
|
139
|
-
self.assertEqual(new_proxies, ["http://proxy1.example.com:8080"])
|
|
140
|
-
|
|
141
|
-
# 测试解析字典代理数据
|
|
142
|
-
proxy_data = {
|
|
143
|
-
"proxies": [
|
|
144
|
-
"http://proxy1.example.com:8080",
|
|
145
|
-
"https://proxy2.example.com:8080",
|
|
146
|
-
"http://proxy3.example.com:8080"
|
|
147
|
-
]
|
|
148
|
-
}
|
|
149
|
-
new_proxies = middleware._parse_proxy_data(proxy_data)
|
|
150
|
-
self.assertEqual(new_proxies, [
|
|
151
|
-
"http://proxy1.example.com:8080",
|
|
152
|
-
"https://proxy2.example.com:8080",
|
|
153
|
-
"http://proxy3.example.com:8080"
|
|
154
|
-
])
|
|
155
|
-
|
|
156
|
-
def test_get_healthy_proxy_with_refactored_logic(self):
|
|
157
|
-
"""测试使用重构后的逻辑获取健康代理"""
|
|
158
|
-
middleware = ProxyMiddleware(
|
|
159
|
-
settings=self.settings,
|
|
160
|
-
log_level='INFO'
|
|
161
|
-
)
|
|
162
|
-
|
|
163
|
-
# 创建测试代理
|
|
164
|
-
proxy1 = Proxy("http://proxy1.example.com:8080")
|
|
165
|
-
proxy2 = Proxy("http://proxy2.example.com:8080")
|
|
166
|
-
|
|
167
|
-
# 设置代理池
|
|
168
|
-
middleware._proxy_pool = [proxy1, proxy2]
|
|
169
|
-
middleware._current_proxy_index = 0
|
|
170
|
-
|
|
171
|
-
# 获取第一个健康代理(由于轮询逻辑,第一次调用会得到索引1的代理)
|
|
172
|
-
healthy_proxy = asyncio.run(middleware._get_healthy_proxy())
|
|
173
|
-
self.assertEqual(healthy_proxy.proxy_str, proxy2.proxy_str)
|
|
174
|
-
|
|
175
|
-
# 获取第二个健康代理(轮询回到索引0)
|
|
176
|
-
healthy_proxy = asyncio.run(middleware._get_healthy_proxy())
|
|
177
|
-
self.assertEqual(healthy_proxy.proxy_str, proxy1.proxy_str)
|
|
178
|
-
|
|
179
|
-
# 再次获取第一个健康代理(轮询到索引1)
|
|
180
|
-
healthy_proxy = asyncio.run(middleware._get_healthy_proxy())
|
|
181
|
-
self.assertEqual(healthy_proxy.proxy_str, proxy2.proxy_str)
|
|
182
|
-
|
|
183
|
-
def test_proxy_middleware_initialization(self):
|
|
184
|
-
"""测试代理中间件初始化"""
|
|
185
|
-
# 不再需要 PROXY_ENABLED,只要配置了 PROXY_API_URL 就会启用
|
|
186
|
-
self.settings.set('PROXY_API_URL', 'http://test-proxy-api.com')
|
|
187
|
-
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
188
|
-
self.assertIsInstance(middleware, ProxyMiddleware)
|
|
189
|
-
self.assertTrue(middleware.enabled)
|
|
190
|
-
self.assertEqual(middleware.api_url, 'http://test-proxy-api.com')
|
|
191
|
-
|
|
192
|
-
def test_proxy_middleware_enabled_with_api_url(self):
|
|
193
|
-
"""测试配置了代理API URL时中间件启用"""
|
|
194
|
-
# 不再需要 PROXY_ENABLED,只要配置了 PROXY_API_URL 就会启用
|
|
195
|
-
self.settings.set('PROXY_API_URL', 'http://test-proxy-api.com')
|
|
196
|
-
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
197
|
-
self.assertTrue(middleware.enabled)
|
|
198
|
-
self.assertEqual(middleware.api_url, 'http://test-proxy-api.com')
|
|
199
|
-
|
|
200
|
-
def test_proxy_middleware_disabled_without_api_url(self):
|
|
201
|
-
"""测试未配置代理API URL时中间件禁用"""
|
|
202
|
-
# 不再需要 PROXY_ENABLED,只要不配置 PROXY_API_URL 就会禁用
|
|
203
|
-
self.settings.set('PROXY_API_URL', None)
|
|
204
|
-
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
205
|
-
self.assertFalse(middleware.enabled)
|
|
206
|
-
|
|
207
|
-
if __name__ == '__main__':
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
ProxyMiddleware 重构测试文件
|
|
5
|
+
用于测试重构后的代理中间件功能,特别是修复的重复逻辑
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import unittest
|
|
10
|
+
from unittest.mock import Mock, patch
|
|
11
|
+
|
|
12
|
+
from crawlo.middleware.proxy import ProxyMiddleware, Proxy
|
|
13
|
+
from crawlo.exceptions import NotConfiguredError
|
|
14
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TestProxyMiddlewareRefactored(unittest.TestCase):
|
|
18
|
+
"""ProxyMiddleware 重构测试类"""
|
|
19
|
+
|
|
20
|
+
def setUp(self):
|
|
21
|
+
"""测试前准备"""
|
|
22
|
+
# 创建设置管理器
|
|
23
|
+
self.settings = SettingManager()
|
|
24
|
+
|
|
25
|
+
# 创建爬虫模拟对象
|
|
26
|
+
self.crawler = Mock()
|
|
27
|
+
self.crawler.settings = self.settings
|
|
28
|
+
|
|
29
|
+
def test_parse_proxy_data_with_string(self):
|
|
30
|
+
"""测试解析字符串代理数据"""
|
|
31
|
+
middleware = ProxyMiddleware(
|
|
32
|
+
settings=self.settings,
|
|
33
|
+
log_level='INFO'
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
# 测试有效的HTTP代理URL
|
|
37
|
+
result = middleware._parse_proxy_data("http://proxy.example.com:8080")
|
|
38
|
+
self.assertEqual(result, ["http://proxy.example.com:8080"])
|
|
39
|
+
|
|
40
|
+
# 测试有效的HTTPS代理URL
|
|
41
|
+
result = middleware._parse_proxy_data("https://proxy.example.com:8080")
|
|
42
|
+
self.assertEqual(result, ["https://proxy.example.com:8080"])
|
|
43
|
+
|
|
44
|
+
# 测试无效的代理URL
|
|
45
|
+
result = middleware._parse_proxy_data("invalid-proxy")
|
|
46
|
+
self.assertEqual(result, [])
|
|
47
|
+
|
|
48
|
+
def test_parse_proxy_data_with_dict(self):
|
|
49
|
+
"""测试解析字典代理数据"""
|
|
50
|
+
middleware = ProxyMiddleware(
|
|
51
|
+
settings=self.settings,
|
|
52
|
+
log_level='INFO'
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# 测试包含字符串代理的字典
|
|
56
|
+
proxy_data = {
|
|
57
|
+
"proxy": "http://proxy1.example.com:8080"
|
|
58
|
+
}
|
|
59
|
+
result = middleware._parse_proxy_data(proxy_data)
|
|
60
|
+
self.assertEqual(result, ["http://proxy1.example.com:8080"])
|
|
61
|
+
|
|
62
|
+
# 测试包含列表代理的字典
|
|
63
|
+
proxy_data = {
|
|
64
|
+
"proxies": [
|
|
65
|
+
"http://proxy1.example.com:8080",
|
|
66
|
+
"https://proxy2.example.com:8080"
|
|
67
|
+
]
|
|
68
|
+
}
|
|
69
|
+
result = middleware._parse_proxy_data(proxy_data)
|
|
70
|
+
self.assertEqual(result, [
|
|
71
|
+
"http://proxy1.example.com:8080",
|
|
72
|
+
"https://proxy2.example.com:8080"
|
|
73
|
+
])
|
|
74
|
+
|
|
75
|
+
# 测试混合数据
|
|
76
|
+
proxy_data = {
|
|
77
|
+
"proxy": "http://proxy1.example.com:8080",
|
|
78
|
+
"proxies": [
|
|
79
|
+
"https://proxy2.example.com:8080",
|
|
80
|
+
"invalid-proxy"
|
|
81
|
+
]
|
|
82
|
+
}
|
|
83
|
+
result = middleware._parse_proxy_data(proxy_data)
|
|
84
|
+
self.assertEqual(result, [
|
|
85
|
+
"http://proxy1.example.com:8080",
|
|
86
|
+
"https://proxy2.example.com:8080"
|
|
87
|
+
])
|
|
88
|
+
|
|
89
|
+
def test_get_healthy_proxies(self):
|
|
90
|
+
"""测试获取健康代理"""
|
|
91
|
+
middleware = ProxyMiddleware(
|
|
92
|
+
settings=self.settings,
|
|
93
|
+
log_level='INFO'
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# 创建测试代理
|
|
97
|
+
proxy1 = Proxy("http://proxy1.example.com:8080")
|
|
98
|
+
proxy2 = Proxy("http://proxy2.example.com:8080")
|
|
99
|
+
proxy3 = Proxy("http://proxy3.example.com:8080")
|
|
100
|
+
|
|
101
|
+
# 设置代理池
|
|
102
|
+
middleware._proxy_pool = [proxy1, proxy2, proxy3]
|
|
103
|
+
|
|
104
|
+
# 所有代理都是健康的
|
|
105
|
+
healthy_proxies = middleware._get_healthy_proxies()
|
|
106
|
+
self.assertEqual(len(healthy_proxies), 3)
|
|
107
|
+
|
|
108
|
+
# 标记一个代理为不健康
|
|
109
|
+
proxy2.is_healthy = False
|
|
110
|
+
healthy_proxies = middleware._get_healthy_proxies()
|
|
111
|
+
self.assertEqual(len(healthy_proxies), 2)
|
|
112
|
+
self.assertIn(proxy1, healthy_proxies)
|
|
113
|
+
self.assertNotIn(proxy2, healthy_proxies)
|
|
114
|
+
self.assertIn(proxy3, healthy_proxies)
|
|
115
|
+
|
|
116
|
+
# 标记一个代理成功率低于阈值
|
|
117
|
+
proxy3.mark_failure()
|
|
118
|
+
proxy3.mark_failure()
|
|
119
|
+
proxy3.mark_failure()
|
|
120
|
+
proxy3.mark_failure() # 4次失败,0次成功,成功率=0 < 0.5(默认阈值)
|
|
121
|
+
healthy_proxies = middleware._get_healthy_proxies()
|
|
122
|
+
self.assertEqual(len(healthy_proxies), 1)
|
|
123
|
+
self.assertIn(proxy1, healthy_proxies)
|
|
124
|
+
self.assertNotIn(proxy3, healthy_proxies)
|
|
125
|
+
|
|
126
|
+
@patch('crawlo.utils.log.get_logger')
|
|
127
|
+
def test_update_proxy_pool_with_parsed_data(self, mock_get_logger):
|
|
128
|
+
"""测试使用解析后的代理数据更新代理池"""
|
|
129
|
+
# 不再需要 PROXY_ENABLED,只要配置了 PROXY_API_URL 就会启用
|
|
130
|
+
self.settings.set('PROXY_API_URL', 'http://proxy-api.example.com')
|
|
131
|
+
self.settings.set('LOG_LEVEL', 'INFO')
|
|
132
|
+
|
|
133
|
+
mock_get_logger.return_value = Mock()
|
|
134
|
+
|
|
135
|
+
middleware = ProxyMiddleware.create_instance(self.crawler)
|
|
136
|
+
|
|
137
|
+
# 测试解析字符串代理数据
|
|
138
|
+
new_proxies = middleware._parse_proxy_data("http://proxy1.example.com:8080")
|
|
139
|
+
self.assertEqual(new_proxies, ["http://proxy1.example.com:8080"])
|
|
140
|
+
|
|
141
|
+
# 测试解析字典代理数据
|
|
142
|
+
proxy_data = {
|
|
143
|
+
"proxies": [
|
|
144
|
+
"http://proxy1.example.com:8080",
|
|
145
|
+
"https://proxy2.example.com:8080",
|
|
146
|
+
"http://proxy3.example.com:8080"
|
|
147
|
+
]
|
|
148
|
+
}
|
|
149
|
+
new_proxies = middleware._parse_proxy_data(proxy_data)
|
|
150
|
+
self.assertEqual(new_proxies, [
|
|
151
|
+
"http://proxy1.example.com:8080",
|
|
152
|
+
"https://proxy2.example.com:8080",
|
|
153
|
+
"http://proxy3.example.com:8080"
|
|
154
|
+
])
|
|
155
|
+
|
|
156
|
+
def test_get_healthy_proxy_with_refactored_logic(self):
|
|
157
|
+
"""测试使用重构后的逻辑获取健康代理"""
|
|
158
|
+
middleware = ProxyMiddleware(
|
|
159
|
+
settings=self.settings,
|
|
160
|
+
log_level='INFO'
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
# 创建测试代理
|
|
164
|
+
proxy1 = Proxy("http://proxy1.example.com:8080")
|
|
165
|
+
proxy2 = Proxy("http://proxy2.example.com:8080")
|
|
166
|
+
|
|
167
|
+
# 设置代理池
|
|
168
|
+
middleware._proxy_pool = [proxy1, proxy2]
|
|
169
|
+
middleware._current_proxy_index = 0
|
|
170
|
+
|
|
171
|
+
# 获取第一个健康代理(由于轮询逻辑,第一次调用会得到索引1的代理)
|
|
172
|
+
healthy_proxy = asyncio.run(middleware._get_healthy_proxy())
|
|
173
|
+
self.assertEqual(healthy_proxy.proxy_str, proxy2.proxy_str)
|
|
174
|
+
|
|
175
|
+
# 获取第二个健康代理(轮询回到索引0)
|
|
176
|
+
healthy_proxy = asyncio.run(middleware._get_healthy_proxy())
|
|
177
|
+
self.assertEqual(healthy_proxy.proxy_str, proxy1.proxy_str)
|
|
178
|
+
|
|
179
|
+
# 再次获取第一个健康代理(轮询到索引1)
|
|
180
|
+
healthy_proxy = asyncio.run(middleware._get_healthy_proxy())
|
|
181
|
+
self.assertEqual(healthy_proxy.proxy_str, proxy2.proxy_str)
|
|
182
|
+
|
|
183
|
+
def test_proxy_middleware_initialization(self):
|
|
184
|
+
"""测试代理中间件初始化"""
|
|
185
|
+
# 不再需要 PROXY_ENABLED,只要配置了 PROXY_API_URL 就会启用
|
|
186
|
+
self.settings.set('PROXY_API_URL', 'http://test-proxy-api.com')
|
|
187
|
+
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
188
|
+
self.assertIsInstance(middleware, ProxyMiddleware)
|
|
189
|
+
self.assertTrue(middleware.enabled)
|
|
190
|
+
self.assertEqual(middleware.api_url, 'http://test-proxy-api.com')
|
|
191
|
+
|
|
192
|
+
def test_proxy_middleware_enabled_with_api_url(self):
|
|
193
|
+
"""测试配置了代理API URL时中间件启用"""
|
|
194
|
+
# 不再需要 PROXY_ENABLED,只要配置了 PROXY_API_URL 就会启用
|
|
195
|
+
self.settings.set('PROXY_API_URL', 'http://test-proxy-api.com')
|
|
196
|
+
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
197
|
+
self.assertTrue(middleware.enabled)
|
|
198
|
+
self.assertEqual(middleware.api_url, 'http://test-proxy-api.com')
|
|
199
|
+
|
|
200
|
+
def test_proxy_middleware_disabled_without_api_url(self):
|
|
201
|
+
"""测试未配置代理API URL时中间件禁用"""
|
|
202
|
+
# 不再需要 PROXY_ENABLED,只要不配置 PROXY_API_URL 就会禁用
|
|
203
|
+
self.settings.set('PROXY_API_URL', None)
|
|
204
|
+
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
205
|
+
self.assertFalse(middleware.enabled)
|
|
206
|
+
|
|
207
|
+
if __name__ == '__main__':
|
|
208
208
|
unittest.main()
|
tests/test_proxy_only.py
CHANGED
|
@@ -1,84 +1,84 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
代理中间件测试脚本
|
|
5
|
-
测试指定的代理URL功能
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import asyncio
|
|
9
|
-
import sys
|
|
10
|
-
import os
|
|
11
|
-
|
|
12
|
-
# 添加项目根目录到Python路径
|
|
13
|
-
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
14
|
-
|
|
15
|
-
from crawlo.settings.setting_manager import SettingManager
|
|
16
|
-
from crawlo.middleware.proxy import ProxyMiddleware
|
|
17
|
-
from crawlo.network import Request
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
async def test_proxy_middleware():
|
|
21
|
-
"""测试代理中间件"""
|
|
22
|
-
print("=== 测试代理中间件 ===")
|
|
23
|
-
|
|
24
|
-
# 创建设置管理器
|
|
25
|
-
settings_manager = SettingManager()
|
|
26
|
-
settings = settings_manager # SettingManager实例本身就是设置对象
|
|
27
|
-
|
|
28
|
-
# 配置代理
|
|
29
|
-
settings.set('PROXY_API_URL', 'http://proxy-api.example.com/proxy/getitem/')
|
|
30
|
-
settings.set('LOG_LEVEL', 'DEBUG')
|
|
31
|
-
|
|
32
|
-
# 创建代理中间件
|
|
33
|
-
proxy_middleware = ProxyMiddleware(settings, "DEBUG")
|
|
34
|
-
|
|
35
|
-
print(f"代理中间件已创建")
|
|
36
|
-
print(f"模式: {proxy_middleware.mode}")
|
|
37
|
-
print(f"是否启用: {proxy_middleware.enabled}")
|
|
38
|
-
|
|
39
|
-
if proxy_middleware.enabled and proxy_middleware.mode == "dynamic":
|
|
40
|
-
# 测试从API获取代理
|
|
41
|
-
print("\n尝试从API获取代理...")
|
|
42
|
-
proxy = await proxy_middleware._fetch_proxy_from_api()
|
|
43
|
-
print(f"获取到的代理: {proxy}")
|
|
44
|
-
|
|
45
|
-
# 测试代理提取功能
|
|
46
|
-
if proxy:
|
|
47
|
-
print(f"代理格式正确: {proxy.startswith('http://') or proxy.startswith('https://')}")
|
|
48
|
-
|
|
49
|
-
# 测试处理请求
|
|
50
|
-
print("\n测试处理请求...")
|
|
51
|
-
request = Request(url="https://httpbin.org/ip")
|
|
52
|
-
|
|
53
|
-
class MockSpider:
|
|
54
|
-
def __init__(self):
|
|
55
|
-
self.name = "test_spider"
|
|
56
|
-
|
|
57
|
-
spider = MockSpider()
|
|
58
|
-
|
|
59
|
-
await proxy_middleware.process_request(request, spider)
|
|
60
|
-
|
|
61
|
-
if request.proxy:
|
|
62
|
-
print(f"请求代理设置成功: {request.proxy}")
|
|
63
|
-
else:
|
|
64
|
-
print("请求代理设置失败")
|
|
65
|
-
else:
|
|
66
|
-
print("未能从API获取有效代理")
|
|
67
|
-
else:
|
|
68
|
-
print("代理中间件未启用或模式不正确")
|
|
69
|
-
|
|
70
|
-
return proxy_middleware
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
async def main():
|
|
74
|
-
"""主测试函数"""
|
|
75
|
-
print("开始测试代理中间件...")
|
|
76
|
-
|
|
77
|
-
# 测试代理中间件
|
|
78
|
-
await test_proxy_middleware()
|
|
79
|
-
|
|
80
|
-
print("\n测试完成")
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
if __name__ == "__main__":
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
代理中间件测试脚本
|
|
5
|
+
测试指定的代理URL功能
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import sys
|
|
10
|
+
import os
|
|
11
|
+
|
|
12
|
+
# 添加项目根目录到Python路径
|
|
13
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
14
|
+
|
|
15
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
16
|
+
from crawlo.middleware.proxy import ProxyMiddleware
|
|
17
|
+
from crawlo.network import Request
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
async def test_proxy_middleware():
|
|
21
|
+
"""测试代理中间件"""
|
|
22
|
+
print("=== 测试代理中间件 ===")
|
|
23
|
+
|
|
24
|
+
# 创建设置管理器
|
|
25
|
+
settings_manager = SettingManager()
|
|
26
|
+
settings = settings_manager # SettingManager实例本身就是设置对象
|
|
27
|
+
|
|
28
|
+
# 配置代理
|
|
29
|
+
settings.set('PROXY_API_URL', 'http://proxy-api.example.com/proxy/getitem/')
|
|
30
|
+
settings.set('LOG_LEVEL', 'DEBUG')
|
|
31
|
+
|
|
32
|
+
# 创建代理中间件
|
|
33
|
+
proxy_middleware = ProxyMiddleware(settings, "DEBUG")
|
|
34
|
+
|
|
35
|
+
print(f"代理中间件已创建")
|
|
36
|
+
print(f"模式: {proxy_middleware.mode}")
|
|
37
|
+
print(f"是否启用: {proxy_middleware.enabled}")
|
|
38
|
+
|
|
39
|
+
if proxy_middleware.enabled and proxy_middleware.mode == "dynamic":
|
|
40
|
+
# 测试从API获取代理
|
|
41
|
+
print("\n尝试从API获取代理...")
|
|
42
|
+
proxy = await proxy_middleware._fetch_proxy_from_api()
|
|
43
|
+
print(f"获取到的代理: {proxy}")
|
|
44
|
+
|
|
45
|
+
# 测试代理提取功能
|
|
46
|
+
if proxy:
|
|
47
|
+
print(f"代理格式正确: {proxy.startswith('http://') or proxy.startswith('https://')}")
|
|
48
|
+
|
|
49
|
+
# 测试处理请求
|
|
50
|
+
print("\n测试处理请求...")
|
|
51
|
+
request = Request(url="https://httpbin.org/ip")
|
|
52
|
+
|
|
53
|
+
class MockSpider:
|
|
54
|
+
def __init__(self):
|
|
55
|
+
self.name = "test_spider"
|
|
56
|
+
|
|
57
|
+
spider = MockSpider()
|
|
58
|
+
|
|
59
|
+
await proxy_middleware.process_request(request, spider)
|
|
60
|
+
|
|
61
|
+
if request.proxy:
|
|
62
|
+
print(f"请求代理设置成功: {request.proxy}")
|
|
63
|
+
else:
|
|
64
|
+
print("请求代理设置失败")
|
|
65
|
+
else:
|
|
66
|
+
print("未能从API获取有效代理")
|
|
67
|
+
else:
|
|
68
|
+
print("代理中间件未启用或模式不正确")
|
|
69
|
+
|
|
70
|
+
return proxy_middleware
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
async def main():
|
|
74
|
+
"""主测试函数"""
|
|
75
|
+
print("开始测试代理中间件...")
|
|
76
|
+
|
|
77
|
+
# 测试代理中间件
|
|
78
|
+
await test_proxy_middleware()
|
|
79
|
+
|
|
80
|
+
print("\n测试完成")
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
if __name__ == "__main__":
|
|
84
84
|
asyncio.run(main())
|