crawlo 1.4.1__py3-none-any.whl → 1.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +93 -93
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +151 -151
- crawlo/commands/help.py +138 -138
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +341 -341
- crawlo/commands/startproject.py +436 -436
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +312 -312
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +52 -52
- crawlo/core/engine.py +438 -439
- crawlo/core/processor.py +47 -47
- crawlo/core/scheduler.py +291 -257
- crawlo/crawler.py +650 -650
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +273 -273
- crawlo/downloader/aiohttp_downloader.py +233 -228
- crawlo/downloader/cffi_downloader.py +245 -245
- crawlo/downloader/httpx_downloader.py +259 -259
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +402 -402
- crawlo/downloader/selenium_downloader.py +472 -472
- crawlo/event.py +11 -11
- crawlo/exceptions.py +81 -81
- crawlo/extension/__init__.py +63 -63
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +94 -94
- crawlo/extension/log_stats.py +70 -70
- crawlo/extension/logging_extension.py +61 -61
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/factories/__init__.py +27 -27
- crawlo/factories/base.py +68 -68
- crawlo/factories/crawler.py +103 -103
- crawlo/factories/registry.py +84 -84
- crawlo/filters/__init__.py +154 -154
- crawlo/filters/aioredis_filter.py +257 -257
- crawlo/filters/memory_filter.py +269 -269
- crawlo/framework.py +292 -292
- crawlo/initialization/__init__.py +44 -44
- crawlo/initialization/built_in.py +425 -425
- crawlo/initialization/context.py +141 -141
- crawlo/initialization/core.py +193 -193
- crawlo/initialization/phases.py +148 -148
- crawlo/initialization/registry.py +145 -145
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +23 -23
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/logging/__init__.py +45 -37
- crawlo/logging/async_handler.py +181 -0
- crawlo/logging/config.py +196 -96
- crawlo/logging/factory.py +171 -128
- crawlo/logging/manager.py +111 -111
- crawlo/logging/monitor.py +153 -0
- crawlo/logging/sampler.py +167 -0
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +135 -135
- crawlo/middleware/offsite.py +123 -123
- crawlo/middleware/proxy.py +386 -386
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +150 -150
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/middleware/simple_proxy.py +65 -65
- crawlo/mode_manager.py +219 -219
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +379 -379
- crawlo/network/response.py +359 -359
- crawlo/pipelines/__init__.py +21 -21
- crawlo/pipelines/bloom_dedup_pipeline.py +146 -146
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +197 -197
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +105 -105
- crawlo/pipelines/mongo_pipeline.py +131 -131
- crawlo/pipelines/mysql_pipeline.py +325 -325
- crawlo/pipelines/pipeline_manager.py +100 -84
- crawlo/pipelines/redis_dedup_pipeline.py +156 -156
- crawlo/project.py +349 -338
- crawlo/queue/pqueue.py +42 -42
- crawlo/queue/queue_manager.py +526 -522
- crawlo/queue/redis_priority_queue.py +370 -367
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +284 -284
- crawlo/settings/setting_manager.py +219 -219
- crawlo/spider/__init__.py +657 -657
- crawlo/stats_collector.py +73 -73
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +138 -138
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +3 -3
- crawlo/templates/project/items.py.tmpl +17 -17
- crawlo/templates/project/middlewares.py.tmpl +118 -118
- crawlo/templates/project/pipelines.py.tmpl +96 -96
- crawlo/templates/project/settings.py.tmpl +170 -170
- crawlo/templates/project/settings_distributed.py.tmpl +169 -169
- crawlo/templates/project/settings_gentle.py.tmpl +166 -166
- crawlo/templates/project/settings_high_performance.py.tmpl +167 -167
- crawlo/templates/project/settings_minimal.py.tmpl +65 -65
- crawlo/templates/project/settings_simple.py.tmpl +164 -164
- crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
- crawlo/templates/run.py.tmpl +34 -34
- crawlo/templates/spider/spider.py.tmpl +143 -143
- crawlo/templates/spiders_init.py.tmpl +9 -9
- crawlo/tools/__init__.py +200 -200
- crawlo/tools/anti_crawler.py +268 -268
- crawlo/tools/authenticated_proxy.py +240 -240
- crawlo/tools/data_formatter.py +225 -225
- crawlo/tools/data_validator.py +180 -180
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +384 -384
- crawlo/tools/encoding_converter.py +127 -127
- crawlo/tools/network_diagnostic.py +364 -364
- crawlo/tools/request_tools.py +82 -82
- crawlo/tools/retry_mechanism.py +224 -224
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +34 -34
- crawlo/utils/batch_processor.py +259 -259
- crawlo/utils/class_loader.py +25 -25
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +343 -343
- crawlo/utils/enhanced_error_handler.py +356 -356
- crawlo/utils/env_config.py +142 -142
- crawlo/utils/error_handler.py +165 -165
- crawlo/utils/fingerprint.py +122 -122
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_config.py +286 -286
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/log.py +79 -79
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_connection_pool.py +388 -388
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +267 -267
- crawlo/utils/request_serializer.py +225 -225
- crawlo/utils/spider_loader.py +61 -61
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +4 -4
- crawlo/utils/url.py +39 -39
- crawlo-1.4.3.dist-info/METADATA +190 -0
- crawlo-1.4.3.dist-info/RECORD +326 -0
- examples/__init__.py +7 -7
- examples/test_project/__init__.py +7 -7
- examples/test_project/run.py +34 -34
- examples/test_project/test_project/__init__.py +3 -3
- examples/test_project/test_project/items.py +17 -17
- examples/test_project/test_project/middlewares.py +118 -118
- examples/test_project/test_project/pipelines.py +96 -96
- examples/test_project/test_project/settings.py +169 -169
- examples/test_project/test_project/spiders/__init__.py +9 -9
- examples/test_project/test_project/spiders/of_week_dis.py +143 -143
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +275 -275
- tests/authenticated_proxy_example.py +106 -106
- tests/baidu_performance_test.py +108 -108
- tests/baidu_test.py +59 -59
- tests/cleaners_example.py +160 -160
- tests/comprehensive_framework_test.py +212 -212
- tests/comprehensive_test.py +81 -81
- tests/comprehensive_testing_summary.md +186 -186
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_configure.py +69 -69
- tests/debug_framework_logger.py +84 -84
- tests/debug_log_config.py +126 -126
- tests/debug_log_levels.py +63 -63
- tests/debug_pipelines.py +66 -66
- tests/detailed_log_test.py +233 -233
- tests/distributed_test.py +66 -66
- tests/distributed_test_debug.py +76 -76
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/env_config_example.py +133 -133
- tests/error_handling_example.py +171 -171
- tests/final_comprehensive_test.py +151 -151
- tests/final_log_test.py +260 -260
- tests/final_validation_test.py +182 -182
- tests/fix_log_test.py +142 -142
- tests/framework_performance_test.py +202 -202
- tests/log_buffering_test.py +111 -111
- tests/log_generation_timing_test.py +153 -153
- tests/optimized_performance_test.py +211 -211
- tests/performance_comparison.py +245 -245
- tests/queue_blocking_test.py +113 -113
- tests/queue_test.py +89 -89
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/scrapy_comparison/ofweek_scrapy.py +138 -138
- tests/scrapy_comparison/scrapy_test.py +133 -133
- tests/simple_command_test.py +119 -119
- tests/simple_crawlo_test.py +127 -127
- tests/simple_log_test.py +57 -57
- tests/simple_log_test2.py +137 -137
- tests/simple_optimization_test.py +128 -128
- tests/simple_queue_type_test.py +41 -41
- tests/simple_spider_test.py +49 -49
- tests/simple_test.py +47 -47
- tests/spider_log_timing_test.py +177 -177
- tests/test_advanced_tools.py +148 -148
- tests/test_all_commands.py +230 -230
- tests/test_all_pipeline_fingerprints.py +133 -133
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_authenticated_proxy.py +141 -141
- tests/test_batch_processor.py +178 -178
- tests/test_cleaners.py +54 -54
- tests/test_component_factory.py +174 -174
- tests/test_comprehensive.py +146 -146
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_controlled_spider_mixin.py +79 -79
- tests/test_crawlo_proxy_integration.py +108 -108
- tests/test_date_tools.py +123 -123
- tests/test_dedup_fix.py +220 -220
- tests/test_dedup_pipeline_consistency.py +125 -0
- tests/test_default_header_middleware.py +313 -313
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +204 -204
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +268 -268
- tests/test_dynamic_downloaders_proxy.py +124 -124
- tests/test_dynamic_proxy.py +92 -92
- tests/test_dynamic_proxy_config.py +146 -146
- tests/test_dynamic_proxy_real.py +109 -109
- tests/test_edge_cases.py +303 -303
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_enhanced_error_handler_comprehensive.py +245 -245
- tests/test_env_config.py +121 -121
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_factories.py +252 -252
- tests/test_final_validation.py +153 -153
- tests/test_fingerprint_consistency.py +135 -135
- tests/test_fingerprint_simple.py +51 -51
- tests/test_framework_env_usage.py +103 -103
- tests/test_framework_logger.py +66 -66
- tests/test_framework_startup.py +64 -64
- tests/test_get_component_logger.py +83 -83
- tests/test_hash_performance.py +99 -99
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_large_scale_config.py +112 -112
- tests/test_large_scale_helper.py +235 -235
- tests/test_logging_enhancements.py +375 -0
- tests/test_logging_final.py +185 -0
- tests/test_logging_integration.py +313 -0
- tests/test_logging_system.py +282 -282
- tests/test_middleware_debug.py +142 -0
- tests/test_mode_change.py +72 -72
- tests/test_mode_consistency.py +51 -51
- tests/test_offsite_middleware.py +244 -244
- tests/test_offsite_middleware_simple.py +203 -203
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_performance_monitor.py +115 -115
- tests/test_pipeline_fingerprint_consistency.py +86 -86
- tests/test_priority_behavior.py +212 -0
- tests/test_priority_consistency.py +152 -0
- tests/test_priority_consistency_fixed.py +250 -0
- tests/test_proxy_api.py +264 -264
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +121 -121
- tests/test_proxy_middleware_enhanced.py +216 -216
- tests/test_proxy_middleware_integration.py +136 -136
- tests/test_proxy_middleware_refactored.py +184 -184
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_queue_empty_check.py +41 -41
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +179 -179
- tests/test_queue_naming.py +154 -154
- tests/test_queue_type.py +106 -106
- tests/test_queue_type_redis_config_consistency.py +131 -0
- tests/test_random_headers_default.py +323 -0
- tests/test_random_headers_necessity.py +309 -0
- tests/test_random_user_agent.py +72 -72
- tests/test_real_scenario_proxy.py +195 -195
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_redis_queue_name_fix.py +175 -175
- tests/test_redis_queue_type_fallback.py +130 -0
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_improvements.py +152 -152
- tests/test_retry_middleware.py +334 -242
- tests/test_retry_middleware_realistic.py +274 -0
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_simple_response.py +61 -61
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agent_randomness.py +177 -0
- tests/test_user_agents.py +96 -96
- tests/tools_example.py +260 -260
- tests/untested_features_report.md +138 -138
- tests/verify_debug.py +51 -51
- tests/verify_distributed.py +117 -117
- tests/verify_log_fix.py +111 -111
- crawlo-1.4.1.dist-info/METADATA +0 -1199
- crawlo-1.4.1.dist-info/RECORD +0 -309
- {crawlo-1.4.1.dist-info → crawlo-1.4.3.dist-info}/WHEEL +0 -0
- {crawlo-1.4.1.dist-info → crawlo-1.4.3.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.1.dist-info → crawlo-1.4.3.dist-info}/top_level.txt +0 -0
|
@@ -1,137 +1,137 @@
|
|
|
1
|
-
# tests/test_proxy_middleware_integration.py
|
|
2
|
-
import pytest
|
|
3
|
-
import asyncio
|
|
4
|
-
import time
|
|
5
|
-
from unittest.mock import Mock, AsyncMock, patch
|
|
6
|
-
from crawlo import Request, Response, Spider
|
|
7
|
-
from crawlo.proxy.middleware import ProxyMiddleware
|
|
8
|
-
from crawlo.proxy.stats import ProxyStats
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
@pytest.fixture
|
|
12
|
-
def crawler():
|
|
13
|
-
class MockSettings:
|
|
14
|
-
def get(self, key, default=None):
|
|
15
|
-
defaults = {
|
|
16
|
-
'PROXY_ENABLED': True,
|
|
17
|
-
'PROXIES': ['http://p1:8080', 'http://p2:8080'],
|
|
18
|
-
'PROXY_SELECTION_STRATEGY': 'random',
|
|
19
|
-
'PROXY_REQUEST_DELAY_ENABLED': False,
|
|
20
|
-
'PROXY_MAX_RETRY_COUNT': 1,
|
|
21
|
-
}
|
|
22
|
-
return defaults.get(key, default)
|
|
23
|
-
|
|
24
|
-
def get_bool(self, key, default=None):
|
|
25
|
-
return self.get(key, default)
|
|
26
|
-
|
|
27
|
-
def get_int(self, key, default=None):
|
|
28
|
-
return self.get(key, default)
|
|
29
|
-
|
|
30
|
-
def get_float(self, key, default=None):
|
|
31
|
-
return self.get(key, default)
|
|
32
|
-
|
|
33
|
-
def get_list(self, key, default=None):
|
|
34
|
-
return self.get(key, default)
|
|
35
|
-
|
|
36
|
-
class MockCrawler:
|
|
37
|
-
def __init__(self):
|
|
38
|
-
self.settings = MockSettings()
|
|
39
|
-
|
|
40
|
-
return MockCrawler()
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
@pytest.fixture
|
|
44
|
-
def middleware(crawler):
|
|
45
|
-
mw = ProxyMiddleware.create_instance(crawler)
|
|
46
|
-
mw._load_providers = Mock()
|
|
47
|
-
mw._update_proxy_pool = AsyncMock()
|
|
48
|
-
mw._health_check = AsyncMock()
|
|
49
|
-
mw.scheduler = None
|
|
50
|
-
|
|
51
|
-
mw.proxies = [
|
|
52
|
-
{
|
|
53
|
-
'url': 'http://p1:8080',
|
|
54
|
-
'healthy': True,
|
|
55
|
-
'failures': 0,
|
|
56
|
-
'last_health_check': 0,
|
|
57
|
-
'unhealthy_since': 0
|
|
58
|
-
},
|
|
59
|
-
{
|
|
60
|
-
'url': 'http://p2:8080',
|
|
61
|
-
'healthy': True,
|
|
62
|
-
'failures': 0,
|
|
63
|
-
'last_health_check': 0,
|
|
64
|
-
'unhealthy_since': 0
|
|
65
|
-
},
|
|
66
|
-
]
|
|
67
|
-
mw.stats = ProxyStats()
|
|
68
|
-
for p in mw.proxies:
|
|
69
|
-
mw.stats.record(p['url'], 'total')
|
|
70
|
-
|
|
71
|
-
asyncio.get_event_loop().run_until_complete(mw._initial_setup())
|
|
72
|
-
return mw
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
@pytest.fixture
|
|
76
|
-
def spider():
|
|
77
|
-
return Mock(spec=Spider, logger=Mock())
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
def test_process_request_sets_proxy(middleware, spider):
|
|
81
|
-
request = Request("https://example.com")
|
|
82
|
-
result = asyncio.get_event_loop().run_until_complete(
|
|
83
|
-
middleware.process_request(request, spider)
|
|
84
|
-
)
|
|
85
|
-
assert result is None
|
|
86
|
-
assert hasattr(request, 'proxy')
|
|
87
|
-
assert request.proxy in ['http://p1:8080', 'http://p2:8080']
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
def test_process_response_records_success(middleware, spider):
|
|
91
|
-
request = Request("https://example.com")
|
|
92
|
-
request.proxy = 'http://p1:8080'
|
|
93
|
-
response = Response("https://example.com", body=b"ok", headers={})
|
|
94
|
-
middleware.stats.record(request.proxy, 'total')
|
|
95
|
-
middleware.process_response(request, response, spider)
|
|
96
|
-
assert middleware.stats.get(request.proxy)['success'] == 1
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
def test_process_exception_switches_proxy(middleware, spider):
|
|
100
|
-
request = Request("https://example.com")
|
|
101
|
-
request.proxy = 'http://p1:8080'
|
|
102
|
-
request.meta['proxy_retry_count'] = 0
|
|
103
|
-
|
|
104
|
-
result = middleware.process_exception(request, Exception("Timeout"), spider)
|
|
105
|
-
assert result is not None
|
|
106
|
-
assert result.proxy != 'http://p1:8080'
|
|
107
|
-
assert result.meta['proxy_retry_count'] == 1
|
|
108
|
-
|
|
109
|
-
final = middleware.process_exception(result, Exception("Timeout"), spider)
|
|
110
|
-
assert final is None
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
def test_mark_failure_disables_proxy(middleware):
|
|
114
|
-
proxy_url = 'http://p1:8080'
|
|
115
|
-
p = next(p for p in middleware.proxies if p['url'] == proxy_url)
|
|
116
|
-
p['failures'] = 2
|
|
117
|
-
|
|
118
|
-
middleware._mark_failure(proxy_url)
|
|
119
|
-
assert p['failures'] == 3
|
|
120
|
-
assert p['healthy'] is False
|
|
121
|
-
assert p['unhealthy_since'] > 0
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
@pytest.mark.asyncio
|
|
125
|
-
async def test_request_delay(middleware, spider):
|
|
126
|
-
"""测试请求延迟功能:验证是否调用了 asyncio.sleep"""
|
|
127
|
-
with patch("crawlo.proxy.middleware.asyncio.sleep", new_callable=AsyncMock) as mock_sleep:
|
|
128
|
-
middleware.delay_enabled = True # 注意:这里应该是 delay_enabled 而不是 request_delay_enabled
|
|
129
|
-
middleware.request_delay = 0.1
|
|
130
|
-
middleware._last_req_time = time.time() - 0.05 # 50ms 前
|
|
131
|
-
|
|
132
|
-
request = Request("https://a.com")
|
|
133
|
-
await middleware.process_request(request, spider)
|
|
134
|
-
|
|
135
|
-
mock_sleep.assert_called_once()
|
|
136
|
-
delay = mock_sleep.call_args[0][0]
|
|
1
|
+
# tests/test_proxy_middleware_integration.py
|
|
2
|
+
import pytest
|
|
3
|
+
import asyncio
|
|
4
|
+
import time
|
|
5
|
+
from unittest.mock import Mock, AsyncMock, patch
|
|
6
|
+
from crawlo import Request, Response, Spider
|
|
7
|
+
from crawlo.proxy.middleware import ProxyMiddleware
|
|
8
|
+
from crawlo.proxy.stats import ProxyStats
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@pytest.fixture
|
|
12
|
+
def crawler():
|
|
13
|
+
class MockSettings:
|
|
14
|
+
def get(self, key, default=None):
|
|
15
|
+
defaults = {
|
|
16
|
+
'PROXY_ENABLED': True,
|
|
17
|
+
'PROXIES': ['http://p1:8080', 'http://p2:8080'],
|
|
18
|
+
'PROXY_SELECTION_STRATEGY': 'random',
|
|
19
|
+
'PROXY_REQUEST_DELAY_ENABLED': False,
|
|
20
|
+
'PROXY_MAX_RETRY_COUNT': 1,
|
|
21
|
+
}
|
|
22
|
+
return defaults.get(key, default)
|
|
23
|
+
|
|
24
|
+
def get_bool(self, key, default=None):
|
|
25
|
+
return self.get(key, default)
|
|
26
|
+
|
|
27
|
+
def get_int(self, key, default=None):
|
|
28
|
+
return self.get(key, default)
|
|
29
|
+
|
|
30
|
+
def get_float(self, key, default=None):
|
|
31
|
+
return self.get(key, default)
|
|
32
|
+
|
|
33
|
+
def get_list(self, key, default=None):
|
|
34
|
+
return self.get(key, default)
|
|
35
|
+
|
|
36
|
+
class MockCrawler:
|
|
37
|
+
def __init__(self):
|
|
38
|
+
self.settings = MockSettings()
|
|
39
|
+
|
|
40
|
+
return MockCrawler()
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@pytest.fixture
|
|
44
|
+
def middleware(crawler):
|
|
45
|
+
mw = ProxyMiddleware.create_instance(crawler)
|
|
46
|
+
mw._load_providers = Mock()
|
|
47
|
+
mw._update_proxy_pool = AsyncMock()
|
|
48
|
+
mw._health_check = AsyncMock()
|
|
49
|
+
mw.scheduler = None
|
|
50
|
+
|
|
51
|
+
mw.proxies = [
|
|
52
|
+
{
|
|
53
|
+
'url': 'http://p1:8080',
|
|
54
|
+
'healthy': True,
|
|
55
|
+
'failures': 0,
|
|
56
|
+
'last_health_check': 0,
|
|
57
|
+
'unhealthy_since': 0
|
|
58
|
+
},
|
|
59
|
+
{
|
|
60
|
+
'url': 'http://p2:8080',
|
|
61
|
+
'healthy': True,
|
|
62
|
+
'failures': 0,
|
|
63
|
+
'last_health_check': 0,
|
|
64
|
+
'unhealthy_since': 0
|
|
65
|
+
},
|
|
66
|
+
]
|
|
67
|
+
mw.stats = ProxyStats()
|
|
68
|
+
for p in mw.proxies:
|
|
69
|
+
mw.stats.record(p['url'], 'total')
|
|
70
|
+
|
|
71
|
+
asyncio.get_event_loop().run_until_complete(mw._initial_setup())
|
|
72
|
+
return mw
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@pytest.fixture
|
|
76
|
+
def spider():
|
|
77
|
+
return Mock(spec=Spider, logger=Mock())
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def test_process_request_sets_proxy(middleware, spider):
|
|
81
|
+
request = Request("https://example.com")
|
|
82
|
+
result = asyncio.get_event_loop().run_until_complete(
|
|
83
|
+
middleware.process_request(request, spider)
|
|
84
|
+
)
|
|
85
|
+
assert result is None
|
|
86
|
+
assert hasattr(request, 'proxy')
|
|
87
|
+
assert request.proxy in ['http://p1:8080', 'http://p2:8080']
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def test_process_response_records_success(middleware, spider):
|
|
91
|
+
request = Request("https://example.com")
|
|
92
|
+
request.proxy = 'http://p1:8080'
|
|
93
|
+
response = Response("https://example.com", body=b"ok", headers={})
|
|
94
|
+
middleware.stats.record(request.proxy, 'total')
|
|
95
|
+
middleware.process_response(request, response, spider)
|
|
96
|
+
assert middleware.stats.get(request.proxy)['success'] == 1
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def test_process_exception_switches_proxy(middleware, spider):
|
|
100
|
+
request = Request("https://example.com")
|
|
101
|
+
request.proxy = 'http://p1:8080'
|
|
102
|
+
request.meta['proxy_retry_count'] = 0
|
|
103
|
+
|
|
104
|
+
result = middleware.process_exception(request, Exception("Timeout"), spider)
|
|
105
|
+
assert result is not None
|
|
106
|
+
assert result.proxy != 'http://p1:8080'
|
|
107
|
+
assert result.meta['proxy_retry_count'] == 1
|
|
108
|
+
|
|
109
|
+
final = middleware.process_exception(result, Exception("Timeout"), spider)
|
|
110
|
+
assert final is None
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def test_mark_failure_disables_proxy(middleware):
|
|
114
|
+
proxy_url = 'http://p1:8080'
|
|
115
|
+
p = next(p for p in middleware.proxies if p['url'] == proxy_url)
|
|
116
|
+
p['failures'] = 2
|
|
117
|
+
|
|
118
|
+
middleware._mark_failure(proxy_url)
|
|
119
|
+
assert p['failures'] == 3
|
|
120
|
+
assert p['healthy'] is False
|
|
121
|
+
assert p['unhealthy_since'] > 0
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
@pytest.mark.asyncio
|
|
125
|
+
async def test_request_delay(middleware, spider):
|
|
126
|
+
"""测试请求延迟功能:验证是否调用了 asyncio.sleep"""
|
|
127
|
+
with patch("crawlo.proxy.middleware.asyncio.sleep", new_callable=AsyncMock) as mock_sleep:
|
|
128
|
+
middleware.delay_enabled = True # 注意:这里应该是 delay_enabled 而不是 request_delay_enabled
|
|
129
|
+
middleware.request_delay = 0.1
|
|
130
|
+
middleware._last_req_time = time.time() - 0.05 # 50ms 前
|
|
131
|
+
|
|
132
|
+
request = Request("https://a.com")
|
|
133
|
+
await middleware.process_request(request, spider)
|
|
134
|
+
|
|
135
|
+
mock_sleep.assert_called_once()
|
|
136
|
+
delay = mock_sleep.call_args[0][0]
|
|
137
137
|
assert 0.04 <= delay <= 0.06
|
|
@@ -1,185 +1,185 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
ProxyMiddleware 重构测试文件
|
|
5
|
-
用于测试重构后的代理中间件功能,特别是修复的重复逻辑
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import asyncio
|
|
9
|
-
import unittest
|
|
10
|
-
from unittest.mock import Mock, patch
|
|
11
|
-
|
|
12
|
-
from crawlo.middleware.proxy import ProxyMiddleware, Proxy
|
|
13
|
-
from crawlo.exceptions import NotConfiguredError
|
|
14
|
-
from crawlo.settings.setting_manager import SettingManager
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class TestProxyMiddlewareRefactored(unittest.TestCase):
|
|
18
|
-
"""ProxyMiddleware 重构测试类"""
|
|
19
|
-
|
|
20
|
-
def setUp(self):
|
|
21
|
-
"""测试前准备"""
|
|
22
|
-
# 创建设置管理器
|
|
23
|
-
self.settings = SettingManager()
|
|
24
|
-
|
|
25
|
-
# 创建爬虫模拟对象
|
|
26
|
-
self.crawler = Mock()
|
|
27
|
-
self.crawler.settings = self.settings
|
|
28
|
-
|
|
29
|
-
def test_parse_proxy_data_with_string(self):
|
|
30
|
-
"""测试解析字符串代理数据"""
|
|
31
|
-
middleware = ProxyMiddleware(
|
|
32
|
-
settings=self.settings,
|
|
33
|
-
log_level='INFO'
|
|
34
|
-
)
|
|
35
|
-
|
|
36
|
-
# 测试有效的HTTP代理URL
|
|
37
|
-
result = middleware._parse_proxy_data("http://proxy.example.com:8080")
|
|
38
|
-
self.assertEqual(result, ["http://proxy.example.com:8080"])
|
|
39
|
-
|
|
40
|
-
# 测试有效的HTTPS代理URL
|
|
41
|
-
result = middleware._parse_proxy_data("https://proxy.example.com:8080")
|
|
42
|
-
self.assertEqual(result, ["https://proxy.example.com:8080"])
|
|
43
|
-
|
|
44
|
-
# 测试无效的代理URL
|
|
45
|
-
result = middleware._parse_proxy_data("invalid-proxy")
|
|
46
|
-
self.assertEqual(result, [])
|
|
47
|
-
|
|
48
|
-
def test_parse_proxy_data_with_dict(self):
|
|
49
|
-
"""测试解析字典代理数据"""
|
|
50
|
-
middleware = ProxyMiddleware(
|
|
51
|
-
settings=self.settings,
|
|
52
|
-
log_level='INFO'
|
|
53
|
-
)
|
|
54
|
-
|
|
55
|
-
# 测试包含字符串代理的字典
|
|
56
|
-
proxy_data = {
|
|
57
|
-
"proxy": "http://proxy1.example.com:8080"
|
|
58
|
-
}
|
|
59
|
-
result = middleware._parse_proxy_data(proxy_data)
|
|
60
|
-
self.assertEqual(result, ["http://proxy1.example.com:8080"])
|
|
61
|
-
|
|
62
|
-
# 测试包含列表代理的字典
|
|
63
|
-
proxy_data = {
|
|
64
|
-
"proxies": [
|
|
65
|
-
"http://proxy1.example.com:8080",
|
|
66
|
-
"https://proxy2.example.com:8080"
|
|
67
|
-
]
|
|
68
|
-
}
|
|
69
|
-
result = middleware._parse_proxy_data(proxy_data)
|
|
70
|
-
self.assertEqual(result, [
|
|
71
|
-
"http://proxy1.example.com:8080",
|
|
72
|
-
"https://proxy2.example.com:8080"
|
|
73
|
-
])
|
|
74
|
-
|
|
75
|
-
# 测试混合数据
|
|
76
|
-
proxy_data = {
|
|
77
|
-
"proxy": "http://proxy1.example.com:8080",
|
|
78
|
-
"proxies": [
|
|
79
|
-
"https://proxy2.example.com:8080",
|
|
80
|
-
"invalid-proxy"
|
|
81
|
-
]
|
|
82
|
-
}
|
|
83
|
-
result = middleware._parse_proxy_data(proxy_data)
|
|
84
|
-
self.assertEqual(result, [
|
|
85
|
-
"http://proxy1.example.com:8080",
|
|
86
|
-
"https://proxy2.example.com:8080"
|
|
87
|
-
])
|
|
88
|
-
|
|
89
|
-
def test_get_healthy_proxies(self):
|
|
90
|
-
"""测试获取健康代理"""
|
|
91
|
-
middleware = ProxyMiddleware(
|
|
92
|
-
settings=self.settings,
|
|
93
|
-
log_level='INFO'
|
|
94
|
-
)
|
|
95
|
-
|
|
96
|
-
# 创建测试代理
|
|
97
|
-
proxy1 = Proxy("http://proxy1.example.com:8080")
|
|
98
|
-
proxy2 = Proxy("http://proxy2.example.com:8080")
|
|
99
|
-
proxy3 = Proxy("http://proxy3.example.com:8080")
|
|
100
|
-
|
|
101
|
-
# 设置代理池
|
|
102
|
-
middleware._proxy_pool = [proxy1, proxy2, proxy3]
|
|
103
|
-
|
|
104
|
-
# 所有代理都是健康的
|
|
105
|
-
healthy_proxies = middleware._get_healthy_proxies()
|
|
106
|
-
self.assertEqual(len(healthy_proxies), 3)
|
|
107
|
-
|
|
108
|
-
# 标记一个代理为不健康
|
|
109
|
-
proxy2.is_healthy = False
|
|
110
|
-
healthy_proxies = middleware._get_healthy_proxies()
|
|
111
|
-
self.assertEqual(len(healthy_proxies), 2)
|
|
112
|
-
self.assertIn(proxy1, healthy_proxies)
|
|
113
|
-
self.assertNotIn(proxy2, healthy_proxies)
|
|
114
|
-
self.assertIn(proxy3, healthy_proxies)
|
|
115
|
-
|
|
116
|
-
# 标记一个代理成功率低于阈值
|
|
117
|
-
proxy3.mark_failure()
|
|
118
|
-
proxy3.mark_failure()
|
|
119
|
-
proxy3.mark_failure()
|
|
120
|
-
proxy3.mark_failure() # 4次失败,0次成功,成功率=0 < 0.5(默认阈值)
|
|
121
|
-
healthy_proxies = middleware._get_healthy_proxies()
|
|
122
|
-
self.assertEqual(len(healthy_proxies), 1)
|
|
123
|
-
self.assertIn(proxy1, healthy_proxies)
|
|
124
|
-
self.assertNotIn(proxy3, healthy_proxies)
|
|
125
|
-
|
|
126
|
-
@patch('crawlo.utils.log.get_logger')
|
|
127
|
-
def test_update_proxy_pool_with_parsed_data(self, mock_get_logger):
|
|
128
|
-
"""测试使用解析后的代理数据更新代理池"""
|
|
129
|
-
self.settings.set('PROXY_ENABLED', True)
|
|
130
|
-
self.settings.set('PROXY_API_URL', 'http://proxy-api.example.com')
|
|
131
|
-
self.settings.set('PROXY_POOL_SIZE', 2)
|
|
132
|
-
self.settings.set('LOG_LEVEL', 'INFO')
|
|
133
|
-
|
|
134
|
-
mock_get_logger.return_value = Mock()
|
|
135
|
-
|
|
136
|
-
middleware = ProxyMiddleware.create_instance(self.crawler)
|
|
137
|
-
|
|
138
|
-
# 测试解析字符串代理数据
|
|
139
|
-
new_proxies = middleware._parse_proxy_data("http://proxy1.example.com:8080")
|
|
140
|
-
self.assertEqual(new_proxies, ["http://proxy1.example.com:8080"])
|
|
141
|
-
|
|
142
|
-
# 测试解析字典代理数据
|
|
143
|
-
proxy_data = {
|
|
144
|
-
"proxies": [
|
|
145
|
-
"http://proxy1.example.com:8080",
|
|
146
|
-
"https://proxy2.example.com:8080",
|
|
147
|
-
"http://proxy3.example.com:8080"
|
|
148
|
-
]
|
|
149
|
-
}
|
|
150
|
-
new_proxies = middleware._parse_proxy_data(proxy_data)
|
|
151
|
-
self.assertEqual(new_proxies, [
|
|
152
|
-
"http://proxy1.example.com:8080",
|
|
153
|
-
"https://proxy2.example.com:8080",
|
|
154
|
-
"http://proxy3.example.com:8080"
|
|
155
|
-
])
|
|
156
|
-
|
|
157
|
-
def test_get_healthy_proxy_with_refactored_logic(self):
|
|
158
|
-
"""测试使用重构后的逻辑获取健康代理"""
|
|
159
|
-
middleware = ProxyMiddleware(
|
|
160
|
-
settings=self.settings,
|
|
161
|
-
log_level='INFO'
|
|
162
|
-
)
|
|
163
|
-
|
|
164
|
-
# 创建测试代理
|
|
165
|
-
proxy1 = Proxy("http://proxy1.example.com:8080")
|
|
166
|
-
proxy2 = Proxy("http://proxy2.example.com:8080")
|
|
167
|
-
|
|
168
|
-
# 设置代理池
|
|
169
|
-
middleware._proxy_pool = [proxy1, proxy2]
|
|
170
|
-
middleware._current_proxy_index = 0
|
|
171
|
-
|
|
172
|
-
# 获取第一个健康代理(由于轮询逻辑,第一次调用会得到索引1的代理)
|
|
173
|
-
healthy_proxy = asyncio.run(middleware._get_healthy_proxy())
|
|
174
|
-
self.assertEqual(healthy_proxy.proxy_str, proxy2.proxy_str)
|
|
175
|
-
|
|
176
|
-
# 获取第二个健康代理(轮询回到索引0)
|
|
177
|
-
healthy_proxy = asyncio.run(middleware._get_healthy_proxy())
|
|
178
|
-
self.assertEqual(healthy_proxy.proxy_str, proxy1.proxy_str)
|
|
179
|
-
|
|
180
|
-
# 再次获取第一个健康代理(轮询到索引1)
|
|
181
|
-
healthy_proxy = asyncio.run(middleware._get_healthy_proxy())
|
|
182
|
-
self.assertEqual(healthy_proxy.proxy_str, proxy2.proxy_str)
|
|
183
|
-
|
|
184
|
-
if __name__ == '__main__':
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
ProxyMiddleware 重构测试文件
|
|
5
|
+
用于测试重构后的代理中间件功能,特别是修复的重复逻辑
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import unittest
|
|
10
|
+
from unittest.mock import Mock, patch
|
|
11
|
+
|
|
12
|
+
from crawlo.middleware.proxy import ProxyMiddleware, Proxy
|
|
13
|
+
from crawlo.exceptions import NotConfiguredError
|
|
14
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TestProxyMiddlewareRefactored(unittest.TestCase):
|
|
18
|
+
"""ProxyMiddleware 重构测试类"""
|
|
19
|
+
|
|
20
|
+
def setUp(self):
|
|
21
|
+
"""测试前准备"""
|
|
22
|
+
# 创建设置管理器
|
|
23
|
+
self.settings = SettingManager()
|
|
24
|
+
|
|
25
|
+
# 创建爬虫模拟对象
|
|
26
|
+
self.crawler = Mock()
|
|
27
|
+
self.crawler.settings = self.settings
|
|
28
|
+
|
|
29
|
+
def test_parse_proxy_data_with_string(self):
|
|
30
|
+
"""测试解析字符串代理数据"""
|
|
31
|
+
middleware = ProxyMiddleware(
|
|
32
|
+
settings=self.settings,
|
|
33
|
+
log_level='INFO'
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
# 测试有效的HTTP代理URL
|
|
37
|
+
result = middleware._parse_proxy_data("http://proxy.example.com:8080")
|
|
38
|
+
self.assertEqual(result, ["http://proxy.example.com:8080"])
|
|
39
|
+
|
|
40
|
+
# 测试有效的HTTPS代理URL
|
|
41
|
+
result = middleware._parse_proxy_data("https://proxy.example.com:8080")
|
|
42
|
+
self.assertEqual(result, ["https://proxy.example.com:8080"])
|
|
43
|
+
|
|
44
|
+
# 测试无效的代理URL
|
|
45
|
+
result = middleware._parse_proxy_data("invalid-proxy")
|
|
46
|
+
self.assertEqual(result, [])
|
|
47
|
+
|
|
48
|
+
def test_parse_proxy_data_with_dict(self):
|
|
49
|
+
"""测试解析字典代理数据"""
|
|
50
|
+
middleware = ProxyMiddleware(
|
|
51
|
+
settings=self.settings,
|
|
52
|
+
log_level='INFO'
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# 测试包含字符串代理的字典
|
|
56
|
+
proxy_data = {
|
|
57
|
+
"proxy": "http://proxy1.example.com:8080"
|
|
58
|
+
}
|
|
59
|
+
result = middleware._parse_proxy_data(proxy_data)
|
|
60
|
+
self.assertEqual(result, ["http://proxy1.example.com:8080"])
|
|
61
|
+
|
|
62
|
+
# 测试包含列表代理的字典
|
|
63
|
+
proxy_data = {
|
|
64
|
+
"proxies": [
|
|
65
|
+
"http://proxy1.example.com:8080",
|
|
66
|
+
"https://proxy2.example.com:8080"
|
|
67
|
+
]
|
|
68
|
+
}
|
|
69
|
+
result = middleware._parse_proxy_data(proxy_data)
|
|
70
|
+
self.assertEqual(result, [
|
|
71
|
+
"http://proxy1.example.com:8080",
|
|
72
|
+
"https://proxy2.example.com:8080"
|
|
73
|
+
])
|
|
74
|
+
|
|
75
|
+
# 测试混合数据
|
|
76
|
+
proxy_data = {
|
|
77
|
+
"proxy": "http://proxy1.example.com:8080",
|
|
78
|
+
"proxies": [
|
|
79
|
+
"https://proxy2.example.com:8080",
|
|
80
|
+
"invalid-proxy"
|
|
81
|
+
]
|
|
82
|
+
}
|
|
83
|
+
result = middleware._parse_proxy_data(proxy_data)
|
|
84
|
+
self.assertEqual(result, [
|
|
85
|
+
"http://proxy1.example.com:8080",
|
|
86
|
+
"https://proxy2.example.com:8080"
|
|
87
|
+
])
|
|
88
|
+
|
|
89
|
+
def test_get_healthy_proxies(self):
|
|
90
|
+
"""测试获取健康代理"""
|
|
91
|
+
middleware = ProxyMiddleware(
|
|
92
|
+
settings=self.settings,
|
|
93
|
+
log_level='INFO'
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# 创建测试代理
|
|
97
|
+
proxy1 = Proxy("http://proxy1.example.com:8080")
|
|
98
|
+
proxy2 = Proxy("http://proxy2.example.com:8080")
|
|
99
|
+
proxy3 = Proxy("http://proxy3.example.com:8080")
|
|
100
|
+
|
|
101
|
+
# 设置代理池
|
|
102
|
+
middleware._proxy_pool = [proxy1, proxy2, proxy3]
|
|
103
|
+
|
|
104
|
+
# 所有代理都是健康的
|
|
105
|
+
healthy_proxies = middleware._get_healthy_proxies()
|
|
106
|
+
self.assertEqual(len(healthy_proxies), 3)
|
|
107
|
+
|
|
108
|
+
# 标记一个代理为不健康
|
|
109
|
+
proxy2.is_healthy = False
|
|
110
|
+
healthy_proxies = middleware._get_healthy_proxies()
|
|
111
|
+
self.assertEqual(len(healthy_proxies), 2)
|
|
112
|
+
self.assertIn(proxy1, healthy_proxies)
|
|
113
|
+
self.assertNotIn(proxy2, healthy_proxies)
|
|
114
|
+
self.assertIn(proxy3, healthy_proxies)
|
|
115
|
+
|
|
116
|
+
# 标记一个代理成功率低于阈值
|
|
117
|
+
proxy3.mark_failure()
|
|
118
|
+
proxy3.mark_failure()
|
|
119
|
+
proxy3.mark_failure()
|
|
120
|
+
proxy3.mark_failure() # 4次失败,0次成功,成功率=0 < 0.5(默认阈值)
|
|
121
|
+
healthy_proxies = middleware._get_healthy_proxies()
|
|
122
|
+
self.assertEqual(len(healthy_proxies), 1)
|
|
123
|
+
self.assertIn(proxy1, healthy_proxies)
|
|
124
|
+
self.assertNotIn(proxy3, healthy_proxies)
|
|
125
|
+
|
|
126
|
+
@patch('crawlo.utils.log.get_logger')
|
|
127
|
+
def test_update_proxy_pool_with_parsed_data(self, mock_get_logger):
|
|
128
|
+
"""测试使用解析后的代理数据更新代理池"""
|
|
129
|
+
self.settings.set('PROXY_ENABLED', True)
|
|
130
|
+
self.settings.set('PROXY_API_URL', 'http://proxy-api.example.com')
|
|
131
|
+
self.settings.set('PROXY_POOL_SIZE', 2)
|
|
132
|
+
self.settings.set('LOG_LEVEL', 'INFO')
|
|
133
|
+
|
|
134
|
+
mock_get_logger.return_value = Mock()
|
|
135
|
+
|
|
136
|
+
middleware = ProxyMiddleware.create_instance(self.crawler)
|
|
137
|
+
|
|
138
|
+
# 测试解析字符串代理数据
|
|
139
|
+
new_proxies = middleware._parse_proxy_data("http://proxy1.example.com:8080")
|
|
140
|
+
self.assertEqual(new_proxies, ["http://proxy1.example.com:8080"])
|
|
141
|
+
|
|
142
|
+
# 测试解析字典代理数据
|
|
143
|
+
proxy_data = {
|
|
144
|
+
"proxies": [
|
|
145
|
+
"http://proxy1.example.com:8080",
|
|
146
|
+
"https://proxy2.example.com:8080",
|
|
147
|
+
"http://proxy3.example.com:8080"
|
|
148
|
+
]
|
|
149
|
+
}
|
|
150
|
+
new_proxies = middleware._parse_proxy_data(proxy_data)
|
|
151
|
+
self.assertEqual(new_proxies, [
|
|
152
|
+
"http://proxy1.example.com:8080",
|
|
153
|
+
"https://proxy2.example.com:8080",
|
|
154
|
+
"http://proxy3.example.com:8080"
|
|
155
|
+
])
|
|
156
|
+
|
|
157
|
+
def test_get_healthy_proxy_with_refactored_logic(self):
|
|
158
|
+
"""测试使用重构后的逻辑获取健康代理"""
|
|
159
|
+
middleware = ProxyMiddleware(
|
|
160
|
+
settings=self.settings,
|
|
161
|
+
log_level='INFO'
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
# 创建测试代理
|
|
165
|
+
proxy1 = Proxy("http://proxy1.example.com:8080")
|
|
166
|
+
proxy2 = Proxy("http://proxy2.example.com:8080")
|
|
167
|
+
|
|
168
|
+
# 设置代理池
|
|
169
|
+
middleware._proxy_pool = [proxy1, proxy2]
|
|
170
|
+
middleware._current_proxy_index = 0
|
|
171
|
+
|
|
172
|
+
# 获取第一个健康代理(由于轮询逻辑,第一次调用会得到索引1的代理)
|
|
173
|
+
healthy_proxy = asyncio.run(middleware._get_healthy_proxy())
|
|
174
|
+
self.assertEqual(healthy_proxy.proxy_str, proxy2.proxy_str)
|
|
175
|
+
|
|
176
|
+
# 获取第二个健康代理(轮询回到索引0)
|
|
177
|
+
healthy_proxy = asyncio.run(middleware._get_healthy_proxy())
|
|
178
|
+
self.assertEqual(healthy_proxy.proxy_str, proxy1.proxy_str)
|
|
179
|
+
|
|
180
|
+
# 再次获取第一个健康代理(轮询到索引1)
|
|
181
|
+
healthy_proxy = asyncio.run(middleware._get_healthy_proxy())
|
|
182
|
+
self.assertEqual(healthy_proxy.proxy_str, proxy2.proxy_str)
|
|
183
|
+
|
|
184
|
+
if __name__ == '__main__':
|
|
185
185
|
unittest.main()
|