crawlo 1.4.6__py3-none-any.whl → 1.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +90 -89
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +186 -186
- crawlo/commands/help.py +140 -138
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +379 -341
- crawlo/commands/startproject.py +460 -460
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +320 -312
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +52 -52
- crawlo/core/engine.py +451 -438
- crawlo/core/processor.py +47 -47
- crawlo/core/scheduler.py +290 -291
- crawlo/crawler.py +698 -657
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +280 -276
- crawlo/downloader/aiohttp_downloader.py +233 -233
- crawlo/downloader/cffi_downloader.py +250 -247
- crawlo/downloader/httpx_downloader.py +265 -259
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +425 -402
- crawlo/downloader/selenium_downloader.py +486 -472
- crawlo/event.py +45 -11
- crawlo/exceptions.py +215 -82
- crawlo/extension/__init__.py +65 -64
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +94 -94
- crawlo/extension/log_stats.py +70 -70
- crawlo/extension/logging_extension.py +53 -61
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/factories/__init__.py +27 -27
- crawlo/factories/base.py +68 -68
- crawlo/factories/crawler.py +104 -103
- crawlo/factories/registry.py +84 -84
- crawlo/factories/utils.py +135 -0
- crawlo/filters/__init__.py +170 -153
- crawlo/filters/aioredis_filter.py +348 -264
- crawlo/filters/memory_filter.py +261 -276
- crawlo/framework.py +306 -292
- crawlo/initialization/__init__.py +44 -44
- crawlo/initialization/built_in.py +391 -434
- crawlo/initialization/context.py +141 -141
- crawlo/initialization/core.py +240 -194
- crawlo/initialization/phases.py +230 -149
- crawlo/initialization/registry.py +143 -145
- crawlo/initialization/utils.py +49 -0
- crawlo/interfaces.py +23 -23
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +23 -23
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/logging/__init__.py +42 -46
- crawlo/logging/config.py +277 -197
- crawlo/logging/factory.py +175 -171
- crawlo/logging/manager.py +104 -112
- crawlo/middleware/__init__.py +87 -24
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +142 -142
- crawlo/middleware/offsite.py +123 -123
- crawlo/middleware/proxy.py +209 -209
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +150 -150
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/mode_manager.py +287 -253
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +375 -379
- crawlo/network/response.py +569 -664
- crawlo/pipelines/__init__.py +53 -22
- crawlo/pipelines/base_pipeline.py +452 -0
- crawlo/pipelines/bloom_dedup_pipeline.py +146 -146
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +197 -197
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +105 -105
- crawlo/pipelines/mongo_pipeline.py +140 -132
- crawlo/pipelines/mysql_pipeline.py +469 -476
- crawlo/pipelines/pipeline_manager.py +100 -100
- crawlo/pipelines/redis_dedup_pipeline.py +155 -156
- crawlo/project.py +347 -347
- crawlo/queue/__init__.py +10 -0
- crawlo/queue/pqueue.py +38 -38
- crawlo/queue/queue_manager.py +591 -525
- crawlo/queue/redis_priority_queue.py +519 -370
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +284 -277
- crawlo/settings/setting_manager.py +219 -219
- crawlo/spider/__init__.py +657 -657
- crawlo/stats_collector.py +81 -81
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +138 -138
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +2 -4
- crawlo/templates/project/items.py.tmpl +13 -17
- crawlo/templates/project/middlewares.py.tmpl +38 -38
- crawlo/templates/project/pipelines.py.tmpl +35 -36
- crawlo/templates/project/settings.py.tmpl +109 -111
- crawlo/templates/project/settings_distributed.py.tmpl +156 -159
- crawlo/templates/project/settings_gentle.py.tmpl +170 -176
- crawlo/templates/project/settings_high_performance.py.tmpl +171 -177
- crawlo/templates/project/settings_minimal.py.tmpl +98 -100
- crawlo/templates/project/settings_simple.py.tmpl +168 -174
- crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
- crawlo/templates/run.py.tmpl +23 -23
- crawlo/templates/spider/spider.py.tmpl +32 -40
- crawlo/templates/spiders_init.py.tmpl +5 -10
- crawlo/tools/__init__.py +86 -189
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +384 -384
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +50 -50
- crawlo/utils/batch_processor.py +276 -259
- crawlo/utils/config_manager.py +442 -0
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +250 -250
- crawlo/utils/error_handler.py +410 -410
- crawlo/utils/fingerprint.py +121 -121
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/leak_detector.py +335 -0
- crawlo/utils/log.py +79 -79
- crawlo/utils/misc.py +81 -81
- crawlo/utils/mongo_connection_pool.py +157 -0
- crawlo/utils/mysql_connection_pool.py +197 -0
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_checker.py +91 -0
- crawlo/utils/redis_connection_pool.py +578 -388
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +278 -256
- crawlo/utils/request_serializer.py +225 -225
- crawlo/utils/resource_manager.py +337 -0
- crawlo/utils/selector_helper.py +137 -137
- crawlo/utils/singleton.py +70 -0
- crawlo/utils/spider_loader.py +201 -201
- crawlo/utils/text_helper.py +94 -94
- crawlo/utils/{url.py → url_utils.py} +39 -39
- crawlo-1.4.7.dist-info/METADATA +689 -0
- crawlo-1.4.7.dist-info/RECORD +347 -0
- examples/__init__.py +7 -7
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +217 -275
- tests/authenticated_proxy_example.py +110 -110
- tests/baidu_performance_test.py +108 -108
- tests/baidu_test.py +59 -59
- tests/bug_check_test.py +250 -250
- tests/cleaners_example.py +160 -160
- tests/comprehensive_framework_test.py +212 -212
- tests/comprehensive_test.py +81 -81
- tests/comprehensive_testing_summary.md +186 -186
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_configure.py +69 -69
- tests/debug_framework_logger.py +84 -84
- tests/debug_log_config.py +126 -126
- tests/debug_log_levels.py +63 -63
- tests/debug_pipelines.py +66 -66
- tests/detailed_log_test.py +233 -233
- tests/direct_selector_helper_test.py +96 -96
- tests/distributed_dedup_test.py +467 -0
- tests/distributed_test.py +66 -66
- tests/distributed_test_debug.py +76 -76
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/error_handling_example.py +171 -171
- tests/explain_mysql_update_behavior.py +76 -76
- tests/final_comprehensive_test.py +151 -151
- tests/final_log_test.py +260 -260
- tests/final_validation_test.py +182 -182
- tests/fix_log_test.py +142 -142
- tests/framework_performance_test.py +202 -202
- tests/log_buffering_test.py +111 -111
- tests/log_generation_timing_test.py +153 -153
- tests/monitor_redis_dedup.sh +72 -0
- tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
- tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
- tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
- tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
- tests/ofweek_scrapy/scrapy.cfg +11 -11
- tests/optimized_performance_test.py +211 -211
- tests/performance_comparison.py +244 -244
- tests/queue_blocking_test.py +113 -113
- tests/queue_test.py +89 -89
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/scrapy_comparison/ofweek_scrapy.py +138 -138
- tests/scrapy_comparison/scrapy_test.py +133 -133
- tests/simple_cli_test.py +55 -0
- tests/simple_command_test.py +119 -119
- tests/simple_crawlo_test.py +126 -126
- tests/simple_follow_test.py +38 -38
- tests/simple_log_test2.py +137 -137
- tests/simple_optimization_test.py +128 -128
- tests/simple_queue_type_test.py +41 -41
- tests/simple_response_selector_test.py +94 -94
- tests/simple_selector_helper_test.py +154 -154
- tests/simple_selector_test.py +207 -207
- tests/simple_spider_test.py +49 -49
- tests/simple_url_test.py +73 -73
- tests/simulate_mysql_update_test.py +139 -139
- tests/spider_log_timing_test.py +177 -177
- tests/test_advanced_tools.py +148 -148
- tests/test_all_commands.py +230 -230
- tests/test_all_pipeline_fingerprints.py +133 -133
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_asyncmy_usage.py +56 -56
- tests/test_batch_processor.py +178 -178
- tests/test_cleaners.py +54 -54
- tests/test_cli_arguments.py +119 -0
- tests/test_component_factory.py +174 -174
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_controlled_spider_mixin.py +79 -79
- tests/test_crawler_process_import.py +38 -38
- tests/test_crawler_process_spider_modules.py +47 -47
- tests/test_crawlo_proxy_integration.py +114 -114
- tests/test_date_tools.py +123 -123
- tests/test_dedup_fix.py +220 -220
- tests/test_dedup_pipeline_consistency.py +124 -124
- tests/test_default_header_middleware.py +313 -313
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +204 -204
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +272 -272
- tests/test_edge_cases.py +305 -305
- tests/test_encoding_core.py +56 -56
- tests/test_encoding_detection.py +126 -126
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_enhanced_error_handler_comprehensive.py +245 -245
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_factories.py +252 -252
- tests/test_factory_compatibility.py +196 -196
- tests/test_final_validation.py +153 -153
- tests/test_fingerprint_consistency.py +135 -135
- tests/test_fingerprint_simple.py +51 -51
- tests/test_get_component_logger.py +83 -83
- tests/test_hash_performance.py +99 -99
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_large_scale_helper.py +235 -235
- tests/test_logging_enhancements.py +374 -374
- tests/test_logging_final.py +184 -184
- tests/test_logging_integration.py +312 -312
- tests/test_logging_system.py +282 -282
- tests/test_middleware_debug.py +141 -141
- tests/test_mode_consistency.py +51 -51
- tests/test_multi_directory.py +67 -67
- tests/test_multiple_spider_modules.py +80 -80
- tests/test_mysql_pipeline_config.py +164 -164
- tests/test_mysql_pipeline_error.py +98 -98
- tests/test_mysql_pipeline_init_log.py +82 -82
- tests/test_mysql_pipeline_integration.py +132 -132
- tests/test_mysql_pipeline_refactor.py +143 -143
- tests/test_mysql_pipeline_refactor_simple.py +85 -85
- tests/test_mysql_pipeline_robustness.py +195 -195
- tests/test_mysql_pipeline_types.py +88 -88
- tests/test_mysql_update_columns.py +93 -93
- tests/test_offsite_middleware.py +244 -244
- tests/test_offsite_middleware_simple.py +203 -203
- tests/test_optimized_selector_naming.py +100 -100
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_performance_monitor.py +115 -115
- tests/test_pipeline_fingerprint_consistency.py +86 -86
- tests/test_priority_behavior.py +211 -211
- tests/test_priority_consistency.py +151 -151
- tests/test_priority_consistency_fixed.py +249 -249
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +217 -217
- tests/test_proxy_middleware_enhanced.py +212 -212
- tests/test_proxy_middleware_integration.py +142 -142
- tests/test_proxy_middleware_refactored.py +207 -207
- tests/test_proxy_only.py +83 -83
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_proxy_with_downloader.py +152 -152
- tests/test_queue_empty_check.py +41 -41
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +179 -179
- tests/test_queue_naming.py +154 -154
- tests/test_queue_type.py +106 -106
- tests/test_queue_type_redis_config_consistency.py +130 -130
- tests/test_random_headers_default.py +322 -322
- tests/test_random_headers_necessity.py +308 -308
- tests/test_random_user_agent.py +72 -72
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_redis_queue_name_fix.py +175 -175
- tests/test_redis_queue_type_fallback.py +129 -129
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_follow.py +104 -104
- tests/test_response_improvements.py +152 -152
- tests/test_response_selector_methods.py +92 -92
- tests/test_response_url_methods.py +70 -70
- tests/test_response_urljoin.py +86 -86
- tests/test_retry_middleware.py +333 -333
- tests/test_retry_middleware_realistic.py +273 -273
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_scrapy_style_encoding.py +112 -112
- tests/test_selector_helper.py +100 -100
- tests/test_selector_optimizations.py +146 -146
- tests/test_simple_response.py +61 -61
- tests/test_spider_loader.py +49 -49
- tests/test_spider_loader_comprehensive.py +69 -69
- tests/test_spider_modules.py +84 -84
- tests/test_spiders/test_spider.py +9 -9
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agent_randomness.py +176 -176
- tests/test_user_agents.py +96 -96
- tests/untested_features_report.md +138 -138
- tests/verify_debug.py +51 -51
- tests/verify_distributed.py +117 -117
- tests/verify_log_fix.py +111 -111
- tests/verify_mysql_warnings.py +109 -109
- crawlo/logging/async_handler.py +0 -181
- crawlo/logging/monitor.py +0 -153
- crawlo/logging/sampler.py +0 -167
- crawlo/tools/authenticated_proxy.py +0 -241
- crawlo/tools/data_formatter.py +0 -226
- crawlo/tools/data_validator.py +0 -181
- crawlo/tools/encoding_converter.py +0 -127
- crawlo/tools/network_diagnostic.py +0 -365
- crawlo/tools/request_tools.py +0 -83
- crawlo/tools/retry_mechanism.py +0 -224
- crawlo/utils/env_config.py +0 -143
- crawlo/utils/large_scale_config.py +0 -287
- crawlo/utils/system.py +0 -11
- crawlo/utils/tools.py +0 -5
- crawlo-1.4.6.dist-info/METADATA +0 -329
- crawlo-1.4.6.dist-info/RECORD +0 -361
- tests/env_config_example.py +0 -134
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
- tests/test_authenticated_proxy.py +0 -142
- tests/test_comprehensive.py +0 -147
- tests/test_dynamic_downloaders_proxy.py +0 -125
- tests/test_dynamic_proxy.py +0 -93
- tests/test_dynamic_proxy_config.py +0 -147
- tests/test_dynamic_proxy_real.py +0 -110
- tests/test_env_config.py +0 -122
- tests/test_framework_env_usage.py +0 -104
- tests/test_large_scale_config.py +0 -113
- tests/test_proxy_api.py +0 -265
- tests/test_real_scenario_proxy.py +0 -196
- tests/tools_example.py +0 -261
- {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/WHEEL +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/top_level.txt +0 -0
tests/test_retry_middleware.py
CHANGED
|
@@ -1,334 +1,334 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
测试重试中间件功能
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import sys
|
|
8
|
-
import os
|
|
9
|
-
import asyncio
|
|
10
|
-
from unittest.mock import Mock, patch
|
|
11
|
-
|
|
12
|
-
# 添加项目根目录到路径
|
|
13
|
-
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
14
|
-
|
|
15
|
-
from crawlo.middleware.retry import RetryMiddleware
|
|
16
|
-
from crawlo.stats_collector import StatsCollector
|
|
17
|
-
from crawlo.settings.setting_manager import SettingManager
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class MockRequest:
|
|
21
|
-
def __init__(self, url="http://example.com", meta=None):
|
|
22
|
-
self.url = url
|
|
23
|
-
self.meta = meta or {}
|
|
24
|
-
self.priority = 0
|
|
25
|
-
|
|
26
|
-
def __str__(self):
|
|
27
|
-
return f"<Request {self.url}>"
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
class MockResponse:
|
|
31
|
-
def __init__(self, status_code=200):
|
|
32
|
-
self.status_code = status_code
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
class MockSpider:
|
|
36
|
-
def __init__(self, name="test_spider"):
|
|
37
|
-
self.name = name
|
|
38
|
-
|
|
39
|
-
def __str__(self):
|
|
40
|
-
return self.name
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def test_retry_middleware_creation():
|
|
44
|
-
"""测试重试中间件创建"""
|
|
45
|
-
print("=== 测试重试中间件创建 ===")
|
|
46
|
-
|
|
47
|
-
# 创建设置管理器
|
|
48
|
-
settings = SettingManager()
|
|
49
|
-
settings.set('RETRY_HTTP_CODES', [500, 502, 503, 504])
|
|
50
|
-
settings.set('IGNORE_HTTP_CODES', [404])
|
|
51
|
-
settings.set('MAX_RETRY_TIMES', 3)
|
|
52
|
-
settings.set('RETRY_EXCEPTIONS', [])
|
|
53
|
-
settings.set('RETRY_PRIORITY', -1)
|
|
54
|
-
|
|
55
|
-
# 创建统计收集器
|
|
56
|
-
class MockCrawler:
|
|
57
|
-
def __init__(self):
|
|
58
|
-
self.settings = settings
|
|
59
|
-
|
|
60
|
-
crawler = MockCrawler()
|
|
61
|
-
stats = StatsCollector(crawler)
|
|
62
|
-
crawler.stats = stats
|
|
63
|
-
|
|
64
|
-
# 创建爬虫对象
|
|
65
|
-
class MockCrawlerWithStats:
|
|
66
|
-
def __init__(self):
|
|
67
|
-
self.settings = settings
|
|
68
|
-
self.stats = stats
|
|
69
|
-
|
|
70
|
-
crawler_with_stats = MockCrawlerWithStats()
|
|
71
|
-
|
|
72
|
-
# 创建重试中间件
|
|
73
|
-
middleware = RetryMiddleware.create_instance(crawler_with_stats)
|
|
74
|
-
|
|
75
|
-
# 验证配置
|
|
76
|
-
assert middleware.retry_http_codes == [500, 502, 503, 504]
|
|
77
|
-
assert middleware.ignore_http_codes == [404]
|
|
78
|
-
assert middleware.max_retry_times == 3
|
|
79
|
-
assert middleware.retry_priority == -1
|
|
80
|
-
|
|
81
|
-
print("✅ 重试中间件创建测试通过")
|
|
82
|
-
return middleware
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
def test_retry_http_codes():
|
|
86
|
-
"""测试HTTP状态码重试"""
|
|
87
|
-
print("\n=== 测试HTTP状态码重试 ===")
|
|
88
|
-
|
|
89
|
-
# 创建重试中间件
|
|
90
|
-
settings = SettingManager()
|
|
91
|
-
settings.set('RETRY_HTTP_CODES', [500, 502, 503, 504])
|
|
92
|
-
settings.set('IGNORE_HTTP_CODES', [404])
|
|
93
|
-
settings.set('MAX_RETRY_TIMES', 3)
|
|
94
|
-
settings.set('RETRY_EXCEPTIONS', [])
|
|
95
|
-
settings.set('RETRY_PRIORITY', -1)
|
|
96
|
-
|
|
97
|
-
class MockCrawler:
|
|
98
|
-
def __init__(self):
|
|
99
|
-
self.settings = settings
|
|
100
|
-
|
|
101
|
-
crawler = MockCrawler()
|
|
102
|
-
stats = StatsCollector(crawler)
|
|
103
|
-
crawler.stats = stats
|
|
104
|
-
|
|
105
|
-
class MockCrawlerWithStats:
|
|
106
|
-
def __init__(self):
|
|
107
|
-
self.settings = settings
|
|
108
|
-
self.stats = stats
|
|
109
|
-
|
|
110
|
-
crawler_with_stats = MockCrawlerWithStats()
|
|
111
|
-
middleware = RetryMiddleware.create_instance(crawler_with_stats)
|
|
112
|
-
|
|
113
|
-
# 创建请求和爬虫
|
|
114
|
-
request = MockRequest()
|
|
115
|
-
spider = MockSpider()
|
|
116
|
-
|
|
117
|
-
# 测试需要重试的状态码
|
|
118
|
-
for status_code in [500, 502, 503, 504]:
|
|
119
|
-
# 为每个测试创建新的请求实例
|
|
120
|
-
test_request = MockRequest()
|
|
121
|
-
response = MockResponse(status_code)
|
|
122
|
-
original_retry_times = test_request.meta.get('retry_times', 0)
|
|
123
|
-
result = middleware.process_response(test_request, response, spider)
|
|
124
|
-
|
|
125
|
-
# 应该返回重试的请求
|
|
126
|
-
assert result is not None
|
|
127
|
-
# 由于中间件直接修改并返回原始请求对象,所以result和test_request是同一个对象
|
|
128
|
-
assert result is test_request
|
|
129
|
-
assert result.meta.get('retry_times', 0) == original_retry_times + 1
|
|
130
|
-
assert result.meta.get('dont_retry', False) is True
|
|
131
|
-
print(f" ✅ 状态码 {status_code} 重试测试通过")
|
|
132
|
-
|
|
133
|
-
# 测试忽略的状态码
|
|
134
|
-
test_request = MockRequest()
|
|
135
|
-
response = MockResponse(404)
|
|
136
|
-
result = middleware.process_response(test_request, response, spider)
|
|
137
|
-
|
|
138
|
-
# 应该返回原始响应
|
|
139
|
-
assert result == response
|
|
140
|
-
print(" ✅ 忽略状态码 404 测试通过")
|
|
141
|
-
|
|
142
|
-
# 测试正常状态码
|
|
143
|
-
test_request = MockRequest()
|
|
144
|
-
response = MockResponse(200)
|
|
145
|
-
result = middleware.process_response(test_request, response, spider)
|
|
146
|
-
|
|
147
|
-
# 应该返回原始响应
|
|
148
|
-
assert result == response
|
|
149
|
-
print(" ✅ 正常状态码 200 测试通过")
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
def test_retry_max_times():
|
|
153
|
-
"""测试最大重试次数限制"""
|
|
154
|
-
print("\n=== 测试最大重试次数限制 ===")
|
|
155
|
-
|
|
156
|
-
# 创建重试中间件
|
|
157
|
-
settings = SettingManager()
|
|
158
|
-
settings.set('RETRY_HTTP_CODES', [500])
|
|
159
|
-
settings.set('IGNORE_HTTP_CODES', [])
|
|
160
|
-
settings.set('MAX_RETRY_TIMES', 2)
|
|
161
|
-
settings.set('RETRY_EXCEPTIONS', [])
|
|
162
|
-
settings.set('RETRY_PRIORITY', -1)
|
|
163
|
-
|
|
164
|
-
class MockCrawler:
|
|
165
|
-
def __init__(self):
|
|
166
|
-
self.settings = settings
|
|
167
|
-
|
|
168
|
-
crawler = MockCrawler()
|
|
169
|
-
stats = StatsCollector(crawler)
|
|
170
|
-
crawler.stats = stats
|
|
171
|
-
|
|
172
|
-
class MockCrawlerWithStats:
|
|
173
|
-
def __init__(self):
|
|
174
|
-
self.settings = settings
|
|
175
|
-
self.stats = stats
|
|
176
|
-
|
|
177
|
-
crawler_with_stats = MockCrawlerWithStats()
|
|
178
|
-
middleware = RetryMiddleware.create_instance(crawler_with_stats)
|
|
179
|
-
|
|
180
|
-
# 创建请求和爬虫
|
|
181
|
-
request = MockRequest()
|
|
182
|
-
spider = MockSpider()
|
|
183
|
-
|
|
184
|
-
# 第一次重试
|
|
185
|
-
response = MockResponse(500)
|
|
186
|
-
result = middleware.process_response(request, response, spider)
|
|
187
|
-
print(f" 第一次重试结果: {result}, 类型: {type(result)}")
|
|
188
|
-
assert result is not None
|
|
189
|
-
# 由于中间件直接修改并返回原始请求对象,所以result和request是同一个对象
|
|
190
|
-
assert result is request
|
|
191
|
-
assert result.meta.get('retry_times', 0) == 1
|
|
192
|
-
print(" ✅ 第一次重试测试通过")
|
|
193
|
-
|
|
194
|
-
# 第二次重试(这是最后一次重试,之后应该放弃)
|
|
195
|
-
result = middleware.process_response(request, response, spider)
|
|
196
|
-
print(f" 第二次重试结果: {result}, 类型: {type(result)}")
|
|
197
|
-
# 当达到最大重试次数时,中间件会返回原始响应而不是重试请求
|
|
198
|
-
assert result is response
|
|
199
|
-
print(" ✅ 第二次重试测试通过(达到最大重试次数)")
|
|
200
|
-
|
|
201
|
-
# 第三次重试(应该放弃,返回原始响应)
|
|
202
|
-
# 为确保测试准确性,我们创建一个新的请求
|
|
203
|
-
new_request = MockRequest()
|
|
204
|
-
result = middleware.process_response(new_request, response, spider)
|
|
205
|
-
print(f" 第三次重试结果: {result}, 类型: {type(result)}")
|
|
206
|
-
# 新请求没有重试历史,所以会被重试一次
|
|
207
|
-
assert result is not None
|
|
208
|
-
assert result is new_request
|
|
209
|
-
assert result.meta.get('retry_times', 0) == 1
|
|
210
|
-
print(" ✅ 第三次重试测试通过")
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
def test_retry_exceptions():
|
|
214
|
-
"""测试异常重试"""
|
|
215
|
-
print("\n=== 测试异常重试 ===")
|
|
216
|
-
|
|
217
|
-
# 创建重试中间件
|
|
218
|
-
settings = SettingManager()
|
|
219
|
-
settings.set('RETRY_HTTP_CODES', [])
|
|
220
|
-
settings.set('IGNORE_HTTP_CODES', [])
|
|
221
|
-
settings.set('MAX_RETRY_TIMES', 3)
|
|
222
|
-
settings.set('RETRY_EXCEPTIONS', [])
|
|
223
|
-
settings.set('RETRY_PRIORITY', -1)
|
|
224
|
-
|
|
225
|
-
class MockCrawler:
|
|
226
|
-
def __init__(self):
|
|
227
|
-
self.settings = settings
|
|
228
|
-
|
|
229
|
-
crawler = MockCrawler()
|
|
230
|
-
stats = StatsCollector(crawler)
|
|
231
|
-
crawler.stats = stats
|
|
232
|
-
|
|
233
|
-
class MockCrawlerWithStats:
|
|
234
|
-
def __init__(self):
|
|
235
|
-
self.settings = settings
|
|
236
|
-
self.stats = stats
|
|
237
|
-
|
|
238
|
-
crawler_with_stats = MockCrawlerWithStats()
|
|
239
|
-
middleware = RetryMiddleware.create_instance(crawler_with_stats)
|
|
240
|
-
|
|
241
|
-
# 创建请求和爬虫
|
|
242
|
-
request = MockRequest()
|
|
243
|
-
spider = MockSpider()
|
|
244
|
-
|
|
245
|
-
# 测试连接错误异常
|
|
246
|
-
try:
|
|
247
|
-
from aiohttp.client_exceptions import ClientConnectorError
|
|
248
|
-
import socket
|
|
249
|
-
# 创建一个模拟的socket错误
|
|
250
|
-
sock_error = socket.gaierror("test error")
|
|
251
|
-
exc = ClientConnectorError(None, sock_error)
|
|
252
|
-
result = middleware.process_exception(request, exc, spider)
|
|
253
|
-
|
|
254
|
-
# 应该返回重试的请求
|
|
255
|
-
assert result is not None
|
|
256
|
-
assert result.meta.get('retry_times', 0) == 1
|
|
257
|
-
assert result.meta.get('dont_retry', False) is True
|
|
258
|
-
print(" ✅ ClientConnectorError 异常重试测试通过")
|
|
259
|
-
except ImportError:
|
|
260
|
-
print(" ⚠️ ClientConnectorError 未安装,跳过测试")
|
|
261
|
-
except Exception as e:
|
|
262
|
-
print(f" ⚠️ ClientConnectorError 测试出现异常: {e}")
|
|
263
|
-
|
|
264
|
-
# 测试超时异常(使用新的请求对象)
|
|
265
|
-
new_request = MockRequest() # 创建新的请求对象
|
|
266
|
-
exc = asyncio.TimeoutError()
|
|
267
|
-
result = middleware.process_exception(new_request, exc, spider)
|
|
268
|
-
print(f" TimeoutError测试结果: {result}, 类型: {type(result)}")
|
|
269
|
-
|
|
270
|
-
# 应该返回重试的请求
|
|
271
|
-
assert result is not None
|
|
272
|
-
assert result.meta.get('retry_times', 0) == 1
|
|
273
|
-
assert result.meta.get('dont_retry', False) is True
|
|
274
|
-
print(" ✅ TimeoutError 异常重试测试通过")
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
def test_dont_retry_flag():
|
|
278
|
-
"""测试 dont_retry 标志"""
|
|
279
|
-
print("\n=== 测试 dont_retry 标志 ===")
|
|
280
|
-
|
|
281
|
-
# 创建重试中间件
|
|
282
|
-
settings = SettingManager()
|
|
283
|
-
settings.set('RETRY_HTTP_CODES', [500])
|
|
284
|
-
settings.set('IGNORE_HTTP_CODES', [])
|
|
285
|
-
settings.set('MAX_RETRY_TIMES', 3)
|
|
286
|
-
settings.set('RETRY_EXCEPTIONS', [])
|
|
287
|
-
settings.set('RETRY_PRIORITY', -1)
|
|
288
|
-
|
|
289
|
-
class MockCrawler:
|
|
290
|
-
def __init__(self):
|
|
291
|
-
self.settings = settings
|
|
292
|
-
|
|
293
|
-
crawler = MockCrawler()
|
|
294
|
-
stats = StatsCollector(crawler)
|
|
295
|
-
crawler.stats = stats
|
|
296
|
-
|
|
297
|
-
class MockCrawlerWithStats:
|
|
298
|
-
def __init__(self):
|
|
299
|
-
self.settings = settings
|
|
300
|
-
self.stats = stats
|
|
301
|
-
|
|
302
|
-
crawler_with_stats = MockCrawlerWithStats()
|
|
303
|
-
middleware = RetryMiddleware.create_instance(crawler_with_stats)
|
|
304
|
-
|
|
305
|
-
# 创建带有 dont_retry 标志的请求和爬虫
|
|
306
|
-
request = MockRequest(meta={'dont_retry': True})
|
|
307
|
-
spider = MockSpider()
|
|
308
|
-
|
|
309
|
-
# 测试带有 dont_retry 标志的响应
|
|
310
|
-
response = MockResponse(500)
|
|
311
|
-
result = middleware.process_response(request, response, spider)
|
|
312
|
-
|
|
313
|
-
# 应该返回原始响应,不进行重试
|
|
314
|
-
assert result == response
|
|
315
|
-
print(" ✅ dont_retry 标志测试通过")
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
if __name__ == "__main__":
|
|
319
|
-
print("开始测试重试中间件功能...")
|
|
320
|
-
|
|
321
|
-
try:
|
|
322
|
-
# 运行所有测试
|
|
323
|
-
middleware = test_retry_middleware_creation()
|
|
324
|
-
test_retry_http_codes()
|
|
325
|
-
test_retry_max_times()
|
|
326
|
-
test_retry_exceptions()
|
|
327
|
-
test_dont_retry_flag()
|
|
328
|
-
|
|
329
|
-
print("\n🎉 所有测试通过!重试中间件功能正常。")
|
|
330
|
-
|
|
331
|
-
except Exception as e:
|
|
332
|
-
print(f"\n❌ 测试失败: {e}")
|
|
333
|
-
import traceback
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
测试重试中间件功能
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import sys
|
|
8
|
+
import os
|
|
9
|
+
import asyncio
|
|
10
|
+
from unittest.mock import Mock, patch
|
|
11
|
+
|
|
12
|
+
# 添加项目根目录到路径
|
|
13
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
14
|
+
|
|
15
|
+
from crawlo.middleware.retry import RetryMiddleware
|
|
16
|
+
from crawlo.stats_collector import StatsCollector
|
|
17
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class MockRequest:
|
|
21
|
+
def __init__(self, url="http://example.com", meta=None):
|
|
22
|
+
self.url = url
|
|
23
|
+
self.meta = meta or {}
|
|
24
|
+
self.priority = 0
|
|
25
|
+
|
|
26
|
+
def __str__(self):
|
|
27
|
+
return f"<Request {self.url}>"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class MockResponse:
|
|
31
|
+
def __init__(self, status_code=200):
|
|
32
|
+
self.status_code = status_code
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class MockSpider:
|
|
36
|
+
def __init__(self, name="test_spider"):
|
|
37
|
+
self.name = name
|
|
38
|
+
|
|
39
|
+
def __str__(self):
|
|
40
|
+
return self.name
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_retry_middleware_creation():
|
|
44
|
+
"""测试重试中间件创建"""
|
|
45
|
+
print("=== 测试重试中间件创建 ===")
|
|
46
|
+
|
|
47
|
+
# 创建设置管理器
|
|
48
|
+
settings = SettingManager()
|
|
49
|
+
settings.set('RETRY_HTTP_CODES', [500, 502, 503, 504])
|
|
50
|
+
settings.set('IGNORE_HTTP_CODES', [404])
|
|
51
|
+
settings.set('MAX_RETRY_TIMES', 3)
|
|
52
|
+
settings.set('RETRY_EXCEPTIONS', [])
|
|
53
|
+
settings.set('RETRY_PRIORITY', -1)
|
|
54
|
+
|
|
55
|
+
# 创建统计收集器
|
|
56
|
+
class MockCrawler:
|
|
57
|
+
def __init__(self):
|
|
58
|
+
self.settings = settings
|
|
59
|
+
|
|
60
|
+
crawler = MockCrawler()
|
|
61
|
+
stats = StatsCollector(crawler)
|
|
62
|
+
crawler.stats = stats
|
|
63
|
+
|
|
64
|
+
# 创建爬虫对象
|
|
65
|
+
class MockCrawlerWithStats:
|
|
66
|
+
def __init__(self):
|
|
67
|
+
self.settings = settings
|
|
68
|
+
self.stats = stats
|
|
69
|
+
|
|
70
|
+
crawler_with_stats = MockCrawlerWithStats()
|
|
71
|
+
|
|
72
|
+
# 创建重试中间件
|
|
73
|
+
middleware = RetryMiddleware.create_instance(crawler_with_stats)
|
|
74
|
+
|
|
75
|
+
# 验证配置
|
|
76
|
+
assert middleware.retry_http_codes == [500, 502, 503, 504]
|
|
77
|
+
assert middleware.ignore_http_codes == [404]
|
|
78
|
+
assert middleware.max_retry_times == 3
|
|
79
|
+
assert middleware.retry_priority == -1
|
|
80
|
+
|
|
81
|
+
print("✅ 重试中间件创建测试通过")
|
|
82
|
+
return middleware
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def test_retry_http_codes():
|
|
86
|
+
"""测试HTTP状态码重试"""
|
|
87
|
+
print("\n=== 测试HTTP状态码重试 ===")
|
|
88
|
+
|
|
89
|
+
# 创建重试中间件
|
|
90
|
+
settings = SettingManager()
|
|
91
|
+
settings.set('RETRY_HTTP_CODES', [500, 502, 503, 504])
|
|
92
|
+
settings.set('IGNORE_HTTP_CODES', [404])
|
|
93
|
+
settings.set('MAX_RETRY_TIMES', 3)
|
|
94
|
+
settings.set('RETRY_EXCEPTIONS', [])
|
|
95
|
+
settings.set('RETRY_PRIORITY', -1)
|
|
96
|
+
|
|
97
|
+
class MockCrawler:
|
|
98
|
+
def __init__(self):
|
|
99
|
+
self.settings = settings
|
|
100
|
+
|
|
101
|
+
crawler = MockCrawler()
|
|
102
|
+
stats = StatsCollector(crawler)
|
|
103
|
+
crawler.stats = stats
|
|
104
|
+
|
|
105
|
+
class MockCrawlerWithStats:
|
|
106
|
+
def __init__(self):
|
|
107
|
+
self.settings = settings
|
|
108
|
+
self.stats = stats
|
|
109
|
+
|
|
110
|
+
crawler_with_stats = MockCrawlerWithStats()
|
|
111
|
+
middleware = RetryMiddleware.create_instance(crawler_with_stats)
|
|
112
|
+
|
|
113
|
+
# 创建请求和爬虫
|
|
114
|
+
request = MockRequest()
|
|
115
|
+
spider = MockSpider()
|
|
116
|
+
|
|
117
|
+
# 测试需要重试的状态码
|
|
118
|
+
for status_code in [500, 502, 503, 504]:
|
|
119
|
+
# 为每个测试创建新的请求实例
|
|
120
|
+
test_request = MockRequest()
|
|
121
|
+
response = MockResponse(status_code)
|
|
122
|
+
original_retry_times = test_request.meta.get('retry_times', 0)
|
|
123
|
+
result = middleware.process_response(test_request, response, spider)
|
|
124
|
+
|
|
125
|
+
# 应该返回重试的请求
|
|
126
|
+
assert result is not None
|
|
127
|
+
# 由于中间件直接修改并返回原始请求对象,所以result和test_request是同一个对象
|
|
128
|
+
assert result is test_request
|
|
129
|
+
assert result.meta.get('retry_times', 0) == original_retry_times + 1
|
|
130
|
+
assert result.meta.get('dont_retry', False) is True
|
|
131
|
+
print(f" ✅ 状态码 {status_code} 重试测试通过")
|
|
132
|
+
|
|
133
|
+
# 测试忽略的状态码
|
|
134
|
+
test_request = MockRequest()
|
|
135
|
+
response = MockResponse(404)
|
|
136
|
+
result = middleware.process_response(test_request, response, spider)
|
|
137
|
+
|
|
138
|
+
# 应该返回原始响应
|
|
139
|
+
assert result == response
|
|
140
|
+
print(" ✅ 忽略状态码 404 测试通过")
|
|
141
|
+
|
|
142
|
+
# 测试正常状态码
|
|
143
|
+
test_request = MockRequest()
|
|
144
|
+
response = MockResponse(200)
|
|
145
|
+
result = middleware.process_response(test_request, response, spider)
|
|
146
|
+
|
|
147
|
+
# 应该返回原始响应
|
|
148
|
+
assert result == response
|
|
149
|
+
print(" ✅ 正常状态码 200 测试通过")
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def test_retry_max_times():
|
|
153
|
+
"""测试最大重试次数限制"""
|
|
154
|
+
print("\n=== 测试最大重试次数限制 ===")
|
|
155
|
+
|
|
156
|
+
# 创建重试中间件
|
|
157
|
+
settings = SettingManager()
|
|
158
|
+
settings.set('RETRY_HTTP_CODES', [500])
|
|
159
|
+
settings.set('IGNORE_HTTP_CODES', [])
|
|
160
|
+
settings.set('MAX_RETRY_TIMES', 2)
|
|
161
|
+
settings.set('RETRY_EXCEPTIONS', [])
|
|
162
|
+
settings.set('RETRY_PRIORITY', -1)
|
|
163
|
+
|
|
164
|
+
class MockCrawler:
|
|
165
|
+
def __init__(self):
|
|
166
|
+
self.settings = settings
|
|
167
|
+
|
|
168
|
+
crawler = MockCrawler()
|
|
169
|
+
stats = StatsCollector(crawler)
|
|
170
|
+
crawler.stats = stats
|
|
171
|
+
|
|
172
|
+
class MockCrawlerWithStats:
|
|
173
|
+
def __init__(self):
|
|
174
|
+
self.settings = settings
|
|
175
|
+
self.stats = stats
|
|
176
|
+
|
|
177
|
+
crawler_with_stats = MockCrawlerWithStats()
|
|
178
|
+
middleware = RetryMiddleware.create_instance(crawler_with_stats)
|
|
179
|
+
|
|
180
|
+
# 创建请求和爬虫
|
|
181
|
+
request = MockRequest()
|
|
182
|
+
spider = MockSpider()
|
|
183
|
+
|
|
184
|
+
# 第一次重试
|
|
185
|
+
response = MockResponse(500)
|
|
186
|
+
result = middleware.process_response(request, response, spider)
|
|
187
|
+
print(f" 第一次重试结果: {result}, 类型: {type(result)}")
|
|
188
|
+
assert result is not None
|
|
189
|
+
# 由于中间件直接修改并返回原始请求对象,所以result和request是同一个对象
|
|
190
|
+
assert result is request
|
|
191
|
+
assert result.meta.get('retry_times', 0) == 1
|
|
192
|
+
print(" ✅ 第一次重试测试通过")
|
|
193
|
+
|
|
194
|
+
# 第二次重试(这是最后一次重试,之后应该放弃)
|
|
195
|
+
result = middleware.process_response(request, response, spider)
|
|
196
|
+
print(f" 第二次重试结果: {result}, 类型: {type(result)}")
|
|
197
|
+
# 当达到最大重试次数时,中间件会返回原始响应而不是重试请求
|
|
198
|
+
assert result is response
|
|
199
|
+
print(" ✅ 第二次重试测试通过(达到最大重试次数)")
|
|
200
|
+
|
|
201
|
+
# 第三次重试(应该放弃,返回原始响应)
|
|
202
|
+
# 为确保测试准确性,我们创建一个新的请求
|
|
203
|
+
new_request = MockRequest()
|
|
204
|
+
result = middleware.process_response(new_request, response, spider)
|
|
205
|
+
print(f" 第三次重试结果: {result}, 类型: {type(result)}")
|
|
206
|
+
# 新请求没有重试历史,所以会被重试一次
|
|
207
|
+
assert result is not None
|
|
208
|
+
assert result is new_request
|
|
209
|
+
assert result.meta.get('retry_times', 0) == 1
|
|
210
|
+
print(" ✅ 第三次重试测试通过")
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def test_retry_exceptions():
|
|
214
|
+
"""测试异常重试"""
|
|
215
|
+
print("\n=== 测试异常重试 ===")
|
|
216
|
+
|
|
217
|
+
# 创建重试中间件
|
|
218
|
+
settings = SettingManager()
|
|
219
|
+
settings.set('RETRY_HTTP_CODES', [])
|
|
220
|
+
settings.set('IGNORE_HTTP_CODES', [])
|
|
221
|
+
settings.set('MAX_RETRY_TIMES', 3)
|
|
222
|
+
settings.set('RETRY_EXCEPTIONS', [])
|
|
223
|
+
settings.set('RETRY_PRIORITY', -1)
|
|
224
|
+
|
|
225
|
+
class MockCrawler:
|
|
226
|
+
def __init__(self):
|
|
227
|
+
self.settings = settings
|
|
228
|
+
|
|
229
|
+
crawler = MockCrawler()
|
|
230
|
+
stats = StatsCollector(crawler)
|
|
231
|
+
crawler.stats = stats
|
|
232
|
+
|
|
233
|
+
class MockCrawlerWithStats:
|
|
234
|
+
def __init__(self):
|
|
235
|
+
self.settings = settings
|
|
236
|
+
self.stats = stats
|
|
237
|
+
|
|
238
|
+
crawler_with_stats = MockCrawlerWithStats()
|
|
239
|
+
middleware = RetryMiddleware.create_instance(crawler_with_stats)
|
|
240
|
+
|
|
241
|
+
# 创建请求和爬虫
|
|
242
|
+
request = MockRequest()
|
|
243
|
+
spider = MockSpider()
|
|
244
|
+
|
|
245
|
+
# 测试连接错误异常
|
|
246
|
+
try:
|
|
247
|
+
from aiohttp.client_exceptions import ClientConnectorError
|
|
248
|
+
import socket
|
|
249
|
+
# 创建一个模拟的socket错误
|
|
250
|
+
sock_error = socket.gaierror("test error")
|
|
251
|
+
exc = ClientConnectorError(None, sock_error)
|
|
252
|
+
result = middleware.process_exception(request, exc, spider)
|
|
253
|
+
|
|
254
|
+
# 应该返回重试的请求
|
|
255
|
+
assert result is not None
|
|
256
|
+
assert result.meta.get('retry_times', 0) == 1
|
|
257
|
+
assert result.meta.get('dont_retry', False) is True
|
|
258
|
+
print(" ✅ ClientConnectorError 异常重试测试通过")
|
|
259
|
+
except ImportError:
|
|
260
|
+
print(" ⚠️ ClientConnectorError 未安装,跳过测试")
|
|
261
|
+
except Exception as e:
|
|
262
|
+
print(f" ⚠️ ClientConnectorError 测试出现异常: {e}")
|
|
263
|
+
|
|
264
|
+
# 测试超时异常(使用新的请求对象)
|
|
265
|
+
new_request = MockRequest() # 创建新的请求对象
|
|
266
|
+
exc = asyncio.TimeoutError()
|
|
267
|
+
result = middleware.process_exception(new_request, exc, spider)
|
|
268
|
+
print(f" TimeoutError测试结果: {result}, 类型: {type(result)}")
|
|
269
|
+
|
|
270
|
+
# 应该返回重试的请求
|
|
271
|
+
assert result is not None
|
|
272
|
+
assert result.meta.get('retry_times', 0) == 1
|
|
273
|
+
assert result.meta.get('dont_retry', False) is True
|
|
274
|
+
print(" ✅ TimeoutError 异常重试测试通过")
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def test_dont_retry_flag():
|
|
278
|
+
"""测试 dont_retry 标志"""
|
|
279
|
+
print("\n=== 测试 dont_retry 标志 ===")
|
|
280
|
+
|
|
281
|
+
# 创建重试中间件
|
|
282
|
+
settings = SettingManager()
|
|
283
|
+
settings.set('RETRY_HTTP_CODES', [500])
|
|
284
|
+
settings.set('IGNORE_HTTP_CODES', [])
|
|
285
|
+
settings.set('MAX_RETRY_TIMES', 3)
|
|
286
|
+
settings.set('RETRY_EXCEPTIONS', [])
|
|
287
|
+
settings.set('RETRY_PRIORITY', -1)
|
|
288
|
+
|
|
289
|
+
class MockCrawler:
|
|
290
|
+
def __init__(self):
|
|
291
|
+
self.settings = settings
|
|
292
|
+
|
|
293
|
+
crawler = MockCrawler()
|
|
294
|
+
stats = StatsCollector(crawler)
|
|
295
|
+
crawler.stats = stats
|
|
296
|
+
|
|
297
|
+
class MockCrawlerWithStats:
|
|
298
|
+
def __init__(self):
|
|
299
|
+
self.settings = settings
|
|
300
|
+
self.stats = stats
|
|
301
|
+
|
|
302
|
+
crawler_with_stats = MockCrawlerWithStats()
|
|
303
|
+
middleware = RetryMiddleware.create_instance(crawler_with_stats)
|
|
304
|
+
|
|
305
|
+
# 创建带有 dont_retry 标志的请求和爬虫
|
|
306
|
+
request = MockRequest(meta={'dont_retry': True})
|
|
307
|
+
spider = MockSpider()
|
|
308
|
+
|
|
309
|
+
# 测试带有 dont_retry 标志的响应
|
|
310
|
+
response = MockResponse(500)
|
|
311
|
+
result = middleware.process_response(request, response, spider)
|
|
312
|
+
|
|
313
|
+
# 应该返回原始响应,不进行重试
|
|
314
|
+
assert result == response
|
|
315
|
+
print(" ✅ dont_retry 标志测试通过")
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
if __name__ == "__main__":
|
|
319
|
+
print("开始测试重试中间件功能...")
|
|
320
|
+
|
|
321
|
+
try:
|
|
322
|
+
# 运行所有测试
|
|
323
|
+
middleware = test_retry_middleware_creation()
|
|
324
|
+
test_retry_http_codes()
|
|
325
|
+
test_retry_max_times()
|
|
326
|
+
test_retry_exceptions()
|
|
327
|
+
test_dont_retry_flag()
|
|
328
|
+
|
|
329
|
+
print("\n🎉 所有测试通过!重试中间件功能正常。")
|
|
330
|
+
|
|
331
|
+
except Exception as e:
|
|
332
|
+
print(f"\n❌ 测试失败: {e}")
|
|
333
|
+
import traceback
|
|
334
334
|
traceback.print_exc()
|