crawlo 1.4.6__py3-none-any.whl → 1.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +90 -89
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +186 -186
- crawlo/commands/help.py +140 -138
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +379 -341
- crawlo/commands/startproject.py +460 -460
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +320 -312
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +52 -52
- crawlo/core/engine.py +451 -438
- crawlo/core/processor.py +47 -47
- crawlo/core/scheduler.py +290 -291
- crawlo/crawler.py +698 -657
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +280 -276
- crawlo/downloader/aiohttp_downloader.py +233 -233
- crawlo/downloader/cffi_downloader.py +250 -247
- crawlo/downloader/httpx_downloader.py +265 -259
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +425 -402
- crawlo/downloader/selenium_downloader.py +486 -472
- crawlo/event.py +45 -11
- crawlo/exceptions.py +215 -82
- crawlo/extension/__init__.py +65 -64
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +94 -94
- crawlo/extension/log_stats.py +70 -70
- crawlo/extension/logging_extension.py +53 -61
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/factories/__init__.py +27 -27
- crawlo/factories/base.py +68 -68
- crawlo/factories/crawler.py +104 -103
- crawlo/factories/registry.py +84 -84
- crawlo/factories/utils.py +135 -0
- crawlo/filters/__init__.py +170 -153
- crawlo/filters/aioredis_filter.py +348 -264
- crawlo/filters/memory_filter.py +261 -276
- crawlo/framework.py +306 -292
- crawlo/initialization/__init__.py +44 -44
- crawlo/initialization/built_in.py +391 -434
- crawlo/initialization/context.py +141 -141
- crawlo/initialization/core.py +240 -194
- crawlo/initialization/phases.py +230 -149
- crawlo/initialization/registry.py +143 -145
- crawlo/initialization/utils.py +49 -0
- crawlo/interfaces.py +23 -23
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +23 -23
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/logging/__init__.py +42 -46
- crawlo/logging/config.py +277 -197
- crawlo/logging/factory.py +175 -171
- crawlo/logging/manager.py +104 -112
- crawlo/middleware/__init__.py +87 -24
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +142 -142
- crawlo/middleware/offsite.py +123 -123
- crawlo/middleware/proxy.py +209 -209
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +150 -150
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/mode_manager.py +287 -253
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +375 -379
- crawlo/network/response.py +569 -664
- crawlo/pipelines/__init__.py +53 -22
- crawlo/pipelines/base_pipeline.py +452 -0
- crawlo/pipelines/bloom_dedup_pipeline.py +146 -146
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +197 -197
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +105 -105
- crawlo/pipelines/mongo_pipeline.py +140 -132
- crawlo/pipelines/mysql_pipeline.py +469 -476
- crawlo/pipelines/pipeline_manager.py +100 -100
- crawlo/pipelines/redis_dedup_pipeline.py +155 -156
- crawlo/project.py +347 -347
- crawlo/queue/__init__.py +10 -0
- crawlo/queue/pqueue.py +38 -38
- crawlo/queue/queue_manager.py +591 -525
- crawlo/queue/redis_priority_queue.py +519 -370
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +284 -277
- crawlo/settings/setting_manager.py +219 -219
- crawlo/spider/__init__.py +657 -657
- crawlo/stats_collector.py +81 -81
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +138 -138
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +2 -4
- crawlo/templates/project/items.py.tmpl +13 -17
- crawlo/templates/project/middlewares.py.tmpl +38 -38
- crawlo/templates/project/pipelines.py.tmpl +35 -36
- crawlo/templates/project/settings.py.tmpl +109 -111
- crawlo/templates/project/settings_distributed.py.tmpl +156 -159
- crawlo/templates/project/settings_gentle.py.tmpl +170 -176
- crawlo/templates/project/settings_high_performance.py.tmpl +171 -177
- crawlo/templates/project/settings_minimal.py.tmpl +98 -100
- crawlo/templates/project/settings_simple.py.tmpl +168 -174
- crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
- crawlo/templates/run.py.tmpl +23 -23
- crawlo/templates/spider/spider.py.tmpl +32 -40
- crawlo/templates/spiders_init.py.tmpl +5 -10
- crawlo/tools/__init__.py +86 -189
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +384 -384
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +50 -50
- crawlo/utils/batch_processor.py +276 -259
- crawlo/utils/config_manager.py +442 -0
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +250 -250
- crawlo/utils/error_handler.py +410 -410
- crawlo/utils/fingerprint.py +121 -121
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/leak_detector.py +335 -0
- crawlo/utils/log.py +79 -79
- crawlo/utils/misc.py +81 -81
- crawlo/utils/mongo_connection_pool.py +157 -0
- crawlo/utils/mysql_connection_pool.py +197 -0
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_checker.py +91 -0
- crawlo/utils/redis_connection_pool.py +578 -388
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +278 -256
- crawlo/utils/request_serializer.py +225 -225
- crawlo/utils/resource_manager.py +337 -0
- crawlo/utils/selector_helper.py +137 -137
- crawlo/utils/singleton.py +70 -0
- crawlo/utils/spider_loader.py +201 -201
- crawlo/utils/text_helper.py +94 -94
- crawlo/utils/{url.py → url_utils.py} +39 -39
- crawlo-1.4.7.dist-info/METADATA +689 -0
- crawlo-1.4.7.dist-info/RECORD +347 -0
- examples/__init__.py +7 -7
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +217 -275
- tests/authenticated_proxy_example.py +110 -110
- tests/baidu_performance_test.py +108 -108
- tests/baidu_test.py +59 -59
- tests/bug_check_test.py +250 -250
- tests/cleaners_example.py +160 -160
- tests/comprehensive_framework_test.py +212 -212
- tests/comprehensive_test.py +81 -81
- tests/comprehensive_testing_summary.md +186 -186
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_configure.py +69 -69
- tests/debug_framework_logger.py +84 -84
- tests/debug_log_config.py +126 -126
- tests/debug_log_levels.py +63 -63
- tests/debug_pipelines.py +66 -66
- tests/detailed_log_test.py +233 -233
- tests/direct_selector_helper_test.py +96 -96
- tests/distributed_dedup_test.py +467 -0
- tests/distributed_test.py +66 -66
- tests/distributed_test_debug.py +76 -76
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/error_handling_example.py +171 -171
- tests/explain_mysql_update_behavior.py +76 -76
- tests/final_comprehensive_test.py +151 -151
- tests/final_log_test.py +260 -260
- tests/final_validation_test.py +182 -182
- tests/fix_log_test.py +142 -142
- tests/framework_performance_test.py +202 -202
- tests/log_buffering_test.py +111 -111
- tests/log_generation_timing_test.py +153 -153
- tests/monitor_redis_dedup.sh +72 -0
- tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
- tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
- tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
- tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
- tests/ofweek_scrapy/scrapy.cfg +11 -11
- tests/optimized_performance_test.py +211 -211
- tests/performance_comparison.py +244 -244
- tests/queue_blocking_test.py +113 -113
- tests/queue_test.py +89 -89
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/scrapy_comparison/ofweek_scrapy.py +138 -138
- tests/scrapy_comparison/scrapy_test.py +133 -133
- tests/simple_cli_test.py +55 -0
- tests/simple_command_test.py +119 -119
- tests/simple_crawlo_test.py +126 -126
- tests/simple_follow_test.py +38 -38
- tests/simple_log_test2.py +137 -137
- tests/simple_optimization_test.py +128 -128
- tests/simple_queue_type_test.py +41 -41
- tests/simple_response_selector_test.py +94 -94
- tests/simple_selector_helper_test.py +154 -154
- tests/simple_selector_test.py +207 -207
- tests/simple_spider_test.py +49 -49
- tests/simple_url_test.py +73 -73
- tests/simulate_mysql_update_test.py +139 -139
- tests/spider_log_timing_test.py +177 -177
- tests/test_advanced_tools.py +148 -148
- tests/test_all_commands.py +230 -230
- tests/test_all_pipeline_fingerprints.py +133 -133
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_asyncmy_usage.py +56 -56
- tests/test_batch_processor.py +178 -178
- tests/test_cleaners.py +54 -54
- tests/test_cli_arguments.py +119 -0
- tests/test_component_factory.py +174 -174
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_controlled_spider_mixin.py +79 -79
- tests/test_crawler_process_import.py +38 -38
- tests/test_crawler_process_spider_modules.py +47 -47
- tests/test_crawlo_proxy_integration.py +114 -114
- tests/test_date_tools.py +123 -123
- tests/test_dedup_fix.py +220 -220
- tests/test_dedup_pipeline_consistency.py +124 -124
- tests/test_default_header_middleware.py +313 -313
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +204 -204
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +272 -272
- tests/test_edge_cases.py +305 -305
- tests/test_encoding_core.py +56 -56
- tests/test_encoding_detection.py +126 -126
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_enhanced_error_handler_comprehensive.py +245 -245
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_factories.py +252 -252
- tests/test_factory_compatibility.py +196 -196
- tests/test_final_validation.py +153 -153
- tests/test_fingerprint_consistency.py +135 -135
- tests/test_fingerprint_simple.py +51 -51
- tests/test_get_component_logger.py +83 -83
- tests/test_hash_performance.py +99 -99
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_large_scale_helper.py +235 -235
- tests/test_logging_enhancements.py +374 -374
- tests/test_logging_final.py +184 -184
- tests/test_logging_integration.py +312 -312
- tests/test_logging_system.py +282 -282
- tests/test_middleware_debug.py +141 -141
- tests/test_mode_consistency.py +51 -51
- tests/test_multi_directory.py +67 -67
- tests/test_multiple_spider_modules.py +80 -80
- tests/test_mysql_pipeline_config.py +164 -164
- tests/test_mysql_pipeline_error.py +98 -98
- tests/test_mysql_pipeline_init_log.py +82 -82
- tests/test_mysql_pipeline_integration.py +132 -132
- tests/test_mysql_pipeline_refactor.py +143 -143
- tests/test_mysql_pipeline_refactor_simple.py +85 -85
- tests/test_mysql_pipeline_robustness.py +195 -195
- tests/test_mysql_pipeline_types.py +88 -88
- tests/test_mysql_update_columns.py +93 -93
- tests/test_offsite_middleware.py +244 -244
- tests/test_offsite_middleware_simple.py +203 -203
- tests/test_optimized_selector_naming.py +100 -100
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_performance_monitor.py +115 -115
- tests/test_pipeline_fingerprint_consistency.py +86 -86
- tests/test_priority_behavior.py +211 -211
- tests/test_priority_consistency.py +151 -151
- tests/test_priority_consistency_fixed.py +249 -249
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +217 -217
- tests/test_proxy_middleware_enhanced.py +212 -212
- tests/test_proxy_middleware_integration.py +142 -142
- tests/test_proxy_middleware_refactored.py +207 -207
- tests/test_proxy_only.py +83 -83
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_proxy_with_downloader.py +152 -152
- tests/test_queue_empty_check.py +41 -41
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +179 -179
- tests/test_queue_naming.py +154 -154
- tests/test_queue_type.py +106 -106
- tests/test_queue_type_redis_config_consistency.py +130 -130
- tests/test_random_headers_default.py +322 -322
- tests/test_random_headers_necessity.py +308 -308
- tests/test_random_user_agent.py +72 -72
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_redis_queue_name_fix.py +175 -175
- tests/test_redis_queue_type_fallback.py +129 -129
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_follow.py +104 -104
- tests/test_response_improvements.py +152 -152
- tests/test_response_selector_methods.py +92 -92
- tests/test_response_url_methods.py +70 -70
- tests/test_response_urljoin.py +86 -86
- tests/test_retry_middleware.py +333 -333
- tests/test_retry_middleware_realistic.py +273 -273
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_scrapy_style_encoding.py +112 -112
- tests/test_selector_helper.py +100 -100
- tests/test_selector_optimizations.py +146 -146
- tests/test_simple_response.py +61 -61
- tests/test_spider_loader.py +49 -49
- tests/test_spider_loader_comprehensive.py +69 -69
- tests/test_spider_modules.py +84 -84
- tests/test_spiders/test_spider.py +9 -9
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agent_randomness.py +176 -176
- tests/test_user_agents.py +96 -96
- tests/untested_features_report.md +138 -138
- tests/verify_debug.py +51 -51
- tests/verify_distributed.py +117 -117
- tests/verify_log_fix.py +111 -111
- tests/verify_mysql_warnings.py +109 -109
- crawlo/logging/async_handler.py +0 -181
- crawlo/logging/monitor.py +0 -153
- crawlo/logging/sampler.py +0 -167
- crawlo/tools/authenticated_proxy.py +0 -241
- crawlo/tools/data_formatter.py +0 -226
- crawlo/tools/data_validator.py +0 -181
- crawlo/tools/encoding_converter.py +0 -127
- crawlo/tools/network_diagnostic.py +0 -365
- crawlo/tools/request_tools.py +0 -83
- crawlo/tools/retry_mechanism.py +0 -224
- crawlo/utils/env_config.py +0 -143
- crawlo/utils/large_scale_config.py +0 -287
- crawlo/utils/system.py +0 -11
- crawlo/utils/tools.py +0 -5
- crawlo-1.4.6.dist-info/METADATA +0 -329
- crawlo-1.4.6.dist-info/RECORD +0 -361
- tests/env_config_example.py +0 -134
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
- tests/test_authenticated_proxy.py +0 -142
- tests/test_comprehensive.py +0 -147
- tests/test_dynamic_downloaders_proxy.py +0 -125
- tests/test_dynamic_proxy.py +0 -93
- tests/test_dynamic_proxy_config.py +0 -147
- tests/test_dynamic_proxy_real.py +0 -110
- tests/test_env_config.py +0 -122
- tests/test_framework_env_usage.py +0 -104
- tests/test_large_scale_config.py +0 -113
- tests/test_proxy_api.py +0 -265
- tests/test_real_scenario_proxy.py +0 -196
- tests/tools_example.py +0 -261
- {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/WHEEL +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/top_level.txt +0 -0
tests/test_proxy_middleware.py
CHANGED
|
@@ -1,218 +1,218 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
ProxyMiddleware 测试文件
|
|
5
|
-
用于测试代理中间件的功能
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import asyncio
|
|
9
|
-
import unittest
|
|
10
|
-
from unittest.mock import Mock, patch
|
|
11
|
-
|
|
12
|
-
from crawlo.middleware.proxy import ProxyMiddleware
|
|
13
|
-
from crawlo.exceptions import NotConfiguredError
|
|
14
|
-
from crawlo.settings.setting_manager import SettingManager
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class MockLogger:
|
|
18
|
-
"""Mock Logger 类,用于测试日志输出"""
|
|
19
|
-
def __init__(self, name, level=None):
|
|
20
|
-
self.name = name
|
|
21
|
-
self.level = level
|
|
22
|
-
self.logs = []
|
|
23
|
-
|
|
24
|
-
def debug(self, msg):
|
|
25
|
-
self.logs.append(('debug', msg))
|
|
26
|
-
|
|
27
|
-
def info(self, msg):
|
|
28
|
-
self.logs.append(('info', msg))
|
|
29
|
-
|
|
30
|
-
def warning(self, msg):
|
|
31
|
-
self.logs.append(('warning', msg))
|
|
32
|
-
|
|
33
|
-
def error(self, msg):
|
|
34
|
-
self.logs.append(('error', msg))
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
class TestProxyMiddleware(unittest.TestCase):
|
|
38
|
-
"""ProxyMiddleware 测试类"""
|
|
39
|
-
|
|
40
|
-
def setUp(self):
|
|
41
|
-
"""测试前准备"""
|
|
42
|
-
# 创建设置管理器
|
|
43
|
-
self.settings = SettingManager()
|
|
44
|
-
|
|
45
|
-
# 创建爬虫模拟对象
|
|
46
|
-
self.crawler = Mock()
|
|
47
|
-
self.crawler.settings = self.settings
|
|
48
|
-
|
|
49
|
-
@patch('crawlo.utils.log.get_logger')
|
|
50
|
-
def test_middleware_initialization_without_api_url(self, mock_get_logger):
|
|
51
|
-
"""测试没有配置API URL时中间件初始化"""
|
|
52
|
-
# 不再需要 PROXY_ENABLED,只要不配置 PROXY_API_URL 就会禁用
|
|
53
|
-
self.settings.set('PROXY_API_URL', None)
|
|
54
|
-
self.settings.set('LOG_LEVEL', 'INFO')
|
|
55
|
-
|
|
56
|
-
mock_get_logger.return_value = MockLogger('ProxyMiddleware')
|
|
57
|
-
|
|
58
|
-
# 应该正常创建实例,但会禁用
|
|
59
|
-
middleware = ProxyMiddleware.create_instance(self.crawler)
|
|
60
|
-
self.assertIsInstance(middleware, ProxyMiddleware)
|
|
61
|
-
self.assertFalse(middleware.enabled)
|
|
62
|
-
|
|
63
|
-
@patch('crawlo.utils.log.get_logger')
|
|
64
|
-
def test_middleware_initialization_with_disabled_proxy(self, mock_get_logger):
|
|
65
|
-
"""测试禁用代理时中间件初始化"""
|
|
66
|
-
# 不再需要 PROXY_ENABLED,只要不配置 PROXY_API_URL 就会禁用
|
|
67
|
-
self.settings.set('PROXY_API_URL', None)
|
|
68
|
-
self.settings.set('LOG_LEVEL', 'INFO')
|
|
69
|
-
|
|
70
|
-
mock_get_logger.return_value = MockLogger('ProxyMiddleware')
|
|
71
|
-
|
|
72
|
-
# 应该正常创建实例,但会禁用
|
|
73
|
-
middleware = ProxyMiddleware.create_instance(self.crawler)
|
|
74
|
-
self.assertIsInstance(middleware, ProxyMiddleware)
|
|
75
|
-
self.assertFalse(middleware.enabled)
|
|
76
|
-
|
|
77
|
-
@patch('crawlo.utils.log.get_logger')
|
|
78
|
-
def test_middleware_initialization_with_api_url(self, mock_get_logger):
|
|
79
|
-
"""测试配置API URL时中间件初始化"""
|
|
80
|
-
# 不再需要 PROXY_ENABLED,只要配置了 PROXY_API_URL 就会启用
|
|
81
|
-
self.settings.set('PROXY_API_URL', 'http://proxy-api.example.com')
|
|
82
|
-
self.settings.set('LOG_LEVEL', 'INFO')
|
|
83
|
-
|
|
84
|
-
mock_get_logger.return_value = MockLogger('ProxyMiddleware')
|
|
85
|
-
|
|
86
|
-
# 应该正常创建实例并启用
|
|
87
|
-
middleware = ProxyMiddleware.create_instance(self.crawler)
|
|
88
|
-
self.assertIsInstance(middleware, ProxyMiddleware)
|
|
89
|
-
self.assertTrue(middleware.enabled)
|
|
90
|
-
self.assertEqual(middleware.api_url, 'http://proxy-api.example.com')
|
|
91
|
-
|
|
92
|
-
def test_middleware_initialization(self):
|
|
93
|
-
"""测试中间件初始化"""
|
|
94
|
-
# 配置代理API URL以启用中间件
|
|
95
|
-
self.settings.set('PROXY_API_URL', 'http://proxy-api.example.com')
|
|
96
|
-
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
97
|
-
self.assertIsInstance(middleware, ProxyMiddleware)
|
|
98
|
-
self.assertTrue(middleware.enabled)
|
|
99
|
-
self.assertEqual(middleware.api_url, 'http://proxy-api.example.com')
|
|
100
|
-
|
|
101
|
-
def test_middleware_enabled_with_api_url(self):
|
|
102
|
-
"""测试配置了代理API URL时中间件启用"""
|
|
103
|
-
self.settings.set('PROXY_API_URL', 'http://proxy-api.example.com')
|
|
104
|
-
# 不再需要显式设置 PROXY_ENABLED = True
|
|
105
|
-
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
106
|
-
self.assertTrue(middleware.enabled)
|
|
107
|
-
self.assertEqual(middleware.api_url, 'http://proxy-api.example.com')
|
|
108
|
-
|
|
109
|
-
def test_middleware_disabled_without_api_url(self):
|
|
110
|
-
"""测试未配置代理API URL时中间件禁用"""
|
|
111
|
-
# 不设置 PROXY_API_URL 或设置为空
|
|
112
|
-
self.settings.set('PROXY_API_URL', '')
|
|
113
|
-
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
114
|
-
self.assertFalse(middleware.enabled)
|
|
115
|
-
|
|
116
|
-
def test_middleware_disabled_explicitly(self):
|
|
117
|
-
"""测试显式禁用中间件(通过不配置API URL)"""
|
|
118
|
-
# 不配置 PROXY_API_URL
|
|
119
|
-
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
120
|
-
self.assertFalse(middleware.enabled)
|
|
121
|
-
|
|
122
|
-
def test_is_https_with_https_url(self):
|
|
123
|
-
"""测试HTTPS URL判断"""
|
|
124
|
-
# 创建中间件实例
|
|
125
|
-
middleware = ProxyMiddleware(
|
|
126
|
-
settings=self.settings,
|
|
127
|
-
log_level='INFO'
|
|
128
|
-
)
|
|
129
|
-
|
|
130
|
-
# 创建请求对象
|
|
131
|
-
request = Mock()
|
|
132
|
-
request.url = 'https://example.com/page'
|
|
133
|
-
|
|
134
|
-
# 应该返回True
|
|
135
|
-
self.assertTrue(middleware._is_https(request))
|
|
136
|
-
|
|
137
|
-
def test_is_https_with_http_url(self):
|
|
138
|
-
"""测试HTTP URL判断"""
|
|
139
|
-
# 创建中间件实例
|
|
140
|
-
middleware = ProxyMiddleware(
|
|
141
|
-
settings=self.settings,
|
|
142
|
-
log_level='INFO'
|
|
143
|
-
)
|
|
144
|
-
|
|
145
|
-
# 创建请求对象
|
|
146
|
-
request = Mock()
|
|
147
|
-
request.url = 'http://example.com/page'
|
|
148
|
-
|
|
149
|
-
# 应该返回False
|
|
150
|
-
self.assertFalse(middleware._is_https(request))
|
|
151
|
-
|
|
152
|
-
def test_proxy_extractor_field(self):
|
|
153
|
-
"""测试字段名提取方式"""
|
|
154
|
-
self.settings.set('PROXY_API_URL', 'http://test.api/proxy')
|
|
155
|
-
self.settings.set('PROXY_EXTRACTOR', 'data') # 从data字段提取
|
|
156
|
-
|
|
157
|
-
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
158
|
-
self.assertEqual(middleware.proxy_extractor, 'data')
|
|
159
|
-
|
|
160
|
-
# 测试提取逻辑
|
|
161
|
-
data = {'data': 'http://proxy-from-data:8080'}
|
|
162
|
-
proxy = middleware._extract_proxy_from_data(data)
|
|
163
|
-
self.assertEqual(proxy, 'http://proxy-from-data:8080')
|
|
164
|
-
|
|
165
|
-
def test_proxy_extractor_dict_field(self):
|
|
166
|
-
"""测试字典字段提取方式"""
|
|
167
|
-
self.settings.set('PROXY_API_URL', 'http://test.api/proxy')
|
|
168
|
-
self.settings.set('PROXY_EXTRACTOR', {'type': 'field', 'value': 'result'})
|
|
169
|
-
|
|
170
|
-
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
171
|
-
self.assertEqual(middleware.proxy_extractor['type'], 'field')
|
|
172
|
-
self.assertEqual(middleware.proxy_extractor['value'], 'result')
|
|
173
|
-
|
|
174
|
-
# 测试提取逻辑
|
|
175
|
-
data = {'result': 'http://proxy-from-result:8080'}
|
|
176
|
-
proxy = middleware._extract_proxy_from_data(data)
|
|
177
|
-
self.assertEqual(proxy, 'http://proxy-from-result:8080')
|
|
178
|
-
|
|
179
|
-
def test_proxy_extractor_custom_function(self):
|
|
180
|
-
"""测试自定义函数提取方式"""
|
|
181
|
-
def custom_extractor(data):
|
|
182
|
-
return data.get('custom_proxy')
|
|
183
|
-
|
|
184
|
-
self.settings.set('PROXY_API_URL', 'http://test.api/proxy')
|
|
185
|
-
self.settings.set('PROXY_EXTRACTOR', {'type': 'custom', 'function': custom_extractor})
|
|
186
|
-
|
|
187
|
-
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
188
|
-
|
|
189
|
-
# 测试提取逻辑
|
|
190
|
-
data = {'custom_proxy': 'http://proxy-from-custom:8080'}
|
|
191
|
-
proxy = middleware._extract_proxy_from_data(data)
|
|
192
|
-
self.assertEqual(proxy, 'http://proxy-from-custom:8080')
|
|
193
|
-
|
|
194
|
-
def test_proxy_extractor_callable(self):
|
|
195
|
-
"""测试直接函数提取方式"""
|
|
196
|
-
def direct_extractor(data):
|
|
197
|
-
return data.get('direct_proxy')
|
|
198
|
-
|
|
199
|
-
self.settings.set('PROXY_API_URL', 'http://test.api/proxy')
|
|
200
|
-
self.settings.set('PROXY_EXTRACTOR', direct_extractor)
|
|
201
|
-
|
|
202
|
-
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
203
|
-
|
|
204
|
-
# 测试提取逻辑
|
|
205
|
-
data = {'direct_proxy': 'http://proxy-from-direct:8080'}
|
|
206
|
-
proxy = middleware._extract_proxy_from_data(data)
|
|
207
|
-
self.assertEqual(proxy, 'http://proxy-from-direct:8080')
|
|
208
|
-
|
|
209
|
-
def test_middleware_disabled_without_list(self):
|
|
210
|
-
"""测试未配置代理列表时代理中间件禁用"""
|
|
211
|
-
# 不设置 PROXY_LIST 或设置为空列表
|
|
212
|
-
self.settings.set('PROXY_LIST', [])
|
|
213
|
-
from crawlo.middleware.proxy import ProxyMiddleware
|
|
214
|
-
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
215
|
-
self.assertFalse(middleware.enabled)
|
|
216
|
-
|
|
217
|
-
if __name__ == '__main__':
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
ProxyMiddleware 测试文件
|
|
5
|
+
用于测试代理中间件的功能
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import unittest
|
|
10
|
+
from unittest.mock import Mock, patch
|
|
11
|
+
|
|
12
|
+
from crawlo.middleware.proxy import ProxyMiddleware
|
|
13
|
+
from crawlo.exceptions import NotConfiguredError
|
|
14
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class MockLogger:
|
|
18
|
+
"""Mock Logger 类,用于测试日志输出"""
|
|
19
|
+
def __init__(self, name, level=None):
|
|
20
|
+
self.name = name
|
|
21
|
+
self.level = level
|
|
22
|
+
self.logs = []
|
|
23
|
+
|
|
24
|
+
def debug(self, msg):
|
|
25
|
+
self.logs.append(('debug', msg))
|
|
26
|
+
|
|
27
|
+
def info(self, msg):
|
|
28
|
+
self.logs.append(('info', msg))
|
|
29
|
+
|
|
30
|
+
def warning(self, msg):
|
|
31
|
+
self.logs.append(('warning', msg))
|
|
32
|
+
|
|
33
|
+
def error(self, msg):
|
|
34
|
+
self.logs.append(('error', msg))
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class TestProxyMiddleware(unittest.TestCase):
|
|
38
|
+
"""ProxyMiddleware 测试类"""
|
|
39
|
+
|
|
40
|
+
def setUp(self):
|
|
41
|
+
"""测试前准备"""
|
|
42
|
+
# 创建设置管理器
|
|
43
|
+
self.settings = SettingManager()
|
|
44
|
+
|
|
45
|
+
# 创建爬虫模拟对象
|
|
46
|
+
self.crawler = Mock()
|
|
47
|
+
self.crawler.settings = self.settings
|
|
48
|
+
|
|
49
|
+
@patch('crawlo.utils.log.get_logger')
|
|
50
|
+
def test_middleware_initialization_without_api_url(self, mock_get_logger):
|
|
51
|
+
"""测试没有配置API URL时中间件初始化"""
|
|
52
|
+
# 不再需要 PROXY_ENABLED,只要不配置 PROXY_API_URL 就会禁用
|
|
53
|
+
self.settings.set('PROXY_API_URL', None)
|
|
54
|
+
self.settings.set('LOG_LEVEL', 'INFO')
|
|
55
|
+
|
|
56
|
+
mock_get_logger.return_value = MockLogger('ProxyMiddleware')
|
|
57
|
+
|
|
58
|
+
# 应该正常创建实例,但会禁用
|
|
59
|
+
middleware = ProxyMiddleware.create_instance(self.crawler)
|
|
60
|
+
self.assertIsInstance(middleware, ProxyMiddleware)
|
|
61
|
+
self.assertFalse(middleware.enabled)
|
|
62
|
+
|
|
63
|
+
@patch('crawlo.utils.log.get_logger')
|
|
64
|
+
def test_middleware_initialization_with_disabled_proxy(self, mock_get_logger):
|
|
65
|
+
"""测试禁用代理时中间件初始化"""
|
|
66
|
+
# 不再需要 PROXY_ENABLED,只要不配置 PROXY_API_URL 就会禁用
|
|
67
|
+
self.settings.set('PROXY_API_URL', None)
|
|
68
|
+
self.settings.set('LOG_LEVEL', 'INFO')
|
|
69
|
+
|
|
70
|
+
mock_get_logger.return_value = MockLogger('ProxyMiddleware')
|
|
71
|
+
|
|
72
|
+
# 应该正常创建实例,但会禁用
|
|
73
|
+
middleware = ProxyMiddleware.create_instance(self.crawler)
|
|
74
|
+
self.assertIsInstance(middleware, ProxyMiddleware)
|
|
75
|
+
self.assertFalse(middleware.enabled)
|
|
76
|
+
|
|
77
|
+
@patch('crawlo.utils.log.get_logger')
|
|
78
|
+
def test_middleware_initialization_with_api_url(self, mock_get_logger):
|
|
79
|
+
"""测试配置API URL时中间件初始化"""
|
|
80
|
+
# 不再需要 PROXY_ENABLED,只要配置了 PROXY_API_URL 就会启用
|
|
81
|
+
self.settings.set('PROXY_API_URL', 'http://proxy-api.example.com')
|
|
82
|
+
self.settings.set('LOG_LEVEL', 'INFO')
|
|
83
|
+
|
|
84
|
+
mock_get_logger.return_value = MockLogger('ProxyMiddleware')
|
|
85
|
+
|
|
86
|
+
# 应该正常创建实例并启用
|
|
87
|
+
middleware = ProxyMiddleware.create_instance(self.crawler)
|
|
88
|
+
self.assertIsInstance(middleware, ProxyMiddleware)
|
|
89
|
+
self.assertTrue(middleware.enabled)
|
|
90
|
+
self.assertEqual(middleware.api_url, 'http://proxy-api.example.com')
|
|
91
|
+
|
|
92
|
+
def test_middleware_initialization(self):
|
|
93
|
+
"""测试中间件初始化"""
|
|
94
|
+
# 配置代理API URL以启用中间件
|
|
95
|
+
self.settings.set('PROXY_API_URL', 'http://proxy-api.example.com')
|
|
96
|
+
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
97
|
+
self.assertIsInstance(middleware, ProxyMiddleware)
|
|
98
|
+
self.assertTrue(middleware.enabled)
|
|
99
|
+
self.assertEqual(middleware.api_url, 'http://proxy-api.example.com')
|
|
100
|
+
|
|
101
|
+
def test_middleware_enabled_with_api_url(self):
|
|
102
|
+
"""测试配置了代理API URL时中间件启用"""
|
|
103
|
+
self.settings.set('PROXY_API_URL', 'http://proxy-api.example.com')
|
|
104
|
+
# 不再需要显式设置 PROXY_ENABLED = True
|
|
105
|
+
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
106
|
+
self.assertTrue(middleware.enabled)
|
|
107
|
+
self.assertEqual(middleware.api_url, 'http://proxy-api.example.com')
|
|
108
|
+
|
|
109
|
+
def test_middleware_disabled_without_api_url(self):
|
|
110
|
+
"""测试未配置代理API URL时中间件禁用"""
|
|
111
|
+
# 不设置 PROXY_API_URL 或设置为空
|
|
112
|
+
self.settings.set('PROXY_API_URL', '')
|
|
113
|
+
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
114
|
+
self.assertFalse(middleware.enabled)
|
|
115
|
+
|
|
116
|
+
def test_middleware_disabled_explicitly(self):
|
|
117
|
+
"""测试显式禁用中间件(通过不配置API URL)"""
|
|
118
|
+
# 不配置 PROXY_API_URL
|
|
119
|
+
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
120
|
+
self.assertFalse(middleware.enabled)
|
|
121
|
+
|
|
122
|
+
def test_is_https_with_https_url(self):
|
|
123
|
+
"""测试HTTPS URL判断"""
|
|
124
|
+
# 创建中间件实例
|
|
125
|
+
middleware = ProxyMiddleware(
|
|
126
|
+
settings=self.settings,
|
|
127
|
+
log_level='INFO'
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
# 创建请求对象
|
|
131
|
+
request = Mock()
|
|
132
|
+
request.url = 'https://example.com/page'
|
|
133
|
+
|
|
134
|
+
# 应该返回True
|
|
135
|
+
self.assertTrue(middleware._is_https(request))
|
|
136
|
+
|
|
137
|
+
def test_is_https_with_http_url(self):
|
|
138
|
+
"""测试HTTP URL判断"""
|
|
139
|
+
# 创建中间件实例
|
|
140
|
+
middleware = ProxyMiddleware(
|
|
141
|
+
settings=self.settings,
|
|
142
|
+
log_level='INFO'
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# 创建请求对象
|
|
146
|
+
request = Mock()
|
|
147
|
+
request.url = 'http://example.com/page'
|
|
148
|
+
|
|
149
|
+
# 应该返回False
|
|
150
|
+
self.assertFalse(middleware._is_https(request))
|
|
151
|
+
|
|
152
|
+
def test_proxy_extractor_field(self):
|
|
153
|
+
"""测试字段名提取方式"""
|
|
154
|
+
self.settings.set('PROXY_API_URL', 'http://test.api/proxy')
|
|
155
|
+
self.settings.set('PROXY_EXTRACTOR', 'data') # 从data字段提取
|
|
156
|
+
|
|
157
|
+
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
158
|
+
self.assertEqual(middleware.proxy_extractor, 'data')
|
|
159
|
+
|
|
160
|
+
# 测试提取逻辑
|
|
161
|
+
data = {'data': 'http://proxy-from-data:8080'}
|
|
162
|
+
proxy = middleware._extract_proxy_from_data(data)
|
|
163
|
+
self.assertEqual(proxy, 'http://proxy-from-data:8080')
|
|
164
|
+
|
|
165
|
+
def test_proxy_extractor_dict_field(self):
|
|
166
|
+
"""测试字典字段提取方式"""
|
|
167
|
+
self.settings.set('PROXY_API_URL', 'http://test.api/proxy')
|
|
168
|
+
self.settings.set('PROXY_EXTRACTOR', {'type': 'field', 'value': 'result'})
|
|
169
|
+
|
|
170
|
+
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
171
|
+
self.assertEqual(middleware.proxy_extractor['type'], 'field')
|
|
172
|
+
self.assertEqual(middleware.proxy_extractor['value'], 'result')
|
|
173
|
+
|
|
174
|
+
# 测试提取逻辑
|
|
175
|
+
data = {'result': 'http://proxy-from-result:8080'}
|
|
176
|
+
proxy = middleware._extract_proxy_from_data(data)
|
|
177
|
+
self.assertEqual(proxy, 'http://proxy-from-result:8080')
|
|
178
|
+
|
|
179
|
+
def test_proxy_extractor_custom_function(self):
|
|
180
|
+
"""测试自定义函数提取方式"""
|
|
181
|
+
def custom_extractor(data):
|
|
182
|
+
return data.get('custom_proxy')
|
|
183
|
+
|
|
184
|
+
self.settings.set('PROXY_API_URL', 'http://test.api/proxy')
|
|
185
|
+
self.settings.set('PROXY_EXTRACTOR', {'type': 'custom', 'function': custom_extractor})
|
|
186
|
+
|
|
187
|
+
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
188
|
+
|
|
189
|
+
# 测试提取逻辑
|
|
190
|
+
data = {'custom_proxy': 'http://proxy-from-custom:8080'}
|
|
191
|
+
proxy = middleware._extract_proxy_from_data(data)
|
|
192
|
+
self.assertEqual(proxy, 'http://proxy-from-custom:8080')
|
|
193
|
+
|
|
194
|
+
def test_proxy_extractor_callable(self):
|
|
195
|
+
"""测试直接函数提取方式"""
|
|
196
|
+
def direct_extractor(data):
|
|
197
|
+
return data.get('direct_proxy')
|
|
198
|
+
|
|
199
|
+
self.settings.set('PROXY_API_URL', 'http://test.api/proxy')
|
|
200
|
+
self.settings.set('PROXY_EXTRACTOR', direct_extractor)
|
|
201
|
+
|
|
202
|
+
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
203
|
+
|
|
204
|
+
# 测试提取逻辑
|
|
205
|
+
data = {'direct_proxy': 'http://proxy-from-direct:8080'}
|
|
206
|
+
proxy = middleware._extract_proxy_from_data(data)
|
|
207
|
+
self.assertEqual(proxy, 'http://proxy-from-direct:8080')
|
|
208
|
+
|
|
209
|
+
def test_middleware_disabled_without_list(self):
|
|
210
|
+
"""测试未配置代理列表时代理中间件禁用"""
|
|
211
|
+
# 不设置 PROXY_LIST 或设置为空列表
|
|
212
|
+
self.settings.set('PROXY_LIST', [])
|
|
213
|
+
from crawlo.middleware.proxy import ProxyMiddleware
|
|
214
|
+
middleware = ProxyMiddleware(self.settings, "DEBUG")
|
|
215
|
+
self.assertFalse(middleware.enabled)
|
|
216
|
+
|
|
217
|
+
if __name__ == '__main__':
|
|
218
218
|
unittest.main()
|