crawlo 1.4.6__py3-none-any.whl → 1.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +90 -89
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +186 -186
- crawlo/commands/help.py +140 -138
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +379 -341
- crawlo/commands/startproject.py +460 -460
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +320 -312
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +52 -52
- crawlo/core/engine.py +451 -438
- crawlo/core/processor.py +47 -47
- crawlo/core/scheduler.py +290 -291
- crawlo/crawler.py +698 -657
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +280 -276
- crawlo/downloader/aiohttp_downloader.py +233 -233
- crawlo/downloader/cffi_downloader.py +250 -247
- crawlo/downloader/httpx_downloader.py +265 -259
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +425 -402
- crawlo/downloader/selenium_downloader.py +486 -472
- crawlo/event.py +45 -11
- crawlo/exceptions.py +215 -82
- crawlo/extension/__init__.py +65 -64
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +94 -94
- crawlo/extension/log_stats.py +70 -70
- crawlo/extension/logging_extension.py +53 -61
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/factories/__init__.py +27 -27
- crawlo/factories/base.py +68 -68
- crawlo/factories/crawler.py +104 -103
- crawlo/factories/registry.py +84 -84
- crawlo/factories/utils.py +135 -0
- crawlo/filters/__init__.py +170 -153
- crawlo/filters/aioredis_filter.py +348 -264
- crawlo/filters/memory_filter.py +261 -276
- crawlo/framework.py +306 -292
- crawlo/initialization/__init__.py +44 -44
- crawlo/initialization/built_in.py +391 -434
- crawlo/initialization/context.py +141 -141
- crawlo/initialization/core.py +240 -194
- crawlo/initialization/phases.py +230 -149
- crawlo/initialization/registry.py +143 -145
- crawlo/initialization/utils.py +49 -0
- crawlo/interfaces.py +23 -23
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +23 -23
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/logging/__init__.py +42 -46
- crawlo/logging/config.py +277 -197
- crawlo/logging/factory.py +175 -171
- crawlo/logging/manager.py +104 -112
- crawlo/middleware/__init__.py +87 -24
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +142 -142
- crawlo/middleware/offsite.py +123 -123
- crawlo/middleware/proxy.py +209 -209
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +150 -150
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/mode_manager.py +287 -253
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +375 -379
- crawlo/network/response.py +569 -664
- crawlo/pipelines/__init__.py +53 -22
- crawlo/pipelines/base_pipeline.py +452 -0
- crawlo/pipelines/bloom_dedup_pipeline.py +146 -146
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +197 -197
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +105 -105
- crawlo/pipelines/mongo_pipeline.py +140 -132
- crawlo/pipelines/mysql_pipeline.py +469 -476
- crawlo/pipelines/pipeline_manager.py +100 -100
- crawlo/pipelines/redis_dedup_pipeline.py +155 -156
- crawlo/project.py +347 -347
- crawlo/queue/__init__.py +10 -0
- crawlo/queue/pqueue.py +38 -38
- crawlo/queue/queue_manager.py +591 -525
- crawlo/queue/redis_priority_queue.py +519 -370
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +284 -277
- crawlo/settings/setting_manager.py +219 -219
- crawlo/spider/__init__.py +657 -657
- crawlo/stats_collector.py +81 -81
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +138 -138
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +2 -4
- crawlo/templates/project/items.py.tmpl +13 -17
- crawlo/templates/project/middlewares.py.tmpl +38 -38
- crawlo/templates/project/pipelines.py.tmpl +35 -36
- crawlo/templates/project/settings.py.tmpl +109 -111
- crawlo/templates/project/settings_distributed.py.tmpl +156 -159
- crawlo/templates/project/settings_gentle.py.tmpl +170 -176
- crawlo/templates/project/settings_high_performance.py.tmpl +171 -177
- crawlo/templates/project/settings_minimal.py.tmpl +98 -100
- crawlo/templates/project/settings_simple.py.tmpl +168 -174
- crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
- crawlo/templates/run.py.tmpl +23 -23
- crawlo/templates/spider/spider.py.tmpl +32 -40
- crawlo/templates/spiders_init.py.tmpl +5 -10
- crawlo/tools/__init__.py +86 -189
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +384 -384
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +50 -50
- crawlo/utils/batch_processor.py +276 -259
- crawlo/utils/config_manager.py +442 -0
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +250 -250
- crawlo/utils/error_handler.py +410 -410
- crawlo/utils/fingerprint.py +121 -121
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/leak_detector.py +335 -0
- crawlo/utils/log.py +79 -79
- crawlo/utils/misc.py +81 -81
- crawlo/utils/mongo_connection_pool.py +157 -0
- crawlo/utils/mysql_connection_pool.py +197 -0
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_checker.py +91 -0
- crawlo/utils/redis_connection_pool.py +578 -388
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +278 -256
- crawlo/utils/request_serializer.py +225 -225
- crawlo/utils/resource_manager.py +337 -0
- crawlo/utils/selector_helper.py +137 -137
- crawlo/utils/singleton.py +70 -0
- crawlo/utils/spider_loader.py +201 -201
- crawlo/utils/text_helper.py +94 -94
- crawlo/utils/{url.py → url_utils.py} +39 -39
- crawlo-1.4.7.dist-info/METADATA +689 -0
- crawlo-1.4.7.dist-info/RECORD +347 -0
- examples/__init__.py +7 -7
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +217 -275
- tests/authenticated_proxy_example.py +110 -110
- tests/baidu_performance_test.py +108 -108
- tests/baidu_test.py +59 -59
- tests/bug_check_test.py +250 -250
- tests/cleaners_example.py +160 -160
- tests/comprehensive_framework_test.py +212 -212
- tests/comprehensive_test.py +81 -81
- tests/comprehensive_testing_summary.md +186 -186
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_configure.py +69 -69
- tests/debug_framework_logger.py +84 -84
- tests/debug_log_config.py +126 -126
- tests/debug_log_levels.py +63 -63
- tests/debug_pipelines.py +66 -66
- tests/detailed_log_test.py +233 -233
- tests/direct_selector_helper_test.py +96 -96
- tests/distributed_dedup_test.py +467 -0
- tests/distributed_test.py +66 -66
- tests/distributed_test_debug.py +76 -76
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/error_handling_example.py +171 -171
- tests/explain_mysql_update_behavior.py +76 -76
- tests/final_comprehensive_test.py +151 -151
- tests/final_log_test.py +260 -260
- tests/final_validation_test.py +182 -182
- tests/fix_log_test.py +142 -142
- tests/framework_performance_test.py +202 -202
- tests/log_buffering_test.py +111 -111
- tests/log_generation_timing_test.py +153 -153
- tests/monitor_redis_dedup.sh +72 -0
- tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
- tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
- tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
- tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
- tests/ofweek_scrapy/scrapy.cfg +11 -11
- tests/optimized_performance_test.py +211 -211
- tests/performance_comparison.py +244 -244
- tests/queue_blocking_test.py +113 -113
- tests/queue_test.py +89 -89
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/scrapy_comparison/ofweek_scrapy.py +138 -138
- tests/scrapy_comparison/scrapy_test.py +133 -133
- tests/simple_cli_test.py +55 -0
- tests/simple_command_test.py +119 -119
- tests/simple_crawlo_test.py +126 -126
- tests/simple_follow_test.py +38 -38
- tests/simple_log_test2.py +137 -137
- tests/simple_optimization_test.py +128 -128
- tests/simple_queue_type_test.py +41 -41
- tests/simple_response_selector_test.py +94 -94
- tests/simple_selector_helper_test.py +154 -154
- tests/simple_selector_test.py +207 -207
- tests/simple_spider_test.py +49 -49
- tests/simple_url_test.py +73 -73
- tests/simulate_mysql_update_test.py +139 -139
- tests/spider_log_timing_test.py +177 -177
- tests/test_advanced_tools.py +148 -148
- tests/test_all_commands.py +230 -230
- tests/test_all_pipeline_fingerprints.py +133 -133
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_asyncmy_usage.py +56 -56
- tests/test_batch_processor.py +178 -178
- tests/test_cleaners.py +54 -54
- tests/test_cli_arguments.py +119 -0
- tests/test_component_factory.py +174 -174
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_controlled_spider_mixin.py +79 -79
- tests/test_crawler_process_import.py +38 -38
- tests/test_crawler_process_spider_modules.py +47 -47
- tests/test_crawlo_proxy_integration.py +114 -114
- tests/test_date_tools.py +123 -123
- tests/test_dedup_fix.py +220 -220
- tests/test_dedup_pipeline_consistency.py +124 -124
- tests/test_default_header_middleware.py +313 -313
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +204 -204
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +272 -272
- tests/test_edge_cases.py +305 -305
- tests/test_encoding_core.py +56 -56
- tests/test_encoding_detection.py +126 -126
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_enhanced_error_handler_comprehensive.py +245 -245
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_factories.py +252 -252
- tests/test_factory_compatibility.py +196 -196
- tests/test_final_validation.py +153 -153
- tests/test_fingerprint_consistency.py +135 -135
- tests/test_fingerprint_simple.py +51 -51
- tests/test_get_component_logger.py +83 -83
- tests/test_hash_performance.py +99 -99
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_large_scale_helper.py +235 -235
- tests/test_logging_enhancements.py +374 -374
- tests/test_logging_final.py +184 -184
- tests/test_logging_integration.py +312 -312
- tests/test_logging_system.py +282 -282
- tests/test_middleware_debug.py +141 -141
- tests/test_mode_consistency.py +51 -51
- tests/test_multi_directory.py +67 -67
- tests/test_multiple_spider_modules.py +80 -80
- tests/test_mysql_pipeline_config.py +164 -164
- tests/test_mysql_pipeline_error.py +98 -98
- tests/test_mysql_pipeline_init_log.py +82 -82
- tests/test_mysql_pipeline_integration.py +132 -132
- tests/test_mysql_pipeline_refactor.py +143 -143
- tests/test_mysql_pipeline_refactor_simple.py +85 -85
- tests/test_mysql_pipeline_robustness.py +195 -195
- tests/test_mysql_pipeline_types.py +88 -88
- tests/test_mysql_update_columns.py +93 -93
- tests/test_offsite_middleware.py +244 -244
- tests/test_offsite_middleware_simple.py +203 -203
- tests/test_optimized_selector_naming.py +100 -100
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_performance_monitor.py +115 -115
- tests/test_pipeline_fingerprint_consistency.py +86 -86
- tests/test_priority_behavior.py +211 -211
- tests/test_priority_consistency.py +151 -151
- tests/test_priority_consistency_fixed.py +249 -249
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +217 -217
- tests/test_proxy_middleware_enhanced.py +212 -212
- tests/test_proxy_middleware_integration.py +142 -142
- tests/test_proxy_middleware_refactored.py +207 -207
- tests/test_proxy_only.py +83 -83
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_proxy_with_downloader.py +152 -152
- tests/test_queue_empty_check.py +41 -41
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +179 -179
- tests/test_queue_naming.py +154 -154
- tests/test_queue_type.py +106 -106
- tests/test_queue_type_redis_config_consistency.py +130 -130
- tests/test_random_headers_default.py +322 -322
- tests/test_random_headers_necessity.py +308 -308
- tests/test_random_user_agent.py +72 -72
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_redis_queue_name_fix.py +175 -175
- tests/test_redis_queue_type_fallback.py +129 -129
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_follow.py +104 -104
- tests/test_response_improvements.py +152 -152
- tests/test_response_selector_methods.py +92 -92
- tests/test_response_url_methods.py +70 -70
- tests/test_response_urljoin.py +86 -86
- tests/test_retry_middleware.py +333 -333
- tests/test_retry_middleware_realistic.py +273 -273
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_scrapy_style_encoding.py +112 -112
- tests/test_selector_helper.py +100 -100
- tests/test_selector_optimizations.py +146 -146
- tests/test_simple_response.py +61 -61
- tests/test_spider_loader.py +49 -49
- tests/test_spider_loader_comprehensive.py +69 -69
- tests/test_spider_modules.py +84 -84
- tests/test_spiders/test_spider.py +9 -9
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agent_randomness.py +176 -176
- tests/test_user_agents.py +96 -96
- tests/untested_features_report.md +138 -138
- tests/verify_debug.py +51 -51
- tests/verify_distributed.py +117 -117
- tests/verify_log_fix.py +111 -111
- tests/verify_mysql_warnings.py +109 -109
- crawlo/logging/async_handler.py +0 -181
- crawlo/logging/monitor.py +0 -153
- crawlo/logging/sampler.py +0 -167
- crawlo/tools/authenticated_proxy.py +0 -241
- crawlo/tools/data_formatter.py +0 -226
- crawlo/tools/data_validator.py +0 -181
- crawlo/tools/encoding_converter.py +0 -127
- crawlo/tools/network_diagnostic.py +0 -365
- crawlo/tools/request_tools.py +0 -83
- crawlo/tools/retry_mechanism.py +0 -224
- crawlo/utils/env_config.py +0 -143
- crawlo/utils/large_scale_config.py +0 -287
- crawlo/utils/system.py +0 -11
- crawlo/utils/tools.py +0 -5
- crawlo-1.4.6.dist-info/METADATA +0 -329
- crawlo-1.4.6.dist-info/RECORD +0 -361
- tests/env_config_example.py +0 -134
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
- tests/test_authenticated_proxy.py +0 -142
- tests/test_comprehensive.py +0 -147
- tests/test_dynamic_downloaders_proxy.py +0 -125
- tests/test_dynamic_proxy.py +0 -93
- tests/test_dynamic_proxy_config.py +0 -147
- tests/test_dynamic_proxy_real.py +0 -110
- tests/test_env_config.py +0 -122
- tests/test_framework_env_usage.py +0 -104
- tests/test_large_scale_config.py +0 -113
- tests/test_proxy_api.py +0 -265
- tests/test_real_scenario_proxy.py +0 -196
- tests/tools_example.py +0 -261
- {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/WHEEL +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/top_level.txt +0 -0
|
@@ -1,314 +1,314 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
DefaultHeaderMiddleware 测试文件
|
|
5
|
-
用于测试默认请求头中间件的功能,包括随机更换header功能
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import unittest
|
|
9
|
-
from unittest.mock import Mock, patch
|
|
10
|
-
|
|
11
|
-
from crawlo.middleware.default_header import DefaultHeaderMiddleware
|
|
12
|
-
from crawlo.settings.setting_manager import SettingManager
|
|
13
|
-
from crawlo.exceptions import NotConfiguredError
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class MockLogger:
|
|
17
|
-
"""Mock Logger 类,用于测试日志输出"""
|
|
18
|
-
def __init__(self, name, level=None):
|
|
19
|
-
self.name = name
|
|
20
|
-
self.level = level
|
|
21
|
-
self.logs = []
|
|
22
|
-
|
|
23
|
-
def debug(self, msg):
|
|
24
|
-
self.logs.append(('debug', msg))
|
|
25
|
-
|
|
26
|
-
def info(self, msg):
|
|
27
|
-
self.logs.append(('info', msg))
|
|
28
|
-
|
|
29
|
-
def warning(self, msg):
|
|
30
|
-
self.logs.append(('warning', msg))
|
|
31
|
-
|
|
32
|
-
def error(self, msg):
|
|
33
|
-
self.logs.append(('error', msg))
|
|
34
|
-
|
|
35
|
-
def isEnabledFor(self, level):
|
|
36
|
-
return True
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
class TestDefaultHeaderMiddleware(unittest.TestCase):
|
|
40
|
-
"""DefaultHeaderMiddleware 测试类"""
|
|
41
|
-
|
|
42
|
-
def setUp(self):
|
|
43
|
-
"""测试前准备"""
|
|
44
|
-
# 创建设置管理器
|
|
45
|
-
self.settings = SettingManager()
|
|
46
|
-
|
|
47
|
-
def test_middleware_initialization_without_config(self):
|
|
48
|
-
"""测试没有配置时中间件初始化"""
|
|
49
|
-
# 创建一个模拟的crawler对象
|
|
50
|
-
crawler = Mock()
|
|
51
|
-
crawler.settings = self.settings
|
|
52
|
-
|
|
53
|
-
logger = MockLogger('DefaultHeaderMiddleware')
|
|
54
|
-
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
55
|
-
# 应该抛出NotConfiguredError异常
|
|
56
|
-
with self.assertRaises(NotConfiguredError) as context:
|
|
57
|
-
DefaultHeaderMiddleware.create_instance(crawler)
|
|
58
|
-
|
|
59
|
-
self.assertIn("未配置DEFAULT_REQUEST_HEADERS、USER_AGENT或随机头部配置,DefaultHeaderMiddleware已禁用", str(context.exception))
|
|
60
|
-
|
|
61
|
-
def test_middleware_initialization_with_default_headers(self):
|
|
62
|
-
"""测试使用默认请求头配置时中间件初始化"""
|
|
63
|
-
# 设置默认请求头
|
|
64
|
-
self.settings.set('DEFAULT_REQUEST_HEADERS', {
|
|
65
|
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
66
|
-
'Accept-Language': 'en-US,en;q=0.5',
|
|
67
|
-
'Accept-Encoding': 'gzip, deflate',
|
|
68
|
-
})
|
|
69
|
-
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
70
|
-
|
|
71
|
-
# 创建一个模拟的crawler对象
|
|
72
|
-
crawler = Mock()
|
|
73
|
-
crawler.settings = self.settings
|
|
74
|
-
|
|
75
|
-
logger = MockLogger('DefaultHeaderMiddleware')
|
|
76
|
-
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
77
|
-
# 应该正常创建实例
|
|
78
|
-
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
79
|
-
|
|
80
|
-
self.assertIsInstance(middleware, DefaultHeaderMiddleware)
|
|
81
|
-
self.assertEqual(len(middleware.headers), 3)
|
|
82
|
-
self.assertIn('Accept', middleware.headers)
|
|
83
|
-
self.assertIn('Accept-Language', middleware.headers)
|
|
84
|
-
self.assertIn('Accept-Encoding', middleware.headers)
|
|
85
|
-
|
|
86
|
-
def test_middleware_initialization_with_user_agent(self):
|
|
87
|
-
"""测试使用User-Agent配置时中间件初始化"""
|
|
88
|
-
# 设置User-Agent
|
|
89
|
-
self.settings.set('USER_AGENT', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
|
|
90
|
-
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
91
|
-
|
|
92
|
-
# 创建一个模拟的crawler对象
|
|
93
|
-
crawler = Mock()
|
|
94
|
-
crawler.settings = self.settings
|
|
95
|
-
|
|
96
|
-
logger = MockLogger('DefaultHeaderMiddleware')
|
|
97
|
-
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
98
|
-
# 应该正常创建实例
|
|
99
|
-
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
100
|
-
|
|
101
|
-
self.assertIsInstance(middleware, DefaultHeaderMiddleware)
|
|
102
|
-
self.assertIn('User-Agent', middleware.headers)
|
|
103
|
-
self.assertEqual(middleware.headers['User-Agent'], 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
|
|
104
|
-
|
|
105
|
-
def test_middleware_initialization_with_random_user_agent_enabled(self):
|
|
106
|
-
"""测试启用随机User-Agent时中间件初始化"""
|
|
107
|
-
# 启用随机User-Agent并提供一个User-Agent
|
|
108
|
-
self.settings.set('RANDOM_USER_AGENT_ENABLED', True)
|
|
109
|
-
self.settings.set('USER_AGENTS', ['Test-Agent/1.0']) # 提供一个User-Agent以通过初始化检查
|
|
110
|
-
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
111
|
-
|
|
112
|
-
# 创建一个模拟的crawler对象
|
|
113
|
-
crawler = Mock()
|
|
114
|
-
crawler.settings = self.settings
|
|
115
|
-
|
|
116
|
-
logger = MockLogger('DefaultHeaderMiddleware')
|
|
117
|
-
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
118
|
-
# 应该正常创建实例,使用内置User-Agent列表
|
|
119
|
-
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
120
|
-
|
|
121
|
-
self.assertIsInstance(middleware, DefaultHeaderMiddleware)
|
|
122
|
-
self.assertTrue(middleware.random_user_agent_enabled)
|
|
123
|
-
# 注意:这里user_agents会被get_user_agents覆盖,所以长度可能不为1
|
|
124
|
-
|
|
125
|
-
def test_middleware_initialization_with_custom_user_agents(self):
|
|
126
|
-
"""测试使用自定义User-Agent列表时中间件初始化"""
|
|
127
|
-
# 设置自定义User-Agent列表
|
|
128
|
-
custom_user_agents = [
|
|
129
|
-
'Custom-Agent/1.0',
|
|
130
|
-
'Custom-Agent/2.0',
|
|
131
|
-
'Custom-Agent/3.0'
|
|
132
|
-
]
|
|
133
|
-
self.settings.set('RANDOM_USER_AGENT_ENABLED', True)
|
|
134
|
-
self.settings.set('USER_AGENTS', custom_user_agents)
|
|
135
|
-
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
136
|
-
|
|
137
|
-
# 创建一个模拟的crawler对象
|
|
138
|
-
crawler = Mock()
|
|
139
|
-
crawler.settings = self.settings
|
|
140
|
-
|
|
141
|
-
logger = MockLogger('DefaultHeaderMiddleware')
|
|
142
|
-
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
143
|
-
# 应该正常创建实例,使用自定义User-Agent列表
|
|
144
|
-
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
145
|
-
|
|
146
|
-
self.assertIsInstance(middleware, DefaultHeaderMiddleware)
|
|
147
|
-
self.assertTrue(middleware.random_user_agent_enabled)
|
|
148
|
-
self.assertEqual(middleware.user_agents, custom_user_agents)
|
|
149
|
-
|
|
150
|
-
def test_process_request_with_default_headers(self):
|
|
151
|
-
"""测试处理请求时添加默认请求头"""
|
|
152
|
-
# 设置默认请求头
|
|
153
|
-
self.settings.set('DEFAULT_REQUEST_HEADERS', {
|
|
154
|
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
155
|
-
'Accept-Language': 'en-US,en;q=0.5',
|
|
156
|
-
})
|
|
157
|
-
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
158
|
-
|
|
159
|
-
# 创建一个模拟的crawler对象
|
|
160
|
-
crawler = Mock()
|
|
161
|
-
crawler.settings = self.settings
|
|
162
|
-
|
|
163
|
-
logger = MockLogger('DefaultHeaderMiddleware')
|
|
164
|
-
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
165
|
-
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
166
|
-
|
|
167
|
-
# 创建请求
|
|
168
|
-
request = Mock()
|
|
169
|
-
request.headers = {}
|
|
170
|
-
request.url = 'https://example.com'
|
|
171
|
-
|
|
172
|
-
spider = Mock()
|
|
173
|
-
|
|
174
|
-
# 处理请求
|
|
175
|
-
middleware.process_request(request, spider)
|
|
176
|
-
|
|
177
|
-
# 检查默认请求头是否添加
|
|
178
|
-
self.assertIn('Accept', request.headers)
|
|
179
|
-
self.assertIn('Accept-Language', request.headers)
|
|
180
|
-
self.assertEqual(request.headers['Accept'], 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
|
|
181
|
-
self.assertEqual(request.headers['Accept-Language'], 'en-US,en;q=0.5')
|
|
182
|
-
|
|
183
|
-
def test_process_request_with_existing_headers(self):
|
|
184
|
-
"""测试处理已有请求头的请求"""
|
|
185
|
-
# 设置默认请求头
|
|
186
|
-
self.settings.set('DEFAULT_REQUEST_HEADERS', {
|
|
187
|
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
188
|
-
'Accept-Language': 'en-US,en;q=0.5',
|
|
189
|
-
})
|
|
190
|
-
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
191
|
-
|
|
192
|
-
# 创建一个模拟的crawler对象
|
|
193
|
-
crawler = Mock()
|
|
194
|
-
crawler.settings = self.settings
|
|
195
|
-
|
|
196
|
-
logger = MockLogger('DefaultHeaderMiddleware')
|
|
197
|
-
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
198
|
-
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
199
|
-
|
|
200
|
-
# 创建已有请求头的请求
|
|
201
|
-
request = Mock()
|
|
202
|
-
request.headers = {
|
|
203
|
-
'Accept': 'application/json', # 已存在的请求头
|
|
204
|
-
}
|
|
205
|
-
request.url = 'https://example.com'
|
|
206
|
-
|
|
207
|
-
spider = Mock()
|
|
208
|
-
|
|
209
|
-
# 处理请求
|
|
210
|
-
middleware.process_request(request, spider)
|
|
211
|
-
|
|
212
|
-
# 检查已存在的请求头不被覆盖,新请求头被添加
|
|
213
|
-
self.assertEqual(request.headers['Accept'], 'application/json') # 保持原值
|
|
214
|
-
self.assertIn('Accept-Language', request.headers) # 新添加的请求头
|
|
215
|
-
|
|
216
|
-
def test_process_request_with_random_user_agent(self):
|
|
217
|
-
"""测试处理请求时添加随机User-Agent"""
|
|
218
|
-
# 启用随机User-Agent并设置自定义列表
|
|
219
|
-
custom_user_agents = [
|
|
220
|
-
'Custom-Agent/1.0',
|
|
221
|
-
'Custom-Agent/2.0',
|
|
222
|
-
'Custom-Agent/3.0'
|
|
223
|
-
]
|
|
224
|
-
self.settings.set('RANDOM_USER_AGENT_ENABLED', True)
|
|
225
|
-
self.settings.set('USER_AGENTS', custom_user_agents)
|
|
226
|
-
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
227
|
-
|
|
228
|
-
# 创建一个模拟的crawler对象
|
|
229
|
-
crawler = Mock()
|
|
230
|
-
crawler.settings = self.settings
|
|
231
|
-
|
|
232
|
-
logger = MockLogger('DefaultHeaderMiddleware')
|
|
233
|
-
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
234
|
-
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
235
|
-
|
|
236
|
-
# 创建没有User-Agent的请求
|
|
237
|
-
request = Mock()
|
|
238
|
-
request.headers = {}
|
|
239
|
-
request.url = 'https://example.com'
|
|
240
|
-
|
|
241
|
-
spider = Mock()
|
|
242
|
-
|
|
243
|
-
# 处理请求
|
|
244
|
-
middleware.process_request(request, spider)
|
|
245
|
-
|
|
246
|
-
# 检查随机User-Agent是否添加
|
|
247
|
-
self.assertIn('User-Agent', request.headers)
|
|
248
|
-
self.assertIn(request.headers['User-Agent'], custom_user_agents)
|
|
249
|
-
|
|
250
|
-
def test_process_request_with_existing_user_agent(self):
|
|
251
|
-
"""测试处理已有User-Agent的请求"""
|
|
252
|
-
# 启用随机User-Agent并设置自定义列表
|
|
253
|
-
custom_user_agents = [
|
|
254
|
-
'Custom-Agent/1.0',
|
|
255
|
-
'Custom-Agent/2.0',
|
|
256
|
-
'Custom-Agent/3.0'
|
|
257
|
-
]
|
|
258
|
-
self.settings.set('RANDOM_USER_AGENT_ENABLED', True)
|
|
259
|
-
self.settings.set('USER_AGENTS', custom_user_agents)
|
|
260
|
-
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
261
|
-
|
|
262
|
-
# 创建一个模拟的crawler对象
|
|
263
|
-
crawler = Mock()
|
|
264
|
-
crawler.settings = self.settings
|
|
265
|
-
|
|
266
|
-
logger = MockLogger('DefaultHeaderMiddleware')
|
|
267
|
-
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
268
|
-
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
269
|
-
|
|
270
|
-
# 创建已有User-Agent的请求
|
|
271
|
-
existing_ua = 'Existing-Agent/1.0'
|
|
272
|
-
request = Mock()
|
|
273
|
-
request.headers = {
|
|
274
|
-
'User-Agent': existing_ua,
|
|
275
|
-
}
|
|
276
|
-
request.url = 'https://example.com'
|
|
277
|
-
|
|
278
|
-
spider = Mock()
|
|
279
|
-
|
|
280
|
-
# 处理请求
|
|
281
|
-
middleware.process_request(request, spider)
|
|
282
|
-
|
|
283
|
-
# 检查已存在的User-Agent不被覆盖
|
|
284
|
-
self.assertEqual(request.headers['User-Agent'], existing_ua)
|
|
285
|
-
|
|
286
|
-
def test_get_random_user_agent(self):
|
|
287
|
-
"""测试获取随机User-Agent功能"""
|
|
288
|
-
# 设置自定义User-Agent列表
|
|
289
|
-
custom_user_agents = [
|
|
290
|
-
'Custom-Agent/1.0',
|
|
291
|
-
'Custom-Agent/2.0',
|
|
292
|
-
'Custom-Agent/3.0'
|
|
293
|
-
]
|
|
294
|
-
self.settings.set('RANDOM_USER_AGENT_ENABLED', True)
|
|
295
|
-
self.settings.set('USER_AGENTS', custom_user_agents)
|
|
296
|
-
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
297
|
-
|
|
298
|
-
# 创建一个模拟的crawler对象
|
|
299
|
-
crawler = Mock()
|
|
300
|
-
crawler.settings = self.settings
|
|
301
|
-
|
|
302
|
-
logger = MockLogger('DefaultHeaderMiddleware')
|
|
303
|
-
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
304
|
-
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
305
|
-
|
|
306
|
-
# 获取随机User-Agent
|
|
307
|
-
random_ua = middleware._get_random_user_agent()
|
|
308
|
-
|
|
309
|
-
# 检查返回的User-Agent在列表中
|
|
310
|
-
self.assertIn(random_ua, custom_user_agents)
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
if __name__ == '__main__':
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
DefaultHeaderMiddleware 测试文件
|
|
5
|
+
用于测试默认请求头中间件的功能,包括随机更换header功能
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import unittest
|
|
9
|
+
from unittest.mock import Mock, patch
|
|
10
|
+
|
|
11
|
+
from crawlo.middleware.default_header import DefaultHeaderMiddleware
|
|
12
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
13
|
+
from crawlo.exceptions import NotConfiguredError
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class MockLogger:
|
|
17
|
+
"""Mock Logger 类,用于测试日志输出"""
|
|
18
|
+
def __init__(self, name, level=None):
|
|
19
|
+
self.name = name
|
|
20
|
+
self.level = level
|
|
21
|
+
self.logs = []
|
|
22
|
+
|
|
23
|
+
def debug(self, msg):
|
|
24
|
+
self.logs.append(('debug', msg))
|
|
25
|
+
|
|
26
|
+
def info(self, msg):
|
|
27
|
+
self.logs.append(('info', msg))
|
|
28
|
+
|
|
29
|
+
def warning(self, msg):
|
|
30
|
+
self.logs.append(('warning', msg))
|
|
31
|
+
|
|
32
|
+
def error(self, msg):
|
|
33
|
+
self.logs.append(('error', msg))
|
|
34
|
+
|
|
35
|
+
def isEnabledFor(self, level):
|
|
36
|
+
return True
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class TestDefaultHeaderMiddleware(unittest.TestCase):
|
|
40
|
+
"""DefaultHeaderMiddleware 测试类"""
|
|
41
|
+
|
|
42
|
+
def setUp(self):
|
|
43
|
+
"""测试前准备"""
|
|
44
|
+
# 创建设置管理器
|
|
45
|
+
self.settings = SettingManager()
|
|
46
|
+
|
|
47
|
+
def test_middleware_initialization_without_config(self):
|
|
48
|
+
"""测试没有配置时中间件初始化"""
|
|
49
|
+
# 创建一个模拟的crawler对象
|
|
50
|
+
crawler = Mock()
|
|
51
|
+
crawler.settings = self.settings
|
|
52
|
+
|
|
53
|
+
logger = MockLogger('DefaultHeaderMiddleware')
|
|
54
|
+
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
55
|
+
# 应该抛出NotConfiguredError异常
|
|
56
|
+
with self.assertRaises(NotConfiguredError) as context:
|
|
57
|
+
DefaultHeaderMiddleware.create_instance(crawler)
|
|
58
|
+
|
|
59
|
+
self.assertIn("未配置DEFAULT_REQUEST_HEADERS、USER_AGENT或随机头部配置,DefaultHeaderMiddleware已禁用", str(context.exception))
|
|
60
|
+
|
|
61
|
+
def test_middleware_initialization_with_default_headers(self):
|
|
62
|
+
"""测试使用默认请求头配置时中间件初始化"""
|
|
63
|
+
# 设置默认请求头
|
|
64
|
+
self.settings.set('DEFAULT_REQUEST_HEADERS', {
|
|
65
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
66
|
+
'Accept-Language': 'en-US,en;q=0.5',
|
|
67
|
+
'Accept-Encoding': 'gzip, deflate',
|
|
68
|
+
})
|
|
69
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
70
|
+
|
|
71
|
+
# 创建一个模拟的crawler对象
|
|
72
|
+
crawler = Mock()
|
|
73
|
+
crawler.settings = self.settings
|
|
74
|
+
|
|
75
|
+
logger = MockLogger('DefaultHeaderMiddleware')
|
|
76
|
+
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
77
|
+
# 应该正常创建实例
|
|
78
|
+
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
79
|
+
|
|
80
|
+
self.assertIsInstance(middleware, DefaultHeaderMiddleware)
|
|
81
|
+
self.assertEqual(len(middleware.headers), 3)
|
|
82
|
+
self.assertIn('Accept', middleware.headers)
|
|
83
|
+
self.assertIn('Accept-Language', middleware.headers)
|
|
84
|
+
self.assertIn('Accept-Encoding', middleware.headers)
|
|
85
|
+
|
|
86
|
+
def test_middleware_initialization_with_user_agent(self):
|
|
87
|
+
"""测试使用User-Agent配置时中间件初始化"""
|
|
88
|
+
# 设置User-Agent
|
|
89
|
+
self.settings.set('USER_AGENT', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
|
|
90
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
91
|
+
|
|
92
|
+
# 创建一个模拟的crawler对象
|
|
93
|
+
crawler = Mock()
|
|
94
|
+
crawler.settings = self.settings
|
|
95
|
+
|
|
96
|
+
logger = MockLogger('DefaultHeaderMiddleware')
|
|
97
|
+
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
98
|
+
# 应该正常创建实例
|
|
99
|
+
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
100
|
+
|
|
101
|
+
self.assertIsInstance(middleware, DefaultHeaderMiddleware)
|
|
102
|
+
self.assertIn('User-Agent', middleware.headers)
|
|
103
|
+
self.assertEqual(middleware.headers['User-Agent'], 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
|
|
104
|
+
|
|
105
|
+
def test_middleware_initialization_with_random_user_agent_enabled(self):
|
|
106
|
+
"""测试启用随机User-Agent时中间件初始化"""
|
|
107
|
+
# 启用随机User-Agent并提供一个User-Agent
|
|
108
|
+
self.settings.set('RANDOM_USER_AGENT_ENABLED', True)
|
|
109
|
+
self.settings.set('USER_AGENTS', ['Test-Agent/1.0']) # 提供一个User-Agent以通过初始化检查
|
|
110
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
111
|
+
|
|
112
|
+
# 创建一个模拟的crawler对象
|
|
113
|
+
crawler = Mock()
|
|
114
|
+
crawler.settings = self.settings
|
|
115
|
+
|
|
116
|
+
logger = MockLogger('DefaultHeaderMiddleware')
|
|
117
|
+
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
118
|
+
# 应该正常创建实例,使用内置User-Agent列表
|
|
119
|
+
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
120
|
+
|
|
121
|
+
self.assertIsInstance(middleware, DefaultHeaderMiddleware)
|
|
122
|
+
self.assertTrue(middleware.random_user_agent_enabled)
|
|
123
|
+
# 注意:这里user_agents会被get_user_agents覆盖,所以长度可能不为1
|
|
124
|
+
|
|
125
|
+
def test_middleware_initialization_with_custom_user_agents(self):
|
|
126
|
+
"""测试使用自定义User-Agent列表时中间件初始化"""
|
|
127
|
+
# 设置自定义User-Agent列表
|
|
128
|
+
custom_user_agents = [
|
|
129
|
+
'Custom-Agent/1.0',
|
|
130
|
+
'Custom-Agent/2.0',
|
|
131
|
+
'Custom-Agent/3.0'
|
|
132
|
+
]
|
|
133
|
+
self.settings.set('RANDOM_USER_AGENT_ENABLED', True)
|
|
134
|
+
self.settings.set('USER_AGENTS', custom_user_agents)
|
|
135
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
136
|
+
|
|
137
|
+
# 创建一个模拟的crawler对象
|
|
138
|
+
crawler = Mock()
|
|
139
|
+
crawler.settings = self.settings
|
|
140
|
+
|
|
141
|
+
logger = MockLogger('DefaultHeaderMiddleware')
|
|
142
|
+
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
143
|
+
# 应该正常创建实例,使用自定义User-Agent列表
|
|
144
|
+
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
145
|
+
|
|
146
|
+
self.assertIsInstance(middleware, DefaultHeaderMiddleware)
|
|
147
|
+
self.assertTrue(middleware.random_user_agent_enabled)
|
|
148
|
+
self.assertEqual(middleware.user_agents, custom_user_agents)
|
|
149
|
+
|
|
150
|
+
def test_process_request_with_default_headers(self):
|
|
151
|
+
"""测试处理请求时添加默认请求头"""
|
|
152
|
+
# 设置默认请求头
|
|
153
|
+
self.settings.set('DEFAULT_REQUEST_HEADERS', {
|
|
154
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
155
|
+
'Accept-Language': 'en-US,en;q=0.5',
|
|
156
|
+
})
|
|
157
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
158
|
+
|
|
159
|
+
# 创建一个模拟的crawler对象
|
|
160
|
+
crawler = Mock()
|
|
161
|
+
crawler.settings = self.settings
|
|
162
|
+
|
|
163
|
+
logger = MockLogger('DefaultHeaderMiddleware')
|
|
164
|
+
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
165
|
+
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
166
|
+
|
|
167
|
+
# 创建请求
|
|
168
|
+
request = Mock()
|
|
169
|
+
request.headers = {}
|
|
170
|
+
request.url = 'https://example.com'
|
|
171
|
+
|
|
172
|
+
spider = Mock()
|
|
173
|
+
|
|
174
|
+
# 处理请求
|
|
175
|
+
middleware.process_request(request, spider)
|
|
176
|
+
|
|
177
|
+
# 检查默认请求头是否添加
|
|
178
|
+
self.assertIn('Accept', request.headers)
|
|
179
|
+
self.assertIn('Accept-Language', request.headers)
|
|
180
|
+
self.assertEqual(request.headers['Accept'], 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
|
|
181
|
+
self.assertEqual(request.headers['Accept-Language'], 'en-US,en;q=0.5')
|
|
182
|
+
|
|
183
|
+
def test_process_request_with_existing_headers(self):
|
|
184
|
+
"""测试处理已有请求头的请求"""
|
|
185
|
+
# 设置默认请求头
|
|
186
|
+
self.settings.set('DEFAULT_REQUEST_HEADERS', {
|
|
187
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
188
|
+
'Accept-Language': 'en-US,en;q=0.5',
|
|
189
|
+
})
|
|
190
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
191
|
+
|
|
192
|
+
# 创建一个模拟的crawler对象
|
|
193
|
+
crawler = Mock()
|
|
194
|
+
crawler.settings = self.settings
|
|
195
|
+
|
|
196
|
+
logger = MockLogger('DefaultHeaderMiddleware')
|
|
197
|
+
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
198
|
+
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
199
|
+
|
|
200
|
+
# 创建已有请求头的请求
|
|
201
|
+
request = Mock()
|
|
202
|
+
request.headers = {
|
|
203
|
+
'Accept': 'application/json', # 已存在的请求头
|
|
204
|
+
}
|
|
205
|
+
request.url = 'https://example.com'
|
|
206
|
+
|
|
207
|
+
spider = Mock()
|
|
208
|
+
|
|
209
|
+
# 处理请求
|
|
210
|
+
middleware.process_request(request, spider)
|
|
211
|
+
|
|
212
|
+
# 检查已存在的请求头不被覆盖,新请求头被添加
|
|
213
|
+
self.assertEqual(request.headers['Accept'], 'application/json') # 保持原值
|
|
214
|
+
self.assertIn('Accept-Language', request.headers) # 新添加的请求头
|
|
215
|
+
|
|
216
|
+
def test_process_request_with_random_user_agent(self):
|
|
217
|
+
"""测试处理请求时添加随机User-Agent"""
|
|
218
|
+
# 启用随机User-Agent并设置自定义列表
|
|
219
|
+
custom_user_agents = [
|
|
220
|
+
'Custom-Agent/1.0',
|
|
221
|
+
'Custom-Agent/2.0',
|
|
222
|
+
'Custom-Agent/3.0'
|
|
223
|
+
]
|
|
224
|
+
self.settings.set('RANDOM_USER_AGENT_ENABLED', True)
|
|
225
|
+
self.settings.set('USER_AGENTS', custom_user_agents)
|
|
226
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
227
|
+
|
|
228
|
+
# 创建一个模拟的crawler对象
|
|
229
|
+
crawler = Mock()
|
|
230
|
+
crawler.settings = self.settings
|
|
231
|
+
|
|
232
|
+
logger = MockLogger('DefaultHeaderMiddleware')
|
|
233
|
+
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
234
|
+
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
235
|
+
|
|
236
|
+
# 创建没有User-Agent的请求
|
|
237
|
+
request = Mock()
|
|
238
|
+
request.headers = {}
|
|
239
|
+
request.url = 'https://example.com'
|
|
240
|
+
|
|
241
|
+
spider = Mock()
|
|
242
|
+
|
|
243
|
+
# 处理请求
|
|
244
|
+
middleware.process_request(request, spider)
|
|
245
|
+
|
|
246
|
+
# 检查随机User-Agent是否添加
|
|
247
|
+
self.assertIn('User-Agent', request.headers)
|
|
248
|
+
self.assertIn(request.headers['User-Agent'], custom_user_agents)
|
|
249
|
+
|
|
250
|
+
def test_process_request_with_existing_user_agent(self):
|
|
251
|
+
"""测试处理已有User-Agent的请求"""
|
|
252
|
+
# 启用随机User-Agent并设置自定义列表
|
|
253
|
+
custom_user_agents = [
|
|
254
|
+
'Custom-Agent/1.0',
|
|
255
|
+
'Custom-Agent/2.0',
|
|
256
|
+
'Custom-Agent/3.0'
|
|
257
|
+
]
|
|
258
|
+
self.settings.set('RANDOM_USER_AGENT_ENABLED', True)
|
|
259
|
+
self.settings.set('USER_AGENTS', custom_user_agents)
|
|
260
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
261
|
+
|
|
262
|
+
# 创建一个模拟的crawler对象
|
|
263
|
+
crawler = Mock()
|
|
264
|
+
crawler.settings = self.settings
|
|
265
|
+
|
|
266
|
+
logger = MockLogger('DefaultHeaderMiddleware')
|
|
267
|
+
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
268
|
+
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
269
|
+
|
|
270
|
+
# 创建已有User-Agent的请求
|
|
271
|
+
existing_ua = 'Existing-Agent/1.0'
|
|
272
|
+
request = Mock()
|
|
273
|
+
request.headers = {
|
|
274
|
+
'User-Agent': existing_ua,
|
|
275
|
+
}
|
|
276
|
+
request.url = 'https://example.com'
|
|
277
|
+
|
|
278
|
+
spider = Mock()
|
|
279
|
+
|
|
280
|
+
# 处理请求
|
|
281
|
+
middleware.process_request(request, spider)
|
|
282
|
+
|
|
283
|
+
# 检查已存在的User-Agent不被覆盖
|
|
284
|
+
self.assertEqual(request.headers['User-Agent'], existing_ua)
|
|
285
|
+
|
|
286
|
+
def test_get_random_user_agent(self):
|
|
287
|
+
"""测试获取随机User-Agent功能"""
|
|
288
|
+
# 设置自定义User-Agent列表
|
|
289
|
+
custom_user_agents = [
|
|
290
|
+
'Custom-Agent/1.0',
|
|
291
|
+
'Custom-Agent/2.0',
|
|
292
|
+
'Custom-Agent/3.0'
|
|
293
|
+
]
|
|
294
|
+
self.settings.set('RANDOM_USER_AGENT_ENABLED', True)
|
|
295
|
+
self.settings.set('USER_AGENTS', custom_user_agents)
|
|
296
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
297
|
+
|
|
298
|
+
# 创建一个模拟的crawler对象
|
|
299
|
+
crawler = Mock()
|
|
300
|
+
crawler.settings = self.settings
|
|
301
|
+
|
|
302
|
+
logger = MockLogger('DefaultHeaderMiddleware')
|
|
303
|
+
with patch('crawlo.middleware.default_header.get_logger', return_value=logger):
|
|
304
|
+
middleware = DefaultHeaderMiddleware.create_instance(crawler)
|
|
305
|
+
|
|
306
|
+
# 获取随机User-Agent
|
|
307
|
+
random_ua = middleware._get_random_user_agent()
|
|
308
|
+
|
|
309
|
+
# 检查返回的User-Agent在列表中
|
|
310
|
+
self.assertIn(random_ua, custom_user_agents)
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
if __name__ == '__main__':
|
|
314
314
|
unittest.main()
|