crawlo 1.2.6__py3-none-any.whl → 1.2.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +61 -61
- crawlo/__version__.py +1 -1
- crawlo/cleaners/__init__.py +60 -60
- crawlo/cleaners/data_formatter.py +225 -225
- crawlo/cleaners/encoding_converter.py +125 -125
- crawlo/cleaners/text_cleaner.py +232 -232
- crawlo/cli.py +75 -88
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +151 -151
- crawlo/commands/help.py +138 -144
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +323 -323
- crawlo/commands/startproject.py +436 -436
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +186 -186
- crawlo/config.py +312 -312
- crawlo/config_validator.py +251 -251
- crawlo/core/__init__.py +2 -2
- crawlo/core/engine.py +365 -356
- crawlo/core/processor.py +40 -40
- crawlo/core/scheduler.py +251 -239
- crawlo/crawler.py +1099 -1110
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +107 -107
- crawlo/downloader/__init__.py +266 -266
- crawlo/downloader/aiohttp_downloader.py +228 -221
- crawlo/downloader/cffi_downloader.py +256 -256
- crawlo/downloader/httpx_downloader.py +259 -259
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +402 -402
- crawlo/downloader/selenium_downloader.py +472 -472
- crawlo/event.py +11 -11
- crawlo/exceptions.py +81 -81
- crawlo/extension/__init__.py +39 -38
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +57 -57
- crawlo/extension/log_stats.py +81 -81
- crawlo/extension/logging_extension.py +43 -43
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/filters/__init__.py +154 -154
- crawlo/filters/aioredis_filter.py +234 -234
- crawlo/filters/memory_filter.py +269 -269
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +21 -21
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +131 -131
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +136 -135
- crawlo/middleware/offsite.py +114 -114
- crawlo/middleware/proxy.py +367 -367
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +163 -163
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/mode_manager.py +211 -211
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +338 -338
- crawlo/network/response.py +359 -359
- crawlo/pipelines/__init__.py +21 -21
- crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +222 -222
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +115 -115
- crawlo/pipelines/mongo_pipeline.py +131 -131
- crawlo/pipelines/mysql_pipeline.py +317 -317
- crawlo/pipelines/pipeline_manager.py +62 -61
- crawlo/pipelines/redis_dedup_pipeline.py +166 -165
- crawlo/project.py +314 -279
- crawlo/queue/pqueue.py +37 -37
- crawlo/queue/queue_manager.py +377 -376
- crawlo/queue/redis_priority_queue.py +306 -306
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +219 -215
- crawlo/settings/setting_manager.py +122 -122
- crawlo/spider/__init__.py +639 -639
- crawlo/stats_collector.py +59 -59
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +30 -30
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +3 -3
- crawlo/templates/project/items.py.tmpl +17 -17
- crawlo/templates/project/middlewares.py.tmpl +118 -118
- crawlo/templates/project/pipelines.py.tmpl +96 -96
- crawlo/templates/project/settings.py.tmpl +288 -288
- crawlo/templates/project/settings_distributed.py.tmpl +157 -157
- crawlo/templates/project/settings_gentle.py.tmpl +100 -100
- crawlo/templates/project/settings_high_performance.py.tmpl +134 -134
- crawlo/templates/project/settings_simple.py.tmpl +98 -98
- crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
- crawlo/templates/run.py.tmpl +47 -45
- crawlo/templates/spider/spider.py.tmpl +143 -143
- crawlo/tools/__init__.py +182 -182
- crawlo/tools/anti_crawler.py +268 -268
- crawlo/tools/authenticated_proxy.py +240 -240
- crawlo/tools/data_validator.py +180 -180
- crawlo/tools/date_tools.py +35 -35
- crawlo/tools/distributed_coordinator.py +386 -386
- crawlo/tools/retry_mechanism.py +220 -220
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/utils/__init__.py +35 -35
- crawlo/utils/batch_processor.py +259 -259
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/date_tools.py +290 -290
- crawlo/utils/db_helper.py +343 -343
- crawlo/utils/enhanced_error_handler.py +356 -356
- crawlo/utils/env_config.py +143 -106
- crawlo/utils/error_handler.py +123 -123
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_config.py +286 -286
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/log.py +128 -128
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_connection_pool.py +351 -351
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +267 -267
- crawlo/utils/request_serializer.py +218 -218
- crawlo/utils/spider_loader.py +61 -61
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +4 -4
- crawlo/utils/url.py +39 -39
- {crawlo-1.2.6.dist-info → crawlo-1.2.8.dist-info}/METADATA +764 -764
- crawlo-1.2.8.dist-info/RECORD +209 -0
- examples/__init__.py +7 -7
- tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +81 -81
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +275 -275
- tests/authenticated_proxy_example.py +236 -236
- tests/cleaners_example.py +160 -160
- tests/config_validation_demo.py +102 -102
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/env_config_example.py +133 -133
- tests/error_handling_example.py +171 -171
- tests/redis_key_validation_demo.py +130 -130
- tests/response_improvements_example.py +144 -144
- tests/test_advanced_tools.py +148 -148
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_authenticated_proxy.py +141 -141
- tests/test_cleaners.py +54 -54
- tests/test_comprehensive.py +146 -146
- tests/test_config_consistency.py +81 -0
- tests/test_config_validator.py +193 -193
- tests/test_crawlo_proxy_integration.py +172 -172
- tests/test_date_tools.py +123 -123
- tests/test_default_header_middleware.py +158 -158
- tests/test_double_crawlo_fix.py +207 -207
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +268 -268
- tests/test_dynamic_downloaders_proxy.py +124 -124
- tests/test_dynamic_proxy.py +92 -92
- tests/test_dynamic_proxy_config.py +146 -146
- tests/test_dynamic_proxy_real.py +109 -109
- tests/test_edge_cases.py +303 -303
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_env_config.py +121 -121
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_final_validation.py +153 -153
- tests/test_framework_env_usage.py +103 -103
- tests/test_integration.py +356 -356
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_mode_consistency.py +52 -0
- tests/test_offsite_middleware.py +221 -221
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_proxy_api.py +264 -264
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +121 -121
- tests/test_proxy_middleware_enhanced.py +216 -216
- tests/test_proxy_middleware_integration.py +136 -136
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +176 -176
- tests/test_real_scenario_proxy.py +195 -195
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_improvements.py +152 -152
- tests/test_retry_middleware.py +241 -241
- tests/test_scheduler.py +252 -241
- tests/test_scheduler_config_update.py +134 -0
- tests/test_simple_response.py +61 -61
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +153 -153
- tests/tools_example.py +257 -257
- crawlo-1.2.6.dist-info/RECORD +0 -206
- {crawlo-1.2.6.dist-info → crawlo-1.2.8.dist-info}/WHEEL +0 -0
- {crawlo-1.2.6.dist-info → crawlo-1.2.8.dist-info}/entry_points.txt +0 -0
- {crawlo-1.2.6.dist-info → crawlo-1.2.8.dist-info}/top_level.txt +0 -0
|
@@ -1,159 +1,159 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
DefaultHeaderMiddleware 测试文件
|
|
5
|
-
用于测试默认请求头中间件的功能
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import unittest
|
|
9
|
-
from unittest.mock import Mock, patch
|
|
10
|
-
|
|
11
|
-
from crawlo.middleware.default_header import DefaultHeaderMiddleware
|
|
12
|
-
from crawlo.exceptions import NotConfiguredError
|
|
13
|
-
from crawlo.settings.setting_manager import SettingManager
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class MockLogger:
|
|
17
|
-
"""Mock Logger 类,用于测试日志输出"""
|
|
18
|
-
def __init__(self, name, level=None):
|
|
19
|
-
self.name = name
|
|
20
|
-
self.level = level
|
|
21
|
-
self.logs = []
|
|
22
|
-
|
|
23
|
-
def debug(self, msg):
|
|
24
|
-
self.logs.append(('debug', msg))
|
|
25
|
-
|
|
26
|
-
def info(self, msg):
|
|
27
|
-
self.logs.append(('info', msg))
|
|
28
|
-
|
|
29
|
-
def warning(self, msg):
|
|
30
|
-
self.logs.append(('warning', msg))
|
|
31
|
-
|
|
32
|
-
def error(self, msg):
|
|
33
|
-
self.logs.append(('error', msg))
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
class TestDefaultHeaderMiddleware(unittest.TestCase):
|
|
37
|
-
"""DefaultHeaderMiddleware 测试类"""
|
|
38
|
-
|
|
39
|
-
def setUp(self):
|
|
40
|
-
"""测试前准备"""
|
|
41
|
-
# 创建设置管理器
|
|
42
|
-
self.settings = SettingManager()
|
|
43
|
-
|
|
44
|
-
# 创建爬虫模拟对象
|
|
45
|
-
self.crawler = Mock()
|
|
46
|
-
self.crawler.settings = self.settings
|
|
47
|
-
|
|
48
|
-
@patch('crawlo.utils.log.get_logger')
|
|
49
|
-
def test_middleware_initialization_without_config(self, mock_get_logger):
|
|
50
|
-
"""测试没有配置时中间件初始化(清除默认配置)"""
|
|
51
|
-
# 清除默认的请求头配置
|
|
52
|
-
self.settings.set('DEFAULT_REQUEST_HEADERS', {})
|
|
53
|
-
self.settings.set('USER_AGENT', None)
|
|
54
|
-
self.settings.set('USER_AGENTS', [])
|
|
55
|
-
self.settings.set('RANDOM_HEADERS', {})
|
|
56
|
-
self.settings.set('LOG_LEVEL', 'INFO')
|
|
57
|
-
|
|
58
|
-
mock_get_logger.return_value = MockLogger('DefaultHeaderMiddleware')
|
|
59
|
-
|
|
60
|
-
# 应该抛出NotConfiguredError异常
|
|
61
|
-
with self.assertRaises(NotConfiguredError):
|
|
62
|
-
DefaultHeaderMiddleware.create_instance(self.crawler)
|
|
63
|
-
|
|
64
|
-
@patch('crawlo.utils.log.get_logger')
|
|
65
|
-
def test_middleware_initialization_with_default_headers(self, mock_get_logger):
|
|
66
|
-
"""测试配置默认请求头时中间件初始化"""
|
|
67
|
-
# 设置默认请求头
|
|
68
|
-
self.settings.set('DEFAULT_REQUEST_HEADERS', {
|
|
69
|
-
'User-Agent': 'Test-Agent',
|
|
70
|
-
'Accept': 'text/html'
|
|
71
|
-
})
|
|
72
|
-
self.settings.set('LOG_LEVEL', 'INFO')
|
|
73
|
-
|
|
74
|
-
mock_get_logger.return_value = MockLogger('DefaultHeaderMiddleware')
|
|
75
|
-
|
|
76
|
-
# 应该正常创建实例
|
|
77
|
-
middleware = DefaultHeaderMiddleware.create_instance(self.crawler)
|
|
78
|
-
self.assertIsInstance(middleware, DefaultHeaderMiddleware)
|
|
79
|
-
self.assertIn('User-Agent', middleware.headers)
|
|
80
|
-
self.assertIn('Accept', middleware.headers)
|
|
81
|
-
|
|
82
|
-
@patch('crawlo.utils.log.get_logger')
|
|
83
|
-
def test_middleware_initialization_with_user_agent(self, mock_get_logger):
|
|
84
|
-
"""测试配置User-Agent时中间件初始化"""
|
|
85
|
-
# 清除默认的请求头配置
|
|
86
|
-
self.settings.set('DEFAULT_REQUEST_HEADERS', {})
|
|
87
|
-
# 设置User-Agent
|
|
88
|
-
self.settings.set('USER_AGENT', 'Custom-Agent')
|
|
89
|
-
self.settings.set('LOG_LEVEL', 'INFO')
|
|
90
|
-
|
|
91
|
-
mock_get_logger.return_value = MockLogger('DefaultHeaderMiddleware')
|
|
92
|
-
|
|
93
|
-
# 应该正常创建实例
|
|
94
|
-
middleware = DefaultHeaderMiddleware.create_instance(self.crawler)
|
|
95
|
-
self.assertIsInstance(middleware, DefaultHeaderMiddleware)
|
|
96
|
-
self.assertIn('User-Agent', middleware.headers)
|
|
97
|
-
self.assertEqual(middleware.headers['User-Agent'], 'Custom-Agent')
|
|
98
|
-
|
|
99
|
-
@patch('crawlo.utils.log.get_logger')
|
|
100
|
-
def test_process_request_with_default_headers(self, mock_get_logger):
|
|
101
|
-
"""测试处理请求时添加默认请求头"""
|
|
102
|
-
# 设置默认请求头
|
|
103
|
-
self.settings.set('DEFAULT_REQUEST_HEADERS', {
|
|
104
|
-
'User-Agent': 'Test-Agent',
|
|
105
|
-
'Accept': 'text/html'
|
|
106
|
-
})
|
|
107
|
-
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
108
|
-
|
|
109
|
-
mock_logger = MockLogger('DefaultHeaderMiddleware')
|
|
110
|
-
mock_get_logger.return_value = mock_logger
|
|
111
|
-
|
|
112
|
-
middleware = DefaultHeaderMiddleware.create_instance(self.crawler)
|
|
113
|
-
|
|
114
|
-
# 创建请求对象
|
|
115
|
-
request = Mock()
|
|
116
|
-
request.headers = {}
|
|
117
|
-
request.url = 'http://example.com'
|
|
118
|
-
|
|
119
|
-
# 处理请求
|
|
120
|
-
middleware.process_request(request, Mock())
|
|
121
|
-
|
|
122
|
-
# 验证请求头被添加
|
|
123
|
-
self.assertIn('User-Agent', request.headers)
|
|
124
|
-
self.assertEqual(request.headers['User-Agent'], 'Test-Agent')
|
|
125
|
-
self.assertIn('Accept', request.headers)
|
|
126
|
-
self.assertEqual(request.headers['Accept'], 'text/html')
|
|
127
|
-
|
|
128
|
-
@patch('crawlo.utils.log.get_logger')
|
|
129
|
-
def test_process_request_without_overwriting_existing_headers(self, mock_get_logger):
|
|
130
|
-
"""测试处理请求时不覆盖已存在的请求头"""
|
|
131
|
-
# 设置默认请求头
|
|
132
|
-
self.settings.set('DEFAULT_REQUEST_HEADERS', {
|
|
133
|
-
'User-Agent': 'Test-Agent',
|
|
134
|
-
'Accept': 'text/html'
|
|
135
|
-
})
|
|
136
|
-
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
137
|
-
|
|
138
|
-
mock_logger = MockLogger('DefaultHeaderMiddleware')
|
|
139
|
-
mock_get_logger.return_value = mock_logger
|
|
140
|
-
|
|
141
|
-
middleware = DefaultHeaderMiddleware.create_instance(self.crawler)
|
|
142
|
-
|
|
143
|
-
# 创建请求对象,已包含User-Agent
|
|
144
|
-
request = Mock()
|
|
145
|
-
request.headers = {'User-Agent': 'Existing-Agent'}
|
|
146
|
-
request.url = 'http://example.com'
|
|
147
|
-
|
|
148
|
-
# 处理请求
|
|
149
|
-
middleware.process_request(request, Mock())
|
|
150
|
-
|
|
151
|
-
# 验证已存在的请求头未被覆盖
|
|
152
|
-
self.assertEqual(request.headers['User-Agent'], 'Existing-Agent')
|
|
153
|
-
# 验证其他请求头被添加
|
|
154
|
-
self.assertIn('Accept', request.headers)
|
|
155
|
-
self.assertEqual(request.headers['Accept'], 'text/html')
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
if __name__ == '__main__':
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
DefaultHeaderMiddleware 测试文件
|
|
5
|
+
用于测试默认请求头中间件的功能
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import unittest
|
|
9
|
+
from unittest.mock import Mock, patch
|
|
10
|
+
|
|
11
|
+
from crawlo.middleware.default_header import DefaultHeaderMiddleware
|
|
12
|
+
from crawlo.exceptions import NotConfiguredError
|
|
13
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class MockLogger:
|
|
17
|
+
"""Mock Logger 类,用于测试日志输出"""
|
|
18
|
+
def __init__(self, name, level=None):
|
|
19
|
+
self.name = name
|
|
20
|
+
self.level = level
|
|
21
|
+
self.logs = []
|
|
22
|
+
|
|
23
|
+
def debug(self, msg):
|
|
24
|
+
self.logs.append(('debug', msg))
|
|
25
|
+
|
|
26
|
+
def info(self, msg):
|
|
27
|
+
self.logs.append(('info', msg))
|
|
28
|
+
|
|
29
|
+
def warning(self, msg):
|
|
30
|
+
self.logs.append(('warning', msg))
|
|
31
|
+
|
|
32
|
+
def error(self, msg):
|
|
33
|
+
self.logs.append(('error', msg))
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class TestDefaultHeaderMiddleware(unittest.TestCase):
|
|
37
|
+
"""DefaultHeaderMiddleware 测试类"""
|
|
38
|
+
|
|
39
|
+
def setUp(self):
|
|
40
|
+
"""测试前准备"""
|
|
41
|
+
# 创建设置管理器
|
|
42
|
+
self.settings = SettingManager()
|
|
43
|
+
|
|
44
|
+
# 创建爬虫模拟对象
|
|
45
|
+
self.crawler = Mock()
|
|
46
|
+
self.crawler.settings = self.settings
|
|
47
|
+
|
|
48
|
+
@patch('crawlo.utils.log.get_logger')
|
|
49
|
+
def test_middleware_initialization_without_config(self, mock_get_logger):
|
|
50
|
+
"""测试没有配置时中间件初始化(清除默认配置)"""
|
|
51
|
+
# 清除默认的请求头配置
|
|
52
|
+
self.settings.set('DEFAULT_REQUEST_HEADERS', {})
|
|
53
|
+
self.settings.set('USER_AGENT', None)
|
|
54
|
+
self.settings.set('USER_AGENTS', [])
|
|
55
|
+
self.settings.set('RANDOM_HEADERS', {})
|
|
56
|
+
self.settings.set('LOG_LEVEL', 'INFO')
|
|
57
|
+
|
|
58
|
+
mock_get_logger.return_value = MockLogger('DefaultHeaderMiddleware')
|
|
59
|
+
|
|
60
|
+
# 应该抛出NotConfiguredError异常
|
|
61
|
+
with self.assertRaises(NotConfiguredError):
|
|
62
|
+
DefaultHeaderMiddleware.create_instance(self.crawler)
|
|
63
|
+
|
|
64
|
+
@patch('crawlo.utils.log.get_logger')
|
|
65
|
+
def test_middleware_initialization_with_default_headers(self, mock_get_logger):
|
|
66
|
+
"""测试配置默认请求头时中间件初始化"""
|
|
67
|
+
# 设置默认请求头
|
|
68
|
+
self.settings.set('DEFAULT_REQUEST_HEADERS', {
|
|
69
|
+
'User-Agent': 'Test-Agent',
|
|
70
|
+
'Accept': 'text/html'
|
|
71
|
+
})
|
|
72
|
+
self.settings.set('LOG_LEVEL', 'INFO')
|
|
73
|
+
|
|
74
|
+
mock_get_logger.return_value = MockLogger('DefaultHeaderMiddleware')
|
|
75
|
+
|
|
76
|
+
# 应该正常创建实例
|
|
77
|
+
middleware = DefaultHeaderMiddleware.create_instance(self.crawler)
|
|
78
|
+
self.assertIsInstance(middleware, DefaultHeaderMiddleware)
|
|
79
|
+
self.assertIn('User-Agent', middleware.headers)
|
|
80
|
+
self.assertIn('Accept', middleware.headers)
|
|
81
|
+
|
|
82
|
+
@patch('crawlo.utils.log.get_logger')
|
|
83
|
+
def test_middleware_initialization_with_user_agent(self, mock_get_logger):
|
|
84
|
+
"""测试配置User-Agent时中间件初始化"""
|
|
85
|
+
# 清除默认的请求头配置
|
|
86
|
+
self.settings.set('DEFAULT_REQUEST_HEADERS', {})
|
|
87
|
+
# 设置User-Agent
|
|
88
|
+
self.settings.set('USER_AGENT', 'Custom-Agent')
|
|
89
|
+
self.settings.set('LOG_LEVEL', 'INFO')
|
|
90
|
+
|
|
91
|
+
mock_get_logger.return_value = MockLogger('DefaultHeaderMiddleware')
|
|
92
|
+
|
|
93
|
+
# 应该正常创建实例
|
|
94
|
+
middleware = DefaultHeaderMiddleware.create_instance(self.crawler)
|
|
95
|
+
self.assertIsInstance(middleware, DefaultHeaderMiddleware)
|
|
96
|
+
self.assertIn('User-Agent', middleware.headers)
|
|
97
|
+
self.assertEqual(middleware.headers['User-Agent'], 'Custom-Agent')
|
|
98
|
+
|
|
99
|
+
@patch('crawlo.utils.log.get_logger')
|
|
100
|
+
def test_process_request_with_default_headers(self, mock_get_logger):
|
|
101
|
+
"""测试处理请求时添加默认请求头"""
|
|
102
|
+
# 设置默认请求头
|
|
103
|
+
self.settings.set('DEFAULT_REQUEST_HEADERS', {
|
|
104
|
+
'User-Agent': 'Test-Agent',
|
|
105
|
+
'Accept': 'text/html'
|
|
106
|
+
})
|
|
107
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
108
|
+
|
|
109
|
+
mock_logger = MockLogger('DefaultHeaderMiddleware')
|
|
110
|
+
mock_get_logger.return_value = mock_logger
|
|
111
|
+
|
|
112
|
+
middleware = DefaultHeaderMiddleware.create_instance(self.crawler)
|
|
113
|
+
|
|
114
|
+
# 创建请求对象
|
|
115
|
+
request = Mock()
|
|
116
|
+
request.headers = {}
|
|
117
|
+
request.url = 'http://example.com'
|
|
118
|
+
|
|
119
|
+
# 处理请求
|
|
120
|
+
middleware.process_request(request, Mock())
|
|
121
|
+
|
|
122
|
+
# 验证请求头被添加
|
|
123
|
+
self.assertIn('User-Agent', request.headers)
|
|
124
|
+
self.assertEqual(request.headers['User-Agent'], 'Test-Agent')
|
|
125
|
+
self.assertIn('Accept', request.headers)
|
|
126
|
+
self.assertEqual(request.headers['Accept'], 'text/html')
|
|
127
|
+
|
|
128
|
+
@patch('crawlo.utils.log.get_logger')
|
|
129
|
+
def test_process_request_without_overwriting_existing_headers(self, mock_get_logger):
|
|
130
|
+
"""测试处理请求时不覆盖已存在的请求头"""
|
|
131
|
+
# 设置默认请求头
|
|
132
|
+
self.settings.set('DEFAULT_REQUEST_HEADERS', {
|
|
133
|
+
'User-Agent': 'Test-Agent',
|
|
134
|
+
'Accept': 'text/html'
|
|
135
|
+
})
|
|
136
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
137
|
+
|
|
138
|
+
mock_logger = MockLogger('DefaultHeaderMiddleware')
|
|
139
|
+
mock_get_logger.return_value = mock_logger
|
|
140
|
+
|
|
141
|
+
middleware = DefaultHeaderMiddleware.create_instance(self.crawler)
|
|
142
|
+
|
|
143
|
+
# 创建请求对象,已包含User-Agent
|
|
144
|
+
request = Mock()
|
|
145
|
+
request.headers = {'User-Agent': 'Existing-Agent'}
|
|
146
|
+
request.url = 'http://example.com'
|
|
147
|
+
|
|
148
|
+
# 处理请求
|
|
149
|
+
middleware.process_request(request, Mock())
|
|
150
|
+
|
|
151
|
+
# 验证已存在的请求头未被覆盖
|
|
152
|
+
self.assertEqual(request.headers['User-Agent'], 'Existing-Agent')
|
|
153
|
+
# 验证其他请求头被添加
|
|
154
|
+
self.assertIn('Accept', request.headers)
|
|
155
|
+
self.assertEqual(request.headers['Accept'], 'text/html')
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
if __name__ == '__main__':
|
|
159
159
|
unittest.main()
|