crawlo 1.3.6__py3-none-any.whl → 1.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +87 -87
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +151 -151
- crawlo/commands/help.py +138 -138
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +341 -341
- crawlo/commands/startproject.py +436 -436
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +312 -312
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +45 -45
- crawlo/core/engine.py +439 -439
- crawlo/core/processor.py +40 -40
- crawlo/core/scheduler.py +257 -257
- crawlo/crawler.py +638 -638
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +273 -273
- crawlo/downloader/aiohttp_downloader.py +228 -228
- crawlo/downloader/cffi_downloader.py +245 -245
- crawlo/downloader/httpx_downloader.py +259 -259
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +402 -402
- crawlo/downloader/selenium_downloader.py +472 -472
- crawlo/event.py +11 -11
- crawlo/exceptions.py +81 -81
- crawlo/extension/__init__.py +39 -39
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +57 -57
- crawlo/extension/log_stats.py +81 -81
- crawlo/extension/logging_extension.py +61 -61
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/factories/__init__.py +27 -27
- crawlo/factories/base.py +68 -68
- crawlo/factories/crawler.py +103 -103
- crawlo/factories/registry.py +84 -84
- crawlo/filters/__init__.py +154 -154
- crawlo/filters/aioredis_filter.py +257 -257
- crawlo/filters/memory_filter.py +269 -269
- crawlo/framework.py +292 -292
- crawlo/initialization/__init__.py +39 -39
- crawlo/initialization/built_in.py +425 -425
- crawlo/initialization/context.py +141 -141
- crawlo/initialization/core.py +193 -193
- crawlo/initialization/phases.py +148 -148
- crawlo/initialization/registry.py +145 -145
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +23 -23
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/logging/__init__.py +37 -37
- crawlo/logging/config.py +96 -96
- crawlo/logging/factory.py +128 -128
- crawlo/logging/manager.py +111 -111
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +135 -135
- crawlo/middleware/offsite.py +123 -123
- crawlo/middleware/proxy.py +386 -386
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +163 -163
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/middleware/simple_proxy.py +65 -65
- crawlo/mode_manager.py +212 -212
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +379 -379
- crawlo/network/response.py +359 -359
- crawlo/pipelines/__init__.py +21 -21
- crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +222 -222
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +115 -115
- crawlo/pipelines/mongo_pipeline.py +131 -131
- crawlo/pipelines/mysql_pipeline.py +325 -325
- crawlo/pipelines/pipeline_manager.py +76 -76
- crawlo/pipelines/redis_dedup_pipeline.py +166 -166
- crawlo/project.py +327 -327
- crawlo/queue/pqueue.py +42 -42
- crawlo/queue/queue_manager.py +522 -503
- crawlo/queue/redis_priority_queue.py +367 -326
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +321 -321
- crawlo/settings/setting_manager.py +214 -214
- crawlo/spider/__init__.py +657 -657
- crawlo/stats_collector.py +73 -73
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +138 -138
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +3 -3
- crawlo/templates/project/items.py.tmpl +17 -17
- crawlo/templates/project/middlewares.py.tmpl +118 -118
- crawlo/templates/project/pipelines.py.tmpl +96 -96
- crawlo/templates/project/settings.py.tmpl +170 -167
- crawlo/templates/project/settings_distributed.py.tmpl +169 -166
- crawlo/templates/project/settings_gentle.py.tmpl +166 -166
- crawlo/templates/project/settings_high_performance.py.tmpl +167 -167
- crawlo/templates/project/settings_minimal.py.tmpl +65 -65
- crawlo/templates/project/settings_simple.py.tmpl +164 -164
- crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
- crawlo/templates/run.py.tmpl +34 -34
- crawlo/templates/spider/spider.py.tmpl +143 -143
- crawlo/templates/spiders_init.py.tmpl +9 -9
- crawlo/tools/__init__.py +200 -200
- crawlo/tools/anti_crawler.py +268 -268
- crawlo/tools/authenticated_proxy.py +240 -240
- crawlo/tools/data_formatter.py +225 -225
- crawlo/tools/data_validator.py +180 -180
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +388 -388
- crawlo/tools/encoding_converter.py +127 -127
- crawlo/tools/network_diagnostic.py +364 -364
- crawlo/tools/request_tools.py +82 -82
- crawlo/tools/retry_mechanism.py +224 -224
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +34 -34
- crawlo/utils/batch_processor.py +259 -259
- crawlo/utils/class_loader.py +25 -25
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +343 -343
- crawlo/utils/enhanced_error_handler.py +356 -356
- crawlo/utils/env_config.py +142 -142
- crawlo/utils/error_handler.py +165 -165
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_config.py +286 -286
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/log.py +79 -79
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_connection_pool.py +388 -388
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +267 -267
- crawlo/utils/request_serializer.py +225 -225
- crawlo/utils/spider_loader.py +61 -61
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +4 -4
- crawlo/utils/url.py +39 -39
- {crawlo-1.3.6.dist-info → crawlo-1.3.7.dist-info}/METADATA +1199 -1126
- crawlo-1.3.7.dist-info/RECORD +292 -0
- examples/__init__.py +7 -7
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +275 -275
- tests/authenticated_proxy_example.py +106 -106
- tests/baidu_performance_test.py +108 -108
- tests/baidu_test.py +59 -59
- tests/cleaners_example.py +160 -160
- tests/comprehensive_framework_test.py +212 -212
- tests/comprehensive_test.py +81 -81
- tests/comprehensive_testing_summary.md +186 -186
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_configure.py +69 -69
- tests/debug_framework_logger.py +84 -84
- tests/debug_log_config.py +126 -126
- tests/debug_log_levels.py +63 -63
- tests/debug_pipelines.py +66 -66
- tests/detailed_log_test.py +233 -233
- tests/distributed_test.py +66 -66
- tests/distributed_test_debug.py +76 -76
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/env_config_example.py +133 -133
- tests/error_handling_example.py +171 -171
- tests/final_comprehensive_test.py +151 -151
- tests/final_log_test.py +260 -260
- tests/final_validation_test.py +182 -182
- tests/fix_log_test.py +142 -142
- tests/framework_performance_test.py +202 -202
- tests/log_buffering_test.py +111 -111
- tests/log_generation_timing_test.py +153 -153
- tests/optimized_performance_test.py +211 -211
- tests/performance_comparison.py +245 -245
- tests/queue_blocking_test.py +113 -113
- tests/queue_test.py +89 -89
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/scrapy_comparison/ofweek_scrapy.py +138 -138
- tests/scrapy_comparison/scrapy_test.py +133 -133
- tests/simple_command_test.py +119 -119
- tests/simple_crawlo_test.py +127 -127
- tests/simple_log_test.py +57 -57
- tests/simple_log_test2.py +137 -137
- tests/simple_optimization_test.py +128 -128
- tests/simple_queue_type_test.py +41 -41
- tests/simple_spider_test.py +49 -49
- tests/simple_test.py +47 -47
- tests/spider_log_timing_test.py +177 -177
- tests/test_advanced_tools.py +148 -148
- tests/test_all_commands.py +230 -230
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_authenticated_proxy.py +141 -141
- tests/test_batch_processor.py +178 -178
- tests/test_cleaners.py +54 -54
- tests/test_component_factory.py +174 -174
- tests/test_comprehensive.py +146 -146
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_controlled_spider_mixin.py +79 -79
- tests/test_crawlo_proxy_integration.py +108 -108
- tests/test_date_tools.py +123 -123
- tests/test_default_header_middleware.py +158 -158
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +204 -207
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +268 -268
- tests/test_dynamic_downloaders_proxy.py +124 -124
- tests/test_dynamic_proxy.py +92 -92
- tests/test_dynamic_proxy_config.py +146 -146
- tests/test_dynamic_proxy_real.py +109 -109
- tests/test_edge_cases.py +303 -303
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_enhanced_error_handler_comprehensive.py +245 -245
- tests/test_env_config.py +121 -121
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_factories.py +252 -252
- tests/test_final_validation.py +153 -153
- tests/test_framework_env_usage.py +103 -103
- tests/test_framework_logger.py +66 -66
- tests/test_framework_startup.py +64 -64
- tests/test_get_component_logger.py +83 -83
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_large_scale_config.py +112 -112
- tests/test_large_scale_helper.py +235 -235
- tests/test_logging_system.py +282 -282
- tests/test_mode_change.py +72 -72
- tests/test_mode_consistency.py +51 -51
- tests/test_offsite_middleware.py +221 -221
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_performance_monitor.py +115 -115
- tests/test_proxy_api.py +264 -264
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +121 -121
- tests/test_proxy_middleware_enhanced.py +216 -216
- tests/test_proxy_middleware_integration.py +136 -136
- tests/test_proxy_middleware_refactored.py +184 -184
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_queue_empty_check.py +41 -41
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +179 -176
- tests/test_queue_naming.py +155 -0
- tests/test_queue_type.py +106 -106
- tests/test_random_user_agent.py +72 -72
- tests/test_real_scenario_proxy.py +195 -195
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_redis_queue_name_fix.py +176 -0
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_improvements.py +152 -152
- tests/test_retry_middleware.py +241 -241
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_simple_response.py +61 -61
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agents.py +96 -96
- tests/tools_example.py +260 -260
- tests/untested_features_report.md +138 -138
- tests/verify_debug.py +51 -51
- tests/verify_distributed.py +117 -117
- tests/verify_log_fix.py +111 -111
- crawlo-1.3.6.dist-info/RECORD +0 -290
- {crawlo-1.3.6.dist-info → crawlo-1.3.7.dist-info}/WHEEL +0 -0
- {crawlo-1.3.6.dist-info → crawlo-1.3.7.dist-info}/entry_points.txt +0 -0
- {crawlo-1.3.6.dist-info → crawlo-1.3.7.dist-info}/top_level.txt +0 -0
|
@@ -1,159 +1,159 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
DefaultHeaderMiddleware 测试文件
|
|
5
|
-
用于测试默认请求头中间件的功能
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import unittest
|
|
9
|
-
from unittest.mock import Mock, patch
|
|
10
|
-
|
|
11
|
-
from crawlo.middleware.default_header import DefaultHeaderMiddleware
|
|
12
|
-
from crawlo.exceptions import NotConfiguredError
|
|
13
|
-
from crawlo.settings.setting_manager import SettingManager
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class MockLogger:
|
|
17
|
-
"""Mock Logger 类,用于测试日志输出"""
|
|
18
|
-
def __init__(self, name, level=None):
|
|
19
|
-
self.name = name
|
|
20
|
-
self.level = level
|
|
21
|
-
self.logs = []
|
|
22
|
-
|
|
23
|
-
def debug(self, msg):
|
|
24
|
-
self.logs.append(('debug', msg))
|
|
25
|
-
|
|
26
|
-
def info(self, msg):
|
|
27
|
-
self.logs.append(('info', msg))
|
|
28
|
-
|
|
29
|
-
def warning(self, msg):
|
|
30
|
-
self.logs.append(('warning', msg))
|
|
31
|
-
|
|
32
|
-
def error(self, msg):
|
|
33
|
-
self.logs.append(('error', msg))
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
class TestDefaultHeaderMiddleware(unittest.TestCase):
|
|
37
|
-
"""DefaultHeaderMiddleware 测试类"""
|
|
38
|
-
|
|
39
|
-
def setUp(self):
|
|
40
|
-
"""测试前准备"""
|
|
41
|
-
# 创建设置管理器
|
|
42
|
-
self.settings = SettingManager()
|
|
43
|
-
|
|
44
|
-
# 创建爬虫模拟对象
|
|
45
|
-
self.crawler = Mock()
|
|
46
|
-
self.crawler.settings = self.settings
|
|
47
|
-
|
|
48
|
-
@patch('crawlo.utils.log.get_logger')
|
|
49
|
-
def test_middleware_initialization_without_config(self, mock_get_logger):
|
|
50
|
-
"""测试没有配置时中间件初始化(清除默认配置)"""
|
|
51
|
-
# 清除默认的请求头配置
|
|
52
|
-
self.settings.set('DEFAULT_REQUEST_HEADERS', {})
|
|
53
|
-
self.settings.set('USER_AGENT', None)
|
|
54
|
-
self.settings.set('USER_AGENTS', [])
|
|
55
|
-
self.settings.set('RANDOM_HEADERS', {})
|
|
56
|
-
self.settings.set('LOG_LEVEL', 'INFO')
|
|
57
|
-
|
|
58
|
-
mock_get_logger.return_value = MockLogger('DefaultHeaderMiddleware')
|
|
59
|
-
|
|
60
|
-
# 应该抛出NotConfiguredError异常
|
|
61
|
-
with self.assertRaises(NotConfiguredError):
|
|
62
|
-
DefaultHeaderMiddleware.create_instance(self.crawler)
|
|
63
|
-
|
|
64
|
-
@patch('crawlo.utils.log.get_logger')
|
|
65
|
-
def test_middleware_initialization_with_default_headers(self, mock_get_logger):
|
|
66
|
-
"""测试配置默认请求头时中间件初始化"""
|
|
67
|
-
# 设置默认请求头
|
|
68
|
-
self.settings.set('DEFAULT_REQUEST_HEADERS', {
|
|
69
|
-
'User-Agent': 'Test-Agent',
|
|
70
|
-
'Accept': 'text/html'
|
|
71
|
-
})
|
|
72
|
-
self.settings.set('LOG_LEVEL', 'INFO')
|
|
73
|
-
|
|
74
|
-
mock_get_logger.return_value = MockLogger('DefaultHeaderMiddleware')
|
|
75
|
-
|
|
76
|
-
# 应该正常创建实例
|
|
77
|
-
middleware = DefaultHeaderMiddleware.create_instance(self.crawler)
|
|
78
|
-
self.assertIsInstance(middleware, DefaultHeaderMiddleware)
|
|
79
|
-
self.assertIn('User-Agent', middleware.headers)
|
|
80
|
-
self.assertIn('Accept', middleware.headers)
|
|
81
|
-
|
|
82
|
-
@patch('crawlo.utils.log.get_logger')
|
|
83
|
-
def test_middleware_initialization_with_user_agent(self, mock_get_logger):
|
|
84
|
-
"""测试配置User-Agent时中间件初始化"""
|
|
85
|
-
# 清除默认的请求头配置
|
|
86
|
-
self.settings.set('DEFAULT_REQUEST_HEADERS', {})
|
|
87
|
-
# 设置User-Agent
|
|
88
|
-
self.settings.set('USER_AGENT', 'Custom-Agent')
|
|
89
|
-
self.settings.set('LOG_LEVEL', 'INFO')
|
|
90
|
-
|
|
91
|
-
mock_get_logger.return_value = MockLogger('DefaultHeaderMiddleware')
|
|
92
|
-
|
|
93
|
-
# 应该正常创建实例
|
|
94
|
-
middleware = DefaultHeaderMiddleware.create_instance(self.crawler)
|
|
95
|
-
self.assertIsInstance(middleware, DefaultHeaderMiddleware)
|
|
96
|
-
self.assertIn('User-Agent', middleware.headers)
|
|
97
|
-
self.assertEqual(middleware.headers['User-Agent'], 'Custom-Agent')
|
|
98
|
-
|
|
99
|
-
@patch('crawlo.utils.log.get_logger')
|
|
100
|
-
def test_process_request_with_default_headers(self, mock_get_logger):
|
|
101
|
-
"""测试处理请求时添加默认请求头"""
|
|
102
|
-
# 设置默认请求头
|
|
103
|
-
self.settings.set('DEFAULT_REQUEST_HEADERS', {
|
|
104
|
-
'User-Agent': 'Test-Agent',
|
|
105
|
-
'Accept': 'text/html'
|
|
106
|
-
})
|
|
107
|
-
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
108
|
-
|
|
109
|
-
mock_logger = MockLogger('DefaultHeaderMiddleware')
|
|
110
|
-
mock_get_logger.return_value = mock_logger
|
|
111
|
-
|
|
112
|
-
middleware = DefaultHeaderMiddleware.create_instance(self.crawler)
|
|
113
|
-
|
|
114
|
-
# 创建请求对象
|
|
115
|
-
request = Mock()
|
|
116
|
-
request.headers = {}
|
|
117
|
-
request.url = 'http://example.com'
|
|
118
|
-
|
|
119
|
-
# 处理请求
|
|
120
|
-
middleware.process_request(request, Mock())
|
|
121
|
-
|
|
122
|
-
# 验证请求头被添加
|
|
123
|
-
self.assertIn('User-Agent', request.headers)
|
|
124
|
-
self.assertEqual(request.headers['User-Agent'], 'Test-Agent')
|
|
125
|
-
self.assertIn('Accept', request.headers)
|
|
126
|
-
self.assertEqual(request.headers['Accept'], 'text/html')
|
|
127
|
-
|
|
128
|
-
@patch('crawlo.utils.log.get_logger')
|
|
129
|
-
def test_process_request_without_overwriting_existing_headers(self, mock_get_logger):
|
|
130
|
-
"""测试处理请求时不覆盖已存在的请求头"""
|
|
131
|
-
# 设置默认请求头
|
|
132
|
-
self.settings.set('DEFAULT_REQUEST_HEADERS', {
|
|
133
|
-
'User-Agent': 'Test-Agent',
|
|
134
|
-
'Accept': 'text/html'
|
|
135
|
-
})
|
|
136
|
-
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
137
|
-
|
|
138
|
-
mock_logger = MockLogger('DefaultHeaderMiddleware')
|
|
139
|
-
mock_get_logger.return_value = mock_logger
|
|
140
|
-
|
|
141
|
-
middleware = DefaultHeaderMiddleware.create_instance(self.crawler)
|
|
142
|
-
|
|
143
|
-
# 创建请求对象,已包含User-Agent
|
|
144
|
-
request = Mock()
|
|
145
|
-
request.headers = {'User-Agent': 'Existing-Agent'}
|
|
146
|
-
request.url = 'http://example.com'
|
|
147
|
-
|
|
148
|
-
# 处理请求
|
|
149
|
-
middleware.process_request(request, Mock())
|
|
150
|
-
|
|
151
|
-
# 验证已存在的请求头未被覆盖
|
|
152
|
-
self.assertEqual(request.headers['User-Agent'], 'Existing-Agent')
|
|
153
|
-
# 验证其他请求头被添加
|
|
154
|
-
self.assertIn('Accept', request.headers)
|
|
155
|
-
self.assertEqual(request.headers['Accept'], 'text/html')
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
if __name__ == '__main__':
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
DefaultHeaderMiddleware 测试文件
|
|
5
|
+
用于测试默认请求头中间件的功能
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import unittest
|
|
9
|
+
from unittest.mock import Mock, patch
|
|
10
|
+
|
|
11
|
+
from crawlo.middleware.default_header import DefaultHeaderMiddleware
|
|
12
|
+
from crawlo.exceptions import NotConfiguredError
|
|
13
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class MockLogger:
|
|
17
|
+
"""Mock Logger 类,用于测试日志输出"""
|
|
18
|
+
def __init__(self, name, level=None):
|
|
19
|
+
self.name = name
|
|
20
|
+
self.level = level
|
|
21
|
+
self.logs = []
|
|
22
|
+
|
|
23
|
+
def debug(self, msg):
|
|
24
|
+
self.logs.append(('debug', msg))
|
|
25
|
+
|
|
26
|
+
def info(self, msg):
|
|
27
|
+
self.logs.append(('info', msg))
|
|
28
|
+
|
|
29
|
+
def warning(self, msg):
|
|
30
|
+
self.logs.append(('warning', msg))
|
|
31
|
+
|
|
32
|
+
def error(self, msg):
|
|
33
|
+
self.logs.append(('error', msg))
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class TestDefaultHeaderMiddleware(unittest.TestCase):
|
|
37
|
+
"""DefaultHeaderMiddleware 测试类"""
|
|
38
|
+
|
|
39
|
+
def setUp(self):
|
|
40
|
+
"""测试前准备"""
|
|
41
|
+
# 创建设置管理器
|
|
42
|
+
self.settings = SettingManager()
|
|
43
|
+
|
|
44
|
+
# 创建爬虫模拟对象
|
|
45
|
+
self.crawler = Mock()
|
|
46
|
+
self.crawler.settings = self.settings
|
|
47
|
+
|
|
48
|
+
@patch('crawlo.utils.log.get_logger')
|
|
49
|
+
def test_middleware_initialization_without_config(self, mock_get_logger):
|
|
50
|
+
"""测试没有配置时中间件初始化(清除默认配置)"""
|
|
51
|
+
# 清除默认的请求头配置
|
|
52
|
+
self.settings.set('DEFAULT_REQUEST_HEADERS', {})
|
|
53
|
+
self.settings.set('USER_AGENT', None)
|
|
54
|
+
self.settings.set('USER_AGENTS', [])
|
|
55
|
+
self.settings.set('RANDOM_HEADERS', {})
|
|
56
|
+
self.settings.set('LOG_LEVEL', 'INFO')
|
|
57
|
+
|
|
58
|
+
mock_get_logger.return_value = MockLogger('DefaultHeaderMiddleware')
|
|
59
|
+
|
|
60
|
+
# 应该抛出NotConfiguredError异常
|
|
61
|
+
with self.assertRaises(NotConfiguredError):
|
|
62
|
+
DefaultHeaderMiddleware.create_instance(self.crawler)
|
|
63
|
+
|
|
64
|
+
@patch('crawlo.utils.log.get_logger')
|
|
65
|
+
def test_middleware_initialization_with_default_headers(self, mock_get_logger):
|
|
66
|
+
"""测试配置默认请求头时中间件初始化"""
|
|
67
|
+
# 设置默认请求头
|
|
68
|
+
self.settings.set('DEFAULT_REQUEST_HEADERS', {
|
|
69
|
+
'User-Agent': 'Test-Agent',
|
|
70
|
+
'Accept': 'text/html'
|
|
71
|
+
})
|
|
72
|
+
self.settings.set('LOG_LEVEL', 'INFO')
|
|
73
|
+
|
|
74
|
+
mock_get_logger.return_value = MockLogger('DefaultHeaderMiddleware')
|
|
75
|
+
|
|
76
|
+
# 应该正常创建实例
|
|
77
|
+
middleware = DefaultHeaderMiddleware.create_instance(self.crawler)
|
|
78
|
+
self.assertIsInstance(middleware, DefaultHeaderMiddleware)
|
|
79
|
+
self.assertIn('User-Agent', middleware.headers)
|
|
80
|
+
self.assertIn('Accept', middleware.headers)
|
|
81
|
+
|
|
82
|
+
@patch('crawlo.utils.log.get_logger')
|
|
83
|
+
def test_middleware_initialization_with_user_agent(self, mock_get_logger):
|
|
84
|
+
"""测试配置User-Agent时中间件初始化"""
|
|
85
|
+
# 清除默认的请求头配置
|
|
86
|
+
self.settings.set('DEFAULT_REQUEST_HEADERS', {})
|
|
87
|
+
# 设置User-Agent
|
|
88
|
+
self.settings.set('USER_AGENT', 'Custom-Agent')
|
|
89
|
+
self.settings.set('LOG_LEVEL', 'INFO')
|
|
90
|
+
|
|
91
|
+
mock_get_logger.return_value = MockLogger('DefaultHeaderMiddleware')
|
|
92
|
+
|
|
93
|
+
# 应该正常创建实例
|
|
94
|
+
middleware = DefaultHeaderMiddleware.create_instance(self.crawler)
|
|
95
|
+
self.assertIsInstance(middleware, DefaultHeaderMiddleware)
|
|
96
|
+
self.assertIn('User-Agent', middleware.headers)
|
|
97
|
+
self.assertEqual(middleware.headers['User-Agent'], 'Custom-Agent')
|
|
98
|
+
|
|
99
|
+
@patch('crawlo.utils.log.get_logger')
|
|
100
|
+
def test_process_request_with_default_headers(self, mock_get_logger):
|
|
101
|
+
"""测试处理请求时添加默认请求头"""
|
|
102
|
+
# 设置默认请求头
|
|
103
|
+
self.settings.set('DEFAULT_REQUEST_HEADERS', {
|
|
104
|
+
'User-Agent': 'Test-Agent',
|
|
105
|
+
'Accept': 'text/html'
|
|
106
|
+
})
|
|
107
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
108
|
+
|
|
109
|
+
mock_logger = MockLogger('DefaultHeaderMiddleware')
|
|
110
|
+
mock_get_logger.return_value = mock_logger
|
|
111
|
+
|
|
112
|
+
middleware = DefaultHeaderMiddleware.create_instance(self.crawler)
|
|
113
|
+
|
|
114
|
+
# 创建请求对象
|
|
115
|
+
request = Mock()
|
|
116
|
+
request.headers = {}
|
|
117
|
+
request.url = 'http://example.com'
|
|
118
|
+
|
|
119
|
+
# 处理请求
|
|
120
|
+
middleware.process_request(request, Mock())
|
|
121
|
+
|
|
122
|
+
# 验证请求头被添加
|
|
123
|
+
self.assertIn('User-Agent', request.headers)
|
|
124
|
+
self.assertEqual(request.headers['User-Agent'], 'Test-Agent')
|
|
125
|
+
self.assertIn('Accept', request.headers)
|
|
126
|
+
self.assertEqual(request.headers['Accept'], 'text/html')
|
|
127
|
+
|
|
128
|
+
@patch('crawlo.utils.log.get_logger')
|
|
129
|
+
def test_process_request_without_overwriting_existing_headers(self, mock_get_logger):
|
|
130
|
+
"""测试处理请求时不覆盖已存在的请求头"""
|
|
131
|
+
# 设置默认请求头
|
|
132
|
+
self.settings.set('DEFAULT_REQUEST_HEADERS', {
|
|
133
|
+
'User-Agent': 'Test-Agent',
|
|
134
|
+
'Accept': 'text/html'
|
|
135
|
+
})
|
|
136
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
137
|
+
|
|
138
|
+
mock_logger = MockLogger('DefaultHeaderMiddleware')
|
|
139
|
+
mock_get_logger.return_value = mock_logger
|
|
140
|
+
|
|
141
|
+
middleware = DefaultHeaderMiddleware.create_instance(self.crawler)
|
|
142
|
+
|
|
143
|
+
# 创建请求对象,已包含User-Agent
|
|
144
|
+
request = Mock()
|
|
145
|
+
request.headers = {'User-Agent': 'Existing-Agent'}
|
|
146
|
+
request.url = 'http://example.com'
|
|
147
|
+
|
|
148
|
+
# 处理请求
|
|
149
|
+
middleware.process_request(request, Mock())
|
|
150
|
+
|
|
151
|
+
# 验证已存在的请求头未被覆盖
|
|
152
|
+
self.assertEqual(request.headers['User-Agent'], 'Existing-Agent')
|
|
153
|
+
# 验证其他请求头被添加
|
|
154
|
+
self.assertIn('Accept', request.headers)
|
|
155
|
+
self.assertEqual(request.headers['Accept'], 'text/html')
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
if __name__ == '__main__':
|
|
159
159
|
unittest.main()
|
tests/test_distributed.py
CHANGED
|
@@ -1,65 +1,65 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
分布式采集测试脚本
|
|
5
|
-
用于验证分布式采集功能是否正常工作
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import asyncio
|
|
9
|
-
import sys
|
|
10
|
-
import os
|
|
11
|
-
import time
|
|
12
|
-
|
|
13
|
-
# 添加项目根目录到 Python 路径
|
|
14
|
-
project_root = os.path.dirname(os.path.abspath(__file__))
|
|
15
|
-
sys.path.insert(0, project_root)
|
|
16
|
-
|
|
17
|
-
# 切换到项目根目录
|
|
18
|
-
os.chdir(project_root)
|
|
19
|
-
|
|
20
|
-
from crawlo.crawler import CrawlerProcess
|
|
21
|
-
from crawlo.utils.log import get_logger
|
|
22
|
-
|
|
23
|
-
logger = get_logger(__name__)
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
async def test_distributed_crawling():
|
|
27
|
-
"""测试分布式采集功能"""
|
|
28
|
-
logger.info("开始测试分布式采集功能...")
|
|
29
|
-
|
|
30
|
-
# 创建爬虫进程并应用配置
|
|
31
|
-
try:
|
|
32
|
-
# 确保 spider 模块被正确导入
|
|
33
|
-
spider_modules = ['ofweek_distributed.spiders']
|
|
34
|
-
process = CrawlerProcess(spider_modules=spider_modules)
|
|
35
|
-
logger.info("爬虫进程初始化成功")
|
|
36
|
-
|
|
37
|
-
# 运行指定的爬虫,使用正确的爬虫名称
|
|
38
|
-
result = await process.crawl('of_week_distributed')
|
|
39
|
-
logger.info(f"爬虫运行完成,结果: {result}")
|
|
40
|
-
|
|
41
|
-
except Exception as e:
|
|
42
|
-
logger.error(f"运行失败: {e}")
|
|
43
|
-
import traceback
|
|
44
|
-
traceback.print_exc()
|
|
45
|
-
sys.exit(1)
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
def main():
|
|
49
|
-
"""主函数"""
|
|
50
|
-
start_time = time.time()
|
|
51
|
-
logger.info("开始分布式采集测试")
|
|
52
|
-
|
|
53
|
-
try:
|
|
54
|
-
asyncio.run(test_distributed_crawling())
|
|
55
|
-
end_time = time.time()
|
|
56
|
-
logger.info(f"分布式采集测试完成,耗时: {end_time - start_time:.2f} 秒")
|
|
57
|
-
except KeyboardInterrupt:
|
|
58
|
-
logger.info("用户中断测试")
|
|
59
|
-
except Exception as e:
|
|
60
|
-
logger.error(f"测试过程中发生错误: {e}")
|
|
61
|
-
sys.exit(1)
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
if __name__ == '__main__':
|
|
65
|
-
main()
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
分布式采集测试脚本
|
|
5
|
+
用于验证分布式采集功能是否正常工作
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import sys
|
|
10
|
+
import os
|
|
11
|
+
import time
|
|
12
|
+
|
|
13
|
+
# 添加项目根目录到 Python 路径
|
|
14
|
+
project_root = os.path.dirname(os.path.abspath(__file__))
|
|
15
|
+
sys.path.insert(0, project_root)
|
|
16
|
+
|
|
17
|
+
# 切换到项目根目录
|
|
18
|
+
os.chdir(project_root)
|
|
19
|
+
|
|
20
|
+
from crawlo.crawler import CrawlerProcess
|
|
21
|
+
from crawlo.utils.log import get_logger
|
|
22
|
+
|
|
23
|
+
logger = get_logger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
async def test_distributed_crawling():
|
|
27
|
+
"""测试分布式采集功能"""
|
|
28
|
+
logger.info("开始测试分布式采集功能...")
|
|
29
|
+
|
|
30
|
+
# 创建爬虫进程并应用配置
|
|
31
|
+
try:
|
|
32
|
+
# 确保 spider 模块被正确导入
|
|
33
|
+
spider_modules = ['ofweek_distributed.spiders']
|
|
34
|
+
process = CrawlerProcess(spider_modules=spider_modules)
|
|
35
|
+
logger.info("爬虫进程初始化成功")
|
|
36
|
+
|
|
37
|
+
# 运行指定的爬虫,使用正确的爬虫名称
|
|
38
|
+
result = await process.crawl('of_week_distributed')
|
|
39
|
+
logger.info(f"爬虫运行完成,结果: {result}")
|
|
40
|
+
|
|
41
|
+
except Exception as e:
|
|
42
|
+
logger.error(f"运行失败: {e}")
|
|
43
|
+
import traceback
|
|
44
|
+
traceback.print_exc()
|
|
45
|
+
sys.exit(1)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def main():
|
|
49
|
+
"""主函数"""
|
|
50
|
+
start_time = time.time()
|
|
51
|
+
logger.info("开始分布式采集测试")
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
asyncio.run(test_distributed_crawling())
|
|
55
|
+
end_time = time.time()
|
|
56
|
+
logger.info(f"分布式采集测试完成,耗时: {end_time - start_time:.2f} 秒")
|
|
57
|
+
except KeyboardInterrupt:
|
|
58
|
+
logger.info("用户中断测试")
|
|
59
|
+
except Exception as e:
|
|
60
|
+
logger.error(f"测试过程中发生错误: {e}")
|
|
61
|
+
sys.exit(1)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
if __name__ == '__main__':
|
|
65
|
+
main()
|