crawlo 1.4.6__py3-none-any.whl → 1.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +90 -89
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +186 -186
- crawlo/commands/help.py +140 -138
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +379 -341
- crawlo/commands/startproject.py +460 -460
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +320 -312
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +52 -52
- crawlo/core/engine.py +451 -438
- crawlo/core/processor.py +47 -47
- crawlo/core/scheduler.py +290 -291
- crawlo/crawler.py +698 -657
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +280 -276
- crawlo/downloader/aiohttp_downloader.py +233 -233
- crawlo/downloader/cffi_downloader.py +250 -247
- crawlo/downloader/httpx_downloader.py +265 -259
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +425 -402
- crawlo/downloader/selenium_downloader.py +486 -472
- crawlo/event.py +45 -11
- crawlo/exceptions.py +215 -82
- crawlo/extension/__init__.py +65 -64
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +94 -94
- crawlo/extension/log_stats.py +70 -70
- crawlo/extension/logging_extension.py +53 -61
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/factories/__init__.py +27 -27
- crawlo/factories/base.py +68 -68
- crawlo/factories/crawler.py +104 -103
- crawlo/factories/registry.py +84 -84
- crawlo/factories/utils.py +135 -0
- crawlo/filters/__init__.py +170 -153
- crawlo/filters/aioredis_filter.py +348 -264
- crawlo/filters/memory_filter.py +261 -276
- crawlo/framework.py +306 -292
- crawlo/initialization/__init__.py +44 -44
- crawlo/initialization/built_in.py +391 -434
- crawlo/initialization/context.py +141 -141
- crawlo/initialization/core.py +240 -194
- crawlo/initialization/phases.py +230 -149
- crawlo/initialization/registry.py +143 -145
- crawlo/initialization/utils.py +49 -0
- crawlo/interfaces.py +23 -23
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +23 -23
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/logging/__init__.py +42 -46
- crawlo/logging/config.py +277 -197
- crawlo/logging/factory.py +175 -171
- crawlo/logging/manager.py +104 -112
- crawlo/middleware/__init__.py +87 -24
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +142 -142
- crawlo/middleware/offsite.py +123 -123
- crawlo/middleware/proxy.py +209 -209
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +150 -150
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/mode_manager.py +287 -253
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +375 -379
- crawlo/network/response.py +569 -664
- crawlo/pipelines/__init__.py +53 -22
- crawlo/pipelines/base_pipeline.py +452 -0
- crawlo/pipelines/bloom_dedup_pipeline.py +146 -146
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +197 -197
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +105 -105
- crawlo/pipelines/mongo_pipeline.py +140 -132
- crawlo/pipelines/mysql_pipeline.py +469 -476
- crawlo/pipelines/pipeline_manager.py +100 -100
- crawlo/pipelines/redis_dedup_pipeline.py +155 -156
- crawlo/project.py +347 -347
- crawlo/queue/__init__.py +10 -0
- crawlo/queue/pqueue.py +38 -38
- crawlo/queue/queue_manager.py +591 -525
- crawlo/queue/redis_priority_queue.py +519 -370
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +284 -277
- crawlo/settings/setting_manager.py +219 -219
- crawlo/spider/__init__.py +657 -657
- crawlo/stats_collector.py +81 -81
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +138 -138
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +2 -4
- crawlo/templates/project/items.py.tmpl +13 -17
- crawlo/templates/project/middlewares.py.tmpl +38 -38
- crawlo/templates/project/pipelines.py.tmpl +35 -36
- crawlo/templates/project/settings.py.tmpl +109 -111
- crawlo/templates/project/settings_distributed.py.tmpl +156 -159
- crawlo/templates/project/settings_gentle.py.tmpl +170 -176
- crawlo/templates/project/settings_high_performance.py.tmpl +171 -177
- crawlo/templates/project/settings_minimal.py.tmpl +98 -100
- crawlo/templates/project/settings_simple.py.tmpl +168 -174
- crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
- crawlo/templates/run.py.tmpl +23 -23
- crawlo/templates/spider/spider.py.tmpl +32 -40
- crawlo/templates/spiders_init.py.tmpl +5 -10
- crawlo/tools/__init__.py +86 -189
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +384 -384
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +50 -50
- crawlo/utils/batch_processor.py +276 -259
- crawlo/utils/config_manager.py +442 -0
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +250 -250
- crawlo/utils/error_handler.py +410 -410
- crawlo/utils/fingerprint.py +121 -121
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/leak_detector.py +335 -0
- crawlo/utils/log.py +79 -79
- crawlo/utils/misc.py +81 -81
- crawlo/utils/mongo_connection_pool.py +157 -0
- crawlo/utils/mysql_connection_pool.py +197 -0
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_checker.py +91 -0
- crawlo/utils/redis_connection_pool.py +578 -388
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +278 -256
- crawlo/utils/request_serializer.py +225 -225
- crawlo/utils/resource_manager.py +337 -0
- crawlo/utils/selector_helper.py +137 -137
- crawlo/utils/singleton.py +70 -0
- crawlo/utils/spider_loader.py +201 -201
- crawlo/utils/text_helper.py +94 -94
- crawlo/utils/{url.py → url_utils.py} +39 -39
- crawlo-1.4.7.dist-info/METADATA +689 -0
- crawlo-1.4.7.dist-info/RECORD +347 -0
- examples/__init__.py +7 -7
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +217 -275
- tests/authenticated_proxy_example.py +110 -110
- tests/baidu_performance_test.py +108 -108
- tests/baidu_test.py +59 -59
- tests/bug_check_test.py +250 -250
- tests/cleaners_example.py +160 -160
- tests/comprehensive_framework_test.py +212 -212
- tests/comprehensive_test.py +81 -81
- tests/comprehensive_testing_summary.md +186 -186
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_configure.py +69 -69
- tests/debug_framework_logger.py +84 -84
- tests/debug_log_config.py +126 -126
- tests/debug_log_levels.py +63 -63
- tests/debug_pipelines.py +66 -66
- tests/detailed_log_test.py +233 -233
- tests/direct_selector_helper_test.py +96 -96
- tests/distributed_dedup_test.py +467 -0
- tests/distributed_test.py +66 -66
- tests/distributed_test_debug.py +76 -76
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/error_handling_example.py +171 -171
- tests/explain_mysql_update_behavior.py +76 -76
- tests/final_comprehensive_test.py +151 -151
- tests/final_log_test.py +260 -260
- tests/final_validation_test.py +182 -182
- tests/fix_log_test.py +142 -142
- tests/framework_performance_test.py +202 -202
- tests/log_buffering_test.py +111 -111
- tests/log_generation_timing_test.py +153 -153
- tests/monitor_redis_dedup.sh +72 -0
- tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
- tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
- tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
- tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
- tests/ofweek_scrapy/scrapy.cfg +11 -11
- tests/optimized_performance_test.py +211 -211
- tests/performance_comparison.py +244 -244
- tests/queue_blocking_test.py +113 -113
- tests/queue_test.py +89 -89
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/scrapy_comparison/ofweek_scrapy.py +138 -138
- tests/scrapy_comparison/scrapy_test.py +133 -133
- tests/simple_cli_test.py +55 -0
- tests/simple_command_test.py +119 -119
- tests/simple_crawlo_test.py +126 -126
- tests/simple_follow_test.py +38 -38
- tests/simple_log_test2.py +137 -137
- tests/simple_optimization_test.py +128 -128
- tests/simple_queue_type_test.py +41 -41
- tests/simple_response_selector_test.py +94 -94
- tests/simple_selector_helper_test.py +154 -154
- tests/simple_selector_test.py +207 -207
- tests/simple_spider_test.py +49 -49
- tests/simple_url_test.py +73 -73
- tests/simulate_mysql_update_test.py +139 -139
- tests/spider_log_timing_test.py +177 -177
- tests/test_advanced_tools.py +148 -148
- tests/test_all_commands.py +230 -230
- tests/test_all_pipeline_fingerprints.py +133 -133
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_asyncmy_usage.py +56 -56
- tests/test_batch_processor.py +178 -178
- tests/test_cleaners.py +54 -54
- tests/test_cli_arguments.py +119 -0
- tests/test_component_factory.py +174 -174
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_controlled_spider_mixin.py +79 -79
- tests/test_crawler_process_import.py +38 -38
- tests/test_crawler_process_spider_modules.py +47 -47
- tests/test_crawlo_proxy_integration.py +114 -114
- tests/test_date_tools.py +123 -123
- tests/test_dedup_fix.py +220 -220
- tests/test_dedup_pipeline_consistency.py +124 -124
- tests/test_default_header_middleware.py +313 -313
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +204 -204
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +272 -272
- tests/test_edge_cases.py +305 -305
- tests/test_encoding_core.py +56 -56
- tests/test_encoding_detection.py +126 -126
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_enhanced_error_handler_comprehensive.py +245 -245
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_factories.py +252 -252
- tests/test_factory_compatibility.py +196 -196
- tests/test_final_validation.py +153 -153
- tests/test_fingerprint_consistency.py +135 -135
- tests/test_fingerprint_simple.py +51 -51
- tests/test_get_component_logger.py +83 -83
- tests/test_hash_performance.py +99 -99
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_large_scale_helper.py +235 -235
- tests/test_logging_enhancements.py +374 -374
- tests/test_logging_final.py +184 -184
- tests/test_logging_integration.py +312 -312
- tests/test_logging_system.py +282 -282
- tests/test_middleware_debug.py +141 -141
- tests/test_mode_consistency.py +51 -51
- tests/test_multi_directory.py +67 -67
- tests/test_multiple_spider_modules.py +80 -80
- tests/test_mysql_pipeline_config.py +164 -164
- tests/test_mysql_pipeline_error.py +98 -98
- tests/test_mysql_pipeline_init_log.py +82 -82
- tests/test_mysql_pipeline_integration.py +132 -132
- tests/test_mysql_pipeline_refactor.py +143 -143
- tests/test_mysql_pipeline_refactor_simple.py +85 -85
- tests/test_mysql_pipeline_robustness.py +195 -195
- tests/test_mysql_pipeline_types.py +88 -88
- tests/test_mysql_update_columns.py +93 -93
- tests/test_offsite_middleware.py +244 -244
- tests/test_offsite_middleware_simple.py +203 -203
- tests/test_optimized_selector_naming.py +100 -100
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_performance_monitor.py +115 -115
- tests/test_pipeline_fingerprint_consistency.py +86 -86
- tests/test_priority_behavior.py +211 -211
- tests/test_priority_consistency.py +151 -151
- tests/test_priority_consistency_fixed.py +249 -249
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +217 -217
- tests/test_proxy_middleware_enhanced.py +212 -212
- tests/test_proxy_middleware_integration.py +142 -142
- tests/test_proxy_middleware_refactored.py +207 -207
- tests/test_proxy_only.py +83 -83
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_proxy_with_downloader.py +152 -152
- tests/test_queue_empty_check.py +41 -41
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +179 -179
- tests/test_queue_naming.py +154 -154
- tests/test_queue_type.py +106 -106
- tests/test_queue_type_redis_config_consistency.py +130 -130
- tests/test_random_headers_default.py +322 -322
- tests/test_random_headers_necessity.py +308 -308
- tests/test_random_user_agent.py +72 -72
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_redis_queue_name_fix.py +175 -175
- tests/test_redis_queue_type_fallback.py +129 -129
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_follow.py +104 -104
- tests/test_response_improvements.py +152 -152
- tests/test_response_selector_methods.py +92 -92
- tests/test_response_url_methods.py +70 -70
- tests/test_response_urljoin.py +86 -86
- tests/test_retry_middleware.py +333 -333
- tests/test_retry_middleware_realistic.py +273 -273
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_scrapy_style_encoding.py +112 -112
- tests/test_selector_helper.py +100 -100
- tests/test_selector_optimizations.py +146 -146
- tests/test_simple_response.py +61 -61
- tests/test_spider_loader.py +49 -49
- tests/test_spider_loader_comprehensive.py +69 -69
- tests/test_spider_modules.py +84 -84
- tests/test_spiders/test_spider.py +9 -9
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agent_randomness.py +176 -176
- tests/test_user_agents.py +96 -96
- tests/untested_features_report.md +138 -138
- tests/verify_debug.py +51 -51
- tests/verify_distributed.py +117 -117
- tests/verify_log_fix.py +111 -111
- tests/verify_mysql_warnings.py +109 -109
- crawlo/logging/async_handler.py +0 -181
- crawlo/logging/monitor.py +0 -153
- crawlo/logging/sampler.py +0 -167
- crawlo/tools/authenticated_proxy.py +0 -241
- crawlo/tools/data_formatter.py +0 -226
- crawlo/tools/data_validator.py +0 -181
- crawlo/tools/encoding_converter.py +0 -127
- crawlo/tools/network_diagnostic.py +0 -365
- crawlo/tools/request_tools.py +0 -83
- crawlo/tools/retry_mechanism.py +0 -224
- crawlo/utils/env_config.py +0 -143
- crawlo/utils/large_scale_config.py +0 -287
- crawlo/utils/system.py +0 -11
- crawlo/utils/tools.py +0 -5
- crawlo-1.4.6.dist-info/METADATA +0 -329
- crawlo-1.4.6.dist-info/RECORD +0 -361
- tests/env_config_example.py +0 -134
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
- tests/test_authenticated_proxy.py +0 -142
- tests/test_comprehensive.py +0 -147
- tests/test_dynamic_downloaders_proxy.py +0 -125
- tests/test_dynamic_proxy.py +0 -93
- tests/test_dynamic_proxy_config.py +0 -147
- tests/test_dynamic_proxy_real.py +0 -110
- tests/test_env_config.py +0 -122
- tests/test_framework_env_usage.py +0 -104
- tests/test_large_scale_config.py +0 -113
- tests/test_proxy_api.py +0 -265
- tests/test_real_scenario_proxy.py +0 -196
- tests/tools_example.py +0 -261
- {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/WHEEL +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.7.dist-info}/top_level.txt +0 -0
tests/test_offsite_middleware.py
CHANGED
|
@@ -1,245 +1,245 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
OffsiteMiddleware 测试文件
|
|
5
|
-
用于测试站点过滤中间件的功能,特别是多个域名的情况
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import unittest
|
|
9
|
-
from unittest.mock import Mock, patch
|
|
10
|
-
|
|
11
|
-
from crawlo.middleware.offsite import OffsiteMiddleware
|
|
12
|
-
from crawlo.settings.setting_manager import SettingManager
|
|
13
|
-
from crawlo.exceptions import IgnoreRequestError, NotConfiguredError
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class MockStats:
|
|
17
|
-
"""Mock Stats 类,用于测试统计信息"""
|
|
18
|
-
def __init__(self):
|
|
19
|
-
self.stats = {}
|
|
20
|
-
|
|
21
|
-
def inc_value(self, key, value=1):
|
|
22
|
-
if key in self.stats:
|
|
23
|
-
self.stats[key] += value
|
|
24
|
-
else:
|
|
25
|
-
self.stats[key] = value
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
class MockLogger:
|
|
29
|
-
"""Mock Logger 类,用于测试日志输出"""
|
|
30
|
-
def __init__(self, name, level=None):
|
|
31
|
-
self.name = name
|
|
32
|
-
self.level = level
|
|
33
|
-
self.logs = []
|
|
34
|
-
|
|
35
|
-
def debug(self, msg):
|
|
36
|
-
self.logs.append(('debug', msg))
|
|
37
|
-
|
|
38
|
-
def info(self, msg):
|
|
39
|
-
self.logs.append(('info', msg))
|
|
40
|
-
|
|
41
|
-
def warning(self, msg):
|
|
42
|
-
self.logs.append(('warning', msg))
|
|
43
|
-
|
|
44
|
-
def error(self, msg):
|
|
45
|
-
self.logs.append(('error', msg))
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
class TestOffsiteMiddleware(unittest.TestCase):
|
|
49
|
-
"""OffsiteMiddleware 测试类"""
|
|
50
|
-
|
|
51
|
-
def setUp(self):
|
|
52
|
-
"""测试前准备"""
|
|
53
|
-
# 创建设置管理器
|
|
54
|
-
self.settings = SettingManager()
|
|
55
|
-
|
|
56
|
-
# 创建爬虫模拟对象
|
|
57
|
-
self.crawler = Mock()
|
|
58
|
-
self.crawler.settings = self.settings
|
|
59
|
-
self.crawler.stats = MockStats()
|
|
60
|
-
|
|
61
|
-
def test_middleware_initialization_without_domains(self):
|
|
62
|
-
"""测试没有设置ALLOWED_DOMAINS时中间件初始化"""
|
|
63
|
-
# 不设置ALLOWED_DOMAINS
|
|
64
|
-
logger = MockLogger('OffsiteMiddleware')
|
|
65
|
-
with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
|
|
66
|
-
# 应该抛出NotConfiguredError异常
|
|
67
|
-
with self.assertRaises(NotConfiguredError) as context:
|
|
68
|
-
OffsiteMiddleware.create_instance(self.crawler)
|
|
69
|
-
|
|
70
|
-
self.assertIn("未配置ALLOWED_DOMAINS,OffsiteMiddleware已禁用", str(context.exception))
|
|
71
|
-
|
|
72
|
-
def test_middleware_initialization_with_global_domains(self):
|
|
73
|
-
"""测试使用全局ALLOWED_DOMAINS设置时中间件初始化"""
|
|
74
|
-
# 设置全局ALLOWED_DOMAINS
|
|
75
|
-
self.settings.set('ALLOWED_DOMAINS', ['ee.ofweek.com', 'www.baidu.com'])
|
|
76
|
-
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
77
|
-
|
|
78
|
-
logger = MockLogger('OffsiteMiddleware')
|
|
79
|
-
with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
|
|
80
|
-
# 应该正常创建实例
|
|
81
|
-
middleware = OffsiteMiddleware.create_instance(self.crawler)
|
|
82
|
-
self.assertIsInstance(middleware, OffsiteMiddleware)
|
|
83
|
-
self.assertEqual(len(middleware.allowed_domains), 2)
|
|
84
|
-
self.assertIn('ee.ofweek.com', middleware.allowed_domains)
|
|
85
|
-
self.assertIn('www.baidu.com', middleware.allowed_domains)
|
|
86
|
-
|
|
87
|
-
def test_middleware_initialization_with_spider_domains(self):
|
|
88
|
-
"""测试使用Spider实例allowed_domains属性时中间件初始化"""
|
|
89
|
-
# 设置Spider实例的allowed_domains
|
|
90
|
-
spider = Mock()
|
|
91
|
-
spider.allowed_domains = ['ee.ofweek.com', 'www.baidu.com']
|
|
92
|
-
|
|
93
|
-
self.crawler.spider = spider
|
|
94
|
-
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
95
|
-
|
|
96
|
-
logger = MockLogger('OffsiteMiddleware')
|
|
97
|
-
with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
|
|
98
|
-
# 应该正常创建实例,使用Spider的allowed_domains
|
|
99
|
-
middleware = OffsiteMiddleware.create_instance(self.crawler)
|
|
100
|
-
self.assertIsInstance(middleware, OffsiteMiddleware)
|
|
101
|
-
self.assertEqual(len(middleware.allowed_domains), 2)
|
|
102
|
-
self.assertIn('ee.ofweek.com', middleware.allowed_domains)
|
|
103
|
-
self.assertIn('www.baidu.com', middleware.allowed_domains)
|
|
104
|
-
|
|
105
|
-
def test_is_offsite_request_with_allowed_domains(self):
|
|
106
|
-
"""测试允许域名内的请求"""
|
|
107
|
-
# 设置ALLOWED_DOMAINS
|
|
108
|
-
self.settings.set('ALLOWED_DOMAINS', ['ee.ofweek.com', 'www.baidu.com'])
|
|
109
|
-
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
110
|
-
|
|
111
|
-
logger = MockLogger('OffsiteMiddleware')
|
|
112
|
-
with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
|
|
113
|
-
middleware = OffsiteMiddleware.create_instance(self.crawler)
|
|
114
|
-
|
|
115
|
-
# 创建允许的请求
|
|
116
|
-
request1 = Mock()
|
|
117
|
-
request1.url = 'https://ee.ofweek.com/news/article1.html'
|
|
118
|
-
|
|
119
|
-
request2 = Mock()
|
|
120
|
-
request2.url = 'https://www.baidu.com/s?wd=test'
|
|
121
|
-
|
|
122
|
-
# 这些请求应该不被认为是站外请求
|
|
123
|
-
self.assertFalse(middleware._is_offsite_request(request1))
|
|
124
|
-
self.assertFalse(middleware._is_offsite_request(request2))
|
|
125
|
-
|
|
126
|
-
def test_is_offsite_request_with_subdomains(self):
|
|
127
|
-
"""测试子域名的请求"""
|
|
128
|
-
# 设置ALLOWED_DOMAINS
|
|
129
|
-
self.settings.set('ALLOWED_DOMAINS', ['ofweek.com', 'baidu.com'])
|
|
130
|
-
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
131
|
-
|
|
132
|
-
logger = MockLogger('OffsiteMiddleware')
|
|
133
|
-
with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
|
|
134
|
-
middleware = OffsiteMiddleware.create_instance(self.crawler)
|
|
135
|
-
|
|
136
|
-
# 创建子域名的请求
|
|
137
|
-
request1 = Mock()
|
|
138
|
-
request1.url = 'https://news.ofweek.com/article1.html'
|
|
139
|
-
|
|
140
|
-
request2 = Mock()
|
|
141
|
-
request2.url = 'https://map.baidu.com/location'
|
|
142
|
-
|
|
143
|
-
# 这些请求应该不被认为是站外请求(因为允许了根域名)
|
|
144
|
-
self.assertFalse(middleware._is_offsite_request(request1))
|
|
145
|
-
self.assertFalse(middleware._is_offsite_request(request2))
|
|
146
|
-
|
|
147
|
-
def test_is_offsite_request_with_disallowed_domains(self):
|
|
148
|
-
"""测试不允许域名的请求"""
|
|
149
|
-
# 设置ALLOWED_DOMAINS
|
|
150
|
-
self.settings.set('ALLOWED_DOMAINS', ['ee.ofweek.com', 'www.baidu.com'])
|
|
151
|
-
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
152
|
-
|
|
153
|
-
logger = MockLogger('OffsiteMiddleware')
|
|
154
|
-
with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
|
|
155
|
-
middleware = OffsiteMiddleware.create_instance(self.crawler)
|
|
156
|
-
|
|
157
|
-
# 创建不允许的请求
|
|
158
|
-
request1 = Mock()
|
|
159
|
-
request1.url = 'https://www.google.com/search?q=test'
|
|
160
|
-
|
|
161
|
-
request2 = Mock()
|
|
162
|
-
request2.url = 'https://github.com/user/repo'
|
|
163
|
-
|
|
164
|
-
# 这些请求应该被认为是站外请求
|
|
165
|
-
self.assertTrue(middleware._is_offsite_request(request1))
|
|
166
|
-
self.assertTrue(middleware._is_offsite_request(request2))
|
|
167
|
-
|
|
168
|
-
def test_process_request_with_allowed_domain(self):
|
|
169
|
-
"""测试处理允许域名内的请求"""
|
|
170
|
-
# 设置ALLOWED_DOMAINS
|
|
171
|
-
self.settings.set('ALLOWED_DOMAINS', ['ee.ofweek.com', 'www.baidu.com'])
|
|
172
|
-
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
173
|
-
|
|
174
|
-
logger = MockLogger('OffsiteMiddleware')
|
|
175
|
-
with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
|
|
176
|
-
middleware = OffsiteMiddleware.create_instance(self.crawler)
|
|
177
|
-
|
|
178
|
-
# 创建允许的请求
|
|
179
|
-
request = Mock()
|
|
180
|
-
request.url = 'https://ee.ofweek.com/news/article1.html'
|
|
181
|
-
spider = Mock()
|
|
182
|
-
|
|
183
|
-
# 处理请求,应该不抛出异常
|
|
184
|
-
result = middleware.process_request(request, spider)
|
|
185
|
-
self.assertIsNone(result) # 应该返回None,表示请求被允许
|
|
186
|
-
|
|
187
|
-
# 检查没有增加统计计数
|
|
188
|
-
self.assertNotIn('offsite_request_count', self.crawler.stats.stats)
|
|
189
|
-
|
|
190
|
-
def test_process_request_with_disallowed_domain(self):
|
|
191
|
-
"""测试处理不允许域名的请求"""
|
|
192
|
-
# 设置ALLOWED_DOMAINS
|
|
193
|
-
self.settings.set('ALLOWED_DOMAINS', ['ee.ofweek.com', 'www.baidu.com'])
|
|
194
|
-
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
195
|
-
|
|
196
|
-
logger = MockLogger('OffsiteMiddleware')
|
|
197
|
-
with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
|
|
198
|
-
middleware = OffsiteMiddleware.create_instance(self.crawler)
|
|
199
|
-
|
|
200
|
-
# 创建不允许的请求
|
|
201
|
-
request = Mock()
|
|
202
|
-
request.url = 'https://www.google.com/search?q=test'
|
|
203
|
-
spider = Mock()
|
|
204
|
-
|
|
205
|
-
# 处理请求,应该抛出IgnoreRequestError异常
|
|
206
|
-
with self.assertRaises(IgnoreRequestError) as context:
|
|
207
|
-
middleware.process_request(request, spider)
|
|
208
|
-
|
|
209
|
-
self.assertIn("站外请求被过滤", str(context.exception))
|
|
210
|
-
|
|
211
|
-
# 检查增加了统计计数
|
|
212
|
-
self.assertIn('offsite_request_count', self.crawler.stats.stats)
|
|
213
|
-
self.assertEqual(self.crawler.stats.stats['offsite_request_count'], 1)
|
|
214
|
-
self.assertIn('offsite_request_count/www.google.com', self.crawler.stats.stats)
|
|
215
|
-
|
|
216
|
-
def test_process_request_with_invalid_url(self):
|
|
217
|
-
"""测试处理无效URL的请求"""
|
|
218
|
-
# 设置ALLOWED_DOMAINS
|
|
219
|
-
self.settings.set('ALLOWED_DOMAINS', ['ee.ofweek.com', 'www.baidu.com'])
|
|
220
|
-
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
221
|
-
|
|
222
|
-
logger = MockLogger('OffsiteMiddleware')
|
|
223
|
-
with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
|
|
224
|
-
middleware = OffsiteMiddleware.create_instance(self.crawler)
|
|
225
|
-
|
|
226
|
-
# 创建无效URL的请求
|
|
227
|
-
request = Mock()
|
|
228
|
-
request.url = 'not_a_valid_url'
|
|
229
|
-
spider = Mock()
|
|
230
|
-
|
|
231
|
-
# 处理请求,应该抛出IgnoreRequestError异常
|
|
232
|
-
with self.assertRaises(IgnoreRequestError) as context:
|
|
233
|
-
middleware.process_request(request, spider)
|
|
234
|
-
|
|
235
|
-
self.assertIn("站外请求被过滤", str(context.exception))
|
|
236
|
-
|
|
237
|
-
# 检查增加了统计计数
|
|
238
|
-
self.assertIn('offsite_request_count', self.crawler.stats.stats)
|
|
239
|
-
self.assertEqual(self.crawler.stats.stats['offsite_request_count'], 1)
|
|
240
|
-
self.assertIn('offsite_request_count/invalid_url', self.crawler.stats.stats)
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
if __name__ == '__main__':
|
|
244
|
-
# 直接创建一个OffsiteMiddleware实例进行测试,绕过create_instance的复杂逻辑
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
OffsiteMiddleware 测试文件
|
|
5
|
+
用于测试站点过滤中间件的功能,特别是多个域名的情况
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import unittest
|
|
9
|
+
from unittest.mock import Mock, patch
|
|
10
|
+
|
|
11
|
+
from crawlo.middleware.offsite import OffsiteMiddleware
|
|
12
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
13
|
+
from crawlo.exceptions import IgnoreRequestError, NotConfiguredError
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class MockStats:
|
|
17
|
+
"""Mock Stats 类,用于测试统计信息"""
|
|
18
|
+
def __init__(self):
|
|
19
|
+
self.stats = {}
|
|
20
|
+
|
|
21
|
+
def inc_value(self, key, value=1):
|
|
22
|
+
if key in self.stats:
|
|
23
|
+
self.stats[key] += value
|
|
24
|
+
else:
|
|
25
|
+
self.stats[key] = value
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class MockLogger:
|
|
29
|
+
"""Mock Logger 类,用于测试日志输出"""
|
|
30
|
+
def __init__(self, name, level=None):
|
|
31
|
+
self.name = name
|
|
32
|
+
self.level = level
|
|
33
|
+
self.logs = []
|
|
34
|
+
|
|
35
|
+
def debug(self, msg):
|
|
36
|
+
self.logs.append(('debug', msg))
|
|
37
|
+
|
|
38
|
+
def info(self, msg):
|
|
39
|
+
self.logs.append(('info', msg))
|
|
40
|
+
|
|
41
|
+
def warning(self, msg):
|
|
42
|
+
self.logs.append(('warning', msg))
|
|
43
|
+
|
|
44
|
+
def error(self, msg):
|
|
45
|
+
self.logs.append(('error', msg))
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class TestOffsiteMiddleware(unittest.TestCase):
|
|
49
|
+
"""OffsiteMiddleware 测试类"""
|
|
50
|
+
|
|
51
|
+
def setUp(self):
|
|
52
|
+
"""测试前准备"""
|
|
53
|
+
# 创建设置管理器
|
|
54
|
+
self.settings = SettingManager()
|
|
55
|
+
|
|
56
|
+
# 创建爬虫模拟对象
|
|
57
|
+
self.crawler = Mock()
|
|
58
|
+
self.crawler.settings = self.settings
|
|
59
|
+
self.crawler.stats = MockStats()
|
|
60
|
+
|
|
61
|
+
def test_middleware_initialization_without_domains(self):
|
|
62
|
+
"""测试没有设置ALLOWED_DOMAINS时中间件初始化"""
|
|
63
|
+
# 不设置ALLOWED_DOMAINS
|
|
64
|
+
logger = MockLogger('OffsiteMiddleware')
|
|
65
|
+
with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
|
|
66
|
+
# 应该抛出NotConfiguredError异常
|
|
67
|
+
with self.assertRaises(NotConfiguredError) as context:
|
|
68
|
+
OffsiteMiddleware.create_instance(self.crawler)
|
|
69
|
+
|
|
70
|
+
self.assertIn("未配置ALLOWED_DOMAINS,OffsiteMiddleware已禁用", str(context.exception))
|
|
71
|
+
|
|
72
|
+
def test_middleware_initialization_with_global_domains(self):
|
|
73
|
+
"""测试使用全局ALLOWED_DOMAINS设置时中间件初始化"""
|
|
74
|
+
# 设置全局ALLOWED_DOMAINS
|
|
75
|
+
self.settings.set('ALLOWED_DOMAINS', ['ee.ofweek.com', 'www.baidu.com'])
|
|
76
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
77
|
+
|
|
78
|
+
logger = MockLogger('OffsiteMiddleware')
|
|
79
|
+
with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
|
|
80
|
+
# 应该正常创建实例
|
|
81
|
+
middleware = OffsiteMiddleware.create_instance(self.crawler)
|
|
82
|
+
self.assertIsInstance(middleware, OffsiteMiddleware)
|
|
83
|
+
self.assertEqual(len(middleware.allowed_domains), 2)
|
|
84
|
+
self.assertIn('ee.ofweek.com', middleware.allowed_domains)
|
|
85
|
+
self.assertIn('www.baidu.com', middleware.allowed_domains)
|
|
86
|
+
|
|
87
|
+
def test_middleware_initialization_with_spider_domains(self):
|
|
88
|
+
"""测试使用Spider实例allowed_domains属性时中间件初始化"""
|
|
89
|
+
# 设置Spider实例的allowed_domains
|
|
90
|
+
spider = Mock()
|
|
91
|
+
spider.allowed_domains = ['ee.ofweek.com', 'www.baidu.com']
|
|
92
|
+
|
|
93
|
+
self.crawler.spider = spider
|
|
94
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
95
|
+
|
|
96
|
+
logger = MockLogger('OffsiteMiddleware')
|
|
97
|
+
with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
|
|
98
|
+
# 应该正常创建实例,使用Spider的allowed_domains
|
|
99
|
+
middleware = OffsiteMiddleware.create_instance(self.crawler)
|
|
100
|
+
self.assertIsInstance(middleware, OffsiteMiddleware)
|
|
101
|
+
self.assertEqual(len(middleware.allowed_domains), 2)
|
|
102
|
+
self.assertIn('ee.ofweek.com', middleware.allowed_domains)
|
|
103
|
+
self.assertIn('www.baidu.com', middleware.allowed_domains)
|
|
104
|
+
|
|
105
|
+
def test_is_offsite_request_with_allowed_domains(self):
|
|
106
|
+
"""测试允许域名内的请求"""
|
|
107
|
+
# 设置ALLOWED_DOMAINS
|
|
108
|
+
self.settings.set('ALLOWED_DOMAINS', ['ee.ofweek.com', 'www.baidu.com'])
|
|
109
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
110
|
+
|
|
111
|
+
logger = MockLogger('OffsiteMiddleware')
|
|
112
|
+
with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
|
|
113
|
+
middleware = OffsiteMiddleware.create_instance(self.crawler)
|
|
114
|
+
|
|
115
|
+
# 创建允许的请求
|
|
116
|
+
request1 = Mock()
|
|
117
|
+
request1.url = 'https://ee.ofweek.com/news/article1.html'
|
|
118
|
+
|
|
119
|
+
request2 = Mock()
|
|
120
|
+
request2.url = 'https://www.baidu.com/s?wd=test'
|
|
121
|
+
|
|
122
|
+
# 这些请求应该不被认为是站外请求
|
|
123
|
+
self.assertFalse(middleware._is_offsite_request(request1))
|
|
124
|
+
self.assertFalse(middleware._is_offsite_request(request2))
|
|
125
|
+
|
|
126
|
+
def test_is_offsite_request_with_subdomains(self):
|
|
127
|
+
"""测试子域名的请求"""
|
|
128
|
+
# 设置ALLOWED_DOMAINS
|
|
129
|
+
self.settings.set('ALLOWED_DOMAINS', ['ofweek.com', 'baidu.com'])
|
|
130
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
131
|
+
|
|
132
|
+
logger = MockLogger('OffsiteMiddleware')
|
|
133
|
+
with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
|
|
134
|
+
middleware = OffsiteMiddleware.create_instance(self.crawler)
|
|
135
|
+
|
|
136
|
+
# 创建子域名的请求
|
|
137
|
+
request1 = Mock()
|
|
138
|
+
request1.url = 'https://news.ofweek.com/article1.html'
|
|
139
|
+
|
|
140
|
+
request2 = Mock()
|
|
141
|
+
request2.url = 'https://map.baidu.com/location'
|
|
142
|
+
|
|
143
|
+
# 这些请求应该不被认为是站外请求(因为允许了根域名)
|
|
144
|
+
self.assertFalse(middleware._is_offsite_request(request1))
|
|
145
|
+
self.assertFalse(middleware._is_offsite_request(request2))
|
|
146
|
+
|
|
147
|
+
def test_is_offsite_request_with_disallowed_domains(self):
|
|
148
|
+
"""测试不允许域名的请求"""
|
|
149
|
+
# 设置ALLOWED_DOMAINS
|
|
150
|
+
self.settings.set('ALLOWED_DOMAINS', ['ee.ofweek.com', 'www.baidu.com'])
|
|
151
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
152
|
+
|
|
153
|
+
logger = MockLogger('OffsiteMiddleware')
|
|
154
|
+
with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
|
|
155
|
+
middleware = OffsiteMiddleware.create_instance(self.crawler)
|
|
156
|
+
|
|
157
|
+
# 创建不允许的请求
|
|
158
|
+
request1 = Mock()
|
|
159
|
+
request1.url = 'https://www.google.com/search?q=test'
|
|
160
|
+
|
|
161
|
+
request2 = Mock()
|
|
162
|
+
request2.url = 'https://github.com/user/repo'
|
|
163
|
+
|
|
164
|
+
# 这些请求应该被认为是站外请求
|
|
165
|
+
self.assertTrue(middleware._is_offsite_request(request1))
|
|
166
|
+
self.assertTrue(middleware._is_offsite_request(request2))
|
|
167
|
+
|
|
168
|
+
def test_process_request_with_allowed_domain(self):
|
|
169
|
+
"""测试处理允许域名内的请求"""
|
|
170
|
+
# 设置ALLOWED_DOMAINS
|
|
171
|
+
self.settings.set('ALLOWED_DOMAINS', ['ee.ofweek.com', 'www.baidu.com'])
|
|
172
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
173
|
+
|
|
174
|
+
logger = MockLogger('OffsiteMiddleware')
|
|
175
|
+
with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
|
|
176
|
+
middleware = OffsiteMiddleware.create_instance(self.crawler)
|
|
177
|
+
|
|
178
|
+
# 创建允许的请求
|
|
179
|
+
request = Mock()
|
|
180
|
+
request.url = 'https://ee.ofweek.com/news/article1.html'
|
|
181
|
+
spider = Mock()
|
|
182
|
+
|
|
183
|
+
# 处理请求,应该不抛出异常
|
|
184
|
+
result = middleware.process_request(request, spider)
|
|
185
|
+
self.assertIsNone(result) # 应该返回None,表示请求被允许
|
|
186
|
+
|
|
187
|
+
# 检查没有增加统计计数
|
|
188
|
+
self.assertNotIn('offsite_request_count', self.crawler.stats.stats)
|
|
189
|
+
|
|
190
|
+
def test_process_request_with_disallowed_domain(self):
|
|
191
|
+
"""测试处理不允许域名的请求"""
|
|
192
|
+
# 设置ALLOWED_DOMAINS
|
|
193
|
+
self.settings.set('ALLOWED_DOMAINS', ['ee.ofweek.com', 'www.baidu.com'])
|
|
194
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
195
|
+
|
|
196
|
+
logger = MockLogger('OffsiteMiddleware')
|
|
197
|
+
with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
|
|
198
|
+
middleware = OffsiteMiddleware.create_instance(self.crawler)
|
|
199
|
+
|
|
200
|
+
# 创建不允许的请求
|
|
201
|
+
request = Mock()
|
|
202
|
+
request.url = 'https://www.google.com/search?q=test'
|
|
203
|
+
spider = Mock()
|
|
204
|
+
|
|
205
|
+
# 处理请求,应该抛出IgnoreRequestError异常
|
|
206
|
+
with self.assertRaises(IgnoreRequestError) as context:
|
|
207
|
+
middleware.process_request(request, spider)
|
|
208
|
+
|
|
209
|
+
self.assertIn("站外请求被过滤", str(context.exception))
|
|
210
|
+
|
|
211
|
+
# 检查增加了统计计数
|
|
212
|
+
self.assertIn('offsite_request_count', self.crawler.stats.stats)
|
|
213
|
+
self.assertEqual(self.crawler.stats.stats['offsite_request_count'], 1)
|
|
214
|
+
self.assertIn('offsite_request_count/www.google.com', self.crawler.stats.stats)
|
|
215
|
+
|
|
216
|
+
def test_process_request_with_invalid_url(self):
|
|
217
|
+
"""测试处理无效URL的请求"""
|
|
218
|
+
# 设置ALLOWED_DOMAINS
|
|
219
|
+
self.settings.set('ALLOWED_DOMAINS', ['ee.ofweek.com', 'www.baidu.com'])
|
|
220
|
+
self.settings.set('LOG_LEVEL', 'DEBUG')
|
|
221
|
+
|
|
222
|
+
logger = MockLogger('OffsiteMiddleware')
|
|
223
|
+
with patch('crawlo.middleware.offsite.get_logger', return_value=logger):
|
|
224
|
+
middleware = OffsiteMiddleware.create_instance(self.crawler)
|
|
225
|
+
|
|
226
|
+
# 创建无效URL的请求
|
|
227
|
+
request = Mock()
|
|
228
|
+
request.url = 'not_a_valid_url'
|
|
229
|
+
spider = Mock()
|
|
230
|
+
|
|
231
|
+
# 处理请求,应该抛出IgnoreRequestError异常
|
|
232
|
+
with self.assertRaises(IgnoreRequestError) as context:
|
|
233
|
+
middleware.process_request(request, spider)
|
|
234
|
+
|
|
235
|
+
self.assertIn("站外请求被过滤", str(context.exception))
|
|
236
|
+
|
|
237
|
+
# 检查增加了统计计数
|
|
238
|
+
self.assertIn('offsite_request_count', self.crawler.stats.stats)
|
|
239
|
+
self.assertEqual(self.crawler.stats.stats['offsite_request_count'], 1)
|
|
240
|
+
self.assertIn('offsite_request_count/invalid_url', self.crawler.stats.stats)
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
if __name__ == '__main__':
|
|
244
|
+
# 直接创建一个OffsiteMiddleware实例进行测试,绕过create_instance的复杂逻辑
|
|
245
245
|
unittest.main()
|