crawlo 1.4.1__py3-none-any.whl → 1.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +93 -93
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +151 -151
- crawlo/commands/help.py +138 -138
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +341 -341
- crawlo/commands/startproject.py +436 -436
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +312 -312
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +52 -52
- crawlo/core/engine.py +438 -439
- crawlo/core/processor.py +47 -47
- crawlo/core/scheduler.py +291 -257
- crawlo/crawler.py +650 -650
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +273 -273
- crawlo/downloader/aiohttp_downloader.py +233 -228
- crawlo/downloader/cffi_downloader.py +245 -245
- crawlo/downloader/httpx_downloader.py +259 -259
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +402 -402
- crawlo/downloader/selenium_downloader.py +472 -472
- crawlo/event.py +11 -11
- crawlo/exceptions.py +81 -81
- crawlo/extension/__init__.py +63 -63
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +94 -94
- crawlo/extension/log_stats.py +70 -70
- crawlo/extension/logging_extension.py +61 -61
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/factories/__init__.py +27 -27
- crawlo/factories/base.py +68 -68
- crawlo/factories/crawler.py +103 -103
- crawlo/factories/registry.py +84 -84
- crawlo/filters/__init__.py +154 -154
- crawlo/filters/aioredis_filter.py +257 -257
- crawlo/filters/memory_filter.py +269 -269
- crawlo/framework.py +292 -292
- crawlo/initialization/__init__.py +44 -44
- crawlo/initialization/built_in.py +425 -425
- crawlo/initialization/context.py +141 -141
- crawlo/initialization/core.py +193 -193
- crawlo/initialization/phases.py +148 -148
- crawlo/initialization/registry.py +145 -145
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +23 -23
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/logging/__init__.py +45 -37
- crawlo/logging/async_handler.py +181 -0
- crawlo/logging/config.py +196 -96
- crawlo/logging/factory.py +171 -128
- crawlo/logging/manager.py +111 -111
- crawlo/logging/monitor.py +153 -0
- crawlo/logging/sampler.py +167 -0
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +135 -135
- crawlo/middleware/offsite.py +123 -123
- crawlo/middleware/proxy.py +386 -386
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +150 -150
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/middleware/simple_proxy.py +65 -65
- crawlo/mode_manager.py +219 -219
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +379 -379
- crawlo/network/response.py +359 -359
- crawlo/pipelines/__init__.py +21 -21
- crawlo/pipelines/bloom_dedup_pipeline.py +146 -146
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +197 -197
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +105 -105
- crawlo/pipelines/mongo_pipeline.py +131 -131
- crawlo/pipelines/mysql_pipeline.py +325 -325
- crawlo/pipelines/pipeline_manager.py +100 -84
- crawlo/pipelines/redis_dedup_pipeline.py +156 -156
- crawlo/project.py +349 -338
- crawlo/queue/pqueue.py +42 -42
- crawlo/queue/queue_manager.py +526 -522
- crawlo/queue/redis_priority_queue.py +370 -367
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +284 -284
- crawlo/settings/setting_manager.py +219 -219
- crawlo/spider/__init__.py +657 -657
- crawlo/stats_collector.py +73 -73
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +138 -138
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +3 -3
- crawlo/templates/project/items.py.tmpl +17 -17
- crawlo/templates/project/middlewares.py.tmpl +118 -118
- crawlo/templates/project/pipelines.py.tmpl +96 -96
- crawlo/templates/project/settings.py.tmpl +170 -170
- crawlo/templates/project/settings_distributed.py.tmpl +169 -169
- crawlo/templates/project/settings_gentle.py.tmpl +166 -166
- crawlo/templates/project/settings_high_performance.py.tmpl +167 -167
- crawlo/templates/project/settings_minimal.py.tmpl +65 -65
- crawlo/templates/project/settings_simple.py.tmpl +164 -164
- crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
- crawlo/templates/run.py.tmpl +34 -34
- crawlo/templates/spider/spider.py.tmpl +143 -143
- crawlo/templates/spiders_init.py.tmpl +9 -9
- crawlo/tools/__init__.py +200 -200
- crawlo/tools/anti_crawler.py +268 -268
- crawlo/tools/authenticated_proxy.py +240 -240
- crawlo/tools/data_formatter.py +225 -225
- crawlo/tools/data_validator.py +180 -180
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +384 -384
- crawlo/tools/encoding_converter.py +127 -127
- crawlo/tools/network_diagnostic.py +364 -364
- crawlo/tools/request_tools.py +82 -82
- crawlo/tools/retry_mechanism.py +224 -224
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +34 -34
- crawlo/utils/batch_processor.py +259 -259
- crawlo/utils/class_loader.py +25 -25
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +343 -343
- crawlo/utils/enhanced_error_handler.py +356 -356
- crawlo/utils/env_config.py +142 -142
- crawlo/utils/error_handler.py +165 -165
- crawlo/utils/fingerprint.py +122 -122
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_config.py +286 -286
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/log.py +79 -79
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_connection_pool.py +388 -388
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +267 -267
- crawlo/utils/request_serializer.py +225 -225
- crawlo/utils/spider_loader.py +61 -61
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +4 -4
- crawlo/utils/url.py +39 -39
- crawlo-1.4.3.dist-info/METADATA +190 -0
- crawlo-1.4.3.dist-info/RECORD +326 -0
- examples/__init__.py +7 -7
- examples/test_project/__init__.py +7 -7
- examples/test_project/run.py +34 -34
- examples/test_project/test_project/__init__.py +3 -3
- examples/test_project/test_project/items.py +17 -17
- examples/test_project/test_project/middlewares.py +118 -118
- examples/test_project/test_project/pipelines.py +96 -96
- examples/test_project/test_project/settings.py +169 -169
- examples/test_project/test_project/spiders/__init__.py +9 -9
- examples/test_project/test_project/spiders/of_week_dis.py +143 -143
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +275 -275
- tests/authenticated_proxy_example.py +106 -106
- tests/baidu_performance_test.py +108 -108
- tests/baidu_test.py +59 -59
- tests/cleaners_example.py +160 -160
- tests/comprehensive_framework_test.py +212 -212
- tests/comprehensive_test.py +81 -81
- tests/comprehensive_testing_summary.md +186 -186
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_configure.py +69 -69
- tests/debug_framework_logger.py +84 -84
- tests/debug_log_config.py +126 -126
- tests/debug_log_levels.py +63 -63
- tests/debug_pipelines.py +66 -66
- tests/detailed_log_test.py +233 -233
- tests/distributed_test.py +66 -66
- tests/distributed_test_debug.py +76 -76
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/env_config_example.py +133 -133
- tests/error_handling_example.py +171 -171
- tests/final_comprehensive_test.py +151 -151
- tests/final_log_test.py +260 -260
- tests/final_validation_test.py +182 -182
- tests/fix_log_test.py +142 -142
- tests/framework_performance_test.py +202 -202
- tests/log_buffering_test.py +111 -111
- tests/log_generation_timing_test.py +153 -153
- tests/optimized_performance_test.py +211 -211
- tests/performance_comparison.py +245 -245
- tests/queue_blocking_test.py +113 -113
- tests/queue_test.py +89 -89
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/scrapy_comparison/ofweek_scrapy.py +138 -138
- tests/scrapy_comparison/scrapy_test.py +133 -133
- tests/simple_command_test.py +119 -119
- tests/simple_crawlo_test.py +127 -127
- tests/simple_log_test.py +57 -57
- tests/simple_log_test2.py +137 -137
- tests/simple_optimization_test.py +128 -128
- tests/simple_queue_type_test.py +41 -41
- tests/simple_spider_test.py +49 -49
- tests/simple_test.py +47 -47
- tests/spider_log_timing_test.py +177 -177
- tests/test_advanced_tools.py +148 -148
- tests/test_all_commands.py +230 -230
- tests/test_all_pipeline_fingerprints.py +133 -133
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_authenticated_proxy.py +141 -141
- tests/test_batch_processor.py +178 -178
- tests/test_cleaners.py +54 -54
- tests/test_component_factory.py +174 -174
- tests/test_comprehensive.py +146 -146
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_controlled_spider_mixin.py +79 -79
- tests/test_crawlo_proxy_integration.py +108 -108
- tests/test_date_tools.py +123 -123
- tests/test_dedup_fix.py +220 -220
- tests/test_dedup_pipeline_consistency.py +125 -0
- tests/test_default_header_middleware.py +313 -313
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +204 -204
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +268 -268
- tests/test_dynamic_downloaders_proxy.py +124 -124
- tests/test_dynamic_proxy.py +92 -92
- tests/test_dynamic_proxy_config.py +146 -146
- tests/test_dynamic_proxy_real.py +109 -109
- tests/test_edge_cases.py +303 -303
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_enhanced_error_handler_comprehensive.py +245 -245
- tests/test_env_config.py +121 -121
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_factories.py +252 -252
- tests/test_final_validation.py +153 -153
- tests/test_fingerprint_consistency.py +135 -135
- tests/test_fingerprint_simple.py +51 -51
- tests/test_framework_env_usage.py +103 -103
- tests/test_framework_logger.py +66 -66
- tests/test_framework_startup.py +64 -64
- tests/test_get_component_logger.py +83 -83
- tests/test_hash_performance.py +99 -99
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_large_scale_config.py +112 -112
- tests/test_large_scale_helper.py +235 -235
- tests/test_logging_enhancements.py +375 -0
- tests/test_logging_final.py +185 -0
- tests/test_logging_integration.py +313 -0
- tests/test_logging_system.py +282 -282
- tests/test_middleware_debug.py +142 -0
- tests/test_mode_change.py +72 -72
- tests/test_mode_consistency.py +51 -51
- tests/test_offsite_middleware.py +244 -244
- tests/test_offsite_middleware_simple.py +203 -203
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_performance_monitor.py +115 -115
- tests/test_pipeline_fingerprint_consistency.py +86 -86
- tests/test_priority_behavior.py +212 -0
- tests/test_priority_consistency.py +152 -0
- tests/test_priority_consistency_fixed.py +250 -0
- tests/test_proxy_api.py +264 -264
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +121 -121
- tests/test_proxy_middleware_enhanced.py +216 -216
- tests/test_proxy_middleware_integration.py +136 -136
- tests/test_proxy_middleware_refactored.py +184 -184
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_queue_empty_check.py +41 -41
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +179 -179
- tests/test_queue_naming.py +154 -154
- tests/test_queue_type.py +106 -106
- tests/test_queue_type_redis_config_consistency.py +131 -0
- tests/test_random_headers_default.py +323 -0
- tests/test_random_headers_necessity.py +309 -0
- tests/test_random_user_agent.py +72 -72
- tests/test_real_scenario_proxy.py +195 -195
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_redis_queue_name_fix.py +175 -175
- tests/test_redis_queue_type_fallback.py +130 -0
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_improvements.py +152 -152
- tests/test_retry_middleware.py +334 -242
- tests/test_retry_middleware_realistic.py +274 -0
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_simple_response.py +61 -61
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agent_randomness.py +177 -0
- tests/test_user_agents.py +96 -96
- tests/tools_example.py +260 -260
- tests/untested_features_report.md +138 -138
- tests/verify_debug.py +51 -51
- tests/verify_distributed.py +117 -117
- tests/verify_log_fix.py +111 -111
- crawlo-1.4.1.dist-info/METADATA +0 -1199
- crawlo-1.4.1.dist-info/RECORD +0 -309
- {crawlo-1.4.1.dist-info → crawlo-1.4.3.dist-info}/WHEEL +0 -0
- {crawlo-1.4.1.dist-info → crawlo-1.4.3.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.1.dist-info → crawlo-1.4.3.dist-info}/top_level.txt +0 -0
|
@@ -1,204 +1,204 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
OffsiteMiddleware 简单测试文件
|
|
5
|
-
用于测试站点过滤中间件的功能,特别是多个域名的情况
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import asyncio
|
|
9
|
-
import unittest
|
|
10
|
-
from unittest.mock import Mock, patch
|
|
11
|
-
|
|
12
|
-
from crawlo.middleware.offsite import OffsiteMiddleware
|
|
13
|
-
from crawlo.settings.setting_manager import SettingManager
|
|
14
|
-
from crawlo.exceptions import IgnoreRequestError
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class MockStats:
|
|
18
|
-
"""Mock Stats 类,用于测试统计信息"""
|
|
19
|
-
def __init__(self):
|
|
20
|
-
self.stats = {}
|
|
21
|
-
|
|
22
|
-
def inc_value(self, key, value=1):
|
|
23
|
-
if key in self.stats:
|
|
24
|
-
self.stats[key] += value
|
|
25
|
-
else:
|
|
26
|
-
self.stats[key] = value
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
class MockLogger:
|
|
30
|
-
"""Mock Logger 类,用于测试日志输出"""
|
|
31
|
-
def __init__(self, name, level=None):
|
|
32
|
-
self.name = name
|
|
33
|
-
self.level = level
|
|
34
|
-
self.logs = []
|
|
35
|
-
|
|
36
|
-
def debug(self, msg):
|
|
37
|
-
self.logs.append(('debug', msg))
|
|
38
|
-
|
|
39
|
-
def info(self, msg):
|
|
40
|
-
self.logs.append(('info', msg))
|
|
41
|
-
|
|
42
|
-
def warning(self, msg):
|
|
43
|
-
self.logs.append(('warning', msg))
|
|
44
|
-
|
|
45
|
-
def error(self, msg):
|
|
46
|
-
self.logs.append(('error', msg))
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
class TestOffsiteMiddleware(unittest.TestCase):
|
|
50
|
-
"""OffsiteMiddleware 测试类"""
|
|
51
|
-
|
|
52
|
-
def setUp(self):
|
|
53
|
-
"""测试前准备"""
|
|
54
|
-
self.stats = MockStats()
|
|
55
|
-
self.logger = MockLogger('OffsiteMiddleware')
|
|
56
|
-
|
|
57
|
-
def test_multiple_domains_initialization(self):
|
|
58
|
-
"""测试多个域名的初始化"""
|
|
59
|
-
with patch('crawlo.middleware.offsite.get_logger', return_value=self.logger):
|
|
60
|
-
# 直接创建实例,传入多个域名
|
|
61
|
-
middleware = OffsiteMiddleware(
|
|
62
|
-
stats=self.stats,
|
|
63
|
-
log_level='DEBUG',
|
|
64
|
-
allowed_domains=['ee.ofweek.com', 'www.baidu.com']
|
|
65
|
-
)
|
|
66
|
-
|
|
67
|
-
# 手动调用编译域名方法
|
|
68
|
-
middleware._compile_domains()
|
|
69
|
-
|
|
70
|
-
# 检查域名是否正确设置
|
|
71
|
-
self.assertEqual(len(middleware.allowed_domains), 2)
|
|
72
|
-
self.assertIn('ee.ofweek.com', middleware.allowed_domains)
|
|
73
|
-
self.assertIn('www.baidu.com', middleware.allowed_domains)
|
|
74
|
-
|
|
75
|
-
# 检查是否创建了正确的正则表达式
|
|
76
|
-
self.assertEqual(len(middleware._domain_regexes), 2)
|
|
77
|
-
|
|
78
|
-
def test_allowed_requests_with_multiple_domains(self):
|
|
79
|
-
"""测试多个域名下允许的请求"""
|
|
80
|
-
with patch('crawlo.middleware.offsite.get_logger', return_value=self.logger):
|
|
81
|
-
middleware = OffsiteMiddleware(
|
|
82
|
-
stats=self.stats,
|
|
83
|
-
log_level='DEBUG',
|
|
84
|
-
allowed_domains=['ee.ofweek.com', 'www.baidu.com']
|
|
85
|
-
)
|
|
86
|
-
|
|
87
|
-
# 手动调用编译域名方法
|
|
88
|
-
middleware._compile_domains()
|
|
89
|
-
|
|
90
|
-
# 创建允许的请求
|
|
91
|
-
request1 = Mock()
|
|
92
|
-
request1.url = 'https://ee.ofweek.com/news/article1.html'
|
|
93
|
-
|
|
94
|
-
request2 = Mock()
|
|
95
|
-
request2.url = 'https://www.baidu.com/s?wd=test'
|
|
96
|
-
|
|
97
|
-
# 这些请求应该不被认为是站外请求
|
|
98
|
-
self.assertFalse(middleware._is_offsite_request(request1))
|
|
99
|
-
self.assertFalse(middleware._is_offsite_request(request2))
|
|
100
|
-
|
|
101
|
-
def test_disallowed_requests_with_multiple_domains(self):
|
|
102
|
-
"""测试多个域名下不允许的请求"""
|
|
103
|
-
with patch('crawlo.middleware.offsite.get_logger', return_value=self.logger):
|
|
104
|
-
middleware = OffsiteMiddleware(
|
|
105
|
-
stats=self.stats,
|
|
106
|
-
log_level='DEBUG',
|
|
107
|
-
allowed_domains=['ee.ofweek.com', 'www.baidu.com']
|
|
108
|
-
)
|
|
109
|
-
|
|
110
|
-
# 手动调用编译域名方法
|
|
111
|
-
middleware._compile_domains()
|
|
112
|
-
|
|
113
|
-
# 创建不允许的请求
|
|
114
|
-
request1 = Mock()
|
|
115
|
-
request1.url = 'https://www.google.com/search?q=test'
|
|
116
|
-
|
|
117
|
-
request2 = Mock()
|
|
118
|
-
request2.url = 'https://github.com/user/repo'
|
|
119
|
-
|
|
120
|
-
# 这些请求应该被认为是站外请求
|
|
121
|
-
self.assertTrue(middleware._is_offsite_request(request1))
|
|
122
|
-
self.assertTrue(middleware._is_offsite_request(request2))
|
|
123
|
-
|
|
124
|
-
def test_subdomain_requests_with_multiple_domains(self):
|
|
125
|
-
"""测试多个域名下的子域名请求"""
|
|
126
|
-
with patch('crawlo.middleware.offsite.get_logger', return_value=self.logger):
|
|
127
|
-
# 使用根域名,应该允许子域名
|
|
128
|
-
middleware = OffsiteMiddleware(
|
|
129
|
-
stats=self.stats,
|
|
130
|
-
log_level='DEBUG',
|
|
131
|
-
allowed_domains=['ofweek.com', 'baidu.com']
|
|
132
|
-
)
|
|
133
|
-
|
|
134
|
-
# 手动调用编译域名方法
|
|
135
|
-
middleware._compile_domains()
|
|
136
|
-
|
|
137
|
-
# 创建子域名的请求
|
|
138
|
-
request1 = Mock()
|
|
139
|
-
request1.url = 'https://news.ofweek.com/article1.html'
|
|
140
|
-
|
|
141
|
-
request2 = Mock()
|
|
142
|
-
request2.url = 'https://map.baidu.com/location'
|
|
143
|
-
|
|
144
|
-
# 这些请求应该不被认为是站外请求(因为允许了根域名)
|
|
145
|
-
self.assertFalse(middleware._is_offsite_request(request1))
|
|
146
|
-
self.assertFalse(middleware._is_offsite_request(request2))
|
|
147
|
-
|
|
148
|
-
def test_process_allowed_request_with_multiple_domains(self):
|
|
149
|
-
"""测试处理多个域名下允许的请求"""
|
|
150
|
-
with patch('crawlo.middleware.offsite.get_logger', return_value=self.logger):
|
|
151
|
-
middleware = OffsiteMiddleware(
|
|
152
|
-
stats=self.stats,
|
|
153
|
-
log_level='DEBUG',
|
|
154
|
-
allowed_domains=['ee.ofweek.com', 'www.baidu.com']
|
|
155
|
-
)
|
|
156
|
-
|
|
157
|
-
# 手动调用编译域名方法
|
|
158
|
-
middleware._compile_domains()
|
|
159
|
-
|
|
160
|
-
# 创建允许的请求
|
|
161
|
-
request = Mock()
|
|
162
|
-
request.url = 'https://ee.ofweek.com/news/article1.html'
|
|
163
|
-
spider = Mock()
|
|
164
|
-
|
|
165
|
-
# 处理请求,应该不抛出异常
|
|
166
|
-
# 使用asyncio.run来运行异步方法
|
|
167
|
-
result = asyncio.run(middleware.process_request(request, spider))
|
|
168
|
-
self.assertIsNone(result) # 应该返回None,表示请求被允许
|
|
169
|
-
|
|
170
|
-
# 检查没有增加统计计数
|
|
171
|
-
self.assertNotIn('offsite_request_count', self.stats.stats)
|
|
172
|
-
|
|
173
|
-
def test_process_disallowed_request_with_multiple_domains(self):
|
|
174
|
-
"""测试处理多个域名下不允许的请求"""
|
|
175
|
-
with patch('crawlo.middleware.offsite.get_logger', return_value=self.logger):
|
|
176
|
-
middleware = OffsiteMiddleware(
|
|
177
|
-
stats=self.stats,
|
|
178
|
-
log_level='DEBUG',
|
|
179
|
-
allowed_domains=['ee.ofweek.com', 'www.baidu.com']
|
|
180
|
-
)
|
|
181
|
-
|
|
182
|
-
# 手动调用编译域名方法
|
|
183
|
-
middleware._compile_domains()
|
|
184
|
-
|
|
185
|
-
# 创建不允许的请求
|
|
186
|
-
request = Mock()
|
|
187
|
-
request.url = 'https://www.google.com/search?q=test'
|
|
188
|
-
spider = Mock()
|
|
189
|
-
|
|
190
|
-
# 处理请求,应该抛出IgnoreRequestError异常
|
|
191
|
-
# 使用asyncio.run来运行异步方法
|
|
192
|
-
with self.assertRaises(IgnoreRequestError) as context:
|
|
193
|
-
asyncio.run(middleware.process_request(request, spider))
|
|
194
|
-
|
|
195
|
-
self.assertIn("站外请求被过滤", str(context.exception))
|
|
196
|
-
|
|
197
|
-
# 检查增加了统计计数
|
|
198
|
-
self.assertIn('offsite_request_count', self.stats.stats)
|
|
199
|
-
self.assertEqual(self.stats.stats['offsite_request_count'], 1)
|
|
200
|
-
self.assertIn('offsite_request_count/www.google.com', self.stats.stats)
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
if __name__ == '__main__':
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
OffsiteMiddleware 简单测试文件
|
|
5
|
+
用于测试站点过滤中间件的功能,特别是多个域名的情况
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import unittest
|
|
10
|
+
from unittest.mock import Mock, patch
|
|
11
|
+
|
|
12
|
+
from crawlo.middleware.offsite import OffsiteMiddleware
|
|
13
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
14
|
+
from crawlo.exceptions import IgnoreRequestError
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class MockStats:
|
|
18
|
+
"""Mock Stats 类,用于测试统计信息"""
|
|
19
|
+
def __init__(self):
|
|
20
|
+
self.stats = {}
|
|
21
|
+
|
|
22
|
+
def inc_value(self, key, value=1):
|
|
23
|
+
if key in self.stats:
|
|
24
|
+
self.stats[key] += value
|
|
25
|
+
else:
|
|
26
|
+
self.stats[key] = value
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class MockLogger:
|
|
30
|
+
"""Mock Logger 类,用于测试日志输出"""
|
|
31
|
+
def __init__(self, name, level=None):
|
|
32
|
+
self.name = name
|
|
33
|
+
self.level = level
|
|
34
|
+
self.logs = []
|
|
35
|
+
|
|
36
|
+
def debug(self, msg):
|
|
37
|
+
self.logs.append(('debug', msg))
|
|
38
|
+
|
|
39
|
+
def info(self, msg):
|
|
40
|
+
self.logs.append(('info', msg))
|
|
41
|
+
|
|
42
|
+
def warning(self, msg):
|
|
43
|
+
self.logs.append(('warning', msg))
|
|
44
|
+
|
|
45
|
+
def error(self, msg):
|
|
46
|
+
self.logs.append(('error', msg))
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class TestOffsiteMiddleware(unittest.TestCase):
|
|
50
|
+
"""OffsiteMiddleware 测试类"""
|
|
51
|
+
|
|
52
|
+
def setUp(self):
|
|
53
|
+
"""测试前准备"""
|
|
54
|
+
self.stats = MockStats()
|
|
55
|
+
self.logger = MockLogger('OffsiteMiddleware')
|
|
56
|
+
|
|
57
|
+
def test_multiple_domains_initialization(self):
|
|
58
|
+
"""测试多个域名的初始化"""
|
|
59
|
+
with patch('crawlo.middleware.offsite.get_logger', return_value=self.logger):
|
|
60
|
+
# 直接创建实例,传入多个域名
|
|
61
|
+
middleware = OffsiteMiddleware(
|
|
62
|
+
stats=self.stats,
|
|
63
|
+
log_level='DEBUG',
|
|
64
|
+
allowed_domains=['ee.ofweek.com', 'www.baidu.com']
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# 手动调用编译域名方法
|
|
68
|
+
middleware._compile_domains()
|
|
69
|
+
|
|
70
|
+
# 检查域名是否正确设置
|
|
71
|
+
self.assertEqual(len(middleware.allowed_domains), 2)
|
|
72
|
+
self.assertIn('ee.ofweek.com', middleware.allowed_domains)
|
|
73
|
+
self.assertIn('www.baidu.com', middleware.allowed_domains)
|
|
74
|
+
|
|
75
|
+
# 检查是否创建了正确的正则表达式
|
|
76
|
+
self.assertEqual(len(middleware._domain_regexes), 2)
|
|
77
|
+
|
|
78
|
+
def test_allowed_requests_with_multiple_domains(self):
|
|
79
|
+
"""测试多个域名下允许的请求"""
|
|
80
|
+
with patch('crawlo.middleware.offsite.get_logger', return_value=self.logger):
|
|
81
|
+
middleware = OffsiteMiddleware(
|
|
82
|
+
stats=self.stats,
|
|
83
|
+
log_level='DEBUG',
|
|
84
|
+
allowed_domains=['ee.ofweek.com', 'www.baidu.com']
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
# 手动调用编译域名方法
|
|
88
|
+
middleware._compile_domains()
|
|
89
|
+
|
|
90
|
+
# 创建允许的请求
|
|
91
|
+
request1 = Mock()
|
|
92
|
+
request1.url = 'https://ee.ofweek.com/news/article1.html'
|
|
93
|
+
|
|
94
|
+
request2 = Mock()
|
|
95
|
+
request2.url = 'https://www.baidu.com/s?wd=test'
|
|
96
|
+
|
|
97
|
+
# 这些请求应该不被认为是站外请求
|
|
98
|
+
self.assertFalse(middleware._is_offsite_request(request1))
|
|
99
|
+
self.assertFalse(middleware._is_offsite_request(request2))
|
|
100
|
+
|
|
101
|
+
def test_disallowed_requests_with_multiple_domains(self):
|
|
102
|
+
"""测试多个域名下不允许的请求"""
|
|
103
|
+
with patch('crawlo.middleware.offsite.get_logger', return_value=self.logger):
|
|
104
|
+
middleware = OffsiteMiddleware(
|
|
105
|
+
stats=self.stats,
|
|
106
|
+
log_level='DEBUG',
|
|
107
|
+
allowed_domains=['ee.ofweek.com', 'www.baidu.com']
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# 手动调用编译域名方法
|
|
111
|
+
middleware._compile_domains()
|
|
112
|
+
|
|
113
|
+
# 创建不允许的请求
|
|
114
|
+
request1 = Mock()
|
|
115
|
+
request1.url = 'https://www.google.com/search?q=test'
|
|
116
|
+
|
|
117
|
+
request2 = Mock()
|
|
118
|
+
request2.url = 'https://github.com/user/repo'
|
|
119
|
+
|
|
120
|
+
# 这些请求应该被认为是站外请求
|
|
121
|
+
self.assertTrue(middleware._is_offsite_request(request1))
|
|
122
|
+
self.assertTrue(middleware._is_offsite_request(request2))
|
|
123
|
+
|
|
124
|
+
def test_subdomain_requests_with_multiple_domains(self):
|
|
125
|
+
"""测试多个域名下的子域名请求"""
|
|
126
|
+
with patch('crawlo.middleware.offsite.get_logger', return_value=self.logger):
|
|
127
|
+
# 使用根域名,应该允许子域名
|
|
128
|
+
middleware = OffsiteMiddleware(
|
|
129
|
+
stats=self.stats,
|
|
130
|
+
log_level='DEBUG',
|
|
131
|
+
allowed_domains=['ofweek.com', 'baidu.com']
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
# 手动调用编译域名方法
|
|
135
|
+
middleware._compile_domains()
|
|
136
|
+
|
|
137
|
+
# 创建子域名的请求
|
|
138
|
+
request1 = Mock()
|
|
139
|
+
request1.url = 'https://news.ofweek.com/article1.html'
|
|
140
|
+
|
|
141
|
+
request2 = Mock()
|
|
142
|
+
request2.url = 'https://map.baidu.com/location'
|
|
143
|
+
|
|
144
|
+
# 这些请求应该不被认为是站外请求(因为允许了根域名)
|
|
145
|
+
self.assertFalse(middleware._is_offsite_request(request1))
|
|
146
|
+
self.assertFalse(middleware._is_offsite_request(request2))
|
|
147
|
+
|
|
148
|
+
def test_process_allowed_request_with_multiple_domains(self):
|
|
149
|
+
"""测试处理多个域名下允许的请求"""
|
|
150
|
+
with patch('crawlo.middleware.offsite.get_logger', return_value=self.logger):
|
|
151
|
+
middleware = OffsiteMiddleware(
|
|
152
|
+
stats=self.stats,
|
|
153
|
+
log_level='DEBUG',
|
|
154
|
+
allowed_domains=['ee.ofweek.com', 'www.baidu.com']
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
# 手动调用编译域名方法
|
|
158
|
+
middleware._compile_domains()
|
|
159
|
+
|
|
160
|
+
# 创建允许的请求
|
|
161
|
+
request = Mock()
|
|
162
|
+
request.url = 'https://ee.ofweek.com/news/article1.html'
|
|
163
|
+
spider = Mock()
|
|
164
|
+
|
|
165
|
+
# 处理请求,应该不抛出异常
|
|
166
|
+
# 使用asyncio.run来运行异步方法
|
|
167
|
+
result = asyncio.run(middleware.process_request(request, spider))
|
|
168
|
+
self.assertIsNone(result) # 应该返回None,表示请求被允许
|
|
169
|
+
|
|
170
|
+
# 检查没有增加统计计数
|
|
171
|
+
self.assertNotIn('offsite_request_count', self.stats.stats)
|
|
172
|
+
|
|
173
|
+
def test_process_disallowed_request_with_multiple_domains(self):
|
|
174
|
+
"""测试处理多个域名下不允许的请求"""
|
|
175
|
+
with patch('crawlo.middleware.offsite.get_logger', return_value=self.logger):
|
|
176
|
+
middleware = OffsiteMiddleware(
|
|
177
|
+
stats=self.stats,
|
|
178
|
+
log_level='DEBUG',
|
|
179
|
+
allowed_domains=['ee.ofweek.com', 'www.baidu.com']
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
# 手动调用编译域名方法
|
|
183
|
+
middleware._compile_domains()
|
|
184
|
+
|
|
185
|
+
# 创建不允许的请求
|
|
186
|
+
request = Mock()
|
|
187
|
+
request.url = 'https://www.google.com/search?q=test'
|
|
188
|
+
spider = Mock()
|
|
189
|
+
|
|
190
|
+
# 处理请求,应该抛出IgnoreRequestError异常
|
|
191
|
+
# 使用asyncio.run来运行异步方法
|
|
192
|
+
with self.assertRaises(IgnoreRequestError) as context:
|
|
193
|
+
asyncio.run(middleware.process_request(request, spider))
|
|
194
|
+
|
|
195
|
+
self.assertIn("站外请求被过滤", str(context.exception))
|
|
196
|
+
|
|
197
|
+
# 检查增加了统计计数
|
|
198
|
+
self.assertIn('offsite_request_count', self.stats.stats)
|
|
199
|
+
self.assertEqual(self.stats.stats['offsite_request_count'], 1)
|
|
200
|
+
self.assertIn('offsite_request_count/www.google.com', self.stats.stats)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
if __name__ == '__main__':
|
|
204
204
|
unittest.main()
|
tests/test_parsel.py
CHANGED
|
@@ -1,30 +1,30 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
parsel 库测试
|
|
5
|
-
"""
|
|
6
|
-
import sys
|
|
7
|
-
import os
|
|
8
|
-
|
|
9
|
-
# 添加项目根目录到路径
|
|
10
|
-
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
11
|
-
|
|
12
|
-
try:
|
|
13
|
-
from parsel import Selector, SelectorList
|
|
14
|
-
print("parsel 导入成功")
|
|
15
|
-
|
|
16
|
-
# 测试基本功能
|
|
17
|
-
html = "<html><body><h1>测试标题</h1></body></html>"
|
|
18
|
-
selector = Selector(html)
|
|
19
|
-
print("Selector 创建成功")
|
|
20
|
-
|
|
21
|
-
elements = selector.css('h1')
|
|
22
|
-
print("CSS 选择器执行成功")
|
|
23
|
-
|
|
24
|
-
text = elements.get()
|
|
25
|
-
print(f"获取文本: {text}")
|
|
26
|
-
|
|
27
|
-
except Exception as e:
|
|
28
|
-
print(f"错误: {e}")
|
|
29
|
-
import traceback
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
parsel 库测试
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
|
|
9
|
+
# 添加项目根目录到路径
|
|
10
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
11
|
+
|
|
12
|
+
try:
|
|
13
|
+
from parsel import Selector, SelectorList
|
|
14
|
+
print("parsel 导入成功")
|
|
15
|
+
|
|
16
|
+
# 测试基本功能
|
|
17
|
+
html = "<html><body><h1>测试标题</h1></body></html>"
|
|
18
|
+
selector = Selector(html)
|
|
19
|
+
print("Selector 创建成功")
|
|
20
|
+
|
|
21
|
+
elements = selector.css('h1')
|
|
22
|
+
print("CSS 选择器执行成功")
|
|
23
|
+
|
|
24
|
+
text = elements.get()
|
|
25
|
+
print(f"获取文本: {text}")
|
|
26
|
+
|
|
27
|
+
except Exception as e:
|
|
28
|
+
print(f"错误: {e}")
|
|
29
|
+
import traceback
|
|
30
30
|
traceback.print_exc()
|