crawlo 1.4.1__py3-none-any.whl → 1.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +93 -93
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +151 -151
- crawlo/commands/help.py +138 -138
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +341 -341
- crawlo/commands/startproject.py +436 -436
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +312 -312
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +52 -52
- crawlo/core/engine.py +438 -439
- crawlo/core/processor.py +47 -47
- crawlo/core/scheduler.py +291 -257
- crawlo/crawler.py +650 -650
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +273 -273
- crawlo/downloader/aiohttp_downloader.py +233 -228
- crawlo/downloader/cffi_downloader.py +245 -245
- crawlo/downloader/httpx_downloader.py +259 -259
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +402 -402
- crawlo/downloader/selenium_downloader.py +472 -472
- crawlo/event.py +11 -11
- crawlo/exceptions.py +81 -81
- crawlo/extension/__init__.py +63 -63
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +94 -94
- crawlo/extension/log_stats.py +70 -70
- crawlo/extension/logging_extension.py +61 -61
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/factories/__init__.py +27 -27
- crawlo/factories/base.py +68 -68
- crawlo/factories/crawler.py +103 -103
- crawlo/factories/registry.py +84 -84
- crawlo/filters/__init__.py +154 -154
- crawlo/filters/aioredis_filter.py +257 -257
- crawlo/filters/memory_filter.py +269 -269
- crawlo/framework.py +292 -292
- crawlo/initialization/__init__.py +44 -44
- crawlo/initialization/built_in.py +425 -425
- crawlo/initialization/context.py +141 -141
- crawlo/initialization/core.py +193 -193
- crawlo/initialization/phases.py +148 -148
- crawlo/initialization/registry.py +145 -145
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +23 -23
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/logging/__init__.py +45 -37
- crawlo/logging/async_handler.py +181 -0
- crawlo/logging/config.py +196 -96
- crawlo/logging/factory.py +171 -128
- crawlo/logging/manager.py +111 -111
- crawlo/logging/monitor.py +153 -0
- crawlo/logging/sampler.py +167 -0
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +135 -135
- crawlo/middleware/offsite.py +123 -123
- crawlo/middleware/proxy.py +386 -386
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +150 -150
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/middleware/simple_proxy.py +65 -65
- crawlo/mode_manager.py +219 -219
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +379 -379
- crawlo/network/response.py +359 -359
- crawlo/pipelines/__init__.py +21 -21
- crawlo/pipelines/bloom_dedup_pipeline.py +146 -146
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +197 -197
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +105 -105
- crawlo/pipelines/mongo_pipeline.py +131 -131
- crawlo/pipelines/mysql_pipeline.py +325 -325
- crawlo/pipelines/pipeline_manager.py +100 -84
- crawlo/pipelines/redis_dedup_pipeline.py +156 -156
- crawlo/project.py +349 -338
- crawlo/queue/pqueue.py +42 -42
- crawlo/queue/queue_manager.py +526 -522
- crawlo/queue/redis_priority_queue.py +370 -367
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +284 -284
- crawlo/settings/setting_manager.py +219 -219
- crawlo/spider/__init__.py +657 -657
- crawlo/stats_collector.py +73 -73
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +138 -138
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +3 -3
- crawlo/templates/project/items.py.tmpl +17 -17
- crawlo/templates/project/middlewares.py.tmpl +118 -118
- crawlo/templates/project/pipelines.py.tmpl +96 -96
- crawlo/templates/project/settings.py.tmpl +170 -170
- crawlo/templates/project/settings_distributed.py.tmpl +169 -169
- crawlo/templates/project/settings_gentle.py.tmpl +166 -166
- crawlo/templates/project/settings_high_performance.py.tmpl +167 -167
- crawlo/templates/project/settings_minimal.py.tmpl +65 -65
- crawlo/templates/project/settings_simple.py.tmpl +164 -164
- crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
- crawlo/templates/run.py.tmpl +34 -34
- crawlo/templates/spider/spider.py.tmpl +143 -143
- crawlo/templates/spiders_init.py.tmpl +9 -9
- crawlo/tools/__init__.py +200 -200
- crawlo/tools/anti_crawler.py +268 -268
- crawlo/tools/authenticated_proxy.py +240 -240
- crawlo/tools/data_formatter.py +225 -225
- crawlo/tools/data_validator.py +180 -180
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +384 -384
- crawlo/tools/encoding_converter.py +127 -127
- crawlo/tools/network_diagnostic.py +364 -364
- crawlo/tools/request_tools.py +82 -82
- crawlo/tools/retry_mechanism.py +224 -224
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +34 -34
- crawlo/utils/batch_processor.py +259 -259
- crawlo/utils/class_loader.py +25 -25
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +343 -343
- crawlo/utils/enhanced_error_handler.py +356 -356
- crawlo/utils/env_config.py +142 -142
- crawlo/utils/error_handler.py +165 -165
- crawlo/utils/fingerprint.py +122 -122
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_config.py +286 -286
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/log.py +79 -79
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_connection_pool.py +388 -388
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +267 -267
- crawlo/utils/request_serializer.py +225 -225
- crawlo/utils/spider_loader.py +61 -61
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +4 -4
- crawlo/utils/url.py +39 -39
- crawlo-1.4.3.dist-info/METADATA +190 -0
- crawlo-1.4.3.dist-info/RECORD +326 -0
- examples/__init__.py +7 -7
- examples/test_project/__init__.py +7 -7
- examples/test_project/run.py +34 -34
- examples/test_project/test_project/__init__.py +3 -3
- examples/test_project/test_project/items.py +17 -17
- examples/test_project/test_project/middlewares.py +118 -118
- examples/test_project/test_project/pipelines.py +96 -96
- examples/test_project/test_project/settings.py +169 -169
- examples/test_project/test_project/spiders/__init__.py +9 -9
- examples/test_project/test_project/spiders/of_week_dis.py +143 -143
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +275 -275
- tests/authenticated_proxy_example.py +106 -106
- tests/baidu_performance_test.py +108 -108
- tests/baidu_test.py +59 -59
- tests/cleaners_example.py +160 -160
- tests/comprehensive_framework_test.py +212 -212
- tests/comprehensive_test.py +81 -81
- tests/comprehensive_testing_summary.md +186 -186
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_configure.py +69 -69
- tests/debug_framework_logger.py +84 -84
- tests/debug_log_config.py +126 -126
- tests/debug_log_levels.py +63 -63
- tests/debug_pipelines.py +66 -66
- tests/detailed_log_test.py +233 -233
- tests/distributed_test.py +66 -66
- tests/distributed_test_debug.py +76 -76
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/env_config_example.py +133 -133
- tests/error_handling_example.py +171 -171
- tests/final_comprehensive_test.py +151 -151
- tests/final_log_test.py +260 -260
- tests/final_validation_test.py +182 -182
- tests/fix_log_test.py +142 -142
- tests/framework_performance_test.py +202 -202
- tests/log_buffering_test.py +111 -111
- tests/log_generation_timing_test.py +153 -153
- tests/optimized_performance_test.py +211 -211
- tests/performance_comparison.py +245 -245
- tests/queue_blocking_test.py +113 -113
- tests/queue_test.py +89 -89
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/scrapy_comparison/ofweek_scrapy.py +138 -138
- tests/scrapy_comparison/scrapy_test.py +133 -133
- tests/simple_command_test.py +119 -119
- tests/simple_crawlo_test.py +127 -127
- tests/simple_log_test.py +57 -57
- tests/simple_log_test2.py +137 -137
- tests/simple_optimization_test.py +128 -128
- tests/simple_queue_type_test.py +41 -41
- tests/simple_spider_test.py +49 -49
- tests/simple_test.py +47 -47
- tests/spider_log_timing_test.py +177 -177
- tests/test_advanced_tools.py +148 -148
- tests/test_all_commands.py +230 -230
- tests/test_all_pipeline_fingerprints.py +133 -133
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_authenticated_proxy.py +141 -141
- tests/test_batch_processor.py +178 -178
- tests/test_cleaners.py +54 -54
- tests/test_component_factory.py +174 -174
- tests/test_comprehensive.py +146 -146
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_controlled_spider_mixin.py +79 -79
- tests/test_crawlo_proxy_integration.py +108 -108
- tests/test_date_tools.py +123 -123
- tests/test_dedup_fix.py +220 -220
- tests/test_dedup_pipeline_consistency.py +125 -0
- tests/test_default_header_middleware.py +313 -313
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +204 -204
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +268 -268
- tests/test_dynamic_downloaders_proxy.py +124 -124
- tests/test_dynamic_proxy.py +92 -92
- tests/test_dynamic_proxy_config.py +146 -146
- tests/test_dynamic_proxy_real.py +109 -109
- tests/test_edge_cases.py +303 -303
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_enhanced_error_handler_comprehensive.py +245 -245
- tests/test_env_config.py +121 -121
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_factories.py +252 -252
- tests/test_final_validation.py +153 -153
- tests/test_fingerprint_consistency.py +135 -135
- tests/test_fingerprint_simple.py +51 -51
- tests/test_framework_env_usage.py +103 -103
- tests/test_framework_logger.py +66 -66
- tests/test_framework_startup.py +64 -64
- tests/test_get_component_logger.py +83 -83
- tests/test_hash_performance.py +99 -99
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_large_scale_config.py +112 -112
- tests/test_large_scale_helper.py +235 -235
- tests/test_logging_enhancements.py +375 -0
- tests/test_logging_final.py +185 -0
- tests/test_logging_integration.py +313 -0
- tests/test_logging_system.py +282 -282
- tests/test_middleware_debug.py +142 -0
- tests/test_mode_change.py +72 -72
- tests/test_mode_consistency.py +51 -51
- tests/test_offsite_middleware.py +244 -244
- tests/test_offsite_middleware_simple.py +203 -203
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_performance_monitor.py +115 -115
- tests/test_pipeline_fingerprint_consistency.py +86 -86
- tests/test_priority_behavior.py +212 -0
- tests/test_priority_consistency.py +152 -0
- tests/test_priority_consistency_fixed.py +250 -0
- tests/test_proxy_api.py +264 -264
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +121 -121
- tests/test_proxy_middleware_enhanced.py +216 -216
- tests/test_proxy_middleware_integration.py +136 -136
- tests/test_proxy_middleware_refactored.py +184 -184
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_queue_empty_check.py +41 -41
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +179 -179
- tests/test_queue_naming.py +154 -154
- tests/test_queue_type.py +106 -106
- tests/test_queue_type_redis_config_consistency.py +131 -0
- tests/test_random_headers_default.py +323 -0
- tests/test_random_headers_necessity.py +309 -0
- tests/test_random_user_agent.py +72 -72
- tests/test_real_scenario_proxy.py +195 -195
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_redis_queue_name_fix.py +175 -175
- tests/test_redis_queue_type_fallback.py +130 -0
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_improvements.py +152 -152
- tests/test_retry_middleware.py +334 -242
- tests/test_retry_middleware_realistic.py +274 -0
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_simple_response.py +61 -61
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agent_randomness.py +177 -0
- tests/test_user_agents.py +96 -96
- tests/tools_example.py +260 -260
- tests/untested_features_report.md +138 -138
- tests/verify_debug.py +51 -51
- tests/verify_distributed.py +117 -117
- tests/verify_log_fix.py +111 -111
- crawlo-1.4.1.dist-info/METADATA +0 -1199
- crawlo-1.4.1.dist-info/RECORD +0 -309
- {crawlo-1.4.1.dist-info → crawlo-1.4.3.dist-info}/WHEEL +0 -0
- {crawlo-1.4.1.dist-info → crawlo-1.4.3.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.1.dist-info → crawlo-1.4.3.dist-info}/top_level.txt +0 -0
tests/test_comprehensive.py
CHANGED
|
@@ -1,147 +1,147 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
综合测试
|
|
5
|
-
验证所有改进的集成效果
|
|
6
|
-
"""
|
|
7
|
-
import sys
|
|
8
|
-
import os
|
|
9
|
-
import asyncio
|
|
10
|
-
import unittest
|
|
11
|
-
from unittest.mock import patch, MagicMock
|
|
12
|
-
|
|
13
|
-
# 添加项目根目录到Python路径
|
|
14
|
-
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
15
|
-
|
|
16
|
-
from crawlo.utils.env_config import get_env_var, get_redis_config, get_runtime_config
|
|
17
|
-
from crawlo.utils.error_handler import ErrorHandler, handle_exception
|
|
18
|
-
from crawlo.core.engine import Engine
|
|
19
|
-
from crawlo.settings.setting_manager import SettingManager
|
|
20
|
-
from crawlo.settings import default_settings
|
|
21
|
-
from crawlo.queue.queue_manager import QueueManager, QueueConfig, QueueType
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
class TestComprehensiveIntegration(unittest.TestCase):
|
|
25
|
-
"""综合集成测试"""
|
|
26
|
-
|
|
27
|
-
def setUp(self):
|
|
28
|
-
"""测试前准备"""
|
|
29
|
-
# 设置测试环境变量
|
|
30
|
-
self.test_env = {
|
|
31
|
-
'PROJECT_NAME': 'test_project',
|
|
32
|
-
'CONCURRENCY': '4',
|
|
33
|
-
'REDIS_HOST': 'localhost',
|
|
34
|
-
'REDIS_PORT': '6379'
|
|
35
|
-
}
|
|
36
|
-
self.original_env = {}
|
|
37
|
-
for key, value in self.test_env.items():
|
|
38
|
-
self.original_env[key] = os.environ.get(key)
|
|
39
|
-
os.environ[key] = value
|
|
40
|
-
|
|
41
|
-
def tearDown(self):
|
|
42
|
-
"""测试后清理"""
|
|
43
|
-
# 恢复原始环境变量
|
|
44
|
-
for key, value in self.original_env.items():
|
|
45
|
-
if value is None:
|
|
46
|
-
os.environ.pop(key, None)
|
|
47
|
-
else:
|
|
48
|
-
os.environ[key] = value
|
|
49
|
-
|
|
50
|
-
def test_env_config_integration(self):
|
|
51
|
-
"""测试环境变量配置集成"""
|
|
52
|
-
# 验证环境变量工具正常工作
|
|
53
|
-
project_name = get_env_var('PROJECT_NAME', 'default', str)
|
|
54
|
-
self.assertEqual(project_name, 'test_project')
|
|
55
|
-
|
|
56
|
-
concurrency = get_env_var('CONCURRENCY', 1, int)
|
|
57
|
-
self.assertEqual(concurrency, 4)
|
|
58
|
-
|
|
59
|
-
# 验证Redis配置工具
|
|
60
|
-
redis_config = get_redis_config()
|
|
61
|
-
self.assertEqual(redis_config['REDIS_HOST'], 'localhost')
|
|
62
|
-
self.assertEqual(redis_config['REDIS_PORT'], 6379)
|
|
63
|
-
|
|
64
|
-
def test_error_handler_integration(self):
|
|
65
|
-
"""测试错误处理集成"""
|
|
66
|
-
# 验证错误处理模块正常工作
|
|
67
|
-
error_handler = ErrorHandler("test")
|
|
68
|
-
|
|
69
|
-
# 测试错误处理
|
|
70
|
-
try:
|
|
71
|
-
error_handler.handle_error(ValueError("Test error"), raise_error=False)
|
|
72
|
-
except Exception:
|
|
73
|
-
self.fail("Error handler should not raise exception when raise_error=False")
|
|
74
|
-
|
|
75
|
-
# 测试安全调用
|
|
76
|
-
result = error_handler.safe_call(lambda x: x*2, 5, default_return=0)
|
|
77
|
-
self.assertEqual(result, 10)
|
|
78
|
-
|
|
79
|
-
# 测试装饰器
|
|
80
|
-
@handle_exception(raise_error=False)
|
|
81
|
-
def failing_function():
|
|
82
|
-
raise RuntimeError("Test")
|
|
83
|
-
|
|
84
|
-
try:
|
|
85
|
-
failing_function()
|
|
86
|
-
except Exception:
|
|
87
|
-
self.fail("Decorated function should not raise exception")
|
|
88
|
-
|
|
89
|
-
def test_settings_integration(self):
|
|
90
|
-
"""测试设置管理器集成"""
|
|
91
|
-
# 重新加载默认设置以获取环境变量
|
|
92
|
-
import importlib
|
|
93
|
-
import crawlo.settings.default_settings
|
|
94
|
-
importlib.reload(crawlo.settings.default_settings)
|
|
95
|
-
|
|
96
|
-
# 创建设置管理器
|
|
97
|
-
settings = SettingManager()
|
|
98
|
-
settings.set_settings(crawlo.settings.default_settings)
|
|
99
|
-
|
|
100
|
-
# 验证设置正确加载
|
|
101
|
-
self.assertEqual(settings.get('PROJECT_NAME'), 'test_project')
|
|
102
|
-
self.assertEqual(settings.get_int('CONCURRENCY'), 4)
|
|
103
|
-
self.assertEqual(settings.get('REDIS_HOST'), 'localhost')
|
|
104
|
-
|
|
105
|
-
def test_queue_manager_config(self):
|
|
106
|
-
"""测试队列管理器配置"""
|
|
107
|
-
# 重新加载默认设置
|
|
108
|
-
import importlib
|
|
109
|
-
import crawlo.settings.default_settings
|
|
110
|
-
importlib.reload(crawlo.settings.default_settings)
|
|
111
|
-
|
|
112
|
-
# 创建设置管理器
|
|
113
|
-
settings = SettingManager()
|
|
114
|
-
settings.set_settings(crawlo.settings.default_settings)
|
|
115
|
-
|
|
116
|
-
# 从设置创建队列配置
|
|
117
|
-
queue_config = QueueConfig.from_settings(settings)
|
|
118
|
-
|
|
119
|
-
# 验证配置正确
|
|
120
|
-
self.assertEqual(queue_config.queue_type, QueueType.AUTO)
|
|
121
|
-
self.assertIn('test_project', queue_config.queue_name)
|
|
122
|
-
|
|
123
|
-
async def test_async_components(self):
|
|
124
|
-
"""测试异步组件"""
|
|
125
|
-
# 测试异步错误处理装饰器
|
|
126
|
-
@handle_exception(raise_error=False)
|
|
127
|
-
async def async_failing_function():
|
|
128
|
-
raise RuntimeError("Async test")
|
|
129
|
-
|
|
130
|
-
try:
|
|
131
|
-
await async_failing_function()
|
|
132
|
-
except Exception:
|
|
133
|
-
self.fail("Async decorated function should not raise exception")
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
if __name__ == '__main__':
|
|
137
|
-
# 运行同步测试
|
|
138
|
-
unittest.main(exit=False)
|
|
139
|
-
|
|
140
|
-
# 运行异步测试
|
|
141
|
-
async def run_async_tests():
|
|
142
|
-
test_instance = TestComprehensiveIntegration()
|
|
143
|
-
test_instance.setUp()
|
|
144
|
-
await test_instance.test_async_components()
|
|
145
|
-
test_instance.tearDown()
|
|
146
|
-
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
综合测试
|
|
5
|
+
验证所有改进的集成效果
|
|
6
|
+
"""
|
|
7
|
+
import sys
|
|
8
|
+
import os
|
|
9
|
+
import asyncio
|
|
10
|
+
import unittest
|
|
11
|
+
from unittest.mock import patch, MagicMock
|
|
12
|
+
|
|
13
|
+
# 添加项目根目录到Python路径
|
|
14
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
15
|
+
|
|
16
|
+
from crawlo.utils.env_config import get_env_var, get_redis_config, get_runtime_config
|
|
17
|
+
from crawlo.utils.error_handler import ErrorHandler, handle_exception
|
|
18
|
+
from crawlo.core.engine import Engine
|
|
19
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
20
|
+
from crawlo.settings import default_settings
|
|
21
|
+
from crawlo.queue.queue_manager import QueueManager, QueueConfig, QueueType
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class TestComprehensiveIntegration(unittest.TestCase):
|
|
25
|
+
"""综合集成测试"""
|
|
26
|
+
|
|
27
|
+
def setUp(self):
|
|
28
|
+
"""测试前准备"""
|
|
29
|
+
# 设置测试环境变量
|
|
30
|
+
self.test_env = {
|
|
31
|
+
'PROJECT_NAME': 'test_project',
|
|
32
|
+
'CONCURRENCY': '4',
|
|
33
|
+
'REDIS_HOST': 'localhost',
|
|
34
|
+
'REDIS_PORT': '6379'
|
|
35
|
+
}
|
|
36
|
+
self.original_env = {}
|
|
37
|
+
for key, value in self.test_env.items():
|
|
38
|
+
self.original_env[key] = os.environ.get(key)
|
|
39
|
+
os.environ[key] = value
|
|
40
|
+
|
|
41
|
+
def tearDown(self):
|
|
42
|
+
"""测试后清理"""
|
|
43
|
+
# 恢复原始环境变量
|
|
44
|
+
for key, value in self.original_env.items():
|
|
45
|
+
if value is None:
|
|
46
|
+
os.environ.pop(key, None)
|
|
47
|
+
else:
|
|
48
|
+
os.environ[key] = value
|
|
49
|
+
|
|
50
|
+
def test_env_config_integration(self):
|
|
51
|
+
"""测试环境变量配置集成"""
|
|
52
|
+
# 验证环境变量工具正常工作
|
|
53
|
+
project_name = get_env_var('PROJECT_NAME', 'default', str)
|
|
54
|
+
self.assertEqual(project_name, 'test_project')
|
|
55
|
+
|
|
56
|
+
concurrency = get_env_var('CONCURRENCY', 1, int)
|
|
57
|
+
self.assertEqual(concurrency, 4)
|
|
58
|
+
|
|
59
|
+
# 验证Redis配置工具
|
|
60
|
+
redis_config = get_redis_config()
|
|
61
|
+
self.assertEqual(redis_config['REDIS_HOST'], 'localhost')
|
|
62
|
+
self.assertEqual(redis_config['REDIS_PORT'], 6379)
|
|
63
|
+
|
|
64
|
+
def test_error_handler_integration(self):
|
|
65
|
+
"""测试错误处理集成"""
|
|
66
|
+
# 验证错误处理模块正常工作
|
|
67
|
+
error_handler = ErrorHandler("test")
|
|
68
|
+
|
|
69
|
+
# 测试错误处理
|
|
70
|
+
try:
|
|
71
|
+
error_handler.handle_error(ValueError("Test error"), raise_error=False)
|
|
72
|
+
except Exception:
|
|
73
|
+
self.fail("Error handler should not raise exception when raise_error=False")
|
|
74
|
+
|
|
75
|
+
# 测试安全调用
|
|
76
|
+
result = error_handler.safe_call(lambda x: x*2, 5, default_return=0)
|
|
77
|
+
self.assertEqual(result, 10)
|
|
78
|
+
|
|
79
|
+
# 测试装饰器
|
|
80
|
+
@handle_exception(raise_error=False)
|
|
81
|
+
def failing_function():
|
|
82
|
+
raise RuntimeError("Test")
|
|
83
|
+
|
|
84
|
+
try:
|
|
85
|
+
failing_function()
|
|
86
|
+
except Exception:
|
|
87
|
+
self.fail("Decorated function should not raise exception")
|
|
88
|
+
|
|
89
|
+
def test_settings_integration(self):
|
|
90
|
+
"""测试设置管理器集成"""
|
|
91
|
+
# 重新加载默认设置以获取环境变量
|
|
92
|
+
import importlib
|
|
93
|
+
import crawlo.settings.default_settings
|
|
94
|
+
importlib.reload(crawlo.settings.default_settings)
|
|
95
|
+
|
|
96
|
+
# 创建设置管理器
|
|
97
|
+
settings = SettingManager()
|
|
98
|
+
settings.set_settings(crawlo.settings.default_settings)
|
|
99
|
+
|
|
100
|
+
# 验证设置正确加载
|
|
101
|
+
self.assertEqual(settings.get('PROJECT_NAME'), 'test_project')
|
|
102
|
+
self.assertEqual(settings.get_int('CONCURRENCY'), 4)
|
|
103
|
+
self.assertEqual(settings.get('REDIS_HOST'), 'localhost')
|
|
104
|
+
|
|
105
|
+
def test_queue_manager_config(self):
|
|
106
|
+
"""测试队列管理器配置"""
|
|
107
|
+
# 重新加载默认设置
|
|
108
|
+
import importlib
|
|
109
|
+
import crawlo.settings.default_settings
|
|
110
|
+
importlib.reload(crawlo.settings.default_settings)
|
|
111
|
+
|
|
112
|
+
# 创建设置管理器
|
|
113
|
+
settings = SettingManager()
|
|
114
|
+
settings.set_settings(crawlo.settings.default_settings)
|
|
115
|
+
|
|
116
|
+
# 从设置创建队列配置
|
|
117
|
+
queue_config = QueueConfig.from_settings(settings)
|
|
118
|
+
|
|
119
|
+
# 验证配置正确
|
|
120
|
+
self.assertEqual(queue_config.queue_type, QueueType.AUTO)
|
|
121
|
+
self.assertIn('test_project', queue_config.queue_name)
|
|
122
|
+
|
|
123
|
+
async def test_async_components(self):
|
|
124
|
+
"""测试异步组件"""
|
|
125
|
+
# 测试异步错误处理装饰器
|
|
126
|
+
@handle_exception(raise_error=False)
|
|
127
|
+
async def async_failing_function():
|
|
128
|
+
raise RuntimeError("Async test")
|
|
129
|
+
|
|
130
|
+
try:
|
|
131
|
+
await async_failing_function()
|
|
132
|
+
except Exception:
|
|
133
|
+
self.fail("Async decorated function should not raise exception")
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
if __name__ == '__main__':
|
|
137
|
+
# 运行同步测试
|
|
138
|
+
unittest.main(exit=False)
|
|
139
|
+
|
|
140
|
+
# 运行异步测试
|
|
141
|
+
async def run_async_tests():
|
|
142
|
+
test_instance = TestComprehensiveIntegration()
|
|
143
|
+
test_instance.setUp()
|
|
144
|
+
await test_instance.test_async_components()
|
|
145
|
+
test_instance.tearDown()
|
|
146
|
+
|
|
147
147
|
asyncio.run(run_async_tests())
|
tests/test_config_consistency.py
CHANGED
|
@@ -1,81 +1,81 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
测试配置一致性优化
|
|
5
|
-
"""
|
|
6
|
-
import asyncio
|
|
7
|
-
import sys
|
|
8
|
-
import os
|
|
9
|
-
|
|
10
|
-
# 添加项目根目录到路径
|
|
11
|
-
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
12
|
-
|
|
13
|
-
from crawlo.project import get_settings
|
|
14
|
-
from crawlo.crawler import Crawler
|
|
15
|
-
from crawlo.spider import Spider
|
|
16
|
-
from crawlo.utils.log import get_logger
|
|
17
|
-
from crawlo import Request
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class TestSpider(Spider):
|
|
21
|
-
name = "test_spider"
|
|
22
|
-
|
|
23
|
-
def start_requests(self):
|
|
24
|
-
yield Request("https://example.com")
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
async def test_config_consistency():
|
|
28
|
-
"""测试配置一致性优化"""
|
|
29
|
-
print("测试配置一致性优化...")
|
|
30
|
-
|
|
31
|
-
# 模拟单机模式配置但Redis可用的情况
|
|
32
|
-
custom_settings = {
|
|
33
|
-
'QUEUE_TYPE': 'auto', # 自动检测模式
|
|
34
|
-
'CONCURRENCY': 4,
|
|
35
|
-
'DOWNLOAD_DELAY': 1.0,
|
|
36
|
-
'LOG_LEVEL': 'INFO'
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
try:
|
|
40
|
-
# 获取配置
|
|
41
|
-
settings = get_settings(custom_settings)
|
|
42
|
-
|
|
43
|
-
# 创建爬虫实例
|
|
44
|
-
crawler = Crawler(TestSpider, settings)
|
|
45
|
-
|
|
46
|
-
# 启动爬虫(这会触发调度器初始化)
|
|
47
|
-
print("开始初始化爬虫...")
|
|
48
|
-
await crawler.crawl()
|
|
49
|
-
|
|
50
|
-
print("配置一致性测试完成")
|
|
51
|
-
|
|
52
|
-
except Exception as e:
|
|
53
|
-
print(f"测试失败: {e}")
|
|
54
|
-
import traceback
|
|
55
|
-
traceback.print_exc()
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
async def main():
|
|
59
|
-
"""主测试函数"""
|
|
60
|
-
print("开始测试配置一致性优化...")
|
|
61
|
-
print("=" * 50)
|
|
62
|
-
|
|
63
|
-
try:
|
|
64
|
-
await test_config_consistency()
|
|
65
|
-
|
|
66
|
-
print("=" * 50)
|
|
67
|
-
print("配置一致性优化测试完成!")
|
|
68
|
-
|
|
69
|
-
except Exception as e:
|
|
70
|
-
print("=" * 50)
|
|
71
|
-
print(f"测试失败: {e}")
|
|
72
|
-
import traceback
|
|
73
|
-
traceback.print_exc()
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
if __name__ == "__main__":
|
|
77
|
-
# 设置日志级别
|
|
78
|
-
import logging
|
|
79
|
-
logging.basicConfig(level=logging.INFO)
|
|
80
|
-
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
测试配置一致性优化
|
|
5
|
+
"""
|
|
6
|
+
import asyncio
|
|
7
|
+
import sys
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
# 添加项目根目录到路径
|
|
11
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
12
|
+
|
|
13
|
+
from crawlo.project import get_settings
|
|
14
|
+
from crawlo.crawler import Crawler
|
|
15
|
+
from crawlo.spider import Spider
|
|
16
|
+
from crawlo.utils.log import get_logger
|
|
17
|
+
from crawlo import Request
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class TestSpider(Spider):
|
|
21
|
+
name = "test_spider"
|
|
22
|
+
|
|
23
|
+
def start_requests(self):
|
|
24
|
+
yield Request("https://example.com")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
async def test_config_consistency():
|
|
28
|
+
"""测试配置一致性优化"""
|
|
29
|
+
print("测试配置一致性优化...")
|
|
30
|
+
|
|
31
|
+
# 模拟单机模式配置但Redis可用的情况
|
|
32
|
+
custom_settings = {
|
|
33
|
+
'QUEUE_TYPE': 'auto', # 自动检测模式
|
|
34
|
+
'CONCURRENCY': 4,
|
|
35
|
+
'DOWNLOAD_DELAY': 1.0,
|
|
36
|
+
'LOG_LEVEL': 'INFO'
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
try:
|
|
40
|
+
# 获取配置
|
|
41
|
+
settings = get_settings(custom_settings)
|
|
42
|
+
|
|
43
|
+
# 创建爬虫实例
|
|
44
|
+
crawler = Crawler(TestSpider, settings)
|
|
45
|
+
|
|
46
|
+
# 启动爬虫(这会触发调度器初始化)
|
|
47
|
+
print("开始初始化爬虫...")
|
|
48
|
+
await crawler.crawl()
|
|
49
|
+
|
|
50
|
+
print("配置一致性测试完成")
|
|
51
|
+
|
|
52
|
+
except Exception as e:
|
|
53
|
+
print(f"测试失败: {e}")
|
|
54
|
+
import traceback
|
|
55
|
+
traceback.print_exc()
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
async def main():
|
|
59
|
+
"""主测试函数"""
|
|
60
|
+
print("开始测试配置一致性优化...")
|
|
61
|
+
print("=" * 50)
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
await test_config_consistency()
|
|
65
|
+
|
|
66
|
+
print("=" * 50)
|
|
67
|
+
print("配置一致性优化测试完成!")
|
|
68
|
+
|
|
69
|
+
except Exception as e:
|
|
70
|
+
print("=" * 50)
|
|
71
|
+
print(f"测试失败: {e}")
|
|
72
|
+
import traceback
|
|
73
|
+
traceback.print_exc()
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
if __name__ == "__main__":
|
|
77
|
+
# 设置日志级别
|
|
78
|
+
import logging
|
|
79
|
+
logging.basicConfig(level=logging.INFO)
|
|
80
|
+
|
|
81
81
|
asyncio.run(main())
|