crawlo 1.3.6__py3-none-any.whl → 1.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +87 -87
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +151 -151
- crawlo/commands/help.py +138 -138
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +341 -341
- crawlo/commands/startproject.py +436 -436
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +312 -312
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +45 -45
- crawlo/core/engine.py +439 -439
- crawlo/core/processor.py +40 -40
- crawlo/core/scheduler.py +257 -257
- crawlo/crawler.py +638 -638
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +273 -273
- crawlo/downloader/aiohttp_downloader.py +228 -228
- crawlo/downloader/cffi_downloader.py +245 -245
- crawlo/downloader/httpx_downloader.py +259 -259
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +402 -402
- crawlo/downloader/selenium_downloader.py +472 -472
- crawlo/event.py +11 -11
- crawlo/exceptions.py +81 -81
- crawlo/extension/__init__.py +39 -39
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +57 -57
- crawlo/extension/log_stats.py +81 -81
- crawlo/extension/logging_extension.py +61 -61
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/factories/__init__.py +27 -27
- crawlo/factories/base.py +68 -68
- crawlo/factories/crawler.py +103 -103
- crawlo/factories/registry.py +84 -84
- crawlo/filters/__init__.py +154 -154
- crawlo/filters/aioredis_filter.py +257 -257
- crawlo/filters/memory_filter.py +269 -269
- crawlo/framework.py +292 -292
- crawlo/initialization/__init__.py +39 -39
- crawlo/initialization/built_in.py +425 -425
- crawlo/initialization/context.py +141 -141
- crawlo/initialization/core.py +193 -193
- crawlo/initialization/phases.py +148 -148
- crawlo/initialization/registry.py +145 -145
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +23 -23
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/logging/__init__.py +37 -37
- crawlo/logging/config.py +96 -96
- crawlo/logging/factory.py +128 -128
- crawlo/logging/manager.py +111 -111
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +135 -135
- crawlo/middleware/offsite.py +123 -123
- crawlo/middleware/proxy.py +386 -386
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +163 -163
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/middleware/simple_proxy.py +65 -65
- crawlo/mode_manager.py +212 -212
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +379 -379
- crawlo/network/response.py +359 -359
- crawlo/pipelines/__init__.py +21 -21
- crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +222 -222
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +115 -115
- crawlo/pipelines/mongo_pipeline.py +131 -131
- crawlo/pipelines/mysql_pipeline.py +325 -325
- crawlo/pipelines/pipeline_manager.py +76 -76
- crawlo/pipelines/redis_dedup_pipeline.py +166 -166
- crawlo/project.py +327 -327
- crawlo/queue/pqueue.py +42 -42
- crawlo/queue/queue_manager.py +522 -503
- crawlo/queue/redis_priority_queue.py +367 -326
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +321 -321
- crawlo/settings/setting_manager.py +214 -214
- crawlo/spider/__init__.py +657 -657
- crawlo/stats_collector.py +73 -73
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +138 -138
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +3 -3
- crawlo/templates/project/items.py.tmpl +17 -17
- crawlo/templates/project/middlewares.py.tmpl +118 -118
- crawlo/templates/project/pipelines.py.tmpl +96 -96
- crawlo/templates/project/settings.py.tmpl +170 -167
- crawlo/templates/project/settings_distributed.py.tmpl +169 -166
- crawlo/templates/project/settings_gentle.py.tmpl +166 -166
- crawlo/templates/project/settings_high_performance.py.tmpl +167 -167
- crawlo/templates/project/settings_minimal.py.tmpl +65 -65
- crawlo/templates/project/settings_simple.py.tmpl +164 -164
- crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
- crawlo/templates/run.py.tmpl +34 -34
- crawlo/templates/spider/spider.py.tmpl +143 -143
- crawlo/templates/spiders_init.py.tmpl +9 -9
- crawlo/tools/__init__.py +200 -200
- crawlo/tools/anti_crawler.py +268 -268
- crawlo/tools/authenticated_proxy.py +240 -240
- crawlo/tools/data_formatter.py +225 -225
- crawlo/tools/data_validator.py +180 -180
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +388 -388
- crawlo/tools/encoding_converter.py +127 -127
- crawlo/tools/network_diagnostic.py +364 -364
- crawlo/tools/request_tools.py +82 -82
- crawlo/tools/retry_mechanism.py +224 -224
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +34 -34
- crawlo/utils/batch_processor.py +259 -259
- crawlo/utils/class_loader.py +25 -25
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +343 -343
- crawlo/utils/enhanced_error_handler.py +356 -356
- crawlo/utils/env_config.py +142 -142
- crawlo/utils/error_handler.py +165 -165
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_config.py +286 -286
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/log.py +79 -79
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_connection_pool.py +388 -388
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +267 -267
- crawlo/utils/request_serializer.py +225 -225
- crawlo/utils/spider_loader.py +61 -61
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +4 -4
- crawlo/utils/url.py +39 -39
- {crawlo-1.3.6.dist-info → crawlo-1.3.7.dist-info}/METADATA +1199 -1126
- crawlo-1.3.7.dist-info/RECORD +292 -0
- examples/__init__.py +7 -7
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +275 -275
- tests/authenticated_proxy_example.py +106 -106
- tests/baidu_performance_test.py +108 -108
- tests/baidu_test.py +59 -59
- tests/cleaners_example.py +160 -160
- tests/comprehensive_framework_test.py +212 -212
- tests/comprehensive_test.py +81 -81
- tests/comprehensive_testing_summary.md +186 -186
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_configure.py +69 -69
- tests/debug_framework_logger.py +84 -84
- tests/debug_log_config.py +126 -126
- tests/debug_log_levels.py +63 -63
- tests/debug_pipelines.py +66 -66
- tests/detailed_log_test.py +233 -233
- tests/distributed_test.py +66 -66
- tests/distributed_test_debug.py +76 -76
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/env_config_example.py +133 -133
- tests/error_handling_example.py +171 -171
- tests/final_comprehensive_test.py +151 -151
- tests/final_log_test.py +260 -260
- tests/final_validation_test.py +182 -182
- tests/fix_log_test.py +142 -142
- tests/framework_performance_test.py +202 -202
- tests/log_buffering_test.py +111 -111
- tests/log_generation_timing_test.py +153 -153
- tests/optimized_performance_test.py +211 -211
- tests/performance_comparison.py +245 -245
- tests/queue_blocking_test.py +113 -113
- tests/queue_test.py +89 -89
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/scrapy_comparison/ofweek_scrapy.py +138 -138
- tests/scrapy_comparison/scrapy_test.py +133 -133
- tests/simple_command_test.py +119 -119
- tests/simple_crawlo_test.py +127 -127
- tests/simple_log_test.py +57 -57
- tests/simple_log_test2.py +137 -137
- tests/simple_optimization_test.py +128 -128
- tests/simple_queue_type_test.py +41 -41
- tests/simple_spider_test.py +49 -49
- tests/simple_test.py +47 -47
- tests/spider_log_timing_test.py +177 -177
- tests/test_advanced_tools.py +148 -148
- tests/test_all_commands.py +230 -230
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_authenticated_proxy.py +141 -141
- tests/test_batch_processor.py +178 -178
- tests/test_cleaners.py +54 -54
- tests/test_component_factory.py +174 -174
- tests/test_comprehensive.py +146 -146
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_controlled_spider_mixin.py +79 -79
- tests/test_crawlo_proxy_integration.py +108 -108
- tests/test_date_tools.py +123 -123
- tests/test_default_header_middleware.py +158 -158
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +204 -207
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +268 -268
- tests/test_dynamic_downloaders_proxy.py +124 -124
- tests/test_dynamic_proxy.py +92 -92
- tests/test_dynamic_proxy_config.py +146 -146
- tests/test_dynamic_proxy_real.py +109 -109
- tests/test_edge_cases.py +303 -303
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_enhanced_error_handler_comprehensive.py +245 -245
- tests/test_env_config.py +121 -121
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_factories.py +252 -252
- tests/test_final_validation.py +153 -153
- tests/test_framework_env_usage.py +103 -103
- tests/test_framework_logger.py +66 -66
- tests/test_framework_startup.py +64 -64
- tests/test_get_component_logger.py +83 -83
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_large_scale_config.py +112 -112
- tests/test_large_scale_helper.py +235 -235
- tests/test_logging_system.py +282 -282
- tests/test_mode_change.py +72 -72
- tests/test_mode_consistency.py +51 -51
- tests/test_offsite_middleware.py +221 -221
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_performance_monitor.py +115 -115
- tests/test_proxy_api.py +264 -264
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +121 -121
- tests/test_proxy_middleware_enhanced.py +216 -216
- tests/test_proxy_middleware_integration.py +136 -136
- tests/test_proxy_middleware_refactored.py +184 -184
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_queue_empty_check.py +41 -41
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +179 -176
- tests/test_queue_naming.py +155 -0
- tests/test_queue_type.py +106 -106
- tests/test_random_user_agent.py +72 -72
- tests/test_real_scenario_proxy.py +195 -195
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_redis_queue_name_fix.py +176 -0
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_improvements.py +152 -152
- tests/test_retry_middleware.py +241 -241
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_simple_response.py +61 -61
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agents.py +96 -96
- tests/tools_example.py +260 -260
- tests/untested_features_report.md +138 -138
- tests/verify_debug.py +51 -51
- tests/verify_distributed.py +117 -117
- tests/verify_log_fix.py +111 -111
- crawlo-1.3.6.dist-info/RECORD +0 -290
- {crawlo-1.3.6.dist-info → crawlo-1.3.7.dist-info}/WHEEL +0 -0
- {crawlo-1.3.6.dist-info → crawlo-1.3.7.dist-info}/entry_points.txt +0 -0
- {crawlo-1.3.6.dist-info → crawlo-1.3.7.dist-info}/top_level.txt +0 -0
tests/test_config_consistency.py
CHANGED
|
@@ -1,81 +1,81 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
测试配置一致性优化
|
|
5
|
-
"""
|
|
6
|
-
import asyncio
|
|
7
|
-
import sys
|
|
8
|
-
import os
|
|
9
|
-
|
|
10
|
-
# 添加项目根目录到路径
|
|
11
|
-
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
12
|
-
|
|
13
|
-
from crawlo.project import get_settings
|
|
14
|
-
from crawlo.crawler import Crawler
|
|
15
|
-
from crawlo.spider import Spider
|
|
16
|
-
from crawlo.utils.log import get_logger
|
|
17
|
-
from crawlo import Request
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class TestSpider(Spider):
|
|
21
|
-
name = "test_spider"
|
|
22
|
-
|
|
23
|
-
def start_requests(self):
|
|
24
|
-
yield Request("https://example.com")
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
async def test_config_consistency():
|
|
28
|
-
"""测试配置一致性优化"""
|
|
29
|
-
print("测试配置一致性优化...")
|
|
30
|
-
|
|
31
|
-
# 模拟单机模式配置但Redis可用的情况
|
|
32
|
-
custom_settings = {
|
|
33
|
-
'QUEUE_TYPE': 'auto', # 自动检测模式
|
|
34
|
-
'CONCURRENCY': 4,
|
|
35
|
-
'DOWNLOAD_DELAY': 1.0,
|
|
36
|
-
'LOG_LEVEL': 'INFO'
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
try:
|
|
40
|
-
# 获取配置
|
|
41
|
-
settings = get_settings(custom_settings)
|
|
42
|
-
|
|
43
|
-
# 创建爬虫实例
|
|
44
|
-
crawler = Crawler(TestSpider, settings)
|
|
45
|
-
|
|
46
|
-
# 启动爬虫(这会触发调度器初始化)
|
|
47
|
-
print("开始初始化爬虫...")
|
|
48
|
-
await crawler.crawl()
|
|
49
|
-
|
|
50
|
-
print("配置一致性测试完成")
|
|
51
|
-
|
|
52
|
-
except Exception as e:
|
|
53
|
-
print(f"测试失败: {e}")
|
|
54
|
-
import traceback
|
|
55
|
-
traceback.print_exc()
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
async def main():
|
|
59
|
-
"""主测试函数"""
|
|
60
|
-
print("开始测试配置一致性优化...")
|
|
61
|
-
print("=" * 50)
|
|
62
|
-
|
|
63
|
-
try:
|
|
64
|
-
await test_config_consistency()
|
|
65
|
-
|
|
66
|
-
print("=" * 50)
|
|
67
|
-
print("配置一致性优化测试完成!")
|
|
68
|
-
|
|
69
|
-
except Exception as e:
|
|
70
|
-
print("=" * 50)
|
|
71
|
-
print(f"测试失败: {e}")
|
|
72
|
-
import traceback
|
|
73
|
-
traceback.print_exc()
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
if __name__ == "__main__":
|
|
77
|
-
# 设置日志级别
|
|
78
|
-
import logging
|
|
79
|
-
logging.basicConfig(level=logging.INFO)
|
|
80
|
-
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
测试配置一致性优化
|
|
5
|
+
"""
|
|
6
|
+
import asyncio
|
|
7
|
+
import sys
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
# 添加项目根目录到路径
|
|
11
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
12
|
+
|
|
13
|
+
from crawlo.project import get_settings
|
|
14
|
+
from crawlo.crawler import Crawler
|
|
15
|
+
from crawlo.spider import Spider
|
|
16
|
+
from crawlo.utils.log import get_logger
|
|
17
|
+
from crawlo import Request
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class TestSpider(Spider):
|
|
21
|
+
name = "test_spider"
|
|
22
|
+
|
|
23
|
+
def start_requests(self):
|
|
24
|
+
yield Request("https://example.com")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
async def test_config_consistency():
|
|
28
|
+
"""测试配置一致性优化"""
|
|
29
|
+
print("测试配置一致性优化...")
|
|
30
|
+
|
|
31
|
+
# 模拟单机模式配置但Redis可用的情况
|
|
32
|
+
custom_settings = {
|
|
33
|
+
'QUEUE_TYPE': 'auto', # 自动检测模式
|
|
34
|
+
'CONCURRENCY': 4,
|
|
35
|
+
'DOWNLOAD_DELAY': 1.0,
|
|
36
|
+
'LOG_LEVEL': 'INFO'
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
try:
|
|
40
|
+
# 获取配置
|
|
41
|
+
settings = get_settings(custom_settings)
|
|
42
|
+
|
|
43
|
+
# 创建爬虫实例
|
|
44
|
+
crawler = Crawler(TestSpider, settings)
|
|
45
|
+
|
|
46
|
+
# 启动爬虫(这会触发调度器初始化)
|
|
47
|
+
print("开始初始化爬虫...")
|
|
48
|
+
await crawler.crawl()
|
|
49
|
+
|
|
50
|
+
print("配置一致性测试完成")
|
|
51
|
+
|
|
52
|
+
except Exception as e:
|
|
53
|
+
print(f"测试失败: {e}")
|
|
54
|
+
import traceback
|
|
55
|
+
traceback.print_exc()
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
async def main():
|
|
59
|
+
"""主测试函数"""
|
|
60
|
+
print("开始测试配置一致性优化...")
|
|
61
|
+
print("=" * 50)
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
await test_config_consistency()
|
|
65
|
+
|
|
66
|
+
print("=" * 50)
|
|
67
|
+
print("配置一致性优化测试完成!")
|
|
68
|
+
|
|
69
|
+
except Exception as e:
|
|
70
|
+
print("=" * 50)
|
|
71
|
+
print(f"测试失败: {e}")
|
|
72
|
+
import traceback
|
|
73
|
+
traceback.print_exc()
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
if __name__ == "__main__":
|
|
77
|
+
# 设置日志级别
|
|
78
|
+
import logging
|
|
79
|
+
logging.basicConfig(level=logging.INFO)
|
|
80
|
+
|
|
81
81
|
asyncio.run(main())
|
tests/test_config_merge.py
CHANGED
|
@@ -1,153 +1,153 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
配置合并测试
|
|
5
|
-
测试 Crawlo 框架的配置合并功能
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import sys
|
|
9
|
-
import os
|
|
10
|
-
import unittest
|
|
11
|
-
|
|
12
|
-
# 添加项目根目录到路径
|
|
13
|
-
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
14
|
-
|
|
15
|
-
from crawlo.settings.setting_manager import SettingManager
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class TestConfigMerge(unittest.TestCase):
|
|
19
|
-
"""配置合并测试类"""
|
|
20
|
-
|
|
21
|
-
def test_middleware_merge(self):
|
|
22
|
-
"""测试中间件配置合并"""
|
|
23
|
-
# 用户自定义配置
|
|
24
|
-
user_config = {
|
|
25
|
-
'MIDDLEWARES': [
|
|
26
|
-
'myproject.middlewares.CustomMiddleware',
|
|
27
|
-
]
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
settings = SettingManager(user_config)
|
|
31
|
-
|
|
32
|
-
# 获取合并后的中间件列表
|
|
33
|
-
middlewares = settings.get('MIDDLEWARES')
|
|
34
|
-
|
|
35
|
-
# 检查默认中间件是否存在
|
|
36
|
-
self.assertIn('crawlo.middleware.request_ignore.RequestIgnoreMiddleware', middlewares)
|
|
37
|
-
self.assertIn('crawlo.middleware.download_delay.DownloadDelayMiddleware', middlewares)
|
|
38
|
-
|
|
39
|
-
# 检查自定义中间件是否存在
|
|
40
|
-
self.assertIn('myproject.middlewares.CustomMiddleware', middlewares)
|
|
41
|
-
|
|
42
|
-
# 检查合并后的顺序是否正确
|
|
43
|
-
default_index = middlewares.index('crawlo.middleware.request_ignore.RequestIgnoreMiddleware')
|
|
44
|
-
custom_index = middlewares.index('myproject.middlewares.CustomMiddleware')
|
|
45
|
-
self.assertLess(default_index, custom_index)
|
|
46
|
-
|
|
47
|
-
def test_pipeline_merge(self):
|
|
48
|
-
"""测试管道配置合并"""
|
|
49
|
-
# 用户自定义配置
|
|
50
|
-
user_config = {
|
|
51
|
-
'PIPELINES': [
|
|
52
|
-
'myproject.pipelines.CustomPipeline',
|
|
53
|
-
]
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
settings = SettingManager(user_config)
|
|
57
|
-
|
|
58
|
-
# 获取合并后的管道列表
|
|
59
|
-
pipelines = settings.get('PIPELINES')
|
|
60
|
-
|
|
61
|
-
# 检查默认管道是否存在
|
|
62
|
-
self.assertIn('crawlo.pipelines.console_pipeline.ConsolePipeline', pipelines)
|
|
63
|
-
|
|
64
|
-
# 检查自定义管道是否存在
|
|
65
|
-
self.assertIn('myproject.pipelines.CustomPipeline', pipelines)
|
|
66
|
-
|
|
67
|
-
# 检查去重管道是否在开头
|
|
68
|
-
dedup_pipeline = settings.get('DEFAULT_DEDUP_PIPELINE')
|
|
69
|
-
self.assertEqual(pipelines[0], dedup_pipeline)
|
|
70
|
-
self.assertEqual(dedup_pipeline, 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline')
|
|
71
|
-
|
|
72
|
-
def test_extension_merge(self):
|
|
73
|
-
"""测试扩展配置合并"""
|
|
74
|
-
# 用户自定义配置
|
|
75
|
-
user_config = {
|
|
76
|
-
'EXTENSIONS': [
|
|
77
|
-
'myproject.extensions.CustomExtension',
|
|
78
|
-
]
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
settings = SettingManager(user_config)
|
|
82
|
-
|
|
83
|
-
# 获取合并后的扩展列表
|
|
84
|
-
extensions = settings.get('EXTENSIONS')
|
|
85
|
-
|
|
86
|
-
# 检查默认扩展是否存在
|
|
87
|
-
self.assertIn('crawlo.extension.log_interval.LogIntervalExtension', extensions)
|
|
88
|
-
self.assertIn('crawlo.extension.log_stats.LogStats', extensions)
|
|
89
|
-
self.assertIn('crawlo.extension.logging_extension.CustomLoggerExtension', extensions)
|
|
90
|
-
|
|
91
|
-
# 检查自定义扩展是否存在
|
|
92
|
-
self.assertIn('myproject.extensions.CustomExtension', extensions)
|
|
93
|
-
|
|
94
|
-
# 检查合并后的顺序是否正确
|
|
95
|
-
default_index = extensions.index('crawlo.extension.log_interval.LogIntervalExtension')
|
|
96
|
-
custom_index = extensions.index('myproject.extensions.CustomExtension')
|
|
97
|
-
self.assertLess(default_index, custom_index)
|
|
98
|
-
|
|
99
|
-
def test_empty_custom_config(self):
|
|
100
|
-
"""测试空自定义配置"""
|
|
101
|
-
# 空用户配置
|
|
102
|
-
user_config = {}
|
|
103
|
-
|
|
104
|
-
settings = SettingManager(user_config)
|
|
105
|
-
|
|
106
|
-
# 获取合并后的中间件列表
|
|
107
|
-
middlewares = settings.get('MIDDLEWARES')
|
|
108
|
-
|
|
109
|
-
# 检查默认中间件是否存在
|
|
110
|
-
self.assertIn('crawlo.middleware.request_ignore.RequestIgnoreMiddleware', middlewares)
|
|
111
|
-
self.assertIn('crawlo.middleware.download_delay.DownloadDelayMiddleware', middlewares)
|
|
112
|
-
|
|
113
|
-
# 检查管道和扩展
|
|
114
|
-
pipelines = settings.get('PIPELINES')
|
|
115
|
-
self.assertIn('crawlo.pipelines.console_pipeline.ConsolePipeline', pipelines)
|
|
116
|
-
|
|
117
|
-
extensions = settings.get('EXTENSIONS')
|
|
118
|
-
self.assertIn('crawlo.extension.log_interval.LogIntervalExtension', extensions)
|
|
119
|
-
|
|
120
|
-
def test_no_custom_config(self):
|
|
121
|
-
"""测试无自定义配置"""
|
|
122
|
-
# 无用户配置
|
|
123
|
-
settings = SettingManager()
|
|
124
|
-
|
|
125
|
-
# 获取合并后的中间件列表
|
|
126
|
-
middlewares = settings.get('MIDDLEWARES')
|
|
127
|
-
|
|
128
|
-
# 检查默认中间件是否存在
|
|
129
|
-
self.assertIn('crawlo.middleware.request_ignore.RequestIgnoreMiddleware', middlewares)
|
|
130
|
-
self.assertIn('crawlo.middleware.download_delay.DownloadDelayMiddleware', middlewares)
|
|
131
|
-
|
|
132
|
-
# 检查管道和扩展
|
|
133
|
-
pipelines = settings.get('PIPELINES')
|
|
134
|
-
self.assertIn('crawlo.pipelines.console_pipeline.ConsolePipeline', pipelines)
|
|
135
|
-
|
|
136
|
-
extensions = settings.get('EXTENSIONS')
|
|
137
|
-
self.assertIn('crawlo.extension.log_interval.LogIntervalExtension', extensions)
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
def main():
|
|
141
|
-
"""主测试函数"""
|
|
142
|
-
print("开始配置合并测试...")
|
|
143
|
-
print("=" * 50)
|
|
144
|
-
|
|
145
|
-
# 运行测试
|
|
146
|
-
unittest.main(argv=['first-arg-is-ignored'], exit=False, verbosity=2)
|
|
147
|
-
|
|
148
|
-
print("=" * 50)
|
|
149
|
-
print("配置合并测试完成")
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
if __name__ == "__main__":
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
配置合并测试
|
|
5
|
+
测试 Crawlo 框架的配置合并功能
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import sys
|
|
9
|
+
import os
|
|
10
|
+
import unittest
|
|
11
|
+
|
|
12
|
+
# 添加项目根目录到路径
|
|
13
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
14
|
+
|
|
15
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class TestConfigMerge(unittest.TestCase):
|
|
19
|
+
"""配置合并测试类"""
|
|
20
|
+
|
|
21
|
+
def test_middleware_merge(self):
|
|
22
|
+
"""测试中间件配置合并"""
|
|
23
|
+
# 用户自定义配置
|
|
24
|
+
user_config = {
|
|
25
|
+
'MIDDLEWARES': [
|
|
26
|
+
'myproject.middlewares.CustomMiddleware',
|
|
27
|
+
]
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
settings = SettingManager(user_config)
|
|
31
|
+
|
|
32
|
+
# 获取合并后的中间件列表
|
|
33
|
+
middlewares = settings.get('MIDDLEWARES')
|
|
34
|
+
|
|
35
|
+
# 检查默认中间件是否存在
|
|
36
|
+
self.assertIn('crawlo.middleware.request_ignore.RequestIgnoreMiddleware', middlewares)
|
|
37
|
+
self.assertIn('crawlo.middleware.download_delay.DownloadDelayMiddleware', middlewares)
|
|
38
|
+
|
|
39
|
+
# 检查自定义中间件是否存在
|
|
40
|
+
self.assertIn('myproject.middlewares.CustomMiddleware', middlewares)
|
|
41
|
+
|
|
42
|
+
# 检查合并后的顺序是否正确
|
|
43
|
+
default_index = middlewares.index('crawlo.middleware.request_ignore.RequestIgnoreMiddleware')
|
|
44
|
+
custom_index = middlewares.index('myproject.middlewares.CustomMiddleware')
|
|
45
|
+
self.assertLess(default_index, custom_index)
|
|
46
|
+
|
|
47
|
+
def test_pipeline_merge(self):
|
|
48
|
+
"""测试管道配置合并"""
|
|
49
|
+
# 用户自定义配置
|
|
50
|
+
user_config = {
|
|
51
|
+
'PIPELINES': [
|
|
52
|
+
'myproject.pipelines.CustomPipeline',
|
|
53
|
+
]
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
settings = SettingManager(user_config)
|
|
57
|
+
|
|
58
|
+
# 获取合并后的管道列表
|
|
59
|
+
pipelines = settings.get('PIPELINES')
|
|
60
|
+
|
|
61
|
+
# 检查默认管道是否存在
|
|
62
|
+
self.assertIn('crawlo.pipelines.console_pipeline.ConsolePipeline', pipelines)
|
|
63
|
+
|
|
64
|
+
# 检查自定义管道是否存在
|
|
65
|
+
self.assertIn('myproject.pipelines.CustomPipeline', pipelines)
|
|
66
|
+
|
|
67
|
+
# 检查去重管道是否在开头
|
|
68
|
+
dedup_pipeline = settings.get('DEFAULT_DEDUP_PIPELINE')
|
|
69
|
+
self.assertEqual(pipelines[0], dedup_pipeline)
|
|
70
|
+
self.assertEqual(dedup_pipeline, 'crawlo.pipelines.memory_dedup_pipeline.MemoryDedupPipeline')
|
|
71
|
+
|
|
72
|
+
def test_extension_merge(self):
|
|
73
|
+
"""测试扩展配置合并"""
|
|
74
|
+
# 用户自定义配置
|
|
75
|
+
user_config = {
|
|
76
|
+
'EXTENSIONS': [
|
|
77
|
+
'myproject.extensions.CustomExtension',
|
|
78
|
+
]
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
settings = SettingManager(user_config)
|
|
82
|
+
|
|
83
|
+
# 获取合并后的扩展列表
|
|
84
|
+
extensions = settings.get('EXTENSIONS')
|
|
85
|
+
|
|
86
|
+
# 检查默认扩展是否存在
|
|
87
|
+
self.assertIn('crawlo.extension.log_interval.LogIntervalExtension', extensions)
|
|
88
|
+
self.assertIn('crawlo.extension.log_stats.LogStats', extensions)
|
|
89
|
+
self.assertIn('crawlo.extension.logging_extension.CustomLoggerExtension', extensions)
|
|
90
|
+
|
|
91
|
+
# 检查自定义扩展是否存在
|
|
92
|
+
self.assertIn('myproject.extensions.CustomExtension', extensions)
|
|
93
|
+
|
|
94
|
+
# 检查合并后的顺序是否正确
|
|
95
|
+
default_index = extensions.index('crawlo.extension.log_interval.LogIntervalExtension')
|
|
96
|
+
custom_index = extensions.index('myproject.extensions.CustomExtension')
|
|
97
|
+
self.assertLess(default_index, custom_index)
|
|
98
|
+
|
|
99
|
+
def test_empty_custom_config(self):
|
|
100
|
+
"""测试空自定义配置"""
|
|
101
|
+
# 空用户配置
|
|
102
|
+
user_config = {}
|
|
103
|
+
|
|
104
|
+
settings = SettingManager(user_config)
|
|
105
|
+
|
|
106
|
+
# 获取合并后的中间件列表
|
|
107
|
+
middlewares = settings.get('MIDDLEWARES')
|
|
108
|
+
|
|
109
|
+
# 检查默认中间件是否存在
|
|
110
|
+
self.assertIn('crawlo.middleware.request_ignore.RequestIgnoreMiddleware', middlewares)
|
|
111
|
+
self.assertIn('crawlo.middleware.download_delay.DownloadDelayMiddleware', middlewares)
|
|
112
|
+
|
|
113
|
+
# 检查管道和扩展
|
|
114
|
+
pipelines = settings.get('PIPELINES')
|
|
115
|
+
self.assertIn('crawlo.pipelines.console_pipeline.ConsolePipeline', pipelines)
|
|
116
|
+
|
|
117
|
+
extensions = settings.get('EXTENSIONS')
|
|
118
|
+
self.assertIn('crawlo.extension.log_interval.LogIntervalExtension', extensions)
|
|
119
|
+
|
|
120
|
+
def test_no_custom_config(self):
|
|
121
|
+
"""测试无自定义配置"""
|
|
122
|
+
# 无用户配置
|
|
123
|
+
settings = SettingManager()
|
|
124
|
+
|
|
125
|
+
# 获取合并后的中间件列表
|
|
126
|
+
middlewares = settings.get('MIDDLEWARES')
|
|
127
|
+
|
|
128
|
+
# 检查默认中间件是否存在
|
|
129
|
+
self.assertIn('crawlo.middleware.request_ignore.RequestIgnoreMiddleware', middlewares)
|
|
130
|
+
self.assertIn('crawlo.middleware.download_delay.DownloadDelayMiddleware', middlewares)
|
|
131
|
+
|
|
132
|
+
# 检查管道和扩展
|
|
133
|
+
pipelines = settings.get('PIPELINES')
|
|
134
|
+
self.assertIn('crawlo.pipelines.console_pipeline.ConsolePipeline', pipelines)
|
|
135
|
+
|
|
136
|
+
extensions = settings.get('EXTENSIONS')
|
|
137
|
+
self.assertIn('crawlo.extension.log_interval.LogIntervalExtension', extensions)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def main():
|
|
141
|
+
"""主测试函数"""
|
|
142
|
+
print("开始配置合并测试...")
|
|
143
|
+
print("=" * 50)
|
|
144
|
+
|
|
145
|
+
# 运行测试
|
|
146
|
+
unittest.main(argv=['first-arg-is-ignored'], exit=False, verbosity=2)
|
|
147
|
+
|
|
148
|
+
print("=" * 50)
|
|
149
|
+
print("配置合并测试完成")
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
if __name__ == "__main__":
|
|
153
153
|
main()
|