crawlo 1.4.2__py3-none-any.whl → 1.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +93 -93
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +151 -151
- crawlo/commands/help.py +138 -138
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +341 -341
- crawlo/commands/startproject.py +436 -436
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +312 -312
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +52 -52
- crawlo/core/engine.py +438 -439
- crawlo/core/processor.py +47 -47
- crawlo/core/scheduler.py +291 -257
- crawlo/crawler.py +650 -650
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +273 -273
- crawlo/downloader/aiohttp_downloader.py +233 -233
- crawlo/downloader/cffi_downloader.py +245 -245
- crawlo/downloader/httpx_downloader.py +259 -259
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +402 -402
- crawlo/downloader/selenium_downloader.py +472 -472
- crawlo/event.py +11 -11
- crawlo/exceptions.py +81 -81
- crawlo/extension/__init__.py +63 -63
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +94 -94
- crawlo/extension/log_stats.py +70 -70
- crawlo/extension/logging_extension.py +61 -61
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/factories/__init__.py +27 -27
- crawlo/factories/base.py +68 -68
- crawlo/factories/crawler.py +103 -103
- crawlo/factories/registry.py +84 -84
- crawlo/filters/__init__.py +154 -154
- crawlo/filters/aioredis_filter.py +257 -257
- crawlo/filters/memory_filter.py +269 -269
- crawlo/framework.py +292 -292
- crawlo/initialization/__init__.py +44 -44
- crawlo/initialization/built_in.py +425 -425
- crawlo/initialization/context.py +141 -141
- crawlo/initialization/core.py +193 -193
- crawlo/initialization/phases.py +148 -148
- crawlo/initialization/registry.py +145 -145
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +23 -23
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/logging/__init__.py +45 -37
- crawlo/logging/async_handler.py +181 -0
- crawlo/logging/config.py +196 -96
- crawlo/logging/factory.py +171 -128
- crawlo/logging/manager.py +111 -111
- crawlo/logging/monitor.py +153 -0
- crawlo/logging/sampler.py +167 -0
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +135 -135
- crawlo/middleware/offsite.py +123 -123
- crawlo/middleware/proxy.py +386 -386
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +150 -150
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/middleware/simple_proxy.py +65 -65
- crawlo/mode_manager.py +219 -219
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +379 -379
- crawlo/network/response.py +359 -359
- crawlo/pipelines/__init__.py +21 -21
- crawlo/pipelines/bloom_dedup_pipeline.py +146 -146
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +197 -197
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +105 -105
- crawlo/pipelines/mongo_pipeline.py +131 -131
- crawlo/pipelines/mysql_pipeline.py +325 -325
- crawlo/pipelines/pipeline_manager.py +100 -84
- crawlo/pipelines/redis_dedup_pipeline.py +156 -156
- crawlo/project.py +349 -338
- crawlo/queue/pqueue.py +42 -42
- crawlo/queue/queue_manager.py +526 -522
- crawlo/queue/redis_priority_queue.py +370 -367
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +284 -284
- crawlo/settings/setting_manager.py +219 -219
- crawlo/spider/__init__.py +657 -657
- crawlo/stats_collector.py +73 -73
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +138 -138
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +3 -3
- crawlo/templates/project/items.py.tmpl +17 -17
- crawlo/templates/project/middlewares.py.tmpl +118 -118
- crawlo/templates/project/pipelines.py.tmpl +96 -96
- crawlo/templates/project/settings.py.tmpl +170 -170
- crawlo/templates/project/settings_distributed.py.tmpl +169 -169
- crawlo/templates/project/settings_gentle.py.tmpl +166 -166
- crawlo/templates/project/settings_high_performance.py.tmpl +167 -167
- crawlo/templates/project/settings_minimal.py.tmpl +65 -65
- crawlo/templates/project/settings_simple.py.tmpl +164 -164
- crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
- crawlo/templates/run.py.tmpl +34 -34
- crawlo/templates/spider/spider.py.tmpl +143 -143
- crawlo/templates/spiders_init.py.tmpl +9 -9
- crawlo/tools/__init__.py +200 -200
- crawlo/tools/anti_crawler.py +268 -268
- crawlo/tools/authenticated_proxy.py +240 -240
- crawlo/tools/data_formatter.py +225 -225
- crawlo/tools/data_validator.py +180 -180
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +384 -384
- crawlo/tools/encoding_converter.py +127 -127
- crawlo/tools/network_diagnostic.py +364 -364
- crawlo/tools/request_tools.py +82 -82
- crawlo/tools/retry_mechanism.py +224 -224
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +34 -34
- crawlo/utils/batch_processor.py +259 -259
- crawlo/utils/class_loader.py +25 -25
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +343 -343
- crawlo/utils/enhanced_error_handler.py +356 -356
- crawlo/utils/env_config.py +142 -142
- crawlo/utils/error_handler.py +165 -165
- crawlo/utils/fingerprint.py +122 -122
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_config.py +286 -286
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/log.py +79 -79
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_connection_pool.py +388 -388
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +267 -267
- crawlo/utils/request_serializer.py +225 -225
- crawlo/utils/spider_loader.py +61 -61
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +4 -4
- crawlo/utils/url.py +39 -39
- crawlo-1.4.3.dist-info/METADATA +190 -0
- crawlo-1.4.3.dist-info/RECORD +326 -0
- examples/__init__.py +7 -7
- examples/test_project/__init__.py +7 -7
- examples/test_project/run.py +34 -34
- examples/test_project/test_project/__init__.py +3 -3
- examples/test_project/test_project/items.py +17 -17
- examples/test_project/test_project/middlewares.py +118 -118
- examples/test_project/test_project/pipelines.py +96 -96
- examples/test_project/test_project/settings.py +169 -169
- examples/test_project/test_project/spiders/__init__.py +9 -9
- examples/test_project/test_project/spiders/of_week_dis.py +143 -143
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +275 -275
- tests/authenticated_proxy_example.py +106 -106
- tests/baidu_performance_test.py +108 -108
- tests/baidu_test.py +59 -59
- tests/cleaners_example.py +160 -160
- tests/comprehensive_framework_test.py +212 -212
- tests/comprehensive_test.py +81 -81
- tests/comprehensive_testing_summary.md +186 -186
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_configure.py +69 -69
- tests/debug_framework_logger.py +84 -84
- tests/debug_log_config.py +126 -126
- tests/debug_log_levels.py +63 -63
- tests/debug_pipelines.py +66 -66
- tests/detailed_log_test.py +233 -233
- tests/distributed_test.py +66 -66
- tests/distributed_test_debug.py +76 -76
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/env_config_example.py +133 -133
- tests/error_handling_example.py +171 -171
- tests/final_comprehensive_test.py +151 -151
- tests/final_log_test.py +260 -260
- tests/final_validation_test.py +182 -182
- tests/fix_log_test.py +142 -142
- tests/framework_performance_test.py +202 -202
- tests/log_buffering_test.py +111 -111
- tests/log_generation_timing_test.py +153 -153
- tests/optimized_performance_test.py +211 -211
- tests/performance_comparison.py +245 -245
- tests/queue_blocking_test.py +113 -113
- tests/queue_test.py +89 -89
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/scrapy_comparison/ofweek_scrapy.py +138 -138
- tests/scrapy_comparison/scrapy_test.py +133 -133
- tests/simple_command_test.py +119 -119
- tests/simple_crawlo_test.py +127 -127
- tests/simple_log_test.py +57 -57
- tests/simple_log_test2.py +137 -137
- tests/simple_optimization_test.py +128 -128
- tests/simple_queue_type_test.py +41 -41
- tests/simple_spider_test.py +49 -49
- tests/simple_test.py +47 -47
- tests/spider_log_timing_test.py +177 -177
- tests/test_advanced_tools.py +148 -148
- tests/test_all_commands.py +230 -230
- tests/test_all_pipeline_fingerprints.py +133 -133
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_authenticated_proxy.py +141 -141
- tests/test_batch_processor.py +178 -178
- tests/test_cleaners.py +54 -54
- tests/test_component_factory.py +174 -174
- tests/test_comprehensive.py +146 -146
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_controlled_spider_mixin.py +79 -79
- tests/test_crawlo_proxy_integration.py +108 -108
- tests/test_date_tools.py +123 -123
- tests/test_dedup_fix.py +220 -220
- tests/test_dedup_pipeline_consistency.py +125 -0
- tests/test_default_header_middleware.py +313 -313
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +204 -204
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +268 -268
- tests/test_dynamic_downloaders_proxy.py +124 -124
- tests/test_dynamic_proxy.py +92 -92
- tests/test_dynamic_proxy_config.py +146 -146
- tests/test_dynamic_proxy_real.py +109 -109
- tests/test_edge_cases.py +303 -303
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_enhanced_error_handler_comprehensive.py +245 -245
- tests/test_env_config.py +121 -121
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_factories.py +252 -252
- tests/test_final_validation.py +153 -153
- tests/test_fingerprint_consistency.py +135 -135
- tests/test_fingerprint_simple.py +51 -51
- tests/test_framework_env_usage.py +103 -103
- tests/test_framework_logger.py +66 -66
- tests/test_framework_startup.py +64 -64
- tests/test_get_component_logger.py +83 -83
- tests/test_hash_performance.py +99 -99
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_large_scale_config.py +112 -112
- tests/test_large_scale_helper.py +235 -235
- tests/test_logging_enhancements.py +375 -0
- tests/test_logging_final.py +185 -0
- tests/test_logging_integration.py +313 -0
- tests/test_logging_system.py +282 -282
- tests/test_middleware_debug.py +142 -0
- tests/test_mode_change.py +72 -72
- tests/test_mode_consistency.py +51 -51
- tests/test_offsite_middleware.py +244 -244
- tests/test_offsite_middleware_simple.py +203 -203
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_performance_monitor.py +115 -115
- tests/test_pipeline_fingerprint_consistency.py +86 -86
- tests/test_priority_behavior.py +212 -0
- tests/test_priority_consistency.py +152 -0
- tests/test_priority_consistency_fixed.py +250 -0
- tests/test_proxy_api.py +264 -264
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +121 -121
- tests/test_proxy_middleware_enhanced.py +216 -216
- tests/test_proxy_middleware_integration.py +136 -136
- tests/test_proxy_middleware_refactored.py +184 -184
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_queue_empty_check.py +41 -41
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +179 -179
- tests/test_queue_naming.py +154 -154
- tests/test_queue_type.py +106 -106
- tests/test_queue_type_redis_config_consistency.py +131 -0
- tests/test_random_headers_default.py +323 -0
- tests/test_random_headers_necessity.py +309 -0
- tests/test_random_user_agent.py +72 -72
- tests/test_real_scenario_proxy.py +195 -195
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_redis_queue_name_fix.py +175 -175
- tests/test_redis_queue_type_fallback.py +130 -0
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_improvements.py +152 -152
- tests/test_retry_middleware.py +334 -242
- tests/test_retry_middleware_realistic.py +274 -0
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_simple_response.py +61 -61
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agent_randomness.py +177 -0
- tests/test_user_agents.py +96 -96
- tests/tools_example.py +260 -260
- tests/untested_features_report.md +138 -138
- tests/verify_debug.py +51 -51
- tests/verify_distributed.py +117 -117
- tests/verify_log_fix.py +111 -111
- crawlo-1.4.2.dist-info/METADATA +0 -1199
- crawlo-1.4.2.dist-info/RECORD +0 -309
- {crawlo-1.4.2.dist-info → crawlo-1.4.3.dist-info}/WHEEL +0 -0
- {crawlo-1.4.2.dist-info → crawlo-1.4.3.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.2.dist-info → crawlo-1.4.3.dist-info}/top_level.txt +0 -0
|
@@ -1,199 +1,199 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
Redis Key 验证工具
|
|
5
|
-
=================
|
|
6
|
-
提供 Redis Key 命名规范的验证功能
|
|
7
|
-
"""
|
|
8
|
-
from typing import List, Tuple
|
|
9
|
-
|
|
10
|
-
from crawlo.utils.log import get_logger
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class RedisKeyValidator:
|
|
14
|
-
"""Redis Key 验证器"""
|
|
15
|
-
|
|
16
|
-
def __init__(self):
|
|
17
|
-
self.logger = get_logger(self.__class__.__name__)
|
|
18
|
-
|
|
19
|
-
def validate_key_naming(self, key: str, project_name: str = None) -> bool:
|
|
20
|
-
"""
|
|
21
|
-
验证Redis Key是否符合命名规范
|
|
22
|
-
|
|
23
|
-
Args:
|
|
24
|
-
key: Redis Key
|
|
25
|
-
project_name: 项目名称(可选)
|
|
26
|
-
|
|
27
|
-
Returns:
|
|
28
|
-
bool: 是否符合命名规范
|
|
29
|
-
"""
|
|
30
|
-
if not isinstance(key, str) or not key:
|
|
31
|
-
return False
|
|
32
|
-
|
|
33
|
-
# 检查是否以 crawlo: 开头
|
|
34
|
-
if not key.startswith('crawlo:'):
|
|
35
|
-
return False
|
|
36
|
-
|
|
37
|
-
# 分割Key部分
|
|
38
|
-
parts = key.split(':')
|
|
39
|
-
if len(parts) < 3:
|
|
40
|
-
return False
|
|
41
|
-
|
|
42
|
-
# 检查基本结构
|
|
43
|
-
if parts[0] != 'crawlo':
|
|
44
|
-
return False
|
|
45
|
-
|
|
46
|
-
# 如果提供了项目名称,检查是否匹配
|
|
47
|
-
if project_name and parts[1] != project_name:
|
|
48
|
-
return False
|
|
49
|
-
|
|
50
|
-
# 检查组件类型
|
|
51
|
-
valid_components = ['filter', 'queue', 'item']
|
|
52
|
-
if parts[2] not in valid_components:
|
|
53
|
-
return False
|
|
54
|
-
|
|
55
|
-
# 检查子组件(根据组件类型)
|
|
56
|
-
if parts[2] == 'queue':
|
|
57
|
-
valid_subcomponents = ['requests', 'processing', 'failed']
|
|
58
|
-
if len(parts) < 4 or parts[3] not in valid_subcomponents:
|
|
59
|
-
return False
|
|
60
|
-
elif parts[2] == 'filter':
|
|
61
|
-
if len(parts) < 4 or parts[3] != 'fingerprint':
|
|
62
|
-
return False
|
|
63
|
-
elif parts[2] == 'item':
|
|
64
|
-
if len(parts) < 4 or parts[3] != 'fingerprint':
|
|
65
|
-
return False
|
|
66
|
-
|
|
67
|
-
return True
|
|
68
|
-
|
|
69
|
-
def validate_multiple_keys(self, keys: List[str], project_name: str = None) -> Tuple[bool, List[str]]:
|
|
70
|
-
"""
|
|
71
|
-
验证多个Redis Key
|
|
72
|
-
|
|
73
|
-
Args:
|
|
74
|
-
keys: Redis Key列表
|
|
75
|
-
project_name: 项目名称(可选)
|
|
76
|
-
|
|
77
|
-
Returns:
|
|
78
|
-
Tuple[bool, List[str]]: (是否全部有效, 无效的Key列表)
|
|
79
|
-
"""
|
|
80
|
-
invalid_keys = []
|
|
81
|
-
for key in keys:
|
|
82
|
-
if not self.validate_key_naming(key, project_name):
|
|
83
|
-
invalid_keys.append(key)
|
|
84
|
-
|
|
85
|
-
return len(invalid_keys) == 0, invalid_keys
|
|
86
|
-
|
|
87
|
-
def get_key_info(self, key: str) -> dict:
|
|
88
|
-
"""
|
|
89
|
-
获取Redis Key的信息
|
|
90
|
-
|
|
91
|
-
Args:
|
|
92
|
-
key: Redis Key
|
|
93
|
-
|
|
94
|
-
Returns:
|
|
95
|
-
dict: Key信息
|
|
96
|
-
"""
|
|
97
|
-
if not self.validate_key_naming(key):
|
|
98
|
-
return {
|
|
99
|
-
'valid': False,
|
|
100
|
-
'error': 'Key不符合命名规范'
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
parts = key.split(':')
|
|
104
|
-
info = {
|
|
105
|
-
'valid': True,
|
|
106
|
-
'framework': parts[0],
|
|
107
|
-
'project': parts[1],
|
|
108
|
-
'component': parts[2]
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
if parts[2] == 'queue' and len(parts) >= 4:
|
|
112
|
-
info['sub_component'] = parts[3]
|
|
113
|
-
elif len(parts) >= 4:
|
|
114
|
-
info['sub_component'] = parts[3]
|
|
115
|
-
|
|
116
|
-
return info
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
# 便利函数
|
|
120
|
-
def validate_redis_key_naming(key: str, project_name: str = None) -> bool:
|
|
121
|
-
"""
|
|
122
|
-
验证Redis Key是否符合命名规范(便利函数)
|
|
123
|
-
|
|
124
|
-
Args:
|
|
125
|
-
key: Redis Key
|
|
126
|
-
project_name: 项目名称(可选)
|
|
127
|
-
|
|
128
|
-
Returns:
|
|
129
|
-
bool: 是否符合命名规范
|
|
130
|
-
"""
|
|
131
|
-
validator = RedisKeyValidator()
|
|
132
|
-
return validator.validate_key_naming(key, project_name)
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
def validate_multiple_redis_keys(keys: List[str], project_name: str = None) -> Tuple[bool, List[str]]:
|
|
136
|
-
"""
|
|
137
|
-
验证多个Redis Key(便利函数)
|
|
138
|
-
|
|
139
|
-
Args:
|
|
140
|
-
keys: Redis Key列表
|
|
141
|
-
project_name: 项目名称(可选)
|
|
142
|
-
|
|
143
|
-
Returns:
|
|
144
|
-
Tuple[bool, List[str]]: (是否全部有效, 无效的Key列表)
|
|
145
|
-
"""
|
|
146
|
-
validator = RedisKeyValidator()
|
|
147
|
-
return validator.validate_multiple_keys(keys, project_name)
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
def get_redis_key_info(key: str) -> dict:
|
|
151
|
-
"""
|
|
152
|
-
获取Redis Key的信息(便利函数)
|
|
153
|
-
|
|
154
|
-
Args:
|
|
155
|
-
key: Redis Key
|
|
156
|
-
|
|
157
|
-
Returns:
|
|
158
|
-
dict: Key信息
|
|
159
|
-
"""
|
|
160
|
-
validator = RedisKeyValidator()
|
|
161
|
-
return validator.get_key_info(key)
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
def print_validation_report(keys: List[str], project_name: str = None):
|
|
165
|
-
"""
|
|
166
|
-
打印Redis Key验证报告
|
|
167
|
-
|
|
168
|
-
Args:
|
|
169
|
-
keys: Redis Key列表
|
|
170
|
-
project_name: 项目名称(可选)
|
|
171
|
-
"""
|
|
172
|
-
validator = RedisKeyValidator()
|
|
173
|
-
is_valid, invalid_keys = validator.validate_multiple_keys(keys, project_name)
|
|
174
|
-
|
|
175
|
-
print("=" * 50)
|
|
176
|
-
print("Redis Key 命名规范验证报告")
|
|
177
|
-
print("=" * 50)
|
|
178
|
-
|
|
179
|
-
if is_valid:
|
|
180
|
-
print("所有Redis Key命名规范验证通过")
|
|
181
|
-
else:
|
|
182
|
-
print("发现不符合命名规范的Redis Key:")
|
|
183
|
-
for key in invalid_keys:
|
|
184
|
-
print(f" - {key}")
|
|
185
|
-
|
|
186
|
-
print("\nKey 详细信息:")
|
|
187
|
-
for key in keys:
|
|
188
|
-
info = validator.get_key_info(key)
|
|
189
|
-
if info['valid']:
|
|
190
|
-
print(f" {key}")
|
|
191
|
-
print(f" 框架: {info['framework']}")
|
|
192
|
-
print(f" 项目: {info['project']}")
|
|
193
|
-
print(f" 组件: {info['component']}")
|
|
194
|
-
if 'sub_component' in info:
|
|
195
|
-
print(f" 子组件: {info['sub_component']}")
|
|
196
|
-
else:
|
|
197
|
-
print(f" {key} - {info.get('error', '无效')}")
|
|
198
|
-
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Redis Key 验证工具
|
|
5
|
+
=================
|
|
6
|
+
提供 Redis Key 命名规范的验证功能
|
|
7
|
+
"""
|
|
8
|
+
from typing import List, Tuple
|
|
9
|
+
|
|
10
|
+
from crawlo.utils.log import get_logger
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class RedisKeyValidator:
|
|
14
|
+
"""Redis Key 验证器"""
|
|
15
|
+
|
|
16
|
+
def __init__(self):
|
|
17
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
18
|
+
|
|
19
|
+
def validate_key_naming(self, key: str, project_name: str = None) -> bool:
|
|
20
|
+
"""
|
|
21
|
+
验证Redis Key是否符合命名规范
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
key: Redis Key
|
|
25
|
+
project_name: 项目名称(可选)
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
bool: 是否符合命名规范
|
|
29
|
+
"""
|
|
30
|
+
if not isinstance(key, str) or not key:
|
|
31
|
+
return False
|
|
32
|
+
|
|
33
|
+
# 检查是否以 crawlo: 开头
|
|
34
|
+
if not key.startswith('crawlo:'):
|
|
35
|
+
return False
|
|
36
|
+
|
|
37
|
+
# 分割Key部分
|
|
38
|
+
parts = key.split(':')
|
|
39
|
+
if len(parts) < 3:
|
|
40
|
+
return False
|
|
41
|
+
|
|
42
|
+
# 检查基本结构
|
|
43
|
+
if parts[0] != 'crawlo':
|
|
44
|
+
return False
|
|
45
|
+
|
|
46
|
+
# 如果提供了项目名称,检查是否匹配
|
|
47
|
+
if project_name and parts[1] != project_name:
|
|
48
|
+
return False
|
|
49
|
+
|
|
50
|
+
# 检查组件类型
|
|
51
|
+
valid_components = ['filter', 'queue', 'item']
|
|
52
|
+
if parts[2] not in valid_components:
|
|
53
|
+
return False
|
|
54
|
+
|
|
55
|
+
# 检查子组件(根据组件类型)
|
|
56
|
+
if parts[2] == 'queue':
|
|
57
|
+
valid_subcomponents = ['requests', 'processing', 'failed']
|
|
58
|
+
if len(parts) < 4 or parts[3] not in valid_subcomponents:
|
|
59
|
+
return False
|
|
60
|
+
elif parts[2] == 'filter':
|
|
61
|
+
if len(parts) < 4 or parts[3] != 'fingerprint':
|
|
62
|
+
return False
|
|
63
|
+
elif parts[2] == 'item':
|
|
64
|
+
if len(parts) < 4 or parts[3] != 'fingerprint':
|
|
65
|
+
return False
|
|
66
|
+
|
|
67
|
+
return True
|
|
68
|
+
|
|
69
|
+
def validate_multiple_keys(self, keys: List[str], project_name: str = None) -> Tuple[bool, List[str]]:
|
|
70
|
+
"""
|
|
71
|
+
验证多个Redis Key
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
keys: Redis Key列表
|
|
75
|
+
project_name: 项目名称(可选)
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
Tuple[bool, List[str]]: (是否全部有效, 无效的Key列表)
|
|
79
|
+
"""
|
|
80
|
+
invalid_keys = []
|
|
81
|
+
for key in keys:
|
|
82
|
+
if not self.validate_key_naming(key, project_name):
|
|
83
|
+
invalid_keys.append(key)
|
|
84
|
+
|
|
85
|
+
return len(invalid_keys) == 0, invalid_keys
|
|
86
|
+
|
|
87
|
+
def get_key_info(self, key: str) -> dict:
|
|
88
|
+
"""
|
|
89
|
+
获取Redis Key的信息
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
key: Redis Key
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
dict: Key信息
|
|
96
|
+
"""
|
|
97
|
+
if not self.validate_key_naming(key):
|
|
98
|
+
return {
|
|
99
|
+
'valid': False,
|
|
100
|
+
'error': 'Key不符合命名规范'
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
parts = key.split(':')
|
|
104
|
+
info = {
|
|
105
|
+
'valid': True,
|
|
106
|
+
'framework': parts[0],
|
|
107
|
+
'project': parts[1],
|
|
108
|
+
'component': parts[2]
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
if parts[2] == 'queue' and len(parts) >= 4:
|
|
112
|
+
info['sub_component'] = parts[3]
|
|
113
|
+
elif len(parts) >= 4:
|
|
114
|
+
info['sub_component'] = parts[3]
|
|
115
|
+
|
|
116
|
+
return info
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# 便利函数
|
|
120
|
+
def validate_redis_key_naming(key: str, project_name: str = None) -> bool:
|
|
121
|
+
"""
|
|
122
|
+
验证Redis Key是否符合命名规范(便利函数)
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
key: Redis Key
|
|
126
|
+
project_name: 项目名称(可选)
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
bool: 是否符合命名规范
|
|
130
|
+
"""
|
|
131
|
+
validator = RedisKeyValidator()
|
|
132
|
+
return validator.validate_key_naming(key, project_name)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def validate_multiple_redis_keys(keys: List[str], project_name: str = None) -> Tuple[bool, List[str]]:
|
|
136
|
+
"""
|
|
137
|
+
验证多个Redis Key(便利函数)
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
keys: Redis Key列表
|
|
141
|
+
project_name: 项目名称(可选)
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
Tuple[bool, List[str]]: (是否全部有效, 无效的Key列表)
|
|
145
|
+
"""
|
|
146
|
+
validator = RedisKeyValidator()
|
|
147
|
+
return validator.validate_multiple_keys(keys, project_name)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def get_redis_key_info(key: str) -> dict:
|
|
151
|
+
"""
|
|
152
|
+
获取Redis Key的信息(便利函数)
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
key: Redis Key
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
dict: Key信息
|
|
159
|
+
"""
|
|
160
|
+
validator = RedisKeyValidator()
|
|
161
|
+
return validator.get_key_info(key)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def print_validation_report(keys: List[str], project_name: str = None):
|
|
165
|
+
"""
|
|
166
|
+
打印Redis Key验证报告
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
keys: Redis Key列表
|
|
170
|
+
project_name: 项目名称(可选)
|
|
171
|
+
"""
|
|
172
|
+
validator = RedisKeyValidator()
|
|
173
|
+
is_valid, invalid_keys = validator.validate_multiple_keys(keys, project_name)
|
|
174
|
+
|
|
175
|
+
print("=" * 50)
|
|
176
|
+
print("Redis Key 命名规范验证报告")
|
|
177
|
+
print("=" * 50)
|
|
178
|
+
|
|
179
|
+
if is_valid:
|
|
180
|
+
print("所有Redis Key命名规范验证通过")
|
|
181
|
+
else:
|
|
182
|
+
print("发现不符合命名规范的Redis Key:")
|
|
183
|
+
for key in invalid_keys:
|
|
184
|
+
print(f" - {key}")
|
|
185
|
+
|
|
186
|
+
print("\nKey 详细信息:")
|
|
187
|
+
for key in keys:
|
|
188
|
+
info = validator.get_key_info(key)
|
|
189
|
+
if info['valid']:
|
|
190
|
+
print(f" {key}")
|
|
191
|
+
print(f" 框架: {info['framework']}")
|
|
192
|
+
print(f" 项目: {info['project']}")
|
|
193
|
+
print(f" 组件: {info['component']}")
|
|
194
|
+
if 'sub_component' in info:
|
|
195
|
+
print(f" 子组件: {info['sub_component']}")
|
|
196
|
+
else:
|
|
197
|
+
print(f" {key} - {info.get('error', '无效')}")
|
|
198
|
+
|
|
199
199
|
print("=" * 50)
|