crawlo 1.1.3__py3-none-any.whl → 1.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (115) hide show
  1. crawlo/__init__.py +28 -1
  2. crawlo/__version__.py +1 -1
  3. crawlo/cleaners/__init__.py +61 -0
  4. crawlo/cleaners/data_formatter.py +226 -0
  5. crawlo/cleaners/encoding_converter.py +126 -0
  6. crawlo/cleaners/text_cleaner.py +233 -0
  7. crawlo/commands/startproject.py +117 -13
  8. crawlo/config.py +30 -0
  9. crawlo/config_validator.py +253 -0
  10. crawlo/core/engine.py +185 -11
  11. crawlo/core/scheduler.py +49 -78
  12. crawlo/crawler.py +6 -6
  13. crawlo/downloader/__init__.py +24 -0
  14. crawlo/downloader/aiohttp_downloader.py +8 -0
  15. crawlo/downloader/cffi_downloader.py +5 -0
  16. crawlo/downloader/hybrid_downloader.py +214 -0
  17. crawlo/downloader/playwright_downloader.py +403 -0
  18. crawlo/downloader/selenium_downloader.py +473 -0
  19. crawlo/extension/__init__.py +17 -10
  20. crawlo/extension/health_check.py +142 -0
  21. crawlo/extension/log_interval.py +27 -18
  22. crawlo/extension/log_stats.py +62 -24
  23. crawlo/extension/logging_extension.py +18 -9
  24. crawlo/extension/memory_monitor.py +105 -0
  25. crawlo/extension/performance_profiler.py +134 -0
  26. crawlo/extension/request_recorder.py +108 -0
  27. crawlo/filters/aioredis_filter.py +50 -12
  28. crawlo/middleware/proxy.py +26 -2
  29. crawlo/mode_manager.py +24 -19
  30. crawlo/network/request.py +30 -3
  31. crawlo/network/response.py +114 -25
  32. crawlo/pipelines/mongo_pipeline.py +81 -66
  33. crawlo/pipelines/mysql_pipeline.py +165 -43
  34. crawlo/pipelines/redis_dedup_pipeline.py +7 -3
  35. crawlo/queue/queue_manager.py +15 -2
  36. crawlo/queue/redis_priority_queue.py +144 -76
  37. crawlo/settings/default_settings.py +93 -121
  38. crawlo/subscriber.py +62 -37
  39. crawlo/templates/project/items.py.tmpl +1 -1
  40. crawlo/templates/project/middlewares.py.tmpl +73 -49
  41. crawlo/templates/project/pipelines.py.tmpl +51 -295
  42. crawlo/templates/project/settings.py.tmpl +93 -17
  43. crawlo/templates/project/settings_distributed.py.tmpl +120 -0
  44. crawlo/templates/project/settings_gentle.py.tmpl +95 -0
  45. crawlo/templates/project/settings_high_performance.py.tmpl +152 -0
  46. crawlo/templates/project/settings_simple.py.tmpl +69 -0
  47. crawlo/templates/spider/spider.py.tmpl +2 -38
  48. crawlo/tools/__init__.py +183 -0
  49. crawlo/tools/anti_crawler.py +269 -0
  50. crawlo/tools/authenticated_proxy.py +241 -0
  51. crawlo/tools/data_validator.py +181 -0
  52. crawlo/tools/date_tools.py +36 -0
  53. crawlo/tools/distributed_coordinator.py +387 -0
  54. crawlo/tools/retry_mechanism.py +221 -0
  55. crawlo/tools/scenario_adapter.py +263 -0
  56. crawlo/utils/__init__.py +29 -1
  57. crawlo/utils/batch_processor.py +261 -0
  58. crawlo/utils/date_tools.py +58 -1
  59. crawlo/utils/enhanced_error_handler.py +360 -0
  60. crawlo/utils/env_config.py +106 -0
  61. crawlo/utils/error_handler.py +126 -0
  62. crawlo/utils/performance_monitor.py +285 -0
  63. crawlo/utils/redis_connection_pool.py +335 -0
  64. crawlo/utils/redis_key_validator.py +200 -0
  65. crawlo-1.1.5.dist-info/METADATA +401 -0
  66. crawlo-1.1.5.dist-info/RECORD +185 -0
  67. tests/advanced_tools_example.py +276 -0
  68. tests/authenticated_proxy_example.py +237 -0
  69. tests/cleaners_example.py +161 -0
  70. tests/config_validation_demo.py +103 -0
  71. tests/date_tools_example.py +181 -0
  72. tests/dynamic_loading_example.py +524 -0
  73. tests/dynamic_loading_test.py +105 -0
  74. tests/env_config_example.py +134 -0
  75. tests/error_handling_example.py +172 -0
  76. tests/redis_key_validation_demo.py +131 -0
  77. tests/response_improvements_example.py +145 -0
  78. tests/test_advanced_tools.py +149 -0
  79. tests/test_all_redis_key_configs.py +146 -0
  80. tests/test_authenticated_proxy.py +142 -0
  81. tests/test_cleaners.py +55 -0
  82. tests/test_comprehensive.py +147 -0
  83. tests/test_config_validator.py +194 -0
  84. tests/test_date_tools.py +124 -0
  85. tests/test_dynamic_downloaders_proxy.py +125 -0
  86. tests/test_dynamic_proxy.py +93 -0
  87. tests/test_dynamic_proxy_config.py +147 -0
  88. tests/test_dynamic_proxy_real.py +110 -0
  89. tests/test_edge_cases.py +304 -0
  90. tests/test_enhanced_error_handler.py +271 -0
  91. tests/test_env_config.py +122 -0
  92. tests/test_error_handler_compatibility.py +113 -0
  93. tests/test_framework_env_usage.py +104 -0
  94. tests/test_integration.py +357 -0
  95. tests/test_item_dedup_redis_key.py +123 -0
  96. tests/test_parsel.py +30 -0
  97. tests/test_performance.py +328 -0
  98. tests/test_queue_manager_redis_key.py +177 -0
  99. tests/test_redis_connection_pool.py +295 -0
  100. tests/test_redis_key_naming.py +182 -0
  101. tests/test_redis_key_validator.py +124 -0
  102. tests/test_response_improvements.py +153 -0
  103. tests/test_simple_response.py +62 -0
  104. tests/test_telecom_spider_redis_key.py +206 -0
  105. tests/test_template_content.py +88 -0
  106. tests/test_template_redis_key.py +135 -0
  107. tests/test_tools.py +154 -0
  108. tests/tools_example.py +258 -0
  109. crawlo/core/enhanced_engine.py +0 -190
  110. crawlo-1.1.3.dist-info/METADATA +0 -635
  111. crawlo-1.1.3.dist-info/RECORD +0 -113
  112. {crawlo-1.1.3.dist-info → crawlo-1.1.5.dist-info}/WHEEL +0 -0
  113. {crawlo-1.1.3.dist-info → crawlo-1.1.5.dist-info}/entry_points.txt +0 -0
  114. {crawlo-1.1.3.dist-info → crawlo-1.1.5.dist-info}/top_level.txt +0 -0
  115. {examples → tests}/controlled_spider_example.py +0 -0
@@ -7,18 +7,73 @@
7
7
  """
8
8
  import shutil
9
9
  import re
10
+ import sys
11
+ import os
10
12
  from pathlib import Path
11
- from rich.console import Console
12
- from rich.panel import Panel
13
- from rich.text import Text
13
+ from typing import Optional
14
14
 
15
- from .utils import show_error_panel, show_success_panel
15
+ # 添加项目根目录到路径,以便能够导入utils模块
16
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..'))
16
17
 
17
- # 初始化 rich 控制台
18
- console = Console()
18
+ try:
19
+ from rich.console import Console
20
+ from rich.panel import Panel
21
+ from rich.text import Text
22
+ from rich.table import Table
23
+ RICH_AVAILABLE = True
24
+ except ImportError:
25
+ RICH_AVAILABLE = False
26
+
27
+ try:
28
+ from .utils import show_error_panel, show_success_panel
29
+ UTILS_AVAILABLE = True
30
+ except ImportError:
31
+ # 如果相对导入失败,尝试绝对导入
32
+ try:
33
+ from crawlo.commands.utils import show_error_panel, show_success_panel
34
+ UTILS_AVAILABLE = True
35
+ except ImportError:
36
+ UTILS_AVAILABLE = False
37
+
38
+ # 初始化 rich 控制台(如果可用)
39
+ if RICH_AVAILABLE:
40
+ console = Console()
41
+ else:
42
+ # 简单的控制台输出替代
43
+ class Console:
44
+ def print(self, text):
45
+ print(text)
46
+ console = Console()
19
47
 
20
48
  TEMPLATES_DIR = Path(__file__).parent.parent / 'templates'
21
49
 
50
+ # 可用的模板类型
51
+ TEMPLATE_TYPES = {
52
+ 'default': '默认模板 - 通用配置,适合大多数项目',
53
+ 'simple': '简化模板 - 最小配置,适合快速开始',
54
+ 'distributed': '分布式模板 - 针对分布式爬取优化',
55
+ 'high-performance': '高性能模板 - 针对大规模高并发优化',
56
+ 'gentle': '温和模板 - 低负载配置,对目标网站友好'
57
+ }
58
+
59
+
60
+ def show_error_panel(title, content):
61
+ """显示错误面板的简单实现"""
62
+ if RICH_AVAILABLE:
63
+ from rich.panel import Panel
64
+ console.print(Panel(content, title=title, border_style="red"))
65
+ else:
66
+ print(f"❌ {title}")
67
+ print(content)
68
+
69
+ def show_success_panel(title, content):
70
+ """显示成功面板的简单实现"""
71
+ if RICH_AVAILABLE:
72
+ from rich.panel import Panel
73
+ console.print(Panel(content, title=title, border_style="green"))
74
+ else:
75
+ print(f"✅ {title}")
76
+ print(content)
22
77
 
23
78
  def _render_template(tmpl_path, context):
24
79
  """读取模板文件,替换 {{key}} 为 context 中的值"""
@@ -29,7 +84,7 @@ def _render_template(tmpl_path, context):
29
84
  return content
30
85
 
31
86
 
32
- def _copytree_with_templates(src, dst, context):
87
+ def _copytree_with_templates(src, dst, context, template_type='default'):
33
88
  """
34
89
  递归复制目录,将 .tmpl 文件渲染后复制(去除 .tmpl 后缀),其他文件直接复制。
35
90
  """
@@ -45,7 +100,19 @@ def _copytree_with_templates(src, dst, context):
45
100
  dst_item.mkdir(parents=True, exist_ok=True)
46
101
  else:
47
102
  if item.suffix == '.tmpl':
48
- rendered_content = _render_template(item, context)
103
+ # 处理特定模板类型的设置文件
104
+ if item.name == 'settings.py.tmpl' and template_type != 'default':
105
+ # 使用特定模板类型的设置文件
106
+ template_file_name = f'settings_{template_type}.py.tmpl'
107
+ template_file_path = src_path / template_file_name
108
+ if template_file_path.exists():
109
+ rendered_content = _render_template(template_file_path, context)
110
+ else:
111
+ # 如果特定模板不存在,使用默认模板
112
+ rendered_content = _render_template(item, context)
113
+ else:
114
+ rendered_content = _render_template(item, context)
115
+
49
116
  final_dst = dst_item.with_suffix('')
50
117
  final_dst.parent.mkdir(parents=True, exist_ok=True)
51
118
  with open(final_dst, 'w', encoding='utf-8') as f:
@@ -100,16 +167,44 @@ def validate_project_name(project_name: str) -> tuple[bool, str]:
100
167
  return True, ""
101
168
 
102
169
 
170
+ def show_template_options():
171
+ """显示可用的模板选项"""
172
+ if RICH_AVAILABLE:
173
+ table = Table(title="可用模板类型", show_header=True, header_style="bold magenta")
174
+ table.add_column("模板类型", style="cyan", no_wrap=True)
175
+ table.add_column("描述", style="green")
176
+
177
+ for template_type, description in TEMPLATE_TYPES.items():
178
+ table.add_row(template_type, description)
179
+
180
+ console.print(table)
181
+ else:
182
+ print("可用模板类型:")
183
+ for template_type, description in TEMPLATE_TYPES.items():
184
+ print(f" {template_type}: {description}")
185
+
186
+
103
187
  def main(args):
104
- if len(args) != 1:
105
- console.print("[bold red]Error:[/bold red] Usage: [blue]crawlo startproject[/blue] <project_name>")
188
+ if len(args) < 1 or len(args) > 2:
189
+ console.print("[bold red]Error:[/bold red] Usage: [blue]crawlo startproject[/blue] <project_name> [template_type]")
106
190
  console.print("💡 Examples:")
107
191
  console.print(" [blue]crawlo startproject[/blue] my_spider_project")
108
- console.print(" [blue]crawlo startproject[/blue] news_crawler")
109
- console.print(" [blue]crawlo startproject[/blue] ecommerce_spider")
192
+ console.print(" [blue]crawlo startproject[/blue] news_crawler simple")
193
+ console.print(" [blue]crawlo startproject[/blue] ecommerce_spider distributed")
194
+ show_template_options()
110
195
  return 1
111
196
 
112
197
  project_name = args[0]
198
+ template_type = args[1] if len(args) > 1 else 'default'
199
+
200
+ # 验证模板类型
201
+ if template_type not in TEMPLATE_TYPES:
202
+ show_error_panel(
203
+ "Invalid Template Type",
204
+ f"Template type '[cyan]{template_type}[/cyan]' is not supported.\n"
205
+ )
206
+ show_template_options()
207
+ return 1
113
208
 
114
209
  # 验证项目名称
115
210
  is_valid, error_msg = validate_project_name(project_name)
@@ -154,7 +249,7 @@ def main(args):
154
249
 
155
250
  # 3. 复制并渲染项目包内容
156
251
  package_dir = project_dir / project_name
157
- _copytree_with_templates(template_dir, package_dir, context)
252
+ _copytree_with_templates(template_dir, package_dir, context, template_type)
158
253
  console.print(f":white_check_mark: Created project package: [green]{package_dir}[/green]")
159
254
 
160
255
  # 4. 创建 logs 目录
@@ -168,6 +263,10 @@ def main(args):
168
263
  # 成功面板
169
264
  success_text = Text.from_markup(f"Project '[bold cyan]{project_name}[/bold cyan]' created successfully!")
170
265
  console.print(Panel(success_text, title=":rocket: Success", border_style="green", padding=(1, 2)))
266
+
267
+ # 显示使用的模板类型
268
+ if template_type != 'default':
269
+ console.print(f":information: 使用模板类型: [bold blue]{template_type}[/bold blue] - {TEMPLATE_TYPES[template_type]}")
171
270
 
172
271
  # 下一步操作提示(对齐美观 + 语法高亮)
173
272
  next_steps = f"""
@@ -194,3 +293,8 @@ def main(args):
194
293
  shutil.rmtree(project_dir, ignore_errors=True)
195
294
  console.print("[red]:cross_mark: Cleaned up partially created project.[/red]")
196
295
  return 1
296
+
297
+ if __name__ == "__main__":
298
+ import sys
299
+ exit_code = main(sys.argv[1:])
300
+ sys.exit(exit_code)
crawlo/config.py CHANGED
@@ -23,6 +23,7 @@ from typing import Dict, Any, Optional, Union
23
23
  import os
24
24
  from crawlo.mode_manager import ModeManager, standalone_mode, distributed_mode, auto_mode, from_env
25
25
  from crawlo.utils.log import get_logger
26
+ from crawlo.config_validator import validate_config
26
27
 
27
28
 
28
29
  class CrawloConfig:
@@ -31,6 +32,19 @@ class CrawloConfig:
31
32
  def __init__(self, settings: Dict[str, Any]):
32
33
  self.settings = settings
33
34
  self.logger = get_logger(self.__class__.__name__)
35
+ # 验证配置
36
+ self._validate_settings()
37
+
38
+ def _validate_settings(self):
39
+ """验证配置"""
40
+ is_valid, errors, warnings = validate_config(self.settings)
41
+ if not is_valid:
42
+ error_msg = "配置验证失败:\n" + "\n".join([f" - {error}" for error in errors])
43
+ raise ValueError(error_msg)
44
+
45
+ if warnings:
46
+ warning_msg = "配置警告:\n" + "\n".join([f" - {warning}" for warning in warnings])
47
+ self.logger.warning(warning_msg)
34
48
 
35
49
  def get(self, key: str, default: Any = None) -> Any:
36
50
  """获取配置项"""
@@ -100,6 +114,22 @@ class CrawloConfig:
100
114
  print("=" * 50)
101
115
  return self
102
116
 
117
+ def validate(self) -> bool:
118
+ """验证当前配置"""
119
+ is_valid, errors, warnings = validate_config(self.settings)
120
+ if not is_valid:
121
+ print("配置验证失败:")
122
+ for error in errors:
123
+ print(f" - {error}")
124
+ return False
125
+
126
+ if warnings:
127
+ print("配置警告:")
128
+ for warning in warnings:
129
+ print(f" - {warning}")
130
+
131
+ return True
132
+
103
133
  # ==================== 静态工厂方法 ====================
104
134
 
105
135
  @staticmethod
@@ -0,0 +1,253 @@
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 配置验证器
5
+ ==========
6
+ 提供配置项的验证和默认值设置功能,确保配置的合理性和一致性。
7
+ """
8
+
9
+ from typing import Dict, Any, List, Tuple, Optional
10
+ import re
11
+ from crawlo.utils.log import get_logger
12
+
13
+
14
+ class ConfigValidator:
15
+ """配置验证器"""
16
+
17
+ def __init__(self):
18
+ self.logger = get_logger(self.__class__.__name__)
19
+ self.errors = []
20
+ self.warnings = []
21
+
22
+ def validate(self, config: Dict[str, Any]) -> Tuple[bool, List[str], List[str]]:
23
+ """
24
+ 验证配置
25
+
26
+ Args:
27
+ config: 配置字典
28
+
29
+ Returns:
30
+ Tuple[bool, List[str], List[str]]: (是否有效, 错误列表, 警告列表)
31
+ """
32
+ self.errors = []
33
+ self.warnings = []
34
+
35
+ # 验证各个配置项
36
+ self._validate_basic_settings(config)
37
+ self._validate_network_settings(config)
38
+ self._validate_concurrency_settings(config)
39
+ self._validate_queue_settings(config)
40
+ self._validate_storage_settings(config)
41
+ self._validate_redis_settings(config)
42
+ self._validate_middleware_settings(config)
43
+ self._validate_pipeline_settings(config)
44
+ self._validate_logging_settings(config)
45
+
46
+ is_valid = len(self.errors) == 0
47
+ return is_valid, self.errors, self.warnings
48
+
49
+ def _validate_basic_settings(self, config: Dict[str, Any]):
50
+ """验证基本设置"""
51
+ project_name = config.get('PROJECT_NAME', 'crawlo')
52
+ if not isinstance(project_name, str) or not project_name.strip():
53
+ self.errors.append("PROJECT_NAME 必须是非空字符串")
54
+
55
+ version = config.get('VERSION', '1.0')
56
+ if not isinstance(version, str):
57
+ self.errors.append("VERSION 必须是字符串")
58
+
59
+ def _validate_network_settings(self, config: Dict[str, Any]):
60
+ """验证网络设置"""
61
+ # 下载器验证
62
+ downloader = config.get('DOWNLOADER', 'crawlo.downloader.aiohttp_downloader.AioHttpDownloader')
63
+ if not isinstance(downloader, str):
64
+ self.errors.append("DOWNLOADER 必须是字符串")
65
+
66
+ # 超时设置验证
67
+ timeout = config.get('DOWNLOAD_TIMEOUT', 30)
68
+ if not isinstance(timeout, (int, float)) or timeout <= 0:
69
+ self.errors.append("DOWNLOAD_TIMEOUT 必须是正数")
70
+
71
+ # 延迟设置验证
72
+ delay = config.get('DOWNLOAD_DELAY', 1.0)
73
+ if not isinstance(delay, (int, float)) or delay < 0:
74
+ self.errors.append("DOWNLOAD_DELAY 必须是非负数")
75
+
76
+ # 重试次数验证
77
+ max_retries = config.get('MAX_RETRY_TIMES', 3)
78
+ if not isinstance(max_retries, int) or max_retries < 0:
79
+ self.errors.append("MAX_RETRY_TIMES 必须是非负整数")
80
+
81
+ # 连接池限制验证
82
+ pool_limit = config.get('CONNECTION_POOL_LIMIT', 50)
83
+ if not isinstance(pool_limit, int) or pool_limit <= 0:
84
+ self.errors.append("CONNECTION_POOL_LIMIT 必须是正整数")
85
+
86
+ def _validate_concurrency_settings(self, config: Dict[str, Any]):
87
+ """验证并发设置"""
88
+ concurrency = config.get('CONCURRENCY', 8)
89
+ if not isinstance(concurrency, int) or concurrency <= 0:
90
+ self.errors.append("CONCURRENCY 必须是正整数")
91
+
92
+ max_running_spiders = config.get('MAX_RUNNING_SPIDERS', 3)
93
+ if not isinstance(max_running_spiders, int) or max_running_spiders <= 0:
94
+ self.errors.append("MAX_RUNNING_SPIDERS 必须是正整数")
95
+
96
+ def _validate_queue_settings(self, config: Dict[str, Any]):
97
+ """验证队列设置"""
98
+ queue_type = config.get('QUEUE_TYPE', 'memory')
99
+ valid_queue_types = ['memory', 'redis', 'auto']
100
+ if queue_type not in valid_queue_types:
101
+ self.errors.append(f"QUEUE_TYPE 必须是以下值之一: {valid_queue_types}")
102
+
103
+ # 队列大小验证
104
+ max_queue_size = config.get('SCHEDULER_MAX_QUEUE_SIZE', 2000)
105
+ if not isinstance(max_queue_size, int) or max_queue_size <= 0:
106
+ self.errors.append("SCHEDULER_MAX_QUEUE_SIZE 必须是正整数")
107
+
108
+ # 队列名称验证(如果是Redis队列)
109
+ if queue_type == 'redis':
110
+ queue_name = config.get('SCHEDULER_QUEUE_NAME', '')
111
+ if not queue_name:
112
+ self.errors.append("使用Redis队列时,SCHEDULER_QUEUE_NAME 不能为空")
113
+ elif not self._is_valid_redis_key(queue_name):
114
+ self.warnings.append(f"Redis队列名称 '{queue_name}' 不符合命名规范,建议使用 'crawlo:{config.get('PROJECT_NAME', 'project')}:queue:requests' 格式")
115
+
116
+ def _validate_storage_settings(self, config: Dict[str, Any]):
117
+ """验证存储设置"""
118
+ # MySQL设置验证
119
+ mysql_host = config.get('MYSQL_HOST')
120
+ if mysql_host is not None and not isinstance(mysql_host, str):
121
+ self.errors.append("MYSQL_HOST 必须是字符串")
122
+
123
+ mysql_port = config.get('MYSQL_PORT')
124
+ if mysql_port is not None and (not isinstance(mysql_port, int) or mysql_port <= 0 or mysql_port > 65535):
125
+ self.errors.append("MYSQL_PORT 必须是1-65535之间的整数")
126
+
127
+ # MongoDB设置验证
128
+ mongo_uri = config.get('MONGO_URI')
129
+ if mongo_uri is not None and not isinstance(mongo_uri, str):
130
+ self.errors.append("MONGO_URI 必须是字符串")
131
+
132
+ def _validate_redis_settings(self, config: Dict[str, Any]):
133
+ """验证Redis设置"""
134
+ queue_type = config.get('QUEUE_TYPE', 'memory')
135
+ if queue_type == 'redis':
136
+ # Redis主机验证
137
+ redis_host = config.get('REDIS_HOST', '127.0.0.1')
138
+ if not isinstance(redis_host, str) or not redis_host.strip():
139
+ self.errors.append("REDIS_HOST 必须是非空字符串")
140
+
141
+ # Redis端口验证
142
+ redis_port = config.get('REDIS_PORT', 6379)
143
+ if not isinstance(redis_port, int) or redis_port <= 0 or redis_port > 65535:
144
+ self.errors.append("REDIS_PORT 必须是1-65535之间的整数")
145
+
146
+ # Redis URL验证
147
+ redis_url = config.get('REDIS_URL')
148
+ if redis_url is not None and not isinstance(redis_url, str):
149
+ self.errors.append("REDIS_URL 必须是字符串")
150
+
151
+ def _validate_middleware_settings(self, config: Dict[str, Any]):
152
+ """验证中间件设置"""
153
+ middlewares = config.get('MIDDLEWARES', [])
154
+ if not isinstance(middlewares, list):
155
+ self.errors.append("MIDDLEWARES 必须是列表")
156
+ else:
157
+ for i, middleware in enumerate(middlewares):
158
+ if not isinstance(middleware, str):
159
+ self.errors.append(f"MIDDLEWARES[{i}] 必须是字符串")
160
+
161
+ def _validate_pipeline_settings(self, config: Dict[str, Any]):
162
+ """验证管道设置"""
163
+ pipelines = config.get('PIPELINES', [])
164
+ if not isinstance(pipelines, list):
165
+ self.errors.append("PIPELINES 必须是列表")
166
+ else:
167
+ for i, pipeline in enumerate(pipelines):
168
+ if not isinstance(pipeline, str):
169
+ self.errors.append(f"PIPELINES[{i}] 必须是字符串")
170
+
171
+ def _validate_logging_settings(self, config: Dict[str, Any]):
172
+ """验证日志设置"""
173
+ log_level = config.get('LOG_LEVEL', 'INFO')
174
+ valid_log_levels = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
175
+ if log_level not in valid_log_levels:
176
+ self.errors.append(f"LOG_LEVEL 必须是以下值之一: {valid_log_levels}")
177
+
178
+ log_file = config.get('LOG_FILE')
179
+ if log_file is not None and not isinstance(log_file, str):
180
+ self.errors.append("LOG_FILE 必须是字符串")
181
+
182
+ def _is_valid_redis_key(self, key: str) -> bool:
183
+ """检查Redis key是否符合命名规范"""
184
+ # 检查是否以 crawlo: 开头
185
+ if not key.startswith('crawlo:'):
186
+ return False
187
+
188
+ # 检查是否包含必要的部分
189
+ parts = key.split(':')
190
+ if len(parts) < 3:
191
+ return False
192
+
193
+ # 检查是否包含 queue 部分
194
+ return 'queue' in parts
195
+
196
+ def get_validation_report(self, config: Dict[str, Any]) -> str:
197
+ """
198
+ 获取验证报告
199
+
200
+ Args:
201
+ config: 配置字典
202
+
203
+ Returns:
204
+ str: 验证报告
205
+ """
206
+ is_valid, errors, warnings = self.validate(config)
207
+
208
+ report = []
209
+ report.append("=" * 50)
210
+ report.append("配置验证报告")
211
+ report.append("=" * 50)
212
+
213
+ if is_valid:
214
+ report.append("✅ 配置验证通过")
215
+ else:
216
+ report.append("❌ 配置验证失败")
217
+ report.append("错误:")
218
+ for error in errors:
219
+ report.append(f" - {error}")
220
+
221
+ if warnings:
222
+ report.append("警告:")
223
+ for warning in warnings:
224
+ report.append(f" - {warning}")
225
+
226
+ report.append("=" * 50)
227
+ return "\n".join(report)
228
+
229
+
230
+ # 便利函数
231
+ def validate_config(config: Dict[str, Any]) -> Tuple[bool, List[str], List[str]]:
232
+ """
233
+ 验证配置
234
+
235
+ Args:
236
+ config: 配置字典
237
+
238
+ Returns:
239
+ Tuple[bool, List[str], List[str]]: (是否有效, 错误列表, 警告列表)
240
+ """
241
+ validator = ConfigValidator()
242
+ return validator.validate(config)
243
+
244
+
245
+ def print_validation_report(config: Dict[str, Any]):
246
+ """
247
+ 打印验证报告
248
+
249
+ Args:
250
+ config: 配置字典
251
+ """
252
+ validator = ConfigValidator()
253
+ print(validator.get_validation_report(config))