crawlo 1.1.3__py3-none-any.whl → 1.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +28 -1
- crawlo/__version__.py +1 -1
- crawlo/cleaners/__init__.py +61 -0
- crawlo/cleaners/data_formatter.py +226 -0
- crawlo/cleaners/encoding_converter.py +126 -0
- crawlo/cleaners/text_cleaner.py +233 -0
- crawlo/commands/startproject.py +117 -13
- crawlo/config.py +30 -0
- crawlo/config_validator.py +253 -0
- crawlo/core/engine.py +185 -11
- crawlo/core/scheduler.py +49 -78
- crawlo/crawler.py +6 -6
- crawlo/downloader/__init__.py +24 -0
- crawlo/downloader/aiohttp_downloader.py +8 -0
- crawlo/downloader/cffi_downloader.py +5 -0
- crawlo/downloader/hybrid_downloader.py +214 -0
- crawlo/downloader/playwright_downloader.py +403 -0
- crawlo/downloader/selenium_downloader.py +473 -0
- crawlo/extension/__init__.py +17 -10
- crawlo/extension/health_check.py +142 -0
- crawlo/extension/log_interval.py +27 -18
- crawlo/extension/log_stats.py +62 -24
- crawlo/extension/logging_extension.py +18 -9
- crawlo/extension/memory_monitor.py +105 -0
- crawlo/extension/performance_profiler.py +134 -0
- crawlo/extension/request_recorder.py +108 -0
- crawlo/filters/aioredis_filter.py +50 -12
- crawlo/middleware/proxy.py +26 -2
- crawlo/mode_manager.py +24 -19
- crawlo/network/request.py +30 -3
- crawlo/network/response.py +114 -25
- crawlo/pipelines/mongo_pipeline.py +81 -66
- crawlo/pipelines/mysql_pipeline.py +165 -43
- crawlo/pipelines/redis_dedup_pipeline.py +7 -3
- crawlo/queue/queue_manager.py +15 -2
- crawlo/queue/redis_priority_queue.py +144 -76
- crawlo/settings/default_settings.py +93 -121
- crawlo/subscriber.py +62 -37
- crawlo/templates/project/items.py.tmpl +1 -1
- crawlo/templates/project/middlewares.py.tmpl +73 -49
- crawlo/templates/project/pipelines.py.tmpl +51 -295
- crawlo/templates/project/settings.py.tmpl +93 -17
- crawlo/templates/project/settings_distributed.py.tmpl +120 -0
- crawlo/templates/project/settings_gentle.py.tmpl +95 -0
- crawlo/templates/project/settings_high_performance.py.tmpl +152 -0
- crawlo/templates/project/settings_simple.py.tmpl +69 -0
- crawlo/templates/spider/spider.py.tmpl +2 -38
- crawlo/tools/__init__.py +183 -0
- crawlo/tools/anti_crawler.py +269 -0
- crawlo/tools/authenticated_proxy.py +241 -0
- crawlo/tools/data_validator.py +181 -0
- crawlo/tools/date_tools.py +36 -0
- crawlo/tools/distributed_coordinator.py +387 -0
- crawlo/tools/retry_mechanism.py +221 -0
- crawlo/tools/scenario_adapter.py +263 -0
- crawlo/utils/__init__.py +29 -1
- crawlo/utils/batch_processor.py +261 -0
- crawlo/utils/date_tools.py +58 -1
- crawlo/utils/enhanced_error_handler.py +360 -0
- crawlo/utils/env_config.py +106 -0
- crawlo/utils/error_handler.py +126 -0
- crawlo/utils/performance_monitor.py +285 -0
- crawlo/utils/redis_connection_pool.py +335 -0
- crawlo/utils/redis_key_validator.py +200 -0
- crawlo-1.1.5.dist-info/METADATA +401 -0
- crawlo-1.1.5.dist-info/RECORD +185 -0
- tests/advanced_tools_example.py +276 -0
- tests/authenticated_proxy_example.py +237 -0
- tests/cleaners_example.py +161 -0
- tests/config_validation_demo.py +103 -0
- tests/date_tools_example.py +181 -0
- tests/dynamic_loading_example.py +524 -0
- tests/dynamic_loading_test.py +105 -0
- tests/env_config_example.py +134 -0
- tests/error_handling_example.py +172 -0
- tests/redis_key_validation_demo.py +131 -0
- tests/response_improvements_example.py +145 -0
- tests/test_advanced_tools.py +149 -0
- tests/test_all_redis_key_configs.py +146 -0
- tests/test_authenticated_proxy.py +142 -0
- tests/test_cleaners.py +55 -0
- tests/test_comprehensive.py +147 -0
- tests/test_config_validator.py +194 -0
- tests/test_date_tools.py +124 -0
- tests/test_dynamic_downloaders_proxy.py +125 -0
- tests/test_dynamic_proxy.py +93 -0
- tests/test_dynamic_proxy_config.py +147 -0
- tests/test_dynamic_proxy_real.py +110 -0
- tests/test_edge_cases.py +304 -0
- tests/test_enhanced_error_handler.py +271 -0
- tests/test_env_config.py +122 -0
- tests/test_error_handler_compatibility.py +113 -0
- tests/test_framework_env_usage.py +104 -0
- tests/test_integration.py +357 -0
- tests/test_item_dedup_redis_key.py +123 -0
- tests/test_parsel.py +30 -0
- tests/test_performance.py +328 -0
- tests/test_queue_manager_redis_key.py +177 -0
- tests/test_redis_connection_pool.py +295 -0
- tests/test_redis_key_naming.py +182 -0
- tests/test_redis_key_validator.py +124 -0
- tests/test_response_improvements.py +153 -0
- tests/test_simple_response.py +62 -0
- tests/test_telecom_spider_redis_key.py +206 -0
- tests/test_template_content.py +88 -0
- tests/test_template_redis_key.py +135 -0
- tests/test_tools.py +154 -0
- tests/tools_example.py +258 -0
- crawlo/core/enhanced_engine.py +0 -190
- crawlo-1.1.3.dist-info/METADATA +0 -635
- crawlo-1.1.3.dist-info/RECORD +0 -113
- {crawlo-1.1.3.dist-info → crawlo-1.1.5.dist-info}/WHEEL +0 -0
- {crawlo-1.1.3.dist-info → crawlo-1.1.5.dist-info}/entry_points.txt +0 -0
- {crawlo-1.1.3.dist-info → crawlo-1.1.5.dist-info}/top_level.txt +0 -0
- {examples → tests}/controlled_spider_example.py +0 -0
crawlo/commands/startproject.py
CHANGED
|
@@ -7,18 +7,73 @@
|
|
|
7
7
|
"""
|
|
8
8
|
import shutil
|
|
9
9
|
import re
|
|
10
|
+
import sys
|
|
11
|
+
import os
|
|
10
12
|
from pathlib import Path
|
|
11
|
-
from
|
|
12
|
-
from rich.panel import Panel
|
|
13
|
-
from rich.text import Text
|
|
13
|
+
from typing import Optional
|
|
14
14
|
|
|
15
|
-
|
|
15
|
+
# 添加项目根目录到路径,以便能够导入utils模块
|
|
16
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..'))
|
|
16
17
|
|
|
17
|
-
|
|
18
|
-
console
|
|
18
|
+
try:
|
|
19
|
+
from rich.console import Console
|
|
20
|
+
from rich.panel import Panel
|
|
21
|
+
from rich.text import Text
|
|
22
|
+
from rich.table import Table
|
|
23
|
+
RICH_AVAILABLE = True
|
|
24
|
+
except ImportError:
|
|
25
|
+
RICH_AVAILABLE = False
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
from .utils import show_error_panel, show_success_panel
|
|
29
|
+
UTILS_AVAILABLE = True
|
|
30
|
+
except ImportError:
|
|
31
|
+
# 如果相对导入失败,尝试绝对导入
|
|
32
|
+
try:
|
|
33
|
+
from crawlo.commands.utils import show_error_panel, show_success_panel
|
|
34
|
+
UTILS_AVAILABLE = True
|
|
35
|
+
except ImportError:
|
|
36
|
+
UTILS_AVAILABLE = False
|
|
37
|
+
|
|
38
|
+
# 初始化 rich 控制台(如果可用)
|
|
39
|
+
if RICH_AVAILABLE:
|
|
40
|
+
console = Console()
|
|
41
|
+
else:
|
|
42
|
+
# 简单的控制台输出替代
|
|
43
|
+
class Console:
|
|
44
|
+
def print(self, text):
|
|
45
|
+
print(text)
|
|
46
|
+
console = Console()
|
|
19
47
|
|
|
20
48
|
TEMPLATES_DIR = Path(__file__).parent.parent / 'templates'
|
|
21
49
|
|
|
50
|
+
# 可用的模板类型
|
|
51
|
+
TEMPLATE_TYPES = {
|
|
52
|
+
'default': '默认模板 - 通用配置,适合大多数项目',
|
|
53
|
+
'simple': '简化模板 - 最小配置,适合快速开始',
|
|
54
|
+
'distributed': '分布式模板 - 针对分布式爬取优化',
|
|
55
|
+
'high-performance': '高性能模板 - 针对大规模高并发优化',
|
|
56
|
+
'gentle': '温和模板 - 低负载配置,对目标网站友好'
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def show_error_panel(title, content):
|
|
61
|
+
"""显示错误面板的简单实现"""
|
|
62
|
+
if RICH_AVAILABLE:
|
|
63
|
+
from rich.panel import Panel
|
|
64
|
+
console.print(Panel(content, title=title, border_style="red"))
|
|
65
|
+
else:
|
|
66
|
+
print(f"❌ {title}")
|
|
67
|
+
print(content)
|
|
68
|
+
|
|
69
|
+
def show_success_panel(title, content):
|
|
70
|
+
"""显示成功面板的简单实现"""
|
|
71
|
+
if RICH_AVAILABLE:
|
|
72
|
+
from rich.panel import Panel
|
|
73
|
+
console.print(Panel(content, title=title, border_style="green"))
|
|
74
|
+
else:
|
|
75
|
+
print(f"✅ {title}")
|
|
76
|
+
print(content)
|
|
22
77
|
|
|
23
78
|
def _render_template(tmpl_path, context):
|
|
24
79
|
"""读取模板文件,替换 {{key}} 为 context 中的值"""
|
|
@@ -29,7 +84,7 @@ def _render_template(tmpl_path, context):
|
|
|
29
84
|
return content
|
|
30
85
|
|
|
31
86
|
|
|
32
|
-
def _copytree_with_templates(src, dst, context):
|
|
87
|
+
def _copytree_with_templates(src, dst, context, template_type='default'):
|
|
33
88
|
"""
|
|
34
89
|
递归复制目录,将 .tmpl 文件渲染后复制(去除 .tmpl 后缀),其他文件直接复制。
|
|
35
90
|
"""
|
|
@@ -45,7 +100,19 @@ def _copytree_with_templates(src, dst, context):
|
|
|
45
100
|
dst_item.mkdir(parents=True, exist_ok=True)
|
|
46
101
|
else:
|
|
47
102
|
if item.suffix == '.tmpl':
|
|
48
|
-
|
|
103
|
+
# 处理特定模板类型的设置文件
|
|
104
|
+
if item.name == 'settings.py.tmpl' and template_type != 'default':
|
|
105
|
+
# 使用特定模板类型的设置文件
|
|
106
|
+
template_file_name = f'settings_{template_type}.py.tmpl'
|
|
107
|
+
template_file_path = src_path / template_file_name
|
|
108
|
+
if template_file_path.exists():
|
|
109
|
+
rendered_content = _render_template(template_file_path, context)
|
|
110
|
+
else:
|
|
111
|
+
# 如果特定模板不存在,使用默认模板
|
|
112
|
+
rendered_content = _render_template(item, context)
|
|
113
|
+
else:
|
|
114
|
+
rendered_content = _render_template(item, context)
|
|
115
|
+
|
|
49
116
|
final_dst = dst_item.with_suffix('')
|
|
50
117
|
final_dst.parent.mkdir(parents=True, exist_ok=True)
|
|
51
118
|
with open(final_dst, 'w', encoding='utf-8') as f:
|
|
@@ -100,16 +167,44 @@ def validate_project_name(project_name: str) -> tuple[bool, str]:
|
|
|
100
167
|
return True, ""
|
|
101
168
|
|
|
102
169
|
|
|
170
|
+
def show_template_options():
|
|
171
|
+
"""显示可用的模板选项"""
|
|
172
|
+
if RICH_AVAILABLE:
|
|
173
|
+
table = Table(title="可用模板类型", show_header=True, header_style="bold magenta")
|
|
174
|
+
table.add_column("模板类型", style="cyan", no_wrap=True)
|
|
175
|
+
table.add_column("描述", style="green")
|
|
176
|
+
|
|
177
|
+
for template_type, description in TEMPLATE_TYPES.items():
|
|
178
|
+
table.add_row(template_type, description)
|
|
179
|
+
|
|
180
|
+
console.print(table)
|
|
181
|
+
else:
|
|
182
|
+
print("可用模板类型:")
|
|
183
|
+
for template_type, description in TEMPLATE_TYPES.items():
|
|
184
|
+
print(f" {template_type}: {description}")
|
|
185
|
+
|
|
186
|
+
|
|
103
187
|
def main(args):
|
|
104
|
-
if len(args)
|
|
105
|
-
console.print("[bold red]Error:[/bold red] Usage: [blue]crawlo startproject[/blue] <project_name>")
|
|
188
|
+
if len(args) < 1 or len(args) > 2:
|
|
189
|
+
console.print("[bold red]Error:[/bold red] Usage: [blue]crawlo startproject[/blue] <project_name> [template_type]")
|
|
106
190
|
console.print("💡 Examples:")
|
|
107
191
|
console.print(" [blue]crawlo startproject[/blue] my_spider_project")
|
|
108
|
-
console.print(" [blue]crawlo startproject[/blue] news_crawler")
|
|
109
|
-
console.print(" [blue]crawlo startproject[/blue] ecommerce_spider")
|
|
192
|
+
console.print(" [blue]crawlo startproject[/blue] news_crawler simple")
|
|
193
|
+
console.print(" [blue]crawlo startproject[/blue] ecommerce_spider distributed")
|
|
194
|
+
show_template_options()
|
|
110
195
|
return 1
|
|
111
196
|
|
|
112
197
|
project_name = args[0]
|
|
198
|
+
template_type = args[1] if len(args) > 1 else 'default'
|
|
199
|
+
|
|
200
|
+
# 验证模板类型
|
|
201
|
+
if template_type not in TEMPLATE_TYPES:
|
|
202
|
+
show_error_panel(
|
|
203
|
+
"Invalid Template Type",
|
|
204
|
+
f"Template type '[cyan]{template_type}[/cyan]' is not supported.\n"
|
|
205
|
+
)
|
|
206
|
+
show_template_options()
|
|
207
|
+
return 1
|
|
113
208
|
|
|
114
209
|
# 验证项目名称
|
|
115
210
|
is_valid, error_msg = validate_project_name(project_name)
|
|
@@ -154,7 +249,7 @@ def main(args):
|
|
|
154
249
|
|
|
155
250
|
# 3. 复制并渲染项目包内容
|
|
156
251
|
package_dir = project_dir / project_name
|
|
157
|
-
_copytree_with_templates(template_dir, package_dir, context)
|
|
252
|
+
_copytree_with_templates(template_dir, package_dir, context, template_type)
|
|
158
253
|
console.print(f":white_check_mark: Created project package: [green]{package_dir}[/green]")
|
|
159
254
|
|
|
160
255
|
# 4. 创建 logs 目录
|
|
@@ -168,6 +263,10 @@ def main(args):
|
|
|
168
263
|
# 成功面板
|
|
169
264
|
success_text = Text.from_markup(f"Project '[bold cyan]{project_name}[/bold cyan]' created successfully!")
|
|
170
265
|
console.print(Panel(success_text, title=":rocket: Success", border_style="green", padding=(1, 2)))
|
|
266
|
+
|
|
267
|
+
# 显示使用的模板类型
|
|
268
|
+
if template_type != 'default':
|
|
269
|
+
console.print(f":information: 使用模板类型: [bold blue]{template_type}[/bold blue] - {TEMPLATE_TYPES[template_type]}")
|
|
171
270
|
|
|
172
271
|
# 下一步操作提示(对齐美观 + 语法高亮)
|
|
173
272
|
next_steps = f"""
|
|
@@ -194,3 +293,8 @@ def main(args):
|
|
|
194
293
|
shutil.rmtree(project_dir, ignore_errors=True)
|
|
195
294
|
console.print("[red]:cross_mark: Cleaned up partially created project.[/red]")
|
|
196
295
|
return 1
|
|
296
|
+
|
|
297
|
+
if __name__ == "__main__":
|
|
298
|
+
import sys
|
|
299
|
+
exit_code = main(sys.argv[1:])
|
|
300
|
+
sys.exit(exit_code)
|
crawlo/config.py
CHANGED
|
@@ -23,6 +23,7 @@ from typing import Dict, Any, Optional, Union
|
|
|
23
23
|
import os
|
|
24
24
|
from crawlo.mode_manager import ModeManager, standalone_mode, distributed_mode, auto_mode, from_env
|
|
25
25
|
from crawlo.utils.log import get_logger
|
|
26
|
+
from crawlo.config_validator import validate_config
|
|
26
27
|
|
|
27
28
|
|
|
28
29
|
class CrawloConfig:
|
|
@@ -31,6 +32,19 @@ class CrawloConfig:
|
|
|
31
32
|
def __init__(self, settings: Dict[str, Any]):
|
|
32
33
|
self.settings = settings
|
|
33
34
|
self.logger = get_logger(self.__class__.__name__)
|
|
35
|
+
# 验证配置
|
|
36
|
+
self._validate_settings()
|
|
37
|
+
|
|
38
|
+
def _validate_settings(self):
|
|
39
|
+
"""验证配置"""
|
|
40
|
+
is_valid, errors, warnings = validate_config(self.settings)
|
|
41
|
+
if not is_valid:
|
|
42
|
+
error_msg = "配置验证失败:\n" + "\n".join([f" - {error}" for error in errors])
|
|
43
|
+
raise ValueError(error_msg)
|
|
44
|
+
|
|
45
|
+
if warnings:
|
|
46
|
+
warning_msg = "配置警告:\n" + "\n".join([f" - {warning}" for warning in warnings])
|
|
47
|
+
self.logger.warning(warning_msg)
|
|
34
48
|
|
|
35
49
|
def get(self, key: str, default: Any = None) -> Any:
|
|
36
50
|
"""获取配置项"""
|
|
@@ -100,6 +114,22 @@ class CrawloConfig:
|
|
|
100
114
|
print("=" * 50)
|
|
101
115
|
return self
|
|
102
116
|
|
|
117
|
+
def validate(self) -> bool:
|
|
118
|
+
"""验证当前配置"""
|
|
119
|
+
is_valid, errors, warnings = validate_config(self.settings)
|
|
120
|
+
if not is_valid:
|
|
121
|
+
print("配置验证失败:")
|
|
122
|
+
for error in errors:
|
|
123
|
+
print(f" - {error}")
|
|
124
|
+
return False
|
|
125
|
+
|
|
126
|
+
if warnings:
|
|
127
|
+
print("配置警告:")
|
|
128
|
+
for warning in warnings:
|
|
129
|
+
print(f" - {warning}")
|
|
130
|
+
|
|
131
|
+
return True
|
|
132
|
+
|
|
103
133
|
# ==================== 静态工厂方法 ====================
|
|
104
134
|
|
|
105
135
|
@staticmethod
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
配置验证器
|
|
5
|
+
==========
|
|
6
|
+
提供配置项的验证和默认值设置功能,确保配置的合理性和一致性。
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import Dict, Any, List, Tuple, Optional
|
|
10
|
+
import re
|
|
11
|
+
from crawlo.utils.log import get_logger
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ConfigValidator:
|
|
15
|
+
"""配置验证器"""
|
|
16
|
+
|
|
17
|
+
def __init__(self):
|
|
18
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
19
|
+
self.errors = []
|
|
20
|
+
self.warnings = []
|
|
21
|
+
|
|
22
|
+
def validate(self, config: Dict[str, Any]) -> Tuple[bool, List[str], List[str]]:
|
|
23
|
+
"""
|
|
24
|
+
验证配置
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
config: 配置字典
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
Tuple[bool, List[str], List[str]]: (是否有效, 错误列表, 警告列表)
|
|
31
|
+
"""
|
|
32
|
+
self.errors = []
|
|
33
|
+
self.warnings = []
|
|
34
|
+
|
|
35
|
+
# 验证各个配置项
|
|
36
|
+
self._validate_basic_settings(config)
|
|
37
|
+
self._validate_network_settings(config)
|
|
38
|
+
self._validate_concurrency_settings(config)
|
|
39
|
+
self._validate_queue_settings(config)
|
|
40
|
+
self._validate_storage_settings(config)
|
|
41
|
+
self._validate_redis_settings(config)
|
|
42
|
+
self._validate_middleware_settings(config)
|
|
43
|
+
self._validate_pipeline_settings(config)
|
|
44
|
+
self._validate_logging_settings(config)
|
|
45
|
+
|
|
46
|
+
is_valid = len(self.errors) == 0
|
|
47
|
+
return is_valid, self.errors, self.warnings
|
|
48
|
+
|
|
49
|
+
def _validate_basic_settings(self, config: Dict[str, Any]):
|
|
50
|
+
"""验证基本设置"""
|
|
51
|
+
project_name = config.get('PROJECT_NAME', 'crawlo')
|
|
52
|
+
if not isinstance(project_name, str) or not project_name.strip():
|
|
53
|
+
self.errors.append("PROJECT_NAME 必须是非空字符串")
|
|
54
|
+
|
|
55
|
+
version = config.get('VERSION', '1.0')
|
|
56
|
+
if not isinstance(version, str):
|
|
57
|
+
self.errors.append("VERSION 必须是字符串")
|
|
58
|
+
|
|
59
|
+
def _validate_network_settings(self, config: Dict[str, Any]):
|
|
60
|
+
"""验证网络设置"""
|
|
61
|
+
# 下载器验证
|
|
62
|
+
downloader = config.get('DOWNLOADER', 'crawlo.downloader.aiohttp_downloader.AioHttpDownloader')
|
|
63
|
+
if not isinstance(downloader, str):
|
|
64
|
+
self.errors.append("DOWNLOADER 必须是字符串")
|
|
65
|
+
|
|
66
|
+
# 超时设置验证
|
|
67
|
+
timeout = config.get('DOWNLOAD_TIMEOUT', 30)
|
|
68
|
+
if not isinstance(timeout, (int, float)) or timeout <= 0:
|
|
69
|
+
self.errors.append("DOWNLOAD_TIMEOUT 必须是正数")
|
|
70
|
+
|
|
71
|
+
# 延迟设置验证
|
|
72
|
+
delay = config.get('DOWNLOAD_DELAY', 1.0)
|
|
73
|
+
if not isinstance(delay, (int, float)) or delay < 0:
|
|
74
|
+
self.errors.append("DOWNLOAD_DELAY 必须是非负数")
|
|
75
|
+
|
|
76
|
+
# 重试次数验证
|
|
77
|
+
max_retries = config.get('MAX_RETRY_TIMES', 3)
|
|
78
|
+
if not isinstance(max_retries, int) or max_retries < 0:
|
|
79
|
+
self.errors.append("MAX_RETRY_TIMES 必须是非负整数")
|
|
80
|
+
|
|
81
|
+
# 连接池限制验证
|
|
82
|
+
pool_limit = config.get('CONNECTION_POOL_LIMIT', 50)
|
|
83
|
+
if not isinstance(pool_limit, int) or pool_limit <= 0:
|
|
84
|
+
self.errors.append("CONNECTION_POOL_LIMIT 必须是正整数")
|
|
85
|
+
|
|
86
|
+
def _validate_concurrency_settings(self, config: Dict[str, Any]):
|
|
87
|
+
"""验证并发设置"""
|
|
88
|
+
concurrency = config.get('CONCURRENCY', 8)
|
|
89
|
+
if not isinstance(concurrency, int) or concurrency <= 0:
|
|
90
|
+
self.errors.append("CONCURRENCY 必须是正整数")
|
|
91
|
+
|
|
92
|
+
max_running_spiders = config.get('MAX_RUNNING_SPIDERS', 3)
|
|
93
|
+
if not isinstance(max_running_spiders, int) or max_running_spiders <= 0:
|
|
94
|
+
self.errors.append("MAX_RUNNING_SPIDERS 必须是正整数")
|
|
95
|
+
|
|
96
|
+
def _validate_queue_settings(self, config: Dict[str, Any]):
|
|
97
|
+
"""验证队列设置"""
|
|
98
|
+
queue_type = config.get('QUEUE_TYPE', 'memory')
|
|
99
|
+
valid_queue_types = ['memory', 'redis', 'auto']
|
|
100
|
+
if queue_type not in valid_queue_types:
|
|
101
|
+
self.errors.append(f"QUEUE_TYPE 必须是以下值之一: {valid_queue_types}")
|
|
102
|
+
|
|
103
|
+
# 队列大小验证
|
|
104
|
+
max_queue_size = config.get('SCHEDULER_MAX_QUEUE_SIZE', 2000)
|
|
105
|
+
if not isinstance(max_queue_size, int) or max_queue_size <= 0:
|
|
106
|
+
self.errors.append("SCHEDULER_MAX_QUEUE_SIZE 必须是正整数")
|
|
107
|
+
|
|
108
|
+
# 队列名称验证(如果是Redis队列)
|
|
109
|
+
if queue_type == 'redis':
|
|
110
|
+
queue_name = config.get('SCHEDULER_QUEUE_NAME', '')
|
|
111
|
+
if not queue_name:
|
|
112
|
+
self.errors.append("使用Redis队列时,SCHEDULER_QUEUE_NAME 不能为空")
|
|
113
|
+
elif not self._is_valid_redis_key(queue_name):
|
|
114
|
+
self.warnings.append(f"Redis队列名称 '{queue_name}' 不符合命名规范,建议使用 'crawlo:{config.get('PROJECT_NAME', 'project')}:queue:requests' 格式")
|
|
115
|
+
|
|
116
|
+
def _validate_storage_settings(self, config: Dict[str, Any]):
|
|
117
|
+
"""验证存储设置"""
|
|
118
|
+
# MySQL设置验证
|
|
119
|
+
mysql_host = config.get('MYSQL_HOST')
|
|
120
|
+
if mysql_host is not None and not isinstance(mysql_host, str):
|
|
121
|
+
self.errors.append("MYSQL_HOST 必须是字符串")
|
|
122
|
+
|
|
123
|
+
mysql_port = config.get('MYSQL_PORT')
|
|
124
|
+
if mysql_port is not None and (not isinstance(mysql_port, int) or mysql_port <= 0 or mysql_port > 65535):
|
|
125
|
+
self.errors.append("MYSQL_PORT 必须是1-65535之间的整数")
|
|
126
|
+
|
|
127
|
+
# MongoDB设置验证
|
|
128
|
+
mongo_uri = config.get('MONGO_URI')
|
|
129
|
+
if mongo_uri is not None and not isinstance(mongo_uri, str):
|
|
130
|
+
self.errors.append("MONGO_URI 必须是字符串")
|
|
131
|
+
|
|
132
|
+
def _validate_redis_settings(self, config: Dict[str, Any]):
|
|
133
|
+
"""验证Redis设置"""
|
|
134
|
+
queue_type = config.get('QUEUE_TYPE', 'memory')
|
|
135
|
+
if queue_type == 'redis':
|
|
136
|
+
# Redis主机验证
|
|
137
|
+
redis_host = config.get('REDIS_HOST', '127.0.0.1')
|
|
138
|
+
if not isinstance(redis_host, str) or not redis_host.strip():
|
|
139
|
+
self.errors.append("REDIS_HOST 必须是非空字符串")
|
|
140
|
+
|
|
141
|
+
# Redis端口验证
|
|
142
|
+
redis_port = config.get('REDIS_PORT', 6379)
|
|
143
|
+
if not isinstance(redis_port, int) or redis_port <= 0 or redis_port > 65535:
|
|
144
|
+
self.errors.append("REDIS_PORT 必须是1-65535之间的整数")
|
|
145
|
+
|
|
146
|
+
# Redis URL验证
|
|
147
|
+
redis_url = config.get('REDIS_URL')
|
|
148
|
+
if redis_url is not None and not isinstance(redis_url, str):
|
|
149
|
+
self.errors.append("REDIS_URL 必须是字符串")
|
|
150
|
+
|
|
151
|
+
def _validate_middleware_settings(self, config: Dict[str, Any]):
|
|
152
|
+
"""验证中间件设置"""
|
|
153
|
+
middlewares = config.get('MIDDLEWARES', [])
|
|
154
|
+
if not isinstance(middlewares, list):
|
|
155
|
+
self.errors.append("MIDDLEWARES 必须是列表")
|
|
156
|
+
else:
|
|
157
|
+
for i, middleware in enumerate(middlewares):
|
|
158
|
+
if not isinstance(middleware, str):
|
|
159
|
+
self.errors.append(f"MIDDLEWARES[{i}] 必须是字符串")
|
|
160
|
+
|
|
161
|
+
def _validate_pipeline_settings(self, config: Dict[str, Any]):
|
|
162
|
+
"""验证管道设置"""
|
|
163
|
+
pipelines = config.get('PIPELINES', [])
|
|
164
|
+
if not isinstance(pipelines, list):
|
|
165
|
+
self.errors.append("PIPELINES 必须是列表")
|
|
166
|
+
else:
|
|
167
|
+
for i, pipeline in enumerate(pipelines):
|
|
168
|
+
if not isinstance(pipeline, str):
|
|
169
|
+
self.errors.append(f"PIPELINES[{i}] 必须是字符串")
|
|
170
|
+
|
|
171
|
+
def _validate_logging_settings(self, config: Dict[str, Any]):
|
|
172
|
+
"""验证日志设置"""
|
|
173
|
+
log_level = config.get('LOG_LEVEL', 'INFO')
|
|
174
|
+
valid_log_levels = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
|
|
175
|
+
if log_level not in valid_log_levels:
|
|
176
|
+
self.errors.append(f"LOG_LEVEL 必须是以下值之一: {valid_log_levels}")
|
|
177
|
+
|
|
178
|
+
log_file = config.get('LOG_FILE')
|
|
179
|
+
if log_file is not None and not isinstance(log_file, str):
|
|
180
|
+
self.errors.append("LOG_FILE 必须是字符串")
|
|
181
|
+
|
|
182
|
+
def _is_valid_redis_key(self, key: str) -> bool:
|
|
183
|
+
"""检查Redis key是否符合命名规范"""
|
|
184
|
+
# 检查是否以 crawlo: 开头
|
|
185
|
+
if not key.startswith('crawlo:'):
|
|
186
|
+
return False
|
|
187
|
+
|
|
188
|
+
# 检查是否包含必要的部分
|
|
189
|
+
parts = key.split(':')
|
|
190
|
+
if len(parts) < 3:
|
|
191
|
+
return False
|
|
192
|
+
|
|
193
|
+
# 检查是否包含 queue 部分
|
|
194
|
+
return 'queue' in parts
|
|
195
|
+
|
|
196
|
+
def get_validation_report(self, config: Dict[str, Any]) -> str:
|
|
197
|
+
"""
|
|
198
|
+
获取验证报告
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
config: 配置字典
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
str: 验证报告
|
|
205
|
+
"""
|
|
206
|
+
is_valid, errors, warnings = self.validate(config)
|
|
207
|
+
|
|
208
|
+
report = []
|
|
209
|
+
report.append("=" * 50)
|
|
210
|
+
report.append("配置验证报告")
|
|
211
|
+
report.append("=" * 50)
|
|
212
|
+
|
|
213
|
+
if is_valid:
|
|
214
|
+
report.append("✅ 配置验证通过")
|
|
215
|
+
else:
|
|
216
|
+
report.append("❌ 配置验证失败")
|
|
217
|
+
report.append("错误:")
|
|
218
|
+
for error in errors:
|
|
219
|
+
report.append(f" - {error}")
|
|
220
|
+
|
|
221
|
+
if warnings:
|
|
222
|
+
report.append("警告:")
|
|
223
|
+
for warning in warnings:
|
|
224
|
+
report.append(f" - {warning}")
|
|
225
|
+
|
|
226
|
+
report.append("=" * 50)
|
|
227
|
+
return "\n".join(report)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
# 便利函数
|
|
231
|
+
def validate_config(config: Dict[str, Any]) -> Tuple[bool, List[str], List[str]]:
|
|
232
|
+
"""
|
|
233
|
+
验证配置
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
config: 配置字典
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
Tuple[bool, List[str], List[str]]: (是否有效, 错误列表, 警告列表)
|
|
240
|
+
"""
|
|
241
|
+
validator = ConfigValidator()
|
|
242
|
+
return validator.validate(config)
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def print_validation_report(config: Dict[str, Any]):
|
|
246
|
+
"""
|
|
247
|
+
打印验证报告
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
config: 配置字典
|
|
251
|
+
"""
|
|
252
|
+
validator = ConfigValidator()
|
|
253
|
+
print(validator.get_validation_report(config))
|