crawlo 1.4.2__py3-none-any.whl → 1.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +93 -93
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +151 -151
- crawlo/commands/help.py +138 -138
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +341 -341
- crawlo/commands/startproject.py +436 -436
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +312 -312
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +52 -52
- crawlo/core/engine.py +438 -439
- crawlo/core/processor.py +47 -47
- crawlo/core/scheduler.py +291 -257
- crawlo/crawler.py +650 -650
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +273 -273
- crawlo/downloader/aiohttp_downloader.py +233 -233
- crawlo/downloader/cffi_downloader.py +245 -245
- crawlo/downloader/httpx_downloader.py +259 -259
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +402 -402
- crawlo/downloader/selenium_downloader.py +472 -472
- crawlo/event.py +11 -11
- crawlo/exceptions.py +81 -81
- crawlo/extension/__init__.py +63 -63
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +94 -94
- crawlo/extension/log_stats.py +70 -70
- crawlo/extension/logging_extension.py +61 -61
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/factories/__init__.py +27 -27
- crawlo/factories/base.py +68 -68
- crawlo/factories/crawler.py +103 -103
- crawlo/factories/registry.py +84 -84
- crawlo/filters/__init__.py +154 -154
- crawlo/filters/aioredis_filter.py +257 -257
- crawlo/filters/memory_filter.py +269 -269
- crawlo/framework.py +292 -292
- crawlo/initialization/__init__.py +44 -44
- crawlo/initialization/built_in.py +425 -425
- crawlo/initialization/context.py +141 -141
- crawlo/initialization/core.py +193 -193
- crawlo/initialization/phases.py +148 -148
- crawlo/initialization/registry.py +145 -145
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +23 -23
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/logging/__init__.py +45 -37
- crawlo/logging/async_handler.py +181 -0
- crawlo/logging/config.py +196 -96
- crawlo/logging/factory.py +171 -128
- crawlo/logging/manager.py +111 -111
- crawlo/logging/monitor.py +153 -0
- crawlo/logging/sampler.py +167 -0
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +135 -135
- crawlo/middleware/offsite.py +123 -123
- crawlo/middleware/proxy.py +386 -386
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +150 -150
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/middleware/simple_proxy.py +65 -65
- crawlo/mode_manager.py +219 -219
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +379 -379
- crawlo/network/response.py +359 -359
- crawlo/pipelines/__init__.py +21 -21
- crawlo/pipelines/bloom_dedup_pipeline.py +146 -146
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +197 -197
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +105 -105
- crawlo/pipelines/mongo_pipeline.py +131 -131
- crawlo/pipelines/mysql_pipeline.py +325 -325
- crawlo/pipelines/pipeline_manager.py +100 -84
- crawlo/pipelines/redis_dedup_pipeline.py +156 -156
- crawlo/project.py +349 -338
- crawlo/queue/pqueue.py +42 -42
- crawlo/queue/queue_manager.py +526 -522
- crawlo/queue/redis_priority_queue.py +370 -367
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +284 -284
- crawlo/settings/setting_manager.py +219 -219
- crawlo/spider/__init__.py +657 -657
- crawlo/stats_collector.py +73 -73
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +138 -138
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +3 -3
- crawlo/templates/project/items.py.tmpl +17 -17
- crawlo/templates/project/middlewares.py.tmpl +118 -118
- crawlo/templates/project/pipelines.py.tmpl +96 -96
- crawlo/templates/project/settings.py.tmpl +170 -170
- crawlo/templates/project/settings_distributed.py.tmpl +169 -169
- crawlo/templates/project/settings_gentle.py.tmpl +166 -166
- crawlo/templates/project/settings_high_performance.py.tmpl +167 -167
- crawlo/templates/project/settings_minimal.py.tmpl +65 -65
- crawlo/templates/project/settings_simple.py.tmpl +164 -164
- crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
- crawlo/templates/run.py.tmpl +34 -34
- crawlo/templates/spider/spider.py.tmpl +143 -143
- crawlo/templates/spiders_init.py.tmpl +9 -9
- crawlo/tools/__init__.py +200 -200
- crawlo/tools/anti_crawler.py +268 -268
- crawlo/tools/authenticated_proxy.py +240 -240
- crawlo/tools/data_formatter.py +225 -225
- crawlo/tools/data_validator.py +180 -180
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +384 -384
- crawlo/tools/encoding_converter.py +127 -127
- crawlo/tools/network_diagnostic.py +364 -364
- crawlo/tools/request_tools.py +82 -82
- crawlo/tools/retry_mechanism.py +224 -224
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +34 -34
- crawlo/utils/batch_processor.py +259 -259
- crawlo/utils/class_loader.py +25 -25
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +343 -343
- crawlo/utils/enhanced_error_handler.py +356 -356
- crawlo/utils/env_config.py +142 -142
- crawlo/utils/error_handler.py +165 -165
- crawlo/utils/fingerprint.py +122 -122
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_config.py +286 -286
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/log.py +79 -79
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_connection_pool.py +388 -388
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +267 -267
- crawlo/utils/request_serializer.py +225 -225
- crawlo/utils/spider_loader.py +61 -61
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +4 -4
- crawlo/utils/url.py +39 -39
- crawlo-1.4.3.dist-info/METADATA +190 -0
- crawlo-1.4.3.dist-info/RECORD +326 -0
- examples/__init__.py +7 -7
- examples/test_project/__init__.py +7 -7
- examples/test_project/run.py +34 -34
- examples/test_project/test_project/__init__.py +3 -3
- examples/test_project/test_project/items.py +17 -17
- examples/test_project/test_project/middlewares.py +118 -118
- examples/test_project/test_project/pipelines.py +96 -96
- examples/test_project/test_project/settings.py +169 -169
- examples/test_project/test_project/spiders/__init__.py +9 -9
- examples/test_project/test_project/spiders/of_week_dis.py +143 -143
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +275 -275
- tests/authenticated_proxy_example.py +106 -106
- tests/baidu_performance_test.py +108 -108
- tests/baidu_test.py +59 -59
- tests/cleaners_example.py +160 -160
- tests/comprehensive_framework_test.py +212 -212
- tests/comprehensive_test.py +81 -81
- tests/comprehensive_testing_summary.md +186 -186
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_configure.py +69 -69
- tests/debug_framework_logger.py +84 -84
- tests/debug_log_config.py +126 -126
- tests/debug_log_levels.py +63 -63
- tests/debug_pipelines.py +66 -66
- tests/detailed_log_test.py +233 -233
- tests/distributed_test.py +66 -66
- tests/distributed_test_debug.py +76 -76
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/env_config_example.py +133 -133
- tests/error_handling_example.py +171 -171
- tests/final_comprehensive_test.py +151 -151
- tests/final_log_test.py +260 -260
- tests/final_validation_test.py +182 -182
- tests/fix_log_test.py +142 -142
- tests/framework_performance_test.py +202 -202
- tests/log_buffering_test.py +111 -111
- tests/log_generation_timing_test.py +153 -153
- tests/optimized_performance_test.py +211 -211
- tests/performance_comparison.py +245 -245
- tests/queue_blocking_test.py +113 -113
- tests/queue_test.py +89 -89
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/scrapy_comparison/ofweek_scrapy.py +138 -138
- tests/scrapy_comparison/scrapy_test.py +133 -133
- tests/simple_command_test.py +119 -119
- tests/simple_crawlo_test.py +127 -127
- tests/simple_log_test.py +57 -57
- tests/simple_log_test2.py +137 -137
- tests/simple_optimization_test.py +128 -128
- tests/simple_queue_type_test.py +41 -41
- tests/simple_spider_test.py +49 -49
- tests/simple_test.py +47 -47
- tests/spider_log_timing_test.py +177 -177
- tests/test_advanced_tools.py +148 -148
- tests/test_all_commands.py +230 -230
- tests/test_all_pipeline_fingerprints.py +133 -133
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_authenticated_proxy.py +141 -141
- tests/test_batch_processor.py +178 -178
- tests/test_cleaners.py +54 -54
- tests/test_component_factory.py +174 -174
- tests/test_comprehensive.py +146 -146
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_controlled_spider_mixin.py +79 -79
- tests/test_crawlo_proxy_integration.py +108 -108
- tests/test_date_tools.py +123 -123
- tests/test_dedup_fix.py +220 -220
- tests/test_dedup_pipeline_consistency.py +125 -0
- tests/test_default_header_middleware.py +313 -313
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +204 -204
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +268 -268
- tests/test_dynamic_downloaders_proxy.py +124 -124
- tests/test_dynamic_proxy.py +92 -92
- tests/test_dynamic_proxy_config.py +146 -146
- tests/test_dynamic_proxy_real.py +109 -109
- tests/test_edge_cases.py +303 -303
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_enhanced_error_handler_comprehensive.py +245 -245
- tests/test_env_config.py +121 -121
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_factories.py +252 -252
- tests/test_final_validation.py +153 -153
- tests/test_fingerprint_consistency.py +135 -135
- tests/test_fingerprint_simple.py +51 -51
- tests/test_framework_env_usage.py +103 -103
- tests/test_framework_logger.py +66 -66
- tests/test_framework_startup.py +64 -64
- tests/test_get_component_logger.py +83 -83
- tests/test_hash_performance.py +99 -99
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_large_scale_config.py +112 -112
- tests/test_large_scale_helper.py +235 -235
- tests/test_logging_enhancements.py +375 -0
- tests/test_logging_final.py +185 -0
- tests/test_logging_integration.py +313 -0
- tests/test_logging_system.py +282 -282
- tests/test_middleware_debug.py +142 -0
- tests/test_mode_change.py +72 -72
- tests/test_mode_consistency.py +51 -51
- tests/test_offsite_middleware.py +244 -244
- tests/test_offsite_middleware_simple.py +203 -203
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_performance_monitor.py +115 -115
- tests/test_pipeline_fingerprint_consistency.py +86 -86
- tests/test_priority_behavior.py +212 -0
- tests/test_priority_consistency.py +152 -0
- tests/test_priority_consistency_fixed.py +250 -0
- tests/test_proxy_api.py +264 -264
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +121 -121
- tests/test_proxy_middleware_enhanced.py +216 -216
- tests/test_proxy_middleware_integration.py +136 -136
- tests/test_proxy_middleware_refactored.py +184 -184
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_queue_empty_check.py +41 -41
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +179 -179
- tests/test_queue_naming.py +154 -154
- tests/test_queue_type.py +106 -106
- tests/test_queue_type_redis_config_consistency.py +131 -0
- tests/test_random_headers_default.py +323 -0
- tests/test_random_headers_necessity.py +309 -0
- tests/test_random_user_agent.py +72 -72
- tests/test_real_scenario_proxy.py +195 -195
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_redis_queue_name_fix.py +175 -175
- tests/test_redis_queue_type_fallback.py +130 -0
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_improvements.py +152 -152
- tests/test_retry_middleware.py +334 -242
- tests/test_retry_middleware_realistic.py +274 -0
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_simple_response.py +61 -61
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agent_randomness.py +177 -0
- tests/test_user_agents.py +96 -96
- tests/tools_example.py +260 -260
- tests/untested_features_report.md +138 -138
- tests/verify_debug.py +51 -51
- tests/verify_distributed.py +117 -117
- tests/verify_log_fix.py +111 -111
- crawlo-1.4.2.dist-info/METADATA +0 -1199
- crawlo-1.4.2.dist-info/RECORD +0 -309
- {crawlo-1.4.2.dist-info → crawlo-1.4.3.dist-info}/WHEEL +0 -0
- {crawlo-1.4.2.dist-info → crawlo-1.4.3.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.2.dist-info → crawlo-1.4.3.dist-info}/top_level.txt +0 -0
crawlo/commands/utils.py
CHANGED
|
@@ -1,197 +1,197 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
命令行工具公共模块
|
|
5
|
-
提供命令行工具的公共函数和工具
|
|
6
|
-
"""
|
|
7
|
-
import sys
|
|
8
|
-
import configparser
|
|
9
|
-
from pathlib import Path
|
|
10
|
-
from importlib import import_module
|
|
11
|
-
from typing import Optional, Tuple
|
|
12
|
-
|
|
13
|
-
from rich.console import Console
|
|
14
|
-
from rich.panel import Panel
|
|
15
|
-
from rich.text import Text
|
|
16
|
-
|
|
17
|
-
console = Console()
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
def get_project_root() -> Optional[Path]:
|
|
21
|
-
"""
|
|
22
|
-
自动检测项目根目录:从当前目录向上查找 crawlo.cfg
|
|
23
|
-
|
|
24
|
-
Returns:
|
|
25
|
-
Path: 项目根目录路径,如果未找到返回 None
|
|
26
|
-
"""
|
|
27
|
-
current = Path.cwd()
|
|
28
|
-
for _ in range(10): # 最多向上查找10层
|
|
29
|
-
cfg_file = current / "crawlo.cfg"
|
|
30
|
-
if cfg_file.exists():
|
|
31
|
-
return current
|
|
32
|
-
if current == current.parent:
|
|
33
|
-
break
|
|
34
|
-
current = current.parent
|
|
35
|
-
return None
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
def validate_project_environment() -> Tuple[bool, Optional[str], Optional[str]]:
|
|
39
|
-
"""
|
|
40
|
-
验证项目环境,确保在正确的 Crawlo 项目中
|
|
41
|
-
|
|
42
|
-
Returns:
|
|
43
|
-
Tuple[bool, Optional[str], Optional[str]]:
|
|
44
|
-
(是否有效, 项目包名, 错误信息)
|
|
45
|
-
"""
|
|
46
|
-
# 1. 查找项目根目录
|
|
47
|
-
project_root = get_project_root()
|
|
48
|
-
if not project_root:
|
|
49
|
-
return False, None, "找不到 'crawlo.cfg'。请在项目目录中运行此命令。"
|
|
50
|
-
|
|
51
|
-
# 2. 将项目根加入 Python 路径
|
|
52
|
-
project_root_str = str(project_root)
|
|
53
|
-
if project_root_str not in sys.path:
|
|
54
|
-
sys.path.insert(0, project_root_str)
|
|
55
|
-
|
|
56
|
-
# 3. 读取配置文件
|
|
57
|
-
cfg_file = project_root / "crawlo.cfg"
|
|
58
|
-
config = configparser.ConfigParser()
|
|
59
|
-
|
|
60
|
-
try:
|
|
61
|
-
config.read(cfg_file, encoding="utf-8")
|
|
62
|
-
except Exception as e:
|
|
63
|
-
return False, None, f"读取 crawlo.cfg 失败: {e}"
|
|
64
|
-
|
|
65
|
-
if not config.has_section("settings") or not config.has_option("settings", "default"):
|
|
66
|
-
return False, None, "无效的 crawlo.cfg:缺少 [settings] 部分或 'default' 选项"
|
|
67
|
-
|
|
68
|
-
# 4. 获取项目包名
|
|
69
|
-
settings_module = config.get("settings", "default")
|
|
70
|
-
project_package = settings_module.split(".")[0]
|
|
71
|
-
|
|
72
|
-
# 5. 验证项目包是否可导入
|
|
73
|
-
try:
|
|
74
|
-
import_module(project_package)
|
|
75
|
-
except ImportError as e:
|
|
76
|
-
return False, None, f"导入项目包 '{project_package}' 失败: {e}"
|
|
77
|
-
|
|
78
|
-
return True, project_package, None
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
def show_error_panel(title: str, message: str, show_json: bool = False) -> None:
|
|
82
|
-
"""
|
|
83
|
-
显示错误面板或JSON格式错误
|
|
84
|
-
|
|
85
|
-
Args:
|
|
86
|
-
title: 错误标题
|
|
87
|
-
message: 错误消息
|
|
88
|
-
show_json: 是否以JSON格式输出
|
|
89
|
-
"""
|
|
90
|
-
if show_json:
|
|
91
|
-
console.print_json(data={"success": False, "error": message})
|
|
92
|
-
else:
|
|
93
|
-
console.print(Panel(
|
|
94
|
-
Text.from_markup(f"[bold red]{message}[/bold red]"),
|
|
95
|
-
title=f"{title}",
|
|
96
|
-
border_style="red",
|
|
97
|
-
padding=(1, 2)
|
|
98
|
-
))
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
def show_success_panel(title: str, message: str, show_json: bool = False, data: dict = None) -> None:
|
|
102
|
-
"""
|
|
103
|
-
显示成功面板或JSON格式结果
|
|
104
|
-
|
|
105
|
-
Args:
|
|
106
|
-
title: 成功标题
|
|
107
|
-
message: 成功消息
|
|
108
|
-
show_json: 是否以JSON格式输出
|
|
109
|
-
data: JSON数据(当show_json=True时)
|
|
110
|
-
"""
|
|
111
|
-
if show_json:
|
|
112
|
-
result = {"success": True, "message": message}
|
|
113
|
-
if data:
|
|
114
|
-
result.update(data)
|
|
115
|
-
console.print_json(data=result)
|
|
116
|
-
else:
|
|
117
|
-
console.print(Panel(
|
|
118
|
-
Text.from_markup(f"[bold green]{message}[/bold green]"),
|
|
119
|
-
title=f"{title}",
|
|
120
|
-
border_style="green",
|
|
121
|
-
padding=(1, 2)
|
|
122
|
-
))
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
def validate_spider_name(spider_name: str) -> bool:
|
|
126
|
-
"""
|
|
127
|
-
验证爬虫名称是否符合规范
|
|
128
|
-
|
|
129
|
-
Args:
|
|
130
|
-
spider_name: 爬虫名称
|
|
131
|
-
|
|
132
|
-
Returns:
|
|
133
|
-
bool: 是否有效
|
|
134
|
-
"""
|
|
135
|
-
import re
|
|
136
|
-
# 清理爬虫名称中的不可见字符
|
|
137
|
-
cleaned_name = ''.join(c for c in spider_name if not unicodedata.category(c).startswith('C'))
|
|
138
|
-
|
|
139
|
-
# 爬虫名称应该是有效的Python标识符
|
|
140
|
-
return cleaned_name.isidentifier() and re.match(r'^[a-z][a-z0-9_]*$', cleaned_name)
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
def format_file_size(size_bytes: int) -> str:
|
|
144
|
-
"""
|
|
145
|
-
格式化文件大小
|
|
146
|
-
|
|
147
|
-
Args:
|
|
148
|
-
size_bytes: 字节数
|
|
149
|
-
|
|
150
|
-
Returns:
|
|
151
|
-
str: 格式化后的大小字符串
|
|
152
|
-
"""
|
|
153
|
-
for unit in ['B', 'KB', 'MB', 'GB']:
|
|
154
|
-
if size_bytes < 1024.0:
|
|
155
|
-
return f"{size_bytes:.1f} {unit}"
|
|
156
|
-
size_bytes /= 1024.0
|
|
157
|
-
return f"{size_bytes:.1f} TB"
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
def truncate_text(text: str, max_length: int = 80) -> str:
|
|
161
|
-
"""
|
|
162
|
-
截断过长的文本
|
|
163
|
-
|
|
164
|
-
Args:
|
|
165
|
-
text: 原始文本
|
|
166
|
-
max_length: 最大长度
|
|
167
|
-
|
|
168
|
-
Returns:
|
|
169
|
-
str: 截断后的文本
|
|
170
|
-
"""
|
|
171
|
-
if len(text) <= max_length:
|
|
172
|
-
return text
|
|
173
|
-
return text[:max_length-3] + "..."
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
def is_valid_domain(domain: str) -> bool:
|
|
177
|
-
"""
|
|
178
|
-
验证域名格式是否正确
|
|
179
|
-
|
|
180
|
-
Args:
|
|
181
|
-
domain: 域名
|
|
182
|
-
|
|
183
|
-
Returns:
|
|
184
|
-
bool: 是否有效
|
|
185
|
-
"""
|
|
186
|
-
import re
|
|
187
|
-
# 清理域名中的不可见字符
|
|
188
|
-
cleaned_domain = ''.join(c for c in domain if not unicodedata.category(c).startswith('C'))
|
|
189
|
-
|
|
190
|
-
pattern = re.compile(
|
|
191
|
-
r'^(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)*[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?$'
|
|
192
|
-
)
|
|
193
|
-
return bool(pattern.match(cleaned_domain))
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
# 添加导入
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
命令行工具公共模块
|
|
5
|
+
提供命令行工具的公共函数和工具
|
|
6
|
+
"""
|
|
7
|
+
import sys
|
|
8
|
+
import configparser
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from importlib import import_module
|
|
11
|
+
from typing import Optional, Tuple
|
|
12
|
+
|
|
13
|
+
from rich.console import Console
|
|
14
|
+
from rich.panel import Panel
|
|
15
|
+
from rich.text import Text
|
|
16
|
+
|
|
17
|
+
console = Console()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get_project_root() -> Optional[Path]:
|
|
21
|
+
"""
|
|
22
|
+
自动检测项目根目录:从当前目录向上查找 crawlo.cfg
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
Path: 项目根目录路径,如果未找到返回 None
|
|
26
|
+
"""
|
|
27
|
+
current = Path.cwd()
|
|
28
|
+
for _ in range(10): # 最多向上查找10层
|
|
29
|
+
cfg_file = current / "crawlo.cfg"
|
|
30
|
+
if cfg_file.exists():
|
|
31
|
+
return current
|
|
32
|
+
if current == current.parent:
|
|
33
|
+
break
|
|
34
|
+
current = current.parent
|
|
35
|
+
return None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def validate_project_environment() -> Tuple[bool, Optional[str], Optional[str]]:
|
|
39
|
+
"""
|
|
40
|
+
验证项目环境,确保在正确的 Crawlo 项目中
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Tuple[bool, Optional[str], Optional[str]]:
|
|
44
|
+
(是否有效, 项目包名, 错误信息)
|
|
45
|
+
"""
|
|
46
|
+
# 1. 查找项目根目录
|
|
47
|
+
project_root = get_project_root()
|
|
48
|
+
if not project_root:
|
|
49
|
+
return False, None, "找不到 'crawlo.cfg'。请在项目目录中运行此命令。"
|
|
50
|
+
|
|
51
|
+
# 2. 将项目根加入 Python 路径
|
|
52
|
+
project_root_str = str(project_root)
|
|
53
|
+
if project_root_str not in sys.path:
|
|
54
|
+
sys.path.insert(0, project_root_str)
|
|
55
|
+
|
|
56
|
+
# 3. 读取配置文件
|
|
57
|
+
cfg_file = project_root / "crawlo.cfg"
|
|
58
|
+
config = configparser.ConfigParser()
|
|
59
|
+
|
|
60
|
+
try:
|
|
61
|
+
config.read(cfg_file, encoding="utf-8")
|
|
62
|
+
except Exception as e:
|
|
63
|
+
return False, None, f"读取 crawlo.cfg 失败: {e}"
|
|
64
|
+
|
|
65
|
+
if not config.has_section("settings") or not config.has_option("settings", "default"):
|
|
66
|
+
return False, None, "无效的 crawlo.cfg:缺少 [settings] 部分或 'default' 选项"
|
|
67
|
+
|
|
68
|
+
# 4. 获取项目包名
|
|
69
|
+
settings_module = config.get("settings", "default")
|
|
70
|
+
project_package = settings_module.split(".")[0]
|
|
71
|
+
|
|
72
|
+
# 5. 验证项目包是否可导入
|
|
73
|
+
try:
|
|
74
|
+
import_module(project_package)
|
|
75
|
+
except ImportError as e:
|
|
76
|
+
return False, None, f"导入项目包 '{project_package}' 失败: {e}"
|
|
77
|
+
|
|
78
|
+
return True, project_package, None
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def show_error_panel(title: str, message: str, show_json: bool = False) -> None:
|
|
82
|
+
"""
|
|
83
|
+
显示错误面板或JSON格式错误
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
title: 错误标题
|
|
87
|
+
message: 错误消息
|
|
88
|
+
show_json: 是否以JSON格式输出
|
|
89
|
+
"""
|
|
90
|
+
if show_json:
|
|
91
|
+
console.print_json(data={"success": False, "error": message})
|
|
92
|
+
else:
|
|
93
|
+
console.print(Panel(
|
|
94
|
+
Text.from_markup(f"[bold red]{message}[/bold red]"),
|
|
95
|
+
title=f"{title}",
|
|
96
|
+
border_style="red",
|
|
97
|
+
padding=(1, 2)
|
|
98
|
+
))
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def show_success_panel(title: str, message: str, show_json: bool = False, data: dict = None) -> None:
|
|
102
|
+
"""
|
|
103
|
+
显示成功面板或JSON格式结果
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
title: 成功标题
|
|
107
|
+
message: 成功消息
|
|
108
|
+
show_json: 是否以JSON格式输出
|
|
109
|
+
data: JSON数据(当show_json=True时)
|
|
110
|
+
"""
|
|
111
|
+
if show_json:
|
|
112
|
+
result = {"success": True, "message": message}
|
|
113
|
+
if data:
|
|
114
|
+
result.update(data)
|
|
115
|
+
console.print_json(data=result)
|
|
116
|
+
else:
|
|
117
|
+
console.print(Panel(
|
|
118
|
+
Text.from_markup(f"[bold green]{message}[/bold green]"),
|
|
119
|
+
title=f"{title}",
|
|
120
|
+
border_style="green",
|
|
121
|
+
padding=(1, 2)
|
|
122
|
+
))
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def validate_spider_name(spider_name: str) -> bool:
|
|
126
|
+
"""
|
|
127
|
+
验证爬虫名称是否符合规范
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
spider_name: 爬虫名称
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
bool: 是否有效
|
|
134
|
+
"""
|
|
135
|
+
import re
|
|
136
|
+
# 清理爬虫名称中的不可见字符
|
|
137
|
+
cleaned_name = ''.join(c for c in spider_name if not unicodedata.category(c).startswith('C'))
|
|
138
|
+
|
|
139
|
+
# 爬虫名称应该是有效的Python标识符
|
|
140
|
+
return cleaned_name.isidentifier() and re.match(r'^[a-z][a-z0-9_]*$', cleaned_name)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def format_file_size(size_bytes: int) -> str:
|
|
144
|
+
"""
|
|
145
|
+
格式化文件大小
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
size_bytes: 字节数
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
str: 格式化后的大小字符串
|
|
152
|
+
"""
|
|
153
|
+
for unit in ['B', 'KB', 'MB', 'GB']:
|
|
154
|
+
if size_bytes < 1024.0:
|
|
155
|
+
return f"{size_bytes:.1f} {unit}"
|
|
156
|
+
size_bytes /= 1024.0
|
|
157
|
+
return f"{size_bytes:.1f} TB"
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def truncate_text(text: str, max_length: int = 80) -> str:
|
|
161
|
+
"""
|
|
162
|
+
截断过长的文本
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
text: 原始文本
|
|
166
|
+
max_length: 最大长度
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
str: 截断后的文本
|
|
170
|
+
"""
|
|
171
|
+
if len(text) <= max_length:
|
|
172
|
+
return text
|
|
173
|
+
return text[:max_length-3] + "..."
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def is_valid_domain(domain: str) -> bool:
|
|
177
|
+
"""
|
|
178
|
+
验证域名格式是否正确
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
domain: 域名
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
bool: 是否有效
|
|
185
|
+
"""
|
|
186
|
+
import re
|
|
187
|
+
# 清理域名中的不可见字符
|
|
188
|
+
cleaned_domain = ''.join(c for c in domain if not unicodedata.category(c).startswith('C'))
|
|
189
|
+
|
|
190
|
+
pattern = re.compile(
|
|
191
|
+
r'^(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)*[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?$'
|
|
192
|
+
)
|
|
193
|
+
return bool(pattern.match(cleaned_domain))
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
# 添加导入
|
|
197
197
|
import unicodedata
|