mlenvdoctor 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlenvdoctor/__init__.py +16 -1
- mlenvdoctor/cli.py +56 -20
- mlenvdoctor/config.py +169 -0
- mlenvdoctor/constants.py +63 -0
- mlenvdoctor/diagnose.py +116 -25
- mlenvdoctor/exceptions.py +51 -0
- mlenvdoctor/export.py +290 -0
- mlenvdoctor/fix.py +2 -1
- mlenvdoctor/gpu.py +7 -1
- mlenvdoctor/icons.py +100 -0
- mlenvdoctor/logger.py +81 -0
- mlenvdoctor/parallel.py +115 -0
- mlenvdoctor/retry.py +92 -0
- mlenvdoctor/utils.py +79 -19
- mlenvdoctor/validators.py +217 -0
- {mlenvdoctor-0.1.1.dist-info → mlenvdoctor-0.1.2.dist-info}/METADATA +3 -2
- mlenvdoctor-0.1.2.dist-info/RECORD +21 -0
- mlenvdoctor-0.1.1.dist-info/RECORD +0 -12
- {mlenvdoctor-0.1.1.dist-info → mlenvdoctor-0.1.2.dist-info}/WHEEL +0 -0
- {mlenvdoctor-0.1.1.dist-info → mlenvdoctor-0.1.2.dist-info}/entry_points.txt +0 -0
- {mlenvdoctor-0.1.1.dist-info → mlenvdoctor-0.1.2.dist-info}/licenses/LICENSE +0 -0
mlenvdoctor/__init__.py
CHANGED
|
@@ -1,3 +1,18 @@
|
|
|
1
1
|
"""ML Environment Doctor - Diagnose and fix ML environments for LLM fine-tuning."""
|
|
2
2
|
|
|
3
|
-
__version__ = "0.1.
|
|
3
|
+
__version__ = "0.1.2"
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
"__version__",
|
|
7
|
+
"diagnose",
|
|
8
|
+
"fix",
|
|
9
|
+
"dockerize",
|
|
10
|
+
"export",
|
|
11
|
+
"exceptions",
|
|
12
|
+
"logger",
|
|
13
|
+
"config",
|
|
14
|
+
"validators",
|
|
15
|
+
"retry",
|
|
16
|
+
"parallel",
|
|
17
|
+
"constants",
|
|
18
|
+
]
|
mlenvdoctor/cli.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""CLI entrypoint for ML Environment Doctor."""
|
|
2
2
|
|
|
3
|
+
from pathlib import Path
|
|
3
4
|
from typing import Optional
|
|
4
5
|
|
|
5
6
|
import typer
|
|
@@ -7,13 +8,16 @@ import typer
|
|
|
7
8
|
from . import __version__
|
|
8
9
|
from .diagnose import diagnose_env, print_diagnostic_table
|
|
9
10
|
from .dockerize import generate_dockerfile, generate_service_template
|
|
11
|
+
from .export import export_csv, export_html, export_json
|
|
10
12
|
from .fix import auto_fix
|
|
11
13
|
from .gpu import benchmark_gpu_ops, smoke_test_lora, test_model as gpu_test_model
|
|
14
|
+
from .icons import icon_check, icon_cross, icon_search, icon_test, icon_whale, icon_wrench
|
|
15
|
+
from .logger import get_default_log_file, setup_logger
|
|
12
16
|
from .utils import console
|
|
13
17
|
|
|
14
18
|
app = typer.Typer(
|
|
15
19
|
name="mlenvdoctor",
|
|
16
|
-
help="
|
|
20
|
+
help=f"{icon_search()} ML Environment Doctor - Diagnose & fix ML environments for LLM fine-tuning",
|
|
17
21
|
add_completion=False,
|
|
18
22
|
)
|
|
19
23
|
|
|
@@ -37,9 +41,21 @@ def main(
|
|
|
37
41
|
is_eager=True,
|
|
38
42
|
help="Show version and exit",
|
|
39
43
|
),
|
|
44
|
+
log_file: Optional[Path] = typer.Option(
|
|
45
|
+
None,
|
|
46
|
+
"--log-file",
|
|
47
|
+
help="Path to log file (default: ~/.mlenvdoctor/logs/mlenvdoctor.log)",
|
|
48
|
+
),
|
|
49
|
+
log_level: str = typer.Option(
|
|
50
|
+
"INFO",
|
|
51
|
+
"--log-level",
|
|
52
|
+
help="Logging level: DEBUG, INFO, WARNING, ERROR, CRITICAL",
|
|
53
|
+
),
|
|
40
54
|
):
|
|
41
55
|
"""ML Environment Doctor - Diagnose & fix ML environments for LLM fine-tuning."""
|
|
42
|
-
|
|
56
|
+
# Set up logging
|
|
57
|
+
log_path = log_file or get_default_log_file()
|
|
58
|
+
setup_logger(log_file=log_path, level=log_level)
|
|
43
59
|
|
|
44
60
|
|
|
45
61
|
@app.command()
|
|
@@ -47,9 +63,18 @@ def diagnose(
|
|
|
47
63
|
full: bool = typer.Option(
|
|
48
64
|
False, "--full", "-f", help="Run full diagnostics including GPU benchmarks"
|
|
49
65
|
),
|
|
66
|
+
json_output: Optional[Path] = typer.Option(
|
|
67
|
+
None, "--json", help="Export results to JSON file"
|
|
68
|
+
),
|
|
69
|
+
csv_output: Optional[Path] = typer.Option(
|
|
70
|
+
None, "--csv", help="Export results to CSV file"
|
|
71
|
+
),
|
|
72
|
+
html_output: Optional[Path] = typer.Option(
|
|
73
|
+
None, "--html", help="Export results to HTML file"
|
|
74
|
+
),
|
|
50
75
|
):
|
|
51
|
-
"""
|
|
52
|
-
|
|
76
|
+
f"""
|
|
77
|
+
{icon_search()} Diagnose your ML environment.
|
|
53
78
|
|
|
54
79
|
Quick scan: Checks CUDA, PyTorch, and required ML libraries.
|
|
55
80
|
Full scan (--full): Also checks GPU memory, disk space, Docker GPU support, and connectivity.
|
|
@@ -57,6 +82,17 @@ def diagnose(
|
|
|
57
82
|
issues = diagnose_env(full=full)
|
|
58
83
|
print_diagnostic_table(issues)
|
|
59
84
|
|
|
85
|
+
# Export to formats if requested
|
|
86
|
+
if json_output:
|
|
87
|
+
export_json(issues, json_output)
|
|
88
|
+
console.print(f"[green]{icon_check()} Exported to {json_output}[/green]")
|
|
89
|
+
if csv_output:
|
|
90
|
+
export_csv(issues, csv_output)
|
|
91
|
+
console.print(f"[green]{icon_check()} Exported to {csv_output}[/green]")
|
|
92
|
+
if html_output:
|
|
93
|
+
export_html(issues, html_output)
|
|
94
|
+
console.print(f"[green]{icon_check()} Exported to {html_output}[/green]")
|
|
95
|
+
|
|
60
96
|
if full:
|
|
61
97
|
console.print()
|
|
62
98
|
console.print("[bold blue]Running GPU benchmark...[/bold blue]")
|
|
@@ -78,8 +114,8 @@ def fix(
|
|
|
78
114
|
venv: bool = typer.Option(False, "--venv", "-v", help="Create virtual environment"),
|
|
79
115
|
stack: str = typer.Option("trl-peft", "--stack", "-s", help="ML stack: trl-peft or minimal"),
|
|
80
116
|
):
|
|
81
|
-
"""
|
|
82
|
-
|
|
117
|
+
f"""
|
|
118
|
+
{icon_wrench()} Auto-fix environment issues and generate requirements.
|
|
83
119
|
|
|
84
120
|
Generates requirements.txt or conda environment file based on detected issues.
|
|
85
121
|
Optionally creates a virtual environment and installs dependencies.
|
|
@@ -87,7 +123,7 @@ def fix(
|
|
|
87
123
|
success = auto_fix(use_conda=conda, create_venv=venv, stack=stack)
|
|
88
124
|
if success:
|
|
89
125
|
console.print()
|
|
90
|
-
console.print("[bold green]
|
|
126
|
+
console.print(f"[bold green]{icon_check()} Auto-fix completed![/bold green]")
|
|
91
127
|
console.print("[yellow]💡 Run 'mlenvdoctor diagnose' to verify fixes[/yellow]")
|
|
92
128
|
|
|
93
129
|
|
|
@@ -101,8 +137,8 @@ def dockerize(
|
|
|
101
137
|
"Dockerfile.mlenvdoctor", "--output", "-o", help="Output Dockerfile name"
|
|
102
138
|
),
|
|
103
139
|
):
|
|
104
|
-
"""
|
|
105
|
-
|
|
140
|
+
f"""
|
|
141
|
+
{icon_whale()} Generate Dockerfile for ML fine-tuning.
|
|
106
142
|
|
|
107
143
|
Creates a production-ready Dockerfile with CUDA support.
|
|
108
144
|
Optionally generates a FastAPI service template.
|
|
@@ -115,45 +151,45 @@ def dockerize(
|
|
|
115
151
|
generate_dockerfile(model_name=model, service=service, output_file=output)
|
|
116
152
|
|
|
117
153
|
console.print()
|
|
118
|
-
console.print("[bold green]
|
|
154
|
+
console.print(f"[bold green]{icon_check()} Dockerfile generated![/bold green]")
|
|
119
155
|
|
|
120
156
|
|
|
121
157
|
@app.command(name="test-model")
|
|
122
158
|
def test_model_cmd(
|
|
123
159
|
model: str = typer.Argument("tinyllama", help="Model to test (tinyllama, gpt2, mistral-7b)"),
|
|
124
160
|
):
|
|
125
|
-
"""
|
|
126
|
-
|
|
161
|
+
f"""
|
|
162
|
+
{icon_test()} Run smoke test with a real LLM model.
|
|
127
163
|
|
|
128
164
|
Tests model loading and forward pass to verify fine-tuning readiness.
|
|
129
165
|
"""
|
|
130
|
-
console.print(f"[bold blue]
|
|
166
|
+
console.print(f"[bold blue]{icon_test()} Testing model: {model}[/bold blue]\n")
|
|
131
167
|
success = gpu_test_model(model_name=model)
|
|
132
168
|
if success:
|
|
133
169
|
console.print()
|
|
134
|
-
console.print("[bold green]
|
|
170
|
+
console.print(f"[bold green]{icon_check()} Model test passed! Ready for fine-tuning.[/bold green]")
|
|
135
171
|
else:
|
|
136
172
|
console.print()
|
|
137
|
-
console.print("[bold red]
|
|
173
|
+
console.print(f"[bold red]{icon_cross()} Model test failed. Check diagnostics.[/bold red]")
|
|
138
174
|
raise typer.Exit(1)
|
|
139
175
|
|
|
140
176
|
|
|
141
177
|
@app.command()
|
|
142
178
|
def smoke_test():
|
|
143
|
-
"""
|
|
144
|
-
|
|
179
|
+
f"""
|
|
180
|
+
{icon_test()} Run LoRA fine-tuning smoke test.
|
|
145
181
|
|
|
146
182
|
Performs a minimal LoRA fine-tuning test to verify environment setup.
|
|
147
183
|
"""
|
|
148
|
-
console.print("[bold blue]
|
|
184
|
+
console.print(f"[bold blue]{icon_test()} Running LoRA smoke test...[/bold blue]\n")
|
|
149
185
|
success = smoke_test_lora()
|
|
150
186
|
if success:
|
|
151
187
|
console.print()
|
|
152
|
-
console.print("[bold green]
|
|
188
|
+
console.print(f"[bold green]{icon_check()} Smoke test passed! Environment is ready.[/bold green]")
|
|
153
189
|
else:
|
|
154
190
|
console.print()
|
|
155
191
|
console.print(
|
|
156
|
-
"[bold red]
|
|
192
|
+
f"[bold red]{icon_cross()} Smoke test failed. Run 'mlenvdoctor diagnose' for details.[/bold red]"
|
|
157
193
|
)
|
|
158
194
|
raise typer.Exit(1)
|
|
159
195
|
|
mlenvdoctor/config.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
"""Configuration management for ML Environment Doctor."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, Dict, Optional
|
|
5
|
+
|
|
6
|
+
# Try tomllib (Python 3.11+)
|
|
7
|
+
try:
|
|
8
|
+
import tomllib
|
|
9
|
+
except ImportError:
|
|
10
|
+
tomllib = None # type: ignore
|
|
11
|
+
|
|
12
|
+
# Fallback to tomli for older Python versions
|
|
13
|
+
try:
|
|
14
|
+
import tomli
|
|
15
|
+
except ImportError:
|
|
16
|
+
tomli = None
|
|
17
|
+
|
|
18
|
+
from .exceptions import ConfigurationError
|
|
19
|
+
from .utils import get_home_config_dir
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def load_config(config_path: Optional[Path] = None) -> Dict[str, Any]:
|
|
23
|
+
"""
|
|
24
|
+
Load configuration from TOML file.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
config_path: Path to config file. If None, searches for:
|
|
28
|
+
1. mlenvdoctor.toml in current directory
|
|
29
|
+
2. .mlenvdoctorrc in current directory
|
|
30
|
+
3. ~/.mlenvdoctor/config.toml
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
Configuration dictionary
|
|
34
|
+
|
|
35
|
+
Raises:
|
|
36
|
+
ConfigurationError: If config file is invalid
|
|
37
|
+
"""
|
|
38
|
+
default_config: Dict[str, Any] = {
|
|
39
|
+
"diagnostics": {
|
|
40
|
+
"full_scan": False,
|
|
41
|
+
"skip_checks": [],
|
|
42
|
+
},
|
|
43
|
+
"fix": {
|
|
44
|
+
"default_stack": "trl-peft",
|
|
45
|
+
"auto_install": False,
|
|
46
|
+
},
|
|
47
|
+
"docker": {
|
|
48
|
+
"default_base_image": "nvidia/cuda:12.4.0-devel-ubuntu22.04",
|
|
49
|
+
},
|
|
50
|
+
"logging": {
|
|
51
|
+
"level": "INFO",
|
|
52
|
+
"file": None,
|
|
53
|
+
},
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
if config_path is None:
|
|
57
|
+
# Search for config files
|
|
58
|
+
search_paths = [
|
|
59
|
+
Path("mlenvdoctor.toml"),
|
|
60
|
+
Path(".mlenvdoctorrc"),
|
|
61
|
+
get_home_config_dir() / "config.toml",
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
for path in search_paths:
|
|
65
|
+
if path.exists():
|
|
66
|
+
config_path = path
|
|
67
|
+
break
|
|
68
|
+
else:
|
|
69
|
+
if not config_path.exists():
|
|
70
|
+
raise ConfigurationError(
|
|
71
|
+
f"Config file not found: {config_path}",
|
|
72
|
+
"Create the file or use default configuration",
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
if config_path is None or not config_path.exists():
|
|
76
|
+
return default_config
|
|
77
|
+
|
|
78
|
+
try:
|
|
79
|
+
# Try tomllib (Python 3.11+)
|
|
80
|
+
if tomllib is not None:
|
|
81
|
+
with config_path.open("rb") as f:
|
|
82
|
+
user_config = tomllib.load(f)
|
|
83
|
+
elif tomli is not None:
|
|
84
|
+
# Fallback to tomli for older Python
|
|
85
|
+
with config_path.open("rb") as f:
|
|
86
|
+
user_config = tomli.load(f)
|
|
87
|
+
else:
|
|
88
|
+
raise ConfigurationError(
|
|
89
|
+
"TOML parsing not available. Install tomli: pip install tomli",
|
|
90
|
+
"Or upgrade to Python 3.11+",
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# Merge with defaults
|
|
94
|
+
merged_config = default_config.copy()
|
|
95
|
+
for section, values in user_config.items():
|
|
96
|
+
if section in merged_config and isinstance(merged_config[section], dict):
|
|
97
|
+
merged_config[section].update(values)
|
|
98
|
+
else:
|
|
99
|
+
merged_config[section] = values
|
|
100
|
+
|
|
101
|
+
return merged_config
|
|
102
|
+
|
|
103
|
+
except Exception as e:
|
|
104
|
+
raise ConfigurationError(
|
|
105
|
+
f"Error parsing config file {config_path}: {e}",
|
|
106
|
+
"Check TOML syntax and file permissions",
|
|
107
|
+
) from e
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def get_config_value(config: Dict[str, Any], *keys: str, default: Any = None) -> Any:
|
|
111
|
+
"""
|
|
112
|
+
Get nested config value safely.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
config: Configuration dictionary
|
|
116
|
+
*keys: Nested keys to traverse
|
|
117
|
+
default: Default value if key not found
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
Config value or default
|
|
121
|
+
"""
|
|
122
|
+
value = config
|
|
123
|
+
for key in keys:
|
|
124
|
+
if isinstance(value, dict):
|
|
125
|
+
value = value.get(key)
|
|
126
|
+
if value is None:
|
|
127
|
+
return default
|
|
128
|
+
else:
|
|
129
|
+
return default
|
|
130
|
+
return value if value is not None else default
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def create_default_config(output_path: Path) -> Path:
|
|
134
|
+
"""
|
|
135
|
+
Create a default configuration file.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
output_path: Path where to create config file
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
Path to created config file
|
|
142
|
+
"""
|
|
143
|
+
default_content = """# ML Environment Doctor Configuration
|
|
144
|
+
|
|
145
|
+
[diagnostics]
|
|
146
|
+
# Run full scan by default
|
|
147
|
+
full_scan = false
|
|
148
|
+
# Skip specific checks (e.g., ["docker_gpu", "internet"])
|
|
149
|
+
skip_checks = []
|
|
150
|
+
|
|
151
|
+
[fix]
|
|
152
|
+
# Default ML stack: "trl-peft" or "minimal"
|
|
153
|
+
default_stack = "trl-peft"
|
|
154
|
+
# Automatically install dependencies without prompting
|
|
155
|
+
auto_install = false
|
|
156
|
+
|
|
157
|
+
[docker]
|
|
158
|
+
# Default base image for Dockerfiles
|
|
159
|
+
default_base_image = "nvidia/cuda:12.4.0-devel-ubuntu22.04"
|
|
160
|
+
|
|
161
|
+
[logging]
|
|
162
|
+
# Logging level: DEBUG, INFO, WARNING, ERROR, CRITICAL
|
|
163
|
+
level = "INFO"
|
|
164
|
+
# Log file path (None for default: ~/.mlenvdoctor/logs/mlenvdoctor.log)
|
|
165
|
+
file = null
|
|
166
|
+
"""
|
|
167
|
+
|
|
168
|
+
output_path.write_text(default_content, encoding="utf-8")
|
|
169
|
+
return output_path
|
mlenvdoctor/constants.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Constants used throughout ML Environment Doctor."""
|
|
2
|
+
|
|
3
|
+
from typing import Final
|
|
4
|
+
|
|
5
|
+
# Version compatibility
|
|
6
|
+
MIN_PYTHON_VERSION: Final[tuple[int, int]] = (3, 8)
|
|
7
|
+
MIN_PYTORCH_VERSION: Final[str] = "2.4.0"
|
|
8
|
+
|
|
9
|
+
# CUDA versions
|
|
10
|
+
SUPPORTED_CUDA_VERSIONS: Final[list[str]] = ["12.1", "12.4"]
|
|
11
|
+
DEFAULT_CUDA_VERSION: Final[str] = "12.4"
|
|
12
|
+
|
|
13
|
+
# ML Library versions
|
|
14
|
+
MIN_TRANSFORMERS_VERSION: Final[str] = "4.44.0"
|
|
15
|
+
MIN_PEFT_VERSION: Final[str] = "0.12.0"
|
|
16
|
+
MIN_TRL_VERSION: Final[str] = "0.9.0"
|
|
17
|
+
MIN_DATASETS_VERSION: Final[str] = "2.20.0"
|
|
18
|
+
MIN_ACCELERATE_VERSION: Final[str] = "1.0.0"
|
|
19
|
+
|
|
20
|
+
# Memory requirements (GB)
|
|
21
|
+
MIN_GPU_MEMORY_GB: Final[int] = 8
|
|
22
|
+
RECOMMENDED_GPU_MEMORY_GB: Final[int] = 16
|
|
23
|
+
MIN_DISK_SPACE_GB: Final[int] = 50
|
|
24
|
+
|
|
25
|
+
# Timeouts (seconds)
|
|
26
|
+
DEFAULT_COMMAND_TIMEOUT: Final[int] = 30
|
|
27
|
+
DEFAULT_NETWORK_TIMEOUT: Final[int] = 10
|
|
28
|
+
DEFAULT_INSTALL_TIMEOUT: Final[int] = 600
|
|
29
|
+
|
|
30
|
+
# File paths
|
|
31
|
+
DEFAULT_CONFIG_FILE: Final[str] = "mlenvdoctor.toml"
|
|
32
|
+
DEFAULT_REQUIREMENTS_FILE: Final[str] = "requirements-mlenvdoctor.txt"
|
|
33
|
+
DEFAULT_DOCKERFILE: Final[str] = "Dockerfile.mlenvdoctor"
|
|
34
|
+
|
|
35
|
+
# Model names
|
|
36
|
+
SUPPORTED_MODELS: Final[dict[str, str]] = {
|
|
37
|
+
"tinyllama": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
|
38
|
+
"gpt2": "gpt2",
|
|
39
|
+
"mistral-7b": "mistralai/Mistral-7B-v0.1",
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
# ML Stacks
|
|
43
|
+
ML_STACKS: Final[list[str]] = ["trl-peft", "minimal"]
|
|
44
|
+
|
|
45
|
+
# Diagnostic check names
|
|
46
|
+
CHECK_CUDA_DRIVER: Final[str] = "cuda_driver"
|
|
47
|
+
CHECK_PYTORCH_CUDA: Final[str] = "pytorch_cuda"
|
|
48
|
+
CHECK_ML_LIBRARIES: Final[str] = "ml_libraries"
|
|
49
|
+
CHECK_GPU_MEMORY: Final[str] = "gpu_memory"
|
|
50
|
+
CHECK_DISK_SPACE: Final[str] = "disk_space"
|
|
51
|
+
CHECK_DOCKER_GPU: Final[str] = "docker_gpu"
|
|
52
|
+
CHECK_INTERNET: Final[str] = "internet"
|
|
53
|
+
|
|
54
|
+
# Severity levels
|
|
55
|
+
SEVERITY_CRITICAL: Final[str] = "critical"
|
|
56
|
+
SEVERITY_WARNING: Final[str] = "warning"
|
|
57
|
+
SEVERITY_INFO: Final[str] = "info"
|
|
58
|
+
|
|
59
|
+
# Status values
|
|
60
|
+
STATUS_PASS: Final[str] = "PASS"
|
|
61
|
+
STATUS_FAIL: Final[str] = "FAIL"
|
|
62
|
+
STATUS_WARN: Final[str] = "WARN"
|
|
63
|
+
STATUS_INFO: Final[str] = "INFO"
|
mlenvdoctor/diagnose.py
CHANGED
|
@@ -11,6 +11,7 @@ except ImportError:
|
|
|
11
11
|
|
|
12
12
|
from rich.table import Table
|
|
13
13
|
|
|
14
|
+
from .icons import icon_check, icon_cross, icon_info, icon_search, icon_warning
|
|
14
15
|
from .utils import check_command_exists, console, format_size, get_home_config_dir, run_command
|
|
15
16
|
|
|
16
17
|
|
|
@@ -33,12 +34,13 @@ class DiagnosticIssue:
|
|
|
33
34
|
|
|
34
35
|
def to_row(self) -> Tuple[str, str, str, str]:
|
|
35
36
|
"""Convert to table row."""
|
|
36
|
-
|
|
37
|
-
"PASS":
|
|
38
|
-
"FAIL":
|
|
39
|
-
"WARN":
|
|
40
|
-
"INFO":
|
|
41
|
-
}
|
|
37
|
+
status_icon_map = {
|
|
38
|
+
"PASS": icon_check(),
|
|
39
|
+
"FAIL": icon_cross(),
|
|
40
|
+
"WARN": icon_warning(),
|
|
41
|
+
"INFO": icon_info(),
|
|
42
|
+
}
|
|
43
|
+
status_icon = status_icon_map.get(self.status.split()[0], "?")
|
|
42
44
|
return (
|
|
43
45
|
self.name,
|
|
44
46
|
f"{status_icon} {self.status}",
|
|
@@ -411,11 +413,19 @@ def check_docker_gpu() -> List[DiagnosticIssue]:
|
|
|
411
413
|
|
|
412
414
|
def check_internet_connectivity() -> List[DiagnosticIssue]:
|
|
413
415
|
"""Check internet connectivity for HF Hub."""
|
|
416
|
+
from .retry import retry_network
|
|
417
|
+
|
|
414
418
|
issues = []
|
|
415
|
-
|
|
419
|
+
|
|
420
|
+
@retry_network
|
|
421
|
+
def _check_connectivity() -> bool:
|
|
416
422
|
import urllib.request
|
|
417
423
|
|
|
418
424
|
urllib.request.urlopen("https://huggingface.co", timeout=5)
|
|
425
|
+
return True
|
|
426
|
+
|
|
427
|
+
try:
|
|
428
|
+
_check_connectivity()
|
|
419
429
|
issues.append(
|
|
420
430
|
DiagnosticIssue(
|
|
421
431
|
name="Internet Connectivity",
|
|
@@ -424,36 +434,117 @@ def check_internet_connectivity() -> List[DiagnosticIssue]:
|
|
|
424
434
|
fix="",
|
|
425
435
|
)
|
|
426
436
|
)
|
|
427
|
-
except Exception:
|
|
437
|
+
except Exception as e:
|
|
428
438
|
issues.append(
|
|
429
439
|
DiagnosticIssue(
|
|
430
440
|
name="Internet Connectivity",
|
|
431
441
|
status="WARN - Cannot reach HF Hub",
|
|
432
442
|
severity="warning",
|
|
433
443
|
fix="Check internet connection and firewall settings",
|
|
444
|
+
details=str(e),
|
|
434
445
|
)
|
|
435
446
|
)
|
|
436
447
|
|
|
437
448
|
return issues
|
|
438
449
|
|
|
439
450
|
|
|
440
|
-
def diagnose_env(full: bool = False) -> List[DiagnosticIssue]:
|
|
441
|
-
"""
|
|
442
|
-
|
|
451
|
+
def diagnose_env(full: bool = False, parallel: bool = True) -> List[DiagnosticIssue]:
|
|
452
|
+
"""
|
|
453
|
+
Run all diagnostic checks.
|
|
454
|
+
|
|
455
|
+
Args:
|
|
456
|
+
full: Whether to run full diagnostics including extended checks
|
|
457
|
+
parallel: Whether to run independent checks in parallel
|
|
443
458
|
|
|
444
|
-
|
|
459
|
+
Returns:
|
|
460
|
+
List of diagnostic issues found
|
|
461
|
+
"""
|
|
462
|
+
from .parallel import run_parallel_with_results
|
|
445
463
|
|
|
446
|
-
|
|
447
|
-
all_issues.extend(check_cuda_driver())
|
|
448
|
-
all_issues.extend(check_pytorch_cuda())
|
|
449
|
-
all_issues.extend(check_ml_libraries())
|
|
464
|
+
all_issues: List[DiagnosticIssue] = []
|
|
450
465
|
|
|
451
|
-
|
|
466
|
+
console.print(f"[bold blue]{icon_search()} Running ML Environment Diagnostics...[/bold blue]\n")
|
|
467
|
+
|
|
468
|
+
# Core checks (always run) - these can run in parallel
|
|
469
|
+
core_checks = [
|
|
470
|
+
check_cuda_driver,
|
|
471
|
+
check_pytorch_cuda,
|
|
472
|
+
check_ml_libraries,
|
|
473
|
+
]
|
|
474
|
+
|
|
475
|
+
if parallel:
|
|
476
|
+
# Run core checks in parallel
|
|
477
|
+
results = run_parallel_with_results(
|
|
478
|
+
lambda check_func: check_func(),
|
|
479
|
+
core_checks,
|
|
480
|
+
max_workers=3,
|
|
481
|
+
timeout=60.0,
|
|
482
|
+
)
|
|
483
|
+
for check_func, result in results:
|
|
484
|
+
if isinstance(result, Exception):
|
|
485
|
+
# Log error but continue with other checks
|
|
486
|
+
from .logger import logger
|
|
487
|
+
logger.error(f"Check {check_func.__name__} failed: {result}")
|
|
488
|
+
# Add a diagnostic issue for the failure
|
|
489
|
+
all_issues.append(
|
|
490
|
+
DiagnosticIssue(
|
|
491
|
+
name=check_func.__name__.replace("check_", "").replace("_", " ").title(),
|
|
492
|
+
status="FAIL - Check error",
|
|
493
|
+
severity="critical",
|
|
494
|
+
fix="Run diagnostics again or check logs",
|
|
495
|
+
details=str(result),
|
|
496
|
+
)
|
|
497
|
+
)
|
|
498
|
+
else:
|
|
499
|
+
all_issues.extend(result)
|
|
500
|
+
else:
|
|
501
|
+
# Sequential execution (fallback)
|
|
502
|
+
for check_func in core_checks:
|
|
503
|
+
try:
|
|
504
|
+
all_issues.extend(check_func())
|
|
505
|
+
except Exception as e:
|
|
506
|
+
from .logger import logger
|
|
507
|
+
logger.error(f"Check {check_func.__name__} failed: {e}")
|
|
508
|
+
all_issues.append(
|
|
509
|
+
DiagnosticIssue(
|
|
510
|
+
name=check_func.__name__.replace("check_", "").replace("_", " ").title(),
|
|
511
|
+
status="FAIL - Check error",
|
|
512
|
+
severity="critical",
|
|
513
|
+
fix="Run diagnostics again or check logs",
|
|
514
|
+
details=str(e),
|
|
515
|
+
)
|
|
516
|
+
)
|
|
517
|
+
|
|
518
|
+
# Extended checks (if --full) - can also run in parallel
|
|
452
519
|
if full:
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
520
|
+
extended_checks = [
|
|
521
|
+
check_gpu_memory,
|
|
522
|
+
check_disk_space,
|
|
523
|
+
check_docker_gpu,
|
|
524
|
+
check_internet_connectivity,
|
|
525
|
+
]
|
|
526
|
+
|
|
527
|
+
if parallel:
|
|
528
|
+
results = run_parallel_with_results(
|
|
529
|
+
lambda check_func: check_func(),
|
|
530
|
+
extended_checks,
|
|
531
|
+
max_workers=4,
|
|
532
|
+
timeout=120.0,
|
|
533
|
+
)
|
|
534
|
+
for check_func, result in results:
|
|
535
|
+
if isinstance(result, Exception):
|
|
536
|
+
from .logger import logger
|
|
537
|
+
logger.warning(f"Extended check {check_func.__name__} failed: {result}")
|
|
538
|
+
# Extended checks are less critical, so we log but don't fail
|
|
539
|
+
else:
|
|
540
|
+
all_issues.extend(result)
|
|
541
|
+
else:
|
|
542
|
+
for check_func in extended_checks:
|
|
543
|
+
try:
|
|
544
|
+
all_issues.extend(check_func())
|
|
545
|
+
except Exception as e:
|
|
546
|
+
from .logger import logger
|
|
547
|
+
logger.warning(f"Extended check {check_func.__name__} failed: {e}")
|
|
457
548
|
|
|
458
549
|
return all_issues
|
|
459
550
|
|
|
@@ -484,18 +575,18 @@ def print_diagnostic_table(issues: List[DiagnosticIssue]) -> None:
|
|
|
484
575
|
pass_count = sum(1 for i in issues if "PASS" in i.status)
|
|
485
576
|
|
|
486
577
|
console.print()
|
|
487
|
-
console.print(f"[green]
|
|
578
|
+
console.print(f"[green]{icon_check()} Passed: {pass_count}[/green]")
|
|
488
579
|
if warning_count > 0:
|
|
489
|
-
console.print(f"[yellow]
|
|
580
|
+
console.print(f"[yellow]{icon_warning()} Warnings: {warning_count}[/yellow]")
|
|
490
581
|
if critical_count > 0:
|
|
491
|
-
console.print(f"[red]
|
|
582
|
+
console.print(f"[red]{icon_cross()} Critical Issues: {critical_count}[/red]")
|
|
492
583
|
|
|
493
584
|
if critical_count == 0 and warning_count == 0:
|
|
494
585
|
console.print(
|
|
495
586
|
"\n[bold green]🎉 Your ML environment looks ready for fine-tuning![/bold green]"
|
|
496
587
|
)
|
|
497
588
|
elif critical_count > 0:
|
|
498
|
-
console.print("\n[bold red]
|
|
589
|
+
console.print(f"\n[bold red]{icon_warning()} Please fix critical issues before proceeding.[/bold red]")
|
|
499
590
|
else:
|
|
500
591
|
console.print(
|
|
501
592
|
"\n[bold yellow]💡 Consider addressing warnings for optimal performance.[/bold yellow]"
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""Custom exceptions for ML Environment Doctor."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class MLEnvDoctorError(Exception):
|
|
5
|
+
"""Base exception for ML Environment Doctor."""
|
|
6
|
+
|
|
7
|
+
def __init__(self, message: str, suggestion: str = ""):
|
|
8
|
+
super().__init__(message)
|
|
9
|
+
self.message = message
|
|
10
|
+
self.suggestion = suggestion
|
|
11
|
+
|
|
12
|
+
def __str__(self) -> str:
|
|
13
|
+
if self.suggestion:
|
|
14
|
+
return f"{self.message}\n💡 Suggestion: {self.suggestion}"
|
|
15
|
+
return self.message
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DiagnosticError(MLEnvDoctorError):
|
|
19
|
+
"""Error during diagnostic checks."""
|
|
20
|
+
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class FixError(MLEnvDoctorError):
|
|
25
|
+
"""Error during auto-fix operations."""
|
|
26
|
+
|
|
27
|
+
pass
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class DockerError(MLEnvDoctorError):
|
|
31
|
+
"""Error during Docker operations."""
|
|
32
|
+
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class GPUError(MLEnvDoctorError):
|
|
37
|
+
"""Error related to GPU operations."""
|
|
38
|
+
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class ConfigurationError(MLEnvDoctorError):
|
|
43
|
+
"""Error in configuration."""
|
|
44
|
+
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class InstallationError(MLEnvDoctorError):
|
|
49
|
+
"""Error during package installation."""
|
|
50
|
+
|
|
51
|
+
pass
|