mlenvdoctor 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlenvdoctor/__init__.py +15 -1
- mlenvdoctor/cli.py +80 -30
- mlenvdoctor/config.py +169 -0
- mlenvdoctor/constants.py +63 -0
- mlenvdoctor/diagnose.py +146 -46
- mlenvdoctor/dockerize.py +3 -6
- mlenvdoctor/exceptions.py +51 -0
- mlenvdoctor/export.py +290 -0
- mlenvdoctor/fix.py +19 -13
- mlenvdoctor/gpu.py +15 -9
- mlenvdoctor/icons.py +100 -0
- mlenvdoctor/logger.py +81 -0
- mlenvdoctor/parallel.py +115 -0
- mlenvdoctor/retry.py +92 -0
- mlenvdoctor/utils.py +79 -22
- mlenvdoctor/validators.py +217 -0
- {mlenvdoctor-0.1.0.dist-info → mlenvdoctor-0.1.2.dist-info}/METADATA +3 -2
- mlenvdoctor-0.1.2.dist-info/RECORD +21 -0
- mlenvdoctor-0.1.0.dist-info/RECORD +0 -12
- {mlenvdoctor-0.1.0.dist-info → mlenvdoctor-0.1.2.dist-info}/WHEEL +0 -0
- {mlenvdoctor-0.1.0.dist-info → mlenvdoctor-0.1.2.dist-info}/entry_points.txt +0 -0
- {mlenvdoctor-0.1.0.dist-info → mlenvdoctor-0.1.2.dist-info}/licenses/LICENSE +0 -0
mlenvdoctor/__init__.py
CHANGED
|
@@ -1,4 +1,18 @@
|
|
|
1
1
|
"""ML Environment Doctor - Diagnose and fix ML environments for LLM fine-tuning."""
|
|
2
2
|
|
|
3
|
-
__version__ = "0.1.
|
|
3
|
+
__version__ = "0.1.2"
|
|
4
4
|
|
|
5
|
+
__all__ = [
|
|
6
|
+
"__version__",
|
|
7
|
+
"diagnose",
|
|
8
|
+
"fix",
|
|
9
|
+
"dockerize",
|
|
10
|
+
"export",
|
|
11
|
+
"exceptions",
|
|
12
|
+
"logger",
|
|
13
|
+
"config",
|
|
14
|
+
"validators",
|
|
15
|
+
"retry",
|
|
16
|
+
"parallel",
|
|
17
|
+
"constants",
|
|
18
|
+
]
|
mlenvdoctor/cli.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""CLI entrypoint for ML Environment Doctor."""
|
|
2
2
|
|
|
3
|
+
from pathlib import Path
|
|
3
4
|
from typing import Optional
|
|
4
5
|
|
|
5
6
|
import typer
|
|
@@ -7,13 +8,16 @@ import typer
|
|
|
7
8
|
from . import __version__
|
|
8
9
|
from .diagnose import diagnose_env, print_diagnostic_table
|
|
9
10
|
from .dockerize import generate_dockerfile, generate_service_template
|
|
11
|
+
from .export import export_csv, export_html, export_json
|
|
10
12
|
from .fix import auto_fix
|
|
11
|
-
from .gpu import benchmark_gpu_ops, smoke_test_lora, test_model
|
|
13
|
+
from .gpu import benchmark_gpu_ops, smoke_test_lora, test_model as gpu_test_model
|
|
14
|
+
from .icons import icon_check, icon_cross, icon_search, icon_test, icon_whale, icon_wrench
|
|
15
|
+
from .logger import get_default_log_file, setup_logger
|
|
12
16
|
from .utils import console
|
|
13
17
|
|
|
14
18
|
app = typer.Typer(
|
|
15
19
|
name="mlenvdoctor",
|
|
16
|
-
help="
|
|
20
|
+
help=f"{icon_search()} ML Environment Doctor - Diagnose & fix ML environments for LLM fine-tuning",
|
|
17
21
|
add_completion=False,
|
|
18
22
|
)
|
|
19
23
|
|
|
@@ -21,26 +25,56 @@ app = typer.Typer(
|
|
|
21
25
|
def version_callback(value: bool):
|
|
22
26
|
"""Print version and exit."""
|
|
23
27
|
if value:
|
|
24
|
-
console.print(
|
|
28
|
+
console.print(
|
|
29
|
+
f"[bold blue]ML Environment Doctor[/bold blue] version [cyan]{__version__}[/cyan]"
|
|
30
|
+
)
|
|
25
31
|
raise typer.Exit()
|
|
26
32
|
|
|
27
33
|
|
|
28
34
|
@app.callback()
|
|
29
35
|
def main(
|
|
30
36
|
version: Optional[bool] = typer.Option(
|
|
31
|
-
None,
|
|
37
|
+
None,
|
|
38
|
+
"--version",
|
|
39
|
+
"-v",
|
|
40
|
+
callback=version_callback,
|
|
41
|
+
is_eager=True,
|
|
42
|
+
help="Show version and exit",
|
|
43
|
+
),
|
|
44
|
+
log_file: Optional[Path] = typer.Option(
|
|
45
|
+
None,
|
|
46
|
+
"--log-file",
|
|
47
|
+
help="Path to log file (default: ~/.mlenvdoctor/logs/mlenvdoctor.log)",
|
|
48
|
+
),
|
|
49
|
+
log_level: str = typer.Option(
|
|
50
|
+
"INFO",
|
|
51
|
+
"--log-level",
|
|
52
|
+
help="Logging level: DEBUG, INFO, WARNING, ERROR, CRITICAL",
|
|
32
53
|
),
|
|
33
54
|
):
|
|
34
55
|
"""ML Environment Doctor - Diagnose & fix ML environments for LLM fine-tuning."""
|
|
35
|
-
|
|
56
|
+
# Set up logging
|
|
57
|
+
log_path = log_file or get_default_log_file()
|
|
58
|
+
setup_logger(log_file=log_path, level=log_level)
|
|
36
59
|
|
|
37
60
|
|
|
38
61
|
@app.command()
|
|
39
62
|
def diagnose(
|
|
40
|
-
full: bool = typer.Option(
|
|
63
|
+
full: bool = typer.Option(
|
|
64
|
+
False, "--full", "-f", help="Run full diagnostics including GPU benchmarks"
|
|
65
|
+
),
|
|
66
|
+
json_output: Optional[Path] = typer.Option(
|
|
67
|
+
None, "--json", help="Export results to JSON file"
|
|
68
|
+
),
|
|
69
|
+
csv_output: Optional[Path] = typer.Option(
|
|
70
|
+
None, "--csv", help="Export results to CSV file"
|
|
71
|
+
),
|
|
72
|
+
html_output: Optional[Path] = typer.Option(
|
|
73
|
+
None, "--html", help="Export results to HTML file"
|
|
74
|
+
),
|
|
41
75
|
):
|
|
42
|
-
"""
|
|
43
|
-
|
|
76
|
+
f"""
|
|
77
|
+
{icon_search()} Diagnose your ML environment.
|
|
44
78
|
|
|
45
79
|
Quick scan: Checks CUDA, PyTorch, and required ML libraries.
|
|
46
80
|
Full scan (--full): Also checks GPU memory, disk space, Docker GPU support, and connectivity.
|
|
@@ -48,6 +82,17 @@ def diagnose(
|
|
|
48
82
|
issues = diagnose_env(full=full)
|
|
49
83
|
print_diagnostic_table(issues)
|
|
50
84
|
|
|
85
|
+
# Export to formats if requested
|
|
86
|
+
if json_output:
|
|
87
|
+
export_json(issues, json_output)
|
|
88
|
+
console.print(f"[green]{icon_check()} Exported to {json_output}[/green]")
|
|
89
|
+
if csv_output:
|
|
90
|
+
export_csv(issues, csv_output)
|
|
91
|
+
console.print(f"[green]{icon_check()} Exported to {csv_output}[/green]")
|
|
92
|
+
if html_output:
|
|
93
|
+
export_html(issues, html_output)
|
|
94
|
+
console.print(f"[green]{icon_check()} Exported to {html_output}[/green]")
|
|
95
|
+
|
|
51
96
|
if full:
|
|
52
97
|
console.print()
|
|
53
98
|
console.print("[bold blue]Running GPU benchmark...[/bold blue]")
|
|
@@ -69,8 +114,8 @@ def fix(
|
|
|
69
114
|
venv: bool = typer.Option(False, "--venv", "-v", help="Create virtual environment"),
|
|
70
115
|
stack: str = typer.Option("trl-peft", "--stack", "-s", help="ML stack: trl-peft or minimal"),
|
|
71
116
|
):
|
|
72
|
-
"""
|
|
73
|
-
|
|
117
|
+
f"""
|
|
118
|
+
{icon_wrench()} Auto-fix environment issues and generate requirements.
|
|
74
119
|
|
|
75
120
|
Generates requirements.txt or conda environment file based on detected issues.
|
|
76
121
|
Optionally creates a virtual environment and installs dependencies.
|
|
@@ -78,18 +123,22 @@ def fix(
|
|
|
78
123
|
success = auto_fix(use_conda=conda, create_venv=venv, stack=stack)
|
|
79
124
|
if success:
|
|
80
125
|
console.print()
|
|
81
|
-
console.print("[bold green]
|
|
126
|
+
console.print(f"[bold green]{icon_check()} Auto-fix completed![/bold green]")
|
|
82
127
|
console.print("[yellow]💡 Run 'mlenvdoctor diagnose' to verify fixes[/yellow]")
|
|
83
128
|
|
|
84
129
|
|
|
85
130
|
@app.command()
|
|
86
131
|
def dockerize(
|
|
87
132
|
model: Optional[str] = typer.Argument(None, help="Model name (mistral-7b, tinyllama, gpt2)"),
|
|
88
|
-
service: bool = typer.Option(
|
|
89
|
-
|
|
133
|
+
service: bool = typer.Option(
|
|
134
|
+
False, "--service", "-s", help="Generate FastAPI service template"
|
|
135
|
+
),
|
|
136
|
+
output: str = typer.Option(
|
|
137
|
+
"Dockerfile.mlenvdoctor", "--output", "-o", help="Output Dockerfile name"
|
|
138
|
+
),
|
|
90
139
|
):
|
|
91
|
-
"""
|
|
92
|
-
|
|
140
|
+
f"""
|
|
141
|
+
{icon_whale()} Generate Dockerfile for ML fine-tuning.
|
|
93
142
|
|
|
94
143
|
Creates a production-ready Dockerfile with CUDA support.
|
|
95
144
|
Optionally generates a FastAPI service template.
|
|
@@ -102,44 +151,46 @@ def dockerize(
|
|
|
102
151
|
generate_dockerfile(model_name=model, service=service, output_file=output)
|
|
103
152
|
|
|
104
153
|
console.print()
|
|
105
|
-
console.print("[bold green]
|
|
154
|
+
console.print(f"[bold green]{icon_check()} Dockerfile generated![/bold green]")
|
|
106
155
|
|
|
107
156
|
|
|
108
|
-
@app.command()
|
|
109
|
-
def
|
|
157
|
+
@app.command(name="test-model")
|
|
158
|
+
def test_model_cmd(
|
|
110
159
|
model: str = typer.Argument("tinyllama", help="Model to test (tinyllama, gpt2, mistral-7b)"),
|
|
111
160
|
):
|
|
112
|
-
"""
|
|
113
|
-
|
|
161
|
+
f"""
|
|
162
|
+
{icon_test()} Run smoke test with a real LLM model.
|
|
114
163
|
|
|
115
164
|
Tests model loading and forward pass to verify fine-tuning readiness.
|
|
116
165
|
"""
|
|
117
|
-
console.print(f"[bold blue]
|
|
118
|
-
success =
|
|
166
|
+
console.print(f"[bold blue]{icon_test()} Testing model: {model}[/bold blue]\n")
|
|
167
|
+
success = gpu_test_model(model_name=model)
|
|
119
168
|
if success:
|
|
120
169
|
console.print()
|
|
121
|
-
console.print("[bold green]
|
|
170
|
+
console.print(f"[bold green]{icon_check()} Model test passed! Ready for fine-tuning.[/bold green]")
|
|
122
171
|
else:
|
|
123
172
|
console.print()
|
|
124
|
-
console.print("[bold red]
|
|
173
|
+
console.print(f"[bold red]{icon_cross()} Model test failed. Check diagnostics.[/bold red]")
|
|
125
174
|
raise typer.Exit(1)
|
|
126
175
|
|
|
127
176
|
|
|
128
177
|
@app.command()
|
|
129
178
|
def smoke_test():
|
|
130
|
-
"""
|
|
131
|
-
|
|
179
|
+
f"""
|
|
180
|
+
{icon_test()} Run LoRA fine-tuning smoke test.
|
|
132
181
|
|
|
133
182
|
Performs a minimal LoRA fine-tuning test to verify environment setup.
|
|
134
183
|
"""
|
|
135
|
-
console.print("[bold blue]
|
|
184
|
+
console.print(f"[bold blue]{icon_test()} Running LoRA smoke test...[/bold blue]\n")
|
|
136
185
|
success = smoke_test_lora()
|
|
137
186
|
if success:
|
|
138
187
|
console.print()
|
|
139
|
-
console.print("[bold green]
|
|
188
|
+
console.print(f"[bold green]{icon_check()} Smoke test passed! Environment is ready.[/bold green]")
|
|
140
189
|
else:
|
|
141
190
|
console.print()
|
|
142
|
-
console.print(
|
|
191
|
+
console.print(
|
|
192
|
+
f"[bold red]{icon_cross()} Smoke test failed. Run 'mlenvdoctor diagnose' for details.[/bold red]"
|
|
193
|
+
)
|
|
143
194
|
raise typer.Exit(1)
|
|
144
195
|
|
|
145
196
|
|
|
@@ -150,4 +201,3 @@ def main_cli():
|
|
|
150
201
|
|
|
151
202
|
if __name__ == "__main__":
|
|
152
203
|
main_cli()
|
|
153
|
-
|
mlenvdoctor/config.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
"""Configuration management for ML Environment Doctor."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, Dict, Optional
|
|
5
|
+
|
|
6
|
+
# Try tomllib (Python 3.11+)
|
|
7
|
+
try:
|
|
8
|
+
import tomllib
|
|
9
|
+
except ImportError:
|
|
10
|
+
tomllib = None # type: ignore
|
|
11
|
+
|
|
12
|
+
# Fallback to tomli for older Python versions
|
|
13
|
+
try:
|
|
14
|
+
import tomli
|
|
15
|
+
except ImportError:
|
|
16
|
+
tomli = None
|
|
17
|
+
|
|
18
|
+
from .exceptions import ConfigurationError
|
|
19
|
+
from .utils import get_home_config_dir
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def load_config(config_path: Optional[Path] = None) -> Dict[str, Any]:
|
|
23
|
+
"""
|
|
24
|
+
Load configuration from TOML file.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
config_path: Path to config file. If None, searches for:
|
|
28
|
+
1. mlenvdoctor.toml in current directory
|
|
29
|
+
2. .mlenvdoctorrc in current directory
|
|
30
|
+
3. ~/.mlenvdoctor/config.toml
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
Configuration dictionary
|
|
34
|
+
|
|
35
|
+
Raises:
|
|
36
|
+
ConfigurationError: If config file is invalid
|
|
37
|
+
"""
|
|
38
|
+
default_config: Dict[str, Any] = {
|
|
39
|
+
"diagnostics": {
|
|
40
|
+
"full_scan": False,
|
|
41
|
+
"skip_checks": [],
|
|
42
|
+
},
|
|
43
|
+
"fix": {
|
|
44
|
+
"default_stack": "trl-peft",
|
|
45
|
+
"auto_install": False,
|
|
46
|
+
},
|
|
47
|
+
"docker": {
|
|
48
|
+
"default_base_image": "nvidia/cuda:12.4.0-devel-ubuntu22.04",
|
|
49
|
+
},
|
|
50
|
+
"logging": {
|
|
51
|
+
"level": "INFO",
|
|
52
|
+
"file": None,
|
|
53
|
+
},
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
if config_path is None:
|
|
57
|
+
# Search for config files
|
|
58
|
+
search_paths = [
|
|
59
|
+
Path("mlenvdoctor.toml"),
|
|
60
|
+
Path(".mlenvdoctorrc"),
|
|
61
|
+
get_home_config_dir() / "config.toml",
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
for path in search_paths:
|
|
65
|
+
if path.exists():
|
|
66
|
+
config_path = path
|
|
67
|
+
break
|
|
68
|
+
else:
|
|
69
|
+
if not config_path.exists():
|
|
70
|
+
raise ConfigurationError(
|
|
71
|
+
f"Config file not found: {config_path}",
|
|
72
|
+
"Create the file or use default configuration",
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
if config_path is None or not config_path.exists():
|
|
76
|
+
return default_config
|
|
77
|
+
|
|
78
|
+
try:
|
|
79
|
+
# Try tomllib (Python 3.11+)
|
|
80
|
+
if tomllib is not None:
|
|
81
|
+
with config_path.open("rb") as f:
|
|
82
|
+
user_config = tomllib.load(f)
|
|
83
|
+
elif tomli is not None:
|
|
84
|
+
# Fallback to tomli for older Python
|
|
85
|
+
with config_path.open("rb") as f:
|
|
86
|
+
user_config = tomli.load(f)
|
|
87
|
+
else:
|
|
88
|
+
raise ConfigurationError(
|
|
89
|
+
"TOML parsing not available. Install tomli: pip install tomli",
|
|
90
|
+
"Or upgrade to Python 3.11+",
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# Merge with defaults
|
|
94
|
+
merged_config = default_config.copy()
|
|
95
|
+
for section, values in user_config.items():
|
|
96
|
+
if section in merged_config and isinstance(merged_config[section], dict):
|
|
97
|
+
merged_config[section].update(values)
|
|
98
|
+
else:
|
|
99
|
+
merged_config[section] = values
|
|
100
|
+
|
|
101
|
+
return merged_config
|
|
102
|
+
|
|
103
|
+
except Exception as e:
|
|
104
|
+
raise ConfigurationError(
|
|
105
|
+
f"Error parsing config file {config_path}: {e}",
|
|
106
|
+
"Check TOML syntax and file permissions",
|
|
107
|
+
) from e
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def get_config_value(config: Dict[str, Any], *keys: str, default: Any = None) -> Any:
|
|
111
|
+
"""
|
|
112
|
+
Get nested config value safely.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
config: Configuration dictionary
|
|
116
|
+
*keys: Nested keys to traverse
|
|
117
|
+
default: Default value if key not found
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
Config value or default
|
|
121
|
+
"""
|
|
122
|
+
value = config
|
|
123
|
+
for key in keys:
|
|
124
|
+
if isinstance(value, dict):
|
|
125
|
+
value = value.get(key)
|
|
126
|
+
if value is None:
|
|
127
|
+
return default
|
|
128
|
+
else:
|
|
129
|
+
return default
|
|
130
|
+
return value if value is not None else default
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def create_default_config(output_path: Path) -> Path:
|
|
134
|
+
"""
|
|
135
|
+
Create a default configuration file.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
output_path: Path where to create config file
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
Path to created config file
|
|
142
|
+
"""
|
|
143
|
+
default_content = """# ML Environment Doctor Configuration
|
|
144
|
+
|
|
145
|
+
[diagnostics]
|
|
146
|
+
# Run full scan by default
|
|
147
|
+
full_scan = false
|
|
148
|
+
# Skip specific checks (e.g., ["docker_gpu", "internet"])
|
|
149
|
+
skip_checks = []
|
|
150
|
+
|
|
151
|
+
[fix]
|
|
152
|
+
# Default ML stack: "trl-peft" or "minimal"
|
|
153
|
+
default_stack = "trl-peft"
|
|
154
|
+
# Automatically install dependencies without prompting
|
|
155
|
+
auto_install = false
|
|
156
|
+
|
|
157
|
+
[docker]
|
|
158
|
+
# Default base image for Dockerfiles
|
|
159
|
+
default_base_image = "nvidia/cuda:12.4.0-devel-ubuntu22.04"
|
|
160
|
+
|
|
161
|
+
[logging]
|
|
162
|
+
# Logging level: DEBUG, INFO, WARNING, ERROR, CRITICAL
|
|
163
|
+
level = "INFO"
|
|
164
|
+
# Log file path (None for default: ~/.mlenvdoctor/logs/mlenvdoctor.log)
|
|
165
|
+
file = null
|
|
166
|
+
"""
|
|
167
|
+
|
|
168
|
+
output_path.write_text(default_content, encoding="utf-8")
|
|
169
|
+
return output_path
|
mlenvdoctor/constants.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Constants used throughout ML Environment Doctor."""
|
|
2
|
+
|
|
3
|
+
from typing import Final
|
|
4
|
+
|
|
5
|
+
# Version compatibility
|
|
6
|
+
MIN_PYTHON_VERSION: Final[tuple[int, int]] = (3, 8)
|
|
7
|
+
MIN_PYTORCH_VERSION: Final[str] = "2.4.0"
|
|
8
|
+
|
|
9
|
+
# CUDA versions
|
|
10
|
+
SUPPORTED_CUDA_VERSIONS: Final[list[str]] = ["12.1", "12.4"]
|
|
11
|
+
DEFAULT_CUDA_VERSION: Final[str] = "12.4"
|
|
12
|
+
|
|
13
|
+
# ML Library versions
|
|
14
|
+
MIN_TRANSFORMERS_VERSION: Final[str] = "4.44.0"
|
|
15
|
+
MIN_PEFT_VERSION: Final[str] = "0.12.0"
|
|
16
|
+
MIN_TRL_VERSION: Final[str] = "0.9.0"
|
|
17
|
+
MIN_DATASETS_VERSION: Final[str] = "2.20.0"
|
|
18
|
+
MIN_ACCELERATE_VERSION: Final[str] = "1.0.0"
|
|
19
|
+
|
|
20
|
+
# Memory requirements (GB)
|
|
21
|
+
MIN_GPU_MEMORY_GB: Final[int] = 8
|
|
22
|
+
RECOMMENDED_GPU_MEMORY_GB: Final[int] = 16
|
|
23
|
+
MIN_DISK_SPACE_GB: Final[int] = 50
|
|
24
|
+
|
|
25
|
+
# Timeouts (seconds)
|
|
26
|
+
DEFAULT_COMMAND_TIMEOUT: Final[int] = 30
|
|
27
|
+
DEFAULT_NETWORK_TIMEOUT: Final[int] = 10
|
|
28
|
+
DEFAULT_INSTALL_TIMEOUT: Final[int] = 600
|
|
29
|
+
|
|
30
|
+
# File paths
|
|
31
|
+
DEFAULT_CONFIG_FILE: Final[str] = "mlenvdoctor.toml"
|
|
32
|
+
DEFAULT_REQUIREMENTS_FILE: Final[str] = "requirements-mlenvdoctor.txt"
|
|
33
|
+
DEFAULT_DOCKERFILE: Final[str] = "Dockerfile.mlenvdoctor"
|
|
34
|
+
|
|
35
|
+
# Model names
|
|
36
|
+
SUPPORTED_MODELS: Final[dict[str, str]] = {
|
|
37
|
+
"tinyllama": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
|
38
|
+
"gpt2": "gpt2",
|
|
39
|
+
"mistral-7b": "mistralai/Mistral-7B-v0.1",
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
# ML Stacks
|
|
43
|
+
ML_STACKS: Final[list[str]] = ["trl-peft", "minimal"]
|
|
44
|
+
|
|
45
|
+
# Diagnostic check names
|
|
46
|
+
CHECK_CUDA_DRIVER: Final[str] = "cuda_driver"
|
|
47
|
+
CHECK_PYTORCH_CUDA: Final[str] = "pytorch_cuda"
|
|
48
|
+
CHECK_ML_LIBRARIES: Final[str] = "ml_libraries"
|
|
49
|
+
CHECK_GPU_MEMORY: Final[str] = "gpu_memory"
|
|
50
|
+
CHECK_DISK_SPACE: Final[str] = "disk_space"
|
|
51
|
+
CHECK_DOCKER_GPU: Final[str] = "docker_gpu"
|
|
52
|
+
CHECK_INTERNET: Final[str] = "internet"
|
|
53
|
+
|
|
54
|
+
# Severity levels
|
|
55
|
+
SEVERITY_CRITICAL: Final[str] = "critical"
|
|
56
|
+
SEVERITY_WARNING: Final[str] = "warning"
|
|
57
|
+
SEVERITY_INFO: Final[str] = "info"
|
|
58
|
+
|
|
59
|
+
# Status values
|
|
60
|
+
STATUS_PASS: Final[str] = "PASS"
|
|
61
|
+
STATUS_FAIL: Final[str] = "FAIL"
|
|
62
|
+
STATUS_WARN: Final[str] = "WARN"
|
|
63
|
+
STATUS_INFO: Final[str] = "INFO"
|