claude-turing 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +34 -0
- package/LICENSE +21 -0
- package/README.md +457 -0
- package/agents/ml-evaluator.md +43 -0
- package/agents/ml-researcher.md +74 -0
- package/bin/cli.js +46 -0
- package/bin/turing-init.sh +57 -0
- package/commands/brief.md +83 -0
- package/commands/compare.md +24 -0
- package/commands/design.md +97 -0
- package/commands/init.md +123 -0
- package/commands/logbook.md +51 -0
- package/commands/mode.md +43 -0
- package/commands/poster.md +89 -0
- package/commands/preflight.md +75 -0
- package/commands/report.md +97 -0
- package/commands/rules/loop-protocol.md +91 -0
- package/commands/status.md +24 -0
- package/commands/suggest.md +95 -0
- package/commands/sweep.md +45 -0
- package/commands/train.md +66 -0
- package/commands/try.md +63 -0
- package/commands/turing.md +54 -0
- package/commands/validate.md +34 -0
- package/config/defaults.yaml +45 -0
- package/config/experiment_archetypes.yaml +127 -0
- package/config/lifecycle.toml +31 -0
- package/config/novelty_aliases.yaml +107 -0
- package/config/relationships.toml +125 -0
- package/config/state.toml +24 -0
- package/config/task_taxonomy.yaml +110 -0
- package/config/taxonomy.toml +37 -0
- package/package.json +54 -0
- package/src/claude-md.js +55 -0
- package/src/install.js +107 -0
- package/src/paths.js +20 -0
- package/src/postinstall.js +22 -0
- package/src/verify.js +109 -0
- package/templates/MEMORY.md +36 -0
- package/templates/README.md +93 -0
- package/templates/__pycache__/evaluate.cpython-314.pyc +0 -0
- package/templates/__pycache__/prepare.cpython-314.pyc +0 -0
- package/templates/config.yaml +48 -0
- package/templates/evaluate.py +237 -0
- package/templates/features/__init__.py +0 -0
- package/templates/features/__pycache__/__init__.cpython-314.pyc +0 -0
- package/templates/features/__pycache__/featurizers.cpython-314.pyc +0 -0
- package/templates/features/featurizers.py +138 -0
- package/templates/prepare.py +171 -0
- package/templates/program.md +216 -0
- package/templates/pyproject.toml +8 -0
- package/templates/requirements.txt +8 -0
- package/templates/scripts/__init__.py +0 -0
- package/templates/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/check_convergence.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/classify_task.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/critique_hypothesis.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_index.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_logbook.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/log_experiment.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/novelty_guard.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/parse_metrics.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/show_experiment_tree.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/show_families.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/statistical_compare.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/suggest_next.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/sweep.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/synthesize_decision.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/turing_io.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/update_state.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/verify_placeholders.cpython-314.pyc +0 -0
- package/templates/scripts/check_convergence.py +230 -0
- package/templates/scripts/compare_runs.py +124 -0
- package/templates/scripts/critique_hypothesis.py +350 -0
- package/templates/scripts/experiment_index.py +288 -0
- package/templates/scripts/generate_brief.py +389 -0
- package/templates/scripts/generate_logbook.py +423 -0
- package/templates/scripts/log_experiment.py +243 -0
- package/templates/scripts/manage_hypotheses.py +543 -0
- package/templates/scripts/novelty_guard.py +343 -0
- package/templates/scripts/parse_metrics.py +139 -0
- package/templates/scripts/post-train-hook.sh +74 -0
- package/templates/scripts/preflight.py +549 -0
- package/templates/scripts/scaffold.py +409 -0
- package/templates/scripts/show_environment.py +92 -0
- package/templates/scripts/show_experiment_tree.py +144 -0
- package/templates/scripts/show_families.py +133 -0
- package/templates/scripts/show_metrics.py +157 -0
- package/templates/scripts/statistical_compare.py +259 -0
- package/templates/scripts/stop-hook.sh +34 -0
- package/templates/scripts/suggest_next.py +301 -0
- package/templates/scripts/sweep.py +276 -0
- package/templates/scripts/synthesize_decision.py +300 -0
- package/templates/scripts/turing_io.py +76 -0
- package/templates/scripts/update_state.py +296 -0
- package/templates/scripts/validate_stability.py +167 -0
- package/templates/scripts/verify_placeholders.py +119 -0
- package/templates/sweep_config.yaml +14 -0
- package/templates/tests/__init__.py +0 -0
- package/templates/tests/conftest.py +91 -0
- package/templates/train.py +240 -0
|
@@ -0,0 +1,549 @@
|
|
|
1
|
+
"""Pre-flight resource estimator for ML experiments.
|
|
2
|
+
|
|
3
|
+
Estimates VRAM, RAM, and disk requirements before running a training script.
|
|
4
|
+
Compares against available system resources and issues warnings or blocks
|
|
5
|
+
if the experiment is likely to fail due to resource constraints.
|
|
6
|
+
|
|
7
|
+
Works with any ML project — not Turing-specific. Analyzes:
|
|
8
|
+
- Dataset size and shape (from CSV/parquet/splits)
|
|
9
|
+
- Model type and architecture (from config or CLI)
|
|
10
|
+
- Batch size, precision, and expected memory multipliers
|
|
11
|
+
- Available system resources (RAM, VRAM, disk)
|
|
12
|
+
|
|
13
|
+
Usage:
|
|
14
|
+
python scripts/preflight.py # Auto-detect from config.yaml
|
|
15
|
+
python scripts/preflight.py --config config.yaml # Explicit config
|
|
16
|
+
python scripts/preflight.py --model-type xgboost --dataset data.csv
|
|
17
|
+
python scripts/preflight.py --model-type torch --params 10M --batch-size 32
|
|
18
|
+
python scripts/preflight.py --json # Machine-readable output
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import argparse
|
|
24
|
+
import json
|
|
25
|
+
import os
|
|
26
|
+
import shutil
|
|
27
|
+
import sys
|
|
28
|
+
from pathlib import Path
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
import yaml
|
|
32
|
+
HAS_YAML = True
|
|
33
|
+
except ImportError:
|
|
34
|
+
HAS_YAML = False
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# ---------------------------------------------------------------------------
|
|
38
|
+
# System resource detection
|
|
39
|
+
# ---------------------------------------------------------------------------
|
|
40
|
+
|
|
41
|
+
def get_system_ram_gb() -> float:
|
|
42
|
+
"""Get total system RAM in GB."""
|
|
43
|
+
try:
|
|
44
|
+
import psutil
|
|
45
|
+
return round(psutil.virtual_memory().total / (1024 ** 3), 1)
|
|
46
|
+
except ImportError:
|
|
47
|
+
pass
|
|
48
|
+
# Fallback: read /proc/meminfo on Linux
|
|
49
|
+
meminfo = Path("/proc/meminfo")
|
|
50
|
+
if meminfo.exists():
|
|
51
|
+
for line in meminfo.read_text().splitlines():
|
|
52
|
+
if line.startswith("MemTotal:"):
|
|
53
|
+
kb = int(line.split()[1])
|
|
54
|
+
return round(kb / (1024 ** 2), 1)
|
|
55
|
+
# macOS fallback
|
|
56
|
+
try:
|
|
57
|
+
import subprocess
|
|
58
|
+
result = subprocess.run(
|
|
59
|
+
["sysctl", "-n", "hw.memsize"], capture_output=True, text=True, timeout=5,
|
|
60
|
+
)
|
|
61
|
+
if result.returncode == 0:
|
|
62
|
+
return round(int(result.stdout.strip()) / (1024 ** 3), 1)
|
|
63
|
+
except (FileNotFoundError, subprocess.TimeoutExpired):
|
|
64
|
+
pass
|
|
65
|
+
return 0.0
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def get_available_ram_gb() -> float:
|
|
69
|
+
"""Get available (free + cached) RAM in GB."""
|
|
70
|
+
try:
|
|
71
|
+
import psutil
|
|
72
|
+
return round(psutil.virtual_memory().available / (1024 ** 3), 1)
|
|
73
|
+
except ImportError:
|
|
74
|
+
pass
|
|
75
|
+
meminfo = Path("/proc/meminfo")
|
|
76
|
+
if meminfo.exists():
|
|
77
|
+
available = 0
|
|
78
|
+
for line in meminfo.read_text().splitlines():
|
|
79
|
+
if line.startswith("MemAvailable:"):
|
|
80
|
+
available = int(line.split()[1])
|
|
81
|
+
break
|
|
82
|
+
if available:
|
|
83
|
+
return round(available / (1024 ** 2), 1)
|
|
84
|
+
return 0.0
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def get_gpu_info() -> list[dict]:
|
|
88
|
+
"""Detect GPUs and their VRAM."""
|
|
89
|
+
gpus = []
|
|
90
|
+
|
|
91
|
+
# Try torch first
|
|
92
|
+
try:
|
|
93
|
+
import torch
|
|
94
|
+
if torch.cuda.is_available():
|
|
95
|
+
for i in range(torch.cuda.device_count()):
|
|
96
|
+
props = torch.cuda.get_device_properties(i)
|
|
97
|
+
gpus.append({
|
|
98
|
+
"index": i,
|
|
99
|
+
"name": props.name,
|
|
100
|
+
"vram_gb": round(props.total_mem / (1024 ** 3), 1),
|
|
101
|
+
"vram_free_gb": round(
|
|
102
|
+
(props.total_mem - torch.cuda.memory_reserved(i)) / (1024 ** 3), 1,
|
|
103
|
+
),
|
|
104
|
+
"source": "torch",
|
|
105
|
+
})
|
|
106
|
+
return gpus
|
|
107
|
+
except ImportError:
|
|
108
|
+
pass
|
|
109
|
+
|
|
110
|
+
# Fallback: nvidia-smi
|
|
111
|
+
try:
|
|
112
|
+
import subprocess
|
|
113
|
+
result = subprocess.run(
|
|
114
|
+
["nvidia-smi", "--query-gpu=index,name,memory.total,memory.free",
|
|
115
|
+
"--format=csv,nounits,noheader"],
|
|
116
|
+
capture_output=True, text=True, timeout=10,
|
|
117
|
+
)
|
|
118
|
+
if result.returncode == 0:
|
|
119
|
+
for line in result.stdout.strip().splitlines():
|
|
120
|
+
parts = [p.strip() for p in line.split(",")]
|
|
121
|
+
if len(parts) >= 4:
|
|
122
|
+
gpus.append({
|
|
123
|
+
"index": int(parts[0]),
|
|
124
|
+
"name": parts[1],
|
|
125
|
+
"vram_gb": round(int(parts[2]) / 1024, 1),
|
|
126
|
+
"vram_free_gb": round(int(parts[3]) / 1024, 1),
|
|
127
|
+
"source": "nvidia-smi",
|
|
128
|
+
})
|
|
129
|
+
except (FileNotFoundError, subprocess.TimeoutExpired):
|
|
130
|
+
pass
|
|
131
|
+
|
|
132
|
+
return gpus
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def get_disk_free_gb(path: str = ".") -> float:
|
|
136
|
+
"""Get free disk space at path in GB."""
|
|
137
|
+
usage = shutil.disk_usage(path)
|
|
138
|
+
return round(usage.free / (1024 ** 3), 1)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
# ---------------------------------------------------------------------------
|
|
142
|
+
# Dataset analysis
|
|
143
|
+
# ---------------------------------------------------------------------------
|
|
144
|
+
|
|
145
|
+
def estimate_dataset_memory(path: str | None = None, splits_dir: str | None = None) -> dict:
|
|
146
|
+
"""Estimate memory needed to load a dataset.
|
|
147
|
+
|
|
148
|
+
Returns dict with: file_size_mb, estimated_ram_gb, rows, columns, dtype_info.
|
|
149
|
+
"""
|
|
150
|
+
result = {
|
|
151
|
+
"file_size_mb": 0.0,
|
|
152
|
+
"estimated_ram_gb": 0.0,
|
|
153
|
+
"rows": 0,
|
|
154
|
+
"columns": 0,
|
|
155
|
+
"source": None,
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
# Try splits directory first (Turing convention)
|
|
159
|
+
if splits_dir:
|
|
160
|
+
splits_path = Path(splits_dir)
|
|
161
|
+
if splits_path.exists():
|
|
162
|
+
total_size = sum(f.stat().st_size for f in splits_path.glob("*") if f.is_file())
|
|
163
|
+
result["file_size_mb"] = round(total_size / (1024 ** 2), 1)
|
|
164
|
+
# CSV in RAM is typically 3-5x the file size
|
|
165
|
+
result["estimated_ram_gb"] = round(total_size * 4 / (1024 ** 3), 2)
|
|
166
|
+
result["source"] = str(splits_path)
|
|
167
|
+
|
|
168
|
+
# Try to count rows/columns from first file
|
|
169
|
+
for f in sorted(splits_path.glob("*.csv")):
|
|
170
|
+
try:
|
|
171
|
+
with open(f) as fh:
|
|
172
|
+
header = fh.readline()
|
|
173
|
+
result["columns"] = len(header.split(","))
|
|
174
|
+
lines = sum(1 for _ in fh)
|
|
175
|
+
result["rows"] += lines
|
|
176
|
+
except OSError:
|
|
177
|
+
pass
|
|
178
|
+
return result
|
|
179
|
+
|
|
180
|
+
# Try single file
|
|
181
|
+
if path:
|
|
182
|
+
p = Path(path)
|
|
183
|
+
if p.exists():
|
|
184
|
+
result["file_size_mb"] = round(p.stat().st_size / (1024 ** 2), 1)
|
|
185
|
+
result["estimated_ram_gb"] = round(p.stat().st_size * 4 / (1024 ** 3), 2)
|
|
186
|
+
result["source"] = str(p)
|
|
187
|
+
|
|
188
|
+
if p.suffix == ".csv":
|
|
189
|
+
try:
|
|
190
|
+
with open(p) as fh:
|
|
191
|
+
header = fh.readline()
|
|
192
|
+
result["columns"] = len(header.split(","))
|
|
193
|
+
result["rows"] = sum(1 for _ in fh)
|
|
194
|
+
except OSError:
|
|
195
|
+
pass
|
|
196
|
+
|
|
197
|
+
return result
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
# ---------------------------------------------------------------------------
|
|
201
|
+
# Model resource estimation
|
|
202
|
+
# ---------------------------------------------------------------------------
|
|
203
|
+
|
|
204
|
+
def parse_param_count(s: str) -> int:
|
|
205
|
+
"""Parse parameter count strings like '10M', '1.5B', '350K'."""
|
|
206
|
+
s = s.strip().upper()
|
|
207
|
+
multipliers = {"K": 1_000, "M": 1_000_000, "B": 1_000_000_000, "T": 1_000_000_000_000}
|
|
208
|
+
for suffix, mult in multipliers.items():
|
|
209
|
+
if s.endswith(suffix):
|
|
210
|
+
return int(float(s[:-1]) * mult)
|
|
211
|
+
return int(float(s))
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def estimate_model_resources(
|
|
215
|
+
model_type: str,
|
|
216
|
+
n_estimators: int = 100,
|
|
217
|
+
max_depth: int = 6,
|
|
218
|
+
n_features: int = 50,
|
|
219
|
+
n_samples: int = 10000,
|
|
220
|
+
param_count: int | None = None,
|
|
221
|
+
batch_size: int = 32,
|
|
222
|
+
precision: str = "fp32",
|
|
223
|
+
sequence_length: int = 512,
|
|
224
|
+
) -> dict:
|
|
225
|
+
"""Estimate training resource requirements for a model.
|
|
226
|
+
|
|
227
|
+
Returns dict with: ram_gb, vram_gb, disk_gb, notes.
|
|
228
|
+
"""
|
|
229
|
+
bytes_per_param = {"fp32": 4, "fp16": 2, "bf16": 2, "int8": 1, "int4": 0.5}
|
|
230
|
+
bpp = bytes_per_param.get(precision, 4)
|
|
231
|
+
|
|
232
|
+
result = {
|
|
233
|
+
"model_type": model_type,
|
|
234
|
+
"ram_gb": 0.0,
|
|
235
|
+
"vram_gb": 0.0,
|
|
236
|
+
"disk_gb": 0.0,
|
|
237
|
+
"requires_gpu": False,
|
|
238
|
+
"notes": [],
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
mt = model_type.lower()
|
|
242
|
+
|
|
243
|
+
if mt in ("xgboost", "lightgbm", "catboost"):
|
|
244
|
+
# Tree-based: RAM-bound, no GPU required (unless GPU training explicitly used)
|
|
245
|
+
# Rule of thumb: ~1KB per tree node, depth d → 2^d nodes per tree
|
|
246
|
+
nodes_per_tree = min(2 ** max_depth, 2 ** 10) # Cap at 1024
|
|
247
|
+
tree_memory_gb = (n_estimators * nodes_per_tree * 8 * n_features) / (1024 ** 3)
|
|
248
|
+
data_memory_gb = (n_samples * n_features * 8) / (1024 ** 3) # float64
|
|
249
|
+
result["ram_gb"] = round(tree_memory_gb + data_memory_gb * 2, 2) # 2x for train + working
|
|
250
|
+
result["disk_gb"] = round(tree_memory_gb * 2, 2) # Model artifact
|
|
251
|
+
result["notes"].append(f"{n_estimators} trees, depth {max_depth}")
|
|
252
|
+
if n_samples > 1_000_000:
|
|
253
|
+
result["notes"].append("large dataset — consider subsample parameter")
|
|
254
|
+
|
|
255
|
+
elif mt in ("randomforest", "random_forest", "sklearn_rf", "extra_trees"):
|
|
256
|
+
nodes_per_tree = min(2 ** max_depth, 2 ** 16)
|
|
257
|
+
tree_memory_gb = (n_estimators * nodes_per_tree * 8 * n_features) / (1024 ** 3)
|
|
258
|
+
data_memory_gb = (n_samples * n_features * 8) / (1024 ** 3)
|
|
259
|
+
result["ram_gb"] = round(tree_memory_gb + data_memory_gb * 2, 2)
|
|
260
|
+
result["disk_gb"] = round(tree_memory_gb * 3, 2) # sklearn RF models are large
|
|
261
|
+
result["notes"].append(f"{n_estimators} trees, unlimited depth" if max_depth > 20 else f"{n_estimators} trees, depth {max_depth}")
|
|
262
|
+
|
|
263
|
+
elif mt in ("mlp", "neural_network", "nn"):
|
|
264
|
+
if param_count is None:
|
|
265
|
+
# Estimate from features: simple 2-layer MLP
|
|
266
|
+
param_count = n_features * 256 + 256 * 128 + 128 * 1 # ~50K for typical tabular
|
|
267
|
+
model_gb = (param_count * bpp) / (1024 ** 3)
|
|
268
|
+
# Training: model + gradients + optimizer state (Adam = 2x) + activations
|
|
269
|
+
train_multiplier = 4 # model + grad + 2x optimizer state
|
|
270
|
+
activation_gb = (batch_size * param_count * bpp * 0.5) / (1024 ** 3)
|
|
271
|
+
result["vram_gb"] = round(model_gb * train_multiplier + activation_gb, 2)
|
|
272
|
+
result["ram_gb"] = round(model_gb * 2 + (n_samples * n_features * 8) / (1024 ** 3), 2)
|
|
273
|
+
result["disk_gb"] = round(model_gb * 2, 2)
|
|
274
|
+
result["requires_gpu"] = result["vram_gb"] > 1.0
|
|
275
|
+
result["notes"].append(f"{param_count:,} parameters ({precision})")
|
|
276
|
+
if result["vram_gb"] > 0.5:
|
|
277
|
+
result["notes"].append("GPU recommended for reasonable training speed")
|
|
278
|
+
|
|
279
|
+
elif mt in ("transformer", "llm", "bert", "gpt"):
|
|
280
|
+
if param_count is None:
|
|
281
|
+
param_count = 100_000_000 # Default 100M
|
|
282
|
+
model_gb = (param_count * bpp) / (1024 ** 3)
|
|
283
|
+
# Transformers: activations scale with batch_size * seq_len * hidden_dim
|
|
284
|
+
hidden_dim = int((param_count / 12) ** 0.5) # Rough estimate
|
|
285
|
+
activation_gb = (batch_size * sequence_length * hidden_dim * bpp * 12) / (1024 ** 3)
|
|
286
|
+
optimizer_gb = model_gb * 2 # Adam
|
|
287
|
+
result["vram_gb"] = round(model_gb + optimizer_gb + activation_gb, 2)
|
|
288
|
+
result["ram_gb"] = round(model_gb + 2, 2) # Model + data loading overhead
|
|
289
|
+
result["disk_gb"] = round(model_gb * 3, 2) # Checkpoints
|
|
290
|
+
result["requires_gpu"] = True
|
|
291
|
+
result["notes"].append(f"{param_count:,} parameters ({precision})")
|
|
292
|
+
result["notes"].append(f"batch_size={batch_size}, seq_len={sequence_length}")
|
|
293
|
+
if result["vram_gb"] > 24:
|
|
294
|
+
result["notes"].append("likely needs multi-GPU or gradient checkpointing")
|
|
295
|
+
elif result["vram_gb"] > 12:
|
|
296
|
+
result["notes"].append("needs >=16GB VRAM GPU (A100/A6000/RTX 4090)")
|
|
297
|
+
elif result["vram_gb"] > 6:
|
|
298
|
+
result["notes"].append("needs >=8GB VRAM GPU")
|
|
299
|
+
|
|
300
|
+
elif mt in ("linear", "logistic", "ridge", "lasso", "elastic_net"):
|
|
301
|
+
# Linear models: very lightweight
|
|
302
|
+
result["ram_gb"] = round((n_samples * n_features * 8 * 2) / (1024 ** 3), 2)
|
|
303
|
+
result["disk_gb"] = 0.01
|
|
304
|
+
result["notes"].append("lightweight — no resource concerns")
|
|
305
|
+
|
|
306
|
+
else:
|
|
307
|
+
result["notes"].append(f"unknown model type '{model_type}' — using conservative estimates")
|
|
308
|
+
result["ram_gb"] = round((n_samples * n_features * 8 * 3) / (1024 ** 3), 2)
|
|
309
|
+
result["vram_gb"] = 0.0
|
|
310
|
+
result["disk_gb"] = 0.5
|
|
311
|
+
|
|
312
|
+
return result
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
# ---------------------------------------------------------------------------
|
|
316
|
+
# Preflight check
|
|
317
|
+
# ---------------------------------------------------------------------------
|
|
318
|
+
|
|
319
|
+
def run_preflight(
|
|
320
|
+
model_type: str | None = None,
|
|
321
|
+
config_path: str = "config.yaml",
|
|
322
|
+
dataset_path: str | None = None,
|
|
323
|
+
param_count: str | None = None,
|
|
324
|
+
batch_size: int = 32,
|
|
325
|
+
precision: str = "fp32",
|
|
326
|
+
sequence_length: int = 512,
|
|
327
|
+
) -> dict:
|
|
328
|
+
"""Run a complete preflight check.
|
|
329
|
+
|
|
330
|
+
Returns dict with: system, dataset, model, verdict, warnings.
|
|
331
|
+
"""
|
|
332
|
+
# Load config if available
|
|
333
|
+
config = {}
|
|
334
|
+
if HAS_YAML and Path(config_path).exists():
|
|
335
|
+
with open(config_path) as f:
|
|
336
|
+
config = yaml.safe_load(f) or {}
|
|
337
|
+
|
|
338
|
+
# Auto-detect from config
|
|
339
|
+
if not model_type:
|
|
340
|
+
model_type = config.get("model", {}).get("type", "xgboost")
|
|
341
|
+
hyperparams = config.get("model", {}).get("hyperparams", {})
|
|
342
|
+
n_estimators = hyperparams.get("n_estimators", 100)
|
|
343
|
+
max_depth = hyperparams.get("max_depth", 6)
|
|
344
|
+
|
|
345
|
+
# System resources
|
|
346
|
+
system = {
|
|
347
|
+
"ram_total_gb": get_system_ram_gb(),
|
|
348
|
+
"ram_available_gb": get_available_ram_gb(),
|
|
349
|
+
"disk_free_gb": get_disk_free_gb(),
|
|
350
|
+
"gpus": get_gpu_info(),
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
# Dataset analysis
|
|
354
|
+
splits_dir = config.get("data", {}).get("splits_dir")
|
|
355
|
+
data_source = dataset_path or config.get("data", {}).get("source")
|
|
356
|
+
dataset = estimate_dataset_memory(path=data_source, splits_dir=splits_dir)
|
|
357
|
+
|
|
358
|
+
# Model estimation
|
|
359
|
+
params = parse_param_count(param_count) if param_count else None
|
|
360
|
+
model = estimate_model_resources(
|
|
361
|
+
model_type=model_type,
|
|
362
|
+
n_estimators=n_estimators,
|
|
363
|
+
max_depth=max_depth,
|
|
364
|
+
n_features=max(dataset.get("columns", 50), 1),
|
|
365
|
+
n_samples=max(dataset.get("rows", 10000), 1),
|
|
366
|
+
param_count=params,
|
|
367
|
+
batch_size=batch_size,
|
|
368
|
+
precision=precision,
|
|
369
|
+
sequence_length=sequence_length,
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
# Total requirements
|
|
373
|
+
total_ram = model["ram_gb"] + dataset["estimated_ram_gb"]
|
|
374
|
+
total_vram = model["vram_gb"]
|
|
375
|
+
total_disk = model["disk_gb"] + dataset["file_size_mb"] / 1024
|
|
376
|
+
|
|
377
|
+
# Verdict
|
|
378
|
+
warnings = []
|
|
379
|
+
verdict = "PASS"
|
|
380
|
+
|
|
381
|
+
if system["ram_available_gb"] > 0 and total_ram > system["ram_available_gb"] * 0.9:
|
|
382
|
+
warnings.append(
|
|
383
|
+
f"RAM: need ~{total_ram:.1f}GB but only {system['ram_available_gb']:.1f}GB available"
|
|
384
|
+
)
|
|
385
|
+
verdict = "WARN"
|
|
386
|
+
|
|
387
|
+
if total_ram > system["ram_total_gb"] * 0.8:
|
|
388
|
+
warnings.append(
|
|
389
|
+
f"RAM: need ~{total_ram:.1f}GB, system has {system['ram_total_gb']:.1f}GB total — may cause swapping"
|
|
390
|
+
)
|
|
391
|
+
verdict = "FAIL"
|
|
392
|
+
|
|
393
|
+
if model["requires_gpu"]:
|
|
394
|
+
if not system["gpus"]:
|
|
395
|
+
warnings.append(f"VRAM: model needs ~{total_vram:.1f}GB VRAM but no GPU detected")
|
|
396
|
+
verdict = "FAIL"
|
|
397
|
+
else:
|
|
398
|
+
max_vram = max(g["vram_gb"] for g in system["gpus"])
|
|
399
|
+
if total_vram > max_vram * 0.95:
|
|
400
|
+
warnings.append(
|
|
401
|
+
f"VRAM: need ~{total_vram:.1f}GB but largest GPU has {max_vram:.1f}GB"
|
|
402
|
+
)
|
|
403
|
+
verdict = "FAIL"
|
|
404
|
+
elif total_vram > max_vram * 0.8:
|
|
405
|
+
warnings.append(
|
|
406
|
+
f"VRAM: need ~{total_vram:.1f}GB, GPU has {max_vram:.1f}GB — tight fit"
|
|
407
|
+
)
|
|
408
|
+
if verdict != "FAIL":
|
|
409
|
+
verdict = "WARN"
|
|
410
|
+
|
|
411
|
+
if total_disk > system["disk_free_gb"] * 0.5:
|
|
412
|
+
warnings.append(
|
|
413
|
+
f"Disk: model + data need ~{total_disk:.1f}GB, only {system['disk_free_gb']:.1f}GB free"
|
|
414
|
+
)
|
|
415
|
+
if verdict != "FAIL":
|
|
416
|
+
verdict = "WARN"
|
|
417
|
+
|
|
418
|
+
return {
|
|
419
|
+
"verdict": verdict,
|
|
420
|
+
"warnings": warnings,
|
|
421
|
+
"requirements": {
|
|
422
|
+
"ram_gb": round(total_ram, 2),
|
|
423
|
+
"vram_gb": round(total_vram, 2),
|
|
424
|
+
"disk_gb": round(total_disk, 2),
|
|
425
|
+
"requires_gpu": model["requires_gpu"],
|
|
426
|
+
},
|
|
427
|
+
"system": system,
|
|
428
|
+
"dataset": dataset,
|
|
429
|
+
"model": model,
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
def format_preflight(result: dict) -> str:
|
|
434
|
+
"""Format preflight results for display."""
|
|
435
|
+
v = result["verdict"]
|
|
436
|
+
icon = {"PASS": "✓", "WARN": "!", "FAIL": "✗"}.get(v, "?")
|
|
437
|
+
|
|
438
|
+
lines = [
|
|
439
|
+
f"Preflight Check: {icon} {v}",
|
|
440
|
+
"=" * 50,
|
|
441
|
+
]
|
|
442
|
+
|
|
443
|
+
# Requirements
|
|
444
|
+
req = result["requirements"]
|
|
445
|
+
lines.extend([
|
|
446
|
+
"",
|
|
447
|
+
"Requirements",
|
|
448
|
+
"-" * 50,
|
|
449
|
+
f" RAM: ~{req['ram_gb']:.1f} GB",
|
|
450
|
+
f" VRAM: ~{req['vram_gb']:.1f} GB" + (" (GPU required)" if req["requires_gpu"] else " (no GPU needed)"),
|
|
451
|
+
f" Disk: ~{req['disk_gb']:.1f} GB",
|
|
452
|
+
])
|
|
453
|
+
|
|
454
|
+
# System
|
|
455
|
+
sys_info = result["system"]
|
|
456
|
+
lines.extend([
|
|
457
|
+
"",
|
|
458
|
+
"System",
|
|
459
|
+
"-" * 50,
|
|
460
|
+
f" RAM: {sys_info['ram_total_gb']:.1f} GB total, {sys_info['ram_available_gb']:.1f} GB available",
|
|
461
|
+
f" Disk: {sys_info['disk_free_gb']:.1f} GB free",
|
|
462
|
+
])
|
|
463
|
+
if sys_info["gpus"]:
|
|
464
|
+
for gpu in sys_info["gpus"]:
|
|
465
|
+
lines.append(f" GPU {gpu['index']}: {gpu['name']} ({gpu['vram_gb']:.1f} GB VRAM)")
|
|
466
|
+
else:
|
|
467
|
+
lines.append(" GPU: none detected")
|
|
468
|
+
|
|
469
|
+
# Dataset
|
|
470
|
+
ds = result["dataset"]
|
|
471
|
+
if ds["source"]:
|
|
472
|
+
lines.extend([
|
|
473
|
+
"",
|
|
474
|
+
"Dataset",
|
|
475
|
+
"-" * 50,
|
|
476
|
+
f" Source: {ds['source']}",
|
|
477
|
+
f" Size: {ds['file_size_mb']:.1f} MB on disk",
|
|
478
|
+
f" In RAM: ~{ds['estimated_ram_gb']:.2f} GB estimated",
|
|
479
|
+
])
|
|
480
|
+
if ds["rows"]:
|
|
481
|
+
lines.append(f" Shape: {ds['rows']:,} rows x {ds['columns']} columns")
|
|
482
|
+
|
|
483
|
+
# Model
|
|
484
|
+
model = result["model"]
|
|
485
|
+
lines.extend([
|
|
486
|
+
"",
|
|
487
|
+
"Model",
|
|
488
|
+
"-" * 50,
|
|
489
|
+
f" Type: {model['model_type']}",
|
|
490
|
+
f" RAM: ~{model['ram_gb']:.2f} GB",
|
|
491
|
+
f" VRAM: ~{model['vram_gb']:.2f} GB",
|
|
492
|
+
])
|
|
493
|
+
for note in model["notes"]:
|
|
494
|
+
lines.append(f" Note: {note}")
|
|
495
|
+
|
|
496
|
+
# Warnings
|
|
497
|
+
if result["warnings"]:
|
|
498
|
+
lines.extend(["", "Warnings", "-" * 50])
|
|
499
|
+
for w in result["warnings"]:
|
|
500
|
+
lines.append(f" {w}")
|
|
501
|
+
|
|
502
|
+
# Verdict
|
|
503
|
+
lines.extend(["", "=" * 50])
|
|
504
|
+
if v == "PASS":
|
|
505
|
+
lines.append(" System has sufficient resources. Proceed with training.")
|
|
506
|
+
elif v == "WARN":
|
|
507
|
+
lines.append(" Training may succeed but resources are tight. Monitor memory usage.")
|
|
508
|
+
else:
|
|
509
|
+
lines.append(" Training will likely fail. Address warnings before proceeding.")
|
|
510
|
+
|
|
511
|
+
return "\n".join(lines)
|
|
512
|
+
|
|
513
|
+
|
|
514
|
+
def main() -> None:
|
|
515
|
+
parser = argparse.ArgumentParser(
|
|
516
|
+
description="Pre-flight resource check for ML training",
|
|
517
|
+
)
|
|
518
|
+
parser.add_argument("--config", default="config.yaml", help="Config file path")
|
|
519
|
+
parser.add_argument("--model-type", default=None, help="Model type (xgboost, lightgbm, torch, transformer, ...)")
|
|
520
|
+
parser.add_argument("--dataset", default=None, help="Path to dataset file")
|
|
521
|
+
parser.add_argument("--params", default=None, help="Parameter count (e.g., 10M, 1.5B)")
|
|
522
|
+
parser.add_argument("--batch-size", type=int, default=32, help="Training batch size")
|
|
523
|
+
parser.add_argument("--precision", default="fp32", choices=["fp32", "fp16", "bf16", "int8", "int4"])
|
|
524
|
+
parser.add_argument("--seq-len", type=int, default=512, help="Sequence length (transformers)")
|
|
525
|
+
parser.add_argument("--json", action="store_true", help="Machine-readable JSON output")
|
|
526
|
+
|
|
527
|
+
args = parser.parse_args()
|
|
528
|
+
|
|
529
|
+
result = run_preflight(
|
|
530
|
+
model_type=args.model_type,
|
|
531
|
+
config_path=args.config,
|
|
532
|
+
dataset_path=args.dataset,
|
|
533
|
+
param_count=args.params,
|
|
534
|
+
batch_size=args.batch_size,
|
|
535
|
+
precision=args.precision,
|
|
536
|
+
sequence_length=args.seq_len,
|
|
537
|
+
)
|
|
538
|
+
|
|
539
|
+
if args.json:
|
|
540
|
+
print(json.dumps(result, indent=2))
|
|
541
|
+
else:
|
|
542
|
+
print(format_preflight(result))
|
|
543
|
+
|
|
544
|
+
if result["verdict"] == "FAIL":
|
|
545
|
+
sys.exit(1)
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
if __name__ == "__main__":
|
|
549
|
+
main()
|