loopkit 0.0.1a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loopkit/__init__.py +85 -0
- loopkit/cli/__init__.py +3 -0
- loopkit/cli/compare.py +274 -0
- loopkit/cli/main.py +62 -0
- loopkit/cli/sweep.py +313 -0
- loopkit/cli/visualize.py +285 -0
- loopkit/config.py +1236 -0
- loopkit/git.py +154 -0
- loopkit/logger.py +729 -0
- loopkit/monitor.py +259 -0
- loopkit/torch/__init__.py +43 -0
- loopkit/torch/checkpoint.py +339 -0
- loopkit/torch/mp.py +346 -0
- loopkit/tracking.py +203 -0
- loopkit/utils.py +75 -0
- loopkit-0.0.1a1.dist-info/METADATA +44 -0
- loopkit-0.0.1a1.dist-info/RECORD +21 -0
- loopkit-0.0.1a1.dist-info/WHEEL +5 -0
- loopkit-0.0.1a1.dist-info/entry_points.txt +2 -0
- loopkit-0.0.1a1.dist-info/licenses/LICENSE +7 -0
- loopkit-0.0.1a1.dist-info/top_level.txt +1 -0
loopkit/cli/sweep.py
ADDED
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import copy
|
|
3
|
+
import itertools
|
|
4
|
+
import json
|
|
5
|
+
import re
|
|
6
|
+
import subprocess
|
|
7
|
+
import sys
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any, Dict, List, Optional
|
|
11
|
+
|
|
12
|
+
import yaml
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def generate_sweep_configs(
|
|
16
|
+
base_config: Dict[str, Any],
|
|
17
|
+
sweep_params: Dict[str, List[Any]],
|
|
18
|
+
) -> List[Dict[str, Any]]:
|
|
19
|
+
"""Generate all configurations for a grid search.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
base_config: Base configuration dictionary
|
|
23
|
+
sweep_params: Dictionary mapping parameter paths to lists of values.
|
|
24
|
+
Example: {"training.lr": [0.001, 0.01],
|
|
25
|
+
"model.hidden_dim": [128, 256]}
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
List of configuration dictionaries
|
|
29
|
+
"""
|
|
30
|
+
# Get all parameter names and their values
|
|
31
|
+
param_names = list(sweep_params.keys())
|
|
32
|
+
param_values = [sweep_params[name] for name in param_names]
|
|
33
|
+
|
|
34
|
+
# Generate all combinations
|
|
35
|
+
configs = []
|
|
36
|
+
for values in itertools.product(*param_values):
|
|
37
|
+
# Deep copy the base config to avoid mutations
|
|
38
|
+
config = copy.deepcopy(base_config)
|
|
39
|
+
|
|
40
|
+
# Apply sweep parameters
|
|
41
|
+
for param_name, value in zip(param_names, values):
|
|
42
|
+
# Convert dot notation to nested dict
|
|
43
|
+
keys = param_name.split(".")
|
|
44
|
+
current = config
|
|
45
|
+
for key in keys[:-1]:
|
|
46
|
+
if key not in current:
|
|
47
|
+
current[key] = {}
|
|
48
|
+
current = current[key]
|
|
49
|
+
current[keys[-1]] = value
|
|
50
|
+
|
|
51
|
+
configs.append(config)
|
|
52
|
+
|
|
53
|
+
return configs
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def run_sweep(
|
|
57
|
+
command_template: str,
|
|
58
|
+
sweep_params: Dict[str, List[Any]],
|
|
59
|
+
base_config_file: Optional[str] = None,
|
|
60
|
+
runs_dir: str = "runs",
|
|
61
|
+
sweep_name: Optional[str] = None,
|
|
62
|
+
dry_run: bool = False,
|
|
63
|
+
parallel: bool = False,
|
|
64
|
+
max_parallel: int = 4,
|
|
65
|
+
):
|
|
66
|
+
"""Run a hyperparameter sweep.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
command_template: Command to run, e.g.:
|
|
70
|
+
"python train.py {config}"
|
|
71
|
+
"torchrun --nproc_per_node=2 train.py {config}"
|
|
72
|
+
"bash train.sh 2 {config}"
|
|
73
|
+
sweep_params: Parameters to sweep over
|
|
74
|
+
base_config_file: Base config file (optional)
|
|
75
|
+
runs_dir: Directory to save runs
|
|
76
|
+
sweep_name: Name for this sweep
|
|
77
|
+
dry_run: If True, only print commands without running
|
|
78
|
+
parallel: Run experiments in parallel
|
|
79
|
+
max_parallel: Maximum number of parallel runs
|
|
80
|
+
|
|
81
|
+
Example:
|
|
82
|
+
>>> run_sweep(
|
|
83
|
+
... command_template="python train.py {config}",
|
|
84
|
+
... sweep_params={
|
|
85
|
+
... "training.lr": [0.001, 0.01, 0.1],
|
|
86
|
+
... "model.hidden_dim": [128, 256],
|
|
87
|
+
... },
|
|
88
|
+
... base_config_file="conf/default.yaml",
|
|
89
|
+
... sweep_name="lr_hidden_sweep"
|
|
90
|
+
... )
|
|
91
|
+
"""
|
|
92
|
+
# Create sweep directory
|
|
93
|
+
if sweep_name is None:
|
|
94
|
+
sweep_name = datetime.now().strftime("sweep_%Y%m%d_%H%M%S")
|
|
95
|
+
|
|
96
|
+
sweep_dir = Path(runs_dir) / sweep_name
|
|
97
|
+
sweep_dir.mkdir(parents=True, exist_ok=True)
|
|
98
|
+
|
|
99
|
+
# Load base config if provided
|
|
100
|
+
base_config = {}
|
|
101
|
+
if base_config_file:
|
|
102
|
+
with open(base_config_file) as f:
|
|
103
|
+
base_config = yaml.safe_load(f)
|
|
104
|
+
|
|
105
|
+
# Generate all configurations
|
|
106
|
+
configs = generate_sweep_configs(base_config, sweep_params)
|
|
107
|
+
|
|
108
|
+
print(f"\n{'=' * 80}")
|
|
109
|
+
print(f"Hyperparameter Sweep: {sweep_name}")
|
|
110
|
+
print(f"{'=' * 80}")
|
|
111
|
+
print(f"Total experiments: {len(configs)}")
|
|
112
|
+
print(f"Sweep directory: {sweep_dir}")
|
|
113
|
+
print("\nSweep parameters:")
|
|
114
|
+
for param, values in sweep_params.items():
|
|
115
|
+
print(f" {param}: {values}")
|
|
116
|
+
print(f"{'=' * 80}\n")
|
|
117
|
+
|
|
118
|
+
# Save sweep configuration
|
|
119
|
+
sweep_info = {
|
|
120
|
+
"sweep_name": sweep_name,
|
|
121
|
+
"command_template": command_template,
|
|
122
|
+
"sweep_params": sweep_params,
|
|
123
|
+
"base_config_file": base_config_file,
|
|
124
|
+
"num_experiments": len(configs),
|
|
125
|
+
"timestamp": datetime.now().isoformat(),
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
with open(sweep_dir / "sweep_info.json", "w") as f:
|
|
129
|
+
json.dump(sweep_info, f, indent=2)
|
|
130
|
+
|
|
131
|
+
# Run each configuration
|
|
132
|
+
results = []
|
|
133
|
+
|
|
134
|
+
for idx, config in enumerate(configs):
|
|
135
|
+
exp_name = f"exp_{idx:03d}"
|
|
136
|
+
exp_dir = sweep_dir / exp_name
|
|
137
|
+
exp_dir.mkdir(exist_ok=True)
|
|
138
|
+
|
|
139
|
+
# Save config
|
|
140
|
+
config_file = exp_dir / "config.yaml"
|
|
141
|
+
with open(config_file, "w") as f:
|
|
142
|
+
yaml.dump(config, f)
|
|
143
|
+
|
|
144
|
+
# Build command
|
|
145
|
+
command = command_template.format(config=str(config_file))
|
|
146
|
+
|
|
147
|
+
# Add run_dir override if not in command
|
|
148
|
+
if "run_dir" not in command:
|
|
149
|
+
command += f" run_dir={exp_dir}"
|
|
150
|
+
|
|
151
|
+
print(f"\n[{idx + 1}/{len(configs)}] Running: {exp_name}")
|
|
152
|
+
print(f"Command: {command}")
|
|
153
|
+
|
|
154
|
+
# Print sweep parameters for this run
|
|
155
|
+
print("Parameters:")
|
|
156
|
+
for param in sweep_params:
|
|
157
|
+
keys = param.split(".")
|
|
158
|
+
current = config
|
|
159
|
+
for key in keys:
|
|
160
|
+
current = current.get(key, {})
|
|
161
|
+
print(f" {param}: {current}")
|
|
162
|
+
|
|
163
|
+
if dry_run:
|
|
164
|
+
results.append({"exp_name": exp_name, "status": "dry_run"})
|
|
165
|
+
continue
|
|
166
|
+
|
|
167
|
+
# Run command
|
|
168
|
+
try:
|
|
169
|
+
result = subprocess.run(
|
|
170
|
+
command,
|
|
171
|
+
shell=True,
|
|
172
|
+
capture_output=True,
|
|
173
|
+
text=True,
|
|
174
|
+
cwd=Path.cwd(),
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
success = result.returncode == 0
|
|
178
|
+
results.append(
|
|
179
|
+
{
|
|
180
|
+
"exp_name": exp_name,
|
|
181
|
+
"config": config,
|
|
182
|
+
"command": command,
|
|
183
|
+
"returncode": result.returncode,
|
|
184
|
+
"status": "success" if success else "failed",
|
|
185
|
+
}
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
if success:
|
|
189
|
+
print("✅ Completed successfully")
|
|
190
|
+
else:
|
|
191
|
+
print(f"❌ Failed with return code {result.returncode}")
|
|
192
|
+
if result.stderr:
|
|
193
|
+
print(f"Error: {result.stderr[:500]}")
|
|
194
|
+
|
|
195
|
+
except Exception as e:
|
|
196
|
+
print(f"❌ Exception: {e}")
|
|
197
|
+
results.append(
|
|
198
|
+
{
|
|
199
|
+
"exp_name": exp_name,
|
|
200
|
+
"config": config,
|
|
201
|
+
"command": command,
|
|
202
|
+
"status": "exception",
|
|
203
|
+
"error": str(e),
|
|
204
|
+
}
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
# Save results
|
|
208
|
+
results_file = sweep_dir / "sweep_results.json"
|
|
209
|
+
with open(results_file, "w") as f:
|
|
210
|
+
json.dump(results, f, indent=2)
|
|
211
|
+
|
|
212
|
+
# Print summary
|
|
213
|
+
print(f"\n{'=' * 80}")
|
|
214
|
+
print("Sweep Summary")
|
|
215
|
+
print(f"{'=' * 80}")
|
|
216
|
+
|
|
217
|
+
successful = sum(1 for r in results if r["status"] == "success")
|
|
218
|
+
failed = sum(1 for r in results if r["status"] in ["failed", "exception"])
|
|
219
|
+
|
|
220
|
+
print(f"Total experiments: {len(results)}")
|
|
221
|
+
print(f"Successful: {successful}")
|
|
222
|
+
print(f"Failed: {failed}")
|
|
223
|
+
print(f"\nResults saved to: {sweep_dir}")
|
|
224
|
+
print(f"{'=' * 80}\n")
|
|
225
|
+
|
|
226
|
+
return results
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def setup_sweep_parser(subparsers):
|
|
230
|
+
"""Setup the sweep subcommand parser."""
|
|
231
|
+
sweep_parser = subparsers.add_parser(
|
|
232
|
+
"sweep",
|
|
233
|
+
help="Run hyperparameter sweeps",
|
|
234
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
235
|
+
epilog="""
|
|
236
|
+
Examples:
|
|
237
|
+
# Basic sweep
|
|
238
|
+
loopkit sweep "python train.py {config}" \\
|
|
239
|
+
--config config.yaml \\
|
|
240
|
+
--sweep "training.lr=[0.001,0.01]" "model.hidden_dim=[128,256]"
|
|
241
|
+
|
|
242
|
+
# With torchrun
|
|
243
|
+
loopkit sweep "torchrun --nproc_per_node=2 train.py {config}" \\
|
|
244
|
+
--config config.yaml \\
|
|
245
|
+
--sweep "training.lr=[0.001,0.01]"
|
|
246
|
+
|
|
247
|
+
# Dry run to see commands
|
|
248
|
+
loopkit sweep "python train.py {config}" \\
|
|
249
|
+
--config config.yaml \\
|
|
250
|
+
--sweep "training.lr=[0.001,0.01]" \\
|
|
251
|
+
--dry-run
|
|
252
|
+
""",
|
|
253
|
+
)
|
|
254
|
+
sweep_parser.add_argument(
|
|
255
|
+
"command", help="Command template with {config} placeholder"
|
|
256
|
+
)
|
|
257
|
+
sweep_parser.add_argument(
|
|
258
|
+
"--config", "-c", required=True, help="Base configuration file"
|
|
259
|
+
)
|
|
260
|
+
sweep_parser.add_argument(
|
|
261
|
+
"--sweep",
|
|
262
|
+
"-s",
|
|
263
|
+
nargs="+",
|
|
264
|
+
required=True,
|
|
265
|
+
help="Sweep parameters in format: param=[val1,val2,...]",
|
|
266
|
+
)
|
|
267
|
+
sweep_parser.add_argument("--name", "-n", help="Sweep name")
|
|
268
|
+
sweep_parser.add_argument(
|
|
269
|
+
"--runs-dir", "-d", default="runs", help="Directory for runs"
|
|
270
|
+
)
|
|
271
|
+
sweep_parser.add_argument(
|
|
272
|
+
"--dry-run", action="store_true", help="Print commands without running"
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def run_sweep_command(args):
|
|
277
|
+
"""Run the sweep command."""
|
|
278
|
+
# Parse sweep parameters
|
|
279
|
+
sweep_params = {}
|
|
280
|
+
|
|
281
|
+
for spec in args.sweep:
|
|
282
|
+
match = re.match(r"(\S+)=\[([^\]]+)\]", spec)
|
|
283
|
+
if not match:
|
|
284
|
+
print(f"❌ Invalid sweep spec: {spec}")
|
|
285
|
+
print(" Expected format: param=[val1,val2,...]")
|
|
286
|
+
continue
|
|
287
|
+
|
|
288
|
+
key, values_str = match.groups()
|
|
289
|
+
values = []
|
|
290
|
+
for v in values_str.split(","):
|
|
291
|
+
v = v.strip()
|
|
292
|
+
try:
|
|
293
|
+
values.append(int(v))
|
|
294
|
+
except ValueError:
|
|
295
|
+
try:
|
|
296
|
+
values.append(float(v))
|
|
297
|
+
except ValueError:
|
|
298
|
+
values.append(v)
|
|
299
|
+
|
|
300
|
+
sweep_params[key] = values
|
|
301
|
+
|
|
302
|
+
if not sweep_params:
|
|
303
|
+
print("❌ No valid sweep parameters provided")
|
|
304
|
+
sys.exit(1)
|
|
305
|
+
|
|
306
|
+
run_sweep(
|
|
307
|
+
command_template=args.command,
|
|
308
|
+
sweep_params=sweep_params,
|
|
309
|
+
base_config_file=args.config,
|
|
310
|
+
sweep_name=args.name,
|
|
311
|
+
runs_dir=args.runs_dir,
|
|
312
|
+
dry_run=args.dry_run,
|
|
313
|
+
)
|
loopkit/cli/visualize.py
ADDED
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import json
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import List, Optional, Union
|
|
5
|
+
|
|
6
|
+
import matplotlib.pyplot as plt
|
|
7
|
+
import polars as pl
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def plot_metrics(
|
|
11
|
+
run_dirs: Union[str, Path, List[Union[str, Path]]],
|
|
12
|
+
metrics: Optional[List[str]] = None,
|
|
13
|
+
output_file: Optional[str] = None,
|
|
14
|
+
show: bool = True,
|
|
15
|
+
style: str = "default",
|
|
16
|
+
):
|
|
17
|
+
"""Plot metrics from one or more runs.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
run_dirs: Single run directory or list of run directories
|
|
21
|
+
metrics: List of metrics to plot (None = all)
|
|
22
|
+
output_file: Save plot to file (e.g., 'plot.png')
|
|
23
|
+
show: Whether to display the plot
|
|
24
|
+
style: Plot style ('default', 'seaborn', 'ggplot')
|
|
25
|
+
"""
|
|
26
|
+
# Handle single run_dir
|
|
27
|
+
if isinstance(run_dirs, (str, Path)):
|
|
28
|
+
run_dirs = [run_dirs]
|
|
29
|
+
|
|
30
|
+
# Set style
|
|
31
|
+
if style != "default":
|
|
32
|
+
plt.style.use(style)
|
|
33
|
+
|
|
34
|
+
# Load data from all runs
|
|
35
|
+
all_data = []
|
|
36
|
+
for run_dir in run_dirs:
|
|
37
|
+
run_dir = Path(run_dir)
|
|
38
|
+
metrics_file = run_dir / "metrics.csv"
|
|
39
|
+
|
|
40
|
+
if not metrics_file.exists():
|
|
41
|
+
print(f"⚠️ No metrics found in {run_dir}")
|
|
42
|
+
continue
|
|
43
|
+
|
|
44
|
+
df = pl.read_csv(metrics_file)
|
|
45
|
+
df = df.with_columns(run_id=pl.lit(run_dir.name))
|
|
46
|
+
all_data.append(df)
|
|
47
|
+
|
|
48
|
+
if not all_data:
|
|
49
|
+
print("❌ No metrics data found")
|
|
50
|
+
return
|
|
51
|
+
|
|
52
|
+
combined_df = pl.concat(all_data)
|
|
53
|
+
|
|
54
|
+
# Filter metrics if specified
|
|
55
|
+
if metrics:
|
|
56
|
+
combined_df = combined_df.filter(pl.col("name").is_in(metrics))
|
|
57
|
+
|
|
58
|
+
# Get unique metrics and splits
|
|
59
|
+
unique_metrics = combined_df["name"].unique()
|
|
60
|
+
unique_splits = combined_df["split"].unique()
|
|
61
|
+
|
|
62
|
+
# Create subplots
|
|
63
|
+
n_metrics = len(unique_metrics)
|
|
64
|
+
n_cols = min(2, n_metrics)
|
|
65
|
+
n_rows = (n_metrics + n_cols - 1) // n_cols
|
|
66
|
+
|
|
67
|
+
fig, axes = plt.subplots(n_rows, n_cols, figsize=(6 * n_cols, 4 * n_rows))
|
|
68
|
+
if n_metrics == 1:
|
|
69
|
+
axes = [axes]
|
|
70
|
+
elif n_rows > 1 and n_cols > 1:
|
|
71
|
+
axes = axes.flatten()
|
|
72
|
+
else:
|
|
73
|
+
axes = list(axes) if n_rows > 1 or n_cols > 1 else [axes]
|
|
74
|
+
|
|
75
|
+
# Plot each metric
|
|
76
|
+
for idx, metric_name in enumerate(sorted(unique_metrics)):
|
|
77
|
+
ax = axes[idx]
|
|
78
|
+
|
|
79
|
+
for run_id in combined_df["run_id"].unique():
|
|
80
|
+
for split in unique_splits:
|
|
81
|
+
data = combined_df.filter(
|
|
82
|
+
(pl.col("name") == metric_name)
|
|
83
|
+
& (pl.col("run_id") == run_id)
|
|
84
|
+
& (pl.col("split") == split)
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
if data.height > 0:
|
|
88
|
+
label = f"{run_id}_{split}" if len(run_dirs) > 1 else split
|
|
89
|
+
ax.plot(
|
|
90
|
+
data["step"],
|
|
91
|
+
data["value"],
|
|
92
|
+
label=label,
|
|
93
|
+
marker="o",
|
|
94
|
+
markersize=3,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
ax.set_xlabel("Step")
|
|
98
|
+
ax.set_ylabel("Value")
|
|
99
|
+
ax.set_title(metric_name)
|
|
100
|
+
ax.legend()
|
|
101
|
+
ax.grid(True, alpha=0.3)
|
|
102
|
+
|
|
103
|
+
# Hide unused subplots
|
|
104
|
+
for idx in range(n_metrics, len(axes)):
|
|
105
|
+
axes[idx].axis("off")
|
|
106
|
+
|
|
107
|
+
plt.tight_layout()
|
|
108
|
+
|
|
109
|
+
if output_file:
|
|
110
|
+
plt.savefig(output_file, dpi=150, bbox_inches="tight")
|
|
111
|
+
print(f"📊 Plot saved to {output_file}")
|
|
112
|
+
|
|
113
|
+
if show:
|
|
114
|
+
plt.show()
|
|
115
|
+
else:
|
|
116
|
+
plt.close()
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def plot_resource_usage(
|
|
120
|
+
run_dir: Union[str, Path],
|
|
121
|
+
output_file: Optional[str] = None,
|
|
122
|
+
show: bool = True,
|
|
123
|
+
):
|
|
124
|
+
"""Plot resource usage over time.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
run_dir: Run directory
|
|
128
|
+
output_file: Save plot to file
|
|
129
|
+
show: Whether to display the plot
|
|
130
|
+
"""
|
|
131
|
+
run_dir = Path(run_dir)
|
|
132
|
+
resource_file = run_dir / "resource_usage.jsonl"
|
|
133
|
+
|
|
134
|
+
if not resource_file.exists():
|
|
135
|
+
print(f"❌ No resource monitoring data found in {run_dir}")
|
|
136
|
+
return
|
|
137
|
+
|
|
138
|
+
# Load data
|
|
139
|
+
data = []
|
|
140
|
+
with open(resource_file) as f:
|
|
141
|
+
for line in f:
|
|
142
|
+
data.append(json.loads(line))
|
|
143
|
+
|
|
144
|
+
if not data:
|
|
145
|
+
print("❌ No resource data found")
|
|
146
|
+
return
|
|
147
|
+
|
|
148
|
+
# Extract time series
|
|
149
|
+
timestamps = [d["timestamp"] for d in data]
|
|
150
|
+
start_time = timestamps[0]
|
|
151
|
+
elapsed_times = [(t - start_time) / 60 for t in timestamps] # Convert to minutes
|
|
152
|
+
|
|
153
|
+
# Prepare subplots
|
|
154
|
+
n_plots = 0
|
|
155
|
+
has_gpu = "gpu" in data[0]
|
|
156
|
+
has_cpu = "cpu" in data[0]
|
|
157
|
+
has_memory = "memory" in data[0]
|
|
158
|
+
|
|
159
|
+
n_plots = sum([has_gpu, has_cpu, has_memory])
|
|
160
|
+
|
|
161
|
+
if n_plots == 0:
|
|
162
|
+
print("❌ No resource data to plot")
|
|
163
|
+
return
|
|
164
|
+
|
|
165
|
+
fig, axes = plt.subplots(n_plots, 1, figsize=(10, 4 * n_plots))
|
|
166
|
+
if n_plots == 1:
|
|
167
|
+
axes = [axes]
|
|
168
|
+
|
|
169
|
+
plot_idx = 0
|
|
170
|
+
|
|
171
|
+
# Plot GPU memory
|
|
172
|
+
if has_gpu:
|
|
173
|
+
ax = axes[plot_idx]
|
|
174
|
+
plot_idx += 1
|
|
175
|
+
|
|
176
|
+
for i in range(len(data[0]["gpu"])):
|
|
177
|
+
gpu_mem = [d["gpu"][i]["allocated_mb"] for d in data if "gpu" in d]
|
|
178
|
+
ax.plot(elapsed_times, gpu_mem, label=f"GPU {i}", marker="o", markersize=2)
|
|
179
|
+
|
|
180
|
+
ax.set_xlabel("Time (minutes)")
|
|
181
|
+
ax.set_ylabel("Memory (MB)")
|
|
182
|
+
ax.set_title("GPU Memory Usage")
|
|
183
|
+
ax.legend()
|
|
184
|
+
ax.grid(True, alpha=0.3)
|
|
185
|
+
|
|
186
|
+
# Plot CPU usage
|
|
187
|
+
if has_cpu:
|
|
188
|
+
ax = axes[plot_idx]
|
|
189
|
+
plot_idx += 1
|
|
190
|
+
|
|
191
|
+
cpu_percent = [d["cpu"]["percent"] for d in data if "cpu" in d]
|
|
192
|
+
ax.plot(
|
|
193
|
+
elapsed_times[: len(cpu_percent)], cpu_percent, marker="o", markersize=2
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
ax.set_xlabel("Time (minutes)")
|
|
197
|
+
ax.set_ylabel("CPU %")
|
|
198
|
+
ax.set_title("CPU Usage")
|
|
199
|
+
ax.grid(True, alpha=0.3)
|
|
200
|
+
|
|
201
|
+
# Plot memory usage
|
|
202
|
+
if has_memory:
|
|
203
|
+
ax = axes[plot_idx]
|
|
204
|
+
plot_idx += 1
|
|
205
|
+
|
|
206
|
+
mem_percent = [d["memory"]["percent"] for d in data if "memory" in d]
|
|
207
|
+
ax.plot(
|
|
208
|
+
elapsed_times[: len(mem_percent)], mem_percent, marker="o", markersize=2
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
ax.set_xlabel("Time (minutes)")
|
|
212
|
+
ax.set_ylabel("Memory %")
|
|
213
|
+
ax.set_title("System Memory Usage")
|
|
214
|
+
ax.grid(True, alpha=0.3)
|
|
215
|
+
|
|
216
|
+
plt.tight_layout()
|
|
217
|
+
|
|
218
|
+
if output_file:
|
|
219
|
+
plt.savefig(output_file, dpi=150, bbox_inches="tight")
|
|
220
|
+
print(f"📊 Resource plot saved to {output_file}")
|
|
221
|
+
|
|
222
|
+
if show:
|
|
223
|
+
plt.show()
|
|
224
|
+
else:
|
|
225
|
+
plt.close()
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def setup_visualize_parser(subparsers):
|
|
229
|
+
"""Setup the visualize subcommand parser."""
|
|
230
|
+
viz_parser = subparsers.add_parser(
|
|
231
|
+
"viz",
|
|
232
|
+
help="Visualize experiment metrics",
|
|
233
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
234
|
+
epilog="""
|
|
235
|
+
Examples:
|
|
236
|
+
# Plot all metrics from multiple runs
|
|
237
|
+
loopkit viz runs/exp1 runs/exp2 runs/exp3
|
|
238
|
+
|
|
239
|
+
# Plot specific metrics
|
|
240
|
+
loopkit viz runs/exp_* --metrics loss accuracy
|
|
241
|
+
|
|
242
|
+
# Save plot to file
|
|
243
|
+
loopkit viz runs/exp1 --output plot.png
|
|
244
|
+
|
|
245
|
+
# Plot resource usage
|
|
246
|
+
loopkit viz runs/exp1 --resources
|
|
247
|
+
""",
|
|
248
|
+
)
|
|
249
|
+
viz_parser.add_argument("run_dirs", nargs="+", help="Run directories to plot")
|
|
250
|
+
viz_parser.add_argument(
|
|
251
|
+
"--metrics", "-m", nargs="+", help="Specific metrics to plot"
|
|
252
|
+
)
|
|
253
|
+
viz_parser.add_argument("--output", "-o", help="Save plot to file (e.g., plot.png)")
|
|
254
|
+
viz_parser.add_argument("--no-show", action="store_true", help="Don't display plot")
|
|
255
|
+
viz_parser.add_argument(
|
|
256
|
+
"--style",
|
|
257
|
+
default="default",
|
|
258
|
+
choices=["default", "seaborn", "ggplot"],
|
|
259
|
+
help="Plot style",
|
|
260
|
+
)
|
|
261
|
+
viz_parser.add_argument(
|
|
262
|
+
"--resources",
|
|
263
|
+
"-r",
|
|
264
|
+
action="store_true",
|
|
265
|
+
help="Plot resource usage instead of metrics",
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def run_visualize_command(args):
|
|
270
|
+
"""Run the visualize command."""
|
|
271
|
+
if args.resources:
|
|
272
|
+
if len(args.run_dirs) > 1:
|
|
273
|
+
print("⚠️ Resource plotting only supports single run directory")
|
|
274
|
+
print(f"Plotting resources for: {args.run_dirs[0]}")
|
|
275
|
+
plot_resource_usage(
|
|
276
|
+
args.run_dirs[0], output_file=args.output, show=not args.no_show
|
|
277
|
+
)
|
|
278
|
+
else:
|
|
279
|
+
plot_metrics(
|
|
280
|
+
args.run_dirs,
|
|
281
|
+
metrics=args.metrics,
|
|
282
|
+
output_file=args.output,
|
|
283
|
+
show=not args.no_show,
|
|
284
|
+
style=args.style,
|
|
285
|
+
)
|