ml-dash 0.5.8__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ml_dash/__init__.py +35 -9
- ml_dash/auth/__init__.py +51 -0
- ml_dash/auth/constants.py +10 -0
- ml_dash/auth/device_flow.py +237 -0
- ml_dash/auth/device_secret.py +49 -0
- ml_dash/auth/exceptions.py +31 -0
- ml_dash/auth/token_storage.py +262 -0
- ml_dash/auto_start.py +37 -14
- ml_dash/cli.py +14 -2
- ml_dash/cli_commands/download.py +10 -38
- ml_dash/cli_commands/list.py +10 -34
- ml_dash/cli_commands/login.py +225 -0
- ml_dash/cli_commands/logout.py +54 -0
- ml_dash/cli_commands/upload.py +3 -53
- ml_dash/client.py +67 -34
- ml_dash/config.py +15 -1
- ml_dash/experiment.py +151 -55
- ml_dash/files.py +97 -0
- ml_dash/metric.py +192 -3
- ml_dash/params.py +92 -3
- ml_dash/remote_auto_start.py +55 -0
- ml_dash/storage.py +366 -235
- {ml_dash-0.5.8.dist-info → ml_dash-0.6.0.dist-info}/METADATA +5 -1
- ml_dash-0.6.0.dist-info/RECORD +29 -0
- ml_dash-0.5.8.dist-info/RECORD +0 -20
- {ml_dash-0.5.8.dist-info → ml_dash-0.6.0.dist-info}/WHEEL +0 -0
- {ml_dash-0.5.8.dist-info → ml_dash-0.6.0.dist-info}/entry_points.txt +0 -0
ml_dash/experiment.py
CHANGED
|
@@ -67,6 +67,83 @@ class RunManager:
|
|
|
67
67
|
"""Mark experiment as cancelled (status: CANCELLED)."""
|
|
68
68
|
self._experiment._close(status="CANCELLED")
|
|
69
69
|
|
|
70
|
+
@property
|
|
71
|
+
def folder(self) -> Optional[str]:
|
|
72
|
+
"""
|
|
73
|
+
Get the current folder for this experiment.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
Current folder path or None
|
|
77
|
+
|
|
78
|
+
Example:
|
|
79
|
+
current_folder = exp.run.folder
|
|
80
|
+
"""
|
|
81
|
+
return self._experiment.folder
|
|
82
|
+
|
|
83
|
+
@folder.setter
|
|
84
|
+
def folder(self, value: Optional[str]) -> None:
|
|
85
|
+
"""
|
|
86
|
+
Set the folder for this experiment before initialization.
|
|
87
|
+
|
|
88
|
+
This can ONLY be set before the experiment is started (initialized).
|
|
89
|
+
Once the experiment is opened, the folder cannot be changed.
|
|
90
|
+
|
|
91
|
+
Supports template variables:
|
|
92
|
+
- {RUN.name} - Experiment name
|
|
93
|
+
- {RUN.project} - Project name
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
value: Folder path with optional template variables
|
|
97
|
+
(e.g., "experiments/{RUN.name}" or None)
|
|
98
|
+
|
|
99
|
+
Raises:
|
|
100
|
+
RuntimeError: If experiment is already initialized/open
|
|
101
|
+
|
|
102
|
+
Examples:
|
|
103
|
+
from ml_dash import dxp
|
|
104
|
+
|
|
105
|
+
# Static folder
|
|
106
|
+
dxp.run.folder = "experiments/vision/resnet"
|
|
107
|
+
|
|
108
|
+
# Template with experiment name
|
|
109
|
+
dxp.run.folder = "/iclr_2024/{RUN.name}"
|
|
110
|
+
|
|
111
|
+
# Template with multiple variables
|
|
112
|
+
dxp.run.folder = "{RUN.project}/experiments/{RUN.name}"
|
|
113
|
+
|
|
114
|
+
# Now start the experiment
|
|
115
|
+
with dxp.run:
|
|
116
|
+
dxp.params.set(lr=0.001)
|
|
117
|
+
"""
|
|
118
|
+
if self._experiment._is_open:
|
|
119
|
+
raise RuntimeError(
|
|
120
|
+
"Cannot change folder after experiment is initialized. "
|
|
121
|
+
"Set folder before calling start() or entering 'with' block."
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Process template variables if present
|
|
125
|
+
if value and '{RUN.' in value:
|
|
126
|
+
# Generate unique run ID (timestamp-based)
|
|
127
|
+
from datetime import datetime
|
|
128
|
+
run_timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
|
|
129
|
+
|
|
130
|
+
# Simple string replacement for template variables
|
|
131
|
+
# Supports: {RUN.name}, {RUN.project}, {RUN.id}, {RUN.timestamp}
|
|
132
|
+
replacements = {
|
|
133
|
+
'{RUN.name}': f"{self._experiment.name}_{run_timestamp}", # Unique name with timestamp
|
|
134
|
+
'{RUN.project}': self._experiment.project,
|
|
135
|
+
'{RUN.id}': run_timestamp, # Just the timestamp
|
|
136
|
+
'{RUN.timestamp}': run_timestamp, # Alias for id
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
# Replace all template variables
|
|
140
|
+
for template, replacement in replacements.items():
|
|
141
|
+
if template in value:
|
|
142
|
+
value = value.replace(template, replacement)
|
|
143
|
+
|
|
144
|
+
# Update the folder on the experiment
|
|
145
|
+
self._experiment.folder = value
|
|
146
|
+
|
|
70
147
|
def __enter__(self) -> "Experiment":
|
|
71
148
|
"""Context manager entry - starts the experiment."""
|
|
72
149
|
return self.start()
|
|
@@ -105,7 +182,7 @@ class Experiment:
|
|
|
105
182
|
experiment = Experiment(
|
|
106
183
|
name="my-experiment",
|
|
107
184
|
project="my-project",
|
|
108
|
-
remote="
|
|
185
|
+
remote="https://api.dash.ml",
|
|
109
186
|
api_key="your-jwt-token"
|
|
110
187
|
)
|
|
111
188
|
|
|
@@ -139,7 +216,6 @@ class Experiment:
|
|
|
139
216
|
# Mode configuration
|
|
140
217
|
remote: Optional[str] = None,
|
|
141
218
|
api_key: Optional[str] = None,
|
|
142
|
-
user_name: Optional[str] = None,
|
|
143
219
|
local_path: Optional[str] = None,
|
|
144
220
|
# Internal parameters
|
|
145
221
|
_write_protected: bool = False,
|
|
@@ -155,9 +231,8 @@ class Experiment:
|
|
|
155
231
|
bindrs: Optional list of bindrs
|
|
156
232
|
folder: Optional folder path (e.g., "/experiments/baseline")
|
|
157
233
|
metadata: Optional metadata dict
|
|
158
|
-
remote: Remote API URL (e.g., "
|
|
159
|
-
api_key: JWT token for authentication (if not provided
|
|
160
|
-
user_name: Username for authentication (generates API key if api_key not provided)
|
|
234
|
+
remote: Remote API URL (e.g., "https://api.dash.ml")
|
|
235
|
+
api_key: JWT token for authentication (auto-loaded from storage if not provided)
|
|
161
236
|
local_path: Local storage root path (for local mode)
|
|
162
237
|
_write_protected: Internal parameter - if True, experiment becomes immutable after creation
|
|
163
238
|
"""
|
|
@@ -170,10 +245,6 @@ class Experiment:
|
|
|
170
245
|
self._write_protected = _write_protected
|
|
171
246
|
self.metadata = metadata
|
|
172
247
|
|
|
173
|
-
# Generate API key from username if not provided
|
|
174
|
-
if remote and not api_key and user_name:
|
|
175
|
-
api_key = self._generate_api_key_from_username(user_name)
|
|
176
|
-
|
|
177
248
|
# Determine operation mode
|
|
178
249
|
if remote and local_path:
|
|
179
250
|
self.mode = OperationMode.HYBRID
|
|
@@ -183,7 +254,7 @@ class Experiment:
|
|
|
183
254
|
self.mode = OperationMode.LOCAL
|
|
184
255
|
else:
|
|
185
256
|
raise ValueError(
|
|
186
|
-
"Must specify either 'remote' (with api_key
|
|
257
|
+
"Must specify either 'remote' (with api_key) or 'local_path'"
|
|
187
258
|
)
|
|
188
259
|
|
|
189
260
|
# Initialize backend
|
|
@@ -192,10 +263,10 @@ class Experiment:
|
|
|
192
263
|
self._experiment_id: Optional[str] = None
|
|
193
264
|
self._experiment_data: Optional[Dict[str, Any]] = None
|
|
194
265
|
self._is_open = False
|
|
266
|
+
self._metrics_manager: Optional['MetricsManager'] = None # Cached metrics manager
|
|
195
267
|
|
|
196
268
|
if self.mode in (OperationMode.REMOTE, OperationMode.HYBRID):
|
|
197
|
-
|
|
198
|
-
raise ValueError("Either api_key or user_name is required for remote mode")
|
|
269
|
+
# api_key can be None - RemoteClient will auto-load from storage
|
|
199
270
|
self._client = RemoteClient(base_url=remote, api_key=api_key)
|
|
200
271
|
|
|
201
272
|
if self.mode in (OperationMode.LOCAL, OperationMode.HYBRID):
|
|
@@ -203,43 +274,6 @@ class Experiment:
|
|
|
203
274
|
raise ValueError("local_path is required for local mode")
|
|
204
275
|
self._storage = LocalStorage(root_path=Path(local_path))
|
|
205
276
|
|
|
206
|
-
@staticmethod
|
|
207
|
-
def _generate_api_key_from_username(user_name: str) -> str:
|
|
208
|
-
"""
|
|
209
|
-
Generate a deterministic API key (JWT) from username.
|
|
210
|
-
|
|
211
|
-
This is a temporary solution until proper user authentication is implemented.
|
|
212
|
-
Generates a unique user ID from the username and creates a JWT token.
|
|
213
|
-
|
|
214
|
-
Args:
|
|
215
|
-
user_name: Username to generate API key from
|
|
216
|
-
|
|
217
|
-
Returns:
|
|
218
|
-
JWT token string
|
|
219
|
-
"""
|
|
220
|
-
import hashlib
|
|
221
|
-
import time
|
|
222
|
-
import jwt
|
|
223
|
-
|
|
224
|
-
# Generate deterministic user ID from username (first 10 digits of SHA256 hash)
|
|
225
|
-
user_id = str(int(hashlib.sha256(user_name.encode()).hexdigest()[:16], 16))[:10]
|
|
226
|
-
|
|
227
|
-
# JWT payload
|
|
228
|
-
payload = {
|
|
229
|
-
"userId": user_id,
|
|
230
|
-
"userName": user_name,
|
|
231
|
-
"iat": int(time.time()),
|
|
232
|
-
"exp": int(time.time()) + (30 * 24 * 60 * 60) # 30 days expiration
|
|
233
|
-
}
|
|
234
|
-
|
|
235
|
-
# Secret key for signing (should match server's JWT_SECRET)
|
|
236
|
-
secret = "your-secret-key-change-this-in-production"
|
|
237
|
-
|
|
238
|
-
# Generate JWT
|
|
239
|
-
token = jwt.encode(payload, secret, algorithm="HS256")
|
|
240
|
-
|
|
241
|
-
return token
|
|
242
|
-
|
|
243
277
|
def _open(self) -> "Experiment":
|
|
244
278
|
"""
|
|
245
279
|
Internal method to open the experiment (create or update on server/filesystem).
|
|
@@ -350,7 +384,12 @@ class Experiment:
|
|
|
350
384
|
RuntimeError: If experiment is not open
|
|
351
385
|
"""
|
|
352
386
|
if not self._is_open:
|
|
353
|
-
raise RuntimeError(
|
|
387
|
+
raise RuntimeError(
|
|
388
|
+
"Experiment not started. Use 'with experiment.run:' or call experiment.run.start() first.\n"
|
|
389
|
+
"Example:\n"
|
|
390
|
+
" with dxp.run:\n"
|
|
391
|
+
" dxp.params.set(lr=0.001)"
|
|
392
|
+
)
|
|
354
393
|
|
|
355
394
|
return ParametersBuilder(self)
|
|
356
395
|
|
|
@@ -395,7 +434,12 @@ class Experiment:
|
|
|
395
434
|
ValueError: If log level is invalid
|
|
396
435
|
"""
|
|
397
436
|
if not self._is_open:
|
|
398
|
-
raise RuntimeError(
|
|
437
|
+
raise RuntimeError(
|
|
438
|
+
"Experiment not started. Use 'with experiment.run:' or call experiment.run.start() first.\n"
|
|
439
|
+
"Example:\n"
|
|
440
|
+
" with dxp.run:\n"
|
|
441
|
+
" dxp.log().info('Training started')"
|
|
442
|
+
)
|
|
399
443
|
|
|
400
444
|
# Fluent mode: return LogBuilder
|
|
401
445
|
if message is None:
|
|
@@ -424,7 +468,7 @@ class Experiment:
|
|
|
424
468
|
) -> None:
|
|
425
469
|
"""
|
|
426
470
|
Internal method to write a log entry immediately.
|
|
427
|
-
No buffering - writes directly to storage/remote.
|
|
471
|
+
No buffering - writes directly to storage/remote AND stdout/stderr.
|
|
428
472
|
|
|
429
473
|
Args:
|
|
430
474
|
message: Log message
|
|
@@ -441,6 +485,9 @@ class Experiment:
|
|
|
441
485
|
if metadata:
|
|
442
486
|
log_entry["metadata"] = metadata
|
|
443
487
|
|
|
488
|
+
# Mirror to stdout/stderr before writing to storage
|
|
489
|
+
self._print_log(message, level, metadata)
|
|
490
|
+
|
|
444
491
|
# Write immediately (no buffering)
|
|
445
492
|
if self._client:
|
|
446
493
|
# Remote mode: send to API (wrapped in array for batch API)
|
|
@@ -454,12 +501,50 @@ class Experiment:
|
|
|
454
501
|
self._storage.write_log(
|
|
455
502
|
project=self.project,
|
|
456
503
|
experiment=self.name,
|
|
504
|
+
folder=self.folder,
|
|
457
505
|
message=log_entry["message"],
|
|
458
506
|
level=log_entry["level"],
|
|
459
507
|
metadata=log_entry.get("metadata"),
|
|
460
508
|
timestamp=log_entry["timestamp"]
|
|
461
509
|
)
|
|
462
510
|
|
|
511
|
+
def _print_log(
|
|
512
|
+
self,
|
|
513
|
+
message: str,
|
|
514
|
+
level: str,
|
|
515
|
+
metadata: Optional[Dict[str, Any]]
|
|
516
|
+
) -> None:
|
|
517
|
+
"""
|
|
518
|
+
Print log to stdout or stderr based on level.
|
|
519
|
+
|
|
520
|
+
ERROR and FATAL go to stderr, all others go to stdout.
|
|
521
|
+
|
|
522
|
+
Args:
|
|
523
|
+
message: Log message
|
|
524
|
+
level: Log level
|
|
525
|
+
metadata: Optional metadata dict
|
|
526
|
+
"""
|
|
527
|
+
import sys
|
|
528
|
+
|
|
529
|
+
# Format the log message
|
|
530
|
+
level_upper = level.upper()
|
|
531
|
+
|
|
532
|
+
# Build metadata string if present
|
|
533
|
+
metadata_str = ""
|
|
534
|
+
if metadata:
|
|
535
|
+
# Format metadata as key=value pairs
|
|
536
|
+
pairs = [f"{k}={v}" for k, v in metadata.items()]
|
|
537
|
+
metadata_str = f" [{', '.join(pairs)}]"
|
|
538
|
+
|
|
539
|
+
# Format: [LEVEL] message [key=value, ...]
|
|
540
|
+
formatted_message = f"[{level_upper}] {message}{metadata_str}"
|
|
541
|
+
|
|
542
|
+
# Route to stdout or stderr based on level
|
|
543
|
+
if level in ("error", "fatal"):
|
|
544
|
+
print(formatted_message, file=sys.stderr)
|
|
545
|
+
else:
|
|
546
|
+
print(formatted_message, file=sys.stdout)
|
|
547
|
+
|
|
463
548
|
def files(self, **kwargs) -> FileBuilder:
|
|
464
549
|
"""
|
|
465
550
|
Get a FileBuilder for fluent file operations.
|
|
@@ -485,7 +570,12 @@ class Experiment:
|
|
|
485
570
|
experiment.files(file_id="123").delete()
|
|
486
571
|
"""
|
|
487
572
|
if not self._is_open:
|
|
488
|
-
raise RuntimeError(
|
|
573
|
+
raise RuntimeError(
|
|
574
|
+
"Experiment not started. Use 'with experiment.run:' or call experiment.run.start() first.\n"
|
|
575
|
+
"Example:\n"
|
|
576
|
+
" with dxp.run:\n"
|
|
577
|
+
" dxp.files().save()"
|
|
578
|
+
)
|
|
489
579
|
|
|
490
580
|
return FileBuilder(self, **kwargs)
|
|
491
581
|
|
|
@@ -540,6 +630,7 @@ class Experiment:
|
|
|
540
630
|
result = self._storage.write_file(
|
|
541
631
|
project=self.project,
|
|
542
632
|
experiment=self.name,
|
|
633
|
+
folder=self.folder,
|
|
543
634
|
file_path=file_path,
|
|
544
635
|
prefix=prefix,
|
|
545
636
|
filename=filename,
|
|
@@ -716,6 +807,7 @@ class Experiment:
|
|
|
716
807
|
self._storage.write_parameters(
|
|
717
808
|
project=self.project,
|
|
718
809
|
experiment=self.name,
|
|
810
|
+
folder=self.folder,
|
|
719
811
|
data=flattened_params
|
|
720
812
|
)
|
|
721
813
|
|
|
@@ -787,7 +879,10 @@ class Experiment:
|
|
|
787
879
|
"Use 'with Experiment(...).run as experiment:' or call experiment.run.start() first."
|
|
788
880
|
)
|
|
789
881
|
|
|
790
|
-
|
|
882
|
+
# Cache the MetricsManager instance to preserve MetricBuilder cache across calls
|
|
883
|
+
if self._metrics_manager is None:
|
|
884
|
+
self._metrics_manager = MetricsManager(self)
|
|
885
|
+
return self._metrics_manager
|
|
791
886
|
|
|
792
887
|
def _append_to_metric(
|
|
793
888
|
self,
|
|
@@ -828,6 +923,7 @@ class Experiment:
|
|
|
828
923
|
result = self._storage.append_to_metric(
|
|
829
924
|
project=self.project,
|
|
830
925
|
experiment=self.name,
|
|
926
|
+
folder=self.folder,
|
|
831
927
|
metric_name=name,
|
|
832
928
|
data=data,
|
|
833
929
|
description=description,
|
|
@@ -999,7 +1095,7 @@ def ml_dash_experiment(
|
|
|
999
1095
|
@ml_dash_experiment(
|
|
1000
1096
|
name="my-experiment",
|
|
1001
1097
|
project="my-project",
|
|
1002
|
-
remote="
|
|
1098
|
+
remote="https://api.dash.ml",
|
|
1003
1099
|
api_key="your-token"
|
|
1004
1100
|
)
|
|
1005
1101
|
def train_model():
|
ml_dash/files.py
CHANGED
|
@@ -607,6 +607,103 @@ class FileBuilder:
|
|
|
607
607
|
except Exception:
|
|
608
608
|
pass
|
|
609
609
|
|
|
610
|
+
def duplicate(self, source: Union[str, Dict[str, Any]], to: str) -> Dict[str, Any]:
|
|
611
|
+
"""
|
|
612
|
+
Duplicate an existing file to a new path within the same experiment.
|
|
613
|
+
|
|
614
|
+
Useful for checkpoint rotation patterns where you save versioned checkpoints
|
|
615
|
+
and maintain a "latest" or "best" pointer.
|
|
616
|
+
|
|
617
|
+
Args:
|
|
618
|
+
source: Source file - either file ID (str) or metadata dict with 'id' key
|
|
619
|
+
to: Target path like "models/latest.pt" or "/checkpoints/best.pt"
|
|
620
|
+
|
|
621
|
+
Returns:
|
|
622
|
+
File metadata dict for the duplicated file with id, path, filename, checksum, etc.
|
|
623
|
+
|
|
624
|
+
Raises:
|
|
625
|
+
RuntimeError: If experiment is not open or write-protected
|
|
626
|
+
ValueError: If source file not found or target path invalid
|
|
627
|
+
|
|
628
|
+
Examples:
|
|
629
|
+
# Using file ID
|
|
630
|
+
dxp.files().duplicate("file-id-123", to="models/latest.pt")
|
|
631
|
+
|
|
632
|
+
# Using metadata dict from save_torch
|
|
633
|
+
snapshot = dxp.files(prefix="/models").save_torch(model, f"model_{epoch:05d}.pt")
|
|
634
|
+
dxp.files().duplicate(snapshot, to="models/latest.pt")
|
|
635
|
+
|
|
636
|
+
# Checkpoint rotation pattern
|
|
637
|
+
snap = dxp.files(prefix="/checkpoints").save_torch(model, f"model_{epoch:05d}.pt")
|
|
638
|
+
dxp.files().duplicate(snap, to="checkpoints/best.pt")
|
|
639
|
+
"""
|
|
640
|
+
import tempfile
|
|
641
|
+
import os
|
|
642
|
+
|
|
643
|
+
if not self._experiment._is_open:
|
|
644
|
+
raise RuntimeError("Experiment not open. Use experiment.run.start() or context manager.")
|
|
645
|
+
|
|
646
|
+
if self._experiment._write_protected:
|
|
647
|
+
raise RuntimeError("Experiment is write-protected and cannot be modified.")
|
|
648
|
+
|
|
649
|
+
# Extract source file ID
|
|
650
|
+
if isinstance(source, str):
|
|
651
|
+
source_id = source
|
|
652
|
+
elif isinstance(source, dict) and 'id' in source:
|
|
653
|
+
source_id = source['id']
|
|
654
|
+
else:
|
|
655
|
+
raise ValueError("source must be a file ID (str) or metadata dict with 'id' key")
|
|
656
|
+
|
|
657
|
+
if not source_id:
|
|
658
|
+
raise ValueError("Invalid source: file ID is empty")
|
|
659
|
+
|
|
660
|
+
# Parse target path into prefix and filename
|
|
661
|
+
to = to.lstrip('/')
|
|
662
|
+
if '/' in to:
|
|
663
|
+
target_prefix, target_filename = to.rsplit('/', 1)
|
|
664
|
+
target_prefix = '/' + target_prefix
|
|
665
|
+
else:
|
|
666
|
+
target_prefix = '/'
|
|
667
|
+
target_filename = to
|
|
668
|
+
|
|
669
|
+
if not target_filename:
|
|
670
|
+
raise ValueError(f"Invalid target path '{to}': must include filename")
|
|
671
|
+
|
|
672
|
+
# Download source file to temp location
|
|
673
|
+
temp_dir = tempfile.mkdtemp()
|
|
674
|
+
temp_path = os.path.join(temp_dir, target_filename)
|
|
675
|
+
|
|
676
|
+
try:
|
|
677
|
+
# Download the source file
|
|
678
|
+
downloaded_path = self._experiment._download_file(
|
|
679
|
+
file_id=source_id,
|
|
680
|
+
dest_path=temp_path
|
|
681
|
+
)
|
|
682
|
+
|
|
683
|
+
# Save to new location using existing save() method
|
|
684
|
+
original_file_path = self._file_path
|
|
685
|
+
original_prefix = self._prefix
|
|
686
|
+
|
|
687
|
+
self._file_path = downloaded_path
|
|
688
|
+
self._prefix = target_prefix
|
|
689
|
+
|
|
690
|
+
# Upload and get result
|
|
691
|
+
result = self.save()
|
|
692
|
+
|
|
693
|
+
# Restore original values
|
|
694
|
+
self._file_path = original_file_path
|
|
695
|
+
self._prefix = original_prefix
|
|
696
|
+
|
|
697
|
+
return result
|
|
698
|
+
finally:
|
|
699
|
+
# Clean up temp file and directory
|
|
700
|
+
try:
|
|
701
|
+
if os.path.exists(temp_path):
|
|
702
|
+
os.unlink(temp_path)
|
|
703
|
+
os.rmdir(temp_dir)
|
|
704
|
+
except Exception:
|
|
705
|
+
pass
|
|
706
|
+
|
|
610
707
|
|
|
611
708
|
def compute_sha256(file_path: str) -> str:
|
|
612
709
|
"""
|
ml_dash/metric.py
CHANGED
|
@@ -6,11 +6,165 @@ validation losses, system measurements, etc.
|
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
from typing import Dict, Any, List, Optional, TYPE_CHECKING
|
|
9
|
+
from collections import defaultdict
|
|
10
|
+
import statistics
|
|
9
11
|
|
|
10
12
|
if TYPE_CHECKING:
|
|
11
13
|
from .experiment import Experiment
|
|
12
14
|
|
|
13
15
|
|
|
16
|
+
class SummaryCache:
|
|
17
|
+
"""
|
|
18
|
+
Buffer for collecting metric values and computing statistics periodically.
|
|
19
|
+
|
|
20
|
+
Inspired by ml-logger's SummaryCache design:
|
|
21
|
+
- Lazy computation: Store raw values, compute stats on demand
|
|
22
|
+
- Hierarchical naming: Stats get suffixes (loss.mean, loss.std)
|
|
23
|
+
- Robust handling: Converts None → NaN, filters before stats
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(self, metric_builder: 'MetricBuilder'):
|
|
27
|
+
"""
|
|
28
|
+
Initialize SummaryCache.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
metric_builder: Parent MetricBuilder instance
|
|
32
|
+
"""
|
|
33
|
+
self._metric_builder = metric_builder
|
|
34
|
+
self._buffer: Dict[str, List[float]] = defaultdict(list)
|
|
35
|
+
self._metadata: Dict[str, Any] = {} # For set() metadata
|
|
36
|
+
|
|
37
|
+
def store(self, **kwargs) -> None:
|
|
38
|
+
"""
|
|
39
|
+
Store values in buffer without immediate logging (deferred computation).
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
**kwargs: Metric values to buffer (e.g., loss=0.5, accuracy=0.9)
|
|
43
|
+
|
|
44
|
+
Example:
|
|
45
|
+
cache.store(loss=0.5, accuracy=0.9)
|
|
46
|
+
cache.store(loss=0.48) # Accumulates
|
|
47
|
+
"""
|
|
48
|
+
for key, value in kwargs.items():
|
|
49
|
+
# Handle None values gracefully
|
|
50
|
+
if value is None:
|
|
51
|
+
value = float('nan')
|
|
52
|
+
try:
|
|
53
|
+
self._buffer[key].append(float(value))
|
|
54
|
+
except (TypeError, ValueError):
|
|
55
|
+
# Skip non-numeric values silently
|
|
56
|
+
continue
|
|
57
|
+
|
|
58
|
+
def set(self, **kwargs) -> None:
|
|
59
|
+
"""
|
|
60
|
+
Set metadata values without aggregation (replaces previous values).
|
|
61
|
+
|
|
62
|
+
Used for contextual metadata like learning rate, epoch number, etc.
|
|
63
|
+
These values are included in the final data point when summarize() is called.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
**kwargs: Metadata to set (e.g., lr=0.001, epoch=5)
|
|
67
|
+
|
|
68
|
+
Example:
|
|
69
|
+
cache.set(lr=0.001, epoch=5)
|
|
70
|
+
cache.set(lr=0.0005) # Replaces lr, keeps epoch
|
|
71
|
+
"""
|
|
72
|
+
self._metadata.update(kwargs)
|
|
73
|
+
|
|
74
|
+
def _compute_stats(self) -> Dict[str, float]:
|
|
75
|
+
"""
|
|
76
|
+
Compute statistics from buffered values (idempotent, read-only).
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
Dict with hierarchical metric names (key.mean, key.std, etc.)
|
|
80
|
+
|
|
81
|
+
Note: This is idempotent - can be called multiple times without side effects.
|
|
82
|
+
"""
|
|
83
|
+
stats_data = {}
|
|
84
|
+
|
|
85
|
+
for key, values in self._buffer.items():
|
|
86
|
+
if not values:
|
|
87
|
+
continue
|
|
88
|
+
|
|
89
|
+
# Filter out NaN values (ml-logger pattern)
|
|
90
|
+
clean_values = [v for v in values if not (isinstance(v, float) and v != v)]
|
|
91
|
+
|
|
92
|
+
if not clean_values:
|
|
93
|
+
continue
|
|
94
|
+
|
|
95
|
+
# Compute statistics with hierarchical naming
|
|
96
|
+
stats_data[f"{key}.mean"] = statistics.mean(clean_values)
|
|
97
|
+
stats_data[f"{key}.min"] = min(clean_values)
|
|
98
|
+
stats_data[f"{key}.max"] = max(clean_values)
|
|
99
|
+
stats_data[f"{key}.count"] = len(clean_values)
|
|
100
|
+
|
|
101
|
+
# Std dev requires at least 2 values
|
|
102
|
+
if len(clean_values) >= 2:
|
|
103
|
+
stats_data[f"{key}.std"] = statistics.stdev(clean_values)
|
|
104
|
+
else:
|
|
105
|
+
stats_data[f"{key}.std"] = 0.0
|
|
106
|
+
|
|
107
|
+
return stats_data
|
|
108
|
+
|
|
109
|
+
def summarize(self, clear: bool = True) -> None:
|
|
110
|
+
"""
|
|
111
|
+
Compute statistics from buffered values and log them (non-idempotent).
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
clear: If True (default), clear buffer after computing statistics.
|
|
115
|
+
This creates a "rolling window" behavior matching ml-logger's "tiled" mode.
|
|
116
|
+
|
|
117
|
+
Example:
|
|
118
|
+
# After storing 10 loss values and setting lr=0.001:
|
|
119
|
+
cache.store(loss=0.5)
|
|
120
|
+
cache.set(lr=0.001, epoch=5)
|
|
121
|
+
cache.summarize()
|
|
122
|
+
# Logs: {lr: 0.001, epoch: 5, loss.mean: 0.5, loss.std: 0.0, ...}
|
|
123
|
+
|
|
124
|
+
Note: This is non-idempotent - calling it multiple times has side effects.
|
|
125
|
+
"""
|
|
126
|
+
if not self._buffer and not self._metadata:
|
|
127
|
+
return
|
|
128
|
+
|
|
129
|
+
# Compute statistics (delegated to idempotent method)
|
|
130
|
+
stats_data = self._compute_stats()
|
|
131
|
+
|
|
132
|
+
# Merge metadata with statistics
|
|
133
|
+
output_data = {**self._metadata, **stats_data}
|
|
134
|
+
|
|
135
|
+
if not output_data:
|
|
136
|
+
return
|
|
137
|
+
|
|
138
|
+
# Append combined data as a single metric data point
|
|
139
|
+
self._metric_builder.append(**output_data)
|
|
140
|
+
|
|
141
|
+
# Clear buffer if requested (default behavior for "tiled" mode)
|
|
142
|
+
if clear:
|
|
143
|
+
self._buffer.clear()
|
|
144
|
+
self._metadata.clear() # Also clear metadata
|
|
145
|
+
|
|
146
|
+
def peek(self, *keys: str, limit: int = 5) -> Dict[str, List[float]]:
|
|
147
|
+
"""
|
|
148
|
+
Non-destructive inspection of buffered values (idempotent, read-only).
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
*keys: Optional specific keys to peek at. If empty, shows all.
|
|
152
|
+
limit: Number of most recent values to show (default 5)
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
Dict of buffered values (truncated to last `limit` items)
|
|
156
|
+
|
|
157
|
+
Example:
|
|
158
|
+
cache.peek('loss', limit=3) # {'loss': [0.5, 0.48, 0.52]}
|
|
159
|
+
"""
|
|
160
|
+
keys_to_show = keys if keys else self._buffer.keys()
|
|
161
|
+
return {
|
|
162
|
+
k: self._buffer[k][-limit:] if limit else self._buffer[k]
|
|
163
|
+
for k in keys_to_show
|
|
164
|
+
if k in self._buffer and self._buffer[k]
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
|
|
14
168
|
class MetricsManager:
|
|
15
169
|
"""
|
|
16
170
|
Manager for metric operations that supports both named and unnamed usage.
|
|
@@ -39,11 +193,12 @@ class MetricsManager:
|
|
|
39
193
|
experiment: Parent Experiment instance
|
|
40
194
|
"""
|
|
41
195
|
self._experiment = experiment
|
|
196
|
+
self._metric_builders: Dict[str, 'MetricBuilder'] = {} # Cache for MetricBuilder instances
|
|
42
197
|
|
|
43
198
|
def __call__(self, name: str, description: Optional[str] = None,
|
|
44
199
|
tags: Optional[List[str]] = None, metadata: Optional[Dict[str, Any]] = None) -> 'MetricBuilder':
|
|
45
200
|
"""
|
|
46
|
-
Get a MetricBuilder for a specific metric name.
|
|
201
|
+
Get a MetricBuilder for a specific metric name (cached for reuse).
|
|
47
202
|
|
|
48
203
|
Args:
|
|
49
204
|
name: Metric name (unique within experiment)
|
|
@@ -52,12 +207,20 @@ class MetricsManager:
|
|
|
52
207
|
metadata: Optional structured metadata
|
|
53
208
|
|
|
54
209
|
Returns:
|
|
55
|
-
MetricBuilder instance for the named metric
|
|
210
|
+
MetricBuilder instance for the named metric (same instance on repeated calls)
|
|
56
211
|
|
|
57
212
|
Examples:
|
|
58
213
|
experiment.metrics("loss").append(value=0.5, step=1)
|
|
214
|
+
|
|
215
|
+
Note:
|
|
216
|
+
MetricBuilder instances are cached by name, so repeated calls with the
|
|
217
|
+
same name return the same instance. This ensures summary_cache works
|
|
218
|
+
correctly when called multiple times within a loop.
|
|
59
219
|
"""
|
|
60
|
-
|
|
220
|
+
# Cache key includes name only (description/tags/metadata are set once on first call)
|
|
221
|
+
if name not in self._metric_builders:
|
|
222
|
+
self._metric_builders[name] = MetricBuilder(self._experiment, name, description, tags, metadata)
|
|
223
|
+
return self._metric_builders[name]
|
|
61
224
|
|
|
62
225
|
def append(self, name: Optional[str] = None, data: Optional[Dict[str, Any]] = None, **kwargs) -> Dict[str, Any]:
|
|
63
226
|
"""
|
|
@@ -157,6 +320,7 @@ class MetricBuilder:
|
|
|
157
320
|
self._description = description
|
|
158
321
|
self._tags = tags
|
|
159
322
|
self._metadata = metadata
|
|
323
|
+
self._summary_cache = None # Lazy initialization
|
|
160
324
|
|
|
161
325
|
def append(self, **kwargs) -> 'MetricBuilder':
|
|
162
326
|
"""
|
|
@@ -290,3 +454,28 @@ class MetricBuilder:
|
|
|
290
454
|
print(f"{metric['name']}: {metric['totalDataPoints']} points")
|
|
291
455
|
"""
|
|
292
456
|
return self._experiment._list_metrics()
|
|
457
|
+
|
|
458
|
+
@property
|
|
459
|
+
def summary_cache(self) -> SummaryCache:
|
|
460
|
+
"""
|
|
461
|
+
Get summary cache for this metric (lazy initialization).
|
|
462
|
+
|
|
463
|
+
The summary cache allows buffering values and computing statistics
|
|
464
|
+
periodically, which is much more efficient than logging every value.
|
|
465
|
+
|
|
466
|
+
Returns:
|
|
467
|
+
SummaryCache instance for this metric
|
|
468
|
+
|
|
469
|
+
Example:
|
|
470
|
+
metric = experiment.metrics("train")
|
|
471
|
+
# Store values every batch
|
|
472
|
+
metric.summary_cache.store(loss=0.5)
|
|
473
|
+
metric.summary_cache.store(loss=0.48)
|
|
474
|
+
# Set metadata
|
|
475
|
+
metric.summary_cache.set(lr=0.001, epoch=1)
|
|
476
|
+
# Compute stats and log periodically
|
|
477
|
+
metric.summary_cache.summarize()
|
|
478
|
+
"""
|
|
479
|
+
if self._summary_cache is None:
|
|
480
|
+
self._summary_cache = SummaryCache(self)
|
|
481
|
+
return self._summary_cache
|