ml-dash 0.6.5__py3-none-any.whl → 0.6.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ml_dash/__init__.py +1 -2
- ml_dash/auto_start.py +1 -4
- ml_dash/buffer.py +735 -0
- ml_dash/cli.py +7 -1
- ml_dash/cli_commands/create.py +145 -0
- ml_dash/cli_commands/download.py +177 -0
- ml_dash/cli_commands/list.py +146 -0
- ml_dash/cli_commands/upload.py +148 -4
- ml_dash/client.py +328 -25
- ml_dash/experiment.py +491 -457
- ml_dash/files.py +228 -70
- ml_dash/run.py +92 -3
- ml_dash/storage.py +403 -2
- ml_dash/track.py +263 -0
- {ml_dash-0.6.5.dist-info → ml_dash-0.6.7.dist-info}/METADATA +1 -1
- {ml_dash-0.6.5.dist-info → ml_dash-0.6.7.dist-info}/RECORD +18 -15
- {ml_dash-0.6.5.dist-info → ml_dash-0.6.7.dist-info}/WHEEL +0 -0
- {ml_dash-0.6.5.dist-info → ml_dash-0.6.7.dist-info}/entry_points.txt +0 -0
ml_dash/experiment.py
CHANGED
|
@@ -11,13 +11,14 @@ import functools
|
|
|
11
11
|
from datetime import datetime
|
|
12
12
|
from enum import Enum
|
|
13
13
|
from pathlib import Path
|
|
14
|
-
from typing import Any, Callable, Dict, List, Optional, Union
|
|
14
|
+
from typing import Any, Callable, Dict, List, Optional, Union
|
|
15
15
|
|
|
16
|
+
from .buffer import BackgroundBufferManager, BufferConfig
|
|
16
17
|
from .client import RemoteClient
|
|
17
18
|
from .files import BindrsBuilder, FilesAccessor
|
|
18
19
|
from .log import LogBuilder, LogLevel
|
|
19
20
|
from .params import ParametersBuilder
|
|
20
|
-
from .run import RUN
|
|
21
|
+
from .run import RUN, requires_open
|
|
21
22
|
from .storage import LocalStorage
|
|
22
23
|
|
|
23
24
|
|
|
@@ -60,140 +61,6 @@ class OperationMode(Enum):
|
|
|
60
61
|
HYBRID = "hybrid" # Future: sync local to remote
|
|
61
62
|
|
|
62
63
|
|
|
63
|
-
class RunManager:
|
|
64
|
-
"""
|
|
65
|
-
Lifecycle manager for experiments.
|
|
66
|
-
|
|
67
|
-
Supports three usage patterns:
|
|
68
|
-
1. Method calls: experiment.run.start(), experiment.run.complete()
|
|
69
|
-
2. Context manager: with Experiment(...).run as exp:
|
|
70
|
-
3. Decorator: @exp.run or @Experiment(...).run
|
|
71
|
-
"""
|
|
72
|
-
|
|
73
|
-
def __init__(self, experiment: "Experiment"):
|
|
74
|
-
"""
|
|
75
|
-
Initialize RunManager.
|
|
76
|
-
|
|
77
|
-
Args:
|
|
78
|
-
experiment: Parent Experiment instance
|
|
79
|
-
"""
|
|
80
|
-
self._experiment = experiment
|
|
81
|
-
|
|
82
|
-
def start(self) -> "Experiment":
|
|
83
|
-
"""
|
|
84
|
-
Start the experiment (sets status to RUNNING).
|
|
85
|
-
|
|
86
|
-
Returns:
|
|
87
|
-
The experiment instance for chaining
|
|
88
|
-
"""
|
|
89
|
-
return self._experiment._open()
|
|
90
|
-
|
|
91
|
-
def complete(self) -> None:
|
|
92
|
-
"""Mark experiment as completed (status: COMPLETED)."""
|
|
93
|
-
self._experiment._close(status="COMPLETED")
|
|
94
|
-
|
|
95
|
-
def fail(self) -> None:
|
|
96
|
-
"""Mark experiment as failed (status: FAILED)."""
|
|
97
|
-
self._experiment._close(status="FAILED")
|
|
98
|
-
|
|
99
|
-
def cancel(self) -> None:
|
|
100
|
-
"""Mark experiment as cancelled (status: CANCELLED)."""
|
|
101
|
-
self._experiment._close(status="CANCELLED")
|
|
102
|
-
|
|
103
|
-
@property
|
|
104
|
-
def prefix(self) -> Optional[str]:
|
|
105
|
-
"""
|
|
106
|
-
Get the current folder prefix for this experiment.
|
|
107
|
-
|
|
108
|
-
Returns:
|
|
109
|
-
Current folder prefix path or None
|
|
110
|
-
|
|
111
|
-
Example:
|
|
112
|
-
current_prefix = exp.run.prefix
|
|
113
|
-
"""
|
|
114
|
-
return self._experiment._folder_path
|
|
115
|
-
|
|
116
|
-
@prefix.setter
|
|
117
|
-
def prefix(self, value: Optional[str]) -> None:
|
|
118
|
-
"""
|
|
119
|
-
Set the folder prefix for this experiment before initialization.
|
|
120
|
-
|
|
121
|
-
This can ONLY be set before the experiment is started (initialized).
|
|
122
|
-
Once the experiment is opened, the prefix cannot be changed.
|
|
123
|
-
|
|
124
|
-
Supports template variables:
|
|
125
|
-
- {EXP.name} - Experiment name
|
|
126
|
-
- {EXP.id} - Experiment ID
|
|
127
|
-
|
|
128
|
-
Args:
|
|
129
|
-
value: Folder prefix path with optional template variables
|
|
130
|
-
(e.g., "ge/myproject/{EXP.name}" or None)
|
|
131
|
-
|
|
132
|
-
Raises:
|
|
133
|
-
RuntimeError: If experiment is already initialized/open
|
|
134
|
-
|
|
135
|
-
Examples:
|
|
136
|
-
from ml_dash import dxp
|
|
137
|
-
|
|
138
|
-
# Static folder
|
|
139
|
-
dxp.run.prefix = "ge/myproject/experiments/resnet"
|
|
140
|
-
|
|
141
|
-
# Template with experiment name
|
|
142
|
-
dxp.run.prefix = "ge/iclr_2024/{EXP.name}"
|
|
143
|
-
|
|
144
|
-
# Now start the experiment
|
|
145
|
-
with dxp.run:
|
|
146
|
-
dxp.params.set(lr=0.001)
|
|
147
|
-
"""
|
|
148
|
-
if self._experiment._is_open:
|
|
149
|
-
raise RuntimeError(
|
|
150
|
-
"Cannot change prefix after experiment is initialized. "
|
|
151
|
-
"Set prefix before calling start() or entering 'with' block."
|
|
152
|
-
)
|
|
153
|
-
|
|
154
|
-
if value:
|
|
155
|
-
# Sync EXP with this experiment's values
|
|
156
|
-
RUN.name = self._experiment.name
|
|
157
|
-
RUN.description = self._experiment.description
|
|
158
|
-
# Generate id/timestamp if not already set
|
|
159
|
-
if RUN.id is None:
|
|
160
|
-
RUN._init_run()
|
|
161
|
-
# Format with EXP - use helper to expand properties correctly
|
|
162
|
-
value = _expand_exp_template(value)
|
|
163
|
-
|
|
164
|
-
# Update the folder on the experiment
|
|
165
|
-
self._experiment._folder_path = value
|
|
166
|
-
|
|
167
|
-
def __enter__(self) -> "Experiment":
|
|
168
|
-
"""Context manager entry - starts the experiment."""
|
|
169
|
-
return self.start()
|
|
170
|
-
|
|
171
|
-
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
172
|
-
"""Context manager exit - completes or fails the experiment."""
|
|
173
|
-
if exc_type is not None:
|
|
174
|
-
self.fail()
|
|
175
|
-
else:
|
|
176
|
-
self.complete()
|
|
177
|
-
return False
|
|
178
|
-
|
|
179
|
-
def __call__(self, func: Callable) -> Callable:
|
|
180
|
-
"""
|
|
181
|
-
Decorator support for wrapping functions with experiment lifecycle.
|
|
182
|
-
|
|
183
|
-
Usage:
|
|
184
|
-
@exp.run
|
|
185
|
-
def train(exp):
|
|
186
|
-
exp.log("Training...")
|
|
187
|
-
"""
|
|
188
|
-
|
|
189
|
-
@functools.wraps(func)
|
|
190
|
-
def wrapper(*args, **kwargs):
|
|
191
|
-
with self as exp:
|
|
192
|
-
return func(exp, *args, **kwargs)
|
|
193
|
-
|
|
194
|
-
return wrapper
|
|
195
|
-
|
|
196
|
-
|
|
197
64
|
class Experiment:
|
|
198
65
|
"""
|
|
199
66
|
ML-Dash experiment for metricing experiments.
|
|
@@ -231,6 +98,28 @@ class Experiment:
|
|
|
231
98
|
...
|
|
232
99
|
"""
|
|
233
100
|
|
|
101
|
+
run: RUN
|
|
102
|
+
"""
|
|
103
|
+
Get the RunManager for lifecycle operations.
|
|
104
|
+
|
|
105
|
+
Usage:
|
|
106
|
+
# Method calls
|
|
107
|
+
experiment.run.start()
|
|
108
|
+
experiment.run.complete()
|
|
109
|
+
|
|
110
|
+
# Context manager
|
|
111
|
+
with Experiment(...).run as exp:
|
|
112
|
+
exp.log("Training...")
|
|
113
|
+
|
|
114
|
+
# Decorator
|
|
115
|
+
@experiment.run
|
|
116
|
+
def train(exp):
|
|
117
|
+
exp.log("Training...")
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
RunManager instance
|
|
121
|
+
"""
|
|
122
|
+
|
|
234
123
|
def __init__(
|
|
235
124
|
self,
|
|
236
125
|
prefix: Optional[str] = None,
|
|
@@ -251,7 +140,7 @@ class Experiment:
|
|
|
251
140
|
# Internal parameters
|
|
252
141
|
_write_protected: bool = False,
|
|
253
142
|
# The rest of the params go directly to populate the RUN object.
|
|
254
|
-
**run_params
|
|
143
|
+
**run_params,
|
|
255
144
|
):
|
|
256
145
|
"""
|
|
257
146
|
Initialize an ML-Dash experiment.
|
|
@@ -278,7 +167,6 @@ class Experiment:
|
|
|
278
167
|
- dash_url + dash_root: Hybrid mode (local + remote)
|
|
279
168
|
- dash_url + dash_root=None: Remote-only mode
|
|
280
169
|
"""
|
|
281
|
-
import os
|
|
282
170
|
import warnings
|
|
283
171
|
|
|
284
172
|
# Handle backward compatibility
|
|
@@ -286,7 +174,7 @@ class Experiment:
|
|
|
286
174
|
warnings.warn(
|
|
287
175
|
"Parameter 'remote' is deprecated. Use 'dash_url' instead.",
|
|
288
176
|
DeprecationWarning,
|
|
289
|
-
stacklevel=2
|
|
177
|
+
stacklevel=2,
|
|
290
178
|
)
|
|
291
179
|
if dash_url is None:
|
|
292
180
|
dash_url = remote
|
|
@@ -295,28 +183,15 @@ class Experiment:
|
|
|
295
183
|
warnings.warn(
|
|
296
184
|
"Parameter 'local_path' is deprecated. Use 'dash_root' instead.",
|
|
297
185
|
DeprecationWarning,
|
|
298
|
-
stacklevel=2
|
|
186
|
+
stacklevel=2,
|
|
299
187
|
)
|
|
300
188
|
if dash_root == ".dash": # Only override if dash_root is default
|
|
301
189
|
dash_root = local_path
|
|
302
190
|
|
|
303
|
-
|
|
304
|
-
|
|
191
|
+
if prefix:
|
|
192
|
+
run_params["prefix"] = prefix
|
|
305
193
|
|
|
306
|
-
|
|
307
|
-
raise ValueError("prefix (or DASH_PREFIX env var) must be provided")
|
|
308
|
-
|
|
309
|
-
# Parse prefix: {owner}/{project}/path.../[name]
|
|
310
|
-
parts = self._folder_path.strip("/").split("/")
|
|
311
|
-
if len(parts) < 2:
|
|
312
|
-
raise ValueError(
|
|
313
|
-
f"prefix must have at least owner/project: got '{self._folder_path}'"
|
|
314
|
-
)
|
|
315
|
-
|
|
316
|
-
self.owner = parts[0]
|
|
317
|
-
self.project = parts[1]
|
|
318
|
-
# Name is the last segment (may be a seed/id, not always a meaningful name)
|
|
319
|
-
self.name = parts[-1] if len(parts) > 2 else parts[1]
|
|
194
|
+
self.run = RUN(_experiment=self, **run_params)
|
|
320
195
|
|
|
321
196
|
self.readme = readme
|
|
322
197
|
self.tags = tags
|
|
@@ -324,11 +199,6 @@ class Experiment:
|
|
|
324
199
|
self._write_protected = _write_protected
|
|
325
200
|
self.metadata = metadata
|
|
326
201
|
|
|
327
|
-
# Initialize RUN with experiment values
|
|
328
|
-
RUN.name = self.name
|
|
329
|
-
if readme:
|
|
330
|
-
RUN.readme = readme
|
|
331
|
-
|
|
332
202
|
# Determine operation mode
|
|
333
203
|
# dash_root defaults to ".dash", dash_url defaults to None
|
|
334
204
|
if dash_url and dash_root:
|
|
@@ -339,21 +209,24 @@ class Experiment:
|
|
|
339
209
|
self.mode = OperationMode.LOCAL
|
|
340
210
|
|
|
341
211
|
# Initialize backend
|
|
342
|
-
self._client: Optional[RemoteClient] = None
|
|
343
|
-
self._storage: Optional[LocalStorage] = None
|
|
344
212
|
self._experiment_id: Optional[str] = None
|
|
345
213
|
self._experiment_data: Optional[Dict[str, Any]] = None
|
|
346
214
|
self._is_open = False
|
|
347
215
|
self._metrics_manager: Optional["MetricsManager"] = None # Cached metrics manager
|
|
216
|
+
self._tracks_manager: Optional["TracksManager"] = None # Cached tracks manager
|
|
217
|
+
|
|
218
|
+
# Initialize buffer manager
|
|
219
|
+
self._buffer_config = BufferConfig.from_env()
|
|
220
|
+
self._buffer_manager: Optional[BackgroundBufferManager] = None
|
|
348
221
|
|
|
349
222
|
if self.mode in (OperationMode.REMOTE, OperationMode.HYBRID):
|
|
350
|
-
# RemoteClient will
|
|
223
|
+
# RemoteClient will autoload token from ~/.dash/token.enc
|
|
351
224
|
# Use RUN.api_url if dash_url=True (boolean), otherwise use the provided URL
|
|
352
225
|
api_url = RUN.api_url if dash_url is True else dash_url
|
|
353
|
-
self._client = RemoteClient(base_url=api_url, namespace=self.owner)
|
|
226
|
+
self.run._client = RemoteClient(base_url=api_url, namespace=self.run.owner)
|
|
354
227
|
|
|
355
228
|
if self.mode in (OperationMode.LOCAL, OperationMode.HYBRID):
|
|
356
|
-
self._storage = LocalStorage(root_path=Path(dash_root))
|
|
229
|
+
self.run._storage = LocalStorage(root_path=Path(dash_root))
|
|
357
230
|
|
|
358
231
|
def _open(self) -> "Experiment":
|
|
359
232
|
"""
|
|
@@ -365,16 +238,16 @@ class Experiment:
|
|
|
365
238
|
if self._is_open:
|
|
366
239
|
return self
|
|
367
240
|
|
|
368
|
-
if self._client:
|
|
241
|
+
if self.run._client:
|
|
369
242
|
# Remote mode: create/update experiment via API
|
|
370
243
|
try:
|
|
371
|
-
response = self._client.create_or_update_experiment(
|
|
372
|
-
project=self.project,
|
|
373
|
-
name=self.name,
|
|
244
|
+
response = self.run._client.create_or_update_experiment(
|
|
245
|
+
project=self.run.project,
|
|
246
|
+
name=self.run.name,
|
|
374
247
|
description=self.readme,
|
|
375
248
|
tags=self.tags,
|
|
376
249
|
bindrs=self._bindrs_list,
|
|
377
|
-
prefix=self._folder_path,
|
|
250
|
+
prefix=self.run._folder_path,
|
|
378
251
|
write_protected=self._write_protected,
|
|
379
252
|
metadata=self.metadata,
|
|
380
253
|
)
|
|
@@ -387,13 +260,13 @@ class Experiment:
|
|
|
387
260
|
|
|
388
261
|
console = Console()
|
|
389
262
|
console.print(
|
|
390
|
-
f"[dim]✓ Experiment started: [bold]{self.name}[/bold] (project: {self.project})[/dim]\n"
|
|
263
|
+
f"[dim]✓ Experiment started: [bold]{self.run.name}[/bold] (project: {self.run.project})[/dim]\n"
|
|
391
264
|
f"[dim]View your data, statistics, and plots online at:[/dim] "
|
|
392
265
|
f"[link=https://dash.ml]https://dash.ml[/link]"
|
|
393
266
|
)
|
|
394
267
|
except ImportError:
|
|
395
268
|
# Fallback if rich is not available
|
|
396
|
-
print(f"✓ Experiment started: {self.name} (project: {self.project})")
|
|
269
|
+
print(f"✓ Experiment started: {self.run.name} (project: {self.run.project})")
|
|
397
270
|
print("View your data at: https://dash.ml")
|
|
398
271
|
|
|
399
272
|
except Exception as e:
|
|
@@ -446,18 +319,22 @@ class Experiment:
|
|
|
446
319
|
# Re-raise other exceptions
|
|
447
320
|
raise
|
|
448
321
|
|
|
449
|
-
if self._storage:
|
|
322
|
+
if self.run._storage:
|
|
450
323
|
# Local mode: create experiment directory structure
|
|
451
|
-
self._storage.create_experiment(
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
prefix=self._folder_path,
|
|
324
|
+
self.run._storage.create_experiment(
|
|
325
|
+
project=self.run.project,
|
|
326
|
+
prefix=self.run._folder_path,
|
|
455
327
|
description=self.readme,
|
|
456
328
|
tags=self.tags,
|
|
457
329
|
bindrs=self._bindrs_list,
|
|
458
330
|
metadata=self.metadata,
|
|
459
331
|
)
|
|
460
332
|
|
|
333
|
+
# Start background buffer
|
|
334
|
+
if self._buffer_config.buffer_enabled:
|
|
335
|
+
self._buffer_manager = BackgroundBufferManager(self, self._buffer_config)
|
|
336
|
+
self._buffer_manager.start()
|
|
337
|
+
|
|
461
338
|
self._is_open = True
|
|
462
339
|
return self
|
|
463
340
|
|
|
@@ -468,17 +345,24 @@ class Experiment:
|
|
|
468
345
|
Args:
|
|
469
346
|
status: Status to set - "COMPLETED" (default), "FAILED", or "CANCELLED"
|
|
470
347
|
"""
|
|
471
|
-
if not self._is_open:
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
#
|
|
475
|
-
|
|
476
|
-
|
|
348
|
+
# if not self._is_open:
|
|
349
|
+
# return
|
|
350
|
+
#
|
|
351
|
+
# note-ge: do NOT flush because the upload will be async. we will NEVER reuse
|
|
352
|
+
# experiment objects.
|
|
353
|
+
# # Flush any pending writes
|
|
354
|
+
# if self.run._storage:
|
|
355
|
+
# self.run._storage.flush()
|
|
356
|
+
|
|
357
|
+
# Flush and stop buffer BEFORE status update
|
|
358
|
+
# Waits indefinitely for all data to be flushed (important for large files)
|
|
359
|
+
if self._buffer_manager:
|
|
360
|
+
self._buffer_manager.stop()
|
|
477
361
|
|
|
478
362
|
# Update experiment status in remote mode
|
|
479
|
-
if self._client and self._experiment_id:
|
|
363
|
+
if self.run._client and self._experiment_id:
|
|
480
364
|
try:
|
|
481
|
-
self._client.update_experiment_status(
|
|
365
|
+
self.run._client.update_experiment_status(
|
|
482
366
|
experiment_id=self._experiment_id, status=status
|
|
483
367
|
)
|
|
484
368
|
|
|
@@ -499,14 +383,14 @@ class Experiment:
|
|
|
499
383
|
console = Console()
|
|
500
384
|
console.print(
|
|
501
385
|
f"[{status_color}]{status_emoji} Experiment {status.lower()}: "
|
|
502
|
-
f"[bold]{self.name}[/bold] (project: {self.project})[/{status_color}]\n"
|
|
386
|
+
f"[bold]{self.run.name}[/bold] (project: {self.run.project})[/{status_color}]\n"
|
|
503
387
|
f"[dim]View results, statistics, and plots online at:[/dim] "
|
|
504
388
|
f"[link=https://dash.ml]https://dash.ml[/link]"
|
|
505
389
|
)
|
|
506
390
|
except ImportError:
|
|
507
391
|
# Fallback if rich is not available
|
|
508
392
|
print(
|
|
509
|
-
f"{status_emoji} Experiment {status.lower()}: {self.name} (project: {self.project})"
|
|
393
|
+
f"{status_emoji} Experiment {status.lower()}: {self.run.name} (project: {self.run.project})"
|
|
510
394
|
)
|
|
511
395
|
print("View results at: https://dash.ml")
|
|
512
396
|
|
|
@@ -516,35 +400,8 @@ class Experiment:
|
|
|
516
400
|
|
|
517
401
|
self._is_open = False
|
|
518
402
|
|
|
519
|
-
# Reset RUN for next experiment
|
|
520
|
-
# TODO: RUN._reset() - method doesn't exist
|
|
521
|
-
# RUN._reset()
|
|
522
|
-
|
|
523
|
-
@property
|
|
524
|
-
def run(self) -> RunManager:
|
|
525
|
-
"""
|
|
526
|
-
Get the RunManager for lifecycle operations.
|
|
527
|
-
|
|
528
|
-
Usage:
|
|
529
|
-
# Method calls
|
|
530
|
-
experiment.run.start()
|
|
531
|
-
experiment.run.complete()
|
|
532
|
-
|
|
533
|
-
# Context manager
|
|
534
|
-
with Experiment(...).run as exp:
|
|
535
|
-
exp.log("Training...")
|
|
536
|
-
|
|
537
|
-
# Decorator
|
|
538
|
-
@experiment.run
|
|
539
|
-
def train(exp):
|
|
540
|
-
exp.log("Training...")
|
|
541
|
-
|
|
542
|
-
Returns:
|
|
543
|
-
RunManager instance
|
|
544
|
-
"""
|
|
545
|
-
return RunManager(self)
|
|
546
|
-
|
|
547
403
|
@property
|
|
404
|
+
@requires_open
|
|
548
405
|
def params(self) -> ParametersBuilder:
|
|
549
406
|
"""
|
|
550
407
|
Get a ParametersBuilder for parameter operations.
|
|
@@ -562,17 +419,10 @@ class Experiment:
|
|
|
562
419
|
Raises:
|
|
563
420
|
RuntimeError: If experiment is not open
|
|
564
421
|
"""
|
|
565
|
-
if not self._is_open:
|
|
566
|
-
raise RuntimeError(
|
|
567
|
-
"Experiment not started. Use 'with experiment.run:' or call experiment.run.start() first.\n"
|
|
568
|
-
"Example:\n"
|
|
569
|
-
" with dxp.run:\n"
|
|
570
|
-
" dxp.params.set(lr=0.001)"
|
|
571
|
-
)
|
|
572
|
-
|
|
573
422
|
return ParametersBuilder(self)
|
|
574
423
|
|
|
575
424
|
@property
|
|
425
|
+
@requires_open
|
|
576
426
|
def logs(self) -> LogBuilder:
|
|
577
427
|
"""
|
|
578
428
|
Get a LogBuilder for fluent-style logging.
|
|
@@ -592,16 +442,9 @@ class Experiment:
|
|
|
592
442
|
exp.logs.warn("GPU memory low", memory_available="1GB")
|
|
593
443
|
exp.logs.debug("Debug info", step=100)
|
|
594
444
|
"""
|
|
595
|
-
if not self._is_open:
|
|
596
|
-
raise RuntimeError(
|
|
597
|
-
"Experiment not started. Use 'with experiment.run:' or call experiment.run.start() first.\n"
|
|
598
|
-
"Example:\n"
|
|
599
|
-
" with dxp.run:\n"
|
|
600
|
-
" dxp.logs.info('Training started')"
|
|
601
|
-
)
|
|
602
|
-
|
|
603
445
|
return LogBuilder(self, metadata=None)
|
|
604
446
|
|
|
447
|
+
@requires_open
|
|
605
448
|
def log(
|
|
606
449
|
self,
|
|
607
450
|
message: Optional[str] = None,
|
|
@@ -638,22 +481,16 @@ class Experiment:
|
|
|
638
481
|
RuntimeError: If experiment is not open
|
|
639
482
|
ValueError: If log level is invalid
|
|
640
483
|
"""
|
|
641
|
-
if not self._is_open:
|
|
642
|
-
raise RuntimeError(
|
|
643
|
-
"Experiment not started. Use 'with experiment.run:' or call experiment.run.start() first.\n"
|
|
644
|
-
"Example:\n"
|
|
645
|
-
" with dxp.run:\n"
|
|
646
|
-
" dxp.logs.info('Training started')"
|
|
647
|
-
)
|
|
648
484
|
|
|
649
485
|
# Fluent mode: return LogBuilder (deprecated)
|
|
650
486
|
if message is None:
|
|
651
487
|
import warnings
|
|
488
|
+
|
|
652
489
|
warnings.warn(
|
|
653
490
|
"Using exp.log() without a message is deprecated. "
|
|
654
491
|
"Use exp.logs.info('message') instead.",
|
|
655
492
|
DeprecationWarning,
|
|
656
|
-
stacklevel=2
|
|
493
|
+
stacklevel=2,
|
|
657
494
|
)
|
|
658
495
|
combined_metadata = {**(metadata or {}), **extra_metadata}
|
|
659
496
|
return LogBuilder(self, combined_metadata if combined_metadata else None)
|
|
@@ -679,8 +516,8 @@ class Experiment:
|
|
|
679
516
|
timestamp: Optional[datetime],
|
|
680
517
|
) -> None:
|
|
681
518
|
"""
|
|
682
|
-
Internal method to write a log entry
|
|
683
|
-
|
|
519
|
+
Internal method to write a log entry.
|
|
520
|
+
Uses buffering if enabled, otherwise writes directly.
|
|
684
521
|
|
|
685
522
|
Args:
|
|
686
523
|
message: Log message
|
|
@@ -688,55 +525,59 @@ class Experiment:
|
|
|
688
525
|
metadata: Optional metadata dict
|
|
689
526
|
timestamp: Optional custom timestamp (defaults to now)
|
|
690
527
|
"""
|
|
691
|
-
|
|
692
|
-
"timestamp": (timestamp or datetime.utcnow()).isoformat() + "Z",
|
|
693
|
-
"level": level,
|
|
694
|
-
"message": message,
|
|
695
|
-
}
|
|
696
|
-
|
|
697
|
-
if metadata:
|
|
698
|
-
log_entry["metadata"] = metadata
|
|
699
|
-
|
|
700
|
-
# Mirror to stdout/stderr before writing to storage
|
|
528
|
+
# Print to console immediately (user visibility)
|
|
701
529
|
self._print_log(message, level, metadata)
|
|
702
530
|
|
|
703
|
-
#
|
|
704
|
-
if self.
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
)
|
|
719
|
-
|
|
531
|
+
# Buffer or write immediately
|
|
532
|
+
if self._buffer_manager and self._buffer_config.buffer_enabled:
|
|
533
|
+
self._buffer_manager.buffer_log(message, level, metadata, timestamp)
|
|
534
|
+
else:
|
|
535
|
+
# Immediate write (backward compatibility)
|
|
536
|
+
log_entry = {
|
|
537
|
+
"timestamp": (timestamp or datetime.utcnow()).isoformat() + "Z",
|
|
538
|
+
"level": level,
|
|
539
|
+
"message": message,
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
if metadata:
|
|
543
|
+
log_entry["metadata"] = metadata
|
|
544
|
+
|
|
545
|
+
if self.run._client:
|
|
546
|
+
# Remote mode: send to API (wrapped in array for batch API)
|
|
547
|
+
try:
|
|
548
|
+
self.run._client.create_log_entries(
|
|
549
|
+
experiment_id=self._experiment_id,
|
|
550
|
+
logs=[log_entry], # Single log in array
|
|
551
|
+
)
|
|
552
|
+
except Exception as e:
|
|
553
|
+
# Log warning but don't crash training
|
|
554
|
+
import warnings
|
|
555
|
+
|
|
556
|
+
warnings.warn(
|
|
557
|
+
f"Failed to write log to remote server: {e}. Training will continue.",
|
|
558
|
+
RuntimeWarning,
|
|
559
|
+
stacklevel=4,
|
|
560
|
+
)
|
|
561
|
+
# Fall through to local storage if available
|
|
720
562
|
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
)
|
|
563
|
+
if self.run._storage:
|
|
564
|
+
# Local mode: write to file immediately
|
|
565
|
+
try:
|
|
566
|
+
self.run._storage.write_log(
|
|
567
|
+
owner=self.run.owner,
|
|
568
|
+
project=self.run.project,
|
|
569
|
+
prefix=self.run._folder_path,
|
|
570
|
+
message=log_entry["message"],
|
|
571
|
+
level=log_entry["level"],
|
|
572
|
+
metadata=log_entry.get("metadata"),
|
|
573
|
+
timestamp=log_entry["timestamp"],
|
|
574
|
+
)
|
|
575
|
+
except Exception as e:
|
|
576
|
+
import warnings
|
|
577
|
+
|
|
578
|
+
warnings.warn(
|
|
579
|
+
f"Failed to write log to local storage: {e}", RuntimeWarning, stacklevel=4
|
|
580
|
+
)
|
|
740
581
|
|
|
741
582
|
def _print_log(
|
|
742
583
|
self, message: str, level: str, metadata: Optional[Dict[str, Any]]
|
|
@@ -773,6 +614,7 @@ class Experiment:
|
|
|
773
614
|
print(formatted_message, file=sys.stdout)
|
|
774
615
|
|
|
775
616
|
@property
|
|
617
|
+
@requires_open
|
|
776
618
|
def files(self) -> FilesAccessor:
|
|
777
619
|
"""
|
|
778
620
|
Get a FilesAccessor for fluent file operations.
|
|
@@ -813,16 +655,9 @@ class Experiment:
|
|
|
813
655
|
dxp.files.save_json(dict(hey="yo"), to="config.json")
|
|
814
656
|
dxp.files.save_blob(b"xxx", to="data.bin")
|
|
815
657
|
"""
|
|
816
|
-
if not self._is_open:
|
|
817
|
-
raise RuntimeError(
|
|
818
|
-
"Experiment not started. Use 'with experiment.run:' or call experiment.run.start() first.\n"
|
|
819
|
-
"Example:\n"
|
|
820
|
-
" with dxp.run:\n"
|
|
821
|
-
" dxp.files('path').upload()"
|
|
822
|
-
)
|
|
823
|
-
|
|
824
658
|
return FilesAccessor(self)
|
|
825
659
|
|
|
660
|
+
@requires_open
|
|
826
661
|
def bindrs(self, bindr_name: str) -> BindrsBuilder:
|
|
827
662
|
"""
|
|
828
663
|
Get a BindrsBuilder for working with file collections (bindrs).
|
|
@@ -845,14 +680,6 @@ class Experiment:
|
|
|
845
680
|
Note:
|
|
846
681
|
This is a placeholder for future bindr functionality.
|
|
847
682
|
"""
|
|
848
|
-
if not self._is_open:
|
|
849
|
-
raise RuntimeError(
|
|
850
|
-
"Experiment not started. Use 'with experiment.run:' or call experiment.run.start() first.\n"
|
|
851
|
-
"Example:\n"
|
|
852
|
-
" with dxp.run:\n"
|
|
853
|
-
" files = dxp.bindrs('my-bindr').list()"
|
|
854
|
-
)
|
|
855
|
-
|
|
856
683
|
return BindrsBuilder(self, bindr_name)
|
|
857
684
|
|
|
858
685
|
def _upload_file(
|
|
@@ -869,6 +696,7 @@ class Experiment:
|
|
|
869
696
|
) -> Dict[str, Any]:
|
|
870
697
|
"""
|
|
871
698
|
Internal method to upload a file.
|
|
699
|
+
Uses buffering if enabled, otherwise uploads directly.
|
|
872
700
|
|
|
873
701
|
Args:
|
|
874
702
|
file_path: Local file path
|
|
@@ -882,43 +710,52 @@ class Experiment:
|
|
|
882
710
|
size_bytes: File size in bytes
|
|
883
711
|
|
|
884
712
|
Returns:
|
|
885
|
-
File metadata dict
|
|
713
|
+
File metadata dict (or pending status if buffering)
|
|
886
714
|
"""
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
experiment_id=self._experiment_id,
|
|
893
|
-
file_path=file_path,
|
|
894
|
-
prefix=prefix,
|
|
895
|
-
filename=filename,
|
|
896
|
-
description=description,
|
|
897
|
-
tags=tags,
|
|
898
|
-
metadata=metadata,
|
|
899
|
-
checksum=checksum,
|
|
900
|
-
content_type=content_type,
|
|
901
|
-
size_bytes=size_bytes,
|
|
715
|
+
# Buffer or upload immediately
|
|
716
|
+
if self._buffer_manager and self._buffer_config.buffer_enabled:
|
|
717
|
+
self._buffer_manager.buffer_file(
|
|
718
|
+
file_path, prefix, filename, description, tags, metadata,
|
|
719
|
+
checksum, content_type, size_bytes
|
|
902
720
|
)
|
|
721
|
+
return {"id": "pending", "status": "queued"}
|
|
722
|
+
else:
|
|
723
|
+
# Immediate upload (backward compatibility)
|
|
724
|
+
result = None
|
|
903
725
|
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
size_bytes=size_bytes,
|
|
919
|
-
)
|
|
726
|
+
if self.run._client:
|
|
727
|
+
# Remote mode: upload to API
|
|
728
|
+
result = self.run._client.upload_file(
|
|
729
|
+
experiment_id=self._experiment_id,
|
|
730
|
+
file_path=file_path,
|
|
731
|
+
prefix=prefix,
|
|
732
|
+
filename=filename,
|
|
733
|
+
description=description,
|
|
734
|
+
tags=tags,
|
|
735
|
+
metadata=metadata,
|
|
736
|
+
checksum=checksum,
|
|
737
|
+
content_type=content_type,
|
|
738
|
+
size_bytes=size_bytes,
|
|
739
|
+
)
|
|
920
740
|
|
|
921
|
-
|
|
741
|
+
if self.run._storage:
|
|
742
|
+
# Local mode: copy to local storage
|
|
743
|
+
result = self.run._storage.write_file(
|
|
744
|
+
owner=self.run.owner,
|
|
745
|
+
project=self.run.project,
|
|
746
|
+
prefix=self.run._folder_path,
|
|
747
|
+
file_path=file_path,
|
|
748
|
+
path=prefix,
|
|
749
|
+
filename=filename,
|
|
750
|
+
description=description,
|
|
751
|
+
tags=tags,
|
|
752
|
+
metadata=metadata,
|
|
753
|
+
checksum=checksum,
|
|
754
|
+
content_type=content_type,
|
|
755
|
+
size_bytes=size_bytes,
|
|
756
|
+
)
|
|
757
|
+
|
|
758
|
+
return result
|
|
922
759
|
|
|
923
760
|
def _list_files(
|
|
924
761
|
self, prefix: Optional[str] = None, tags: Optional[List[str]] = None
|
|
@@ -935,18 +772,18 @@ class Experiment:
|
|
|
935
772
|
"""
|
|
936
773
|
files = []
|
|
937
774
|
|
|
938
|
-
if self._client:
|
|
775
|
+
if self.run._client:
|
|
939
776
|
# Remote mode: fetch from API
|
|
940
|
-
files = self._client.list_files(
|
|
777
|
+
files = self.run._client.list_files(
|
|
941
778
|
experiment_id=self._experiment_id, prefix=prefix, tags=tags
|
|
942
779
|
)
|
|
943
780
|
|
|
944
|
-
if self._storage:
|
|
781
|
+
if self.run._storage:
|
|
945
782
|
# Local mode: read from metadata file
|
|
946
|
-
files = self._storage.list_files(
|
|
947
|
-
owner=self.owner,
|
|
948
|
-
project=self.project,
|
|
949
|
-
prefix=self._folder_path,
|
|
783
|
+
files = self.run._storage.list_files(
|
|
784
|
+
owner=self.run.owner,
|
|
785
|
+
project=self.run.project,
|
|
786
|
+
prefix=self.run._folder_path,
|
|
950
787
|
path_prefix=prefix,
|
|
951
788
|
tags=tags,
|
|
952
789
|
)
|
|
@@ -964,18 +801,18 @@ class Experiment:
|
|
|
964
801
|
Returns:
|
|
965
802
|
Path to downloaded file
|
|
966
803
|
"""
|
|
967
|
-
if self._client:
|
|
804
|
+
if self.run._client:
|
|
968
805
|
# Remote mode: download from API
|
|
969
|
-
return self._client.download_file(
|
|
806
|
+
return self.run._client.download_file(
|
|
970
807
|
experiment_id=self._experiment_id, file_id=file_id, dest_path=dest_path
|
|
971
808
|
)
|
|
972
809
|
|
|
973
|
-
if self._storage:
|
|
810
|
+
if self.run._storage:
|
|
974
811
|
# Local mode: copy from local storage
|
|
975
|
-
return self._storage.read_file(
|
|
976
|
-
owner=self.owner,
|
|
977
|
-
project=self.project,
|
|
978
|
-
prefix=self._folder_path,
|
|
812
|
+
return self.run._storage.read_file(
|
|
813
|
+
owner=self.run.owner,
|
|
814
|
+
project=self.run.project,
|
|
815
|
+
prefix=self.run._folder_path,
|
|
979
816
|
file_id=file_id,
|
|
980
817
|
dest_path=dest_path,
|
|
981
818
|
)
|
|
@@ -994,18 +831,18 @@ class Experiment:
|
|
|
994
831
|
"""
|
|
995
832
|
result = None
|
|
996
833
|
|
|
997
|
-
if self._client:
|
|
834
|
+
if self.run._client:
|
|
998
835
|
# Remote mode: delete via API
|
|
999
|
-
result = self._client.delete_file(
|
|
836
|
+
result = self.run._client.delete_file(
|
|
1000
837
|
experiment_id=self._experiment_id, file_id=file_id
|
|
1001
838
|
)
|
|
1002
839
|
|
|
1003
|
-
if self._storage:
|
|
840
|
+
if self.run._storage:
|
|
1004
841
|
# Local mode: soft delete in metadata
|
|
1005
|
-
result = self._storage.delete_file(
|
|
1006
|
-
owner=self.owner,
|
|
1007
|
-
project=self.project,
|
|
1008
|
-
prefix=self._folder_path,
|
|
842
|
+
result = self.run._storage.delete_file(
|
|
843
|
+
owner=self.run.owner,
|
|
844
|
+
project=self.run.project,
|
|
845
|
+
prefix=self.run._folder_path,
|
|
1009
846
|
file_id=file_id,
|
|
1010
847
|
)
|
|
1011
848
|
|
|
@@ -1032,9 +869,9 @@ class Experiment:
|
|
|
1032
869
|
"""
|
|
1033
870
|
result = None
|
|
1034
871
|
|
|
1035
|
-
if self._client:
|
|
872
|
+
if self.run._client:
|
|
1036
873
|
# Remote mode: update via API
|
|
1037
|
-
result = self._client.update_file(
|
|
874
|
+
result = self.run._client.update_file(
|
|
1038
875
|
experiment_id=self._experiment_id,
|
|
1039
876
|
file_id=file_id,
|
|
1040
877
|
description=description,
|
|
@@ -1042,12 +879,12 @@ class Experiment:
|
|
|
1042
879
|
metadata=metadata,
|
|
1043
880
|
)
|
|
1044
881
|
|
|
1045
|
-
if self._storage:
|
|
882
|
+
if self.run._storage:
|
|
1046
883
|
# Local mode: update in metadata file
|
|
1047
|
-
result = self._storage.update_file_metadata(
|
|
1048
|
-
owner=self.owner,
|
|
1049
|
-
project=self.project,
|
|
1050
|
-
prefix=self._folder_path,
|
|
884
|
+
result = self.run._storage.update_file_metadata(
|
|
885
|
+
owner=self.run.owner,
|
|
886
|
+
project=self.run.project,
|
|
887
|
+
prefix=self.run._folder_path,
|
|
1051
888
|
file_id=file_id,
|
|
1052
889
|
description=description,
|
|
1053
890
|
tags=tags,
|
|
@@ -1063,18 +900,18 @@ class Experiment:
|
|
|
1063
900
|
Args:
|
|
1064
901
|
flattened_params: Already-flattened parameter dict with dot notation
|
|
1065
902
|
"""
|
|
1066
|
-
if self._client:
|
|
903
|
+
if self.run._client:
|
|
1067
904
|
# Remote mode: send to API
|
|
1068
|
-
self._client.set_parameters(
|
|
905
|
+
self.run._client.set_parameters(
|
|
1069
906
|
experiment_id=self._experiment_id, data=flattened_params
|
|
1070
907
|
)
|
|
1071
908
|
|
|
1072
|
-
if self._storage:
|
|
909
|
+
if self.run._storage:
|
|
1073
910
|
# Local mode: write to file
|
|
1074
|
-
self._storage.write_parameters(
|
|
1075
|
-
owner=self.owner,
|
|
1076
|
-
project=self.project,
|
|
1077
|
-
prefix=self._folder_path,
|
|
911
|
+
self.run._storage.write_parameters(
|
|
912
|
+
owner=self.run.owner,
|
|
913
|
+
project=self.run.project,
|
|
914
|
+
prefix=self.run._folder_path,
|
|
1078
915
|
data=flattened_params,
|
|
1079
916
|
)
|
|
1080
917
|
|
|
@@ -1087,23 +924,24 @@ class Experiment:
|
|
|
1087
924
|
"""
|
|
1088
925
|
params = None
|
|
1089
926
|
|
|
1090
|
-
if self._client:
|
|
927
|
+
if self.run._client:
|
|
1091
928
|
# Remote mode: fetch from API
|
|
1092
929
|
try:
|
|
1093
|
-
params = self._client.get_parameters(experiment_id=self._experiment_id)
|
|
930
|
+
params = self.run._client.get_parameters(experiment_id=self._experiment_id)
|
|
1094
931
|
except Exception:
|
|
1095
932
|
# Parameters don't exist yet
|
|
1096
933
|
params = None
|
|
1097
934
|
|
|
1098
|
-
if self._storage:
|
|
935
|
+
if self.run._storage:
|
|
1099
936
|
# Local mode: read from file
|
|
1100
|
-
params = self._storage.read_parameters(
|
|
1101
|
-
owner=self.owner, project=self.project, prefix=self._folder_path
|
|
937
|
+
params = self.run._storage.read_parameters(
|
|
938
|
+
owner=self.run.owner, project=self.run.project, prefix=self.run._folder_path
|
|
1102
939
|
)
|
|
1103
940
|
|
|
1104
941
|
return params
|
|
1105
942
|
|
|
1106
943
|
@property
|
|
944
|
+
@requires_open
|
|
1107
945
|
def metrics(self) -> "MetricsManager":
|
|
1108
946
|
"""
|
|
1109
947
|
Get a MetricsManager for metric operations.
|
|
@@ -1139,17 +977,55 @@ class Experiment:
|
|
|
1139
977
|
"""
|
|
1140
978
|
from .metric import MetricsManager
|
|
1141
979
|
|
|
1142
|
-
if not self._is_open:
|
|
1143
|
-
raise RuntimeError(
|
|
1144
|
-
"Cannot use metrics on closed experiment. "
|
|
1145
|
-
"Use 'with Experiment(...).run as experiment:' or call experiment.run.start() first."
|
|
1146
|
-
)
|
|
1147
|
-
|
|
1148
980
|
# Cache the MetricsManager instance to preserve MetricBuilder cache across calls
|
|
1149
981
|
if self._metrics_manager is None:
|
|
1150
982
|
self._metrics_manager = MetricsManager(self)
|
|
1151
983
|
return self._metrics_manager
|
|
1152
984
|
|
|
985
|
+
@property
|
|
986
|
+
@requires_open
|
|
987
|
+
def tracks(self) -> "TracksManager":
|
|
988
|
+
"""
|
|
989
|
+
Get a TracksManager for timestamped track operations.
|
|
990
|
+
|
|
991
|
+
Supports topic-based logging with automatic timestamp merging:
|
|
992
|
+
- experiment.tracks("robot/position").append(q=[0.1, 0.2], _ts=0.0)
|
|
993
|
+
- experiment.tracks.flush() # Flush all topics
|
|
994
|
+
- experiment.tracks("robot/position").flush() # Flush specific topic
|
|
995
|
+
|
|
996
|
+
Returns:
|
|
997
|
+
TracksManager instance
|
|
998
|
+
|
|
999
|
+
Raises:
|
|
1000
|
+
RuntimeError: If experiment is not open
|
|
1001
|
+
|
|
1002
|
+
Examples:
|
|
1003
|
+
# Log track data with timestamp
|
|
1004
|
+
experiment.tracks("robot/position").append(
|
|
1005
|
+
q=[0.1, -0.22, 0.45],
|
|
1006
|
+
e=[0.5, 0.0, 0.6],
|
|
1007
|
+
_ts=2.0
|
|
1008
|
+
)
|
|
1009
|
+
|
|
1010
|
+
# Entries with same timestamp are automatically merged
|
|
1011
|
+
experiment.tracks("camera/rgb").append(frame_id=0, _ts=0.0)
|
|
1012
|
+
experiment.tracks("camera/rgb").append(path="frame_0.png", _ts=0.0)
|
|
1013
|
+
|
|
1014
|
+
# Read track data
|
|
1015
|
+
data = experiment.tracks("robot/position").read(format="json")
|
|
1016
|
+
|
|
1017
|
+
# Download in different formats
|
|
1018
|
+
jsonl = experiment.tracks("robot/position").read(format="jsonl")
|
|
1019
|
+
parquet = experiment.tracks("robot/position").read(format="parquet")
|
|
1020
|
+
mocap = experiment.tracks("robot/position").read(format="mocap")
|
|
1021
|
+
"""
|
|
1022
|
+
from .track import TracksManager
|
|
1023
|
+
|
|
1024
|
+
# Cache the TracksManager instance to preserve TrackBuilder cache across calls
|
|
1025
|
+
if self._tracks_manager is None:
|
|
1026
|
+
self._tracks_manager = TracksManager(self)
|
|
1027
|
+
return self._tracks_manager
|
|
1028
|
+
|
|
1153
1029
|
def _append_to_metric(
|
|
1154
1030
|
self,
|
|
1155
1031
|
name: Optional[str],
|
|
@@ -1160,6 +1036,7 @@ class Experiment:
|
|
|
1160
1036
|
) -> Optional[Dict[str, Any]]:
|
|
1161
1037
|
"""
|
|
1162
1038
|
Internal method to append a single data point to a metric.
|
|
1039
|
+
Uses buffering if enabled, otherwise writes directly.
|
|
1163
1040
|
|
|
1164
1041
|
Args:
|
|
1165
1042
|
name: Metric name (can be None for unnamed metrics)
|
|
@@ -1169,56 +1046,125 @@ class Experiment:
|
|
|
1169
1046
|
metadata: Optional metadata
|
|
1170
1047
|
|
|
1171
1048
|
Returns:
|
|
1172
|
-
Dict with metricId, index, bufferedDataPoints, chunkSize or None if all backends fail
|
|
1049
|
+
Dict with metricId, index, bufferedDataPoints, chunkSize or None if buffering enabled/all backends fail
|
|
1173
1050
|
"""
|
|
1174
|
-
|
|
1051
|
+
# Buffer or write immediately
|
|
1052
|
+
if self._buffer_manager and self._buffer_config.buffer_enabled:
|
|
1053
|
+
self._buffer_manager.buffer_metric(name, data, description, tags, metadata)
|
|
1054
|
+
return None # No immediate response when buffering
|
|
1055
|
+
else:
|
|
1056
|
+
# Immediate write (backward compatibility)
|
|
1057
|
+
result = None
|
|
1175
1058
|
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1059
|
+
if self.run._client:
|
|
1060
|
+
# Remote mode: append via API
|
|
1061
|
+
try:
|
|
1062
|
+
result = self.run._client.append_to_metric(
|
|
1063
|
+
experiment_id=self._experiment_id,
|
|
1064
|
+
metric_name=name,
|
|
1065
|
+
data=data,
|
|
1066
|
+
description=description,
|
|
1067
|
+
tags=tags,
|
|
1068
|
+
metadata=metadata,
|
|
1069
|
+
)
|
|
1070
|
+
except Exception as e:
|
|
1071
|
+
# Log warning but don't crash training
|
|
1072
|
+
import warnings
|
|
1073
|
+
|
|
1074
|
+
metric_display = f"'{name}'" if name else "unnamed metric"
|
|
1075
|
+
warnings.warn(
|
|
1076
|
+
f"Failed to log {metric_display} to remote server: {e}. "
|
|
1077
|
+
f"Training will continue.",
|
|
1078
|
+
RuntimeWarning,
|
|
1079
|
+
stacklevel=3,
|
|
1080
|
+
)
|
|
1081
|
+
# Fall through to local storage if available
|
|
1198
1082
|
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
|
|
1083
|
+
if self.run._storage:
|
|
1084
|
+
# Local mode: append to local storage
|
|
1085
|
+
try:
|
|
1086
|
+
result = self.run._storage.append_to_metric(
|
|
1087
|
+
owner=self.run.owner,
|
|
1088
|
+
project=self.run.project,
|
|
1089
|
+
prefix=self.run._folder_path,
|
|
1090
|
+
metric_name=name,
|
|
1091
|
+
data=data,
|
|
1092
|
+
description=description,
|
|
1093
|
+
tags=tags,
|
|
1094
|
+
metadata=metadata,
|
|
1095
|
+
)
|
|
1096
|
+
except Exception as e:
|
|
1097
|
+
import warnings
|
|
1098
|
+
|
|
1099
|
+
metric_display = f"'{name}'" if name else "unnamed metric"
|
|
1100
|
+
warnings.warn(
|
|
1101
|
+
f"Failed to log {metric_display} to local storage: {e}",
|
|
1102
|
+
RuntimeWarning,
|
|
1103
|
+
stacklevel=3,
|
|
1104
|
+
)
|
|
1220
1105
|
|
|
1221
|
-
|
|
1106
|
+
return result
|
|
1107
|
+
|
|
1108
|
+
def _write_track(
|
|
1109
|
+
self,
|
|
1110
|
+
topic: str,
|
|
1111
|
+
timestamp: float,
|
|
1112
|
+
data: Dict[str, Any],
|
|
1113
|
+
) -> None:
|
|
1114
|
+
"""
|
|
1115
|
+
Internal method to write a track entry with timestamp.
|
|
1116
|
+
Uses buffering with timestamp-based merging if enabled.
|
|
1117
|
+
|
|
1118
|
+
Args:
|
|
1119
|
+
topic: Track topic (e.g., "robot/position")
|
|
1120
|
+
timestamp: Entry timestamp
|
|
1121
|
+
data: Data fields
|
|
1122
|
+
|
|
1123
|
+
Note:
|
|
1124
|
+
Entries with the same timestamp are automatically merged in the buffer.
|
|
1125
|
+
"""
|
|
1126
|
+
# Buffer or write immediately
|
|
1127
|
+
if self._buffer_manager and self._buffer_config.buffer_enabled:
|
|
1128
|
+
self._buffer_manager.buffer_track(topic, timestamp, data)
|
|
1129
|
+
else:
|
|
1130
|
+
# Immediate write (no buffering)
|
|
1131
|
+
if self.run._client:
|
|
1132
|
+
# Remote mode: append via API
|
|
1133
|
+
try:
|
|
1134
|
+
self.run._client.append_batch_to_track(
|
|
1135
|
+
experiment_id=self._experiment_id,
|
|
1136
|
+
topic=topic,
|
|
1137
|
+
entries=[{"timestamp": timestamp, **data}],
|
|
1138
|
+
)
|
|
1139
|
+
except Exception as e:
|
|
1140
|
+
# Log warning but don't crash training
|
|
1141
|
+
import warnings
|
|
1142
|
+
|
|
1143
|
+
warnings.warn(
|
|
1144
|
+
f"Failed to log track '{topic}' to remote server: {e}. "
|
|
1145
|
+
f"Training will continue.",
|
|
1146
|
+
RuntimeWarning,
|
|
1147
|
+
stacklevel=3,
|
|
1148
|
+
)
|
|
1149
|
+
|
|
1150
|
+
if self.run._storage:
|
|
1151
|
+
# Local mode: append to local storage
|
|
1152
|
+
try:
|
|
1153
|
+
self.run._storage.append_batch_to_track(
|
|
1154
|
+
owner=self.run.owner,
|
|
1155
|
+
project=self.run.project,
|
|
1156
|
+
prefix=self.run._folder_path,
|
|
1157
|
+
topic=topic,
|
|
1158
|
+
entries=[{"timestamp": timestamp, **data}],
|
|
1159
|
+
)
|
|
1160
|
+
except Exception as e:
|
|
1161
|
+
import warnings
|
|
1162
|
+
|
|
1163
|
+
warnings.warn(
|
|
1164
|
+
f"Failed to log track '{topic}' to local storage: {e}",
|
|
1165
|
+
RuntimeWarning,
|
|
1166
|
+
stacklevel=3,
|
|
1167
|
+
)
|
|
1222
1168
|
|
|
1223
1169
|
def _append_batch_to_metric(
|
|
1224
1170
|
self,
|
|
@@ -1243,10 +1189,10 @@ class Experiment:
|
|
|
1243
1189
|
"""
|
|
1244
1190
|
result = None
|
|
1245
1191
|
|
|
1246
|
-
if self._client:
|
|
1192
|
+
if self.run._client:
|
|
1247
1193
|
# Remote mode: append batch via API
|
|
1248
1194
|
try:
|
|
1249
|
-
result = self._client.append_batch_to_metric(
|
|
1195
|
+
result = self.run._client.append_batch_to_metric(
|
|
1250
1196
|
experiment_id=self._experiment_id,
|
|
1251
1197
|
metric_name=name,
|
|
1252
1198
|
data_points=data_points,
|
|
@@ -1257,22 +1203,23 @@ class Experiment:
|
|
|
1257
1203
|
except Exception as e:
|
|
1258
1204
|
# Log warning but don't crash training
|
|
1259
1205
|
import warnings
|
|
1206
|
+
|
|
1260
1207
|
metric_display = f"'{name}'" if name else "unnamed metric"
|
|
1261
1208
|
warnings.warn(
|
|
1262
1209
|
f"Failed to log batch to {metric_display} on remote server: {e}. "
|
|
1263
1210
|
f"Training will continue.",
|
|
1264
1211
|
RuntimeWarning,
|
|
1265
|
-
stacklevel=3
|
|
1212
|
+
stacklevel=3,
|
|
1266
1213
|
)
|
|
1267
1214
|
# Fall through to local storage if available
|
|
1268
1215
|
|
|
1269
|
-
if self._storage:
|
|
1216
|
+
if self.run._storage:
|
|
1270
1217
|
# Local mode: append batch to local storage
|
|
1271
1218
|
try:
|
|
1272
|
-
result = self._storage.append_batch_to_metric(
|
|
1273
|
-
owner=self.owner,
|
|
1274
|
-
project=self.project,
|
|
1275
|
-
prefix=self._folder_path,
|
|
1219
|
+
result = self.run._storage.append_batch_to_metric(
|
|
1220
|
+
owner=self.run.owner,
|
|
1221
|
+
project=self.run.project,
|
|
1222
|
+
prefix=self.run._folder_path,
|
|
1276
1223
|
metric_name=name,
|
|
1277
1224
|
data_points=data_points,
|
|
1278
1225
|
description=description,
|
|
@@ -1281,11 +1228,12 @@ class Experiment:
|
|
|
1281
1228
|
)
|
|
1282
1229
|
except Exception as e:
|
|
1283
1230
|
import warnings
|
|
1231
|
+
|
|
1284
1232
|
metric_display = f"'{name}'" if name else "unnamed metric"
|
|
1285
1233
|
warnings.warn(
|
|
1286
1234
|
f"Failed to log batch to {metric_display} in local storage: {e}",
|
|
1287
1235
|
RuntimeWarning,
|
|
1288
|
-
stacklevel=3
|
|
1236
|
+
stacklevel=3,
|
|
1289
1237
|
)
|
|
1290
1238
|
|
|
1291
1239
|
return result
|
|
@@ -1306,21 +1254,21 @@ class Experiment:
|
|
|
1306
1254
|
"""
|
|
1307
1255
|
result = None
|
|
1308
1256
|
|
|
1309
|
-
if self._client:
|
|
1257
|
+
if self.run._client:
|
|
1310
1258
|
# Remote mode: read via API
|
|
1311
|
-
result = self._client.read_metric_data(
|
|
1259
|
+
result = self.run._client.read_metric_data(
|
|
1312
1260
|
experiment_id=self._experiment_id,
|
|
1313
1261
|
metric_name=name,
|
|
1314
1262
|
start_index=start_index,
|
|
1315
1263
|
limit=limit,
|
|
1316
1264
|
)
|
|
1317
1265
|
|
|
1318
|
-
if self._storage:
|
|
1266
|
+
if self.run._storage:
|
|
1319
1267
|
# Local mode: read from local storage
|
|
1320
|
-
result = self._storage.read_metric_data(
|
|
1321
|
-
owner=self.owner,
|
|
1322
|
-
project=self.project,
|
|
1323
|
-
prefix=self._folder_path,
|
|
1268
|
+
result = self.run._storage.read_metric_data(
|
|
1269
|
+
owner=self.run.owner,
|
|
1270
|
+
project=self.run.project,
|
|
1271
|
+
prefix=self.run._folder_path,
|
|
1324
1272
|
metric_name=name,
|
|
1325
1273
|
start_index=start_index,
|
|
1326
1274
|
limit=limit,
|
|
@@ -1340,18 +1288,18 @@ class Experiment:
|
|
|
1340
1288
|
"""
|
|
1341
1289
|
result = None
|
|
1342
1290
|
|
|
1343
|
-
if self._client:
|
|
1291
|
+
if self.run._client:
|
|
1344
1292
|
# Remote mode: get stats via API
|
|
1345
|
-
result = self._client.get_metric_stats(
|
|
1293
|
+
result = self.run._client.get_metric_stats(
|
|
1346
1294
|
experiment_id=self._experiment_id, metric_name=name
|
|
1347
1295
|
)
|
|
1348
1296
|
|
|
1349
|
-
if self._storage:
|
|
1297
|
+
if self.run._storage:
|
|
1350
1298
|
# Local mode: get stats from local storage
|
|
1351
|
-
result = self._storage.get_metric_stats(
|
|
1352
|
-
owner=self.owner,
|
|
1353
|
-
project=self.project,
|
|
1354
|
-
prefix=self._folder_path,
|
|
1299
|
+
result = self.run._storage.get_metric_stats(
|
|
1300
|
+
owner=self.run.owner,
|
|
1301
|
+
project=self.run.project,
|
|
1302
|
+
prefix=self.run._folder_path,
|
|
1355
1303
|
metric_name=name,
|
|
1356
1304
|
)
|
|
1357
1305
|
|
|
@@ -1366,18 +1314,104 @@ class Experiment:
|
|
|
1366
1314
|
"""
|
|
1367
1315
|
result = None
|
|
1368
1316
|
|
|
1369
|
-
if self._client:
|
|
1317
|
+
if self.run._client:
|
|
1370
1318
|
# Remote mode: list via API
|
|
1371
|
-
result = self._client.list_metrics(experiment_id=self._experiment_id)
|
|
1319
|
+
result = self.run._client.list_metrics(experiment_id=self._experiment_id)
|
|
1372
1320
|
|
|
1373
|
-
if self._storage:
|
|
1321
|
+
if self.run._storage:
|
|
1374
1322
|
# Local mode: list from local storage
|
|
1375
|
-
result = self._storage.list_metrics(
|
|
1376
|
-
owner=self.owner, project=self.project, prefix=self._folder_path
|
|
1323
|
+
result = self.run._storage.list_metrics(
|
|
1324
|
+
owner=self.run.owner, project=self.run.project, prefix=self.run._folder_path
|
|
1377
1325
|
)
|
|
1378
1326
|
|
|
1379
1327
|
return result or []
|
|
1380
1328
|
|
|
1329
|
+
@property
|
|
1330
|
+
def owner(self) -> Optional[str]:
|
|
1331
|
+
"""Get the owner (first segment of prefix)."""
|
|
1332
|
+
return self.run.owner
|
|
1333
|
+
|
|
1334
|
+
@owner.setter
|
|
1335
|
+
def owner(self, value: str) -> None:
|
|
1336
|
+
"""Set the owner."""
|
|
1337
|
+
self.run.owner = value
|
|
1338
|
+
|
|
1339
|
+
@property
|
|
1340
|
+
def project(self) -> Optional[str]:
|
|
1341
|
+
"""Get the project (second segment of prefix or RUN.project)."""
|
|
1342
|
+
return self.run.project
|
|
1343
|
+
|
|
1344
|
+
@project.setter
|
|
1345
|
+
def project(self, value: str) -> None:
|
|
1346
|
+
"""Set the project."""
|
|
1347
|
+
self.run.project = value
|
|
1348
|
+
|
|
1349
|
+
@property
|
|
1350
|
+
def name(self) -> Optional[str]:
|
|
1351
|
+
"""Get the experiment name (last segment of prefix)."""
|
|
1352
|
+
return self.run.name
|
|
1353
|
+
|
|
1354
|
+
@name.setter
|
|
1355
|
+
def name(self, value: str) -> None:
|
|
1356
|
+
"""Set the name."""
|
|
1357
|
+
self.run.name = value
|
|
1358
|
+
|
|
1359
|
+
@property
|
|
1360
|
+
def _folder_path(self) -> Optional[str]:
|
|
1361
|
+
"""Get the full folder path (same as prefix)."""
|
|
1362
|
+
return self.run._folder_path
|
|
1363
|
+
|
|
1364
|
+
@_folder_path.setter
|
|
1365
|
+
def _folder_path(self, value: str) -> None:
|
|
1366
|
+
"""Set the full folder path and re-parse into components."""
|
|
1367
|
+
self.run._folder_path = value
|
|
1368
|
+
self.run.prefix = value
|
|
1369
|
+
# Re-parse prefix into components
|
|
1370
|
+
if value:
|
|
1371
|
+
parts = value.strip("/").split("/")
|
|
1372
|
+
if len(parts) >= 2:
|
|
1373
|
+
self.run.owner = parts[0]
|
|
1374
|
+
self.run.project = parts[1]
|
|
1375
|
+
self.run.name = parts[-1] if len(parts) > 2 else parts[1]
|
|
1376
|
+
|
|
1377
|
+
@property
|
|
1378
|
+
def _client(self):
|
|
1379
|
+
"""Get the remote client."""
|
|
1380
|
+
return self.run._client
|
|
1381
|
+
|
|
1382
|
+
@_client.setter
|
|
1383
|
+
def _client(self, value) -> None:
|
|
1384
|
+
"""Set the remote client."""
|
|
1385
|
+
self.run._client = value
|
|
1386
|
+
|
|
1387
|
+
@property
|
|
1388
|
+
def _storage(self):
|
|
1389
|
+
"""Get the local storage."""
|
|
1390
|
+
return self.run._storage
|
|
1391
|
+
|
|
1392
|
+
@_storage.setter
|
|
1393
|
+
def _storage(self, value) -> None:
|
|
1394
|
+
"""Set the local storage."""
|
|
1395
|
+
self.run._storage = value
|
|
1396
|
+
|
|
1397
|
+
def flush(self) -> None:
|
|
1398
|
+
"""
|
|
1399
|
+
Manually flush all buffered data.
|
|
1400
|
+
|
|
1401
|
+
Forces immediate flush of all queued logs, metrics, and files.
|
|
1402
|
+
Waits for all file uploads to complete.
|
|
1403
|
+
|
|
1404
|
+
Examples:
|
|
1405
|
+
with Experiment("my-project/exp").run as exp:
|
|
1406
|
+
for epoch in range(100):
|
|
1407
|
+
exp.metrics("train").log(loss=loss)
|
|
1408
|
+
|
|
1409
|
+
exp.flush() # Ensure metrics written before checkpoint
|
|
1410
|
+
torch.save(model, "model.pt")
|
|
1411
|
+
"""
|
|
1412
|
+
if self._buffer_manager:
|
|
1413
|
+
self._buffer_manager.flush_all()
|
|
1414
|
+
|
|
1381
1415
|
@property
|
|
1382
1416
|
def id(self) -> Optional[str]:
|
|
1383
1417
|
"""Get the experiment ID (only available after open in remote mode)."""
|