ml-dash 0.6.6__py3-none-any.whl → 0.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ml_dash/experiment.py CHANGED
@@ -13,6 +13,7 @@ from enum import Enum
13
13
  from pathlib import Path
14
14
  from typing import Any, Callable, Dict, List, Optional, Union
15
15
 
16
+ from .buffer import BackgroundBufferManager, BufferConfig
16
17
  from .client import RemoteClient
17
18
  from .files import BindrsBuilder, FilesAccessor
18
19
  from .log import LogBuilder, LogLevel
@@ -212,6 +213,11 @@ class Experiment:
212
213
  self._experiment_data: Optional[Dict[str, Any]] = None
213
214
  self._is_open = False
214
215
  self._metrics_manager: Optional["MetricsManager"] = None # Cached metrics manager
216
+ self._tracks_manager: Optional["TracksManager"] = None # Cached tracks manager
217
+
218
+ # Initialize buffer manager
219
+ self._buffer_config = BufferConfig.from_env()
220
+ self._buffer_manager: Optional[BackgroundBufferManager] = None
215
221
 
216
222
  if self.mode in (OperationMode.REMOTE, OperationMode.HYBRID):
217
223
  # RemoteClient will autoload token from ~/.dash/token.enc
@@ -324,6 +330,11 @@ class Experiment:
324
330
  metadata=self.metadata,
325
331
  )
326
332
 
333
+ # Start background buffer
334
+ if self._buffer_config.buffer_enabled:
335
+ self._buffer_manager = BackgroundBufferManager(self, self._buffer_config)
336
+ self._buffer_manager.start()
337
+
327
338
  self._is_open = True
328
339
  return self
329
340
 
@@ -343,6 +354,11 @@ class Experiment:
343
354
  # if self.run._storage:
344
355
  # self.run._storage.flush()
345
356
 
357
+ # Flush and stop buffer BEFORE status update
358
+ # Waits indefinitely for all data to be flushed (important for large files)
359
+ if self._buffer_manager:
360
+ self._buffer_manager.stop()
361
+
346
362
  # Update experiment status in remote mode
347
363
  if self.run._client and self._experiment_id:
348
364
  try:
@@ -500,8 +516,8 @@ class Experiment:
500
516
  timestamp: Optional[datetime],
501
517
  ) -> None:
502
518
  """
503
- Internal method to write a log entry immediately.
504
- No buffering - writes directly to storage/remote AND stdout/stderr.
519
+ Internal method to write a log entry.
520
+ Uses buffering if enabled, otherwise writes directly.
505
521
 
506
522
  Args:
507
523
  message: Log message
@@ -509,55 +525,59 @@ class Experiment:
509
525
  metadata: Optional metadata dict
510
526
  timestamp: Optional custom timestamp (defaults to now)
511
527
  """
512
- log_entry = {
513
- "timestamp": (timestamp or datetime.utcnow()).isoformat() + "Z",
514
- "level": level,
515
- "message": message,
516
- }
517
-
518
- if metadata:
519
- log_entry["metadata"] = metadata
520
-
521
- # Mirror to stdout/stderr before writing to storage
528
+ # Print to console immediately (user visibility)
522
529
  self._print_log(message, level, metadata)
523
530
 
524
- # Write immediately (no buffering)
525
- if self.run._client:
526
- # Remote mode: send to API (wrapped in array for batch API)
527
- try:
528
- self.run._client.create_log_entries(
529
- experiment_id=self._experiment_id,
530
- logs=[log_entry], # Single log in array
531
- )
532
- except Exception as e:
533
- # Log warning but don't crash training
534
- import warnings
535
-
536
- warnings.warn(
537
- f"Failed to write log to remote server: {e}. Training will continue.",
538
- RuntimeWarning,
539
- stacklevel=4,
540
- )
541
- # Fall through to local storage if available
531
+ # Buffer or write immediately
532
+ if self._buffer_manager and self._buffer_config.buffer_enabled:
533
+ self._buffer_manager.buffer_log(message, level, metadata, timestamp)
534
+ else:
535
+ # Immediate write (backward compatibility)
536
+ log_entry = {
537
+ "timestamp": (timestamp or datetime.utcnow()).isoformat() + "Z",
538
+ "level": level,
539
+ "message": message,
540
+ }
541
+
542
+ if metadata:
543
+ log_entry["metadata"] = metadata
544
+
545
+ if self.run._client:
546
+ # Remote mode: send to API (wrapped in array for batch API)
547
+ try:
548
+ self.run._client.create_log_entries(
549
+ experiment_id=self._experiment_id,
550
+ logs=[log_entry], # Single log in array
551
+ )
552
+ except Exception as e:
553
+ # Log warning but don't crash training
554
+ import warnings
555
+
556
+ warnings.warn(
557
+ f"Failed to write log to remote server: {e}. Training will continue.",
558
+ RuntimeWarning,
559
+ stacklevel=4,
560
+ )
561
+ # Fall through to local storage if available
542
562
 
543
- if self.run._storage:
544
- # Local mode: write to file immediately
545
- try:
546
- self.run._storage.write_log(
547
- owner=self.run.owner,
548
- project=self.run.project,
549
- prefix=self.run._folder_path,
550
- message=log_entry["message"],
551
- level=log_entry["level"],
552
- metadata=log_entry.get("metadata"),
553
- timestamp=log_entry["timestamp"],
554
- )
555
- except Exception as e:
556
- import warnings
563
+ if self.run._storage:
564
+ # Local mode: write to file immediately
565
+ try:
566
+ self.run._storage.write_log(
567
+ owner=self.run.owner,
568
+ project=self.run.project,
569
+ prefix=self.run._folder_path,
570
+ message=log_entry["message"],
571
+ level=log_entry["level"],
572
+ metadata=log_entry.get("metadata"),
573
+ timestamp=log_entry["timestamp"],
574
+ )
575
+ except Exception as e:
576
+ import warnings
557
577
 
558
- warnings.warn(
559
- f"Failed to write log to local storage: {e}", RuntimeWarning, stacklevel=4
560
- )
578
+ warnings.warn(
579
+ f"Failed to write log to local storage: {e}", RuntimeWarning, stacklevel=4
580
+ )
561
581
 
562
582
  def _print_log(
563
583
  self, message: str, level: str, metadata: Optional[Dict[str, Any]]
@@ -676,6 +696,7 @@ class Experiment:
676
696
  ) -> Dict[str, Any]:
677
697
  """
678
698
  Internal method to upload a file.
699
+ Uses buffering if enabled, otherwise uploads directly.
679
700
 
680
701
  Args:
681
702
  file_path: Local file path
@@ -689,43 +710,52 @@ class Experiment:
689
710
  size_bytes: File size in bytes
690
711
 
691
712
  Returns:
692
- File metadata dict
713
+ File metadata dict (or pending status if buffering)
693
714
  """
694
- result = None
695
-
696
- if self.run._client:
697
- # Remote mode: upload to API
698
- result = self.run._client.upload_file(
699
- experiment_id=self._experiment_id,
700
- file_path=file_path,
701
- prefix=prefix,
702
- filename=filename,
703
- description=description,
704
- tags=tags,
705
- metadata=metadata,
706
- checksum=checksum,
707
- content_type=content_type,
708
- size_bytes=size_bytes,
715
+ # Buffer or upload immediately
716
+ if self._buffer_manager and self._buffer_config.buffer_enabled:
717
+ self._buffer_manager.buffer_file(
718
+ file_path, prefix, filename, description, tags, metadata,
719
+ checksum, content_type, size_bytes
709
720
  )
721
+ return {"id": "pending", "status": "queued"}
722
+ else:
723
+ # Immediate upload (backward compatibility)
724
+ result = None
710
725
 
711
- if self.run._storage:
712
- # Local mode: copy to local storage
713
- result = self.run._storage.write_file(
714
- owner=self.run.owner,
715
- project=self.run.project,
716
- prefix=self.run._folder_path,
717
- file_path=file_path,
718
- path=prefix,
719
- filename=filename,
720
- description=description,
721
- tags=tags,
722
- metadata=metadata,
723
- checksum=checksum,
724
- content_type=content_type,
725
- size_bytes=size_bytes,
726
- )
726
+ if self.run._client:
727
+ # Remote mode: upload to API
728
+ result = self.run._client.upload_file(
729
+ experiment_id=self._experiment_id,
730
+ file_path=file_path,
731
+ prefix=prefix,
732
+ filename=filename,
733
+ description=description,
734
+ tags=tags,
735
+ metadata=metadata,
736
+ checksum=checksum,
737
+ content_type=content_type,
738
+ size_bytes=size_bytes,
739
+ )
727
740
 
728
- return result
741
+ if self.run._storage:
742
+ # Local mode: copy to local storage
743
+ result = self.run._storage.write_file(
744
+ owner=self.run.owner,
745
+ project=self.run.project,
746
+ prefix=self.run._folder_path,
747
+ file_path=file_path,
748
+ path=prefix,
749
+ filename=filename,
750
+ description=description,
751
+ tags=tags,
752
+ metadata=metadata,
753
+ checksum=checksum,
754
+ content_type=content_type,
755
+ size_bytes=size_bytes,
756
+ )
757
+
758
+ return result
729
759
 
730
760
  def _list_files(
731
761
  self, prefix: Optional[str] = None, tags: Optional[List[str]] = None
@@ -952,6 +982,50 @@ class Experiment:
952
982
  self._metrics_manager = MetricsManager(self)
953
983
  return self._metrics_manager
954
984
 
985
+ @property
986
+ @requires_open
987
+ def tracks(self) -> "TracksManager":
988
+ """
989
+ Get a TracksManager for timestamped track operations.
990
+
991
+ Supports topic-based logging with automatic timestamp merging:
992
+ - experiment.tracks("robot/position").append(q=[0.1, 0.2], _ts=0.0)
993
+ - experiment.tracks.flush() # Flush all topics
994
+ - experiment.tracks("robot/position").flush() # Flush specific topic
995
+
996
+ Returns:
997
+ TracksManager instance
998
+
999
+ Raises:
1000
+ RuntimeError: If experiment is not open
1001
+
1002
+ Examples:
1003
+ # Log track data with timestamp
1004
+ experiment.tracks("robot/position").append(
1005
+ q=[0.1, -0.22, 0.45],
1006
+ e=[0.5, 0.0, 0.6],
1007
+ _ts=2.0
1008
+ )
1009
+
1010
+ # Entries with same timestamp are automatically merged
1011
+ experiment.tracks("camera/rgb").append(frame_id=0, _ts=0.0)
1012
+ experiment.tracks("camera/rgb").append(path="frame_0.png", _ts=0.0)
1013
+
1014
+ # Read track data
1015
+ data = experiment.tracks("robot/position").read(format="json")
1016
+
1017
+ # Download in different formats
1018
+ jsonl = experiment.tracks("robot/position").read(format="jsonl")
1019
+ parquet = experiment.tracks("robot/position").read(format="parquet")
1020
+ mocap = experiment.tracks("robot/position").read(format="mocap")
1021
+ """
1022
+ from .track import TracksManager
1023
+
1024
+ # Cache the TracksManager instance to preserve TrackBuilder cache across calls
1025
+ if self._tracks_manager is None:
1026
+ self._tracks_manager = TracksManager(self)
1027
+ return self._tracks_manager
1028
+
955
1029
  def _append_to_metric(
956
1030
  self,
957
1031
  name: Optional[str],
@@ -962,6 +1036,7 @@ class Experiment:
962
1036
  ) -> Optional[Dict[str, Any]]:
963
1037
  """
964
1038
  Internal method to append a single data point to a metric.
1039
+ Uses buffering if enabled, otherwise writes directly.
965
1040
 
966
1041
  Args:
967
1042
  name: Metric name (can be None for unnamed metrics)
@@ -971,58 +1046,125 @@ class Experiment:
971
1046
  metadata: Optional metadata
972
1047
 
973
1048
  Returns:
974
- Dict with metricId, index, bufferedDataPoints, chunkSize or None if all backends fail
1049
+ Dict with metricId, index, bufferedDataPoints, chunkSize or None if buffering enabled/all backends fail
975
1050
  """
976
- result = None
1051
+ # Buffer or write immediately
1052
+ if self._buffer_manager and self._buffer_config.buffer_enabled:
1053
+ self._buffer_manager.buffer_metric(name, data, description, tags, metadata)
1054
+ return None # No immediate response when buffering
1055
+ else:
1056
+ # Immediate write (backward compatibility)
1057
+ result = None
977
1058
 
978
- if self.run._client:
979
- # Remote mode: append via API
980
- try:
981
- result = self.run._client.append_to_metric(
982
- experiment_id=self._experiment_id,
983
- metric_name=name,
984
- data=data,
985
- description=description,
986
- tags=tags,
987
- metadata=metadata,
988
- )
989
- except Exception as e:
990
- # Log warning but don't crash training
991
- import warnings
1059
+ if self.run._client:
1060
+ # Remote mode: append via API
1061
+ try:
1062
+ result = self.run._client.append_to_metric(
1063
+ experiment_id=self._experiment_id,
1064
+ metric_name=name,
1065
+ data=data,
1066
+ description=description,
1067
+ tags=tags,
1068
+ metadata=metadata,
1069
+ )
1070
+ except Exception as e:
1071
+ # Log warning but don't crash training
1072
+ import warnings
1073
+
1074
+ metric_display = f"'{name}'" if name else "unnamed metric"
1075
+ warnings.warn(
1076
+ f"Failed to log {metric_display} to remote server: {e}. "
1077
+ f"Training will continue.",
1078
+ RuntimeWarning,
1079
+ stacklevel=3,
1080
+ )
1081
+ # Fall through to local storage if available
992
1082
 
993
- metric_display = f"'{name}'" if name else "unnamed metric"
994
- warnings.warn(
995
- f"Failed to log {metric_display} to remote server: {e}. "
996
- f"Training will continue.",
997
- RuntimeWarning,
998
- stacklevel=3,
999
- )
1000
- # Fall through to local storage if available
1083
+ if self.run._storage:
1084
+ # Local mode: append to local storage
1085
+ try:
1086
+ result = self.run._storage.append_to_metric(
1087
+ owner=self.run.owner,
1088
+ project=self.run.project,
1089
+ prefix=self.run._folder_path,
1090
+ metric_name=name,
1091
+ data=data,
1092
+ description=description,
1093
+ tags=tags,
1094
+ metadata=metadata,
1095
+ )
1096
+ except Exception as e:
1097
+ import warnings
1098
+
1099
+ metric_display = f"'{name}'" if name else "unnamed metric"
1100
+ warnings.warn(
1101
+ f"Failed to log {metric_display} to local storage: {e}",
1102
+ RuntimeWarning,
1103
+ stacklevel=3,
1104
+ )
1001
1105
 
1002
- if self.run._storage:
1003
- # Local mode: append to local storage
1004
- try:
1005
- result = self.run._storage.append_to_metric(
1006
- owner=self.run.owner,
1007
- project=self.run.project,
1008
- prefix=self.run._folder_path,
1009
- metric_name=name,
1010
- data=data,
1011
- description=description,
1012
- tags=tags,
1013
- metadata=metadata,
1014
- )
1015
- except Exception as e:
1016
- import warnings
1106
+ return result
1017
1107
 
1018
- metric_display = f"'{name}'" if name else "unnamed metric"
1019
- warnings.warn(
1020
- f"Failed to log {metric_display} to local storage: {e}",
1021
- RuntimeWarning,
1022
- stacklevel=3,
1023
- )
1108
+ def _write_track(
1109
+ self,
1110
+ topic: str,
1111
+ timestamp: float,
1112
+ data: Dict[str, Any],
1113
+ ) -> None:
1114
+ """
1115
+ Internal method to write a track entry with timestamp.
1116
+ Uses buffering with timestamp-based merging if enabled.
1024
1117
 
1025
- return result
1118
+ Args:
1119
+ topic: Track topic (e.g., "robot/position")
1120
+ timestamp: Entry timestamp
1121
+ data: Data fields
1122
+
1123
+ Note:
1124
+ Entries with the same timestamp are automatically merged in the buffer.
1125
+ """
1126
+ # Buffer or write immediately
1127
+ if self._buffer_manager and self._buffer_config.buffer_enabled:
1128
+ self._buffer_manager.buffer_track(topic, timestamp, data)
1129
+ else:
1130
+ # Immediate write (no buffering)
1131
+ if self.run._client:
1132
+ # Remote mode: append via API
1133
+ try:
1134
+ self.run._client.append_batch_to_track(
1135
+ experiment_id=self._experiment_id,
1136
+ topic=topic,
1137
+ entries=[{"timestamp": timestamp, **data}],
1138
+ )
1139
+ except Exception as e:
1140
+ # Log warning but don't crash training
1141
+ import warnings
1142
+
1143
+ warnings.warn(
1144
+ f"Failed to log track '{topic}' to remote server: {e}. "
1145
+ f"Training will continue.",
1146
+ RuntimeWarning,
1147
+ stacklevel=3,
1148
+ )
1149
+
1150
+ if self.run._storage:
1151
+ # Local mode: append to local storage
1152
+ try:
1153
+ self.run._storage.append_batch_to_track(
1154
+ owner=self.run.owner,
1155
+ project=self.run.project,
1156
+ prefix=self.run._folder_path,
1157
+ topic=topic,
1158
+ entries=[{"timestamp": timestamp, **data}],
1159
+ )
1160
+ except Exception as e:
1161
+ import warnings
1162
+
1163
+ warnings.warn(
1164
+ f"Failed to log track '{topic}' to local storage: {e}",
1165
+ RuntimeWarning,
1166
+ stacklevel=3,
1167
+ )
1026
1168
 
1027
1169
  def _append_batch_to_metric(
1028
1170
  self,
@@ -1252,6 +1394,24 @@ class Experiment:
1252
1394
  """Set the local storage."""
1253
1395
  self.run._storage = value
1254
1396
 
1397
+ def flush(self) -> None:
1398
+ """
1399
+ Manually flush all buffered data.
1400
+
1401
+ Forces immediate flush of all queued logs, metrics, and files.
1402
+ Waits for all file uploads to complete.
1403
+
1404
+ Examples:
1405
+ with Experiment("my-project/exp").run as exp:
1406
+ for epoch in range(100):
1407
+ exp.metrics("train").log(loss=loss)
1408
+
1409
+ exp.flush() # Ensure metrics written before checkpoint
1410
+ torch.save(model, "model.pt")
1411
+ """
1412
+ if self._buffer_manager:
1413
+ self._buffer_manager.flush_all()
1414
+
1255
1415
  @property
1256
1416
  def id(self) -> Optional[str]:
1257
1417
  """Get the experiment ID (only available after open in remote mode)."""