ml-dash 0.6.13__py3-none-any.whl → 0.6.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ml_dash/__init__.py +2 -2
- ml_dash/buffer.py +160 -51
- ml_dash/cli_commands/upload.py +2 -1
- ml_dash/client.py +14 -4
- ml_dash/experiment.py +37 -66
- ml_dash/metric.py +10 -6
- {ml_dash-0.6.13.dist-info → ml_dash-0.6.15.dist-info}/METADATA +1 -1
- {ml_dash-0.6.13.dist-info → ml_dash-0.6.15.dist-info}/RECORD +10 -10
- {ml_dash-0.6.13.dist-info → ml_dash-0.6.15.dist-info}/WHEEL +2 -2
- {ml_dash-0.6.13.dist-info → ml_dash-0.6.15.dist-info}/entry_points.txt +0 -0
ml_dash/__init__.py
CHANGED
|
@@ -43,11 +43,11 @@ from .params import ParametersBuilder
|
|
|
43
43
|
from .run import RUN
|
|
44
44
|
from .storage import LocalStorage
|
|
45
45
|
|
|
46
|
-
__version__ = "0.6.
|
|
46
|
+
__version__ = "0.6.14"
|
|
47
47
|
|
|
48
48
|
# Required version - MUST match exactly (blocks all older versions)
|
|
49
49
|
# Update this with EVERY release to force users to upgrade
|
|
50
|
-
REQUIRED_VERSION = "0.6.
|
|
50
|
+
REQUIRED_VERSION = "0.6.14"
|
|
51
51
|
|
|
52
52
|
|
|
53
53
|
def _check_version_compatibility():
|
ml_dash/buffer.py
CHANGED
|
@@ -57,6 +57,11 @@ def _serialize_value(value: Any) -> Any:
|
|
|
57
57
|
class BufferConfig:
|
|
58
58
|
"""Configuration for buffering behavior."""
|
|
59
59
|
|
|
60
|
+
# Internal constants for queue management (not exposed to users)
|
|
61
|
+
_MAX_QUEUE_SIZE = 100000 # Maximum items before blocking
|
|
62
|
+
_WARNING_THRESHOLD = 80000 # Warn at 80% capacity
|
|
63
|
+
_AGGRESSIVE_FLUSH_THRESHOLD = 50000 # Trigger immediate flush at 50% capacity
|
|
64
|
+
|
|
60
65
|
def __init__(
|
|
61
66
|
self,
|
|
62
67
|
flush_interval: float = 5.0,
|
|
@@ -114,17 +119,20 @@ class BackgroundBufferManager:
|
|
|
114
119
|
self._experiment = experiment
|
|
115
120
|
self._config = config
|
|
116
121
|
|
|
117
|
-
# Resource-specific queues
|
|
118
|
-
self._log_queue: Queue = Queue()
|
|
122
|
+
# Resource-specific queues with bounded size to prevent OOM
|
|
123
|
+
self._log_queue: Queue = Queue(maxsize=config._MAX_QUEUE_SIZE)
|
|
119
124
|
self._metric_queues: Dict[Optional[str], Queue] = {} # Per-metric queues
|
|
120
125
|
self._track_buffers: Dict[str, Dict[float, Dict[str, Any]]] = {} # Per-topic: {timestamp: merged_data}
|
|
121
|
-
self._file_queue: Queue = Queue()
|
|
126
|
+
self._file_queue: Queue = Queue(maxsize=config._MAX_QUEUE_SIZE)
|
|
122
127
|
|
|
123
128
|
# Track last flush times per resource type
|
|
124
129
|
self._last_log_flush = time.time()
|
|
125
130
|
self._last_metric_flush: Dict[Optional[str], float] = {}
|
|
126
131
|
self._last_track_flush: Dict[str, float] = {} # Per-topic flush times
|
|
127
132
|
|
|
133
|
+
# Track warnings to avoid spamming
|
|
134
|
+
self._warned_queues: set = set()
|
|
135
|
+
|
|
128
136
|
# Background thread control
|
|
129
137
|
self._thread: Optional[threading.Thread] = None
|
|
130
138
|
self._stop_event = threading.Event()
|
|
@@ -184,6 +192,34 @@ class BackgroundBufferManager:
|
|
|
184
192
|
|
|
185
193
|
self._thread = None
|
|
186
194
|
|
|
195
|
+
def _check_queue_pressure(self, queue: Queue, queue_name: str) -> None:
|
|
196
|
+
"""
|
|
197
|
+
Check queue size and trigger aggressive flushing if needed.
|
|
198
|
+
|
|
199
|
+
This prevents OOM by flushing immediately when queue fills up.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
queue: The queue to check
|
|
203
|
+
queue_name: Name for warning messages
|
|
204
|
+
"""
|
|
205
|
+
qsize = queue.qsize()
|
|
206
|
+
|
|
207
|
+
# Trigger immediate flush if queue is getting full
|
|
208
|
+
if qsize >= self._config._AGGRESSIVE_FLUSH_THRESHOLD:
|
|
209
|
+
self._flush_event.set()
|
|
210
|
+
|
|
211
|
+
# Warn once if queue is filling up (80% capacity)
|
|
212
|
+
if qsize >= self._config._WARNING_THRESHOLD:
|
|
213
|
+
if queue_name not in self._warned_queues:
|
|
214
|
+
warnings.warn(
|
|
215
|
+
f"[ML-Dash] {queue_name} queue is {qsize}/{self._config._MAX_QUEUE_SIZE} full. "
|
|
216
|
+
f"Data is being generated faster than it can be flushed. "
|
|
217
|
+
f"Consider reducing logging frequency or the background flush will block to prevent OOM.",
|
|
218
|
+
RuntimeWarning,
|
|
219
|
+
stacklevel=3
|
|
220
|
+
)
|
|
221
|
+
self._warned_queues.add(queue_name)
|
|
222
|
+
|
|
187
223
|
def buffer_log(
|
|
188
224
|
self,
|
|
189
225
|
message: str,
|
|
@@ -192,7 +228,10 @@ class BackgroundBufferManager:
|
|
|
192
228
|
timestamp: Optional[datetime],
|
|
193
229
|
) -> None:
|
|
194
230
|
"""
|
|
195
|
-
Add log to buffer
|
|
231
|
+
Add log to buffer with automatic backpressure.
|
|
232
|
+
|
|
233
|
+
If queue is full, this will block until space is available.
|
|
234
|
+
This prevents OOM when logs are generated faster than they can be flushed.
|
|
196
235
|
|
|
197
236
|
Args:
|
|
198
237
|
message: Log message
|
|
@@ -200,6 +239,9 @@ class BackgroundBufferManager:
|
|
|
200
239
|
metadata: Optional metadata
|
|
201
240
|
timestamp: Optional timestamp
|
|
202
241
|
"""
|
|
242
|
+
# Check queue pressure and trigger aggressive flushing if needed
|
|
243
|
+
self._check_queue_pressure(self._log_queue, "Log")
|
|
244
|
+
|
|
203
245
|
log_entry = {
|
|
204
246
|
"timestamp": (timestamp or datetime.utcnow()).isoformat() + "Z",
|
|
205
247
|
"level": level,
|
|
@@ -209,6 +251,7 @@ class BackgroundBufferManager:
|
|
|
209
251
|
if metadata:
|
|
210
252
|
log_entry["metadata"] = metadata
|
|
211
253
|
|
|
254
|
+
# Will block if queue is full (backpressure to prevent OOM)
|
|
212
255
|
self._log_queue.put(log_entry)
|
|
213
256
|
|
|
214
257
|
def buffer_metric(
|
|
@@ -220,7 +263,10 @@ class BackgroundBufferManager:
|
|
|
220
263
|
metadata: Optional[Dict[str, Any]],
|
|
221
264
|
) -> None:
|
|
222
265
|
"""
|
|
223
|
-
Add metric datapoint to buffer
|
|
266
|
+
Add metric datapoint to buffer with automatic backpressure.
|
|
267
|
+
|
|
268
|
+
If queue is full, this will block until space is available.
|
|
269
|
+
This prevents OOM when metrics are generated faster than they can be flushed.
|
|
224
270
|
|
|
225
271
|
Args:
|
|
226
272
|
metric_name: Metric name (can be None for unnamed metrics)
|
|
@@ -229,11 +275,18 @@ class BackgroundBufferManager:
|
|
|
229
275
|
tags: Optional tags
|
|
230
276
|
metadata: Optional metadata
|
|
231
277
|
"""
|
|
232
|
-
# Get or create queue for this metric
|
|
278
|
+
# Get or create queue for this metric (with bounded size)
|
|
233
279
|
if metric_name not in self._metric_queues:
|
|
234
|
-
self._metric_queues[metric_name] = Queue()
|
|
280
|
+
self._metric_queues[metric_name] = Queue(maxsize=self._config._MAX_QUEUE_SIZE)
|
|
235
281
|
self._last_metric_flush[metric_name] = time.time()
|
|
236
282
|
|
|
283
|
+
# Check queue pressure and trigger aggressive flushing if needed
|
|
284
|
+
metric_display = f"'{metric_name}'" if metric_name else "unnamed"
|
|
285
|
+
self._check_queue_pressure(
|
|
286
|
+
self._metric_queues[metric_name],
|
|
287
|
+
f"Metric {metric_display}"
|
|
288
|
+
)
|
|
289
|
+
|
|
237
290
|
metric_entry = {
|
|
238
291
|
"data": data,
|
|
239
292
|
"description": description,
|
|
@@ -241,6 +294,7 @@ class BackgroundBufferManager:
|
|
|
241
294
|
"metadata": metadata,
|
|
242
295
|
}
|
|
243
296
|
|
|
297
|
+
# Will block if queue is full (backpressure to prevent OOM)
|
|
244
298
|
self._metric_queues[metric_name].put(metric_entry)
|
|
245
299
|
|
|
246
300
|
def buffer_track(
|
|
@@ -286,7 +340,9 @@ class BackgroundBufferManager:
|
|
|
286
340
|
size_bytes: int,
|
|
287
341
|
) -> None:
|
|
288
342
|
"""
|
|
289
|
-
Add file upload to queue
|
|
343
|
+
Add file upload to queue with automatic backpressure.
|
|
344
|
+
|
|
345
|
+
If queue is full, this will block until space is available.
|
|
290
346
|
|
|
291
347
|
Args:
|
|
292
348
|
file_path: Local file path
|
|
@@ -299,6 +355,9 @@ class BackgroundBufferManager:
|
|
|
299
355
|
content_type: MIME type
|
|
300
356
|
size_bytes: File size in bytes
|
|
301
357
|
"""
|
|
358
|
+
# Check queue pressure and trigger aggressive flushing if needed
|
|
359
|
+
self._check_queue_pressure(self._file_queue, "File")
|
|
360
|
+
|
|
302
361
|
file_entry = {
|
|
303
362
|
"file_path": file_path,
|
|
304
363
|
"prefix": prefix,
|
|
@@ -311,6 +370,7 @@ class BackgroundBufferManager:
|
|
|
311
370
|
"size_bytes": size_bytes,
|
|
312
371
|
}
|
|
313
372
|
|
|
373
|
+
# Will block if queue is full (backpressure to prevent OOM)
|
|
314
374
|
self._file_queue.put(file_entry)
|
|
315
375
|
|
|
316
376
|
def flush_all(self) -> None:
|
|
@@ -425,13 +485,73 @@ class BackgroundBufferManager:
|
|
|
425
485
|
if triggered:
|
|
426
486
|
self._flush_event.clear()
|
|
427
487
|
|
|
428
|
-
# Final flush on shutdown
|
|
429
|
-
|
|
488
|
+
# Final flush on shutdown - loop until all queues are empty
|
|
489
|
+
# This ensures no data is lost when shutting down with large queues
|
|
490
|
+
# Show progress bar for large flushes
|
|
491
|
+
initial_counts = {
|
|
492
|
+
'logs': self._log_queue.qsize(),
|
|
493
|
+
'metrics': {name: q.qsize() for name, q in self._metric_queues.items()},
|
|
494
|
+
'tracks': {topic: len(entries) for topic, entries in self._track_buffers.items()},
|
|
495
|
+
'files': self._file_queue.qsize(),
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
total_items = (
|
|
499
|
+
initial_counts['logs'] +
|
|
500
|
+
sum(initial_counts['metrics'].values()) +
|
|
501
|
+
sum(initial_counts['tracks'].values()) +
|
|
502
|
+
initial_counts['files']
|
|
503
|
+
)
|
|
504
|
+
|
|
505
|
+
# Show progress bar if there are many items to flush
|
|
506
|
+
show_progress = total_items > 200
|
|
507
|
+
items_flushed = 0
|
|
508
|
+
|
|
509
|
+
def update_progress():
|
|
510
|
+
nonlocal items_flushed
|
|
511
|
+
if show_progress:
|
|
512
|
+
progress = items_flushed / total_items
|
|
513
|
+
bar_length = 40
|
|
514
|
+
filled = int(bar_length * progress)
|
|
515
|
+
bar = '█' * filled + '░' * (bar_length - filled)
|
|
516
|
+
percent = progress * 100
|
|
517
|
+
print(f'\r[ML-Dash] Flushing: |{bar}| {percent:.1f}% ({items_flushed}/{total_items})', end='', flush=True)
|
|
518
|
+
|
|
519
|
+
# Flush logs
|
|
520
|
+
log_batch_size = self._config.log_batch_size
|
|
521
|
+
while not self._log_queue.empty():
|
|
522
|
+
before = self._log_queue.qsize()
|
|
523
|
+
self._flush_logs()
|
|
524
|
+
after = self._log_queue.qsize()
|
|
525
|
+
items_flushed += before - after
|
|
526
|
+
update_progress()
|
|
527
|
+
|
|
528
|
+
# Flush metrics
|
|
529
|
+
metric_batch_size = self._config.metric_batch_size
|
|
430
530
|
for metric_name in list(self._metric_queues.keys()):
|
|
431
|
-
self.
|
|
531
|
+
while not self._metric_queues[metric_name].empty():
|
|
532
|
+
before = self._metric_queues[metric_name].qsize()
|
|
533
|
+
self._flush_metric(metric_name)
|
|
534
|
+
after = self._metric_queues[metric_name].qsize()
|
|
535
|
+
items_flushed += before - after
|
|
536
|
+
update_progress()
|
|
537
|
+
|
|
538
|
+
# Flush tracks
|
|
432
539
|
for topic in list(self._track_buffers.keys()):
|
|
540
|
+
track_count = len(self._track_buffers.get(topic, {}))
|
|
433
541
|
self._flush_track(topic)
|
|
434
|
-
|
|
542
|
+
items_flushed += track_count
|
|
543
|
+
update_progress()
|
|
544
|
+
|
|
545
|
+
# Flush files
|
|
546
|
+
while not self._file_queue.empty():
|
|
547
|
+
before = self._file_queue.qsize()
|
|
548
|
+
self._flush_files()
|
|
549
|
+
after = self._file_queue.qsize()
|
|
550
|
+
items_flushed += before - after
|
|
551
|
+
update_progress()
|
|
552
|
+
|
|
553
|
+
if show_progress:
|
|
554
|
+
print() # New line after progress bar
|
|
435
555
|
|
|
436
556
|
def _flush_logs(self) -> None:
|
|
437
557
|
"""Batch flush logs using client.create_log_entries()."""
|
|
@@ -458,12 +578,10 @@ class BackgroundBufferManager:
|
|
|
458
578
|
logs=batch,
|
|
459
579
|
)
|
|
460
580
|
except Exception as e:
|
|
461
|
-
|
|
462
|
-
f"Failed to flush {len(batch)} logs to remote server: {e}
|
|
463
|
-
f"
|
|
464
|
-
|
|
465
|
-
stacklevel=3,
|
|
466
|
-
)
|
|
581
|
+
raise RuntimeError(
|
|
582
|
+
f"Failed to flush {len(batch)} logs to remote server: {e}\n"
|
|
583
|
+
f"Data loss occurred. Check your network connection and server status."
|
|
584
|
+
) from e
|
|
467
585
|
|
|
468
586
|
if self._experiment.run._storage:
|
|
469
587
|
# Local storage writes one at a time (no batch API)
|
|
@@ -479,11 +597,10 @@ class BackgroundBufferManager:
|
|
|
479
597
|
timestamp=log_entry["timestamp"],
|
|
480
598
|
)
|
|
481
599
|
except Exception as e:
|
|
482
|
-
|
|
483
|
-
f"Failed to write log to local storage: {e}"
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
)
|
|
600
|
+
raise RuntimeError(
|
|
601
|
+
f"Failed to write log to local storage: {e}\n"
|
|
602
|
+
f"Check disk space and file permissions."
|
|
603
|
+
) from e
|
|
487
604
|
|
|
488
605
|
self._last_log_flush = time.time()
|
|
489
606
|
|
|
@@ -535,12 +652,10 @@ class BackgroundBufferManager:
|
|
|
535
652
|
)
|
|
536
653
|
except Exception as e:
|
|
537
654
|
metric_display = f"'{metric_name}'" if metric_name else "unnamed metric"
|
|
538
|
-
|
|
539
|
-
f"Failed to flush {len(batch)} points to {metric_display} on remote server: {e}
|
|
540
|
-
f"
|
|
541
|
-
|
|
542
|
-
stacklevel=3,
|
|
543
|
-
)
|
|
655
|
+
raise RuntimeError(
|
|
656
|
+
f"Failed to flush {len(batch)} points to {metric_display} on remote server: {e}\n"
|
|
657
|
+
f"Data loss occurred. Check your network connection and server status."
|
|
658
|
+
) from e
|
|
544
659
|
|
|
545
660
|
if self._experiment.run._storage:
|
|
546
661
|
try:
|
|
@@ -556,11 +671,10 @@ class BackgroundBufferManager:
|
|
|
556
671
|
)
|
|
557
672
|
except Exception as e:
|
|
558
673
|
metric_display = f"'{metric_name}'" if metric_name else "unnamed metric"
|
|
559
|
-
|
|
560
|
-
f"Failed to flush {len(batch)} points to {metric_display} in local storage: {e}"
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
)
|
|
674
|
+
raise RuntimeError(
|
|
675
|
+
f"Failed to flush {len(batch)} points to {metric_display} in local storage: {e}\n"
|
|
676
|
+
f"Check disk space and file permissions."
|
|
677
|
+
) from e
|
|
564
678
|
|
|
565
679
|
self._last_metric_flush[metric_name] = time.time()
|
|
566
680
|
|
|
@@ -597,12 +711,10 @@ class BackgroundBufferManager:
|
|
|
597
711
|
entries=batch,
|
|
598
712
|
)
|
|
599
713
|
except Exception as e:
|
|
600
|
-
|
|
601
|
-
f"Failed to flush {len(batch)} entries to track '{topic}' on remote server: {e}
|
|
602
|
-
f"
|
|
603
|
-
|
|
604
|
-
stacklevel=3,
|
|
605
|
-
)
|
|
714
|
+
raise RuntimeError(
|
|
715
|
+
f"Failed to flush {len(batch)} entries to track '{topic}' on remote server: {e}\n"
|
|
716
|
+
f"Data loss occurred. Check your network connection and server status."
|
|
717
|
+
) from e
|
|
606
718
|
|
|
607
719
|
# Write to local storage
|
|
608
720
|
if self._experiment.run._storage:
|
|
@@ -615,11 +727,10 @@ class BackgroundBufferManager:
|
|
|
615
727
|
entries=batch,
|
|
616
728
|
)
|
|
617
729
|
except Exception as e:
|
|
618
|
-
|
|
619
|
-
f"Failed to flush {len(batch)} entries to track '{topic}' in local storage: {e}"
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
)
|
|
730
|
+
raise RuntimeError(
|
|
731
|
+
f"Failed to flush {len(batch)} entries to track '{topic}' in local storage: {e}\n"
|
|
732
|
+
f"Check disk space and file permissions."
|
|
733
|
+
) from e
|
|
623
734
|
|
|
624
735
|
self._last_track_flush[topic] = time.time()
|
|
625
736
|
|
|
@@ -663,12 +774,10 @@ class BackgroundBufferManager:
|
|
|
663
774
|
if total_files > 1:
|
|
664
775
|
print(f"[ML-Dash] [{completed}/{total_files}] Uploaded {file_entry['filename']}", flush=True)
|
|
665
776
|
except Exception as e:
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
f"
|
|
669
|
-
|
|
670
|
-
stacklevel=3,
|
|
671
|
-
)
|
|
777
|
+
raise RuntimeError(
|
|
778
|
+
f"Failed to upload file {file_entry['filename']}: {e}\n"
|
|
779
|
+
f"File upload failed. Check network connection and file permissions."
|
|
780
|
+
) from e
|
|
672
781
|
|
|
673
782
|
def _upload_single_file(self, file_entry: Dict[str, Any]) -> None:
|
|
674
783
|
"""
|
ml_dash/cli_commands/upload.py
CHANGED
|
@@ -306,7 +306,8 @@ def discover_experiments(
|
|
|
306
306
|
with open(exp_json, "r") as f:
|
|
307
307
|
metadata = json.load(f)
|
|
308
308
|
prefix = metadata.get("prefix")
|
|
309
|
-
except:
|
|
309
|
+
except (FileNotFoundError, json.JSONDecodeError, KeyError) as e:
|
|
310
|
+
# Metadata file missing or invalid - will use path-based prefix
|
|
310
311
|
pass
|
|
311
312
|
|
|
312
313
|
# Extract project and experiment names from PREFIX (not path)
|
ml_dash/client.py
CHANGED
|
@@ -226,8 +226,13 @@ class RemoteClient:
|
|
|
226
226
|
result = self.graphql_query(query)
|
|
227
227
|
username = result.get("me", {}).get("username")
|
|
228
228
|
return username
|
|
229
|
-
except Exception:
|
|
230
|
-
|
|
229
|
+
except Exception as e:
|
|
230
|
+
# Re-raise authentication errors
|
|
231
|
+
from .auth.exceptions import AuthenticationError
|
|
232
|
+
if isinstance(e, AuthenticationError):
|
|
233
|
+
raise
|
|
234
|
+
# For other errors, raise a clear exception
|
|
235
|
+
raise RuntimeError(f"Failed to fetch namespace from server: {e}") from e
|
|
231
236
|
|
|
232
237
|
def get_current_user(self) -> Optional[Dict[str, Any]]:
|
|
233
238
|
"""
|
|
@@ -264,8 +269,13 @@ class RemoteClient:
|
|
|
264
269
|
"""
|
|
265
270
|
result = self.graphql_query(query)
|
|
266
271
|
return result.get("me")
|
|
267
|
-
except Exception:
|
|
268
|
-
|
|
272
|
+
except Exception as e:
|
|
273
|
+
# Re-raise authentication errors
|
|
274
|
+
from .auth.exceptions import AuthenticationError
|
|
275
|
+
if isinstance(e, AuthenticationError):
|
|
276
|
+
raise
|
|
277
|
+
# For other errors, raise a clear exception
|
|
278
|
+
raise RuntimeError(f"Failed to fetch current user from server: {e}") from e
|
|
269
279
|
|
|
270
280
|
def _ensure_authenticated(self):
|
|
271
281
|
"""Check if authenticated, raise error if not."""
|
ml_dash/experiment.py
CHANGED
|
@@ -399,8 +399,11 @@ class Experiment:
|
|
|
399
399
|
print(f"View results at: {experiment_url}")
|
|
400
400
|
|
|
401
401
|
except Exception as e:
|
|
402
|
-
#
|
|
403
|
-
|
|
402
|
+
# Raise on status update failure
|
|
403
|
+
raise RuntimeError(
|
|
404
|
+
f"Failed to update experiment status to COMPLETED: {e}\n"
|
|
405
|
+
f"Experiment may not be marked as completed on the server."
|
|
406
|
+
) from e
|
|
404
407
|
|
|
405
408
|
self._is_open = False
|
|
406
409
|
|
|
@@ -554,15 +557,10 @@ class Experiment:
|
|
|
554
557
|
logs=[log_entry], # Single log in array
|
|
555
558
|
)
|
|
556
559
|
except Exception as e:
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
f"Failed to write log to remote server: {e}. Training will continue.",
|
|
562
|
-
RuntimeWarning,
|
|
563
|
-
stacklevel=4,
|
|
564
|
-
)
|
|
565
|
-
# Fall through to local storage if available
|
|
560
|
+
raise RuntimeError(
|
|
561
|
+
f"Failed to write log to remote server: {e}\n"
|
|
562
|
+
f"Data loss occurred. Check your network connection and server status."
|
|
563
|
+
) from e
|
|
566
564
|
|
|
567
565
|
if self.run._storage:
|
|
568
566
|
# Local mode: write to file immediately
|
|
@@ -577,11 +575,10 @@ class Experiment:
|
|
|
577
575
|
timestamp=log_entry["timestamp"],
|
|
578
576
|
)
|
|
579
577
|
except Exception as e:
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
)
|
|
578
|
+
raise RuntimeError(
|
|
579
|
+
f"Failed to write log to local storage: {e}\n"
|
|
580
|
+
f"Check disk space and file permissions."
|
|
581
|
+
) from e
|
|
585
582
|
|
|
586
583
|
def _print_log(
|
|
587
584
|
self, message: str, level: str, metadata: Optional[Dict[str, Any]]
|
|
@@ -1072,17 +1069,11 @@ class Experiment:
|
|
|
1072
1069
|
metadata=metadata,
|
|
1073
1070
|
)
|
|
1074
1071
|
except Exception as e:
|
|
1075
|
-
# Log warning but don't crash training
|
|
1076
|
-
import warnings
|
|
1077
|
-
|
|
1078
1072
|
metric_display = f"'{name}'" if name else "unnamed metric"
|
|
1079
|
-
|
|
1080
|
-
f"Failed to log {metric_display} to remote server: {e}
|
|
1081
|
-
f"
|
|
1082
|
-
|
|
1083
|
-
stacklevel=3,
|
|
1084
|
-
)
|
|
1085
|
-
# Fall through to local storage if available
|
|
1073
|
+
raise RuntimeError(
|
|
1074
|
+
f"Failed to log {metric_display} to remote server: {e}\n"
|
|
1075
|
+
f"Data loss occurred. Check your network connection and server status."
|
|
1076
|
+
) from e
|
|
1086
1077
|
|
|
1087
1078
|
if self.run._storage:
|
|
1088
1079
|
# Local mode: append to local storage
|
|
@@ -1098,14 +1089,11 @@ class Experiment:
|
|
|
1098
1089
|
metadata=metadata,
|
|
1099
1090
|
)
|
|
1100
1091
|
except Exception as e:
|
|
1101
|
-
import warnings
|
|
1102
|
-
|
|
1103
1092
|
metric_display = f"'{name}'" if name else "unnamed metric"
|
|
1104
|
-
|
|
1105
|
-
f"Failed to log {metric_display} to local storage: {e}"
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
)
|
|
1093
|
+
raise RuntimeError(
|
|
1094
|
+
f"Failed to log {metric_display} to local storage: {e}\n"
|
|
1095
|
+
f"Check disk space and file permissions."
|
|
1096
|
+
) from e
|
|
1109
1097
|
|
|
1110
1098
|
return result
|
|
1111
1099
|
|
|
@@ -1141,15 +1129,10 @@ class Experiment:
|
|
|
1141
1129
|
entries=[{"timestamp": timestamp, **data}],
|
|
1142
1130
|
)
|
|
1143
1131
|
except Exception as e:
|
|
1144
|
-
|
|
1145
|
-
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
f"Failed to log track '{topic}' to remote server: {e}. "
|
|
1149
|
-
f"Training will continue.",
|
|
1150
|
-
RuntimeWarning,
|
|
1151
|
-
stacklevel=3,
|
|
1152
|
-
)
|
|
1132
|
+
raise RuntimeError(
|
|
1133
|
+
f"Failed to log track '{topic}' to remote server: {e}\n"
|
|
1134
|
+
f"Data loss occurred. Check your network connection and server status."
|
|
1135
|
+
) from e
|
|
1153
1136
|
|
|
1154
1137
|
if self.run._storage:
|
|
1155
1138
|
# Local mode: append to local storage
|
|
@@ -1162,13 +1145,10 @@ class Experiment:
|
|
|
1162
1145
|
entries=[{"timestamp": timestamp, **data}],
|
|
1163
1146
|
)
|
|
1164
1147
|
except Exception as e:
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
RuntimeWarning,
|
|
1170
|
-
stacklevel=3,
|
|
1171
|
-
)
|
|
1148
|
+
raise RuntimeError(
|
|
1149
|
+
f"Failed to log track '{topic}' to local storage: {e}\n"
|
|
1150
|
+
f"Check disk space and file permissions."
|
|
1151
|
+
) from e
|
|
1172
1152
|
|
|
1173
1153
|
def _append_batch_to_metric(
|
|
1174
1154
|
self,
|
|
@@ -1205,17 +1185,11 @@ class Experiment:
|
|
|
1205
1185
|
metadata=metadata,
|
|
1206
1186
|
)
|
|
1207
1187
|
except Exception as e:
|
|
1208
|
-
# Log warning but don't crash training
|
|
1209
|
-
import warnings
|
|
1210
|
-
|
|
1211
1188
|
metric_display = f"'{name}'" if name else "unnamed metric"
|
|
1212
|
-
|
|
1213
|
-
f"Failed to log batch to {metric_display} on remote server: {e}
|
|
1214
|
-
f"
|
|
1215
|
-
|
|
1216
|
-
stacklevel=3,
|
|
1217
|
-
)
|
|
1218
|
-
# Fall through to local storage if available
|
|
1189
|
+
raise RuntimeError(
|
|
1190
|
+
f"Failed to log batch to {metric_display} on remote server: {e}\n"
|
|
1191
|
+
f"Data loss occurred. Check your network connection and server status."
|
|
1192
|
+
) from e
|
|
1219
1193
|
|
|
1220
1194
|
if self.run._storage:
|
|
1221
1195
|
# Local mode: append batch to local storage
|
|
@@ -1231,14 +1205,11 @@ class Experiment:
|
|
|
1231
1205
|
metadata=metadata,
|
|
1232
1206
|
)
|
|
1233
1207
|
except Exception as e:
|
|
1234
|
-
import warnings
|
|
1235
|
-
|
|
1236
1208
|
metric_display = f"'{name}'" if name else "unnamed metric"
|
|
1237
|
-
|
|
1238
|
-
f"Failed to log batch to {metric_display} in local storage: {e}"
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
)
|
|
1209
|
+
raise RuntimeError(
|
|
1210
|
+
f"Failed to log batch to {metric_display} in local storage: {e}\n"
|
|
1211
|
+
f"Check disk space and file permissions."
|
|
1212
|
+
) from e
|
|
1242
1213
|
|
|
1243
1214
|
return result
|
|
1244
1215
|
|
ml_dash/metric.py
CHANGED
|
@@ -69,9 +69,11 @@ class BufferManager:
|
|
|
69
69
|
value = float('nan')
|
|
70
70
|
try:
|
|
71
71
|
self._buffers[prefix][key].append(float(value))
|
|
72
|
-
except (TypeError, ValueError):
|
|
73
|
-
|
|
74
|
-
|
|
72
|
+
except (TypeError, ValueError) as e:
|
|
73
|
+
raise ValueError(
|
|
74
|
+
f"Cannot buffer non-numeric value for '{key}': {value!r} (type: {type(value).__name__})\n"
|
|
75
|
+
f"Metrics must be numeric (int, float). Use exp.log() for non-numeric values."
|
|
76
|
+
) from e
|
|
75
77
|
|
|
76
78
|
def _compute_stats(self, values: List[float], aggs: tuple) -> Dict[str, float]:
|
|
77
79
|
"""
|
|
@@ -248,9 +250,11 @@ class SummaryCache:
|
|
|
248
250
|
value = float('nan')
|
|
249
251
|
try:
|
|
250
252
|
self._buffer[key].append(float(value))
|
|
251
|
-
except (TypeError, ValueError):
|
|
252
|
-
|
|
253
|
-
|
|
253
|
+
except (TypeError, ValueError) as e:
|
|
254
|
+
raise ValueError(
|
|
255
|
+
f"Cannot store non-numeric value for '{key}': {value!r} (type: {type(value).__name__})\n"
|
|
256
|
+
f"SummaryCache only accepts numeric values. Use exp.log() for non-numeric data."
|
|
257
|
+
) from e
|
|
254
258
|
|
|
255
259
|
def set(self, **kwargs) -> None:
|
|
256
260
|
"""
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
ml_dash/__init__.py,sha256=
|
|
1
|
+
ml_dash/__init__.py,sha256=0N78Rs6EH_pFB-OoRXx42ZXm1ghbvp2xd54it41CPXM,3060
|
|
2
2
|
ml_dash/auth/__init__.py,sha256=3lwM-Y8UBHPU1gFW2JNpmXlPVTnkGudWLKNFFKulQfo,1200
|
|
3
3
|
ml_dash/auth/constants.py,sha256=ku4QzQUMNjvyJwjy7AUdywMAZd59jXSxNHZxDiagUWU,280
|
|
4
4
|
ml_dash/auth/device_flow.py,sha256=DQOdPNlZCuU1umZOA_A6WXdRM3zWphnyo9IntToBl_A,7921
|
|
@@ -6,7 +6,7 @@ ml_dash/auth/device_secret.py,sha256=qUsz6M9S1GEIukvmz57eJEp57srSx74O4MU9mZEeDlE
|
|
|
6
6
|
ml_dash/auth/exceptions.py,sha256=IeBwUzoaTyFtPwd4quFOIel49inIzuabe_ChEeEXEWI,725
|
|
7
7
|
ml_dash/auth/token_storage.py,sha256=9YQXGrn41UVyc1wUvZYbTYLzxSt5NGOyNFNjeX28bjA,7976
|
|
8
8
|
ml_dash/auto_start.py,sha256=mYNjLGI2kyylIfOX5wGOR74gb9UlXg1n5OUQu7aw5SE,2412
|
|
9
|
-
ml_dash/buffer.py,sha256=
|
|
9
|
+
ml_dash/buffer.py,sha256=m2vTcS08DL8C797Mw_f7myk_tpivow4z4FOQgb1KR4c,31633
|
|
10
10
|
ml_dash/cli.py,sha256=Vd0taM5MQrhvxqL2KQhklZ00wxZLdWl6Qw1IPkjPNuw,2897
|
|
11
11
|
ml_dash/cli_commands/__init__.py,sha256=bjAmV7MsW-bhtW_4SnLJ0Cfkt9h82vMDC8ebW1Ke8KE,38
|
|
12
12
|
ml_dash/cli_commands/api.py,sha256=NekZEJGWNpIfB6YrsrOw7kw7rZKjVudwgJWPZIy6ANQ,4535
|
|
@@ -17,13 +17,13 @@ ml_dash/cli_commands/login.py,sha256=zX-urtUrfzg2qOGtKNYQgj6UloN9kzj4zEO6h_xwuNs
|
|
|
17
17
|
ml_dash/cli_commands/logout.py,sha256=lTUUNyRXqvo61qNkCd4KBrPUujDAHnNqsHkU6bHie0U,1332
|
|
18
18
|
ml_dash/cli_commands/profile.py,sha256=PoRO1XA4bnOINptj4AO0SyNDBADeryPJBfgC74327e4,5997
|
|
19
19
|
ml_dash/cli_commands/remove.py,sha256=AtDlUWkNeGcnZWN0Wbg6XoyYhFHkCFMPdxsGA33v38c,5325
|
|
20
|
-
ml_dash/cli_commands/upload.py,sha256=
|
|
21
|
-
ml_dash/client.py,sha256=
|
|
20
|
+
ml_dash/cli_commands/upload.py,sha256=oZVU8m9Ey8N151KmUiQD_qQLEBBN4oqu5NcqUNOUJWY,49786
|
|
21
|
+
ml_dash/client.py,sha256=ZWO1uzV-GLz-3R2ML90XPCV200e_ch7L-4iG07jkC84,67213
|
|
22
22
|
ml_dash/config.py,sha256=oz2xvoBh2X_xUXWr92cPD5nFxXMT5LxVNypv5B5O0fA,3116
|
|
23
|
-
ml_dash/experiment.py,sha256=
|
|
23
|
+
ml_dash/experiment.py,sha256=eSzrh49jP5dPx6rlEZZHLDMn2CxoA9uE_PSG3CpQQfA,43366
|
|
24
24
|
ml_dash/files.py,sha256=tGJCTxPfd9vmfvIEqstZjzLvqmHzMZffPXHz0jU9bYU,54441
|
|
25
25
|
ml_dash/log.py,sha256=E-DLg0vejVLLEyShJ_r0LneDMI0XU7XTH5iKWYJe9jI,5298
|
|
26
|
-
ml_dash/metric.py,sha256=
|
|
26
|
+
ml_dash/metric.py,sha256=hwdZIkHbt_wbJQPiO1LK3UJwusbAC7J-TR0gKeAnjHc,26206
|
|
27
27
|
ml_dash/params.py,sha256=pPFvknJAJX5uhckzjO1r-HNnKbQFFKDISFmOXNET5eY,9046
|
|
28
28
|
ml_dash/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
29
29
|
ml_dash/remote_auto_start.py,sha256=5fvQDHv1CWEKFb6WAa5_uyEInwV_SvotXjOO_6i6ZKE,1687
|
|
@@ -31,7 +31,7 @@ ml_dash/run.py,sha256=yAKZ9HtU4cidtbWMAY1IiDPVWwluVlicD5hsmVT89U0,11361
|
|
|
31
31
|
ml_dash/snowflake.py,sha256=14rEpRU5YltsmmmZW0EMUy_hdv5S5ME9gWVtmdmwfiU,4917
|
|
32
32
|
ml_dash/storage.py,sha256=x1W-dK6wQY36-LVOJ4kA8Dn07ObNQuIErQWJ3b0PoGY,44910
|
|
33
33
|
ml_dash/track.py,sha256=Dfg1ZnmKZ_FlE5ZfG8Qld_wN4RIMs3nrOxrxwf3thiY,8164
|
|
34
|
-
ml_dash-0.6.
|
|
35
|
-
ml_dash-0.6.
|
|
36
|
-
ml_dash-0.6.
|
|
37
|
-
ml_dash-0.6.
|
|
34
|
+
ml_dash-0.6.15.dist-info/WHEEL,sha256=fAguSjoiATBe7TNBkJwOjyL1Tt4wwiaQGtNtjRPNMQA,80
|
|
35
|
+
ml_dash-0.6.15.dist-info/entry_points.txt,sha256=dYs2EHX1uRNO7AQGNnVaJJpgiy0Z9q7tiy4fHSyaf3Q,46
|
|
36
|
+
ml_dash-0.6.15.dist-info/METADATA,sha256=h5lcu5MX1urb8ZSPaZDPprXM6CUdvemGfGgVhqJJ4hA,9536
|
|
37
|
+
ml_dash-0.6.15.dist-info/RECORD,,
|
|
File without changes
|