ml-dash 0.3.26__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. ml_dash-0.4.0/PKG-INFO +1424 -0
  2. ml_dash-0.4.0/README.md +1401 -0
  3. ml_dash-0.4.0/pyproject.toml +62 -0
  4. ml_dash-0.4.0/src/ml_dash/ARCHITECTURE.md +382 -0
  5. ml_dash-0.4.0/src/ml_dash/__init__.py +14 -0
  6. ml_dash-0.4.0/src/ml_dash/autolog.py +32 -0
  7. ml_dash-0.4.0/src/ml_dash/backends/__init__.py +11 -0
  8. ml_dash-0.4.0/src/ml_dash/backends/base.py +124 -0
  9. ml_dash-0.4.0/src/ml_dash/backends/dash_backend.py +571 -0
  10. ml_dash-0.4.0/src/ml_dash/backends/local_backend.py +90 -0
  11. ml_dash-0.4.0/src/ml_dash/components/__init__.py +13 -0
  12. ml_dash-0.4.0/src/ml_dash/components/files.py +246 -0
  13. ml_dash-0.4.0/src/ml_dash/components/logs.py +104 -0
  14. ml_dash-0.4.0/src/ml_dash/components/metrics.py +169 -0
  15. ml_dash-0.4.0/src/ml_dash/components/parameters.py +144 -0
  16. ml_dash-0.4.0/src/ml_dash/job_logger.py +42 -0
  17. ml_dash-0.4.0/src/ml_dash/ml_logger.py +234 -0
  18. ml_dash-0.4.0/src/ml_dash/run.py +331 -0
  19. ml-dash-0.3.26/.gitignore +0 -132
  20. ml-dash-0.3.26/.graphqlconfig +0 -13
  21. ml-dash-0.3.26/.run/pytest for test_pytorch_methods.test_torch_load.run.xml +0 -29
  22. ml-dash-0.3.26/.run/pytest for test_pytorch_methods.test_torch_load_sync.run.xml +0 -29
  23. ml-dash-0.3.26/.run/pytest for test_pytorch_methods.test_torch_save.run.xml +0 -29
  24. ml-dash-0.3.26/.run/pytest in test_pytorch_methods.py.run.xml +0 -29
  25. ml-dash-0.3.26/.run/train.run.xml +0 -34
  26. ml-dash-0.3.26/LICENSE.md +0 -30
  27. ml-dash-0.3.26/MANIFEST.in +0 -2
  28. ml-dash-0.3.26/Makefile +0 -43
  29. ml-dash-0.3.26/PKG-INFO +0 -104
  30. ml-dash-0.3.26/README +0 -73
  31. ml-dash-0.3.26/README.md +0 -67
  32. ml-dash-0.3.26/VERSION +0 -1
  33. ml-dash-0.3.26/dash_server_specs/.gitignore +0 -1
  34. ml-dash-0.3.26/dash_server_specs/__init__.py +0 -28
  35. ml-dash-0.3.26/dash_server_specs/conftest.py +0 -10
  36. ml-dash-0.3.26/dash_server_specs/create_experiments.py +0 -57
  37. ml-dash-0.3.26/dash_server_specs/test_ml_dash.py +0 -507
  38. ml-dash-0.3.26/docs/.gitignore +0 -1
  39. ml-dash-0.3.26/docs/Makefile +0 -177
  40. ml-dash-0.3.26/docs/authors.rst +0 -1
  41. ml-dash-0.3.26/docs/conf.py +0 -276
  42. ml-dash-0.3.26/docs/contributing.rst +0 -1
  43. ml-dash-0.3.26/docs/history.rst +0 -1
  44. ml-dash-0.3.26/docs/index.rst +0 -18
  45. ml-dash-0.3.26/docs/installation.rst +0 -7
  46. ml-dash-0.3.26/docs/make.bat +0 -242
  47. ml-dash-0.3.26/docs/readme.rst +0 -1
  48. ml-dash-0.3.26/docs/requirements.txt +0 -30
  49. ml-dash-0.3.26/docs/usage.rst +0 -7
  50. ml-dash-0.3.26/figures/example_log_output.png +0 -0
  51. ml-dash-0.3.26/figures/hyperparameter-column.gif +0 -0
  52. ml-dash-0.3.26/figures/logger_color_output.png +0 -0
  53. ml-dash-0.3.26/figures/logging_images.png +0 -0
  54. ml-dash-0.3.26/figures/ml-dash-v0.1.0.png +0 -0
  55. ml-dash-0.3.26/figures/ml-dash-v3.gif +0 -0
  56. ml-dash-0.3.26/figures/ml_visualization_dashboard_preview.png +0 -0
  57. ml-dash-0.3.26/figures/tensorboard_example.png +0 -0
  58. ml-dash-0.3.26/figures/visualization_column.png +0 -0
  59. ml-dash-0.3.26/ml_dash/.gitignore +0 -1
  60. ml-dash-0.3.26/ml_dash/__init__.py +0 -0
  61. ml-dash-0.3.26/ml_dash/app.py +0 -70
  62. ml-dash-0.3.26/ml_dash/client-dist/asset-manifest.json +0 -23
  63. ml-dash-0.3.26/ml_dash/client-dist/favicon.ico +0 -0
  64. ml-dash-0.3.26/ml_dash/client-dist/github-markdown.css +0 -957
  65. ml-dash-0.3.26/ml_dash/client-dist/index.html +0 -1
  66. ml-dash-0.3.26/ml_dash/client-dist/logo192.png +0 -0
  67. ml-dash-0.3.26/ml_dash/client-dist/logo512.png +0 -0
  68. ml-dash-0.3.26/ml_dash/client-dist/manifest.json +0 -15
  69. ml-dash-0.3.26/ml_dash/client-dist/ml-dash_logo.png +0 -0
  70. ml-dash-0.3.26/ml_dash/client-dist/monaco-editor-worker-loader-proxy.js +0 -6
  71. ml-dash-0.3.26/ml_dash/client-dist/robots.txt +0 -3
  72. ml-dash-0.3.26/ml_dash/client-dist/static/css/2.0e9a69fe.chunk.css +0 -2
  73. ml-dash-0.3.26/ml_dash/client-dist/static/css/2.0e9a69fe.chunk.css.map +0 -1
  74. ml-dash-0.3.26/ml_dash/client-dist/static/css/main.a2e245bd.chunk.css +0 -2
  75. ml-dash-0.3.26/ml_dash/client-dist/static/css/main.a2e245bd.chunk.css.map +0 -1
  76. ml-dash-0.3.26/ml_dash/client-dist/static/js/2.0c697312.chunk.js +0 -3
  77. ml-dash-0.3.26/ml_dash/client-dist/static/js/2.0c697312.chunk.js.LICENSE.txt +0 -113
  78. ml-dash-0.3.26/ml_dash/client-dist/static/js/2.0c697312.chunk.js.map +0 -1
  79. ml-dash-0.3.26/ml_dash/client-dist/static/js/main.a1584e64.chunk.js +0 -2
  80. ml-dash-0.3.26/ml_dash/client-dist/static/js/main.a1584e64.chunk.js.map +0 -1
  81. ml-dash-0.3.26/ml_dash/client-dist/static/js/runtime-main.34fb8e2d.js +0 -2
  82. ml-dash-0.3.26/ml_dash/client-dist/static/js/runtime-main.34fb8e2d.js.map +0 -1
  83. ml-dash-0.3.26/ml_dash/config.py +0 -32
  84. ml-dash-0.3.26/ml_dash/example.py +0 -0
  85. ml-dash-0.3.26/ml_dash/file_events.py +0 -71
  86. ml-dash-0.3.26/ml_dash/file_handlers.py +0 -141
  87. ml-dash-0.3.26/ml_dash/file_utils.py +0 -5
  88. ml-dash-0.3.26/ml_dash/file_watcher.py +0 -30
  89. ml-dash-0.3.26/ml_dash/mime_types.py +0 -20
  90. ml-dash-0.3.26/ml_dash/schema/__init__.py +0 -62
  91. ml-dash-0.3.26/ml_dash/schema/directories.py +0 -67
  92. ml-dash-0.3.26/ml_dash/schema/experiments.py +0 -92
  93. ml-dash-0.3.26/ml_dash/schema/files/__init__.py +0 -228
  94. ml-dash-0.3.26/ml_dash/schema/files/file_helpers.py +0 -122
  95. ml-dash-0.3.26/ml_dash/schema/files/images.py +0 -27
  96. ml-dash-0.3.26/ml_dash/schema/files/metrics.py +0 -67
  97. ml-dash-0.3.26/ml_dash/schema/files/parameters.py +0 -64
  98. ml-dash-0.3.26/ml_dash/schema/files/series.py +0 -263
  99. ml-dash-0.3.26/ml_dash/schema/files/videos.py +0 -27
  100. ml-dash-0.3.26/ml_dash/schema/helpers.py +0 -66
  101. ml-dash-0.3.26/ml_dash/schema/projects.py +0 -62
  102. ml-dash-0.3.26/ml_dash/schema/schema_helpers.py +0 -19
  103. ml-dash-0.3.26/ml_dash/schema/users.py +0 -35
  104. ml-dash-0.3.26/ml_dash/server.py +0 -66
  105. ml-dash-0.3.26/ml_dash/sse.py +0 -18
  106. ml-dash-0.3.26/ml_dash.egg-info/PKG-INFO +0 -104
  107. ml-dash-0.3.26/ml_dash.egg-info/SOURCES.txt +0 -98
  108. ml-dash-0.3.26/ml_dash.egg-info/dependency_links.txt +0 -1
  109. ml-dash-0.3.26/ml_dash.egg-info/requires.txt +0 -18
  110. ml-dash-0.3.26/ml_dash.egg-info/top_level.txt +0 -2
  111. ml-dash-0.3.26/notes/ML-Dash Enhancement Plans.md +0 -11
  112. ml-dash-0.3.26/notes/README.md +0 -380
  113. ml-dash-0.3.26/notes/client design doc.md +0 -100
  114. ml-dash-0.3.26/notes/dashboard design doc.md +0 -53
  115. ml-dash-0.3.26/notes/setting_up_dash_server.md +0 -22
  116. ml-dash-0.3.26/requirements-dev.txt +0 -1
  117. ml-dash-0.3.26/setup.cfg +0 -4
  118. ml-dash-0.3.26/setup.py +0 -56
ml_dash-0.4.0/PKG-INFO ADDED
@@ -0,0 +1,1424 @@
1
+ Metadata-Version: 2.3
2
+ Name: ml-dash
3
+ Version: 0.4.0
4
+ Summary: Add your description here
5
+ Author: Ge Yang
6
+ Author-email: Ge Yang <ge.ike.yang@gmail.com>
7
+ Requires-Dist: msgpack>=1.0.0
8
+ Requires-Dist: numpy>=1.24.0
9
+ Requires-Dist: requests>=2.31.0
10
+ Requires-Dist: ruff>=0.8.0 ; extra == 'dev'
11
+ Requires-Dist: pytest>=8.4.2 ; extra == 'dev'
12
+ Requires-Dist: pytest-cov>=4.1.0 ; extra == 'dev'
13
+ Requires-Dist: pytest-asyncio>=0.23.0 ; extra == 'dev'
14
+ Requires-Dist: sphinx==7.1.2 ; extra == 'dev'
15
+ Requires-Dist: furo ; extra == 'dev'
16
+ Requires-Dist: myst-parser ; extra == 'dev'
17
+ Requires-Dist: sphinx-copybutton ; extra == 'dev'
18
+ Requires-Dist: sphinx-autobuild ; extra == 'dev'
19
+ Requires-Dist: torch>=2.0.0 ; extra == 'dev'
20
+ Requires-Python: >=3.12
21
+ Provides-Extra: dev
22
+ Description-Content-Type: text/markdown
23
+
24
+ # ML-Logger API Documentation
25
+
26
+ **Version:** 0.1.0
27
+
28
+ ML-Logger is a minimal, experiment tracking library for machine learning. It provides a simple API to log
29
+ parameters, metrics, files, and logs during your ML experiments.
30
+
31
+ ## Table of Contents
32
+
33
+ 1. [Installation](#installation)
34
+ 2. [Quick Start](#quick-start)
35
+ 3. [Core Concepts](#core-concepts)
36
+ 4. [API Reference](#api-reference)
37
+ - [Experiment](#experiment)
38
+ - [Parameters](#parameters)
39
+ - [Metrics](#metrics)
40
+ - [Logs](#logs)
41
+ - [Files](#files)
42
+ 5. [Usage Patterns](#usage-patterns)
43
+ 6. [Examples](#examples)
44
+ 7. [Remote Backend](#remote-backend)
45
+ 8. [Best Practices](#best-practices)
46
+
47
+ ---
48
+
49
+ ## Installation
50
+
51
+ ```bash
52
+ # Using pip
53
+ pip install -i https://test.pypi.org/simple/ ml-logger-beta
54
+ ```
55
+
56
+ ---
57
+
58
+ ## Quick Start
59
+
60
+ ### Basic Example
61
+
62
+ ```python
63
+ from ml_dash import Experiment
64
+
65
+ # Create an experiment
66
+ exp = Experiment(
67
+ namespace="alice", # Your username or team
68
+ workspace="my-project", # Project name
69
+ prefix="experiment-1" # Experiment name
70
+ )
71
+
72
+ # Start tracking
73
+ with exp.run():
74
+ # Log parameters
75
+ exp.params.set(
76
+ learning_rate=0.001,
77
+ batch_size=32,
78
+ epochs=100
79
+ )
80
+
81
+ # Log metrics
82
+ for epoch in range(100):
83
+ exp.metrics.log(
84
+ step=epoch,
85
+ loss=0.5 - epoch * 0.01,
86
+ accuracy=0.5 + epoch * 0.005
87
+ )
88
+
89
+ # Log messages
90
+ exp.info("Training completed!")
91
+
92
+ # Save files
93
+ exp.files.save({"final": "results"}, "results.json")
94
+ ```
95
+
96
+ This creates a local directory structure:
97
+
98
+ ```
99
+ .ml-logger/
100
+ └── alice/
101
+ └── my-project/
102
+ └── experiment-1/
103
+ ├── .ml-logger.meta.json
104
+ ├── parameters.jsonl
105
+ ├── metrics.jsonl
106
+ ├── logs.jsonl
107
+ └── files/
108
+ └── results.json
109
+ ```
110
+
111
+ ---
112
+
113
+ ## Core Concepts
114
+
115
+ ### 1. **Namespace**
116
+
117
+ Your username or organization (e.g., `"alice"`, `"research-team"`)
118
+
119
+ ### 2. **Workspace**
120
+
121
+ Project or research area (e.g., `"image-classification"`, `"nlp-experiments"`)
122
+
123
+ ### 3. **Prefix**
124
+
125
+ Unique experiment name (e.g., `"resnet50-run-001"`)
126
+
127
+ ### 4. **Directory** (Optional)
128
+
129
+ Hierarchical organization within workspace (e.g., `"models/resnet/cifar10"`)
130
+
131
+ ### 5. **Local or Remote**
132
+
133
+ Everything is saved locally by default. Remote sync is optional.
134
+
135
+ ### 6. **Append-Only**
136
+
137
+ All data is written in append-only JSONL format for crash safety.
138
+
139
+ ---
140
+
141
+ ## API Reference
142
+
143
+ ### Experiment
144
+
145
+ The main class for experiment tracking.
146
+
147
+ #### Constructor
148
+
149
+ ```python
150
+ Experiment(
151
+ namespace: str, # Required: User/team namespace
152
+ workspace: str, # Required: Project workspace
153
+ prefix: str, # Required: Experiment name
154
+ remote: str = None, # Optional: Remote server URL
155
+ local_root: str = ".ml-logger", # Local storage directory
156
+ directory: str = None, # Optional: Subdirectory path
157
+ readme: str = None, # Optional: Description
158
+ experiment_id: str = None, # Optional: Server experiment ID
159
+ )
160
+ ```
161
+
162
+ **Example:**
163
+
164
+ ```python
165
+ exp = Experiment(
166
+ namespace="alice",
167
+ workspace="vision",
168
+ prefix="resnet-experiment",
169
+ directory="image-classification/cifar10",
170
+ readme="ResNet50 transfer learning on CIFAR-10",
171
+ remote="https://qwqdug4btp.us-east-1.awsapprunner.com" # Optional remote server
172
+ )
173
+ ```
174
+
175
+ #### Methods
176
+
177
+ ##### `run(func=None)`
178
+
179
+ Mark experiment as running. Supports 3 patterns:
180
+
181
+ **Pattern 1: Direct Call**
182
+
183
+ ```python
184
+ exp.run()
185
+ # ... your training code ...
186
+ exp.complete()
187
+ ```
188
+
189
+ **Pattern 2: Context Manager (Recommended)**
190
+
191
+ ```python
192
+ with exp.run():
193
+ # ... your training code ...
194
+ # Automatically calls complete() on success
195
+ # Automatically calls fail() on exception
196
+ ```
197
+
198
+ **Pattern 3: Decorator**
199
+
200
+ ```python
201
+ @exp.run
202
+ def train():
203
+
204
+
205
+ # ... your training code ...
206
+
207
+ train()
208
+ ```
209
+
210
+ ##### `complete()`
211
+
212
+ Mark experiment as completed.
213
+
214
+ ```python
215
+ exp.complete()
216
+ ```
217
+
218
+ ##### `fail(error: str)`
219
+
220
+ Mark experiment as failed with error message.
221
+
222
+ ```python
223
+ try:
224
+ # training code
225
+ except Exception as e:
226
+ exp.fail(str(e))
227
+ raise
228
+ ```
229
+
230
+ ##### Logging Convenience Methods
231
+
232
+ ```python
233
+ exp.info(message: str, ** context) # Log info message
234
+ exp.warning(message: str, ** context) # Log warning message
235
+ exp.error(message: str, ** context) # Log error message
236
+ exp.debug(message: str, ** context) # Log debug message
237
+ ```
238
+
239
+ **Example:**
240
+
241
+ ```python
242
+ exp.info("Epoch completed", epoch=5, loss=0.3, accuracy=0.85)
243
+ exp.warning("Memory usage high", usage_gb=15.2)
244
+ exp.error("Training failed", error="CUDA out of memory")
245
+ ```
246
+
247
+ #### Properties
248
+
249
+ ```python
250
+ exp.namespace # str: Namespace
251
+ exp.workspace # str: Workspace
252
+ exp.prefix # str: Experiment prefix
253
+ exp.directory # str | None: Directory path
254
+ exp.remote # str | None: Remote server URL
255
+ exp.experiment_id # str | None: Server experiment ID
256
+ exp.run_id # str | None: Server run ID
257
+ ```
258
+
259
+ #### Components
260
+
261
+ ```python
262
+ exp.params # ParameterManager
263
+ exp.metrics # MetricsLogger
264
+ exp.files # FileManager
265
+ exp.logs # LogManager
266
+ ```
267
+
268
+ ---
269
+
270
+ ### Parameters
271
+
272
+ Manages experiment parameters (hyperparameters, config, etc.)
273
+
274
+ #### Methods
275
+
276
+ ##### `set(**kwargs)`
277
+
278
+ Set parameters (replaces existing).
279
+
280
+ ```python
281
+ exp.params.set(
282
+ learning_rate=0.001,
283
+ batch_size=32,
284
+ optimizer="adam",
285
+ model={
286
+ "layers": 50,
287
+ "dropout": 0.2
288
+ }
289
+ )
290
+ ```
291
+
292
+ ##### `extend(**kwargs)`
293
+
294
+ Extend parameters (deep merge with existing).
295
+
296
+ ```python
297
+ # First call
298
+ exp.params.set(model={"layers": 50})
299
+
300
+ # Extend (merges with existing)
301
+ exp.params.extend(model={"dropout": 0.2})
302
+
303
+ # Result: {"model": {"layers": 50, "dropout": 0.2}}
304
+ ```
305
+
306
+ ##### `update(key: str, value: Any)`
307
+
308
+ Update a single parameter (supports dot notation).
309
+
310
+ ```python
311
+ exp.params.update("model.layers", 100)
312
+ exp.params.update("learning_rate", 0.0001)
313
+ ```
314
+
315
+ ##### `read() -> dict`
316
+
317
+ Read current parameters.
318
+
319
+ ```python
320
+ params = exp.params.read()
321
+ print(params["learning_rate"]) # 0.001
322
+ ```
323
+
324
+ ##### `log(**kwargs)`
325
+
326
+ Alias for `set()` (for API consistency).
327
+
328
+ ```python
329
+ exp.params.log(batch_size=64)
330
+ ```
331
+
332
+ ---
333
+
334
+ ### Metrics
335
+
336
+ Logs time-series metrics with optional namespacing.
337
+
338
+ #### Methods
339
+
340
+ ##### `log(step=None, **metrics)`
341
+
342
+ Log metrics immediately.
343
+
344
+ ```python
345
+ # Simple logging
346
+ exp.metrics.log(step=1, loss=0.5, accuracy=0.8)
347
+
348
+ # Multiple metrics at once
349
+ exp.metrics.log(
350
+ step=10,
351
+ train_loss=0.3,
352
+ val_loss=0.4,
353
+ train_acc=0.85,
354
+ val_acc=0.82
355
+ )
356
+
357
+ # Without step (uses timestamp only)
358
+ exp.metrics.log(gpu_memory=8.5, cpu_usage=45.2)
359
+ ```
360
+
361
+ ##### `collect(step=None, **metrics)`
362
+
363
+ Collect metrics for later aggregation (useful for batch-level logging).
364
+
365
+ ```python
366
+ for batch in train_loader:
367
+ loss = train_batch(batch)
368
+
369
+ # Collect batch metrics (not logged yet)
370
+ exp.metrics.collect(loss=loss.item(), accuracy=acc.item())
371
+
372
+ # Aggregate and log after epoch
373
+ exp.metrics.flush(_aggregation="mean", step=epoch)
374
+ ```
375
+
376
+ ##### `flush(_aggregation="mean", step=None, **additional_metrics)`
377
+
378
+ Flush collected metrics with aggregation.
379
+
380
+ **Aggregation methods:**
381
+
382
+ - `"mean"` - Average of collected values (default)
383
+ - `"sum"` - Sum of collected values
384
+ - `"min"` - Minimum value
385
+ - `"max"` - Maximum value
386
+ - `"last"` - Last value
387
+
388
+ ```python
389
+ # Collect during training
390
+ for batch in batches:
391
+ metrics.collect(loss=loss, accuracy=acc)
392
+
393
+ # Flush with mean aggregation
394
+ metrics.flush(_aggregation="mean", step=epoch, learning_rate=lr)
395
+
396
+ # Flush with max aggregation
397
+ metrics.flush(_aggregation="max", step=epoch)
398
+ ```
399
+
400
+ ##### Namespacing: `__call__(namespace: str)`
401
+
402
+ Create a namespaced metrics logger.
403
+
404
+ ```python
405
+ # Create namespaced loggers
406
+ train_metrics = exp.metrics("train")
407
+ val_metrics = exp.metrics("val")
408
+
409
+ # Log to different namespaces
410
+ train_metrics.log(step=1, loss=0.5, accuracy=0.8)
411
+ val_metrics.log(step=1, loss=0.6, accuracy=0.75)
412
+
413
+ # Results in metrics named: "train.loss", "train.accuracy", "val.loss", "val.accuracy"
414
+ ```
415
+
416
+ ##### `read() -> list`
417
+
418
+ Read all logged metrics.
419
+
420
+ ```python
421
+ metrics_data = exp.metrics.read()
422
+ for entry in metrics_data:
423
+ print(entry["step"], entry["metrics"])
424
+ ```
425
+
426
+ ---
427
+
428
+ ### Logs
429
+
430
+ Structured text logging with levels and context.
431
+
432
+ #### Methods
433
+
434
+ ##### `log(message: str, level: str = "INFO", **context)`
435
+
436
+ Log a message with level and context.
437
+
438
+ ```python
439
+ exp.logs.log("Training started", level="INFO", epoch=0, lr=0.001)
440
+ ```
441
+
442
+ ##### Level-Specific Methods
443
+
444
+ ```python
445
+ exp.logs.info(message: str, ** context) # INFO level
446
+ exp.logs.warning(message: str, ** context) # WARNING level
447
+ exp.logs.error(message: str, ** context) # ERROR level
448
+ exp.logs.debug(message: str, ** context) # DEBUG level
449
+ ```
450
+
451
+ **Examples:**
452
+
453
+ ```python
454
+ # Info log
455
+ exp.logs.info("Epoch started", epoch=5, batches=100)
456
+
457
+ # Warning log
458
+ exp.logs.warning("High memory usage", memory_gb=14.5, threshold_gb=16.0)
459
+
460
+ # Error log
461
+ exp.logs.error("Training failed", error="CUDA OOM", batch_size=128)
462
+
463
+ # Debug log
464
+ exp.logs.debug("Gradient norm", grad_norm=2.3, step=1000)
465
+ ```
466
+
467
+ ##### `read() -> list`
468
+
469
+ Read all logs.
470
+
471
+ ```python
472
+ logs = exp.logs.read()
473
+ for log_entry in logs:
474
+ print(f"[{log_entry['level']}] {log_entry['message']}")
475
+ if 'context' in log_entry:
476
+ print(f" Context: {log_entry['context']}")
477
+ ```
478
+
479
+ ---
480
+
481
+ ### Files
482
+
483
+ Manages file storage with auto-format detection.
484
+
485
+ #### Methods
486
+
487
+ ##### `save(data: Any, filename: str)`
488
+
489
+ Save data with automatic format detection.
490
+
491
+ **Supported formats:**
492
+
493
+ - `.json` - JSON files
494
+ - `.pkl`, `.pickle` - Pickle files
495
+ - `.pt`, `.pth` - PyTorch tensors/models
496
+ - `.npy`, `.npz` - NumPy arrays
497
+ - Other extensions - Raw bytes or fallback to pickle
498
+
499
+ ```python
500
+ # JSON
501
+ exp.files.save({"results": [1, 2, 3]}, "results.json")
502
+
503
+ # PyTorch model
504
+ exp.files.save(model.state_dict(), "model.pt")
505
+
506
+ # NumPy array
507
+ exp.files.save(numpy_array, "embeddings.npy")
508
+
509
+ # Pickle
510
+ exp.files.save(custom_object, "object.pkl")
511
+
512
+ # Raw bytes
513
+ exp.files.save(b"binary data", "data.bin")
514
+
515
+ # Text
516
+ exp.files.save("text content", "notes.txt")
517
+ ```
518
+
519
+ ##### `save_pkl(data: Any, filename: str)`
520
+
521
+ Save as pickle (automatically adds .pkl extension).
522
+
523
+ ```python
524
+ exp.files.save_pkl(complex_object, "checkpoint")
525
+ # Saves as "checkpoint.pkl"
526
+ ```
527
+
528
+ ##### `load(filename: str) -> Any`
529
+
530
+ Load data with automatic format detection.
531
+
532
+ ```python
533
+ # JSON
534
+ results = exp.files.load("results.json")
535
+
536
+ # PyTorch
537
+ state_dict = exp.files.load("model.pt")
538
+
539
+ # NumPy
540
+ array = exp.files.load("embeddings.npy")
541
+
542
+ # Pickle
543
+ obj = exp.files.load("object.pkl")
544
+ ```
545
+
546
+ ##### `load_torch(filename: str) -> Any`
547
+
548
+ Load PyTorch checkpoint (adds .pt extension if missing).
549
+
550
+ ```python
551
+ checkpoint = exp.files.load_torch("best_model")
552
+ # Loads "best_model.pt"
553
+ ```
554
+
555
+ ##### Namespacing: `__call__(namespace: str)`
556
+
557
+ Create a namespaced file manager.
558
+
559
+ ```python
560
+ # Create namespaced file managers
561
+ checkpoints = exp.files("checkpoints")
562
+ configs = exp.files("configs")
563
+
564
+ # Save to different directories
565
+ checkpoints.save(model.state_dict(), "epoch_10.pt")
566
+ # Saves to: files/checkpoints/epoch_10.pt
567
+
568
+ configs.save(config, "training.json")
569
+ # Saves to: files/configs/training.json
570
+ ```
571
+
572
+ ##### `exists(filename: str) -> bool`
573
+
574
+ Check if file exists.
575
+
576
+ ```python
577
+ if exp.files.exists("checkpoint.pt"):
578
+ model.load_state_dict(exp.files.load("checkpoint.pt"))
579
+ ```
580
+
581
+ ##### `list() -> list`
582
+
583
+ List files in current namespace.
584
+
585
+ ```python
586
+ files = exp.files.list()
587
+ print(f"Files: {files}")
588
+
589
+ # With namespace
590
+ checkpoint_files = exp.files("checkpoints").list()
591
+ ```
592
+
593
+ ---
594
+
595
+ ## Usage Patterns
596
+
597
+ ### Pattern 1: Simple Training Loop
598
+
599
+ ```python
600
+ from ml_dash import Experiment
601
+
602
+ exp = Experiment(
603
+ namespace="alice",
604
+ workspace="mnist",
605
+ prefix="simple-cnn"
606
+ )
607
+
608
+ with exp.run():
609
+ # Log hyperparameters
610
+ exp.params.set(lr=0.001, epochs=10, batch_size=32)
611
+
612
+ # Training loop
613
+ for epoch in range(10):
614
+ train_loss = train_one_epoch()
615
+ val_loss = validate()
616
+
617
+ exp.metrics.log(
618
+ step=epoch,
619
+ train_loss=train_loss,
620
+ val_loss=val_loss
621
+ )
622
+
623
+ # Save model
624
+ exp.files.save(model.state_dict(), "final_model.pt")
625
+ ```
626
+
627
+ ### Pattern 2: Batch-Level Metrics with Aggregation
628
+
629
+ ```python
630
+ with exp.run():
631
+ exp.params.set(lr=0.001, batch_size=128)
632
+
633
+ for epoch in range(100):
634
+ # Collect batch-level metrics
635
+ for batch in train_loader:
636
+ loss, acc = train_step(batch)
637
+ exp.metrics.collect(loss=loss, accuracy=acc)
638
+
639
+ # Aggregate and log epoch metrics
640
+ exp.metrics.flush(_aggregation="mean", step=epoch)
641
+ ```
642
+
643
+ ### Pattern 3: Separate Train/Val Metrics
644
+
645
+ ```python
646
+ with exp.run():
647
+ # Create namespaced loggers
648
+ train_metrics = exp.metrics("train")
649
+ val_metrics = exp.metrics("val")
650
+
651
+ for epoch in range(100):
652
+ # Training phase
653
+ for batch in train_loader:
654
+ loss, acc = train_step(batch)
655
+ train_metrics.collect(loss=loss, accuracy=acc)
656
+ train_metrics.flush(_aggregation="mean", step=epoch)
657
+
658
+ # Validation phase
659
+ val_loss, val_acc = validate()
660
+ val_metrics.log(step=epoch, loss=val_loss, accuracy=val_acc)
661
+ ```
662
+
663
+ ### Pattern 4: Checkpoint Management
664
+
665
+ ```python
666
+ with exp.run():
667
+ checkpoints = exp.files("checkpoints")
668
+
669
+ best_val_loss = float('inf')
670
+
671
+ for epoch in range(100):
672
+ train_loss = train()
673
+ val_loss = validate()
674
+
675
+ # Save regular checkpoint
676
+ if epoch % 10 == 0:
677
+ checkpoints.save(model.state_dict(), f"epoch_{epoch}.pt")
678
+
679
+ # Save best model
680
+ if val_loss < best_val_loss:
681
+ best_val_loss = val_loss
682
+ checkpoints.save({
683
+ "epoch": epoch,
684
+ "model": model.state_dict(),
685
+ "optimizer": optimizer.state_dict(),
686
+ "val_loss": val_loss
687
+ }, "best_model.pt")
688
+
689
+ # Save final model
690
+ checkpoints.save(model.state_dict(), "final_model.pt")
691
+ ```
692
+
693
+ ### Pattern 5: Hierarchical Organization
694
+
695
+ ```python
696
+ # Organize experiments in a directory hierarchy
697
+ exp = Experiment(
698
+ namespace="alice",
699
+ workspace="vision",
700
+ prefix="run-001",
701
+ directory="image-classification/resnet50/cifar10"
702
+ )
703
+ # Creates: .ml-logger/alice/vision/image-classification/resnet50/cifar10/run-001/
704
+ ```
705
+
706
+ ---
707
+
708
+ ## Examples
709
+
710
+ ### Example 1: Basic MNIST Training
711
+
712
+ ```python
713
+ from ml_dash import Experiment
714
+ import torch
715
+ import torch.nn as nn
716
+ from torch.utils.data import DataLoader
717
+ from torchvision import datasets, transforms
718
+
719
+ # Create experiment
720
+ exp = Experiment(
721
+ namespace="alice",
722
+ workspace="mnist",
723
+ prefix="basic-cnn-001"
724
+ )
725
+
726
+ # Define model
727
+ model = nn.Sequential(
728
+ nn.Conv2d(1, 32, 3),
729
+ nn.ReLU(),
730
+ nn.MaxPool2d(2),
731
+ nn.Conv2d(32, 64, 3),
732
+ nn.ReLU(),
733
+ nn.MaxPool2d(2),
734
+ nn.Flatten(),
735
+ nn.Linear(1600, 128),
736
+ nn.ReLU(),
737
+ nn.Linear(128, 10)
738
+ )
739
+
740
+ # Training
741
+ with exp.run():
742
+ # Log configuration
743
+ exp.params.set(
744
+ learning_rate=0.001,
745
+ batch_size=64,
746
+ epochs=10,
747
+ optimizer="adam"
748
+ )
749
+
750
+ # Setup
751
+ train_loader = DataLoader(
752
+ datasets.MNIST('../data', train=True, download=True,
753
+ transform=transforms.ToTensor()),
754
+ batch_size=64, shuffle=True
755
+ )
756
+
757
+ optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
758
+ criterion = nn.CrossEntropyLoss()
759
+
760
+ # Training loop
761
+ for epoch in range(10):
762
+ total_loss = 0
763
+ correct = 0
764
+ total = 0
765
+
766
+ for data, target in train_loader:
767
+ optimizer.zero_grad()
768
+ output = model(data)
769
+ loss = criterion(output, target)
770
+ loss.backward()
771
+ optimizer.step()
772
+
773
+ total_loss += loss.item()
774
+ pred = output.argmax(dim=1)
775
+ correct += (pred == target).sum().item()
776
+ total += target.size(0)
777
+
778
+ # Log epoch metrics
779
+ avg_loss = total_loss / len(train_loader)
780
+ accuracy = correct / total
781
+
782
+ exp.metrics.log(
783
+ step=epoch,
784
+ loss=avg_loss,
785
+ accuracy=accuracy
786
+ )
787
+
788
+ exp.info(f"Epoch {epoch}", loss=avg_loss, accuracy=accuracy)
789
+
790
+ # Save final model
791
+ exp.files.save(model.state_dict(), "model.pt")
792
+ exp.info("Training completed!")
793
+ ```
794
+
795
+ ### Example 2: Transfer Learning with Checkpointing
796
+
797
+ ```python
798
+ from ml_dash import Experiment
799
+ from torchvision import models
800
+ import torch.nn as nn
801
+
802
+ exp = Experiment(
803
+ namespace="alice",
804
+ workspace="vision",
805
+ prefix="resnet-transfer-001",
806
+ directory="transfer-learning/cifar10",
807
+ readme="ResNet50 transfer learning on CIFAR-10"
808
+ )
809
+
810
+ with exp.run():
811
+ # Configuration
812
+ config = {
813
+ "model": "resnet50",
814
+ "pretrained": True,
815
+ "num_classes": 10,
816
+ "learning_rate": 0.001,
817
+ "epochs": 50,
818
+ "batch_size": 128,
819
+ "early_stopping_patience": 10
820
+ }
821
+
822
+ exp.params.set(**config)
823
+
824
+ # Load pretrained model
825
+ model = models.resnet50(pretrained=True)
826
+ model.fc = nn.Linear(model.fc.in_features, 10)
827
+
828
+ # Create namespaced loggers
829
+ train_metrics = exp.metrics("train")
830
+ val_metrics = exp.metrics("val")
831
+ checkpoints = exp.files("checkpoints")
832
+
833
+ best_val_acc = 0.0
834
+ patience_counter = 0
835
+
836
+ for epoch in range(config["epochs"]):
837
+ # Training phase
838
+ model.train()
839
+ for batch in train_loader:
840
+ loss, acc = train_step(model, batch)
841
+ train_metrics.collect(loss=loss, accuracy=acc)
842
+
843
+ train_metrics.flush(_aggregation="mean", step=epoch)
844
+
845
+ # Validation phase
846
+ model.eval()
847
+ val_loss, val_acc = validate(model, val_loader)
848
+ val_metrics.log(step=epoch, loss=val_loss, accuracy=val_acc)
849
+
850
+ # Checkpoint best model
851
+ if val_acc > best_val_acc:
852
+ best_val_acc = val_acc
853
+ patience_counter = 0
854
+
855
+ checkpoints.save({
856
+ "epoch": epoch,
857
+ "model_state_dict": model.state_dict(),
858
+ "val_accuracy": val_acc,
859
+ "config": config
860
+ }, "best_model.pt")
861
+
862
+ exp.info("New best model!", epoch=epoch, val_acc=val_acc)
863
+ else:
864
+ patience_counter += 1
865
+
866
+ # Early stopping
867
+ if patience_counter >= config["early_stopping_patience"]:
868
+ exp.info("Early stopping", epoch=epoch)
869
+ break
870
+
871
+ # Regular checkpoint
872
+ if epoch % 10 == 0:
873
+ checkpoints.save(model.state_dict(), f"checkpoint_epoch_{epoch}.pt")
874
+
875
+ # Save final summary
876
+ exp.files.save({
877
+ "best_val_accuracy": best_val_acc,
878
+ "total_epochs": epoch + 1,
879
+ "config": config
880
+ }, "summary.json")
881
+
882
+ exp.info("Training completed!", best_val_acc=best_val_acc)
883
+ ```
884
+
885
+ ### Example 3: Hyperparameter Sweep
886
+
887
+ ```python
888
+ from ml_dash import Experiment
889
+
890
+ # Define hyperparameter grid
891
+ learning_rates = [0.001, 0.0001, 0.00001]
892
+ batch_sizes = [32, 64, 128]
893
+
894
+ for lr in learning_rates:
895
+ for bs in batch_sizes:
896
+ # Create unique experiment for each combination
897
+ exp = Experiment(
898
+ namespace="alice",
899
+ workspace="hp-sweep",
900
+ prefix=f"lr{lr}_bs{bs}",
901
+ directory="mnist/grid-search"
902
+ )
903
+
904
+ with exp.run():
905
+ # Log this combination
906
+ exp.params.set(
907
+ learning_rate=lr,
908
+ batch_size=bs,
909
+ model="simple-cnn"
910
+ )
911
+
912
+ # Train with these hyperparameters
913
+ final_acc = train_model(lr, bs)
914
+
915
+ # Log final result
916
+ exp.metrics.log(step=0, final_accuracy=final_acc)
917
+ exp.info("Sweep run completed", lr=lr, bs=bs, acc=final_acc)
918
+
919
+ print("Hyperparameter sweep completed!")
920
+ ```
921
+
922
+ ### Example 4: Multi-Stage Training
923
+
924
+ ```python
925
+ from ml_dash import Experiment
926
+
927
+ exp = Experiment(
928
+ namespace="alice",
929
+ workspace="nlp",
930
+ prefix="bert-finetuning-001",
931
+ directory="transformers/bert/squad"
932
+ )
933
+
934
+ with exp.run():
935
+ # Stage 1: Warmup
936
+ exp.params.set(stage="warmup", lr=0.00001, epochs=5)
937
+ exp.info("Starting warmup phase")
938
+
939
+ warmup_metrics = exp.metrics("warmup")
940
+ for epoch in range(5):
941
+ loss = train_epoch(lr=0.00001)
942
+ warmup_metrics.log(step=epoch, loss=loss)
943
+
944
+ # Stage 2: Main training
945
+ exp.params.extend(stage="main", lr=0.0001, epochs=20)
946
+ exp.info("Starting main training phase")
947
+
948
+ train_metrics = exp.metrics("train")
949
+ val_metrics = exp.metrics("val")
950
+
951
+ for epoch in range(20):
952
+ train_loss = train_epoch(lr=0.0001)
953
+ val_loss = validate()
954
+
955
+ train_metrics.log(step=epoch, loss=train_loss)
956
+ val_metrics.log(step=epoch, loss=val_loss)
957
+
958
+ # Stage 3: Fine-tuning
959
+ exp.params.extend(stage="finetune", lr=0.00001, epochs=10)
960
+ exp.info("Starting fine-tuning phase")
961
+
962
+ finetune_metrics = exp.metrics("finetune")
963
+ for epoch in range(10):
964
+ loss = train_epoch(lr=0.00001)
965
+ finetune_metrics.log(step=epoch, loss=loss)
966
+
967
+ exp.info("Multi-stage training completed!")
968
+ ```
969
+
970
+ ---
971
+
972
+ ## Remote Backend
973
+
974
+ ML-Logger supports syncing to a remote server for team collaboration.
975
+
976
+ ### Setup Remote Backend
977
+
978
+ ```python
979
+ exp = Experiment(
980
+ namespace="alice",
981
+ workspace="shared-project",
982
+ prefix="experiment-001",
983
+ remote="http://qwqdug4btp.us-east-1.awsapprunner.com", # Remote server URL
984
+ readme="Shared experiment for team"
985
+ )
986
+ ```
987
+
988
+ ### How It Works
989
+
990
+ 1. **Local or Remote **: Data can be saved locally or remotely
991
+ 2. **Automatic Sync**: When `remote` is specified, data syncs to server
992
+ 3. **Experiment Creation**: Server creates an experiment record
993
+ 4. **Run Tracking**: Server tracks run status (RUNNING, COMPLETED, FAILED)
994
+ 5. **GraphQL API**: Query experiments via GraphQL at `http://qwqdug4btp.us-east-1.awsapprunner.com/graphql`
995
+
996
+ ### Environment Variables
997
+
998
+ Configure remote backend via environment:
999
+
1000
+ ```bash
1001
+ export ML_LOGGER_REMOTE="http://qwqdug4btp.us-east-1.awsapprunner.com"
1002
+ export ML_LOGGER_NAMESPACE="alice"
1003
+ export ML_LOGGER_WORKSPACE="production"
1004
+ ```
1005
+
1006
+ ```python
1007
+ # Uses environment variables if not specified
1008
+ exp = Experiment(prefix="my-experiment")
1009
+ ```
1010
+
1011
+ ### Server Requirements
1012
+
1013
+ To use remote backend, you need the dash-server running:
1014
+
1015
+ ```bash
1016
+ cd ml-dash/ml-dash-server
1017
+ pnpm install
1018
+ pnpm dev
1019
+ ```
1020
+
1021
+ Server will be available at `http://qwqdug4btp.us-east-1.awsapprunner.com`
1022
+
1023
+ ---
1024
+
1025
+ ## Best Practices
1026
+
1027
+ ### 1. **Use Context Manager**
1028
+
1029
+ Always use `with exp.run():` for automatic cleanup:
1030
+
1031
+ ```python
1032
+ # Good
1033
+ with exp.run():
1034
+ train()
1035
+
1036
+ # Avoid
1037
+ exp.run()
1038
+ train()
1039
+ exp.complete() # Easy to forget!
1040
+ ```
1041
+
1042
+ ### 2. **Namespace Metrics and Files**
1043
+
1044
+ Organize metrics and files with namespaces:
1045
+
1046
+ ```python
1047
+ train_metrics = exp.metrics("train")
1048
+ val_metrics = exp.metrics("val")
1049
+ test_metrics = exp.metrics("test")
1050
+
1051
+ checkpoints = exp.files("checkpoints")
1052
+ configs = exp.files("configs")
1053
+ visualizations = exp.files("plots")
1054
+ ```
1055
+
1056
+ ### 3. **Use collect() + flush() for Batch Metrics**
1057
+
1058
+ For fine-grained batch logging with epoch aggregation:
1059
+
1060
+ ```python
1061
+ for epoch in range(epochs):
1062
+ for batch in batches:
1063
+ loss = train_batch(batch)
1064
+ exp.metrics.collect(loss=loss)
1065
+
1066
+ # Log aggregated metrics once per epoch
1067
+ exp.metrics.flush(_aggregation="mean", step=epoch)
1068
+ ```
1069
+
1070
+ ### 4. **Log Configuration Early**
1071
+
1072
+ Log all hyperparameters at the start:
1073
+
1074
+ ```python
1075
+ with exp.run():
1076
+ exp.params.set(
1077
+ model="resnet50",
1078
+ learning_rate=0.001,
1079
+ batch_size=128,
1080
+ epochs=100,
1081
+ optimizer="adam",
1082
+ dataset="cifar10"
1083
+ )
1084
+ # ... training ...
1085
+ ```
1086
+
1087
+ ### 5. **Use Hierarchical Organization**
1088
+
1089
+ Organize experiments with directories:
1090
+
1091
+ ```python
1092
+ exp = Experiment(
1093
+ namespace="alice",
1094
+ workspace="vision",
1095
+ prefix="run-001",
1096
+ directory="models/resnet/cifar10" # Hierarchical organization
1097
+ )
1098
+ ```
1099
+
1100
+ ### 6. **Add Context to Logs**
1101
+
1102
+ Make logs searchable with context:
1103
+
1104
+ ```python
1105
+ exp.info("Epoch completed",
1106
+ epoch=5,
1107
+ train_loss=0.3,
1108
+ val_loss=0.35,
1109
+ learning_rate=0.001)
1110
+
1111
+ exp.warning("High memory usage",
1112
+ memory_gb=14.5,
1113
+ available_gb=16.0,
1114
+ batch_size=128)
1115
+ ```
1116
+
1117
+ ### 7. **Save Comprehensive Checkpoints**
1118
+
1119
+ Include all state needed for resumption:
1120
+
1121
+ ```python
1122
+ checkpoints.save({
1123
+ "epoch": epoch,
1124
+ "model_state_dict": model.state_dict(),
1125
+ "optimizer_state_dict": optimizer.state_dict(),
1126
+ "scheduler_state_dict": scheduler.state_dict(),
1127
+ "best_val_loss": best_val_loss,
1128
+ "config": config
1129
+ }, "checkpoint.pt")
1130
+ ```
1131
+
1132
+ ### 8. **Version Control Integration**
1133
+
1134
+ Log git information:
1135
+
1136
+ ```python
1137
+ import subprocess
1138
+
1139
+ git_hash = subprocess.check_output(
1140
+ ["git", "rev-parse", "HEAD"]
1141
+ ).decode().strip()
1142
+
1143
+ exp.params.set(
1144
+ git_commit=git_hash,
1145
+ git_branch="main"
1146
+ )
1147
+ ```
1148
+
1149
+ ### 9. **Error Handling**
1150
+
1151
+ The context manager handles errors automatically, but you can add custom handling:
1152
+
1153
+ ```python
1154
+ with exp.run():
1155
+ try:
1156
+ train()
1157
+ except RuntimeError as e:
1158
+ exp.error("Training error", error=str(e), device="cuda:0")
1159
+ raise
1160
+ ```
1161
+
1162
+ ---
1163
+
1164
+ ## File Format Details
1165
+
1166
+ ### metrics.jsonl
1167
+
1168
+ ```json
1169
+ {
1170
+ "timestamp": 1234567890.123,
1171
+ "step": 0,
1172
+ "metrics": {
1173
+ "loss": 0.5,
1174
+ "accuracy": 0.8
1175
+ }
1176
+ }
1177
+ {
1178
+ "timestamp": 1234567891.456,
1179
+ "step": 1,
1180
+ "metrics": {
1181
+ "loss": 0.4,
1182
+ "accuracy": 0.85
1183
+ }
1184
+ }
1185
+ ```
1186
+
1187
+ ### parameters.jsonl
1188
+
1189
+ ```json
1190
+ {
1191
+ "timestamp": 1234567890.123,
1192
+ "operation": "set",
1193
+ "data": {
1194
+ "lr": 0.001,
1195
+ "batch_size": 32
1196
+ }
1197
+ }
1198
+ {
1199
+ "timestamp": 1234567892.456,
1200
+ "operation": "update",
1201
+ "key": "lr",
1202
+ "value": 0.0001
1203
+ }
1204
+ ```
1205
+
1206
+ ### logs.jsonl
1207
+
1208
+ ```json
1209
+ {
1210
+ "timestamp": 1234567890.123,
1211
+ "level": "INFO",
1212
+ "message": "Training started",
1213
+ "context": {
1214
+ "epoch": 0
1215
+ }
1216
+ }
1217
+ {
1218
+ "timestamp": 1234567891.456,
1219
+ "level": "WARNING",
1220
+ "message": "High memory",
1221
+ "context": {
1222
+ "memory_gb": 14.5
1223
+ }
1224
+ }
1225
+ ```
1226
+
1227
+ ### .ml-logger.meta.json
1228
+
1229
+ ```json
1230
+ {
1231
+ "namespace": "alice",
1232
+ "workspace": "vision",
1233
+ "prefix": "experiment-1",
1234
+ "status": "completed",
1235
+ "started_at": 1234567890.123,
1236
+ "completed_at": 1234567900.456,
1237
+ "readme": "ResNet experiment",
1238
+ "experiment_id": "exp_123",
1239
+ "hostname": "gpu-server-01"
1240
+ }
1241
+ ```
1242
+
1243
+ ---
1244
+
1245
+ ## Troubleshooting
1246
+
1247
+ ### Issue: Remote server connection failed
1248
+
1249
+ ```python
1250
+ # Warning: Failed to initialize experiment on remote server
1251
+ # Solution: Check if ml-dash-server is running
1252
+ cd
1253
+ ml - dash / dash - server & & pnpm
1254
+ dev
1255
+ ```
1256
+
1257
+ ### Issue: File not found when loading
1258
+
1259
+ ```python
1260
+ # Check if file exists first
1261
+ if exp.files.exists("model.pt"):
1262
+ model.load_state_dict(exp.files.load("model.pt"))
1263
+ else:
1264
+ print("Checkpoint not found")
1265
+ ```
1266
+
1267
+ ### Issue: Metrics not aggregating correctly
1268
+
1269
+ ```python
1270
+ # Make sure to call flush() after collect()
1271
+ for batch in batches:
1272
+ metrics.collect(loss=loss)
1273
+
1274
+ metrics.flush(_aggregation="mean", step=epoch) # Don't forget this!
1275
+ ```
1276
+
1277
+ ---
1278
+
1279
+ ## API Summary
1280
+
1281
+ | Component | Key Methods | Purpose |
1282
+ |----------------|---------------------------------------------|-----------------------------|
1283
+ | **Experiment** | `run()`, `complete()`, `fail()`, `info()` | Manage experiment lifecycle |
1284
+ | **Parameters** | `set()`, `extend()`, `update()`, `read()` | Store configuration |
1285
+ | **Metrics** | `log()`, `collect()`, `flush()`, `read()` | Track time-series metrics |
1286
+ | **Logs** | `info()`, `warning()`, `error()`, `debug()` | Structured logging |
1287
+ | **Files** | `save()`, `load()`, `exists()`, `list()` | File management |
1288
+
1289
+ ---
1290
+
1291
+ ## Additional Resources
1292
+
1293
+ - **GitHub**: https://github.com/vuer-ai/vuer-dashboard
1294
+ - **Examples**: See `ml-logger/examples/` directory
1295
+ - **Tests**: See `ml-logger/tests/` for usage examples
1296
+ - **Dashboard**: http://qwqdug4btp.us-east-1.awsapprunner.com (when dash-server is running)
1297
+
1298
+ ---
1299
+
1300
+ ## Contributing & Development
1301
+
1302
+ ### Development Setup
1303
+
1304
+ 1. **Clone the repository**:
1305
+ ```bash
1306
+ git clone https://github.com/vuer-ai/vuer-dashboard.git
1307
+ cd vuer-dashboard/ml-logger
1308
+ ```
1309
+
1310
+ 2. **Install with development dependencies**:
1311
+ ```bash
1312
+ # Install all dev dependencies (includes testing, docs, and torch)
1313
+ uv sync --extra dev
1314
+ ```
1315
+
1316
+ ### Running Tests
1317
+
1318
+ ```bash
1319
+ # Run all tests
1320
+ uv run pytest
1321
+
1322
+ # Run with coverage
1323
+ uv run pytest --cov=ml_dash --cov-report=html
1324
+
1325
+ # Run specific test file
1326
+ uv run pytest tests/test_backends.py
1327
+ ```
1328
+
1329
+ ### Building Documentation
1330
+
1331
+ ```bash
1332
+ # Build HTML documentation
1333
+ cd docs && make html
1334
+
1335
+ # Serve docs with live reload (auto-refreshes on file changes)
1336
+ cd docs && make serve
1337
+
1338
+ # Clean build artifacts
1339
+ cd docs && make clean
1340
+ ```
1341
+
1342
+ The built documentation will be in `docs/_build/html/`. The `make serve` command starts a local server at `http://localhost:8000` with automatic rebuilding on file changes.
1343
+
1344
+ ### Linting and Code Checks
1345
+
1346
+ ```bash
1347
+ # Format code
1348
+ uv run ruff format .
1349
+
1350
+ # Lint code
1351
+ uv run ruff check .
1352
+
1353
+ # Fix auto-fixable issues
1354
+ uv run ruff check --fix .
1355
+ ```
1356
+
1357
+ ### Project Structure
1358
+
1359
+ ```
1360
+ ml-logger/
1361
+ ├── src/ml_logger_beta/ # Main package source
1362
+ │ ├── __init__.py # Package exports
1363
+ │ ├── run.py # Experiment class
1364
+ │ ├── ml_logger.py # ML_Logger class
1365
+ │ ├── job_logger.py # JobLogger class
1366
+ │ ├── backends/ # Storage backends
1367
+ │ │ ├── base.py # Base backend interface
1368
+ │ │ ├── local_backend.py # Local filesystem backend
1369
+ │ │ └── dash_backend.py # Remote server backend
1370
+ │ └── components/ # Component managers
1371
+ │ ├── parameters.py # Parameter management
1372
+ │ ├── metrics.py # Metrics logging
1373
+ │ ├── logs.py # Structured logging
1374
+ │ └── files.py # File management
1375
+ ├── tests/ # Test suite
1376
+ ├── docs/ # Sphinx documentation
1377
+ ├── pyproject.toml # Package configuration
1378
+ └── README.md # This file
1379
+ ```
1380
+
1381
+ ### Dependency Structure
1382
+
1383
+ The project uses a simplified dependency structure:
1384
+
1385
+ - **`dependencies`**: Core runtime dependencies (always installed)
1386
+ - `msgpack`, `numpy`, `requests`
1387
+ - **`dev`**: All development dependencies
1388
+ - Linting and formatting: `ruff`
1389
+ - Testing: `pytest`, `pytest-cov`, `pytest-asyncio`
1390
+ - Documentation: `sphinx`, `furo`, `myst-parser`, `sphinx-copybutton`, `sphinx-autobuild`
1391
+ - Optional features: `torch` (for saving/loading .pt/.pth files)
1392
+
1393
+ ### Making Changes
1394
+
1395
+ 1. Create a new branch for your changes
1396
+ 2. Make your modifications
1397
+ 3. Run tests to ensure everything works: `uv run pytest`
1398
+ 4. Run linting: `uv run ruff check .`
1399
+ 5. Format code: `uv run ruff format .`
1400
+ 6. Update documentation if needed
1401
+ 7. Submit a pull request
1402
+
1403
+ ### Building and Publishing
1404
+
1405
+ ```bash
1406
+ # Build the package
1407
+ uv build
1408
+
1409
+ # Publish to PyPI (requires credentials)
1410
+ uv publish
1411
+ ```
1412
+
1413
+ ### Tips for Contributors
1414
+
1415
+ - Follow the existing code style (enforced by ruff)
1416
+ - Add tests for new features
1417
+ - Update documentation for API changes
1418
+ - Use type hints where appropriate
1419
+ - Keep functions focused and modular
1420
+ - Write descriptive commit messages
1421
+
1422
+ ---
1423
+
1424
+ **Happy Experimenting!** 🚀