ml-dash 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ml_dash/__init__.py +51 -7
- ml_dash/client.py +595 -0
- ml_dash/experiment.py +939 -0
- ml_dash/files.py +313 -0
- ml_dash/log.py +181 -0
- ml_dash/metric.py +186 -0
- ml_dash/params.py +188 -0
- ml_dash/py.typed +0 -0
- ml_dash/storage.py +941 -0
- ml_dash-0.5.1.dist-info/METADATA +240 -0
- ml_dash-0.5.1.dist-info/RECORD +12 -0
- {ml_dash-0.4.0.dist-info → ml_dash-0.5.1.dist-info}/WHEEL +1 -1
- ml_dash/ARCHITECTURE.md +0 -382
- ml_dash/autolog.py +0 -32
- ml_dash/backends/__init__.py +0 -11
- ml_dash/backends/base.py +0 -124
- ml_dash/backends/dash_backend.py +0 -571
- ml_dash/backends/local_backend.py +0 -90
- ml_dash/components/__init__.py +0 -13
- ml_dash/components/files.py +0 -246
- ml_dash/components/logs.py +0 -104
- ml_dash/components/metrics.py +0 -169
- ml_dash/components/parameters.py +0 -144
- ml_dash/job_logger.py +0 -42
- ml_dash/ml_logger.py +0 -234
- ml_dash/run.py +0 -331
- ml_dash-0.4.0.dist-info/METADATA +0 -1424
- ml_dash-0.4.0.dist-info/RECORD +0 -19
- ml_dash-0.4.0.dist-info/entry_points.txt +0 -3
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: ml-dash
|
|
3
|
+
Version: 0.5.1
|
|
4
|
+
Summary: ML experiment metricing and data storage
|
|
5
|
+
Keywords: machine-learning,experiment-metricing,mlops,data-storage
|
|
6
|
+
Author: Ge Yang, Tom Tao
|
|
7
|
+
License: MIT License
|
|
8
|
+
|
|
9
|
+
Copyright (c) 2025 Ge Yang, Tom Tao
|
|
10
|
+
|
|
11
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
12
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
13
|
+
in the Software without restriction, including without limitation the rights
|
|
14
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
15
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
16
|
+
furnished to do so, subject to the following conditions:
|
|
17
|
+
|
|
18
|
+
The above copyright notice and this permission notice shall be included in all
|
|
19
|
+
copies or substantial portions of the Software.
|
|
20
|
+
|
|
21
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
22
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
23
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
24
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
25
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
26
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
27
|
+
SOFTWARE.
|
|
28
|
+
Classifier: Development Status :: 4 - Beta
|
|
29
|
+
Classifier: Intended Audience :: Developers
|
|
30
|
+
Classifier: Intended Audience :: Science/Research
|
|
31
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
32
|
+
Classifier: Programming Language :: Python :: 3
|
|
33
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
34
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
35
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
36
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
37
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
38
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
39
|
+
Requires-Dist: httpx>=0.27.0
|
|
40
|
+
Requires-Dist: pyjwt>=2.8.0
|
|
41
|
+
Requires-Dist: pytest>=8.0.0 ; extra == 'dev'
|
|
42
|
+
Requires-Dist: pytest-asyncio>=0.23.0 ; extra == 'dev'
|
|
43
|
+
Requires-Dist: sphinx>=7.2.0 ; extra == 'dev'
|
|
44
|
+
Requires-Dist: furo>=2024.0.0 ; extra == 'dev'
|
|
45
|
+
Requires-Dist: sphinx-autodoc-typehints>=2.0.0 ; extra == 'dev'
|
|
46
|
+
Requires-Dist: sphinx-autobuild>=2024.0.0 ; extra == 'dev'
|
|
47
|
+
Requires-Dist: sphinx-copybutton>=0.5.0 ; extra == 'dev'
|
|
48
|
+
Requires-Dist: sphinx-design>=0.5.0 ; extra == 'dev'
|
|
49
|
+
Requires-Dist: sphinx-tabs>=3.4.0 ; extra == 'dev'
|
|
50
|
+
Requires-Dist: sphinxcontrib-mermaid>=0.9.0 ; extra == 'dev'
|
|
51
|
+
Requires-Dist: sphinxext-opengraph>=0.9.0 ; extra == 'dev'
|
|
52
|
+
Requires-Dist: myst-parser>=2.0.0 ; extra == 'dev'
|
|
53
|
+
Requires-Dist: linkify-it-py>=2.0.0 ; extra == 'dev'
|
|
54
|
+
Requires-Dist: ruff>=0.3.0 ; extra == 'dev'
|
|
55
|
+
Requires-Dist: mypy>=1.9.0 ; extra == 'dev'
|
|
56
|
+
Requires-Python: >=3.9
|
|
57
|
+
Provides-Extra: dev
|
|
58
|
+
Description-Content-Type: text/markdown
|
|
59
|
+
|
|
60
|
+
# ML-Dash
|
|
61
|
+
|
|
62
|
+
A simple and flexible SDK for ML experiment metricing and data storage.
|
|
63
|
+
|
|
64
|
+
## Features
|
|
65
|
+
|
|
66
|
+
- **Three Usage Styles**: Decorator, context manager, or direct instantiation
|
|
67
|
+
- **Dual Operation Modes**: Remote (API server) or local (filesystem)
|
|
68
|
+
- **Auto-creation**: Automatically creates namespace, project, and folder hierarchy
|
|
69
|
+
- **Upsert Behavior**: Updates existing experiments or creates new ones
|
|
70
|
+
- **Experiment Lifecycle**: Automatic status tracking (RUNNING, COMPLETED, FAILED, CANCELLED)
|
|
71
|
+
- **Organized File Storage**: Prefix-based file organization with unique snowflake IDs
|
|
72
|
+
- **Rich Metadata**: Tags, bindrs, descriptions, and custom metadata support
|
|
73
|
+
- **Simple API**: Minimal configuration, maximum flexibility
|
|
74
|
+
|
|
75
|
+
## Installation
|
|
76
|
+
|
|
77
|
+
<table>
|
|
78
|
+
<tr>
|
|
79
|
+
<td>Using uv (recommended)</td>
|
|
80
|
+
<td>Using pip</td>
|
|
81
|
+
</tr>
|
|
82
|
+
<tr>
|
|
83
|
+
<td>
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
uv add ml-dash
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
</td>
|
|
90
|
+
<td>
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
pip install ml-dash
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
</td>
|
|
97
|
+
</tr>
|
|
98
|
+
</table>
|
|
99
|
+
|
|
100
|
+
## Quick Start
|
|
101
|
+
|
|
102
|
+
### Remote Mode (with API Server)
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
from ml_dash import Experiment
|
|
106
|
+
|
|
107
|
+
with Experiment(
|
|
108
|
+
name="my-experiment",
|
|
109
|
+
project="my-project",
|
|
110
|
+
remote="https://cu3thurmv3.us-east-1.awsapprunner.com",
|
|
111
|
+
api_key="your-jwt-token"
|
|
112
|
+
) as experiment:
|
|
113
|
+
print(f"Experiment ID: {experiment.id}")
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
### Local Mode (Filesystem)
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
from ml_dash import Experiment
|
|
120
|
+
|
|
121
|
+
with Experiment(
|
|
122
|
+
name="my-experiment",
|
|
123
|
+
project="my-project",
|
|
124
|
+
local_path=".ml-dash"
|
|
125
|
+
) as experiment:
|
|
126
|
+
pass # Your code here
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
See [examples/](examples/) for more complete examples.
|
|
130
|
+
|
|
131
|
+
## Development Setup
|
|
132
|
+
|
|
133
|
+
### Installing Dev Dependencies
|
|
134
|
+
|
|
135
|
+
To contribute to ML-Dash or run tests, install the development dependencies:
|
|
136
|
+
|
|
137
|
+
<table>
|
|
138
|
+
<tr>
|
|
139
|
+
<td>Using uv (recommended)</td>
|
|
140
|
+
<td>Using pip</td>
|
|
141
|
+
</tr>
|
|
142
|
+
<tr>
|
|
143
|
+
<td>
|
|
144
|
+
|
|
145
|
+
```bash
|
|
146
|
+
uv sync --extra dev
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
</td>
|
|
150
|
+
<td>
|
|
151
|
+
|
|
152
|
+
```bash
|
|
153
|
+
pip install -e ".[dev]"
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
</td>
|
|
157
|
+
</tr>
|
|
158
|
+
</table>
|
|
159
|
+
|
|
160
|
+
This installs:
|
|
161
|
+
- `pytest>=8.0.0` - Testing framework
|
|
162
|
+
- `pytest-asyncio>=0.23.0` - Async test support
|
|
163
|
+
- `sphinx>=7.2.0` - Documentation builder
|
|
164
|
+
- `sphinx-rtd-theme>=2.0.0` - Read the Docs theme
|
|
165
|
+
- `sphinx-autobuild>=2024.0.0` - Live preview for documentation
|
|
166
|
+
- `myst-parser>=2.0.0` - Markdown support for Sphinx
|
|
167
|
+
- `ruff>=0.3.0` - Linter and formatter
|
|
168
|
+
- `mypy>=1.9.0` - Type checker
|
|
169
|
+
|
|
170
|
+
### Running Tests
|
|
171
|
+
|
|
172
|
+
<table>
|
|
173
|
+
<tr>
|
|
174
|
+
<td>Using uv</td>
|
|
175
|
+
<td>Using pytest directly</td>
|
|
176
|
+
</tr>
|
|
177
|
+
<tr>
|
|
178
|
+
<td>
|
|
179
|
+
|
|
180
|
+
```bash
|
|
181
|
+
uv run pytest
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
</td>
|
|
185
|
+
<td>
|
|
186
|
+
|
|
187
|
+
```bash
|
|
188
|
+
pytest
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
</td>
|
|
192
|
+
</tr>
|
|
193
|
+
</table>
|
|
194
|
+
|
|
195
|
+
### Building Documentation
|
|
196
|
+
|
|
197
|
+
Documentation is built using Sphinx with Read the Docs theme.
|
|
198
|
+
|
|
199
|
+
<table>
|
|
200
|
+
<tr>
|
|
201
|
+
<td>Build docs</td>
|
|
202
|
+
<td>Live preview</td>
|
|
203
|
+
<td>Clean build</td>
|
|
204
|
+
</tr>
|
|
205
|
+
<tr>
|
|
206
|
+
<td>
|
|
207
|
+
|
|
208
|
+
```bash
|
|
209
|
+
uv run python -m sphinx -b html docs docs/_build/html
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
</td>
|
|
213
|
+
<td>
|
|
214
|
+
|
|
215
|
+
```bash
|
|
216
|
+
uv run sphinx-autobuild docs docs/_build/html
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
</td>
|
|
220
|
+
<td>
|
|
221
|
+
|
|
222
|
+
```bash
|
|
223
|
+
rm -rf docs/_build
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
</td>
|
|
227
|
+
</tr>
|
|
228
|
+
</table>
|
|
229
|
+
|
|
230
|
+
The live preview command starts a local server and automatically rebuilds when files change.
|
|
231
|
+
|
|
232
|
+
Alternatively, you can use the Makefile from within the docs directory:
|
|
233
|
+
|
|
234
|
+
```bash
|
|
235
|
+
cd docs
|
|
236
|
+
make html # Build HTML documentation
|
|
237
|
+
make clean # Clean build files
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
For maintainers, to build and publish a new release: `uv build && uv publish`
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
ml_dash/__init__.py,sha256=5tT0Lmf0SS3J7BOwJGVai8FOjdpjKGBJCEYL5nXnkLA,1384
|
|
2
|
+
ml_dash/client.py,sha256=vhWcS5o2n3o4apEjVeLmu7flCEzxBbBOoLSQNcAx_ew,17267
|
|
3
|
+
ml_dash/experiment.py,sha256=x1jtQD1QroNjNULOxZiGtX5oFLi3ZXDaFbGHWt0yMJU,28652
|
|
4
|
+
ml_dash/files.py,sha256=WKWbcug6XADwZruYQio1EdSstmfTsty9-2-t2KPWz38,9719
|
|
5
|
+
ml_dash/log.py,sha256=0yXaNnFwYeBI3tRLHX3kkqWRpg0MbSGwmgjnOfsElCk,5350
|
|
6
|
+
ml_dash/metric.py,sha256=PcEd6_HTLDpf-kBIDeQq2LlTRAS7xDx6EvSBpin5iuY,6456
|
|
7
|
+
ml_dash/params.py,sha256=W-JkY1Mz7KdmvDjQ0HFV2QnpBov7Gf4dl70fuBnXTdo,5974
|
|
8
|
+
ml_dash/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
+
ml_dash/storage.py,sha256=UTuux2nfclLrrtlkC6TsOvDB_wIbSDvYGg8Gtbvk6mc,30471
|
|
10
|
+
ml_dash-0.5.1.dist-info/WHEEL,sha256=X16MKk8bp2DRsAuyteHJ-9qOjzmnY0x1aj0P1ftqqWA,78
|
|
11
|
+
ml_dash-0.5.1.dist-info/METADATA,sha256=TBTqi4lJNoFO2eyztYJZptQypUSVTDVUdbGS0EnQZ2k,6067
|
|
12
|
+
ml_dash-0.5.1.dist-info/RECORD,,
|
ml_dash/ARCHITECTURE.md
DELETED
|
@@ -1,382 +0,0 @@
|
|
|
1
|
-
# ML-Logger Architecture
|
|
2
|
-
|
|
3
|
-
## Class Hierarchy and Composition
|
|
4
|
-
|
|
5
|
-
<details open>
|
|
6
|
-
<summary><strong>🏗️ System Overview</strong></summary>
|
|
7
|
-
|
|
8
|
-
```
|
|
9
|
-
ML-Logger System
|
|
10
|
-
│
|
|
11
|
-
├── Storage Backends (remove existing implementations awaiting design)
|
|
12
|
-
add local logger, s3, gcp, ml_dash, as empty files. Also include an empty base class.
|
|
13
|
-
├── Logger Components (file and data types)
|
|
14
|
-
├── ML_Logger (Main Interface)
|
|
15
|
-
└── Supporting and Utility Classes
|
|
16
|
-
```
|
|
17
|
-
|
|
18
|
-
</details>
|
|
19
|
-
|
|
20
|
-
<details>
|
|
21
|
-
<summary><strong>💾 Storage Backends</strong> (Where to store)</summary>
|
|
22
|
-
|
|
23
|
-
```
|
|
24
|
-
Storage Backends
|
|
25
|
-
│
|
|
26
|
-
├── StorageBackend (Abstract Base)
|
|
27
|
-
│ ├── exists()
|
|
28
|
-
│ ├── write_bytes()
|
|
29
|
-
│ ├── read_bytes()
|
|
30
|
-
│ ├── write_text()
|
|
31
|
-
│ ├── read_text()
|
|
32
|
-
│ ├── append_text()
|
|
33
|
-
│ ├── list_dir()
|
|
34
|
-
│ └── get_url()
|
|
35
|
-
│
|
|
36
|
-
├── LocalBackend(StorageBackend)
|
|
37
|
-
│ └── Implements file system operations
|
|
38
|
-
│
|
|
39
|
-
├── S3Backend(StorageBackend)
|
|
40
|
-
│ └── Implements AWS S3 operations
|
|
41
|
-
│
|
|
42
|
-
└── GCPBackend(StorageBackend)
|
|
43
|
-
└── Implements Google Cloud Storage operations
|
|
44
|
-
```
|
|
45
|
-
|
|
46
|
-
</details>
|
|
47
|
-
|
|
48
|
-
<details>
|
|
49
|
-
<summary><strong>📝 Logger Components</strong> (What to log)</summary>
|
|
50
|
-
|
|
51
|
-
```
|
|
52
|
-
Experiment
|
|
53
|
-
│
|
|
54
|
-
├── logs: TextLogger
|
|
55
|
-
│ ├── log(level, message)
|
|
56
|
-
│ ├── error(message)
|
|
57
|
-
│ ├── warning(message)
|
|
58
|
-
│ ├── info(message)
|
|
59
|
-
│ └── debug(message)
|
|
60
|
-
│
|
|
61
|
-
├── metrics: ScalarLogger (accessed via experiment.metrics)
|
|
62
|
-
│ ├── log(step, **metrics) - Log metrics immediately
|
|
63
|
-
│ ├── collect(step, **metrics) - Collect for later aggregation
|
|
64
|
-
│ ├── flush(_aggregation, step) - Aggregate and log collected metrics
|
|
65
|
-
│ ├── get_summary(name, frequency)
|
|
66
|
-
│ ├── __call__(namespace) - Return namespaced logger
|
|
67
|
-
│ └── Uses: ScalarCache, Series
|
|
68
|
-
│
|
|
69
|
-
├── files: ArtifactLogger (accessed via experiment.files)
|
|
70
|
-
│ ├── save(data, filename) - Save generic data
|
|
71
|
-
│ ├── save_pkl(data, filename) - Save pickle data
|
|
72
|
-
│ ├── save_image(name, image) - Save image
|
|
73
|
-
│ ├── save_video(name, video, fps) - Save video
|
|
74
|
-
│ ├── save_audio(name, audio) - Save audio
|
|
75
|
-
│ ├── savefig(fig, filename) - Save matplotlib figure
|
|
76
|
-
│ ├── load_torch(filename) - Load PyTorch data
|
|
77
|
-
│ ├── make_video(pattern, output, fps, codec, quality, sort) - Create video from frames
|
|
78
|
-
│ ├── __call__(namespace) - Return namespaced logger
|
|
79
|
-
│ └── File management and artifact storage
|
|
80
|
-
│
|
|
81
|
-
├── params: ParameterIndex
|
|
82
|
-
│ ├── set(params) - Set/overwrite parameters
|
|
83
|
-
│ ├── extend(params) - Merge with existing parameters
|
|
84
|
-
│ ├── update(key, value) - Update single parameter
|
|
85
|
-
│ ├── read() - Read all parameters
|
|
86
|
-
│ └── Manages experiment configuration
|
|
87
|
-
│
|
|
88
|
-
└── charts: ChartBuilder # PLANNING PHASE, subject to changes.
|
|
89
|
-
├── line_chart(query)
|
|
90
|
-
├── scatter_plot(query)
|
|
91
|
-
├── bar_chart(query)
|
|
92
|
-
└── video/images(query)
|
|
93
|
-
```
|
|
94
|
-
|
|
95
|
-
</details>
|
|
96
|
-
|
|
97
|
-
<details>
|
|
98
|
-
<summary><strong>🎯 Composite Logger</strong> (Main Interface)</summary>
|
|
99
|
-
|
|
100
|
-
```
|
|
101
|
-
MLLogger
|
|
102
|
-
├── __init__(backend: StorageBackend)
|
|
103
|
-
├── params: ParameterIndex - Parameter management
|
|
104
|
-
├── metrics: ScalarLogger - Metrics logging
|
|
105
|
-
├── readme: MarkdownLogger - Rich Text logging (PLANNING PHASE)
|
|
106
|
-
├── files: ArtifactLogger - File and artifact management
|
|
107
|
-
├── logs: TextLogger - Text logging
|
|
108
|
-
│
|
|
109
|
-
├── Convenience Methods: (can just hide under logs)
|
|
110
|
-
│ ├── error() -> text.error()
|
|
111
|
-
│ ├── warning() -> text.warning()
|
|
112
|
-
│ ├── info() -> text.info()
|
|
113
|
-
│ └── debug() -> text.debug()
|
|
114
|
-
│
|
|
115
|
-
└── Context Managers:
|
|
116
|
-
├── experiment(name)
|
|
117
|
-
└── run(id)
|
|
118
|
-
```
|
|
119
|
-
|
|
120
|
-
</details>
|
|
121
|
-
|
|
122
|
-
<details>
|
|
123
|
-
<summary><strong>⚙️ Supporting Classes</strong></summary>
|
|
124
|
-
|
|
125
|
-
```
|
|
126
|
-
Supporting Classes
|
|
127
|
-
│
|
|
128
|
-
└── Serialization (serdes/) (NOT USED)
|
|
129
|
-
├── serialize()
|
|
130
|
-
├── deserialize()
|
|
131
|
-
└── Type registry with $t, $s keys
|
|
132
|
-
```
|
|
133
|
-
|
|
134
|
-
</details>
|
|
135
|
-
|
|
136
|
-
## Usage Examples
|
|
137
|
-
|
|
138
|
-
<details>
|
|
139
|
-
<summary><strong>📊 Logging Different Data Types</strong></summary>
|
|
140
|
-
|
|
141
|
-
```python
|
|
142
|
-
# Text logging (errors, warnings, info) experiment.logs.error("Training failed") experiment.logs.warning("Low GPU memory") experiment.logs.info("Starting epoch 1")
|
|
143
|
-
|
|
144
|
-
# Parameter logging experiment.params.set(learning_rate=0.001, batch_size=32)
|
|
145
|
-
|
|
146
|
-
# Metrics logging experiment.metrics.log(step=100, loss=0.523, accuracy=0.95)
|
|
147
|
-
|
|
148
|
-
# Collect metrics for aggregation experiment.metrics.collect(step=101, loss=0.521) experiment.metrics.flush(_aggregation="mean", step=100)
|
|
149
|
-
|
|
150
|
-
# Namespaced metrics experiment.metrics("train").log(step=100, loss=0.5) experiment.metrics("val").log(step=100, accuracy=0.95)
|
|
151
|
-
|
|
152
|
-
# File operations experiment.files.save_image("confusion_matrix", image_array) experiment.files.save(model_state, "checkpoint.pt") experiment.files("checkpoints").save(model_state, "model_epoch_10.pt")
|
|
153
|
-
```
|
|
154
|
-
|
|
155
|
-
</details>
|
|
156
|
-
|
|
157
|
-
<details>
|
|
158
|
-
<summary><strong>🎛️ Direct Component Access</strong></summary>
|
|
159
|
-
|
|
160
|
-
```python
|
|
161
|
-
# Access components directly for advanced usage experiment.logs.error("Direct text logging") experiment.metrics.log(step=50, lr=0.001) experiment.files.save_video("training_progress", video_array, fps=30)
|
|
162
|
-
|
|
163
|
-
# Namespaced file operations experiment.files("videos").save_video("training_progress", video_array, fps=30) experiment.files("checkpoints").save(model_state, "model.pt")
|
|
164
|
-
|
|
165
|
-
# Get statistics
|
|
166
|
-
stats = experiment.metrics.get_stats("loss")
|
|
167
|
-
percentile_95 = experiment.metrics.get_percentile("loss", 95)
|
|
168
|
-
```
|
|
169
|
-
|
|
170
|
-
</details>
|
|
171
|
-
|
|
172
|
-
## File Organization
|
|
173
|
-
|
|
174
|
-
<details>
|
|
175
|
-
<summary><strong>📁 Project Structure</strong></summary>
|
|
176
|
-
|
|
177
|
-
```
|
|
178
|
-
ml-logger/
|
|
179
|
-
├── src/ml_logger/
|
|
180
|
-
│ ├── __init__.py
|
|
181
|
-
│ ├── experiment.py # Main MLLogger class
|
|
182
|
-
│ │
|
|
183
|
-
│ ├── backends/
|
|
184
|
-
│ │ ├── __init__.py
|
|
185
|
-
│ │ ├── base.py # StorageBackend ABC
|
|
186
|
-
│ │ ├── local.py # LocalBackend
|
|
187
|
-
│ │ ├── s3.py # S3Backend
|
|
188
|
-
│ │ └── gcp.py # GCPBackend
|
|
189
|
-
│ │
|
|
190
|
-
│ ├── loggers/
|
|
191
|
-
│ │ ├── __init__.py
|
|
192
|
-
│ │ ├── text.py # TextLogger
|
|
193
|
-
│ │ ├── scalar.py # ScalarLogger
|
|
194
|
-
│ │ └── artifact.py # ArtifactLogger
|
|
195
|
-
│ │
|
|
196
|
-
│ ├── scalar_cache.py # ScalarCache, Series, RollingStats
|
|
197
|
-
│ │
|
|
198
|
-
│ └── serdes/
|
|
199
|
-
│ ├── __init__.py
|
|
200
|
-
│ └── ndjson.py # Serialization with $t, $s
|
|
201
|
-
│
|
|
202
|
-
└── tests/
|
|
203
|
-
├── test_backends.py
|
|
204
|
-
├── test_loggers.py
|
|
205
|
-
├── test_scalar_cache.py
|
|
206
|
-
└── test_integration.py
|
|
207
|
-
```
|
|
208
|
-
|
|
209
|
-
</details>
|
|
210
|
-
|
|
211
|
-
## Advanced Features
|
|
212
|
-
|
|
213
|
-
<details>
|
|
214
|
-
<summary><strong>📈 Statistical Features</strong></summary>
|
|
215
|
-
|
|
216
|
-
### Rolling Statistics
|
|
217
|
-
- **Window-based metrics**: Configurable window size for recent data
|
|
218
|
-
- **Automatic calculation**: Mean, variance, std, min, max
|
|
219
|
-
- **Percentiles**: p0, p1, p5, p10, p20, p25, p40, p50, p60, p75, p80, p90, p95, p99, p100
|
|
220
|
-
|
|
221
|
-
### Summary Frequencies
|
|
222
|
-
Automatic summaries at: 1, 5, 10, 15, 20, 25, 30, 40, 50, 75, 80, 100, 120, 150, 200, 250, 300, 400, 500, 600, 1000, 1200, 1500, 2000, 2500, ...
|
|
223
|
-
|
|
224
|
-
```python
|
|
225
|
-
# Access statistics
|
|
226
|
-
stats = experiment.scalars.get_stats("loss")
|
|
227
|
-
print(f"Mean: {stats.mean}, Std: {stats.std}")
|
|
228
|
-
|
|
229
|
-
# Get percentiles
|
|
230
|
-
p95 = experiment.scalars.get_percentile("accuracy", 95)
|
|
231
|
-
|
|
232
|
-
# Get summaries at specific frequencies
|
|
233
|
-
summaries = experiment.scalars.get_summary("loss", frequency=100)
|
|
234
|
-
```
|
|
235
|
-
|
|
236
|
-
</details>
|
|
237
|
-
|
|
238
|
-
<details>
|
|
239
|
-
<summary><strong>🔄 Serialization System</strong></summary>
|
|
240
|
-
|
|
241
|
-
### Type-Annotated Serialization
|
|
242
|
-
- Uses `$t` for type keys
|
|
243
|
-
- Uses `$s` for shape keys (arrays)
|
|
244
|
-
- Recursive serialization for nested structures
|
|
245
|
-
- Supports: primitives, datetime, numpy, Path, bytes, collections
|
|
246
|
-
|
|
247
|
-
```python
|
|
248
|
-
from ml_dash.serdes import serialize, deserialize
|
|
249
|
-
|
|
250
|
-
# Serialize complex objects
|
|
251
|
-
data = {
|
|
252
|
-
"array": np.array([[1, 2], [3, 4]]),
|
|
253
|
-
"date": datetime.now(),
|
|
254
|
-
"path": Path("/tmp/file.txt")
|
|
255
|
-
}
|
|
256
|
-
serialized = serialize(data)
|
|
257
|
-
|
|
258
|
-
# Deserialize back
|
|
259
|
-
original = deserialize(serialized)
|
|
260
|
-
```
|
|
261
|
-
|
|
262
|
-
</details>
|
|
263
|
-
|
|
264
|
-
## Examples
|
|
265
|
-
|
|
266
|
-
<details>
|
|
267
|
-
<summary><strong>🤖 ML Training Example</strong></summary>
|
|
268
|
-
|
|
269
|
-
```python
|
|
270
|
-
# train.py - Define your training function
|
|
271
|
-
from ml_dash import get_logger
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
@logger.run
|
|
275
|
-
def train(config):
|
|
276
|
-
"""Training function that will be wrapped by the experiment."""
|
|
277
|
-
model = create_model(config.model_type)
|
|
278
|
-
optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)
|
|
279
|
-
|
|
280
|
-
best_acc = 0
|
|
281
|
-
for epoch in range(config.epochs):
|
|
282
|
-
# Training loop
|
|
283
|
-
for batch_idx, (data, target) in enumerate(train_loader):
|
|
284
|
-
loss = train_step(model, data, target, optimizer)
|
|
285
|
-
|
|
286
|
-
step = epoch * len(train_loader) + batch_idx
|
|
287
|
-
with experiment.step(step):
|
|
288
|
-
# Log metrics
|
|
289
|
-
experiment.log_metric("train/loss", loss.item())
|
|
290
|
-
|
|
291
|
-
# Log histograms periodically
|
|
292
|
-
if step % 100 == 0:
|
|
293
|
-
experiment.log_histogram("gradients", get_gradients(model))
|
|
294
|
-
|
|
295
|
-
# Save visualizations
|
|
296
|
-
if step % 500 == 0:
|
|
297
|
-
fig = plot_predictions(model, data)
|
|
298
|
-
experiment.log_image("predictions", fig)
|
|
299
|
-
|
|
300
|
-
# Validation
|
|
301
|
-
val_loss, val_acc = validate(model, val_loader)
|
|
302
|
-
experiment.log_metrics({
|
|
303
|
-
"val/loss": val_loss,
|
|
304
|
-
"val/accuracy": val_acc
|
|
305
|
-
}, step=epoch)
|
|
306
|
-
|
|
307
|
-
# Save checkpoint
|
|
308
|
-
if val_acc > best_acc:
|
|
309
|
-
experiment.log_model("best_model", model.state_dict())
|
|
310
|
-
best_acc = val_acc
|
|
311
|
-
|
|
312
|
-
# Final summary
|
|
313
|
-
experiment.info(f"Training completed. Best accuracy: {best_acc}")
|
|
314
|
-
return {"best_accuracy": best_acc}
|
|
315
|
-
|
|
316
|
-
```
|
|
317
|
-
|
|
318
|
-
**experiment.py** - Launch experiments with different configs:
|
|
319
|
-
|
|
320
|
-
```python
|
|
321
|
-
from ml_dash import get_logger
|
|
322
|
-
from train import train
|
|
323
|
-
|
|
324
|
-
# Initialize logger
|
|
325
|
-
experiment = get_logger("s3://experiments/mnist")
|
|
326
|
-
|
|
327
|
-
# Define experiment configurations
|
|
328
|
-
configs = [
|
|
329
|
-
{"model_type": "CNN", "lr": 0.001, "batch_size": 32, "epochs": 100},
|
|
330
|
-
{"model_type": "CNN", "lr": 0.01, "batch_size": 64, "epochs": 100},
|
|
331
|
-
{"model_type": "ResNet", "lr": 0.001, "batch_size": 32, "epochs": 150},
|
|
332
|
-
]
|
|
333
|
-
|
|
334
|
-
# Run experiment with multiple configurations
|
|
335
|
-
with experiment.experiment("model_comparison"):
|
|
336
|
-
for i, config in enumerate(configs):
|
|
337
|
-
# Each config gets its own run
|
|
338
|
-
run_name = f"{config['model_type']}_lr{config['lr']}"
|
|
339
|
-
|
|
340
|
-
# The decorator handles run creation and lifecycle
|
|
341
|
-
result = train(
|
|
342
|
-
config=config,
|
|
343
|
-
_run_name=run_name,
|
|
344
|
-
_hyperparams=config,
|
|
345
|
-
_tags=["baseline", config["model_type"].lower()]
|
|
346
|
-
)
|
|
347
|
-
|
|
348
|
-
print(f"Run {run_name} completed with accuracy: {result['best_accuracy']}")
|
|
349
|
-
```
|
|
350
|
-
|
|
351
|
-
</details>
|
|
352
|
-
|
|
353
|
-
<details>
|
|
354
|
-
<summary><strong>🔍 Debugging Example</strong></summary>
|
|
355
|
-
|
|
356
|
-
```python
|
|
357
|
-
# Setup logger with debug level
|
|
358
|
-
experiment =get_logger("./debug_logs") experiment.logs.set_level(LogLevel.DEBUG)
|
|
359
|
-
|
|
360
|
-
try:
|
|
361
|
-
# Your code here
|
|
362
|
-
result = risky_operation()
|
|
363
|
-
experiment.debug(f"Operation result: {result}")
|
|
364
|
-
|
|
365
|
-
except Exception as e:
|
|
366
|
-
# Log exception with full traceback
|
|
367
|
-
experiment.exception("Operation failed", exc_info=True)
|
|
368
|
-
|
|
369
|
-
# Log additional context
|
|
370
|
-
experiment.error("Failed at step", step=current_step,
|
|
371
|
-
input_shape=data.shape)
|
|
372
|
-
|
|
373
|
-
# Save problematic data for debugging
|
|
374
|
-
experiment.log_file("failed_input", "debug_data.pkl")
|
|
375
|
-
|
|
376
|
-
finally:
|
|
377
|
-
# Get recent logs
|
|
378
|
-
errors = experiment.get_logs(level="ERROR", limit=50)
|
|
379
|
-
print(f"Found {len(errors)} errors")
|
|
380
|
-
```
|
|
381
|
-
|
|
382
|
-
</details>
|
ml_dash/autolog.py
DELETED
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
"""Auto-configured experiment for ML-Logger.
|
|
2
|
-
|
|
3
|
-
This module provides a pre-configured global `experiment` instance that can be
|
|
4
|
-
imported and used immediately without manual setup.
|
|
5
|
-
|
|
6
|
-
Example:
|
|
7
|
-
from ml_dash.autolog import experiment
|
|
8
|
-
|
|
9
|
-
# No setup needed!
|
|
10
|
-
experiment.params.set(learning_rate=0.001)
|
|
11
|
-
experiment.metrics.log(step=0, loss=0.5)
|
|
12
|
-
experiment.files.save(model.state_dict(), "checkpoint.pt")
|
|
13
|
-
|
|
14
|
-
Configuration:
|
|
15
|
-
The auto-experiment is configured from environment variables:
|
|
16
|
-
- ML_LOGGER_NAMESPACE: User/team namespace (default: "default")
|
|
17
|
-
- ML_LOGGER_WORKSPACE: Project workspace (default: "experiments")
|
|
18
|
-
- ML_LOGGER_PREFIX: Experiment prefix (default: auto-generated timestamp+uuid)
|
|
19
|
-
- ML_LOGGER_REMOTE: Remote server URL (optional)
|
|
20
|
-
|
|
21
|
-
Or from ~/.ml-logger/config.yaml:
|
|
22
|
-
namespace: alice
|
|
23
|
-
workspace: my-project
|
|
24
|
-
remote: http://localhost:3001
|
|
25
|
-
"""
|
|
26
|
-
|
|
27
|
-
from .run import Experiment
|
|
28
|
-
|
|
29
|
-
# Auto-configured global experiment instance
|
|
30
|
-
experiment = Experiment._auto_configure()
|
|
31
|
-
|
|
32
|
-
__all__ = ["experiment"]
|
ml_dash/backends/__init__.py
DELETED