podstack 1.3.13__tar.gz → 1.3.15__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {podstack-1.3.13 → podstack-1.3.15}/PKG-INFO +161 -8
- {podstack-1.3.13 → podstack-1.3.15}/README.md +160 -7
- {podstack-1.3.13 → podstack-1.3.15}/podstack/registry/__init__.py +255 -1
- {podstack-1.3.13 → podstack-1.3.15}/podstack/registry/client.py +482 -4
- {podstack-1.3.13 → podstack-1.3.15}/podstack/registry/experiment.py +90 -1
- {podstack-1.3.13 → podstack-1.3.15}/podstack.egg-info/PKG-INFO +161 -8
- {podstack-1.3.13 → podstack-1.3.15}/pyproject.toml +1 -1
- {podstack-1.3.13 → podstack-1.3.15}/LICENSE +0 -0
- {podstack-1.3.13 → podstack-1.3.15}/podstack/__init__.py +0 -0
- {podstack-1.3.13 → podstack-1.3.15}/podstack/annotations.py +0 -0
- {podstack-1.3.13 → podstack-1.3.15}/podstack/client.py +0 -0
- {podstack-1.3.13 → podstack-1.3.15}/podstack/exceptions.py +0 -0
- {podstack-1.3.13 → podstack-1.3.15}/podstack/execution.py +0 -0
- {podstack-1.3.13 → podstack-1.3.15}/podstack/gpu_runner.py +0 -0
- {podstack-1.3.13 → podstack-1.3.15}/podstack/models.py +0 -0
- {podstack-1.3.13 → podstack-1.3.15}/podstack/notebook.py +0 -0
- {podstack-1.3.13 → podstack-1.3.15}/podstack/registry/autolog.py +0 -0
- {podstack-1.3.13 → podstack-1.3.15}/podstack/registry/exceptions.py +0 -0
- {podstack-1.3.13 → podstack-1.3.15}/podstack/registry/model.py +0 -0
- {podstack-1.3.13 → podstack-1.3.15}/podstack/registry/model_utils.py +0 -0
- {podstack-1.3.13 → podstack-1.3.15}/podstack.egg-info/SOURCES.txt +0 -0
- {podstack-1.3.13 → podstack-1.3.15}/podstack.egg-info/dependency_links.txt +0 -0
- {podstack-1.3.13 → podstack-1.3.15}/podstack.egg-info/requires.txt +0 -0
- {podstack-1.3.13 → podstack-1.3.15}/podstack.egg-info/top_level.txt +0 -0
- {podstack-1.3.13 → podstack-1.3.15}/podstack_gpu/__init__.py +0 -0
- {podstack-1.3.13 → podstack-1.3.15}/podstack_gpu/app.py +0 -0
- {podstack-1.3.13 → podstack-1.3.15}/podstack_gpu/exceptions.py +0 -0
- {podstack-1.3.13 → podstack-1.3.15}/podstack_gpu/image.py +0 -0
- {podstack-1.3.13 → podstack-1.3.15}/podstack_gpu/runner.py +0 -0
- {podstack-1.3.13 → podstack-1.3.15}/podstack_gpu/secret.py +0 -0
- {podstack-1.3.13 → podstack-1.3.15}/podstack_gpu/utils.py +0 -0
- {podstack-1.3.13 → podstack-1.3.15}/podstack_gpu/volume.py +0 -0
- {podstack-1.3.13 → podstack-1.3.15}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: podstack
|
|
3
|
-
Version: 1.3.
|
|
3
|
+
Version: 1.3.15
|
|
4
4
|
Summary: Official Python SDK for Podstack GPU Notebook Platform
|
|
5
5
|
Author-email: Podstack <support@podstack.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -298,9 +298,9 @@ with registry.start_run(name="training-v1") as run:
|
|
|
298
298
|
# Set tags
|
|
299
299
|
registry.set_tag("framework", "pytorch")
|
|
300
300
|
|
|
301
|
-
#
|
|
302
|
-
registry.log_artifact("model.pt"
|
|
303
|
-
registry.log_artifact("training_curves.png", "plots")
|
|
301
|
+
# Upload artifacts to cloud artifact store
|
|
302
|
+
registry.log_artifact("model.pt")
|
|
303
|
+
registry.log_artifact("training_curves.png", artifact_path="plots/curves.png")
|
|
304
304
|
|
|
305
305
|
# Log dataset provenance (first-class resource, deduped by content hash)
|
|
306
306
|
registry.log_dataset("imdb-reviews", path="data/imdb.csv", context="training")
|
|
@@ -316,7 +316,7 @@ with registry.start_run(name="training-v1") as run:
|
|
|
316
316
|
```python
|
|
317
317
|
from podstack import registry
|
|
318
318
|
|
|
319
|
-
#
|
|
319
|
+
# Serialize and upload the model to the artifact store (auto-detects framework)
|
|
320
320
|
registry.log_model(model, artifact_path="model", framework="pytorch")
|
|
321
321
|
|
|
322
322
|
# Register in model registry
|
|
@@ -332,7 +332,7 @@ registry.set_model_stage("my-classifier", version=1, stage="production")
|
|
|
332
332
|
# Set aliases
|
|
333
333
|
registry.set_model_alias("my-classifier", alias="champion", version=1)
|
|
334
334
|
|
|
335
|
-
# Load model from
|
|
335
|
+
# Load model from any machine — files are downloaded automatically if missing locally
|
|
336
336
|
model = registry.load_model("my-classifier", stage="production")
|
|
337
337
|
```
|
|
338
338
|
|
|
@@ -659,6 +659,155 @@ model = registry.get_model("sentiment-bert")
|
|
|
659
659
|
lineage = registry.get_model_lineage(model.id)
|
|
660
660
|
```
|
|
661
661
|
|
|
662
|
+
### Artifact Storage
|
|
663
|
+
|
|
664
|
+
Podstack stores every artifact you log — model files, plots, CSV exports, anything — in the project's cloud artifact store. Artifacts are keyed by run ID, so the same file can be retrieved from any machine, by any project member, at any time.
|
|
665
|
+
|
|
666
|
+
#### `log_artifact()` — upload a file for the active run
|
|
667
|
+
|
|
668
|
+
```python
|
|
669
|
+
# Upload a single file (uses the filename as the artifact path)
|
|
670
|
+
registry.log_artifact("model.pt")
|
|
671
|
+
|
|
672
|
+
# Upload with an explicit path inside the artifact store
|
|
673
|
+
registry.log_artifact("training_curves.png", artifact_path="plots/curves.png")
|
|
674
|
+
registry.log_artifact("feature_importance.csv", artifact_path="analysis/features.csv")
|
|
675
|
+
```
|
|
676
|
+
|
|
677
|
+
**Parameters:**
|
|
678
|
+
|
|
679
|
+
| Parameter | Type | Default | Description |
|
|
680
|
+
|-----------|------|---------|-------------|
|
|
681
|
+
| `local_path` | `str` | required | Path to the local file to upload |
|
|
682
|
+
| `artifact_path` | `str` | filename | Relative path inside the artifact store. Defaults to `os.path.basename(local_path)` |
|
|
683
|
+
|
|
684
|
+
If the artifact store is temporarily unreachable, the SDK saves the file to a local fallback cache (`~/.podstack/artifacts/<run_id>/`) so your run is never interrupted.
|
|
685
|
+
|
|
686
|
+
**Via the `Run` object** — equivalent to calling `registry.log_artifact()`:
|
|
687
|
+
|
|
688
|
+
```python
|
|
689
|
+
with registry.start_run("training-v1") as run:
|
|
690
|
+
run.log_artifact("confusion_matrix.png", artifact_path="plots/confusion_matrix.png")
|
|
691
|
+
run.log_artifact("model.pkl")
|
|
692
|
+
```
|
|
693
|
+
|
|
694
|
+
#### `list_artifacts()` — list all artifacts for a run
|
|
695
|
+
|
|
696
|
+
```python
|
|
697
|
+
artifacts = registry.list_artifacts(run_id)
|
|
698
|
+
for a in artifacts:
|
|
699
|
+
print(f"{a['path']:40s} {a['size'] / 1e6:.1f} MB {a['last_modified']}")
|
|
700
|
+
```
|
|
701
|
+
|
|
702
|
+
**Parameters:**
|
|
703
|
+
|
|
704
|
+
| Parameter | Type | Description |
|
|
705
|
+
|-----------|------|-------------|
|
|
706
|
+
| `run_id` | `str` | ID of the run to query |
|
|
707
|
+
|
|
708
|
+
**Returns:** `list[dict]` — one entry per artifact:
|
|
709
|
+
|
|
710
|
+
| Key | Type | Description |
|
|
711
|
+
|-----|------|-------------|
|
|
712
|
+
| `path` | `str` | Relative artifact path (e.g. `"plots/curves.png"`) |
|
|
713
|
+
| `size` | `int` | File size in bytes |
|
|
714
|
+
| `etag` | `str` | Content hash for integrity verification |
|
|
715
|
+
| `last_modified` | `str` | ISO 8601 upload timestamp |
|
|
716
|
+
|
|
717
|
+
#### `download_artifact()` — retrieve an artifact
|
|
718
|
+
|
|
719
|
+
Downloads a specific artifact from the cloud store into a local directory. Falls back to the local cache when the store is unreachable.
|
|
720
|
+
|
|
721
|
+
```python
|
|
722
|
+
# Download a single file
|
|
723
|
+
dest = registry.download_artifact("run-id", "model/model.pkl", "./downloads/")
|
|
724
|
+
print(f"Saved to: {dest}")
|
|
725
|
+
|
|
726
|
+
# Download a whole model directory
|
|
727
|
+
dest = registry.download_artifact("run-id", "model", "./local_models/")
|
|
728
|
+
```
|
|
729
|
+
|
|
730
|
+
**Parameters:**
|
|
731
|
+
|
|
732
|
+
| Parameter | Type | Description |
|
|
733
|
+
|-----------|------|-------------|
|
|
734
|
+
| `run_id` | `str` | ID of the run that logged the artifact |
|
|
735
|
+
| `artifact_path` | `str` | Relative artifact path as logged (e.g. `"model/model.pkl"`) |
|
|
736
|
+
| `local_path` | `str` | Destination directory |
|
|
737
|
+
|
|
738
|
+
**Returns:** `str` — absolute path to the downloaded file or directory.
|
|
739
|
+
|
|
740
|
+
**Raises:** `ArtifactNotFoundError` if the artifact cannot be found in the store or the local cache.
|
|
741
|
+
|
|
742
|
+
#### Models as artifacts: `log_model()` and `load_model()`
|
|
743
|
+
|
|
744
|
+
`log_model()` serializes your model to disk and uploads every resulting file to the artifact store in one call. `load_model()` resolves the registered model version, downloads any missing files from the store, then deserializes the model — so it works correctly from any machine regardless of where training happened.
|
|
745
|
+
|
|
746
|
+
```python
|
|
747
|
+
# ── Training machine ──────────────────────────────────────────────────────────
|
|
748
|
+
with registry.start_run("bert-finetune-v3") as run:
|
|
749
|
+
# train...
|
|
750
|
+
registry.log_model(model, artifact_path="model", framework="pytorch")
|
|
751
|
+
|
|
752
|
+
registry.register_model("sentiment-bert", run_id=run.id)
|
|
753
|
+
registry.set_model_stage("sentiment-bert", version=3, stage="production")
|
|
754
|
+
|
|
755
|
+
# ── Any machine (CI, inference server, colleague's laptop) ───────────────────
|
|
756
|
+
# Model files are downloaded automatically from the artifact store if not cached
|
|
757
|
+
model = registry.load_model("sentiment-bert", stage="production")
|
|
758
|
+
```
|
|
759
|
+
|
|
760
|
+
**`log_model()` parameters:**
|
|
761
|
+
|
|
762
|
+
| Parameter | Type | Default | Description |
|
|
763
|
+
|-----------|------|---------|-------------|
|
|
764
|
+
| `model` | any | required | Model object (PyTorch, TensorFlow, sklearn, HuggingFace, or any picklable object) |
|
|
765
|
+
| `artifact_path` | `str` | `"model"` | Sub-path inside the artifact store |
|
|
766
|
+
| `framework` | `str` | auto-detected | `"pytorch"`, `"tensorflow"`, `"sklearn"`, `"huggingface"`, or `"pickle"` |
|
|
767
|
+
| `metadata` | `dict` | `None` | Arbitrary key-value metadata stored as run params |
|
|
768
|
+
|
|
769
|
+
**`load_model()` parameters:**
|
|
770
|
+
|
|
771
|
+
| Parameter | Type | Default | Description |
|
|
772
|
+
|-----------|------|---------|-------------|
|
|
773
|
+
| `model_name` | `str` | required | Registered model name |
|
|
774
|
+
| `version` | `int` | `None` | Specific version to load. Mutually exclusive with `stage` |
|
|
775
|
+
| `stage` | `str` | `None` | Stage to load from: `"development"`, `"staging"`, `"production"`, `"archived"` |
|
|
776
|
+
| `framework` | `str` | from run params | Override framework for deserialization |
|
|
777
|
+
|
|
778
|
+
#### Viewing artifacts in the dashboard
|
|
779
|
+
|
|
780
|
+
Every artifact logged with `log_artifact()` or `log_model()` appears automatically in the **Artifacts tab** of the run's detail page in the Podstack dashboard. No extra steps are needed — the tab populates from the same store the SDK writes to.
|
|
781
|
+
|
|
782
|
+
The Artifacts tab shows:
|
|
783
|
+
|
|
784
|
+
| Column | Description |
|
|
785
|
+
|--------|-------------|
|
|
786
|
+
| **Path** | The relative artifact path as logged (e.g. `model/model.pkl`, `plots/curves.png`) |
|
|
787
|
+
| **Type badge** | File extension, color-coded by category — model weights, data files, images, configs, etc. |
|
|
788
|
+
| **Size** | Formatted file size (B / KB / MB) |
|
|
789
|
+
| **Uploaded** | Timestamp of when the file was stored |
|
|
790
|
+
| **Download** | One-click download button — opens a short-lived direct download link in the browser |
|
|
791
|
+
|
|
792
|
+
A footer below the list shows the combined size of all artifacts for the run.
|
|
793
|
+
|
|
794
|
+
```python
|
|
795
|
+
# Everything logged here shows up in the dashboard Artifacts tab
|
|
796
|
+
with registry.start_run("bert-finetune-v3") as run:
|
|
797
|
+
registry.log_params({"lr": 2e-5, "epochs": 3})
|
|
798
|
+
registry.log_metrics({"accuracy": 0.93})
|
|
799
|
+
|
|
800
|
+
# These all appear as separate rows in the Artifacts tab
|
|
801
|
+
registry.log_artifact("confusion_matrix.png", artifact_path="plots/confusion_matrix.png")
|
|
802
|
+
registry.log_artifact("feature_importance.csv", artifact_path="analysis/features.csv")
|
|
803
|
+
registry.log_model(model, artifact_path="model", framework="pytorch")
|
|
804
|
+
# ↳ each model file (model.pkl, config.json, etc.) appears as its own row
|
|
805
|
+
```
|
|
806
|
+
|
|
807
|
+
#### Access control
|
|
808
|
+
|
|
809
|
+
Artifact upload and download URLs are issued by the registry API and require a valid API key and project membership. The URLs are short-lived, ensuring that access always reflects the current state of your project — a revoked key can no longer generate new URLs. Any member of a project can upload and download artifacts for runs within that project.
|
|
810
|
+
|
|
662
811
|
### List and Browse
|
|
663
812
|
|
|
664
813
|
```python
|
|
@@ -670,8 +819,12 @@ experiments = registry.list_experiments()
|
|
|
670
819
|
# List models
|
|
671
820
|
models = registry.list_models()
|
|
672
821
|
|
|
673
|
-
#
|
|
674
|
-
registry.
|
|
822
|
+
# List artifacts for a specific run
|
|
823
|
+
artifacts = registry.list_artifacts(run_id)
|
|
824
|
+
|
|
825
|
+
# Download a specific artifact to a local directory
|
|
826
|
+
dest = registry.download_artifact("run-id", "model/model.pt", "./downloads/")
|
|
827
|
+
print(f"Saved to: {dest}")
|
|
675
828
|
```
|
|
676
829
|
|
|
677
830
|
## GPU Runner - Direct Code Execution
|
|
@@ -246,9 +246,9 @@ with registry.start_run(name="training-v1") as run:
|
|
|
246
246
|
# Set tags
|
|
247
247
|
registry.set_tag("framework", "pytorch")
|
|
248
248
|
|
|
249
|
-
#
|
|
250
|
-
registry.log_artifact("model.pt"
|
|
251
|
-
registry.log_artifact("training_curves.png", "plots")
|
|
249
|
+
# Upload artifacts to cloud artifact store
|
|
250
|
+
registry.log_artifact("model.pt")
|
|
251
|
+
registry.log_artifact("training_curves.png", artifact_path="plots/curves.png")
|
|
252
252
|
|
|
253
253
|
# Log dataset provenance (first-class resource, deduped by content hash)
|
|
254
254
|
registry.log_dataset("imdb-reviews", path="data/imdb.csv", context="training")
|
|
@@ -264,7 +264,7 @@ with registry.start_run(name="training-v1") as run:
|
|
|
264
264
|
```python
|
|
265
265
|
from podstack import registry
|
|
266
266
|
|
|
267
|
-
#
|
|
267
|
+
# Serialize and upload the model to the artifact store (auto-detects framework)
|
|
268
268
|
registry.log_model(model, artifact_path="model", framework="pytorch")
|
|
269
269
|
|
|
270
270
|
# Register in model registry
|
|
@@ -280,7 +280,7 @@ registry.set_model_stage("my-classifier", version=1, stage="production")
|
|
|
280
280
|
# Set aliases
|
|
281
281
|
registry.set_model_alias("my-classifier", alias="champion", version=1)
|
|
282
282
|
|
|
283
|
-
# Load model from
|
|
283
|
+
# Load model from any machine — files are downloaded automatically if missing locally
|
|
284
284
|
model = registry.load_model("my-classifier", stage="production")
|
|
285
285
|
```
|
|
286
286
|
|
|
@@ -607,6 +607,155 @@ model = registry.get_model("sentiment-bert")
|
|
|
607
607
|
lineage = registry.get_model_lineage(model.id)
|
|
608
608
|
```
|
|
609
609
|
|
|
610
|
+
### Artifact Storage
|
|
611
|
+
|
|
612
|
+
Podstack stores every artifact you log — model files, plots, CSV exports, anything — in the project's cloud artifact store. Artifacts are keyed by run ID, so the same file can be retrieved from any machine, by any project member, at any time.
|
|
613
|
+
|
|
614
|
+
#### `log_artifact()` — upload a file for the active run
|
|
615
|
+
|
|
616
|
+
```python
|
|
617
|
+
# Upload a single file (uses the filename as the artifact path)
|
|
618
|
+
registry.log_artifact("model.pt")
|
|
619
|
+
|
|
620
|
+
# Upload with an explicit path inside the artifact store
|
|
621
|
+
registry.log_artifact("training_curves.png", artifact_path="plots/curves.png")
|
|
622
|
+
registry.log_artifact("feature_importance.csv", artifact_path="analysis/features.csv")
|
|
623
|
+
```
|
|
624
|
+
|
|
625
|
+
**Parameters:**
|
|
626
|
+
|
|
627
|
+
| Parameter | Type | Default | Description |
|
|
628
|
+
|-----------|------|---------|-------------|
|
|
629
|
+
| `local_path` | `str` | required | Path to the local file to upload |
|
|
630
|
+
| `artifact_path` | `str` | filename | Relative path inside the artifact store. Defaults to `os.path.basename(local_path)` |
|
|
631
|
+
|
|
632
|
+
If the artifact store is temporarily unreachable, the SDK saves the file to a local fallback cache (`~/.podstack/artifacts/<run_id>/`) so your run is never interrupted.
|
|
633
|
+
|
|
634
|
+
**Via the `Run` object** — equivalent to calling `registry.log_artifact()`:
|
|
635
|
+
|
|
636
|
+
```python
|
|
637
|
+
with registry.start_run("training-v1") as run:
|
|
638
|
+
run.log_artifact("confusion_matrix.png", artifact_path="plots/confusion_matrix.png")
|
|
639
|
+
run.log_artifact("model.pkl")
|
|
640
|
+
```
|
|
641
|
+
|
|
642
|
+
#### `list_artifacts()` — list all artifacts for a run
|
|
643
|
+
|
|
644
|
+
```python
|
|
645
|
+
artifacts = registry.list_artifacts(run_id)
|
|
646
|
+
for a in artifacts:
|
|
647
|
+
print(f"{a['path']:40s} {a['size'] / 1e6:.1f} MB {a['last_modified']}")
|
|
648
|
+
```
|
|
649
|
+
|
|
650
|
+
**Parameters:**
|
|
651
|
+
|
|
652
|
+
| Parameter | Type | Description |
|
|
653
|
+
|-----------|------|-------------|
|
|
654
|
+
| `run_id` | `str` | ID of the run to query |
|
|
655
|
+
|
|
656
|
+
**Returns:** `list[dict]` — one entry per artifact:
|
|
657
|
+
|
|
658
|
+
| Key | Type | Description |
|
|
659
|
+
|-----|------|-------------|
|
|
660
|
+
| `path` | `str` | Relative artifact path (e.g. `"plots/curves.png"`) |
|
|
661
|
+
| `size` | `int` | File size in bytes |
|
|
662
|
+
| `etag` | `str` | Content hash for integrity verification |
|
|
663
|
+
| `last_modified` | `str` | ISO 8601 upload timestamp |
|
|
664
|
+
|
|
665
|
+
#### `download_artifact()` — retrieve an artifact
|
|
666
|
+
|
|
667
|
+
Downloads a specific artifact from the cloud store into a local directory. Falls back to the local cache when the store is unreachable.
|
|
668
|
+
|
|
669
|
+
```python
|
|
670
|
+
# Download a single file
|
|
671
|
+
dest = registry.download_artifact("run-id", "model/model.pkl", "./downloads/")
|
|
672
|
+
print(f"Saved to: {dest}")
|
|
673
|
+
|
|
674
|
+
# Download a whole model directory
|
|
675
|
+
dest = registry.download_artifact("run-id", "model", "./local_models/")
|
|
676
|
+
```
|
|
677
|
+
|
|
678
|
+
**Parameters:**
|
|
679
|
+
|
|
680
|
+
| Parameter | Type | Description |
|
|
681
|
+
|-----------|------|-------------|
|
|
682
|
+
| `run_id` | `str` | ID of the run that logged the artifact |
|
|
683
|
+
| `artifact_path` | `str` | Relative artifact path as logged (e.g. `"model/model.pkl"`) |
|
|
684
|
+
| `local_path` | `str` | Destination directory |
|
|
685
|
+
|
|
686
|
+
**Returns:** `str` — absolute path to the downloaded file or directory.
|
|
687
|
+
|
|
688
|
+
**Raises:** `ArtifactNotFoundError` if the artifact cannot be found in the store or the local cache.
|
|
689
|
+
|
|
690
|
+
#### Models as artifacts: `log_model()` and `load_model()`
|
|
691
|
+
|
|
692
|
+
`log_model()` serializes your model to disk and uploads every resulting file to the artifact store in one call. `load_model()` resolves the registered model version, downloads any missing files from the store, then deserializes the model — so it works correctly from any machine regardless of where training happened.
|
|
693
|
+
|
|
694
|
+
```python
|
|
695
|
+
# ── Training machine ──────────────────────────────────────────────────────────
|
|
696
|
+
with registry.start_run("bert-finetune-v3") as run:
|
|
697
|
+
# train...
|
|
698
|
+
registry.log_model(model, artifact_path="model", framework="pytorch")
|
|
699
|
+
|
|
700
|
+
registry.register_model("sentiment-bert", run_id=run.id)
|
|
701
|
+
registry.set_model_stage("sentiment-bert", version=3, stage="production")
|
|
702
|
+
|
|
703
|
+
# ── Any machine (CI, inference server, colleague's laptop) ───────────────────
|
|
704
|
+
# Model files are downloaded automatically from the artifact store if not cached
|
|
705
|
+
model = registry.load_model("sentiment-bert", stage="production")
|
|
706
|
+
```
|
|
707
|
+
|
|
708
|
+
**`log_model()` parameters:**
|
|
709
|
+
|
|
710
|
+
| Parameter | Type | Default | Description |
|
|
711
|
+
|-----------|------|---------|-------------|
|
|
712
|
+
| `model` | any | required | Model object (PyTorch, TensorFlow, sklearn, HuggingFace, or any picklable object) |
|
|
713
|
+
| `artifact_path` | `str` | `"model"` | Sub-path inside the artifact store |
|
|
714
|
+
| `framework` | `str` | auto-detected | `"pytorch"`, `"tensorflow"`, `"sklearn"`, `"huggingface"`, or `"pickle"` |
|
|
715
|
+
| `metadata` | `dict` | `None` | Arbitrary key-value metadata stored as run params |
|
|
716
|
+
|
|
717
|
+
**`load_model()` parameters:**
|
|
718
|
+
|
|
719
|
+
| Parameter | Type | Default | Description |
|
|
720
|
+
|-----------|------|---------|-------------|
|
|
721
|
+
| `model_name` | `str` | required | Registered model name |
|
|
722
|
+
| `version` | `int` | `None` | Specific version to load. Mutually exclusive with `stage` |
|
|
723
|
+
| `stage` | `str` | `None` | Stage to load from: `"development"`, `"staging"`, `"production"`, `"archived"` |
|
|
724
|
+
| `framework` | `str` | from run params | Override framework for deserialization |
|
|
725
|
+
|
|
726
|
+
#### Viewing artifacts in the dashboard
|
|
727
|
+
|
|
728
|
+
Every artifact logged with `log_artifact()` or `log_model()` appears automatically in the **Artifacts tab** of the run's detail page in the Podstack dashboard. No extra steps are needed — the tab populates from the same store the SDK writes to.
|
|
729
|
+
|
|
730
|
+
The Artifacts tab shows:
|
|
731
|
+
|
|
732
|
+
| Column | Description |
|
|
733
|
+
|--------|-------------|
|
|
734
|
+
| **Path** | The relative artifact path as logged (e.g. `model/model.pkl`, `plots/curves.png`) |
|
|
735
|
+
| **Type badge** | File extension, color-coded by category — model weights, data files, images, configs, etc. |
|
|
736
|
+
| **Size** | Formatted file size (B / KB / MB) |
|
|
737
|
+
| **Uploaded** | Timestamp of when the file was stored |
|
|
738
|
+
| **Download** | One-click download button — opens a short-lived direct download link in the browser |
|
|
739
|
+
|
|
740
|
+
A footer below the list shows the combined size of all artifacts for the run.
|
|
741
|
+
|
|
742
|
+
```python
|
|
743
|
+
# Everything logged here shows up in the dashboard Artifacts tab
|
|
744
|
+
with registry.start_run("bert-finetune-v3") as run:
|
|
745
|
+
registry.log_params({"lr": 2e-5, "epochs": 3})
|
|
746
|
+
registry.log_metrics({"accuracy": 0.93})
|
|
747
|
+
|
|
748
|
+
# These all appear as separate rows in the Artifacts tab
|
|
749
|
+
registry.log_artifact("confusion_matrix.png", artifact_path="plots/confusion_matrix.png")
|
|
750
|
+
registry.log_artifact("feature_importance.csv", artifact_path="analysis/features.csv")
|
|
751
|
+
registry.log_model(model, artifact_path="model", framework="pytorch")
|
|
752
|
+
# ↳ each model file (model.pkl, config.json, etc.) appears as its own row
|
|
753
|
+
```
|
|
754
|
+
|
|
755
|
+
#### Access control
|
|
756
|
+
|
|
757
|
+
Artifact upload and download URLs are issued by the registry API and require a valid API key and project membership. The URLs are short-lived, ensuring that access always reflects the current state of your project — a revoked key can no longer generate new URLs. Any member of a project can upload and download artifacts for runs within that project.
|
|
758
|
+
|
|
610
759
|
### List and Browse
|
|
611
760
|
|
|
612
761
|
```python
|
|
@@ -618,8 +767,12 @@ experiments = registry.list_experiments()
|
|
|
618
767
|
# List models
|
|
619
768
|
models = registry.list_models()
|
|
620
769
|
|
|
621
|
-
#
|
|
622
|
-
registry.
|
|
770
|
+
# List artifacts for a specific run
|
|
771
|
+
artifacts = registry.list_artifacts(run_id)
|
|
772
|
+
|
|
773
|
+
# Download a specific artifact to a local directory
|
|
774
|
+
dest = registry.download_artifact("run-id", "model/model.pt", "./downloads/")
|
|
775
|
+
print(f"Saved to: {dest}")
|
|
623
776
|
```
|
|
624
777
|
|
|
625
778
|
## GPU Runner - Direct Code Execution
|
|
@@ -47,18 +47,22 @@ __all__ = [
|
|
|
47
47
|
"set_experiment",
|
|
48
48
|
"get_experiment",
|
|
49
49
|
"list_experiments",
|
|
50
|
+
"archive_experiment",
|
|
50
51
|
"start_run",
|
|
51
52
|
"end_run",
|
|
53
|
+
"get_run",
|
|
54
|
+
"list_runs",
|
|
52
55
|
"log_params",
|
|
53
56
|
"log_metrics",
|
|
54
57
|
"log_artifact",
|
|
55
58
|
"set_tag",
|
|
59
|
+
"update_run_notes",
|
|
56
60
|
"register_model",
|
|
57
61
|
"get_model",
|
|
58
62
|
"list_models",
|
|
59
63
|
"set_model_stage",
|
|
60
64
|
"set_model_alias",
|
|
61
|
-
#
|
|
65
|
+
# MLOps helpers
|
|
62
66
|
"log_model",
|
|
63
67
|
"load_model",
|
|
64
68
|
"log_dataset",
|
|
@@ -66,6 +70,32 @@ __all__ = [
|
|
|
66
70
|
"get_metric_history",
|
|
67
71
|
"download_artifact",
|
|
68
72
|
"search_runs",
|
|
73
|
+
"get_run_datasets",
|
|
74
|
+
"get_model_lineage",
|
|
75
|
+
"autolog",
|
|
76
|
+
# HPO Sweeps
|
|
77
|
+
"create_sweep",
|
|
78
|
+
"get_sweep",
|
|
79
|
+
"list_sweeps",
|
|
80
|
+
"suggest_trial_params",
|
|
81
|
+
"create_trial",
|
|
82
|
+
"complete_trial",
|
|
83
|
+
"list_trials",
|
|
84
|
+
"stop_sweep",
|
|
85
|
+
# Alerts
|
|
86
|
+
"create_alert",
|
|
87
|
+
"list_alerts",
|
|
88
|
+
"delete_alert",
|
|
89
|
+
# Approvals
|
|
90
|
+
"list_pending_approvals",
|
|
91
|
+
"approve_promotion",
|
|
92
|
+
"reject_promotion",
|
|
93
|
+
# Schedules
|
|
94
|
+
"create_schedule",
|
|
95
|
+
"get_schedule",
|
|
96
|
+
"update_schedule",
|
|
97
|
+
"delete_schedule",
|
|
98
|
+
"list_schedules",
|
|
69
99
|
# Classes
|
|
70
100
|
"Experiment",
|
|
71
101
|
"Run",
|
|
@@ -400,3 +430,227 @@ def search_runs(
|
|
|
400
430
|
List of matching Run objects.
|
|
401
431
|
"""
|
|
402
432
|
return _get_client().search_runs(experiment_id, status, max_results, offset)
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
def get_run(run_id: str):
|
|
436
|
+
"""Get a run by ID."""
|
|
437
|
+
return _get_client().get_run(run_id)
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
def list_runs(
|
|
441
|
+
experiment_id: str = None,
|
|
442
|
+
status: str = None,
|
|
443
|
+
limit: int = 20,
|
|
444
|
+
offset: int = 0
|
|
445
|
+
) -> list:
|
|
446
|
+
"""List runs, optionally filtered by experiment or status."""
|
|
447
|
+
return _get_client().list_runs(experiment_id, status, limit, offset)
|
|
448
|
+
|
|
449
|
+
|
|
450
|
+
def archive_experiment(experiment_id: str):
|
|
451
|
+
"""Archive an experiment."""
|
|
452
|
+
return _get_client().archive_experiment(experiment_id)
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
def update_run_notes(run_id: str, notes: str):
|
|
456
|
+
"""Update the free-form notes for a run."""
|
|
457
|
+
_get_client().update_run_notes(run_id, notes)
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
def get_run_datasets(run_id: str) -> list:
|
|
461
|
+
"""List datasets logged for a run."""
|
|
462
|
+
return _get_client().get_run_datasets(run_id)
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
def get_model_lineage(model_id: str) -> dict:
|
|
466
|
+
"""Get full lineage for a model (versions → runs → datasets)."""
|
|
467
|
+
return _get_client().get_model_lineage(model_id)
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
def autolog(
|
|
471
|
+
framework: str = None,
|
|
472
|
+
log_every_n_steps: int = 1,
|
|
473
|
+
log_system_metrics: bool = True,
|
|
474
|
+
system_metrics_interval: float = 10.0,
|
|
475
|
+
):
|
|
476
|
+
"""
|
|
477
|
+
Enable automatic logging for ML training frameworks.
|
|
478
|
+
|
|
479
|
+
Supports pytorch_lightning, huggingface, and sklearn.
|
|
480
|
+
Auto-detects available frameworks when framework is None.
|
|
481
|
+
"""
|
|
482
|
+
_get_client().autolog(framework, log_every_n_steps, log_system_metrics, system_metrics_interval)
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
# ==================== HPO Sweeps ====================
|
|
486
|
+
|
|
487
|
+
def create_sweep(
|
|
488
|
+
experiment_id: str,
|
|
489
|
+
name: str,
|
|
490
|
+
search_space: dict,
|
|
491
|
+
strategy: str = "random",
|
|
492
|
+
max_trials: int = 20,
|
|
493
|
+
metric=None,
|
|
494
|
+
direction: str = "minimize",
|
|
495
|
+
) -> dict:
|
|
496
|
+
"""
|
|
497
|
+
Create a hyperparameter optimization sweep.
|
|
498
|
+
|
|
499
|
+
Args:
|
|
500
|
+
experiment_id: Experiment to run trials in.
|
|
501
|
+
name: Sweep name.
|
|
502
|
+
search_space: Dict mapping param names to spec dicts.
|
|
503
|
+
strategy: "random" (default) or "grid".
|
|
504
|
+
max_trials: Maximum number of trials.
|
|
505
|
+
metric: Metric key to optimize (str), or dict with "name" and "direction" keys.
|
|
506
|
+
direction: "minimize" (default) or "maximize". Ignored if metric is a dict.
|
|
507
|
+
|
|
508
|
+
Returns:
|
|
509
|
+
Sweep dict with id, status, etc.
|
|
510
|
+
"""
|
|
511
|
+
if isinstance(metric, dict):
|
|
512
|
+
direction = metric.get("direction", direction)
|
|
513
|
+
metric = metric.get("name", None)
|
|
514
|
+
return _get_client().create_sweep(
|
|
515
|
+
experiment_id, name, search_space, strategy, max_trials, metric, direction
|
|
516
|
+
)
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
def get_sweep(sweep_id: str) -> dict:
|
|
520
|
+
"""Get a sweep by ID."""
|
|
521
|
+
return _get_client().get_sweep(sweep_id)
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
def list_sweeps(experiment_id: str) -> list:
|
|
525
|
+
"""List all sweeps for an experiment."""
|
|
526
|
+
return _get_client().list_sweeps(experiment_id)
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
def suggest_trial_params(sweep_id: str) -> dict:
|
|
530
|
+
"""Get suggested hyperparameter values for the next trial."""
|
|
531
|
+
return _get_client().suggest_trial_params(sweep_id)
|
|
532
|
+
|
|
533
|
+
|
|
534
|
+
def create_trial(sweep_id: str, run_id: str, params: dict) -> dict:
|
|
535
|
+
"""Record a new trial linked to a sweep and run."""
|
|
536
|
+
return _get_client().create_trial(sweep_id, run_id, params)
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
def complete_trial(sweep_id: str, trial_id: str, value: float) -> None:
|
|
540
|
+
"""Mark a trial as completed with its objective metric value."""
|
|
541
|
+
_get_client().complete_trial(sweep_id, trial_id, value)
|
|
542
|
+
|
|
543
|
+
|
|
544
|
+
def list_trials(sweep_id: str) -> list:
|
|
545
|
+
"""List all trials for a sweep."""
|
|
546
|
+
return _get_client().list_trials(sweep_id)
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
def stop_sweep(sweep_id: str) -> None:
|
|
550
|
+
"""Stop a running sweep."""
|
|
551
|
+
_get_client().stop_sweep(sweep_id)
|
|
552
|
+
|
|
553
|
+
|
|
554
|
+
# ==================== Alerts ====================
|
|
555
|
+
|
|
556
|
+
def create_alert(
|
|
557
|
+
run_id: str,
|
|
558
|
+
metric_key: str,
|
|
559
|
+
condition: str,
|
|
560
|
+
threshold: float,
|
|
561
|
+
notify_email: str = None,
|
|
562
|
+
notify_slack: str = None,
|
|
563
|
+
) -> dict:
|
|
564
|
+
"""
|
|
565
|
+
Create a metric threshold alert for a run.
|
|
566
|
+
|
|
567
|
+
Args:
|
|
568
|
+
run_id: Run to monitor.
|
|
569
|
+
metric_key: Metric name to watch.
|
|
570
|
+
condition: One of gt, lt, gte, lte, eq.
|
|
571
|
+
threshold: Trigger threshold value.
|
|
572
|
+
notify_email: Email address to notify.
|
|
573
|
+
notify_slack: Slack webhook URL to notify.
|
|
574
|
+
|
|
575
|
+
Returns:
|
|
576
|
+
Alert dict with id.
|
|
577
|
+
"""
|
|
578
|
+
return _get_client().create_alert(run_id, metric_key, condition, threshold, notify_email, notify_slack)
|
|
579
|
+
|
|
580
|
+
|
|
581
|
+
def list_alerts(run_id: str) -> list:
|
|
582
|
+
"""List all alerts for a run."""
|
|
583
|
+
return _get_client().list_alerts(run_id)
|
|
584
|
+
|
|
585
|
+
|
|
586
|
+
def delete_alert(alert_id: str) -> None:
|
|
587
|
+
"""Delete an alert by ID."""
|
|
588
|
+
_get_client().delete_alert(alert_id)
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
# ==================== Approvals ====================
|
|
592
|
+
|
|
593
|
+
def list_pending_approvals() -> list:
|
|
594
|
+
"""List all pending model promotion approval requests in the project."""
|
|
595
|
+
return _get_client().list_pending_approvals()
|
|
596
|
+
|
|
597
|
+
|
|
598
|
+
def approve_promotion(request_id: str, comment: str = None) -> dict:
|
|
599
|
+
"""Approve a pending model stage promotion request."""
|
|
600
|
+
return _get_client().approve_promotion(request_id, comment)
|
|
601
|
+
|
|
602
|
+
|
|
603
|
+
def reject_promotion(request_id: str, comment: str = None) -> dict:
|
|
604
|
+
"""Reject a pending model stage promotion request."""
|
|
605
|
+
return _get_client().reject_promotion(request_id, comment)
|
|
606
|
+
|
|
607
|
+
|
|
608
|
+
# ==================== Schedules ====================
|
|
609
|
+
|
|
610
|
+
def create_schedule(
|
|
611
|
+
name: str,
|
|
612
|
+
experiment_id: str,
|
|
613
|
+
cron_expr: str,
|
|
614
|
+
run_name: str = None,
|
|
615
|
+
run_config: dict = None,
|
|
616
|
+
webhook_url: str = None,
|
|
617
|
+
) -> dict:
|
|
618
|
+
"""
|
|
619
|
+
Create a recurring training schedule using a cron expression.
|
|
620
|
+
|
|
621
|
+
Args:
|
|
622
|
+
name: Schedule name.
|
|
623
|
+
experiment_id: Experiment to create runs in.
|
|
624
|
+
cron_expr: 5-field cron expression (e.g. "0 2 * * 1").
|
|
625
|
+
run_name: Base name for created runs.
|
|
626
|
+
run_config: Optional params to log on each scheduled run.
|
|
627
|
+
webhook_url: Optional URL to POST after each run fires.
|
|
628
|
+
|
|
629
|
+
Returns:
|
|
630
|
+
Schedule dict with id, next_fire_at, etc.
|
|
631
|
+
"""
|
|
632
|
+
return _get_client().create_schedule(name, experiment_id, cron_expr, run_name, run_config, webhook_url)
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
def list_schedules() -> list:
|
|
636
|
+
"""List all training schedules in the project."""
|
|
637
|
+
return _get_client().list_schedules()
|
|
638
|
+
|
|
639
|
+
|
|
640
|
+
def get_schedule(schedule_id: str) -> dict:
|
|
641
|
+
"""Get a schedule by ID."""
|
|
642
|
+
return _get_client().get_schedule(schedule_id)
|
|
643
|
+
|
|
644
|
+
|
|
645
|
+
def update_schedule(
|
|
646
|
+
schedule_id: str,
|
|
647
|
+
enabled: bool = None,
|
|
648
|
+
cron_expr: str = None,
|
|
649
|
+
) -> dict:
|
|
650
|
+
"""Update a schedule's enabled state or cron expression."""
|
|
651
|
+
return _get_client().update_schedule(schedule_id, enabled, cron_expr)
|
|
652
|
+
|
|
653
|
+
|
|
654
|
+
def delete_schedule(schedule_id: str) -> None:
|
|
655
|
+
"""Delete a schedule."""
|
|
656
|
+
_get_client().delete_schedule(schedule_id)
|