podstack 1.3.10__tar.gz → 1.3.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {podstack-1.3.10 → podstack-1.3.12}/PKG-INFO +1 -1
  2. podstack-1.3.12/podstack/registry/autolog.py +196 -0
  3. {podstack-1.3.10 → podstack-1.3.12}/podstack/registry/client.py +108 -34
  4. {podstack-1.3.10 → podstack-1.3.12}/podstack/registry/experiment.py +60 -0
  5. {podstack-1.3.10 → podstack-1.3.12}/podstack.egg-info/PKG-INFO +1 -1
  6. {podstack-1.3.10 → podstack-1.3.12}/podstack.egg-info/SOURCES.txt +1 -0
  7. {podstack-1.3.10 → podstack-1.3.12}/pyproject.toml +1 -1
  8. {podstack-1.3.10 → podstack-1.3.12}/LICENSE +0 -0
  9. {podstack-1.3.10 → podstack-1.3.12}/README.md +0 -0
  10. {podstack-1.3.10 → podstack-1.3.12}/podstack/__init__.py +0 -0
  11. {podstack-1.3.10 → podstack-1.3.12}/podstack/annotations.py +0 -0
  12. {podstack-1.3.10 → podstack-1.3.12}/podstack/client.py +0 -0
  13. {podstack-1.3.10 → podstack-1.3.12}/podstack/exceptions.py +0 -0
  14. {podstack-1.3.10 → podstack-1.3.12}/podstack/execution.py +0 -0
  15. {podstack-1.3.10 → podstack-1.3.12}/podstack/gpu_runner.py +0 -0
  16. {podstack-1.3.10 → podstack-1.3.12}/podstack/models.py +0 -0
  17. {podstack-1.3.10 → podstack-1.3.12}/podstack/notebook.py +0 -0
  18. {podstack-1.3.10 → podstack-1.3.12}/podstack/registry/__init__.py +0 -0
  19. {podstack-1.3.10 → podstack-1.3.12}/podstack/registry/exceptions.py +0 -0
  20. {podstack-1.3.10 → podstack-1.3.12}/podstack/registry/model.py +0 -0
  21. {podstack-1.3.10 → podstack-1.3.12}/podstack/registry/model_utils.py +0 -0
  22. {podstack-1.3.10 → podstack-1.3.12}/podstack.egg-info/dependency_links.txt +0 -0
  23. {podstack-1.3.10 → podstack-1.3.12}/podstack.egg-info/requires.txt +0 -0
  24. {podstack-1.3.10 → podstack-1.3.12}/podstack.egg-info/top_level.txt +0 -0
  25. {podstack-1.3.10 → podstack-1.3.12}/podstack_gpu/__init__.py +0 -0
  26. {podstack-1.3.10 → podstack-1.3.12}/podstack_gpu/app.py +0 -0
  27. {podstack-1.3.10 → podstack-1.3.12}/podstack_gpu/exceptions.py +0 -0
  28. {podstack-1.3.10 → podstack-1.3.12}/podstack_gpu/image.py +0 -0
  29. {podstack-1.3.10 → podstack-1.3.12}/podstack_gpu/runner.py +0 -0
  30. {podstack-1.3.10 → podstack-1.3.12}/podstack_gpu/secret.py +0 -0
  31. {podstack-1.3.10 → podstack-1.3.12}/podstack_gpu/utils.py +0 -0
  32. {podstack-1.3.10 → podstack-1.3.12}/podstack_gpu/volume.py +0 -0
  33. {podstack-1.3.10 → podstack-1.3.12}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: podstack
3
- Version: 1.3.10
3
+ Version: 1.3.12
4
4
  Summary: Official Python SDK for Podstack GPU Notebook Platform
5
5
  Author-email: Podstack <support@podstack.ai>
6
6
  License-Expression: MIT
@@ -0,0 +1,196 @@
1
+ """
2
+ Autolog — automatic metric and parameter logging hooks for popular ML frameworks.
3
+
4
+ Supports:
5
+ - PyTorch Lightning (via LightningCallback)
6
+ - HuggingFace Transformers Trainer (via TrainerCallback)
7
+ - scikit-learn (via fit() monkey-patch)
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import functools
13
+ from typing import TYPE_CHECKING, Any, Dict
14
+
15
+ if TYPE_CHECKING:
16
+ from .client import RegistryClient
17
+
18
+
19
+ # ─────────────────────────── PyTorch Lightning ───────────────────────────────
20
+
21
+
22
+ def _install_pytorch_lightning_autolog(client: "RegistryClient", log_every_n_steps: int = 1) -> bool:
23
+ """
24
+ Install a Podstack callback into PyTorch Lightning's global callback list.
25
+
26
+ Returns True if pytorch_lightning is importable, False otherwise.
27
+ """
28
+ try:
29
+ import pytorch_lightning as pl # type: ignore
30
+ except ImportError:
31
+ return False
32
+
33
+ class PodstackCallback(pl.Callback):
34
+ def __init__(self):
35
+ self._step = 0
36
+
37
+ def on_train_epoch_end(self, trainer, pl_module):
38
+ metrics = {k: float(v) for k, v in trainer.callback_metrics.items()
39
+ if not k.startswith("_")}
40
+ if metrics and client._active_run:
41
+ try:
42
+ client.log_metrics(metrics, step=trainer.current_epoch)
43
+ except Exception:
44
+ pass
45
+
46
+ def on_validation_epoch_end(self, trainer, pl_module):
47
+ val_metrics = {k: float(v) for k, v in trainer.callback_metrics.items()
48
+ if k.startswith("val_")}
49
+ if val_metrics and client._active_run:
50
+ try:
51
+ client.log_metrics(val_metrics, step=trainer.current_epoch)
52
+ except Exception:
53
+ pass
54
+
55
+ def on_fit_start(self, trainer, pl_module):
56
+ # Log hyperparams from hparams
57
+ try:
58
+ hparams = dict(pl_module.hparams)
59
+ if hparams and client._active_run:
60
+ client.log_params({k: str(v) for k, v in hparams.items()})
61
+ except Exception:
62
+ pass
63
+
64
+ def on_fit_end(self, trainer, pl_module):
65
+ # Log final metrics
66
+ final = {k: float(v) for k, v in trainer.callback_metrics.items()
67
+ if not k.startswith("_")}
68
+ if final and client._active_run:
69
+ try:
70
+ client.log_metrics(final)
71
+ except Exception:
72
+ pass
73
+
74
+ _callback = PodstackCallback()
75
+
76
+ # Monkey-patch Trainer.__init__ to inject the callback
77
+ _orig_init = pl.Trainer.__init__
78
+
79
+ @functools.wraps(_orig_init)
80
+ def _patched_init(self_trainer, *args, callbacks=None, **kwargs):
81
+ callbacks = list(callbacks or [])
82
+ if not any(isinstance(c, PodstackCallback) for c in callbacks):
83
+ callbacks.append(_callback)
84
+ _orig_init(self_trainer, *args, callbacks=callbacks, **kwargs)
85
+
86
+ pl.Trainer.__init__ = _patched_init
87
+ return True
88
+
89
+
90
+ # ─────────────────────────── HuggingFace Transformers ────────────────────────
91
+
92
+
93
+ def _install_huggingface_autolog(client: "RegistryClient") -> bool:
94
+ """
95
+ Install a Podstack TrainerCallback into HuggingFace Trainer.
96
+
97
+ Returns True if transformers is importable, False otherwise.
98
+ """
99
+ try:
100
+ from transformers import TrainerCallback, TrainingArguments, TrainerState, TrainerControl # type: ignore
101
+ import transformers # type: ignore
102
+ except ImportError:
103
+ return False
104
+
105
+ class PodstackTrainerCallback(TrainerCallback):
106
+ def on_log(self, args: TrainingArguments, state: TrainerState,
107
+ control: TrainerControl, logs: Dict[str, Any] = None, **kwargs):
108
+ if logs and client._active_run:
109
+ metrics = {k: float(v) for k, v in logs.items()
110
+ if isinstance(v, (int, float)) and not k.startswith("_")}
111
+ if metrics:
112
+ try:
113
+ client.log_metrics(metrics, step=state.global_step)
114
+ except Exception:
115
+ pass
116
+
117
+ def on_train_begin(self, args: TrainingArguments, state: TrainerState,
118
+ control: TrainerControl, **kwargs):
119
+ if client._active_run:
120
+ try:
121
+ params = {
122
+ "learning_rate": str(args.learning_rate),
123
+ "per_device_train_batch_size": str(args.per_device_train_batch_size),
124
+ "num_train_epochs": str(args.num_train_epochs),
125
+ "warmup_steps": str(args.warmup_steps),
126
+ "weight_decay": str(args.weight_decay),
127
+ "adam_epsilon": str(args.adam_epsilon),
128
+ "max_grad_norm": str(args.max_grad_norm),
129
+ }
130
+ client.log_params(params)
131
+ except Exception:
132
+ pass
133
+
134
+ _cb = PodstackTrainerCallback()
135
+
136
+ # Monkey-patch Trainer.__init__ to add our callback
137
+ _orig_trainer_init = transformers.Trainer.__init__
138
+
139
+ @functools.wraps(_orig_trainer_init)
140
+ def _patched_trainer_init(self_trainer, *args, callbacks=None, **kwargs):
141
+ callbacks = list(callbacks or [])
142
+ if not any(isinstance(c, PodstackTrainerCallback) for c in callbacks):
143
+ callbacks.append(_cb)
144
+ _orig_trainer_init(self_trainer, *args, callbacks=callbacks, **kwargs)
145
+
146
+ transformers.Trainer.__init__ = _patched_trainer_init
147
+ return True
148
+
149
+
150
+ # ─────────────────────────── scikit-learn ────────────────────────────────────
151
+
152
+
153
+ def _install_sklearn_autolog(client: "RegistryClient") -> bool:
154
+ """
155
+ Wrap sklearn estimator ``fit()`` methods to auto-log params and scores.
156
+
157
+ Returns True if scikit-learn is importable, False otherwise.
158
+ """
159
+ try:
160
+ from sklearn.base import BaseEstimator # type: ignore
161
+ except ImportError:
162
+ return False
163
+
164
+ _orig_fit = BaseEstimator.fit
165
+
166
+ @functools.wraps(_orig_fit)
167
+ def _autolog_fit(self_est, X, y=None, **fit_params):
168
+ # Log estimator params before fitting
169
+ if client._active_run:
170
+ try:
171
+ params = self_est.get_params(deep=True)
172
+ client.log_params({
173
+ f"{type(self_est).__name__}.{k}": str(v)
174
+ for k, v in params.items()
175
+ })
176
+ except Exception:
177
+ pass
178
+
179
+ result = _orig_fit(self_est, X, y, **fit_params)
180
+
181
+ # Log score on training data if possible
182
+ if client._active_run:
183
+ try:
184
+ score = self_est.score(X, y)
185
+ client.log_metrics({f"{type(self_est).__name__}.train_score": float(score)})
186
+ except Exception:
187
+ pass
188
+
189
+ return result
190
+
191
+ # Only patch once to avoid infinite recursion
192
+ if not getattr(BaseEstimator.fit, "_podstack_patched", False):
193
+ BaseEstimator.fit = _autolog_fit
194
+ BaseEstimator.fit._podstack_patched = True # type: ignore
195
+
196
+ return True
@@ -11,7 +11,7 @@ import shutil
11
11
  from typing import Optional, Dict, Any, List
12
12
  import requests
13
13
 
14
- from .experiment import Experiment, Run, Metric, Param
14
+ from .experiment import Experiment, Run, Metric, Param, Dataset
15
15
  from .model import RegisteredModel, ModelVersion, ModelAlias, StageTransition
16
16
  from .exceptions import (
17
17
  RegistryError,
@@ -413,15 +413,16 @@ class RegistryClient:
413
413
 
414
414
  try:
415
415
  data = self._request("POST", "/models", json=body)
416
+ model_data = data.get("model", data)
417
+ model = RegisteredModel.from_dict(model_data, client=self)
416
418
  except RegistryError as e:
417
419
  if "already exists" in str(e).lower():
418
- data = self._request("GET", f"/models/{name}")
420
+ # get_model() handles both UUID and name lookup correctly
421
+ model = self.get_model(name)
419
422
  else:
420
423
  raise
421
- model_data = data.get("model", data)
422
- model = RegisteredModel.from_dict(model_data, client=self)
423
424
 
424
- # Auto-create version 1 when run_id is provided.
425
+ # Auto-create a version when run_id is provided.
425
426
  # Only pass source when the artifact dir actually exists locally;
426
427
  # otherwise let the backend derive it from the run's artifact URI.
427
428
  if run_id:
@@ -431,10 +432,9 @@ class RegistryClient:
431
432
  self.create_model_version(
432
433
  model.id, run_id=run_id, source=source
433
434
  )
434
- except RegistryError as e:
435
- # If a version already exists for this model, that's fine
436
- # the caller can query list_model_versions() to see what exists.
437
- if "already exists" not in str(e).lower():
435
+ except RegistryError as ve:
436
+ # Version may already exist that is fine.
437
+ if "already exists" not in str(ve).lower():
438
438
  raise
439
439
 
440
440
  return model
@@ -812,26 +812,43 @@ class RegistryClient:
812
812
  self,
813
813
  name: str,
814
814
  path: str = None,
815
+ df=None,
816
+ context: str = "training",
817
+ digest: str = None,
818
+ source_type: str = "local",
819
+ tags: dict = None,
820
+ # Legacy params kept for backward compat:
815
821
  version: str = None,
816
822
  description: str = None,
817
- digest: str = None,
818
823
  num_rows: int = None,
819
- num_features: int = None
820
- ):
824
+ num_features: int = None,
825
+ ) -> Dataset:
821
826
  """
822
- Log dataset metadata for the active run.
827
+ Log a dataset to the active run as a first-class dataset resource.
823
828
 
824
- All metadata is stored as run params via ``POST /runs/:id/params``
825
- using a ``dataset.`` prefix for easy retrieval.
829
+ Auto-enrichment:
830
+ - If ``df`` (pandas DataFrame) is provided, schema and profile are computed automatically.
831
+ - If ``path`` is provided and ``digest`` is not set, SHA-256 is computed for files
832
+ under 500 MB to enable cross-run deduplication.
833
+
834
+ Falls back to legacy param-based logging with a deprecation warning when
835
+ no active run is present.
826
836
 
827
837
  Args:
828
838
  name: Dataset name.
829
- path: Dataset path or URI (e.g., "s3://bucket/data").
830
- version: Dataset version string.
831
- description: Dataset description.
832
- digest: Hash/digest of the dataset for reproducibility.
833
- num_rows: Number of rows/samples in the dataset.
834
- num_features: Number of features/columns.
839
+ path: Local file path or URI (e.g., ``s3://bucket/data.csv``).
840
+ df: Optional pandas DataFrame.
841
+ context: One of "training", "validation", "test" (default: "training").
842
+ digest: SHA-256 hex digest. Computed from ``path`` if not provided.
843
+ source_type: One of "local", "s3", "gcs", "url" (default: "local").
844
+ tags: Optional dict of string tags.
845
+ version: Ignored (legacy compat).
846
+ description: Ignored (legacy compat).
847
+ num_rows: Ignored (legacy compat; auto-computed from ``df``).
848
+ num_features: Ignored (legacy compat; auto-computed from ``df``).
849
+
850
+ Returns:
851
+ Dataset object.
835
852
 
836
853
  Raises:
837
854
  NoActiveRunError: If no run is active.
@@ -839,21 +856,78 @@ class RegistryClient:
839
856
  if not self._active_run:
840
857
  raise NoActiveRunError()
841
858
 
842
- params = {"dataset.name": name}
843
- if path:
844
- params["dataset.path"] = path
845
- if version:
846
- params["dataset.version"] = version
847
- if description:
848
- params["dataset.description"] = description
859
+ schema: Dict[str, str] = {}
860
+ profile: Dict[str, Any] = {}
861
+
862
+ # Auto-compute schema + profile from DataFrame
863
+ if df is not None:
864
+ try:
865
+ schema = {col: str(dtype) for col, dtype in df.dtypes.items()}
866
+ profile = {
867
+ "num_rows": len(df),
868
+ "num_features": len(df.columns),
869
+ }
870
+ except Exception:
871
+ pass
872
+
873
+ # Auto-compute digest from local file
874
+ if path and not digest and source_type == "local":
875
+ import os
876
+ try:
877
+ file_size = os.path.getsize(path)
878
+ if file_size <= 500 * 1024 * 1024: # Skip files > 500 MB
879
+ import hashlib
880
+ sha256 = hashlib.sha256()
881
+ with open(path, "rb") as f:
882
+ for chunk in iter(lambda: f.read(65536), b""):
883
+ sha256.update(chunk)
884
+ digest = sha256.hexdigest()
885
+ except (OSError, IOError):
886
+ pass
887
+
888
+ body: Dict[str, Any] = {
889
+ "name": name,
890
+ "source_type": source_type,
891
+ "context": context,
892
+ }
849
893
  if digest:
850
- params["dataset.digest"] = digest
851
- if num_rows is not None:
852
- params["dataset.num_rows"] = str(num_rows)
853
- if num_features is not None:
854
- params["dataset.num_features"] = str(num_features)
894
+ body["digest"] = digest
895
+ if path:
896
+ body["source"] = path
897
+ if schema:
898
+ body["schema"] = schema
899
+ if profile:
900
+ body["profile"] = profile
901
+ if tags:
902
+ body["tags"] = tags
903
+
904
+ data = self._request("POST", f"/runs/{self._active_run.id}/datasets", json=body)
905
+ dataset_data = data.get("dataset", data)
906
+ return Dataset.from_dict(dataset_data)
907
+
908
+ def get_run_datasets(self, run_id: str) -> List[Dataset]:
909
+ """Return all datasets linked to a run.
855
910
 
856
- self.log_params(params)
911
+ Args:
912
+ run_id: Run ID.
913
+
914
+ Returns:
915
+ List of Dataset objects.
916
+ """
917
+ data = self._request("GET", f"/runs/{run_id}/datasets")
918
+ return [Dataset.from_dict(d) for d in data.get("datasets", [])]
919
+
920
+ def get_model_lineage(self, model_id: str) -> Dict[str, Any]:
921
+ """Return the full dataset lineage for all versions of a model.
922
+
923
+ Args:
924
+ model_id: Registered model ID.
925
+
926
+ Returns:
927
+ Dict with ``model_id`` and ``versions`` list, each containing
928
+ ``version``, ``stage``, ``run_id``, ``run_name``, and ``datasets``.
929
+ """
930
+ return self._request("GET", f"/models/{model_id}/lineage")
857
931
 
858
932
  def compare_runs(
859
933
  self,
@@ -9,6 +9,50 @@ from typing import Optional, Dict, Any, List
9
9
  from datetime import datetime
10
10
 
11
11
 
12
+ @dataclass
13
+ class Dataset:
14
+ """Represents a tracked dataset."""
15
+
16
+ id: str
17
+ project_id: str
18
+ name: str
19
+ digest: str = ""
20
+ source_type: str = ""
21
+ source: str = ""
22
+ schema: Dict[str, str] = field(default_factory=dict)
23
+ profile: Dict[str, Any] = field(default_factory=dict)
24
+ tags: Dict[str, str] = field(default_factory=dict)
25
+ created_at: Optional[str] = None
26
+
27
+ @classmethod
28
+ def from_dict(cls, data: Dict[str, Any]) -> "Dataset":
29
+ """Create a Dataset from a dict."""
30
+ import json
31
+
32
+ def _parse_json_field(value, default):
33
+ if value is None:
34
+ return default
35
+ if isinstance(value, (dict, list)):
36
+ return value
37
+ try:
38
+ return json.loads(value)
39
+ except (ValueError, TypeError):
40
+ return default
41
+
42
+ return cls(
43
+ id=data.get("id", ""),
44
+ project_id=data.get("project_id", ""),
45
+ name=data.get("name", ""),
46
+ digest=data.get("digest", ""),
47
+ source_type=data.get("source_type", ""),
48
+ source=data.get("source", ""),
49
+ schema=_parse_json_field(data.get("schema"), {}),
50
+ profile=_parse_json_field(data.get("profile"), {}),
51
+ tags=_parse_json_field(data.get("tags"), {}),
52
+ created_at=data.get("created_at"),
53
+ )
54
+
55
+
12
56
  @dataclass
13
57
  class Experiment:
14
58
  """Represents an experiment in the registry."""
@@ -202,6 +246,22 @@ class Run:
202
246
  if self._client:
203
247
  self._client.log_model(model, artifact_path=artifact_path, framework=framework, metadata=metadata)
204
248
 
249
+ def log_dataset(self, name: str, path: str = None, df=None, context: str = "training", **kwargs) -> "Dataset":
250
+ """Log a dataset to this run.
251
+
252
+ Args:
253
+ name: Dataset name.
254
+ path: Local file path or URI.
255
+ df: Optional pandas DataFrame — schema and profile are auto-computed.
256
+ context: "training", "validation", or "test".
257
+ **kwargs: Extra keyword args forwarded to RegistryClient.log_dataset().
258
+
259
+ Returns:
260
+ Dataset object.
261
+ """
262
+ if self._client:
263
+ return self._client.log_dataset(name=name, path=path, df=df, context=context, **kwargs)
264
+
205
265
  def set_tag(self, key: str, value: str):
206
266
  """Set a tag on this run."""
207
267
  if self._client:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: podstack
3
- Version: 1.3.10
3
+ Version: 1.3.12
4
4
  Summary: Official Python SDK for Podstack GPU Notebook Platform
5
5
  Author-email: Podstack <support@podstack.ai>
6
6
  License-Expression: MIT
@@ -15,6 +15,7 @@ podstack.egg-info/dependency_links.txt
15
15
  podstack.egg-info/requires.txt
16
16
  podstack.egg-info/top_level.txt
17
17
  podstack/registry/__init__.py
18
+ podstack/registry/autolog.py
18
19
  podstack/registry/client.py
19
20
  podstack/registry/exceptions.py
20
21
  podstack/registry/experiment.py
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "podstack"
7
- version = "1.3.10"
7
+ version = "1.3.12"
8
8
  description = "Official Python SDK for Podstack GPU Notebook Platform"
9
9
  readme = "README.md"
10
10
  license = "MIT"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes