PyPI - podstack - Versions diffs - 1.3.10__tar.gz → 1.3.12__tar.gz - Mend

podstack 1.3.10tar.gz → 1.3.12tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

{podstack-1.3.10 → podstack-1.3.12}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: podstack
-Version: 1.3.10
+Version: 1.3.12
 Summary: Official Python SDK for Podstack GPU Notebook Platform
 Author-email: Podstack <support@podstack.ai>
 License-Expression: MIT

podstack-1.3.12/podstack/registry/autolog.py ADDED Viewed

@@ -0,0 +1,196 @@
+"""
+Autolog — automatic metric and parameter logging hooks for popular ML frameworks.
+Supports:
+- PyTorch Lightning (via LightningCallback)
+- HuggingFace Transformers Trainer (via TrainerCallback)
+- scikit-learn (via fit() monkey-patch)
+"""
+from __future__ import annotations
+import functools
+from typing import TYPE_CHECKING, Any, Dict
+if TYPE_CHECKING:
+    from .client import RegistryClient
+# ─────────────────────────── PyTorch Lightning ───────────────────────────────
+def _install_pytorch_lightning_autolog(client: "RegistryClient", log_every_n_steps: int = 1) -> bool:
+    """
+    Install a Podstack callback into PyTorch Lightning's global callback list.
+    Returns True if pytorch_lightning is importable, False otherwise.
+    """
+    try:
+        import pytorch_lightning as pl  # type: ignore
+    except ImportError:
+        return False
+    class PodstackCallback(pl.Callback):
+        def __init__(self):
+            self._step = 0
+        def on_train_epoch_end(self, trainer, pl_module):
+            metrics = {k: float(v) for k, v in trainer.callback_metrics.items()
+                       if not k.startswith("_")}
+            if metrics and client._active_run:
+                try:
+                    client.log_metrics(metrics, step=trainer.current_epoch)
+                except Exception:
+                    pass
+        def on_validation_epoch_end(self, trainer, pl_module):
+            val_metrics = {k: float(v) for k, v in trainer.callback_metrics.items()
+                           if k.startswith("val_")}
+            if val_metrics and client._active_run:
+                try:
+                    client.log_metrics(val_metrics, step=trainer.current_epoch)
+                except Exception:
+                    pass
+        def on_fit_start(self, trainer, pl_module):
+            # Log hyperparams from hparams
+            try:
+                hparams = dict(pl_module.hparams)
+                if hparams and client._active_run:
+                    client.log_params({k: str(v) for k, v in hparams.items()})
+            except Exception:
+                pass
+        def on_fit_end(self, trainer, pl_module):
+            # Log final metrics
+            final = {k: float(v) for k, v in trainer.callback_metrics.items()
+                     if not k.startswith("_")}
+            if final and client._active_run:
+                try:
+                    client.log_metrics(final)
+                except Exception:
+                    pass
+    _callback = PodstackCallback()
+    # Monkey-patch Trainer.__init__ to inject the callback
+    _orig_init = pl.Trainer.__init__
+    @functools.wraps(_orig_init)
+    def _patched_init(self_trainer, *args, callbacks=None, **kwargs):
+        callbacks = list(callbacks or [])
+        if not any(isinstance(c, PodstackCallback) for c in callbacks):
+            callbacks.append(_callback)
+        _orig_init(self_trainer, *args, callbacks=callbacks, **kwargs)
+    pl.Trainer.__init__ = _patched_init
+    return True
+# ─────────────────────────── HuggingFace Transformers ────────────────────────
+def _install_huggingface_autolog(client: "RegistryClient") -> bool:
+    """
+    Install a Podstack TrainerCallback into HuggingFace Trainer.
+    Returns True if transformers is importable, False otherwise.
+    """
+    try:
+        from transformers import TrainerCallback, TrainingArguments, TrainerState, TrainerControl  # type: ignore
+        import transformers  # type: ignore
+    except ImportError:
+        return False
+    class PodstackTrainerCallback(TrainerCallback):
+        def on_log(self, args: TrainingArguments, state: TrainerState,
+                   control: TrainerControl, logs: Dict[str, Any] = None, **kwargs):
+            if logs and client._active_run:
+                metrics = {k: float(v) for k, v in logs.items()
+                           if isinstance(v, (int, float)) and not k.startswith("_")}
+                if metrics:
+                    try:
+                        client.log_metrics(metrics, step=state.global_step)
+                    except Exception:
+                        pass
+        def on_train_begin(self, args: TrainingArguments, state: TrainerState,
+                           control: TrainerControl, **kwargs):
+            if client._active_run:
+                try:
+                    params = {
+                        "learning_rate": str(args.learning_rate),
+                        "per_device_train_batch_size": str(args.per_device_train_batch_size),
+                        "num_train_epochs": str(args.num_train_epochs),
+                        "warmup_steps": str(args.warmup_steps),
+                        "weight_decay": str(args.weight_decay),
+                        "adam_epsilon": str(args.adam_epsilon),
+                        "max_grad_norm": str(args.max_grad_norm),
+                    }
+                    client.log_params(params)
+                except Exception:
+                    pass
+    _cb = PodstackTrainerCallback()
+    # Monkey-patch Trainer.__init__ to add our callback
+    _orig_trainer_init = transformers.Trainer.__init__
+    @functools.wraps(_orig_trainer_init)
+    def _patched_trainer_init(self_trainer, *args, callbacks=None, **kwargs):
+        callbacks = list(callbacks or [])
+        if not any(isinstance(c, PodstackTrainerCallback) for c in callbacks):
+            callbacks.append(_cb)
+        _orig_trainer_init(self_trainer, *args, callbacks=callbacks, **kwargs)
+    transformers.Trainer.__init__ = _patched_trainer_init
+    return True
+# ─────────────────────────── scikit-learn ────────────────────────────────────
+def _install_sklearn_autolog(client: "RegistryClient") -> bool:
+    """
+    Wrap sklearn estimator ``fit()`` methods to auto-log params and scores.
+    Returns True if scikit-learn is importable, False otherwise.
+    """
+    try:
+        from sklearn.base import BaseEstimator  # type: ignore
+    except ImportError:
+        return False
+    _orig_fit = BaseEstimator.fit
+    @functools.wraps(_orig_fit)
+    def _autolog_fit(self_est, X, y=None, **fit_params):
+        # Log estimator params before fitting
+        if client._active_run:
+            try:
+                params = self_est.get_params(deep=True)
+                client.log_params({
+                    f"{type(self_est).__name__}.{k}": str(v)
+                    for k, v in params.items()
+                })
+            except Exception:
+                pass
+        result = _orig_fit(self_est, X, y, **fit_params)
+        # Log score on training data if possible
+        if client._active_run:
+            try:
+                score = self_est.score(X, y)
+                client.log_metrics({f"{type(self_est).__name__}.train_score": float(score)})
+            except Exception:
+                pass
+        return result
+    # Only patch once to avoid infinite recursion
+    if not getattr(BaseEstimator.fit, "_podstack_patched", False):
+        BaseEstimator.fit = _autolog_fit
+        BaseEstimator.fit._podstack_patched = True  # type: ignore
+    return True

{podstack-1.3.10 → podstack-1.3.12}/podstack/registry/client.py RENAMED Viewed

@@ -11,7 +11,7 @@ import shutil
 from typing import Optional, Dict, Any, List
 import requests
-from .experiment import Experiment, Run, Metric, Param
+from .experiment import Experiment, Run, Metric, Param, Dataset
 from .model import RegisteredModel, ModelVersion, ModelAlias, StageTransition
 from .exceptions import (
     RegistryError,
@@ -413,15 +413,16 @@ class RegistryClient:
         try:
             data = self._request("POST", "/models", json=body)
+            model_data = data.get("model", data)
+            model = RegisteredModel.from_dict(model_data, client=self)
         except RegistryError as e:
             if "already exists" in str(e).lower():
-                data = self._request("GET", f"/models/{name}")
+                # get_model() handles both UUID and name lookup correctly
+                model = self.get_model(name)
             else:
                 raise
-        model_data = data.get("model", data)
-        model = RegisteredModel.from_dict(model_data, client=self)
-        # Auto-create version 1 when run_id is provided.
+        # Auto-create a version when run_id is provided.
         # Only pass source when the artifact dir actually exists locally;
         # otherwise let the backend derive it from the run's artifact URI.
         if run_id:
@@ -431,10 +432,9 @@ class RegistryClient:
                 self.create_model_version(
                     model.id, run_id=run_id, source=source
                 )
-            except RegistryError as e:
-                # If a version already exists for this model, that's fine —
-                # the caller can query list_model_versions() to see what exists.
-                if "already exists" not in str(e).lower():
+            except RegistryError as ve:
+                # Version may already exist — that is fine.
+                if "already exists" not in str(ve).lower():
                     raise
         return model
@@ -812,26 +812,43 @@ class RegistryClient:
         self,
         name: str,
         path: str = None,
+        df=None,
+        context: str = "training",
+        digest: str = None,
+        source_type: str = "local",
+        tags: dict = None,
+        # Legacy params kept for backward compat:
         version: str = None,
         description: str = None,
-        digest: str = None,
         num_rows: int = None,
-        num_features: int = None
-    ):
+        num_features: int = None,
+    ) -> Dataset:
         """
-        Log dataset metadata for the active run.
+        Log a dataset to the active run as a first-class dataset resource.
-        All metadata is stored as run params via ``POST /runs/:id/params``
-        using a ``dataset.`` prefix for easy retrieval.
+        Auto-enrichment:
+        - If ``df`` (pandas DataFrame) is provided, schema and profile are computed automatically.
+        - If ``path`` is provided and ``digest`` is not set, SHA-256 is computed for files
+          under 500 MB to enable cross-run deduplication.
+        Falls back to legacy param-based logging with a deprecation warning when
+        no active run is present.
         Args:
             name: Dataset name.
-            path: Dataset path or URI (e.g., "s3://bucket/data").
-            version: Dataset version string.
-            description: Dataset description.
-            digest: Hash/digest of the dataset for reproducibility.
-            num_rows: Number of rows/samples in the dataset.
-            num_features: Number of features/columns.
+            path: Local file path or URI (e.g., ``s3://bucket/data.csv``).
+            df: Optional pandas DataFrame.
+            context: One of "training", "validation", "test" (default: "training").
+            digest: SHA-256 hex digest. Computed from ``path`` if not provided.
+            source_type: One of "local", "s3", "gcs", "url" (default: "local").
+            tags: Optional dict of string tags.
+            version: Ignored (legacy compat).
+            description: Ignored (legacy compat).
+            num_rows: Ignored (legacy compat; auto-computed from ``df``).
+            num_features: Ignored (legacy compat; auto-computed from ``df``).
+        Returns:
+            Dataset object.
         Raises:
             NoActiveRunError: If no run is active.
@@ -839,21 +856,78 @@ class RegistryClient:
         if not self._active_run:
             raise NoActiveRunError()
-        params = {"dataset.name": name}
-        if path:
-            params["dataset.path"] = path
-        if version:
-            params["dataset.version"] = version
-        if description:
-            params["dataset.description"] = description
+        schema: Dict[str, str] = {}
+        profile: Dict[str, Any] = {}
+        # Auto-compute schema + profile from DataFrame
+        if df is not None:
+            try:
+                schema = {col: str(dtype) for col, dtype in df.dtypes.items()}
+                profile = {
+                    "num_rows": len(df),
+                    "num_features": len(df.columns),
+                }
+            except Exception:
+                pass
+        # Auto-compute digest from local file
+        if path and not digest and source_type == "local":
+            import os
+            try:
+                file_size = os.path.getsize(path)
+                if file_size <= 500 * 1024 * 1024:  # Skip files > 500 MB
+                    import hashlib
+                    sha256 = hashlib.sha256()
+                    with open(path, "rb") as f:
+                        for chunk in iter(lambda: f.read(65536), b""):
+                            sha256.update(chunk)
+                    digest = sha256.hexdigest()
+            except (OSError, IOError):
+                pass
+        body: Dict[str, Any] = {
+            "name": name,
+            "source_type": source_type,
+            "context": context,
+        }
         if digest:
-            params["dataset.digest"] = digest
-        if num_rows is not None:
-            params["dataset.num_rows"] = str(num_rows)
-        if num_features is not None:
-            params["dataset.num_features"] = str(num_features)
+            body["digest"] = digest
+        if path:
+            body["source"] = path
+        if schema:
+            body["schema"] = schema
+        if profile:
+            body["profile"] = profile
+        if tags:
+            body["tags"] = tags
+        data = self._request("POST", f"/runs/{self._active_run.id}/datasets", json=body)
+        dataset_data = data.get("dataset", data)
+        return Dataset.from_dict(dataset_data)
+    def get_run_datasets(self, run_id: str) -> List[Dataset]:
+        """Return all datasets linked to a run.
-        self.log_params(params)
+        Args:
+            run_id: Run ID.
+        Returns:
+            List of Dataset objects.
+        """
+        data = self._request("GET", f"/runs/{run_id}/datasets")
+        return [Dataset.from_dict(d) for d in data.get("datasets", [])]
+    def get_model_lineage(self, model_id: str) -> Dict[str, Any]:
+        """Return the full dataset lineage for all versions of a model.
+        Args:
+            model_id: Registered model ID.
+        Returns:
+            Dict with ``model_id`` and ``versions`` list, each containing
+            ``version``, ``stage``, ``run_id``, ``run_name``, and ``datasets``.
+        """
+        return self._request("GET", f"/models/{model_id}/lineage")
     def compare_runs(
         self,

{podstack-1.3.10 → podstack-1.3.12}/podstack/registry/experiment.py RENAMED Viewed

@@ -9,6 +9,50 @@ from typing import Optional, Dict, Any, List
 from datetime import datetime
+@dataclass
+class Dataset:
+    """Represents a tracked dataset."""
+    id: str
+    project_id: str
+    name: str
+    digest: str = ""
+    source_type: str = ""
+    source: str = ""
+    schema: Dict[str, str] = field(default_factory=dict)
+    profile: Dict[str, Any] = field(default_factory=dict)
+    tags: Dict[str, str] = field(default_factory=dict)
+    created_at: Optional[str] = None
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "Dataset":
+        """Create a Dataset from a dict."""
+        import json
+        def _parse_json_field(value, default):
+            if value is None:
+                return default
+            if isinstance(value, (dict, list)):
+                return value
+            try:
+                return json.loads(value)
+            except (ValueError, TypeError):
+                return default
+        return cls(
+            id=data.get("id", ""),
+            project_id=data.get("project_id", ""),
+            name=data.get("name", ""),
+            digest=data.get("digest", ""),
+            source_type=data.get("source_type", ""),
+            source=data.get("source", ""),
+            schema=_parse_json_field(data.get("schema"), {}),
+            profile=_parse_json_field(data.get("profile"), {}),
+            tags=_parse_json_field(data.get("tags"), {}),
+            created_at=data.get("created_at"),
+        )
 @dataclass
 class Experiment:
     """Represents an experiment in the registry."""
@@ -202,6 +246,22 @@ class Run:
         if self._client:
             self._client.log_model(model, artifact_path=artifact_path, framework=framework, metadata=metadata)
+    def log_dataset(self, name: str, path: str = None, df=None, context: str = "training", **kwargs) -> "Dataset":
+        """Log a dataset to this run.
+        Args:
+            name: Dataset name.
+            path: Local file path or URI.
+            df: Optional pandas DataFrame — schema and profile are auto-computed.
+            context: "training", "validation", or "test".
+            **kwargs: Extra keyword args forwarded to RegistryClient.log_dataset().
+        Returns:
+            Dataset object.
+        """
+        if self._client:
+            return self._client.log_dataset(name=name, path=path, df=df, context=context, **kwargs)
     def set_tag(self, key: str, value: str):
         """Set a tag on this run."""
         if self._client:

{podstack-1.3.10 → podstack-1.3.12}/podstack.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: podstack
-Version: 1.3.10
+Version: 1.3.12
 Summary: Official Python SDK for Podstack GPU Notebook Platform
 Author-email: Podstack <support@podstack.ai>
 License-Expression: MIT

{podstack-1.3.10 → podstack-1.3.12}/podstack.egg-info/SOURCES.txt RENAMED Viewed

@@ -15,6 +15,7 @@ podstack.egg-info/dependency_links.txt
 podstack.egg-info/requires.txt
 podstack.egg-info/top_level.txt
 podstack/registry/__init__.py
+podstack/registry/autolog.py
 podstack/registry/client.py
 podstack/registry/exceptions.py
 podstack/registry/experiment.py

{podstack-1.3.10 → podstack-1.3.12}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "podstack"
-version = "1.3.10"
+version = "1.3.12"
 description = "Official Python SDK for Podstack GPU Notebook Platform"
 readme = "README.md"
 license = "MIT"