PyPI - sunstone-py - Versions diffs - 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl - Mend

sunstone-py 0.5.1py3-none-any.whl → 0.5.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

sunstone/_release.py +23 -12
sunstone/dataframe.py +16 -89
sunstone/datasets.py +28 -6
sunstone/lineage.py +37 -27
{sunstone_py-0.5.1.dist-info → sunstone_py-0.5.3.dist-info}/METADATA +3 -5
sunstone_py-0.5.3.dist-info/RECORD +15 -0
sunstone_py-0.5.1.dist-info/RECORD +0 -15
{sunstone_py-0.5.1.dist-info → sunstone_py-0.5.3.dist-info}/WHEEL +0 -0
{sunstone_py-0.5.1.dist-info → sunstone_py-0.5.3.dist-info}/entry_points.txt +0 -0
{sunstone_py-0.5.1.dist-info → sunstone_py-0.5.3.dist-info}/licenses/LICENSE +0 -0
{sunstone_py-0.5.1.dist-info → sunstone_py-0.5.3.dist-info}/top_level.txt +0 -0

sunstone/_release.py CHANGED Viewed

@@ -13,6 +13,17 @@ import sys
 from datetime import date
 from pathlib import Path
+try:
+    import tomllib
+except ModuleNotFoundError:
+    import tomli as tomllib  # type: ignore[import-not-found,no-redef]
+try:
+    import tomli_w
+except ModuleNotFoundError:
+    print("Error: tomli_w not found. Install with: uv add --dev tomli-w", file=sys.stderr)
+    sys.exit(1)
 def get_root_dir() -> Path:
     """Get the root directory (where pyproject.toml lives)."""
@@ -216,12 +227,13 @@ def confirm_release(new_version: str) -> bool:
 def get_current_version() -> str:
     """Get the current version from pyproject.toml."""
     pyproject_path = get_root_dir() / "pyproject.toml"
-    content = pyproject_path.read_text()
-    match = re.search(r'^version\s*=\s*"([^"]+)"', content, re.MULTILINE)
-    if not match:
+    with open(pyproject_path, "rb") as f:
+        data = tomllib.load(f)
+    version = data.get("project", {}).get("version")
+    if not version:
         print("Error: Could not find version in pyproject.toml", file=sys.stderr)
         sys.exit(1)
-    return match.group(1)
+    return str(version)
 def bump_version(version: str, bump: str) -> str:
@@ -244,14 +256,13 @@ def bump_version(version: str, bump: str) -> str:
 def update_pyproject_version(new_version: str) -> None:
     """Update the version in pyproject.toml."""
     pyproject_path = get_root_dir() / "pyproject.toml"
-    content = pyproject_path.read_text()
-    new_content = re.sub(
-        r'^(version\s*=\s*)"[^"]+"',
-        f'\\1"{new_version}"',
-        content,
-        flags=re.MULTILINE,
-    )
-    pyproject_path.write_text(new_content)
+    with open(pyproject_path, "rb") as f:
+        data = tomllib.load(f)
+    data["project"]["version"] = new_version
+    with open(pyproject_path, "wb") as f:
+        tomli_w.dump(data, f)
 def update_changelog(new_version: str) -> None:

sunstone/dataframe.py CHANGED Viewed

@@ -10,7 +10,7 @@ import pandas as pd
 from .datasets import DatasetsManager
 from .exceptions import DatasetNotFoundError, StrictModeError
-from .lineage import FieldSchema, LineageMetadata
+from .lineage import FieldSchema, LineageMetadata, compute_dataframe_hash
 pd.options.mode.copy_on_write = True
@@ -196,7 +196,6 @@ class DataFrame:
         # Create lineage metadata
         lineage = LineageMetadata(project_path=str(manager.project_path))
         lineage.add_source(dataset)
-        lineage.add_operation(f"read_dataset({dataset.slug}, format={format})")
         # Return wrapped DataFrame
         return cls(data=df, lineage=lineage, strict=strict, project_path=project_path)
@@ -294,7 +293,6 @@ class DataFrame:
         # Create lineage metadata
         lineage = LineageMetadata(project_path=str(manager.project_path))
         lineage.add_source(dataset)
-        lineage.add_operation(f"read_csv({dataset.slug})")
         # Return wrapped DataFrame
         return cls(data=df, lineage=lineage, strict=strict, project_path=project_path)
@@ -363,11 +361,13 @@ class DataFrame:
         absolute_path.parent.mkdir(parents=True, exist_ok=True)
         self.data.to_csv(absolute_path, **kwargs)
-        # Record the operation
-        self.lineage.add_operation(f"to_csv({dataset.slug})")
+        # Compute content hash for change detection
+        content_hash = compute_dataframe_hash(self.data)
         # Persist lineage metadata to datasets.yaml
-        manager.update_output_lineage(slug=dataset.slug, lineage=self.lineage, strict=self.strict_mode)
+        manager.update_output_lineage(
+            slug=dataset.slug, lineage=self.lineage, content_hash=content_hash, strict=self.strict_mode
+        )
     def _infer_field_schema(self) -> List[FieldSchema]:
         """
@@ -410,11 +410,8 @@ class DataFrame:
         # Perform the merge
         merged_data = pd.merge(self.data, right.data, **kwargs)
-        # Combine lineage
+        # Combine lineage (sources from both DataFrames)
         merged_lineage = self.lineage.merge(right.lineage)
-        merged_lineage.add_operation(
-            f"merge(left={len(self.lineage.sources)} sources, right={len(right.lineage.sources)} sources)"
-        )
         return DataFrame(
             data=merged_data,
@@ -437,11 +434,8 @@ class DataFrame:
         # Perform the join
         joined_data = self.data.join(other.data, **kwargs)
-        # Combine lineage
+        # Combine lineage (sources from both DataFrames)
         joined_lineage = self.lineage.merge(other.lineage)
-        joined_lineage.add_operation(
-            f"join(left={len(self.lineage.sources)} sources, right={len(other.lineage.sources)} sources)"
-        )
         return DataFrame(
             data=joined_data,
@@ -467,16 +461,11 @@ class DataFrame:
         # Concatenate
         concatenated_data = pd.concat(all_dfs, **kwargs)
-        # Combine lineage from all DataFrames
+        # Combine lineage (sources from all DataFrames)
         combined_lineage = self.lineage
         for other in others:
             combined_lineage = combined_lineage.merge(other.lineage)
-        combined_lineage.add_operation(
-            f"concat({len(others) + 1} dataframes, "
-            f"{sum(len(df.lineage.sources) for df in [self] + others)} total sources)"
-        )
         return DataFrame(
             data=concatenated_data,
             lineage=combined_lineage,
@@ -484,42 +473,12 @@ class DataFrame:
             project_path=self.lineage.project_path,
         )
-    def apply_operation(self, operation: Callable[[pd.DataFrame], pd.DataFrame], description: str) -> "DataFrame":
-        """
-        Apply a transformation operation to the DataFrame.
-        Args:
-            operation: Function that takes a pandas DataFrame and returns a DataFrame.
-            description: Human-readable description of the operation.
-        Returns:
-            A new DataFrame with the operation applied and recorded in lineage.
-        """
-        # Apply the operation
-        new_data = operation(self.data)
-        # Copy lineage and add operation
-        new_lineage = LineageMetadata(
-            sources=self.lineage.sources.copy(),
-            operations=self.lineage.operations.copy(),
-            project_path=self.lineage.project_path,
-        )
-        new_lineage.add_operation(description)
-        return DataFrame(
-            data=new_data,
-            lineage=new_lineage,
-            strict=self.strict_mode,
-            project_path=self.lineage.project_path,
-        )
-    def _wrap_result(self, result: Any, operation: Optional[str] = None) -> Any:
+    def _wrap_result(self, result: Any) -> Any:
         """
         Wrap a pandas result in a Sunstone DataFrame if applicable.
         Args:
             result: The result from a pandas operation.
-            operation: Name of the operation performed. If None, no operation is recorded.
         Returns:
             Wrapped DataFrame if result is a DataFrame, otherwise the result.
@@ -527,11 +486,8 @@ class DataFrame:
         if isinstance(result, pd.DataFrame):
             new_lineage = LineageMetadata(
                 sources=self.lineage.sources.copy(),
-                operations=self.lineage.operations.copy(),
                 project_path=self.lineage.project_path,
             )
-            if operation is not None:
-                new_lineage.add_operation(operation)
             return DataFrame(
                 data=result,
@@ -541,28 +497,6 @@ class DataFrame:
             )
         return result
-    # Methods that don't represent meaningful data transformations
-    # These return DataFrames but shouldn't be tracked in lineage
-    _NON_TRACKING_METHODS = frozenset(
-        {
-            # Copy operations - same data, no transformation
-            "copy",
-            # Index operations - same data, different index
-            "reset_index",
-            "set_index",
-            "reindex",
-            # Type conversions without data change
-            "astype",
-            "infer_objects",
-            # Column/index renaming - same data, different labels
-            "rename",
-            "rename_axis",
-            # Reshaping without data loss
-            "T",
-            "transpose",
-        }
-    )
     def __getattr__(self, name: str) -> Any:
         """
         Delegate attribute access to the underlying pandas DataFrame.
@@ -583,14 +517,11 @@ class DataFrame:
             def wrapper(*args: Any, **kwargs: Any) -> Any:
                 result = attr(*args, **kwargs)
-                # Don't track non-transforming methods
-                if name in DataFrame._NON_TRACKING_METHODS:
-                    return self._wrap_result(result, operation=None)
-                return self._wrap_result(result, operation=f"{name}")
+                return self._wrap_result(result)
             return wrapper
-        return self._wrap_result(attr, operation=None)  # Don't track attribute access
+        return self._wrap_result(attr)
     def __getitem__(self, key: Any) -> Any:
         """
@@ -603,9 +534,7 @@ class DataFrame:
             The item from the underlying DataFrame, wrapped if it's a DataFrame.
         """
         result = self.data[key]
-        # Don't track __getitem__ as an operation - it's just column/row access
-        # not a meaningful transformation
-        return self._wrap_result(result, operation=None)
+        return self._wrap_result(result)
     def __setitem__(self, key: Any, value: Any) -> None:
         """
@@ -616,14 +545,12 @@ class DataFrame:
             value: Value to assign.
         """
         self.data[key] = value
-        # Track column assignment in lineage
-        self.lineage.add_operation(f"__setitem__({key!r})")
+        # Don't track column assignments automatically
+        # Users should use add_operation() for meaningful transformations
     def __repr__(self) -> str:
         """String representation of the DataFrame."""
-        lineage_info = (
-            f"\n\nLineage: {len(self.lineage.sources)} source(s), {len(self.lineage.operations)} operation(s)"
-        )
+        lineage_info = f"\n\nLineage: {len(self.lineage.sources)} source(s)"
         return repr(self.data) + lineage_info
     def __str__(self) -> str:

sunstone/datasets.py CHANGED Viewed

@@ -380,22 +380,30 @@ class DatasetsManager:
         raise DatasetNotFoundError(f"Output dataset with slug '{slug}' not found")
-    def update_output_lineage(self, slug: str, lineage: LineageMetadata, strict: bool = False) -> None:
+    def update_output_lineage(
+        self, slug: str, lineage: LineageMetadata, content_hash: str, strict: bool = False
+    ) -> None:
         """
         Update lineage metadata for an output dataset.
+        The timestamp is only updated when the content hash changes, preventing
+        unnecessary updates when the data hasn't changed.
         In strict mode, validates that the lineage matches what would be written
         without modifying the file. In relaxed mode, updates the file with lineage.
         Args:
             slug: The slug of the output dataset to update.
             lineage: The lineage metadata to persist.
+            content_hash: SHA256 hash of the DataFrame content.
             strict: If True, validate without modifying. If False, update the file.
         Raises:
             DatasetNotFoundError: If the dataset doesn't exist.
             DatasetValidationError: In strict mode, if lineage differs from what's in the file.
         """
+        from datetime import datetime
         # Find the output dataset
         dataset_idx = None
         for i, dataset_data in enumerate(self._data["outputs"]):
@@ -406,6 +414,21 @@ class DatasetsManager:
         if dataset_idx is None:
             raise DatasetNotFoundError(f"Output dataset with slug '{slug}' not found")
+        # Get existing lineage data if present
+        existing_lineage = self._data["outputs"][dataset_idx].get("lineage", {})
+        existing_hash = existing_lineage.get("content_hash")
+        existing_timestamp = existing_lineage.get("created_at")
+        # Determine if content has changed
+        content_changed = existing_hash != content_hash
+        # Only update timestamp if content changed
+        if content_changed:
+            timestamp = datetime.now().isoformat()
+        else:
+            # Preserve existing timestamp
+            timestamp = existing_timestamp
         # Build lineage metadata to add
         lineage_data: dict[str, Any] = {}
@@ -414,15 +437,14 @@ class DatasetsManager:
                 {
                     "slug": src.slug,
                     "name": src.name,
+                    "location": src.location,
                 }
                 for src in lineage.sources
             ]
-        if lineage.operations:
-            lineage_data["operations"] = lineage.operations.copy()
-        if lineage.created_at:
-            lineage_data["created_at"] = lineage.created_at.isoformat()
+        lineage_data["content_hash"] = content_hash
+        if timestamp:
+            lineage_data["created_at"] = timestamp
         # Create a copy of the data with updated lineage
         updated_data = self._data.copy()

sunstone/lineage.py CHANGED Viewed

@@ -2,9 +2,13 @@
 Lineage metadata structures for tracking data provenance.
 """
+import hashlib
 from dataclasses import dataclass, field
 from datetime import datetime
-from typing import Any, Dict, List, Optional
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+if TYPE_CHECKING:
+    import pandas as pd
 @dataclass
@@ -88,23 +92,41 @@ class DatasetMetadata:
     """Type of dataset: 'input' or 'output'."""
+def compute_dataframe_hash(df: "pd.DataFrame") -> str:
+    """
+    Compute a fast SHA256 hash of a pandas DataFrame's content.
+    Uses pickle serialization for a consistent, fast representation of the data.
+    Args:
+        df: The pandas DataFrame to hash.
+    Returns:
+        A SHA256 hex digest string representing the DataFrame content.
+    """
+    import pickle
+    # Use pickle protocol 5 for efficiency; hash the bytes directly
+    data_bytes = pickle.dumps(df, protocol=5)
+    return hashlib.sha256(data_bytes).hexdigest()
 @dataclass
 class LineageMetadata:
     """
     Lineage metadata tracking the provenance of data in a DataFrame.
-    This tracks all source datasets that contributed to the current DataFrame,
-    including information about transformations and operations performed.
+    This tracks all source datasets that contributed to the current DataFrame.
     """
     sources: List[DatasetMetadata] = field(default_factory=list)
     """List of source datasets that contributed to this data."""
-    operations: List[str] = field(default_factory=list)
-    """List of operations performed on the data."""
+    created_at: Optional[datetime] = None
+    """Timestamp when this lineage was last updated (content changed)."""
-    created_at: datetime = field(default_factory=datetime.now)
-    """Timestamp when this lineage was created."""
+    content_hash: Optional[str] = None
+    """SHA256 hash of the DataFrame content, used to detect changes."""
     project_path: Optional[str] = None
     """Path to the project directory containing datasets.yaml."""
@@ -119,15 +141,6 @@ class LineageMetadata:
         if dataset not in self.sources:
             self.sources.append(dataset)
-    def add_operation(self, operation: str) -> None:
-        """
-        Record an operation performed on the data.
-        Args:
-            operation: Description of the operation.
-        """
-        self.operations.append(operation)
     def merge(self, other: "LineageMetadata") -> "LineageMetadata":
         """
         Merge lineage from another DataFrame.
@@ -136,12 +149,10 @@ class LineageMetadata:
             other: The other lineage metadata to merge.
         Returns:
-            A new LineageMetadata with combined sources and operations.
+            A new LineageMetadata with combined sources.
         """
         merged = LineageMetadata(
             sources=self.sources.copy(),
-            operations=self.operations.copy(),
-            created_at=datetime.now(),
             project_path=self.project_path or other.project_path,
         )
@@ -150,9 +161,6 @@ class LineageMetadata:
             if source not in merged.sources:
                 merged.sources.append(source)
-        # Combine operations
-        merged.operations.extend(other.operations)
         return merged
     def get_licenses(self) -> List[str]:
@@ -175,16 +183,18 @@ class LineageMetadata:
         Returns:
             Dictionary containing lineage information.
         """
-        return {
+        result: Dict[str, Any] = {
             "sources": [
                 {
-                    "name": src.name,
                     "slug": src.slug,
+                    "name": src.name,
                     "location": src.location,
                 }
                 for src in self.sources
             ],
-            "operations": self.operations,
-            "created_at": self.created_at.isoformat(),
-            "licenses": self.get_licenses(),
         }
+        if self.created_at is not None:
+            result["created_at"] = self.created_at.isoformat()
+        if self.content_hash is not None:
+            result["content_hash"] = self.content_hash
+        return result

{sunstone_py-0.5.1.dist-info → sunstone_py-0.5.3.dist-info}/METADATA RENAMED Viewed

@@ -1,22 +1,20 @@
 Metadata-Version: 2.4
 Name: sunstone-py
-Version: 0.5.1
+Version: 0.5.3
 Summary: Python library for managing datasets with lineage tracking in Sunstone projects
 Author-email: Sunstone Institute <stig@sunstone.institute>
 License: MIT
 Project-URL: Homepage, https://github.com/sunstoneinstitute/sunstone-py
 Project-URL: Documentation, https://sunstoneinstitute.github.io/sunstone-py/
 Project-URL: Repository, https://github.com/sunstoneinstitute/sunstone-py
-Classifier: Development Status :: 3 - Alpha
+Classifier: Development Status :: 4 - Beta
 Classifier: Intended Audience :: Science/Research
 Classifier: License :: OSI Approved :: MIT License
 Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.10
-Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
 Classifier: Programming Language :: Python :: 3.14
-Requires-Python: >=3.10
+Requires-Python: >=3.12
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: frictionless>=5.18.1

sunstone_py-0.5.3.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,15 @@
+sunstone/__init__.py,sha256=LC0ZtmxP26eXPLKejbg7UStcHOnE_lwttNTL4m3F4yM,2032
+sunstone/_release.py,sha256=MQNaUD7mSK6h8vu6EIgJuaMlAxuFxv82NQwHgBpLZm4,14907
+sunstone/dataframe.py,sha256=rFGuMq-63Haua_QQfR3E708KYc1g43yEyCej11_Gl3A,20679
+sunstone/datasets.py,sha256=V2psK5G2IwpxNFL_DdoVistIT8O53ASbJ0Y3nPDtEx4,21970
+sunstone/exceptions.py,sha256=fiixXazur3LtQGy21bGEaSr356DObFcYxQJ3FvOxNec,623
+sunstone/lineage.py,sha256=SRCpdsYDeAPTO2H-3ul8BP8AUihmhezcV8Ggwa0eTfs,5460
+sunstone/pandas.py,sha256=CLEqIIgTbMmpH73TPy_vDUPxQa37Hpmqn4r6No8PJwo,8188
+sunstone/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+sunstone/validation.py,sha256=1356vcUc72a1zGBUe9Xjrcb5h41Xo53PaK2nnQ_FuSM,8286
+sunstone_py-0.5.3.dist-info/licenses/LICENSE,sha256=pB6VuR4QRjwjMjy8RSNGho-N1SUdu07ntIhT5lrhkzU,1078
+sunstone_py-0.5.3.dist-info/METADATA,sha256=qwq_KyzHzGljeHFUUJwVEGJL1l-JrAxiB8RVS-8bqt4,9460
+sunstone_py-0.5.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+sunstone_py-0.5.3.dist-info/entry_points.txt,sha256=0h6E88rH9a_503BAzXvFPR-UfmkrRFjcOf29DXgJNjk,51
+sunstone_py-0.5.3.dist-info/top_level.txt,sha256=A2fW-7JO10rlx_L28Bc4FVvWt2R8kgvS8_TGPBhQp3c,9
+sunstone_py-0.5.3.dist-info/RECORD,,

sunstone_py-0.5.1.dist-info/RECORD DELETED Viewed

@@ -1,15 +0,0 @@
-sunstone/__init__.py,sha256=LC0ZtmxP26eXPLKejbg7UStcHOnE_lwttNTL4m3F4yM,2032
-sunstone/_release.py,sha256=_yjAl_vZQ_5IYr0ugPlqtmUvsGnyGDx7LyiI_2HToVM,14649
-sunstone/dataframe.py,sha256=UJgQx7auiNb6hSIvhB8EQs2afu-7S22xdWL5DZUr29g,23602
-sunstone/datasets.py,sha256=LdHk3Vkfc7QH2VxhSskRCm9wUFSkldCmgS_1c2KDAPA,21142
-sunstone/exceptions.py,sha256=fiixXazur3LtQGy21bGEaSr356DObFcYxQJ3FvOxNec,623
-sunstone/lineage.py,sha256=B9GKMu5-v8Izos5G40K_EvsCPJL3Z2Tg1T_Fc7ezSMI,5240
-sunstone/pandas.py,sha256=CLEqIIgTbMmpH73TPy_vDUPxQa37Hpmqn4r6No8PJwo,8188
-sunstone/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-sunstone/validation.py,sha256=1356vcUc72a1zGBUe9Xjrcb5h41Xo53PaK2nnQ_FuSM,8286
-sunstone_py-0.5.1.dist-info/licenses/LICENSE,sha256=pB6VuR4QRjwjMjy8RSNGho-N1SUdu07ntIhT5lrhkzU,1078
-sunstone_py-0.5.1.dist-info/METADATA,sha256=DMLR03NMB5_t14rsBo4GtqY0oQFHnKQtbdUGEfxFcq8,9563
-sunstone_py-0.5.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-sunstone_py-0.5.1.dist-info/entry_points.txt,sha256=0h6E88rH9a_503BAzXvFPR-UfmkrRFjcOf29DXgJNjk,51
-sunstone_py-0.5.1.dist-info/top_level.txt,sha256=A2fW-7JO10rlx_L28Bc4FVvWt2R8kgvS8_TGPBhQp3c,9
-sunstone_py-0.5.1.dist-info/RECORD,,

{sunstone_py-0.5.1.dist-info → sunstone_py-0.5.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{sunstone_py-0.5.1.dist-info → sunstone_py-0.5.3.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{sunstone_py-0.5.1.dist-info → sunstone_py-0.5.3.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{sunstone_py-0.5.1.dist-info → sunstone_py-0.5.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

sunstone-py 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

sunstone-py 0.5.1py3-none-any.whl → 0.5.3py3-none-any.whl