PyPI - sunstone-py - Versions diffs - 0.5.1__tar.gz → 0.5.3__tar.gz - Mend

sunstone-py 0.5.1tar.gz → 0.5.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

{sunstone_py-0.5.1 → sunstone_py-0.5.3}/PKG-INFO RENAMED Viewed

@@ -1,22 +1,20 @@
 Metadata-Version: 2.4
 Name: sunstone-py
-Version: 0.5.1
+Version: 0.5.3
 Summary: Python library for managing datasets with lineage tracking in Sunstone projects
 Author-email: Sunstone Institute <stig@sunstone.institute>
 License: MIT
 Project-URL: Homepage, https://github.com/sunstoneinstitute/sunstone-py
 Project-URL: Documentation, https://sunstoneinstitute.github.io/sunstone-py/
 Project-URL: Repository, https://github.com/sunstoneinstitute/sunstone-py
-Classifier: Development Status :: 3 - Alpha
+Classifier: Development Status :: 4 - Beta
 Classifier: Intended Audience :: Science/Research
 Classifier: License :: OSI Approved :: MIT License
 Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.10
-Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
 Classifier: Programming Language :: Python :: 3.14
-Requires-Python: >=3.10
+Requires-Python: >=3.12
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: frictionless>=5.18.1

{sunstone_py-0.5.1 → sunstone_py-0.5.3}/pyproject.toml RENAMED Viewed

@@ -1,29 +1,28 @@
 [build-system]
-requires = ["setuptools>=61.0", "wheel"]
+requires = [
+    "setuptools>=61.0",
+    "wheel",
+]
 build-backend = "setuptools.build_meta"
 [project]
 name = "sunstone-py"
-version = "0.5.1"
+version = "0.5.3"
 description = "Python library for managing datasets with lineage tracking in Sunstone projects"
 readme = "README.md"
-requires-python = ">=3.10"
-license = {text = "MIT"}
+requires-python = ">=3.12"
 authors = [
-    {name = "Sunstone Institute", email = "stig@sunstone.institute"}
+    { name = "Sunstone Institute", email = "stig@sunstone.institute" },
 ]
 classifiers = [
-    "Development Status :: 3 - Alpha",
+    "Development Status :: 4 - Beta",
     "Intended Audience :: Science/Research",
     "License :: OSI Approved :: MIT License",
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.10",
-    "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
     "Programming Language :: Python :: 3.13",
     "Programming Language :: Python :: 3.14",
 ]
 dependencies = [
     "frictionless>=5.18.1",
     "google-auth>=2.43.0",
@@ -33,6 +32,9 @@ dependencies = [
     "ruamel-yaml>=0.18",
 ]
+[project.license]
+text = "MIT"
 [project.urls]
 Homepage = "https://github.com/sunstoneinstitute/sunstone-py"
 Documentation = "https://sunstoneinstitute.github.io/sunstone-py/"
@@ -42,21 +44,25 @@ Repository = "https://github.com/sunstoneinstitute/sunstone-py"
 release = "sunstone._release:main"
 [tool.setuptools.packages.find]
-where = ["src"]
+where = [
+    "src",
+]
 [tool.setuptools.package-data]
-sunstone = ["py.typed"]
+sunstone = [
+    "py.typed",
+]
 [tool.ruff]
 line-length = 120
-target-version = "py310"
+target-version = "py312"
 [tool.ruff.format]
 quote-style = "double"
 indent-style = "space"
 [tool.mypy]
-python_version = "3.10"
+python_version = "3.12"
 warn_return_any = true
 warn_unused_configs = true
 disallow_untyped_defs = true
@@ -75,6 +81,7 @@ dev = [
     "pandas-stubs>=2.3.2.250926",
     "types-pyyaml>=6.0.12.20250915",
     "markdown>=3.10",
+    "tomli-w>=1.2.0",
 ]
 docs = [
     "mkdocs-material>=9.5.0",

{sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone/_release.py RENAMED Viewed

@@ -13,6 +13,17 @@ import sys
 from datetime import date
 from pathlib import Path
+try:
+    import tomllib
+except ModuleNotFoundError:
+    import tomli as tomllib  # type: ignore[import-not-found,no-redef]
+try:
+    import tomli_w
+except ModuleNotFoundError:
+    print("Error: tomli_w not found. Install with: uv add --dev tomli-w", file=sys.stderr)
+    sys.exit(1)
 def get_root_dir() -> Path:
     """Get the root directory (where pyproject.toml lives)."""
@@ -216,12 +227,13 @@ def confirm_release(new_version: str) -> bool:
 def get_current_version() -> str:
     """Get the current version from pyproject.toml."""
     pyproject_path = get_root_dir() / "pyproject.toml"
-    content = pyproject_path.read_text()
-    match = re.search(r'^version\s*=\s*"([^"]+)"', content, re.MULTILINE)
-    if not match:
+    with open(pyproject_path, "rb") as f:
+        data = tomllib.load(f)
+    version = data.get("project", {}).get("version")
+    if not version:
         print("Error: Could not find version in pyproject.toml", file=sys.stderr)
         sys.exit(1)
-    return match.group(1)
+    return str(version)
 def bump_version(version: str, bump: str) -> str:
@@ -244,14 +256,13 @@ def bump_version(version: str, bump: str) -> str:
 def update_pyproject_version(new_version: str) -> None:
     """Update the version in pyproject.toml."""
     pyproject_path = get_root_dir() / "pyproject.toml"
-    content = pyproject_path.read_text()
-    new_content = re.sub(
-        r'^(version\s*=\s*)"[^"]+"',
-        f'\\1"{new_version}"',
-        content,
-        flags=re.MULTILINE,
-    )
-    pyproject_path.write_text(new_content)
+    with open(pyproject_path, "rb") as f:
+        data = tomllib.load(f)
+    data["project"]["version"] = new_version
+    with open(pyproject_path, "wb") as f:
+        tomli_w.dump(data, f)
 def update_changelog(new_version: str) -> None:

{sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone/dataframe.py RENAMED Viewed

@@ -10,7 +10,7 @@ import pandas as pd
 from .datasets import DatasetsManager
 from .exceptions import DatasetNotFoundError, StrictModeError
-from .lineage import FieldSchema, LineageMetadata
+from .lineage import FieldSchema, LineageMetadata, compute_dataframe_hash
 pd.options.mode.copy_on_write = True
@@ -196,7 +196,6 @@ class DataFrame:
         # Create lineage metadata
         lineage = LineageMetadata(project_path=str(manager.project_path))
         lineage.add_source(dataset)
-        lineage.add_operation(f"read_dataset({dataset.slug}, format={format})")
         # Return wrapped DataFrame
         return cls(data=df, lineage=lineage, strict=strict, project_path=project_path)
@@ -294,7 +293,6 @@ class DataFrame:
         # Create lineage metadata
         lineage = LineageMetadata(project_path=str(manager.project_path))
         lineage.add_source(dataset)
-        lineage.add_operation(f"read_csv({dataset.slug})")
         # Return wrapped DataFrame
         return cls(data=df, lineage=lineage, strict=strict, project_path=project_path)
@@ -363,11 +361,13 @@ class DataFrame:
         absolute_path.parent.mkdir(parents=True, exist_ok=True)
         self.data.to_csv(absolute_path, **kwargs)
-        # Record the operation
-        self.lineage.add_operation(f"to_csv({dataset.slug})")
+        # Compute content hash for change detection
+        content_hash = compute_dataframe_hash(self.data)
         # Persist lineage metadata to datasets.yaml
-        manager.update_output_lineage(slug=dataset.slug, lineage=self.lineage, strict=self.strict_mode)
+        manager.update_output_lineage(
+            slug=dataset.slug, lineage=self.lineage, content_hash=content_hash, strict=self.strict_mode
+        )
     def _infer_field_schema(self) -> List[FieldSchema]:
         """
@@ -410,11 +410,8 @@ class DataFrame:
         # Perform the merge
         merged_data = pd.merge(self.data, right.data, **kwargs)
-        # Combine lineage
+        # Combine lineage (sources from both DataFrames)
         merged_lineage = self.lineage.merge(right.lineage)
-        merged_lineage.add_operation(
-            f"merge(left={len(self.lineage.sources)} sources, right={len(right.lineage.sources)} sources)"
-        )
         return DataFrame(
             data=merged_data,
@@ -437,11 +434,8 @@ class DataFrame:
         # Perform the join
         joined_data = self.data.join(other.data, **kwargs)
-        # Combine lineage
+        # Combine lineage (sources from both DataFrames)
         joined_lineage = self.lineage.merge(other.lineage)
-        joined_lineage.add_operation(
-            f"join(left={len(self.lineage.sources)} sources, right={len(other.lineage.sources)} sources)"
-        )
         return DataFrame(
             data=joined_data,
@@ -467,16 +461,11 @@ class DataFrame:
         # Concatenate
         concatenated_data = pd.concat(all_dfs, **kwargs)
-        # Combine lineage from all DataFrames
+        # Combine lineage (sources from all DataFrames)
         combined_lineage = self.lineage
         for other in others:
             combined_lineage = combined_lineage.merge(other.lineage)
-        combined_lineage.add_operation(
-            f"concat({len(others) + 1} dataframes, "
-            f"{sum(len(df.lineage.sources) for df in [self] + others)} total sources)"
-        )
         return DataFrame(
             data=concatenated_data,
             lineage=combined_lineage,
@@ -484,42 +473,12 @@ class DataFrame:
             project_path=self.lineage.project_path,
         )
-    def apply_operation(self, operation: Callable[[pd.DataFrame], pd.DataFrame], description: str) -> "DataFrame":
-        """
-        Apply a transformation operation to the DataFrame.
-        Args:
-            operation: Function that takes a pandas DataFrame and returns a DataFrame.
-            description: Human-readable description of the operation.
-        Returns:
-            A new DataFrame with the operation applied and recorded in lineage.
-        """
-        # Apply the operation
-        new_data = operation(self.data)
-        # Copy lineage and add operation
-        new_lineage = LineageMetadata(
-            sources=self.lineage.sources.copy(),
-            operations=self.lineage.operations.copy(),
-            project_path=self.lineage.project_path,
-        )
-        new_lineage.add_operation(description)
-        return DataFrame(
-            data=new_data,
-            lineage=new_lineage,
-            strict=self.strict_mode,
-            project_path=self.lineage.project_path,
-        )
-    def _wrap_result(self, result: Any, operation: Optional[str] = None) -> Any:
+    def _wrap_result(self, result: Any) -> Any:
         """
         Wrap a pandas result in a Sunstone DataFrame if applicable.
         Args:
             result: The result from a pandas operation.
-            operation: Name of the operation performed. If None, no operation is recorded.
         Returns:
             Wrapped DataFrame if result is a DataFrame, otherwise the result.
@@ -527,11 +486,8 @@ class DataFrame:
         if isinstance(result, pd.DataFrame):
             new_lineage = LineageMetadata(
                 sources=self.lineage.sources.copy(),
-                operations=self.lineage.operations.copy(),
                 project_path=self.lineage.project_path,
             )
-            if operation is not None:
-                new_lineage.add_operation(operation)
             return DataFrame(
                 data=result,
@@ -541,28 +497,6 @@ class DataFrame:
             )
         return result
-    # Methods that don't represent meaningful data transformations
-    # These return DataFrames but shouldn't be tracked in lineage
-    _NON_TRACKING_METHODS = frozenset(
-        {
-            # Copy operations - same data, no transformation
-            "copy",
-            # Index operations - same data, different index
-            "reset_index",
-            "set_index",
-            "reindex",
-            # Type conversions without data change
-            "astype",
-            "infer_objects",
-            # Column/index renaming - same data, different labels
-            "rename",
-            "rename_axis",
-            # Reshaping without data loss
-            "T",
-            "transpose",
-        }
-    )
     def __getattr__(self, name: str) -> Any:
         """
         Delegate attribute access to the underlying pandas DataFrame.
@@ -583,14 +517,11 @@ class DataFrame:
             def wrapper(*args: Any, **kwargs: Any) -> Any:
                 result = attr(*args, **kwargs)
-                # Don't track non-transforming methods
-                if name in DataFrame._NON_TRACKING_METHODS:
-                    return self._wrap_result(result, operation=None)
-                return self._wrap_result(result, operation=f"{name}")
+                return self._wrap_result(result)
             return wrapper
-        return self._wrap_result(attr, operation=None)  # Don't track attribute access
+        return self._wrap_result(attr)
     def __getitem__(self, key: Any) -> Any:
         """
@@ -603,9 +534,7 @@ class DataFrame:
             The item from the underlying DataFrame, wrapped if it's a DataFrame.
         """
         result = self.data[key]
-        # Don't track __getitem__ as an operation - it's just column/row access
-        # not a meaningful transformation
-        return self._wrap_result(result, operation=None)
+        return self._wrap_result(result)
     def __setitem__(self, key: Any, value: Any) -> None:
         """
@@ -616,14 +545,12 @@ class DataFrame:
             value: Value to assign.
         """
         self.data[key] = value
-        # Track column assignment in lineage
-        self.lineage.add_operation(f"__setitem__({key!r})")
+        # Don't track column assignments automatically
+        # Users should use add_operation() for meaningful transformations
     def __repr__(self) -> str:
         """String representation of the DataFrame."""
-        lineage_info = (
-            f"\n\nLineage: {len(self.lineage.sources)} source(s), {len(self.lineage.operations)} operation(s)"
-        )
+        lineage_info = f"\n\nLineage: {len(self.lineage.sources)} source(s)"
         return repr(self.data) + lineage_info
     def __str__(self) -> str:

{sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone/datasets.py RENAMED Viewed

@@ -380,22 +380,30 @@ class DatasetsManager:
         raise DatasetNotFoundError(f"Output dataset with slug '{slug}' not found")
-    def update_output_lineage(self, slug: str, lineage: LineageMetadata, strict: bool = False) -> None:
+    def update_output_lineage(
+        self, slug: str, lineage: LineageMetadata, content_hash: str, strict: bool = False
+    ) -> None:
         """
         Update lineage metadata for an output dataset.
+        The timestamp is only updated when the content hash changes, preventing
+        unnecessary updates when the data hasn't changed.
         In strict mode, validates that the lineage matches what would be written
         without modifying the file. In relaxed mode, updates the file with lineage.
         Args:
             slug: The slug of the output dataset to update.
             lineage: The lineage metadata to persist.
+            content_hash: SHA256 hash of the DataFrame content.
             strict: If True, validate without modifying. If False, update the file.
         Raises:
             DatasetNotFoundError: If the dataset doesn't exist.
             DatasetValidationError: In strict mode, if lineage differs from what's in the file.
         """
+        from datetime import datetime
         # Find the output dataset
         dataset_idx = None
         for i, dataset_data in enumerate(self._data["outputs"]):
@@ -406,6 +414,21 @@ class DatasetsManager:
         if dataset_idx is None:
             raise DatasetNotFoundError(f"Output dataset with slug '{slug}' not found")
+        # Get existing lineage data if present
+        existing_lineage = self._data["outputs"][dataset_idx].get("lineage", {})
+        existing_hash = existing_lineage.get("content_hash")
+        existing_timestamp = existing_lineage.get("created_at")
+        # Determine if content has changed
+        content_changed = existing_hash != content_hash
+        # Only update timestamp if content changed
+        if content_changed:
+            timestamp = datetime.now().isoformat()
+        else:
+            # Preserve existing timestamp
+            timestamp = existing_timestamp
         # Build lineage metadata to add
         lineage_data: dict[str, Any] = {}
@@ -414,15 +437,14 @@ class DatasetsManager:
                 {
                     "slug": src.slug,
                     "name": src.name,
+                    "location": src.location,
                 }
                 for src in lineage.sources
             ]
-        if lineage.operations:
-            lineage_data["operations"] = lineage.operations.copy()
-        if lineage.created_at:
-            lineage_data["created_at"] = lineage.created_at.isoformat()
+        lineage_data["content_hash"] = content_hash
+        if timestamp:
+            lineage_data["created_at"] = timestamp
         # Create a copy of the data with updated lineage
         updated_data = self._data.copy()

{sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone/lineage.py RENAMED Viewed

@@ -2,9 +2,13 @@
 Lineage metadata structures for tracking data provenance.
 """
+import hashlib
 from dataclasses import dataclass, field
 from datetime import datetime
-from typing import Any, Dict, List, Optional
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+if TYPE_CHECKING:
+    import pandas as pd
 @dataclass
@@ -88,23 +92,41 @@ class DatasetMetadata:
     """Type of dataset: 'input' or 'output'."""
+def compute_dataframe_hash(df: "pd.DataFrame") -> str:
+    """
+    Compute a fast SHA256 hash of a pandas DataFrame's content.
+    Uses pickle serialization for a consistent, fast representation of the data.
+    Args:
+        df: The pandas DataFrame to hash.
+    Returns:
+        A SHA256 hex digest string representing the DataFrame content.
+    """
+    import pickle
+    # Use pickle protocol 5 for efficiency; hash the bytes directly
+    data_bytes = pickle.dumps(df, protocol=5)
+    return hashlib.sha256(data_bytes).hexdigest()
 @dataclass
 class LineageMetadata:
     """
     Lineage metadata tracking the provenance of data in a DataFrame.
-    This tracks all source datasets that contributed to the current DataFrame,
-    including information about transformations and operations performed.
+    This tracks all source datasets that contributed to the current DataFrame.
     """
     sources: List[DatasetMetadata] = field(default_factory=list)
     """List of source datasets that contributed to this data."""
-    operations: List[str] = field(default_factory=list)
-    """List of operations performed on the data."""
+    created_at: Optional[datetime] = None
+    """Timestamp when this lineage was last updated (content changed)."""
-    created_at: datetime = field(default_factory=datetime.now)
-    """Timestamp when this lineage was created."""
+    content_hash: Optional[str] = None
+    """SHA256 hash of the DataFrame content, used to detect changes."""
     project_path: Optional[str] = None
     """Path to the project directory containing datasets.yaml."""
@@ -119,15 +141,6 @@ class LineageMetadata:
         if dataset not in self.sources:
             self.sources.append(dataset)
-    def add_operation(self, operation: str) -> None:
-        """
-        Record an operation performed on the data.
-        Args:
-            operation: Description of the operation.
-        """
-        self.operations.append(operation)
     def merge(self, other: "LineageMetadata") -> "LineageMetadata":
         """
         Merge lineage from another DataFrame.
@@ -136,12 +149,10 @@ class LineageMetadata:
             other: The other lineage metadata to merge.
         Returns:
-            A new LineageMetadata with combined sources and operations.
+            A new LineageMetadata with combined sources.
         """
         merged = LineageMetadata(
             sources=self.sources.copy(),
-            operations=self.operations.copy(),
-            created_at=datetime.now(),
             project_path=self.project_path or other.project_path,
         )
@@ -150,9 +161,6 @@ class LineageMetadata:
             if source not in merged.sources:
                 merged.sources.append(source)
-        # Combine operations
-        merged.operations.extend(other.operations)
         return merged
     def get_licenses(self) -> List[str]:
@@ -175,16 +183,18 @@ class LineageMetadata:
         Returns:
             Dictionary containing lineage information.
         """
-        return {
+        result: Dict[str, Any] = {
             "sources": [
                 {
-                    "name": src.name,
                     "slug": src.slug,
+                    "name": src.name,
                     "location": src.location,
                 }
                 for src in self.sources
             ],
-            "operations": self.operations,
-            "created_at": self.created_at.isoformat(),
-            "licenses": self.get_licenses(),
         }
+        if self.created_at is not None:
+            result["created_at"] = self.created_at.isoformat()
+        if self.content_hash is not None:
+            result["content_hash"] = self.content_hash
+        return result

{sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone_py.egg-info/PKG-INFO RENAMED Viewed

@@ -1,22 +1,20 @@
 Metadata-Version: 2.4
 Name: sunstone-py
-Version: 0.5.1
+Version: 0.5.3
 Summary: Python library for managing datasets with lineage tracking in Sunstone projects
 Author-email: Sunstone Institute <stig@sunstone.institute>
 License: MIT
 Project-URL: Homepage, https://github.com/sunstoneinstitute/sunstone-py
 Project-URL: Documentation, https://sunstoneinstitute.github.io/sunstone-py/
 Project-URL: Repository, https://github.com/sunstoneinstitute/sunstone-py
-Classifier: Development Status :: 3 - Alpha
+Classifier: Development Status :: 4 - Beta
 Classifier: Intended Audience :: Science/Research
 Classifier: License :: OSI Approved :: MIT License
 Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.10
-Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
 Classifier: Programming Language :: Python :: 3.14
-Requires-Python: >=3.10
+Requires-Python: >=3.12
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: frictionless>=5.18.1

{sunstone_py-0.5.1 → sunstone_py-0.5.3}/tests/test_dataframe.py RENAMED Viewed

@@ -25,20 +25,19 @@ class TestDataFrameBasics:
         assert len(df.data) > 0
         assert len(df.data.columns) > 0
         assert len(df.lineage.sources) > 0
-        assert df.lineage.operations is not None
-    def test_apply_operation(self, project_path: Path) -> None:
-        """Test applying an operation to a DataFrame."""
+    def test_head_preserves_lineage(self, project_path: Path) -> None:
+        """Test that head() preserves lineage."""
         df = sunstone.DataFrame.read_csv(
             "inputs/official_un_member_states_raw.csv",
             project_path=project_path,
             strict=False,
         )
-        filtered = df.apply_operation(lambda d: d.head(10), description="Select first 10 rows")
+        filtered = df.head(10)
         assert len(filtered.data) == 10
-        assert len(filtered.lineage.operations) > len(df.lineage.operations)
+        assert len(filtered.lineage.sources) == len(df.lineage.sources)
     def test_read_second_dataset(self, project_path: Path) -> None:
         """Test reading the same dataset twice creates separate lineage."""
@@ -68,10 +67,7 @@ class TestDataFrameMerge:
             strict=False,
         )
         # Filter to create a subset
-        return df.apply_operation(
-            lambda d: d[d["ISO Code"].notna()].head(50),
-            description="Select first 50 countries with ISO codes",
-        )
+        return df[df.data["ISO Code"].notna()].head(50)
     @pytest.fixture
     def un_members_df2(self, project_path: Path) -> Any:
@@ -82,10 +78,7 @@ class TestDataFrameMerge:
             strict=False,
         )
         # Select different columns as a second dataset
-        return df.apply_operation(
-            lambda d: d[["Member State", "ISO Code", "Start date"]].dropna(),
-            description="Select subset of columns",
-        )
+        return df[["Member State", "ISO Code", "Start date"]].dropna()
     def test_merge_dataframes(self, un_members_df1: Any, un_members_df2: Any) -> None:
         """Test merging two DataFrames."""
@@ -95,7 +88,6 @@ class TestDataFrameMerge:
         assert len(merged.data) > 0
         # Both sources come from the same file, but lineage should track them separately
         assert len(merged.lineage.sources) >= 1
-        assert len(merged.lineage.operations) > 0
     def test_merge_lineage_tracking(self, un_members_df1: Any, un_members_df2: Any) -> None:
         """Test that merge properly tracks lineage."""
@@ -117,11 +109,9 @@ class TestLineageMetadata:
             project_path=project_path,
             strict=False,
         )
-        # Apply some operations to build lineage
-        filtered = un_members.apply_operation(
-            lambda d: d[d["ISO Code"].notna()], description="Filter countries with ISO codes"
-        )
-        return filtered.apply_operation(lambda d: d.head(100), description="Select first 100 countries")
+        # Apply some operations
+        filtered = un_members[un_members.data["ISO Code"].notna()]
+        return filtered.head(100)
     def test_lineage_to_dict(self, processed_df: Any) -> None:
         """Test converting lineage to dictionary."""
@@ -129,11 +119,8 @@ class TestLineageMetadata:
         assert lineage_dict is not None
         assert "sources" in lineage_dict
-        assert "operations" in lineage_dict
-        assert "created_at" in lineage_dict
-        assert "licenses" in lineage_dict
+        # created_at is only set when writing output (not when reading)
         assert len(lineage_dict["sources"]) > 0
-        assert len(lineage_dict["operations"]) > 0
 class TestStrictMode:
@@ -172,8 +159,8 @@ class TestReadDataset:
         assert len(df.data) > 0
         assert len(df.data.columns) > 0
         assert len(df.lineage.sources) > 0
-        # Check that the lineage operation mentions the format
-        assert any("format=csv" in op for op in df.lineage.operations)
+        # Check that the source is tracked
+        assert df.lineage.sources[0].slug == "official-un-member-states"
     def test_read_dataset_with_explicit_format(self, project_path: Path) -> None:
         """Test reading a dataset with explicit format override."""
@@ -186,7 +173,7 @@ class TestReadDataset:
         assert df is not None
         assert len(df.data) > 0
-        assert any("format=csv" in op for op in df.lineage.operations)
+        assert len(df.lineage.sources) > 0
     def test_read_dataset_slug_not_found(self, project_path: Path) -> None:
         """Test that reading non-existent slug raises error."""
@@ -221,5 +208,159 @@ class TestReadDataset:
         assert df is not None
         assert len(df.data) > 0
-        # Should have the read_dataset operation in lineage
-        assert any("read_dataset" in op for op in df.lineage.operations)
+        # Check that the source is tracked
+        assert len(df.lineage.sources) > 0
+class TestContentHashLineage:
+    """Tests for content-hash based lineage tracking."""
+    def test_content_hash_computed_on_save(self, project_path: Path, tmp_path: Path) -> None:
+        """Test that content hash is computed and saved when writing output."""
+        import shutil
+        from ruamel.yaml import YAML
+        # Create a copy of the project in tmp_path to avoid modifying original
+        test_project = tmp_path / "test_project"
+        shutil.copytree(project_path, test_project)
+        df = sunstone.DataFrame.read_csv(
+            "inputs/official_un_member_states_raw.csv",
+            project_path=test_project,
+            strict=False,
+        )
+        # Write the output
+        output_path = "outputs/test_output.csv"
+        df.to_csv(output_path, slug="test-output", name="Test Output", index=False)
+        # Read the datasets.yaml and check for content_hash
+        yaml = YAML()
+        with open(test_project / "datasets.yaml") as f:
+            data = yaml.load(f)
+        # Find the output dataset
+        output = next((d for d in data.get("outputs", []) if d["slug"] == "test-output"), None)
+        assert output is not None
+        assert "lineage" in output
+        assert "content_hash" in output["lineage"]
+        assert "created_at" in output["lineage"]
+        # Hash should be a 64-character hex string (SHA256)
+        assert len(output["lineage"]["content_hash"]) == 64
+    def test_timestamp_not_updated_when_content_unchanged(self, project_path: Path, tmp_path: Path) -> None:
+        """Test that timestamp stays the same when saving identical content."""
+        import shutil
+        import time
+        from ruamel.yaml import YAML
+        # Create a copy of the project in tmp_path
+        test_project = tmp_path / "test_project"
+        shutil.copytree(project_path, test_project)
+        df = sunstone.DataFrame.read_csv(
+            "inputs/official_un_member_states_raw.csv",
+            project_path=test_project,
+            strict=False,
+        )
+        output_path = "outputs/stable_output.csv"
+        # First write
+        df.to_csv(output_path, slug="stable-output", name="Stable Output", index=False)
+        # Read the first timestamp and hash
+        yaml = YAML()
+        with open(test_project / "datasets.yaml") as f:
+            data1 = yaml.load(f)
+        output1 = next((d for d in data1.get("outputs", []) if d["slug"] == "stable-output"), None)
+        assert output1 is not None
+        first_timestamp = output1["lineage"]["created_at"]
+        first_hash = output1["lineage"]["content_hash"]
+        # Wait a bit to ensure different timestamp would be generated
+        time.sleep(0.1)
+        # Reload the manager and write again with the same data
+        df2 = sunstone.DataFrame.read_csv(
+            "inputs/official_un_member_states_raw.csv",
+            project_path=test_project,
+            strict=False,
+        )
+        df2.to_csv(output_path, slug="stable-output", name="Stable Output", index=False)
+        # Read the second timestamp and hash
+        with open(test_project / "datasets.yaml") as f:
+            data2 = yaml.load(f)
+        output2 = next((d for d in data2.get("outputs", []) if d["slug"] == "stable-output"), None)
+        assert output2 is not None
+        second_timestamp = output2["lineage"]["created_at"]
+        second_hash = output2["lineage"]["content_hash"]
+        # Hash should be the same
+        assert first_hash == second_hash
+        # Timestamp should NOT have changed since content is identical
+        assert first_timestamp == second_timestamp
+    def test_timestamp_updated_when_content_changes(self, project_path: Path, tmp_path: Path) -> None:
+        """Test that timestamp is updated when content actually changes."""
+        import shutil
+        import time
+        from ruamel.yaml import YAML
+        # Create a copy of the project in tmp_path
+        test_project = tmp_path / "test_project"
+        shutil.copytree(project_path, test_project)
+        df = sunstone.DataFrame.read_csv(
+            "inputs/official_un_member_states_raw.csv",
+            project_path=test_project,
+            strict=False,
+        )
+        output_path = "outputs/changing_output.csv"
+        # First write
+        df.to_csv(output_path, slug="changing-output", name="Changing Output", index=False)
+        # Read the first timestamp and hash
+        yaml = YAML()
+        with open(test_project / "datasets.yaml") as f:
+            data1 = yaml.load(f)
+        output1 = next((d for d in data1.get("outputs", []) if d["slug"] == "changing-output"), None)
+        assert output1 is not None
+        first_timestamp = output1["lineage"]["created_at"]
+        first_hash = output1["lineage"]["content_hash"]
+        # Wait a bit to ensure different timestamp
+        time.sleep(0.1)
+        # Modify the data and write again
+        df2 = sunstone.DataFrame.read_csv(
+            "inputs/official_un_member_states_raw.csv",
+            project_path=test_project,
+            strict=False,
+        )
+        # Actually modify the content - take only first 10 rows
+        df2_modified = df2.head(10)
+        df2_modified.to_csv(output_path, slug="changing-output", name="Changing Output", index=False)
+        # Read the second timestamp and hash
+        with open(test_project / "datasets.yaml") as f:
+            data2 = yaml.load(f)
+        output2 = next((d for d in data2.get("outputs", []) if d["slug"] == "changing-output"), None)
+        assert output2 is not None
+        second_timestamp = output2["lineage"]["created_at"]
+        second_hash = output2["lineage"]["content_hash"]
+        # Hash should be different since content changed
+        assert first_hash != second_hash
+        # Timestamp SHOULD have changed since content is different
+        assert first_timestamp != second_timestamp

{sunstone_py-0.5.1 → sunstone_py-0.5.3}/tests/test_lineage_persistence.py RENAMED Viewed

@@ -22,26 +22,17 @@ class TestLineagePersistence:
         assert hasattr(result, "lineage")
         assert len(result.lineage.sources) == len(df.lineage.sources)
-        # Check operation tracking
-        # We expect the operation to be recorded, ideally
-        assert any("head" in op for op in result.lineage.operations)
     def test_getitem_preserves_lineage(self, project_path: Path) -> None:
         """Test that boolean indexing/getitem returns sunstone DataFrame."""
         df = sunstone.DataFrame.read_csv(
             "inputs/official_un_member_states_raw.csv", project_path=project_path, strict=False
         )
-        # Boolean masking (returns DataFrame)
-        # Assuming 'Year' or some column exists, checking columns first
-        # Using the columns we saw in previous turns or just slicing
         # Let's just slice columns, which returns a DataFrame
         result = df[["Member State", "ISO Code"]]
         assert isinstance(result, sunstone.DataFrame)
         assert len(result.lineage.sources) == len(df.lineage.sources)
-        # Operation tracking for getitem might be tricky to name perfectly, but should exist
     def test_sort_values_preserves_lineage(self, project_path: Path) -> None:
         """Test that sort_values returns sunstone DataFrame."""
@@ -53,17 +44,16 @@ class TestLineagePersistence:
         assert isinstance(result, sunstone.DataFrame)
         assert len(result.lineage.sources) == len(df.lineage.sources)
-        assert any("sort_values" in op for op in result.lineage.operations)
     def test_setitem_preserves_lineage(self, project_path: Path) -> None:
-        """Test that in-place modification tracks lineage."""
+        """Test that in-place modification preserves lineage."""
         df = sunstone.DataFrame.read_csv(
             "inputs/official_un_member_states_raw.csv", project_path=project_path, strict=False
         )
-        initial_ops = len(df.lineage.operations)
+        initial_sources = len(df.lineage.sources)
         df["NewCol"] = 1
         assert "NewCol" in df.data.columns
-        assert len(df.lineage.operations) > initial_ops
-        assert any("__setitem__" in op for op in df.lineage.operations)
+        # Lineage sources should be preserved after setitem
+        assert len(df.lineage.sources) == initial_sources

{sunstone_py-0.5.1 → sunstone_py-0.5.3}/tests/test_pandas_compatibility.py RENAMED Viewed

@@ -131,10 +131,11 @@ class TestSelectionAndIndexing:
         """Test setting column values like pandas."""
         # Create a copy to avoid modifying fixture
         df = sample_df.head()
+        initial_sources = len(df.lineage.sources)
         df["test_column"] = "test_value"
         assert "test_column" in df.columns
-        # Lineage should track this operation
-        assert any("setitem" in op.lower() for op in df.lineage.operations)
+        # Lineage sources should be preserved
+        assert len(df.lineage.sources) == initial_sources
 class TestDataManipulation: