PyPI - sunstone-py - Versions diffs - 0.4.2__tar.gz → 0.5.1__tar.gz - Mend

sunstone-py 0.4.2tar.gz → 0.5.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

{sunstone_py-0.4.2 → sunstone_py-0.5.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sunstone-py
-Version: 0.4.2
+Version: 0.5.1
 Summary: Python library for managing datasets with lineage tracking in Sunstone projects
 Author-email: Sunstone Institute <stig@sunstone.institute>
 License: MIT
@@ -24,6 +24,7 @@ Requires-Dist: google-auth>=2.43.0
 Requires-Dist: pandas>=2.0.0
 Requires-Dist: pyyaml>=6.0
 Requires-Dist: requests>=2.31.0
+Requires-Dist: ruamel-yaml>=0.18
 Dynamic: license-file
 # sunstone-py
@@ -324,14 +325,14 @@ uv run pytest
 ### Type Checking
 ```bash
-uv run mypy src/sunstone
+uv run mypy
 ```
 ### Linting and Formatting
 ```bash
-uv run ruff check src/sunstone
-uv run ruff format src/sunstone
+uv run ruff check
+uv run ruff format
 ```
 ## About Sunstone Institute

{sunstone_py-0.4.2 → sunstone_py-0.5.1}/README.md RENAMED Viewed

@@ -296,14 +296,14 @@ uv run pytest
 ### Type Checking
 ```bash
-uv run mypy src/sunstone
+uv run mypy
 ```
 ### Linting and Formatting
 ```bash
-uv run ruff check src/sunstone
-uv run ruff format src/sunstone
+uv run ruff check
+uv run ruff format
 ```
 ## About Sunstone Institute

{sunstone_py-0.4.2 → sunstone_py-0.5.1}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "sunstone-py"
-version = "0.4.2"
+version = "0.5.1"
 description = "Python library for managing datasets with lineage tracking in Sunstone projects"
 readme = "README.md"
 requires-python = ">=3.10"
@@ -30,6 +30,7 @@ dependencies = [
     "pandas>=2.0.0",
     "pyyaml>=6.0",
     "requests>=2.31.0",
+    "ruamel-yaml>=0.18",
 ]
 [project.urls]
@@ -59,6 +60,10 @@ python_version = "3.10"
 warn_return_any = true
 warn_unused_configs = true
 disallow_untyped_defs = true
+check_untyped_defs = false
+exclude = [
+    "^tests/testdata/",
+]
 [dependency-groups]
 dev = [
@@ -69,6 +74,7 @@ dev = [
     "types-requests>=2.32.4.20250913",
     "pandas-stubs>=2.3.2.250926",
     "types-pyyaml>=6.0.12.20250915",
+    "markdown>=3.10",
 ]
 docs = [
     "mkdocs-material>=9.5.0",

{sunstone_py-0.4.2 → sunstone_py-0.5.1}/src/sunstone/_release.py RENAMED Viewed

@@ -164,29 +164,12 @@ def get_last_tag() -> str | None:
 def generate_changelog_from_git() -> str:
     """Generate changelog entries from git commits since last tag using Claude."""
     last_tag = get_last_tag()
-    if last_tag:
-        commit_range = f"{last_tag}..HEAD"
-    else:
-        commit_range = "HEAD"
-    # Get commits since last tag
-    result = run_git("log", commit_range, "--pretty=format:%s")
-    if result.returncode != 0 or not result.stdout.strip():
-        return ""
-    commits = result.stdout.strip()
-    prompt = f"""Convert these git commit messages into Keep a Changelog format entries.
-Categorize under: Added, Changed, Fixed, Removed, Security (only include categories that apply).
-Be concise. Skip merge commits, version bump commits, and release commits.
-Output ONLY the markdown entries with ### headers for categories, nothing else.
-Commits:
-{commits}"""
+    if last_tag is None:
+        last_tag = "HEAD~1"
     print("Generating changelog entries with Claude...")
     claude_result = subprocess.run(
-        ["claude", "-p", "--model=haiku", prompt],
+        ["claude", "-p", f"/generate-changelog {last_tag}"],
         capture_output=True,
         text=True,
         cwd=get_root_dir(),

{sunstone_py-0.4.2 → sunstone_py-0.5.1}/src/sunstone/dataframe.py RENAMED Viewed

@@ -323,7 +323,7 @@ class DataFrame:
             path_or_buf: File path for the output CSV.
             slug: Dataset slug (required in relaxed mode if not registered).
             name: Dataset name (required in relaxed mode if not registered).
-            publish: Whether to publish the dataset.
+            publish: bool = False,
             **kwargs: Additional arguments passed to pandas.to_csv.
         Raises:
@@ -366,6 +366,9 @@ class DataFrame:
         # Record the operation
         self.lineage.add_operation(f"to_csv({dataset.slug})")
+        # Persist lineage metadata to datasets.yaml
+        manager.update_output_lineage(slug=dataset.slug, lineage=self.lineage, strict=self.strict_mode)
     def _infer_field_schema(self) -> List[FieldSchema]:
         """
         Infer field schema from the DataFrame.
@@ -510,13 +513,13 @@ class DataFrame:
             project_path=self.lineage.project_path,
         )
-    def _wrap_result(self, result: Any, operation: str = "pandas_operation") -> Any:
+    def _wrap_result(self, result: Any, operation: Optional[str] = None) -> Any:
         """
         Wrap a pandas result in a Sunstone DataFrame if applicable.
         Args:
             result: The result from a pandas operation.
-            operation: Name of the operation performed.
+            operation: Name of the operation performed. If None, no operation is recorded.
         Returns:
             Wrapped DataFrame if result is a DataFrame, otherwise the result.
@@ -527,7 +530,8 @@ class DataFrame:
                 operations=self.lineage.operations.copy(),
                 project_path=self.lineage.project_path,
             )
-            new_lineage.add_operation(operation)
+            if operation is not None:
+                new_lineage.add_operation(operation)
             return DataFrame(
                 data=result,
@@ -537,6 +541,28 @@ class DataFrame:
             )
         return result
+    # Methods that don't represent meaningful data transformations
+    # These return DataFrames but shouldn't be tracked in lineage
+    _NON_TRACKING_METHODS = frozenset(
+        {
+            # Copy operations - same data, no transformation
+            "copy",
+            # Index operations - same data, different index
+            "reset_index",
+            "set_index",
+            "reindex",
+            # Type conversions without data change
+            "astype",
+            "infer_objects",
+            # Column/index renaming - same data, different labels
+            "rename",
+            "rename_axis",
+            # Reshaping without data loss
+            "T",
+            "transpose",
+        }
+    )
     def __getattr__(self, name: str) -> Any:
         """
         Delegate attribute access to the underlying pandas DataFrame.
@@ -557,11 +583,14 @@ class DataFrame:
             def wrapper(*args: Any, **kwargs: Any) -> Any:
                 result = attr(*args, **kwargs)
+                # Don't track non-transforming methods
+                if name in DataFrame._NON_TRACKING_METHODS:
+                    return self._wrap_result(result, operation=None)
                 return self._wrap_result(result, operation=f"{name}")
             return wrapper
-        return self._wrap_result(attr, operation=f"access_attribute_{name}")
+        return self._wrap_result(attr, operation=None)  # Don't track attribute access
     def __getitem__(self, key: Any) -> Any:
         """
@@ -574,7 +603,9 @@ class DataFrame:
             The item from the underlying DataFrame, wrapped if it's a DataFrame.
         """
         result = self.data[key]
-        return self._wrap_result(result, operation="__getitem__")
+        # Don't track __getitem__ as an operation - it's just column/row access
+        # not a meaningful transformation
+        return self._wrap_result(result, operation=None)
     def __setitem__(self, key: Any, value: Any) -> None:
         """
@@ -585,7 +616,8 @@ class DataFrame:
             value: Value to assign.
         """
         self.data[key] = value
-        self.lineage.add_operation("__setitem__")
+        # Track column assignment in lineage
+        self.lineage.add_operation(f"__setitem__({key!r})")
     def __repr__(self) -> str:
         """String representation of the DataFrame."""

{sunstone_py-0.4.2 → sunstone_py-0.5.1}/src/sunstone/datasets.py RENAMED Viewed

@@ -4,19 +4,27 @@ Parser and manager for datasets.yaml files.
 import ipaddress
 import logging
+import os
 import socket
+import tempfile
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 from urllib.parse import urljoin, urlparse
 import requests
-import yaml
+from ruamel.yaml import YAML
 from .exceptions import DatasetNotFoundError, DatasetValidationError
-from .lineage import DatasetMetadata, FieldSchema, Source, SourceLocation
+from .lineage import DatasetMetadata, FieldSchema, LineageMetadata, Source, SourceLocation
 logger = logging.getLogger(__name__)
+# Configure ruamel.yaml for round-trip parsing (preserves comments) with proper indentation
+_yaml = YAML()
+_yaml.preserve_quotes = True
+_yaml.default_flow_style = False
+_yaml.indent(mapping=2, sequence=4, offset=2)
 def _is_public_url(url: str) -> bool:
     """
@@ -109,7 +117,7 @@ class DatasetsManager:
     def _load(self) -> None:
         """Load and parse the datasets.yaml file."""
         with open(self.datasets_file, "r") as f:
-            self._data = yaml.safe_load(f) or {}
+            self._data = _yaml.load(f) or {}
         if "inputs" not in self._data:
             self._data["inputs"] = []
@@ -119,7 +127,7 @@ class DatasetsManager:
     def _save(self) -> None:
         """Save the current data back to datasets.yaml."""
         with open(self.datasets_file, "w") as f:
-            yaml.dump(self._data, f, default_flow_style=False, sort_keys=False)
+            _yaml.dump(self._data, f)
     def _parse_source_location(self, loc_data: Dict[str, Any]) -> SourceLocation:
         """Parse source location data from YAML."""
@@ -372,6 +380,92 @@ class DatasetsManager:
         raise DatasetNotFoundError(f"Output dataset with slug '{slug}' not found")
+    def update_output_lineage(self, slug: str, lineage: LineageMetadata, strict: bool = False) -> None:
+        """
+        Update lineage metadata for an output dataset.
+        In strict mode, validates that the lineage matches what would be written
+        without modifying the file. In relaxed mode, updates the file with lineage.
+        Args:
+            slug: The slug of the output dataset to update.
+            lineage: The lineage metadata to persist.
+            strict: If True, validate without modifying. If False, update the file.
+        Raises:
+            DatasetNotFoundError: If the dataset doesn't exist.
+            DatasetValidationError: In strict mode, if lineage differs from what's in the file.
+        """
+        # Find the output dataset
+        dataset_idx = None
+        for i, dataset_data in enumerate(self._data["outputs"]):
+            if dataset_data["slug"] == slug:
+                dataset_idx = i
+                break
+        if dataset_idx is None:
+            raise DatasetNotFoundError(f"Output dataset with slug '{slug}' not found")
+        # Build lineage metadata to add
+        lineage_data: dict[str, Any] = {}
+        if lineage.sources:
+            lineage_data["sources"] = [
+                {
+                    "slug": src.slug,
+                    "name": src.name,
+                }
+                for src in lineage.sources
+            ]
+        if lineage.operations:
+            lineage_data["operations"] = lineage.operations.copy()
+        if lineage.created_at:
+            lineage_data["created_at"] = lineage.created_at.isoformat()
+        # Create a copy of the data with updated lineage
+        updated_data = self._data.copy()
+        updated_data["outputs"] = [dict(d) for d in self._data["outputs"]]
+        updated_data["outputs"][dataset_idx] = dict(self._data["outputs"][dataset_idx])
+        # Add or update lineage in the copy
+        if lineage_data:
+            updated_data["outputs"][dataset_idx]["lineage"] = lineage_data
+        # Write to temp file
+        temp_fd, temp_path = tempfile.mkstemp(suffix=".yaml", prefix="datasets_", dir=self.project_path)
+        try:
+            with os.fdopen(temp_fd, "w") as f:
+                _yaml.dump(updated_data, f)
+            if strict:
+                # In strict mode, check if files differ
+                import filecmp
+                if not filecmp.cmp(self.datasets_file, temp_path, shallow=False):
+                    # Files differ - this is an error in strict mode
+                    os.unlink(temp_path)
+                    raise DatasetValidationError(
+                        f"In strict mode, lineage metadata for '{slug}' would be updated in datasets.yaml. "
+                        f"Expected lineage is already present in the file, but found differences."
+                    )
+                else:
+                    # Files are the same - clean up temp file
+                    os.unlink(temp_path)
+            else:
+                # In relaxed mode, replace the file
+                os.replace(temp_path, self.datasets_file)
+                # Reload the data
+                self._load()
+        except Exception:
+            # Clean up temp file on error
+            if os.path.exists(temp_path):
+                os.unlink(temp_path)
+            raise
     def get_absolute_path(self, location: str) -> Path:
         """
         Get the absolute path for a dataset location.

{sunstone_py-0.4.2 → sunstone_py-0.5.1}/src/sunstone_py.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sunstone-py
-Version: 0.4.2
+Version: 0.5.1
 Summary: Python library for managing datasets with lineage tracking in Sunstone projects
 Author-email: Sunstone Institute <stig@sunstone.institute>
 License: MIT
@@ -24,6 +24,7 @@ Requires-Dist: google-auth>=2.43.0
 Requires-Dist: pandas>=2.0.0
 Requires-Dist: pyyaml>=6.0
 Requires-Dist: requests>=2.31.0
+Requires-Dist: ruamel-yaml>=0.18
 Dynamic: license-file
 # sunstone-py
@@ -324,14 +325,14 @@ uv run pytest
 ### Type Checking
 ```bash
-uv run mypy src/sunstone
+uv run mypy
 ```
 ### Linting and Formatting
 ```bash
-uv run ruff check src/sunstone
-uv run ruff format src/sunstone
+uv run ruff check
+uv run ruff format
 ```
 ## About Sunstone Institute

{sunstone_py-0.4.2 → sunstone_py-0.5.1}/src/sunstone_py.egg-info/requires.txt RENAMED Viewed

@@ -3,3 +3,4 @@ google-auth>=2.43.0
 pandas>=2.0.0
 pyyaml>=6.0
 requests>=2.31.0
+ruamel-yaml>=0.18

{sunstone_py-0.4.2 → sunstone_py-0.5.1}/tests/test_dataframe.py RENAMED Viewed

@@ -3,6 +3,7 @@ Tests for Sunstone DataFrame functionality.
 """
 from pathlib import Path
+from typing import Any
 import pytest
@@ -12,7 +13,7 @@ import sunstone
 class TestDataFrameBasics:
     """Tests for basic DataFrame operations."""
-    def test_read_csv(self, project_path: Path):
+    def test_read_csv(self, project_path: Path) -> None:
         """Test reading a CSV file into a DataFrame."""
         df = sunstone.DataFrame.read_csv(
             "inputs/official_un_member_states_raw.csv",
@@ -26,7 +27,7 @@ class TestDataFrameBasics:
         assert len(df.lineage.sources) > 0
         assert df.lineage.operations is not None
-    def test_apply_operation(self, project_path: Path):
+    def test_apply_operation(self, project_path: Path) -> None:
         """Test applying an operation to a DataFrame."""
         df = sunstone.DataFrame.read_csv(
             "inputs/official_un_member_states_raw.csv",
@@ -39,7 +40,7 @@ class TestDataFrameBasics:
         assert len(filtered.data) == 10
         assert len(filtered.lineage.operations) > len(df.lineage.operations)
-    def test_read_second_dataset(self, project_path: Path):
+    def test_read_second_dataset(self, project_path: Path) -> None:
         """Test reading the same dataset twice creates separate lineage."""
         members1 = sunstone.DataFrame.read_csv(
             "inputs/official_un_member_states_raw.csv", project_path=project_path, strict=False
@@ -59,7 +60,7 @@ class TestDataFrameMerge:
     """Tests for DataFrame merge operations."""
     @pytest.fixture
-    def un_members_df1(self, project_path: Path):
+    def un_members_df1(self, project_path: Path) -> Any:
         """Load UN members DataFrame (first instance)."""
         df = sunstone.DataFrame.read_csv(
             "inputs/official_un_member_states_raw.csv",
@@ -73,7 +74,7 @@ class TestDataFrameMerge:
         )
     @pytest.fixture
-    def un_members_df2(self, project_path: Path):
+    def un_members_df2(self, project_path: Path) -> Any:
         """Load UN members DataFrame (second instance)."""
         df = sunstone.DataFrame.read_csv(
             "inputs/official_un_member_states_raw.csv",
@@ -86,7 +87,7 @@ class TestDataFrameMerge:
             description="Select subset of columns",
         )
-    def test_merge_dataframes(self, un_members_df1, un_members_df2):
+    def test_merge_dataframes(self, un_members_df1: Any, un_members_df2: Any) -> None:
         """Test merging two DataFrames."""
         merged = un_members_df1.merge(un_members_df2, left_on="ISO Code", right_on="ISO Code", how="inner")
@@ -96,7 +97,7 @@ class TestDataFrameMerge:
         assert len(merged.lineage.sources) >= 1
         assert len(merged.lineage.operations) > 0
-    def test_merge_lineage_tracking(self, un_members_df1, un_members_df2):
+    def test_merge_lineage_tracking(self, un_members_df1: Any, un_members_df2: Any) -> None:
         """Test that merge properly tracks lineage."""
         merged = un_members_df1.merge(un_members_df2, left_on="ISO Code", right_on="ISO Code", how="inner")
@@ -109,7 +110,7 @@ class TestLineageMetadata:
     """Tests for lineage metadata functionality."""
     @pytest.fixture
-    def processed_df(self, project_path: Path):
+    def processed_df(self, project_path: Path) -> Any:
         """Create a processed DataFrame for testing."""
         un_members = sunstone.DataFrame.read_csv(
             "inputs/official_un_member_states_raw.csv",
@@ -122,7 +123,7 @@ class TestLineageMetadata:
         )
         return filtered.apply_operation(lambda d: d.head(100), description="Select first 100 countries")
-    def test_lineage_to_dict(self, processed_df):
+    def test_lineage_to_dict(self, processed_df: Any) -> None:
         """Test converting lineage to dictionary."""
         lineage_dict = processed_df.lineage.to_dict()
@@ -138,7 +139,7 @@ class TestLineageMetadata:
 class TestStrictMode:
     """Tests for strict mode functionality."""
-    def test_strict_mode_load(self, project_path: Path, monkeypatch):
+    def test_strict_mode_load(self, project_path: Path, monkeypatch: Any) -> None:
         """Test loading DataFrame in strict mode."""
         monkeypatch.setenv("SUNSTONE_DATAFRAME_STRICT", "1")
@@ -146,7 +147,7 @@ class TestStrictMode:
         assert strict_df.strict_mode is True
-    def test_strict_mode_prevents_unregistered_write(self, project_path: Path, monkeypatch):
+    def test_strict_mode_prevents_unregistered_write(self, project_path: Path, monkeypatch: Any) -> None:
         """Test that strict mode prevents writing to unregistered locations."""
         monkeypatch.setenv("SUNSTONE_DATAFRAME_STRICT", "1")
@@ -159,7 +160,7 @@ class TestStrictMode:
 class TestReadDataset:
     """Tests for read_dataset() functionality with format auto-detection."""
-    def test_read_dataset_by_slug(self, project_path: Path):
+    def test_read_dataset_by_slug(self, project_path: Path) -> None:
         """Test reading a dataset by slug with auto-detection."""
         df = sunstone.DataFrame.read_dataset(
             "official-un-member-states",
@@ -174,7 +175,7 @@ class TestReadDataset:
         # Check that the lineage operation mentions the format
         assert any("format=csv" in op for op in df.lineage.operations)
-    def test_read_dataset_with_explicit_format(self, project_path: Path):
+    def test_read_dataset_with_explicit_format(self, project_path: Path) -> None:
         """Test reading a dataset with explicit format override."""
         df = sunstone.DataFrame.read_dataset(
             "official-un-member-states",
@@ -187,7 +188,7 @@ class TestReadDataset:
         assert len(df.data) > 0
         assert any("format=csv" in op for op in df.lineage.operations)
-    def test_read_dataset_slug_not_found(self, project_path: Path):
+    def test_read_dataset_slug_not_found(self, project_path: Path) -> None:
         """Test that reading non-existent slug raises error."""
         with pytest.raises(sunstone.DatasetNotFoundError) as exc_info:
             sunstone.DataFrame.read_dataset(
@@ -197,7 +198,7 @@ class TestReadDataset:
         assert "not found in datasets.yaml" in str(exc_info.value)
-    def test_read_dataset_via_pandas_api(self, project_path: Path):
+    def test_read_dataset_via_pandas_api(self, project_path: Path) -> None:
         """Test reading dataset via pandas-like API."""
         from sunstone import pandas as pd
@@ -210,7 +211,7 @@ class TestReadDataset:
         assert len(df.data) > 0
         assert isinstance(df, sunstone.DataFrame)
-    def test_read_csv_with_slug_delegates_to_read_dataset(self, project_path: Path):
+    def test_read_csv_with_slug_delegates_to_read_dataset(self, project_path: Path) -> None:
         """Test that read_csv with slug delegates to read_dataset."""
         df = sunstone.DataFrame.read_csv(
             "official-un-member-states",

sunstone-py 0.4.2__tar.gz → 0.5.1__tar.gz

sunstone-py 0.4.2tar.gz → 0.5.1tar.gz