PyPI - sunstone-py - Versions diffs - 0.5.2__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

sunstone-py 0.5.2py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

sunstone/cli.py +542 -0
sunstone/dataframe.py +16 -89
sunstone/datasets.py +78 -17
sunstone/lineage.py +58 -29
{sunstone_py-0.5.2.dist-info → sunstone_py-0.6.0.dist-info}/METADATA +4 -2
sunstone_py-0.6.0.dist-info/RECORD +16 -0
{sunstone_py-0.5.2.dist-info → sunstone_py-0.6.0.dist-info}/WHEEL +1 -1
{sunstone_py-0.5.2.dist-info → sunstone_py-0.6.0.dist-info}/entry_points.txt +1 -0
sunstone_py-0.5.2.dist-info/RECORD +0 -15
{sunstone_py-0.5.2.dist-info → sunstone_py-0.6.0.dist-info}/licenses/LICENSE +0 -0
{sunstone_py-0.5.2.dist-info → sunstone_py-0.6.0.dist-info}/top_level.txt +0 -0

sunstone/cli.py ADDED Viewed

@@ -0,0 +1,542 @@
+"""
+Sunstone command-line interface.
+"""
+import json
+import os
+import re
+import sys
+import tomllib
+from pathlib import Path
+from typing import Optional
+from urllib.parse import urlparse
+import click
+from click.shell_completion import CompletionItem
+from ruamel.yaml import YAML
+from .datasets import DatasetsManager
+from .exceptions import DatasetNotFoundError
+# Configure ruamel.yaml for round-trip parsing
+_yaml = YAML()
+_yaml.preserve_quotes = True
+_yaml.default_flow_style = False
+_yaml.indent(mapping=2, sequence=4, offset=2)
+# Valid field types
+VALID_FIELD_TYPES = {"string", "number", "integer", "boolean", "date", "datetime"}
+# Pattern for ${VAR} or ${VAR:-default} substitution
+ENV_VAR_PATTERN = re.compile(r"\$\{([^}:]+)(?::-([^}]*))?\}")
+def get_project_slug(project_path: Path) -> str:
+    """
+    Get the project slug from pyproject.toml or directory name.
+    Args:
+        project_path: Path to the project directory.
+    Returns:
+        The project slug (kebab-case identifier).
+    """
+    pyproject_path = project_path / "pyproject.toml"
+    if pyproject_path.exists():
+        try:
+            with open(pyproject_path, "rb") as f:
+                pyproject = tomllib.load(f)
+            name = pyproject.get("project", {}).get("name")
+            if isinstance(name, str):
+                return name
+        except Exception:
+            pass
+    return project_path.name
+def expand_env_vars(text: str) -> str:
+    """
+    Expand environment variables in text using ${VAR} or ${VAR:-default} syntax.
+    Args:
+        text: The text containing environment variable references.
+    Returns:
+        The text with environment variables expanded.
+    """
+    def replace_var(match: re.Match[str]) -> str:
+        var_name = match.group(1)
+        default_value = match.group(2)
+        value = os.environ.get(var_name)
+        if value is not None:
+            return value
+        if default_value is not None:
+            return default_value
+        return match.group(0)  # Return original if no value and no default
+    return ENV_VAR_PATTERN.sub(replace_var, text)
+def get_manager(datasets_file: str) -> tuple[DatasetsManager, Path]:
+    """Get DatasetsManager and project path from datasets file."""
+    datasets_path = Path(datasets_file).resolve()
+    project_path = datasets_path.parent
+    manager = DatasetsManager(project_path)
+    return manager, project_path
+def complete_dataset_slugs(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[CompletionItem]:
+    """Shell completion for dataset slugs."""
+    # Get the datasets file from context or use default
+    datasets_file = ctx.params.get("datasets_file", "datasets.yaml")
+    try:
+        manager, _ = get_manager(datasets_file)
+        all_datasets = manager.get_all_inputs() + manager.get_all_outputs()
+        slugs = [ds.slug for ds in all_datasets]
+        return [CompletionItem(slug) for slug in slugs if slug.startswith(incomplete)]
+    except Exception:
+        return []
+# =============================================================================
+# Main CLI group
+# =============================================================================
+@click.group()
+@click.version_option()
+def main() -> None:
+    """Sunstone dataset and package management CLI."""
+    pass
+# =============================================================================
+# Dataset commands
+# =============================================================================
+@main.group()
+def dataset() -> None:
+    """Manage datasets in datasets.yaml."""
+    pass
+@dataset.command("list")
+@click.option(
+    "-f", "--file", "datasets_file", type=click.Path(exists=True), default="datasets.yaml", help="Path to datasets.yaml"
+)
+def dataset_list(datasets_file: str) -> None:
+    """List all datasets."""
+    try:
+        manager, _ = get_manager(datasets_file)
+    except FileNotFoundError as e:
+        click.echo(f"Error: {e}", err=True)
+        sys.exit(1)
+    inputs = manager.get_all_inputs()
+    outputs = manager.get_all_outputs()
+    if inputs:
+        click.echo("Inputs:")
+        for ds in inputs:
+            flags = []
+            if ds.strict:
+                flags.append("strict")
+            flag_str = f" [{', '.join(flags)}]" if flags else ""
+            click.echo(f"  - {ds.slug} ({ds.name}){flag_str}")
+    if outputs:
+        if inputs:
+            click.echo()
+        click.echo("Outputs:")
+        for ds in outputs:
+            flags = []
+            if ds.is_publishable:
+                flags.append("publish")
+            if ds.strict:
+                flags.append("strict")
+            flag_str = f" [{', '.join(flags)}]" if flags else ""
+            click.echo(f"  - {ds.slug} ({ds.name}){flag_str}")
+    if not inputs and not outputs:
+        click.echo("No datasets found.")
+@dataset.command("validate")
+@click.option(
+    "-f", "--file", "datasets_file", type=click.Path(exists=True), default="datasets.yaml", help="Path to datasets.yaml"
+)
+@click.argument("datasets", nargs=-1, shell_complete=complete_dataset_slugs)
+def dataset_validate(datasets_file: str, datasets: tuple[str, ...]) -> None:
+    """Validate datasets.
+    If no datasets are specified, validates all datasets.
+    """
+    datasets_path = Path(datasets_file).resolve()
+    errors: list[str] = []
+    # Load and parse YAML
+    try:
+        with open(datasets_path, "r") as f:
+            data = _yaml.load(f)
+    except Exception as e:
+        click.echo(f"Error: Failed to parse YAML: {e}", err=True)
+        sys.exit(1)
+    if data is None:
+        data = {}
+    # Check structure
+    if "inputs" not in data and "outputs" not in data:
+        errors.append("datasets.yaml must contain 'inputs' and/or 'outputs' lists")
+    # Track slugs for duplicate detection
+    all_slugs: dict[str, str] = {}  # slug -> type
+    datasets_to_validate = set(datasets) if datasets else None
+    def validate_dataset_entry(ds: dict, ds_type: str, index: int) -> None:
+        prefix = f"{ds_type}[{index}]"
+        slug = ds.get("slug")
+        # Skip if specific datasets requested and this isn't one of them
+        if datasets_to_validate and slug not in datasets_to_validate:
+            # Still track slug for duplicate detection
+            if slug:
+                all_slugs[slug] = ds_type
+            return
+        # Required fields
+        for field in ["name", "slug", "location", "fields"]:
+            if field not in ds:
+                errors.append(f"{prefix}: missing required field '{field}'")
+        # Check slug
+        if slug:
+            if slug in all_slugs:
+                errors.append(f"{prefix}: duplicate slug '{slug}' (also in {all_slugs[slug]})")
+            else:
+                all_slugs[slug] = ds_type
+        # Check fields
+        fields = ds.get("fields", [])
+        if not isinstance(fields, list):
+            errors.append(f"{prefix}: 'fields' must be a list")
+        else:
+            for i, field in enumerate(fields):
+                if not isinstance(field, dict):
+                    errors.append(f"{prefix}.fields[{i}]: must be an object")
+                    continue
+                if "name" not in field:
+                    errors.append(f"{prefix}.fields[{i}]: missing 'name'")
+                if "type" not in field:
+                    errors.append(f"{prefix}.fields[{i}]: missing 'type'")
+                elif field["type"] not in VALID_FIELD_TYPES:
+                    errors.append(
+                        f"{prefix}.fields[{i}]: invalid type '{field['type']}' "
+                        f"(must be one of: {', '.join(sorted(VALID_FIELD_TYPES))})"
+                    )
+    # Validate inputs
+    inputs = data.get("inputs", [])
+    if not isinstance(inputs, list):
+        errors.append("'inputs' must be a list")
+    else:
+        for i, ds in enumerate(inputs):
+            if not isinstance(ds, dict):
+                errors.append(f"inputs[{i}]: must be an object")
+            else:
+                validate_dataset_entry(ds, "inputs", i)
+    # Validate outputs
+    outputs = data.get("outputs", [])
+    if not isinstance(outputs, list):
+        errors.append("'outputs' must be a list")
+    else:
+        for i, ds in enumerate(outputs):
+            if not isinstance(ds, dict):
+                errors.append(f"outputs[{i}]: must be an object")
+            else:
+                validate_dataset_entry(ds, "outputs", i)
+    # Check if requested datasets were found
+    if datasets_to_validate:
+        found_slugs = set(all_slugs.keys())
+        missing = datasets_to_validate - found_slugs
+        for slug in missing:
+            errors.append(f"Dataset '{slug}' not found")
+    if errors:
+        click.echo("Validation errors:", err=True)
+        for error in errors:
+            click.echo(f"  - {error}", err=True)
+        sys.exit(1)
+    else:
+        if datasets:
+            click.echo(f"✓ {len(datasets)} dataset(s) valid")
+        else:
+            click.echo(f"✓ {datasets_file} is valid")
+@dataset.command("lock")
+@click.option(
+    "-f", "--file", "datasets_file", type=click.Path(exists=True), default="datasets.yaml", help="Path to datasets.yaml"
+)
+@click.argument("datasets", nargs=-1, shell_complete=complete_dataset_slugs)
+def dataset_lock(datasets_file: str, datasets: tuple[str, ...]) -> None:
+    """Enable strict mode for datasets.
+    If no datasets are specified, locks all datasets.
+    """
+    try:
+        manager, _ = get_manager(datasets_file)
+    except FileNotFoundError as e:
+        click.echo(f"Error: {e}", err=True)
+        sys.exit(1)
+    # Get all datasets if none specified
+    if not datasets:
+        all_datasets = manager.get_all_inputs() + manager.get_all_outputs()
+        datasets = tuple(ds.slug for ds in all_datasets)
+    if not datasets:
+        click.echo("No datasets found.")
+        return
+    locked = []
+    for slug in datasets:
+        try:
+            manager.set_dataset_strict(slug, strict=True)
+            locked.append(slug)
+        except DatasetNotFoundError:
+            click.echo(f"Warning: Dataset '{slug}' not found", err=True)
+    if locked:
+        click.echo(f"✓ Locked {len(locked)} dataset(s): {', '.join(locked)}")
+@dataset.command("unlock")
+@click.option(
+    "-f", "--file", "datasets_file", type=click.Path(exists=True), default="datasets.yaml", help="Path to datasets.yaml"
+)
+@click.argument("datasets", nargs=-1, shell_complete=complete_dataset_slugs)
+def dataset_unlock(datasets_file: str, datasets: tuple[str, ...]) -> None:
+    """Disable strict mode for datasets.
+    If no datasets are specified, unlocks all datasets.
+    """
+    try:
+        manager, _ = get_manager(datasets_file)
+    except FileNotFoundError as e:
+        click.echo(f"Error: {e}", err=True)
+        sys.exit(1)
+    # Get all datasets if none specified
+    if not datasets:
+        all_datasets = manager.get_all_inputs() + manager.get_all_outputs()
+        datasets = tuple(ds.slug for ds in all_datasets)
+    if not datasets:
+        click.echo("No datasets found.")
+        return
+    unlocked = []
+    for slug in datasets:
+        try:
+            manager.set_dataset_strict(slug, strict=False)
+            unlocked.append(slug)
+        except DatasetNotFoundError:
+            click.echo(f"Warning: Dataset '{slug}' not found", err=True)
+    if unlocked:
+        click.echo(f"✓ Unlocked {len(unlocked)} dataset(s): {', '.join(unlocked)}")
+# =============================================================================
+# Package commands
+# =============================================================================
+@main.group()
+def package() -> None:
+    """Manage data packages."""
+    pass
+@package.command("build")
+@click.option(
+    "-f", "--file", "datasets_file", type=click.Path(exists=True), default="datasets.yaml", help="Path to datasets.yaml"
+)
+@click.option("-o", "--output", "output_file", type=click.Path(), default="datapackage.json", help="Output file path")
+def package_build(datasets_file: str, output_file: str) -> None:
+    """Build a datapackage.json from datasets.yaml.
+    Creates a Data Package (https://datapackage.org/) with all output datasets as resources.
+    """
+    try:
+        manager, project_path = get_manager(datasets_file)
+    except FileNotFoundError as e:
+        click.echo(f"Error: {e}", err=True)
+        sys.exit(1)
+    outputs = manager.get_all_outputs()
+    if not outputs:
+        click.echo("No output datasets found.", err=True)
+        sys.exit(1)
+    project_slug = get_project_slug(project_path)
+    try:
+        from frictionless import describe
+    except ImportError:
+        click.echo("Error: frictionless is required for package build", err=True)
+        sys.exit(1)
+    resources = []
+    for ds in outputs:
+        data_path = manager.get_absolute_path(ds.location)
+        if not data_path.exists():
+            click.echo(f"Warning: Data file not found for '{ds.slug}': {data_path}", err=True)
+            continue
+        try:
+            resource = describe(str(data_path))
+            resource.name = ds.slug
+            resource.title = ds.name
+            # Use relative path in the package
+            resource.path = ds.location
+            resources.append(resource.to_dict())
+            click.echo(f"  + {ds.slug}")
+        except Exception as e:
+            click.echo(f"Warning: Failed to describe '{ds.slug}': {e}", err=True)
+    if not resources:
+        click.echo("Error: No resources could be added to the package", err=True)
+        sys.exit(1)
+    datapackage = {
+        "name": project_slug,
+        "resources": resources,
+    }
+    output_path = Path(output_file)
+    with open(output_path, "w") as f:
+        json.dump(datapackage, f, indent=2)
+    click.echo(f"\n✓ Created {output_file} with {len(resources)} resource(s)")
+@package.command("push")
+@click.option("--env", type=click.Choice(["dev", "prod"]), default="dev", help="Target environment")
+@click.option(
+    "-f", "--file", "datasets_file", type=click.Path(exists=True), default="datasets.yaml", help="Path to datasets.yaml"
+)
+@click.option("--destination", "-d", "destination", type=str, default=None, help="Override destination gs:// URL")
+def package_push(env: str, datasets_file: str, destination: Optional[str]) -> None:
+    """Push the data package to Google Cloud Storage.
+    Uploads datapackage.json and all publishable output datasets.
+    """
+    try:
+        manager, project_path = get_manager(datasets_file)
+    except FileNotFoundError as e:
+        click.echo(f"Error: {e}", err=True)
+        sys.exit(1)
+    outputs = manager.get_all_outputs()
+    publishable = [ds for ds in outputs if ds.is_publishable]
+    if not publishable:
+        click.echo("Error: No publishable datasets found (need publish.enabled: true)", err=True)
+        sys.exit(1)
+    project_slug = get_project_slug(project_path)
+    # Determine destination
+    if destination:
+        dest_url = expand_env_vars(destination)
+    elif publishable[0].publish and publishable[0].publish.to:
+        # Use first dataset's publish.to as package destination
+        dest_url = expand_env_vars(publishable[0].publish.to)
+    else:
+        dest_url = f"gs://payloadcms-{env}/datasets/projects/{project_slug}/"
+    parsed = urlparse(dest_url)
+    if parsed.scheme != "gs":
+        click.echo(f"Error: Destination must be a gs:// URL, got: {dest_url}", err=True)
+        sys.exit(1)
+    bucket_name = parsed.netloc
+    gcs_prefix = parsed.path.lstrip("/")
+    if gcs_prefix and not gcs_prefix.endswith("/"):
+        gcs_prefix += "/"
+    # Build the datapackage
+    try:
+        from frictionless import describe
+    except ImportError:
+        click.echo("Error: frictionless is required for package push", err=True)
+        sys.exit(1)
+    resources = []
+    data_files: list[tuple[Path, str]] = []  # (local_path, remote_name)
+    for ds in publishable:
+        data_path = manager.get_absolute_path(ds.location)
+        if not data_path.exists():
+            click.echo(f"Warning: Data file not found for '{ds.slug}': {data_path}", err=True)
+            continue
+        try:
+            resource = describe(str(data_path))
+            resource.name = ds.slug
+            resource.title = ds.name
+            resource.path = data_path.name  # Just the filename in the package
+            resources.append(resource.to_dict())
+            data_files.append((data_path, data_path.name))
+        except Exception as e:
+            click.echo(f"Warning: Failed to describe '{ds.slug}': {e}", err=True)
+    if not resources:
+        click.echo("Error: No resources could be added to the package", err=True)
+        sys.exit(1)
+    datapackage = {
+        "name": project_slug,
+        "resources": resources,
+    }
+    # Upload to GCS
+    try:
+        from google.cloud import storage  # type: ignore[import-untyped]
+        client = storage.Client()
+        bucket = client.bucket(bucket_name)
+        # Upload datapackage.json
+        datapackage_blob = bucket.blob(f"{gcs_prefix}datapackage.json")
+        datapackage_blob.upload_from_string(json.dumps(datapackage, indent=2), content_type="application/json")
+        click.echo("✓ Uploaded datapackage.json")
+        # Upload data files
+        for local_path, remote_name in data_files:
+            data_blob = bucket.blob(f"{gcs_prefix}{remote_name}")
+            data_blob.upload_from_filename(str(local_path))
+            click.echo(f"✓ Uploaded {remote_name}")
+        click.echo(f"\nPackage pushed to: gs://{bucket_name}/{gcs_prefix}")
+    except ImportError:
+        click.echo("Error: google-cloud-storage is required for push", err=True)
+        click.echo("Install with: pip install google-cloud-storage", err=True)
+        sys.exit(1)
+    except Exception as e:
+        click.echo(f"Error uploading to GCS: {e}", err=True)
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

sunstone/dataframe.py CHANGED Viewed

@@ -10,7 +10,7 @@ import pandas as pd
 from .datasets import DatasetsManager
 from .exceptions import DatasetNotFoundError, StrictModeError
-from .lineage import FieldSchema, LineageMetadata
+from .lineage import FieldSchema, LineageMetadata, compute_dataframe_hash
 pd.options.mode.copy_on_write = True
@@ -196,7 +196,6 @@ class DataFrame:
         # Create lineage metadata
         lineage = LineageMetadata(project_path=str(manager.project_path))
         lineage.add_source(dataset)
-        lineage.add_operation(f"read_dataset({dataset.slug}, format={format})")
         # Return wrapped DataFrame
         return cls(data=df, lineage=lineage, strict=strict, project_path=project_path)
@@ -294,7 +293,6 @@ class DataFrame:
         # Create lineage metadata
         lineage = LineageMetadata(project_path=str(manager.project_path))
         lineage.add_source(dataset)
-        lineage.add_operation(f"read_csv({dataset.slug})")
         # Return wrapped DataFrame
         return cls(data=df, lineage=lineage, strict=strict, project_path=project_path)
@@ -363,11 +361,13 @@ class DataFrame:
         absolute_path.parent.mkdir(parents=True, exist_ok=True)
         self.data.to_csv(absolute_path, **kwargs)
-        # Record the operation
-        self.lineage.add_operation(f"to_csv({dataset.slug})")
+        # Compute content hash for change detection
+        content_hash = compute_dataframe_hash(self.data)
         # Persist lineage metadata to datasets.yaml
-        manager.update_output_lineage(slug=dataset.slug, lineage=self.lineage, strict=self.strict_mode)
+        manager.update_output_lineage(
+            slug=dataset.slug, lineage=self.lineage, content_hash=content_hash, strict=self.strict_mode
+        )
     def _infer_field_schema(self) -> List[FieldSchema]:
         """
@@ -410,11 +410,8 @@ class DataFrame:
         # Perform the merge
         merged_data = pd.merge(self.data, right.data, **kwargs)
-        # Combine lineage
+        # Combine lineage (sources from both DataFrames)
         merged_lineage = self.lineage.merge(right.lineage)
-        merged_lineage.add_operation(
-            f"merge(left={len(self.lineage.sources)} sources, right={len(right.lineage.sources)} sources)"
-        )
         return DataFrame(
             data=merged_data,
@@ -437,11 +434,8 @@ class DataFrame:
         # Perform the join
         joined_data = self.data.join(other.data, **kwargs)
-        # Combine lineage
+        # Combine lineage (sources from both DataFrames)
         joined_lineage = self.lineage.merge(other.lineage)
-        joined_lineage.add_operation(
-            f"join(left={len(self.lineage.sources)} sources, right={len(other.lineage.sources)} sources)"
-        )
         return DataFrame(
             data=joined_data,
@@ -467,16 +461,11 @@ class DataFrame:
         # Concatenate
         concatenated_data = pd.concat(all_dfs, **kwargs)
-        # Combine lineage from all DataFrames
+        # Combine lineage (sources from all DataFrames)
         combined_lineage = self.lineage
         for other in others:
             combined_lineage = combined_lineage.merge(other.lineage)
-        combined_lineage.add_operation(
-            f"concat({len(others) + 1} dataframes, "
-            f"{sum(len(df.lineage.sources) for df in [self] + others)} total sources)"
-        )
         return DataFrame(
             data=concatenated_data,
             lineage=combined_lineage,
@@ -484,42 +473,12 @@ class DataFrame:
             project_path=self.lineage.project_path,
         )
-    def apply_operation(self, operation: Callable[[pd.DataFrame], pd.DataFrame], description: str) -> "DataFrame":
-        """
-        Apply a transformation operation to the DataFrame.
-        Args:
-            operation: Function that takes a pandas DataFrame and returns a DataFrame.
-            description: Human-readable description of the operation.
-        Returns:
-            A new DataFrame with the operation applied and recorded in lineage.
-        """
-        # Apply the operation
-        new_data = operation(self.data)
-        # Copy lineage and add operation
-        new_lineage = LineageMetadata(
-            sources=self.lineage.sources.copy(),
-            operations=self.lineage.operations.copy(),
-            project_path=self.lineage.project_path,
-        )
-        new_lineage.add_operation(description)
-        return DataFrame(
-            data=new_data,
-            lineage=new_lineage,
-            strict=self.strict_mode,
-            project_path=self.lineage.project_path,
-        )
-    def _wrap_result(self, result: Any, operation: Optional[str] = None) -> Any:
+    def _wrap_result(self, result: Any) -> Any:
         """
         Wrap a pandas result in a Sunstone DataFrame if applicable.
         Args:
             result: The result from a pandas operation.
-            operation: Name of the operation performed. If None, no operation is recorded.
         Returns:
             Wrapped DataFrame if result is a DataFrame, otherwise the result.
@@ -527,11 +486,8 @@ class DataFrame:
         if isinstance(result, pd.DataFrame):
             new_lineage = LineageMetadata(
                 sources=self.lineage.sources.copy(),
-                operations=self.lineage.operations.copy(),
                 project_path=self.lineage.project_path,
             )
-            if operation is not None:
-                new_lineage.add_operation(operation)
             return DataFrame(
                 data=result,
@@ -541,28 +497,6 @@ class DataFrame:
             )
         return result
-    # Methods that don't represent meaningful data transformations
-    # These return DataFrames but shouldn't be tracked in lineage
-    _NON_TRACKING_METHODS = frozenset(
-        {
-            # Copy operations - same data, no transformation
-            "copy",
-            # Index operations - same data, different index
-            "reset_index",
-            "set_index",
-            "reindex",
-            # Type conversions without data change
-            "astype",
-            "infer_objects",
-            # Column/index renaming - same data, different labels
-            "rename",
-            "rename_axis",
-            # Reshaping without data loss
-            "T",
-            "transpose",
-        }
-    )
     def __getattr__(self, name: str) -> Any:
         """
         Delegate attribute access to the underlying pandas DataFrame.
@@ -583,14 +517,11 @@ class DataFrame:
             def wrapper(*args: Any, **kwargs: Any) -> Any:
                 result = attr(*args, **kwargs)
-                # Don't track non-transforming methods
-                if name in DataFrame._NON_TRACKING_METHODS:
-                    return self._wrap_result(result, operation=None)
-                return self._wrap_result(result, operation=f"{name}")
+                return self._wrap_result(result)
             return wrapper
-        return self._wrap_result(attr, operation=None)  # Don't track attribute access
+        return self._wrap_result(attr)
     def __getitem__(self, key: Any) -> Any:
         """
@@ -603,9 +534,7 @@ class DataFrame:
             The item from the underlying DataFrame, wrapped if it's a DataFrame.
         """
         result = self.data[key]
-        # Don't track __getitem__ as an operation - it's just column/row access
-        # not a meaningful transformation
-        return self._wrap_result(result, operation=None)
+        return self._wrap_result(result)
     def __setitem__(self, key: Any, value: Any) -> None:
         """
@@ -616,14 +545,12 @@ class DataFrame:
             value: Value to assign.
         """
         self.data[key] = value
-        # Track column assignment in lineage
-        self.lineage.add_operation(f"__setitem__({key!r})")
+        # Don't track column assignments automatically
+        # Users should use add_operation() for meaningful transformations
     def __repr__(self) -> str:
         """String representation of the DataFrame."""
-        lineage_info = (
-            f"\n\nLineage: {len(self.lineage.sources)} source(s), {len(self.lineage.operations)} operation(s)"
-        )
+        lineage_info = f"\n\nLineage: {len(self.lineage.sources)} source(s)"
         return repr(self.data) + lineage_info
     def __str__(self) -> str:

sunstone/datasets.py CHANGED Viewed

@@ -15,7 +15,7 @@ import requests
 from ruamel.yaml import YAML
 from .exceptions import DatasetNotFoundError, DatasetValidationError
-from .lineage import DatasetMetadata, FieldSchema, LineageMetadata, Source, SourceLocation
+from .lineage import DatasetMetadata, FieldSchema, LineageMetadata, PublishConfig, Source, SourceLocation
 logger = logging.getLogger(__name__)
@@ -156,6 +156,26 @@ class DatasetsManager:
             for field in fields_data
         ]
+    def _parse_publish(self, publish_data: Any) -> Optional[PublishConfig]:
+        """
+        Parse publish configuration from YAML.
+        Supports both legacy boolean format and new object format:
+        - publish: true -> PublishConfig(enabled=True)
+        - publish: false -> None
+        - publish: { enabled: true, to: "..." } -> PublishConfig(enabled=True, to="...")
+        """
+        if publish_data is None:
+            return None
+        if isinstance(publish_data, bool):
+            return PublishConfig(enabled=publish_data) if publish_data else None
+        if isinstance(publish_data, dict):
+            enabled = publish_data.get("enabled", False)
+            if not enabled:
+                return None
+            return PublishConfig(enabled=True, to=publish_data.get("to"))
+        return None
     def _parse_dataset(self, dataset_data: Dict[str, Any], dataset_type: str) -> DatasetMetadata:
         """
         Parse dataset metadata from YAML data.
@@ -177,7 +197,8 @@ class DatasetsManager:
             location=dataset_data["location"],
             fields=self._parse_fields(dataset_data["fields"]),
             source=source,
-            publish=dataset_data.get("publish", False),
+            publish=self._parse_publish(dataset_data.get("publish")),
+            strict=dataset_data.get("strict", False),
             dataset_type=dataset_type,
         )
@@ -380,22 +401,57 @@ class DatasetsManager:
         raise DatasetNotFoundError(f"Output dataset with slug '{slug}' not found")
-    def update_output_lineage(self, slug: str, lineage: LineageMetadata, strict: bool = False) -> None:
+    def set_dataset_strict(self, slug: str, strict: bool, dataset_type: Optional[str] = None) -> None:
+        """
+        Set or remove strict mode for a dataset.
+        Args:
+            slug: The slug of the dataset to update.
+            strict: If True, enable strict mode. If False, disable it.
+            dataset_type: Optional filter by 'input' or 'output'. If None, searches both.
+        Raises:
+            DatasetNotFoundError: If the dataset doesn't exist.
+        """
+        search_types = ["input", "output"] if dataset_type is None else [dataset_type]
+        for dtype in search_types:
+            key = "inputs" if dtype == "input" else "outputs"
+            for dataset_data in self._data.get(key, []):
+                if dataset_data["slug"] == slug:
+                    if strict:
+                        dataset_data["strict"] = True
+                    elif "strict" in dataset_data:
+                        del dataset_data["strict"]
+                    self._save()
+                    return
+        raise DatasetNotFoundError(f"Dataset with slug '{slug}' not found")
+    def update_output_lineage(
+        self, slug: str, lineage: LineageMetadata, content_hash: str, strict: bool = False
+    ) -> None:
         """
         Update lineage metadata for an output dataset.
+        The timestamp is only updated when the content hash changes, preventing
+        unnecessary updates when the data hasn't changed.
         In strict mode, validates that the lineage matches what would be written
         without modifying the file. In relaxed mode, updates the file with lineage.
         Args:
             slug: The slug of the output dataset to update.
             lineage: The lineage metadata to persist.
+            content_hash: SHA256 hash of the DataFrame content.
             strict: If True, validate without modifying. If False, update the file.
         Raises:
             DatasetNotFoundError: If the dataset doesn't exist.
             DatasetValidationError: In strict mode, if lineage differs from what's in the file.
         """
+        from datetime import datetime
         # Find the output dataset
         dataset_idx = None
         for i, dataset_data in enumerate(self._data["outputs"]):
@@ -406,23 +462,28 @@ class DatasetsManager:
         if dataset_idx is None:
             raise DatasetNotFoundError(f"Output dataset with slug '{slug}' not found")
-        # Build lineage metadata to add
-        lineage_data: dict[str, Any] = {}
+        # Get existing lineage data if present
+        existing_lineage = self._data["outputs"][dataset_idx].get("lineage", {})
+        existing_hash = existing_lineage.get("content_hash")
+        existing_timestamp = existing_lineage.get("created_at")
-        if lineage.sources:
-            lineage_data["sources"] = [
-                {
-                    "slug": src.slug,
-                    "name": src.name,
-                }
-                for src in lineage.sources
-            ]
+        # Determine if content has changed
+        content_changed = existing_hash != content_hash
-        if lineage.operations:
-            lineage_data["operations"] = lineage.operations.copy()
+        # Only update timestamp if content changed
+        if content_changed:
+            timestamp = datetime.now().isoformat()
+        else:
+            # Preserve existing timestamp
+            timestamp = existing_timestamp
-        if lineage.created_at:
-            lineage_data["created_at"] = lineage.created_at.isoformat()
+        # Build lineage metadata to add (order: content_hash, created_at, sources)
+        lineage_data: dict[str, Any] = {}
+        lineage_data["content_hash"] = content_hash
+        if timestamp:
+            lineage_data["created_at"] = timestamp
+        if lineage.sources:
+            lineage_data["sources"] = [{"slug": src.slug} for src in lineage.sources]
         # Create a copy of the data with updated lineage
         updated_data = self._data.copy()

sunstone/lineage.py CHANGED Viewed

@@ -2,9 +2,13 @@
 Lineage metadata structures for tracking data provenance.
 """
+import hashlib
 from dataclasses import dataclass, field
 from datetime import datetime
-from typing import Any, Dict, List, Optional
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+if TYPE_CHECKING:
+    import pandas as pd
 @dataclass
@@ -62,6 +66,17 @@ class FieldSchema:
     """Optional constraints (e.g., enum values)."""
+@dataclass
+class PublishConfig:
+    """Configuration for publishing a dataset."""
+    enabled: bool = False
+    """Whether publishing is enabled."""
+    to: Optional[str] = None
+    """Optional destination URL (supports ${VAR:-default} substitution)."""
 @dataclass
 class DatasetMetadata:
     """Metadata for a dataset from datasets.yaml."""
@@ -81,30 +96,56 @@ class DatasetMetadata:
     source: Optional[Source] = None
     """Source attribution (for input datasets)."""
-    publish: bool = False
-    """Whether this dataset should be published (for output datasets)."""
+    publish: Optional[PublishConfig] = None
+    """Publishing configuration (for output datasets)."""
+    strict: bool = False
+    """Whether strict mode is enabled (lineage cannot be modified)."""
     dataset_type: str = "input"
     """Type of dataset: 'input' or 'output'."""
+    @property
+    def is_publishable(self) -> bool:
+        """Check if this dataset is configured for publishing."""
+        return self.publish is not None and self.publish.enabled
+def compute_dataframe_hash(df: "pd.DataFrame") -> str:
+    """
+    Compute a fast SHA256 hash of a pandas DataFrame's content.
+    Uses pickle serialization for a consistent, fast representation of the data.
+    Args:
+        df: The pandas DataFrame to hash.
+    Returns:
+        A SHA256 hex digest string representing the DataFrame content.
+    """
+    import pickle
+    # Use pickle protocol 5 for efficiency; hash the bytes directly
+    data_bytes = pickle.dumps(df, protocol=5)
+    return hashlib.sha256(data_bytes).hexdigest()
 @dataclass
 class LineageMetadata:
     """
     Lineage metadata tracking the provenance of data in a DataFrame.
-    This tracks all source datasets that contributed to the current DataFrame,
-    including information about transformations and operations performed.
+    This tracks all source datasets that contributed to the current DataFrame.
     """
     sources: List[DatasetMetadata] = field(default_factory=list)
     """List of source datasets that contributed to this data."""
-    operations: List[str] = field(default_factory=list)
-    """List of operations performed on the data."""
+    created_at: Optional[datetime] = None
+    """Timestamp when this lineage was last updated (content changed)."""
-    created_at: datetime = field(default_factory=datetime.now)
-    """Timestamp when this lineage was created."""
+    content_hash: Optional[str] = None
+    """SHA256 hash of the DataFrame content, used to detect changes."""
     project_path: Optional[str] = None
     """Path to the project directory containing datasets.yaml."""
@@ -119,15 +160,6 @@ class LineageMetadata:
         if dataset not in self.sources:
             self.sources.append(dataset)
-    def add_operation(self, operation: str) -> None:
-        """
-        Record an operation performed on the data.
-        Args:
-            operation: Description of the operation.
-        """
-        self.operations.append(operation)
     def merge(self, other: "LineageMetadata") -> "LineageMetadata":
         """
         Merge lineage from another DataFrame.
@@ -136,12 +168,10 @@ class LineageMetadata:
             other: The other lineage metadata to merge.
         Returns:
-            A new LineageMetadata with combined sources and operations.
+            A new LineageMetadata with combined sources.
         """
         merged = LineageMetadata(
             sources=self.sources.copy(),
-            operations=self.operations.copy(),
-            created_at=datetime.now(),
             project_path=self.project_path or other.project_path,
         )
@@ -150,9 +180,6 @@ class LineageMetadata:
             if source not in merged.sources:
                 merged.sources.append(source)
-        # Combine operations
-        merged.operations.extend(other.operations)
         return merged
     def get_licenses(self) -> List[str]:
@@ -175,16 +202,18 @@ class LineageMetadata:
         Returns:
             Dictionary containing lineage information.
         """
-        return {
+        result: Dict[str, Any] = {
             "sources": [
                 {
-                    "name": src.name,
                     "slug": src.slug,
+                    "name": src.name,
                     "location": src.location,
                 }
                 for src in self.sources
             ],
-            "operations": self.operations,
-            "created_at": self.created_at.isoformat(),
-            "licenses": self.get_licenses(),
         }
+        if self.created_at is not None:
+            result["created_at"] = self.created_at.isoformat()
+        if self.content_hash is not None:
+            result["content_hash"] = self.content_hash
+        return result

{sunstone_py-0.5.2.dist-info → sunstone_py-0.6.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sunstone-py
-Version: 0.5.2
+Version: 0.6.0
 Summary: Python library for managing datasets with lineage tracking in Sunstone projects
 Author-email: Sunstone Institute <stig@sunstone.institute>
 License: MIT
@@ -17,8 +17,10 @@ Classifier: Programming Language :: Python :: 3.14
 Requires-Python: >=3.12
 Description-Content-Type: text/markdown
 License-File: LICENSE
+Requires-Dist: click>=8.0
 Requires-Dist: frictionless>=5.18.1
 Requires-Dist: google-auth>=2.43.0
+Requires-Dist: google-cloud-storage>=2.0.0
 Requires-Dist: pandas>=2.0.0
 Requires-Dist: pyyaml>=6.0
 Requires-Dist: requests>=2.31.0
@@ -29,7 +31,7 @@ Dynamic: license-file
 A Python library for managing datasets with lineage tracking in data science projects.
-[![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
+[![Python 3.12+](https://img.shields.io/badge/python-3.12+-blue.svg)](https://www.python.org/downloads/)
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 ## Features

sunstone_py-0.6.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,16 @@
+sunstone/__init__.py,sha256=LC0ZtmxP26eXPLKejbg7UStcHOnE_lwttNTL4m3F4yM,2032
+sunstone/_release.py,sha256=MQNaUD7mSK6h8vu6EIgJuaMlAxuFxv82NQwHgBpLZm4,14907
+sunstone/cli.py,sha256=YNwMXWCezQCJikJEC1iprf4rl5hsTr0V8toETVoRVCk,17905
+sunstone/dataframe.py,sha256=rFGuMq-63Haua_QQfR3E708KYc1g43yEyCej11_Gl3A,20679
+sunstone/datasets.py,sha256=9mJJ02UFcjFtbbx01rFLUMAacUPaJdothfqnTsc66kw,23851
+sunstone/exceptions.py,sha256=fiixXazur3LtQGy21bGEaSr356DObFcYxQJ3FvOxNec,623
+sunstone/lineage.py,sha256=iZiVBY-l-iEeVVlEORkow29fMM5UGtah8FU5ZVLetAI,6001
+sunstone/pandas.py,sha256=CLEqIIgTbMmpH73TPy_vDUPxQa37Hpmqn4r6No8PJwo,8188
+sunstone/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+sunstone/validation.py,sha256=1356vcUc72a1zGBUe9Xjrcb5h41Xo53PaK2nnQ_FuSM,8286
+sunstone_py-0.6.0.dist-info/licenses/LICENSE,sha256=pB6VuR4QRjwjMjy8RSNGho-N1SUdu07ntIhT5lrhkzU,1078
+sunstone_py-0.6.0.dist-info/METADATA,sha256=3eqIzvMuCIMbuzLaAMcVMV_KsUxcvJNlh5drnUfV7hk,9529
+sunstone_py-0.6.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+sunstone_py-0.6.0.dist-info/entry_points.txt,sha256=DT-mp-lPl6UEcHBNs2o3HJ8dLp4iqMnzvHJhiLfCd0g,80
+sunstone_py-0.6.0.dist-info/top_level.txt,sha256=A2fW-7JO10rlx_L28Bc4FVvWt2R8kgvS8_TGPBhQp3c,9
+sunstone_py-0.6.0.dist-info/RECORD,,

{sunstone_py-0.5.2.dist-info → sunstone_py-0.6.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.9.0)
+Generator: setuptools (80.10.2)
 Root-Is-Purelib: true
 Tag: py3-none-any

{sunstone_py-0.5.2.dist-info → sunstone_py-0.6.0.dist-info}/entry_points.txt RENAMED Viewed

@@ -1,2 +1,3 @@
 [console_scripts]
 release = sunstone._release:main
+sunstone = sunstone.cli:main

sunstone_py-0.5.2.dist-info/RECORD DELETED Viewed

@@ -1,15 +0,0 @@
-sunstone/__init__.py,sha256=LC0ZtmxP26eXPLKejbg7UStcHOnE_lwttNTL4m3F4yM,2032
-sunstone/_release.py,sha256=MQNaUD7mSK6h8vu6EIgJuaMlAxuFxv82NQwHgBpLZm4,14907
-sunstone/dataframe.py,sha256=UJgQx7auiNb6hSIvhB8EQs2afu-7S22xdWL5DZUr29g,23602
-sunstone/datasets.py,sha256=LdHk3Vkfc7QH2VxhSskRCm9wUFSkldCmgS_1c2KDAPA,21142
-sunstone/exceptions.py,sha256=fiixXazur3LtQGy21bGEaSr356DObFcYxQJ3FvOxNec,623
-sunstone/lineage.py,sha256=B9GKMu5-v8Izos5G40K_EvsCPJL3Z2Tg1T_Fc7ezSMI,5240
-sunstone/pandas.py,sha256=CLEqIIgTbMmpH73TPy_vDUPxQa37Hpmqn4r6No8PJwo,8188
-sunstone/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-sunstone/validation.py,sha256=1356vcUc72a1zGBUe9Xjrcb5h41Xo53PaK2nnQ_FuSM,8286
-sunstone_py-0.5.2.dist-info/licenses/LICENSE,sha256=pB6VuR4QRjwjMjy8RSNGho-N1SUdu07ntIhT5lrhkzU,1078
-sunstone_py-0.5.2.dist-info/METADATA,sha256=uR8iPIENJBiPVFhtr5EXT3V6VAmLiju0CfFjm6oQubI,9460
-sunstone_py-0.5.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-sunstone_py-0.5.2.dist-info/entry_points.txt,sha256=0h6E88rH9a_503BAzXvFPR-UfmkrRFjcOf29DXgJNjk,51
-sunstone_py-0.5.2.dist-info/top_level.txt,sha256=A2fW-7JO10rlx_L28Bc4FVvWt2R8kgvS8_TGPBhQp3c,9
-sunstone_py-0.5.2.dist-info/RECORD,,

{sunstone_py-0.5.2.dist-info → sunstone_py-0.6.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{sunstone_py-0.5.2.dist-info → sunstone_py-0.6.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

sunstone-py 0.5.2__py3-none-any.whl → 0.6.0__py3-none-any.whl

sunstone-py 0.5.2py3-none-any.whl → 0.6.0py3-none-any.whl