PyPI - sunstone-py - Versions diffs - 0.4.0__py3-none-any.whl - Mend

sunstone-py 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

sunstone/__init__.py +84 -0
sunstone/_release.py +403 -0
sunstone/dataframe.py +607 -0
sunstone/datasets.py +480 -0
sunstone/exceptions.py +33 -0
sunstone/lineage.py +190 -0
sunstone/pandas.py +246 -0
sunstone/py.typed +0 -0
sunstone/validation.py +253 -0
sunstone_py-0.4.0.dist-info/METADATA +348 -0
sunstone_py-0.4.0.dist-info/RECORD +15 -0
sunstone_py-0.4.0.dist-info/WHEEL +5 -0
sunstone_py-0.4.0.dist-info/entry_points.txt +2 -0
sunstone_py-0.4.0.dist-info/licenses/LICENSE +21 -0
sunstone_py-0.4.0.dist-info/top_level.txt +1 -0

sunstone/pandas.py ADDED Viewed

@@ -0,0 +1,246 @@
+"""
+Pandas-compatible API for Sunstone DataFrames.
+This module provides a pandas-like interface that data scientists can use
+with minimal friction, while still maintaining full lineage tracking.
+Example:
+    >>> from sunstone import pandas as pd
+    >>>
+    >>> # Read data (must be in datasets.yaml)
+    >>> df = pd.read_csv('input_data.csv', project_path='/path/to/project')
+    >>>
+    >>> # Use familiar pandas operations
+    >>> filtered = df[df['amount'] > 100]
+    >>> grouped = df.groupby('category').sum()
+    >>>
+    >>> # Merge datasets
+    >>> result = pd.merge(df1, df2, on='id')
+    >>>
+    >>> # Save with lineage
+    >>> result.to_csv('output.csv', slug='output-data', name='Output Data')
+"""
+from pathlib import Path
+from typing import Any, List, Optional, Union
+import pandas as _pd
+from .dataframe import DataFrame
+# Re-export commonly used pandas types and functions
+# This allows scripts to use `from sunstone import pandas as pd` and still
+# access standard pandas utilities like pd.Timestamp, pd.NaT, etc.
+#
+# NOTE: DataFrame is our wrapped version from .dataframe
+# For vanilla pandas DataFrame, use _pd.DataFrame directly if needed
+Timestamp = _pd.Timestamp
+NaT = _pd.NaT
+isna = _pd.isna
+isnull = _pd.isnull
+notna = _pd.notna
+notnull = _pd.notnull
+to_datetime = _pd.to_datetime
+to_numeric = _pd.to_numeric
+to_timedelta = _pd.to_timedelta
+Series = _pd.Series  # Re-export pandas Series
+__all__ = [
+    "read_csv",
+    "read_dataset",
+    "merge",
+    "concat",
+    # Pandas types and utilities
+    "DataFrame",
+    "Series",
+    "Timestamp",
+    "NaT",
+    "isna",
+    "isnull",
+    "notna",
+    "notnull",
+    "to_datetime",
+    "to_numeric",
+    "to_timedelta",
+]
+def read_dataset(
+    slug: str,
+    project_path: Union[str, Path],
+    strict: Optional[bool] = None,
+    fetch_from_url: bool = True,
+    format: Optional[str] = None,
+    **kwargs: Any,
+) -> DataFrame:
+    """
+    Read a dataset by slug from datasets.yaml with automatic format detection.
+    This function provides a pandas-like interface while ensuring the dataset
+    is registered in datasets.yaml and lineage is tracked. The file format is
+    automatically detected from the file extension unless explicitly specified.
+    Supported formats:
+    - CSV (.csv)
+    - JSON (.json)
+    - Excel (.xlsx, .xls)
+    - Parquet (.parquet)
+    - TSV (.tsv, .txt with tab delimiter)
+    Args:
+        slug: Dataset slug to look up in datasets.yaml.
+        project_path: Path to project directory containing datasets.yaml.
+                     Must be provided explicitly (no auto-detection).
+        strict: Whether to operate in strict mode. If None, reads from
+               SUNSTONE_DATAFRAME_STRICT environment variable.
+        fetch_from_url: If True and dataset has a source URL but no local file,
+                      automatically fetch from URL.
+        format: Optional format override ('csv', 'json', 'excel', 'parquet', 'tsv').
+               If not provided, format is auto-detected from file extension.
+        **kwargs: Additional arguments passed to the pandas reader function.
+    Returns:
+        A Sunstone DataFrame with lineage metadata.
+    Raises:
+        DatasetNotFoundError: If dataset with slug not found in datasets.yaml.
+        FileNotFoundError: If datasets.yaml doesn't exist.
+        ValueError: If format cannot be detected or is unsupported.
+    Examples:
+        >>> from sunstone import pandas as pd
+        >>>
+        >>> # Auto-detect format from extension
+        >>> df = pd.read_dataset('official-un-member-states', project_path='/path/to/project')
+        >>>
+        >>> # Explicitly specify format
+        >>> df = pd.read_dataset('my-data', format='json', project_path='/path/to/project')
+        >>>
+        >>> # With additional reader arguments
+        >>> df = pd.read_dataset('data-file', project_path='/path/to/project',
+        ...                      encoding='utf-8', skiprows=1)
+    """
+    return DataFrame.read_dataset(
+        slug=slug,
+        project_path=project_path,
+        strict=strict,
+        fetch_from_url=fetch_from_url,
+        format=format,
+        **kwargs,
+    )
+def read_csv(
+    filepath_or_buffer: Union[str, Path],
+    project_path: Union[str, Path],
+    strict: Optional[bool] = None,
+    fetch_from_url: bool = True,
+    **kwargs: Any,
+) -> DataFrame:
+    """
+    Read a CSV file into a Sunstone DataFrame with lineage tracking.
+    This function provides a pandas-like interface while ensuring the dataset
+    is registered in datasets.yaml and lineage is tracked.
+    Args:
+        filepath_or_buffer: Path to CSV file, URL, or dataset slug.
+                          If it's a slug (e.g., 'official-un-member-states'),
+                          the dataset will be looked up in datasets.yaml.
+        project_path: Path to project directory containing datasets.yaml.
+                     Must be provided explicitly (no auto-detection).
+        strict: Whether to operate in strict mode. If None, reads from
+               SUNSTONE_DATAFRAME_STRICT environment variable.
+        fetch_from_url: If True and dataset has a source URL but no local file,
+                      automatically fetch from URL.
+        **kwargs: Additional arguments passed to pandas.read_csv.
+    Returns:
+        A Sunstone DataFrame with lineage metadata.
+    Raises:
+        DatasetNotFoundError: If dataset not found in datasets.yaml.
+        FileNotFoundError: If datasets.yaml doesn't exist.
+    Examples:
+        >>> from sunstone import pandas as pd
+        >>>
+        >>> # Load by slug (recommended)
+        >>> df = pd.read_csv('official-un-member-states', project_path='/path/to/project')
+        >>>
+        >>> # Load by file path
+        >>> df = pd.read_csv('schools.csv', project_path='/path/to/project')
+        >>>
+        >>> # With additional pandas arguments
+        >>> df = pd.read_csv('schools.csv', project_path='/path/to/project',
+        ...                  encoding='utf-8', skiprows=1)
+    """
+    return DataFrame.read_csv(
+        filepath_or_buffer=filepath_or_buffer,
+        project_path=project_path,
+        strict=strict,
+        fetch_from_url=fetch_from_url,
+        **kwargs,
+    )
+def merge(
+    left: DataFrame,
+    right: DataFrame,
+    **kwargs: Any,
+) -> DataFrame:
+    """
+    Merge two Sunstone DataFrames, combining their lineage.
+    This function provides the same interface as pandas.merge but maintains
+    lineage tracking from both input DataFrames.
+    Args:
+        left: Left DataFrame to merge.
+        right: Right DataFrame to merge.
+        **kwargs: Additional arguments passed to pandas.merge (on, how, left_on,
+                 right_on, left_index, right_index, etc.).
+    Returns:
+        A new DataFrame with merged data and combined lineage.
+    Example:
+        >>> from sunstone import pandas as pd
+        >>> df1 = pd.read_csv('countries.csv', project_path='/path/to/project')
+        >>> df2 = pd.read_csv('populations.csv', project_path='/path/to/project')
+        >>> merged = pd.merge(df1, df2, on='country_code', how='inner')
+    """
+    return left.merge(right, **kwargs)
+def concat(
+    objs: List[DataFrame],
+    **kwargs: Any,
+) -> DataFrame:
+    """
+    Concatenate Sunstone DataFrames along a particular axis, combining lineage.
+    This function provides the same interface as pandas.concat but maintains
+    lineage tracking from all input DataFrames.
+    Args:
+        objs: List of DataFrame objects to concatenate.
+        **kwargs: Additional arguments passed to pandas.concat (axis, join,
+                 ignore_index, keys, etc.).
+    Returns:
+        A new DataFrame with concatenated data and combined lineage.
+    Example:
+        >>> from sunstone import pandas as pd
+        >>> df1 = pd.read_csv('data_2023.csv', project_path='/path/to/project')
+        >>> df2 = pd.read_csv('data_2024.csv', project_path='/path/to/project')
+        >>> combined = pd.concat([df1, df2], ignore_index=True)
+    """
+    if not objs:
+        raise ValueError("No objects to concatenate")
+    # Use the first DataFrame's concat method
+    first = objs[0]
+    rest = objs[1:]
+    return first.concat(rest, **kwargs)

sunstone/py.typed ADDED Viewed

File without changes

sunstone/validation.py ADDED Viewed

@@ -0,0 +1,253 @@
+"""
+Validation utilities for Sunstone projects.
+This module provides tools to validate that notebooks and scripts are
+correctly using Sunstone's lineage tracking features.
+"""
+import json
+import re
+from pathlib import Path
+from typing import Dict, List, Union
+class ImportCheckResult:
+    """Result of an import check on a notebook or script."""
+    def __init__(self) -> None:
+        self.has_plain_pandas = False
+        self.has_sunstone_pandas = False
+        self.has_sunstone = False
+        self.plain_pandas_locations: List[str] = []
+        self.warnings: List[str] = []
+        self.errors: List[str] = []
+    @property
+    def is_valid(self) -> bool:
+        """Whether the file has valid imports (uses sunstone, not plain pandas)."""
+        return not self.has_plain_pandas and (self.has_sunstone or self.has_sunstone_pandas)
+    def add_warning(self, message: str) -> None:
+        """Add a warning message."""
+        self.warnings.append(message)
+    def add_error(self, message: str) -> None:
+        """Add an error message."""
+        self.errors.append(message)
+    def summary(self) -> str:
+        """Generate a human-readable summary of the check."""
+        lines = []
+        if self.is_valid:
+            lines.append("✓ Import check passed")
+            if self.has_sunstone_pandas:
+                lines.append("  Using: from sunstone import pandas as pd")
+            elif self.has_sunstone:
+                lines.append("  Using: import sunstone")
+        else:
+            lines.append("✗ Import check failed")
+            if self.has_plain_pandas:
+                lines.append("\n  Problem: Found plain pandas imports")
+                for loc in self.plain_pandas_locations:
+                    lines.append(f"    - {loc}")
+                lines.append("\n  Solution: Use one of these instead:")
+                lines.append("    from sunstone import pandas as pd")
+                lines.append("    # or")
+                lines.append("    import sunstone.pandas as pd")
+            if not self.has_sunstone and not self.has_sunstone_pandas:
+                lines.append("\n  Problem: No sunstone imports found")
+                lines.append("\n  Solution: Add sunstone import:")
+                lines.append("    from sunstone import pandas as pd")
+        if self.warnings:
+            lines.append("\nWarnings:")
+            for warning in self.warnings:
+                lines.append(f"  - {warning}")
+        if self.errors:
+            lines.append("\nErrors:")
+            for error in self.errors:
+                lines.append(f"  - {error}")
+        return "\n".join(lines)
+def check_notebook_imports(notebook_path: Union[str, Path]) -> ImportCheckResult:
+    """
+    Check a Jupyter notebook for correct Sunstone import usage.
+    This function scans all code cells in a notebook and checks if:
+    1. Plain pandas is imported (import pandas as pd)
+    2. Sunstone's pandas module is imported (from sunstone import pandas as pd)
+    3. Sunstone is imported (import sunstone)
+    Args:
+        notebook_path: Path to the Jupyter notebook (.ipynb file).
+    Returns:
+        ImportCheckResult with details about the imports found.
+    Example:
+        >>> from sunstone.validation import check_notebook_imports
+        >>> result = check_notebook_imports('analysis.ipynb')
+        >>> if not result.is_valid:
+        ...     print(result.summary())
+    """
+    result = ImportCheckResult()
+    notebook_path = Path(notebook_path)
+    if not notebook_path.exists():
+        result.add_error(f"Notebook not found: {notebook_path}")
+        return result
+    try:
+        with open(notebook_path, "r", encoding="utf-8") as f:
+            notebook = json.load(f)
+    except json.JSONDecodeError as e:
+        result.add_error(f"Invalid JSON in notebook: {e}")
+        return result
+    except Exception as e:
+        result.add_error(f"Error reading notebook: {e}")
+        return result
+    # Scan all code cells
+    cells = notebook.get("cells", [])
+    for i, cell in enumerate(cells):
+        if cell.get("cell_type") != "code":
+            continue
+        # Get the source code
+        source = cell.get("source", [])
+        if isinstance(source, list):
+            source = "".join(source)
+        # Check for various import patterns
+        _check_source_imports(source, result, f"Cell {i + 1}")
+    return result
+def check_script_imports(script_path: Union[str, Path]) -> ImportCheckResult:
+    """
+    Check a Python script for correct Sunstone import usage.
+    Args:
+        script_path: Path to the Python script (.py file).
+    Returns:
+        ImportCheckResult with details about the imports found.
+    Example:
+        >>> from sunstone.validation import check_script_imports
+        >>> result = check_script_imports('analysis.py')
+        >>> if not result.is_valid:
+        ...     print(result.summary())
+    """
+    result = ImportCheckResult()
+    script_path = Path(script_path)
+    if not script_path.exists():
+        result.add_error(f"Script not found: {script_path}")
+        return result
+    try:
+        with open(script_path, "r", encoding="utf-8") as f:
+            source = f.read()
+    except Exception as e:
+        result.add_error(f"Error reading script: {e}")
+        return result
+    _check_source_imports(source, result, str(script_path.name))
+    return result
+def _check_source_imports(source: str, result: ImportCheckResult, location: str) -> None:
+    """
+    Check source code for import statements.
+    Args:
+        source: Source code to check.
+        result: ImportCheckResult to update.
+        location: Description of where this source came from.
+    """
+    # Pattern for plain pandas import
+    plain_pandas_patterns = [
+        r"^\s*import\s+pandas\s+as\s+pd\s*$",
+        r"^\s*import\s+pandas\s*$",
+        r"^\s*from\s+pandas\s+import\s+",
+    ]
+    # Pattern for sunstone.pandas import
+    sunstone_pandas_patterns = [
+        r"^\s*from\s+sunstone\s+import\s+pandas\s+as\s+pd\s*$",
+        r"^\s*import\s+sunstone\.pandas\s+as\s+pd\s*$",
+        r"^\s*from\s+sunstone\s+import\s+pandas\s*$",
+    ]
+    # Pattern for general sunstone import
+    sunstone_patterns = [
+        r"^\s*import\s+sunstone\s*$",
+        r"^\s*import\s+sunstone\s+as\s+",
+        r"^\s*from\s+sunstone\s+import\s+",
+    ]
+    # Check each line
+    for line_num, line in enumerate(source.split("\n"), 1):
+        # Skip comments
+        if line.strip().startswith("#"):
+            continue
+        # Check for plain pandas (bad)
+        for pattern in plain_pandas_patterns:
+            if re.match(pattern, line, re.MULTILINE):
+                result.has_plain_pandas = True
+                result.plain_pandas_locations.append(f"{location}:{line_num}")
+        # Check for sunstone.pandas (good)
+        for pattern in sunstone_pandas_patterns:
+            if re.match(pattern, line, re.MULTILINE):
+                result.has_sunstone_pandas = True
+        # Check for general sunstone import (good)
+        for pattern in sunstone_patterns:
+            if re.match(pattern, line, re.MULTILINE):
+                result.has_sunstone = True
+def validate_project_notebooks(
+    project_path: Union[str, Path], pattern: str = "**/*.ipynb"
+) -> Dict[str, ImportCheckResult]:
+    """
+    Validate all notebooks in a project directory.
+    Args:
+        project_path: Path to the project directory.
+        pattern: Glob pattern for finding notebooks (default: **/*.ipynb).
+    Returns:
+        Dictionary mapping notebook paths to their ImportCheckResults.
+    Example:
+        >>> from sunstone.validation import validate_project_notebooks
+        >>> results = validate_project_notebooks('/path/to/project')
+        >>> for path, result in results.items():
+        ...     if not result.is_valid:
+        ...         print(f"\\n{path}:")
+        ...         print(result.summary())
+    """
+    project_path = Path(project_path)
+    results = {}
+    for notebook_path in project_path.glob(pattern):
+        # Skip .ipynb_checkpoints
+        if ".ipynb_checkpoints" in str(notebook_path):
+            continue
+        result = check_notebook_imports(notebook_path)
+        results[str(notebook_path.relative_to(project_path))] = result
+    return results