sunstone-py 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sunstone/pandas.py ADDED
@@ -0,0 +1,246 @@
1
+ """
2
+ Pandas-compatible API for Sunstone DataFrames.
3
+
4
+ This module provides a pandas-like interface that data scientists can use
5
+ with minimal friction, while still maintaining full lineage tracking.
6
+
7
+ Example:
8
+ >>> from sunstone import pandas as pd
9
+ >>>
10
+ >>> # Read data (must be in datasets.yaml)
11
+ >>> df = pd.read_csv('input_data.csv', project_path='/path/to/project')
12
+ >>>
13
+ >>> # Use familiar pandas operations
14
+ >>> filtered = df[df['amount'] > 100]
15
+ >>> grouped = df.groupby('category').sum()
16
+ >>>
17
+ >>> # Merge datasets
18
+ >>> result = pd.merge(df1, df2, on='id')
19
+ >>>
20
+ >>> # Save with lineage
21
+ >>> result.to_csv('output.csv', slug='output-data', name='Output Data')
22
+ """
23
+
24
+ from pathlib import Path
25
+ from typing import Any, List, Optional, Union
26
+
27
+ import pandas as _pd
28
+
29
+ from .dataframe import DataFrame
30
+
31
+ # Re-export commonly used pandas types and functions
32
+ # This allows scripts to use `from sunstone import pandas as pd` and still
33
+ # access standard pandas utilities like pd.Timestamp, pd.NaT, etc.
34
+ #
35
+ # NOTE: DataFrame is our wrapped version from .dataframe
36
+ # For vanilla pandas DataFrame, use _pd.DataFrame directly if needed
37
+ Timestamp = _pd.Timestamp
38
+ NaT = _pd.NaT
39
+ isna = _pd.isna
40
+ isnull = _pd.isnull
41
+ notna = _pd.notna
42
+ notnull = _pd.notnull
43
+ to_datetime = _pd.to_datetime
44
+ to_numeric = _pd.to_numeric
45
+ to_timedelta = _pd.to_timedelta
46
+ Series = _pd.Series # Re-export pandas Series
47
+
48
+ __all__ = [
49
+ "read_csv",
50
+ "read_dataset",
51
+ "merge",
52
+ "concat",
53
+ # Pandas types and utilities
54
+ "DataFrame",
55
+ "Series",
56
+ "Timestamp",
57
+ "NaT",
58
+ "isna",
59
+ "isnull",
60
+ "notna",
61
+ "notnull",
62
+ "to_datetime",
63
+ "to_numeric",
64
+ "to_timedelta",
65
+ ]
66
+
67
+
68
+ def read_dataset(
69
+ slug: str,
70
+ project_path: Union[str, Path],
71
+ strict: Optional[bool] = None,
72
+ fetch_from_url: bool = True,
73
+ format: Optional[str] = None,
74
+ **kwargs: Any,
75
+ ) -> DataFrame:
76
+ """
77
+ Read a dataset by slug from datasets.yaml with automatic format detection.
78
+
79
+ This function provides a pandas-like interface while ensuring the dataset
80
+ is registered in datasets.yaml and lineage is tracked. The file format is
81
+ automatically detected from the file extension unless explicitly specified.
82
+
83
+ Supported formats:
84
+ - CSV (.csv)
85
+ - JSON (.json)
86
+ - Excel (.xlsx, .xls)
87
+ - Parquet (.parquet)
88
+ - TSV (.tsv, .txt with tab delimiter)
89
+
90
+ Args:
91
+ slug: Dataset slug to look up in datasets.yaml.
92
+ project_path: Path to project directory containing datasets.yaml.
93
+ Must be provided explicitly (no auto-detection).
94
+ strict: Whether to operate in strict mode. If None, reads from
95
+ SUNSTONE_DATAFRAME_STRICT environment variable.
96
+ fetch_from_url: If True and dataset has a source URL but no local file,
97
+ automatically fetch from URL.
98
+ format: Optional format override ('csv', 'json', 'excel', 'parquet', 'tsv').
99
+ If not provided, format is auto-detected from file extension.
100
+ **kwargs: Additional arguments passed to the pandas reader function.
101
+
102
+ Returns:
103
+ A Sunstone DataFrame with lineage metadata.
104
+
105
+ Raises:
106
+ DatasetNotFoundError: If dataset with slug not found in datasets.yaml.
107
+ FileNotFoundError: If datasets.yaml doesn't exist.
108
+ ValueError: If format cannot be detected or is unsupported.
109
+
110
+ Examples:
111
+ >>> from sunstone import pandas as pd
112
+ >>>
113
+ >>> # Auto-detect format from extension
114
+ >>> df = pd.read_dataset('official-un-member-states', project_path='/path/to/project')
115
+ >>>
116
+ >>> # Explicitly specify format
117
+ >>> df = pd.read_dataset('my-data', format='json', project_path='/path/to/project')
118
+ >>>
119
+ >>> # With additional reader arguments
120
+ >>> df = pd.read_dataset('data-file', project_path='/path/to/project',
121
+ ... encoding='utf-8', skiprows=1)
122
+ """
123
+ return DataFrame.read_dataset(
124
+ slug=slug,
125
+ project_path=project_path,
126
+ strict=strict,
127
+ fetch_from_url=fetch_from_url,
128
+ format=format,
129
+ **kwargs,
130
+ )
131
+
132
+
133
+ def read_csv(
134
+ filepath_or_buffer: Union[str, Path],
135
+ project_path: Union[str, Path],
136
+ strict: Optional[bool] = None,
137
+ fetch_from_url: bool = True,
138
+ **kwargs: Any,
139
+ ) -> DataFrame:
140
+ """
141
+ Read a CSV file into a Sunstone DataFrame with lineage tracking.
142
+
143
+ This function provides a pandas-like interface while ensuring the dataset
144
+ is registered in datasets.yaml and lineage is tracked.
145
+
146
+ Args:
147
+ filepath_or_buffer: Path to CSV file, URL, or dataset slug.
148
+ If it's a slug (e.g., 'official-un-member-states'),
149
+ the dataset will be looked up in datasets.yaml.
150
+ project_path: Path to project directory containing datasets.yaml.
151
+ Must be provided explicitly (no auto-detection).
152
+ strict: Whether to operate in strict mode. If None, reads from
153
+ SUNSTONE_DATAFRAME_STRICT environment variable.
154
+ fetch_from_url: If True and dataset has a source URL but no local file,
155
+ automatically fetch from URL.
156
+ **kwargs: Additional arguments passed to pandas.read_csv.
157
+
158
+ Returns:
159
+ A Sunstone DataFrame with lineage metadata.
160
+
161
+ Raises:
162
+ DatasetNotFoundError: If dataset not found in datasets.yaml.
163
+ FileNotFoundError: If datasets.yaml doesn't exist.
164
+
165
+ Examples:
166
+ >>> from sunstone import pandas as pd
167
+ >>>
168
+ >>> # Load by slug (recommended)
169
+ >>> df = pd.read_csv('official-un-member-states', project_path='/path/to/project')
170
+ >>>
171
+ >>> # Load by file path
172
+ >>> df = pd.read_csv('schools.csv', project_path='/path/to/project')
173
+ >>>
174
+ >>> # With additional pandas arguments
175
+ >>> df = pd.read_csv('schools.csv', project_path='/path/to/project',
176
+ ... encoding='utf-8', skiprows=1)
177
+ """
178
+ return DataFrame.read_csv(
179
+ filepath_or_buffer=filepath_or_buffer,
180
+ project_path=project_path,
181
+ strict=strict,
182
+ fetch_from_url=fetch_from_url,
183
+ **kwargs,
184
+ )
185
+
186
+
187
+ def merge(
188
+ left: DataFrame,
189
+ right: DataFrame,
190
+ **kwargs: Any,
191
+ ) -> DataFrame:
192
+ """
193
+ Merge two Sunstone DataFrames, combining their lineage.
194
+
195
+ This function provides the same interface as pandas.merge but maintains
196
+ lineage tracking from both input DataFrames.
197
+
198
+ Args:
199
+ left: Left DataFrame to merge.
200
+ right: Right DataFrame to merge.
201
+ **kwargs: Additional arguments passed to pandas.merge (on, how, left_on,
202
+ right_on, left_index, right_index, etc.).
203
+
204
+ Returns:
205
+ A new DataFrame with merged data and combined lineage.
206
+
207
+ Example:
208
+ >>> from sunstone import pandas as pd
209
+ >>> df1 = pd.read_csv('countries.csv', project_path='/path/to/project')
210
+ >>> df2 = pd.read_csv('populations.csv', project_path='/path/to/project')
211
+ >>> merged = pd.merge(df1, df2, on='country_code', how='inner')
212
+ """
213
+ return left.merge(right, **kwargs)
214
+
215
+
216
+ def concat(
217
+ objs: List[DataFrame],
218
+ **kwargs: Any,
219
+ ) -> DataFrame:
220
+ """
221
+ Concatenate Sunstone DataFrames along a particular axis, combining lineage.
222
+
223
+ This function provides the same interface as pandas.concat but maintains
224
+ lineage tracking from all input DataFrames.
225
+
226
+ Args:
227
+ objs: List of DataFrame objects to concatenate.
228
+ **kwargs: Additional arguments passed to pandas.concat (axis, join,
229
+ ignore_index, keys, etc.).
230
+
231
+ Returns:
232
+ A new DataFrame with concatenated data and combined lineage.
233
+
234
+ Example:
235
+ >>> from sunstone import pandas as pd
236
+ >>> df1 = pd.read_csv('data_2023.csv', project_path='/path/to/project')
237
+ >>> df2 = pd.read_csv('data_2024.csv', project_path='/path/to/project')
238
+ >>> combined = pd.concat([df1, df2], ignore_index=True)
239
+ """
240
+ if not objs:
241
+ raise ValueError("No objects to concatenate")
242
+
243
+ # Use the first DataFrame's concat method
244
+ first = objs[0]
245
+ rest = objs[1:]
246
+ return first.concat(rest, **kwargs)
sunstone/py.typed ADDED
File without changes
sunstone/validation.py ADDED
@@ -0,0 +1,253 @@
1
+ """
2
+ Validation utilities for Sunstone projects.
3
+
4
+ This module provides tools to validate that notebooks and scripts are
5
+ correctly using Sunstone's lineage tracking features.
6
+ """
7
+
8
+ import json
9
+ import re
10
+ from pathlib import Path
11
+ from typing import Dict, List, Union
12
+
13
+
14
+ class ImportCheckResult:
15
+ """Result of an import check on a notebook or script."""
16
+
17
+ def __init__(self) -> None:
18
+ self.has_plain_pandas = False
19
+ self.has_sunstone_pandas = False
20
+ self.has_sunstone = False
21
+ self.plain_pandas_locations: List[str] = []
22
+ self.warnings: List[str] = []
23
+ self.errors: List[str] = []
24
+
25
+ @property
26
+ def is_valid(self) -> bool:
27
+ """Whether the file has valid imports (uses sunstone, not plain pandas)."""
28
+ return not self.has_plain_pandas and (self.has_sunstone or self.has_sunstone_pandas)
29
+
30
+ def add_warning(self, message: str) -> None:
31
+ """Add a warning message."""
32
+ self.warnings.append(message)
33
+
34
+ def add_error(self, message: str) -> None:
35
+ """Add an error message."""
36
+ self.errors.append(message)
37
+
38
+ def summary(self) -> str:
39
+ """Generate a human-readable summary of the check."""
40
+ lines = []
41
+
42
+ if self.is_valid:
43
+ lines.append("✓ Import check passed")
44
+ if self.has_sunstone_pandas:
45
+ lines.append(" Using: from sunstone import pandas as pd")
46
+ elif self.has_sunstone:
47
+ lines.append(" Using: import sunstone")
48
+ else:
49
+ lines.append("✗ Import check failed")
50
+
51
+ if self.has_plain_pandas:
52
+ lines.append("\n Problem: Found plain pandas imports")
53
+ for loc in self.plain_pandas_locations:
54
+ lines.append(f" - {loc}")
55
+
56
+ lines.append("\n Solution: Use one of these instead:")
57
+ lines.append(" from sunstone import pandas as pd")
58
+ lines.append(" # or")
59
+ lines.append(" import sunstone.pandas as pd")
60
+
61
+ if not self.has_sunstone and not self.has_sunstone_pandas:
62
+ lines.append("\n Problem: No sunstone imports found")
63
+ lines.append("\n Solution: Add sunstone import:")
64
+ lines.append(" from sunstone import pandas as pd")
65
+
66
+ if self.warnings:
67
+ lines.append("\nWarnings:")
68
+ for warning in self.warnings:
69
+ lines.append(f" - {warning}")
70
+
71
+ if self.errors:
72
+ lines.append("\nErrors:")
73
+ for error in self.errors:
74
+ lines.append(f" - {error}")
75
+
76
+ return "\n".join(lines)
77
+
78
+
79
+ def check_notebook_imports(notebook_path: Union[str, Path]) -> ImportCheckResult:
80
+ """
81
+ Check a Jupyter notebook for correct Sunstone import usage.
82
+
83
+ This function scans all code cells in a notebook and checks if:
84
+ 1. Plain pandas is imported (import pandas as pd)
85
+ 2. Sunstone's pandas module is imported (from sunstone import pandas as pd)
86
+ 3. Sunstone is imported (import sunstone)
87
+
88
+ Args:
89
+ notebook_path: Path to the Jupyter notebook (.ipynb file).
90
+
91
+ Returns:
92
+ ImportCheckResult with details about the imports found.
93
+
94
+ Example:
95
+ >>> from sunstone.validation import check_notebook_imports
96
+ >>> result = check_notebook_imports('analysis.ipynb')
97
+ >>> if not result.is_valid:
98
+ ... print(result.summary())
99
+ """
100
+ result = ImportCheckResult()
101
+ notebook_path = Path(notebook_path)
102
+
103
+ if not notebook_path.exists():
104
+ result.add_error(f"Notebook not found: {notebook_path}")
105
+ return result
106
+
107
+ try:
108
+ with open(notebook_path, "r", encoding="utf-8") as f:
109
+ notebook = json.load(f)
110
+ except json.JSONDecodeError as e:
111
+ result.add_error(f"Invalid JSON in notebook: {e}")
112
+ return result
113
+ except Exception as e:
114
+ result.add_error(f"Error reading notebook: {e}")
115
+ return result
116
+
117
+ # Scan all code cells
118
+ cells = notebook.get("cells", [])
119
+ for i, cell in enumerate(cells):
120
+ if cell.get("cell_type") != "code":
121
+ continue
122
+
123
+ # Get the source code
124
+ source = cell.get("source", [])
125
+ if isinstance(source, list):
126
+ source = "".join(source)
127
+
128
+ # Check for various import patterns
129
+ _check_source_imports(source, result, f"Cell {i + 1}")
130
+
131
+ return result
132
+
133
+
134
+ def check_script_imports(script_path: Union[str, Path]) -> ImportCheckResult:
135
+ """
136
+ Check a Python script for correct Sunstone import usage.
137
+
138
+ Args:
139
+ script_path: Path to the Python script (.py file).
140
+
141
+ Returns:
142
+ ImportCheckResult with details about the imports found.
143
+
144
+ Example:
145
+ >>> from sunstone.validation import check_script_imports
146
+ >>> result = check_script_imports('analysis.py')
147
+ >>> if not result.is_valid:
148
+ ... print(result.summary())
149
+ """
150
+ result = ImportCheckResult()
151
+ script_path = Path(script_path)
152
+
153
+ if not script_path.exists():
154
+ result.add_error(f"Script not found: {script_path}")
155
+ return result
156
+
157
+ try:
158
+ with open(script_path, "r", encoding="utf-8") as f:
159
+ source = f.read()
160
+ except Exception as e:
161
+ result.add_error(f"Error reading script: {e}")
162
+ return result
163
+
164
+ _check_source_imports(source, result, str(script_path.name))
165
+ return result
166
+
167
+
168
+ def _check_source_imports(source: str, result: ImportCheckResult, location: str) -> None:
169
+ """
170
+ Check source code for import statements.
171
+
172
+ Args:
173
+ source: Source code to check.
174
+ result: ImportCheckResult to update.
175
+ location: Description of where this source came from.
176
+ """
177
+ # Pattern for plain pandas import
178
+ plain_pandas_patterns = [
179
+ r"^\s*import\s+pandas\s+as\s+pd\s*$",
180
+ r"^\s*import\s+pandas\s*$",
181
+ r"^\s*from\s+pandas\s+import\s+",
182
+ ]
183
+
184
+ # Pattern for sunstone.pandas import
185
+ sunstone_pandas_patterns = [
186
+ r"^\s*from\s+sunstone\s+import\s+pandas\s+as\s+pd\s*$",
187
+ r"^\s*import\s+sunstone\.pandas\s+as\s+pd\s*$",
188
+ r"^\s*from\s+sunstone\s+import\s+pandas\s*$",
189
+ ]
190
+
191
+ # Pattern for general sunstone import
192
+ sunstone_patterns = [
193
+ r"^\s*import\s+sunstone\s*$",
194
+ r"^\s*import\s+sunstone\s+as\s+",
195
+ r"^\s*from\s+sunstone\s+import\s+",
196
+ ]
197
+
198
+ # Check each line
199
+ for line_num, line in enumerate(source.split("\n"), 1):
200
+ # Skip comments
201
+ if line.strip().startswith("#"):
202
+ continue
203
+
204
+ # Check for plain pandas (bad)
205
+ for pattern in plain_pandas_patterns:
206
+ if re.match(pattern, line, re.MULTILINE):
207
+ result.has_plain_pandas = True
208
+ result.plain_pandas_locations.append(f"{location}:{line_num}")
209
+
210
+ # Check for sunstone.pandas (good)
211
+ for pattern in sunstone_pandas_patterns:
212
+ if re.match(pattern, line, re.MULTILINE):
213
+ result.has_sunstone_pandas = True
214
+
215
+ # Check for general sunstone import (good)
216
+ for pattern in sunstone_patterns:
217
+ if re.match(pattern, line, re.MULTILINE):
218
+ result.has_sunstone = True
219
+
220
+
221
+ def validate_project_notebooks(
222
+ project_path: Union[str, Path], pattern: str = "**/*.ipynb"
223
+ ) -> Dict[str, ImportCheckResult]:
224
+ """
225
+ Validate all notebooks in a project directory.
226
+
227
+ Args:
228
+ project_path: Path to the project directory.
229
+ pattern: Glob pattern for finding notebooks (default: **/*.ipynb).
230
+
231
+ Returns:
232
+ Dictionary mapping notebook paths to their ImportCheckResults.
233
+
234
+ Example:
235
+ >>> from sunstone.validation import validate_project_notebooks
236
+ >>> results = validate_project_notebooks('/path/to/project')
237
+ >>> for path, result in results.items():
238
+ ... if not result.is_valid:
239
+ ... print(f"\\n{path}:")
240
+ ... print(result.summary())
241
+ """
242
+ project_path = Path(project_path)
243
+ results = {}
244
+
245
+ for notebook_path in project_path.glob(pattern):
246
+ # Skip .ipynb_checkpoints
247
+ if ".ipynb_checkpoints" in str(notebook_path):
248
+ continue
249
+
250
+ result = check_notebook_imports(notebook_path)
251
+ results[str(notebook_path.relative_to(project_path))] = result
252
+
253
+ return results