sunstone-py 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sunstone_py-0.4.0/LICENSE +21 -0
- sunstone_py-0.4.0/PKG-INFO +348 -0
- sunstone_py-0.4.0/README.md +323 -0
- sunstone_py-0.4.0/pyproject.toml +66 -0
- sunstone_py-0.4.0/setup.cfg +4 -0
- sunstone_py-0.4.0/src/sunstone/__init__.py +84 -0
- sunstone_py-0.4.0/src/sunstone/_release.py +403 -0
- sunstone_py-0.4.0/src/sunstone/dataframe.py +607 -0
- sunstone_py-0.4.0/src/sunstone/datasets.py +480 -0
- sunstone_py-0.4.0/src/sunstone/exceptions.py +33 -0
- sunstone_py-0.4.0/src/sunstone/lineage.py +190 -0
- sunstone_py-0.4.0/src/sunstone/pandas.py +246 -0
- sunstone_py-0.4.0/src/sunstone/py.typed +0 -0
- sunstone_py-0.4.0/src/sunstone/validation.py +253 -0
- sunstone_py-0.4.0/src/sunstone_py.egg-info/PKG-INFO +348 -0
- sunstone_py-0.4.0/src/sunstone_py.egg-info/SOURCES.txt +22 -0
- sunstone_py-0.4.0/src/sunstone_py.egg-info/dependency_links.txt +1 -0
- sunstone_py-0.4.0/src/sunstone_py.egg-info/entry_points.txt +2 -0
- sunstone_py-0.4.0/src/sunstone_py.egg-info/requires.txt +5 -0
- sunstone_py-0.4.0/src/sunstone_py.egg-info/top_level.txt +1 -0
- sunstone_py-0.4.0/tests/test_dataframe.py +224 -0
- sunstone_py-0.4.0/tests/test_datasets.py +439 -0
- sunstone_py-0.4.0/tests/test_lineage_persistence.py +67 -0
- sunstone_py-0.4.0/tests/test_pandas_compatibility.py +436 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Sunstone Institute AS
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,348 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sunstone-py
|
|
3
|
+
Version: 0.4.0
|
|
4
|
+
Summary: Python library for managing datasets with lineage tracking in Sunstone projects
|
|
5
|
+
Author-email: Sunstone Institute <stig@sunstone.institute>
|
|
6
|
+
License: MIT
|
|
7
|
+
Classifier: Development Status :: 3 - Alpha
|
|
8
|
+
Classifier: Intended Audience :: Science/Research
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
16
|
+
Requires-Python: >=3.10
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: frictionless>=5.18.1
|
|
20
|
+
Requires-Dist: google-auth>=2.43.0
|
|
21
|
+
Requires-Dist: pandas>=2.0.0
|
|
22
|
+
Requires-Dist: pyyaml>=6.0
|
|
23
|
+
Requires-Dist: requests>=2.31.0
|
|
24
|
+
Dynamic: license-file
|
|
25
|
+
|
|
26
|
+
# sunstone-py
|
|
27
|
+
|
|
28
|
+
A Python library for managing datasets with lineage tracking in data science projects.
|
|
29
|
+
|
|
30
|
+
[](https://www.python.org/downloads/)
|
|
31
|
+
[](https://opensource.org/licenses/MIT)
|
|
32
|
+
|
|
33
|
+
## Features
|
|
34
|
+
|
|
35
|
+
- **Automatic Lineage Tracking**: Track data provenance through all operations automatically
|
|
36
|
+
- **Dataset Management**: Integration with `datasets.yaml` for organized dataset registration
|
|
37
|
+
- **Pandas-Compatible API**: Familiar pandas-like interface via `from sunstone import pandas as pd`
|
|
38
|
+
- **Strict/Relaxed Modes**: Control whether operations can modify `datasets.yaml`
|
|
39
|
+
- **Validation Tools**: Check notebooks and scripts for correct import usage
|
|
40
|
+
- **Full Type Hints**: Complete type hint support for better IDE integration
|
|
41
|
+
|
|
42
|
+
## Installation
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
# Using uv (recommended)
|
|
46
|
+
uv add sunstone-py
|
|
47
|
+
|
|
48
|
+
# Using pip
|
|
49
|
+
pip install sunstone-py
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
To use the latest commit from github:
|
|
53
|
+
|
|
54
|
+
```toml
|
|
55
|
+
dependencies = [
|
|
56
|
+
"sunstone-py @ git+https://github.com/sunstoneinstitute/sunstone-py.git",
|
|
57
|
+
]
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
If you are making changes to sunstone-py checked out at `~/git/sunstone-py` and testing them
|
|
61
|
+
directly from your project:
|
|
62
|
+
|
|
63
|
+
```toml
|
|
64
|
+
dependencies = [
|
|
65
|
+
"sunstone-py @ file://${HOME}/git/sunstone-py"
|
|
66
|
+
]
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### For Development
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
git clone https://github.com/sunstoneinstitute/sunstone-py.git
|
|
73
|
+
cd sunstone-py
|
|
74
|
+
uv venv
|
|
75
|
+
uv sync
|
|
76
|
+
source .venv/bin/activate # On Windows: .venv\Scripts\activate
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
## Quick Start
|
|
81
|
+
|
|
82
|
+
### 1. Set Up Your Project with datasets.yaml
|
|
83
|
+
|
|
84
|
+
Create a `datasets.yaml` file in your project directory:
|
|
85
|
+
|
|
86
|
+
```yaml
|
|
87
|
+
inputs:
|
|
88
|
+
- name: School Data
|
|
89
|
+
slug: school-data
|
|
90
|
+
location: data/schools.csv
|
|
91
|
+
source:
|
|
92
|
+
name: Ministry of Education
|
|
93
|
+
location:
|
|
94
|
+
data: https://example.com/schools.csv
|
|
95
|
+
attributedTo: Ministry of Education
|
|
96
|
+
acquiredAt: 2025-01-15
|
|
97
|
+
acquisitionMethod: manual-download
|
|
98
|
+
license: CC-BY-4.0
|
|
99
|
+
fields:
|
|
100
|
+
- name: school_id
|
|
101
|
+
type: string
|
|
102
|
+
- name: enrollment
|
|
103
|
+
type: integer
|
|
104
|
+
|
|
105
|
+
outputs: []
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### 2. Use Pandas-Like API with Lineage Tracking
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
from sunstone import pandas as pd
|
|
112
|
+
from pathlib import Path
|
|
113
|
+
|
|
114
|
+
# Set project path (where datasets.yaml lives)
|
|
115
|
+
PROJECT_PATH = Path.cwd()
|
|
116
|
+
|
|
117
|
+
# Read data - lineage automatically tracked
|
|
118
|
+
df = pd.read_csv('data/schools.csv', project_path=PROJECT_PATH)
|
|
119
|
+
|
|
120
|
+
# Transform using familiar pandas operations
|
|
121
|
+
result = df[df['enrollment'] > 100].groupby('district').sum()
|
|
122
|
+
|
|
123
|
+
# Save with automatic lineage tracking and dataset registration
|
|
124
|
+
result.to_csv(
|
|
125
|
+
'outputs/summary.csv',
|
|
126
|
+
slug='school-summary',
|
|
127
|
+
name='School Enrollment Summary',
|
|
128
|
+
index=False
|
|
129
|
+
)
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### 3. Check Lineage Metadata
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
# View lineage information
|
|
136
|
+
print(result.lineage.sources) # Source datasets
|
|
137
|
+
print(result.lineage.operations) # Operations performed
|
|
138
|
+
print(result.lineage.get_licenses()) # All source licenses
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
## Core Concepts
|
|
142
|
+
|
|
143
|
+
### Pandas-Like API
|
|
144
|
+
|
|
145
|
+
sunstone-py provides a drop-in replacement for pandas that adds lineage tracking:
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
from sunstone import pandas as pd
|
|
149
|
+
|
|
150
|
+
# Works like pandas, but tracks lineage
|
|
151
|
+
df = pd.read_csv('input.csv', project_path='/path/to/project')
|
|
152
|
+
df2 = pd.read_csv('input2.csv', project_path='/path/to/project')
|
|
153
|
+
|
|
154
|
+
# All pandas operations work
|
|
155
|
+
filtered = df[df['value'] > 100]
|
|
156
|
+
grouped = df.groupby('category').sum()
|
|
157
|
+
|
|
158
|
+
# Merge/join operations combine lineage from both sources
|
|
159
|
+
merged = pd.merge(df, df2, on='key')
|
|
160
|
+
concatenated = pd.concat([df, df2])
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
### Strict vs Relaxed Mode
|
|
164
|
+
|
|
165
|
+
**Relaxed Mode** (default):
|
|
166
|
+
- Writing to new outputs auto-registers them in `datasets.yaml`
|
|
167
|
+
- More flexible for exploratory work
|
|
168
|
+
|
|
169
|
+
**Strict Mode**:
|
|
170
|
+
- All reads and writes must be pre-registered in `datasets.yaml`
|
|
171
|
+
- Ensures complete documentation of data operations
|
|
172
|
+
- Enable via `strict=True` parameter or `SUNSTONE_DATAFRAME_STRICT=1` environment variable
|
|
173
|
+
|
|
174
|
+
```python
|
|
175
|
+
# Enable strict mode
|
|
176
|
+
df = pd.read_csv('data.csv', project_path=PROJECT_PATH, strict=True)
|
|
177
|
+
|
|
178
|
+
# Or globally
|
|
179
|
+
import os
|
|
180
|
+
os.environ['SUNSTONE_DATAFRAME_STRICT'] = '1'
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
### Validation Tools
|
|
184
|
+
|
|
185
|
+
Check notebooks for correct import usage:
|
|
186
|
+
|
|
187
|
+
```python
|
|
188
|
+
import sunstone
|
|
189
|
+
|
|
190
|
+
# Check a single notebook
|
|
191
|
+
result = sunstone.check_notebook_imports('analysis.ipynb')
|
|
192
|
+
print(result.summary())
|
|
193
|
+
|
|
194
|
+
# Check all notebooks in project
|
|
195
|
+
results = sunstone.validate_project_notebooks('/path/to/project')
|
|
196
|
+
for path, result in results.items():
|
|
197
|
+
if not result.is_valid:
|
|
198
|
+
print(f"\n{path}:")
|
|
199
|
+
print(result.summary())
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
## Advanced Usage
|
|
203
|
+
|
|
204
|
+
### Direct DataFrame API
|
|
205
|
+
|
|
206
|
+
For more control, use the DataFrame class directly:
|
|
207
|
+
|
|
208
|
+
```python
|
|
209
|
+
from sunstone import DataFrame
|
|
210
|
+
|
|
211
|
+
# Read with explicit parameters
|
|
212
|
+
df = DataFrame.read_csv(
|
|
213
|
+
'data.csv',
|
|
214
|
+
project_path='/path/to/project',
|
|
215
|
+
strict=True
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
# Apply custom operations with lineage tracking
|
|
219
|
+
result = df.apply_operation(
|
|
220
|
+
lambda d: d[d['value'] > 100],
|
|
221
|
+
description="Filter high-value rows"
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
# Access underlying pandas DataFrame
|
|
225
|
+
pandas_df = result.data
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
### Managing datasets.yaml Programmatically
|
|
229
|
+
|
|
230
|
+
```python
|
|
231
|
+
from sunstone import DatasetsManager, FieldSchema
|
|
232
|
+
|
|
233
|
+
manager = DatasetsManager('/path/to/project')
|
|
234
|
+
|
|
235
|
+
# Find datasets
|
|
236
|
+
dataset = manager.find_dataset_by_slug('school-data')
|
|
237
|
+
dataset = manager.find_dataset_by_location('data/schools.csv')
|
|
238
|
+
|
|
239
|
+
# Add new output dataset
|
|
240
|
+
manager.add_output_dataset(
|
|
241
|
+
name='Analysis Results',
|
|
242
|
+
slug='analysis-results',
|
|
243
|
+
location='outputs/results.csv',
|
|
244
|
+
fields=[
|
|
245
|
+
FieldSchema(name='category', type='string'),
|
|
246
|
+
FieldSchema(name='count', type='integer'),
|
|
247
|
+
FieldSchema(name='avg_value', type='number')
|
|
248
|
+
],
|
|
249
|
+
publish=True
|
|
250
|
+
)
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
## Documentation
|
|
254
|
+
|
|
255
|
+
- [Contributing Guide](CONTRIBUTING.md)
|
|
256
|
+
- [Changelog](CHANGELOG.md)
|
|
257
|
+
- [API Reference](#api-reference) (below)
|
|
258
|
+
|
|
259
|
+
## API Reference
|
|
260
|
+
|
|
261
|
+
### pandas Module
|
|
262
|
+
|
|
263
|
+
Drop-in replacement for pandas with lineage tracking:
|
|
264
|
+
|
|
265
|
+
- `read_csv(filepath, project_path, strict=False, **kwargs)`: Read CSV with lineage
|
|
266
|
+
- `read_json(filepath, project_path, strict=False, **kwargs)`: Read JSON with lineage
|
|
267
|
+
- `merge(left, right, **kwargs)`: Merge DataFrames with combined lineage
|
|
268
|
+
- `concat(dfs, **kwargs)`: Concatenate DataFrames with combined lineage
|
|
269
|
+
|
|
270
|
+
### DataFrame Class
|
|
271
|
+
|
|
272
|
+
Main class for working with data:
|
|
273
|
+
|
|
274
|
+
- `read_csv(filepath, project_path, strict=False, **kwargs)`: Read CSV with lineage tracking
|
|
275
|
+
- `to_csv(path, slug, name, publish=False, **kwargs)`: Write CSV and register
|
|
276
|
+
- `merge(right, **kwargs)`: Merge with another DataFrame
|
|
277
|
+
- `join(other, **kwargs)`: Join with another DataFrame
|
|
278
|
+
- `concat(others, **kwargs)`: Concatenate DataFrames
|
|
279
|
+
- `apply_operation(operation, description)`: Apply transformation with lineage
|
|
280
|
+
- `.data`: Access underlying pandas DataFrame
|
|
281
|
+
- `.lineage`: Access lineage metadata
|
|
282
|
+
|
|
283
|
+
### DatasetsManager Class
|
|
284
|
+
|
|
285
|
+
Manage `datasets.yaml` files:
|
|
286
|
+
|
|
287
|
+
- `find_dataset_by_location(location, dataset_type='input')`: Find by file path
|
|
288
|
+
- `find_dataset_by_slug(slug, dataset_type='input')`: Find by slug
|
|
289
|
+
- `get_all_inputs()`: Get all input datasets
|
|
290
|
+
- `get_all_outputs()`: Get all output datasets
|
|
291
|
+
- `add_output_dataset(...)`: Register new output
|
|
292
|
+
- `update_output_dataset(...)`: Update existing output
|
|
293
|
+
|
|
294
|
+
### Validation Functions
|
|
295
|
+
|
|
296
|
+
- `check_notebook_imports(notebook_path)`: Validate a single notebook
|
|
297
|
+
- `validate_project_notebooks(project_path)`: Validate all notebooks in project
|
|
298
|
+
|
|
299
|
+
### Exceptions
|
|
300
|
+
|
|
301
|
+
- `SunstoneError`: Base exception
|
|
302
|
+
- `DatasetNotFoundError`: Dataset not found in datasets.yaml
|
|
303
|
+
- `StrictModeError`: Operation blocked in strict mode
|
|
304
|
+
- `DatasetValidationError`: Validation failed
|
|
305
|
+
- `LineageError`: Lineage tracking error
|
|
306
|
+
|
|
307
|
+
## Environment Variables
|
|
308
|
+
|
|
309
|
+
- `SUNSTONE_DATAFRAME_STRICT`: Set to `"1"` or `"true"` to enable strict mode globally
|
|
310
|
+
|
|
311
|
+
## Development
|
|
312
|
+
|
|
313
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for development setup and guidelines.
|
|
314
|
+
|
|
315
|
+
### Running Tests
|
|
316
|
+
|
|
317
|
+
```bash
|
|
318
|
+
uv run pytest
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
### Type Checking
|
|
322
|
+
|
|
323
|
+
```bash
|
|
324
|
+
uv run mypy src/sunstone
|
|
325
|
+
```
|
|
326
|
+
|
|
327
|
+
### Linting and Formatting
|
|
328
|
+
|
|
329
|
+
```bash
|
|
330
|
+
uv run ruff check src/sunstone
|
|
331
|
+
uv run ruff format src/sunstone
|
|
332
|
+
```
|
|
333
|
+
|
|
334
|
+
## About Sunstone Institute
|
|
335
|
+
|
|
336
|
+
[Sunstone Institute](https://sunstone.institute) is a philanthropy-funded organization using data and AI to show the world as it really is, and inspire action everywhere.
|
|
337
|
+
|
|
338
|
+
## License
|
|
339
|
+
|
|
340
|
+
MIT License - see [LICENSE](LICENSE) file for details.
|
|
341
|
+
|
|
342
|
+
## Support
|
|
343
|
+
|
|
344
|
+
- **Issues**: [GitHub Issues](https://github.com/sunstoneinstitute/sunstone-py/issues)
|
|
345
|
+
|
|
346
|
+
---
|
|
347
|
+
|
|
348
|
+
Made with ❤️ by [Sunstone Institute](https://sunstone.institute)
|
|
@@ -0,0 +1,323 @@
|
|
|
1
|
+
# sunstone-py
|
|
2
|
+
|
|
3
|
+
A Python library for managing datasets with lineage tracking in data science projects.
|
|
4
|
+
|
|
5
|
+
[](https://www.python.org/downloads/)
|
|
6
|
+
[](https://opensource.org/licenses/MIT)
|
|
7
|
+
|
|
8
|
+
## Features
|
|
9
|
+
|
|
10
|
+
- **Automatic Lineage Tracking**: Track data provenance through all operations automatically
|
|
11
|
+
- **Dataset Management**: Integration with `datasets.yaml` for organized dataset registration
|
|
12
|
+
- **Pandas-Compatible API**: Familiar pandas-like interface via `from sunstone import pandas as pd`
|
|
13
|
+
- **Strict/Relaxed Modes**: Control whether operations can modify `datasets.yaml`
|
|
14
|
+
- **Validation Tools**: Check notebooks and scripts for correct import usage
|
|
15
|
+
- **Full Type Hints**: Complete type hint support for better IDE integration
|
|
16
|
+
|
|
17
|
+
## Installation
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
# Using uv (recommended)
|
|
21
|
+
uv add sunstone-py
|
|
22
|
+
|
|
23
|
+
# Using pip
|
|
24
|
+
pip install sunstone-py
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
To use the latest commit from github:
|
|
28
|
+
|
|
29
|
+
```toml
|
|
30
|
+
dependencies = [
|
|
31
|
+
"sunstone-py @ git+https://github.com/sunstoneinstitute/sunstone-py.git",
|
|
32
|
+
]
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
If you are making changes to sunstone-py checked out at `~/git/sunstone-py` and testing them
|
|
36
|
+
directly from your project:
|
|
37
|
+
|
|
38
|
+
```toml
|
|
39
|
+
dependencies = [
|
|
40
|
+
"sunstone-py @ file://${HOME}/git/sunstone-py"
|
|
41
|
+
]
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### For Development
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
git clone https://github.com/sunstoneinstitute/sunstone-py.git
|
|
48
|
+
cd sunstone-py
|
|
49
|
+
uv venv
|
|
50
|
+
uv sync
|
|
51
|
+
source .venv/bin/activate # On Windows: .venv\Scripts\activate
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
## Quick Start
|
|
56
|
+
|
|
57
|
+
### 1. Set Up Your Project with datasets.yaml
|
|
58
|
+
|
|
59
|
+
Create a `datasets.yaml` file in your project directory:
|
|
60
|
+
|
|
61
|
+
```yaml
|
|
62
|
+
inputs:
|
|
63
|
+
- name: School Data
|
|
64
|
+
slug: school-data
|
|
65
|
+
location: data/schools.csv
|
|
66
|
+
source:
|
|
67
|
+
name: Ministry of Education
|
|
68
|
+
location:
|
|
69
|
+
data: https://example.com/schools.csv
|
|
70
|
+
attributedTo: Ministry of Education
|
|
71
|
+
acquiredAt: 2025-01-15
|
|
72
|
+
acquisitionMethod: manual-download
|
|
73
|
+
license: CC-BY-4.0
|
|
74
|
+
fields:
|
|
75
|
+
- name: school_id
|
|
76
|
+
type: string
|
|
77
|
+
- name: enrollment
|
|
78
|
+
type: integer
|
|
79
|
+
|
|
80
|
+
outputs: []
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### 2. Use Pandas-Like API with Lineage Tracking
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
from sunstone import pandas as pd
|
|
87
|
+
from pathlib import Path
|
|
88
|
+
|
|
89
|
+
# Set project path (where datasets.yaml lives)
|
|
90
|
+
PROJECT_PATH = Path.cwd()
|
|
91
|
+
|
|
92
|
+
# Read data - lineage automatically tracked
|
|
93
|
+
df = pd.read_csv('data/schools.csv', project_path=PROJECT_PATH)
|
|
94
|
+
|
|
95
|
+
# Transform using familiar pandas operations
|
|
96
|
+
result = df[df['enrollment'] > 100].groupby('district').sum()
|
|
97
|
+
|
|
98
|
+
# Save with automatic lineage tracking and dataset registration
|
|
99
|
+
result.to_csv(
|
|
100
|
+
'outputs/summary.csv',
|
|
101
|
+
slug='school-summary',
|
|
102
|
+
name='School Enrollment Summary',
|
|
103
|
+
index=False
|
|
104
|
+
)
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
### 3. Check Lineage Metadata
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
# View lineage information
|
|
111
|
+
print(result.lineage.sources) # Source datasets
|
|
112
|
+
print(result.lineage.operations) # Operations performed
|
|
113
|
+
print(result.lineage.get_licenses()) # All source licenses
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## Core Concepts
|
|
117
|
+
|
|
118
|
+
### Pandas-Like API
|
|
119
|
+
|
|
120
|
+
sunstone-py provides a drop-in replacement for pandas that adds lineage tracking:
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
from sunstone import pandas as pd
|
|
124
|
+
|
|
125
|
+
# Works like pandas, but tracks lineage
|
|
126
|
+
df = pd.read_csv('input.csv', project_path='/path/to/project')
|
|
127
|
+
df2 = pd.read_csv('input2.csv', project_path='/path/to/project')
|
|
128
|
+
|
|
129
|
+
# All pandas operations work
|
|
130
|
+
filtered = df[df['value'] > 100]
|
|
131
|
+
grouped = df.groupby('category').sum()
|
|
132
|
+
|
|
133
|
+
# Merge/join operations combine lineage from both sources
|
|
134
|
+
merged = pd.merge(df, df2, on='key')
|
|
135
|
+
concatenated = pd.concat([df, df2])
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
### Strict vs Relaxed Mode
|
|
139
|
+
|
|
140
|
+
**Relaxed Mode** (default):
|
|
141
|
+
- Writing to new outputs auto-registers them in `datasets.yaml`
|
|
142
|
+
- More flexible for exploratory work
|
|
143
|
+
|
|
144
|
+
**Strict Mode**:
|
|
145
|
+
- All reads and writes must be pre-registered in `datasets.yaml`
|
|
146
|
+
- Ensures complete documentation of data operations
|
|
147
|
+
- Enable via `strict=True` parameter or `SUNSTONE_DATAFRAME_STRICT=1` environment variable
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
# Enable strict mode
|
|
151
|
+
df = pd.read_csv('data.csv', project_path=PROJECT_PATH, strict=True)
|
|
152
|
+
|
|
153
|
+
# Or globally
|
|
154
|
+
import os
|
|
155
|
+
os.environ['SUNSTONE_DATAFRAME_STRICT'] = '1'
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
### Validation Tools
|
|
159
|
+
|
|
160
|
+
Check notebooks for correct import usage:
|
|
161
|
+
|
|
162
|
+
```python
|
|
163
|
+
import sunstone
|
|
164
|
+
|
|
165
|
+
# Check a single notebook
|
|
166
|
+
result = sunstone.check_notebook_imports('analysis.ipynb')
|
|
167
|
+
print(result.summary())
|
|
168
|
+
|
|
169
|
+
# Check all notebooks in project
|
|
170
|
+
results = sunstone.validate_project_notebooks('/path/to/project')
|
|
171
|
+
for path, result in results.items():
|
|
172
|
+
if not result.is_valid:
|
|
173
|
+
print(f"\n{path}:")
|
|
174
|
+
print(result.summary())
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
## Advanced Usage
|
|
178
|
+
|
|
179
|
+
### Direct DataFrame API
|
|
180
|
+
|
|
181
|
+
For more control, use the DataFrame class directly:
|
|
182
|
+
|
|
183
|
+
```python
|
|
184
|
+
from sunstone import DataFrame
|
|
185
|
+
|
|
186
|
+
# Read with explicit parameters
|
|
187
|
+
df = DataFrame.read_csv(
|
|
188
|
+
'data.csv',
|
|
189
|
+
project_path='/path/to/project',
|
|
190
|
+
strict=True
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
# Apply custom operations with lineage tracking
|
|
194
|
+
result = df.apply_operation(
|
|
195
|
+
lambda d: d[d['value'] > 100],
|
|
196
|
+
description="Filter high-value rows"
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
# Access underlying pandas DataFrame
|
|
200
|
+
pandas_df = result.data
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
### Managing datasets.yaml Programmatically
|
|
204
|
+
|
|
205
|
+
```python
|
|
206
|
+
from sunstone import DatasetsManager, FieldSchema
|
|
207
|
+
|
|
208
|
+
manager = DatasetsManager('/path/to/project')
|
|
209
|
+
|
|
210
|
+
# Find datasets
|
|
211
|
+
dataset = manager.find_dataset_by_slug('school-data')
|
|
212
|
+
dataset = manager.find_dataset_by_location('data/schools.csv')
|
|
213
|
+
|
|
214
|
+
# Add new output dataset
|
|
215
|
+
manager.add_output_dataset(
|
|
216
|
+
name='Analysis Results',
|
|
217
|
+
slug='analysis-results',
|
|
218
|
+
location='outputs/results.csv',
|
|
219
|
+
fields=[
|
|
220
|
+
FieldSchema(name='category', type='string'),
|
|
221
|
+
FieldSchema(name='count', type='integer'),
|
|
222
|
+
FieldSchema(name='avg_value', type='number')
|
|
223
|
+
],
|
|
224
|
+
publish=True
|
|
225
|
+
)
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
## Documentation
|
|
229
|
+
|
|
230
|
+
- [Contributing Guide](CONTRIBUTING.md)
|
|
231
|
+
- [Changelog](CHANGELOG.md)
|
|
232
|
+
- [API Reference](#api-reference) (below)
|
|
233
|
+
|
|
234
|
+
## API Reference
|
|
235
|
+
|
|
236
|
+
### pandas Module
|
|
237
|
+
|
|
238
|
+
Drop-in replacement for pandas with lineage tracking:
|
|
239
|
+
|
|
240
|
+
- `read_csv(filepath, project_path, strict=False, **kwargs)`: Read CSV with lineage
|
|
241
|
+
- `read_json(filepath, project_path, strict=False, **kwargs)`: Read JSON with lineage
|
|
242
|
+
- `merge(left, right, **kwargs)`: Merge DataFrames with combined lineage
|
|
243
|
+
- `concat(dfs, **kwargs)`: Concatenate DataFrames with combined lineage
|
|
244
|
+
|
|
245
|
+
### DataFrame Class
|
|
246
|
+
|
|
247
|
+
Main class for working with data:
|
|
248
|
+
|
|
249
|
+
- `read_csv(filepath, project_path, strict=False, **kwargs)`: Read CSV with lineage tracking
|
|
250
|
+
- `to_csv(path, slug, name, publish=False, **kwargs)`: Write CSV and register
|
|
251
|
+
- `merge(right, **kwargs)`: Merge with another DataFrame
|
|
252
|
+
- `join(other, **kwargs)`: Join with another DataFrame
|
|
253
|
+
- `concat(others, **kwargs)`: Concatenate DataFrames
|
|
254
|
+
- `apply_operation(operation, description)`: Apply transformation with lineage
|
|
255
|
+
- `.data`: Access underlying pandas DataFrame
|
|
256
|
+
- `.lineage`: Access lineage metadata
|
|
257
|
+
|
|
258
|
+
### DatasetsManager Class
|
|
259
|
+
|
|
260
|
+
Manage `datasets.yaml` files:
|
|
261
|
+
|
|
262
|
+
- `find_dataset_by_location(location, dataset_type='input')`: Find by file path
|
|
263
|
+
- `find_dataset_by_slug(slug, dataset_type='input')`: Find by slug
|
|
264
|
+
- `get_all_inputs()`: Get all input datasets
|
|
265
|
+
- `get_all_outputs()`: Get all output datasets
|
|
266
|
+
- `add_output_dataset(...)`: Register new output
|
|
267
|
+
- `update_output_dataset(...)`: Update existing output
|
|
268
|
+
|
|
269
|
+
### Validation Functions
|
|
270
|
+
|
|
271
|
+
- `check_notebook_imports(notebook_path)`: Validate a single notebook
|
|
272
|
+
- `validate_project_notebooks(project_path)`: Validate all notebooks in project
|
|
273
|
+
|
|
274
|
+
### Exceptions
|
|
275
|
+
|
|
276
|
+
- `SunstoneError`: Base exception
|
|
277
|
+
- `DatasetNotFoundError`: Dataset not found in datasets.yaml
|
|
278
|
+
- `StrictModeError`: Operation blocked in strict mode
|
|
279
|
+
- `DatasetValidationError`: Validation failed
|
|
280
|
+
- `LineageError`: Lineage tracking error
|
|
281
|
+
|
|
282
|
+
## Environment Variables
|
|
283
|
+
|
|
284
|
+
- `SUNSTONE_DATAFRAME_STRICT`: Set to `"1"` or `"true"` to enable strict mode globally
|
|
285
|
+
|
|
286
|
+
## Development
|
|
287
|
+
|
|
288
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for development setup and guidelines.
|
|
289
|
+
|
|
290
|
+
### Running Tests
|
|
291
|
+
|
|
292
|
+
```bash
|
|
293
|
+
uv run pytest
|
|
294
|
+
```
|
|
295
|
+
|
|
296
|
+
### Type Checking
|
|
297
|
+
|
|
298
|
+
```bash
|
|
299
|
+
uv run mypy src/sunstone
|
|
300
|
+
```
|
|
301
|
+
|
|
302
|
+
### Linting and Formatting
|
|
303
|
+
|
|
304
|
+
```bash
|
|
305
|
+
uv run ruff check src/sunstone
|
|
306
|
+
uv run ruff format src/sunstone
|
|
307
|
+
```
|
|
308
|
+
|
|
309
|
+
## About Sunstone Institute
|
|
310
|
+
|
|
311
|
+
[Sunstone Institute](https://sunstone.institute) is a philanthropy-funded organization using data and AI to show the world as it really is, and inspire action everywhere.
|
|
312
|
+
|
|
313
|
+
## License
|
|
314
|
+
|
|
315
|
+
MIT License - see [LICENSE](LICENSE) file for details.
|
|
316
|
+
|
|
317
|
+
## Support
|
|
318
|
+
|
|
319
|
+
- **Issues**: [GitHub Issues](https://github.com/sunstoneinstitute/sunstone-py/issues)
|
|
320
|
+
|
|
321
|
+
---
|
|
322
|
+
|
|
323
|
+
Made with ❤️ by [Sunstone Institute](https://sunstone.institute)
|