sunstone-py 0.4.2__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sunstone/_release.py +3 -20
- sunstone/dataframe.py +39 -7
- sunstone/datasets.py +98 -4
- {sunstone_py-0.4.2.dist-info → sunstone_py-0.5.1.dist-info}/METADATA +5 -4
- sunstone_py-0.5.1.dist-info/RECORD +15 -0
- sunstone_py-0.4.2.dist-info/RECORD +0 -15
- {sunstone_py-0.4.2.dist-info → sunstone_py-0.5.1.dist-info}/WHEEL +0 -0
- {sunstone_py-0.4.2.dist-info → sunstone_py-0.5.1.dist-info}/entry_points.txt +0 -0
- {sunstone_py-0.4.2.dist-info → sunstone_py-0.5.1.dist-info}/licenses/LICENSE +0 -0
- {sunstone_py-0.4.2.dist-info → sunstone_py-0.5.1.dist-info}/top_level.txt +0 -0
sunstone/_release.py
CHANGED
|
@@ -164,29 +164,12 @@ def get_last_tag() -> str | None:
|
|
|
164
164
|
def generate_changelog_from_git() -> str:
|
|
165
165
|
"""Generate changelog entries from git commits since last tag using Claude."""
|
|
166
166
|
last_tag = get_last_tag()
|
|
167
|
-
if last_tag:
|
|
168
|
-
|
|
169
|
-
else:
|
|
170
|
-
commit_range = "HEAD"
|
|
171
|
-
|
|
172
|
-
# Get commits since last tag
|
|
173
|
-
result = run_git("log", commit_range, "--pretty=format:%s")
|
|
174
|
-
if result.returncode != 0 or not result.stdout.strip():
|
|
175
|
-
return ""
|
|
176
|
-
|
|
177
|
-
commits = result.stdout.strip()
|
|
178
|
-
|
|
179
|
-
prompt = f"""Convert these git commit messages into Keep a Changelog format entries.
|
|
180
|
-
Categorize under: Added, Changed, Fixed, Removed, Security (only include categories that apply).
|
|
181
|
-
Be concise. Skip merge commits, version bump commits, and release commits.
|
|
182
|
-
Output ONLY the markdown entries with ### headers for categories, nothing else.
|
|
183
|
-
|
|
184
|
-
Commits:
|
|
185
|
-
{commits}"""
|
|
167
|
+
if last_tag is None:
|
|
168
|
+
last_tag = "HEAD~1"
|
|
186
169
|
|
|
187
170
|
print("Generating changelog entries with Claude...")
|
|
188
171
|
claude_result = subprocess.run(
|
|
189
|
-
["claude", "-p", "
|
|
172
|
+
["claude", "-p", f"/generate-changelog {last_tag}"],
|
|
190
173
|
capture_output=True,
|
|
191
174
|
text=True,
|
|
192
175
|
cwd=get_root_dir(),
|
sunstone/dataframe.py
CHANGED
|
@@ -323,7 +323,7 @@ class DataFrame:
|
|
|
323
323
|
path_or_buf: File path for the output CSV.
|
|
324
324
|
slug: Dataset slug (required in relaxed mode if not registered).
|
|
325
325
|
name: Dataset name (required in relaxed mode if not registered).
|
|
326
|
-
publish:
|
|
326
|
+
publish: bool = False,
|
|
327
327
|
**kwargs: Additional arguments passed to pandas.to_csv.
|
|
328
328
|
|
|
329
329
|
Raises:
|
|
@@ -366,6 +366,9 @@ class DataFrame:
|
|
|
366
366
|
# Record the operation
|
|
367
367
|
self.lineage.add_operation(f"to_csv({dataset.slug})")
|
|
368
368
|
|
|
369
|
+
# Persist lineage metadata to datasets.yaml
|
|
370
|
+
manager.update_output_lineage(slug=dataset.slug, lineage=self.lineage, strict=self.strict_mode)
|
|
371
|
+
|
|
369
372
|
def _infer_field_schema(self) -> List[FieldSchema]:
|
|
370
373
|
"""
|
|
371
374
|
Infer field schema from the DataFrame.
|
|
@@ -510,13 +513,13 @@ class DataFrame:
|
|
|
510
513
|
project_path=self.lineage.project_path,
|
|
511
514
|
)
|
|
512
515
|
|
|
513
|
-
def _wrap_result(self, result: Any, operation: str =
|
|
516
|
+
def _wrap_result(self, result: Any, operation: Optional[str] = None) -> Any:
|
|
514
517
|
"""
|
|
515
518
|
Wrap a pandas result in a Sunstone DataFrame if applicable.
|
|
516
519
|
|
|
517
520
|
Args:
|
|
518
521
|
result: The result from a pandas operation.
|
|
519
|
-
operation: Name of the operation performed.
|
|
522
|
+
operation: Name of the operation performed. If None, no operation is recorded.
|
|
520
523
|
|
|
521
524
|
Returns:
|
|
522
525
|
Wrapped DataFrame if result is a DataFrame, otherwise the result.
|
|
@@ -527,7 +530,8 @@ class DataFrame:
|
|
|
527
530
|
operations=self.lineage.operations.copy(),
|
|
528
531
|
project_path=self.lineage.project_path,
|
|
529
532
|
)
|
|
530
|
-
|
|
533
|
+
if operation is not None:
|
|
534
|
+
new_lineage.add_operation(operation)
|
|
531
535
|
|
|
532
536
|
return DataFrame(
|
|
533
537
|
data=result,
|
|
@@ -537,6 +541,28 @@ class DataFrame:
|
|
|
537
541
|
)
|
|
538
542
|
return result
|
|
539
543
|
|
|
544
|
+
# Methods that don't represent meaningful data transformations
|
|
545
|
+
# These return DataFrames but shouldn't be tracked in lineage
|
|
546
|
+
_NON_TRACKING_METHODS = frozenset(
|
|
547
|
+
{
|
|
548
|
+
# Copy operations - same data, no transformation
|
|
549
|
+
"copy",
|
|
550
|
+
# Index operations - same data, different index
|
|
551
|
+
"reset_index",
|
|
552
|
+
"set_index",
|
|
553
|
+
"reindex",
|
|
554
|
+
# Type conversions without data change
|
|
555
|
+
"astype",
|
|
556
|
+
"infer_objects",
|
|
557
|
+
# Column/index renaming - same data, different labels
|
|
558
|
+
"rename",
|
|
559
|
+
"rename_axis",
|
|
560
|
+
# Reshaping without data loss
|
|
561
|
+
"T",
|
|
562
|
+
"transpose",
|
|
563
|
+
}
|
|
564
|
+
)
|
|
565
|
+
|
|
540
566
|
def __getattr__(self, name: str) -> Any:
|
|
541
567
|
"""
|
|
542
568
|
Delegate attribute access to the underlying pandas DataFrame.
|
|
@@ -557,11 +583,14 @@ class DataFrame:
|
|
|
557
583
|
|
|
558
584
|
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
559
585
|
result = attr(*args, **kwargs)
|
|
586
|
+
# Don't track non-transforming methods
|
|
587
|
+
if name in DataFrame._NON_TRACKING_METHODS:
|
|
588
|
+
return self._wrap_result(result, operation=None)
|
|
560
589
|
return self._wrap_result(result, operation=f"{name}")
|
|
561
590
|
|
|
562
591
|
return wrapper
|
|
563
592
|
|
|
564
|
-
return self._wrap_result(attr, operation=
|
|
593
|
+
return self._wrap_result(attr, operation=None) # Don't track attribute access
|
|
565
594
|
|
|
566
595
|
def __getitem__(self, key: Any) -> Any:
|
|
567
596
|
"""
|
|
@@ -574,7 +603,9 @@ class DataFrame:
|
|
|
574
603
|
The item from the underlying DataFrame, wrapped if it's a DataFrame.
|
|
575
604
|
"""
|
|
576
605
|
result = self.data[key]
|
|
577
|
-
|
|
606
|
+
# Don't track __getitem__ as an operation - it's just column/row access
|
|
607
|
+
# not a meaningful transformation
|
|
608
|
+
return self._wrap_result(result, operation=None)
|
|
578
609
|
|
|
579
610
|
def __setitem__(self, key: Any, value: Any) -> None:
|
|
580
611
|
"""
|
|
@@ -585,7 +616,8 @@ class DataFrame:
|
|
|
585
616
|
value: Value to assign.
|
|
586
617
|
"""
|
|
587
618
|
self.data[key] = value
|
|
588
|
-
|
|
619
|
+
# Track column assignment in lineage
|
|
620
|
+
self.lineage.add_operation(f"__setitem__({key!r})")
|
|
589
621
|
|
|
590
622
|
def __repr__(self) -> str:
|
|
591
623
|
"""String representation of the DataFrame."""
|
sunstone/datasets.py
CHANGED
|
@@ -4,19 +4,27 @@ Parser and manager for datasets.yaml files.
|
|
|
4
4
|
|
|
5
5
|
import ipaddress
|
|
6
6
|
import logging
|
|
7
|
+
import os
|
|
7
8
|
import socket
|
|
9
|
+
import tempfile
|
|
8
10
|
from pathlib import Path
|
|
9
11
|
from typing import Any, Dict, List, Optional, Union
|
|
10
12
|
from urllib.parse import urljoin, urlparse
|
|
11
13
|
|
|
12
14
|
import requests
|
|
13
|
-
import
|
|
15
|
+
from ruamel.yaml import YAML
|
|
14
16
|
|
|
15
17
|
from .exceptions import DatasetNotFoundError, DatasetValidationError
|
|
16
|
-
from .lineage import DatasetMetadata, FieldSchema, Source, SourceLocation
|
|
18
|
+
from .lineage import DatasetMetadata, FieldSchema, LineageMetadata, Source, SourceLocation
|
|
17
19
|
|
|
18
20
|
logger = logging.getLogger(__name__)
|
|
19
21
|
|
|
22
|
+
# Configure ruamel.yaml for round-trip parsing (preserves comments) with proper indentation
|
|
23
|
+
_yaml = YAML()
|
|
24
|
+
_yaml.preserve_quotes = True
|
|
25
|
+
_yaml.default_flow_style = False
|
|
26
|
+
_yaml.indent(mapping=2, sequence=4, offset=2)
|
|
27
|
+
|
|
20
28
|
|
|
21
29
|
def _is_public_url(url: str) -> bool:
|
|
22
30
|
"""
|
|
@@ -109,7 +117,7 @@ class DatasetsManager:
|
|
|
109
117
|
def _load(self) -> None:
|
|
110
118
|
"""Load and parse the datasets.yaml file."""
|
|
111
119
|
with open(self.datasets_file, "r") as f:
|
|
112
|
-
self._data =
|
|
120
|
+
self._data = _yaml.load(f) or {}
|
|
113
121
|
|
|
114
122
|
if "inputs" not in self._data:
|
|
115
123
|
self._data["inputs"] = []
|
|
@@ -119,7 +127,7 @@ class DatasetsManager:
|
|
|
119
127
|
def _save(self) -> None:
|
|
120
128
|
"""Save the current data back to datasets.yaml."""
|
|
121
129
|
with open(self.datasets_file, "w") as f:
|
|
122
|
-
|
|
130
|
+
_yaml.dump(self._data, f)
|
|
123
131
|
|
|
124
132
|
def _parse_source_location(self, loc_data: Dict[str, Any]) -> SourceLocation:
|
|
125
133
|
"""Parse source location data from YAML."""
|
|
@@ -372,6 +380,92 @@ class DatasetsManager:
|
|
|
372
380
|
|
|
373
381
|
raise DatasetNotFoundError(f"Output dataset with slug '{slug}' not found")
|
|
374
382
|
|
|
383
|
+
def update_output_lineage(self, slug: str, lineage: LineageMetadata, strict: bool = False) -> None:
|
|
384
|
+
"""
|
|
385
|
+
Update lineage metadata for an output dataset.
|
|
386
|
+
|
|
387
|
+
In strict mode, validates that the lineage matches what would be written
|
|
388
|
+
without modifying the file. In relaxed mode, updates the file with lineage.
|
|
389
|
+
|
|
390
|
+
Args:
|
|
391
|
+
slug: The slug of the output dataset to update.
|
|
392
|
+
lineage: The lineage metadata to persist.
|
|
393
|
+
strict: If True, validate without modifying. If False, update the file.
|
|
394
|
+
|
|
395
|
+
Raises:
|
|
396
|
+
DatasetNotFoundError: If the dataset doesn't exist.
|
|
397
|
+
DatasetValidationError: In strict mode, if lineage differs from what's in the file.
|
|
398
|
+
"""
|
|
399
|
+
# Find the output dataset
|
|
400
|
+
dataset_idx = None
|
|
401
|
+
for i, dataset_data in enumerate(self._data["outputs"]):
|
|
402
|
+
if dataset_data["slug"] == slug:
|
|
403
|
+
dataset_idx = i
|
|
404
|
+
break
|
|
405
|
+
|
|
406
|
+
if dataset_idx is None:
|
|
407
|
+
raise DatasetNotFoundError(f"Output dataset with slug '{slug}' not found")
|
|
408
|
+
|
|
409
|
+
# Build lineage metadata to add
|
|
410
|
+
lineage_data: dict[str, Any] = {}
|
|
411
|
+
|
|
412
|
+
if lineage.sources:
|
|
413
|
+
lineage_data["sources"] = [
|
|
414
|
+
{
|
|
415
|
+
"slug": src.slug,
|
|
416
|
+
"name": src.name,
|
|
417
|
+
}
|
|
418
|
+
for src in lineage.sources
|
|
419
|
+
]
|
|
420
|
+
|
|
421
|
+
if lineage.operations:
|
|
422
|
+
lineage_data["operations"] = lineage.operations.copy()
|
|
423
|
+
|
|
424
|
+
if lineage.created_at:
|
|
425
|
+
lineage_data["created_at"] = lineage.created_at.isoformat()
|
|
426
|
+
|
|
427
|
+
# Create a copy of the data with updated lineage
|
|
428
|
+
updated_data = self._data.copy()
|
|
429
|
+
updated_data["outputs"] = [dict(d) for d in self._data["outputs"]]
|
|
430
|
+
updated_data["outputs"][dataset_idx] = dict(self._data["outputs"][dataset_idx])
|
|
431
|
+
|
|
432
|
+
# Add or update lineage in the copy
|
|
433
|
+
if lineage_data:
|
|
434
|
+
updated_data["outputs"][dataset_idx]["lineage"] = lineage_data
|
|
435
|
+
|
|
436
|
+
# Write to temp file
|
|
437
|
+
temp_fd, temp_path = tempfile.mkstemp(suffix=".yaml", prefix="datasets_", dir=self.project_path)
|
|
438
|
+
|
|
439
|
+
try:
|
|
440
|
+
with os.fdopen(temp_fd, "w") as f:
|
|
441
|
+
_yaml.dump(updated_data, f)
|
|
442
|
+
|
|
443
|
+
if strict:
|
|
444
|
+
# In strict mode, check if files differ
|
|
445
|
+
import filecmp
|
|
446
|
+
|
|
447
|
+
if not filecmp.cmp(self.datasets_file, temp_path, shallow=False):
|
|
448
|
+
# Files differ - this is an error in strict mode
|
|
449
|
+
os.unlink(temp_path)
|
|
450
|
+
raise DatasetValidationError(
|
|
451
|
+
f"In strict mode, lineage metadata for '{slug}' would be updated in datasets.yaml. "
|
|
452
|
+
f"Expected lineage is already present in the file, but found differences."
|
|
453
|
+
)
|
|
454
|
+
else:
|
|
455
|
+
# Files are the same - clean up temp file
|
|
456
|
+
os.unlink(temp_path)
|
|
457
|
+
else:
|
|
458
|
+
# In relaxed mode, replace the file
|
|
459
|
+
os.replace(temp_path, self.datasets_file)
|
|
460
|
+
# Reload the data
|
|
461
|
+
self._load()
|
|
462
|
+
|
|
463
|
+
except Exception:
|
|
464
|
+
# Clean up temp file on error
|
|
465
|
+
if os.path.exists(temp_path):
|
|
466
|
+
os.unlink(temp_path)
|
|
467
|
+
raise
|
|
468
|
+
|
|
375
469
|
def get_absolute_path(self, location: str) -> Path:
|
|
376
470
|
"""
|
|
377
471
|
Get the absolute path for a dataset location.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sunstone-py
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.1
|
|
4
4
|
Summary: Python library for managing datasets with lineage tracking in Sunstone projects
|
|
5
5
|
Author-email: Sunstone Institute <stig@sunstone.institute>
|
|
6
6
|
License: MIT
|
|
@@ -24,6 +24,7 @@ Requires-Dist: google-auth>=2.43.0
|
|
|
24
24
|
Requires-Dist: pandas>=2.0.0
|
|
25
25
|
Requires-Dist: pyyaml>=6.0
|
|
26
26
|
Requires-Dist: requests>=2.31.0
|
|
27
|
+
Requires-Dist: ruamel-yaml>=0.18
|
|
27
28
|
Dynamic: license-file
|
|
28
29
|
|
|
29
30
|
# sunstone-py
|
|
@@ -324,14 +325,14 @@ uv run pytest
|
|
|
324
325
|
### Type Checking
|
|
325
326
|
|
|
326
327
|
```bash
|
|
327
|
-
uv run mypy
|
|
328
|
+
uv run mypy
|
|
328
329
|
```
|
|
329
330
|
|
|
330
331
|
### Linting and Formatting
|
|
331
332
|
|
|
332
333
|
```bash
|
|
333
|
-
uv run ruff check
|
|
334
|
-
uv run ruff format
|
|
334
|
+
uv run ruff check
|
|
335
|
+
uv run ruff format
|
|
335
336
|
```
|
|
336
337
|
|
|
337
338
|
## About Sunstone Institute
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
sunstone/__init__.py,sha256=LC0ZtmxP26eXPLKejbg7UStcHOnE_lwttNTL4m3F4yM,2032
|
|
2
|
+
sunstone/_release.py,sha256=_yjAl_vZQ_5IYr0ugPlqtmUvsGnyGDx7LyiI_2HToVM,14649
|
|
3
|
+
sunstone/dataframe.py,sha256=UJgQx7auiNb6hSIvhB8EQs2afu-7S22xdWL5DZUr29g,23602
|
|
4
|
+
sunstone/datasets.py,sha256=LdHk3Vkfc7QH2VxhSskRCm9wUFSkldCmgS_1c2KDAPA,21142
|
|
5
|
+
sunstone/exceptions.py,sha256=fiixXazur3LtQGy21bGEaSr356DObFcYxQJ3FvOxNec,623
|
|
6
|
+
sunstone/lineage.py,sha256=B9GKMu5-v8Izos5G40K_EvsCPJL3Z2Tg1T_Fc7ezSMI,5240
|
|
7
|
+
sunstone/pandas.py,sha256=CLEqIIgTbMmpH73TPy_vDUPxQa37Hpmqn4r6No8PJwo,8188
|
|
8
|
+
sunstone/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
+
sunstone/validation.py,sha256=1356vcUc72a1zGBUe9Xjrcb5h41Xo53PaK2nnQ_FuSM,8286
|
|
10
|
+
sunstone_py-0.5.1.dist-info/licenses/LICENSE,sha256=pB6VuR4QRjwjMjy8RSNGho-N1SUdu07ntIhT5lrhkzU,1078
|
|
11
|
+
sunstone_py-0.5.1.dist-info/METADATA,sha256=DMLR03NMB5_t14rsBo4GtqY0oQFHnKQtbdUGEfxFcq8,9563
|
|
12
|
+
sunstone_py-0.5.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
13
|
+
sunstone_py-0.5.1.dist-info/entry_points.txt,sha256=0h6E88rH9a_503BAzXvFPR-UfmkrRFjcOf29DXgJNjk,51
|
|
14
|
+
sunstone_py-0.5.1.dist-info/top_level.txt,sha256=A2fW-7JO10rlx_L28Bc4FVvWt2R8kgvS8_TGPBhQp3c,9
|
|
15
|
+
sunstone_py-0.5.1.dist-info/RECORD,,
|
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
sunstone/__init__.py,sha256=LC0ZtmxP26eXPLKejbg7UStcHOnE_lwttNTL4m3F4yM,2032
|
|
2
|
-
sunstone/_release.py,sha256=FXqmg9MtMRW9-1DUXrO0PgViUTgUXnMFOx_HUC2n854,15264
|
|
3
|
-
sunstone/dataframe.py,sha256=3wP91L0J3Ptgg41tCRPm84UxTJsj4fy2aBCmCu15qoE,22312
|
|
4
|
-
sunstone/datasets.py,sha256=rrakdvgX7EOCWWWrm8wDqOqXIRqaq-KNUqjrwsm66OI,17590
|
|
5
|
-
sunstone/exceptions.py,sha256=fiixXazur3LtQGy21bGEaSr356DObFcYxQJ3FvOxNec,623
|
|
6
|
-
sunstone/lineage.py,sha256=B9GKMu5-v8Izos5G40K_EvsCPJL3Z2Tg1T_Fc7ezSMI,5240
|
|
7
|
-
sunstone/pandas.py,sha256=CLEqIIgTbMmpH73TPy_vDUPxQa37Hpmqn4r6No8PJwo,8188
|
|
8
|
-
sunstone/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
-
sunstone/validation.py,sha256=1356vcUc72a1zGBUe9Xjrcb5h41Xo53PaK2nnQ_FuSM,8286
|
|
10
|
-
sunstone_py-0.4.2.dist-info/licenses/LICENSE,sha256=pB6VuR4QRjwjMjy8RSNGho-N1SUdu07ntIhT5lrhkzU,1078
|
|
11
|
-
sunstone_py-0.4.2.dist-info/METADATA,sha256=ApYL9p7R7ibk6e3A4T43x9t5Cg2HzwiRBAwfmaIdFa0,9569
|
|
12
|
-
sunstone_py-0.4.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
13
|
-
sunstone_py-0.4.2.dist-info/entry_points.txt,sha256=0h6E88rH9a_503BAzXvFPR-UfmkrRFjcOf29DXgJNjk,51
|
|
14
|
-
sunstone_py-0.4.2.dist-info/top_level.txt,sha256=A2fW-7JO10rlx_L28Bc4FVvWt2R8kgvS8_TGPBhQp3c,9
|
|
15
|
-
sunstone_py-0.4.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|