sunstone-py 0.4.2__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sunstone/_release.py CHANGED
@@ -164,29 +164,12 @@ def get_last_tag() -> str | None:
164
164
  def generate_changelog_from_git() -> str:
165
165
  """Generate changelog entries from git commits since last tag using Claude."""
166
166
  last_tag = get_last_tag()
167
- if last_tag:
168
- commit_range = f"{last_tag}..HEAD"
169
- else:
170
- commit_range = "HEAD"
171
-
172
- # Get commits since last tag
173
- result = run_git("log", commit_range, "--pretty=format:%s")
174
- if result.returncode != 0 or not result.stdout.strip():
175
- return ""
176
-
177
- commits = result.stdout.strip()
178
-
179
- prompt = f"""Convert these git commit messages into Keep a Changelog format entries.
180
- Categorize under: Added, Changed, Fixed, Removed, Security (only include categories that apply).
181
- Be concise. Skip merge commits, version bump commits, and release commits.
182
- Output ONLY the markdown entries with ### headers for categories, nothing else.
183
-
184
- Commits:
185
- {commits}"""
167
+ if last_tag is None:
168
+ last_tag = "HEAD~1"
186
169
 
187
170
  print("Generating changelog entries with Claude...")
188
171
  claude_result = subprocess.run(
189
- ["claude", "-p", "--model=haiku", prompt],
172
+ ["claude", "-p", f"/generate-changelog {last_tag}"],
190
173
  capture_output=True,
191
174
  text=True,
192
175
  cwd=get_root_dir(),
sunstone/dataframe.py CHANGED
@@ -323,7 +323,7 @@ class DataFrame:
323
323
  path_or_buf: File path for the output CSV.
324
324
  slug: Dataset slug (required in relaxed mode if not registered).
325
325
  name: Dataset name (required in relaxed mode if not registered).
326
- publish: Whether to publish the dataset.
326
+ publish: bool = False,
327
327
  **kwargs: Additional arguments passed to pandas.to_csv.
328
328
 
329
329
  Raises:
@@ -366,6 +366,9 @@ class DataFrame:
366
366
  # Record the operation
367
367
  self.lineage.add_operation(f"to_csv({dataset.slug})")
368
368
 
369
+ # Persist lineage metadata to datasets.yaml
370
+ manager.update_output_lineage(slug=dataset.slug, lineage=self.lineage, strict=self.strict_mode)
371
+
369
372
  def _infer_field_schema(self) -> List[FieldSchema]:
370
373
  """
371
374
  Infer field schema from the DataFrame.
@@ -510,13 +513,13 @@ class DataFrame:
510
513
  project_path=self.lineage.project_path,
511
514
  )
512
515
 
513
- def _wrap_result(self, result: Any, operation: str = "pandas_operation") -> Any:
516
+ def _wrap_result(self, result: Any, operation: Optional[str] = None) -> Any:
514
517
  """
515
518
  Wrap a pandas result in a Sunstone DataFrame if applicable.
516
519
 
517
520
  Args:
518
521
  result: The result from a pandas operation.
519
- operation: Name of the operation performed.
522
+ operation: Name of the operation performed. If None, no operation is recorded.
520
523
 
521
524
  Returns:
522
525
  Wrapped DataFrame if result is a DataFrame, otherwise the result.
@@ -527,7 +530,8 @@ class DataFrame:
527
530
  operations=self.lineage.operations.copy(),
528
531
  project_path=self.lineage.project_path,
529
532
  )
530
- new_lineage.add_operation(operation)
533
+ if operation is not None:
534
+ new_lineage.add_operation(operation)
531
535
 
532
536
  return DataFrame(
533
537
  data=result,
@@ -537,6 +541,28 @@ class DataFrame:
537
541
  )
538
542
  return result
539
543
 
544
+ # Methods that don't represent meaningful data transformations
545
+ # These return DataFrames but shouldn't be tracked in lineage
546
+ _NON_TRACKING_METHODS = frozenset(
547
+ {
548
+ # Copy operations - same data, no transformation
549
+ "copy",
550
+ # Index operations - same data, different index
551
+ "reset_index",
552
+ "set_index",
553
+ "reindex",
554
+ # Type conversions without data change
555
+ "astype",
556
+ "infer_objects",
557
+ # Column/index renaming - same data, different labels
558
+ "rename",
559
+ "rename_axis",
560
+ # Reshaping without data loss
561
+ "T",
562
+ "transpose",
563
+ }
564
+ )
565
+
540
566
  def __getattr__(self, name: str) -> Any:
541
567
  """
542
568
  Delegate attribute access to the underlying pandas DataFrame.
@@ -557,11 +583,14 @@ class DataFrame:
557
583
 
558
584
  def wrapper(*args: Any, **kwargs: Any) -> Any:
559
585
  result = attr(*args, **kwargs)
586
+ # Don't track non-transforming methods
587
+ if name in DataFrame._NON_TRACKING_METHODS:
588
+ return self._wrap_result(result, operation=None)
560
589
  return self._wrap_result(result, operation=f"{name}")
561
590
 
562
591
  return wrapper
563
592
 
564
- return self._wrap_result(attr, operation=f"access_attribute_{name}")
593
+ return self._wrap_result(attr, operation=None) # Don't track attribute access
565
594
 
566
595
  def __getitem__(self, key: Any) -> Any:
567
596
  """
@@ -574,7 +603,9 @@ class DataFrame:
574
603
  The item from the underlying DataFrame, wrapped if it's a DataFrame.
575
604
  """
576
605
  result = self.data[key]
577
- return self._wrap_result(result, operation="__getitem__")
606
+ # Don't track __getitem__ as an operation - it's just column/row access
607
+ # not a meaningful transformation
608
+ return self._wrap_result(result, operation=None)
578
609
 
579
610
  def __setitem__(self, key: Any, value: Any) -> None:
580
611
  """
@@ -585,7 +616,8 @@ class DataFrame:
585
616
  value: Value to assign.
586
617
  """
587
618
  self.data[key] = value
588
- self.lineage.add_operation("__setitem__")
619
+ # Track column assignment in lineage
620
+ self.lineage.add_operation(f"__setitem__({key!r})")
589
621
 
590
622
  def __repr__(self) -> str:
591
623
  """String representation of the DataFrame."""
sunstone/datasets.py CHANGED
@@ -4,19 +4,27 @@ Parser and manager for datasets.yaml files.
4
4
 
5
5
  import ipaddress
6
6
  import logging
7
+ import os
7
8
  import socket
9
+ import tempfile
8
10
  from pathlib import Path
9
11
  from typing import Any, Dict, List, Optional, Union
10
12
  from urllib.parse import urljoin, urlparse
11
13
 
12
14
  import requests
13
- import yaml
15
+ from ruamel.yaml import YAML
14
16
 
15
17
  from .exceptions import DatasetNotFoundError, DatasetValidationError
16
- from .lineage import DatasetMetadata, FieldSchema, Source, SourceLocation
18
+ from .lineage import DatasetMetadata, FieldSchema, LineageMetadata, Source, SourceLocation
17
19
 
18
20
  logger = logging.getLogger(__name__)
19
21
 
22
+ # Configure ruamel.yaml for round-trip parsing (preserves comments) with proper indentation
23
+ _yaml = YAML()
24
+ _yaml.preserve_quotes = True
25
+ _yaml.default_flow_style = False
26
+ _yaml.indent(mapping=2, sequence=4, offset=2)
27
+
20
28
 
21
29
  def _is_public_url(url: str) -> bool:
22
30
  """
@@ -109,7 +117,7 @@ class DatasetsManager:
109
117
  def _load(self) -> None:
110
118
  """Load and parse the datasets.yaml file."""
111
119
  with open(self.datasets_file, "r") as f:
112
- self._data = yaml.safe_load(f) or {}
120
+ self._data = _yaml.load(f) or {}
113
121
 
114
122
  if "inputs" not in self._data:
115
123
  self._data["inputs"] = []
@@ -119,7 +127,7 @@ class DatasetsManager:
119
127
  def _save(self) -> None:
120
128
  """Save the current data back to datasets.yaml."""
121
129
  with open(self.datasets_file, "w") as f:
122
- yaml.dump(self._data, f, default_flow_style=False, sort_keys=False)
130
+ _yaml.dump(self._data, f)
123
131
 
124
132
  def _parse_source_location(self, loc_data: Dict[str, Any]) -> SourceLocation:
125
133
  """Parse source location data from YAML."""
@@ -372,6 +380,92 @@ class DatasetsManager:
372
380
 
373
381
  raise DatasetNotFoundError(f"Output dataset with slug '{slug}' not found")
374
382
 
383
+ def update_output_lineage(self, slug: str, lineage: LineageMetadata, strict: bool = False) -> None:
384
+ """
385
+ Update lineage metadata for an output dataset.
386
+
387
+ In strict mode, validates that the lineage matches what would be written
388
+ without modifying the file. In relaxed mode, updates the file with lineage.
389
+
390
+ Args:
391
+ slug: The slug of the output dataset to update.
392
+ lineage: The lineage metadata to persist.
393
+ strict: If True, validate without modifying. If False, update the file.
394
+
395
+ Raises:
396
+ DatasetNotFoundError: If the dataset doesn't exist.
397
+ DatasetValidationError: In strict mode, if lineage differs from what's in the file.
398
+ """
399
+ # Find the output dataset
400
+ dataset_idx = None
401
+ for i, dataset_data in enumerate(self._data["outputs"]):
402
+ if dataset_data["slug"] == slug:
403
+ dataset_idx = i
404
+ break
405
+
406
+ if dataset_idx is None:
407
+ raise DatasetNotFoundError(f"Output dataset with slug '{slug}' not found")
408
+
409
+ # Build lineage metadata to add
410
+ lineage_data: dict[str, Any] = {}
411
+
412
+ if lineage.sources:
413
+ lineage_data["sources"] = [
414
+ {
415
+ "slug": src.slug,
416
+ "name": src.name,
417
+ }
418
+ for src in lineage.sources
419
+ ]
420
+
421
+ if lineage.operations:
422
+ lineage_data["operations"] = lineage.operations.copy()
423
+
424
+ if lineage.created_at:
425
+ lineage_data["created_at"] = lineage.created_at.isoformat()
426
+
427
+ # Create a copy of the data with updated lineage
428
+ updated_data = self._data.copy()
429
+ updated_data["outputs"] = [dict(d) for d in self._data["outputs"]]
430
+ updated_data["outputs"][dataset_idx] = dict(self._data["outputs"][dataset_idx])
431
+
432
+ # Add or update lineage in the copy
433
+ if lineage_data:
434
+ updated_data["outputs"][dataset_idx]["lineage"] = lineage_data
435
+
436
+ # Write to temp file
437
+ temp_fd, temp_path = tempfile.mkstemp(suffix=".yaml", prefix="datasets_", dir=self.project_path)
438
+
439
+ try:
440
+ with os.fdopen(temp_fd, "w") as f:
441
+ _yaml.dump(updated_data, f)
442
+
443
+ if strict:
444
+ # In strict mode, check if files differ
445
+ import filecmp
446
+
447
+ if not filecmp.cmp(self.datasets_file, temp_path, shallow=False):
448
+ # Files differ - this is an error in strict mode
449
+ os.unlink(temp_path)
450
+ raise DatasetValidationError(
451
+ f"In strict mode, lineage metadata for '{slug}' would be updated in datasets.yaml. "
452
+ f"Expected lineage is already present in the file, but found differences."
453
+ )
454
+ else:
455
+ # Files are the same - clean up temp file
456
+ os.unlink(temp_path)
457
+ else:
458
+ # In relaxed mode, replace the file
459
+ os.replace(temp_path, self.datasets_file)
460
+ # Reload the data
461
+ self._load()
462
+
463
+ except Exception:
464
+ # Clean up temp file on error
465
+ if os.path.exists(temp_path):
466
+ os.unlink(temp_path)
467
+ raise
468
+
375
469
  def get_absolute_path(self, location: str) -> Path:
376
470
  """
377
471
  Get the absolute path for a dataset location.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sunstone-py
3
- Version: 0.4.2
3
+ Version: 0.5.1
4
4
  Summary: Python library for managing datasets with lineage tracking in Sunstone projects
5
5
  Author-email: Sunstone Institute <stig@sunstone.institute>
6
6
  License: MIT
@@ -24,6 +24,7 @@ Requires-Dist: google-auth>=2.43.0
24
24
  Requires-Dist: pandas>=2.0.0
25
25
  Requires-Dist: pyyaml>=6.0
26
26
  Requires-Dist: requests>=2.31.0
27
+ Requires-Dist: ruamel-yaml>=0.18
27
28
  Dynamic: license-file
28
29
 
29
30
  # sunstone-py
@@ -324,14 +325,14 @@ uv run pytest
324
325
  ### Type Checking
325
326
 
326
327
  ```bash
327
- uv run mypy src/sunstone
328
+ uv run mypy
328
329
  ```
329
330
 
330
331
  ### Linting and Formatting
331
332
 
332
333
  ```bash
333
- uv run ruff check src/sunstone
334
- uv run ruff format src/sunstone
334
+ uv run ruff check
335
+ uv run ruff format
335
336
  ```
336
337
 
337
338
  ## About Sunstone Institute
@@ -0,0 +1,15 @@
1
+ sunstone/__init__.py,sha256=LC0ZtmxP26eXPLKejbg7UStcHOnE_lwttNTL4m3F4yM,2032
2
+ sunstone/_release.py,sha256=_yjAl_vZQ_5IYr0ugPlqtmUvsGnyGDx7LyiI_2HToVM,14649
3
+ sunstone/dataframe.py,sha256=UJgQx7auiNb6hSIvhB8EQs2afu-7S22xdWL5DZUr29g,23602
4
+ sunstone/datasets.py,sha256=LdHk3Vkfc7QH2VxhSskRCm9wUFSkldCmgS_1c2KDAPA,21142
5
+ sunstone/exceptions.py,sha256=fiixXazur3LtQGy21bGEaSr356DObFcYxQJ3FvOxNec,623
6
+ sunstone/lineage.py,sha256=B9GKMu5-v8Izos5G40K_EvsCPJL3Z2Tg1T_Fc7ezSMI,5240
7
+ sunstone/pandas.py,sha256=CLEqIIgTbMmpH73TPy_vDUPxQa37Hpmqn4r6No8PJwo,8188
8
+ sunstone/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
+ sunstone/validation.py,sha256=1356vcUc72a1zGBUe9Xjrcb5h41Xo53PaK2nnQ_FuSM,8286
10
+ sunstone_py-0.5.1.dist-info/licenses/LICENSE,sha256=pB6VuR4QRjwjMjy8RSNGho-N1SUdu07ntIhT5lrhkzU,1078
11
+ sunstone_py-0.5.1.dist-info/METADATA,sha256=DMLR03NMB5_t14rsBo4GtqY0oQFHnKQtbdUGEfxFcq8,9563
12
+ sunstone_py-0.5.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
13
+ sunstone_py-0.5.1.dist-info/entry_points.txt,sha256=0h6E88rH9a_503BAzXvFPR-UfmkrRFjcOf29DXgJNjk,51
14
+ sunstone_py-0.5.1.dist-info/top_level.txt,sha256=A2fW-7JO10rlx_L28Bc4FVvWt2R8kgvS8_TGPBhQp3c,9
15
+ sunstone_py-0.5.1.dist-info/RECORD,,
@@ -1,15 +0,0 @@
1
- sunstone/__init__.py,sha256=LC0ZtmxP26eXPLKejbg7UStcHOnE_lwttNTL4m3F4yM,2032
2
- sunstone/_release.py,sha256=FXqmg9MtMRW9-1DUXrO0PgViUTgUXnMFOx_HUC2n854,15264
3
- sunstone/dataframe.py,sha256=3wP91L0J3Ptgg41tCRPm84UxTJsj4fy2aBCmCu15qoE,22312
4
- sunstone/datasets.py,sha256=rrakdvgX7EOCWWWrm8wDqOqXIRqaq-KNUqjrwsm66OI,17590
5
- sunstone/exceptions.py,sha256=fiixXazur3LtQGy21bGEaSr356DObFcYxQJ3FvOxNec,623
6
- sunstone/lineage.py,sha256=B9GKMu5-v8Izos5G40K_EvsCPJL3Z2Tg1T_Fc7ezSMI,5240
7
- sunstone/pandas.py,sha256=CLEqIIgTbMmpH73TPy_vDUPxQa37Hpmqn4r6No8PJwo,8188
8
- sunstone/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
- sunstone/validation.py,sha256=1356vcUc72a1zGBUe9Xjrcb5h41Xo53PaK2nnQ_FuSM,8286
10
- sunstone_py-0.4.2.dist-info/licenses/LICENSE,sha256=pB6VuR4QRjwjMjy8RSNGho-N1SUdu07ntIhT5lrhkzU,1078
11
- sunstone_py-0.4.2.dist-info/METADATA,sha256=ApYL9p7R7ibk6e3A4T43x9t5Cg2HzwiRBAwfmaIdFa0,9569
12
- sunstone_py-0.4.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
13
- sunstone_py-0.4.2.dist-info/entry_points.txt,sha256=0h6E88rH9a_503BAzXvFPR-UfmkrRFjcOf29DXgJNjk,51
14
- sunstone_py-0.4.2.dist-info/top_level.txt,sha256=A2fW-7JO10rlx_L28Bc4FVvWt2R8kgvS8_TGPBhQp3c,9
15
- sunstone_py-0.4.2.dist-info/RECORD,,