sunstone-py 0.5.1__tar.gz → 0.5.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {sunstone_py-0.5.1 → sunstone_py-0.5.3}/PKG-INFO +3 -5
  2. {sunstone_py-0.5.1 → sunstone_py-0.5.3}/pyproject.toml +20 -13
  3. {sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone/_release.py +23 -12
  4. {sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone/dataframe.py +16 -89
  5. {sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone/datasets.py +28 -6
  6. {sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone/lineage.py +37 -27
  7. {sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone_py.egg-info/PKG-INFO +3 -5
  8. {sunstone_py-0.5.1 → sunstone_py-0.5.3}/tests/test_dataframe.py +169 -28
  9. {sunstone_py-0.5.1 → sunstone_py-0.5.3}/tests/test_lineage_persistence.py +4 -14
  10. {sunstone_py-0.5.1 → sunstone_py-0.5.3}/tests/test_pandas_compatibility.py +3 -2
  11. {sunstone_py-0.5.1 → sunstone_py-0.5.3}/LICENSE +0 -0
  12. {sunstone_py-0.5.1 → sunstone_py-0.5.3}/README.md +0 -0
  13. {sunstone_py-0.5.1 → sunstone_py-0.5.3}/setup.cfg +0 -0
  14. {sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone/__init__.py +0 -0
  15. {sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone/exceptions.py +0 -0
  16. {sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone/pandas.py +0 -0
  17. {sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone/py.typed +0 -0
  18. {sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone/validation.py +0 -0
  19. {sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone_py.egg-info/SOURCES.txt +0 -0
  20. {sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone_py.egg-info/dependency_links.txt +0 -0
  21. {sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone_py.egg-info/entry_points.txt +0 -0
  22. {sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone_py.egg-info/requires.txt +0 -0
  23. {sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone_py.egg-info/top_level.txt +0 -0
  24. {sunstone_py-0.5.1 → sunstone_py-0.5.3}/tests/test_datasets.py +0 -0
@@ -1,22 +1,20 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sunstone-py
3
- Version: 0.5.1
3
+ Version: 0.5.3
4
4
  Summary: Python library for managing datasets with lineage tracking in Sunstone projects
5
5
  Author-email: Sunstone Institute <stig@sunstone.institute>
6
6
  License: MIT
7
7
  Project-URL: Homepage, https://github.com/sunstoneinstitute/sunstone-py
8
8
  Project-URL: Documentation, https://sunstoneinstitute.github.io/sunstone-py/
9
9
  Project-URL: Repository, https://github.com/sunstoneinstitute/sunstone-py
10
- Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Development Status :: 4 - Beta
11
11
  Classifier: Intended Audience :: Science/Research
12
12
  Classifier: License :: OSI Approved :: MIT License
13
13
  Classifier: Programming Language :: Python :: 3
14
- Classifier: Programming Language :: Python :: 3.10
15
- Classifier: Programming Language :: Python :: 3.11
16
14
  Classifier: Programming Language :: Python :: 3.12
17
15
  Classifier: Programming Language :: Python :: 3.13
18
16
  Classifier: Programming Language :: Python :: 3.14
19
- Requires-Python: >=3.10
17
+ Requires-Python: >=3.12
20
18
  Description-Content-Type: text/markdown
21
19
  License-File: LICENSE
22
20
  Requires-Dist: frictionless>=5.18.1
@@ -1,29 +1,28 @@
1
1
  [build-system]
2
- requires = ["setuptools>=61.0", "wheel"]
2
+ requires = [
3
+ "setuptools>=61.0",
4
+ "wheel",
5
+ ]
3
6
  build-backend = "setuptools.build_meta"
4
7
 
5
8
  [project]
6
9
  name = "sunstone-py"
7
- version = "0.5.1"
10
+ version = "0.5.3"
8
11
  description = "Python library for managing datasets with lineage tracking in Sunstone projects"
9
12
  readme = "README.md"
10
- requires-python = ">=3.10"
11
- license = {text = "MIT"}
13
+ requires-python = ">=3.12"
12
14
  authors = [
13
- {name = "Sunstone Institute", email = "stig@sunstone.institute"}
15
+ { name = "Sunstone Institute", email = "stig@sunstone.institute" },
14
16
  ]
15
17
  classifiers = [
16
- "Development Status :: 3 - Alpha",
18
+ "Development Status :: 4 - Beta",
17
19
  "Intended Audience :: Science/Research",
18
20
  "License :: OSI Approved :: MIT License",
19
21
  "Programming Language :: Python :: 3",
20
- "Programming Language :: Python :: 3.10",
21
- "Programming Language :: Python :: 3.11",
22
22
  "Programming Language :: Python :: 3.12",
23
23
  "Programming Language :: Python :: 3.13",
24
24
  "Programming Language :: Python :: 3.14",
25
25
  ]
26
-
27
26
  dependencies = [
28
27
  "frictionless>=5.18.1",
29
28
  "google-auth>=2.43.0",
@@ -33,6 +32,9 @@ dependencies = [
33
32
  "ruamel-yaml>=0.18",
34
33
  ]
35
34
 
35
+ [project.license]
36
+ text = "MIT"
37
+
36
38
  [project.urls]
37
39
  Homepage = "https://github.com/sunstoneinstitute/sunstone-py"
38
40
  Documentation = "https://sunstoneinstitute.github.io/sunstone-py/"
@@ -42,21 +44,25 @@ Repository = "https://github.com/sunstoneinstitute/sunstone-py"
42
44
  release = "sunstone._release:main"
43
45
 
44
46
  [tool.setuptools.packages.find]
45
- where = ["src"]
47
+ where = [
48
+ "src",
49
+ ]
46
50
 
47
51
  [tool.setuptools.package-data]
48
- sunstone = ["py.typed"]
52
+ sunstone = [
53
+ "py.typed",
54
+ ]
49
55
 
50
56
  [tool.ruff]
51
57
  line-length = 120
52
- target-version = "py310"
58
+ target-version = "py312"
53
59
 
54
60
  [tool.ruff.format]
55
61
  quote-style = "double"
56
62
  indent-style = "space"
57
63
 
58
64
  [tool.mypy]
59
- python_version = "3.10"
65
+ python_version = "3.12"
60
66
  warn_return_any = true
61
67
  warn_unused_configs = true
62
68
  disallow_untyped_defs = true
@@ -75,6 +81,7 @@ dev = [
75
81
  "pandas-stubs>=2.3.2.250926",
76
82
  "types-pyyaml>=6.0.12.20250915",
77
83
  "markdown>=3.10",
84
+ "tomli-w>=1.2.0",
78
85
  ]
79
86
  docs = [
80
87
  "mkdocs-material>=9.5.0",
@@ -13,6 +13,17 @@ import sys
13
13
  from datetime import date
14
14
  from pathlib import Path
15
15
 
16
+ try:
17
+ import tomllib
18
+ except ModuleNotFoundError:
19
+ import tomli as tomllib # type: ignore[import-not-found,no-redef]
20
+
21
+ try:
22
+ import tomli_w
23
+ except ModuleNotFoundError:
24
+ print("Error: tomli_w not found. Install with: uv add --dev tomli-w", file=sys.stderr)
25
+ sys.exit(1)
26
+
16
27
 
17
28
  def get_root_dir() -> Path:
18
29
  """Get the root directory (where pyproject.toml lives)."""
@@ -216,12 +227,13 @@ def confirm_release(new_version: str) -> bool:
216
227
  def get_current_version() -> str:
217
228
  """Get the current version from pyproject.toml."""
218
229
  pyproject_path = get_root_dir() / "pyproject.toml"
219
- content = pyproject_path.read_text()
220
- match = re.search(r'^version\s*=\s*"([^"]+)"', content, re.MULTILINE)
221
- if not match:
230
+ with open(pyproject_path, "rb") as f:
231
+ data = tomllib.load(f)
232
+ version = data.get("project", {}).get("version")
233
+ if not version:
222
234
  print("Error: Could not find version in pyproject.toml", file=sys.stderr)
223
235
  sys.exit(1)
224
- return match.group(1)
236
+ return str(version)
225
237
 
226
238
 
227
239
  def bump_version(version: str, bump: str) -> str:
@@ -244,14 +256,13 @@ def bump_version(version: str, bump: str) -> str:
244
256
  def update_pyproject_version(new_version: str) -> None:
245
257
  """Update the version in pyproject.toml."""
246
258
  pyproject_path = get_root_dir() / "pyproject.toml"
247
- content = pyproject_path.read_text()
248
- new_content = re.sub(
249
- r'^(version\s*=\s*)"[^"]+"',
250
- f'\\1"{new_version}"',
251
- content,
252
- flags=re.MULTILINE,
253
- )
254
- pyproject_path.write_text(new_content)
259
+ with open(pyproject_path, "rb") as f:
260
+ data = tomllib.load(f)
261
+
262
+ data["project"]["version"] = new_version
263
+
264
+ with open(pyproject_path, "wb") as f:
265
+ tomli_w.dump(data, f)
255
266
 
256
267
 
257
268
  def update_changelog(new_version: str) -> None:
@@ -10,7 +10,7 @@ import pandas as pd
10
10
 
11
11
  from .datasets import DatasetsManager
12
12
  from .exceptions import DatasetNotFoundError, StrictModeError
13
- from .lineage import FieldSchema, LineageMetadata
13
+ from .lineage import FieldSchema, LineageMetadata, compute_dataframe_hash
14
14
 
15
15
  pd.options.mode.copy_on_write = True
16
16
 
@@ -196,7 +196,6 @@ class DataFrame:
196
196
  # Create lineage metadata
197
197
  lineage = LineageMetadata(project_path=str(manager.project_path))
198
198
  lineage.add_source(dataset)
199
- lineage.add_operation(f"read_dataset({dataset.slug}, format={format})")
200
199
 
201
200
  # Return wrapped DataFrame
202
201
  return cls(data=df, lineage=lineage, strict=strict, project_path=project_path)
@@ -294,7 +293,6 @@ class DataFrame:
294
293
  # Create lineage metadata
295
294
  lineage = LineageMetadata(project_path=str(manager.project_path))
296
295
  lineage.add_source(dataset)
297
- lineage.add_operation(f"read_csv({dataset.slug})")
298
296
 
299
297
  # Return wrapped DataFrame
300
298
  return cls(data=df, lineage=lineage, strict=strict, project_path=project_path)
@@ -363,11 +361,13 @@ class DataFrame:
363
361
  absolute_path.parent.mkdir(parents=True, exist_ok=True)
364
362
  self.data.to_csv(absolute_path, **kwargs)
365
363
 
366
- # Record the operation
367
- self.lineage.add_operation(f"to_csv({dataset.slug})")
364
+ # Compute content hash for change detection
365
+ content_hash = compute_dataframe_hash(self.data)
368
366
 
369
367
  # Persist lineage metadata to datasets.yaml
370
- manager.update_output_lineage(slug=dataset.slug, lineage=self.lineage, strict=self.strict_mode)
368
+ manager.update_output_lineage(
369
+ slug=dataset.slug, lineage=self.lineage, content_hash=content_hash, strict=self.strict_mode
370
+ )
371
371
 
372
372
  def _infer_field_schema(self) -> List[FieldSchema]:
373
373
  """
@@ -410,11 +410,8 @@ class DataFrame:
410
410
  # Perform the merge
411
411
  merged_data = pd.merge(self.data, right.data, **kwargs)
412
412
 
413
- # Combine lineage
413
+ # Combine lineage (sources from both DataFrames)
414
414
  merged_lineage = self.lineage.merge(right.lineage)
415
- merged_lineage.add_operation(
416
- f"merge(left={len(self.lineage.sources)} sources, right={len(right.lineage.sources)} sources)"
417
- )
418
415
 
419
416
  return DataFrame(
420
417
  data=merged_data,
@@ -437,11 +434,8 @@ class DataFrame:
437
434
  # Perform the join
438
435
  joined_data = self.data.join(other.data, **kwargs)
439
436
 
440
- # Combine lineage
437
+ # Combine lineage (sources from both DataFrames)
441
438
  joined_lineage = self.lineage.merge(other.lineage)
442
- joined_lineage.add_operation(
443
- f"join(left={len(self.lineage.sources)} sources, right={len(other.lineage.sources)} sources)"
444
- )
445
439
 
446
440
  return DataFrame(
447
441
  data=joined_data,
@@ -467,16 +461,11 @@ class DataFrame:
467
461
  # Concatenate
468
462
  concatenated_data = pd.concat(all_dfs, **kwargs)
469
463
 
470
- # Combine lineage from all DataFrames
464
+ # Combine lineage (sources from all DataFrames)
471
465
  combined_lineage = self.lineage
472
466
  for other in others:
473
467
  combined_lineage = combined_lineage.merge(other.lineage)
474
468
 
475
- combined_lineage.add_operation(
476
- f"concat({len(others) + 1} dataframes, "
477
- f"{sum(len(df.lineage.sources) for df in [self] + others)} total sources)"
478
- )
479
-
480
469
  return DataFrame(
481
470
  data=concatenated_data,
482
471
  lineage=combined_lineage,
@@ -484,42 +473,12 @@ class DataFrame:
484
473
  project_path=self.lineage.project_path,
485
474
  )
486
475
 
487
- def apply_operation(self, operation: Callable[[pd.DataFrame], pd.DataFrame], description: str) -> "DataFrame":
488
- """
489
- Apply a transformation operation to the DataFrame.
490
-
491
- Args:
492
- operation: Function that takes a pandas DataFrame and returns a DataFrame.
493
- description: Human-readable description of the operation.
494
-
495
- Returns:
496
- A new DataFrame with the operation applied and recorded in lineage.
497
- """
498
- # Apply the operation
499
- new_data = operation(self.data)
500
-
501
- # Copy lineage and add operation
502
- new_lineage = LineageMetadata(
503
- sources=self.lineage.sources.copy(),
504
- operations=self.lineage.operations.copy(),
505
- project_path=self.lineage.project_path,
506
- )
507
- new_lineage.add_operation(description)
508
-
509
- return DataFrame(
510
- data=new_data,
511
- lineage=new_lineage,
512
- strict=self.strict_mode,
513
- project_path=self.lineage.project_path,
514
- )
515
-
516
- def _wrap_result(self, result: Any, operation: Optional[str] = None) -> Any:
476
+ def _wrap_result(self, result: Any) -> Any:
517
477
  """
518
478
  Wrap a pandas result in a Sunstone DataFrame if applicable.
519
479
 
520
480
  Args:
521
481
  result: The result from a pandas operation.
522
- operation: Name of the operation performed. If None, no operation is recorded.
523
482
 
524
483
  Returns:
525
484
  Wrapped DataFrame if result is a DataFrame, otherwise the result.
@@ -527,11 +486,8 @@ class DataFrame:
527
486
  if isinstance(result, pd.DataFrame):
528
487
  new_lineage = LineageMetadata(
529
488
  sources=self.lineage.sources.copy(),
530
- operations=self.lineage.operations.copy(),
531
489
  project_path=self.lineage.project_path,
532
490
  )
533
- if operation is not None:
534
- new_lineage.add_operation(operation)
535
491
 
536
492
  return DataFrame(
537
493
  data=result,
@@ -541,28 +497,6 @@ class DataFrame:
541
497
  )
542
498
  return result
543
499
 
544
- # Methods that don't represent meaningful data transformations
545
- # These return DataFrames but shouldn't be tracked in lineage
546
- _NON_TRACKING_METHODS = frozenset(
547
- {
548
- # Copy operations - same data, no transformation
549
- "copy",
550
- # Index operations - same data, different index
551
- "reset_index",
552
- "set_index",
553
- "reindex",
554
- # Type conversions without data change
555
- "astype",
556
- "infer_objects",
557
- # Column/index renaming - same data, different labels
558
- "rename",
559
- "rename_axis",
560
- # Reshaping without data loss
561
- "T",
562
- "transpose",
563
- }
564
- )
565
-
566
500
  def __getattr__(self, name: str) -> Any:
567
501
  """
568
502
  Delegate attribute access to the underlying pandas DataFrame.
@@ -583,14 +517,11 @@ class DataFrame:
583
517
 
584
518
  def wrapper(*args: Any, **kwargs: Any) -> Any:
585
519
  result = attr(*args, **kwargs)
586
- # Don't track non-transforming methods
587
- if name in DataFrame._NON_TRACKING_METHODS:
588
- return self._wrap_result(result, operation=None)
589
- return self._wrap_result(result, operation=f"{name}")
520
+ return self._wrap_result(result)
590
521
 
591
522
  return wrapper
592
523
 
593
- return self._wrap_result(attr, operation=None) # Don't track attribute access
524
+ return self._wrap_result(attr)
594
525
 
595
526
  def __getitem__(self, key: Any) -> Any:
596
527
  """
@@ -603,9 +534,7 @@ class DataFrame:
603
534
  The item from the underlying DataFrame, wrapped if it's a DataFrame.
604
535
  """
605
536
  result = self.data[key]
606
- # Don't track __getitem__ as an operation - it's just column/row access
607
- # not a meaningful transformation
608
- return self._wrap_result(result, operation=None)
537
+ return self._wrap_result(result)
609
538
 
610
539
  def __setitem__(self, key: Any, value: Any) -> None:
611
540
  """
@@ -616,14 +545,12 @@ class DataFrame:
616
545
  value: Value to assign.
617
546
  """
618
547
  self.data[key] = value
619
- # Track column assignment in lineage
620
- self.lineage.add_operation(f"__setitem__({key!r})")
548
+ # Don't track column assignments automatically
549
+ # Users should use add_operation() for meaningful transformations
621
550
 
622
551
  def __repr__(self) -> str:
623
552
  """String representation of the DataFrame."""
624
- lineage_info = (
625
- f"\n\nLineage: {len(self.lineage.sources)} source(s), {len(self.lineage.operations)} operation(s)"
626
- )
553
+ lineage_info = f"\n\nLineage: {len(self.lineage.sources)} source(s)"
627
554
  return repr(self.data) + lineage_info
628
555
 
629
556
  def __str__(self) -> str:
@@ -380,22 +380,30 @@ class DatasetsManager:
380
380
 
381
381
  raise DatasetNotFoundError(f"Output dataset with slug '{slug}' not found")
382
382
 
383
- def update_output_lineage(self, slug: str, lineage: LineageMetadata, strict: bool = False) -> None:
383
+ def update_output_lineage(
384
+ self, slug: str, lineage: LineageMetadata, content_hash: str, strict: bool = False
385
+ ) -> None:
384
386
  """
385
387
  Update lineage metadata for an output dataset.
386
388
 
389
+ The timestamp is only updated when the content hash changes, preventing
390
+ unnecessary updates when the data hasn't changed.
391
+
387
392
  In strict mode, validates that the lineage matches what would be written
388
393
  without modifying the file. In relaxed mode, updates the file with lineage.
389
394
 
390
395
  Args:
391
396
  slug: The slug of the output dataset to update.
392
397
  lineage: The lineage metadata to persist.
398
+ content_hash: SHA256 hash of the DataFrame content.
393
399
  strict: If True, validate without modifying. If False, update the file.
394
400
 
395
401
  Raises:
396
402
  DatasetNotFoundError: If the dataset doesn't exist.
397
403
  DatasetValidationError: In strict mode, if lineage differs from what's in the file.
398
404
  """
405
+ from datetime import datetime
406
+
399
407
  # Find the output dataset
400
408
  dataset_idx = None
401
409
  for i, dataset_data in enumerate(self._data["outputs"]):
@@ -406,6 +414,21 @@ class DatasetsManager:
406
414
  if dataset_idx is None:
407
415
  raise DatasetNotFoundError(f"Output dataset with slug '{slug}' not found")
408
416
 
417
+ # Get existing lineage data if present
418
+ existing_lineage = self._data["outputs"][dataset_idx].get("lineage", {})
419
+ existing_hash = existing_lineage.get("content_hash")
420
+ existing_timestamp = existing_lineage.get("created_at")
421
+
422
+ # Determine if content has changed
423
+ content_changed = existing_hash != content_hash
424
+
425
+ # Only update timestamp if content changed
426
+ if content_changed:
427
+ timestamp = datetime.now().isoformat()
428
+ else:
429
+ # Preserve existing timestamp
430
+ timestamp = existing_timestamp
431
+
409
432
  # Build lineage metadata to add
410
433
  lineage_data: dict[str, Any] = {}
411
434
 
@@ -414,15 +437,14 @@ class DatasetsManager:
414
437
  {
415
438
  "slug": src.slug,
416
439
  "name": src.name,
440
+ "location": src.location,
417
441
  }
418
442
  for src in lineage.sources
419
443
  ]
420
444
 
421
- if lineage.operations:
422
- lineage_data["operations"] = lineage.operations.copy()
423
-
424
- if lineage.created_at:
425
- lineage_data["created_at"] = lineage.created_at.isoformat()
445
+ lineage_data["content_hash"] = content_hash
446
+ if timestamp:
447
+ lineage_data["created_at"] = timestamp
426
448
 
427
449
  # Create a copy of the data with updated lineage
428
450
  updated_data = self._data.copy()
@@ -2,9 +2,13 @@
2
2
  Lineage metadata structures for tracking data provenance.
3
3
  """
4
4
 
5
+ import hashlib
5
6
  from dataclasses import dataclass, field
6
7
  from datetime import datetime
7
- from typing import Any, Dict, List, Optional
8
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional
9
+
10
+ if TYPE_CHECKING:
11
+ import pandas as pd
8
12
 
9
13
 
10
14
  @dataclass
@@ -88,23 +92,41 @@ class DatasetMetadata:
88
92
  """Type of dataset: 'input' or 'output'."""
89
93
 
90
94
 
95
+ def compute_dataframe_hash(df: "pd.DataFrame") -> str:
96
+ """
97
+ Compute a fast SHA256 hash of a pandas DataFrame's content.
98
+
99
+ Uses pickle serialization for a consistent, fast representation of the data.
100
+
101
+ Args:
102
+ df: The pandas DataFrame to hash.
103
+
104
+ Returns:
105
+ A SHA256 hex digest string representing the DataFrame content.
106
+ """
107
+ import pickle
108
+
109
+ # Use pickle protocol 5 for efficiency; hash the bytes directly
110
+ data_bytes = pickle.dumps(df, protocol=5)
111
+ return hashlib.sha256(data_bytes).hexdigest()
112
+
113
+
91
114
  @dataclass
92
115
  class LineageMetadata:
93
116
  """
94
117
  Lineage metadata tracking the provenance of data in a DataFrame.
95
118
 
96
- This tracks all source datasets that contributed to the current DataFrame,
97
- including information about transformations and operations performed.
119
+ This tracks all source datasets that contributed to the current DataFrame.
98
120
  """
99
121
 
100
122
  sources: List[DatasetMetadata] = field(default_factory=list)
101
123
  """List of source datasets that contributed to this data."""
102
124
 
103
- operations: List[str] = field(default_factory=list)
104
- """List of operations performed on the data."""
125
+ created_at: Optional[datetime] = None
126
+ """Timestamp when this lineage was last updated (content changed)."""
105
127
 
106
- created_at: datetime = field(default_factory=datetime.now)
107
- """Timestamp when this lineage was created."""
128
+ content_hash: Optional[str] = None
129
+ """SHA256 hash of the DataFrame content, used to detect changes."""
108
130
 
109
131
  project_path: Optional[str] = None
110
132
  """Path to the project directory containing datasets.yaml."""
@@ -119,15 +141,6 @@ class LineageMetadata:
119
141
  if dataset not in self.sources:
120
142
  self.sources.append(dataset)
121
143
 
122
- def add_operation(self, operation: str) -> None:
123
- """
124
- Record an operation performed on the data.
125
-
126
- Args:
127
- operation: Description of the operation.
128
- """
129
- self.operations.append(operation)
130
-
131
144
  def merge(self, other: "LineageMetadata") -> "LineageMetadata":
132
145
  """
133
146
  Merge lineage from another DataFrame.
@@ -136,12 +149,10 @@ class LineageMetadata:
136
149
  other: The other lineage metadata to merge.
137
150
 
138
151
  Returns:
139
- A new LineageMetadata with combined sources and operations.
152
+ A new LineageMetadata with combined sources.
140
153
  """
141
154
  merged = LineageMetadata(
142
155
  sources=self.sources.copy(),
143
- operations=self.operations.copy(),
144
- created_at=datetime.now(),
145
156
  project_path=self.project_path or other.project_path,
146
157
  )
147
158
 
@@ -150,9 +161,6 @@ class LineageMetadata:
150
161
  if source not in merged.sources:
151
162
  merged.sources.append(source)
152
163
 
153
- # Combine operations
154
- merged.operations.extend(other.operations)
155
-
156
164
  return merged
157
165
 
158
166
  def get_licenses(self) -> List[str]:
@@ -175,16 +183,18 @@ class LineageMetadata:
175
183
  Returns:
176
184
  Dictionary containing lineage information.
177
185
  """
178
- return {
186
+ result: Dict[str, Any] = {
179
187
  "sources": [
180
188
  {
181
- "name": src.name,
182
189
  "slug": src.slug,
190
+ "name": src.name,
183
191
  "location": src.location,
184
192
  }
185
193
  for src in self.sources
186
194
  ],
187
- "operations": self.operations,
188
- "created_at": self.created_at.isoformat(),
189
- "licenses": self.get_licenses(),
190
195
  }
196
+ if self.created_at is not None:
197
+ result["created_at"] = self.created_at.isoformat()
198
+ if self.content_hash is not None:
199
+ result["content_hash"] = self.content_hash
200
+ return result
@@ -1,22 +1,20 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sunstone-py
3
- Version: 0.5.1
3
+ Version: 0.5.3
4
4
  Summary: Python library for managing datasets with lineage tracking in Sunstone projects
5
5
  Author-email: Sunstone Institute <stig@sunstone.institute>
6
6
  License: MIT
7
7
  Project-URL: Homepage, https://github.com/sunstoneinstitute/sunstone-py
8
8
  Project-URL: Documentation, https://sunstoneinstitute.github.io/sunstone-py/
9
9
  Project-URL: Repository, https://github.com/sunstoneinstitute/sunstone-py
10
- Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Development Status :: 4 - Beta
11
11
  Classifier: Intended Audience :: Science/Research
12
12
  Classifier: License :: OSI Approved :: MIT License
13
13
  Classifier: Programming Language :: Python :: 3
14
- Classifier: Programming Language :: Python :: 3.10
15
- Classifier: Programming Language :: Python :: 3.11
16
14
  Classifier: Programming Language :: Python :: 3.12
17
15
  Classifier: Programming Language :: Python :: 3.13
18
16
  Classifier: Programming Language :: Python :: 3.14
19
- Requires-Python: >=3.10
17
+ Requires-Python: >=3.12
20
18
  Description-Content-Type: text/markdown
21
19
  License-File: LICENSE
22
20
  Requires-Dist: frictionless>=5.18.1
@@ -25,20 +25,19 @@ class TestDataFrameBasics:
25
25
  assert len(df.data) > 0
26
26
  assert len(df.data.columns) > 0
27
27
  assert len(df.lineage.sources) > 0
28
- assert df.lineage.operations is not None
29
28
 
30
- def test_apply_operation(self, project_path: Path) -> None:
31
- """Test applying an operation to a DataFrame."""
29
+ def test_head_preserves_lineage(self, project_path: Path) -> None:
30
+ """Test that head() preserves lineage."""
32
31
  df = sunstone.DataFrame.read_csv(
33
32
  "inputs/official_un_member_states_raw.csv",
34
33
  project_path=project_path,
35
34
  strict=False,
36
35
  )
37
36
 
38
- filtered = df.apply_operation(lambda d: d.head(10), description="Select first 10 rows")
37
+ filtered = df.head(10)
39
38
 
40
39
  assert len(filtered.data) == 10
41
- assert len(filtered.lineage.operations) > len(df.lineage.operations)
40
+ assert len(filtered.lineage.sources) == len(df.lineage.sources)
42
41
 
43
42
  def test_read_second_dataset(self, project_path: Path) -> None:
44
43
  """Test reading the same dataset twice creates separate lineage."""
@@ -68,10 +67,7 @@ class TestDataFrameMerge:
68
67
  strict=False,
69
68
  )
70
69
  # Filter to create a subset
71
- return df.apply_operation(
72
- lambda d: d[d["ISO Code"].notna()].head(50),
73
- description="Select first 50 countries with ISO codes",
74
- )
70
+ return df[df.data["ISO Code"].notna()].head(50)
75
71
 
76
72
  @pytest.fixture
77
73
  def un_members_df2(self, project_path: Path) -> Any:
@@ -82,10 +78,7 @@ class TestDataFrameMerge:
82
78
  strict=False,
83
79
  )
84
80
  # Select different columns as a second dataset
85
- return df.apply_operation(
86
- lambda d: d[["Member State", "ISO Code", "Start date"]].dropna(),
87
- description="Select subset of columns",
88
- )
81
+ return df[["Member State", "ISO Code", "Start date"]].dropna()
89
82
 
90
83
  def test_merge_dataframes(self, un_members_df1: Any, un_members_df2: Any) -> None:
91
84
  """Test merging two DataFrames."""
@@ -95,7 +88,6 @@ class TestDataFrameMerge:
95
88
  assert len(merged.data) > 0
96
89
  # Both sources come from the same file, but lineage should track them separately
97
90
  assert len(merged.lineage.sources) >= 1
98
- assert len(merged.lineage.operations) > 0
99
91
 
100
92
  def test_merge_lineage_tracking(self, un_members_df1: Any, un_members_df2: Any) -> None:
101
93
  """Test that merge properly tracks lineage."""
@@ -117,11 +109,9 @@ class TestLineageMetadata:
117
109
  project_path=project_path,
118
110
  strict=False,
119
111
  )
120
- # Apply some operations to build lineage
121
- filtered = un_members.apply_operation(
122
- lambda d: d[d["ISO Code"].notna()], description="Filter countries with ISO codes"
123
- )
124
- return filtered.apply_operation(lambda d: d.head(100), description="Select first 100 countries")
112
+ # Apply some operations
113
+ filtered = un_members[un_members.data["ISO Code"].notna()]
114
+ return filtered.head(100)
125
115
 
126
116
  def test_lineage_to_dict(self, processed_df: Any) -> None:
127
117
  """Test converting lineage to dictionary."""
@@ -129,11 +119,8 @@ class TestLineageMetadata:
129
119
 
130
120
  assert lineage_dict is not None
131
121
  assert "sources" in lineage_dict
132
- assert "operations" in lineage_dict
133
- assert "created_at" in lineage_dict
134
- assert "licenses" in lineage_dict
122
+ # created_at is only set when writing output (not when reading)
135
123
  assert len(lineage_dict["sources"]) > 0
136
- assert len(lineage_dict["operations"]) > 0
137
124
 
138
125
 
139
126
  class TestStrictMode:
@@ -172,8 +159,8 @@ class TestReadDataset:
172
159
  assert len(df.data) > 0
173
160
  assert len(df.data.columns) > 0
174
161
  assert len(df.lineage.sources) > 0
175
- # Check that the lineage operation mentions the format
176
- assert any("format=csv" in op for op in df.lineage.operations)
162
+ # Check that the source is tracked
163
+ assert df.lineage.sources[0].slug == "official-un-member-states"
177
164
 
178
165
  def test_read_dataset_with_explicit_format(self, project_path: Path) -> None:
179
166
  """Test reading a dataset with explicit format override."""
@@ -186,7 +173,7 @@ class TestReadDataset:
186
173
 
187
174
  assert df is not None
188
175
  assert len(df.data) > 0
189
- assert any("format=csv" in op for op in df.lineage.operations)
176
+ assert len(df.lineage.sources) > 0
190
177
 
191
178
  def test_read_dataset_slug_not_found(self, project_path: Path) -> None:
192
179
  """Test that reading non-existent slug raises error."""
@@ -221,5 +208,159 @@ class TestReadDataset:
221
208
 
222
209
  assert df is not None
223
210
  assert len(df.data) > 0
224
- # Should have the read_dataset operation in lineage
225
- assert any("read_dataset" in op for op in df.lineage.operations)
211
+ # Check that the source is tracked
212
+ assert len(df.lineage.sources) > 0
213
+
214
+
215
+ class TestContentHashLineage:
216
+ """Tests for content-hash based lineage tracking."""
217
+
218
+ def test_content_hash_computed_on_save(self, project_path: Path, tmp_path: Path) -> None:
219
+ """Test that content hash is computed and saved when writing output."""
220
+ import shutil
221
+
222
+ from ruamel.yaml import YAML
223
+
224
+ # Create a copy of the project in tmp_path to avoid modifying original
225
+ test_project = tmp_path / "test_project"
226
+ shutil.copytree(project_path, test_project)
227
+
228
+ df = sunstone.DataFrame.read_csv(
229
+ "inputs/official_un_member_states_raw.csv",
230
+ project_path=test_project,
231
+ strict=False,
232
+ )
233
+
234
+ # Write the output
235
+ output_path = "outputs/test_output.csv"
236
+ df.to_csv(output_path, slug="test-output", name="Test Output", index=False)
237
+
238
+ # Read the datasets.yaml and check for content_hash
239
+ yaml = YAML()
240
+ with open(test_project / "datasets.yaml") as f:
241
+ data = yaml.load(f)
242
+
243
+ # Find the output dataset
244
+ output = next((d for d in data.get("outputs", []) if d["slug"] == "test-output"), None)
245
+ assert output is not None
246
+ assert "lineage" in output
247
+ assert "content_hash" in output["lineage"]
248
+ assert "created_at" in output["lineage"]
249
+ # Hash should be a 64-character hex string (SHA256)
250
+ assert len(output["lineage"]["content_hash"]) == 64
251
+
252
+ def test_timestamp_not_updated_when_content_unchanged(self, project_path: Path, tmp_path: Path) -> None:
253
+ """Test that timestamp stays the same when saving identical content."""
254
+ import shutil
255
+ import time
256
+
257
+ from ruamel.yaml import YAML
258
+
259
+ # Create a copy of the project in tmp_path
260
+ test_project = tmp_path / "test_project"
261
+ shutil.copytree(project_path, test_project)
262
+
263
+ df = sunstone.DataFrame.read_csv(
264
+ "inputs/official_un_member_states_raw.csv",
265
+ project_path=test_project,
266
+ strict=False,
267
+ )
268
+
269
+ output_path = "outputs/stable_output.csv"
270
+
271
+ # First write
272
+ df.to_csv(output_path, slug="stable-output", name="Stable Output", index=False)
273
+
274
+ # Read the first timestamp and hash
275
+ yaml = YAML()
276
+ with open(test_project / "datasets.yaml") as f:
277
+ data1 = yaml.load(f)
278
+
279
+ output1 = next((d for d in data1.get("outputs", []) if d["slug"] == "stable-output"), None)
280
+ assert output1 is not None
281
+ first_timestamp = output1["lineage"]["created_at"]
282
+ first_hash = output1["lineage"]["content_hash"]
283
+
284
+ # Wait a bit to ensure different timestamp would be generated
285
+ time.sleep(0.1)
286
+
287
+ # Reload the manager and write again with the same data
288
+ df2 = sunstone.DataFrame.read_csv(
289
+ "inputs/official_un_member_states_raw.csv",
290
+ project_path=test_project,
291
+ strict=False,
292
+ )
293
+ df2.to_csv(output_path, slug="stable-output", name="Stable Output", index=False)
294
+
295
+ # Read the second timestamp and hash
296
+ with open(test_project / "datasets.yaml") as f:
297
+ data2 = yaml.load(f)
298
+
299
+ output2 = next((d for d in data2.get("outputs", []) if d["slug"] == "stable-output"), None)
300
+ assert output2 is not None
301
+ second_timestamp = output2["lineage"]["created_at"]
302
+ second_hash = output2["lineage"]["content_hash"]
303
+
304
+ # Hash should be the same
305
+ assert first_hash == second_hash
306
+ # Timestamp should NOT have changed since content is identical
307
+ assert first_timestamp == second_timestamp
308
+
309
+ def test_timestamp_updated_when_content_changes(self, project_path: Path, tmp_path: Path) -> None:
310
+ """Test that timestamp is updated when content actually changes."""
311
+ import shutil
312
+ import time
313
+
314
+ from ruamel.yaml import YAML
315
+
316
+ # Create a copy of the project in tmp_path
317
+ test_project = tmp_path / "test_project"
318
+ shutil.copytree(project_path, test_project)
319
+
320
+ df = sunstone.DataFrame.read_csv(
321
+ "inputs/official_un_member_states_raw.csv",
322
+ project_path=test_project,
323
+ strict=False,
324
+ )
325
+
326
+ output_path = "outputs/changing_output.csv"
327
+
328
+ # First write
329
+ df.to_csv(output_path, slug="changing-output", name="Changing Output", index=False)
330
+
331
+ # Read the first timestamp and hash
332
+ yaml = YAML()
333
+ with open(test_project / "datasets.yaml") as f:
334
+ data1 = yaml.load(f)
335
+
336
+ output1 = next((d for d in data1.get("outputs", []) if d["slug"] == "changing-output"), None)
337
+ assert output1 is not None
338
+ first_timestamp = output1["lineage"]["created_at"]
339
+ first_hash = output1["lineage"]["content_hash"]
340
+
341
+ # Wait a bit to ensure different timestamp
342
+ time.sleep(0.1)
343
+
344
+ # Modify the data and write again
345
+ df2 = sunstone.DataFrame.read_csv(
346
+ "inputs/official_un_member_states_raw.csv",
347
+ project_path=test_project,
348
+ strict=False,
349
+ )
350
+ # Actually modify the content - take only first 10 rows
351
+ df2_modified = df2.head(10)
352
+ df2_modified.to_csv(output_path, slug="changing-output", name="Changing Output", index=False)
353
+
354
+ # Read the second timestamp and hash
355
+ with open(test_project / "datasets.yaml") as f:
356
+ data2 = yaml.load(f)
357
+
358
+ output2 = next((d for d in data2.get("outputs", []) if d["slug"] == "changing-output"), None)
359
+ assert output2 is not None
360
+ second_timestamp = output2["lineage"]["created_at"]
361
+ second_hash = output2["lineage"]["content_hash"]
362
+
363
+ # Hash should be different since content changed
364
+ assert first_hash != second_hash
365
+ # Timestamp SHOULD have changed since content is different
366
+ assert first_timestamp != second_timestamp
@@ -22,26 +22,17 @@ class TestLineagePersistence:
22
22
  assert hasattr(result, "lineage")
23
23
  assert len(result.lineage.sources) == len(df.lineage.sources)
24
24
 
25
- # Check operation tracking
26
- # We expect the operation to be recorded, ideally
27
- assert any("head" in op for op in result.lineage.operations)
28
-
29
25
  def test_getitem_preserves_lineage(self, project_path: Path) -> None:
30
26
  """Test that boolean indexing/getitem returns sunstone DataFrame."""
31
27
  df = sunstone.DataFrame.read_csv(
32
28
  "inputs/official_un_member_states_raw.csv", project_path=project_path, strict=False
33
29
  )
34
30
 
35
- # Boolean masking (returns DataFrame)
36
- # Assuming 'Year' or some column exists, checking columns first
37
- # Using the columns we saw in previous turns or just slicing
38
-
39
31
  # Let's just slice columns, which returns a DataFrame
40
32
  result = df[["Member State", "ISO Code"]]
41
33
 
42
34
  assert isinstance(result, sunstone.DataFrame)
43
35
  assert len(result.lineage.sources) == len(df.lineage.sources)
44
- # Operation tracking for getitem might be tricky to name perfectly, but should exist
45
36
 
46
37
  def test_sort_values_preserves_lineage(self, project_path: Path) -> None:
47
38
  """Test that sort_values returns sunstone DataFrame."""
@@ -53,17 +44,16 @@ class TestLineagePersistence:
53
44
 
54
45
  assert isinstance(result, sunstone.DataFrame)
55
46
  assert len(result.lineage.sources) == len(df.lineage.sources)
56
- assert any("sort_values" in op for op in result.lineage.operations)
57
47
 
58
48
  def test_setitem_preserves_lineage(self, project_path: Path) -> None:
59
- """Test that in-place modification tracks lineage."""
49
+ """Test that in-place modification preserves lineage."""
60
50
  df = sunstone.DataFrame.read_csv(
61
51
  "inputs/official_un_member_states_raw.csv", project_path=project_path, strict=False
62
52
  )
63
53
 
64
- initial_ops = len(df.lineage.operations)
54
+ initial_sources = len(df.lineage.sources)
65
55
  df["NewCol"] = 1
66
56
 
67
57
  assert "NewCol" in df.data.columns
68
- assert len(df.lineage.operations) > initial_ops
69
- assert any("__setitem__" in op for op in df.lineage.operations)
58
+ # Lineage sources should be preserved after setitem
59
+ assert len(df.lineage.sources) == initial_sources
@@ -131,10 +131,11 @@ class TestSelectionAndIndexing:
131
131
  """Test setting column values like pandas."""
132
132
  # Create a copy to avoid modifying fixture
133
133
  df = sample_df.head()
134
+ initial_sources = len(df.lineage.sources)
134
135
  df["test_column"] = "test_value"
135
136
  assert "test_column" in df.columns
136
- # Lineage should track this operation
137
- assert any("setitem" in op.lower() for op in df.lineage.operations)
137
+ # Lineage sources should be preserved
138
+ assert len(df.lineage.sources) == initial_sources
138
139
 
139
140
 
140
141
  class TestDataManipulation:
File without changes
File without changes
File without changes