sunstone-py 0.5.1__tar.gz → 0.5.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sunstone_py-0.5.1 → sunstone_py-0.5.3}/PKG-INFO +3 -5
- {sunstone_py-0.5.1 → sunstone_py-0.5.3}/pyproject.toml +20 -13
- {sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone/_release.py +23 -12
- {sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone/dataframe.py +16 -89
- {sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone/datasets.py +28 -6
- {sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone/lineage.py +37 -27
- {sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone_py.egg-info/PKG-INFO +3 -5
- {sunstone_py-0.5.1 → sunstone_py-0.5.3}/tests/test_dataframe.py +169 -28
- {sunstone_py-0.5.1 → sunstone_py-0.5.3}/tests/test_lineage_persistence.py +4 -14
- {sunstone_py-0.5.1 → sunstone_py-0.5.3}/tests/test_pandas_compatibility.py +3 -2
- {sunstone_py-0.5.1 → sunstone_py-0.5.3}/LICENSE +0 -0
- {sunstone_py-0.5.1 → sunstone_py-0.5.3}/README.md +0 -0
- {sunstone_py-0.5.1 → sunstone_py-0.5.3}/setup.cfg +0 -0
- {sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone/__init__.py +0 -0
- {sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone/exceptions.py +0 -0
- {sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone/pandas.py +0 -0
- {sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone/py.typed +0 -0
- {sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone/validation.py +0 -0
- {sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone_py.egg-info/SOURCES.txt +0 -0
- {sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone_py.egg-info/dependency_links.txt +0 -0
- {sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone_py.egg-info/entry_points.txt +0 -0
- {sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone_py.egg-info/requires.txt +0 -0
- {sunstone_py-0.5.1 → sunstone_py-0.5.3}/src/sunstone_py.egg-info/top_level.txt +0 -0
- {sunstone_py-0.5.1 → sunstone_py-0.5.3}/tests/test_datasets.py +0 -0
|
@@ -1,22 +1,20 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sunstone-py
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.3
|
|
4
4
|
Summary: Python library for managing datasets with lineage tracking in Sunstone projects
|
|
5
5
|
Author-email: Sunstone Institute <stig@sunstone.institute>
|
|
6
6
|
License: MIT
|
|
7
7
|
Project-URL: Homepage, https://github.com/sunstoneinstitute/sunstone-py
|
|
8
8
|
Project-URL: Documentation, https://sunstoneinstitute.github.io/sunstone-py/
|
|
9
9
|
Project-URL: Repository, https://github.com/sunstoneinstitute/sunstone-py
|
|
10
|
-
Classifier: Development Status ::
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
11
|
Classifier: Intended Audience :: Science/Research
|
|
12
12
|
Classifier: License :: OSI Approved :: MIT License
|
|
13
13
|
Classifier: Programming Language :: Python :: 3
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
16
14
|
Classifier: Programming Language :: Python :: 3.12
|
|
17
15
|
Classifier: Programming Language :: Python :: 3.13
|
|
18
16
|
Classifier: Programming Language :: Python :: 3.14
|
|
19
|
-
Requires-Python: >=3.
|
|
17
|
+
Requires-Python: >=3.12
|
|
20
18
|
Description-Content-Type: text/markdown
|
|
21
19
|
License-File: LICENSE
|
|
22
20
|
Requires-Dist: frictionless>=5.18.1
|
|
@@ -1,29 +1,28 @@
|
|
|
1
1
|
[build-system]
|
|
2
|
-
requires = [
|
|
2
|
+
requires = [
|
|
3
|
+
"setuptools>=61.0",
|
|
4
|
+
"wheel",
|
|
5
|
+
]
|
|
3
6
|
build-backend = "setuptools.build_meta"
|
|
4
7
|
|
|
5
8
|
[project]
|
|
6
9
|
name = "sunstone-py"
|
|
7
|
-
version = "0.5.
|
|
10
|
+
version = "0.5.3"
|
|
8
11
|
description = "Python library for managing datasets with lineage tracking in Sunstone projects"
|
|
9
12
|
readme = "README.md"
|
|
10
|
-
requires-python = ">=3.
|
|
11
|
-
license = {text = "MIT"}
|
|
13
|
+
requires-python = ">=3.12"
|
|
12
14
|
authors = [
|
|
13
|
-
{name = "Sunstone Institute", email = "stig@sunstone.institute"}
|
|
15
|
+
{ name = "Sunstone Institute", email = "stig@sunstone.institute" },
|
|
14
16
|
]
|
|
15
17
|
classifiers = [
|
|
16
|
-
"Development Status ::
|
|
18
|
+
"Development Status :: 4 - Beta",
|
|
17
19
|
"Intended Audience :: Science/Research",
|
|
18
20
|
"License :: OSI Approved :: MIT License",
|
|
19
21
|
"Programming Language :: Python :: 3",
|
|
20
|
-
"Programming Language :: Python :: 3.10",
|
|
21
|
-
"Programming Language :: Python :: 3.11",
|
|
22
22
|
"Programming Language :: Python :: 3.12",
|
|
23
23
|
"Programming Language :: Python :: 3.13",
|
|
24
24
|
"Programming Language :: Python :: 3.14",
|
|
25
25
|
]
|
|
26
|
-
|
|
27
26
|
dependencies = [
|
|
28
27
|
"frictionless>=5.18.1",
|
|
29
28
|
"google-auth>=2.43.0",
|
|
@@ -33,6 +32,9 @@ dependencies = [
|
|
|
33
32
|
"ruamel-yaml>=0.18",
|
|
34
33
|
]
|
|
35
34
|
|
|
35
|
+
[project.license]
|
|
36
|
+
text = "MIT"
|
|
37
|
+
|
|
36
38
|
[project.urls]
|
|
37
39
|
Homepage = "https://github.com/sunstoneinstitute/sunstone-py"
|
|
38
40
|
Documentation = "https://sunstoneinstitute.github.io/sunstone-py/"
|
|
@@ -42,21 +44,25 @@ Repository = "https://github.com/sunstoneinstitute/sunstone-py"
|
|
|
42
44
|
release = "sunstone._release:main"
|
|
43
45
|
|
|
44
46
|
[tool.setuptools.packages.find]
|
|
45
|
-
where = [
|
|
47
|
+
where = [
|
|
48
|
+
"src",
|
|
49
|
+
]
|
|
46
50
|
|
|
47
51
|
[tool.setuptools.package-data]
|
|
48
|
-
sunstone = [
|
|
52
|
+
sunstone = [
|
|
53
|
+
"py.typed",
|
|
54
|
+
]
|
|
49
55
|
|
|
50
56
|
[tool.ruff]
|
|
51
57
|
line-length = 120
|
|
52
|
-
target-version = "
|
|
58
|
+
target-version = "py312"
|
|
53
59
|
|
|
54
60
|
[tool.ruff.format]
|
|
55
61
|
quote-style = "double"
|
|
56
62
|
indent-style = "space"
|
|
57
63
|
|
|
58
64
|
[tool.mypy]
|
|
59
|
-
python_version = "3.
|
|
65
|
+
python_version = "3.12"
|
|
60
66
|
warn_return_any = true
|
|
61
67
|
warn_unused_configs = true
|
|
62
68
|
disallow_untyped_defs = true
|
|
@@ -75,6 +81,7 @@ dev = [
|
|
|
75
81
|
"pandas-stubs>=2.3.2.250926",
|
|
76
82
|
"types-pyyaml>=6.0.12.20250915",
|
|
77
83
|
"markdown>=3.10",
|
|
84
|
+
"tomli-w>=1.2.0",
|
|
78
85
|
]
|
|
79
86
|
docs = [
|
|
80
87
|
"mkdocs-material>=9.5.0",
|
|
@@ -13,6 +13,17 @@ import sys
|
|
|
13
13
|
from datetime import date
|
|
14
14
|
from pathlib import Path
|
|
15
15
|
|
|
16
|
+
try:
|
|
17
|
+
import tomllib
|
|
18
|
+
except ModuleNotFoundError:
|
|
19
|
+
import tomli as tomllib # type: ignore[import-not-found,no-redef]
|
|
20
|
+
|
|
21
|
+
try:
|
|
22
|
+
import tomli_w
|
|
23
|
+
except ModuleNotFoundError:
|
|
24
|
+
print("Error: tomli_w not found. Install with: uv add --dev tomli-w", file=sys.stderr)
|
|
25
|
+
sys.exit(1)
|
|
26
|
+
|
|
16
27
|
|
|
17
28
|
def get_root_dir() -> Path:
|
|
18
29
|
"""Get the root directory (where pyproject.toml lives)."""
|
|
@@ -216,12 +227,13 @@ def confirm_release(new_version: str) -> bool:
|
|
|
216
227
|
def get_current_version() -> str:
|
|
217
228
|
"""Get the current version from pyproject.toml."""
|
|
218
229
|
pyproject_path = get_root_dir() / "pyproject.toml"
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
230
|
+
with open(pyproject_path, "rb") as f:
|
|
231
|
+
data = tomllib.load(f)
|
|
232
|
+
version = data.get("project", {}).get("version")
|
|
233
|
+
if not version:
|
|
222
234
|
print("Error: Could not find version in pyproject.toml", file=sys.stderr)
|
|
223
235
|
sys.exit(1)
|
|
224
|
-
return
|
|
236
|
+
return str(version)
|
|
225
237
|
|
|
226
238
|
|
|
227
239
|
def bump_version(version: str, bump: str) -> str:
|
|
@@ -244,14 +256,13 @@ def bump_version(version: str, bump: str) -> str:
|
|
|
244
256
|
def update_pyproject_version(new_version: str) -> None:
|
|
245
257
|
"""Update the version in pyproject.toml."""
|
|
246
258
|
pyproject_path = get_root_dir() / "pyproject.toml"
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
pyproject_path.write_text(new_content)
|
|
259
|
+
with open(pyproject_path, "rb") as f:
|
|
260
|
+
data = tomllib.load(f)
|
|
261
|
+
|
|
262
|
+
data["project"]["version"] = new_version
|
|
263
|
+
|
|
264
|
+
with open(pyproject_path, "wb") as f:
|
|
265
|
+
tomli_w.dump(data, f)
|
|
255
266
|
|
|
256
267
|
|
|
257
268
|
def update_changelog(new_version: str) -> None:
|
|
@@ -10,7 +10,7 @@ import pandas as pd
|
|
|
10
10
|
|
|
11
11
|
from .datasets import DatasetsManager
|
|
12
12
|
from .exceptions import DatasetNotFoundError, StrictModeError
|
|
13
|
-
from .lineage import FieldSchema, LineageMetadata
|
|
13
|
+
from .lineage import FieldSchema, LineageMetadata, compute_dataframe_hash
|
|
14
14
|
|
|
15
15
|
pd.options.mode.copy_on_write = True
|
|
16
16
|
|
|
@@ -196,7 +196,6 @@ class DataFrame:
|
|
|
196
196
|
# Create lineage metadata
|
|
197
197
|
lineage = LineageMetadata(project_path=str(manager.project_path))
|
|
198
198
|
lineage.add_source(dataset)
|
|
199
|
-
lineage.add_operation(f"read_dataset({dataset.slug}, format={format})")
|
|
200
199
|
|
|
201
200
|
# Return wrapped DataFrame
|
|
202
201
|
return cls(data=df, lineage=lineage, strict=strict, project_path=project_path)
|
|
@@ -294,7 +293,6 @@ class DataFrame:
|
|
|
294
293
|
# Create lineage metadata
|
|
295
294
|
lineage = LineageMetadata(project_path=str(manager.project_path))
|
|
296
295
|
lineage.add_source(dataset)
|
|
297
|
-
lineage.add_operation(f"read_csv({dataset.slug})")
|
|
298
296
|
|
|
299
297
|
# Return wrapped DataFrame
|
|
300
298
|
return cls(data=df, lineage=lineage, strict=strict, project_path=project_path)
|
|
@@ -363,11 +361,13 @@ class DataFrame:
|
|
|
363
361
|
absolute_path.parent.mkdir(parents=True, exist_ok=True)
|
|
364
362
|
self.data.to_csv(absolute_path, **kwargs)
|
|
365
363
|
|
|
366
|
-
#
|
|
367
|
-
self.
|
|
364
|
+
# Compute content hash for change detection
|
|
365
|
+
content_hash = compute_dataframe_hash(self.data)
|
|
368
366
|
|
|
369
367
|
# Persist lineage metadata to datasets.yaml
|
|
370
|
-
manager.update_output_lineage(
|
|
368
|
+
manager.update_output_lineage(
|
|
369
|
+
slug=dataset.slug, lineage=self.lineage, content_hash=content_hash, strict=self.strict_mode
|
|
370
|
+
)
|
|
371
371
|
|
|
372
372
|
def _infer_field_schema(self) -> List[FieldSchema]:
|
|
373
373
|
"""
|
|
@@ -410,11 +410,8 @@ class DataFrame:
|
|
|
410
410
|
# Perform the merge
|
|
411
411
|
merged_data = pd.merge(self.data, right.data, **kwargs)
|
|
412
412
|
|
|
413
|
-
# Combine lineage
|
|
413
|
+
# Combine lineage (sources from both DataFrames)
|
|
414
414
|
merged_lineage = self.lineage.merge(right.lineage)
|
|
415
|
-
merged_lineage.add_operation(
|
|
416
|
-
f"merge(left={len(self.lineage.sources)} sources, right={len(right.lineage.sources)} sources)"
|
|
417
|
-
)
|
|
418
415
|
|
|
419
416
|
return DataFrame(
|
|
420
417
|
data=merged_data,
|
|
@@ -437,11 +434,8 @@ class DataFrame:
|
|
|
437
434
|
# Perform the join
|
|
438
435
|
joined_data = self.data.join(other.data, **kwargs)
|
|
439
436
|
|
|
440
|
-
# Combine lineage
|
|
437
|
+
# Combine lineage (sources from both DataFrames)
|
|
441
438
|
joined_lineage = self.lineage.merge(other.lineage)
|
|
442
|
-
joined_lineage.add_operation(
|
|
443
|
-
f"join(left={len(self.lineage.sources)} sources, right={len(other.lineage.sources)} sources)"
|
|
444
|
-
)
|
|
445
439
|
|
|
446
440
|
return DataFrame(
|
|
447
441
|
data=joined_data,
|
|
@@ -467,16 +461,11 @@ class DataFrame:
|
|
|
467
461
|
# Concatenate
|
|
468
462
|
concatenated_data = pd.concat(all_dfs, **kwargs)
|
|
469
463
|
|
|
470
|
-
# Combine lineage from all DataFrames
|
|
464
|
+
# Combine lineage (sources from all DataFrames)
|
|
471
465
|
combined_lineage = self.lineage
|
|
472
466
|
for other in others:
|
|
473
467
|
combined_lineage = combined_lineage.merge(other.lineage)
|
|
474
468
|
|
|
475
|
-
combined_lineage.add_operation(
|
|
476
|
-
f"concat({len(others) + 1} dataframes, "
|
|
477
|
-
f"{sum(len(df.lineage.sources) for df in [self] + others)} total sources)"
|
|
478
|
-
)
|
|
479
|
-
|
|
480
469
|
return DataFrame(
|
|
481
470
|
data=concatenated_data,
|
|
482
471
|
lineage=combined_lineage,
|
|
@@ -484,42 +473,12 @@ class DataFrame:
|
|
|
484
473
|
project_path=self.lineage.project_path,
|
|
485
474
|
)
|
|
486
475
|
|
|
487
|
-
def
|
|
488
|
-
"""
|
|
489
|
-
Apply a transformation operation to the DataFrame.
|
|
490
|
-
|
|
491
|
-
Args:
|
|
492
|
-
operation: Function that takes a pandas DataFrame and returns a DataFrame.
|
|
493
|
-
description: Human-readable description of the operation.
|
|
494
|
-
|
|
495
|
-
Returns:
|
|
496
|
-
A new DataFrame with the operation applied and recorded in lineage.
|
|
497
|
-
"""
|
|
498
|
-
# Apply the operation
|
|
499
|
-
new_data = operation(self.data)
|
|
500
|
-
|
|
501
|
-
# Copy lineage and add operation
|
|
502
|
-
new_lineage = LineageMetadata(
|
|
503
|
-
sources=self.lineage.sources.copy(),
|
|
504
|
-
operations=self.lineage.operations.copy(),
|
|
505
|
-
project_path=self.lineage.project_path,
|
|
506
|
-
)
|
|
507
|
-
new_lineage.add_operation(description)
|
|
508
|
-
|
|
509
|
-
return DataFrame(
|
|
510
|
-
data=new_data,
|
|
511
|
-
lineage=new_lineage,
|
|
512
|
-
strict=self.strict_mode,
|
|
513
|
-
project_path=self.lineage.project_path,
|
|
514
|
-
)
|
|
515
|
-
|
|
516
|
-
def _wrap_result(self, result: Any, operation: Optional[str] = None) -> Any:
|
|
476
|
+
def _wrap_result(self, result: Any) -> Any:
|
|
517
477
|
"""
|
|
518
478
|
Wrap a pandas result in a Sunstone DataFrame if applicable.
|
|
519
479
|
|
|
520
480
|
Args:
|
|
521
481
|
result: The result from a pandas operation.
|
|
522
|
-
operation: Name of the operation performed. If None, no operation is recorded.
|
|
523
482
|
|
|
524
483
|
Returns:
|
|
525
484
|
Wrapped DataFrame if result is a DataFrame, otherwise the result.
|
|
@@ -527,11 +486,8 @@ class DataFrame:
|
|
|
527
486
|
if isinstance(result, pd.DataFrame):
|
|
528
487
|
new_lineage = LineageMetadata(
|
|
529
488
|
sources=self.lineage.sources.copy(),
|
|
530
|
-
operations=self.lineage.operations.copy(),
|
|
531
489
|
project_path=self.lineage.project_path,
|
|
532
490
|
)
|
|
533
|
-
if operation is not None:
|
|
534
|
-
new_lineage.add_operation(operation)
|
|
535
491
|
|
|
536
492
|
return DataFrame(
|
|
537
493
|
data=result,
|
|
@@ -541,28 +497,6 @@ class DataFrame:
|
|
|
541
497
|
)
|
|
542
498
|
return result
|
|
543
499
|
|
|
544
|
-
# Methods that don't represent meaningful data transformations
|
|
545
|
-
# These return DataFrames but shouldn't be tracked in lineage
|
|
546
|
-
_NON_TRACKING_METHODS = frozenset(
|
|
547
|
-
{
|
|
548
|
-
# Copy operations - same data, no transformation
|
|
549
|
-
"copy",
|
|
550
|
-
# Index operations - same data, different index
|
|
551
|
-
"reset_index",
|
|
552
|
-
"set_index",
|
|
553
|
-
"reindex",
|
|
554
|
-
# Type conversions without data change
|
|
555
|
-
"astype",
|
|
556
|
-
"infer_objects",
|
|
557
|
-
# Column/index renaming - same data, different labels
|
|
558
|
-
"rename",
|
|
559
|
-
"rename_axis",
|
|
560
|
-
# Reshaping without data loss
|
|
561
|
-
"T",
|
|
562
|
-
"transpose",
|
|
563
|
-
}
|
|
564
|
-
)
|
|
565
|
-
|
|
566
500
|
def __getattr__(self, name: str) -> Any:
|
|
567
501
|
"""
|
|
568
502
|
Delegate attribute access to the underlying pandas DataFrame.
|
|
@@ -583,14 +517,11 @@ class DataFrame:
|
|
|
583
517
|
|
|
584
518
|
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
585
519
|
result = attr(*args, **kwargs)
|
|
586
|
-
|
|
587
|
-
if name in DataFrame._NON_TRACKING_METHODS:
|
|
588
|
-
return self._wrap_result(result, operation=None)
|
|
589
|
-
return self._wrap_result(result, operation=f"{name}")
|
|
520
|
+
return self._wrap_result(result)
|
|
590
521
|
|
|
591
522
|
return wrapper
|
|
592
523
|
|
|
593
|
-
return self._wrap_result(attr
|
|
524
|
+
return self._wrap_result(attr)
|
|
594
525
|
|
|
595
526
|
def __getitem__(self, key: Any) -> Any:
|
|
596
527
|
"""
|
|
@@ -603,9 +534,7 @@ class DataFrame:
|
|
|
603
534
|
The item from the underlying DataFrame, wrapped if it's a DataFrame.
|
|
604
535
|
"""
|
|
605
536
|
result = self.data[key]
|
|
606
|
-
|
|
607
|
-
# not a meaningful transformation
|
|
608
|
-
return self._wrap_result(result, operation=None)
|
|
537
|
+
return self._wrap_result(result)
|
|
609
538
|
|
|
610
539
|
def __setitem__(self, key: Any, value: Any) -> None:
|
|
611
540
|
"""
|
|
@@ -616,14 +545,12 @@ class DataFrame:
|
|
|
616
545
|
value: Value to assign.
|
|
617
546
|
"""
|
|
618
547
|
self.data[key] = value
|
|
619
|
-
#
|
|
620
|
-
|
|
548
|
+
# Don't track column assignments automatically
|
|
549
|
+
# Users should use add_operation() for meaningful transformations
|
|
621
550
|
|
|
622
551
|
def __repr__(self) -> str:
|
|
623
552
|
"""String representation of the DataFrame."""
|
|
624
|
-
lineage_info = (
|
|
625
|
-
f"\n\nLineage: {len(self.lineage.sources)} source(s), {len(self.lineage.operations)} operation(s)"
|
|
626
|
-
)
|
|
553
|
+
lineage_info = f"\n\nLineage: {len(self.lineage.sources)} source(s)"
|
|
627
554
|
return repr(self.data) + lineage_info
|
|
628
555
|
|
|
629
556
|
def __str__(self) -> str:
|
|
@@ -380,22 +380,30 @@ class DatasetsManager:
|
|
|
380
380
|
|
|
381
381
|
raise DatasetNotFoundError(f"Output dataset with slug '{slug}' not found")
|
|
382
382
|
|
|
383
|
-
def update_output_lineage(
|
|
383
|
+
def update_output_lineage(
|
|
384
|
+
self, slug: str, lineage: LineageMetadata, content_hash: str, strict: bool = False
|
|
385
|
+
) -> None:
|
|
384
386
|
"""
|
|
385
387
|
Update lineage metadata for an output dataset.
|
|
386
388
|
|
|
389
|
+
The timestamp is only updated when the content hash changes, preventing
|
|
390
|
+
unnecessary updates when the data hasn't changed.
|
|
391
|
+
|
|
387
392
|
In strict mode, validates that the lineage matches what would be written
|
|
388
393
|
without modifying the file. In relaxed mode, updates the file with lineage.
|
|
389
394
|
|
|
390
395
|
Args:
|
|
391
396
|
slug: The slug of the output dataset to update.
|
|
392
397
|
lineage: The lineage metadata to persist.
|
|
398
|
+
content_hash: SHA256 hash of the DataFrame content.
|
|
393
399
|
strict: If True, validate without modifying. If False, update the file.
|
|
394
400
|
|
|
395
401
|
Raises:
|
|
396
402
|
DatasetNotFoundError: If the dataset doesn't exist.
|
|
397
403
|
DatasetValidationError: In strict mode, if lineage differs from what's in the file.
|
|
398
404
|
"""
|
|
405
|
+
from datetime import datetime
|
|
406
|
+
|
|
399
407
|
# Find the output dataset
|
|
400
408
|
dataset_idx = None
|
|
401
409
|
for i, dataset_data in enumerate(self._data["outputs"]):
|
|
@@ -406,6 +414,21 @@ class DatasetsManager:
|
|
|
406
414
|
if dataset_idx is None:
|
|
407
415
|
raise DatasetNotFoundError(f"Output dataset with slug '{slug}' not found")
|
|
408
416
|
|
|
417
|
+
# Get existing lineage data if present
|
|
418
|
+
existing_lineage = self._data["outputs"][dataset_idx].get("lineage", {})
|
|
419
|
+
existing_hash = existing_lineage.get("content_hash")
|
|
420
|
+
existing_timestamp = existing_lineage.get("created_at")
|
|
421
|
+
|
|
422
|
+
# Determine if content has changed
|
|
423
|
+
content_changed = existing_hash != content_hash
|
|
424
|
+
|
|
425
|
+
# Only update timestamp if content changed
|
|
426
|
+
if content_changed:
|
|
427
|
+
timestamp = datetime.now().isoformat()
|
|
428
|
+
else:
|
|
429
|
+
# Preserve existing timestamp
|
|
430
|
+
timestamp = existing_timestamp
|
|
431
|
+
|
|
409
432
|
# Build lineage metadata to add
|
|
410
433
|
lineage_data: dict[str, Any] = {}
|
|
411
434
|
|
|
@@ -414,15 +437,14 @@ class DatasetsManager:
|
|
|
414
437
|
{
|
|
415
438
|
"slug": src.slug,
|
|
416
439
|
"name": src.name,
|
|
440
|
+
"location": src.location,
|
|
417
441
|
}
|
|
418
442
|
for src in lineage.sources
|
|
419
443
|
]
|
|
420
444
|
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
if lineage.created_at:
|
|
425
|
-
lineage_data["created_at"] = lineage.created_at.isoformat()
|
|
445
|
+
lineage_data["content_hash"] = content_hash
|
|
446
|
+
if timestamp:
|
|
447
|
+
lineage_data["created_at"] = timestamp
|
|
426
448
|
|
|
427
449
|
# Create a copy of the data with updated lineage
|
|
428
450
|
updated_data = self._data.copy()
|
|
@@ -2,9 +2,13 @@
|
|
|
2
2
|
Lineage metadata structures for tracking data provenance.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
+
import hashlib
|
|
5
6
|
from dataclasses import dataclass, field
|
|
6
7
|
from datetime import datetime
|
|
7
|
-
from typing import Any, Dict, List, Optional
|
|
8
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
import pandas as pd
|
|
8
12
|
|
|
9
13
|
|
|
10
14
|
@dataclass
|
|
@@ -88,23 +92,41 @@ class DatasetMetadata:
|
|
|
88
92
|
"""Type of dataset: 'input' or 'output'."""
|
|
89
93
|
|
|
90
94
|
|
|
95
|
+
def compute_dataframe_hash(df: "pd.DataFrame") -> str:
|
|
96
|
+
"""
|
|
97
|
+
Compute a fast SHA256 hash of a pandas DataFrame's content.
|
|
98
|
+
|
|
99
|
+
Uses pickle serialization for a consistent, fast representation of the data.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
df: The pandas DataFrame to hash.
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
A SHA256 hex digest string representing the DataFrame content.
|
|
106
|
+
"""
|
|
107
|
+
import pickle
|
|
108
|
+
|
|
109
|
+
# Use pickle protocol 5 for efficiency; hash the bytes directly
|
|
110
|
+
data_bytes = pickle.dumps(df, protocol=5)
|
|
111
|
+
return hashlib.sha256(data_bytes).hexdigest()
|
|
112
|
+
|
|
113
|
+
|
|
91
114
|
@dataclass
|
|
92
115
|
class LineageMetadata:
|
|
93
116
|
"""
|
|
94
117
|
Lineage metadata tracking the provenance of data in a DataFrame.
|
|
95
118
|
|
|
96
|
-
This tracks all source datasets that contributed to the current DataFrame
|
|
97
|
-
including information about transformations and operations performed.
|
|
119
|
+
This tracks all source datasets that contributed to the current DataFrame.
|
|
98
120
|
"""
|
|
99
121
|
|
|
100
122
|
sources: List[DatasetMetadata] = field(default_factory=list)
|
|
101
123
|
"""List of source datasets that contributed to this data."""
|
|
102
124
|
|
|
103
|
-
|
|
104
|
-
"""
|
|
125
|
+
created_at: Optional[datetime] = None
|
|
126
|
+
"""Timestamp when this lineage was last updated (content changed)."""
|
|
105
127
|
|
|
106
|
-
|
|
107
|
-
"""
|
|
128
|
+
content_hash: Optional[str] = None
|
|
129
|
+
"""SHA256 hash of the DataFrame content, used to detect changes."""
|
|
108
130
|
|
|
109
131
|
project_path: Optional[str] = None
|
|
110
132
|
"""Path to the project directory containing datasets.yaml."""
|
|
@@ -119,15 +141,6 @@ class LineageMetadata:
|
|
|
119
141
|
if dataset not in self.sources:
|
|
120
142
|
self.sources.append(dataset)
|
|
121
143
|
|
|
122
|
-
def add_operation(self, operation: str) -> None:
|
|
123
|
-
"""
|
|
124
|
-
Record an operation performed on the data.
|
|
125
|
-
|
|
126
|
-
Args:
|
|
127
|
-
operation: Description of the operation.
|
|
128
|
-
"""
|
|
129
|
-
self.operations.append(operation)
|
|
130
|
-
|
|
131
144
|
def merge(self, other: "LineageMetadata") -> "LineageMetadata":
|
|
132
145
|
"""
|
|
133
146
|
Merge lineage from another DataFrame.
|
|
@@ -136,12 +149,10 @@ class LineageMetadata:
|
|
|
136
149
|
other: The other lineage metadata to merge.
|
|
137
150
|
|
|
138
151
|
Returns:
|
|
139
|
-
A new LineageMetadata with combined sources
|
|
152
|
+
A new LineageMetadata with combined sources.
|
|
140
153
|
"""
|
|
141
154
|
merged = LineageMetadata(
|
|
142
155
|
sources=self.sources.copy(),
|
|
143
|
-
operations=self.operations.copy(),
|
|
144
|
-
created_at=datetime.now(),
|
|
145
156
|
project_path=self.project_path or other.project_path,
|
|
146
157
|
)
|
|
147
158
|
|
|
@@ -150,9 +161,6 @@ class LineageMetadata:
|
|
|
150
161
|
if source not in merged.sources:
|
|
151
162
|
merged.sources.append(source)
|
|
152
163
|
|
|
153
|
-
# Combine operations
|
|
154
|
-
merged.operations.extend(other.operations)
|
|
155
|
-
|
|
156
164
|
return merged
|
|
157
165
|
|
|
158
166
|
def get_licenses(self) -> List[str]:
|
|
@@ -175,16 +183,18 @@ class LineageMetadata:
|
|
|
175
183
|
Returns:
|
|
176
184
|
Dictionary containing lineage information.
|
|
177
185
|
"""
|
|
178
|
-
|
|
186
|
+
result: Dict[str, Any] = {
|
|
179
187
|
"sources": [
|
|
180
188
|
{
|
|
181
|
-
"name": src.name,
|
|
182
189
|
"slug": src.slug,
|
|
190
|
+
"name": src.name,
|
|
183
191
|
"location": src.location,
|
|
184
192
|
}
|
|
185
193
|
for src in self.sources
|
|
186
194
|
],
|
|
187
|
-
"operations": self.operations,
|
|
188
|
-
"created_at": self.created_at.isoformat(),
|
|
189
|
-
"licenses": self.get_licenses(),
|
|
190
195
|
}
|
|
196
|
+
if self.created_at is not None:
|
|
197
|
+
result["created_at"] = self.created_at.isoformat()
|
|
198
|
+
if self.content_hash is not None:
|
|
199
|
+
result["content_hash"] = self.content_hash
|
|
200
|
+
return result
|
|
@@ -1,22 +1,20 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sunstone-py
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.3
|
|
4
4
|
Summary: Python library for managing datasets with lineage tracking in Sunstone projects
|
|
5
5
|
Author-email: Sunstone Institute <stig@sunstone.institute>
|
|
6
6
|
License: MIT
|
|
7
7
|
Project-URL: Homepage, https://github.com/sunstoneinstitute/sunstone-py
|
|
8
8
|
Project-URL: Documentation, https://sunstoneinstitute.github.io/sunstone-py/
|
|
9
9
|
Project-URL: Repository, https://github.com/sunstoneinstitute/sunstone-py
|
|
10
|
-
Classifier: Development Status ::
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
11
|
Classifier: Intended Audience :: Science/Research
|
|
12
12
|
Classifier: License :: OSI Approved :: MIT License
|
|
13
13
|
Classifier: Programming Language :: Python :: 3
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
16
14
|
Classifier: Programming Language :: Python :: 3.12
|
|
17
15
|
Classifier: Programming Language :: Python :: 3.13
|
|
18
16
|
Classifier: Programming Language :: Python :: 3.14
|
|
19
|
-
Requires-Python: >=3.
|
|
17
|
+
Requires-Python: >=3.12
|
|
20
18
|
Description-Content-Type: text/markdown
|
|
21
19
|
License-File: LICENSE
|
|
22
20
|
Requires-Dist: frictionless>=5.18.1
|
|
@@ -25,20 +25,19 @@ class TestDataFrameBasics:
|
|
|
25
25
|
assert len(df.data) > 0
|
|
26
26
|
assert len(df.data.columns) > 0
|
|
27
27
|
assert len(df.lineage.sources) > 0
|
|
28
|
-
assert df.lineage.operations is not None
|
|
29
28
|
|
|
30
|
-
def
|
|
31
|
-
"""Test
|
|
29
|
+
def test_head_preserves_lineage(self, project_path: Path) -> None:
|
|
30
|
+
"""Test that head() preserves lineage."""
|
|
32
31
|
df = sunstone.DataFrame.read_csv(
|
|
33
32
|
"inputs/official_un_member_states_raw.csv",
|
|
34
33
|
project_path=project_path,
|
|
35
34
|
strict=False,
|
|
36
35
|
)
|
|
37
36
|
|
|
38
|
-
filtered = df.
|
|
37
|
+
filtered = df.head(10)
|
|
39
38
|
|
|
40
39
|
assert len(filtered.data) == 10
|
|
41
|
-
assert len(filtered.lineage.
|
|
40
|
+
assert len(filtered.lineage.sources) == len(df.lineage.sources)
|
|
42
41
|
|
|
43
42
|
def test_read_second_dataset(self, project_path: Path) -> None:
|
|
44
43
|
"""Test reading the same dataset twice creates separate lineage."""
|
|
@@ -68,10 +67,7 @@ class TestDataFrameMerge:
|
|
|
68
67
|
strict=False,
|
|
69
68
|
)
|
|
70
69
|
# Filter to create a subset
|
|
71
|
-
return df.
|
|
72
|
-
lambda d: d[d["ISO Code"].notna()].head(50),
|
|
73
|
-
description="Select first 50 countries with ISO codes",
|
|
74
|
-
)
|
|
70
|
+
return df[df.data["ISO Code"].notna()].head(50)
|
|
75
71
|
|
|
76
72
|
@pytest.fixture
|
|
77
73
|
def un_members_df2(self, project_path: Path) -> Any:
|
|
@@ -82,10 +78,7 @@ class TestDataFrameMerge:
|
|
|
82
78
|
strict=False,
|
|
83
79
|
)
|
|
84
80
|
# Select different columns as a second dataset
|
|
85
|
-
return df.
|
|
86
|
-
lambda d: d[["Member State", "ISO Code", "Start date"]].dropna(),
|
|
87
|
-
description="Select subset of columns",
|
|
88
|
-
)
|
|
81
|
+
return df[["Member State", "ISO Code", "Start date"]].dropna()
|
|
89
82
|
|
|
90
83
|
def test_merge_dataframes(self, un_members_df1: Any, un_members_df2: Any) -> None:
|
|
91
84
|
"""Test merging two DataFrames."""
|
|
@@ -95,7 +88,6 @@ class TestDataFrameMerge:
|
|
|
95
88
|
assert len(merged.data) > 0
|
|
96
89
|
# Both sources come from the same file, but lineage should track them separately
|
|
97
90
|
assert len(merged.lineage.sources) >= 1
|
|
98
|
-
assert len(merged.lineage.operations) > 0
|
|
99
91
|
|
|
100
92
|
def test_merge_lineage_tracking(self, un_members_df1: Any, un_members_df2: Any) -> None:
|
|
101
93
|
"""Test that merge properly tracks lineage."""
|
|
@@ -117,11 +109,9 @@ class TestLineageMetadata:
|
|
|
117
109
|
project_path=project_path,
|
|
118
110
|
strict=False,
|
|
119
111
|
)
|
|
120
|
-
# Apply some operations
|
|
121
|
-
filtered = un_members.
|
|
122
|
-
|
|
123
|
-
)
|
|
124
|
-
return filtered.apply_operation(lambda d: d.head(100), description="Select first 100 countries")
|
|
112
|
+
# Apply some operations
|
|
113
|
+
filtered = un_members[un_members.data["ISO Code"].notna()]
|
|
114
|
+
return filtered.head(100)
|
|
125
115
|
|
|
126
116
|
def test_lineage_to_dict(self, processed_df: Any) -> None:
|
|
127
117
|
"""Test converting lineage to dictionary."""
|
|
@@ -129,11 +119,8 @@ class TestLineageMetadata:
|
|
|
129
119
|
|
|
130
120
|
assert lineage_dict is not None
|
|
131
121
|
assert "sources" in lineage_dict
|
|
132
|
-
|
|
133
|
-
assert "created_at" in lineage_dict
|
|
134
|
-
assert "licenses" in lineage_dict
|
|
122
|
+
# created_at is only set when writing output (not when reading)
|
|
135
123
|
assert len(lineage_dict["sources"]) > 0
|
|
136
|
-
assert len(lineage_dict["operations"]) > 0
|
|
137
124
|
|
|
138
125
|
|
|
139
126
|
class TestStrictMode:
|
|
@@ -172,8 +159,8 @@ class TestReadDataset:
|
|
|
172
159
|
assert len(df.data) > 0
|
|
173
160
|
assert len(df.data.columns) > 0
|
|
174
161
|
assert len(df.lineage.sources) > 0
|
|
175
|
-
# Check that the
|
|
176
|
-
assert
|
|
162
|
+
# Check that the source is tracked
|
|
163
|
+
assert df.lineage.sources[0].slug == "official-un-member-states"
|
|
177
164
|
|
|
178
165
|
def test_read_dataset_with_explicit_format(self, project_path: Path) -> None:
|
|
179
166
|
"""Test reading a dataset with explicit format override."""
|
|
@@ -186,7 +173,7 @@ class TestReadDataset:
|
|
|
186
173
|
|
|
187
174
|
assert df is not None
|
|
188
175
|
assert len(df.data) > 0
|
|
189
|
-
assert
|
|
176
|
+
assert len(df.lineage.sources) > 0
|
|
190
177
|
|
|
191
178
|
def test_read_dataset_slug_not_found(self, project_path: Path) -> None:
|
|
192
179
|
"""Test that reading non-existent slug raises error."""
|
|
@@ -221,5 +208,159 @@ class TestReadDataset:
|
|
|
221
208
|
|
|
222
209
|
assert df is not None
|
|
223
210
|
assert len(df.data) > 0
|
|
224
|
-
#
|
|
225
|
-
assert
|
|
211
|
+
# Check that the source is tracked
|
|
212
|
+
assert len(df.lineage.sources) > 0
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
class TestContentHashLineage:
|
|
216
|
+
"""Tests for content-hash based lineage tracking."""
|
|
217
|
+
|
|
218
|
+
def test_content_hash_computed_on_save(self, project_path: Path, tmp_path: Path) -> None:
|
|
219
|
+
"""Test that content hash is computed and saved when writing output."""
|
|
220
|
+
import shutil
|
|
221
|
+
|
|
222
|
+
from ruamel.yaml import YAML
|
|
223
|
+
|
|
224
|
+
# Create a copy of the project in tmp_path to avoid modifying original
|
|
225
|
+
test_project = tmp_path / "test_project"
|
|
226
|
+
shutil.copytree(project_path, test_project)
|
|
227
|
+
|
|
228
|
+
df = sunstone.DataFrame.read_csv(
|
|
229
|
+
"inputs/official_un_member_states_raw.csv",
|
|
230
|
+
project_path=test_project,
|
|
231
|
+
strict=False,
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
# Write the output
|
|
235
|
+
output_path = "outputs/test_output.csv"
|
|
236
|
+
df.to_csv(output_path, slug="test-output", name="Test Output", index=False)
|
|
237
|
+
|
|
238
|
+
# Read the datasets.yaml and check for content_hash
|
|
239
|
+
yaml = YAML()
|
|
240
|
+
with open(test_project / "datasets.yaml") as f:
|
|
241
|
+
data = yaml.load(f)
|
|
242
|
+
|
|
243
|
+
# Find the output dataset
|
|
244
|
+
output = next((d for d in data.get("outputs", []) if d["slug"] == "test-output"), None)
|
|
245
|
+
assert output is not None
|
|
246
|
+
assert "lineage" in output
|
|
247
|
+
assert "content_hash" in output["lineage"]
|
|
248
|
+
assert "created_at" in output["lineage"]
|
|
249
|
+
# Hash should be a 64-character hex string (SHA256)
|
|
250
|
+
assert len(output["lineage"]["content_hash"]) == 64
|
|
251
|
+
|
|
252
|
+
def test_timestamp_not_updated_when_content_unchanged(self, project_path: Path, tmp_path: Path) -> None:
|
|
253
|
+
"""Test that timestamp stays the same when saving identical content."""
|
|
254
|
+
import shutil
|
|
255
|
+
import time
|
|
256
|
+
|
|
257
|
+
from ruamel.yaml import YAML
|
|
258
|
+
|
|
259
|
+
# Create a copy of the project in tmp_path
|
|
260
|
+
test_project = tmp_path / "test_project"
|
|
261
|
+
shutil.copytree(project_path, test_project)
|
|
262
|
+
|
|
263
|
+
df = sunstone.DataFrame.read_csv(
|
|
264
|
+
"inputs/official_un_member_states_raw.csv",
|
|
265
|
+
project_path=test_project,
|
|
266
|
+
strict=False,
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
output_path = "outputs/stable_output.csv"
|
|
270
|
+
|
|
271
|
+
# First write
|
|
272
|
+
df.to_csv(output_path, slug="stable-output", name="Stable Output", index=False)
|
|
273
|
+
|
|
274
|
+
# Read the first timestamp and hash
|
|
275
|
+
yaml = YAML()
|
|
276
|
+
with open(test_project / "datasets.yaml") as f:
|
|
277
|
+
data1 = yaml.load(f)
|
|
278
|
+
|
|
279
|
+
output1 = next((d for d in data1.get("outputs", []) if d["slug"] == "stable-output"), None)
|
|
280
|
+
assert output1 is not None
|
|
281
|
+
first_timestamp = output1["lineage"]["created_at"]
|
|
282
|
+
first_hash = output1["lineage"]["content_hash"]
|
|
283
|
+
|
|
284
|
+
# Wait a bit to ensure different timestamp would be generated
|
|
285
|
+
time.sleep(0.1)
|
|
286
|
+
|
|
287
|
+
# Reload the manager and write again with the same data
|
|
288
|
+
df2 = sunstone.DataFrame.read_csv(
|
|
289
|
+
"inputs/official_un_member_states_raw.csv",
|
|
290
|
+
project_path=test_project,
|
|
291
|
+
strict=False,
|
|
292
|
+
)
|
|
293
|
+
df2.to_csv(output_path, slug="stable-output", name="Stable Output", index=False)
|
|
294
|
+
|
|
295
|
+
# Read the second timestamp and hash
|
|
296
|
+
with open(test_project / "datasets.yaml") as f:
|
|
297
|
+
data2 = yaml.load(f)
|
|
298
|
+
|
|
299
|
+
output2 = next((d for d in data2.get("outputs", []) if d["slug"] == "stable-output"), None)
|
|
300
|
+
assert output2 is not None
|
|
301
|
+
second_timestamp = output2["lineage"]["created_at"]
|
|
302
|
+
second_hash = output2["lineage"]["content_hash"]
|
|
303
|
+
|
|
304
|
+
# Hash should be the same
|
|
305
|
+
assert first_hash == second_hash
|
|
306
|
+
# Timestamp should NOT have changed since content is identical
|
|
307
|
+
assert first_timestamp == second_timestamp
|
|
308
|
+
|
|
309
|
+
def test_timestamp_updated_when_content_changes(self, project_path: Path, tmp_path: Path) -> None:
|
|
310
|
+
"""Test that timestamp is updated when content actually changes."""
|
|
311
|
+
import shutil
|
|
312
|
+
import time
|
|
313
|
+
|
|
314
|
+
from ruamel.yaml import YAML
|
|
315
|
+
|
|
316
|
+
# Create a copy of the project in tmp_path
|
|
317
|
+
test_project = tmp_path / "test_project"
|
|
318
|
+
shutil.copytree(project_path, test_project)
|
|
319
|
+
|
|
320
|
+
df = sunstone.DataFrame.read_csv(
|
|
321
|
+
"inputs/official_un_member_states_raw.csv",
|
|
322
|
+
project_path=test_project,
|
|
323
|
+
strict=False,
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
output_path = "outputs/changing_output.csv"
|
|
327
|
+
|
|
328
|
+
# First write
|
|
329
|
+
df.to_csv(output_path, slug="changing-output", name="Changing Output", index=False)
|
|
330
|
+
|
|
331
|
+
# Read the first timestamp and hash
|
|
332
|
+
yaml = YAML()
|
|
333
|
+
with open(test_project / "datasets.yaml") as f:
|
|
334
|
+
data1 = yaml.load(f)
|
|
335
|
+
|
|
336
|
+
output1 = next((d for d in data1.get("outputs", []) if d["slug"] == "changing-output"), None)
|
|
337
|
+
assert output1 is not None
|
|
338
|
+
first_timestamp = output1["lineage"]["created_at"]
|
|
339
|
+
first_hash = output1["lineage"]["content_hash"]
|
|
340
|
+
|
|
341
|
+
# Wait a bit to ensure different timestamp
|
|
342
|
+
time.sleep(0.1)
|
|
343
|
+
|
|
344
|
+
# Modify the data and write again
|
|
345
|
+
df2 = sunstone.DataFrame.read_csv(
|
|
346
|
+
"inputs/official_un_member_states_raw.csv",
|
|
347
|
+
project_path=test_project,
|
|
348
|
+
strict=False,
|
|
349
|
+
)
|
|
350
|
+
# Actually modify the content - take only first 10 rows
|
|
351
|
+
df2_modified = df2.head(10)
|
|
352
|
+
df2_modified.to_csv(output_path, slug="changing-output", name="Changing Output", index=False)
|
|
353
|
+
|
|
354
|
+
# Read the second timestamp and hash
|
|
355
|
+
with open(test_project / "datasets.yaml") as f:
|
|
356
|
+
data2 = yaml.load(f)
|
|
357
|
+
|
|
358
|
+
output2 = next((d for d in data2.get("outputs", []) if d["slug"] == "changing-output"), None)
|
|
359
|
+
assert output2 is not None
|
|
360
|
+
second_timestamp = output2["lineage"]["created_at"]
|
|
361
|
+
second_hash = output2["lineage"]["content_hash"]
|
|
362
|
+
|
|
363
|
+
# Hash should be different since content changed
|
|
364
|
+
assert first_hash != second_hash
|
|
365
|
+
# Timestamp SHOULD have changed since content is different
|
|
366
|
+
assert first_timestamp != second_timestamp
|
|
@@ -22,26 +22,17 @@ class TestLineagePersistence:
|
|
|
22
22
|
assert hasattr(result, "lineage")
|
|
23
23
|
assert len(result.lineage.sources) == len(df.lineage.sources)
|
|
24
24
|
|
|
25
|
-
# Check operation tracking
|
|
26
|
-
# We expect the operation to be recorded, ideally
|
|
27
|
-
assert any("head" in op for op in result.lineage.operations)
|
|
28
|
-
|
|
29
25
|
def test_getitem_preserves_lineage(self, project_path: Path) -> None:
|
|
30
26
|
"""Test that boolean indexing/getitem returns sunstone DataFrame."""
|
|
31
27
|
df = sunstone.DataFrame.read_csv(
|
|
32
28
|
"inputs/official_un_member_states_raw.csv", project_path=project_path, strict=False
|
|
33
29
|
)
|
|
34
30
|
|
|
35
|
-
# Boolean masking (returns DataFrame)
|
|
36
|
-
# Assuming 'Year' or some column exists, checking columns first
|
|
37
|
-
# Using the columns we saw in previous turns or just slicing
|
|
38
|
-
|
|
39
31
|
# Let's just slice columns, which returns a DataFrame
|
|
40
32
|
result = df[["Member State", "ISO Code"]]
|
|
41
33
|
|
|
42
34
|
assert isinstance(result, sunstone.DataFrame)
|
|
43
35
|
assert len(result.lineage.sources) == len(df.lineage.sources)
|
|
44
|
-
# Operation tracking for getitem might be tricky to name perfectly, but should exist
|
|
45
36
|
|
|
46
37
|
def test_sort_values_preserves_lineage(self, project_path: Path) -> None:
|
|
47
38
|
"""Test that sort_values returns sunstone DataFrame."""
|
|
@@ -53,17 +44,16 @@ class TestLineagePersistence:
|
|
|
53
44
|
|
|
54
45
|
assert isinstance(result, sunstone.DataFrame)
|
|
55
46
|
assert len(result.lineage.sources) == len(df.lineage.sources)
|
|
56
|
-
assert any("sort_values" in op for op in result.lineage.operations)
|
|
57
47
|
|
|
58
48
|
def test_setitem_preserves_lineage(self, project_path: Path) -> None:
|
|
59
|
-
"""Test that in-place modification
|
|
49
|
+
"""Test that in-place modification preserves lineage."""
|
|
60
50
|
df = sunstone.DataFrame.read_csv(
|
|
61
51
|
"inputs/official_un_member_states_raw.csv", project_path=project_path, strict=False
|
|
62
52
|
)
|
|
63
53
|
|
|
64
|
-
|
|
54
|
+
initial_sources = len(df.lineage.sources)
|
|
65
55
|
df["NewCol"] = 1
|
|
66
56
|
|
|
67
57
|
assert "NewCol" in df.data.columns
|
|
68
|
-
|
|
69
|
-
assert
|
|
58
|
+
# Lineage sources should be preserved after setitem
|
|
59
|
+
assert len(df.lineage.sources) == initial_sources
|
|
@@ -131,10 +131,11 @@ class TestSelectionAndIndexing:
|
|
|
131
131
|
"""Test setting column values like pandas."""
|
|
132
132
|
# Create a copy to avoid modifying fixture
|
|
133
133
|
df = sample_df.head()
|
|
134
|
+
initial_sources = len(df.lineage.sources)
|
|
134
135
|
df["test_column"] = "test_value"
|
|
135
136
|
assert "test_column" in df.columns
|
|
136
|
-
# Lineage should
|
|
137
|
-
assert
|
|
137
|
+
# Lineage sources should be preserved
|
|
138
|
+
assert len(df.lineage.sources) == initial_sources
|
|
138
139
|
|
|
139
140
|
|
|
140
141
|
class TestDataManipulation:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|