tracepipe 0.3.4__tar.gz → 0.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tracepipe-0.4.1/CHANGELOG.md +162 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/PKG-INFO +18 -1
- {tracepipe-0.3.4 → tracepipe-0.4.1}/README.md +17 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/api/core.md +15 -9
- tracepipe-0.4.1/docs/changelog.md +120 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/guide/concepts.md +22 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/guide/row-tracing.md +64 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/index.md +4 -2
- {tracepipe-0.3.4 → tracepipe-0.4.1}/mkdocs.yml +1 -1
- {tracepipe-0.3.4 → tracepipe-0.4.1}/pyproject.toml +1 -1
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tests/test_api.py +6 -2
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tests/test_convenience_debug.py +122 -1
- tracepipe-0.4.1/tests/test_integration_scenarios.py +361 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tests/test_lineage_through_merge.py +45 -26
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tests/test_pandas_inst.py +20 -10
- tracepipe-0.4.1/tests/test_row_provenance.py +684 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/__init__.py +1 -1
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/context.py +4 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/convenience.py +130 -7
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/core.py +79 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/instrumentation/filter_capture.py +103 -1
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/instrumentation/merge_capture.py +169 -23
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/instrumentation/pandas_inst.py +11 -3
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/safety.py +29 -4
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/storage/lineage_store.py +92 -7
- {tracepipe-0.3.4 → tracepipe-0.4.1}/uv.lock +317 -248
- tracepipe-0.3.4/CHANGELOG.md +0 -80
- tracepipe-0.3.4/docs/changelog.md +0 -39
- tracepipe-0.3.4/examples/comprehensive_demo.py +0 -694
- tracepipe-0.3.4/examples/red_team_test.py +0 -437
- {tracepipe-0.3.4 → tracepipe-0.4.1}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/.github/workflows/ci.yml +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/.github/workflows/docs.yml +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/.github/workflows/release.yml +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/.gitignore +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/.pre-commit-config.yaml +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/CONTRIBUTING.md +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/LICENSE +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/benchmarks/README.md +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/benchmarks/bench_memory.py +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/benchmarks/bench_overhead.py +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/benchmarks/bench_scale.py +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/benchmarks/run_all.py +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/api/contracts.md +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/api/debug.md +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/api/index.md +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/contributing.md +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/examples/data-validation.md +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/examples/ml-pipeline.md +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/getting-started/installation.md +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/getting-started/modes.md +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/getting-started/quickstart.md +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/guide/cell-provenance.md +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/guide/contracts.md +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/guide/health-checks.md +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/guide/reports.md +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/guide/snapshots.md +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/examples/demo.py +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/examples/ml_pipeline_demo.py +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/404.html +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/api/contracts/index.html +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/api/core/index.html +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/api/debug/index.html +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/api/index.html +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/_mkdocstrings.css +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/images/favicon.png +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/bundle.79ae519e.min.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/bundle.79ae519e.min.js.map +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.ar.min.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.da.min.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.de.min.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.du.min.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.el.min.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.es.min.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.fi.min.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.fr.min.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.he.min.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.hi.min.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.hu.min.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.hy.min.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.it.min.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.ja.min.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.jp.min.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.kn.min.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.ko.min.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.multi.min.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.nl.min.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.no.min.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.pt.min.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.ro.min.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.ru.min.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.sa.min.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.stemmer.support.min.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.sv.min.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.ta.min.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.te.min.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.th.min.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.tr.min.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.vi.min.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.zh.min.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/tinyseg.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/wordcut.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/workers/search.2c215733.min.js +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/workers/search.2c215733.min.js.map +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/stylesheets/main.484c7ddc.min.css +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/stylesheets/main.484c7ddc.min.css.map +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/stylesheets/palette.ab4e12ef.min.css +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/stylesheets/palette.ab4e12ef.min.css.map +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/changelog/index.html +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/contributing/index.html +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/examples/data-validation/index.html +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/examples/ml-pipeline/index.html +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/getting-started/installation/index.html +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/getting-started/modes/index.html +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/getting-started/quickstart/index.html +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/guide/cell-provenance/index.html +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/guide/concepts/index.html +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/guide/contracts/index.html +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/guide/health-checks/index.html +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/guide/reports/index.html +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/guide/row-tracing/index.html +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/guide/snapshots/index.html +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/index.html +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/objects.inv +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/search/search_index.json +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/sitemap.xml +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/site/sitemap.xml.gz +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tests/__init__.py +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tests/conftest.py +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tests/test_concurrency.py +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tests/test_contracts.py +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tests/test_edge_cases.py +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tests/test_integration.py +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tests/test_io_operations.py +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tests/test_public_api.py +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tests/test_snapshot.py +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tests/test_version_matrix.py +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/api.py +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/contracts.py +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/debug.py +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/instrumentation/__init__.py +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/instrumentation/apply_capture.py +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/instrumentation/indexer_capture.py +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/instrumentation/series_capture.py +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/snapshot.py +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/storage/__init__.py +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/storage/base.py +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/storage/row_identity.py +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/utils/__init__.py +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/utils/value_capture.py +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/value_provenance.py +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/visualization/__init__.py +0 -0
- {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/visualization/html_export.py +0 -0
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to TracePipe will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## 0.4.1 - 2026-02-04
|
|
9
|
+
|
|
10
|
+
### Fixed
|
|
11
|
+
- Fully implemented `CheckResult` convenience properties (`.passed`, `.retention`, `.n_dropped`, `.n_steps`, `.drops_by_op`)
|
|
12
|
+
- Added comprehensive tests for `CheckResult` API to ensure properties work correctly
|
|
13
|
+
- Properties now properly access underlying `.facts` dictionary for all metrics
|
|
14
|
+
|
|
15
|
+
### Changed
|
|
16
|
+
- Cleaned up example files and test scripts
|
|
17
|
+
|
|
18
|
+
## 0.4.0 - 2026-02-04
|
|
19
|
+
|
|
20
|
+
### Added
|
|
21
|
+
- **Full row provenance for `pd.concat(axis=0)`**: Row IDs are now preserved through concatenation
|
|
22
|
+
- Each result row maintains its original RID from the source DataFrame
|
|
23
|
+
- `ConcatMapping` tracks which source DataFrame each row came from
|
|
24
|
+
- Concat steps are now marked `FULL` completeness (previously `PARTIAL`)
|
|
25
|
+
|
|
26
|
+
- **Duplicate drop provenance in debug mode**: `drop_duplicates` now tracks which row "won"
|
|
27
|
+
- `DuplicateDropMapping` maps dropped rows to their kept representative
|
|
28
|
+
- Supports `keep='first'`, `keep='last'`, and `keep=False`
|
|
29
|
+
- Uses `hash_pandas_object` for fast, NaN-safe key comparison
|
|
30
|
+
|
|
31
|
+
- **Clean `TraceResult` API for provenance** (UX improvement):
|
|
32
|
+
- `trace.origin` — Unified origin info: `{"type": "concat", "source_df": 1}` or `{"type": "merge", "left_parent": 10, "right_parent": 20}`
|
|
33
|
+
- `trace.representative` — For dedup-dropped rows: `{"kept_rid": 42, "subset": ["key"], "keep": "first"}`
|
|
34
|
+
- No need to access internal `.store` methods — everything is in `tp.trace()` result
|
|
35
|
+
|
|
36
|
+
- **Clean `CheckResult` API** (UX improvement):
|
|
37
|
+
- `result.passed` — Alias for `.ok` (common naming convention)
|
|
38
|
+
- `result.retention` — Row retention rate (0.0-1.0) from `.facts`
|
|
39
|
+
- `result.n_dropped` — Total rows dropped
|
|
40
|
+
- `result.n_steps` — Total pipeline steps recorded
|
|
41
|
+
- `result.drops_by_op` — Drops broken down by operation name
|
|
42
|
+
- All properties are now discoverable via autocomplete
|
|
43
|
+
|
|
44
|
+
- **New data structures in `core.py`**:
|
|
45
|
+
- `ConcatMapping`: Tracks row provenance through concat operations
|
|
46
|
+
- `DuplicateDropMapping`: Tracks dropped->kept relationships in drop_duplicates
|
|
47
|
+
|
|
48
|
+
- **Comprehensive test suite**: 38 new tests in `test_row_provenance.py` covering:
|
|
49
|
+
- Concat RID preservation, ignore_index, after sort, with empty DFs, chained concats
|
|
50
|
+
- Axis=1 same-RID propagation vs different-RID PARTIAL marking
|
|
51
|
+
- Drop_duplicates keep='first'/'last'/False mapping correctness
|
|
52
|
+
- NaN handling parity with pandas `duplicated()`
|
|
53
|
+
- Integration: concat→merge, filter→concat, dedup→fillna lineage
|
|
54
|
+
- TraceResult `.origin` and `.representative` property tests
|
|
55
|
+
|
|
56
|
+
### Changed
|
|
57
|
+
- `wrap_concat_with_lineage` rewritten for full provenance tracking
|
|
58
|
+
- Captures source RIDs before operation
|
|
59
|
+
- Propagates RIDs (not new registration) for axis=0
|
|
60
|
+
- Stores positional + sorted arrays for both "explain row i" and O(log n) lookup
|
|
61
|
+
- Axis=1 propagates RIDs if all inputs match, otherwise PARTIAL
|
|
62
|
+
|
|
63
|
+
- `_capture_filter_with_mask` enhanced to store `DuplicateDropMapping` in debug mode
|
|
64
|
+
|
|
65
|
+
- `TraceResult` enhanced with `.origin` and `.representative` properties
|
|
66
|
+
- `.to_text()` now displays origin and representative info
|
|
67
|
+
- `.to_dict()` includes all provenance info
|
|
68
|
+
|
|
69
|
+
## 0.3.5 - 2026-02-03
|
|
70
|
+
|
|
71
|
+
### Fixed
|
|
72
|
+
- **DataFrame.fillna double-logging**: `df.fillna({"col": 0})` now logs exactly 1 event
|
|
73
|
+
- Previously logged both `DataFrame.fillna` and internal `__setitem__` for same change
|
|
74
|
+
- Added `wrap_pandas_transform_method` with `_in_transform_op` flag to suppress nested setitem
|
|
75
|
+
- Works for both `fillna` and `replace` operations, including `inplace=True`
|
|
76
|
+
|
|
77
|
+
### Added
|
|
78
|
+
- Known Limitations section in README documenting concat/dedup tracking gaps
|
|
79
|
+
- Test for `DataFrame.fillna` single-event logging
|
|
80
|
+
|
|
81
|
+
### Changed
|
|
82
|
+
- **Test suite hardened** with exact count assertions and multi-scenario tests:
|
|
83
|
+
- Changed 15+ assertions from `>= 1` to `== 1` for precise verification
|
|
84
|
+
- Added `test_integration_scenarios.py` with 16 new tests covering:
|
|
85
|
+
- Multi-pipeline session isolation
|
|
86
|
+
- Warning message content verification
|
|
87
|
+
- Reliability scenarios (fillna, replace, loc, merge)
|
|
88
|
+
- Cross-pipeline contamination prevention
|
|
89
|
+
|
|
90
|
+
## 0.3.4 - 2026-02-03
|
|
91
|
+
|
|
92
|
+
### Fixed
|
|
93
|
+
- **Event deduplication**: Identical events from parallel pipelines are now deduplicated
|
|
94
|
+
- When multiple DataFrames share row IDs (e.g., from `df.copy()`), same changes are recorded once
|
|
95
|
+
- Events deduplicated by `(col, old_val, new_val, operation)` signature
|
|
96
|
+
- Prevents "4 events" when only 1 logical change occurred
|
|
97
|
+
|
|
98
|
+
### Added
|
|
99
|
+
- `_stable_repr()` helper for robust value comparison in deduplication
|
|
100
|
+
- Tests for cross-pipeline event deduplication behavior
|
|
101
|
+
|
|
102
|
+
## 0.3.3 - 2026-02-03
|
|
103
|
+
|
|
104
|
+
### Fixed
|
|
105
|
+
- **Double-logging bug**: `df['col'] = df['col'].fillna()` now logs exactly one event, not two
|
|
106
|
+
- Fixed duplicate capture from both `_wrap_setitem` and `wrap_series_assignment`
|
|
107
|
+
- **Merge warning scoping**: `tp.check(df)` now only shows warnings for merges in df's lineage
|
|
108
|
+
- Previously showed warnings from ALL merges in the session (cross-contamination)
|
|
109
|
+
- Now filters by tracking which merge steps produced the queried DataFrame's rows
|
|
110
|
+
|
|
111
|
+
### Added
|
|
112
|
+
- `_get_merge_stats_for_df()` helper to scope merge warnings to df's lineage
|
|
113
|
+
- Tests for double-logging prevention and merge warning scoping
|
|
114
|
+
|
|
115
|
+
## 0.3.2 - 2026-02-03
|
|
116
|
+
|
|
117
|
+
### Fixed
|
|
118
|
+
- Merge duplicate key warnings now correctly identify which table (left/right) has duplicates
|
|
119
|
+
- Previously `right_dup_rate` was mislabeled as "Right table" when it actually indicates LEFT table duplicates
|
|
120
|
+
|
|
121
|
+
## 0.3.1 - 2026-02-03
|
|
122
|
+
|
|
123
|
+
### Fixed
|
|
124
|
+
- Cell history now correctly chains through merge operations via lineage traversal
|
|
125
|
+
- `tp.why()` and `tp.trace()` show pre-merge changes for post-merge rows
|
|
126
|
+
- `enable()` resets accumulated state when called multiple times (fixes duplicate warnings in notebooks/IDEs)
|
|
127
|
+
|
|
128
|
+
### Added
|
|
129
|
+
- `get_row_history_with_lineage()` and `get_cell_history_with_lineage()` methods for lineage-aware queries
|
|
130
|
+
- `follow_lineage` parameter in `explain_value()` for opt-out of lineage traversal
|
|
131
|
+
- Integration tests for cell provenance through merge operations
|
|
132
|
+
|
|
133
|
+
## 0.3.0 - 2026-02-03
|
|
134
|
+
|
|
135
|
+
### Added
|
|
136
|
+
- MkDocs documentation site with Material theme
|
|
137
|
+
- Comprehensive API reference documentation
|
|
138
|
+
- Getting started guides and tutorials
|
|
139
|
+
- `tp.register()` API for manually registering DataFrames created before `enable()`
|
|
140
|
+
- Configurable retention threshold in `tp.check()`
|
|
141
|
+
- Ghost row capture for fallback filter paths
|
|
142
|
+
- Comprehensive test coverage for COLUMN identity mode
|
|
143
|
+
- Data quality contracts with fluent API (`tp.contract().expect_*()`)
|
|
144
|
+
- HTML report generation with `tp.report()`
|
|
145
|
+
- Snapshot and diff functionality
|
|
146
|
+
- Debug mode with cell-level tracking
|
|
147
|
+
- `tp.why()` for cell provenance
|
|
148
|
+
- `tp.trace()` for row journey
|
|
149
|
+
- Watched columns for selective tracking
|
|
150
|
+
- Ghost values capture
|
|
151
|
+
- Basic row-level lineage tracking
|
|
152
|
+
- Support for filter operations (dropna, query, boolean indexing)
|
|
153
|
+
- Support for transform operations (fillna, replace, setitem)
|
|
154
|
+
- Support for merge and join operations
|
|
155
|
+
- CI and Debug modes
|
|
156
|
+
|
|
157
|
+
### Fixed
|
|
158
|
+
- Recursion bug when accessing hidden `__tracepipe_row_id__` column in COLUMN mode
|
|
159
|
+
- Config propagation to `row_manager` and `store` components in `enable()`
|
|
160
|
+
- Retention rate calculation for multi-table pipelines with merges
|
|
161
|
+
- Export wrappers (`to_csv`, `to_parquet`) now correctly strip hidden column
|
|
162
|
+
- `_filter_op_depth` cleanup in error scenarios
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tracepipe
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.1
|
|
4
4
|
Summary: Row-level data lineage tracking for pandas pipelines
|
|
5
5
|
Project-URL: Homepage, https://github.com/tracepipe/tracepipe
|
|
6
6
|
Project-URL: Documentation, https://tracepipe.github.io/tracepipe/
|
|
@@ -276,6 +276,23 @@ tp.enable(mode="debug") # Full lineage
|
|
|
276
276
|
|
|
277
277
|
---
|
|
278
278
|
|
|
279
|
+
## Known Limitations
|
|
280
|
+
|
|
281
|
+
TracePipe tracks **cell mutations**, **merge provenance**, **concat provenance**, and **duplicate drop decisions** reliably. A few patterns have limited tracking:
|
|
282
|
+
|
|
283
|
+
| Pattern | Status | Notes |
|
|
284
|
+
|---------|--------|-------|
|
|
285
|
+
| `df["col"] = df["col"].fillna(0)` | ✅ Tracked | Series + assignment |
|
|
286
|
+
| `df = df.fillna({"col": 0})` | ✅ Tracked | DataFrame-level fillna |
|
|
287
|
+
| `df.loc[mask, "col"] = val` | ✅ Tracked | Conditional assignment |
|
|
288
|
+
| `df.merge(other, on="key")` | ✅ Tracked | Full provenance in debug mode |
|
|
289
|
+
| `pd.concat([df1, df2])` | ✅ Tracked | Row IDs preserved with source DataFrame tracking (v0.4+) |
|
|
290
|
+
| `df.drop_duplicates()` | ✅ Tracked | Dropped rows map to kept representative (debug mode, v0.4+) |
|
|
291
|
+
| `pd.concat(axis=1)` | ⚠️ Partial | FULL only if all inputs have identical RIDs |
|
|
292
|
+
| Complex `apply`/`pipe` | ⚠️ Partial | Output tracked, internals opaque |
|
|
293
|
+
|
|
294
|
+
---
|
|
295
|
+
|
|
279
296
|
## Contributing
|
|
280
297
|
|
|
281
298
|
```bash
|
|
@@ -207,6 +207,23 @@ tp.enable(mode="debug") # Full lineage
|
|
|
207
207
|
|
|
208
208
|
---
|
|
209
209
|
|
|
210
|
+
## Known Limitations
|
|
211
|
+
|
|
212
|
+
TracePipe tracks **cell mutations**, **merge provenance**, **concat provenance**, and **duplicate drop decisions** reliably. A few patterns have limited tracking:
|
|
213
|
+
|
|
214
|
+
| Pattern | Status | Notes |
|
|
215
|
+
|---------|--------|-------|
|
|
216
|
+
| `df["col"] = df["col"].fillna(0)` | ✅ Tracked | Series + assignment |
|
|
217
|
+
| `df = df.fillna({"col": 0})` | ✅ Tracked | DataFrame-level fillna |
|
|
218
|
+
| `df.loc[mask, "col"] = val` | ✅ Tracked | Conditional assignment |
|
|
219
|
+
| `df.merge(other, on="key")` | ✅ Tracked | Full provenance in debug mode |
|
|
220
|
+
| `pd.concat([df1, df2])` | ✅ Tracked | Row IDs preserved with source DataFrame tracking (v0.4+) |
|
|
221
|
+
| `df.drop_duplicates()` | ✅ Tracked | Dropped rows map to kept representative (debug mode, v0.4+) |
|
|
222
|
+
| `pd.concat(axis=1)` | ⚠️ Partial | FULL only if all inputs have identical RIDs |
|
|
223
|
+
| Complex `apply`/`pipe` | ⚠️ Partial | Output tracked, internals opaque |
|
|
224
|
+
|
|
225
|
+
---
|
|
226
|
+
|
|
210
227
|
## Contributing
|
|
211
228
|
|
|
212
229
|
```bash
|
|
@@ -86,6 +86,9 @@ Manually register DataFrames for tracking.
|
|
|
86
86
|
|
|
87
87
|
Use this when DataFrames are created before `tp.enable()` is called.
|
|
88
88
|
|
|
89
|
+
!!! note "Lineage Break"
|
|
90
|
+
Calling `register()` assigns new row IDs, which breaks lineage from any prior transformations. Use it only for "entry point" DataFrames.
|
|
91
|
+
|
|
89
92
|
**Parameters:**
|
|
90
93
|
|
|
91
94
|
| Parameter | Type | Description |
|
|
@@ -161,14 +164,15 @@ Health check for a DataFrame's lineage.
|
|
|
161
164
|
|
|
162
165
|
| Attribute | Type | Description |
|
|
163
166
|
|-----------|------|-------------|
|
|
164
|
-
| `.
|
|
167
|
+
| `.ok` | `bool` | True if no FACT-level warnings |
|
|
168
|
+
| `.passed` | `bool` | Alias for `.ok` |
|
|
165
169
|
| `.mode` | `str` | Current tracking mode |
|
|
166
|
-
| `.retention` | `float` | Row retention rate (0-1) |
|
|
167
|
-
| `.n_dropped` | `int` | Total dropped
|
|
168
|
-
| `.
|
|
169
|
-
| `.
|
|
170
|
-
| `.
|
|
171
|
-
| `.
|
|
170
|
+
| `.retention` | `float \| None` | Row retention rate (0.0-1.0) |
|
|
171
|
+
| `.n_dropped` | `int` | Total rows dropped |
|
|
172
|
+
| `.n_steps` | `int` | Total pipeline steps recorded |
|
|
173
|
+
| `.drops_by_op` | `dict[str, int]` | Drops by operation name |
|
|
174
|
+
| `.warnings` | `list[CheckWarning]` | Warning objects with details |
|
|
175
|
+
| `.facts` | `dict` | Raw measured facts (for power users) |
|
|
172
176
|
|
|
173
177
|
**Example:**
|
|
174
178
|
|
|
@@ -209,9 +213,11 @@ Trace a row's journey through the pipeline.
|
|
|
209
213
|
| Attribute | Type | Description |
|
|
210
214
|
|-----------|------|-------------|
|
|
211
215
|
| `.row_id` | `int` | Internal row ID |
|
|
212
|
-
| `.
|
|
216
|
+
| `.is_alive` | `bool` | True if row exists in current DataFrame |
|
|
213
217
|
| `.events` | `list` | All events for this row |
|
|
214
|
-
| `.
|
|
218
|
+
| `.dropped_at` | `dict` | Operation that dropped (if dropped) |
|
|
219
|
+
| `.origin` | `dict` | Where row came from: `{"type": "concat", "source_df": 1}` or `{"type": "merge", "left_parent": 10, "right_parent": 20}` |
|
|
220
|
+
| `.representative` | `dict` | If dropped by dedup: `{"kept_rid": 42, "subset": [...], "keep": "first"}` |
|
|
215
221
|
|
|
216
222
|
**Example:**
|
|
217
223
|
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to TracePipe will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [0.4.1] - 2026-02-04
|
|
9
|
+
|
|
10
|
+
### Fixed
|
|
11
|
+
- Fully implemented `CheckResult` convenience properties (`.passed`, `.retention`, `.n_dropped`, `.n_steps`, `.drops_by_op`)
|
|
12
|
+
- Added comprehensive tests for `CheckResult` API to ensure properties work correctly
|
|
13
|
+
- Properties now properly access underlying `.facts` dictionary for all metrics
|
|
14
|
+
|
|
15
|
+
### Changed
|
|
16
|
+
- Cleaned up example files and test scripts
|
|
17
|
+
|
|
18
|
+
## [0.4.0] - 2026-02-04
|
|
19
|
+
|
|
20
|
+
### Added
|
|
21
|
+
|
|
22
|
+
- **Full row provenance for `pd.concat(axis=0)`**: Row IDs are now preserved through concatenation
|
|
23
|
+
- Each result row maintains its original RID from the source DataFrame
|
|
24
|
+
- `ConcatMapping` tracks which source DataFrame each row came from
|
|
25
|
+
- Concat steps are now marked `FULL` completeness
|
|
26
|
+
|
|
27
|
+
- **Duplicate drop provenance in debug mode**: `drop_duplicates` now tracks which row "won"
|
|
28
|
+
- `DuplicateDropMapping` maps dropped rows to their kept representative
|
|
29
|
+
- Supports `keep='first'`, `keep='last'`, and `keep=False`
|
|
30
|
+
- Uses `hash_pandas_object` for fast, NaN-safe key comparison
|
|
31
|
+
|
|
32
|
+
- **Clean `TraceResult` API for provenance**:
|
|
33
|
+
- `trace.origin` — Unified origin: `{"type": "concat", "source_df": 1}` or `{"type": "merge", ...}`
|
|
34
|
+
- `trace.representative` — For dedup drops: `{"kept_rid": 42, "subset": ["key"], "keep": "first"}`
|
|
35
|
+
- No need to access internal `.store` methods
|
|
36
|
+
|
|
37
|
+
- **Clean `CheckResult` API**:
|
|
38
|
+
- `result.passed` — Alias for `.ok`
|
|
39
|
+
- `result.retention` — Row retention rate (0.0-1.0)
|
|
40
|
+
- `result.n_dropped`, `result.n_steps`, `result.drops_by_op`
|
|
41
|
+
- All properties discoverable via autocomplete
|
|
42
|
+
|
|
43
|
+
- **Comprehensive test suite**: 38 new tests covering concat, dedup, and TraceResult API
|
|
44
|
+
|
|
45
|
+
### Changed
|
|
46
|
+
|
|
47
|
+
- `wrap_concat_with_lineage` rewritten for full provenance tracking
|
|
48
|
+
- `axis=1` concat propagates RIDs if all inputs match, otherwise PARTIAL
|
|
49
|
+
- `TraceResult` enhanced with `.origin` and `.representative` properties
|
|
50
|
+
|
|
51
|
+
## [0.3.5] - 2026-02-03
|
|
52
|
+
|
|
53
|
+
### Fixed
|
|
54
|
+
|
|
55
|
+
- **DataFrame.fillna double-logging**: `df.fillna({"col": 0})` now logs exactly 1 event
|
|
56
|
+
- Added `wrap_pandas_transform_method` with `_in_transform_op` flag
|
|
57
|
+
|
|
58
|
+
### Added
|
|
59
|
+
|
|
60
|
+
- Known Limitations section in README documenting concat/dedup tracking gaps
|
|
61
|
+
|
|
62
|
+
### Changed
|
|
63
|
+
|
|
64
|
+
- Test suite hardened with exact count assertions and multi-scenario tests
|
|
65
|
+
|
|
66
|
+
## [0.3.4] - 2026-02-03
|
|
67
|
+
|
|
68
|
+
### Fixed
|
|
69
|
+
|
|
70
|
+
- **Event deduplication**: Identical events from parallel pipelines are now deduplicated
|
|
71
|
+
|
|
72
|
+
## [0.3.3] - 2026-02-03
|
|
73
|
+
|
|
74
|
+
### Fixed
|
|
75
|
+
|
|
76
|
+
- **Double-logging bug**: `df['col'] = df['col'].fillna()` now logs exactly one event
|
|
77
|
+
- **Merge warning scoping**: `tp.check(df)` now only shows warnings for merges in df's lineage
|
|
78
|
+
|
|
79
|
+
## [0.3.2] - 2026-02-03
|
|
80
|
+
|
|
81
|
+
### Fixed
|
|
82
|
+
|
|
83
|
+
- Merge duplicate key warnings now correctly identify which table (left/right) has duplicates
|
|
84
|
+
|
|
85
|
+
## [0.3.1] - 2026-02-03
|
|
86
|
+
|
|
87
|
+
### Fixed
|
|
88
|
+
|
|
89
|
+
- Cell history now correctly chains through merge operations via lineage traversal
|
|
90
|
+
- `tp.why()` and `tp.trace()` show pre-merge changes for post-merge rows
|
|
91
|
+
- `enable()` resets accumulated state when called multiple times
|
|
92
|
+
|
|
93
|
+
### Added
|
|
94
|
+
|
|
95
|
+
- `get_row_history_with_lineage()` and `get_cell_history_with_lineage()` methods
|
|
96
|
+
|
|
97
|
+
## [0.3.0] - 2026-02-03
|
|
98
|
+
|
|
99
|
+
### Added
|
|
100
|
+
|
|
101
|
+
- MkDocs documentation site with Material theme
|
|
102
|
+
- Comprehensive API reference documentation
|
|
103
|
+
- Getting started guides and tutorials
|
|
104
|
+
- `tp.register()` API for manually registering DataFrames
|
|
105
|
+
- Configurable retention threshold in `tp.check()`
|
|
106
|
+
- Ghost row capture for fallback filter paths
|
|
107
|
+
- Data quality contracts with fluent API
|
|
108
|
+
- HTML report generation
|
|
109
|
+
- Snapshot and diff functionality
|
|
110
|
+
- Debug mode with cell-level tracking
|
|
111
|
+
- `tp.why()` for cell provenance
|
|
112
|
+
- `tp.trace()` for row journey
|
|
113
|
+
- Support for all major pandas operations
|
|
114
|
+
|
|
115
|
+
### Fixed
|
|
116
|
+
|
|
117
|
+
- Recursion bug when accessing hidden column in COLUMN mode
|
|
118
|
+
- Config propagation issues
|
|
119
|
+
- Retention rate calculation for multi-table pipelines
|
|
120
|
+
- Export wrappers correctly strip hidden column
|
|
@@ -68,6 +68,28 @@ When rows are aggregated:
|
|
|
68
68
|
grouped = df.groupby("category").sum() # GROUP event with membership
|
|
69
69
|
```
|
|
70
70
|
|
|
71
|
+
### Concat Events (v0.4+)
|
|
72
|
+
|
|
73
|
+
When DataFrames are concatenated, row IDs are preserved:
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
df1 = pd.DataFrame({"a": [1, 2]}) # Rows get IDs: 0, 1
|
|
77
|
+
df2 = pd.DataFrame({"a": [3, 4]}) # Rows get IDs: 2, 3
|
|
78
|
+
|
|
79
|
+
result = pd.concat([df1, df2]) # IDs preserved: 0, 1, 2, 3
|
|
80
|
+
# TracePipe tracks which source DataFrame each row came from
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### Duplicate Drop Events (v0.4+)
|
|
84
|
+
|
|
85
|
+
In debug mode, `drop_duplicates` tracks which row was kept as representative:
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
df = pd.DataFrame({"key": ["A", "A", "B"], "val": [1, 2, 3]})
|
|
89
|
+
df = df.drop_duplicates(subset=["key"], keep="first")
|
|
90
|
+
# Row with val=2 was dropped, mapped to representative (val=1)
|
|
91
|
+
```
|
|
92
|
+
|
|
71
93
|
---
|
|
72
94
|
|
|
73
95
|
## The Lineage Store
|
|
@@ -109,6 +109,70 @@ if trace.merge_parents:
|
|
|
109
109
|
print(f"Right parent: {trace.merge_parents.right}")
|
|
110
110
|
```
|
|
111
111
|
|
|
112
|
+
---
|
|
113
|
+
|
|
114
|
+
## Concat Origin Tracking (v0.4+)
|
|
115
|
+
|
|
116
|
+
When rows come from concatenated DataFrames, TracePipe tracks their source via `trace.origin`:
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
df1 = pd.DataFrame({"a": [1, 2]})
|
|
120
|
+
df2 = pd.DataFrame({"a": [3, 4]})
|
|
121
|
+
result = pd.concat([df1, df2])
|
|
122
|
+
|
|
123
|
+
# Trace a row that came from df2
|
|
124
|
+
trace = tp.trace(result, row=2)
|
|
125
|
+
print(trace.origin)
|
|
126
|
+
# {"type": "concat", "source_df": 1, "step_id": 5}
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
The `.origin` property returns a unified dict with:
|
|
130
|
+
|
|
131
|
+
- `type`: `"concat"`, `"merge"`, or `None` (for original rows)
|
|
132
|
+
- `source_df`: Index in the concat list (0=first DataFrame, 1=second, etc.)
|
|
133
|
+
- `step_id`: Which pipeline step
|
|
134
|
+
|
|
135
|
+
Row IDs are preserved through `pd.concat(axis=0)`, so lineage chains correctly:
|
|
136
|
+
|
|
137
|
+
```python
|
|
138
|
+
# Transform df1 before concat
|
|
139
|
+
df1["a"] = df1["a"].fillna(0)
|
|
140
|
+
|
|
141
|
+
result = pd.concat([df1, df2])
|
|
142
|
+
|
|
143
|
+
# Rows from df1 still have their fillna history
|
|
144
|
+
trace = tp.trace(result, row=0) # Shows fillna event from df1
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
---
|
|
148
|
+
|
|
149
|
+
## Duplicate Representative Tracking (v0.4+)
|
|
150
|
+
|
|
151
|
+
When `drop_duplicates` removes rows, TracePipe tracks which row "won" via `trace.representative`:
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
df = pd.DataFrame({
|
|
155
|
+
"key": ["A", "A", "B"],
|
|
156
|
+
"value": [100, 200, 300]
|
|
157
|
+
})
|
|
158
|
+
df = df.drop_duplicates(subset=["key"], keep="first")
|
|
159
|
+
|
|
160
|
+
# Trace the dropped row (value=200)
|
|
161
|
+
trace = tp.trace(df, row=dropped_row_id)
|
|
162
|
+
print(trace.representative)
|
|
163
|
+
# {"kept_rid": 42, "subset": ["key"], "keep": "first"}
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
The `.representative` property is only set for rows dropped by `drop_duplicates`:
|
|
167
|
+
|
|
168
|
+
| `keep` Strategy | `.representative` |
|
|
169
|
+
|-----------------|-------------------|
|
|
170
|
+
| `keep='first'` | `{"kept_rid": 42, ...}` — first occurrence kept |
|
|
171
|
+
| `keep='last'` | `{"kept_rid": 45, ...}` — last occurrence kept |
|
|
172
|
+
| `keep=False` | `{"kept_rid": None, ...}` — all duplicates removed |
|
|
173
|
+
|
|
174
|
+
This answers "why did this row disappear?" — it wasn't deleted, it was deduplicated.
|
|
175
|
+
|
|
112
176
|
## Performance Considerations
|
|
113
177
|
|
|
114
178
|
- Row tracing in CI mode is limited (no individual row IDs)
|
|
@@ -159,12 +159,14 @@ print(tp.why(df, "price", 0)) # Why price changed
|
|
|
159
159
|
|
|
160
160
|
| Operation | Tracking | Completeness |
|
|
161
161
|
|-----------|----------|--------------|
|
|
162
|
-
| `dropna`, `
|
|
163
|
-
| `
|
|
162
|
+
| `dropna`, `query`, `df[mask]` | Dropped row IDs | Full |
|
|
163
|
+
| `drop_duplicates` | Dropped→kept mapping (debug mode) | Full |
|
|
164
164
|
| `head`, `tail`, `sample` | Dropped row IDs | Full |
|
|
165
165
|
| `fillna`, `replace` | Cell diffs (watched cols) | Full |
|
|
166
166
|
| `loc[]=`, `iloc[]=`, `at[]=` | Cell diffs | Full |
|
|
167
167
|
| `merge`, `join` | Parent tracking | Full |
|
|
168
|
+
| `pd.concat(axis=0)` | Row IDs + source DataFrame | Full |
|
|
169
|
+
| `pd.concat(axis=1)` | Row IDs (if aligned) | Partial |
|
|
168
170
|
| `groupby().agg()` | Group membership | Full |
|
|
169
171
|
| `apply`, `pipe` | Output tracked | Partial |
|
|
170
172
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
site_name: TracePipe
|
|
2
2
|
site_description: Row-level data lineage tracking for pandas pipelines
|
|
3
|
-
site_url: https://
|
|
3
|
+
site_url: https://gauthierpiarrette.github.io/tracepipe/
|
|
4
4
|
repo_url: https://github.com/gauthierpiarrette/tracepipe
|
|
5
5
|
repo_name: gauthierpiarrette/tracepipe
|
|
6
6
|
edit_uri: edit/main/docs/
|
|
@@ -304,7 +304,9 @@ class TestRowLineageResult:
|
|
|
304
304
|
|
|
305
305
|
row = dbg().explain_row(0)
|
|
306
306
|
history = row.cell_history("a")
|
|
307
|
-
assert
|
|
307
|
+
assert (
|
|
308
|
+
len(history) == 1
|
|
309
|
+
), f"Single fillna should record exactly 1 change, got {len(history)}"
|
|
308
310
|
|
|
309
311
|
def test_history(self):
|
|
310
312
|
"""history() returns full history."""
|
|
@@ -487,7 +489,9 @@ class TestPreEnableDataFrameTracking:
|
|
|
487
489
|
df["a"] = df["a"] * 10
|
|
488
490
|
|
|
489
491
|
result = tracepipe.why(df, col="a", row=0)
|
|
490
|
-
assert
|
|
492
|
+
assert (
|
|
493
|
+
len(result.history) == 1
|
|
494
|
+
), f"Single multiply should record exactly 1 change, got {len(result.history)}"
|
|
491
495
|
|
|
492
496
|
def test_trace_after_register(self):
|
|
493
497
|
"""Row tracing works for registered DataFrames."""
|