tracepipe 0.3.5__tar.gz → 0.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tracepipe-0.3.5 → tracepipe-0.4.1}/CHANGELOG.md +70 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/PKG-INFO +6 -9
- {tracepipe-0.3.5 → tracepipe-0.4.1}/README.md +5 -8
- {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/api/core.md +15 -9
- tracepipe-0.4.1/docs/changelog.md +120 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/guide/concepts.md +22 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/guide/row-tracing.md +64 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/index.md +4 -2
- {tracepipe-0.3.5 → tracepipe-0.4.1}/mkdocs.yml +1 -1
- {tracepipe-0.3.5 → tracepipe-0.4.1}/pyproject.toml +1 -1
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tests/test_api.py +6 -2
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tests/test_convenience_debug.py +122 -1
- tracepipe-0.4.1/tests/test_integration_scenarios.py +361 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tests/test_lineage_through_merge.py +28 -26
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tests/test_pandas_inst.py +20 -10
- tracepipe-0.4.1/tests/test_row_provenance.py +684 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/__init__.py +1 -1
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/convenience.py +130 -7
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/core.py +79 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/instrumentation/filter_capture.py +103 -1
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/instrumentation/merge_capture.py +169 -23
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/storage/lineage_store.py +92 -7
- {tracepipe-0.3.5 → tracepipe-0.4.1}/uv.lock +317 -248
- tracepipe-0.3.5/docs/changelog.md +0 -39
- tracepipe-0.3.5/examples/comprehensive_demo.py +0 -694
- tracepipe-0.3.5/examples/red_team_test.py +0 -437
- {tracepipe-0.3.5 → tracepipe-0.4.1}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/.github/workflows/ci.yml +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/.github/workflows/docs.yml +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/.github/workflows/release.yml +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/.gitignore +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/.pre-commit-config.yaml +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/CONTRIBUTING.md +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/LICENSE +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/benchmarks/README.md +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/benchmarks/bench_memory.py +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/benchmarks/bench_overhead.py +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/benchmarks/bench_scale.py +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/benchmarks/run_all.py +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/api/contracts.md +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/api/debug.md +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/api/index.md +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/contributing.md +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/examples/data-validation.md +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/examples/ml-pipeline.md +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/getting-started/installation.md +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/getting-started/modes.md +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/getting-started/quickstart.md +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/guide/cell-provenance.md +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/guide/contracts.md +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/guide/health-checks.md +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/guide/reports.md +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/guide/snapshots.md +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/examples/demo.py +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/examples/ml_pipeline_demo.py +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/404.html +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/api/contracts/index.html +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/api/core/index.html +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/api/debug/index.html +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/api/index.html +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/_mkdocstrings.css +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/images/favicon.png +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/bundle.79ae519e.min.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/bundle.79ae519e.min.js.map +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.ar.min.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.da.min.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.de.min.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.du.min.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.el.min.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.es.min.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.fi.min.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.fr.min.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.he.min.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.hi.min.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.hu.min.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.hy.min.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.it.min.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.ja.min.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.jp.min.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.kn.min.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.ko.min.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.multi.min.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.nl.min.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.no.min.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.pt.min.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.ro.min.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.ru.min.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.sa.min.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.stemmer.support.min.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.sv.min.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.ta.min.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.te.min.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.th.min.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.tr.min.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.vi.min.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.zh.min.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/tinyseg.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/wordcut.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/workers/search.2c215733.min.js +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/workers/search.2c215733.min.js.map +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/stylesheets/main.484c7ddc.min.css +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/stylesheets/main.484c7ddc.min.css.map +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/stylesheets/palette.ab4e12ef.min.css +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/stylesheets/palette.ab4e12ef.min.css.map +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/changelog/index.html +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/contributing/index.html +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/examples/data-validation/index.html +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/examples/ml-pipeline/index.html +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/getting-started/installation/index.html +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/getting-started/modes/index.html +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/getting-started/quickstart/index.html +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/guide/cell-provenance/index.html +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/guide/concepts/index.html +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/guide/contracts/index.html +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/guide/health-checks/index.html +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/guide/reports/index.html +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/guide/row-tracing/index.html +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/guide/snapshots/index.html +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/index.html +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/objects.inv +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/search/search_index.json +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/sitemap.xml +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/site/sitemap.xml.gz +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tests/__init__.py +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tests/conftest.py +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tests/test_concurrency.py +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tests/test_contracts.py +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tests/test_edge_cases.py +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tests/test_integration.py +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tests/test_io_operations.py +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tests/test_public_api.py +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tests/test_snapshot.py +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tests/test_version_matrix.py +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/api.py +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/context.py +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/contracts.py +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/debug.py +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/instrumentation/__init__.py +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/instrumentation/apply_capture.py +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/instrumentation/indexer_capture.py +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/instrumentation/pandas_inst.py +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/instrumentation/series_capture.py +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/safety.py +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/snapshot.py +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/storage/__init__.py +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/storage/base.py +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/storage/row_identity.py +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/utils/__init__.py +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/utils/value_capture.py +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/value_provenance.py +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/visualization/__init__.py +0 -0
- {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/visualization/html_export.py +0 -0
|
@@ -5,6 +5,67 @@ All notable changes to TracePipe will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## 0.4.1 - 2026-02-04
|
|
9
|
+
|
|
10
|
+
### Fixed
|
|
11
|
+
- Fully implemented `CheckResult` convenience properties (`.passed`, `.retention`, `.n_dropped`, `.n_steps`, `.drops_by_op`)
|
|
12
|
+
- Added comprehensive tests for `CheckResult` API to ensure properties work correctly
|
|
13
|
+
- Properties now properly access underlying `.facts` dictionary for all metrics
|
|
14
|
+
|
|
15
|
+
### Changed
|
|
16
|
+
- Cleaned up example files and test scripts
|
|
17
|
+
|
|
18
|
+
## 0.4.0 - 2026-02-04
|
|
19
|
+
|
|
20
|
+
### Added
|
|
21
|
+
- **Full row provenance for `pd.concat(axis=0)`**: Row IDs are now preserved through concatenation
|
|
22
|
+
- Each result row maintains its original RID from the source DataFrame
|
|
23
|
+
- `ConcatMapping` tracks which source DataFrame each row came from
|
|
24
|
+
- Concat steps are now marked `FULL` completeness (previously `PARTIAL`)
|
|
25
|
+
|
|
26
|
+
- **Duplicate drop provenance in debug mode**: `drop_duplicates` now tracks which row "won"
|
|
27
|
+
- `DuplicateDropMapping` maps dropped rows to their kept representative
|
|
28
|
+
- Supports `keep='first'`, `keep='last'`, and `keep=False`
|
|
29
|
+
- Uses `hash_pandas_object` for fast, NaN-safe key comparison
|
|
30
|
+
|
|
31
|
+
- **Clean `TraceResult` API for provenance** (UX improvement):
|
|
32
|
+
- `trace.origin` — Unified origin info: `{"type": "concat", "source_df": 1}` or `{"type": "merge", "left_parent": 10, "right_parent": 20}`
|
|
33
|
+
- `trace.representative` — For dedup-dropped rows: `{"kept_rid": 42, "subset": ["key"], "keep": "first"}`
|
|
34
|
+
- No need to access internal `.store` methods — everything is in `tp.trace()` result
|
|
35
|
+
|
|
36
|
+
- **Clean `CheckResult` API** (UX improvement):
|
|
37
|
+
- `result.passed` — Alias for `.ok` (common naming convention)
|
|
38
|
+
- `result.retention` — Row retention rate (0.0-1.0) from `.facts`
|
|
39
|
+
- `result.n_dropped` — Total rows dropped
|
|
40
|
+
- `result.n_steps` — Total pipeline steps recorded
|
|
41
|
+
- `result.drops_by_op` — Drops broken down by operation name
|
|
42
|
+
- All properties are now discoverable via autocomplete
|
|
43
|
+
|
|
44
|
+
- **New data structures in `core.py`**:
|
|
45
|
+
- `ConcatMapping`: Tracks row provenance through concat operations
|
|
46
|
+
- `DuplicateDropMapping`: Tracks dropped->kept relationships in drop_duplicates
|
|
47
|
+
|
|
48
|
+
- **Comprehensive test suite**: 38 new tests in `test_row_provenance.py` covering:
|
|
49
|
+
- Concat RID preservation, ignore_index, after sort, with empty DFs, chained concats
|
|
50
|
+
- Axis=1 same-RID propagation vs different-RID PARTIAL marking
|
|
51
|
+
- Drop_duplicates keep='first'/'last'/False mapping correctness
|
|
52
|
+
- NaN handling parity with pandas `duplicated()`
|
|
53
|
+
- Integration: concat→merge, filter→concat, dedup→fillna lineage
|
|
54
|
+
- TraceResult `.origin` and `.representative` property tests
|
|
55
|
+
|
|
56
|
+
### Changed
|
|
57
|
+
- `wrap_concat_with_lineage` rewritten for full provenance tracking
|
|
58
|
+
- Captures source RIDs before operation
|
|
59
|
+
- Propagates RIDs (not new registration) for axis=0
|
|
60
|
+
- Stores positional + sorted arrays for both "explain row i" and O(log n) lookup
|
|
61
|
+
- Axis=1 propagates RIDs if all inputs match, otherwise PARTIAL
|
|
62
|
+
|
|
63
|
+
- `_capture_filter_with_mask` enhanced to store `DuplicateDropMapping` in debug mode
|
|
64
|
+
|
|
65
|
+
- `TraceResult` enhanced with `.origin` and `.representative` properties
|
|
66
|
+
- `.to_text()` now displays origin and representative info
|
|
67
|
+
- `.to_dict()` includes all provenance info
|
|
68
|
+
|
|
8
69
|
## 0.3.5 - 2026-02-03
|
|
9
70
|
|
|
10
71
|
### Fixed
|
|
@@ -17,6 +78,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
17
78
|
- Known Limitations section in README documenting concat/dedup tracking gaps
|
|
18
79
|
- Test for `DataFrame.fillna` single-event logging
|
|
19
80
|
|
|
81
|
+
### Changed
|
|
82
|
+
- **Test suite hardened** with exact count assertions and multi-scenario tests:
|
|
83
|
+
- Changed 15+ assertions from `>= 1` to `== 1` for precise verification
|
|
84
|
+
- Added `test_integration_scenarios.py` with 16 new tests covering:
|
|
85
|
+
- Multi-pipeline session isolation
|
|
86
|
+
- Warning message content verification
|
|
87
|
+
- Reliability scenarios (fillna, replace, loc, merge)
|
|
88
|
+
- Cross-pipeline contamination prevention
|
|
89
|
+
|
|
20
90
|
## 0.3.4 - 2026-02-03
|
|
21
91
|
|
|
22
92
|
### Fixed
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tracepipe
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.1
|
|
4
4
|
Summary: Row-level data lineage tracking for pandas pipelines
|
|
5
5
|
Project-URL: Homepage, https://github.com/tracepipe/tracepipe
|
|
6
6
|
Project-URL: Documentation, https://tracepipe.github.io/tracepipe/
|
|
@@ -278,7 +278,7 @@ tp.enable(mode="debug") # Full lineage
|
|
|
278
278
|
|
|
279
279
|
## Known Limitations
|
|
280
280
|
|
|
281
|
-
TracePipe tracks **cell mutations**
|
|
281
|
+
TracePipe tracks **cell mutations**, **merge provenance**, **concat provenance**, and **duplicate drop decisions** reliably. A few patterns have limited tracking:
|
|
282
282
|
|
|
283
283
|
| Pattern | Status | Notes |
|
|
284
284
|
|---------|--------|-------|
|
|
@@ -286,13 +286,10 @@ TracePipe tracks **cell mutations** (fillna, replace, loc assignment) and **merg
|
|
|
286
286
|
| `df = df.fillna({"col": 0})` | ✅ Tracked | DataFrame-level fillna |
|
|
287
287
|
| `df.loc[mask, "col"] = val` | ✅ Tracked | Conditional assignment |
|
|
288
288
|
| `df.merge(other, on="key")` | ✅ Tracked | Full provenance in debug mode |
|
|
289
|
-
| `pd.concat([df1, df2])` |
|
|
290
|
-
| `df.drop_duplicates(
|
|
291
|
-
|
|
|
292
|
-
|
|
293
|
-
**Why?** TracePipe tracks value changes within rows, not row-selection operations. When `drop_duplicates` picks one row over another, that's a provenance decision (not a cell mutation) that isn't currently instrumented.
|
|
294
|
-
|
|
295
|
-
**Planned for 0.4**: Full row-provenance tracking for concat, drop_duplicates, and sort operations.
|
|
289
|
+
| `pd.concat([df1, df2])` | ✅ Tracked | Row IDs preserved with source DataFrame tracking (v0.4+) |
|
|
290
|
+
| `df.drop_duplicates()` | ✅ Tracked | Dropped rows map to kept representative (debug mode, v0.4+) |
|
|
291
|
+
| `pd.concat(axis=1)` | ⚠️ Partial | FULL only if all inputs have identical RIDs |
|
|
292
|
+
| Complex `apply`/`pipe` | ⚠️ Partial | Output tracked, internals opaque |
|
|
296
293
|
|
|
297
294
|
---
|
|
298
295
|
|
|
@@ -209,7 +209,7 @@ tp.enable(mode="debug") # Full lineage
|
|
|
209
209
|
|
|
210
210
|
## Known Limitations
|
|
211
211
|
|
|
212
|
-
TracePipe tracks **cell mutations**
|
|
212
|
+
TracePipe tracks **cell mutations**, **merge provenance**, **concat provenance**, and **duplicate drop decisions** reliably. A few patterns have limited tracking:
|
|
213
213
|
|
|
214
214
|
| Pattern | Status | Notes |
|
|
215
215
|
|---------|--------|-------|
|
|
@@ -217,13 +217,10 @@ TracePipe tracks **cell mutations** (fillna, replace, loc assignment) and **merg
|
|
|
217
217
|
| `df = df.fillna({"col": 0})` | ✅ Tracked | DataFrame-level fillna |
|
|
218
218
|
| `df.loc[mask, "col"] = val` | ✅ Tracked | Conditional assignment |
|
|
219
219
|
| `df.merge(other, on="key")` | ✅ Tracked | Full provenance in debug mode |
|
|
220
|
-
| `pd.concat([df1, df2])` |
|
|
221
|
-
| `df.drop_duplicates(
|
|
222
|
-
|
|
|
223
|
-
|
|
224
|
-
**Why?** TracePipe tracks value changes within rows, not row-selection operations. When `drop_duplicates` picks one row over another, that's a provenance decision (not a cell mutation) that isn't currently instrumented.
|
|
225
|
-
|
|
226
|
-
**Planned for 0.4**: Full row-provenance tracking for concat, drop_duplicates, and sort operations.
|
|
220
|
+
| `pd.concat([df1, df2])` | ✅ Tracked | Row IDs preserved with source DataFrame tracking (v0.4+) |
|
|
221
|
+
| `df.drop_duplicates()` | ✅ Tracked | Dropped rows map to kept representative (debug mode, v0.4+) |
|
|
222
|
+
| `pd.concat(axis=1)` | ⚠️ Partial | FULL only if all inputs have identical RIDs |
|
|
223
|
+
| Complex `apply`/`pipe` | ⚠️ Partial | Output tracked, internals opaque |
|
|
227
224
|
|
|
228
225
|
---
|
|
229
226
|
|
|
@@ -86,6 +86,9 @@ Manually register DataFrames for tracking.
|
|
|
86
86
|
|
|
87
87
|
Use this when DataFrames are created before `tp.enable()` is called.
|
|
88
88
|
|
|
89
|
+
!!! note "Lineage Break"
|
|
90
|
+
Calling `register()` assigns new row IDs, which breaks lineage from any prior transformations. Use it only for "entry point" DataFrames.
|
|
91
|
+
|
|
89
92
|
**Parameters:**
|
|
90
93
|
|
|
91
94
|
| Parameter | Type | Description |
|
|
@@ -161,14 +164,15 @@ Health check for a DataFrame's lineage.
|
|
|
161
164
|
|
|
162
165
|
| Attribute | Type | Description |
|
|
163
166
|
|-----------|------|-------------|
|
|
164
|
-
| `.
|
|
167
|
+
| `.ok` | `bool` | True if no FACT-level warnings |
|
|
168
|
+
| `.passed` | `bool` | Alias for `.ok` |
|
|
165
169
|
| `.mode` | `str` | Current tracking mode |
|
|
166
|
-
| `.retention` | `float` | Row retention rate (0-1) |
|
|
167
|
-
| `.n_dropped` | `int` | Total dropped
|
|
168
|
-
| `.
|
|
169
|
-
| `.
|
|
170
|
-
| `.
|
|
171
|
-
| `.
|
|
170
|
+
| `.retention` | `float \| None` | Row retention rate (0.0-1.0) |
|
|
171
|
+
| `.n_dropped` | `int` | Total rows dropped |
|
|
172
|
+
| `.n_steps` | `int` | Total pipeline steps recorded |
|
|
173
|
+
| `.drops_by_op` | `dict[str, int]` | Drops by operation name |
|
|
174
|
+
| `.warnings` | `list[CheckWarning]` | Warning objects with details |
|
|
175
|
+
| `.facts` | `dict` | Raw measured facts (for power users) |
|
|
172
176
|
|
|
173
177
|
**Example:**
|
|
174
178
|
|
|
@@ -209,9 +213,11 @@ Trace a row's journey through the pipeline.
|
|
|
209
213
|
| Attribute | Type | Description |
|
|
210
214
|
|-----------|------|-------------|
|
|
211
215
|
| `.row_id` | `int` | Internal row ID |
|
|
212
|
-
| `.
|
|
216
|
+
| `.is_alive` | `bool` | True if row exists in current DataFrame |
|
|
213
217
|
| `.events` | `list` | All events for this row |
|
|
214
|
-
| `.
|
|
218
|
+
| `.dropped_at` | `dict` | Operation that dropped (if dropped) |
|
|
219
|
+
| `.origin` | `dict` | Where row came from: `{"type": "concat", "source_df": 1}` or `{"type": "merge", "left_parent": 10, "right_parent": 20}` |
|
|
220
|
+
| `.representative` | `dict` | If dropped by dedup: `{"kept_rid": 42, "subset": [...], "keep": "first"}` |
|
|
215
221
|
|
|
216
222
|
**Example:**
|
|
217
223
|
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to TracePipe will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [0.4.1] - 2026-02-04
|
|
9
|
+
|
|
10
|
+
### Fixed
|
|
11
|
+
- Fully implemented `CheckResult` convenience properties (`.passed`, `.retention`, `.n_dropped`, `.n_steps`, `.drops_by_op`)
|
|
12
|
+
- Added comprehensive tests for `CheckResult` API to ensure properties work correctly
|
|
13
|
+
- Properties now properly access underlying `.facts` dictionary for all metrics
|
|
14
|
+
|
|
15
|
+
### Changed
|
|
16
|
+
- Cleaned up example files and test scripts
|
|
17
|
+
|
|
18
|
+
## [0.4.0] - 2026-02-04
|
|
19
|
+
|
|
20
|
+
### Added
|
|
21
|
+
|
|
22
|
+
- **Full row provenance for `pd.concat(axis=0)`**: Row IDs are now preserved through concatenation
|
|
23
|
+
- Each result row maintains its original RID from the source DataFrame
|
|
24
|
+
- `ConcatMapping` tracks which source DataFrame each row came from
|
|
25
|
+
- Concat steps are now marked `FULL` completeness
|
|
26
|
+
|
|
27
|
+
- **Duplicate drop provenance in debug mode**: `drop_duplicates` now tracks which row "won"
|
|
28
|
+
- `DuplicateDropMapping` maps dropped rows to their kept representative
|
|
29
|
+
- Supports `keep='first'`, `keep='last'`, and `keep=False`
|
|
30
|
+
- Uses `hash_pandas_object` for fast, NaN-safe key comparison
|
|
31
|
+
|
|
32
|
+
- **Clean `TraceResult` API for provenance**:
|
|
33
|
+
- `trace.origin` — Unified origin: `{"type": "concat", "source_df": 1}` or `{"type": "merge", ...}`
|
|
34
|
+
- `trace.representative` — For dedup drops: `{"kept_rid": 42, "subset": ["key"], "keep": "first"}`
|
|
35
|
+
- No need to access internal `.store` methods
|
|
36
|
+
|
|
37
|
+
- **Clean `CheckResult` API**:
|
|
38
|
+
- `result.passed` — Alias for `.ok`
|
|
39
|
+
- `result.retention` — Row retention rate (0.0-1.0)
|
|
40
|
+
- `result.n_dropped`, `result.n_steps`, `result.drops_by_op`
|
|
41
|
+
- All properties discoverable via autocomplete
|
|
42
|
+
|
|
43
|
+
- **Comprehensive test suite**: 38 new tests covering concat, dedup, and TraceResult API
|
|
44
|
+
|
|
45
|
+
### Changed
|
|
46
|
+
|
|
47
|
+
- `wrap_concat_with_lineage` rewritten for full provenance tracking
|
|
48
|
+
- `axis=1` concat propagates RIDs if all inputs match, otherwise PARTIAL
|
|
49
|
+
- `TraceResult` enhanced with `.origin` and `.representative` properties
|
|
50
|
+
|
|
51
|
+
## [0.3.5] - 2026-02-03
|
|
52
|
+
|
|
53
|
+
### Fixed
|
|
54
|
+
|
|
55
|
+
- **DataFrame.fillna double-logging**: `df.fillna({"col": 0})` now logs exactly 1 event
|
|
56
|
+
- Added `wrap_pandas_transform_method` with `_in_transform_op` flag
|
|
57
|
+
|
|
58
|
+
### Added
|
|
59
|
+
|
|
60
|
+
- Known Limitations section in README documenting concat/dedup tracking gaps
|
|
61
|
+
|
|
62
|
+
### Changed
|
|
63
|
+
|
|
64
|
+
- Test suite hardened with exact count assertions and multi-scenario tests
|
|
65
|
+
|
|
66
|
+
## [0.3.4] - 2026-02-03
|
|
67
|
+
|
|
68
|
+
### Fixed
|
|
69
|
+
|
|
70
|
+
- **Event deduplication**: Identical events from parallel pipelines are now deduplicated
|
|
71
|
+
|
|
72
|
+
## [0.3.3] - 2026-02-03
|
|
73
|
+
|
|
74
|
+
### Fixed
|
|
75
|
+
|
|
76
|
+
- **Double-logging bug**: `df['col'] = df['col'].fillna()` now logs exactly one event
|
|
77
|
+
- **Merge warning scoping**: `tp.check(df)` now only shows warnings for merges in df's lineage
|
|
78
|
+
|
|
79
|
+
## [0.3.2] - 2026-02-03
|
|
80
|
+
|
|
81
|
+
### Fixed
|
|
82
|
+
|
|
83
|
+
- Merge duplicate key warnings now correctly identify which table (left/right) has duplicates
|
|
84
|
+
|
|
85
|
+
## [0.3.1] - 2026-02-03
|
|
86
|
+
|
|
87
|
+
### Fixed
|
|
88
|
+
|
|
89
|
+
- Cell history now correctly chains through merge operations via lineage traversal
|
|
90
|
+
- `tp.why()` and `tp.trace()` show pre-merge changes for post-merge rows
|
|
91
|
+
- `enable()` resets accumulated state when called multiple times
|
|
92
|
+
|
|
93
|
+
### Added
|
|
94
|
+
|
|
95
|
+
- `get_row_history_with_lineage()` and `get_cell_history_with_lineage()` methods
|
|
96
|
+
|
|
97
|
+
## [0.3.0] - 2026-02-03
|
|
98
|
+
|
|
99
|
+
### Added
|
|
100
|
+
|
|
101
|
+
- MkDocs documentation site with Material theme
|
|
102
|
+
- Comprehensive API reference documentation
|
|
103
|
+
- Getting started guides and tutorials
|
|
104
|
+
- `tp.register()` API for manually registering DataFrames
|
|
105
|
+
- Configurable retention threshold in `tp.check()`
|
|
106
|
+
- Ghost row capture for fallback filter paths
|
|
107
|
+
- Data quality contracts with fluent API
|
|
108
|
+
- HTML report generation
|
|
109
|
+
- Snapshot and diff functionality
|
|
110
|
+
- Debug mode with cell-level tracking
|
|
111
|
+
- `tp.why()` for cell provenance
|
|
112
|
+
- `tp.trace()` for row journey
|
|
113
|
+
- Support for all major pandas operations
|
|
114
|
+
|
|
115
|
+
### Fixed
|
|
116
|
+
|
|
117
|
+
- Recursion bug when accessing hidden column in COLUMN mode
|
|
118
|
+
- Config propagation issues
|
|
119
|
+
- Retention rate calculation for multi-table pipelines
|
|
120
|
+
- Export wrappers correctly strip hidden column
|
|
@@ -68,6 +68,28 @@ When rows are aggregated:
|
|
|
68
68
|
grouped = df.groupby("category").sum() # GROUP event with membership
|
|
69
69
|
```
|
|
70
70
|
|
|
71
|
+
### Concat Events (v0.4+)
|
|
72
|
+
|
|
73
|
+
When DataFrames are concatenated, row IDs are preserved:
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
df1 = pd.DataFrame({"a": [1, 2]}) # Rows get IDs: 0, 1
|
|
77
|
+
df2 = pd.DataFrame({"a": [3, 4]}) # Rows get IDs: 2, 3
|
|
78
|
+
|
|
79
|
+
result = pd.concat([df1, df2]) # IDs preserved: 0, 1, 2, 3
|
|
80
|
+
# TracePipe tracks which source DataFrame each row came from
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### Duplicate Drop Events (v0.4+)
|
|
84
|
+
|
|
85
|
+
In debug mode, `drop_duplicates` tracks which row was kept as representative:
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
df = pd.DataFrame({"key": ["A", "A", "B"], "val": [1, 2, 3]})
|
|
89
|
+
df = df.drop_duplicates(subset=["key"], keep="first")
|
|
90
|
+
# Row with val=2 was dropped, mapped to representative (val=1)
|
|
91
|
+
```
|
|
92
|
+
|
|
71
93
|
---
|
|
72
94
|
|
|
73
95
|
## The Lineage Store
|
|
@@ -109,6 +109,70 @@ if trace.merge_parents:
|
|
|
109
109
|
print(f"Right parent: {trace.merge_parents.right}")
|
|
110
110
|
```
|
|
111
111
|
|
|
112
|
+
---
|
|
113
|
+
|
|
114
|
+
## Concat Origin Tracking (v0.4+)
|
|
115
|
+
|
|
116
|
+
When rows come from concatenated DataFrames, TracePipe tracks their source via `trace.origin`:
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
df1 = pd.DataFrame({"a": [1, 2]})
|
|
120
|
+
df2 = pd.DataFrame({"a": [3, 4]})
|
|
121
|
+
result = pd.concat([df1, df2])
|
|
122
|
+
|
|
123
|
+
# Trace a row that came from df2
|
|
124
|
+
trace = tp.trace(result, row=2)
|
|
125
|
+
print(trace.origin)
|
|
126
|
+
# {"type": "concat", "source_df": 1, "step_id": 5}
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
The `.origin` property returns a unified dict with:
|
|
130
|
+
|
|
131
|
+
- `type`: `"concat"`, `"merge"`, or `None` (for original rows)
|
|
132
|
+
- `source_df`: Index in the concat list (0=first DataFrame, 1=second, etc.)
|
|
133
|
+
- `step_id`: Which pipeline step
|
|
134
|
+
|
|
135
|
+
Row IDs are preserved through `pd.concat(axis=0)`, so lineage chains correctly:
|
|
136
|
+
|
|
137
|
+
```python
|
|
138
|
+
# Transform df1 before concat
|
|
139
|
+
df1["a"] = df1["a"].fillna(0)
|
|
140
|
+
|
|
141
|
+
result = pd.concat([df1, df2])
|
|
142
|
+
|
|
143
|
+
# Rows from df1 still have their fillna history
|
|
144
|
+
trace = tp.trace(result, row=0) # Shows fillna event from df1
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
---
|
|
148
|
+
|
|
149
|
+
## Duplicate Representative Tracking (v0.4+)
|
|
150
|
+
|
|
151
|
+
When `drop_duplicates` removes rows, TracePipe tracks which row "won" via `trace.representative`:
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
df = pd.DataFrame({
|
|
155
|
+
"key": ["A", "A", "B"],
|
|
156
|
+
"value": [100, 200, 300]
|
|
157
|
+
})
|
|
158
|
+
df = df.drop_duplicates(subset=["key"], keep="first")
|
|
159
|
+
|
|
160
|
+
# Trace the dropped row (value=200)
|
|
161
|
+
trace = tp.trace(df, row=dropped_row_id)
|
|
162
|
+
print(trace.representative)
|
|
163
|
+
# {"kept_rid": 42, "subset": ["key"], "keep": "first"}
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
The `.representative` property is only set for rows dropped by `drop_duplicates`:
|
|
167
|
+
|
|
168
|
+
| `keep` Strategy | `.representative` |
|
|
169
|
+
|-----------------|-------------------|
|
|
170
|
+
| `keep='first'` | `{"kept_rid": 42, ...}` — first occurrence kept |
|
|
171
|
+
| `keep='last'` | `{"kept_rid": 45, ...}` — last occurrence kept |
|
|
172
|
+
| `keep=False` | `{"kept_rid": None, ...}` — all duplicates removed |
|
|
173
|
+
|
|
174
|
+
This answers "why did this row disappear?" — it wasn't deleted, it was deduplicated.
|
|
175
|
+
|
|
112
176
|
## Performance Considerations
|
|
113
177
|
|
|
114
178
|
- Row tracing in CI mode is limited (no individual row IDs)
|
|
@@ -159,12 +159,14 @@ print(tp.why(df, "price", 0)) # Why price changed
|
|
|
159
159
|
|
|
160
160
|
| Operation | Tracking | Completeness |
|
|
161
161
|
|-----------|----------|--------------|
|
|
162
|
-
| `dropna`, `
|
|
163
|
-
| `
|
|
162
|
+
| `dropna`, `query`, `df[mask]` | Dropped row IDs | Full |
|
|
163
|
+
| `drop_duplicates` | Dropped→kept mapping (debug mode) | Full |
|
|
164
164
|
| `head`, `tail`, `sample` | Dropped row IDs | Full |
|
|
165
165
|
| `fillna`, `replace` | Cell diffs (watched cols) | Full |
|
|
166
166
|
| `loc[]=`, `iloc[]=`, `at[]=` | Cell diffs | Full |
|
|
167
167
|
| `merge`, `join` | Parent tracking | Full |
|
|
168
|
+
| `pd.concat(axis=0)` | Row IDs + source DataFrame | Full |
|
|
169
|
+
| `pd.concat(axis=1)` | Row IDs (if aligned) | Partial |
|
|
168
170
|
| `groupby().agg()` | Group membership | Full |
|
|
169
171
|
| `apply`, `pipe` | Output tracked | Partial |
|
|
170
172
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
site_name: TracePipe
|
|
2
2
|
site_description: Row-level data lineage tracking for pandas pipelines
|
|
3
|
-
site_url: https://
|
|
3
|
+
site_url: https://gauthierpiarrette.github.io/tracepipe/
|
|
4
4
|
repo_url: https://github.com/gauthierpiarrette/tracepipe
|
|
5
5
|
repo_name: gauthierpiarrette/tracepipe
|
|
6
6
|
edit_uri: edit/main/docs/
|
|
@@ -304,7 +304,9 @@ class TestRowLineageResult:
|
|
|
304
304
|
|
|
305
305
|
row = dbg().explain_row(0)
|
|
306
306
|
history = row.cell_history("a")
|
|
307
|
-
assert
|
|
307
|
+
assert (
|
|
308
|
+
len(history) == 1
|
|
309
|
+
), f"Single fillna should record exactly 1 change, got {len(history)}"
|
|
308
310
|
|
|
309
311
|
def test_history(self):
|
|
310
312
|
"""history() returns full history."""
|
|
@@ -487,7 +489,9 @@ class TestPreEnableDataFrameTracking:
|
|
|
487
489
|
df["a"] = df["a"] * 10
|
|
488
490
|
|
|
489
491
|
result = tracepipe.why(df, col="a", row=0)
|
|
490
|
-
assert
|
|
492
|
+
assert (
|
|
493
|
+
len(result.history) == 1
|
|
494
|
+
), f"Single multiply should record exactly 1 change, got {len(result.history)}"
|
|
491
495
|
|
|
492
496
|
def test_trace_after_register(self):
|
|
493
497
|
"""Row tracing works for registered DataFrames."""
|
|
@@ -192,6 +192,125 @@ class TestCheckResult:
|
|
|
192
192
|
assert d["facts"]["key"] == "value"
|
|
193
193
|
assert len(d["warnings"]) == 1
|
|
194
194
|
|
|
195
|
+
# === CONVENIENCE PROPERTY TESTS (v0.4+) ===
|
|
196
|
+
|
|
197
|
+
def test_passed_property_alias_for_ok(self):
|
|
198
|
+
"""passed is an alias for ok."""
|
|
199
|
+
result_ok = CheckResult(ok=True, warnings=[], facts={}, suggestions=[], mode="debug")
|
|
200
|
+
result_fail = CheckResult(ok=False, warnings=[], facts={}, suggestions=[], mode="debug")
|
|
201
|
+
assert result_ok.passed is True
|
|
202
|
+
assert result_ok.passed == result_ok.ok
|
|
203
|
+
assert result_fail.passed is False
|
|
204
|
+
assert result_fail.passed == result_fail.ok
|
|
205
|
+
|
|
206
|
+
def test_retention_property(self):
|
|
207
|
+
"""retention returns retention_rate from facts."""
|
|
208
|
+
result = CheckResult(
|
|
209
|
+
ok=True,
|
|
210
|
+
warnings=[],
|
|
211
|
+
facts={"retention_rate": 0.847},
|
|
212
|
+
suggestions=[],
|
|
213
|
+
mode="debug",
|
|
214
|
+
)
|
|
215
|
+
assert result.retention == 0.847
|
|
216
|
+
|
|
217
|
+
def test_retention_property_none_when_missing(self):
|
|
218
|
+
"""retention returns None when retention_rate not in facts."""
|
|
219
|
+
result = CheckResult(ok=True, warnings=[], facts={}, suggestions=[], mode="debug")
|
|
220
|
+
assert result.retention is None
|
|
221
|
+
|
|
222
|
+
def test_n_dropped_property(self):
|
|
223
|
+
"""n_dropped returns rows_dropped from facts."""
|
|
224
|
+
result = CheckResult(
|
|
225
|
+
ok=True,
|
|
226
|
+
warnings=[],
|
|
227
|
+
facts={"rows_dropped": 153},
|
|
228
|
+
suggestions=[],
|
|
229
|
+
mode="debug",
|
|
230
|
+
)
|
|
231
|
+
assert result.n_dropped == 153
|
|
232
|
+
|
|
233
|
+
def test_n_dropped_property_zero_default(self):
|
|
234
|
+
"""n_dropped returns 0 when rows_dropped not in facts."""
|
|
235
|
+
result = CheckResult(ok=True, warnings=[], facts={}, suggestions=[], mode="debug")
|
|
236
|
+
assert result.n_dropped == 0
|
|
237
|
+
|
|
238
|
+
def test_n_steps_property(self):
|
|
239
|
+
"""n_steps returns total_steps from facts."""
|
|
240
|
+
result = CheckResult(
|
|
241
|
+
ok=True,
|
|
242
|
+
warnings=[],
|
|
243
|
+
facts={"total_steps": 5},
|
|
244
|
+
suggestions=[],
|
|
245
|
+
mode="debug",
|
|
246
|
+
)
|
|
247
|
+
assert result.n_steps == 5
|
|
248
|
+
|
|
249
|
+
def test_drops_by_op_property(self):
|
|
250
|
+
"""drops_by_op returns the _drops_by_op dict."""
|
|
251
|
+
result = CheckResult(
|
|
252
|
+
ok=True,
|
|
253
|
+
warnings=[],
|
|
254
|
+
facts={},
|
|
255
|
+
suggestions=[],
|
|
256
|
+
mode="debug",
|
|
257
|
+
_drops_by_op={"dropna": 42, "filter": 111},
|
|
258
|
+
)
|
|
259
|
+
assert result.drops_by_op == {"dropna": 42, "filter": 111}
|
|
260
|
+
|
|
261
|
+
def test_drops_by_op_empty_default(self):
|
|
262
|
+
"""drops_by_op returns empty dict when not set."""
|
|
263
|
+
result = CheckResult(ok=True, warnings=[], facts={}, suggestions=[], mode="debug")
|
|
264
|
+
assert result.drops_by_op == {}
|
|
265
|
+
|
|
266
|
+
def test_to_dict_includes_convenience_fields(self):
|
|
267
|
+
"""to_dict includes convenience property values."""
|
|
268
|
+
result = CheckResult(
|
|
269
|
+
ok=True,
|
|
270
|
+
warnings=[],
|
|
271
|
+
facts={"retention_rate": 0.85, "rows_dropped": 15, "total_steps": 3},
|
|
272
|
+
suggestions=[],
|
|
273
|
+
mode="debug",
|
|
274
|
+
_drops_by_op={"dropna": 15},
|
|
275
|
+
)
|
|
276
|
+
d = result.to_dict()
|
|
277
|
+
assert d["passed"] is True
|
|
278
|
+
assert d["retention"] == 0.85
|
|
279
|
+
assert d["n_dropped"] == 15
|
|
280
|
+
assert d["n_steps"] == 3
|
|
281
|
+
assert d["drops_by_op"] == {"dropna": 15}
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
class TestCheckResultIntegration:
|
|
285
|
+
"""Integration tests for CheckResult convenience properties."""
|
|
286
|
+
|
|
287
|
+
def test_check_populates_convenience_properties(self):
|
|
288
|
+
"""tp.check() populates convenience properties correctly."""
|
|
289
|
+
tp.enable(mode="debug")
|
|
290
|
+
df = pd.DataFrame({"a": [1, 2, None, 4, 5]})
|
|
291
|
+
df = df.dropna()
|
|
292
|
+
|
|
293
|
+
result = tp.check(df)
|
|
294
|
+
|
|
295
|
+
# Convenience properties should be accessible
|
|
296
|
+
assert isinstance(result.passed, bool)
|
|
297
|
+
assert result.retention is None or isinstance(result.retention, float)
|
|
298
|
+
assert isinstance(result.n_dropped, int)
|
|
299
|
+
assert isinstance(result.drops_by_op, dict)
|
|
300
|
+
assert isinstance(result.n_steps, int)
|
|
301
|
+
|
|
302
|
+
def test_check_drops_by_op_populated(self):
|
|
303
|
+
"""tp.check() correctly populates drops_by_op."""
|
|
304
|
+
tp.enable(mode="debug")
|
|
305
|
+
df = pd.DataFrame({"a": [1, 2, None, 4, 5], "b": [1, 1, 2, 2, 3]})
|
|
306
|
+
df = df.dropna()
|
|
307
|
+
df = df.drop_duplicates(subset=["b"])
|
|
308
|
+
|
|
309
|
+
result = tp.check(df)
|
|
310
|
+
|
|
311
|
+
# Should have drops tracked by operation
|
|
312
|
+
assert "DataFrame.dropna" in result.drops_by_op or result.n_dropped > 0
|
|
313
|
+
|
|
195
314
|
|
|
196
315
|
# =============================================================================
|
|
197
316
|
# TraceResult TESTS
|
|
@@ -410,7 +529,9 @@ class TestWhyResult:
|
|
|
410
529
|
assert result is not None
|
|
411
530
|
assert result.column == "amount"
|
|
412
531
|
assert result.current_value == 300.0 # 200 * 1.5
|
|
413
|
-
assert
|
|
532
|
+
assert (
|
|
533
|
+
result.n_changes == 1
|
|
534
|
+
), f"Single multiply should record exactly 1 change, got {result.n_changes}"
|
|
414
535
|
|
|
415
536
|
def test_why_with_where_multiple_criteria(self):
|
|
416
537
|
"""why() with where= using multiple column criteria."""
|