tracepipe 0.3.5__tar.gz → 0.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. {tracepipe-0.3.5 → tracepipe-0.4.1}/CHANGELOG.md +70 -0
  2. {tracepipe-0.3.5 → tracepipe-0.4.1}/PKG-INFO +6 -9
  3. {tracepipe-0.3.5 → tracepipe-0.4.1}/README.md +5 -8
  4. {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/api/core.md +15 -9
  5. tracepipe-0.4.1/docs/changelog.md +120 -0
  6. {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/guide/concepts.md +22 -0
  7. {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/guide/row-tracing.md +64 -0
  8. {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/index.md +4 -2
  9. {tracepipe-0.3.5 → tracepipe-0.4.1}/mkdocs.yml +1 -1
  10. {tracepipe-0.3.5 → tracepipe-0.4.1}/pyproject.toml +1 -1
  11. {tracepipe-0.3.5 → tracepipe-0.4.1}/tests/test_api.py +6 -2
  12. {tracepipe-0.3.5 → tracepipe-0.4.1}/tests/test_convenience_debug.py +122 -1
  13. tracepipe-0.4.1/tests/test_integration_scenarios.py +361 -0
  14. {tracepipe-0.3.5 → tracepipe-0.4.1}/tests/test_lineage_through_merge.py +28 -26
  15. {tracepipe-0.3.5 → tracepipe-0.4.1}/tests/test_pandas_inst.py +20 -10
  16. tracepipe-0.4.1/tests/test_row_provenance.py +684 -0
  17. {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/__init__.py +1 -1
  18. {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/convenience.py +130 -7
  19. {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/core.py +79 -0
  20. {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/instrumentation/filter_capture.py +103 -1
  21. {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/instrumentation/merge_capture.py +169 -23
  22. {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/storage/lineage_store.py +92 -7
  23. {tracepipe-0.3.5 → tracepipe-0.4.1}/uv.lock +317 -248
  24. tracepipe-0.3.5/docs/changelog.md +0 -39
  25. tracepipe-0.3.5/examples/comprehensive_demo.py +0 -694
  26. tracepipe-0.3.5/examples/red_team_test.py +0 -437
  27. {tracepipe-0.3.5 → tracepipe-0.4.1}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  28. {tracepipe-0.3.5 → tracepipe-0.4.1}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  29. {tracepipe-0.3.5 → tracepipe-0.4.1}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  30. {tracepipe-0.3.5 → tracepipe-0.4.1}/.github/workflows/ci.yml +0 -0
  31. {tracepipe-0.3.5 → tracepipe-0.4.1}/.github/workflows/docs.yml +0 -0
  32. {tracepipe-0.3.5 → tracepipe-0.4.1}/.github/workflows/release.yml +0 -0
  33. {tracepipe-0.3.5 → tracepipe-0.4.1}/.gitignore +0 -0
  34. {tracepipe-0.3.5 → tracepipe-0.4.1}/.pre-commit-config.yaml +0 -0
  35. {tracepipe-0.3.5 → tracepipe-0.4.1}/CONTRIBUTING.md +0 -0
  36. {tracepipe-0.3.5 → tracepipe-0.4.1}/LICENSE +0 -0
  37. {tracepipe-0.3.5 → tracepipe-0.4.1}/benchmarks/README.md +0 -0
  38. {tracepipe-0.3.5 → tracepipe-0.4.1}/benchmarks/bench_memory.py +0 -0
  39. {tracepipe-0.3.5 → tracepipe-0.4.1}/benchmarks/bench_overhead.py +0 -0
  40. {tracepipe-0.3.5 → tracepipe-0.4.1}/benchmarks/bench_scale.py +0 -0
  41. {tracepipe-0.3.5 → tracepipe-0.4.1}/benchmarks/run_all.py +0 -0
  42. {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/api/contracts.md +0 -0
  43. {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/api/debug.md +0 -0
  44. {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/api/index.md +0 -0
  45. {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/contributing.md +0 -0
  46. {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/examples/data-validation.md +0 -0
  47. {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/examples/ml-pipeline.md +0 -0
  48. {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/getting-started/installation.md +0 -0
  49. {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/getting-started/modes.md +0 -0
  50. {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/getting-started/quickstart.md +0 -0
  51. {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/guide/cell-provenance.md +0 -0
  52. {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/guide/contracts.md +0 -0
  53. {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/guide/health-checks.md +0 -0
  54. {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/guide/reports.md +0 -0
  55. {tracepipe-0.3.5 → tracepipe-0.4.1}/docs/guide/snapshots.md +0 -0
  56. {tracepipe-0.3.5 → tracepipe-0.4.1}/examples/demo.py +0 -0
  57. {tracepipe-0.3.5 → tracepipe-0.4.1}/examples/ml_pipeline_demo.py +0 -0
  58. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/404.html +0 -0
  59. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/api/contracts/index.html +0 -0
  60. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/api/core/index.html +0 -0
  61. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/api/debug/index.html +0 -0
  62. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/api/index.html +0 -0
  63. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/_mkdocstrings.css +0 -0
  64. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/images/favicon.png +0 -0
  65. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/bundle.79ae519e.min.js +0 -0
  66. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/bundle.79ae519e.min.js.map +0 -0
  67. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.ar.min.js +0 -0
  68. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.da.min.js +0 -0
  69. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.de.min.js +0 -0
  70. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.du.min.js +0 -0
  71. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.el.min.js +0 -0
  72. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.es.min.js +0 -0
  73. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.fi.min.js +0 -0
  74. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.fr.min.js +0 -0
  75. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.he.min.js +0 -0
  76. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.hi.min.js +0 -0
  77. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.hu.min.js +0 -0
  78. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.hy.min.js +0 -0
  79. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.it.min.js +0 -0
  80. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.ja.min.js +0 -0
  81. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.jp.min.js +0 -0
  82. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.kn.min.js +0 -0
  83. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.ko.min.js +0 -0
  84. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.multi.min.js +0 -0
  85. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.nl.min.js +0 -0
  86. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.no.min.js +0 -0
  87. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.pt.min.js +0 -0
  88. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.ro.min.js +0 -0
  89. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.ru.min.js +0 -0
  90. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.sa.min.js +0 -0
  91. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.stemmer.support.min.js +0 -0
  92. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.sv.min.js +0 -0
  93. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.ta.min.js +0 -0
  94. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.te.min.js +0 -0
  95. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.th.min.js +0 -0
  96. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.tr.min.js +0 -0
  97. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.vi.min.js +0 -0
  98. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.zh.min.js +0 -0
  99. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/tinyseg.js +0 -0
  100. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/lunr/wordcut.js +0 -0
  101. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/workers/search.2c215733.min.js +0 -0
  102. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/javascripts/workers/search.2c215733.min.js.map +0 -0
  103. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/stylesheets/main.484c7ddc.min.css +0 -0
  104. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/stylesheets/main.484c7ddc.min.css.map +0 -0
  105. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/stylesheets/palette.ab4e12ef.min.css +0 -0
  106. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/assets/stylesheets/palette.ab4e12ef.min.css.map +0 -0
  107. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/changelog/index.html +0 -0
  108. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/contributing/index.html +0 -0
  109. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/examples/data-validation/index.html +0 -0
  110. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/examples/ml-pipeline/index.html +0 -0
  111. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/getting-started/installation/index.html +0 -0
  112. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/getting-started/modes/index.html +0 -0
  113. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/getting-started/quickstart/index.html +0 -0
  114. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/guide/cell-provenance/index.html +0 -0
  115. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/guide/concepts/index.html +0 -0
  116. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/guide/contracts/index.html +0 -0
  117. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/guide/health-checks/index.html +0 -0
  118. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/guide/reports/index.html +0 -0
  119. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/guide/row-tracing/index.html +0 -0
  120. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/guide/snapshots/index.html +0 -0
  121. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/index.html +0 -0
  122. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/objects.inv +0 -0
  123. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/search/search_index.json +0 -0
  124. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/sitemap.xml +0 -0
  125. {tracepipe-0.3.5 → tracepipe-0.4.1}/site/sitemap.xml.gz +0 -0
  126. {tracepipe-0.3.5 → tracepipe-0.4.1}/tests/__init__.py +0 -0
  127. {tracepipe-0.3.5 → tracepipe-0.4.1}/tests/conftest.py +0 -0
  128. {tracepipe-0.3.5 → tracepipe-0.4.1}/tests/test_concurrency.py +0 -0
  129. {tracepipe-0.3.5 → tracepipe-0.4.1}/tests/test_contracts.py +0 -0
  130. {tracepipe-0.3.5 → tracepipe-0.4.1}/tests/test_edge_cases.py +0 -0
  131. {tracepipe-0.3.5 → tracepipe-0.4.1}/tests/test_integration.py +0 -0
  132. {tracepipe-0.3.5 → tracepipe-0.4.1}/tests/test_io_operations.py +0 -0
  133. {tracepipe-0.3.5 → tracepipe-0.4.1}/tests/test_public_api.py +0 -0
  134. {tracepipe-0.3.5 → tracepipe-0.4.1}/tests/test_snapshot.py +0 -0
  135. {tracepipe-0.3.5 → tracepipe-0.4.1}/tests/test_version_matrix.py +0 -0
  136. {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/api.py +0 -0
  137. {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/context.py +0 -0
  138. {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/contracts.py +0 -0
  139. {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/debug.py +0 -0
  140. {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/instrumentation/__init__.py +0 -0
  141. {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/instrumentation/apply_capture.py +0 -0
  142. {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/instrumentation/indexer_capture.py +0 -0
  143. {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/instrumentation/pandas_inst.py +0 -0
  144. {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/instrumentation/series_capture.py +0 -0
  145. {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/safety.py +0 -0
  146. {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/snapshot.py +0 -0
  147. {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/storage/__init__.py +0 -0
  148. {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/storage/base.py +0 -0
  149. {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/storage/row_identity.py +0 -0
  150. {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/utils/__init__.py +0 -0
  151. {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/utils/value_capture.py +0 -0
  152. {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/value_provenance.py +0 -0
  153. {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/visualization/__init__.py +0 -0
  154. {tracepipe-0.3.5 → tracepipe-0.4.1}/tracepipe/visualization/html_export.py +0 -0
@@ -5,6 +5,67 @@ All notable changes to TracePipe will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## 0.4.1 - 2026-02-04
9
+
10
+ ### Fixed
11
+ - Fully implemented `CheckResult` convenience properties (`.passed`, `.retention`, `.n_dropped`, `.n_steps`, `.drops_by_op`)
12
+ - Added comprehensive tests for `CheckResult` API to ensure properties work correctly
13
+ - Properties now properly access underlying `.facts` dictionary for all metrics
14
+
15
+ ### Changed
16
+ - Cleaned up example files and test scripts
17
+
18
+ ## 0.4.0 - 2026-02-04
19
+
20
+ ### Added
21
+ - **Full row provenance for `pd.concat(axis=0)`**: Row IDs are now preserved through concatenation
22
+ - Each result row maintains its original RID from the source DataFrame
23
+ - `ConcatMapping` tracks which source DataFrame each row came from
24
+ - Concat steps are now marked `FULL` completeness (previously `PARTIAL`)
25
+
26
+ - **Duplicate drop provenance in debug mode**: `drop_duplicates` now tracks which row "won"
27
+ - `DuplicateDropMapping` maps dropped rows to their kept representative
28
+ - Supports `keep='first'`, `keep='last'`, and `keep=False`
29
+ - Uses `hash_pandas_object` for fast, NaN-safe key comparison
30
+
31
+ - **Clean `TraceResult` API for provenance** (UX improvement):
32
+ - `trace.origin` — Unified origin info: `{"type": "concat", "source_df": 1}` or `{"type": "merge", "left_parent": 10, "right_parent": 20}`
33
+ - `trace.representative` — For dedup-dropped rows: `{"kept_rid": 42, "subset": ["key"], "keep": "first"}`
34
+ - No need to access internal `.store` methods — everything is in `tp.trace()` result
35
+
36
+ - **Clean `CheckResult` API** (UX improvement):
37
+ - `result.passed` — Alias for `.ok` (common naming convention)
38
+ - `result.retention` — Row retention rate (0.0-1.0) from `.facts`
39
+ - `result.n_dropped` — Total rows dropped
40
+ - `result.n_steps` — Total pipeline steps recorded
41
+ - `result.drops_by_op` — Drops broken down by operation name
42
+ - All properties are now discoverable via autocomplete
43
+
44
+ - **New data structures in `core.py`**:
45
+ - `ConcatMapping`: Tracks row provenance through concat operations
46
+ - `DuplicateDropMapping`: Tracks dropped->kept relationships in drop_duplicates
47
+
48
+ - **Comprehensive test suite**: 38 new tests in `test_row_provenance.py` covering:
49
+ - Concat RID preservation, ignore_index, after sort, with empty DFs, chained concats
50
+ - Axis=1 same-RID propagation vs different-RID PARTIAL marking
51
+ - Drop_duplicates keep='first'/'last'/False mapping correctness
52
+ - NaN handling parity with pandas `duplicated()`
53
+ - Integration: concat→merge, filter→concat, dedup→fillna lineage
54
+ - TraceResult `.origin` and `.representative` property tests
55
+
56
+ ### Changed
57
+ - `wrap_concat_with_lineage` rewritten for full provenance tracking
58
+ - Captures source RIDs before operation
59
+ - Propagates RIDs (not new registration) for axis=0
60
+ - Stores positional + sorted arrays for both "explain row i" and O(log n) lookup
61
+ - Axis=1 propagates RIDs if all inputs match, otherwise PARTIAL
62
+
63
+ - `_capture_filter_with_mask` enhanced to store `DuplicateDropMapping` in debug mode
64
+
65
+ - `TraceResult` enhanced with `.origin` and `.representative` properties
66
+ - `.to_text()` now displays origin and representative info
67
+ - `.to_dict()` includes all provenance info
68
+
8
69
  ## 0.3.5 - 2026-02-03
9
70
 
10
71
  ### Fixed
@@ -17,6 +78,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
17
78
  - Known Limitations section in README documenting concat/dedup tracking gaps
18
79
  - Test for `DataFrame.fillna` single-event logging
19
80
 
81
+ ### Changed
82
+ - **Test suite hardened** with exact count assertions and multi-scenario tests:
83
+ - Changed 15+ assertions from `>= 1` to `== 1` for precise verification
84
+ - Added `test_integration_scenarios.py` with 16 new tests covering:
85
+ - Multi-pipeline session isolation
86
+ - Warning message content verification
87
+ - Reliability scenarios (fillna, replace, loc, merge)
88
+ - Cross-pipeline contamination prevention
89
+
20
90
  ## 0.3.4 - 2026-02-03
21
91
 
22
92
  ### Fixed
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tracepipe
3
- Version: 0.3.5
3
+ Version: 0.4.1
4
4
  Summary: Row-level data lineage tracking for pandas pipelines
5
5
  Project-URL: Homepage, https://github.com/tracepipe/tracepipe
6
6
  Project-URL: Documentation, https://tracepipe.github.io/tracepipe/
@@ -278,7 +278,7 @@ tp.enable(mode="debug") # Full lineage
278
278
 
279
279
  ## Known Limitations
280
280
 
281
- TracePipe tracks **cell mutations** (fillna, replace, loc assignment) and **merge provenance** reliably. However, some patterns are not yet fully supported:
281
+ TracePipe tracks **cell mutations**, **merge provenance**, **concat provenance**, and **duplicate drop decisions** reliably. A few patterns have limited tracking:
282
282
 
283
283
  | Pattern | Status | Notes |
284
284
  |---------|--------|-------|
@@ -286,13 +286,10 @@ TracePipe tracks **cell mutations** (fillna, replace, loc assignment) and **merg
286
286
  | `df = df.fillna({"col": 0})` | ✅ Tracked | DataFrame-level fillna |
287
287
  | `df.loc[mask, "col"] = val` | ✅ Tracked | Conditional assignment |
288
288
  | `df.merge(other, on="key")` | ✅ Tracked | Full provenance in debug mode |
289
- | `pd.concat([df1, df2])` | ⚠️ Partial | Row IDs preserved, but no "source DataFrame" tracking |
290
- | `df.drop_duplicates(keep='last')` | ⚠️ Partial | Which row was kept is not tracked |
291
- | Sort + dedup patterns | ⚠️ Partial | "Latest record wins" logic not traced |
292
-
293
- **Why?** TracePipe tracks value changes within rows, not row-selection operations. When `drop_duplicates` picks one row over another, that's a provenance decision (not a cell mutation) that isn't currently instrumented.
294
-
295
- **Planned for 0.4**: Full row-provenance tracking for concat, drop_duplicates, and sort operations.
289
+ | `pd.concat([df1, df2])` | Tracked | Row IDs preserved with source DataFrame tracking (v0.4+) |
290
+ | `df.drop_duplicates()` | Tracked | Dropped rows map to kept representative (debug mode, v0.4+) |
291
+ | `pd.concat(axis=1)` | ⚠️ Partial | FULL only if all inputs have identical RIDs |
292
+ | Complex `apply`/`pipe` | ⚠️ Partial | Output tracked, internals opaque |
296
293
 
297
294
  ---
298
295
 
@@ -209,7 +209,7 @@ tp.enable(mode="debug") # Full lineage
209
209
 
210
210
  ## Known Limitations
211
211
 
212
- TracePipe tracks **cell mutations** (fillna, replace, loc assignment) and **merge provenance** reliably. However, some patterns are not yet fully supported:
212
+ TracePipe tracks **cell mutations**, **merge provenance**, **concat provenance**, and **duplicate drop decisions** reliably. A few patterns have limited tracking:
213
213
 
214
214
  | Pattern | Status | Notes |
215
215
  |---------|--------|-------|
@@ -217,13 +217,10 @@ TracePipe tracks **cell mutations** (fillna, replace, loc assignment) and **merg
217
217
  | `df = df.fillna({"col": 0})` | ✅ Tracked | DataFrame-level fillna |
218
218
  | `df.loc[mask, "col"] = val` | ✅ Tracked | Conditional assignment |
219
219
  | `df.merge(other, on="key")` | ✅ Tracked | Full provenance in debug mode |
220
- | `pd.concat([df1, df2])` | ⚠️ Partial | Row IDs preserved, but no "source DataFrame" tracking |
221
- | `df.drop_duplicates(keep='last')` | ⚠️ Partial | Which row was kept is not tracked |
222
- | Sort + dedup patterns | ⚠️ Partial | "Latest record wins" logic not traced |
223
-
224
- **Why?** TracePipe tracks value changes within rows, not row-selection operations. When `drop_duplicates` picks one row over another, that's a provenance decision (not a cell mutation) that isn't currently instrumented.
225
-
226
- **Planned for 0.4**: Full row-provenance tracking for concat, drop_duplicates, and sort operations.
220
+ | `pd.concat([df1, df2])` | Tracked | Row IDs preserved with source DataFrame tracking (v0.4+) |
221
+ | `df.drop_duplicates()` | Tracked | Dropped rows map to kept representative (debug mode, v0.4+) |
222
+ | `pd.concat(axis=1)` | ⚠️ Partial | FULL only if all inputs have identical RIDs |
223
+ | Complex `apply`/`pipe` | ⚠️ Partial | Output tracked, internals opaque |
227
224
 
228
225
  ---
229
226
 
@@ -86,6 +86,9 @@ Manually register DataFrames for tracking.
86
86
 
87
87
  Use this when DataFrames are created before `tp.enable()` is called.
88
88
 
89
+ !!! note "Lineage Break"
90
+ Calling `register()` assigns new row IDs, which breaks lineage from any prior transformations. Use it only for "entry point" DataFrames.
91
+
89
92
  **Parameters:**
90
93
 
91
94
  | Parameter | Type | Description |
@@ -161,14 +164,15 @@ Health check for a DataFrame's lineage.
161
164
 
162
165
  | Attribute | Type | Description |
163
166
  |-----------|------|-------------|
164
- | `.passed` | `bool` | True if healthy |
167
+ | `.ok` | `bool` | True if no FACT-level warnings |
168
+ | `.passed` | `bool` | Alias for `.ok` |
165
169
  | `.mode` | `str` | Current tracking mode |
166
- | `.retention` | `float` | Row retention rate (0-1) |
167
- | `.n_dropped` | `int` | Total dropped rows |
168
- | `.n_changes` | `int` | Total cell changes |
169
- | `.warnings` | `list[str]` | Any warnings |
170
- | `.drops_by_op` | `dict` | Drops by operation |
171
- | `.changes_by_op` | `dict` | Changes by operation |
170
+ | `.retention` | `float \| None` | Row retention rate (0.0-1.0) |
171
+ | `.n_dropped` | `int` | Total rows dropped |
172
+ | `.n_steps` | `int` | Total pipeline steps recorded |
173
+ | `.drops_by_op` | `dict[str, int]` | Drops by operation name |
174
+ | `.warnings` | `list[CheckWarning]` | Warning objects with details |
175
+ | `.facts` | `dict` | Raw measured facts (for power users) |
172
176
 
173
177
  **Example:**
174
178
 
@@ -209,9 +213,11 @@ Trace a row's journey through the pipeline.
209
213
  | Attribute | Type | Description |
210
214
  |-----------|------|-------------|
211
215
  | `.row_id` | `int` | Internal row ID |
212
- | `.status` | `str` | `"alive"` or `"dropped"` |
216
+ | `.is_alive` | `bool` | True if row exists in current DataFrame |
213
217
  | `.events` | `list` | All events for this row |
214
- | `.dropped_by` | `str` | Operation that dropped (if dropped) |
218
+ | `.dropped_at` | `dict` | Operation that dropped (if dropped) |
219
+ | `.origin` | `dict` | Where row came from: `{"type": "concat", "source_df": 1}` or `{"type": "merge", "left_parent": 10, "right_parent": 20}` |
220
+ | `.representative` | `dict` | If dropped by dedup: `{"kept_rid": 42, "subset": [...], "keep": "first"}` |
215
221
 
216
222
  **Example:**
217
223
 
@@ -0,0 +1,120 @@
1
+ # Changelog
2
+
3
+ All notable changes to TracePipe will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [0.4.1] - 2026-02-04
9
+
10
+ ### Fixed
11
+ - Fully implemented `CheckResult` convenience properties (`.passed`, `.retention`, `.n_dropped`, `.n_steps`, `.drops_by_op`)
12
+ - Added comprehensive tests for `CheckResult` API to ensure properties work correctly
13
+ - Properties now properly access underlying `.facts` dictionary for all metrics
14
+
15
+ ### Changed
16
+ - Cleaned up example files and test scripts
17
+
18
+ ## [0.4.0] - 2026-02-04
19
+
20
+ ### Added
21
+
22
+ - **Full row provenance for `pd.concat(axis=0)`**: Row IDs are now preserved through concatenation
23
+ - Each result row maintains its original RID from the source DataFrame
24
+ - `ConcatMapping` tracks which source DataFrame each row came from
25
+ - Concat steps are now marked `FULL` completeness
26
+
27
+ - **Duplicate drop provenance in debug mode**: `drop_duplicates` now tracks which row "won"
28
+ - `DuplicateDropMapping` maps dropped rows to their kept representative
29
+ - Supports `keep='first'`, `keep='last'`, and `keep=False`
30
+ - Uses `hash_pandas_object` for fast, NaN-safe key comparison
31
+
32
+ - **Clean `TraceResult` API for provenance**:
33
+ - `trace.origin` — Unified origin: `{"type": "concat", "source_df": 1}` or `{"type": "merge", ...}`
34
+ - `trace.representative` — For dedup drops: `{"kept_rid": 42, "subset": ["key"], "keep": "first"}`
35
+ - No need to access internal `.store` methods
36
+
37
+ - **Clean `CheckResult` API**:
38
+ - `result.passed` — Alias for `.ok`
39
+ - `result.retention` — Row retention rate (0.0-1.0)
40
+ - `result.n_dropped`, `result.n_steps`, `result.drops_by_op`
41
+ - All properties discoverable via autocomplete
42
+
43
+ - **Comprehensive test suite**: 38 new tests covering concat, dedup, and TraceResult API
44
+
45
+ ### Changed
46
+
47
+ - `wrap_concat_with_lineage` rewritten for full provenance tracking
48
+ - `axis=1` concat propagates RIDs if all inputs match, otherwise PARTIAL
49
+ - `TraceResult` enhanced with `.origin` and `.representative` properties
50
+
51
+ ## [0.3.5] - 2026-02-03
52
+
53
+ ### Fixed
54
+
55
+ - **DataFrame.fillna double-logging**: `df.fillna({"col": 0})` now logs exactly 1 event
56
+ - Added `wrap_pandas_transform_method` with `_in_transform_op` flag
57
+
58
+ ### Added
59
+
60
+ - Known Limitations section in README documenting concat/dedup tracking gaps
61
+
62
+ ### Changed
63
+
64
+ - Test suite hardened with exact count assertions and multi-scenario tests
65
+
66
+ ## [0.3.4] - 2026-02-03
67
+
68
+ ### Fixed
69
+
70
+ - **Event deduplication**: Identical events from parallel pipelines are now deduplicated
71
+
72
+ ## [0.3.3] - 2026-02-03
73
+
74
+ ### Fixed
75
+
76
+ - **Double-logging bug**: `df['col'] = df['col'].fillna()` now logs exactly one event
77
+ - **Merge warning scoping**: `tp.check(df)` now only shows warnings for merges in df's lineage
78
+
79
+ ## [0.3.2] - 2026-02-03
80
+
81
+ ### Fixed
82
+
83
+ - Merge duplicate key warnings now correctly identify which table (left/right) has duplicates
84
+
85
+ ## [0.3.1] - 2026-02-03
86
+
87
+ ### Fixed
88
+
89
+ - Cell history now correctly chains through merge operations via lineage traversal
90
+ - `tp.why()` and `tp.trace()` show pre-merge changes for post-merge rows
91
+ - `enable()` resets accumulated state when called multiple times
92
+
93
+ ### Added
94
+
95
+ - `get_row_history_with_lineage()` and `get_cell_history_with_lineage()` methods
96
+
97
+ ## [0.3.0] - 2026-02-03
98
+
99
+ ### Added
100
+
101
+ - MkDocs documentation site with Material theme
102
+ - Comprehensive API reference documentation
103
+ - Getting started guides and tutorials
104
+ - `tp.register()` API for manually registering DataFrames
105
+ - Configurable retention threshold in `tp.check()`
106
+ - Ghost row capture for fallback filter paths
107
+ - Data quality contracts with fluent API
108
+ - HTML report generation
109
+ - Snapshot and diff functionality
110
+ - Debug mode with cell-level tracking
111
+ - `tp.why()` for cell provenance
112
+ - `tp.trace()` for row journey
113
+ - Support for all major pandas operations
114
+
115
+ ### Fixed
116
+
117
+ - Recursion bug when accessing hidden column in COLUMN mode
118
+ - Config propagation issues
119
+ - Retention rate calculation for multi-table pipelines
120
+ - Export wrappers correctly strip hidden column
@@ -68,6 +68,28 @@ When rows are aggregated:
68
68
  grouped = df.groupby("category").sum() # GROUP event with membership
69
69
  ```
70
70
 
71
+ ### Concat Events (v0.4+)
72
+
73
+ When DataFrames are concatenated, row IDs are preserved:
74
+
75
+ ```python
76
+ df1 = pd.DataFrame({"a": [1, 2]}) # Rows get IDs: 0, 1
77
+ df2 = pd.DataFrame({"a": [3, 4]}) # Rows get IDs: 2, 3
78
+
79
+ result = pd.concat([df1, df2]) # IDs preserved: 0, 1, 2, 3
80
+ # TracePipe tracks which source DataFrame each row came from
81
+ ```
82
+
83
+ ### Duplicate Drop Events (v0.4+)
84
+
85
+ In debug mode, `drop_duplicates` tracks which row was kept as representative:
86
+
87
+ ```python
88
+ df = pd.DataFrame({"key": ["A", "A", "B"], "val": [1, 2, 3]})
89
+ df = df.drop_duplicates(subset=["key"], keep="first")
90
+ # Row with val=2 was dropped, mapped to representative (val=1)
91
+ ```
92
+
71
93
  ---
72
94
 
73
95
  ## The Lineage Store
@@ -109,6 +109,70 @@ if trace.merge_parents:
109
109
  print(f"Right parent: {trace.merge_parents.right}")
110
110
  ```
111
111
 
112
+ ---
113
+
114
+ ## Concat Origin Tracking (v0.4+)
115
+
116
+ When rows come from concatenated DataFrames, TracePipe tracks their source via `trace.origin`:
117
+
118
+ ```python
119
+ df1 = pd.DataFrame({"a": [1, 2]})
120
+ df2 = pd.DataFrame({"a": [3, 4]})
121
+ result = pd.concat([df1, df2])
122
+
123
+ # Trace a row that came from df2
124
+ trace = tp.trace(result, row=2)
125
+ print(trace.origin)
126
+ # {"type": "concat", "source_df": 1, "step_id": 5}
127
+ ```
128
+
129
+ The `.origin` property returns a unified dict with:
130
+
131
+ - `type`: `"concat"`, `"merge"`, or `None` (for original rows)
132
+ - `source_df`: Index in the concat list (0=first DataFrame, 1=second, etc.)
133
+ - `step_id`: Which pipeline step
134
+
135
+ Row IDs are preserved through `pd.concat(axis=0)`, so lineage chains correctly:
136
+
137
+ ```python
138
+ # Transform df1 before concat
139
+ df1["a"] = df1["a"].fillna(0)
140
+
141
+ result = pd.concat([df1, df2])
142
+
143
+ # Rows from df1 still have their fillna history
144
+ trace = tp.trace(result, row=0) # Shows fillna event from df1
145
+ ```
146
+
147
+ ---
148
+
149
+ ## Duplicate Representative Tracking (v0.4+)
150
+
151
+ When `drop_duplicates` removes rows, TracePipe tracks which row "won" via `trace.representative`:
152
+
153
+ ```python
154
+ df = pd.DataFrame({
155
+ "key": ["A", "A", "B"],
156
+ "value": [100, 200, 300]
157
+ })
158
+ df = df.drop_duplicates(subset=["key"], keep="first")
159
+
160
+ # Trace the dropped row (value=200)
161
+ trace = tp.trace(df, row=dropped_row_id)
162
+ print(trace.representative)
163
+ # {"kept_rid": 42, "subset": ["key"], "keep": "first"}
164
+ ```
165
+
166
+ The `.representative` property is only set for rows dropped by `drop_duplicates`:
167
+
168
+ | `keep` Strategy | `.representative` |
169
+ |-----------------|-------------------|
170
+ | `keep='first'` | `{"kept_rid": 42, ...}` — first occurrence kept |
171
+ | `keep='last'` | `{"kept_rid": 45, ...}` — last occurrence kept |
172
+ | `keep=False` | `{"kept_rid": None, ...}` — all duplicates removed |
173
+
174
+ This answers "why did this row disappear?" — it wasn't deleted, it was deduplicated.
175
+
112
176
  ## Performance Considerations
113
177
 
114
178
  - Row tracing in CI mode is limited (no individual row IDs)
@@ -159,12 +159,14 @@ print(tp.why(df, "price", 0)) # Why price changed
159
159
 
160
160
  | Operation | Tracking | Completeness |
161
161
  |-----------|----------|--------------|
162
- | `dropna`, `drop_duplicates` | Dropped row IDs | Full |
163
- | `query`, `df[mask]` | Dropped row IDs | Full |
162
+ | `dropna`, `query`, `df[mask]` | Dropped row IDs | Full |
163
+ | `drop_duplicates` | Dropped→kept mapping (debug mode) | Full |
164
164
  | `head`, `tail`, `sample` | Dropped row IDs | Full |
165
165
  | `fillna`, `replace` | Cell diffs (watched cols) | Full |
166
166
  | `loc[]=`, `iloc[]=`, `at[]=` | Cell diffs | Full |
167
167
  | `merge`, `join` | Parent tracking | Full |
168
+ | `pd.concat(axis=0)` | Row IDs + source DataFrame | Full |
169
+ | `pd.concat(axis=1)` | Row IDs (if aligned) | Partial |
168
170
  | `groupby().agg()` | Group membership | Full |
169
171
  | `apply`, `pipe` | Output tracked | Partial |
170
172
 
@@ -1,6 +1,6 @@
1
1
  site_name: TracePipe
2
2
  site_description: Row-level data lineage tracking for pandas pipelines
3
- site_url: https://tracepipe.github.io/tracepipe/
3
+ site_url: https://gauthierpiarrette.github.io/tracepipe/
4
4
  repo_url: https://github.com/gauthierpiarrette/tracepipe
5
5
  repo_name: gauthierpiarrette/tracepipe
6
6
  edit_uri: edit/main/docs/
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "tracepipe"
7
- version = "0.3.5"
7
+ version = "0.4.1"
8
8
  description = "Row-level data lineage tracking for pandas pipelines"
9
9
  readme = "README.md"
10
10
  license = {file = "LICENSE"}
@@ -304,7 +304,9 @@ class TestRowLineageResult:
304
304
 
305
305
  row = dbg().explain_row(0)
306
306
  history = row.cell_history("a")
307
- assert len(history) >= 1
307
+ assert (
308
+ len(history) == 1
309
+ ), f"Single fillna should record exactly 1 change, got {len(history)}"
308
310
 
309
311
  def test_history(self):
310
312
  """history() returns full history."""
@@ -487,7 +489,9 @@ class TestPreEnableDataFrameTracking:
487
489
  df["a"] = df["a"] * 10
488
490
 
489
491
  result = tracepipe.why(df, col="a", row=0)
490
- assert len(result.history) >= 1
492
+ assert (
493
+ len(result.history) == 1
494
+ ), f"Single multiply should record exactly 1 change, got {len(result.history)}"
491
495
 
492
496
  def test_trace_after_register(self):
493
497
  """Row tracing works for registered DataFrames."""
@@ -192,6 +192,125 @@ class TestCheckResult:
192
192
  assert d["facts"]["key"] == "value"
193
193
  assert len(d["warnings"]) == 1
194
194
 
195
+ # === CONVENIENCE PROPERTY TESTS (v0.4+) ===
196
+
197
+ def test_passed_property_alias_for_ok(self):
198
+ """passed is an alias for ok."""
199
+ result_ok = CheckResult(ok=True, warnings=[], facts={}, suggestions=[], mode="debug")
200
+ result_fail = CheckResult(ok=False, warnings=[], facts={}, suggestions=[], mode="debug")
201
+ assert result_ok.passed is True
202
+ assert result_ok.passed == result_ok.ok
203
+ assert result_fail.passed is False
204
+ assert result_fail.passed == result_fail.ok
205
+
206
+ def test_retention_property(self):
207
+ """retention returns retention_rate from facts."""
208
+ result = CheckResult(
209
+ ok=True,
210
+ warnings=[],
211
+ facts={"retention_rate": 0.847},
212
+ suggestions=[],
213
+ mode="debug",
214
+ )
215
+ assert result.retention == 0.847
216
+
217
+ def test_retention_property_none_when_missing(self):
218
+ """retention returns None when retention_rate not in facts."""
219
+ result = CheckResult(ok=True, warnings=[], facts={}, suggestions=[], mode="debug")
220
+ assert result.retention is None
221
+
222
+ def test_n_dropped_property(self):
223
+ """n_dropped returns rows_dropped from facts."""
224
+ result = CheckResult(
225
+ ok=True,
226
+ warnings=[],
227
+ facts={"rows_dropped": 153},
228
+ suggestions=[],
229
+ mode="debug",
230
+ )
231
+ assert result.n_dropped == 153
232
+
233
+ def test_n_dropped_property_zero_default(self):
234
+ """n_dropped returns 0 when rows_dropped not in facts."""
235
+ result = CheckResult(ok=True, warnings=[], facts={}, suggestions=[], mode="debug")
236
+ assert result.n_dropped == 0
237
+
238
+ def test_n_steps_property(self):
239
+ """n_steps returns total_steps from facts."""
240
+ result = CheckResult(
241
+ ok=True,
242
+ warnings=[],
243
+ facts={"total_steps": 5},
244
+ suggestions=[],
245
+ mode="debug",
246
+ )
247
+ assert result.n_steps == 5
248
+
249
+ def test_drops_by_op_property(self):
250
+ """drops_by_op returns the _drops_by_op dict."""
251
+ result = CheckResult(
252
+ ok=True,
253
+ warnings=[],
254
+ facts={},
255
+ suggestions=[],
256
+ mode="debug",
257
+ _drops_by_op={"dropna": 42, "filter": 111},
258
+ )
259
+ assert result.drops_by_op == {"dropna": 42, "filter": 111}
260
+
261
+ def test_drops_by_op_empty_default(self):
262
+ """drops_by_op returns empty dict when not set."""
263
+ result = CheckResult(ok=True, warnings=[], facts={}, suggestions=[], mode="debug")
264
+ assert result.drops_by_op == {}
265
+
266
+ def test_to_dict_includes_convenience_fields(self):
267
+ """to_dict includes convenience property values."""
268
+ result = CheckResult(
269
+ ok=True,
270
+ warnings=[],
271
+ facts={"retention_rate": 0.85, "rows_dropped": 15, "total_steps": 3},
272
+ suggestions=[],
273
+ mode="debug",
274
+ _drops_by_op={"dropna": 15},
275
+ )
276
+ d = result.to_dict()
277
+ assert d["passed"] is True
278
+ assert d["retention"] == 0.85
279
+ assert d["n_dropped"] == 15
280
+ assert d["n_steps"] == 3
281
+ assert d["drops_by_op"] == {"dropna": 15}
282
+
283
+
284
+ class TestCheckResultIntegration:
285
+ """Integration tests for CheckResult convenience properties."""
286
+
287
+ def test_check_populates_convenience_properties(self):
288
+ """tp.check() populates convenience properties correctly."""
289
+ tp.enable(mode="debug")
290
+ df = pd.DataFrame({"a": [1, 2, None, 4, 5]})
291
+ df = df.dropna()
292
+
293
+ result = tp.check(df)
294
+
295
+ # Convenience properties should be accessible
296
+ assert isinstance(result.passed, bool)
297
+ assert result.retention is None or isinstance(result.retention, float)
298
+ assert isinstance(result.n_dropped, int)
299
+ assert isinstance(result.drops_by_op, dict)
300
+ assert isinstance(result.n_steps, int)
301
+
302
+ def test_check_drops_by_op_populated(self):
303
+ """tp.check() correctly populates drops_by_op."""
304
+ tp.enable(mode="debug")
305
+ df = pd.DataFrame({"a": [1, 2, None, 4, 5], "b": [1, 1, 2, 2, 3]})
306
+ df = df.dropna()
307
+ df = df.drop_duplicates(subset=["b"])
308
+
309
+ result = tp.check(df)
310
+
311
+ # Should have drops tracked by operation
312
+ assert "DataFrame.dropna" in result.drops_by_op or result.n_dropped > 0
313
+
195
314
 
196
315
  # =============================================================================
197
316
  # TraceResult TESTS
@@ -410,7 +529,9 @@ class TestWhyResult:
410
529
  assert result is not None
411
530
  assert result.column == "amount"
412
531
  assert result.current_value == 300.0 # 200 * 1.5
413
- assert result.n_changes >= 1
532
+ assert (
533
+ result.n_changes == 1
534
+ ), f"Single multiply should record exactly 1 change, got {result.n_changes}"
414
535
 
415
536
  def test_why_with_where_multiple_criteria(self):
416
537
  """why() with where= using multiple column criteria."""