tracepipe 0.3.4__tar.gz → 0.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. tracepipe-0.4.1/CHANGELOG.md +162 -0
  2. {tracepipe-0.3.4 → tracepipe-0.4.1}/PKG-INFO +18 -1
  3. {tracepipe-0.3.4 → tracepipe-0.4.1}/README.md +17 -0
  4. {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/api/core.md +15 -9
  5. tracepipe-0.4.1/docs/changelog.md +120 -0
  6. {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/guide/concepts.md +22 -0
  7. {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/guide/row-tracing.md +64 -0
  8. {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/index.md +4 -2
  9. {tracepipe-0.3.4 → tracepipe-0.4.1}/mkdocs.yml +1 -1
  10. {tracepipe-0.3.4 → tracepipe-0.4.1}/pyproject.toml +1 -1
  11. {tracepipe-0.3.4 → tracepipe-0.4.1}/tests/test_api.py +6 -2
  12. {tracepipe-0.3.4 → tracepipe-0.4.1}/tests/test_convenience_debug.py +122 -1
  13. tracepipe-0.4.1/tests/test_integration_scenarios.py +361 -0
  14. {tracepipe-0.3.4 → tracepipe-0.4.1}/tests/test_lineage_through_merge.py +45 -26
  15. {tracepipe-0.3.4 → tracepipe-0.4.1}/tests/test_pandas_inst.py +20 -10
  16. tracepipe-0.4.1/tests/test_row_provenance.py +684 -0
  17. {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/__init__.py +1 -1
  18. {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/context.py +4 -0
  19. {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/convenience.py +130 -7
  20. {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/core.py +79 -0
  21. {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/instrumentation/filter_capture.py +103 -1
  22. {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/instrumentation/merge_capture.py +169 -23
  23. {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/instrumentation/pandas_inst.py +11 -3
  24. {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/safety.py +29 -4
  25. {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/storage/lineage_store.py +92 -7
  26. {tracepipe-0.3.4 → tracepipe-0.4.1}/uv.lock +317 -248
  27. tracepipe-0.3.4/CHANGELOG.md +0 -80
  28. tracepipe-0.3.4/docs/changelog.md +0 -39
  29. tracepipe-0.3.4/examples/comprehensive_demo.py +0 -694
  30. tracepipe-0.3.4/examples/red_team_test.py +0 -437
  31. {tracepipe-0.3.4 → tracepipe-0.4.1}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  32. {tracepipe-0.3.4 → tracepipe-0.4.1}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  33. {tracepipe-0.3.4 → tracepipe-0.4.1}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  34. {tracepipe-0.3.4 → tracepipe-0.4.1}/.github/workflows/ci.yml +0 -0
  35. {tracepipe-0.3.4 → tracepipe-0.4.1}/.github/workflows/docs.yml +0 -0
  36. {tracepipe-0.3.4 → tracepipe-0.4.1}/.github/workflows/release.yml +0 -0
  37. {tracepipe-0.3.4 → tracepipe-0.4.1}/.gitignore +0 -0
  38. {tracepipe-0.3.4 → tracepipe-0.4.1}/.pre-commit-config.yaml +0 -0
  39. {tracepipe-0.3.4 → tracepipe-0.4.1}/CONTRIBUTING.md +0 -0
  40. {tracepipe-0.3.4 → tracepipe-0.4.1}/LICENSE +0 -0
  41. {tracepipe-0.3.4 → tracepipe-0.4.1}/benchmarks/README.md +0 -0
  42. {tracepipe-0.3.4 → tracepipe-0.4.1}/benchmarks/bench_memory.py +0 -0
  43. {tracepipe-0.3.4 → tracepipe-0.4.1}/benchmarks/bench_overhead.py +0 -0
  44. {tracepipe-0.3.4 → tracepipe-0.4.1}/benchmarks/bench_scale.py +0 -0
  45. {tracepipe-0.3.4 → tracepipe-0.4.1}/benchmarks/run_all.py +0 -0
  46. {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/api/contracts.md +0 -0
  47. {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/api/debug.md +0 -0
  48. {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/api/index.md +0 -0
  49. {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/contributing.md +0 -0
  50. {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/examples/data-validation.md +0 -0
  51. {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/examples/ml-pipeline.md +0 -0
  52. {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/getting-started/installation.md +0 -0
  53. {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/getting-started/modes.md +0 -0
  54. {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/getting-started/quickstart.md +0 -0
  55. {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/guide/cell-provenance.md +0 -0
  56. {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/guide/contracts.md +0 -0
  57. {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/guide/health-checks.md +0 -0
  58. {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/guide/reports.md +0 -0
  59. {tracepipe-0.3.4 → tracepipe-0.4.1}/docs/guide/snapshots.md +0 -0
  60. {tracepipe-0.3.4 → tracepipe-0.4.1}/examples/demo.py +0 -0
  61. {tracepipe-0.3.4 → tracepipe-0.4.1}/examples/ml_pipeline_demo.py +0 -0
  62. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/404.html +0 -0
  63. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/api/contracts/index.html +0 -0
  64. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/api/core/index.html +0 -0
  65. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/api/debug/index.html +0 -0
  66. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/api/index.html +0 -0
  67. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/_mkdocstrings.css +0 -0
  68. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/images/favicon.png +0 -0
  69. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/bundle.79ae519e.min.js +0 -0
  70. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/bundle.79ae519e.min.js.map +0 -0
  71. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.ar.min.js +0 -0
  72. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.da.min.js +0 -0
  73. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.de.min.js +0 -0
  74. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.du.min.js +0 -0
  75. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.el.min.js +0 -0
  76. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.es.min.js +0 -0
  77. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.fi.min.js +0 -0
  78. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.fr.min.js +0 -0
  79. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.he.min.js +0 -0
  80. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.hi.min.js +0 -0
  81. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.hu.min.js +0 -0
  82. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.hy.min.js +0 -0
  83. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.it.min.js +0 -0
  84. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.ja.min.js +0 -0
  85. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.jp.min.js +0 -0
  86. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.kn.min.js +0 -0
  87. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.ko.min.js +0 -0
  88. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.multi.min.js +0 -0
  89. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.nl.min.js +0 -0
  90. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.no.min.js +0 -0
  91. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.pt.min.js +0 -0
  92. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.ro.min.js +0 -0
  93. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.ru.min.js +0 -0
  94. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.sa.min.js +0 -0
  95. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.stemmer.support.min.js +0 -0
  96. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.sv.min.js +0 -0
  97. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.ta.min.js +0 -0
  98. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.te.min.js +0 -0
  99. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.th.min.js +0 -0
  100. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.tr.min.js +0 -0
  101. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.vi.min.js +0 -0
  102. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/min/lunr.zh.min.js +0 -0
  103. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/tinyseg.js +0 -0
  104. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/lunr/wordcut.js +0 -0
  105. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/workers/search.2c215733.min.js +0 -0
  106. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/javascripts/workers/search.2c215733.min.js.map +0 -0
  107. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/stylesheets/main.484c7ddc.min.css +0 -0
  108. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/stylesheets/main.484c7ddc.min.css.map +0 -0
  109. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/stylesheets/palette.ab4e12ef.min.css +0 -0
  110. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/assets/stylesheets/palette.ab4e12ef.min.css.map +0 -0
  111. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/changelog/index.html +0 -0
  112. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/contributing/index.html +0 -0
  113. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/examples/data-validation/index.html +0 -0
  114. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/examples/ml-pipeline/index.html +0 -0
  115. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/getting-started/installation/index.html +0 -0
  116. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/getting-started/modes/index.html +0 -0
  117. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/getting-started/quickstart/index.html +0 -0
  118. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/guide/cell-provenance/index.html +0 -0
  119. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/guide/concepts/index.html +0 -0
  120. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/guide/contracts/index.html +0 -0
  121. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/guide/health-checks/index.html +0 -0
  122. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/guide/reports/index.html +0 -0
  123. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/guide/row-tracing/index.html +0 -0
  124. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/guide/snapshots/index.html +0 -0
  125. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/index.html +0 -0
  126. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/objects.inv +0 -0
  127. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/search/search_index.json +0 -0
  128. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/sitemap.xml +0 -0
  129. {tracepipe-0.3.4 → tracepipe-0.4.1}/site/sitemap.xml.gz +0 -0
  130. {tracepipe-0.3.4 → tracepipe-0.4.1}/tests/__init__.py +0 -0
  131. {tracepipe-0.3.4 → tracepipe-0.4.1}/tests/conftest.py +0 -0
  132. {tracepipe-0.3.4 → tracepipe-0.4.1}/tests/test_concurrency.py +0 -0
  133. {tracepipe-0.3.4 → tracepipe-0.4.1}/tests/test_contracts.py +0 -0
  134. {tracepipe-0.3.4 → tracepipe-0.4.1}/tests/test_edge_cases.py +0 -0
  135. {tracepipe-0.3.4 → tracepipe-0.4.1}/tests/test_integration.py +0 -0
  136. {tracepipe-0.3.4 → tracepipe-0.4.1}/tests/test_io_operations.py +0 -0
  137. {tracepipe-0.3.4 → tracepipe-0.4.1}/tests/test_public_api.py +0 -0
  138. {tracepipe-0.3.4 → tracepipe-0.4.1}/tests/test_snapshot.py +0 -0
  139. {tracepipe-0.3.4 → tracepipe-0.4.1}/tests/test_version_matrix.py +0 -0
  140. {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/api.py +0 -0
  141. {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/contracts.py +0 -0
  142. {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/debug.py +0 -0
  143. {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/instrumentation/__init__.py +0 -0
  144. {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/instrumentation/apply_capture.py +0 -0
  145. {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/instrumentation/indexer_capture.py +0 -0
  146. {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/instrumentation/series_capture.py +0 -0
  147. {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/snapshot.py +0 -0
  148. {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/storage/__init__.py +0 -0
  149. {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/storage/base.py +0 -0
  150. {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/storage/row_identity.py +0 -0
  151. {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/utils/__init__.py +0 -0
  152. {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/utils/value_capture.py +0 -0
  153. {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/value_provenance.py +0 -0
  154. {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/visualization/__init__.py +0 -0
  155. {tracepipe-0.3.4 → tracepipe-0.4.1}/tracepipe/visualization/html_export.py +0 -0
@@ -0,0 +1,162 @@
1
+ # Changelog
2
+
3
+ All notable changes to TracePipe will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## 0.4.1 - 2026-02-04
9
+
10
+ ### Fixed
11
+ - Fully implemented `CheckResult` convenience properties (`.passed`, `.retention`, `.n_dropped`, `.n_steps`, `.drops_by_op`)
12
+ - Added comprehensive tests for `CheckResult` API to ensure properties work correctly
13
+ - Properties now properly access underlying `.facts` dictionary for all metrics
14
+
15
+ ### Changed
16
+ - Cleaned up example files and test scripts
17
+
18
+ ## 0.4.0 - 2026-02-04
19
+
20
+ ### Added
21
+ - **Full row provenance for `pd.concat(axis=0)`**: Row IDs are now preserved through concatenation
22
+ - Each result row maintains its original RID from the source DataFrame
23
+ - `ConcatMapping` tracks which source DataFrame each row came from
24
+ - Concat steps are now marked `FULL` completeness (previously `PARTIAL`)
25
+
26
+ - **Duplicate drop provenance in debug mode**: `drop_duplicates` now tracks which row "won"
27
+ - `DuplicateDropMapping` maps dropped rows to their kept representative
28
+ - Supports `keep='first'`, `keep='last'`, and `keep=False`
29
+ - Uses `hash_pandas_object` for fast, NaN-safe key comparison
30
+
31
+ - **Clean `TraceResult` API for provenance** (UX improvement):
32
+ - `trace.origin` — Unified origin info: `{"type": "concat", "source_df": 1}` or `{"type": "merge", "left_parent": 10, "right_parent": 20}`
33
+ - `trace.representative` — For dedup-dropped rows: `{"kept_rid": 42, "subset": ["key"], "keep": "first"}`
34
+ - No need to access internal `.store` methods — everything is in `tp.trace()` result
35
+
36
+ - **Clean `CheckResult` API** (UX improvement):
37
+ - `result.passed` — Alias for `.ok` (common naming convention)
38
+ - `result.retention` — Row retention rate (0.0-1.0) from `.facts`
39
+ - `result.n_dropped` — Total rows dropped
40
+ - `result.n_steps` — Total pipeline steps recorded
41
+ - `result.drops_by_op` — Drops broken down by operation name
42
+ - All properties are now discoverable via autocomplete
43
+
44
+ - **New data structures in `core.py`**:
45
+ - `ConcatMapping`: Tracks row provenance through concat operations
46
+ - `DuplicateDropMapping`: Tracks dropped->kept relationships in drop_duplicates
47
+
48
+ - **Comprehensive test suite**: 38 new tests in `test_row_provenance.py` covering:
49
+ - Concat RID preservation, ignore_index, after sort, with empty DFs, chained concats
50
+ - Axis=1 same-RID propagation vs different-RID PARTIAL marking
51
+ - Drop_duplicates keep='first'/'last'/False mapping correctness
52
+ - NaN handling parity with pandas `duplicated()`
53
+ - Integration: concat→merge, filter→concat, dedup→fillna lineage
54
+ - TraceResult `.origin` and `.representative` property tests
55
+
56
+ ### Changed
57
+ - `wrap_concat_with_lineage` rewritten for full provenance tracking
58
+ - Captures source RIDs before operation
59
+ - Propagates RIDs (not new registration) for axis=0
60
+ - Stores positional + sorted arrays for both "explain row i" and O(log n) lookup
61
+ - Axis=1 propagates RIDs if all inputs match, otherwise PARTIAL
62
+
63
+ - `_capture_filter_with_mask` enhanced to store `DuplicateDropMapping` in debug mode
64
+
65
+ - `TraceResult` enhanced with `.origin` and `.representative` properties
66
+ - `.to_text()` now displays origin and representative info
67
+ - `.to_dict()` includes all provenance info
68
+
69
+ ## 0.3.5 - 2026-02-03
70
+
71
+ ### Fixed
72
+ - **DataFrame.fillna double-logging**: `df.fillna({"col": 0})` now logs exactly 1 event
73
+ - Previously logged both `DataFrame.fillna` and internal `__setitem__` for same change
74
+ - Added `wrap_pandas_transform_method` with `_in_transform_op` flag to suppress nested setitem
75
+ - Works for both `fillna` and `replace` operations, including `inplace=True`
76
+
77
+ ### Added
78
+ - Known Limitations section in README documenting concat/dedup tracking gaps
79
+ - Test for `DataFrame.fillna` single-event logging
80
+
81
+ ### Changed
82
+ - **Test suite hardened** with exact count assertions and multi-scenario tests:
83
+ - Changed 15+ assertions from `>= 1` to `== 1` for precise verification
84
+ - Added `test_integration_scenarios.py` with 16 new tests covering:
85
+ - Multi-pipeline session isolation
86
+ - Warning message content verification
87
+ - Reliability scenarios (fillna, replace, loc, merge)
88
+ - Cross-pipeline contamination prevention
89
+
90
+ ## 0.3.4 - 2026-02-03
91
+
92
+ ### Fixed
93
+ - **Event deduplication**: Identical events from parallel pipelines are now deduplicated
94
+ - When multiple DataFrames share row IDs (e.g., from `df.copy()`), same changes are recorded once
95
+ - Events deduplicated by `(col, old_val, new_val, operation)` signature
96
+ - Prevents "4 events" when only 1 logical change occurred
97
+
98
+ ### Added
99
+ - `_stable_repr()` helper for robust value comparison in deduplication
100
+ - Tests for cross-pipeline event deduplication behavior
101
+
102
+ ## 0.3.3 - 2026-02-03
103
+
104
+ ### Fixed
105
+ - **Double-logging bug**: `df['col'] = df['col'].fillna()` now logs exactly one event, not two
106
+ - Fixed duplicate capture from both `_wrap_setitem` and `wrap_series_assignment`
107
+ - **Merge warning scoping**: `tp.check(df)` now only shows warnings for merges in df's lineage
108
+ - Previously showed warnings from ALL merges in the session (cross-contamination)
109
+ - Now filters by tracking which merge steps produced the queried DataFrame's rows
110
+
111
+ ### Added
112
+ - `_get_merge_stats_for_df()` helper to scope merge warnings to df's lineage
113
+ - Tests for double-logging prevention and merge warning scoping
114
+
115
+ ## 0.3.2 - 2026-02-03
116
+
117
+ ### Fixed
118
+ - Merge duplicate key warnings now correctly identify which table (left/right) has duplicates
119
+ - Previously `right_dup_rate` was mislabeled as "Right table" when it actually indicates LEFT table duplicates
120
+
121
+ ## 0.3.1 - 2026-02-03
122
+
123
+ ### Fixed
124
+ - Cell history now correctly chains through merge operations via lineage traversal
125
+ - `tp.why()` and `tp.trace()` show pre-merge changes for post-merge rows
126
+ - `enable()` resets accumulated state when called multiple times (fixes duplicate warnings in notebooks/IDEs)
127
+
128
+ ### Added
129
+ - `get_row_history_with_lineage()` and `get_cell_history_with_lineage()` methods for lineage-aware queries
130
+ - `follow_lineage` parameter in `explain_value()` for opt-out of lineage traversal
131
+ - Integration tests for cell provenance through merge operations
132
+
133
+ ## 0.3.0 - 2026-02-03
134
+
135
+ ### Added
136
+ - MkDocs documentation site with Material theme
137
+ - Comprehensive API reference documentation
138
+ - Getting started guides and tutorials
139
+ - `tp.register()` API for manually registering DataFrames created before `enable()`
140
+ - Configurable retention threshold in `tp.check()`
141
+ - Ghost row capture for fallback filter paths
142
+ - Comprehensive test coverage for COLUMN identity mode
143
+ - Data quality contracts with fluent API (`tp.contract().expect_*()`)
144
+ - HTML report generation with `tp.report()`
145
+ - Snapshot and diff functionality
146
+ - Debug mode with cell-level tracking
147
+ - `tp.why()` for cell provenance
148
+ - `tp.trace()` for row journey
149
+ - Watched columns for selective tracking
150
+ - Ghost values capture
151
+ - Basic row-level lineage tracking
152
+ - Support for filter operations (dropna, query, boolean indexing)
153
+ - Support for transform operations (fillna, replace, setitem)
154
+ - Support for merge and join operations
155
+ - CI and Debug modes
156
+
157
+ ### Fixed
158
+ - Recursion bug when accessing hidden `__tracepipe_row_id__` column in COLUMN mode
159
+ - Config propagation to `row_manager` and `store` components in `enable()`
160
+ - Retention rate calculation for multi-table pipelines with merges
161
+ - Export wrappers (`to_csv`, `to_parquet`) now correctly strip hidden column
162
+ - `_filter_op_depth` cleanup in error scenarios
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tracepipe
3
- Version: 0.3.4
3
+ Version: 0.4.1
4
4
  Summary: Row-level data lineage tracking for pandas pipelines
5
5
  Project-URL: Homepage, https://github.com/tracepipe/tracepipe
6
6
  Project-URL: Documentation, https://tracepipe.github.io/tracepipe/
@@ -276,6 +276,23 @@ tp.enable(mode="debug") # Full lineage
276
276
 
277
277
  ---
278
278
 
279
+ ## Known Limitations
280
+
281
+ TracePipe tracks **cell mutations**, **merge provenance**, **concat provenance**, and **duplicate drop decisions** reliably. A few patterns have limited tracking:
282
+
283
+ | Pattern | Status | Notes |
284
+ |---------|--------|-------|
285
+ | `df["col"] = df["col"].fillna(0)` | ✅ Tracked | Series + assignment |
286
+ | `df = df.fillna({"col": 0})` | ✅ Tracked | DataFrame-level fillna |
287
+ | `df.loc[mask, "col"] = val` | ✅ Tracked | Conditional assignment |
288
+ | `df.merge(other, on="key")` | ✅ Tracked | Full provenance in debug mode |
289
+ | `pd.concat([df1, df2])` | ✅ Tracked | Row IDs preserved with source DataFrame tracking (v0.4+) |
290
+ | `df.drop_duplicates()` | ✅ Tracked | Dropped rows map to kept representative (debug mode, v0.4+) |
291
+ | `pd.concat(axis=1)` | ⚠️ Partial | FULL only if all inputs have identical RIDs |
292
+ | Complex `apply`/`pipe` | ⚠️ Partial | Output tracked, internals opaque |
293
+
294
+ ---
295
+
279
296
  ## Contributing
280
297
 
281
298
  ```bash
@@ -207,6 +207,23 @@ tp.enable(mode="debug") # Full lineage
207
207
 
208
208
  ---
209
209
 
210
+ ## Known Limitations
211
+
212
+ TracePipe tracks **cell mutations**, **merge provenance**, **concat provenance**, and **duplicate drop decisions** reliably. A few patterns have limited tracking:
213
+
214
+ | Pattern | Status | Notes |
215
+ |---------|--------|-------|
216
+ | `df["col"] = df["col"].fillna(0)` | ✅ Tracked | Series + assignment |
217
+ | `df = df.fillna({"col": 0})` | ✅ Tracked | DataFrame-level fillna |
218
+ | `df.loc[mask, "col"] = val` | ✅ Tracked | Conditional assignment |
219
+ | `df.merge(other, on="key")` | ✅ Tracked | Full provenance in debug mode |
220
+ | `pd.concat([df1, df2])` | ✅ Tracked | Row IDs preserved with source DataFrame tracking (v0.4+) |
221
+ | `df.drop_duplicates()` | ✅ Tracked | Dropped rows map to kept representative (debug mode, v0.4+) |
222
+ | `pd.concat(axis=1)` | ⚠️ Partial | FULL only if all inputs have identical RIDs |
223
+ | Complex `apply`/`pipe` | ⚠️ Partial | Output tracked, internals opaque |
224
+
225
+ ---
226
+
210
227
  ## Contributing
211
228
 
212
229
  ```bash
@@ -86,6 +86,9 @@ Manually register DataFrames for tracking.
86
86
 
87
87
  Use this when DataFrames are created before `tp.enable()` is called.
88
88
 
89
+ !!! note "Lineage Break"
90
+ Calling `register()` assigns new row IDs, which breaks lineage from any prior transformations. Use it only for "entry point" DataFrames.
91
+
89
92
  **Parameters:**
90
93
 
91
94
  | Parameter | Type | Description |
@@ -161,14 +164,15 @@ Health check for a DataFrame's lineage.
161
164
 
162
165
  | Attribute | Type | Description |
163
166
  |-----------|------|-------------|
164
- | `.passed` | `bool` | True if healthy |
167
+ | `.ok` | `bool` | True if no FACT-level warnings |
168
+ | `.passed` | `bool` | Alias for `.ok` |
165
169
  | `.mode` | `str` | Current tracking mode |
166
- | `.retention` | `float` | Row retention rate (0-1) |
167
- | `.n_dropped` | `int` | Total dropped rows |
168
- | `.n_changes` | `int` | Total cell changes |
169
- | `.warnings` | `list[str]` | Any warnings |
170
- | `.drops_by_op` | `dict` | Drops by operation |
171
- | `.changes_by_op` | `dict` | Changes by operation |
170
+ | `.retention` | `float \| None` | Row retention rate (0.0-1.0) |
171
+ | `.n_dropped` | `int` | Total rows dropped |
172
+ | `.n_steps` | `int` | Total pipeline steps recorded |
173
+ | `.drops_by_op` | `dict[str, int]` | Drops by operation name |
174
+ | `.warnings` | `list[CheckWarning]` | Warning objects with details |
175
+ | `.facts` | `dict` | Raw measured facts (for power users) |
172
176
 
173
177
  **Example:**
174
178
 
@@ -209,9 +213,11 @@ Trace a row's journey through the pipeline.
209
213
  | Attribute | Type | Description |
210
214
  |-----------|------|-------------|
211
215
  | `.row_id` | `int` | Internal row ID |
212
- | `.status` | `str` | `"alive"` or `"dropped"` |
216
+ | `.is_alive` | `bool` | True if row exists in current DataFrame |
213
217
  | `.events` | `list` | All events for this row |
214
- | `.dropped_by` | `str` | Operation that dropped (if dropped) |
218
+ | `.dropped_at` | `dict` | Operation that dropped (if dropped) |
219
+ | `.origin` | `dict` | Where row came from: `{"type": "concat", "source_df": 1}` or `{"type": "merge", "left_parent": 10, "right_parent": 20}` |
220
+ | `.representative` | `dict` | If dropped by dedup: `{"kept_rid": 42, "subset": [...], "keep": "first"}` |
215
221
 
216
222
  **Example:**
217
223
 
@@ -0,0 +1,120 @@
1
+ # Changelog
2
+
3
+ All notable changes to TracePipe will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [0.4.1] - 2026-02-04
9
+
10
+ ### Fixed
11
+ - Fully implemented `CheckResult` convenience properties (`.passed`, `.retention`, `.n_dropped`, `.n_steps`, `.drops_by_op`)
12
+ - Added comprehensive tests for `CheckResult` API to ensure properties work correctly
13
+ - Properties now properly access underlying `.facts` dictionary for all metrics
14
+
15
+ ### Changed
16
+ - Cleaned up example files and test scripts
17
+
18
+ ## [0.4.0] - 2026-02-04
19
+
20
+ ### Added
21
+
22
+ - **Full row provenance for `pd.concat(axis=0)`**: Row IDs are now preserved through concatenation
23
+ - Each result row maintains its original RID from the source DataFrame
24
+ - `ConcatMapping` tracks which source DataFrame each row came from
25
+ - Concat steps are now marked `FULL` completeness
26
+
27
+ - **Duplicate drop provenance in debug mode**: `drop_duplicates` now tracks which row "won"
28
+ - `DuplicateDropMapping` maps dropped rows to their kept representative
29
+ - Supports `keep='first'`, `keep='last'`, and `keep=False`
30
+ - Uses `hash_pandas_object` for fast, NaN-safe key comparison
31
+
32
+ - **Clean `TraceResult` API for provenance**:
33
+ - `trace.origin` — Unified origin: `{"type": "concat", "source_df": 1}` or `{"type": "merge", ...}`
34
+ - `trace.representative` — For dedup drops: `{"kept_rid": 42, "subset": ["key"], "keep": "first"}`
35
+ - No need to access internal `.store` methods
36
+
37
+ - **Clean `CheckResult` API**:
38
+ - `result.passed` — Alias for `.ok`
39
+ - `result.retention` — Row retention rate (0.0-1.0)
40
+ - `result.n_dropped`, `result.n_steps`, `result.drops_by_op`
41
+ - All properties discoverable via autocomplete
42
+
43
+ - **Comprehensive test suite**: 38 new tests covering concat, dedup, and TraceResult API
44
+
45
+ ### Changed
46
+
47
+ - `wrap_concat_with_lineage` rewritten for full provenance tracking
48
+ - `axis=1` concat propagates RIDs if all inputs match, otherwise PARTIAL
49
+ - `TraceResult` enhanced with `.origin` and `.representative` properties
50
+
51
+ ## [0.3.5] - 2026-02-03
52
+
53
+ ### Fixed
54
+
55
+ - **DataFrame.fillna double-logging**: `df.fillna({"col": 0})` now logs exactly 1 event
56
+ - Added `wrap_pandas_transform_method` with `_in_transform_op` flag
57
+
58
+ ### Added
59
+
60
+ - Known Limitations section in README documenting concat/dedup tracking gaps
61
+
62
+ ### Changed
63
+
64
+ - Test suite hardened with exact count assertions and multi-scenario tests
65
+
66
+ ## [0.3.4] - 2026-02-03
67
+
68
+ ### Fixed
69
+
70
+ - **Event deduplication**: Identical events from parallel pipelines are now deduplicated
71
+
72
+ ## [0.3.3] - 2026-02-03
73
+
74
+ ### Fixed
75
+
76
+ - **Double-logging bug**: `df['col'] = df['col'].fillna()` now logs exactly one event
77
+ - **Merge warning scoping**: `tp.check(df)` now only shows warnings for merges in df's lineage
78
+
79
+ ## [0.3.2] - 2026-02-03
80
+
81
+ ### Fixed
82
+
83
+ - Merge duplicate key warnings now correctly identify which table (left/right) has duplicates
84
+
85
+ ## [0.3.1] - 2026-02-03
86
+
87
+ ### Fixed
88
+
89
+ - Cell history now correctly chains through merge operations via lineage traversal
90
+ - `tp.why()` and `tp.trace()` show pre-merge changes for post-merge rows
91
+ - `enable()` resets accumulated state when called multiple times
92
+
93
+ ### Added
94
+
95
+ - `get_row_history_with_lineage()` and `get_cell_history_with_lineage()` methods
96
+
97
+ ## [0.3.0] - 2026-02-03
98
+
99
+ ### Added
100
+
101
+ - MkDocs documentation site with Material theme
102
+ - Comprehensive API reference documentation
103
+ - Getting started guides and tutorials
104
+ - `tp.register()` API for manually registering DataFrames
105
+ - Configurable retention threshold in `tp.check()`
106
+ - Ghost row capture for fallback filter paths
107
+ - Data quality contracts with fluent API
108
+ - HTML report generation
109
+ - Snapshot and diff functionality
110
+ - Debug mode with cell-level tracking
111
+ - `tp.why()` for cell provenance
112
+ - `tp.trace()` for row journey
113
+ - Support for all major pandas operations
114
+
115
+ ### Fixed
116
+
117
+ - Recursion bug when accessing hidden column in COLUMN mode
118
+ - Config propagation issues
119
+ - Retention rate calculation for multi-table pipelines
120
+ - Export wrappers correctly strip hidden column
@@ -68,6 +68,28 @@ When rows are aggregated:
68
68
  grouped = df.groupby("category").sum() # GROUP event with membership
69
69
  ```
70
70
 
71
+ ### Concat Events (v0.4+)
72
+
73
+ When DataFrames are concatenated, row IDs are preserved:
74
+
75
+ ```python
76
+ df1 = pd.DataFrame({"a": [1, 2]}) # Rows get IDs: 0, 1
77
+ df2 = pd.DataFrame({"a": [3, 4]}) # Rows get IDs: 2, 3
78
+
79
+ result = pd.concat([df1, df2]) # IDs preserved: 0, 1, 2, 3
80
+ # TracePipe tracks which source DataFrame each row came from
81
+ ```
82
+
83
+ ### Duplicate Drop Events (v0.4+)
84
+
85
+ In debug mode, `drop_duplicates` tracks which row was kept as representative:
86
+
87
+ ```python
88
+ df = pd.DataFrame({"key": ["A", "A", "B"], "val": [1, 2, 3]})
89
+ df = df.drop_duplicates(subset=["key"], keep="first")
90
+ # Row with val=2 was dropped, mapped to representative (val=1)
91
+ ```
92
+
71
93
  ---
72
94
 
73
95
  ## The Lineage Store
@@ -109,6 +109,70 @@ if trace.merge_parents:
109
109
  print(f"Right parent: {trace.merge_parents.right}")
110
110
  ```
111
111
 
112
+ ---
113
+
114
+ ## Concat Origin Tracking (v0.4+)
115
+
116
+ When rows come from concatenated DataFrames, TracePipe tracks their source via `trace.origin`:
117
+
118
+ ```python
119
+ df1 = pd.DataFrame({"a": [1, 2]})
120
+ df2 = pd.DataFrame({"a": [3, 4]})
121
+ result = pd.concat([df1, df2])
122
+
123
+ # Trace a row that came from df2
124
+ trace = tp.trace(result, row=2)
125
+ print(trace.origin)
126
+ # {"type": "concat", "source_df": 1, "step_id": 5}
127
+ ```
128
+
129
+ The `.origin` property returns a unified dict with:
130
+
131
+ - `type`: `"concat"`, `"merge"`, or `None` (for original rows)
132
+ - `source_df`: Index in the concat list (0=first DataFrame, 1=second, etc.)
133
+ - `step_id`: Which pipeline step
134
+
135
+ Row IDs are preserved through `pd.concat(axis=0)`, so lineage chains correctly:
136
+
137
+ ```python
138
+ # Transform df1 before concat
139
+ df1["a"] = df1["a"].fillna(0)
140
+
141
+ result = pd.concat([df1, df2])
142
+
143
+ # Rows from df1 still have their fillna history
144
+ trace = tp.trace(result, row=0) # Shows fillna event from df1
145
+ ```
146
+
147
+ ---
148
+
149
+ ## Duplicate Representative Tracking (v0.4+)
150
+
151
+ When `drop_duplicates` removes rows, TracePipe tracks which row "won" via `trace.representative`:
152
+
153
+ ```python
154
+ df = pd.DataFrame({
155
+ "key": ["A", "A", "B"],
156
+ "value": [100, 200, 300]
157
+ })
158
+ df = df.drop_duplicates(subset=["key"], keep="first")
159
+
160
+ # Trace the dropped row (value=200)
161
+ trace = tp.trace(df, row=dropped_row_id)
162
+ print(trace.representative)
163
+ # {"kept_rid": 42, "subset": ["key"], "keep": "first"}
164
+ ```
165
+
166
+ The `.representative` property is only set for rows dropped by `drop_duplicates`:
167
+
168
+ | `keep` Strategy | `.representative` |
169
+ |-----------------|-------------------|
170
+ | `keep='first'` | `{"kept_rid": 42, ...}` — first occurrence kept |
171
+ | `keep='last'` | `{"kept_rid": 45, ...}` — last occurrence kept |
172
+ | `keep=False` | `{"kept_rid": None, ...}` — all duplicates removed |
173
+
174
+ This answers "why did this row disappear?" — it wasn't deleted, it was deduplicated.
175
+
112
176
  ## Performance Considerations
113
177
 
114
178
  - Row tracing in CI mode is limited (no individual row IDs)
@@ -159,12 +159,14 @@ print(tp.why(df, "price", 0)) # Why price changed
159
159
 
160
160
  | Operation | Tracking | Completeness |
161
161
  |-----------|----------|--------------|
162
- | `dropna`, `drop_duplicates` | Dropped row IDs | Full |
163
- | `query`, `df[mask]` | Dropped row IDs | Full |
162
+ | `dropna`, `query`, `df[mask]` | Dropped row IDs | Full |
163
+ | `drop_duplicates` | Dropped→kept mapping (debug mode) | Full |
164
164
  | `head`, `tail`, `sample` | Dropped row IDs | Full |
165
165
  | `fillna`, `replace` | Cell diffs (watched cols) | Full |
166
166
  | `loc[]=`, `iloc[]=`, `at[]=` | Cell diffs | Full |
167
167
  | `merge`, `join` | Parent tracking | Full |
168
+ | `pd.concat(axis=0)` | Row IDs + source DataFrame | Full |
169
+ | `pd.concat(axis=1)` | Row IDs (if aligned) | Partial |
168
170
  | `groupby().agg()` | Group membership | Full |
169
171
  | `apply`, `pipe` | Output tracked | Partial |
170
172
 
@@ -1,6 +1,6 @@
1
1
  site_name: TracePipe
2
2
  site_description: Row-level data lineage tracking for pandas pipelines
3
- site_url: https://tracepipe.github.io/tracepipe/
3
+ site_url: https://gauthierpiarrette.github.io/tracepipe/
4
4
  repo_url: https://github.com/gauthierpiarrette/tracepipe
5
5
  repo_name: gauthierpiarrette/tracepipe
6
6
  edit_uri: edit/main/docs/
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "tracepipe"
7
- version = "0.3.4"
7
+ version = "0.4.1"
8
8
  description = "Row-level data lineage tracking for pandas pipelines"
9
9
  readme = "README.md"
10
10
  license = {file = "LICENSE"}
@@ -304,7 +304,9 @@ class TestRowLineageResult:
304
304
 
305
305
  row = dbg().explain_row(0)
306
306
  history = row.cell_history("a")
307
- assert len(history) >= 1
307
+ assert (
308
+ len(history) == 1
309
+ ), f"Single fillna should record exactly 1 change, got {len(history)}"
308
310
 
309
311
  def test_history(self):
310
312
  """history() returns full history."""
@@ -487,7 +489,9 @@ class TestPreEnableDataFrameTracking:
487
489
  df["a"] = df["a"] * 10
488
490
 
489
491
  result = tracepipe.why(df, col="a", row=0)
490
- assert len(result.history) >= 1
492
+ assert (
493
+ len(result.history) == 1
494
+ ), f"Single multiply should record exactly 1 change, got {len(result.history)}"
491
495
 
492
496
  def test_trace_after_register(self):
493
497
  """Row tracing works for registered DataFrames."""