tracepipe 0.3.3__tar.gz → 0.3.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (151) hide show
  1. {tracepipe-0.3.3 → tracepipe-0.3.5}/CHANGELOG.md +24 -0
  2. {tracepipe-0.3.3 → tracepipe-0.3.5}/PKG-INFO +21 -1
  3. {tracepipe-0.3.3 → tracepipe-0.3.5}/README.md +20 -0
  4. {tracepipe-0.3.3 → tracepipe-0.3.5}/pyproject.toml +1 -1
  5. {tracepipe-0.3.3 → tracepipe-0.3.5}/tests/test_lineage_through_merge.py +68 -0
  6. {tracepipe-0.3.3 → tracepipe-0.3.5}/tracepipe/__init__.py +1 -1
  7. {tracepipe-0.3.3 → tracepipe-0.3.5}/tracepipe/context.py +4 -0
  8. {tracepipe-0.3.3 → tracepipe-0.3.5}/tracepipe/instrumentation/pandas_inst.py +11 -3
  9. {tracepipe-0.3.3 → tracepipe-0.3.5}/tracepipe/safety.py +29 -4
  10. {tracepipe-0.3.3 → tracepipe-0.3.5}/tracepipe/storage/lineage_store.py +37 -2
  11. {tracepipe-0.3.3 → tracepipe-0.3.5}/uv.lock +1 -1
  12. {tracepipe-0.3.3 → tracepipe-0.3.5}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  13. {tracepipe-0.3.3 → tracepipe-0.3.5}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  14. {tracepipe-0.3.3 → tracepipe-0.3.5}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  15. {tracepipe-0.3.3 → tracepipe-0.3.5}/.github/workflows/ci.yml +0 -0
  16. {tracepipe-0.3.3 → tracepipe-0.3.5}/.github/workflows/docs.yml +0 -0
  17. {tracepipe-0.3.3 → tracepipe-0.3.5}/.github/workflows/release.yml +0 -0
  18. {tracepipe-0.3.3 → tracepipe-0.3.5}/.gitignore +0 -0
  19. {tracepipe-0.3.3 → tracepipe-0.3.5}/.pre-commit-config.yaml +0 -0
  20. {tracepipe-0.3.3 → tracepipe-0.3.5}/CONTRIBUTING.md +0 -0
  21. {tracepipe-0.3.3 → tracepipe-0.3.5}/LICENSE +0 -0
  22. {tracepipe-0.3.3 → tracepipe-0.3.5}/benchmarks/README.md +0 -0
  23. {tracepipe-0.3.3 → tracepipe-0.3.5}/benchmarks/bench_memory.py +0 -0
  24. {tracepipe-0.3.3 → tracepipe-0.3.5}/benchmarks/bench_overhead.py +0 -0
  25. {tracepipe-0.3.3 → tracepipe-0.3.5}/benchmarks/bench_scale.py +0 -0
  26. {tracepipe-0.3.3 → tracepipe-0.3.5}/benchmarks/run_all.py +0 -0
  27. {tracepipe-0.3.3 → tracepipe-0.3.5}/docs/api/contracts.md +0 -0
  28. {tracepipe-0.3.3 → tracepipe-0.3.5}/docs/api/core.md +0 -0
  29. {tracepipe-0.3.3 → tracepipe-0.3.5}/docs/api/debug.md +0 -0
  30. {tracepipe-0.3.3 → tracepipe-0.3.5}/docs/api/index.md +0 -0
  31. {tracepipe-0.3.3 → tracepipe-0.3.5}/docs/changelog.md +0 -0
  32. {tracepipe-0.3.3 → tracepipe-0.3.5}/docs/contributing.md +0 -0
  33. {tracepipe-0.3.3 → tracepipe-0.3.5}/docs/examples/data-validation.md +0 -0
  34. {tracepipe-0.3.3 → tracepipe-0.3.5}/docs/examples/ml-pipeline.md +0 -0
  35. {tracepipe-0.3.3 → tracepipe-0.3.5}/docs/getting-started/installation.md +0 -0
  36. {tracepipe-0.3.3 → tracepipe-0.3.5}/docs/getting-started/modes.md +0 -0
  37. {tracepipe-0.3.3 → tracepipe-0.3.5}/docs/getting-started/quickstart.md +0 -0
  38. {tracepipe-0.3.3 → tracepipe-0.3.5}/docs/guide/cell-provenance.md +0 -0
  39. {tracepipe-0.3.3 → tracepipe-0.3.5}/docs/guide/concepts.md +0 -0
  40. {tracepipe-0.3.3 → tracepipe-0.3.5}/docs/guide/contracts.md +0 -0
  41. {tracepipe-0.3.3 → tracepipe-0.3.5}/docs/guide/health-checks.md +0 -0
  42. {tracepipe-0.3.3 → tracepipe-0.3.5}/docs/guide/reports.md +0 -0
  43. {tracepipe-0.3.3 → tracepipe-0.3.5}/docs/guide/row-tracing.md +0 -0
  44. {tracepipe-0.3.3 → tracepipe-0.3.5}/docs/guide/snapshots.md +0 -0
  45. {tracepipe-0.3.3 → tracepipe-0.3.5}/docs/index.md +0 -0
  46. {tracepipe-0.3.3 → tracepipe-0.3.5}/examples/comprehensive_demo.py +0 -0
  47. {tracepipe-0.3.3 → tracepipe-0.3.5}/examples/demo.py +0 -0
  48. {tracepipe-0.3.3 → tracepipe-0.3.5}/examples/ml_pipeline_demo.py +0 -0
  49. {tracepipe-0.3.3 → tracepipe-0.3.5}/examples/red_team_test.py +0 -0
  50. {tracepipe-0.3.3 → tracepipe-0.3.5}/mkdocs.yml +0 -0
  51. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/404.html +0 -0
  52. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/api/contracts/index.html +0 -0
  53. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/api/core/index.html +0 -0
  54. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/api/debug/index.html +0 -0
  55. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/api/index.html +0 -0
  56. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/_mkdocstrings.css +0 -0
  57. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/images/favicon.png +0 -0
  58. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/bundle.79ae519e.min.js +0 -0
  59. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/bundle.79ae519e.min.js.map +0 -0
  60. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/lunr/min/lunr.ar.min.js +0 -0
  61. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/lunr/min/lunr.da.min.js +0 -0
  62. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/lunr/min/lunr.de.min.js +0 -0
  63. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/lunr/min/lunr.du.min.js +0 -0
  64. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/lunr/min/lunr.el.min.js +0 -0
  65. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/lunr/min/lunr.es.min.js +0 -0
  66. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/lunr/min/lunr.fi.min.js +0 -0
  67. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/lunr/min/lunr.fr.min.js +0 -0
  68. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/lunr/min/lunr.he.min.js +0 -0
  69. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/lunr/min/lunr.hi.min.js +0 -0
  70. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/lunr/min/lunr.hu.min.js +0 -0
  71. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/lunr/min/lunr.hy.min.js +0 -0
  72. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/lunr/min/lunr.it.min.js +0 -0
  73. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/lunr/min/lunr.ja.min.js +0 -0
  74. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/lunr/min/lunr.jp.min.js +0 -0
  75. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/lunr/min/lunr.kn.min.js +0 -0
  76. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/lunr/min/lunr.ko.min.js +0 -0
  77. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/lunr/min/lunr.multi.min.js +0 -0
  78. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/lunr/min/lunr.nl.min.js +0 -0
  79. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/lunr/min/lunr.no.min.js +0 -0
  80. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/lunr/min/lunr.pt.min.js +0 -0
  81. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/lunr/min/lunr.ro.min.js +0 -0
  82. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/lunr/min/lunr.ru.min.js +0 -0
  83. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/lunr/min/lunr.sa.min.js +0 -0
  84. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/lunr/min/lunr.stemmer.support.min.js +0 -0
  85. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/lunr/min/lunr.sv.min.js +0 -0
  86. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/lunr/min/lunr.ta.min.js +0 -0
  87. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/lunr/min/lunr.te.min.js +0 -0
  88. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/lunr/min/lunr.th.min.js +0 -0
  89. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/lunr/min/lunr.tr.min.js +0 -0
  90. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/lunr/min/lunr.vi.min.js +0 -0
  91. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/lunr/min/lunr.zh.min.js +0 -0
  92. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/lunr/tinyseg.js +0 -0
  93. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/lunr/wordcut.js +0 -0
  94. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/workers/search.2c215733.min.js +0 -0
  95. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/javascripts/workers/search.2c215733.min.js.map +0 -0
  96. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/stylesheets/main.484c7ddc.min.css +0 -0
  97. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/stylesheets/main.484c7ddc.min.css.map +0 -0
  98. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/stylesheets/palette.ab4e12ef.min.css +0 -0
  99. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/assets/stylesheets/palette.ab4e12ef.min.css.map +0 -0
  100. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/changelog/index.html +0 -0
  101. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/contributing/index.html +0 -0
  102. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/examples/data-validation/index.html +0 -0
  103. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/examples/ml-pipeline/index.html +0 -0
  104. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/getting-started/installation/index.html +0 -0
  105. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/getting-started/modes/index.html +0 -0
  106. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/getting-started/quickstart/index.html +0 -0
  107. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/guide/cell-provenance/index.html +0 -0
  108. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/guide/concepts/index.html +0 -0
  109. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/guide/contracts/index.html +0 -0
  110. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/guide/health-checks/index.html +0 -0
  111. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/guide/reports/index.html +0 -0
  112. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/guide/row-tracing/index.html +0 -0
  113. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/guide/snapshots/index.html +0 -0
  114. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/index.html +0 -0
  115. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/objects.inv +0 -0
  116. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/search/search_index.json +0 -0
  117. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/sitemap.xml +0 -0
  118. {tracepipe-0.3.3 → tracepipe-0.3.5}/site/sitemap.xml.gz +0 -0
  119. {tracepipe-0.3.3 → tracepipe-0.3.5}/tests/__init__.py +0 -0
  120. {tracepipe-0.3.3 → tracepipe-0.3.5}/tests/conftest.py +0 -0
  121. {tracepipe-0.3.3 → tracepipe-0.3.5}/tests/test_api.py +0 -0
  122. {tracepipe-0.3.3 → tracepipe-0.3.5}/tests/test_concurrency.py +0 -0
  123. {tracepipe-0.3.3 → tracepipe-0.3.5}/tests/test_contracts.py +0 -0
  124. {tracepipe-0.3.3 → tracepipe-0.3.5}/tests/test_convenience_debug.py +0 -0
  125. {tracepipe-0.3.3 → tracepipe-0.3.5}/tests/test_edge_cases.py +0 -0
  126. {tracepipe-0.3.3 → tracepipe-0.3.5}/tests/test_integration.py +0 -0
  127. {tracepipe-0.3.3 → tracepipe-0.3.5}/tests/test_io_operations.py +0 -0
  128. {tracepipe-0.3.3 → tracepipe-0.3.5}/tests/test_pandas_inst.py +0 -0
  129. {tracepipe-0.3.3 → tracepipe-0.3.5}/tests/test_public_api.py +0 -0
  130. {tracepipe-0.3.3 → tracepipe-0.3.5}/tests/test_snapshot.py +0 -0
  131. {tracepipe-0.3.3 → tracepipe-0.3.5}/tests/test_version_matrix.py +0 -0
  132. {tracepipe-0.3.3 → tracepipe-0.3.5}/tracepipe/api.py +0 -0
  133. {tracepipe-0.3.3 → tracepipe-0.3.5}/tracepipe/contracts.py +0 -0
  134. {tracepipe-0.3.3 → tracepipe-0.3.5}/tracepipe/convenience.py +0 -0
  135. {tracepipe-0.3.3 → tracepipe-0.3.5}/tracepipe/core.py +0 -0
  136. {tracepipe-0.3.3 → tracepipe-0.3.5}/tracepipe/debug.py +0 -0
  137. {tracepipe-0.3.3 → tracepipe-0.3.5}/tracepipe/instrumentation/__init__.py +0 -0
  138. {tracepipe-0.3.3 → tracepipe-0.3.5}/tracepipe/instrumentation/apply_capture.py +0 -0
  139. {tracepipe-0.3.3 → tracepipe-0.3.5}/tracepipe/instrumentation/filter_capture.py +0 -0
  140. {tracepipe-0.3.3 → tracepipe-0.3.5}/tracepipe/instrumentation/indexer_capture.py +0 -0
  141. {tracepipe-0.3.3 → tracepipe-0.3.5}/tracepipe/instrumentation/merge_capture.py +0 -0
  142. {tracepipe-0.3.3 → tracepipe-0.3.5}/tracepipe/instrumentation/series_capture.py +0 -0
  143. {tracepipe-0.3.3 → tracepipe-0.3.5}/tracepipe/snapshot.py +0 -0
  144. {tracepipe-0.3.3 → tracepipe-0.3.5}/tracepipe/storage/__init__.py +0 -0
  145. {tracepipe-0.3.3 → tracepipe-0.3.5}/tracepipe/storage/base.py +0 -0
  146. {tracepipe-0.3.3 → tracepipe-0.3.5}/tracepipe/storage/row_identity.py +0 -0
  147. {tracepipe-0.3.3 → tracepipe-0.3.5}/tracepipe/utils/__init__.py +0 -0
  148. {tracepipe-0.3.3 → tracepipe-0.3.5}/tracepipe/utils/value_capture.py +0 -0
  149. {tracepipe-0.3.3 → tracepipe-0.3.5}/tracepipe/value_provenance.py +0 -0
  150. {tracepipe-0.3.3 → tracepipe-0.3.5}/tracepipe/visualization/__init__.py +0 -0
  151. {tracepipe-0.3.3 → tracepipe-0.3.5}/tracepipe/visualization/html_export.py +0 -0
@@ -5,6 +5,30 @@ All notable changes to TracePipe will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## 0.3.5 - 2026-02-03
9
+
10
+ ### Fixed
11
+ - **DataFrame.fillna double-logging**: `df.fillna({"col": 0})` now logs exactly 1 event
12
+ - Previously logged both `DataFrame.fillna` and internal `__setitem__` for same change
13
+ - Added `wrap_pandas_transform_method` with `_in_transform_op` flag to suppress nested setitem
14
+ - Works for both `fillna` and `replace` operations, including `inplace=True`
15
+
16
+ ### Added
17
+ - Known Limitations section in README documenting concat/dedup tracking gaps
18
+ - Test for `DataFrame.fillna` single-event logging
19
+
20
+ ## 0.3.4 - 2026-02-03
21
+
22
+ ### Fixed
23
+ - **Event deduplication**: Identical events from parallel pipelines are now deduplicated
24
+ - When multiple DataFrames share row IDs (e.g., from `df.copy()`), same changes are recorded once
25
+ - Events deduplicated by `(col, old_val, new_val, operation)` signature
26
+ - Prevents "4 events" when only 1 logical change occurred
27
+
28
+ ### Added
29
+ - `_stable_repr()` helper for robust value comparison in deduplication
30
+ - Tests for cross-pipeline event deduplication behavior
31
+
8
32
  ## 0.3.3 - 2026-02-03
9
33
 
10
34
  ### Fixed
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tracepipe
3
- Version: 0.3.3
3
+ Version: 0.3.5
4
4
  Summary: Row-level data lineage tracking for pandas pipelines
5
5
  Project-URL: Homepage, https://github.com/tracepipe/tracepipe
6
6
  Project-URL: Documentation, https://tracepipe.github.io/tracepipe/
@@ -276,6 +276,26 @@ tp.enable(mode="debug") # Full lineage
276
276
 
277
277
  ---
278
278
 
279
+ ## Known Limitations
280
+
281
+ TracePipe tracks **cell mutations** (fillna, replace, loc assignment) and **merge provenance** reliably. However, some patterns are not yet fully supported:
282
+
283
+ | Pattern | Status | Notes |
284
+ |---------|--------|-------|
285
+ | `df["col"] = df["col"].fillna(0)` | ✅ Tracked | Series + assignment |
286
+ | `df = df.fillna({"col": 0})` | ✅ Tracked | DataFrame-level fillna |
287
+ | `df.loc[mask, "col"] = val` | ✅ Tracked | Conditional assignment |
288
+ | `df.merge(other, on="key")` | ✅ Tracked | Full provenance in debug mode |
289
+ | `pd.concat([df1, df2])` | ⚠️ Partial | Row IDs preserved, but no "source DataFrame" tracking |
290
+ | `df.drop_duplicates(keep='last')` | ⚠️ Partial | Which row was kept is not tracked |
291
+ | Sort + dedup patterns | ⚠️ Partial | "Latest record wins" logic not traced |
292
+
293
+ **Why?** TracePipe tracks value changes within rows, not row-selection operations. When `drop_duplicates` picks one row over another, that's a provenance decision (not a cell mutation) that isn't currently instrumented.
294
+
295
+ **Planned for 0.4**: Full row-provenance tracking for concat, drop_duplicates, and sort operations.
296
+
297
+ ---
298
+
279
299
  ## Contributing
280
300
 
281
301
  ```bash
@@ -207,6 +207,26 @@ tp.enable(mode="debug") # Full lineage
207
207
 
208
208
  ---
209
209
 
210
+ ## Known Limitations
211
+
212
+ TracePipe tracks **cell mutations** (fillna, replace, loc assignment) and **merge provenance** reliably. However, some patterns are not yet fully supported:
213
+
214
+ | Pattern | Status | Notes |
215
+ |---------|--------|-------|
216
+ | `df["col"] = df["col"].fillna(0)` | ✅ Tracked | Series + assignment |
217
+ | `df = df.fillna({"col": 0})` | ✅ Tracked | DataFrame-level fillna |
218
+ | `df.loc[mask, "col"] = val` | ✅ Tracked | Conditional assignment |
219
+ | `df.merge(other, on="key")` | ✅ Tracked | Full provenance in debug mode |
220
+ | `pd.concat([df1, df2])` | ⚠️ Partial | Row IDs preserved, but no "source DataFrame" tracking |
221
+ | `df.drop_duplicates(keep='last')` | ⚠️ Partial | Which row was kept is not tracked |
222
+ | Sort + dedup patterns | ⚠️ Partial | "Latest record wins" logic not traced |
223
+
224
+ **Why?** TracePipe tracks value changes within rows, not row-selection operations. When `drop_duplicates` picks one row over another, that's a provenance decision (not a cell mutation) that isn't currently instrumented.
225
+
226
+ **Planned for 0.4**: Full row-provenance tracking for concat, drop_duplicates, and sort operations.
227
+
228
+ ---
229
+
210
230
  ## Contributing
211
231
 
212
232
  ```bash
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "tracepipe"
7
- version = "0.3.3"
7
+ version = "0.3.5"
8
8
  description = "Row-level data lineage tracking for pandas pipelines"
9
9
  readme = "README.md"
10
10
  license = {file = "LICENSE"}
@@ -249,6 +249,23 @@ class TestFillnaTrackingVerification:
249
249
  result.n_changes >= 1
250
250
  ), f"DataFrame.fillna should be tracked, got {result.n_changes} changes"
251
251
 
252
+ def test_dataframe_fillna_logs_once_not_twice(self):
253
+ """df.fillna({'col': val}) should log exactly 1 event, not 2.
254
+
255
+ Previously, both DataFrame.fillna and the internal __setitem__ were
256
+ recording the same change, causing double-logging.
257
+ """
258
+ tp.enable(mode="debug", watch=["a"])
259
+
260
+ df = pd.DataFrame({"a": [1.0, None, 3.0]})
261
+ df = df.fillna({"a": 0})
262
+
263
+ result = tp.why(df, col="a", row=1)
264
+ assert result.n_changes == 1, (
265
+ f"DataFrame.fillna should log exactly 1 event, got {result.n_changes}. "
266
+ f"Double-logging bug if > 1. History: {result.history}"
267
+ )
268
+
252
269
  def test_loc_assignment_tracked(self):
253
270
  """df.loc[mask, col] = val should be tracked."""
254
271
  tp.enable(mode="debug", watch=["a"])
@@ -350,6 +367,57 @@ class TestNoDoubleLogging:
350
367
  f"History: {result.history}"
351
368
  )
352
369
 
370
+ def test_cross_pipeline_identical_change_deduplication(self):
371
+ """Identical changes from parallel pipelines should be deduplicated.
372
+
373
+ When multiple pipelines from the same source do the SAME transformation
374
+ (e.g., both do fillna(0)), the event should only appear once.
375
+ """
376
+ tp.enable(mode="debug", watch=["income"])
377
+
378
+ # Source data with a row that has None income
379
+ customers = pd.DataFrame({"id": ["A", "B"], "income": [None, 100.0]})
380
+
381
+ # Two parallel pipelines doing the SAME transformation
382
+ df1 = customers.copy()
383
+ df1["income"] = df1["income"].fillna(0) # Records None -> 0
384
+
385
+ df2 = customers.copy()
386
+ df2["income"] = df2["income"].fillna(0) # Records SAME None -> 0
387
+
388
+ # Query df1 - should deduplicate identical events
389
+ result1 = tp.why(df1, col="income", row=0)
390
+ assert result1.n_changes == 1, (
391
+ f"Identical events should be deduplicated. Got {result1.n_changes}. "
392
+ f"History: {result1.history}"
393
+ )
394
+
395
+ def test_cross_pipeline_different_changes_preserved(self):
396
+ """Different changes from parallel pipelines should NOT be deduplicated.
397
+
398
+ When pipelines do DIFFERENT transformations on the same row,
399
+ both events should be visible (this is expected behavior - row IDs
400
+ are shared, so history includes all changes to that row ID).
401
+ """
402
+ tp.enable(mode="debug", watch=["income"])
403
+
404
+ customers = pd.DataFrame({"id": ["A", "B"], "income": [None, 100.0]})
405
+
406
+ # Two pipelines doing DIFFERENT transformations
407
+ df1 = customers.copy()
408
+ df1["income"] = df1["income"].fillna(0) # None -> 0
409
+
410
+ df2 = customers.copy()
411
+ df2["income"] = df2["income"].fillna(99) # None -> 99
412
+
413
+ # Query df1 - since row ID is shared, both changes are visible
414
+ # This is expected: deduplication only removes IDENTICAL events
415
+ result1 = tp.why(df1, col="income", row=0)
416
+ assert result1.n_changes == 2, (
417
+ f"Different changes should both be visible. Got {result1.n_changes}. "
418
+ f"History: {result1.history}"
419
+ )
420
+
353
421
 
354
422
  class TestMergeWarningScoping:
355
423
  """Tests for merge warnings being scoped to df's lineage."""
@@ -81,7 +81,7 @@ from .core import TracePipeConfig, TracePipeMode
81
81
  from .snapshot import DiffResult, Snapshot, diff, snapshot
82
82
 
83
83
  # === VERSION ===
84
- __version__ = "0.3.3"
84
+ __version__ = "0.3.5"
85
85
 
86
86
  # === MINIMAL __all__ ===
87
87
  __all__ = [
@@ -63,6 +63,10 @@ class TracePipeContext:
63
63
  # When > 0, __getitem__[mask] skips capture (parent op will capture)
64
64
  self._filter_op_depth: int = 0
65
65
 
66
+ # Transform operation tracking (prevents double-counting fillna/replace)
67
+ # When > 0, __setitem__ skips capture (transform op will capture)
68
+ self._in_transform_op: int = 0
69
+
66
70
  # GroupBy state stack (supports nesting)
67
71
  self._groupby_stack: list[dict] = []
68
72
 
@@ -22,6 +22,7 @@ from ..safety import (
22
22
  get_caller_info,
23
23
  wrap_pandas_method,
24
24
  wrap_pandas_method_inplace,
25
+ wrap_pandas_transform_method,
25
26
  )
26
27
  from ..utils.value_capture import find_changed_indices_vectorized
27
28
  from .apply_capture import instrument_apply_pipe, uninstrument_apply_pipe
@@ -554,6 +555,9 @@ def _wrap_setitem(original):
554
555
 
555
556
  Captures BEFORE state for existing columns, then executes assignment,
556
557
  then records the diff with actual old/new values.
558
+
559
+ Skips recording when inside a transform operation (fillna, replace) to
560
+ avoid double-counting cell changes - the transform wrapper will capture.
557
561
  """
558
562
 
559
563
  @wraps(original)
@@ -565,7 +569,9 @@ def _wrap_setitem(original):
565
569
  is_new_column = False
566
570
  should_track = False
567
571
 
568
- if ctx.enabled and isinstance(key, str):
572
+ # Skip tracking if we're inside a transform operation (fillna, replace)
573
+ # Those operations will capture the change themselves
574
+ if ctx.enabled and isinstance(key, str) and ctx._in_transform_op == 0:
569
575
  if key in ctx.watched_columns:
570
576
  should_track = True
571
577
  if key in self.columns:
@@ -771,13 +777,15 @@ def instrument_pandas():
771
777
  wrapped = wrap_filter_method(method_name, original)
772
778
  setattr(pd.DataFrame, method_name, wrapped)
773
779
 
774
- # === DataFrame transform methods (with inplace support) ===
780
+ # === DataFrame transform methods (fillna, replace) ===
781
+ # These use wrap_pandas_transform_method to suppress __setitem__ recording
782
+ # during the transform, avoiding double-counting cell changes
775
783
  transform_methods = ["fillna", "replace"]
776
784
  for method_name in transform_methods:
777
785
  if hasattr(pd.DataFrame, method_name):
778
786
  original = getattr(pd.DataFrame, method_name)
779
787
  _originals[f"DataFrame.{method_name}"] = original
780
- wrapped = wrap_pandas_method_inplace(method_name, original, _capture_transform)
788
+ wrapped = wrap_pandas_transform_method(method_name, original, _capture_transform)
781
789
  setattr(pd.DataFrame, method_name, wrapped)
782
790
 
783
791
  # === astype (no inplace) ===
@@ -103,7 +103,7 @@ def _make_wrapper(
103
103
  method_name: Name for error messages
104
104
  original_method: The original pandas method
105
105
  capture_func: func(self, args, kwargs, result, ctx, method_name)
106
- mode: "standard", "filter", or "inplace"
106
+ mode: "standard", "filter", "inplace", or "transform"
107
107
  """
108
108
 
109
109
  @wraps(original_method)
@@ -112,10 +112,21 @@ def _make_wrapper(
112
112
 
113
113
  # === PRE-EXECUTION SETUP ===
114
114
  before_snapshot = None
115
+ is_inplace = kwargs.get("inplace", False)
115
116
 
116
117
  if mode == "filter" and ctx.enabled:
117
118
  ctx._filter_op_depth += 1
118
- elif mode == "inplace" and ctx.enabled and kwargs.get("inplace", False):
119
+ elif mode == "transform" and ctx.enabled:
120
+ # Suppress __setitem__ recording during transform ops (fillna, replace)
121
+ # to avoid double-counting the same cell change
122
+ ctx._in_transform_op += 1
123
+ # Also handle inplace for transform operations
124
+ if is_inplace:
125
+ try:
126
+ before_snapshot = self.copy()
127
+ except Exception:
128
+ pass
129
+ elif mode == "inplace" and ctx.enabled and is_inplace:
119
130
  try:
120
131
  before_snapshot = self.copy()
121
132
  except Exception:
@@ -127,15 +138,18 @@ def _make_wrapper(
127
138
  finally:
128
139
  if mode == "filter" and ctx.enabled:
129
140
  ctx._filter_op_depth -= 1
141
+ elif mode == "transform" and ctx.enabled:
142
+ ctx._in_transform_op -= 1
130
143
 
131
144
  # === CAPTURE LINEAGE (SIDE EFFECT) ===
132
145
  # Skip capture if we're inside a filter operation (prevents recursion during export)
133
146
  if ctx.enabled and ctx._filter_op_depth == 0:
134
147
  try:
135
- if mode == "inplace" and kwargs.get("inplace", False):
148
+ # Handle inplace for both "inplace" and "transform" modes
149
+ if (mode == "inplace" or mode == "transform") and is_inplace:
136
150
  if before_snapshot is not None:
137
151
  capture_func(before_snapshot, args, kwargs, self, ctx, method_name)
138
- elif mode == "inplace" and result is not None:
152
+ elif (mode == "inplace" or mode == "transform") and result is not None:
139
153
  capture_func(self, args, kwargs, result, ctx, method_name)
140
154
  else:
141
155
  capture_func(self, args, kwargs, result, ctx, method_name)
@@ -176,3 +190,14 @@ def wrap_pandas_method_inplace(
176
190
  ) -> Callable:
177
191
  """Wrap a pandas method that supports inplace=True."""
178
192
  return _make_wrapper(method_name, original_method, capture_func, mode="inplace")
193
+
194
+
195
+ def wrap_pandas_transform_method(
196
+ method_name: str, original_method: Callable, capture_func: Callable
197
+ ) -> Callable:
198
+ """Wrap a pandas transform method (fillna, replace) that may trigger internal setitem.
199
+
200
+ These methods modify column values and pandas internally uses setitem.
201
+ We suppress setitem recording during these ops to avoid double-counting.
202
+ """
203
+ return _make_wrapper(method_name, original_method, capture_func, mode="transform")
@@ -32,6 +32,22 @@ from ..core import (
32
32
  from ..utils.value_capture import capture_typed_value
33
33
 
34
34
 
35
+ def _stable_repr(val) -> str:
36
+ """Create a stable string representation for deduplication.
37
+
38
+ Handles NaN, None, and other values that don't compare equal to themselves.
39
+ """
40
+ if val is None:
41
+ return "None"
42
+ # Handle NaN (which doesn't equal itself)
43
+ try:
44
+ if isinstance(val, float) and val != val: # NaN check
45
+ return "NaN"
46
+ except (TypeError, ValueError):
47
+ pass
48
+ return repr(val)
49
+
50
+
35
51
  class InMemoryLineageStore:
36
52
  """
37
53
  Columnar storage for lineage data using Structure of Arrays (SoA).
@@ -556,12 +572,15 @@ class InMemoryLineageStore:
556
572
  Follows merge lineage recursively to build complete cell provenance.
557
573
  This is essential for tracking changes that happened before merge operations.
558
574
 
575
+ Deduplicates events by (col, old_val, new_val, operation) signature to prevent
576
+ cross-pipeline contamination when multiple DataFrames share row IDs.
577
+
559
578
  Args:
560
579
  row_id: Row ID to trace
561
580
  max_depth: Maximum merge depth to follow (prevents infinite loops)
562
581
 
563
582
  Returns:
564
- List of events in chronological order, including parent row events.
583
+ List of UNIQUE events in chronological order, including parent row events.
565
584
  """
566
585
  visited: set[int] = set()
567
586
 
@@ -589,7 +608,23 @@ class InMemoryLineageStore:
589
608
  # Sort by step_id to ensure chronological order across lineage
590
609
  all_events.sort(key=lambda e: e["step_id"])
591
610
 
592
- return all_events
611
+ # Deduplicate by (col, old_val, new_val, operation) signature
612
+ # This prevents cross-pipeline contamination when multiple DataFrames
613
+ # share the same row IDs (e.g., df.copy() followed by parallel transforms)
614
+ seen_signatures: set[tuple] = set()
615
+ unique_events = []
616
+ for event in all_events:
617
+ sig = (
618
+ event.get("col"),
619
+ _stable_repr(event.get("old_val")),
620
+ _stable_repr(event.get("new_val")),
621
+ event.get("operation"),
622
+ )
623
+ if sig not in seen_signatures:
624
+ seen_signatures.add(sig)
625
+ unique_events.append(event)
626
+
627
+ return unique_events
593
628
 
594
629
  def get_cell_history_with_lineage(
595
630
  self, row_id: int, column: str, max_depth: int = 10
@@ -2051,7 +2051,7 @@ wheels = [
2051
2051
 
2052
2052
  [[package]]
2053
2053
  name = "tracepipe"
2054
- version = "0.3.2"
2054
+ version = "0.3.4"
2055
2055
  source = { editable = "." }
2056
2056
  dependencies = [
2057
2057
  { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes