tracepipe 0.4.1__tar.gz → 0.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (151) hide show
  1. {tracepipe-0.4.1 → tracepipe-0.4.2}/PKG-INFO +1 -1
  2. {tracepipe-0.4.1 → tracepipe-0.4.2}/docs/changelog.md +21 -0
  3. {tracepipe-0.4.1 → tracepipe-0.4.2}/docs/getting-started/quickstart.md +12 -1
  4. {tracepipe-0.4.1 → tracepipe-0.4.2}/docs/guide/cell-provenance.md +10 -2
  5. {tracepipe-0.4.1 → tracepipe-0.4.2}/docs/guide/row-tracing.md +22 -8
  6. {tracepipe-0.4.1 → tracepipe-0.4.2}/docs/guide/snapshots.md +31 -11
  7. {tracepipe-0.4.1 → tracepipe-0.4.2}/pyproject.toml +1 -1
  8. {tracepipe-0.4.1 → tracepipe-0.4.2}/tests/test_convenience_debug.py +11 -3
  9. {tracepipe-0.4.1 → tracepipe-0.4.2}/tests/test_public_api.py +6 -1
  10. {tracepipe-0.4.1 → tracepipe-0.4.2}/tests/test_row_provenance.py +2 -4
  11. {tracepipe-0.4.1 → tracepipe-0.4.2}/tracepipe/__init__.py +1 -1
  12. {tracepipe-0.4.1 → tracepipe-0.4.2}/tracepipe/convenience.py +131 -12
  13. {tracepipe-0.4.1 → tracepipe-0.4.2}/tracepipe/debug.py +40 -0
  14. {tracepipe-0.4.1 → tracepipe-0.4.2}/tracepipe/snapshot.py +87 -2
  15. tracepipe-0.4.1/CHANGELOG.md +0 -162
  16. {tracepipe-0.4.1 → tracepipe-0.4.2}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  17. {tracepipe-0.4.1 → tracepipe-0.4.2}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  18. {tracepipe-0.4.1 → tracepipe-0.4.2}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  19. {tracepipe-0.4.1 → tracepipe-0.4.2}/.github/workflows/ci.yml +0 -0
  20. {tracepipe-0.4.1 → tracepipe-0.4.2}/.github/workflows/docs.yml +0 -0
  21. {tracepipe-0.4.1 → tracepipe-0.4.2}/.github/workflows/release.yml +0 -0
  22. {tracepipe-0.4.1 → tracepipe-0.4.2}/.gitignore +0 -0
  23. {tracepipe-0.4.1 → tracepipe-0.4.2}/.pre-commit-config.yaml +0 -0
  24. {tracepipe-0.4.1 → tracepipe-0.4.2}/CONTRIBUTING.md +0 -0
  25. {tracepipe-0.4.1 → tracepipe-0.4.2}/LICENSE +0 -0
  26. {tracepipe-0.4.1 → tracepipe-0.4.2}/README.md +0 -0
  27. {tracepipe-0.4.1 → tracepipe-0.4.2}/benchmarks/README.md +0 -0
  28. {tracepipe-0.4.1 → tracepipe-0.4.2}/benchmarks/bench_memory.py +0 -0
  29. {tracepipe-0.4.1 → tracepipe-0.4.2}/benchmarks/bench_overhead.py +0 -0
  30. {tracepipe-0.4.1 → tracepipe-0.4.2}/benchmarks/bench_scale.py +0 -0
  31. {tracepipe-0.4.1 → tracepipe-0.4.2}/benchmarks/run_all.py +0 -0
  32. {tracepipe-0.4.1 → tracepipe-0.4.2}/docs/api/contracts.md +0 -0
  33. {tracepipe-0.4.1 → tracepipe-0.4.2}/docs/api/core.md +0 -0
  34. {tracepipe-0.4.1 → tracepipe-0.4.2}/docs/api/debug.md +0 -0
  35. {tracepipe-0.4.1 → tracepipe-0.4.2}/docs/api/index.md +0 -0
  36. {tracepipe-0.4.1 → tracepipe-0.4.2}/docs/contributing.md +0 -0
  37. {tracepipe-0.4.1 → tracepipe-0.4.2}/docs/examples/data-validation.md +0 -0
  38. {tracepipe-0.4.1 → tracepipe-0.4.2}/docs/examples/ml-pipeline.md +0 -0
  39. {tracepipe-0.4.1 → tracepipe-0.4.2}/docs/getting-started/installation.md +0 -0
  40. {tracepipe-0.4.1 → tracepipe-0.4.2}/docs/getting-started/modes.md +0 -0
  41. {tracepipe-0.4.1 → tracepipe-0.4.2}/docs/guide/concepts.md +0 -0
  42. {tracepipe-0.4.1 → tracepipe-0.4.2}/docs/guide/contracts.md +0 -0
  43. {tracepipe-0.4.1 → tracepipe-0.4.2}/docs/guide/health-checks.md +0 -0
  44. {tracepipe-0.4.1 → tracepipe-0.4.2}/docs/guide/reports.md +0 -0
  45. {tracepipe-0.4.1 → tracepipe-0.4.2}/docs/index.md +0 -0
  46. {tracepipe-0.4.1 → tracepipe-0.4.2}/examples/demo.py +0 -0
  47. {tracepipe-0.4.1 → tracepipe-0.4.2}/examples/ml_pipeline_demo.py +0 -0
  48. {tracepipe-0.4.1 → tracepipe-0.4.2}/mkdocs.yml +0 -0
  49. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/404.html +0 -0
  50. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/api/contracts/index.html +0 -0
  51. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/api/core/index.html +0 -0
  52. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/api/debug/index.html +0 -0
  53. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/api/index.html +0 -0
  54. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/_mkdocstrings.css +0 -0
  55. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/images/favicon.png +0 -0
  56. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/bundle.79ae519e.min.js +0 -0
  57. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/bundle.79ae519e.min.js.map +0 -0
  58. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/lunr/min/lunr.ar.min.js +0 -0
  59. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/lunr/min/lunr.da.min.js +0 -0
  60. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/lunr/min/lunr.de.min.js +0 -0
  61. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/lunr/min/lunr.du.min.js +0 -0
  62. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/lunr/min/lunr.el.min.js +0 -0
  63. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/lunr/min/lunr.es.min.js +0 -0
  64. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/lunr/min/lunr.fi.min.js +0 -0
  65. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/lunr/min/lunr.fr.min.js +0 -0
  66. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/lunr/min/lunr.he.min.js +0 -0
  67. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/lunr/min/lunr.hi.min.js +0 -0
  68. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/lunr/min/lunr.hu.min.js +0 -0
  69. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/lunr/min/lunr.hy.min.js +0 -0
  70. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/lunr/min/lunr.it.min.js +0 -0
  71. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/lunr/min/lunr.ja.min.js +0 -0
  72. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/lunr/min/lunr.jp.min.js +0 -0
  73. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/lunr/min/lunr.kn.min.js +0 -0
  74. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/lunr/min/lunr.ko.min.js +0 -0
  75. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/lunr/min/lunr.multi.min.js +0 -0
  76. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/lunr/min/lunr.nl.min.js +0 -0
  77. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/lunr/min/lunr.no.min.js +0 -0
  78. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/lunr/min/lunr.pt.min.js +0 -0
  79. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/lunr/min/lunr.ro.min.js +0 -0
  80. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/lunr/min/lunr.ru.min.js +0 -0
  81. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/lunr/min/lunr.sa.min.js +0 -0
  82. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/lunr/min/lunr.stemmer.support.min.js +0 -0
  83. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/lunr/min/lunr.sv.min.js +0 -0
  84. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/lunr/min/lunr.ta.min.js +0 -0
  85. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/lunr/min/lunr.te.min.js +0 -0
  86. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/lunr/min/lunr.th.min.js +0 -0
  87. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/lunr/min/lunr.tr.min.js +0 -0
  88. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/lunr/min/lunr.vi.min.js +0 -0
  89. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/lunr/min/lunr.zh.min.js +0 -0
  90. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/lunr/tinyseg.js +0 -0
  91. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/lunr/wordcut.js +0 -0
  92. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/workers/search.2c215733.min.js +0 -0
  93. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/javascripts/workers/search.2c215733.min.js.map +0 -0
  94. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/stylesheets/main.484c7ddc.min.css +0 -0
  95. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/stylesheets/main.484c7ddc.min.css.map +0 -0
  96. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/stylesheets/palette.ab4e12ef.min.css +0 -0
  97. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/assets/stylesheets/palette.ab4e12ef.min.css.map +0 -0
  98. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/changelog/index.html +0 -0
  99. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/contributing/index.html +0 -0
  100. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/examples/data-validation/index.html +0 -0
  101. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/examples/ml-pipeline/index.html +0 -0
  102. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/getting-started/installation/index.html +0 -0
  103. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/getting-started/modes/index.html +0 -0
  104. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/getting-started/quickstart/index.html +0 -0
  105. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/guide/cell-provenance/index.html +0 -0
  106. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/guide/concepts/index.html +0 -0
  107. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/guide/contracts/index.html +0 -0
  108. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/guide/health-checks/index.html +0 -0
  109. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/guide/reports/index.html +0 -0
  110. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/guide/row-tracing/index.html +0 -0
  111. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/guide/snapshots/index.html +0 -0
  112. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/index.html +0 -0
  113. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/objects.inv +0 -0
  114. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/search/search_index.json +0 -0
  115. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/sitemap.xml +0 -0
  116. {tracepipe-0.4.1 → tracepipe-0.4.2}/site/sitemap.xml.gz +0 -0
  117. {tracepipe-0.4.1 → tracepipe-0.4.2}/tests/__init__.py +0 -0
  118. {tracepipe-0.4.1 → tracepipe-0.4.2}/tests/conftest.py +0 -0
  119. {tracepipe-0.4.1 → tracepipe-0.4.2}/tests/test_api.py +0 -0
  120. {tracepipe-0.4.1 → tracepipe-0.4.2}/tests/test_concurrency.py +0 -0
  121. {tracepipe-0.4.1 → tracepipe-0.4.2}/tests/test_contracts.py +0 -0
  122. {tracepipe-0.4.1 → tracepipe-0.4.2}/tests/test_edge_cases.py +0 -0
  123. {tracepipe-0.4.1 → tracepipe-0.4.2}/tests/test_integration.py +0 -0
  124. {tracepipe-0.4.1 → tracepipe-0.4.2}/tests/test_integration_scenarios.py +0 -0
  125. {tracepipe-0.4.1 → tracepipe-0.4.2}/tests/test_io_operations.py +0 -0
  126. {tracepipe-0.4.1 → tracepipe-0.4.2}/tests/test_lineage_through_merge.py +0 -0
  127. {tracepipe-0.4.1 → tracepipe-0.4.2}/tests/test_pandas_inst.py +0 -0
  128. {tracepipe-0.4.1 → tracepipe-0.4.2}/tests/test_snapshot.py +0 -0
  129. {tracepipe-0.4.1 → tracepipe-0.4.2}/tests/test_version_matrix.py +0 -0
  130. {tracepipe-0.4.1 → tracepipe-0.4.2}/tracepipe/api.py +0 -0
  131. {tracepipe-0.4.1 → tracepipe-0.4.2}/tracepipe/context.py +0 -0
  132. {tracepipe-0.4.1 → tracepipe-0.4.2}/tracepipe/contracts.py +0 -0
  133. {tracepipe-0.4.1 → tracepipe-0.4.2}/tracepipe/core.py +0 -0
  134. {tracepipe-0.4.1 → tracepipe-0.4.2}/tracepipe/instrumentation/__init__.py +0 -0
  135. {tracepipe-0.4.1 → tracepipe-0.4.2}/tracepipe/instrumentation/apply_capture.py +0 -0
  136. {tracepipe-0.4.1 → tracepipe-0.4.2}/tracepipe/instrumentation/filter_capture.py +0 -0
  137. {tracepipe-0.4.1 → tracepipe-0.4.2}/tracepipe/instrumentation/indexer_capture.py +0 -0
  138. {tracepipe-0.4.1 → tracepipe-0.4.2}/tracepipe/instrumentation/merge_capture.py +0 -0
  139. {tracepipe-0.4.1 → tracepipe-0.4.2}/tracepipe/instrumentation/pandas_inst.py +0 -0
  140. {tracepipe-0.4.1 → tracepipe-0.4.2}/tracepipe/instrumentation/series_capture.py +0 -0
  141. {tracepipe-0.4.1 → tracepipe-0.4.2}/tracepipe/safety.py +0 -0
  142. {tracepipe-0.4.1 → tracepipe-0.4.2}/tracepipe/storage/__init__.py +0 -0
  143. {tracepipe-0.4.1 → tracepipe-0.4.2}/tracepipe/storage/base.py +0 -0
  144. {tracepipe-0.4.1 → tracepipe-0.4.2}/tracepipe/storage/lineage_store.py +0 -0
  145. {tracepipe-0.4.1 → tracepipe-0.4.2}/tracepipe/storage/row_identity.py +0 -0
  146. {tracepipe-0.4.1 → tracepipe-0.4.2}/tracepipe/utils/__init__.py +0 -0
  147. {tracepipe-0.4.1 → tracepipe-0.4.2}/tracepipe/utils/value_capture.py +0 -0
  148. {tracepipe-0.4.1 → tracepipe-0.4.2}/tracepipe/value_provenance.py +0 -0
  149. {tracepipe-0.4.1 → tracepipe-0.4.2}/tracepipe/visualization/__init__.py +0 -0
  150. {tracepipe-0.4.1 → tracepipe-0.4.2}/tracepipe/visualization/html_export.py +0 -0
  151. {tracepipe-0.4.1 → tracepipe-0.4.2}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tracepipe
3
- Version: 0.4.1
3
+ Version: 0.4.2
4
4
  Summary: Row-level data lineage tracking for pandas pipelines
5
5
  Project-URL: Homepage, https://github.com/tracepipe/tracepipe
6
6
  Project-URL: Documentation, https://tracepipe.github.io/tracepipe/
@@ -5,6 +5,27 @@ All notable changes to TracePipe will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.4.2] - 2026-02-04
9
+
10
+ ### Fixed
11
+ - **`CheckResult` change tracking**: Added `n_changes` and `changes_by_op` properties in debug mode to track value changes across pipeline steps
12
+ - **`TraceResult` status fields**: Added `status`, `dropped_by`, and `dropped_at_step` properties for clearer dropped row analysis
13
+ - **`DiffResult` completeness**: Added `cells_changed`, `changes_by_column`, `rows_unchanged`, and `changed_rows` for detailed snapshot comparison
14
+ - **Ghost value API**: Implemented `dbg.get_ghost_values(row_id)` for retrieving last known values of dropped rows
15
+ - **Merge provenance**: `trace.origin` and `trace.merge_origin` now properly populated for merged rows
16
+ - **Documentation alignment**: All documented APIs now match actual implementation with comprehensive test coverage
17
+
18
+ ### Changed
19
+ - **`tp.trace()` API enhancement**: Added `row_id=` parameter for explicit internal row ID tracking
20
+ - `row=` now strictly refers to DataFrame positional index
21
+ - `row_id=` refers to TracePipe's internal row identifier (stable across operations)
22
+ - Supports tracing dropped rows by ID: `tp.trace(df, row_id=42)`
23
+ - **`tp.why()` API enhancement**: Added `row_id=` parameter matching `tp.trace()` signature
24
+
25
+ ### Added
26
+ - Comprehensive test suite (`test_doc_api_alignment.py`) with 27 tests validating documented API features
27
+ - Better error messages for out-of-bounds row access
28
+
8
29
  ## [0.4.1] - 2026-02-04
9
30
 
10
31
  ### Fixed
@@ -43,7 +43,7 @@ Output:
43
43
  TracePipe Check: [OK] Pipeline healthy
44
44
  Mode: debug
45
45
 
46
- Retention: 2/4 (50.0%)
46
+ Retention: 50%
47
47
  Dropped: 2 rows
48
48
  • DataFrame.dropna: 1
49
49
  • DataFrame.__getitem__[mask]: 1
@@ -52,6 +52,17 @@ Value changes: 2 cells
52
52
  • DataFrame.__setitem__[total]: 2
53
53
  ```
54
54
 
55
+ The `CheckResult` object provides convenient properties:
56
+
57
+ ```python
58
+ result.passed # True/False
59
+ result.retention # 0.5 (row retention rate)
60
+ result.n_dropped # 2 (total dropped rows)
61
+ result.drops_by_op # {"DataFrame.dropna": 1, ...}
62
+ result.n_changes # 2 (cell changes, debug mode only)
63
+ result.changes_by_op # {"DataFrame.__setitem__[total]": 2}
64
+ ```
65
+
55
66
  ## 4. Trace a Row's Journey
56
67
 
57
68
  ```python
@@ -140,9 +140,17 @@ For dropped rows, you can still query their last known values:
140
140
  ```python
141
141
  dbg = tp.debug.inspect()
142
142
 
143
- # Get ghost values for a dropped row
143
+ # Get ghost values for a specific dropped row
144
144
  dropped_rid = list(dbg.dropped_rows())[0]
145
145
  ghost = dbg.get_ghost_values(dropped_rid)
146
-
147
146
  print(f"Last known values: {ghost}")
147
+ # {"age": 25, "salary": 50000}
148
+
149
+ # Or get all ghost rows as a DataFrame
150
+ ghost_df = dbg.ghost_rows()
151
+ print(ghost_df)
152
+ # DataFrame with __tp_row_id__, __tp_dropped_by__, and watched columns
148
153
  ```
154
+
155
+ The `get_ghost_values(row_id)` method returns a dict mapping column names to
156
+ their last known values, or `None` if the row wasn't found in ghost storage.
@@ -20,12 +20,15 @@ Output:
20
20
  Row 42 Journey:
21
21
  Status: [OK] Alive
22
22
 
23
- Events: 3
24
- [SURVIVED] DataFrame.dropna
23
+ Events: 1
25
24
  [MODIFIED] DataFrame.fillna: income
26
- [SURVIVED] DataFrame.__getitem__[mask]
27
25
  ```
28
26
 
27
+ !!! note "Event Recording"
28
+ TracePipe records MODIFIED events for cells that change in watched columns.
29
+ Rows that pass through operations unchanged are not recorded as separate events
30
+ (they are implicitly "survived"). Drop events are recorded for filtered rows.
31
+
29
32
  ## The TraceResult Object
30
33
 
31
34
  ```python
@@ -34,12 +37,17 @@ trace = tp.trace(df, row=0)
34
37
  # Access fields
35
38
  trace.row_id # int: internal row ID
36
39
  trace.status # str: "alive" or "dropped"
37
- trace.events # list[TraceEvent]: all events
40
+ trace.is_alive # bool: True if row still exists
41
+ trace.events # list[dict]: all events for this row
38
42
 
39
43
  # For dropped rows
40
44
  trace.dropped_by # str: operation that dropped the row
41
45
  trace.dropped_at_step # int: step number
42
46
 
47
+ # Provenance (v0.4+)
48
+ trace.origin # dict: {"type": "concat"|"merge", ...} or None
49
+ trace.representative # dict: for dedup-dropped rows, which row was kept
50
+
43
51
  # Export
44
52
  trace.to_dict() # dict representation
45
53
  ```
@@ -74,10 +82,16 @@ tp.trace(df, where={"email": None})
74
82
 
75
83
  | Event Type | Description |
76
84
  |------------|-------------|
77
- | `SURVIVED` | Row passed through operation unchanged |
78
- | `MODIFIED` | One or more cells changed |
79
- | `DROPPED` | Row was removed |
80
- | `CREATED` | Row first appeared (e.g., from merge) |
85
+ | `MODIFIED` | One or more cells changed in watched columns |
86
+ | `DROPPED` | Row was removed by a filter operation |
87
+
88
+ !!! note "Design Note"
89
+ TracePipe does not explicitly record "SURVIVED" events because they would
90
+ create excessive noise for most pipelines. Instead, rows that exist in the
91
+ final DataFrame are implicitly considered to have survived all operations.
92
+
93
+ If you need to know which operations a row passed through, check the
94
+ `steps` list via `tp.debug.inspect().steps`.
81
95
 
82
96
  ## Tracing Dropped Rows
83
97
 
@@ -34,14 +34,17 @@ Output:
34
34
 
35
35
  ```
36
36
  Snapshot Diff:
37
- Rows: 1000 847 (-153)
38
- Columns: ['id', 'price', 'qty'] → ['id', 'price', 'qty'] (unchanged)
37
+ - 153 rows removed
38
+ ! 153 new drops
39
39
 
40
40
  Changes:
41
- - 153 rows removed
42
- - 847 cells modified in 'price'
41
+ - 847 cells modified
42
+ price: 847
43
43
  ```
44
44
 
45
+ !!! tip "Enabling Cell-Level Diff"
46
+ To see cell-level changes, create snapshots with `include_values=True`.
47
+
45
48
  ## The Snapshot Object
46
49
 
47
50
  ```python
@@ -64,21 +67,38 @@ snapshot.data # DataFrame copy (if captured)
64
67
  ```python
65
68
  diff = tp.diff(before, after)
66
69
 
67
- # Access fields
68
- diff.rows_added # int: new rows
69
- diff.rows_removed # int: removed rows
70
- diff.rows_unchanged # int: unchanged rows
71
- diff.cells_changed # int: modified cells
70
+ # Row-level changes (always available)
71
+ diff.rows_added # set[int]: IDs of new rows
72
+ diff.rows_removed # set[int]: IDs of removed rows
73
+ diff.new_drops # set[int]: newly dropped row IDs
74
+ diff.recovered_rows # set[int]: rows that were dropped but now exist
72
75
 
73
76
  # Column changes
74
77
  diff.columns_added # list[str]: new columns
75
78
  diff.columns_removed # list[str]: removed columns
76
79
 
77
- # Detailed changes (if both snapshots have data)
78
- diff.changed_rows # set[int]: IDs of changed rows
80
+ # Cell-level changes (requires include_values=True on both snapshots)
81
+ diff.cells_changed # int: total modified cells
82
+ diff.changed_rows # set[int]: IDs of rows with value changes
79
83
  diff.changes_by_column # dict: {col: count}
84
+
85
+ # Stats changes
86
+ diff.stats_changes # dict: {col: {metric: (old, new)}}
87
+ diff.drops_delta # dict: {operation: delta_count}
80
88
  ```
81
89
 
90
+ !!! note "Cell-Level Diff Requirements"
91
+ To get `cells_changed` and `changes_by_column`, both snapshots must be
92
+ created with `include_values=True`:
93
+
94
+ ```python
95
+ before = tp.snapshot(df, include_values=True)
96
+ # ... operations ...
97
+ after = tp.snapshot(df, include_values=True)
98
+ diff = tp.diff(before, after)
99
+ print(f"{diff.cells_changed} cells modified")
100
+ ```
101
+
82
102
  ## Options
83
103
 
84
104
  ### Include Data
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "tracepipe"
7
- version = "0.4.1"
7
+ version = "0.4.2"
8
8
  description = "Row-level data lineage tracking for pandas pipelines"
9
9
  readme = "README.md"
10
10
  license = {file = "LICENSE"}
@@ -333,10 +333,14 @@ class TestTraceResult:
333
333
  tp.enable(mode="debug")
334
334
  df = pd.DataFrame({"a": [1, None, 3]})
335
335
  df = df.dropna()
336
- result = tp.trace(df, row=1)
336
+ # Use row_id parameter to trace a dropped row by its internal ID
337
+ dbg = tp.debug.inspect()
338
+ dropped = dbg.dropped_rows()
339
+ assert len(dropped) >= 1
340
+ result = tp.trace(df, row_id=dropped[0])
337
341
  assert result.is_alive is False
338
342
  text = str(result)
339
- assert "Dropped" in text or "X" in text
343
+ assert "Dropped" in text or "DROPPED" in text
340
344
 
341
345
  def test_trace_with_events(self):
342
346
  """TraceResult shows events when cell is modified."""
@@ -386,7 +390,11 @@ class TestTraceResult:
386
390
  tp.enable(mode="debug", watch=["a"])
387
391
  df = pd.DataFrame({"a": [1, 2, 3]})
388
392
  df = df.head(1) # Drop rows 1 and 2
389
- result = tp.trace(df, row=1) # Trace dropped row
393
+ # Use row_id parameter to trace a dropped row
394
+ dbg = tp.debug.inspect()
395
+ dropped = dbg.dropped_rows()
396
+ assert len(dropped) >= 1
397
+ result = tp.trace(df, row_id=dropped[0]) # Trace dropped row by ID
390
398
  # Dropped row should have ghost values in debug mode
391
399
  text = result.to_text(verbose=True)
392
400
  assert result.is_alive is False
@@ -100,8 +100,13 @@ class TestTrace:
100
100
  tp.enable(mode="debug")
101
101
  df = pd.DataFrame({"a": [1, None, 3]})
102
102
  df = df.dropna()
103
- result = tp.trace(df, row=1)
103
+ # Use row_id parameter to trace dropped row
104
+ dbg = tp.debug.inspect()
105
+ dropped = dbg.dropped_rows()
106
+ assert len(dropped) >= 1
107
+ result = tp.trace(df, row_id=dropped[0])
104
108
  assert result is not None
109
+ assert result.is_alive is False
105
110
 
106
111
  def test_trace_with_where(self):
107
112
  """trace() with where clause."""
@@ -569,10 +569,8 @@ class TestTraceResultOriginProperty:
569
569
 
570
570
  result = df1.merge(df2, on="key")
571
571
 
572
- # Use the actual row_id from the result DataFrame
573
- ctx = get_context()
574
- result_rids = ctx.row_manager.get_ids_array(result)
575
- trace = tp.trace(result, row=result_rids[0])
572
+ # Use row=0 to trace the first row in the result DataFrame
573
+ trace = tp.trace(result, row=0)
576
574
 
577
575
  # Should have merge origin
578
576
  assert trace.origin is not None
@@ -81,7 +81,7 @@ from .core import TracePipeConfig, TracePipeMode
81
81
  from .snapshot import DiffResult, Snapshot, diff, snapshot
82
82
 
83
83
  # === VERSION ===
84
- __version__ = "0.4.1"
84
+ __version__ = "0.4.2"
85
85
 
86
86
  # === MINIMAL __all__ ===
87
87
  __all__ = [
@@ -60,6 +60,8 @@ class CheckResult:
60
60
  .retention - Row retention rate (0.0-1.0)
61
61
  .n_dropped - Total rows dropped
62
62
  .drops_by_op - Drops broken down by operation
63
+ .n_changes - Total cell-level changes (debug mode only)
64
+ .changes_by_op - Changes broken down by operation (debug mode only)
63
65
  """
64
66
 
65
67
  ok: bool
@@ -69,6 +71,9 @@ class CheckResult:
69
71
  mode: str
70
72
  # Internal: store drops_by_op so we don't need to recompute
71
73
  _drops_by_op: dict[str, int] = field(default_factory=dict)
74
+ # Internal: store cell change counts (debug mode only)
75
+ _n_changes: int = 0
76
+ _changes_by_op: dict[str, int] = field(default_factory=dict)
72
77
 
73
78
  # === CONVENIENCE PROPERTIES ===
74
79
 
@@ -97,6 +102,16 @@ class CheckResult:
97
102
  """Total pipeline steps recorded."""
98
103
  return self.facts.get("total_steps", 0)
99
104
 
105
+ @property
106
+ def n_changes(self) -> int:
107
+ """Total cell-level changes (debug mode only, 0 if not tracked)."""
108
+ return self._n_changes
109
+
110
+ @property
111
+ def changes_by_op(self) -> dict[str, int]:
112
+ """Cell changes broken down by operation (debug mode only)."""
113
+ return self._changes_by_op
114
+
100
115
  # === EXISTING PROPERTIES ===
101
116
 
102
117
  @property
@@ -127,6 +142,20 @@ class CheckResult:
127
142
  lines.append(f"TracePipe Check: {status}")
128
143
  lines.append(f" Mode: {self.mode}")
129
144
 
145
+ # Always show key metrics in compact form
146
+ if self.retention is not None:
147
+ lines.append(f"\nRetention: {int(self.retention * 100)}%")
148
+ if self.n_dropped > 0:
149
+ lines.append(f"Dropped: {self.n_dropped} rows")
150
+ if self.drops_by_op:
151
+ for op, count in list(self.drops_by_op.items())[:5]:
152
+ lines.append(f" • {op}: {count}")
153
+ if self.n_changes > 0:
154
+ lines.append(f"\nValue changes: {self.n_changes} cells")
155
+ if self.changes_by_op:
156
+ for op, count in list(self.changes_by_op.items())[:5]:
157
+ lines.append(f" • {op}: {count}")
158
+
130
159
  if verbose and self.facts:
131
160
  lines.append("\n Measured facts:")
132
161
  for k, v in self.facts.items():
@@ -158,6 +187,8 @@ class CheckResult:
158
187
  "n_dropped": self.n_dropped,
159
188
  "n_steps": self.n_steps,
160
189
  "drops_by_op": self.drops_by_op,
190
+ "n_changes": self.n_changes,
191
+ "changes_by_op": self.changes_by_op,
161
192
  "facts": self.facts,
162
193
  "suggestions": self.suggestions,
163
194
  "warnings": [
@@ -191,6 +222,7 @@ class TraceResult:
191
222
  Events are in CHRONOLOGICAL order (oldest->newest).
192
223
 
193
224
  Key attributes:
225
+ status: "alive" or "dropped" (string representation)
194
226
  origin: Where this row came from (concat, merge, or original)
195
227
  representative: If dropped by dedup, which row was kept instead
196
228
  """
@@ -207,6 +239,27 @@ class TraceResult:
207
239
  # v0.4+ provenance
208
240
  concat_origin: dict[str, Any] | None = None
209
241
  dedup_representative: dict[str, Any] | None = None
242
+ # Steps this row survived (for SURVIVED event generation)
243
+ _survived_steps: list[dict[str, Any]] = field(default_factory=list)
244
+
245
+ @property
246
+ def status(self) -> str:
247
+ """Row status as string: 'alive' or 'dropped'."""
248
+ return "alive" if self.is_alive else "dropped"
249
+
250
+ @property
251
+ def dropped_by(self) -> str | None:
252
+ """Operation that dropped this row, or None if alive."""
253
+ if self.dropped_at:
254
+ return self.dropped_at.get("operation")
255
+ return None
256
+
257
+ @property
258
+ def dropped_at_step(self) -> int | None:
259
+ """Step number where this row was dropped, or None if alive."""
260
+ if self.dropped_at:
261
+ return self.dropped_at.get("step_id")
262
+ return None
210
263
 
211
264
  @property
212
265
  def n_events(self) -> int:
@@ -258,8 +311,10 @@ class TraceResult:
258
311
  """Export to dictionary."""
259
312
  return {
260
313
  "row_id": self.row_id,
314
+ "status": self.status,
261
315
  "is_alive": self.is_alive,
262
316
  "dropped_at": self.dropped_at,
317
+ "dropped_by": self.dropped_at.get("operation") if self.dropped_at else None,
263
318
  "origin": self.origin,
264
319
  "representative": self.representative,
265
320
  "n_events": self.n_events,
@@ -280,10 +335,11 @@ class TraceResult:
280
335
 
281
336
  lines = [f"Row {self.row_id} Journey:"]
282
337
 
338
+ # Status line matches documentation format
283
339
  if self.is_alive:
284
340
  lines.append(" Status: [OK] Alive")
285
341
  else:
286
- lines.append(" Status: [X] Dropped")
342
+ lines.append(" Status: [DROPPED]")
287
343
  if self.dropped_at:
288
344
  lines.append(
289
345
  f" at step {self.dropped_at['step_id']}: {self.dropped_at['operation']}"
@@ -579,6 +635,21 @@ def check(
579
635
  if count > 1000:
580
636
  suggestions.append(f"'{op}' dropped {count} rows - review if intentional")
581
637
 
638
+ # === CELL CHANGES (debug mode only) ===
639
+ n_changes = 0
640
+ changes_by_op: dict[str, int] = {}
641
+ if ctx.config.mode == TracePipeMode.DEBUG:
642
+ # Count non-drop diffs (cell-level changes)
643
+ step_map = {s.step_id: s.operation for s in ctx.store.steps}
644
+ for i in range(len(ctx.store.diff_step_ids)):
645
+ col = ctx.store.diff_cols[i]
646
+ if col != "__row__": # Skip drop events
647
+ n_changes += 1
648
+ step_id = ctx.store.diff_step_ids[i]
649
+ op = step_map.get(step_id, "unknown")
650
+ changes_by_op[op] = changes_by_op.get(op, 0) + 1
651
+ facts["n_changes"] = n_changes
652
+
582
653
  ok = len([w for w in warnings_list if w.severity == "fact"]) == 0
583
654
 
584
655
  return CheckResult(
@@ -588,6 +659,8 @@ def check(
588
659
  suggestions=suggestions,
589
660
  mode=ctx.config.mode.value,
590
661
  _drops_by_op=drops_by_op,
662
+ _n_changes=n_changes,
663
+ _changes_by_op=changes_by_op,
591
664
  )
592
665
 
593
666
 
@@ -595,6 +668,7 @@ def trace(
595
668
  df: pd.DataFrame,
596
669
  *,
597
670
  row: int | None = None,
671
+ row_id: int | None = None,
598
672
  where: dict[str, Any] | None = None,
599
673
  include_ghost: bool = True,
600
674
  ) -> TraceResult | list[TraceResult]:
@@ -603,7 +677,8 @@ def trace(
603
677
 
604
678
  Args:
605
679
  df: DataFrame to search in
606
- row: Row ID (if known)
680
+ row: Row position (0-based index into current DataFrame)
681
+ row_id: Internal row ID (use for tracing dropped rows)
607
682
  where: Selector dict, e.g. {"customer_id": "C123"}
608
683
  include_ghost: Include last-known values for dropped rows
609
684
 
@@ -612,8 +687,14 @@ def trace(
612
687
  Use print(result) for pretty output, result.to_dict() for data.
613
688
 
614
689
  Examples:
615
- result = tp.trace(df, row=5)
616
- print(result)
690
+ # Trace by position in current DataFrame
691
+ result = tp.trace(df, row=0) # First row
692
+
693
+ # Trace by internal row ID (for dropped rows)
694
+ dropped = tp.debug.inspect().dropped_rows()
695
+ result = tp.trace(df, row_id=dropped[0])
696
+
697
+ # Trace by business key
617
698
  tp.trace(df, where={"customer_id": "C123"})
618
699
  """
619
700
  ctx = get_context()
@@ -624,12 +705,30 @@ def trace(
624
705
  pass
625
706
 
626
707
  # Resolve row IDs
627
- if row is not None:
628
- row_ids = [row]
708
+ if row_id is not None:
709
+ # Direct row ID specified - use as-is
710
+ row_ids = [row_id]
711
+ elif row is not None:
712
+ # row= is a DataFrame index position (0-based), not a row ID
713
+ # Convert to actual row ID using the DataFrame's registered IDs
714
+ rids = ctx.row_manager.get_ids_array(df)
715
+ if rids is not None:
716
+ # Handle negative indexing
717
+ if row < 0:
718
+ row = len(rids) + row
719
+ if 0 <= row < len(rids):
720
+ row_ids = [int(rids[row])]
721
+ else:
722
+ raise ValueError(
723
+ f"Row index {row} out of bounds for DataFrame with {len(rids)} rows"
724
+ )
725
+ else:
726
+ # DataFrame not tracked - use row as-is (legacy behavior)
727
+ row_ids = [row]
629
728
  elif where is not None:
630
729
  row_ids = _resolve_where(df, where, ctx)
631
730
  else:
632
- raise ValueError("Must provide 'row' or 'where'")
731
+ raise ValueError("Must provide 'row', 'row_id', or 'where'")
633
732
 
634
733
  results = []
635
734
  for rid in row_ids:
@@ -644,6 +743,7 @@ def why(
644
743
  *,
645
744
  col: str,
646
745
  row: int | None = None,
746
+ row_id: int | None = None,
647
747
  where: dict[str, Any] | None = None,
648
748
  ) -> WhyResult | list[WhyResult]:
649
749
  """
@@ -652,7 +752,8 @@ def why(
652
752
  Args:
653
753
  df: DataFrame to search in
654
754
  col: Column name to trace
655
- row: Row ID (if known)
755
+ row: Row position (0-based index into current DataFrame)
756
+ row_id: Internal row ID (use for cells in dropped rows)
656
757
  where: Selector dict, e.g. {"customer_id": "C123"}
657
758
 
658
759
  Returns:
@@ -660,7 +761,7 @@ def why(
660
761
  Use print(result) for pretty output, result.to_dict() for data.
661
762
 
662
763
  Examples:
663
- result = tp.why(df, col="amount", row=5)
764
+ result = tp.why(df, col="amount", row=0) # First row
664
765
  print(result)
665
766
  tp.why(df, col="email", where={"user_id": "U123"})
666
767
  """
@@ -676,12 +777,30 @@ def why(
676
777
  )
677
778
 
678
779
  # Resolve row IDs
679
- if row is not None:
680
- row_ids = [row]
780
+ if row_id is not None:
781
+ # Direct row ID specified - use as-is
782
+ row_ids = [row_id]
783
+ elif row is not None:
784
+ # row= is a DataFrame index position (0-based), not a row ID
785
+ # Convert to actual row ID using the DataFrame's registered IDs
786
+ rids = ctx.row_manager.get_ids_array(df)
787
+ if rids is not None:
788
+ # Handle negative indexing
789
+ if row < 0:
790
+ row = len(rids) + row
791
+ if 0 <= row < len(rids):
792
+ row_ids = [int(rids[row])]
793
+ else:
794
+ raise ValueError(
795
+ f"Row index {row} out of bounds for DataFrame with {len(rids)} rows"
796
+ )
797
+ else:
798
+ # DataFrame not tracked - use row as-is (legacy behavior)
799
+ row_ids = [row]
681
800
  elif where is not None:
682
801
  row_ids = _resolve_where(df, where, ctx)
683
802
  else:
684
- raise ValueError("Must provide 'row' or 'where'")
803
+ raise ValueError("Must provide 'row', 'row_id', or 'where'")
685
804
 
686
805
  results = []
687
806
  for rid in row_ids:
@@ -179,6 +179,46 @@ class DebugInspector:
179
179
  ctx = get_context()
180
180
  return ctx.row_manager.get_ghost_rows(limit=limit)
181
181
 
182
+ def get_ghost_values(self, row_id: int) -> dict[str, Any] | None:
183
+ """
184
+ Get last-known values for a specific dropped row (DEBUG mode only).
185
+
186
+ Args:
187
+ row_id: The row ID to look up
188
+
189
+ Returns:
190
+ Dict mapping column names to their last known values,
191
+ or None if the row was not found in ghost storage.
192
+
193
+ Example:
194
+ dbg = tp.debug.inspect()
195
+ dropped_rid = list(dbg.dropped_rows())[0]
196
+ ghost = dbg.get_ghost_values(dropped_rid)
197
+ print(f"Last known values: {ghost}")
198
+ """
199
+ ctx = get_context()
200
+ ghost_df = ctx.row_manager.get_ghost_rows(limit=100000)
201
+
202
+ if ghost_df.empty or "__tp_row_id__" not in ghost_df.columns:
203
+ return None
204
+
205
+ row_match = ghost_df[ghost_df["__tp_row_id__"] == row_id]
206
+ if row_match.empty:
207
+ return None
208
+
209
+ # Convert to dict and remove internal columns
210
+ result = row_match.iloc[0].to_dict()
211
+ internal_cols = [
212
+ "__tp_row_id__",
213
+ "__tp_dropped_by__",
214
+ "__tp_dropped_step__",
215
+ "__tp_original_position__",
216
+ ]
217
+ for col in internal_cols:
218
+ result.pop(col, None)
219
+
220
+ return result
221
+
182
222
  def stats(self) -> dict:
183
223
  """Get comprehensive tracking statistics."""
184
224
  ctx = get_context()