tracepipe 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tracepipe/__init__.py +1 -1
- tracepipe/api.py +54 -4
- tracepipe/convenience.py +17 -7
- tracepipe/storage/lineage_store.py +63 -1
- tracepipe/value_provenance.py +32 -24
- tracepipe-0.3.2.dist-info/METADATA +308 -0
- {tracepipe-0.3.0.dist-info → tracepipe-0.3.2.dist-info}/RECORD +9 -9
- tracepipe-0.3.0.dist-info/METADATA +0 -575
- {tracepipe-0.3.0.dist-info → tracepipe-0.3.2.dist-info}/WHEEL +0 -0
- {tracepipe-0.3.0.dist-info → tracepipe-0.3.2.dist-info}/licenses/LICENSE +0 -0
tracepipe/__init__.py
CHANGED
tracepipe/api.py
CHANGED
|
@@ -75,12 +75,18 @@ def enable(
|
|
|
75
75
|
# Custom configuration
|
|
76
76
|
tp.enable(mode="ci", merge_provenance=True)
|
|
77
77
|
"""
|
|
78
|
+
ctx = get_context()
|
|
79
|
+
|
|
80
|
+
# If already enabled, reset accumulated state to prevent duplicate warnings/stats
|
|
81
|
+
# This handles the common case of re-running scripts in notebooks/IDEs
|
|
82
|
+
if ctx.enabled:
|
|
83
|
+
_reset_accumulated_state(ctx)
|
|
84
|
+
|
|
78
85
|
# Get or create config
|
|
79
86
|
# If config is provided explicitly, use it
|
|
80
87
|
# Otherwise, start with existing context config (if any) or create new default
|
|
81
88
|
if config is None:
|
|
82
|
-
|
|
83
|
-
config = existing_ctx.config # Use existing config as base
|
|
89
|
+
config = ctx.config # Use existing config as base
|
|
84
90
|
|
|
85
91
|
# Handle mode
|
|
86
92
|
if mode is not None:
|
|
@@ -115,14 +121,14 @@ def enable(
|
|
|
115
121
|
ctx = TracePipeContext(config=config, backend=backend, identity=identity)
|
|
116
122
|
set_context(ctx)
|
|
117
123
|
else:
|
|
118
|
-
ctx = get_context()
|
|
119
124
|
ctx.config = config
|
|
120
125
|
# Also update config in row_manager and store (they may have their own references)
|
|
121
126
|
ctx.row_manager.config = config
|
|
122
127
|
ctx.store.config = config
|
|
123
128
|
|
|
124
|
-
# Add watched columns
|
|
129
|
+
# Add watched columns (reset first if re-enabling to avoid stale watches)
|
|
125
130
|
if watch:
|
|
131
|
+
ctx.watched_columns.clear()
|
|
126
132
|
ctx.watched_columns.update(watch)
|
|
127
133
|
|
|
128
134
|
if not ctx.enabled:
|
|
@@ -132,6 +138,50 @@ def enable(
|
|
|
132
138
|
return _get_module()
|
|
133
139
|
|
|
134
140
|
|
|
141
|
+
def _reset_accumulated_state(ctx: TracePipeContext) -> None:
|
|
142
|
+
"""
|
|
143
|
+
Reset accumulated lineage state without disabling instrumentation.
|
|
144
|
+
|
|
145
|
+
Called when enable() is invoked on an already-enabled context to prevent
|
|
146
|
+
state accumulation across multiple script runs in the same Python process.
|
|
147
|
+
"""
|
|
148
|
+
store = ctx.store
|
|
149
|
+
|
|
150
|
+
# Clear merge stats (prevents duplicate warnings)
|
|
151
|
+
if hasattr(store, "merge_stats"):
|
|
152
|
+
store.merge_stats.clear()
|
|
153
|
+
|
|
154
|
+
# Clear bulk drops
|
|
155
|
+
if hasattr(store, "bulk_drops"):
|
|
156
|
+
store.bulk_drops.clear()
|
|
157
|
+
|
|
158
|
+
# Clear steps
|
|
159
|
+
if hasattr(store, "_steps"):
|
|
160
|
+
store._steps.clear()
|
|
161
|
+
|
|
162
|
+
# Clear in-memory diffs
|
|
163
|
+
if hasattr(store, "_clear_in_memory"):
|
|
164
|
+
store._clear_in_memory()
|
|
165
|
+
|
|
166
|
+
# Reset step counter
|
|
167
|
+
if hasattr(store, "_step_counter"):
|
|
168
|
+
store._step_counter = 0
|
|
169
|
+
|
|
170
|
+
# Clear merge mappings
|
|
171
|
+
if hasattr(store, "merge_mappings"):
|
|
172
|
+
store.merge_mappings.clear()
|
|
173
|
+
|
|
174
|
+
# Clear aggregation mappings
|
|
175
|
+
if hasattr(store, "aggregation_mappings"):
|
|
176
|
+
store.aggregation_mappings.clear()
|
|
177
|
+
|
|
178
|
+
# Reset row identity manager
|
|
179
|
+
ctx.row_manager.clear()
|
|
180
|
+
|
|
181
|
+
# Clear watched columns (will be re-added if watch param provided)
|
|
182
|
+
ctx.watched_columns.clear()
|
|
183
|
+
|
|
184
|
+
|
|
135
185
|
def disable() -> types.ModuleType:
|
|
136
186
|
"""
|
|
137
187
|
Disable TracePipe and restore original pandas methods.
|
tracepipe/convenience.py
CHANGED
|
@@ -385,22 +385,27 @@ def check(
|
|
|
385
385
|
)
|
|
386
386
|
)
|
|
387
387
|
|
|
388
|
-
|
|
388
|
+
# Note on dup_rate semantics:
|
|
389
|
+
# - left_dup_rate = fraction of LEFT rows appearing >1 times in result
|
|
390
|
+
# This happens when RIGHT table has duplicate join keys
|
|
391
|
+
# - right_dup_rate = fraction of RIGHT rows appearing >1 times in result
|
|
392
|
+
# This happens when LEFT table has duplicate join keys
|
|
393
|
+
if stats.right_dup_rate > 0.01:
|
|
389
394
|
warnings_list.append(
|
|
390
395
|
CheckWarning(
|
|
391
396
|
category="duplicate_keys",
|
|
392
397
|
severity="fact",
|
|
393
|
-
message=f"Left table has {stats.
|
|
394
|
-
details={"step_id": step_id, "dup_rate": stats.
|
|
398
|
+
message=f"Left table has {stats.right_dup_rate:.1%} duplicate join keys",
|
|
399
|
+
details={"step_id": step_id, "dup_rate": stats.right_dup_rate},
|
|
395
400
|
)
|
|
396
401
|
)
|
|
397
|
-
if stats.
|
|
402
|
+
if stats.left_dup_rate > 0.01:
|
|
398
403
|
warnings_list.append(
|
|
399
404
|
CheckWarning(
|
|
400
405
|
category="duplicate_keys",
|
|
401
406
|
severity="fact",
|
|
402
|
-
message=f"Right table has {stats.
|
|
403
|
-
details={"step_id": step_id, "dup_rate": stats.
|
|
407
|
+
message=f"Right table has {stats.left_dup_rate:.1%} duplicate join keys",
|
|
408
|
+
details={"step_id": step_id, "dup_rate": stats.left_dup_rate},
|
|
404
409
|
)
|
|
405
410
|
)
|
|
406
411
|
|
|
@@ -733,9 +738,14 @@ def _build_trace_result(row_id: int, ctx, include_ghost: bool) -> TraceResult:
|
|
|
733
738
|
store = ctx.store
|
|
734
739
|
|
|
735
740
|
drop_event = store.get_drop_event(row_id)
|
|
736
|
-
history = store.get_row_history(row_id)
|
|
737
741
|
merge_origin = store.get_merge_origin(row_id)
|
|
738
742
|
|
|
743
|
+
# Use lineage-aware history to include pre-merge parent events
|
|
744
|
+
if hasattr(store, "get_row_history_with_lineage"):
|
|
745
|
+
history = store.get_row_history_with_lineage(row_id)
|
|
746
|
+
else:
|
|
747
|
+
history = store.get_row_history(row_id)
|
|
748
|
+
|
|
739
749
|
dropped_at = None
|
|
740
750
|
if drop_event:
|
|
741
751
|
dropped_at = {
|
|
@@ -485,6 +485,9 @@ class InMemoryLineageStore:
|
|
|
485
485
|
|
|
486
486
|
CONTRACT: Returned list has monotonically increasing step_id.
|
|
487
487
|
Convenience layer may reverse for display.
|
|
488
|
+
|
|
489
|
+
Note: This returns only direct events for this row_id.
|
|
490
|
+
Use get_row_history_with_lineage() to include pre-merge parent history.
|
|
488
491
|
"""
|
|
489
492
|
step_map = {s.step_id: s for s in self._steps}
|
|
490
493
|
events = []
|
|
@@ -546,6 +549,65 @@ class InMemoryLineageStore:
|
|
|
546
549
|
|
|
547
550
|
return events
|
|
548
551
|
|
|
552
|
+
def get_row_history_with_lineage(self, row_id: int, max_depth: int = 10) -> list[dict]:
|
|
553
|
+
"""
|
|
554
|
+
Get row history including pre-merge parent history.
|
|
555
|
+
|
|
556
|
+
Follows merge lineage recursively to build complete cell provenance.
|
|
557
|
+
This is essential for tracking changes that happened before merge operations.
|
|
558
|
+
|
|
559
|
+
Args:
|
|
560
|
+
row_id: Row ID to trace
|
|
561
|
+
max_depth: Maximum merge depth to follow (prevents infinite loops)
|
|
562
|
+
|
|
563
|
+
Returns:
|
|
564
|
+
List of events in chronological order, including parent row events.
|
|
565
|
+
"""
|
|
566
|
+
visited: set[int] = set()
|
|
567
|
+
|
|
568
|
+
def _collect_history(rid: int, depth: int) -> list[dict]:
|
|
569
|
+
if depth > max_depth or rid in visited:
|
|
570
|
+
return []
|
|
571
|
+
visited.add(rid)
|
|
572
|
+
|
|
573
|
+
events = []
|
|
574
|
+
|
|
575
|
+
# Check if this row came from a merge
|
|
576
|
+
origin = self.get_merge_origin(rid)
|
|
577
|
+
if origin and origin["left_parent"] is not None:
|
|
578
|
+
# Recursively get parent's history first (chronological order)
|
|
579
|
+
parent_events = _collect_history(origin["left_parent"], depth + 1)
|
|
580
|
+
events.extend(parent_events)
|
|
581
|
+
|
|
582
|
+
# Add this row's direct events
|
|
583
|
+
events.extend(self.get_row_history(rid))
|
|
584
|
+
|
|
585
|
+
return events
|
|
586
|
+
|
|
587
|
+
all_events = _collect_history(row_id, 0)
|
|
588
|
+
|
|
589
|
+
# Sort by step_id to ensure chronological order across lineage
|
|
590
|
+
all_events.sort(key=lambda e: e["step_id"])
|
|
591
|
+
|
|
592
|
+
return all_events
|
|
593
|
+
|
|
594
|
+
def get_cell_history_with_lineage(
|
|
595
|
+
self, row_id: int, column: str, max_depth: int = 10
|
|
596
|
+
) -> list[dict]:
|
|
597
|
+
"""
|
|
598
|
+
Get cell history for a specific column, including pre-merge parent history.
|
|
599
|
+
|
|
600
|
+
Args:
|
|
601
|
+
row_id: Row ID to trace
|
|
602
|
+
column: Column name to filter events for
|
|
603
|
+
max_depth: Maximum merge depth to follow
|
|
604
|
+
|
|
605
|
+
Returns:
|
|
606
|
+
List of events for this column in chronological order.
|
|
607
|
+
"""
|
|
608
|
+
all_events = self.get_row_history_with_lineage(row_id, max_depth)
|
|
609
|
+
return [e for e in all_events if e["col"] == column]
|
|
610
|
+
|
|
549
611
|
def get_dropped_rows(self, step_id: Optional[int] = None) -> list[int]:
|
|
550
612
|
"""Get all dropped row IDs, optionally filtered by step."""
|
|
551
613
|
if step_id is not None:
|
|
@@ -648,7 +710,7 @@ class InMemoryLineageStore:
|
|
|
648
710
|
diffs = list(self._iter_all_diffs())
|
|
649
711
|
|
|
650
712
|
data = {
|
|
651
|
-
"tracepipe_version": "0.3.
|
|
713
|
+
"tracepipe_version": "0.3.2",
|
|
652
714
|
"export_timestamp": time.time(),
|
|
653
715
|
"total_diffs": len(diffs),
|
|
654
716
|
"total_steps": len(self._steps),
|
tracepipe/value_provenance.py
CHANGED
|
@@ -19,7 +19,6 @@ from typing import Any, Optional
|
|
|
19
19
|
import pandas as pd
|
|
20
20
|
|
|
21
21
|
from .context import get_context
|
|
22
|
-
from .core import ChangeType
|
|
23
22
|
|
|
24
23
|
|
|
25
24
|
@dataclass
|
|
@@ -96,7 +95,12 @@ class ValueHistory:
|
|
|
96
95
|
}
|
|
97
96
|
|
|
98
97
|
|
|
99
|
-
def explain_value(
|
|
98
|
+
def explain_value(
|
|
99
|
+
row_id: int,
|
|
100
|
+
column: str,
|
|
101
|
+
df: Optional[pd.DataFrame] = None,
|
|
102
|
+
follow_lineage: bool = True,
|
|
103
|
+
) -> ValueHistory:
|
|
100
104
|
"""
|
|
101
105
|
Get complete history of a specific cell's value.
|
|
102
106
|
|
|
@@ -104,6 +108,7 @@ def explain_value(row_id: int, column: str, df: Optional[pd.DataFrame] = None) -
|
|
|
104
108
|
row_id: Row ID to trace
|
|
105
109
|
column: Column name
|
|
106
110
|
df: Optional DataFrame for current value lookup
|
|
111
|
+
follow_lineage: If True, include pre-merge parent history (default: True)
|
|
107
112
|
|
|
108
113
|
Returns:
|
|
109
114
|
ValueHistory with all changes to this cell
|
|
@@ -121,35 +126,38 @@ def explain_value(row_id: int, column: str, df: Optional[pd.DataFrame] = None) -
|
|
|
121
126
|
if len(matches) > 0 and column in df.columns:
|
|
122
127
|
current_value = df.iloc[matches[0]][column]
|
|
123
128
|
|
|
124
|
-
# Collect
|
|
129
|
+
# Collect events - use lineage-aware method if requested
|
|
130
|
+
if follow_lineage and hasattr(store, "get_cell_history_with_lineage"):
|
|
131
|
+
# Get cell history including pre-merge parent history
|
|
132
|
+
raw_events = store.get_cell_history_with_lineage(row_id, column)
|
|
133
|
+
else:
|
|
134
|
+
# Fallback to direct row_id lookup only
|
|
135
|
+
raw_events = [e for e in store.get_row_history(row_id) if e["col"] == column]
|
|
136
|
+
|
|
137
|
+
# Convert to ValueEvent objects
|
|
125
138
|
events = []
|
|
126
|
-
step_map = {s.step_id: s for s in store.steps}
|
|
127
139
|
became_null_at = None
|
|
128
140
|
became_null_by = None
|
|
129
141
|
|
|
130
|
-
for diff in
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
change_type=ChangeType(diff["change_type"]).name,
|
|
141
|
-
timestamp=step.timestamp if step else 0,
|
|
142
|
-
code_location=(
|
|
143
|
-
f"{step.code_file}:{step.code_line}" if step and step.code_file else None
|
|
144
|
-
),
|
|
145
|
-
)
|
|
142
|
+
for diff in raw_events:
|
|
143
|
+
events.append(
|
|
144
|
+
ValueEvent(
|
|
145
|
+
step_id=diff["step_id"],
|
|
146
|
+
operation=diff.get("operation", "unknown"),
|
|
147
|
+
old_value=diff["old_val"],
|
|
148
|
+
new_value=diff["new_val"],
|
|
149
|
+
change_type=diff.get("change_type", "UNKNOWN"),
|
|
150
|
+
timestamp=diff.get("timestamp", 0) or 0,
|
|
151
|
+
code_location=diff.get("code_location"),
|
|
146
152
|
)
|
|
153
|
+
)
|
|
147
154
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
155
|
+
# Track when value became null
|
|
156
|
+
if became_null_at is None and pd.isna(diff["new_val"]) and not pd.isna(diff["old_val"]):
|
|
157
|
+
became_null_at = diff["step_id"]
|
|
158
|
+
became_null_by = diff.get("operation", "unknown")
|
|
152
159
|
|
|
160
|
+
# Events should already be sorted by step_id from lineage method
|
|
153
161
|
events.sort(key=lambda e: e.step_id)
|
|
154
162
|
|
|
155
163
|
return ValueHistory(
|
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tracepipe
|
|
3
|
+
Version: 0.3.2
|
|
4
|
+
Summary: Row-level data lineage tracking for pandas pipelines
|
|
5
|
+
Project-URL: Homepage, https://github.com/tracepipe/tracepipe
|
|
6
|
+
Project-URL: Documentation, https://tracepipe.github.io/tracepipe/
|
|
7
|
+
Project-URL: Repository, https://github.com/tracepipe/tracepipe.git
|
|
8
|
+
Project-URL: Issues, https://github.com/tracepipe/tracepipe/issues
|
|
9
|
+
Project-URL: Changelog, https://tracepipe.github.io/tracepipe/changelog/
|
|
10
|
+
Author: Gauthier Piarrette
|
|
11
|
+
License: MIT License
|
|
12
|
+
|
|
13
|
+
Copyright (c) 2026 Gauthier Piarrette
|
|
14
|
+
|
|
15
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
16
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
17
|
+
in the Software without restriction, including without limitation the rights
|
|
18
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
19
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
20
|
+
furnished to do so, subject to the following conditions:
|
|
21
|
+
|
|
22
|
+
The above copyright notice and this permission notice shall be included in all
|
|
23
|
+
copies or substantial portions of the Software.
|
|
24
|
+
|
|
25
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
26
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
27
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
28
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
29
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
30
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
31
|
+
SOFTWARE.
|
|
32
|
+
License-File: LICENSE
|
|
33
|
+
Keywords: data-engineering,data-lineage,data-quality,debugging,observability,pandas
|
|
34
|
+
Classifier: Development Status :: 4 - Beta
|
|
35
|
+
Classifier: Intended Audience :: Developers
|
|
36
|
+
Classifier: Intended Audience :: Science/Research
|
|
37
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
38
|
+
Classifier: Operating System :: OS Independent
|
|
39
|
+
Classifier: Programming Language :: Python :: 3
|
|
40
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
41
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
42
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
43
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
44
|
+
Classifier: Topic :: Scientific/Engineering
|
|
45
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
46
|
+
Requires-Python: >=3.9
|
|
47
|
+
Requires-Dist: numpy>=1.20.0
|
|
48
|
+
Requires-Dist: pandas>=1.5.0
|
|
49
|
+
Provides-Extra: all
|
|
50
|
+
Requires-Dist: psutil>=5.9.0; extra == 'all'
|
|
51
|
+
Requires-Dist: pyarrow>=10.0.0; extra == 'all'
|
|
52
|
+
Provides-Extra: arrow
|
|
53
|
+
Requires-Dist: pyarrow>=10.0.0; extra == 'arrow'
|
|
54
|
+
Provides-Extra: dev
|
|
55
|
+
Requires-Dist: black>=23.0.0; extra == 'dev'
|
|
56
|
+
Requires-Dist: pre-commit>=3.5.0; extra == 'dev'
|
|
57
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
58
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
59
|
+
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
60
|
+
Requires-Dist: taskipy>=1.12.0; extra == 'dev'
|
|
61
|
+
Provides-Extra: docs
|
|
62
|
+
Requires-Dist: mkdocs-material>=9.5.0; extra == 'docs'
|
|
63
|
+
Requires-Dist: mkdocs>=1.5.0; extra == 'docs'
|
|
64
|
+
Requires-Dist: mkdocstrings[python]>=0.24.0; extra == 'docs'
|
|
65
|
+
Requires-Dist: pymdown-extensions>=10.0.0; extra == 'docs'
|
|
66
|
+
Provides-Extra: memory
|
|
67
|
+
Requires-Dist: psutil>=5.9.0; extra == 'memory'
|
|
68
|
+
Description-Content-Type: text/markdown
|
|
69
|
+
|
|
70
|
+
<div align="center">
|
|
71
|
+
|
|
72
|
+
# TracePipe
|
|
73
|
+
|
|
74
|
+
### Row-level data lineage for pandas pipelines
|
|
75
|
+
|
|
76
|
+
**Know exactly where every row went, why values changed, and how your data transformed.**
|
|
77
|
+
|
|
78
|
+
[](https://pypi.org/project/tracepipe/)
|
|
79
|
+
[](https://pypi.org/project/tracepipe/)
|
|
80
|
+
[](https://github.com/gauthierpiarrette/tracepipe/actions/workflows/ci.yml)
|
|
81
|
+
[](https://codecov.io/gh/gauthierpiarrette/tracepipe)
|
|
82
|
+
[](https://opensource.org/licenses/MIT)
|
|
83
|
+
[](https://gauthierpiarrette.github.io/tracepipe/)
|
|
84
|
+
|
|
85
|
+
[Getting Started](#getting-started) · [Documentation](https://gauthierpiarrette.github.io/tracepipe/) · [Examples](#real-world-example)
|
|
86
|
+
|
|
87
|
+
</div>
|
|
88
|
+
|
|
89
|
+
---
|
|
90
|
+
|
|
91
|
+
## Why TracePipe?
|
|
92
|
+
|
|
93
|
+
Data pipelines are black boxes. Rows vanish. Values change. You're left guessing.
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
df = pd.read_csv("customers.csv")
|
|
97
|
+
df = df.dropna() # Some rows disappear
|
|
98
|
+
df = df.merge(regions, on="zip") # New rows appear, some vanish
|
|
99
|
+
df["income"] = df["income"].fillna(0) # Values change silently
|
|
100
|
+
df = df[df["age"] >= 18] # More rows gone
|
|
101
|
+
# What happened to customer C-789? 🤷
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
**TracePipe gives you the complete audit trail — zero code changes required.**
|
|
105
|
+
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
## Getting Started
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
pip install tracepipe
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
import tracepipe as tp
|
|
116
|
+
import pandas as pd
|
|
117
|
+
|
|
118
|
+
tp.enable(mode="debug", watch=["income"])
|
|
119
|
+
|
|
120
|
+
df = pd.read_csv("customers.csv")
|
|
121
|
+
df = df.dropna()
|
|
122
|
+
df["income"] = df["income"].fillna(0)
|
|
123
|
+
df = df[df["age"] >= 18]
|
|
124
|
+
|
|
125
|
+
tp.check(df) # See what happened
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
```
|
|
129
|
+
TracePipe Check: [OK] Pipeline healthy
|
|
130
|
+
|
|
131
|
+
Retention: 847/1000 (84.7%)
|
|
132
|
+
Dropped: 153 rows
|
|
133
|
+
• DataFrame.dropna: 42
|
|
134
|
+
• DataFrame.__getitem__[mask]: 111
|
|
135
|
+
|
|
136
|
+
Value changes: 23 cells modified
|
|
137
|
+
• DataFrame.fillna: 23 (income)
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
That's it. **One import, full visibility.**
|
|
141
|
+
|
|
142
|
+
---
|
|
143
|
+
|
|
144
|
+
## Core API
|
|
145
|
+
|
|
146
|
+
| Function | What it does |
|
|
147
|
+
|----------|--------------|
|
|
148
|
+
| `tp.enable()` | Start tracking |
|
|
149
|
+
| `tp.check(df)` | Health check — retention, drops, changes |
|
|
150
|
+
| `tp.trace(df, where={"id": "C-789"})` | Follow a row's complete journey |
|
|
151
|
+
| `tp.why(df, col="income", row=5)` | Explain why a cell has its current value |
|
|
152
|
+
| `tp.report(df, "audit.html")` | Export interactive HTML report |
|
|
153
|
+
|
|
154
|
+
---
|
|
155
|
+
|
|
156
|
+
## Key Features
|
|
157
|
+
|
|
158
|
+
<table>
|
|
159
|
+
<tr>
|
|
160
|
+
<td width="50%">
|
|
161
|
+
|
|
162
|
+
### 🔍 Zero-Code Instrumentation
|
|
163
|
+
TracePipe patches pandas at runtime. Your existing code works unchanged.
|
|
164
|
+
|
|
165
|
+
### 📊 Complete Provenance
|
|
166
|
+
Track drops, transforms, merges, and cell-level changes with before/after values.
|
|
167
|
+
|
|
168
|
+
</td>
|
|
169
|
+
<td width="50%">
|
|
170
|
+
|
|
171
|
+
### 🎯 Business-Key Lookups
|
|
172
|
+
Find rows by their values: `tp.trace(df, where={"email": "alice@example.com"})`
|
|
173
|
+
|
|
174
|
+
### ⚡ Production-Ready
|
|
175
|
+
1.0-2.8x overhead (varies by operation). Tested on DataFrames up to 1M rows.
|
|
176
|
+
|
|
177
|
+
</td>
|
|
178
|
+
</tr>
|
|
179
|
+
</table>
|
|
180
|
+
|
|
181
|
+
---
|
|
182
|
+
|
|
183
|
+
## Real-World Example
|
|
184
|
+
|
|
185
|
+
```python
|
|
186
|
+
import tracepipe as tp
|
|
187
|
+
import pandas as pd
|
|
188
|
+
|
|
189
|
+
tp.enable(mode="debug", watch=["age", "income", "label"])
|
|
190
|
+
|
|
191
|
+
# Load and clean
|
|
192
|
+
df = pd.read_csv("training_data.csv")
|
|
193
|
+
df = df.dropna(subset=["label"])
|
|
194
|
+
df["income"] = df["income"].fillna(df["income"].median())
|
|
195
|
+
df = df[df["age"] >= 18]
|
|
196
|
+
|
|
197
|
+
# Audit
|
|
198
|
+
print(tp.check(df))
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
```
|
|
202
|
+
Retention: 8234/10000 (82.3%)
|
|
203
|
+
Dropped: 1766 rows
|
|
204
|
+
• DataFrame.dropna: 423
|
|
205
|
+
• DataFrame.__getitem__[mask]: 1343
|
|
206
|
+
|
|
207
|
+
Value changes: 892 cells
|
|
208
|
+
• DataFrame.fillna: 892 (income)
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
```python
|
|
212
|
+
# Why does this customer have a filled income?
|
|
213
|
+
tp.why(df, col="income", where={"customer_id": "C-789"})
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
```
|
|
217
|
+
Cell History: row 156, column 'income'
|
|
218
|
+
Current value: 45000.0
|
|
219
|
+
[i] Was null at step 1 (later recovered)
|
|
220
|
+
|
|
221
|
+
History (1 change):
|
|
222
|
+
None -> 45000.0
|
|
223
|
+
by: DataFrame.fillna
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
---
|
|
227
|
+
|
|
228
|
+
## Two Modes
|
|
229
|
+
|
|
230
|
+
| Mode | Use Case | What's Tracked |
|
|
231
|
+
|------|----------|----------------|
|
|
232
|
+
| **CI** (default) | Production pipelines | Step counts, retention rates, merge warnings |
|
|
233
|
+
| **Debug** | Development | Full row history, cell diffs, merge parents, group membership |
|
|
234
|
+
|
|
235
|
+
```python
|
|
236
|
+
tp.enable(mode="ci") # Lightweight
|
|
237
|
+
tp.enable(mode="debug") # Full lineage
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
---
|
|
241
|
+
|
|
242
|
+
## What's Tracked
|
|
243
|
+
|
|
244
|
+
| Operation | Coverage |
|
|
245
|
+
|-----------|----------|
|
|
246
|
+
| `dropna`, `drop_duplicates`, `query`, `df[mask]` | ✅ Full |
|
|
247
|
+
| `fillna`, `replace`, `loc[]=`, `iloc[]=` | ✅ Full (cell diffs) |
|
|
248
|
+
| `merge`, `join` | ✅ Full (parent tracking) |
|
|
249
|
+
| `groupby().agg()` | ✅ Full (group membership) |
|
|
250
|
+
| `sort_values`, `head`, `tail`, `sample` | ✅ Full |
|
|
251
|
+
| `apply`, `pipe` | ⚠️ Partial |
|
|
252
|
+
|
|
253
|
+
---
|
|
254
|
+
|
|
255
|
+
## Data Quality Contracts
|
|
256
|
+
|
|
257
|
+
```python
|
|
258
|
+
(tp.contract()
|
|
259
|
+
.expect_unique("customer_id")
|
|
260
|
+
.expect_no_nulls("email")
|
|
261
|
+
.expect_retention(min_rate=0.9)
|
|
262
|
+
.check(df)
|
|
263
|
+
.raise_if_failed())
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
---
|
|
267
|
+
|
|
268
|
+
## Documentation
|
|
269
|
+
|
|
270
|
+
📚 **[Full Documentation](https://gauthierpiarrette.github.io/tracepipe/)**
|
|
271
|
+
|
|
272
|
+
- [Quickstart](https://gauthierpiarrette.github.io/tracepipe/getting-started/quickstart/)
|
|
273
|
+
- [User Guide](https://gauthierpiarrette.github.io/tracepipe/guide/concepts/)
|
|
274
|
+
- [API Reference](https://gauthierpiarrette.github.io/tracepipe/api/)
|
|
275
|
+
- [Examples](https://gauthierpiarrette.github.io/tracepipe/examples/ml-pipeline/)
|
|
276
|
+
|
|
277
|
+
---
|
|
278
|
+
|
|
279
|
+
## Contributing
|
|
280
|
+
|
|
281
|
+
```bash
|
|
282
|
+
git clone https://github.com/gauthierpiarrette/tracepipe.git
|
|
283
|
+
cd tracepipe
|
|
284
|
+
pip install -e ".[dev]"
|
|
285
|
+
pytest tests/ -v
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
See [CONTRIBUTING](https://gauthierpiarrette.github.io/tracepipe/contributing/) for guidelines.
|
|
289
|
+
|
|
290
|
+
---
|
|
291
|
+
|
|
292
|
+
## License
|
|
293
|
+
|
|
294
|
+
MIT License. See [LICENSE](LICENSE).
|
|
295
|
+
|
|
296
|
+
---
|
|
297
|
+
|
|
298
|
+
<div align="center">
|
|
299
|
+
|
|
300
|
+
**Stop guessing where your rows went.**
|
|
301
|
+
|
|
302
|
+
```bash
|
|
303
|
+
pip install tracepipe
|
|
304
|
+
```
|
|
305
|
+
|
|
306
|
+
⭐ Star us on GitHub if TracePipe helps your data work!
|
|
307
|
+
|
|
308
|
+
</div>
|
|
@@ -1,13 +1,13 @@
|
|
|
1
|
-
tracepipe/__init__.py,sha256=
|
|
2
|
-
tracepipe/api.py,sha256=
|
|
1
|
+
tracepipe/__init__.py,sha256=MuwxV2mU4XxHqab62vQxaDAlhMvRCgUCmr_YU9R16ss,3342
|
|
2
|
+
tracepipe/api.py,sha256=WdcKvvzI3voDt6fxZWa8vjyZQU8lfRshx7T78oj7oFE,13351
|
|
3
3
|
tracepipe/context.py,sha256=_povLpqa5wd_ESHt5hbSmWTSMTF3nUfeutEQo4RMK2E,3856
|
|
4
4
|
tracepipe/contracts.py,sha256=m-rjPrgnCiAgKEkweOS7P95jrjDptt5UPdvUlqaV_rU,16226
|
|
5
|
-
tracepipe/convenience.py,sha256=
|
|
5
|
+
tracepipe/convenience.py,sha256=SZGcSOKPjAeJ9udPP_Fa_zTZY5GeDX61W6uftMwafjc,26563
|
|
6
6
|
tracepipe/core.py,sha256=kAXks694rR0Z4tD7Gyty0TyJGWx2whsSdteYYpHuazo,8010
|
|
7
7
|
tracepipe/debug.py,sha256=6t2GKVZLwn7SJLhrStE9qsmTiVIHATTE3jJPQ2DYtnc,10140
|
|
8
8
|
tracepipe/safety.py,sha256=jTBZv4QGDJfnZETsSZeMKbdOUtGXk-_XkmllhnGWM-M,5537
|
|
9
9
|
tracepipe/snapshot.py,sha256=OLREzE1_LkWITluG_Bqeb7Y4pAKb8Lb3zJEF3cxnloU,13967
|
|
10
|
-
tracepipe/value_provenance.py,sha256=
|
|
10
|
+
tracepipe/value_provenance.py,sha256=ogky6aOaZ-6K2uNBQxlXpmCeuvK434Hisj30zesRTd8,9330
|
|
11
11
|
tracepipe/instrumentation/__init__.py,sha256=pd0n6Z9m_V3gcBv097cXWFOZEzAP9sAq1jjQnNRrDZ8,222
|
|
12
12
|
tracepipe/instrumentation/apply_capture.py,sha256=cMThWzNXqWQENuMrCGTne1hO6fqaQFV7zJYNpsPTW4w,14463
|
|
13
13
|
tracepipe/instrumentation/filter_capture.py,sha256=onlYLU5bBZSM3WmxM2AFHfktnlx7ReG-brEn5eZ_N10,15830
|
|
@@ -17,13 +17,13 @@ tracepipe/instrumentation/pandas_inst.py,sha256=2YSoju9ml2PjLOYzsx8MHH1iqhjgnXHb
|
|
|
17
17
|
tracepipe/instrumentation/series_capture.py,sha256=N1Cf-pQDh23qQLLd8DNsxbcaD-91sTJkRd5AnccKZGE,10649
|
|
18
18
|
tracepipe/storage/__init__.py,sha256=pGFMfbIgIi2kofVPwYDqe2HTYMYJoabiGjTq77pYi-g,348
|
|
19
19
|
tracepipe/storage/base.py,sha256=7DV_-rp37DjBMr9B1w85hLVYhC8OQShk2PcEhT-n4tE,4894
|
|
20
|
-
tracepipe/storage/lineage_store.py,sha256=
|
|
20
|
+
tracepipe/storage/lineage_store.py,sha256=swMMf59isoCQZHaezCmquA-0R5iGNH3eGWjc9d9LGmo,27392
|
|
21
21
|
tracepipe/storage/row_identity.py,sha256=HBU0gTTJlFtFTcAdUCKuX-c9cHa0lo3CDIodDPDgOzA,17161
|
|
22
22
|
tracepipe/utils/__init__.py,sha256=CI_GXViCjdMbu1j6HuzZhoQZEW0sIB6WAve6j5pfOC0,182
|
|
23
23
|
tracepipe/utils/value_capture.py,sha256=wGgegQmJnVHxHbwHSH9di7JAOBChzD3ERJrabZNiayk,4092
|
|
24
24
|
tracepipe/visualization/__init__.py,sha256=M3s44ZTUNEToyghjhQW0FgbmWHKPr4Xc-7iNF6DpI_E,132
|
|
25
25
|
tracepipe/visualization/html_export.py,sha256=G0hfZTJctUCfpun17zXX1NIXhvJZbca6hKmP3rcIjbg,42282
|
|
26
|
-
tracepipe-0.3.
|
|
27
|
-
tracepipe-0.3.
|
|
28
|
-
tracepipe-0.3.
|
|
29
|
-
tracepipe-0.3.
|
|
26
|
+
tracepipe-0.3.2.dist-info/METADATA,sha256=ik5FLmADKLqj25TprTnJPi21SW4EJ88mBTG-aQ4p-gc,9152
|
|
27
|
+
tracepipe-0.3.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
28
|
+
tracepipe-0.3.2.dist-info/licenses/LICENSE,sha256=HMOAFHBClL79POwWL-2_aDcx42DJAq7Ce-nwJPvMB9U,1075
|
|
29
|
+
tracepipe-0.3.2.dist-info/RECORD,,
|
|
@@ -1,575 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: tracepipe
|
|
3
|
-
Version: 0.3.0
|
|
4
|
-
Summary: Row-level data lineage tracking for pandas pipelines
|
|
5
|
-
Project-URL: Homepage, https://github.com/tracepipe/tracepipe
|
|
6
|
-
Project-URL: Documentation, https://tracepipe.github.io/tracepipe/
|
|
7
|
-
Project-URL: Repository, https://github.com/tracepipe/tracepipe.git
|
|
8
|
-
Project-URL: Issues, https://github.com/tracepipe/tracepipe/issues
|
|
9
|
-
Project-URL: Changelog, https://tracepipe.github.io/tracepipe/changelog/
|
|
10
|
-
Author: Gauthier Piarrette
|
|
11
|
-
License: MIT License
|
|
12
|
-
|
|
13
|
-
Copyright (c) 2026 Gauthier Piarrette
|
|
14
|
-
|
|
15
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
16
|
-
of this software and associated documentation files (the "Software"), to deal
|
|
17
|
-
in the Software without restriction, including without limitation the rights
|
|
18
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
19
|
-
copies of the Software, and to permit persons to whom the Software is
|
|
20
|
-
furnished to do so, subject to the following conditions:
|
|
21
|
-
|
|
22
|
-
The above copyright notice and this permission notice shall be included in all
|
|
23
|
-
copies or substantial portions of the Software.
|
|
24
|
-
|
|
25
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
26
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
27
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
28
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
29
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
30
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
31
|
-
SOFTWARE.
|
|
32
|
-
License-File: LICENSE
|
|
33
|
-
Keywords: data-engineering,data-lineage,data-quality,debugging,observability,pandas
|
|
34
|
-
Classifier: Development Status :: 4 - Beta
|
|
35
|
-
Classifier: Intended Audience :: Developers
|
|
36
|
-
Classifier: Intended Audience :: Science/Research
|
|
37
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
38
|
-
Classifier: Operating System :: OS Independent
|
|
39
|
-
Classifier: Programming Language :: Python :: 3
|
|
40
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
41
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
42
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
43
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
44
|
-
Classifier: Topic :: Scientific/Engineering
|
|
45
|
-
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
46
|
-
Requires-Python: >=3.9
|
|
47
|
-
Requires-Dist: numpy>=1.20.0
|
|
48
|
-
Requires-Dist: pandas>=1.5.0
|
|
49
|
-
Provides-Extra: all
|
|
50
|
-
Requires-Dist: psutil>=5.9.0; extra == 'all'
|
|
51
|
-
Requires-Dist: pyarrow>=10.0.0; extra == 'all'
|
|
52
|
-
Provides-Extra: arrow
|
|
53
|
-
Requires-Dist: pyarrow>=10.0.0; extra == 'arrow'
|
|
54
|
-
Provides-Extra: dev
|
|
55
|
-
Requires-Dist: black>=23.0.0; extra == 'dev'
|
|
56
|
-
Requires-Dist: pre-commit>=3.5.0; extra == 'dev'
|
|
57
|
-
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
58
|
-
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
59
|
-
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
60
|
-
Requires-Dist: taskipy>=1.12.0; extra == 'dev'
|
|
61
|
-
Provides-Extra: docs
|
|
62
|
-
Requires-Dist: mkdocs-material>=9.5.0; extra == 'docs'
|
|
63
|
-
Requires-Dist: mkdocs>=1.5.0; extra == 'docs'
|
|
64
|
-
Requires-Dist: mkdocstrings[python]>=0.24.0; extra == 'docs'
|
|
65
|
-
Requires-Dist: pymdown-extensions>=10.0.0; extra == 'docs'
|
|
66
|
-
Provides-Extra: memory
|
|
67
|
-
Requires-Dist: psutil>=5.9.0; extra == 'memory'
|
|
68
|
-
Description-Content-Type: text/markdown
|
|
69
|
-
|
|
70
|
-
# TracePipe
|
|
71
|
-
|
|
72
|
-
**Row-level data lineage for pandas pipelines.**
|
|
73
|
-
|
|
74
|
-
TracePipe automatically tracks what happens to every row and cell in your DataFrame — drops, transformations, merges, and value changes. Zero code changes required.
|
|
75
|
-
|
|
76
|
-
[](https://pypi.org/project/tracepipe/)
|
|
77
|
-
[](https://pypi.org/project/tracepipe/)
|
|
78
|
-
[](https://github.com/gauthierpiarrette/tracepipe/actions/workflows/ci.yml)
|
|
79
|
-
[](https://codecov.io/gh/gauthierpiarrette/tracepipe)
|
|
80
|
-
[](https://opensource.org/licenses/MIT)
|
|
81
|
-
[](https://gauthierpiarrette.github.io/tracepipe/)
|
|
82
|
-
|
|
83
|
-
---
|
|
84
|
-
|
|
85
|
-
## The Problem
|
|
86
|
-
|
|
87
|
-
Data pipelines are black boxes. When something goes wrong, you're left asking:
|
|
88
|
-
|
|
89
|
-
- **"Where did row X go?"** — Dropped somewhere, but which step?
|
|
90
|
-
- **"Why is this value wrong?"** — It was fine in the source, what changed it?
|
|
91
|
-
- **"How did these rows get merged?"** — Which parent records combined?
|
|
92
|
-
- **"Why are there nulls here?"** — When did they appear?
|
|
93
|
-
|
|
94
|
-
```python
|
|
95
|
-
df = pd.read_csv("customers.csv")
|
|
96
|
-
df = df.dropna() # Some rows disappear
|
|
97
|
-
df = df.merge(regions, on="zip") # New rows appear, some vanish
|
|
98
|
-
df["income"] = df["income"].fillna(0) # Values change silently
|
|
99
|
-
df = df[df["age"] >= 18] # More rows gone
|
|
100
|
-
# What actually happened to customer C-789?
|
|
101
|
-
```
|
|
102
|
-
|
|
103
|
-
Traditional debugging means `print()` statements, manual diffs, and guesswork. **TracePipe gives you the complete audit trail.**
|
|
104
|
-
|
|
105
|
-
---
|
|
106
|
-
|
|
107
|
-
## The Solution
|
|
108
|
-
|
|
109
|
-
```python
|
|
110
|
-
import tracepipe as tp
|
|
111
|
-
import pandas as pd
|
|
112
|
-
|
|
113
|
-
tp.enable(mode="debug", watch=["income", "score"])
|
|
114
|
-
|
|
115
|
-
df = pd.read_csv("customers.csv")
|
|
116
|
-
df = df.dropna()
|
|
117
|
-
df["income"] = df["income"].fillna(0)
|
|
118
|
-
df = df.merge(segments, on="customer_id")
|
|
119
|
-
df = df[df["age"] >= 18]
|
|
120
|
-
|
|
121
|
-
# Pipeline health check
|
|
122
|
-
print(tp.check(df))
|
|
123
|
-
```
|
|
124
|
-
```
|
|
125
|
-
TracePipe Check: [OK] Pipeline healthy
|
|
126
|
-
Mode: debug
|
|
127
|
-
|
|
128
|
-
Retention: 847/1000 (84.7%)
|
|
129
|
-
Dropped: 153 rows
|
|
130
|
-
• DataFrame.dropna: 42
|
|
131
|
-
• DataFrame.__getitem__[mask]: 111
|
|
132
|
-
|
|
133
|
-
Value changes: 23 cells modified
|
|
134
|
-
• DataFrame.fillna: 23 (income)
|
|
135
|
-
```
|
|
136
|
-
|
|
137
|
-
```python
|
|
138
|
-
# Why did this customer's income change?
|
|
139
|
-
print(tp.why(df, col="income", where={"customer_id": "C-789"}))
|
|
140
|
-
```
|
|
141
|
-
```
|
|
142
|
-
Cell History: row 42, column 'income'
|
|
143
|
-
Current value: 0.0
|
|
144
|
-
[i] Was null at step 1 (later recovered)
|
|
145
|
-
by: DataFrame.fillna
|
|
146
|
-
|
|
147
|
-
History (1 change):
|
|
148
|
-
None -> 0.0
|
|
149
|
-
by: DataFrame.fillna
|
|
150
|
-
```
|
|
151
|
-
|
|
152
|
-
**One import. Complete audit trail.**
|
|
153
|
-
|
|
154
|
-
---
|
|
155
|
-
|
|
156
|
-
## Installation
|
|
157
|
-
|
|
158
|
-
```bash
|
|
159
|
-
pip install tracepipe
|
|
160
|
-
```
|
|
161
|
-
|
|
162
|
-
---
|
|
163
|
-
|
|
164
|
-
## Quick Start
|
|
165
|
-
|
|
166
|
-
### 1. Enable tracking
|
|
167
|
-
|
|
168
|
-
```python
|
|
169
|
-
import tracepipe as tp
|
|
170
|
-
|
|
171
|
-
tp.enable(mode="debug", watch=["price", "quantity"]) # Track specific columns
|
|
172
|
-
```
|
|
173
|
-
|
|
174
|
-
### 2. Run your pipeline normally
|
|
175
|
-
|
|
176
|
-
```python
|
|
177
|
-
df = pd.DataFrame({
|
|
178
|
-
"product": ["A", "B", "C", "D"],
|
|
179
|
-
"price": [10.0, None, 30.0, 40.0],
|
|
180
|
-
"quantity": [5, 10, 0, 8]
|
|
181
|
-
})
|
|
182
|
-
|
|
183
|
-
df = df.dropna() # Drops row B
|
|
184
|
-
df = df[df["quantity"] > 0] # Drops row C
|
|
185
|
-
df["total"] = df["price"] * df["quantity"]
|
|
186
|
-
```
|
|
187
|
-
|
|
188
|
-
### 3. Inspect the lineage
|
|
189
|
-
|
|
190
|
-
```python
|
|
191
|
-
# Health check - see drops AND changes
|
|
192
|
-
print(tp.check(df))
|
|
193
|
-
```
|
|
194
|
-
```
|
|
195
|
-
TracePipe Check: [OK] Pipeline healthy
|
|
196
|
-
Mode: debug
|
|
197
|
-
|
|
198
|
-
Retention: 2/4 (50.0%)
|
|
199
|
-
Dropped: 2 rows
|
|
200
|
-
• DataFrame.dropna: 1
|
|
201
|
-
• DataFrame.__getitem__[mask]: 1
|
|
202
|
-
|
|
203
|
-
Value changes: 2 cells
|
|
204
|
-
• DataFrame.__setitem__[total]: 2
|
|
205
|
-
```
|
|
206
|
-
|
|
207
|
-
```python
|
|
208
|
-
# Trace a specific row's full journey
|
|
209
|
-
print(tp.trace(df, where={"product": "A"}))
|
|
210
|
-
```
|
|
211
|
-
```
|
|
212
|
-
Row 0 Journey:
|
|
213
|
-
Status: [OK] Alive
|
|
214
|
-
|
|
215
|
-
Events: 1
|
|
216
|
-
[MODIFIED] DataFrame.__setitem__[total]: total
|
|
217
|
-
```
|
|
218
|
-
|
|
219
|
-
```python
|
|
220
|
-
# Explain why a specific cell has its current value
|
|
221
|
-
print(tp.why(df, col="total", row=0))
|
|
222
|
-
```
|
|
223
|
-
```
|
|
224
|
-
Cell History: row 0, column 'total'
|
|
225
|
-
Current value: 50.0
|
|
226
|
-
|
|
227
|
-
History (1 change):
|
|
228
|
-
None -> 50.0
|
|
229
|
-
by: DataFrame.__setitem__[total]
|
|
230
|
-
```
|
|
231
|
-
|
|
232
|
-
---
|
|
233
|
-
|
|
234
|
-
## Key Features
|
|
235
|
-
|
|
236
|
-
### 🔍 Zero-Code Instrumentation
|
|
237
|
-
|
|
238
|
-
TracePipe monkey-patches pandas at runtime. Your existing code works unchanged:
|
|
239
|
-
|
|
240
|
-
```python
|
|
241
|
-
tp.enable()
|
|
242
|
-
# Your existing pipeline runs exactly as before
|
|
243
|
-
# TracePipe silently records everything
|
|
244
|
-
tp.disable()
|
|
245
|
-
```
|
|
246
|
-
|
|
247
|
-
### 📊 Rich Provenance Data
|
|
248
|
-
|
|
249
|
-
Track everything that happens in your pipeline:
|
|
250
|
-
|
|
251
|
-
| Question | Answer |
|
|
252
|
-
|----------|--------|
|
|
253
|
-
| Which rows were dropped? | `tp.check(df)` shows retention by operation |
|
|
254
|
-
| Why did this value change? | `tp.why(df, col="amount", row=5)` shows before/after |
|
|
255
|
-
| What's this row's history? | `tp.trace(df, row=0)` shows full journey |
|
|
256
|
-
| Where did these rows merge from? | Merge parent tracking in debug mode |
|
|
257
|
-
| Which rows grouped together? | `tp.debug.inspect().explain_group("A")` |
|
|
258
|
-
| When did nulls appear? | `tp.why()` flags null introduction |
|
|
259
|
-
|
|
260
|
-
### 🎯 Business-Key Lookups
|
|
261
|
-
|
|
262
|
-
Find rows by their values, not internal IDs:
|
|
263
|
-
|
|
264
|
-
```python
|
|
265
|
-
# Find by business key
|
|
266
|
-
tp.trace(df, where={"customer_id": "C-12345"})
|
|
267
|
-
tp.trace(df, where={"email": "alice@example.com"})
|
|
268
|
-
|
|
269
|
-
# Find rows where a column is null
|
|
270
|
-
tp.why(df, col="email", where={"email": None})
|
|
271
|
-
```
|
|
272
|
-
|
|
273
|
-
### 📈 Production-Ready Performance
|
|
274
|
-
|
|
275
|
-
| Operation | Overhead | Notes |
|
|
276
|
-
|-----------|----------|-------|
|
|
277
|
-
| Filter (dropna, query) | 1.4-1.9x | Acceptable |
|
|
278
|
-
| Transform (fillna, replace) | 1.0-1.2x | Minimal |
|
|
279
|
-
| GroupBy | 1.0-1.2x | Minimal |
|
|
280
|
-
| Sort | 1.4x | Optimized |
|
|
281
|
-
| Scalar access (at/iat) | <1ms added | Fixed overhead |
|
|
282
|
-
|
|
283
|
-
Tested on DataFrames up to 1M rows with linear scaling.
|
|
284
|
-
|
|
285
|
-
### 🔒 Safety First
|
|
286
|
-
|
|
287
|
-
TracePipe never modifies your data or affects computation results:
|
|
288
|
-
|
|
289
|
-
```python
|
|
290
|
-
# Original pandas method ALWAYS runs first
|
|
291
|
-
# Lineage capture happens after, and failures are non-fatal
|
|
292
|
-
result = df.dropna() # Guaranteed to work, even if tracking fails
|
|
293
|
-
```
|
|
294
|
-
|
|
295
|
-
---
|
|
296
|
-
|
|
297
|
-
## Two Modes
|
|
298
|
-
|
|
299
|
-
### CI Mode (Default)
|
|
300
|
-
Lightweight tracking for production pipelines:
|
|
301
|
-
- Step counts and retention rates
|
|
302
|
-
- Dropped row detection
|
|
303
|
-
- Merge mismatch warnings
|
|
304
|
-
- **No per-row provenance** (fast)
|
|
305
|
-
|
|
306
|
-
```python
|
|
307
|
-
tp.enable(mode="ci")
|
|
308
|
-
```
|
|
309
|
-
|
|
310
|
-
### Debug Mode
|
|
311
|
-
Full lineage for development and debugging:
|
|
312
|
-
- Complete row-level history
|
|
313
|
-
- Cell change tracking with before/after values
|
|
314
|
-
- GroupBy membership
|
|
315
|
-
- Merge parent tracking
|
|
316
|
-
|
|
317
|
-
```python
|
|
318
|
-
tp.enable(mode="debug", watch=["price", "amount"])
|
|
319
|
-
```
|
|
320
|
-
|
|
321
|
-
---
|
|
322
|
-
|
|
323
|
-
## API Reference
|
|
324
|
-
|
|
325
|
-
### Core Functions (5)
|
|
326
|
-
|
|
327
|
-
| Function | Purpose |
|
|
328
|
-
|----------|---------|
|
|
329
|
-
| `tp.enable(mode, watch)` | Start tracking |
|
|
330
|
-
| `tp.check(df)` | Health check with retention stats |
|
|
331
|
-
| `tp.trace(df, row, where)` | Trace a row's journey |
|
|
332
|
-
| `tp.why(df, col, row, where)` | Explain why a cell changed |
|
|
333
|
-
| `tp.report(df, path)` | Export HTML report |
|
|
334
|
-
|
|
335
|
-
### Control Functions
|
|
336
|
-
|
|
337
|
-
| Function | Purpose |
|
|
338
|
-
|----------|---------|
|
|
339
|
-
| `tp.disable()` | Stop tracking |
|
|
340
|
-
| `tp.reset()` | Clear all lineage data |
|
|
341
|
-
| `tp.stage(name)` | Label pipeline stages |
|
|
342
|
-
|
|
343
|
-
### Debug Namespace
|
|
344
|
-
|
|
345
|
-
For power users who need raw access:
|
|
346
|
-
|
|
347
|
-
```python
|
|
348
|
-
dbg = tp.debug.inspect()
|
|
349
|
-
dbg.steps # All recorded operations
|
|
350
|
-
dbg.dropped_rows() # Set of dropped row IDs
|
|
351
|
-
dbg.explain_row(42) # Raw lineage for row 42
|
|
352
|
-
dbg.stats() # Memory and tracking stats
|
|
353
|
-
dbg.export("json", "lineage.json")
|
|
354
|
-
```
|
|
355
|
-
|
|
356
|
-
---
|
|
357
|
-
|
|
358
|
-
## Data Quality Contracts
|
|
359
|
-
|
|
360
|
-
Validate your pipeline with fluent assertions:
|
|
361
|
-
|
|
362
|
-
```python
|
|
363
|
-
result = (tp.contract()
|
|
364
|
-
.expect_unique("customer_id")
|
|
365
|
-
.expect_no_nulls("email")
|
|
366
|
-
.expect_retention(min_rate=0.9)
|
|
367
|
-
.check(df))
|
|
368
|
-
|
|
369
|
-
result.raise_if_failed() # Raises if any contract violated
|
|
370
|
-
```
|
|
371
|
-
|
|
372
|
-
---
|
|
373
|
-
|
|
374
|
-
## Snapshots & Diff
|
|
375
|
-
|
|
376
|
-
Compare DataFrame states:
|
|
377
|
-
|
|
378
|
-
```python
|
|
379
|
-
before = tp.snapshot(df)
|
|
380
|
-
|
|
381
|
-
# ... transformations ...
|
|
382
|
-
|
|
383
|
-
after = tp.snapshot(df)
|
|
384
|
-
diff = tp.diff(before, after)
|
|
385
|
-
|
|
386
|
-
print(f"Rows added: {diff.rows_added}")
|
|
387
|
-
print(f"Rows removed: {diff.rows_removed}")
|
|
388
|
-
print(f"Cells changed: {diff.cells_changed}")
|
|
389
|
-
```
|
|
390
|
-
|
|
391
|
-
---
|
|
392
|
-
|
|
393
|
-
## HTML Reports
|
|
394
|
-
|
|
395
|
-
Generate interactive lineage reports:
|
|
396
|
-
|
|
397
|
-
```python
|
|
398
|
-
tp.report(df, "pipeline_audit.html")
|
|
399
|
-
```
|
|
400
|
-
|
|
401
|
-
Opens a visual dashboard showing:
|
|
402
|
-
- Pipeline flow diagram
|
|
403
|
-
- Retention funnel
|
|
404
|
-
- Dropped rows by operation
|
|
405
|
-
- Cell change history
|
|
406
|
-
|
|
407
|
-
---
|
|
408
|
-
|
|
409
|
-
## What's Tracked
|
|
410
|
-
|
|
411
|
-
| Operation | Tracking | Completeness |
|
|
412
|
-
|-----------|----------|--------------|
|
|
413
|
-
| `dropna`, `drop_duplicates` | Dropped row IDs | FULL |
|
|
414
|
-
| `query`, `df[mask]` | Dropped row IDs | FULL |
|
|
415
|
-
| `head`, `tail`, `sample` | Dropped row IDs | FULL |
|
|
416
|
-
| `fillna`, `replace` | Cell diffs (watched cols) | FULL |
|
|
417
|
-
| `loc[]=`, `iloc[]=`, `at[]=` | Cell diffs | FULL |
|
|
418
|
-
| `merge`, `join` | Parent tracking | FULL |
|
|
419
|
-
| `groupby().agg()` | Group membership | FULL |
|
|
420
|
-
| `sort_values` | Reorder tracking | FULL |
|
|
421
|
-
| `apply`, `pipe` | Output tracked | PARTIAL |
|
|
422
|
-
|
|
423
|
-
---
|
|
424
|
-
|
|
425
|
-
## Limitations
|
|
426
|
-
|
|
427
|
-
TracePipe tracks pandas operations, not arbitrary Python code:
|
|
428
|
-
|
|
429
|
-
| Limitation | Workaround |
|
|
430
|
-
|------------|------------|
|
|
431
|
-
| Direct NumPy array modification | Use pandas methods |
|
|
432
|
-
| Mutable objects in cells (lists, dicts) | Use immutable types |
|
|
433
|
-
| Custom C extensions | Wrap with pandas operations |
|
|
434
|
-
|
|
435
|
-
---
|
|
436
|
-
|
|
437
|
-
## Example: ML Pipeline Audit
|
|
438
|
-
|
|
439
|
-
```python
|
|
440
|
-
import tracepipe as tp
|
|
441
|
-
import pandas as pd
|
|
442
|
-
import numpy as np
|
|
443
|
-
|
|
444
|
-
tp.enable(mode="debug", watch=["age", "income", "label"])
|
|
445
|
-
|
|
446
|
-
# Load and clean
|
|
447
|
-
df = pd.read_csv("training_data.csv")
|
|
448
|
-
df = df.dropna(subset=["label"])
|
|
449
|
-
df["income"] = df["income"].fillna(df["income"].median())
|
|
450
|
-
df = df[df["age"] >= 18]
|
|
451
|
-
|
|
452
|
-
# Feature engineering
|
|
453
|
-
df["age_bucket"] = pd.cut(df["age"], bins=[18, 30, 50, 100])
|
|
454
|
-
df["log_income"] = np.log1p(df["income"])
|
|
455
|
-
|
|
456
|
-
# Audit the pipeline
|
|
457
|
-
print(tp.check(df))
|
|
458
|
-
```
|
|
459
|
-
```
|
|
460
|
-
TracePipe Check: [OK] Pipeline healthy
|
|
461
|
-
Mode: debug
|
|
462
|
-
|
|
463
|
-
Retention: 8234/10000 (82.3%)
|
|
464
|
-
Dropped: 1766 rows
|
|
465
|
-
• DataFrame.dropna: 423
|
|
466
|
-
• DataFrame.__getitem__[mask]: 1343
|
|
467
|
-
|
|
468
|
-
Value changes: 892 cells
|
|
469
|
-
• DataFrame.fillna: 892 (income)
|
|
470
|
-
```
|
|
471
|
-
|
|
472
|
-
```python
|
|
473
|
-
# Why does this customer have log_income = 0?
|
|
474
|
-
print(tp.why(df, col="income", where={"customer_id": "C-789"}))
|
|
475
|
-
```
|
|
476
|
-
```
|
|
477
|
-
Cell History: row 156, column 'income'
|
|
478
|
-
Current value: 45000.0
|
|
479
|
-
[i] Was null at step 1 (later recovered)
|
|
480
|
-
by: DataFrame.fillna
|
|
481
|
-
|
|
482
|
-
History (1 change):
|
|
483
|
-
None -> 45000.0
|
|
484
|
-
by: DataFrame.fillna
|
|
485
|
-
```
|
|
486
|
-
|
|
487
|
-
```python
|
|
488
|
-
# Full journey of a specific row
|
|
489
|
-
print(tp.trace(df, where={"customer_id": "C-789"}))
|
|
490
|
-
```
|
|
491
|
-
```
|
|
492
|
-
Row 156 Journey:
|
|
493
|
-
Status: [OK] Alive
|
|
494
|
-
|
|
495
|
-
Events: 3
|
|
496
|
-
[MODIFIED] DataFrame.fillna: income
|
|
497
|
-
[MODIFIED] pd.cut: age_bucket
|
|
498
|
-
[MODIFIED] DataFrame.__setitem__[log_income]: log_income
|
|
499
|
-
```
|
|
500
|
-
|
|
501
|
-
---
|
|
502
|
-
|
|
503
|
-
## Benchmarks
|
|
504
|
-
|
|
505
|
-
Run on MacBook Pro M1, pandas 2.0, Python 3.11:
|
|
506
|
-
|
|
507
|
-
### Overhead (10K rows, median of 10 runs)
|
|
508
|
-
|
|
509
|
-
| Operation | Baseline | With TracePipe | Overhead |
|
|
510
|
-
|-----------|----------|----------------|----------|
|
|
511
|
-
| dropna | 0.9ms | 1.7ms | 1.9x |
|
|
512
|
-
| query | 2.1ms | 3.0ms | 1.4x |
|
|
513
|
-
| fillna | 0.4ms | 0.4ms | 1.0x |
|
|
514
|
-
| groupby.sum | 1.2ms | 1.2ms | 1.0x |
|
|
515
|
-
| merge | 4.5ms | 12.6ms | 2.8x |
|
|
516
|
-
| sort_values | 1.1ms | 1.5ms | 1.4x |
|
|
517
|
-
|
|
518
|
-
### Scale (filter + dropna pipeline)
|
|
519
|
-
|
|
520
|
-
| Rows | Time | Throughput |
|
|
521
|
-
|------|------|------------|
|
|
522
|
-
| 10K | 5ms | 2M rows/sec |
|
|
523
|
-
| 100K | 35ms | 2.8M rows/sec |
|
|
524
|
-
| 1M | 320ms | 3.1M rows/sec |
|
|
525
|
-
|
|
526
|
-
### Memory
|
|
527
|
-
|
|
528
|
-
- Base overhead: ~40 bytes per tracked diff
|
|
529
|
-
- Typical pipeline: 2-3x memory vs baseline
|
|
530
|
-
- Spillover to disk available for large pipelines
|
|
531
|
-
|
|
532
|
-
---
|
|
533
|
-
|
|
534
|
-
## Documentation
|
|
535
|
-
|
|
536
|
-
📚 **[Full Documentation](https://gauthierpiarrette.github.io/tracepipe/)**
|
|
537
|
-
|
|
538
|
-
- [Getting Started](https://gauthierpiarrette.github.io/tracepipe/getting-started/quickstart/)
|
|
539
|
-
- [User Guide](https://gauthierpiarrette.github.io/tracepipe/guide/concepts/)
|
|
540
|
-
- [API Reference](https://gauthierpiarrette.github.io/tracepipe/api/)
|
|
541
|
-
- [Examples](https://gauthierpiarrette.github.io/tracepipe/examples/ml-pipeline/)
|
|
542
|
-
|
|
543
|
-
---
|
|
544
|
-
|
|
545
|
-
## Contributing
|
|
546
|
-
|
|
547
|
-
```bash
|
|
548
|
-
git clone https://github.com/gauthierpiarrette/tracepipe.git
|
|
549
|
-
cd tracepipe
|
|
550
|
-
pip install -e ".[dev]"
|
|
551
|
-
|
|
552
|
-
# Run tests
|
|
553
|
-
pytest tests/ -v
|
|
554
|
-
|
|
555
|
-
# Run linting
|
|
556
|
-
ruff check tracepipe/ tests/
|
|
557
|
-
|
|
558
|
-
# Run benchmarks
|
|
559
|
-
python benchmarks/run_all.py
|
|
560
|
-
```
|
|
561
|
-
|
|
562
|
-
See [CONTRIBUTING](https://gauthierpiarrette.github.io/tracepipe/contributing/) for detailed guidelines.
|
|
563
|
-
|
|
564
|
-
---
|
|
565
|
-
|
|
566
|
-
## License
|
|
567
|
-
|
|
568
|
-
MIT License. See [LICENSE](LICENSE) for details.
|
|
569
|
-
|
|
570
|
-
---
|
|
571
|
-
|
|
572
|
-
<p align="center">
|
|
573
|
-
<b>Stop guessing where your rows went.</b><br>
|
|
574
|
-
<code>pip install tracepipe</code>
|
|
575
|
-
</p>
|
|
File without changes
|
|
File without changes
|