tracepipe 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tracepipe/__init__.py CHANGED
@@ -81,7 +81,7 @@ from .core import TracePipeConfig, TracePipeMode
81
81
  from .snapshot import DiffResult, Snapshot, diff, snapshot
82
82
 
83
83
  # === VERSION ===
84
- __version__ = "0.3.0"
84
+ __version__ = "0.3.1"
85
85
 
86
86
  # === MINIMAL __all__ ===
87
87
  __all__ = [
tracepipe/api.py CHANGED
@@ -75,12 +75,18 @@ def enable(
75
75
  # Custom configuration
76
76
  tp.enable(mode="ci", merge_provenance=True)
77
77
  """
78
+ ctx = get_context()
79
+
80
+ # If already enabled, reset accumulated state to prevent duplicate warnings/stats
81
+ # This handles the common case of re-running scripts in notebooks/IDEs
82
+ if ctx.enabled:
83
+ _reset_accumulated_state(ctx)
84
+
78
85
  # Get or create config
79
86
  # If config is provided explicitly, use it
80
87
  # Otherwise, start with existing context config (if any) or create new default
81
88
  if config is None:
82
- existing_ctx = get_context()
83
- config = existing_ctx.config # Use existing config as base
89
+ config = ctx.config # Use existing config as base
84
90
 
85
91
  # Handle mode
86
92
  if mode is not None:
@@ -115,14 +121,14 @@ def enable(
115
121
  ctx = TracePipeContext(config=config, backend=backend, identity=identity)
116
122
  set_context(ctx)
117
123
  else:
118
- ctx = get_context()
119
124
  ctx.config = config
120
125
  # Also update config in row_manager and store (they may have their own references)
121
126
  ctx.row_manager.config = config
122
127
  ctx.store.config = config
123
128
 
124
- # Add watched columns
129
+ # Add watched columns (reset first if re-enabling to avoid stale watches)
125
130
  if watch:
131
+ ctx.watched_columns.clear()
126
132
  ctx.watched_columns.update(watch)
127
133
 
128
134
  if not ctx.enabled:
@@ -132,6 +138,50 @@ def enable(
132
138
  return _get_module()
133
139
 
134
140
 
141
+ def _reset_accumulated_state(ctx: TracePipeContext) -> None:
142
+ """
143
+ Reset accumulated lineage state without disabling instrumentation.
144
+
145
+ Called when enable() is invoked on an already-enabled context to prevent
146
+ state accumulation across multiple script runs in the same Python process.
147
+ """
148
+ store = ctx.store
149
+
150
+ # Clear merge stats (prevents duplicate warnings)
151
+ if hasattr(store, "merge_stats"):
152
+ store.merge_stats.clear()
153
+
154
+ # Clear bulk drops
155
+ if hasattr(store, "bulk_drops"):
156
+ store.bulk_drops.clear()
157
+
158
+ # Clear steps
159
+ if hasattr(store, "_steps"):
160
+ store._steps.clear()
161
+
162
+ # Clear in-memory diffs
163
+ if hasattr(store, "_clear_in_memory"):
164
+ store._clear_in_memory()
165
+
166
+ # Reset step counter
167
+ if hasattr(store, "_step_counter"):
168
+ store._step_counter = 0
169
+
170
+ # Clear merge mappings
171
+ if hasattr(store, "merge_mappings"):
172
+ store.merge_mappings.clear()
173
+
174
+ # Clear aggregation mappings
175
+ if hasattr(store, "aggregation_mappings"):
176
+ store.aggregation_mappings.clear()
177
+
178
+ # Reset row identity manager
179
+ ctx.row_manager.clear()
180
+
181
+ # Clear watched columns (will be re-added if watch param provided)
182
+ ctx.watched_columns.clear()
183
+
184
+
135
185
  def disable() -> types.ModuleType:
136
186
  """
137
187
  Disable TracePipe and restore original pandas methods.
tracepipe/convenience.py CHANGED
@@ -733,9 +733,14 @@ def _build_trace_result(row_id: int, ctx, include_ghost: bool) -> TraceResult:
733
733
  store = ctx.store
734
734
 
735
735
  drop_event = store.get_drop_event(row_id)
736
- history = store.get_row_history(row_id)
737
736
  merge_origin = store.get_merge_origin(row_id)
738
737
 
738
+ # Use lineage-aware history to include pre-merge parent events
739
+ if hasattr(store, "get_row_history_with_lineage"):
740
+ history = store.get_row_history_with_lineage(row_id)
741
+ else:
742
+ history = store.get_row_history(row_id)
743
+
739
744
  dropped_at = None
740
745
  if drop_event:
741
746
  dropped_at = {
@@ -485,6 +485,9 @@ class InMemoryLineageStore:
485
485
 
486
486
  CONTRACT: Returned list has monotonically increasing step_id.
487
487
  Convenience layer may reverse for display.
488
+
489
+ Note: This returns only direct events for this row_id.
490
+ Use get_row_history_with_lineage() to include pre-merge parent history.
488
491
  """
489
492
  step_map = {s.step_id: s for s in self._steps}
490
493
  events = []
@@ -546,6 +549,65 @@ class InMemoryLineageStore:
546
549
 
547
550
  return events
548
551
 
552
+ def get_row_history_with_lineage(self, row_id: int, max_depth: int = 10) -> list[dict]:
553
+ """
554
+ Get row history including pre-merge parent history.
555
+
556
+ Follows merge lineage recursively to build complete cell provenance.
557
+ This is essential for tracking changes that happened before merge operations.
558
+
559
+ Args:
560
+ row_id: Row ID to trace
561
+ max_depth: Maximum merge depth to follow (prevents infinite loops)
562
+
563
+ Returns:
564
+ List of events in chronological order, including parent row events.
565
+ """
566
+ visited: set[int] = set()
567
+
568
+ def _collect_history(rid: int, depth: int) -> list[dict]:
569
+ if depth > max_depth or rid in visited:
570
+ return []
571
+ visited.add(rid)
572
+
573
+ events = []
574
+
575
+ # Check if this row came from a merge
576
+ origin = self.get_merge_origin(rid)
577
+ if origin and origin["left_parent"] is not None:
578
+ # Recursively get parent's history first (chronological order)
579
+ parent_events = _collect_history(origin["left_parent"], depth + 1)
580
+ events.extend(parent_events)
581
+
582
+ # Add this row's direct events
583
+ events.extend(self.get_row_history(rid))
584
+
585
+ return events
586
+
587
+ all_events = _collect_history(row_id, 0)
588
+
589
+ # Sort by step_id to ensure chronological order across lineage
590
+ all_events.sort(key=lambda e: e["step_id"])
591
+
592
+ return all_events
593
+
594
+ def get_cell_history_with_lineage(
595
+ self, row_id: int, column: str, max_depth: int = 10
596
+ ) -> list[dict]:
597
+ """
598
+ Get cell history for a specific column, including pre-merge parent history.
599
+
600
+ Args:
601
+ row_id: Row ID to trace
602
+ column: Column name to filter events for
603
+ max_depth: Maximum merge depth to follow
604
+
605
+ Returns:
606
+ List of events for this column in chronological order.
607
+ """
608
+ all_events = self.get_row_history_with_lineage(row_id, max_depth)
609
+ return [e for e in all_events if e["col"] == column]
610
+
549
611
  def get_dropped_rows(self, step_id: Optional[int] = None) -> list[int]:
550
612
  """Get all dropped row IDs, optionally filtered by step."""
551
613
  if step_id is not None:
@@ -648,7 +710,7 @@ class InMemoryLineageStore:
648
710
  diffs = list(self._iter_all_diffs())
649
711
 
650
712
  data = {
651
- "tracepipe_version": "0.3.0",
713
+ "tracepipe_version": "0.3.1",
652
714
  "export_timestamp": time.time(),
653
715
  "total_diffs": len(diffs),
654
716
  "total_steps": len(self._steps),
@@ -19,7 +19,6 @@ from typing import Any, Optional
19
19
  import pandas as pd
20
20
 
21
21
  from .context import get_context
22
- from .core import ChangeType
23
22
 
24
23
 
25
24
  @dataclass
@@ -96,7 +95,12 @@ class ValueHistory:
96
95
  }
97
96
 
98
97
 
99
- def explain_value(row_id: int, column: str, df: Optional[pd.DataFrame] = None) -> ValueHistory:
98
+ def explain_value(
99
+ row_id: int,
100
+ column: str,
101
+ df: Optional[pd.DataFrame] = None,
102
+ follow_lineage: bool = True,
103
+ ) -> ValueHistory:
100
104
  """
101
105
  Get complete history of a specific cell's value.
102
106
 
@@ -104,6 +108,7 @@ def explain_value(row_id: int, column: str, df: Optional[pd.DataFrame] = None) -
104
108
  row_id: Row ID to trace
105
109
  column: Column name
106
110
  df: Optional DataFrame for current value lookup
111
+ follow_lineage: If True, include pre-merge parent history (default: True)
107
112
 
108
113
  Returns:
109
114
  ValueHistory with all changes to this cell
@@ -121,35 +126,38 @@ def explain_value(row_id: int, column: str, df: Optional[pd.DataFrame] = None) -
121
126
  if len(matches) > 0 and column in df.columns:
122
127
  current_value = df.iloc[matches[0]][column]
123
128
 
124
- # Collect all events for this cell
129
+ # Collect events - use lineage-aware method if requested
130
+ if follow_lineage and hasattr(store, "get_cell_history_with_lineage"):
131
+ # Get cell history including pre-merge parent history
132
+ raw_events = store.get_cell_history_with_lineage(row_id, column)
133
+ else:
134
+ # Fallback to direct row_id lookup only
135
+ raw_events = [e for e in store.get_row_history(row_id) if e["col"] == column]
136
+
137
+ # Convert to ValueEvent objects
125
138
  events = []
126
- step_map = {s.step_id: s for s in store.steps}
127
139
  became_null_at = None
128
140
  became_null_by = None
129
141
 
130
- for diff in store._iter_all_diffs():
131
- if diff["row_id"] == row_id and diff["col"] == column:
132
- step = step_map.get(diff["step_id"])
133
-
134
- events.append(
135
- ValueEvent(
136
- step_id=diff["step_id"],
137
- operation=step.operation if step else "unknown",
138
- old_value=diff["old_val"],
139
- new_value=diff["new_val"],
140
- change_type=ChangeType(diff["change_type"]).name,
141
- timestamp=step.timestamp if step else 0,
142
- code_location=(
143
- f"{step.code_file}:{step.code_line}" if step and step.code_file else None
144
- ),
145
- )
142
+ for diff in raw_events:
143
+ events.append(
144
+ ValueEvent(
145
+ step_id=diff["step_id"],
146
+ operation=diff.get("operation", "unknown"),
147
+ old_value=diff["old_val"],
148
+ new_value=diff["new_val"],
149
+ change_type=diff.get("change_type", "UNKNOWN"),
150
+ timestamp=diff.get("timestamp", 0) or 0,
151
+ code_location=diff.get("code_location"),
146
152
  )
153
+ )
147
154
 
148
- # Track when value became null
149
- if became_null_at is None and pd.isna(diff["new_val"]) and not pd.isna(diff["old_val"]):
150
- became_null_at = diff["step_id"]
151
- became_null_by = step.operation if step else "unknown"
155
+ # Track when value became null
156
+ if became_null_at is None and pd.isna(diff["new_val"]) and not pd.isna(diff["old_val"]):
157
+ became_null_at = diff["step_id"]
158
+ became_null_by = diff.get("operation", "unknown")
152
159
 
160
+ # Events should already be sorted by step_id from lineage method
153
161
  events.sort(key=lambda e: e.step_id)
154
162
 
155
163
  return ValueHistory(
@@ -0,0 +1,308 @@
1
+ Metadata-Version: 2.4
2
+ Name: tracepipe
3
+ Version: 0.3.1
4
+ Summary: Row-level data lineage tracking for pandas pipelines
5
+ Project-URL: Homepage, https://github.com/tracepipe/tracepipe
6
+ Project-URL: Documentation, https://tracepipe.github.io/tracepipe/
7
+ Project-URL: Repository, https://github.com/tracepipe/tracepipe.git
8
+ Project-URL: Issues, https://github.com/tracepipe/tracepipe/issues
9
+ Project-URL: Changelog, https://tracepipe.github.io/tracepipe/changelog/
10
+ Author: Gauthier Piarrette
11
+ License: MIT License
12
+
13
+ Copyright (c) 2026 Gauthier Piarrette
14
+
15
+ Permission is hereby granted, free of charge, to any person obtaining a copy
16
+ of this software and associated documentation files (the "Software"), to deal
17
+ in the Software without restriction, including without limitation the rights
18
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
19
+ copies of the Software, and to permit persons to whom the Software is
20
+ furnished to do so, subject to the following conditions:
21
+
22
+ The above copyright notice and this permission notice shall be included in all
23
+ copies or substantial portions of the Software.
24
+
25
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
28
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
30
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31
+ SOFTWARE.
32
+ License-File: LICENSE
33
+ Keywords: data-engineering,data-lineage,data-quality,debugging,observability,pandas
34
+ Classifier: Development Status :: 4 - Beta
35
+ Classifier: Intended Audience :: Developers
36
+ Classifier: Intended Audience :: Science/Research
37
+ Classifier: License :: OSI Approved :: MIT License
38
+ Classifier: Operating System :: OS Independent
39
+ Classifier: Programming Language :: Python :: 3
40
+ Classifier: Programming Language :: Python :: 3.9
41
+ Classifier: Programming Language :: Python :: 3.10
42
+ Classifier: Programming Language :: Python :: 3.11
43
+ Classifier: Programming Language :: Python :: 3.12
44
+ Classifier: Topic :: Scientific/Engineering
45
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
46
+ Requires-Python: >=3.9
47
+ Requires-Dist: numpy>=1.20.0
48
+ Requires-Dist: pandas>=1.5.0
49
+ Provides-Extra: all
50
+ Requires-Dist: psutil>=5.9.0; extra == 'all'
51
+ Requires-Dist: pyarrow>=10.0.0; extra == 'all'
52
+ Provides-Extra: arrow
53
+ Requires-Dist: pyarrow>=10.0.0; extra == 'arrow'
54
+ Provides-Extra: dev
55
+ Requires-Dist: black>=23.0.0; extra == 'dev'
56
+ Requires-Dist: pre-commit>=3.5.0; extra == 'dev'
57
+ Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
58
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
59
+ Requires-Dist: ruff>=0.1.0; extra == 'dev'
60
+ Requires-Dist: taskipy>=1.12.0; extra == 'dev'
61
+ Provides-Extra: docs
62
+ Requires-Dist: mkdocs-material>=9.5.0; extra == 'docs'
63
+ Requires-Dist: mkdocs>=1.5.0; extra == 'docs'
64
+ Requires-Dist: mkdocstrings[python]>=0.24.0; extra == 'docs'
65
+ Requires-Dist: pymdown-extensions>=10.0.0; extra == 'docs'
66
+ Provides-Extra: memory
67
+ Requires-Dist: psutil>=5.9.0; extra == 'memory'
68
+ Description-Content-Type: text/markdown
69
+
70
+ <div align="center">
71
+
72
+ # TracePipe
73
+
74
+ ### Row-level data lineage for pandas pipelines
75
+
76
+ **Know exactly where every row went, why values changed, and how your data transformed.**
77
+
78
+ [![PyPI version](https://img.shields.io/pypi/v/tracepipe.svg)](https://pypi.org/project/tracepipe/)
79
+ [![Python 3.9+](https://img.shields.io/pypi/pyversions/tracepipe.svg)](https://pypi.org/project/tracepipe/)
80
+ [![CI](https://github.com/gauthierpiarrette/tracepipe/actions/workflows/ci.yml/badge.svg)](https://github.com/gauthierpiarrette/tracepipe/actions/workflows/ci.yml)
81
+ [![codecov](https://codecov.io/gh/gauthierpiarrette/tracepipe/branch/main/graph/badge.svg)](https://codecov.io/gh/gauthierpiarrette/tracepipe)
82
+ [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT)
83
+ [![Docs](https://img.shields.io/badge/docs-mkdocs-blue.svg)](https://gauthierpiarrette.github.io/tracepipe/)
84
+
85
+ [Getting Started](#getting-started) · [Documentation](https://gauthierpiarrette.github.io/tracepipe/) · [Examples](#real-world-example)
86
+
87
+ </div>
88
+
89
+ ---
90
+
91
+ ## Why TracePipe?
92
+
93
+ Data pipelines are black boxes. Rows vanish. Values change. You're left guessing.
94
+
95
+ ```python
96
+ df = pd.read_csv("customers.csv")
97
+ df = df.dropna() # Some rows disappear
98
+ df = df.merge(regions, on="zip") # New rows appear, some vanish
99
+ df["income"] = df["income"].fillna(0) # Values change silently
100
+ df = df[df["age"] >= 18] # More rows gone
101
+ # What happened to customer C-789? 🤷
102
+ ```
103
+
104
+ **TracePipe gives you the complete audit trail — zero code changes required.**
105
+
106
+ ---
107
+
108
+ ## Getting Started
109
+
110
+ ```bash
111
+ pip install tracepipe
112
+ ```
113
+
114
+ ```python
115
+ import tracepipe as tp
116
+ import pandas as pd
117
+
118
+ tp.enable(mode="debug", watch=["income"])
119
+
120
+ df = pd.read_csv("customers.csv")
121
+ df = df.dropna()
122
+ df["income"] = df["income"].fillna(0)
123
+ df = df[df["age"] >= 18]
124
+
125
+ tp.check(df) # See what happened
126
+ ```
127
+
128
+ ```
129
+ TracePipe Check: [OK] Pipeline healthy
130
+
131
+ Retention: 847/1000 (84.7%)
132
+ Dropped: 153 rows
133
+ • DataFrame.dropna: 42
134
+ • DataFrame.__getitem__[mask]: 111
135
+
136
+ Value changes: 23 cells modified
137
+ • DataFrame.fillna: 23 (income)
138
+ ```
139
+
140
+ That's it. **One import, full visibility.**
141
+
142
+ ---
143
+
144
+ ## Core API
145
+
146
+ | Function | What it does |
147
+ |----------|--------------|
148
+ | `tp.enable()` | Start tracking |
149
+ | `tp.check(df)` | Health check — retention, drops, changes |
150
+ | `tp.trace(df, where={"id": "C-789"})` | Follow a row's complete journey |
151
+ | `tp.why(df, col="income", row=5)` | Explain why a cell has its current value |
152
+ | `tp.report(df, "audit.html")` | Export interactive HTML report |
153
+
154
+ ---
155
+
156
+ ## Key Features
157
+
158
+ <table>
159
+ <tr>
160
+ <td width="50%">
161
+
162
+ ### 🔍 Zero-Code Instrumentation
163
+ TracePipe patches pandas at runtime. Your existing code works unchanged.
164
+
165
+ ### 📊 Complete Provenance
166
+ Track drops, transforms, merges, and cell-level changes with before/after values.
167
+
168
+ </td>
169
+ <td width="50%">
170
+
171
+ ### 🎯 Business-Key Lookups
172
+ Find rows by their values: `tp.trace(df, where={"email": "alice@example.com"})`
173
+
174
+ ### ⚡ Production-Ready
175
+ 1.0-2.8x overhead (varies by operation). Tested on DataFrames up to 1M rows.
176
+
177
+ </td>
178
+ </tr>
179
+ </table>
180
+
181
+ ---
182
+
183
+ ## Real-World Example
184
+
185
+ ```python
186
+ import tracepipe as tp
187
+ import pandas as pd
188
+
189
+ tp.enable(mode="debug", watch=["age", "income", "label"])
190
+
191
+ # Load and clean
192
+ df = pd.read_csv("training_data.csv")
193
+ df = df.dropna(subset=["label"])
194
+ df["income"] = df["income"].fillna(df["income"].median())
195
+ df = df[df["age"] >= 18]
196
+
197
+ # Audit
198
+ print(tp.check(df))
199
+ ```
200
+
201
+ ```
202
+ Retention: 8234/10000 (82.3%)
203
+ Dropped: 1766 rows
204
+ • DataFrame.dropna: 423
205
+ • DataFrame.__getitem__[mask]: 1343
206
+
207
+ Value changes: 892 cells
208
+ • DataFrame.fillna: 892 (income)
209
+ ```
210
+
211
+ ```python
212
+ # Why does this customer have a filled income?
213
+ tp.why(df, col="income", where={"customer_id": "C-789"})
214
+ ```
215
+
216
+ ```
217
+ Cell History: row 156, column 'income'
218
+ Current value: 45000.0
219
+ [i] Was null at step 1 (later recovered)
220
+
221
+ History (1 change):
222
+ None -> 45000.0
223
+ by: DataFrame.fillna
224
+ ```
225
+
226
+ ---
227
+
228
+ ## Two Modes
229
+
230
+ | Mode | Use Case | What's Tracked |
231
+ |------|----------|----------------|
232
+ | **CI** (default) | Production pipelines | Step counts, retention rates, merge warnings |
233
+ | **Debug** | Development | Full row history, cell diffs, merge parents, group membership |
234
+
235
+ ```python
236
+ tp.enable(mode="ci") # Lightweight
237
+ tp.enable(mode="debug") # Full lineage
238
+ ```
239
+
240
+ ---
241
+
242
+ ## What's Tracked
243
+
244
+ | Operation | Coverage |
245
+ |-----------|----------|
246
+ | `dropna`, `drop_duplicates`, `query`, `df[mask]` | ✅ Full |
247
+ | `fillna`, `replace`, `loc[]=`, `iloc[]=` | ✅ Full (cell diffs) |
248
+ | `merge`, `join` | ✅ Full (parent tracking) |
249
+ | `groupby().agg()` | ✅ Full (group membership) |
250
+ | `sort_values`, `head`, `tail`, `sample` | ✅ Full |
251
+ | `apply`, `pipe` | ⚠️ Partial |
252
+
253
+ ---
254
+
255
+ ## Data Quality Contracts
256
+
257
+ ```python
258
+ (tp.contract()
259
+ .expect_unique("customer_id")
260
+ .expect_no_nulls("email")
261
+ .expect_retention(min_rate=0.9)
262
+ .check(df)
263
+ .raise_if_failed())
264
+ ```
265
+
266
+ ---
267
+
268
+ ## Documentation
269
+
270
+ 📚 **[Full Documentation](https://gauthierpiarrette.github.io/tracepipe/)**
271
+
272
+ - [Quickstart](https://gauthierpiarrette.github.io/tracepipe/getting-started/quickstart/)
273
+ - [User Guide](https://gauthierpiarrette.github.io/tracepipe/guide/concepts/)
274
+ - [API Reference](https://gauthierpiarrette.github.io/tracepipe/api/)
275
+ - [Examples](https://gauthierpiarrette.github.io/tracepipe/examples/ml-pipeline/)
276
+
277
+ ---
278
+
279
+ ## Contributing
280
+
281
+ ```bash
282
+ git clone https://github.com/gauthierpiarrette/tracepipe.git
283
+ cd tracepipe
284
+ pip install -e ".[dev]"
285
+ pytest tests/ -v
286
+ ```
287
+
288
+ See [CONTRIBUTING](https://gauthierpiarrette.github.io/tracepipe/contributing/) for guidelines.
289
+
290
+ ---
291
+
292
+ ## License
293
+
294
+ MIT License. See [LICENSE](LICENSE).
295
+
296
+ ---
297
+
298
+ <div align="center">
299
+
300
+ **Stop guessing where your rows went.**
301
+
302
+ ```bash
303
+ pip install tracepipe
304
+ ```
305
+
306
+ ⭐ Star us on GitHub if TracePipe helps your data work!
307
+
308
+ </div>
@@ -1,13 +1,13 @@
1
- tracepipe/__init__.py,sha256=ZO6-yKMpguohwQLSRovuJoakb7kN1ZveSBwlGwhC-ho,3342
2
- tracepipe/api.py,sha256=KFO0NYRaGqRevbNyFSCFK4ryhFwdixFtUnTeNabwb6o,11862
1
+ tracepipe/__init__.py,sha256=fni87eEsE4pup32e9zJkn3UCIrlufMlUigiQVTdx7rA,3342
2
+ tracepipe/api.py,sha256=WdcKvvzI3voDt6fxZWa8vjyZQU8lfRshx7T78oj7oFE,13351
3
3
  tracepipe/context.py,sha256=_povLpqa5wd_ESHt5hbSmWTSMTF3nUfeutEQo4RMK2E,3856
4
4
  tracepipe/contracts.py,sha256=m-rjPrgnCiAgKEkweOS7P95jrjDptt5UPdvUlqaV_rU,16226
5
- tracepipe/convenience.py,sha256=9F4rLx7AGWwNPKhuJMZD-6PG-QiZq0_mzfmnoU28x6U,26036
5
+ tracepipe/convenience.py,sha256=v-FE95yD_QQvGnhfQIzJYHMMDdAC37F3jQDOMDcazyY,26234
6
6
  tracepipe/core.py,sha256=kAXks694rR0Z4tD7Gyty0TyJGWx2whsSdteYYpHuazo,8010
7
7
  tracepipe/debug.py,sha256=6t2GKVZLwn7SJLhrStE9qsmTiVIHATTE3jJPQ2DYtnc,10140
8
8
  tracepipe/safety.py,sha256=jTBZv4QGDJfnZETsSZeMKbdOUtGXk-_XkmllhnGWM-M,5537
9
9
  tracepipe/snapshot.py,sha256=OLREzE1_LkWITluG_Bqeb7Y4pAKb8Lb3zJEF3cxnloU,13967
10
- tracepipe/value_provenance.py,sha256=cCNDvMduYiFkTzfam5EpBNZI54RL4OtMLP6xNaM00ec,9092
10
+ tracepipe/value_provenance.py,sha256=ogky6aOaZ-6K2uNBQxlXpmCeuvK434Hisj30zesRTd8,9330
11
11
  tracepipe/instrumentation/__init__.py,sha256=pd0n6Z9m_V3gcBv097cXWFOZEzAP9sAq1jjQnNRrDZ8,222
12
12
  tracepipe/instrumentation/apply_capture.py,sha256=cMThWzNXqWQENuMrCGTne1hO6fqaQFV7zJYNpsPTW4w,14463
13
13
  tracepipe/instrumentation/filter_capture.py,sha256=onlYLU5bBZSM3WmxM2AFHfktnlx7ReG-brEn5eZ_N10,15830
@@ -17,13 +17,13 @@ tracepipe/instrumentation/pandas_inst.py,sha256=2YSoju9ml2PjLOYzsx8MHH1iqhjgnXHb
17
17
  tracepipe/instrumentation/series_capture.py,sha256=N1Cf-pQDh23qQLLd8DNsxbcaD-91sTJkRd5AnccKZGE,10649
18
18
  tracepipe/storage/__init__.py,sha256=pGFMfbIgIi2kofVPwYDqe2HTYMYJoabiGjTq77pYi-g,348
19
19
  tracepipe/storage/base.py,sha256=7DV_-rp37DjBMr9B1w85hLVYhC8OQShk2PcEhT-n4tE,4894
20
- tracepipe/storage/lineage_store.py,sha256=KPN-OZOgkZeiIptodQst-Obp9krcuE7Erpc9NX53jKw,25148
20
+ tracepipe/storage/lineage_store.py,sha256=81EAZnSGcwUDxRyDovBYz3zsM0YvBKI3ZLOnnrf_Mak,27392
21
21
  tracepipe/storage/row_identity.py,sha256=HBU0gTTJlFtFTcAdUCKuX-c9cHa0lo3CDIodDPDgOzA,17161
22
22
  tracepipe/utils/__init__.py,sha256=CI_GXViCjdMbu1j6HuzZhoQZEW0sIB6WAve6j5pfOC0,182
23
23
  tracepipe/utils/value_capture.py,sha256=wGgegQmJnVHxHbwHSH9di7JAOBChzD3ERJrabZNiayk,4092
24
24
  tracepipe/visualization/__init__.py,sha256=M3s44ZTUNEToyghjhQW0FgbmWHKPr4Xc-7iNF6DpI_E,132
25
25
  tracepipe/visualization/html_export.py,sha256=G0hfZTJctUCfpun17zXX1NIXhvJZbca6hKmP3rcIjbg,42282
26
- tracepipe-0.3.0.dist-info/METADATA,sha256=oEiGG2V8ya2J3ZKYU_oAfLIqYrZdgwqBRaKup44U-Uw,15478
27
- tracepipe-0.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
28
- tracepipe-0.3.0.dist-info/licenses/LICENSE,sha256=HMOAFHBClL79POwWL-2_aDcx42DJAq7Ce-nwJPvMB9U,1075
29
- tracepipe-0.3.0.dist-info/RECORD,,
26
+ tracepipe-0.3.1.dist-info/METADATA,sha256=mzz_nA2UhB14XhzKWlQGBnNWHCWr2wJHkYm1FzH1c1g,9152
27
+ tracepipe-0.3.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
28
+ tracepipe-0.3.1.dist-info/licenses/LICENSE,sha256=HMOAFHBClL79POwWL-2_aDcx42DJAq7Ce-nwJPvMB9U,1075
29
+ tracepipe-0.3.1.dist-info/RECORD,,
@@ -1,575 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: tracepipe
3
- Version: 0.3.0
4
- Summary: Row-level data lineage tracking for pandas pipelines
5
- Project-URL: Homepage, https://github.com/tracepipe/tracepipe
6
- Project-URL: Documentation, https://tracepipe.github.io/tracepipe/
7
- Project-URL: Repository, https://github.com/tracepipe/tracepipe.git
8
- Project-URL: Issues, https://github.com/tracepipe/tracepipe/issues
9
- Project-URL: Changelog, https://tracepipe.github.io/tracepipe/changelog/
10
- Author: Gauthier Piarrette
11
- License: MIT License
12
-
13
- Copyright (c) 2026 Gauthier Piarrette
14
-
15
- Permission is hereby granted, free of charge, to any person obtaining a copy
16
- of this software and associated documentation files (the "Software"), to deal
17
- in the Software without restriction, including without limitation the rights
18
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
19
- copies of the Software, and to permit persons to whom the Software is
20
- furnished to do so, subject to the following conditions:
21
-
22
- The above copyright notice and this permission notice shall be included in all
23
- copies or substantial portions of the Software.
24
-
25
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
28
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
30
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31
- SOFTWARE.
32
- License-File: LICENSE
33
- Keywords: data-engineering,data-lineage,data-quality,debugging,observability,pandas
34
- Classifier: Development Status :: 4 - Beta
35
- Classifier: Intended Audience :: Developers
36
- Classifier: Intended Audience :: Science/Research
37
- Classifier: License :: OSI Approved :: MIT License
38
- Classifier: Operating System :: OS Independent
39
- Classifier: Programming Language :: Python :: 3
40
- Classifier: Programming Language :: Python :: 3.9
41
- Classifier: Programming Language :: Python :: 3.10
42
- Classifier: Programming Language :: Python :: 3.11
43
- Classifier: Programming Language :: Python :: 3.12
44
- Classifier: Topic :: Scientific/Engineering
45
- Classifier: Topic :: Software Development :: Libraries :: Python Modules
46
- Requires-Python: >=3.9
47
- Requires-Dist: numpy>=1.20.0
48
- Requires-Dist: pandas>=1.5.0
49
- Provides-Extra: all
50
- Requires-Dist: psutil>=5.9.0; extra == 'all'
51
- Requires-Dist: pyarrow>=10.0.0; extra == 'all'
52
- Provides-Extra: arrow
53
- Requires-Dist: pyarrow>=10.0.0; extra == 'arrow'
54
- Provides-Extra: dev
55
- Requires-Dist: black>=23.0.0; extra == 'dev'
56
- Requires-Dist: pre-commit>=3.5.0; extra == 'dev'
57
- Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
58
- Requires-Dist: pytest>=7.0.0; extra == 'dev'
59
- Requires-Dist: ruff>=0.1.0; extra == 'dev'
60
- Requires-Dist: taskipy>=1.12.0; extra == 'dev'
61
- Provides-Extra: docs
62
- Requires-Dist: mkdocs-material>=9.5.0; extra == 'docs'
63
- Requires-Dist: mkdocs>=1.5.0; extra == 'docs'
64
- Requires-Dist: mkdocstrings[python]>=0.24.0; extra == 'docs'
65
- Requires-Dist: pymdown-extensions>=10.0.0; extra == 'docs'
66
- Provides-Extra: memory
67
- Requires-Dist: psutil>=5.9.0; extra == 'memory'
68
- Description-Content-Type: text/markdown
69
-
70
- # TracePipe
71
-
72
- **Row-level data lineage for pandas pipelines.**
73
-
74
- TracePipe automatically tracks what happens to every row and cell in your DataFrame — drops, transformations, merges, and value changes. Zero code changes required.
75
-
76
- [![PyPI version](https://img.shields.io/pypi/v/tracepipe.svg)](https://pypi.org/project/tracepipe/)
77
- [![Python 3.9+](https://img.shields.io/pypi/pyversions/tracepipe.svg)](https://pypi.org/project/tracepipe/)
78
- [![CI](https://github.com/gauthierpiarrette/tracepipe/actions/workflows/ci.yml/badge.svg)](https://github.com/gauthierpiarrette/tracepipe/actions/workflows/ci.yml)
79
- [![codecov](https://codecov.io/gh/gauthierpiarrette/tracepipe/branch/main/graph/badge.svg)](https://codecov.io/gh/gauthierpiarrette/tracepipe)
80
- [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT)
81
- [![Documentation](https://img.shields.io/badge/docs-mkdocs-blue.svg)](https://gauthierpiarrette.github.io/tracepipe/)
82
-
83
- ---
84
-
85
- ## The Problem
86
-
87
- Data pipelines are black boxes. When something goes wrong, you're left asking:
88
-
89
- - **"Where did row X go?"** — Dropped somewhere, but which step?
90
- - **"Why is this value wrong?"** — It was fine in the source, what changed it?
91
- - **"How did these rows get merged?"** — Which parent records combined?
92
- - **"Why are there nulls here?"** — When did they appear?
93
-
94
- ```python
95
- df = pd.read_csv("customers.csv")
96
- df = df.dropna() # Some rows disappear
97
- df = df.merge(regions, on="zip") # New rows appear, some vanish
98
- df["income"] = df["income"].fillna(0) # Values change silently
99
- df = df[df["age"] >= 18] # More rows gone
100
- # What actually happened to customer C-789?
101
- ```
102
-
103
- Traditional debugging means `print()` statements, manual diffs, and guesswork. **TracePipe gives you the complete audit trail.**
104
-
105
- ---
106
-
107
- ## The Solution
108
-
109
- ```python
110
- import tracepipe as tp
111
- import pandas as pd
112
-
113
- tp.enable(mode="debug", watch=["income", "score"])
114
-
115
- df = pd.read_csv("customers.csv")
116
- df = df.dropna()
117
- df["income"] = df["income"].fillna(0)
118
- df = df.merge(segments, on="customer_id")
119
- df = df[df["age"] >= 18]
120
-
121
- # Pipeline health check
122
- print(tp.check(df))
123
- ```
124
- ```
125
- TracePipe Check: [OK] Pipeline healthy
126
- Mode: debug
127
-
128
- Retention: 847/1000 (84.7%)
129
- Dropped: 153 rows
130
- • DataFrame.dropna: 42
131
- • DataFrame.__getitem__[mask]: 111
132
-
133
- Value changes: 23 cells modified
134
- • DataFrame.fillna: 23 (income)
135
- ```
136
-
137
- ```python
138
- # Why did this customer's income change?
139
- print(tp.why(df, col="income", where={"customer_id": "C-789"}))
140
- ```
141
- ```
142
- Cell History: row 42, column 'income'
143
- Current value: 0.0
144
- [i] Was null at step 1 (later recovered)
145
- by: DataFrame.fillna
146
-
147
- History (1 change):
148
- None -> 0.0
149
- by: DataFrame.fillna
150
- ```
151
-
152
- **One import. Complete audit trail.**
153
-
154
- ---
155
-
156
- ## Installation
157
-
158
- ```bash
159
- pip install tracepipe
160
- ```
161
-
162
- ---
163
-
164
- ## Quick Start
165
-
166
- ### 1. Enable tracking
167
-
168
- ```python
169
- import tracepipe as tp
170
-
171
- tp.enable(mode="debug", watch=["price", "quantity"]) # Track specific columns
172
- ```
173
-
174
- ### 2. Run your pipeline normally
175
-
176
- ```python
177
- df = pd.DataFrame({
178
- "product": ["A", "B", "C", "D"],
179
- "price": [10.0, None, 30.0, 40.0],
180
- "quantity": [5, 10, 0, 8]
181
- })
182
-
183
- df = df.dropna() # Drops row B
184
- df = df[df["quantity"] > 0] # Drops row C
185
- df["total"] = df["price"] * df["quantity"]
186
- ```
187
-
188
- ### 3. Inspect the lineage
189
-
190
- ```python
191
- # Health check - see drops AND changes
192
- print(tp.check(df))
193
- ```
194
- ```
195
- TracePipe Check: [OK] Pipeline healthy
196
- Mode: debug
197
-
198
- Retention: 2/4 (50.0%)
199
- Dropped: 2 rows
200
- • DataFrame.dropna: 1
201
- • DataFrame.__getitem__[mask]: 1
202
-
203
- Value changes: 2 cells
204
- • DataFrame.__setitem__[total]: 2
205
- ```
206
-
207
- ```python
208
- # Trace a specific row's full journey
209
- print(tp.trace(df, where={"product": "A"}))
210
- ```
211
- ```
212
- Row 0 Journey:
213
- Status: [OK] Alive
214
-
215
- Events: 1
216
- [MODIFIED] DataFrame.__setitem__[total]: total
217
- ```
218
-
219
- ```python
220
- # Explain why a specific cell has its current value
221
- print(tp.why(df, col="total", row=0))
222
- ```
223
- ```
224
- Cell History: row 0, column 'total'
225
- Current value: 50.0
226
-
227
- History (1 change):
228
- None -> 50.0
229
- by: DataFrame.__setitem__[total]
230
- ```
231
-
232
- ---
233
-
234
- ## Key Features
235
-
236
- ### 🔍 Zero-Code Instrumentation
237
-
238
- TracePipe monkey-patches pandas at runtime. Your existing code works unchanged:
239
-
240
- ```python
241
- tp.enable()
242
- # Your existing pipeline runs exactly as before
243
- # TracePipe silently records everything
244
- tp.disable()
245
- ```
246
-
247
- ### 📊 Rich Provenance Data
248
-
249
- Track everything that happens in your pipeline:
250
-
251
- | Question | Answer |
252
- |----------|--------|
253
- | Which rows were dropped? | `tp.check(df)` shows retention by operation |
254
- | Why did this value change? | `tp.why(df, col="amount", row=5)` shows before/after |
255
- | What's this row's history? | `tp.trace(df, row=0)` shows full journey |
256
- | Where did these rows merge from? | Merge parent tracking in debug mode |
257
- | Which rows grouped together? | `tp.debug.inspect().explain_group("A")` |
258
- | When did nulls appear? | `tp.why()` flags null introduction |
259
-
260
- ### 🎯 Business-Key Lookups
261
-
262
- Find rows by their values, not internal IDs:
263
-
264
- ```python
265
- # Find by business key
266
- tp.trace(df, where={"customer_id": "C-12345"})
267
- tp.trace(df, where={"email": "alice@example.com"})
268
-
269
- # Find rows where a column is null
270
- tp.why(df, col="email", where={"email": None})
271
- ```
272
-
273
- ### 📈 Production-Ready Performance
274
-
275
- | Operation | Overhead | Notes |
276
- |-----------|----------|-------|
277
- | Filter (dropna, query) | 1.4-1.9x | Acceptable |
278
- | Transform (fillna, replace) | 1.0-1.2x | Minimal |
279
- | GroupBy | 1.0-1.2x | Minimal |
280
- | Sort | 1.4x | Optimized |
281
- | Scalar access (at/iat) | <1ms added | Fixed overhead |
282
-
283
- Tested on DataFrames up to 1M rows with linear scaling.
284
-
285
- ### 🔒 Safety First
286
-
287
- TracePipe never modifies your data or affects computation results:
288
-
289
- ```python
290
- # Original pandas method ALWAYS runs first
291
- # Lineage capture happens after, and failures are non-fatal
292
- result = df.dropna() # Guaranteed to work, even if tracking fails
293
- ```
294
-
295
- ---
296
-
297
- ## Two Modes
298
-
299
- ### CI Mode (Default)
300
- Lightweight tracking for production pipelines:
301
- - Step counts and retention rates
302
- - Dropped row detection
303
- - Merge mismatch warnings
304
- - **No per-row provenance** (fast)
305
-
306
- ```python
307
- tp.enable(mode="ci")
308
- ```
309
-
310
- ### Debug Mode
311
- Full lineage for development and debugging:
312
- - Complete row-level history
313
- - Cell change tracking with before/after values
314
- - GroupBy membership
315
- - Merge parent tracking
316
-
317
- ```python
318
- tp.enable(mode="debug", watch=["price", "amount"])
319
- ```
320
-
321
- ---
322
-
323
- ## API Reference
324
-
325
- ### Core Functions (5)
326
-
327
- | Function | Purpose |
328
- |----------|---------|
329
- | `tp.enable(mode, watch)` | Start tracking |
330
- | `tp.check(df)` | Health check with retention stats |
331
- | `tp.trace(df, row, where)` | Trace a row's journey |
332
- | `tp.why(df, col, row, where)` | Explain why a cell changed |
333
- | `tp.report(df, path)` | Export HTML report |
334
-
335
- ### Control Functions
336
-
337
- | Function | Purpose |
338
- |----------|---------|
339
- | `tp.disable()` | Stop tracking |
340
- | `tp.reset()` | Clear all lineage data |
341
- | `tp.stage(name)` | Label pipeline stages |
342
-
343
- ### Debug Namespace
344
-
345
- For power users who need raw access:
346
-
347
- ```python
348
- dbg = tp.debug.inspect()
349
- dbg.steps # All recorded operations
350
- dbg.dropped_rows() # Set of dropped row IDs
351
- dbg.explain_row(42) # Raw lineage for row 42
352
- dbg.stats() # Memory and tracking stats
353
- dbg.export("json", "lineage.json")
354
- ```
355
-
356
- ---
357
-
358
- ## Data Quality Contracts
359
-
360
- Validate your pipeline with fluent assertions:
361
-
362
- ```python
363
- result = (tp.contract()
364
- .expect_unique("customer_id")
365
- .expect_no_nulls("email")
366
- .expect_retention(min_rate=0.9)
367
- .check(df))
368
-
369
- result.raise_if_failed() # Raises if any contract violated
370
- ```
371
-
372
- ---
373
-
374
- ## Snapshots & Diff
375
-
376
- Compare DataFrame states:
377
-
378
- ```python
379
- before = tp.snapshot(df)
380
-
381
- # ... transformations ...
382
-
383
- after = tp.snapshot(df)
384
- diff = tp.diff(before, after)
385
-
386
- print(f"Rows added: {diff.rows_added}")
387
- print(f"Rows removed: {diff.rows_removed}")
388
- print(f"Cells changed: {diff.cells_changed}")
389
- ```
390
-
391
- ---
392
-
393
- ## HTML Reports
394
-
395
- Generate interactive lineage reports:
396
-
397
- ```python
398
- tp.report(df, "pipeline_audit.html")
399
- ```
400
-
401
- Opens a visual dashboard showing:
402
- - Pipeline flow diagram
403
- - Retention funnel
404
- - Dropped rows by operation
405
- - Cell change history
406
-
407
- ---
408
-
409
- ## What's Tracked
410
-
411
- | Operation | Tracking | Completeness |
412
- |-----------|----------|--------------|
413
- | `dropna`, `drop_duplicates` | Dropped row IDs | FULL |
414
- | `query`, `df[mask]` | Dropped row IDs | FULL |
415
- | `head`, `tail`, `sample` | Dropped row IDs | FULL |
416
- | `fillna`, `replace` | Cell diffs (watched cols) | FULL |
417
- | `loc[]=`, `iloc[]=`, `at[]=` | Cell diffs | FULL |
418
- | `merge`, `join` | Parent tracking | FULL |
419
- | `groupby().agg()` | Group membership | FULL |
420
- | `sort_values` | Reorder tracking | FULL |
421
- | `apply`, `pipe` | Output tracked | PARTIAL |
422
-
423
- ---
424
-
425
- ## Limitations
426
-
427
- TracePipe tracks pandas operations, not arbitrary Python code:
428
-
429
- | Limitation | Workaround |
430
- |------------|------------|
431
- | Direct NumPy array modification | Use pandas methods |
432
- | Mutable objects in cells (lists, dicts) | Use immutable types |
433
- | Custom C extensions | Wrap with pandas operations |
434
-
435
- ---
436
-
437
- ## Example: ML Pipeline Audit
438
-
439
- ```python
440
- import tracepipe as tp
441
- import pandas as pd
442
- import numpy as np
443
-
444
- tp.enable(mode="debug", watch=["age", "income", "label"])
445
-
446
- # Load and clean
447
- df = pd.read_csv("training_data.csv")
448
- df = df.dropna(subset=["label"])
449
- df["income"] = df["income"].fillna(df["income"].median())
450
- df = df[df["age"] >= 18]
451
-
452
- # Feature engineering
453
- df["age_bucket"] = pd.cut(df["age"], bins=[18, 30, 50, 100])
454
- df["log_income"] = np.log1p(df["income"])
455
-
456
- # Audit the pipeline
457
- print(tp.check(df))
458
- ```
459
- ```
460
- TracePipe Check: [OK] Pipeline healthy
461
- Mode: debug
462
-
463
- Retention: 8234/10000 (82.3%)
464
- Dropped: 1766 rows
465
- • DataFrame.dropna: 423
466
- • DataFrame.__getitem__[mask]: 1343
467
-
468
- Value changes: 892 cells
469
- • DataFrame.fillna: 892 (income)
470
- ```
471
-
472
- ```python
473
- # Why does this customer have log_income = 0?
474
- print(tp.why(df, col="income", where={"customer_id": "C-789"}))
475
- ```
476
- ```
477
- Cell History: row 156, column 'income'
478
- Current value: 45000.0
479
- [i] Was null at step 1 (later recovered)
480
- by: DataFrame.fillna
481
-
482
- History (1 change):
483
- None -> 45000.0
484
- by: DataFrame.fillna
485
- ```
486
-
487
- ```python
488
- # Full journey of a specific row
489
- print(tp.trace(df, where={"customer_id": "C-789"}))
490
- ```
491
- ```
492
- Row 156 Journey:
493
- Status: [OK] Alive
494
-
495
- Events: 3
496
- [MODIFIED] DataFrame.fillna: income
497
- [MODIFIED] pd.cut: age_bucket
498
- [MODIFIED] DataFrame.__setitem__[log_income]: log_income
499
- ```
500
-
501
- ---
502
-
503
- ## Benchmarks
504
-
505
- Run on MacBook Pro M1, pandas 2.0, Python 3.11:
506
-
507
- ### Overhead (10K rows, median of 10 runs)
508
-
509
- | Operation | Baseline | With TracePipe | Overhead |
510
- |-----------|----------|----------------|----------|
511
- | dropna | 0.9ms | 1.7ms | 1.9x |
512
- | query | 2.1ms | 3.0ms | 1.4x |
513
- | fillna | 0.4ms | 0.4ms | 1.0x |
514
- | groupby.sum | 1.2ms | 1.2ms | 1.0x |
515
- | merge | 4.5ms | 12.6ms | 2.8x |
516
- | sort_values | 1.1ms | 1.5ms | 1.4x |
517
-
518
- ### Scale (filter + dropna pipeline)
519
-
520
- | Rows | Time | Throughput |
521
- |------|------|------------|
522
- | 10K | 5ms | 2M rows/sec |
523
- | 100K | 35ms | 2.8M rows/sec |
524
- | 1M | 320ms | 3.1M rows/sec |
525
-
526
- ### Memory
527
-
528
- - Base overhead: ~40 bytes per tracked diff
529
- - Typical pipeline: 2-3x memory vs baseline
530
- - Spillover to disk available for large pipelines
531
-
532
- ---
533
-
534
- ## Documentation
535
-
536
- 📚 **[Full Documentation](https://gauthierpiarrette.github.io/tracepipe/)**
537
-
538
- - [Getting Started](https://gauthierpiarrette.github.io/tracepipe/getting-started/quickstart/)
539
- - [User Guide](https://gauthierpiarrette.github.io/tracepipe/guide/concepts/)
540
- - [API Reference](https://gauthierpiarrette.github.io/tracepipe/api/)
541
- - [Examples](https://gauthierpiarrette.github.io/tracepipe/examples/ml-pipeline/)
542
-
543
- ---
544
-
545
- ## Contributing
546
-
547
- ```bash
548
- git clone https://github.com/gauthierpiarrette/tracepipe.git
549
- cd tracepipe
550
- pip install -e ".[dev]"
551
-
552
- # Run tests
553
- pytest tests/ -v
554
-
555
- # Run linting
556
- ruff check tracepipe/ tests/
557
-
558
- # Run benchmarks
559
- python benchmarks/run_all.py
560
- ```
561
-
562
- See [CONTRIBUTING](https://gauthierpiarrette.github.io/tracepipe/contributing/) for detailed guidelines.
563
-
564
- ---
565
-
566
- ## License
567
-
568
- MIT License. See [LICENSE](LICENSE) for details.
569
-
570
- ---
571
-
572
- <p align="center">
573
- <b>Stop guessing where your rows went.</b><br>
574
- <code>pip install tracepipe</code>
575
- </p>