tracepipe 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tracepipe/__init__.py +117 -78
- tracepipe/api.py +168 -331
- tracepipe/context.py +21 -1
- tracepipe/contracts.py +473 -0
- tracepipe/convenience.py +812 -0
- tracepipe/core.py +174 -17
- tracepipe/debug.py +325 -0
- tracepipe/instrumentation/apply_capture.py +453 -0
- tracepipe/instrumentation/filter_capture.py +468 -0
- tracepipe/instrumentation/indexer_capture.py +813 -0
- tracepipe/instrumentation/merge_capture.py +434 -0
- tracepipe/instrumentation/pandas_inst.py +66 -183
- tracepipe/instrumentation/series_capture.py +331 -0
- tracepipe/safety.py +3 -3
- tracepipe/snapshot.py +420 -0
- tracepipe/storage/base.py +7 -3
- tracepipe/storage/lineage_store.py +190 -47
- tracepipe/storage/row_identity.py +366 -104
- tracepipe/value_provenance.py +301 -0
- tracepipe/visualization/html_export.py +22 -7
- tracepipe-0.3.0.dist-info/METADATA +575 -0
- tracepipe-0.3.0.dist-info/RECORD +29 -0
- tracepipe-0.2.0.dist-info/METADATA +0 -508
- tracepipe-0.2.0.dist-info/RECORD +0 -19
- {tracepipe-0.2.0.dist-info → tracepipe-0.3.0.dist-info}/WHEEL +0 -0
- {tracepipe-0.2.0.dist-info → tracepipe-0.3.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,575 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tracepipe
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Row-level data lineage tracking for pandas pipelines
|
|
5
|
+
Project-URL: Homepage, https://github.com/tracepipe/tracepipe
|
|
6
|
+
Project-URL: Documentation, https://tracepipe.github.io/tracepipe/
|
|
7
|
+
Project-URL: Repository, https://github.com/tracepipe/tracepipe.git
|
|
8
|
+
Project-URL: Issues, https://github.com/tracepipe/tracepipe/issues
|
|
9
|
+
Project-URL: Changelog, https://tracepipe.github.io/tracepipe/changelog/
|
|
10
|
+
Author: Gauthier Piarrette
|
|
11
|
+
License: MIT License
|
|
12
|
+
|
|
13
|
+
Copyright (c) 2026 Gauthier Piarrette
|
|
14
|
+
|
|
15
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
16
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
17
|
+
in the Software without restriction, including without limitation the rights
|
|
18
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
19
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
20
|
+
furnished to do so, subject to the following conditions:
|
|
21
|
+
|
|
22
|
+
The above copyright notice and this permission notice shall be included in all
|
|
23
|
+
copies or substantial portions of the Software.
|
|
24
|
+
|
|
25
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
26
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
27
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
28
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
29
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
30
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
31
|
+
SOFTWARE.
|
|
32
|
+
License-File: LICENSE
|
|
33
|
+
Keywords: data-engineering,data-lineage,data-quality,debugging,observability,pandas
|
|
34
|
+
Classifier: Development Status :: 4 - Beta
|
|
35
|
+
Classifier: Intended Audience :: Developers
|
|
36
|
+
Classifier: Intended Audience :: Science/Research
|
|
37
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
38
|
+
Classifier: Operating System :: OS Independent
|
|
39
|
+
Classifier: Programming Language :: Python :: 3
|
|
40
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
41
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
42
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
43
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
44
|
+
Classifier: Topic :: Scientific/Engineering
|
|
45
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
46
|
+
Requires-Python: >=3.9
|
|
47
|
+
Requires-Dist: numpy>=1.20.0
|
|
48
|
+
Requires-Dist: pandas>=1.5.0
|
|
49
|
+
Provides-Extra: all
|
|
50
|
+
Requires-Dist: psutil>=5.9.0; extra == 'all'
|
|
51
|
+
Requires-Dist: pyarrow>=10.0.0; extra == 'all'
|
|
52
|
+
Provides-Extra: arrow
|
|
53
|
+
Requires-Dist: pyarrow>=10.0.0; extra == 'arrow'
|
|
54
|
+
Provides-Extra: dev
|
|
55
|
+
Requires-Dist: black>=23.0.0; extra == 'dev'
|
|
56
|
+
Requires-Dist: pre-commit>=3.5.0; extra == 'dev'
|
|
57
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
58
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
59
|
+
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
60
|
+
Requires-Dist: taskipy>=1.12.0; extra == 'dev'
|
|
61
|
+
Provides-Extra: docs
|
|
62
|
+
Requires-Dist: mkdocs-material>=9.5.0; extra == 'docs'
|
|
63
|
+
Requires-Dist: mkdocs>=1.5.0; extra == 'docs'
|
|
64
|
+
Requires-Dist: mkdocstrings[python]>=0.24.0; extra == 'docs'
|
|
65
|
+
Requires-Dist: pymdown-extensions>=10.0.0; extra == 'docs'
|
|
66
|
+
Provides-Extra: memory
|
|
67
|
+
Requires-Dist: psutil>=5.9.0; extra == 'memory'
|
|
68
|
+
Description-Content-Type: text/markdown
|
|
69
|
+
|
|
70
|
+
# TracePipe
|
|
71
|
+
|
|
72
|
+
**Row-level data lineage for pandas pipelines.**
|
|
73
|
+
|
|
74
|
+
TracePipe automatically tracks what happens to every row and cell in your DataFrame — drops, transformations, merges, and value changes. Zero code changes required.
|
|
75
|
+
|
|
76
|
+
[](https://pypi.org/project/tracepipe/)
|
|
77
|
+
[](https://pypi.org/project/tracepipe/)
|
|
78
|
+
[](https://github.com/gauthierpiarrette/tracepipe/actions/workflows/ci.yml)
|
|
79
|
+
[](https://codecov.io/gh/gauthierpiarrette/tracepipe)
|
|
80
|
+
[](https://opensource.org/licenses/MIT)
|
|
81
|
+
[](https://gauthierpiarrette.github.io/tracepipe/)
|
|
82
|
+
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
## The Problem
|
|
86
|
+
|
|
87
|
+
Data pipelines are black boxes. When something goes wrong, you're left asking:
|
|
88
|
+
|
|
89
|
+
- **"Where did row X go?"** — Dropped somewhere, but which step?
|
|
90
|
+
- **"Why is this value wrong?"** — It was fine in the source, what changed it?
|
|
91
|
+
- **"How did these rows get merged?"** — Which parent records combined?
|
|
92
|
+
- **"Why are there nulls here?"** — When did they appear?
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
df = pd.read_csv("customers.csv")
|
|
96
|
+
df = df.dropna() # Some rows disappear
|
|
97
|
+
df = df.merge(regions, on="zip") # New rows appear, some vanish
|
|
98
|
+
df["income"] = df["income"].fillna(0) # Values change silently
|
|
99
|
+
df = df[df["age"] >= 18] # More rows gone
|
|
100
|
+
# What actually happened to customer C-789?
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
Traditional debugging means `print()` statements, manual diffs, and guesswork. **TracePipe gives you the complete audit trail.**
|
|
104
|
+
|
|
105
|
+
---
|
|
106
|
+
|
|
107
|
+
## The Solution
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
import tracepipe as tp
|
|
111
|
+
import pandas as pd
|
|
112
|
+
|
|
113
|
+
tp.enable(mode="debug", watch=["income", "score"])
|
|
114
|
+
|
|
115
|
+
df = pd.read_csv("customers.csv")
|
|
116
|
+
df = df.dropna()
|
|
117
|
+
df["income"] = df["income"].fillna(0)
|
|
118
|
+
df = df.merge(segments, on="customer_id")
|
|
119
|
+
df = df[df["age"] >= 18]
|
|
120
|
+
|
|
121
|
+
# Pipeline health check
|
|
122
|
+
print(tp.check(df))
|
|
123
|
+
```
|
|
124
|
+
```
|
|
125
|
+
TracePipe Check: [OK] Pipeline healthy
|
|
126
|
+
Mode: debug
|
|
127
|
+
|
|
128
|
+
Retention: 847/1000 (84.7%)
|
|
129
|
+
Dropped: 153 rows
|
|
130
|
+
• DataFrame.dropna: 42
|
|
131
|
+
• DataFrame.__getitem__[mask]: 111
|
|
132
|
+
|
|
133
|
+
Value changes: 23 cells modified
|
|
134
|
+
• DataFrame.fillna: 23 (income)
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
```python
|
|
138
|
+
# Why did this customer's income change?
|
|
139
|
+
print(tp.why(df, col="income", where={"customer_id": "C-789"}))
|
|
140
|
+
```
|
|
141
|
+
```
|
|
142
|
+
Cell History: row 42, column 'income'
|
|
143
|
+
Current value: 0.0
|
|
144
|
+
[i] Was null at step 1 (later recovered)
|
|
145
|
+
by: DataFrame.fillna
|
|
146
|
+
|
|
147
|
+
History (1 change):
|
|
148
|
+
None -> 0.0
|
|
149
|
+
by: DataFrame.fillna
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
**One import. Complete audit trail.**
|
|
153
|
+
|
|
154
|
+
---
|
|
155
|
+
|
|
156
|
+
## Installation
|
|
157
|
+
|
|
158
|
+
```bash
|
|
159
|
+
pip install tracepipe
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
---
|
|
163
|
+
|
|
164
|
+
## Quick Start
|
|
165
|
+
|
|
166
|
+
### 1. Enable tracking
|
|
167
|
+
|
|
168
|
+
```python
|
|
169
|
+
import tracepipe as tp
|
|
170
|
+
|
|
171
|
+
tp.enable(mode="debug", watch=["price", "quantity"]) # Track specific columns
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
### 2. Run your pipeline normally
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
df = pd.DataFrame({
|
|
178
|
+
"product": ["A", "B", "C", "D"],
|
|
179
|
+
"price": [10.0, None, 30.0, 40.0],
|
|
180
|
+
"quantity": [5, 10, 0, 8]
|
|
181
|
+
})
|
|
182
|
+
|
|
183
|
+
df = df.dropna() # Drops row B
|
|
184
|
+
df = df[df["quantity"] > 0] # Drops row C
|
|
185
|
+
df["total"] = df["price"] * df["quantity"]
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
### 3. Inspect the lineage
|
|
189
|
+
|
|
190
|
+
```python
|
|
191
|
+
# Health check - see drops AND changes
|
|
192
|
+
print(tp.check(df))
|
|
193
|
+
```
|
|
194
|
+
```
|
|
195
|
+
TracePipe Check: [OK] Pipeline healthy
|
|
196
|
+
Mode: debug
|
|
197
|
+
|
|
198
|
+
Retention: 2/4 (50.0%)
|
|
199
|
+
Dropped: 2 rows
|
|
200
|
+
• DataFrame.dropna: 1
|
|
201
|
+
• DataFrame.__getitem__[mask]: 1
|
|
202
|
+
|
|
203
|
+
Value changes: 2 cells
|
|
204
|
+
• DataFrame.__setitem__[total]: 2
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
```python
|
|
208
|
+
# Trace a specific row's full journey
|
|
209
|
+
print(tp.trace(df, where={"product": "A"}))
|
|
210
|
+
```
|
|
211
|
+
```
|
|
212
|
+
Row 0 Journey:
|
|
213
|
+
Status: [OK] Alive
|
|
214
|
+
|
|
215
|
+
Events: 1
|
|
216
|
+
[MODIFIED] DataFrame.__setitem__[total]: total
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
```python
|
|
220
|
+
# Explain why a specific cell has its current value
|
|
221
|
+
print(tp.why(df, col="total", row=0))
|
|
222
|
+
```
|
|
223
|
+
```
|
|
224
|
+
Cell History: row 0, column 'total'
|
|
225
|
+
Current value: 50.0
|
|
226
|
+
|
|
227
|
+
History (1 change):
|
|
228
|
+
None -> 50.0
|
|
229
|
+
by: DataFrame.__setitem__[total]
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
---
|
|
233
|
+
|
|
234
|
+
## Key Features
|
|
235
|
+
|
|
236
|
+
### 🔍 Zero-Code Instrumentation
|
|
237
|
+
|
|
238
|
+
TracePipe monkey-patches pandas at runtime. Your existing code works unchanged:
|
|
239
|
+
|
|
240
|
+
```python
|
|
241
|
+
tp.enable()
|
|
242
|
+
# Your existing pipeline runs exactly as before
|
|
243
|
+
# TracePipe silently records everything
|
|
244
|
+
tp.disable()
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
### 📊 Rich Provenance Data
|
|
248
|
+
|
|
249
|
+
Track everything that happens in your pipeline:
|
|
250
|
+
|
|
251
|
+
| Question | Answer |
|
|
252
|
+
|----------|--------|
|
|
253
|
+
| Which rows were dropped? | `tp.check(df)` shows retention by operation |
|
|
254
|
+
| Why did this value change? | `tp.why(df, col="amount", row=5)` shows before/after |
|
|
255
|
+
| What's this row's history? | `tp.trace(df, row=0)` shows full journey |
|
|
256
|
+
| Where did these rows merge from? | Merge parent tracking in debug mode |
|
|
257
|
+
| Which rows grouped together? | `tp.debug.inspect().explain_group("A")` |
|
|
258
|
+
| When did nulls appear? | `tp.why()` flags null introduction |
|
|
259
|
+
|
|
260
|
+
### 🎯 Business-Key Lookups
|
|
261
|
+
|
|
262
|
+
Find rows by their values, not internal IDs:
|
|
263
|
+
|
|
264
|
+
```python
|
|
265
|
+
# Find by business key
|
|
266
|
+
tp.trace(df, where={"customer_id": "C-12345"})
|
|
267
|
+
tp.trace(df, where={"email": "alice@example.com"})
|
|
268
|
+
|
|
269
|
+
# Find rows where a column is null
|
|
270
|
+
tp.why(df, col="email", where={"email": None})
|
|
271
|
+
```
|
|
272
|
+
|
|
273
|
+
### 📈 Production-Ready Performance
|
|
274
|
+
|
|
275
|
+
| Operation | Overhead | Notes |
|
|
276
|
+
|-----------|----------|-------|
|
|
277
|
+
| Filter (dropna, query) | 1.4-1.9x | Acceptable |
|
|
278
|
+
| Transform (fillna, replace) | 1.0-1.2x | Minimal |
|
|
279
|
+
| GroupBy | 1.0-1.2x | Minimal |
|
|
280
|
+
| Sort | 1.4x | Optimized |
|
|
281
|
+
| Scalar access (at/iat) | <1ms added | Fixed overhead |
|
|
282
|
+
|
|
283
|
+
Tested on DataFrames up to 1M rows with linear scaling.
|
|
284
|
+
|
|
285
|
+
### 🔒 Safety First
|
|
286
|
+
|
|
287
|
+
TracePipe never modifies your data or affects computation results:
|
|
288
|
+
|
|
289
|
+
```python
|
|
290
|
+
# Original pandas method ALWAYS runs first
|
|
291
|
+
# Lineage capture happens after, and failures are non-fatal
|
|
292
|
+
result = df.dropna() # Guaranteed to work, even if tracking fails
|
|
293
|
+
```
|
|
294
|
+
|
|
295
|
+
---
|
|
296
|
+
|
|
297
|
+
## Two Modes
|
|
298
|
+
|
|
299
|
+
### CI Mode (Default)
|
|
300
|
+
Lightweight tracking for production pipelines:
|
|
301
|
+
- Step counts and retention rates
|
|
302
|
+
- Dropped row detection
|
|
303
|
+
- Merge mismatch warnings
|
|
304
|
+
- **No per-row provenance** (fast)
|
|
305
|
+
|
|
306
|
+
```python
|
|
307
|
+
tp.enable(mode="ci")
|
|
308
|
+
```
|
|
309
|
+
|
|
310
|
+
### Debug Mode
|
|
311
|
+
Full lineage for development and debugging:
|
|
312
|
+
- Complete row-level history
|
|
313
|
+
- Cell change tracking with before/after values
|
|
314
|
+
- GroupBy membership
|
|
315
|
+
- Merge parent tracking
|
|
316
|
+
|
|
317
|
+
```python
|
|
318
|
+
tp.enable(mode="debug", watch=["price", "amount"])
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
---
|
|
322
|
+
|
|
323
|
+
## API Reference
|
|
324
|
+
|
|
325
|
+
### Core Functions (5)
|
|
326
|
+
|
|
327
|
+
| Function | Purpose |
|
|
328
|
+
|----------|---------|
|
|
329
|
+
| `tp.enable(mode, watch)` | Start tracking |
|
|
330
|
+
| `tp.check(df)` | Health check with retention stats |
|
|
331
|
+
| `tp.trace(df, row, where)` | Trace a row's journey |
|
|
332
|
+
| `tp.why(df, col, row, where)` | Explain why a cell changed |
|
|
333
|
+
| `tp.report(df, path)` | Export HTML report |
|
|
334
|
+
|
|
335
|
+
### Control Functions
|
|
336
|
+
|
|
337
|
+
| Function | Purpose |
|
|
338
|
+
|----------|---------|
|
|
339
|
+
| `tp.disable()` | Stop tracking |
|
|
340
|
+
| `tp.reset()` | Clear all lineage data |
|
|
341
|
+
| `tp.stage(name)` | Label pipeline stages |
|
|
342
|
+
|
|
343
|
+
### Debug Namespace
|
|
344
|
+
|
|
345
|
+
For power users who need raw access:
|
|
346
|
+
|
|
347
|
+
```python
|
|
348
|
+
dbg = tp.debug.inspect()
|
|
349
|
+
dbg.steps # All recorded operations
|
|
350
|
+
dbg.dropped_rows() # Set of dropped row IDs
|
|
351
|
+
dbg.explain_row(42) # Raw lineage for row 42
|
|
352
|
+
dbg.stats() # Memory and tracking stats
|
|
353
|
+
dbg.export("json", "lineage.json")
|
|
354
|
+
```
|
|
355
|
+
|
|
356
|
+
---
|
|
357
|
+
|
|
358
|
+
## Data Quality Contracts
|
|
359
|
+
|
|
360
|
+
Validate your pipeline with fluent assertions:
|
|
361
|
+
|
|
362
|
+
```python
|
|
363
|
+
result = (tp.contract()
|
|
364
|
+
.expect_unique("customer_id")
|
|
365
|
+
.expect_no_nulls("email")
|
|
366
|
+
.expect_retention(min_rate=0.9)
|
|
367
|
+
.check(df))
|
|
368
|
+
|
|
369
|
+
result.raise_if_failed() # Raises if any contract violated
|
|
370
|
+
```
|
|
371
|
+
|
|
372
|
+
---
|
|
373
|
+
|
|
374
|
+
## Snapshots & Diff
|
|
375
|
+
|
|
376
|
+
Compare DataFrame states:
|
|
377
|
+
|
|
378
|
+
```python
|
|
379
|
+
before = tp.snapshot(df)
|
|
380
|
+
|
|
381
|
+
# ... transformations ...
|
|
382
|
+
|
|
383
|
+
after = tp.snapshot(df)
|
|
384
|
+
diff = tp.diff(before, after)
|
|
385
|
+
|
|
386
|
+
print(f"Rows added: {diff.rows_added}")
|
|
387
|
+
print(f"Rows removed: {diff.rows_removed}")
|
|
388
|
+
print(f"Cells changed: {diff.cells_changed}")
|
|
389
|
+
```
|
|
390
|
+
|
|
391
|
+
---
|
|
392
|
+
|
|
393
|
+
## HTML Reports
|
|
394
|
+
|
|
395
|
+
Generate interactive lineage reports:
|
|
396
|
+
|
|
397
|
+
```python
|
|
398
|
+
tp.report(df, "pipeline_audit.html")
|
|
399
|
+
```
|
|
400
|
+
|
|
401
|
+
Opens a visual dashboard showing:
|
|
402
|
+
- Pipeline flow diagram
|
|
403
|
+
- Retention funnel
|
|
404
|
+
- Dropped rows by operation
|
|
405
|
+
- Cell change history
|
|
406
|
+
|
|
407
|
+
---
|
|
408
|
+
|
|
409
|
+
## What's Tracked
|
|
410
|
+
|
|
411
|
+
| Operation | Tracking | Completeness |
|
|
412
|
+
|-----------|----------|--------------|
|
|
413
|
+
| `dropna`, `drop_duplicates` | Dropped row IDs | FULL |
|
|
414
|
+
| `query`, `df[mask]` | Dropped row IDs | FULL |
|
|
415
|
+
| `head`, `tail`, `sample` | Dropped row IDs | FULL |
|
|
416
|
+
| `fillna`, `replace` | Cell diffs (watched cols) | FULL |
|
|
417
|
+
| `loc[]=`, `iloc[]=`, `at[]=` | Cell diffs | FULL |
|
|
418
|
+
| `merge`, `join` | Parent tracking | FULL |
|
|
419
|
+
| `groupby().agg()` | Group membership | FULL |
|
|
420
|
+
| `sort_values` | Reorder tracking | FULL |
|
|
421
|
+
| `apply`, `pipe` | Output tracked | PARTIAL |
|
|
422
|
+
|
|
423
|
+
---
|
|
424
|
+
|
|
425
|
+
## Limitations
|
|
426
|
+
|
|
427
|
+
TracePipe tracks pandas operations, not arbitrary Python code:
|
|
428
|
+
|
|
429
|
+
| Limitation | Workaround |
|
|
430
|
+
|------------|------------|
|
|
431
|
+
| Direct NumPy array modification | Use pandas methods |
|
|
432
|
+
| Mutable objects in cells (lists, dicts) | Use immutable types |
|
|
433
|
+
| Custom C extensions | Wrap with pandas operations |
|
|
434
|
+
|
|
435
|
+
---
|
|
436
|
+
|
|
437
|
+
## Example: ML Pipeline Audit
|
|
438
|
+
|
|
439
|
+
```python
|
|
440
|
+
import tracepipe as tp
|
|
441
|
+
import pandas as pd
|
|
442
|
+
import numpy as np
|
|
443
|
+
|
|
444
|
+
tp.enable(mode="debug", watch=["age", "income", "label"])
|
|
445
|
+
|
|
446
|
+
# Load and clean
|
|
447
|
+
df = pd.read_csv("training_data.csv")
|
|
448
|
+
df = df.dropna(subset=["label"])
|
|
449
|
+
df["income"] = df["income"].fillna(df["income"].median())
|
|
450
|
+
df = df[df["age"] >= 18]
|
|
451
|
+
|
|
452
|
+
# Feature engineering
|
|
453
|
+
df["age_bucket"] = pd.cut(df["age"], bins=[18, 30, 50, 100])
|
|
454
|
+
df["log_income"] = np.log1p(df["income"])
|
|
455
|
+
|
|
456
|
+
# Audit the pipeline
|
|
457
|
+
print(tp.check(df))
|
|
458
|
+
```
|
|
459
|
+
```
|
|
460
|
+
TracePipe Check: [OK] Pipeline healthy
|
|
461
|
+
Mode: debug
|
|
462
|
+
|
|
463
|
+
Retention: 8234/10000 (82.3%)
|
|
464
|
+
Dropped: 1766 rows
|
|
465
|
+
• DataFrame.dropna: 423
|
|
466
|
+
• DataFrame.__getitem__[mask]: 1343
|
|
467
|
+
|
|
468
|
+
Value changes: 892 cells
|
|
469
|
+
• DataFrame.fillna: 892 (income)
|
|
470
|
+
```
|
|
471
|
+
|
|
472
|
+
```python
|
|
473
|
+
# Why does this customer have log_income = 0?
|
|
474
|
+
print(tp.why(df, col="income", where={"customer_id": "C-789"}))
|
|
475
|
+
```
|
|
476
|
+
```
|
|
477
|
+
Cell History: row 156, column 'income'
|
|
478
|
+
Current value: 45000.0
|
|
479
|
+
[i] Was null at step 1 (later recovered)
|
|
480
|
+
by: DataFrame.fillna
|
|
481
|
+
|
|
482
|
+
History (1 change):
|
|
483
|
+
None -> 45000.0
|
|
484
|
+
by: DataFrame.fillna
|
|
485
|
+
```
|
|
486
|
+
|
|
487
|
+
```python
|
|
488
|
+
# Full journey of a specific row
|
|
489
|
+
print(tp.trace(df, where={"customer_id": "C-789"}))
|
|
490
|
+
```
|
|
491
|
+
```
|
|
492
|
+
Row 156 Journey:
|
|
493
|
+
Status: [OK] Alive
|
|
494
|
+
|
|
495
|
+
Events: 3
|
|
496
|
+
[MODIFIED] DataFrame.fillna: income
|
|
497
|
+
[MODIFIED] pd.cut: age_bucket
|
|
498
|
+
[MODIFIED] DataFrame.__setitem__[log_income]: log_income
|
|
499
|
+
```
|
|
500
|
+
|
|
501
|
+
---
|
|
502
|
+
|
|
503
|
+
## Benchmarks
|
|
504
|
+
|
|
505
|
+
Run on MacBook Pro M1, pandas 2.0, Python 3.11:
|
|
506
|
+
|
|
507
|
+
### Overhead (10K rows, median of 10 runs)
|
|
508
|
+
|
|
509
|
+
| Operation | Baseline | With TracePipe | Overhead |
|
|
510
|
+
|-----------|----------|----------------|----------|
|
|
511
|
+
| dropna | 0.9ms | 1.7ms | 1.9x |
|
|
512
|
+
| query | 2.1ms | 3.0ms | 1.4x |
|
|
513
|
+
| fillna | 0.4ms | 0.4ms | 1.0x |
|
|
514
|
+
| groupby.sum | 1.2ms | 1.2ms | 1.0x |
|
|
515
|
+
| merge | 4.5ms | 12.6ms | 2.8x |
|
|
516
|
+
| sort_values | 1.1ms | 1.5ms | 1.4x |
|
|
517
|
+
|
|
518
|
+
### Scale (filter + dropna pipeline)
|
|
519
|
+
|
|
520
|
+
| Rows | Time | Throughput |
|
|
521
|
+
|------|------|------------|
|
|
522
|
+
| 10K | 5ms | 2M rows/sec |
|
|
523
|
+
| 100K | 35ms | 2.8M rows/sec |
|
|
524
|
+
| 1M | 320ms | 3.1M rows/sec |
|
|
525
|
+
|
|
526
|
+
### Memory
|
|
527
|
+
|
|
528
|
+
- Base overhead: ~40 bytes per tracked diff
|
|
529
|
+
- Typical pipeline: 2-3x memory vs baseline
|
|
530
|
+
- Spillover to disk available for large pipelines
|
|
531
|
+
|
|
532
|
+
---
|
|
533
|
+
|
|
534
|
+
## Documentation
|
|
535
|
+
|
|
536
|
+
📚 **[Full Documentation](https://gauthierpiarrette.github.io/tracepipe/)**
|
|
537
|
+
|
|
538
|
+
- [Getting Started](https://gauthierpiarrette.github.io/tracepipe/getting-started/quickstart/)
|
|
539
|
+
- [User Guide](https://gauthierpiarrette.github.io/tracepipe/guide/concepts/)
|
|
540
|
+
- [API Reference](https://gauthierpiarrette.github.io/tracepipe/api/)
|
|
541
|
+
- [Examples](https://gauthierpiarrette.github.io/tracepipe/examples/ml-pipeline/)
|
|
542
|
+
|
|
543
|
+
---
|
|
544
|
+
|
|
545
|
+
## Contributing
|
|
546
|
+
|
|
547
|
+
```bash
|
|
548
|
+
git clone https://github.com/gauthierpiarrette/tracepipe.git
|
|
549
|
+
cd tracepipe
|
|
550
|
+
pip install -e ".[dev]"
|
|
551
|
+
|
|
552
|
+
# Run tests
|
|
553
|
+
pytest tests/ -v
|
|
554
|
+
|
|
555
|
+
# Run linting
|
|
556
|
+
ruff check tracepipe/ tests/
|
|
557
|
+
|
|
558
|
+
# Run benchmarks
|
|
559
|
+
python benchmarks/run_all.py
|
|
560
|
+
```
|
|
561
|
+
|
|
562
|
+
See [CONTRIBUTING](https://gauthierpiarrette.github.io/tracepipe/contributing/) for detailed guidelines.
|
|
563
|
+
|
|
564
|
+
---
|
|
565
|
+
|
|
566
|
+
## License
|
|
567
|
+
|
|
568
|
+
MIT License. See [LICENSE](LICENSE) for details.
|
|
569
|
+
|
|
570
|
+
---
|
|
571
|
+
|
|
572
|
+
<p align="center">
|
|
573
|
+
<b>Stop guessing where your rows went.</b><br>
|
|
574
|
+
<code>pip install tracepipe</code>
|
|
575
|
+
</p>
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
tracepipe/__init__.py,sha256=ZO6-yKMpguohwQLSRovuJoakb7kN1ZveSBwlGwhC-ho,3342
|
|
2
|
+
tracepipe/api.py,sha256=KFO0NYRaGqRevbNyFSCFK4ryhFwdixFtUnTeNabwb6o,11862
|
|
3
|
+
tracepipe/context.py,sha256=_povLpqa5wd_ESHt5hbSmWTSMTF3nUfeutEQo4RMK2E,3856
|
|
4
|
+
tracepipe/contracts.py,sha256=m-rjPrgnCiAgKEkweOS7P95jrjDptt5UPdvUlqaV_rU,16226
|
|
5
|
+
tracepipe/convenience.py,sha256=9F4rLx7AGWwNPKhuJMZD-6PG-QiZq0_mzfmnoU28x6U,26036
|
|
6
|
+
tracepipe/core.py,sha256=kAXks694rR0Z4tD7Gyty0TyJGWx2whsSdteYYpHuazo,8010
|
|
7
|
+
tracepipe/debug.py,sha256=6t2GKVZLwn7SJLhrStE9qsmTiVIHATTE3jJPQ2DYtnc,10140
|
|
8
|
+
tracepipe/safety.py,sha256=jTBZv4QGDJfnZETsSZeMKbdOUtGXk-_XkmllhnGWM-M,5537
|
|
9
|
+
tracepipe/snapshot.py,sha256=OLREzE1_LkWITluG_Bqeb7Y4pAKb8Lb3zJEF3cxnloU,13967
|
|
10
|
+
tracepipe/value_provenance.py,sha256=cCNDvMduYiFkTzfam5EpBNZI54RL4OtMLP6xNaM00ec,9092
|
|
11
|
+
tracepipe/instrumentation/__init__.py,sha256=pd0n6Z9m_V3gcBv097cXWFOZEzAP9sAq1jjQnNRrDZ8,222
|
|
12
|
+
tracepipe/instrumentation/apply_capture.py,sha256=cMThWzNXqWQENuMrCGTne1hO6fqaQFV7zJYNpsPTW4w,14463
|
|
13
|
+
tracepipe/instrumentation/filter_capture.py,sha256=onlYLU5bBZSM3WmxM2AFHfktnlx7ReG-brEn5eZ_N10,15830
|
|
14
|
+
tracepipe/instrumentation/indexer_capture.py,sha256=1ATCeJ-uNA1uGiSbgnUx0wdVsIlZGHeUBaFJPXgFQNg,28440
|
|
15
|
+
tracepipe/instrumentation/merge_capture.py,sha256=Eze-PTrn7IXxZRZBYX9R13mOY3diWKAkjp4z-wa1tEk,13349
|
|
16
|
+
tracepipe/instrumentation/pandas_inst.py,sha256=2YSoju9ml2PjLOYzsx8MHH1iqhjgnXHbIidnF0JDpaY,29546
|
|
17
|
+
tracepipe/instrumentation/series_capture.py,sha256=N1Cf-pQDh23qQLLd8DNsxbcaD-91sTJkRd5AnccKZGE,10649
|
|
18
|
+
tracepipe/storage/__init__.py,sha256=pGFMfbIgIi2kofVPwYDqe2HTYMYJoabiGjTq77pYi-g,348
|
|
19
|
+
tracepipe/storage/base.py,sha256=7DV_-rp37DjBMr9B1w85hLVYhC8OQShk2PcEhT-n4tE,4894
|
|
20
|
+
tracepipe/storage/lineage_store.py,sha256=KPN-OZOgkZeiIptodQst-Obp9krcuE7Erpc9NX53jKw,25148
|
|
21
|
+
tracepipe/storage/row_identity.py,sha256=HBU0gTTJlFtFTcAdUCKuX-c9cHa0lo3CDIodDPDgOzA,17161
|
|
22
|
+
tracepipe/utils/__init__.py,sha256=CI_GXViCjdMbu1j6HuzZhoQZEW0sIB6WAve6j5pfOC0,182
|
|
23
|
+
tracepipe/utils/value_capture.py,sha256=wGgegQmJnVHxHbwHSH9di7JAOBChzD3ERJrabZNiayk,4092
|
|
24
|
+
tracepipe/visualization/__init__.py,sha256=M3s44ZTUNEToyghjhQW0FgbmWHKPr4Xc-7iNF6DpI_E,132
|
|
25
|
+
tracepipe/visualization/html_export.py,sha256=G0hfZTJctUCfpun17zXX1NIXhvJZbca6hKmP3rcIjbg,42282
|
|
26
|
+
tracepipe-0.3.0.dist-info/METADATA,sha256=oEiGG2V8ya2J3ZKYU_oAfLIqYrZdgwqBRaKup44U-Uw,15478
|
|
27
|
+
tracepipe-0.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
28
|
+
tracepipe-0.3.0.dist-info/licenses/LICENSE,sha256=HMOAFHBClL79POwWL-2_aDcx42DJAq7Ce-nwJPvMB9U,1075
|
|
29
|
+
tracepipe-0.3.0.dist-info/RECORD,,
|