tracepipe 0.3.5__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tracepipe/__init__.py +1 -1
- tracepipe/convenience.py +261 -19
- tracepipe/core.py +79 -0
- tracepipe/debug.py +40 -0
- tracepipe/instrumentation/filter_capture.py +103 -1
- tracepipe/instrumentation/merge_capture.py +169 -23
- tracepipe/snapshot.py +87 -2
- tracepipe/storage/lineage_store.py +92 -7
- {tracepipe-0.3.5.dist-info → tracepipe-0.4.2.dist-info}/METADATA +6 -9
- {tracepipe-0.3.5.dist-info → tracepipe-0.4.2.dist-info}/RECORD +12 -12
- {tracepipe-0.3.5.dist-info → tracepipe-0.4.2.dist-info}/WHEEL +0 -0
- {tracepipe-0.3.5.dist-info → tracepipe-0.4.2.dist-info}/licenses/LICENSE +0 -0
tracepipe/__init__.py
CHANGED
tracepipe/convenience.py
CHANGED
|
@@ -54,6 +54,14 @@ class CheckResult:
|
|
|
54
54
|
|
|
55
55
|
Separates FACTS (observed, high confidence) from HEURISTICS (inferred).
|
|
56
56
|
.ok is True only if there are no FACT-level warnings.
|
|
57
|
+
|
|
58
|
+
Key properties for quick access:
|
|
59
|
+
.passed - Alias for .ok (common naming convention)
|
|
60
|
+
.retention - Row retention rate (0.0-1.0)
|
|
61
|
+
.n_dropped - Total rows dropped
|
|
62
|
+
.drops_by_op - Drops broken down by operation
|
|
63
|
+
.n_changes - Total cell-level changes (debug mode only)
|
|
64
|
+
.changes_by_op - Changes broken down by operation (debug mode only)
|
|
57
65
|
"""
|
|
58
66
|
|
|
59
67
|
ok: bool
|
|
@@ -61,6 +69,50 @@ class CheckResult:
|
|
|
61
69
|
facts: dict[str, Any]
|
|
62
70
|
suggestions: list[str]
|
|
63
71
|
mode: str
|
|
72
|
+
# Internal: store drops_by_op so we don't need to recompute
|
|
73
|
+
_drops_by_op: dict[str, int] = field(default_factory=dict)
|
|
74
|
+
# Internal: store cell change counts (debug mode only)
|
|
75
|
+
_n_changes: int = 0
|
|
76
|
+
_changes_by_op: dict[str, int] = field(default_factory=dict)
|
|
77
|
+
|
|
78
|
+
# === CONVENIENCE PROPERTIES ===
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
def passed(self) -> bool:
|
|
82
|
+
"""Alias for .ok (matches common naming convention)."""
|
|
83
|
+
return self.ok
|
|
84
|
+
|
|
85
|
+
@property
|
|
86
|
+
def retention(self) -> float | None:
|
|
87
|
+
"""Row retention rate (0.0-1.0), or None if not computed."""
|
|
88
|
+
return self.facts.get("retention_rate")
|
|
89
|
+
|
|
90
|
+
@property
|
|
91
|
+
def n_dropped(self) -> int:
|
|
92
|
+
"""Total number of rows dropped."""
|
|
93
|
+
return self.facts.get("rows_dropped", 0)
|
|
94
|
+
|
|
95
|
+
@property
|
|
96
|
+
def drops_by_op(self) -> dict[str, int]:
|
|
97
|
+
"""Drops broken down by operation name."""
|
|
98
|
+
return self._drops_by_op
|
|
99
|
+
|
|
100
|
+
@property
|
|
101
|
+
def n_steps(self) -> int:
|
|
102
|
+
"""Total pipeline steps recorded."""
|
|
103
|
+
return self.facts.get("total_steps", 0)
|
|
104
|
+
|
|
105
|
+
@property
|
|
106
|
+
def n_changes(self) -> int:
|
|
107
|
+
"""Total cell-level changes (debug mode only, 0 if not tracked)."""
|
|
108
|
+
return self._n_changes
|
|
109
|
+
|
|
110
|
+
@property
|
|
111
|
+
def changes_by_op(self) -> dict[str, int]:
|
|
112
|
+
"""Cell changes broken down by operation (debug mode only)."""
|
|
113
|
+
return self._changes_by_op
|
|
114
|
+
|
|
115
|
+
# === EXISTING PROPERTIES ===
|
|
64
116
|
|
|
65
117
|
@property
|
|
66
118
|
def has_warnings(self) -> bool:
|
|
@@ -90,6 +142,20 @@ class CheckResult:
|
|
|
90
142
|
lines.append(f"TracePipe Check: {status}")
|
|
91
143
|
lines.append(f" Mode: {self.mode}")
|
|
92
144
|
|
|
145
|
+
# Always show key metrics in compact form
|
|
146
|
+
if self.retention is not None:
|
|
147
|
+
lines.append(f"\nRetention: {int(self.retention * 100)}%")
|
|
148
|
+
if self.n_dropped > 0:
|
|
149
|
+
lines.append(f"Dropped: {self.n_dropped} rows")
|
|
150
|
+
if self.drops_by_op:
|
|
151
|
+
for op, count in list(self.drops_by_op.items())[:5]:
|
|
152
|
+
lines.append(f" • {op}: {count}")
|
|
153
|
+
if self.n_changes > 0:
|
|
154
|
+
lines.append(f"\nValue changes: {self.n_changes} cells")
|
|
155
|
+
if self.changes_by_op:
|
|
156
|
+
for op, count in list(self.changes_by_op.items())[:5]:
|
|
157
|
+
lines.append(f" • {op}: {count}")
|
|
158
|
+
|
|
93
159
|
if verbose and self.facts:
|
|
94
160
|
lines.append("\n Measured facts:")
|
|
95
161
|
for k, v in self.facts.items():
|
|
@@ -115,7 +181,14 @@ class CheckResult:
|
|
|
115
181
|
"""Export to dictionary."""
|
|
116
182
|
return {
|
|
117
183
|
"ok": self.ok,
|
|
184
|
+
"passed": self.passed,
|
|
118
185
|
"mode": self.mode,
|
|
186
|
+
"retention": self.retention,
|
|
187
|
+
"n_dropped": self.n_dropped,
|
|
188
|
+
"n_steps": self.n_steps,
|
|
189
|
+
"drops_by_op": self.drops_by_op,
|
|
190
|
+
"n_changes": self.n_changes,
|
|
191
|
+
"changes_by_op": self.changes_by_op,
|
|
119
192
|
"facts": self.facts,
|
|
120
193
|
"suggestions": self.suggestions,
|
|
121
194
|
"warnings": [
|
|
@@ -147,6 +220,11 @@ class TraceResult:
|
|
|
147
220
|
|
|
148
221
|
Answers: "What happened to this row?"
|
|
149
222
|
Events are in CHRONOLOGICAL order (oldest->newest).
|
|
223
|
+
|
|
224
|
+
Key attributes:
|
|
225
|
+
status: "alive" or "dropped" (string representation)
|
|
226
|
+
origin: Where this row came from (concat, merge, or original)
|
|
227
|
+
representative: If dropped by dedup, which row was kept instead
|
|
150
228
|
"""
|
|
151
229
|
|
|
152
230
|
row_id: int
|
|
@@ -158,22 +236,93 @@ class TraceResult:
|
|
|
158
236
|
# Mode enforcement
|
|
159
237
|
supported: bool = True
|
|
160
238
|
unsupported_reason: str | None = None
|
|
239
|
+
# v0.4+ provenance
|
|
240
|
+
concat_origin: dict[str, Any] | None = None
|
|
241
|
+
dedup_representative: dict[str, Any] | None = None
|
|
242
|
+
# Steps this row survived (for SURVIVED event generation)
|
|
243
|
+
_survived_steps: list[dict[str, Any]] = field(default_factory=list)
|
|
244
|
+
|
|
245
|
+
@property
|
|
246
|
+
def status(self) -> str:
|
|
247
|
+
"""Row status as string: 'alive' or 'dropped'."""
|
|
248
|
+
return "alive" if self.is_alive else "dropped"
|
|
249
|
+
|
|
250
|
+
@property
|
|
251
|
+
def dropped_by(self) -> str | None:
|
|
252
|
+
"""Operation that dropped this row, or None if alive."""
|
|
253
|
+
if self.dropped_at:
|
|
254
|
+
return self.dropped_at.get("operation")
|
|
255
|
+
return None
|
|
256
|
+
|
|
257
|
+
@property
|
|
258
|
+
def dropped_at_step(self) -> int | None:
|
|
259
|
+
"""Step number where this row was dropped, or None if alive."""
|
|
260
|
+
if self.dropped_at:
|
|
261
|
+
return self.dropped_at.get("step_id")
|
|
262
|
+
return None
|
|
161
263
|
|
|
162
264
|
@property
|
|
163
265
|
def n_events(self) -> int:
|
|
164
266
|
return len(self.events)
|
|
165
267
|
|
|
268
|
+
@property
|
|
269
|
+
def origin(self) -> dict[str, Any] | None:
|
|
270
|
+
"""
|
|
271
|
+
Unified origin info: where did this row come from?
|
|
272
|
+
|
|
273
|
+
Returns dict with 'type' key:
|
|
274
|
+
- {"type": "concat", "source_df": 1, "step_id": 5}
|
|
275
|
+
- {"type": "merge", "left_parent": 10, "right_parent": 20, "step_id": 3}
|
|
276
|
+
- None if original row (not from concat/merge)
|
|
277
|
+
"""
|
|
278
|
+
if self.concat_origin:
|
|
279
|
+
return {
|
|
280
|
+
"type": "concat",
|
|
281
|
+
"source_df": self.concat_origin.get("source_index"),
|
|
282
|
+
"step_id": self.concat_origin.get("step_id"),
|
|
283
|
+
}
|
|
284
|
+
if self.merge_origin:
|
|
285
|
+
return {
|
|
286
|
+
"type": "merge",
|
|
287
|
+
"left_parent": self.merge_origin.get("left_parent"),
|
|
288
|
+
"right_parent": self.merge_origin.get("right_parent"),
|
|
289
|
+
"step_id": self.merge_origin.get("step_id"),
|
|
290
|
+
}
|
|
291
|
+
return None
|
|
292
|
+
|
|
293
|
+
@property
|
|
294
|
+
def representative(self) -> dict[str, Any] | None:
|
|
295
|
+
"""
|
|
296
|
+
If dropped by drop_duplicates, which row was kept instead?
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
{"kept_rid": 42, "subset": ["key"], "keep": "first"} or None
|
|
300
|
+
kept_rid is None if keep=False (all duplicates dropped)
|
|
301
|
+
"""
|
|
302
|
+
if not self.dedup_representative:
|
|
303
|
+
return None
|
|
304
|
+
return {
|
|
305
|
+
"kept_rid": self.dedup_representative.get("kept_rid"),
|
|
306
|
+
"subset": self.dedup_representative.get("subset_columns"),
|
|
307
|
+
"keep": self.dedup_representative.get("keep_strategy"),
|
|
308
|
+
}
|
|
309
|
+
|
|
166
310
|
def to_dict(self) -> dict:
|
|
167
311
|
"""Export to dictionary."""
|
|
168
312
|
return {
|
|
169
313
|
"row_id": self.row_id,
|
|
314
|
+
"status": self.status,
|
|
170
315
|
"is_alive": self.is_alive,
|
|
171
316
|
"dropped_at": self.dropped_at,
|
|
172
|
-
"
|
|
317
|
+
"dropped_by": self.dropped_at.get("operation") if self.dropped_at else None,
|
|
318
|
+
"origin": self.origin,
|
|
319
|
+
"representative": self.representative,
|
|
173
320
|
"n_events": self.n_events,
|
|
174
321
|
"events": self.events,
|
|
175
322
|
"ghost_values": self.ghost_values,
|
|
176
323
|
"supported": self.supported,
|
|
324
|
+
# Keep legacy fields for backwards compatibility
|
|
325
|
+
"merge_origin": self.merge_origin,
|
|
177
326
|
}
|
|
178
327
|
|
|
179
328
|
def __repr__(self) -> str:
|
|
@@ -186,19 +335,38 @@ class TraceResult:
|
|
|
186
335
|
|
|
187
336
|
lines = [f"Row {self.row_id} Journey:"]
|
|
188
337
|
|
|
338
|
+
# Status line matches documentation format
|
|
189
339
|
if self.is_alive:
|
|
190
340
|
lines.append(" Status: [OK] Alive")
|
|
191
341
|
else:
|
|
192
|
-
lines.append(" Status: [
|
|
342
|
+
lines.append(" Status: [DROPPED]")
|
|
193
343
|
if self.dropped_at:
|
|
194
344
|
lines.append(
|
|
195
345
|
f" at step {self.dropped_at['step_id']}: {self.dropped_at['operation']}"
|
|
196
346
|
)
|
|
197
347
|
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
348
|
+
# Display unified origin info
|
|
349
|
+
origin = self.origin
|
|
350
|
+
if origin:
|
|
351
|
+
if origin["type"] == "merge":
|
|
352
|
+
left = origin.get("left_parent", "?")
|
|
353
|
+
right = origin.get("right_parent", "?")
|
|
354
|
+
lines.append(f" Origin: merge of row {left} (left) + row {right} (right)")
|
|
355
|
+
elif origin["type"] == "concat":
|
|
356
|
+
src = origin.get("source_df", "?")
|
|
357
|
+
lines.append(f" Origin: concat from DataFrame #{src}")
|
|
358
|
+
|
|
359
|
+
# Display dedup representative if dropped by dedup
|
|
360
|
+
if self.representative:
|
|
361
|
+
kept = self.representative.get("kept_rid")
|
|
362
|
+
subset = self.representative.get("subset")
|
|
363
|
+
keep = self.representative.get("keep", "first")
|
|
364
|
+
if kept is not None:
|
|
365
|
+
subset_str = f" (key: {subset})" if subset else ""
|
|
366
|
+
lines.append(f" Replaced by: row {kept}{subset_str} [keep={keep}]")
|
|
367
|
+
else:
|
|
368
|
+
subset_str = f" on {subset}" if subset else ""
|
|
369
|
+
lines.append(f" Dropped: all duplicates removed{subset_str} [keep=False]")
|
|
202
370
|
|
|
203
371
|
if len(self.events) == 0:
|
|
204
372
|
lines.append("\n Events: 0 (no changes to watched columns)")
|
|
@@ -462,11 +630,26 @@ def check(
|
|
|
462
630
|
)
|
|
463
631
|
)
|
|
464
632
|
|
|
465
|
-
|
|
466
|
-
for op, count in
|
|
633
|
+
drops_by_op = ctx.store.get_dropped_by_step()
|
|
634
|
+
for op, count in drops_by_op.items():
|
|
467
635
|
if count > 1000:
|
|
468
636
|
suggestions.append(f"'{op}' dropped {count} rows - review if intentional")
|
|
469
637
|
|
|
638
|
+
# === CELL CHANGES (debug mode only) ===
|
|
639
|
+
n_changes = 0
|
|
640
|
+
changes_by_op: dict[str, int] = {}
|
|
641
|
+
if ctx.config.mode == TracePipeMode.DEBUG:
|
|
642
|
+
# Count non-drop diffs (cell-level changes)
|
|
643
|
+
step_map = {s.step_id: s.operation for s in ctx.store.steps}
|
|
644
|
+
for i in range(len(ctx.store.diff_step_ids)):
|
|
645
|
+
col = ctx.store.diff_cols[i]
|
|
646
|
+
if col != "__row__": # Skip drop events
|
|
647
|
+
n_changes += 1
|
|
648
|
+
step_id = ctx.store.diff_step_ids[i]
|
|
649
|
+
op = step_map.get(step_id, "unknown")
|
|
650
|
+
changes_by_op[op] = changes_by_op.get(op, 0) + 1
|
|
651
|
+
facts["n_changes"] = n_changes
|
|
652
|
+
|
|
470
653
|
ok = len([w for w in warnings_list if w.severity == "fact"]) == 0
|
|
471
654
|
|
|
472
655
|
return CheckResult(
|
|
@@ -475,6 +658,9 @@ def check(
|
|
|
475
658
|
facts=facts,
|
|
476
659
|
suggestions=suggestions,
|
|
477
660
|
mode=ctx.config.mode.value,
|
|
661
|
+
_drops_by_op=drops_by_op,
|
|
662
|
+
_n_changes=n_changes,
|
|
663
|
+
_changes_by_op=changes_by_op,
|
|
478
664
|
)
|
|
479
665
|
|
|
480
666
|
|
|
@@ -482,6 +668,7 @@ def trace(
|
|
|
482
668
|
df: pd.DataFrame,
|
|
483
669
|
*,
|
|
484
670
|
row: int | None = None,
|
|
671
|
+
row_id: int | None = None,
|
|
485
672
|
where: dict[str, Any] | None = None,
|
|
486
673
|
include_ghost: bool = True,
|
|
487
674
|
) -> TraceResult | list[TraceResult]:
|
|
@@ -490,7 +677,8 @@ def trace(
|
|
|
490
677
|
|
|
491
678
|
Args:
|
|
492
679
|
df: DataFrame to search in
|
|
493
|
-
row: Row
|
|
680
|
+
row: Row position (0-based index into current DataFrame)
|
|
681
|
+
row_id: Internal row ID (use for tracing dropped rows)
|
|
494
682
|
where: Selector dict, e.g. {"customer_id": "C123"}
|
|
495
683
|
include_ghost: Include last-known values for dropped rows
|
|
496
684
|
|
|
@@ -499,8 +687,14 @@ def trace(
|
|
|
499
687
|
Use print(result) for pretty output, result.to_dict() for data.
|
|
500
688
|
|
|
501
689
|
Examples:
|
|
502
|
-
|
|
503
|
-
|
|
690
|
+
# Trace by position in current DataFrame
|
|
691
|
+
result = tp.trace(df, row=0) # First row
|
|
692
|
+
|
|
693
|
+
# Trace by internal row ID (for dropped rows)
|
|
694
|
+
dropped = tp.debug.inspect().dropped_rows()
|
|
695
|
+
result = tp.trace(df, row_id=dropped[0])
|
|
696
|
+
|
|
697
|
+
# Trace by business key
|
|
504
698
|
tp.trace(df, where={"customer_id": "C123"})
|
|
505
699
|
"""
|
|
506
700
|
ctx = get_context()
|
|
@@ -511,12 +705,30 @@ def trace(
|
|
|
511
705
|
pass
|
|
512
706
|
|
|
513
707
|
# Resolve row IDs
|
|
514
|
-
if
|
|
515
|
-
|
|
708
|
+
if row_id is not None:
|
|
709
|
+
# Direct row ID specified - use as-is
|
|
710
|
+
row_ids = [row_id]
|
|
711
|
+
elif row is not None:
|
|
712
|
+
# row= is a DataFrame index position (0-based), not a row ID
|
|
713
|
+
# Convert to actual row ID using the DataFrame's registered IDs
|
|
714
|
+
rids = ctx.row_manager.get_ids_array(df)
|
|
715
|
+
if rids is not None:
|
|
716
|
+
# Handle negative indexing
|
|
717
|
+
if row < 0:
|
|
718
|
+
row = len(rids) + row
|
|
719
|
+
if 0 <= row < len(rids):
|
|
720
|
+
row_ids = [int(rids[row])]
|
|
721
|
+
else:
|
|
722
|
+
raise ValueError(
|
|
723
|
+
f"Row index {row} out of bounds for DataFrame with {len(rids)} rows"
|
|
724
|
+
)
|
|
725
|
+
else:
|
|
726
|
+
# DataFrame not tracked - use row as-is (legacy behavior)
|
|
727
|
+
row_ids = [row]
|
|
516
728
|
elif where is not None:
|
|
517
729
|
row_ids = _resolve_where(df, where, ctx)
|
|
518
730
|
else:
|
|
519
|
-
raise ValueError("Must provide 'row' or 'where'")
|
|
731
|
+
raise ValueError("Must provide 'row', 'row_id', or 'where'")
|
|
520
732
|
|
|
521
733
|
results = []
|
|
522
734
|
for rid in row_ids:
|
|
@@ -531,6 +743,7 @@ def why(
|
|
|
531
743
|
*,
|
|
532
744
|
col: str,
|
|
533
745
|
row: int | None = None,
|
|
746
|
+
row_id: int | None = None,
|
|
534
747
|
where: dict[str, Any] | None = None,
|
|
535
748
|
) -> WhyResult | list[WhyResult]:
|
|
536
749
|
"""
|
|
@@ -539,7 +752,8 @@ def why(
|
|
|
539
752
|
Args:
|
|
540
753
|
df: DataFrame to search in
|
|
541
754
|
col: Column name to trace
|
|
542
|
-
row: Row
|
|
755
|
+
row: Row position (0-based index into current DataFrame)
|
|
756
|
+
row_id: Internal row ID (use for cells in dropped rows)
|
|
543
757
|
where: Selector dict, e.g. {"customer_id": "C123"}
|
|
544
758
|
|
|
545
759
|
Returns:
|
|
@@ -547,7 +761,7 @@ def why(
|
|
|
547
761
|
Use print(result) for pretty output, result.to_dict() for data.
|
|
548
762
|
|
|
549
763
|
Examples:
|
|
550
|
-
result = tp.why(df, col="amount", row=
|
|
764
|
+
result = tp.why(df, col="amount", row=0) # First row
|
|
551
765
|
print(result)
|
|
552
766
|
tp.why(df, col="email", where={"user_id": "U123"})
|
|
553
767
|
"""
|
|
@@ -563,12 +777,30 @@ def why(
|
|
|
563
777
|
)
|
|
564
778
|
|
|
565
779
|
# Resolve row IDs
|
|
566
|
-
if
|
|
567
|
-
|
|
780
|
+
if row_id is not None:
|
|
781
|
+
# Direct row ID specified - use as-is
|
|
782
|
+
row_ids = [row_id]
|
|
783
|
+
elif row is not None:
|
|
784
|
+
# row= is a DataFrame index position (0-based), not a row ID
|
|
785
|
+
# Convert to actual row ID using the DataFrame's registered IDs
|
|
786
|
+
rids = ctx.row_manager.get_ids_array(df)
|
|
787
|
+
if rids is not None:
|
|
788
|
+
# Handle negative indexing
|
|
789
|
+
if row < 0:
|
|
790
|
+
row = len(rids) + row
|
|
791
|
+
if 0 <= row < len(rids):
|
|
792
|
+
row_ids = [int(rids[row])]
|
|
793
|
+
else:
|
|
794
|
+
raise ValueError(
|
|
795
|
+
f"Row index {row} out of bounds for DataFrame with {len(rids)} rows"
|
|
796
|
+
)
|
|
797
|
+
else:
|
|
798
|
+
# DataFrame not tracked - use row as-is (legacy behavior)
|
|
799
|
+
row_ids = [row]
|
|
568
800
|
elif where is not None:
|
|
569
801
|
row_ids = _resolve_where(df, where, ctx)
|
|
570
802
|
else:
|
|
571
|
-
raise ValueError("Must provide 'row' or 'where'")
|
|
803
|
+
raise ValueError("Must provide 'row', 'row_id', or 'where'")
|
|
572
804
|
|
|
573
805
|
results = []
|
|
574
806
|
for rid in row_ids:
|
|
@@ -787,6 +1019,14 @@ def _build_trace_result(row_id: int, ctx, include_ghost: bool) -> TraceResult:
|
|
|
787
1019
|
drop_event = store.get_drop_event(row_id)
|
|
788
1020
|
merge_origin = store.get_merge_origin(row_id)
|
|
789
1021
|
|
|
1022
|
+
# v0.4+ provenance: concat origin and dedup representative
|
|
1023
|
+
concat_origin = None
|
|
1024
|
+
dedup_representative = None
|
|
1025
|
+
if hasattr(store, "get_concat_origin"):
|
|
1026
|
+
concat_origin = store.get_concat_origin(row_id)
|
|
1027
|
+
if hasattr(store, "get_duplicate_representative"):
|
|
1028
|
+
dedup_representative = store.get_duplicate_representative(row_id)
|
|
1029
|
+
|
|
790
1030
|
# Use lineage-aware history to include pre-merge parent events
|
|
791
1031
|
if hasattr(store, "get_row_history_with_lineage"):
|
|
792
1032
|
history = store.get_row_history_with_lineage(row_id)
|
|
@@ -823,6 +1063,8 @@ def _build_trace_result(row_id: int, ctx, include_ghost: bool) -> TraceResult:
|
|
|
823
1063
|
merge_origin=merge_origin,
|
|
824
1064
|
events=history,
|
|
825
1065
|
ghost_values=ghost_values,
|
|
1066
|
+
concat_origin=concat_origin,
|
|
1067
|
+
dedup_representative=dedup_representative,
|
|
826
1068
|
)
|
|
827
1069
|
|
|
828
1070
|
|
tracepipe/core.py
CHANGED
|
@@ -277,3 +277,82 @@ class MergeStats:
|
|
|
277
277
|
left_dup_rate: float # -1 if not computed
|
|
278
278
|
right_dup_rate: float # -1 if not computed
|
|
279
279
|
how: str
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
@dataclass
|
|
283
|
+
class ConcatMapping:
|
|
284
|
+
"""
|
|
285
|
+
Mapping for pd.concat operations preserving row lineage.
|
|
286
|
+
|
|
287
|
+
For axis=0 concat, each result row comes from exactly one source DataFrame.
|
|
288
|
+
Arrays are stored in both positional order (for "explain row i") and
|
|
289
|
+
sorted order (for O(log n) RID lookup).
|
|
290
|
+
|
|
291
|
+
Invariants:
|
|
292
|
+
- out_rids and source_indices have same length
|
|
293
|
+
- out_rids_sorted and out_pos_sorted are always paired (both set or both None)
|
|
294
|
+
- out_rids_sorted is monotonically increasing
|
|
295
|
+
"""
|
|
296
|
+
|
|
297
|
+
step_id: int
|
|
298
|
+
|
|
299
|
+
# Positional arrays (match result row order)
|
|
300
|
+
out_rids: Any # numpy array, len = len(result)
|
|
301
|
+
source_indices: Any # numpy array, which source DF (0, 1, 2...) each row came from
|
|
302
|
+
|
|
303
|
+
# Sorted arrays (for O(log n) lookup by RID)
|
|
304
|
+
out_rids_sorted: Any # numpy array, SORTED
|
|
305
|
+
out_pos_sorted: Any # numpy array, original positions aligned with out_rids_sorted
|
|
306
|
+
|
|
307
|
+
# Metadata
|
|
308
|
+
source_shapes: list[tuple] = field(default_factory=list)
|
|
309
|
+
|
|
310
|
+
def __post_init__(self):
|
|
311
|
+
"""Validate invariants."""
|
|
312
|
+
import numpy as np
|
|
313
|
+
|
|
314
|
+
if self.out_rids_sorted is not None and self.out_pos_sorted is not None:
|
|
315
|
+
if len(self.out_rids_sorted) != len(self.out_pos_sorted):
|
|
316
|
+
raise ValueError("out_rids_sorted and out_pos_sorted must have same length")
|
|
317
|
+
# Verify monotonic (debug check)
|
|
318
|
+
if len(self.out_rids_sorted) > 1:
|
|
319
|
+
assert np.all(
|
|
320
|
+
self.out_rids_sorted[:-1] <= self.out_rids_sorted[1:]
|
|
321
|
+
), "out_rids_sorted must be monotonically increasing"
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
@dataclass
|
|
325
|
+
class DuplicateDropMapping:
|
|
326
|
+
"""
|
|
327
|
+
Mapping for drop_duplicates provenance (debug mode only).
|
|
328
|
+
|
|
329
|
+
Tracks which rows were dropped and which "representative" row they lost to.
|
|
330
|
+
Arrays are sorted by dropped_rids for O(log n) lookup.
|
|
331
|
+
|
|
332
|
+
For keep='first': dropped rows map to first occurrence
|
|
333
|
+
For keep='last': dropped rows map to last occurrence
|
|
334
|
+
For keep=False: dropped rows have kept_rids=-1 (no representative)
|
|
335
|
+
"""
|
|
336
|
+
|
|
337
|
+
step_id: int
|
|
338
|
+
|
|
339
|
+
# Sorted arrays for O(log n) lookup
|
|
340
|
+
dropped_rids: Any # numpy array, SORTED dropped row IDs
|
|
341
|
+
kept_rids: Any # numpy array, representative RID for each dropped row (-1 if none)
|
|
342
|
+
|
|
343
|
+
# Metadata
|
|
344
|
+
subset_columns: Optional[tuple[str, ...]] = None
|
|
345
|
+
keep_strategy: str = "first"
|
|
346
|
+
|
|
347
|
+
def __post_init__(self):
|
|
348
|
+
"""Validate invariants."""
|
|
349
|
+
import numpy as np
|
|
350
|
+
|
|
351
|
+
if self.dropped_rids is not None and self.kept_rids is not None:
|
|
352
|
+
if len(self.dropped_rids) != len(self.kept_rids):
|
|
353
|
+
raise ValueError("dropped_rids and kept_rids must have same length")
|
|
354
|
+
# Verify sorted
|
|
355
|
+
if len(self.dropped_rids) > 1:
|
|
356
|
+
assert np.all(
|
|
357
|
+
self.dropped_rids[:-1] <= self.dropped_rids[1:]
|
|
358
|
+
), "dropped_rids must be sorted"
|
tracepipe/debug.py
CHANGED
|
@@ -179,6 +179,46 @@ class DebugInspector:
|
|
|
179
179
|
ctx = get_context()
|
|
180
180
|
return ctx.row_manager.get_ghost_rows(limit=limit)
|
|
181
181
|
|
|
182
|
+
def get_ghost_values(self, row_id: int) -> dict[str, Any] | None:
|
|
183
|
+
"""
|
|
184
|
+
Get last-known values for a specific dropped row (DEBUG mode only).
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
row_id: The row ID to look up
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
Dict mapping column names to their last known values,
|
|
191
|
+
or None if the row was not found in ghost storage.
|
|
192
|
+
|
|
193
|
+
Example:
|
|
194
|
+
dbg = tp.debug.inspect()
|
|
195
|
+
dropped_rid = list(dbg.dropped_rows())[0]
|
|
196
|
+
ghost = dbg.get_ghost_values(dropped_rid)
|
|
197
|
+
print(f"Last known values: {ghost}")
|
|
198
|
+
"""
|
|
199
|
+
ctx = get_context()
|
|
200
|
+
ghost_df = ctx.row_manager.get_ghost_rows(limit=100000)
|
|
201
|
+
|
|
202
|
+
if ghost_df.empty or "__tp_row_id__" not in ghost_df.columns:
|
|
203
|
+
return None
|
|
204
|
+
|
|
205
|
+
row_match = ghost_df[ghost_df["__tp_row_id__"] == row_id]
|
|
206
|
+
if row_match.empty:
|
|
207
|
+
return None
|
|
208
|
+
|
|
209
|
+
# Convert to dict and remove internal columns
|
|
210
|
+
result = row_match.iloc[0].to_dict()
|
|
211
|
+
internal_cols = [
|
|
212
|
+
"__tp_row_id__",
|
|
213
|
+
"__tp_dropped_by__",
|
|
214
|
+
"__tp_dropped_step__",
|
|
215
|
+
"__tp_original_position__",
|
|
216
|
+
]
|
|
217
|
+
for col in internal_cols:
|
|
218
|
+
result.pop(col, None)
|
|
219
|
+
|
|
220
|
+
return result
|
|
221
|
+
|
|
182
222
|
def stats(self) -> dict:
|
|
183
223
|
"""Get comprehensive tracking statistics."""
|
|
184
224
|
ctx = get_context()
|
|
@@ -24,7 +24,7 @@ import numpy as np
|
|
|
24
24
|
import pandas as pd
|
|
25
25
|
|
|
26
26
|
from ..context import TracePipeContext, get_context
|
|
27
|
-
from ..core import CompletenessLevel
|
|
27
|
+
from ..core import CompletenessLevel, DuplicateDropMapping
|
|
28
28
|
from ..safety import TracePipeWarning, get_caller_info
|
|
29
29
|
|
|
30
30
|
# ============ MASK DERIVATION FUNCTIONS ============
|
|
@@ -97,6 +97,95 @@ def derive_drop_duplicates_mask(
|
|
|
97
97
|
return kept_mask.values, completeness
|
|
98
98
|
|
|
99
99
|
|
|
100
|
+
def derive_drop_duplicates_provenance(
|
|
101
|
+
df: pd.DataFrame,
|
|
102
|
+
source_rids: np.ndarray,
|
|
103
|
+
subset: Optional[list[str]],
|
|
104
|
+
keep: str,
|
|
105
|
+
) -> Optional[DuplicateDropMapping]:
|
|
106
|
+
"""
|
|
107
|
+
Derive dropped->kept mapping for drop_duplicates (debug mode only).
|
|
108
|
+
|
|
109
|
+
Uses hash_pandas_object for NaN-safe, fast key comparison.
|
|
110
|
+
Uses vectorized groupby min/max for representative selection.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
df: Source DataFrame
|
|
114
|
+
source_rids: Row IDs for each row in df
|
|
115
|
+
subset: Columns to consider for duplicates (None = all)
|
|
116
|
+
keep: 'first', 'last', or False
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
DuplicateDropMapping if any rows were dropped, else None.
|
|
120
|
+
"""
|
|
121
|
+
n = len(df)
|
|
122
|
+
if n == 0:
|
|
123
|
+
return None
|
|
124
|
+
|
|
125
|
+
# Determine columns to hash
|
|
126
|
+
if subset is None:
|
|
127
|
+
hash_df = df
|
|
128
|
+
valid_cols = tuple(df.columns)
|
|
129
|
+
else:
|
|
130
|
+
valid_cols = tuple(c for c in subset if c in df.columns)
|
|
131
|
+
if not valid_cols:
|
|
132
|
+
return None
|
|
133
|
+
hash_df = df[list(valid_cols)]
|
|
134
|
+
|
|
135
|
+
# Use hash_pandas_object for fast, NaN-safe key hashing
|
|
136
|
+
try:
|
|
137
|
+
h = pd.util.hash_pandas_object(hash_df, index=False)
|
|
138
|
+
codes, _ = pd.factorize(h, sort=False)
|
|
139
|
+
except Exception:
|
|
140
|
+
# Fallback: can't hash, skip provenance
|
|
141
|
+
return None
|
|
142
|
+
|
|
143
|
+
# Compute kept mask using pandas (ground truth)
|
|
144
|
+
kept_mask = ~df.duplicated(subset=list(valid_cols) if valid_cols else None, keep=keep)
|
|
145
|
+
dropped_mask = ~kept_mask.values
|
|
146
|
+
|
|
147
|
+
if not dropped_mask.any():
|
|
148
|
+
return None # No duplicates dropped
|
|
149
|
+
|
|
150
|
+
dropped_positions = np.where(dropped_mask)[0]
|
|
151
|
+
dropped_rids = source_rids[dropped_positions]
|
|
152
|
+
|
|
153
|
+
# Find representative positions using vectorized groupby min/max
|
|
154
|
+
positions = np.arange(n, dtype=np.int64)
|
|
155
|
+
|
|
156
|
+
if keep == "first":
|
|
157
|
+
# Representative = first occurrence of each group
|
|
158
|
+
rep_pos = pd.Series(positions).groupby(codes).min().to_numpy()
|
|
159
|
+
elif keep == "last":
|
|
160
|
+
# Representative = last occurrence of each group
|
|
161
|
+
rep_pos = pd.Series(positions).groupby(codes).max().to_numpy()
|
|
162
|
+
else:
|
|
163
|
+
# keep=False: no representative (all duplicates dropped)
|
|
164
|
+
rep_pos = None
|
|
165
|
+
|
|
166
|
+
# Build kept_rids array
|
|
167
|
+
if rep_pos is not None:
|
|
168
|
+
dropped_codes = codes[dropped_positions]
|
|
169
|
+
kept_positions = rep_pos[dropped_codes]
|
|
170
|
+
kept_rids = source_rids[kept_positions]
|
|
171
|
+
else:
|
|
172
|
+
# keep=False: no representative
|
|
173
|
+
kept_rids = np.full(len(dropped_rids), -1, dtype=np.int64)
|
|
174
|
+
|
|
175
|
+
# Sort by dropped_rids for O(log n) lookup
|
|
176
|
+
sort_order = np.argsort(dropped_rids)
|
|
177
|
+
dropped_rids_sorted = dropped_rids[sort_order].copy()
|
|
178
|
+
kept_rids_sorted = kept_rids[sort_order].copy()
|
|
179
|
+
|
|
180
|
+
return DuplicateDropMapping(
|
|
181
|
+
step_id=-1, # Will be set by caller
|
|
182
|
+
dropped_rids=dropped_rids_sorted,
|
|
183
|
+
kept_rids=kept_rids_sorted,
|
|
184
|
+
subset_columns=valid_cols if valid_cols else None,
|
|
185
|
+
keep_strategy=str(keep),
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
|
|
100
189
|
def derive_query_mask(
|
|
101
190
|
df: pd.DataFrame, args: tuple, kwargs: dict
|
|
102
191
|
) -> tuple[Optional[np.ndarray], CompletenessLevel]:
|
|
@@ -257,12 +346,19 @@ def _capture_filter_with_mask(
|
|
|
257
346
|
kept_mask: Optional[np.ndarray] = None
|
|
258
347
|
positions: Optional[np.ndarray] = None
|
|
259
348
|
completeness = CompletenessLevel.FULL
|
|
349
|
+
dedup_mapping: Optional[DuplicateDropMapping] = None
|
|
260
350
|
|
|
261
351
|
if method_name == "dropna":
|
|
262
352
|
kept_mask, completeness = derive_dropna_mask(source_df, args, kwargs)
|
|
263
353
|
|
|
264
354
|
elif method_name == "drop_duplicates":
|
|
265
355
|
kept_mask, completeness = derive_drop_duplicates_mask(source_df, args, kwargs)
|
|
356
|
+
# Compute provenance mapping in debug mode
|
|
357
|
+
dedup_mapping = None
|
|
358
|
+
if ctx.config.should_capture_merge_provenance:
|
|
359
|
+
subset = kwargs.get("subset", None)
|
|
360
|
+
keep = kwargs.get("keep", "first")
|
|
361
|
+
dedup_mapping = derive_drop_duplicates_provenance(source_df, source_rids, subset, keep)
|
|
266
362
|
|
|
267
363
|
elif method_name == "query":
|
|
268
364
|
kept_mask, completeness = derive_query_mask(source_df, args, kwargs)
|
|
@@ -359,6 +455,12 @@ def _capture_filter_with_mask(
|
|
|
359
455
|
watched_columns=ctx.watched_columns,
|
|
360
456
|
)
|
|
361
457
|
|
|
458
|
+
# === RECORD DROP_DUPLICATES PROVENANCE (debug mode) ===
|
|
459
|
+
if method_name == "drop_duplicates" and dedup_mapping is not None:
|
|
460
|
+
# Update step_id in the mapping and store it
|
|
461
|
+
dedup_mapping.step_id = step_id
|
|
462
|
+
store.duplicate_drop_mappings.append(dedup_mapping)
|
|
463
|
+
|
|
362
464
|
|
|
363
465
|
def _propagate_by_index_fallback(
|
|
364
466
|
row_mgr, source_df: pd.DataFrame, result_df: pd.DataFrame
|
|
@@ -14,7 +14,7 @@ import numpy as np
|
|
|
14
14
|
import pandas as pd
|
|
15
15
|
|
|
16
16
|
from ..context import get_context
|
|
17
|
-
from ..core import CompletenessLevel, MergeMapping, MergeStats
|
|
17
|
+
from ..core import CompletenessLevel, ConcatMapping, MergeMapping, MergeStats
|
|
18
18
|
from ..safety import TracePipeWarning, get_caller_info
|
|
19
19
|
|
|
20
20
|
|
|
@@ -382,53 +382,199 @@ def wrap_join_with_lineage(original_join):
|
|
|
382
382
|
def wrap_concat_with_lineage(original_concat):
|
|
383
383
|
"""
|
|
384
384
|
Wrap pd.concat with lineage capture.
|
|
385
|
+
|
|
386
|
+
For axis=0 (vertical concat):
|
|
387
|
+
- Preserves row IDs from source DataFrames (FULL provenance)
|
|
388
|
+
- Tracks which source DataFrame each row came from
|
|
389
|
+
|
|
390
|
+
For axis=1 (horizontal concat):
|
|
391
|
+
- Propagates RIDs if all inputs have identical RID arrays
|
|
392
|
+
- Otherwise marks as PARTIAL
|
|
385
393
|
"""
|
|
386
394
|
|
|
387
395
|
@wraps(original_concat)
|
|
388
396
|
def wrapper(objs, *args, **kwargs):
|
|
389
397
|
ctx = get_context()
|
|
390
398
|
|
|
391
|
-
result = original_concat(objs, *args, **kwargs)
|
|
392
|
-
|
|
393
399
|
if not ctx.enabled:
|
|
394
|
-
return
|
|
400
|
+
return original_concat(objs, *args, **kwargs)
|
|
401
|
+
|
|
402
|
+
axis = kwargs.get("axis", 0)
|
|
403
|
+
|
|
404
|
+
# === BEFORE: Capture source RIDs from all tracked DataFrames ===
|
|
405
|
+
source_data = [] # [(rids_copy, shape, original_index), ...]
|
|
406
|
+
try:
|
|
407
|
+
objs_list = list(objs) if hasattr(objs, "__iter__") else [objs]
|
|
408
|
+
except TypeError:
|
|
409
|
+
objs_list = [objs]
|
|
410
|
+
|
|
411
|
+
for i, obj in enumerate(objs_list):
|
|
412
|
+
if isinstance(obj, pd.DataFrame) and len(obj) > 0:
|
|
413
|
+
rids = ctx.row_manager.get_ids_array(obj)
|
|
414
|
+
if rids is None:
|
|
415
|
+
rids = ctx.row_manager.register(obj)
|
|
416
|
+
# IMPORTANT: Make a copy to avoid mutation issues
|
|
417
|
+
source_data.append((rids.copy(), obj.shape, i))
|
|
418
|
+
|
|
419
|
+
# === RUN ORIGINAL ===
|
|
420
|
+
try:
|
|
421
|
+
result = original_concat(objs_list, *args, **kwargs)
|
|
422
|
+
except Exception:
|
|
423
|
+
raise # Don't store mapping on failure
|
|
395
424
|
|
|
396
425
|
if not isinstance(result, pd.DataFrame):
|
|
397
426
|
return result
|
|
398
427
|
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
428
|
+
row_mgr = ctx.row_manager
|
|
429
|
+
store = ctx.store
|
|
430
|
+
code_file, code_line = get_caller_info(skip_frames=2)
|
|
402
431
|
|
|
403
|
-
|
|
404
|
-
|
|
432
|
+
# Compute input shapes for step metadata
|
|
433
|
+
input_shapes = [sd[1] for sd in source_data]
|
|
405
434
|
|
|
406
|
-
|
|
435
|
+
# === AXIS=0: Vertical concat with FULL provenance ===
|
|
436
|
+
if axis == 0 and source_data:
|
|
437
|
+
return _concat_axis0_with_provenance(
|
|
438
|
+
result, source_data, input_shapes, code_file, code_line, ctx
|
|
439
|
+
)
|
|
407
440
|
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
441
|
+
# === AXIS=1: Horizontal concat ===
|
|
442
|
+
elif axis == 1 and source_data:
|
|
443
|
+
return _concat_axis1_with_provenance(
|
|
444
|
+
result, source_data, input_shapes, code_file, code_line, ctx
|
|
445
|
+
)
|
|
413
446
|
|
|
447
|
+
# === FALLBACK: Unknown axis or no source data ===
|
|
448
|
+
else:
|
|
449
|
+
row_mgr.register(result)
|
|
414
450
|
store.append_step(
|
|
415
451
|
operation="pd.concat",
|
|
416
452
|
stage=ctx.current_stage,
|
|
417
453
|
code_file=code_file,
|
|
418
454
|
code_line=code_line,
|
|
419
455
|
params={
|
|
420
|
-
"axis":
|
|
421
|
-
"n_inputs": len(
|
|
456
|
+
"axis": axis,
|
|
457
|
+
"n_inputs": len(source_data),
|
|
422
458
|
},
|
|
423
459
|
input_shape=tuple(input_shapes) if input_shapes else None,
|
|
424
460
|
output_shape=result.shape,
|
|
425
|
-
completeness=CompletenessLevel.PARTIAL,
|
|
461
|
+
completeness=CompletenessLevel.PARTIAL,
|
|
426
462
|
)
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
463
|
+
return result
|
|
464
|
+
|
|
465
|
+
return wrapper
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
def _concat_axis0_with_provenance(result, source_data, input_shapes, code_file, code_line, ctx):
|
|
469
|
+
"""
|
|
470
|
+
Handle axis=0 concat with FULL row provenance.
|
|
471
|
+
|
|
472
|
+
Preserves source RIDs and tracks which source DF each row came from.
|
|
473
|
+
"""
|
|
474
|
+
row_mgr = ctx.row_manager
|
|
475
|
+
store = ctx.store
|
|
431
476
|
|
|
477
|
+
# Build concatenated RID array and source index array
|
|
478
|
+
all_rids = np.concatenate([sd[0] for sd in source_data])
|
|
479
|
+
all_source_idx = np.concatenate(
|
|
480
|
+
[np.full(len(sd[0]), sd[2], dtype=np.int32) for sd in source_data]
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
# Validate: length must match result
|
|
484
|
+
if len(all_rids) != len(result):
|
|
485
|
+
# Mismatch - some objects contributed differently (empty DFs, Series, etc.)
|
|
486
|
+
# Degrade gracefully to PARTIAL
|
|
487
|
+
row_mgr.register(result)
|
|
488
|
+
store.append_step(
|
|
489
|
+
operation="pd.concat",
|
|
490
|
+
stage=ctx.current_stage,
|
|
491
|
+
code_file=code_file,
|
|
492
|
+
code_line=code_line,
|
|
493
|
+
params={
|
|
494
|
+
"axis": 0,
|
|
495
|
+
"n_inputs": len(source_data),
|
|
496
|
+
"_length_mismatch": True,
|
|
497
|
+
},
|
|
498
|
+
input_shape=tuple(input_shapes) if input_shapes else None,
|
|
499
|
+
output_shape=result.shape,
|
|
500
|
+
completeness=CompletenessLevel.PARTIAL,
|
|
501
|
+
)
|
|
432
502
|
return result
|
|
433
503
|
|
|
434
|
-
|
|
504
|
+
# Propagate RIDs to result (preserving lineage!)
|
|
505
|
+
row_mgr.set_result_rids(result, all_rids.copy())
|
|
506
|
+
|
|
507
|
+
# Build sorted arrays for O(log n) lookup
|
|
508
|
+
sort_order = np.argsort(all_rids)
|
|
509
|
+
out_rids_sorted = all_rids[sort_order].copy()
|
|
510
|
+
out_pos_sorted = sort_order.copy()
|
|
511
|
+
|
|
512
|
+
# Record step with FULL completeness
|
|
513
|
+
step_id = store.append_step(
|
|
514
|
+
operation="pd.concat",
|
|
515
|
+
stage=ctx.current_stage,
|
|
516
|
+
code_file=code_file,
|
|
517
|
+
code_line=code_line,
|
|
518
|
+
params={
|
|
519
|
+
"axis": 0,
|
|
520
|
+
"n_inputs": len(source_data),
|
|
521
|
+
},
|
|
522
|
+
input_shape=tuple(input_shapes) if input_shapes else None,
|
|
523
|
+
output_shape=result.shape,
|
|
524
|
+
completeness=CompletenessLevel.FULL,
|
|
525
|
+
)
|
|
526
|
+
|
|
527
|
+
# Store mapping
|
|
528
|
+
mapping = ConcatMapping(
|
|
529
|
+
step_id=step_id,
|
|
530
|
+
out_rids=all_rids.copy(),
|
|
531
|
+
source_indices=all_source_idx.copy(),
|
|
532
|
+
out_rids_sorted=out_rids_sorted,
|
|
533
|
+
out_pos_sorted=out_pos_sorted,
|
|
534
|
+
source_shapes=list(input_shapes),
|
|
535
|
+
)
|
|
536
|
+
store.concat_mappings.append(mapping)
|
|
537
|
+
|
|
538
|
+
return result
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
def _concat_axis1_with_provenance(result, source_data, input_shapes, code_file, code_line, ctx):
|
|
542
|
+
"""
|
|
543
|
+
Handle axis=1 concat with best-effort provenance.
|
|
544
|
+
|
|
545
|
+
If all inputs have identical RID arrays, propagate them (FULL).
|
|
546
|
+
Otherwise, mark as PARTIAL and register new RIDs.
|
|
547
|
+
"""
|
|
548
|
+
row_mgr = ctx.row_manager
|
|
549
|
+
store = ctx.store
|
|
550
|
+
|
|
551
|
+
# Check if all inputs have the same RIDs in same order
|
|
552
|
+
first_rids = source_data[0][0]
|
|
553
|
+
all_same = all(
|
|
554
|
+
len(sd[0]) == len(first_rids) and np.array_equal(sd[0], first_rids) for sd in source_data
|
|
555
|
+
)
|
|
556
|
+
|
|
557
|
+
if all_same and len(first_rids) == len(result):
|
|
558
|
+
# All inputs have identical RIDs - propagate them
|
|
559
|
+
row_mgr.set_result_rids(result, first_rids.copy())
|
|
560
|
+
completeness = CompletenessLevel.FULL
|
|
561
|
+
else:
|
|
562
|
+
# Misaligned or different RIDs - register new RIDs
|
|
563
|
+
row_mgr.register(result)
|
|
564
|
+
completeness = CompletenessLevel.PARTIAL
|
|
565
|
+
|
|
566
|
+
store.append_step(
|
|
567
|
+
operation="pd.concat",
|
|
568
|
+
stage=ctx.current_stage,
|
|
569
|
+
code_file=code_file,
|
|
570
|
+
code_line=code_line,
|
|
571
|
+
params={
|
|
572
|
+
"axis": 1,
|
|
573
|
+
"n_inputs": len(source_data),
|
|
574
|
+
},
|
|
575
|
+
input_shape=tuple(input_shapes) if input_shapes else None,
|
|
576
|
+
output_shape=result.shape,
|
|
577
|
+
completeness=completeness,
|
|
578
|
+
)
|
|
579
|
+
|
|
580
|
+
return result
|
tracepipe/snapshot.py
CHANGED
|
@@ -25,7 +25,7 @@ Usage:
|
|
|
25
25
|
|
|
26
26
|
import json
|
|
27
27
|
import time
|
|
28
|
-
from dataclasses import dataclass
|
|
28
|
+
from dataclasses import dataclass, field
|
|
29
29
|
from pathlib import Path
|
|
30
30
|
from typing import Any, Optional
|
|
31
31
|
|
|
@@ -297,6 +297,20 @@ class DiffResult:
|
|
|
297
297
|
recovered_rows: set[int]
|
|
298
298
|
drops_delta: dict[str, int] # op -> change in count
|
|
299
299
|
stats_changes: dict[str, dict[str, Any]] # col -> {metric: (old, new)}
|
|
300
|
+
# Column changes
|
|
301
|
+
columns_added: list[str] = field(default_factory=list)
|
|
302
|
+
columns_removed: list[str] = field(default_factory=list)
|
|
303
|
+
# Cell-level changes (only populated if both snapshots have include_values=True)
|
|
304
|
+
cells_changed: int = 0 # Total modified cells
|
|
305
|
+
changed_rows: set[int] = field(default_factory=set) # IDs of rows with value changes
|
|
306
|
+
changes_by_column: dict[str, int] = field(default_factory=dict) # col -> count
|
|
307
|
+
|
|
308
|
+
@property
|
|
309
|
+
def rows_unchanged(self) -> int:
|
|
310
|
+
"""Number of rows that exist in both snapshots (may have value changes)."""
|
|
311
|
+
# This is computed from the rows that weren't added or removed
|
|
312
|
+
# Note: This is an estimate based on the smaller snapshot
|
|
313
|
+
return 0 # Will be set during diff computation
|
|
300
314
|
|
|
301
315
|
def __repr__(self) -> str:
|
|
302
316
|
lines = ["Snapshot Diff:"]
|
|
@@ -310,6 +324,18 @@ class DiffResult:
|
|
|
310
324
|
if self.recovered_rows:
|
|
311
325
|
lines.append(f" * {len(self.recovered_rows)} recovered")
|
|
312
326
|
|
|
327
|
+
if self.columns_added:
|
|
328
|
+
lines.append(f" Columns added: {', '.join(self.columns_added)}")
|
|
329
|
+
if self.columns_removed:
|
|
330
|
+
lines.append(f" Columns removed: {', '.join(self.columns_removed)}")
|
|
331
|
+
|
|
332
|
+
if self.cells_changed > 0:
|
|
333
|
+
lines.append("\n Changes:")
|
|
334
|
+
lines.append(f" - {self.cells_changed} cells modified")
|
|
335
|
+
if self.changes_by_column:
|
|
336
|
+
for col, count in sorted(self.changes_by_column.items(), key=lambda x: -x[1])[:5]:
|
|
337
|
+
lines.append(f" {col}: {count}")
|
|
338
|
+
|
|
313
339
|
if self.drops_delta:
|
|
314
340
|
lines.append(" Drop changes by operation:")
|
|
315
341
|
for op, delta in sorted(self.drops_delta.items(), key=lambda x: -abs(x[1])):
|
|
@@ -339,6 +365,9 @@ class DiffResult:
|
|
|
339
365
|
or self.recovered_rows
|
|
340
366
|
or self.drops_delta
|
|
341
367
|
or self.stats_changes
|
|
368
|
+
or self.columns_added
|
|
369
|
+
or self.columns_removed
|
|
370
|
+
or self.cells_changed
|
|
342
371
|
)
|
|
343
372
|
|
|
344
373
|
def to_dict(self) -> dict:
|
|
@@ -350,6 +379,11 @@ class DiffResult:
|
|
|
350
379
|
"recovered_rows": list(self.recovered_rows),
|
|
351
380
|
"drops_delta": self.drops_delta,
|
|
352
381
|
"stats_changes": self.stats_changes,
|
|
382
|
+
"columns_added": self.columns_added,
|
|
383
|
+
"columns_removed": self.columns_removed,
|
|
384
|
+
"cells_changed": self.cells_changed,
|
|
385
|
+
"changed_rows": list(self.changed_rows),
|
|
386
|
+
"changes_by_column": self.changes_by_column,
|
|
353
387
|
}
|
|
354
388
|
|
|
355
389
|
|
|
@@ -359,6 +393,9 @@ def diff(baseline: Snapshot, current: Snapshot) -> DiffResult:
|
|
|
359
393
|
|
|
360
394
|
Note: Cross-run diff is SUMMARY-ONLY unless keys are stored.
|
|
361
395
|
Row-level comparison only works within same session (same RID assignment).
|
|
396
|
+
|
|
397
|
+
For cell-level diff (cells_changed, changes_by_column), both snapshots
|
|
398
|
+
must have been created with include_values=True.
|
|
362
399
|
"""
|
|
363
400
|
rows_added = current.row_ids - baseline.row_ids
|
|
364
401
|
rows_removed = baseline.row_ids - current.row_ids
|
|
@@ -375,9 +412,15 @@ def diff(baseline: Snapshot, current: Snapshot) -> DiffResult:
|
|
|
375
412
|
if old != new:
|
|
376
413
|
drops_delta[op] = new - old
|
|
377
414
|
|
|
415
|
+
# Column changes
|
|
416
|
+
baseline_cols = set(baseline.column_stats.keys())
|
|
417
|
+
current_cols = set(current.column_stats.keys())
|
|
418
|
+
columns_added = sorted(current_cols - baseline_cols)
|
|
419
|
+
columns_removed = sorted(baseline_cols - current_cols)
|
|
420
|
+
|
|
378
421
|
# Stats changes
|
|
379
422
|
stats_changes: dict[str, dict[str, Any]] = {}
|
|
380
|
-
all_cols =
|
|
423
|
+
all_cols = baseline_cols | current_cols
|
|
381
424
|
for col in all_cols:
|
|
382
425
|
old_stats = baseline.column_stats.get(col)
|
|
383
426
|
new_stats = current.column_stats.get(col)
|
|
@@ -396,6 +439,43 @@ def diff(baseline: Snapshot, current: Snapshot) -> DiffResult:
|
|
|
396
439
|
if changes:
|
|
397
440
|
stats_changes[col] = changes
|
|
398
441
|
|
|
442
|
+
# Cell-level changes (only if both snapshots have watched data)
|
|
443
|
+
cells_changed = 0
|
|
444
|
+
changed_rows: set[int] = set()
|
|
445
|
+
changes_by_column: dict[str, int] = {}
|
|
446
|
+
|
|
447
|
+
if baseline.watched_data is not None and current.watched_data is not None:
|
|
448
|
+
# Find common rows and columns
|
|
449
|
+
common_rows = baseline.row_ids & current.row_ids
|
|
450
|
+
common_cols = set(baseline.watched_data.columns) & set(current.watched_data.columns)
|
|
451
|
+
|
|
452
|
+
for rid in common_rows:
|
|
453
|
+
for col in common_cols:
|
|
454
|
+
old_val = baseline.watched_data.get_value(int(rid), col)
|
|
455
|
+
new_val = current.watched_data.get_value(int(rid), col)
|
|
456
|
+
|
|
457
|
+
# Compare values (handle NaN)
|
|
458
|
+
values_equal = False
|
|
459
|
+
if old_val is None and new_val is None:
|
|
460
|
+
values_equal = True
|
|
461
|
+
elif old_val is not None and new_val is not None:
|
|
462
|
+
try:
|
|
463
|
+
# Handle NaN comparison
|
|
464
|
+
if isinstance(old_val, float) and isinstance(new_val, float):
|
|
465
|
+
if old_val != old_val and new_val != new_val: # Both NaN
|
|
466
|
+
values_equal = True
|
|
467
|
+
else:
|
|
468
|
+
values_equal = old_val == new_val
|
|
469
|
+
else:
|
|
470
|
+
values_equal = old_val == new_val
|
|
471
|
+
except (TypeError, ValueError):
|
|
472
|
+
values_equal = str(old_val) == str(new_val)
|
|
473
|
+
|
|
474
|
+
if not values_equal:
|
|
475
|
+
cells_changed += 1
|
|
476
|
+
changed_rows.add(rid)
|
|
477
|
+
changes_by_column[col] = changes_by_column.get(col, 0) + 1
|
|
478
|
+
|
|
399
479
|
return DiffResult(
|
|
400
480
|
rows_added=rows_added,
|
|
401
481
|
rows_removed=rows_removed,
|
|
@@ -403,6 +483,11 @@ def diff(baseline: Snapshot, current: Snapshot) -> DiffResult:
|
|
|
403
483
|
recovered_rows=recovered_rows,
|
|
404
484
|
drops_delta=drops_delta,
|
|
405
485
|
stats_changes=stats_changes,
|
|
486
|
+
columns_added=columns_added,
|
|
487
|
+
columns_removed=columns_removed,
|
|
488
|
+
cells_changed=cells_changed,
|
|
489
|
+
changed_rows=changed_rows,
|
|
490
|
+
changes_by_column=changes_by_column,
|
|
406
491
|
)
|
|
407
492
|
|
|
408
493
|
|
|
@@ -22,6 +22,8 @@ from ..core import (
|
|
|
22
22
|
AggregationMapping,
|
|
23
23
|
ChangeType,
|
|
24
24
|
CompletenessLevel,
|
|
25
|
+
ConcatMapping,
|
|
26
|
+
DuplicateDropMapping,
|
|
25
27
|
LineageGap,
|
|
26
28
|
LineageGaps,
|
|
27
29
|
MergeMapping,
|
|
@@ -100,6 +102,12 @@ class InMemoryLineageStore:
|
|
|
100
102
|
self.merge_mappings: list[MergeMapping] = []
|
|
101
103
|
self.merge_stats: list[tuple[int, MergeStats]] = []
|
|
102
104
|
|
|
105
|
+
# === CONCAT TRACKING ===
|
|
106
|
+
self.concat_mappings: list[ConcatMapping] = []
|
|
107
|
+
|
|
108
|
+
# === DUPLICATE DROP TRACKING (debug mode) ===
|
|
109
|
+
self.duplicate_drop_mappings: list[DuplicateDropMapping] = []
|
|
110
|
+
|
|
103
111
|
# === AGGREGATION MAPPINGS ===
|
|
104
112
|
self.aggregation_mappings: list[AggregationMapping] = []
|
|
105
113
|
|
|
@@ -361,6 +369,74 @@ class InMemoryLineageStore:
|
|
|
361
369
|
return [(sid, s) for sid, s in self.merge_stats if sid == step_id]
|
|
362
370
|
return list(self.merge_stats)
|
|
363
371
|
|
|
372
|
+
# === CONCAT LOOKUP (O(log n) via searchsorted) ===
|
|
373
|
+
|
|
374
|
+
def _binary_search_mapping(
|
|
375
|
+
self, sorted_rids: Optional[np.ndarray], target_rid: int
|
|
376
|
+
) -> Optional[int]:
|
|
377
|
+
"""
|
|
378
|
+
Return index in sorted array, or None if not found.
|
|
379
|
+
|
|
380
|
+
Robust to None/empty arrays and dtype mismatches.
|
|
381
|
+
"""
|
|
382
|
+
if sorted_rids is None or len(sorted_rids) == 0:
|
|
383
|
+
return None
|
|
384
|
+
|
|
385
|
+
target = np.int64(target_rid)
|
|
386
|
+
i = np.searchsorted(sorted_rids, target)
|
|
387
|
+
|
|
388
|
+
if i < len(sorted_rids) and sorted_rids[i] == target:
|
|
389
|
+
return int(i)
|
|
390
|
+
return None
|
|
391
|
+
|
|
392
|
+
def get_concat_origin(self, row_id: int) -> Optional[dict]:
|
|
393
|
+
"""
|
|
394
|
+
Get which source DataFrame a row came from in a concat.
|
|
395
|
+
|
|
396
|
+
Uses binary search (O(log n)) on sorted RIDs.
|
|
397
|
+
|
|
398
|
+
Returns:
|
|
399
|
+
{step_id, source_index, source_shape, position} if found, else None.
|
|
400
|
+
"""
|
|
401
|
+
for mapping in self.concat_mappings:
|
|
402
|
+
idx = self._binary_search_mapping(mapping.out_rids_sorted, row_id)
|
|
403
|
+
if idx is not None:
|
|
404
|
+
pos = int(mapping.out_pos_sorted[idx])
|
|
405
|
+
source_idx = int(mapping.source_indices[pos])
|
|
406
|
+
return {
|
|
407
|
+
"step_id": mapping.step_id,
|
|
408
|
+
"source_index": source_idx,
|
|
409
|
+
"source_shape": (
|
|
410
|
+
mapping.source_shapes[source_idx]
|
|
411
|
+
if source_idx < len(mapping.source_shapes)
|
|
412
|
+
else None
|
|
413
|
+
),
|
|
414
|
+
"position": pos,
|
|
415
|
+
}
|
|
416
|
+
return None
|
|
417
|
+
|
|
418
|
+
# === DUPLICATE DROP LOOKUP (O(log n) via searchsorted) ===
|
|
419
|
+
|
|
420
|
+
def get_duplicate_representative(self, row_id: int) -> Optional[dict]:
|
|
421
|
+
"""
|
|
422
|
+
Get which row replaced this one in drop_duplicates.
|
|
423
|
+
|
|
424
|
+
Returns:
|
|
425
|
+
{step_id, kept_rid, subset_columns, keep_strategy} if found, else None.
|
|
426
|
+
kept_rid is -1 if keep=False (no representative).
|
|
427
|
+
"""
|
|
428
|
+
for mapping in self.duplicate_drop_mappings:
|
|
429
|
+
idx = self._binary_search_mapping(mapping.dropped_rids, row_id)
|
|
430
|
+
if idx is not None:
|
|
431
|
+
kept = int(mapping.kept_rids[idx])
|
|
432
|
+
return {
|
|
433
|
+
"step_id": mapping.step_id,
|
|
434
|
+
"kept_rid": kept if kept >= 0 else None,
|
|
435
|
+
"subset_columns": mapping.subset_columns,
|
|
436
|
+
"keep_strategy": mapping.keep_strategy,
|
|
437
|
+
}
|
|
438
|
+
return None
|
|
439
|
+
|
|
364
440
|
# === MEMORY MANAGEMENT ===
|
|
365
441
|
|
|
366
442
|
def _check_memory_and_spill(self) -> None:
|
|
@@ -567,17 +643,17 @@ class InMemoryLineageStore:
|
|
|
567
643
|
|
|
568
644
|
def get_row_history_with_lineage(self, row_id: int, max_depth: int = 10) -> list[dict]:
|
|
569
645
|
"""
|
|
570
|
-
Get row history including pre-merge parent history.
|
|
646
|
+
Get row history including pre-merge and pre-concat parent history.
|
|
571
647
|
|
|
572
|
-
Follows merge lineage recursively to build complete cell provenance.
|
|
573
|
-
This is essential for tracking changes that happened before merge operations.
|
|
648
|
+
Follows merge and concat lineage recursively to build complete cell provenance.
|
|
649
|
+
This is essential for tracking changes that happened before merge/concat operations.
|
|
574
650
|
|
|
575
651
|
Deduplicates events by (col, old_val, new_val, operation) signature to prevent
|
|
576
652
|
cross-pipeline contamination when multiple DataFrames share row IDs.
|
|
577
653
|
|
|
578
654
|
Args:
|
|
579
655
|
row_id: Row ID to trace
|
|
580
|
-
max_depth: Maximum
|
|
656
|
+
max_depth: Maximum lineage depth to follow (prevents infinite loops)
|
|
581
657
|
|
|
582
658
|
Returns:
|
|
583
659
|
List of UNIQUE events in chronological order, including parent row events.
|
|
@@ -592,12 +668,21 @@ class InMemoryLineageStore:
|
|
|
592
668
|
events = []
|
|
593
669
|
|
|
594
670
|
# Check if this row came from a merge
|
|
595
|
-
|
|
596
|
-
if
|
|
671
|
+
merge_origin = self.get_merge_origin(rid)
|
|
672
|
+
if merge_origin and merge_origin["left_parent"] is not None:
|
|
597
673
|
# Recursively get parent's history first (chronological order)
|
|
598
|
-
parent_events = _collect_history(
|
|
674
|
+
parent_events = _collect_history(merge_origin["left_parent"], depth + 1)
|
|
599
675
|
events.extend(parent_events)
|
|
600
676
|
|
|
677
|
+
# Check if this row came from a concat
|
|
678
|
+
# For concat, parent_rid == rid (identity mapping), so we don't recurse
|
|
679
|
+
# But we record the concat step for completeness
|
|
680
|
+
concat_origin = self.get_concat_origin(rid)
|
|
681
|
+
if concat_origin:
|
|
682
|
+
# Concat preserves RIDs, so the "parent" is the same RID
|
|
683
|
+
# The concat step itself is recorded in the step events
|
|
684
|
+
pass
|
|
685
|
+
|
|
601
686
|
# Add this row's direct events
|
|
602
687
|
events.extend(self.get_row_history(rid))
|
|
603
688
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tracepipe
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.2
|
|
4
4
|
Summary: Row-level data lineage tracking for pandas pipelines
|
|
5
5
|
Project-URL: Homepage, https://github.com/tracepipe/tracepipe
|
|
6
6
|
Project-URL: Documentation, https://tracepipe.github.io/tracepipe/
|
|
@@ -278,7 +278,7 @@ tp.enable(mode="debug") # Full lineage
|
|
|
278
278
|
|
|
279
279
|
## Known Limitations
|
|
280
280
|
|
|
281
|
-
TracePipe tracks **cell mutations**
|
|
281
|
+
TracePipe tracks **cell mutations**, **merge provenance**, **concat provenance**, and **duplicate drop decisions** reliably. A few patterns have limited tracking:
|
|
282
282
|
|
|
283
283
|
| Pattern | Status | Notes |
|
|
284
284
|
|---------|--------|-------|
|
|
@@ -286,13 +286,10 @@ TracePipe tracks **cell mutations** (fillna, replace, loc assignment) and **merg
|
|
|
286
286
|
| `df = df.fillna({"col": 0})` | ✅ Tracked | DataFrame-level fillna |
|
|
287
287
|
| `df.loc[mask, "col"] = val` | ✅ Tracked | Conditional assignment |
|
|
288
288
|
| `df.merge(other, on="key")` | ✅ Tracked | Full provenance in debug mode |
|
|
289
|
-
| `pd.concat([df1, df2])` |
|
|
290
|
-
| `df.drop_duplicates(
|
|
291
|
-
|
|
|
292
|
-
|
|
293
|
-
**Why?** TracePipe tracks value changes within rows, not row-selection operations. When `drop_duplicates` picks one row over another, that's a provenance decision (not a cell mutation) that isn't currently instrumented.
|
|
294
|
-
|
|
295
|
-
**Planned for 0.4**: Full row-provenance tracking for concat, drop_duplicates, and sort operations.
|
|
289
|
+
| `pd.concat([df1, df2])` | ✅ Tracked | Row IDs preserved with source DataFrame tracking (v0.4+) |
|
|
290
|
+
| `df.drop_duplicates()` | ✅ Tracked | Dropped rows map to kept representative (debug mode, v0.4+) |
|
|
291
|
+
| `pd.concat(axis=1)` | ⚠️ Partial | FULL only if all inputs have identical RIDs |
|
|
292
|
+
| Complex `apply`/`pipe` | ⚠️ Partial | Output tracked, internals opaque |
|
|
296
293
|
|
|
297
294
|
---
|
|
298
295
|
|
|
@@ -1,29 +1,29 @@
|
|
|
1
|
-
tracepipe/__init__.py,sha256=
|
|
1
|
+
tracepipe/__init__.py,sha256=cocA8ETqC1IGgDCXvxue9M4QVzIt8C981b6NTf9BXQ4,3342
|
|
2
2
|
tracepipe/api.py,sha256=WdcKvvzI3voDt6fxZWa8vjyZQU8lfRshx7T78oj7oFE,13351
|
|
3
3
|
tracepipe/context.py,sha256=DvwAZGZbLDJ4xoqmS1VnZOBbOI8ZIIErsY9W6GEbFSM,4051
|
|
4
4
|
tracepipe/contracts.py,sha256=m-rjPrgnCiAgKEkweOS7P95jrjDptt5UPdvUlqaV_rU,16226
|
|
5
|
-
tracepipe/convenience.py,sha256=
|
|
6
|
-
tracepipe/core.py,sha256=
|
|
7
|
-
tracepipe/debug.py,sha256=
|
|
5
|
+
tracepipe/convenience.py,sha256=ALRtVn6tLfa7Ks7d9hKVJfhLjOLuyFgxTwSoUL0BgHY,38241
|
|
6
|
+
tracepipe/core.py,sha256=O1QFJFTSszxKY1pVR1XLrdA0zez7t5KQLjyq6V36ARk,10883
|
|
7
|
+
tracepipe/debug.py,sha256=S3ga3rVHjDSV4OctkF5uEAQlzjOxFJO8RGC81awGboA,11397
|
|
8
8
|
tracepipe/safety.py,sha256=UpzhQj31Dij-DjgT9mY_jPrUpVfcA51gDI2fUos4IUA,6694
|
|
9
|
-
tracepipe/snapshot.py,sha256=
|
|
9
|
+
tracepipe/snapshot.py,sha256=kvW8be1EAAsyHefXxJPgIQAAYT_FwK167SMxeQcsra4,17921
|
|
10
10
|
tracepipe/value_provenance.py,sha256=ogky6aOaZ-6K2uNBQxlXpmCeuvK434Hisj30zesRTd8,9330
|
|
11
11
|
tracepipe/instrumentation/__init__.py,sha256=pd0n6Z9m_V3gcBv097cXWFOZEzAP9sAq1jjQnNRrDZ8,222
|
|
12
12
|
tracepipe/instrumentation/apply_capture.py,sha256=cMThWzNXqWQENuMrCGTne1hO6fqaQFV7zJYNpsPTW4w,14463
|
|
13
|
-
tracepipe/instrumentation/filter_capture.py,sha256=
|
|
13
|
+
tracepipe/instrumentation/filter_capture.py,sha256=aN8-Ev6kbDR8f9A9JVy236VK0iqNxpMvki3pbtUkBYQ,19445
|
|
14
14
|
tracepipe/instrumentation/indexer_capture.py,sha256=1ATCeJ-uNA1uGiSbgnUx0wdVsIlZGHeUBaFJPXgFQNg,28440
|
|
15
|
-
tracepipe/instrumentation/merge_capture.py,sha256=
|
|
15
|
+
tracepipe/instrumentation/merge_capture.py,sha256=zqa6SY5YLbr-N7PPTdE6TYKyJIZcPqT02d1Ifvi3Jdw,18359
|
|
16
16
|
tracepipe/instrumentation/pandas_inst.py,sha256=h8RlfwYkYwuftCyBYIETdwHxVCzQM1SBBrbYP7SyjJ8,30047
|
|
17
17
|
tracepipe/instrumentation/series_capture.py,sha256=i7FiA2ndEzS6duIj5y-a7SDfIMl2cCY_jGC1tmG7TGU,11271
|
|
18
18
|
tracepipe/storage/__init__.py,sha256=pGFMfbIgIi2kofVPwYDqe2HTYMYJoabiGjTq77pYi-g,348
|
|
19
19
|
tracepipe/storage/base.py,sha256=7DV_-rp37DjBMr9B1w85hLVYhC8OQShk2PcEhT-n4tE,4894
|
|
20
|
-
tracepipe/storage/lineage_store.py,sha256=
|
|
20
|
+
tracepipe/storage/lineage_store.py,sha256=1enRmDgnVjxW8Pu7WMHJ8WPnnbm-HsAm4e1dKsTvnIc,31943
|
|
21
21
|
tracepipe/storage/row_identity.py,sha256=HBU0gTTJlFtFTcAdUCKuX-c9cHa0lo3CDIodDPDgOzA,17161
|
|
22
22
|
tracepipe/utils/__init__.py,sha256=CI_GXViCjdMbu1j6HuzZhoQZEW0sIB6WAve6j5pfOC0,182
|
|
23
23
|
tracepipe/utils/value_capture.py,sha256=wGgegQmJnVHxHbwHSH9di7JAOBChzD3ERJrabZNiayk,4092
|
|
24
24
|
tracepipe/visualization/__init__.py,sha256=M3s44ZTUNEToyghjhQW0FgbmWHKPr4Xc-7iNF6DpI_E,132
|
|
25
25
|
tracepipe/visualization/html_export.py,sha256=G0hfZTJctUCfpun17zXX1NIXhvJZbca6hKmP3rcIjbg,42282
|
|
26
|
-
tracepipe-0.
|
|
27
|
-
tracepipe-0.
|
|
28
|
-
tracepipe-0.
|
|
29
|
-
tracepipe-0.
|
|
26
|
+
tracepipe-0.4.2.dist-info/METADATA,sha256=0nMQRfqFJCg1DMGjWzW_nlFcWMM-q8T4LfoqkMcYmAQ,10067
|
|
27
|
+
tracepipe-0.4.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
28
|
+
tracepipe-0.4.2.dist-info/licenses/LICENSE,sha256=HMOAFHBClL79POwWL-2_aDcx42DJAq7Ce-nwJPvMB9U,1075
|
|
29
|
+
tracepipe-0.4.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|