tracepipe 0.3.5__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tracepipe/__init__.py CHANGED
@@ -81,7 +81,7 @@ from .core import TracePipeConfig, TracePipeMode
81
81
  from .snapshot import DiffResult, Snapshot, diff, snapshot
82
82
 
83
83
  # === VERSION ===
84
- __version__ = "0.3.5"
84
+ __version__ = "0.4.2"
85
85
 
86
86
  # === MINIMAL __all__ ===
87
87
  __all__ = [
tracepipe/convenience.py CHANGED
@@ -54,6 +54,14 @@ class CheckResult:
54
54
 
55
55
  Separates FACTS (observed, high confidence) from HEURISTICS (inferred).
56
56
  .ok is True only if there are no FACT-level warnings.
57
+
58
+ Key properties for quick access:
59
+ .passed - Alias for .ok (common naming convention)
60
+ .retention - Row retention rate (0.0-1.0)
61
+ .n_dropped - Total rows dropped
62
+ .drops_by_op - Drops broken down by operation
63
+ .n_changes - Total cell-level changes (debug mode only)
64
+ .changes_by_op - Changes broken down by operation (debug mode only)
57
65
  """
58
66
 
59
67
  ok: bool
@@ -61,6 +69,50 @@ class CheckResult:
61
69
  facts: dict[str, Any]
62
70
  suggestions: list[str]
63
71
  mode: str
72
+ # Internal: store drops_by_op so we don't need to recompute
73
+ _drops_by_op: dict[str, int] = field(default_factory=dict)
74
+ # Internal: store cell change counts (debug mode only)
75
+ _n_changes: int = 0
76
+ _changes_by_op: dict[str, int] = field(default_factory=dict)
77
+
78
+ # === CONVENIENCE PROPERTIES ===
79
+
80
+ @property
81
+ def passed(self) -> bool:
82
+ """Alias for .ok (matches common naming convention)."""
83
+ return self.ok
84
+
85
+ @property
86
+ def retention(self) -> float | None:
87
+ """Row retention rate (0.0-1.0), or None if not computed."""
88
+ return self.facts.get("retention_rate")
89
+
90
+ @property
91
+ def n_dropped(self) -> int:
92
+ """Total number of rows dropped."""
93
+ return self.facts.get("rows_dropped", 0)
94
+
95
+ @property
96
+ def drops_by_op(self) -> dict[str, int]:
97
+ """Drops broken down by operation name."""
98
+ return self._drops_by_op
99
+
100
+ @property
101
+ def n_steps(self) -> int:
102
+ """Total pipeline steps recorded."""
103
+ return self.facts.get("total_steps", 0)
104
+
105
+ @property
106
+ def n_changes(self) -> int:
107
+ """Total cell-level changes (debug mode only, 0 if not tracked)."""
108
+ return self._n_changes
109
+
110
+ @property
111
+ def changes_by_op(self) -> dict[str, int]:
112
+ """Cell changes broken down by operation (debug mode only)."""
113
+ return self._changes_by_op
114
+
115
+ # === EXISTING PROPERTIES ===
64
116
 
65
117
  @property
66
118
  def has_warnings(self) -> bool:
@@ -90,6 +142,20 @@ class CheckResult:
90
142
  lines.append(f"TracePipe Check: {status}")
91
143
  lines.append(f" Mode: {self.mode}")
92
144
 
145
+ # Always show key metrics in compact form
146
+ if self.retention is not None:
147
+ lines.append(f"\nRetention: {int(self.retention * 100)}%")
148
+ if self.n_dropped > 0:
149
+ lines.append(f"Dropped: {self.n_dropped} rows")
150
+ if self.drops_by_op:
151
+ for op, count in list(self.drops_by_op.items())[:5]:
152
+ lines.append(f" • {op}: {count}")
153
+ if self.n_changes > 0:
154
+ lines.append(f"\nValue changes: {self.n_changes} cells")
155
+ if self.changes_by_op:
156
+ for op, count in list(self.changes_by_op.items())[:5]:
157
+ lines.append(f" • {op}: {count}")
158
+
93
159
  if verbose and self.facts:
94
160
  lines.append("\n Measured facts:")
95
161
  for k, v in self.facts.items():
@@ -115,7 +181,14 @@ class CheckResult:
115
181
  """Export to dictionary."""
116
182
  return {
117
183
  "ok": self.ok,
184
+ "passed": self.passed,
118
185
  "mode": self.mode,
186
+ "retention": self.retention,
187
+ "n_dropped": self.n_dropped,
188
+ "n_steps": self.n_steps,
189
+ "drops_by_op": self.drops_by_op,
190
+ "n_changes": self.n_changes,
191
+ "changes_by_op": self.changes_by_op,
119
192
  "facts": self.facts,
120
193
  "suggestions": self.suggestions,
121
194
  "warnings": [
@@ -147,6 +220,11 @@ class TraceResult:
147
220
 
148
221
  Answers: "What happened to this row?"
149
222
  Events are in CHRONOLOGICAL order (oldest->newest).
223
+
224
+ Key attributes:
225
+ status: "alive" or "dropped" (string representation)
226
+ origin: Where this row came from (concat, merge, or original)
227
+ representative: If dropped by dedup, which row was kept instead
150
228
  """
151
229
 
152
230
  row_id: int
@@ -158,22 +236,93 @@ class TraceResult:
158
236
  # Mode enforcement
159
237
  supported: bool = True
160
238
  unsupported_reason: str | None = None
239
+ # v0.4+ provenance
240
+ concat_origin: dict[str, Any] | None = None
241
+ dedup_representative: dict[str, Any] | None = None
242
+ # Steps this row survived (for SURVIVED event generation)
243
+ _survived_steps: list[dict[str, Any]] = field(default_factory=list)
244
+
245
+ @property
246
+ def status(self) -> str:
247
+ """Row status as string: 'alive' or 'dropped'."""
248
+ return "alive" if self.is_alive else "dropped"
249
+
250
+ @property
251
+ def dropped_by(self) -> str | None:
252
+ """Operation that dropped this row, or None if alive."""
253
+ if self.dropped_at:
254
+ return self.dropped_at.get("operation")
255
+ return None
256
+
257
+ @property
258
+ def dropped_at_step(self) -> int | None:
259
+ """Step number where this row was dropped, or None if alive."""
260
+ if self.dropped_at:
261
+ return self.dropped_at.get("step_id")
262
+ return None
161
263
 
162
264
  @property
163
265
  def n_events(self) -> int:
164
266
  return len(self.events)
165
267
 
268
+ @property
269
+ def origin(self) -> dict[str, Any] | None:
270
+ """
271
+ Unified origin info: where did this row come from?
272
+
273
+ Returns dict with 'type' key:
274
+ - {"type": "concat", "source_df": 1, "step_id": 5}
275
+ - {"type": "merge", "left_parent": 10, "right_parent": 20, "step_id": 3}
276
+ - None if original row (not from concat/merge)
277
+ """
278
+ if self.concat_origin:
279
+ return {
280
+ "type": "concat",
281
+ "source_df": self.concat_origin.get("source_index"),
282
+ "step_id": self.concat_origin.get("step_id"),
283
+ }
284
+ if self.merge_origin:
285
+ return {
286
+ "type": "merge",
287
+ "left_parent": self.merge_origin.get("left_parent"),
288
+ "right_parent": self.merge_origin.get("right_parent"),
289
+ "step_id": self.merge_origin.get("step_id"),
290
+ }
291
+ return None
292
+
293
+ @property
294
+ def representative(self) -> dict[str, Any] | None:
295
+ """
296
+ If dropped by drop_duplicates, which row was kept instead?
297
+
298
+ Returns:
299
+ {"kept_rid": 42, "subset": ["key"], "keep": "first"} or None
300
+ kept_rid is None if keep=False (all duplicates dropped)
301
+ """
302
+ if not self.dedup_representative:
303
+ return None
304
+ return {
305
+ "kept_rid": self.dedup_representative.get("kept_rid"),
306
+ "subset": self.dedup_representative.get("subset_columns"),
307
+ "keep": self.dedup_representative.get("keep_strategy"),
308
+ }
309
+
166
310
  def to_dict(self) -> dict:
167
311
  """Export to dictionary."""
168
312
  return {
169
313
  "row_id": self.row_id,
314
+ "status": self.status,
170
315
  "is_alive": self.is_alive,
171
316
  "dropped_at": self.dropped_at,
172
- "merge_origin": self.merge_origin,
317
+ "dropped_by": self.dropped_at.get("operation") if self.dropped_at else None,
318
+ "origin": self.origin,
319
+ "representative": self.representative,
173
320
  "n_events": self.n_events,
174
321
  "events": self.events,
175
322
  "ghost_values": self.ghost_values,
176
323
  "supported": self.supported,
324
+ # Keep legacy fields for backwards compatibility
325
+ "merge_origin": self.merge_origin,
177
326
  }
178
327
 
179
328
  def __repr__(self) -> str:
@@ -186,19 +335,38 @@ class TraceResult:
186
335
 
187
336
  lines = [f"Row {self.row_id} Journey:"]
188
337
 
338
+ # Status line matches documentation format
189
339
  if self.is_alive:
190
340
  lines.append(" Status: [OK] Alive")
191
341
  else:
192
- lines.append(" Status: [X] Dropped")
342
+ lines.append(" Status: [DROPPED]")
193
343
  if self.dropped_at:
194
344
  lines.append(
195
345
  f" at step {self.dropped_at['step_id']}: {self.dropped_at['operation']}"
196
346
  )
197
347
 
198
- if self.merge_origin:
199
- left = self.merge_origin.get("left_parent", "?")
200
- right = self.merge_origin.get("right_parent", "?")
201
- lines.append(f" Origin: merge of row {left} (left) + row {right} (right)")
348
+ # Display unified origin info
349
+ origin = self.origin
350
+ if origin:
351
+ if origin["type"] == "merge":
352
+ left = origin.get("left_parent", "?")
353
+ right = origin.get("right_parent", "?")
354
+ lines.append(f" Origin: merge of row {left} (left) + row {right} (right)")
355
+ elif origin["type"] == "concat":
356
+ src = origin.get("source_df", "?")
357
+ lines.append(f" Origin: concat from DataFrame #{src}")
358
+
359
+ # Display dedup representative if dropped by dedup
360
+ if self.representative:
361
+ kept = self.representative.get("kept_rid")
362
+ subset = self.representative.get("subset")
363
+ keep = self.representative.get("keep", "first")
364
+ if kept is not None:
365
+ subset_str = f" (key: {subset})" if subset else ""
366
+ lines.append(f" Replaced by: row {kept}{subset_str} [keep={keep}]")
367
+ else:
368
+ subset_str = f" on {subset}" if subset else ""
369
+ lines.append(f" Dropped: all duplicates removed{subset_str} [keep=False]")
202
370
 
203
371
  if len(self.events) == 0:
204
372
  lines.append("\n Events: 0 (no changes to watched columns)")
@@ -462,11 +630,26 @@ def check(
462
630
  )
463
631
  )
464
632
 
465
- drops_by_step = ctx.store.get_dropped_by_step()
466
- for op, count in drops_by_step.items():
633
+ drops_by_op = ctx.store.get_dropped_by_step()
634
+ for op, count in drops_by_op.items():
467
635
  if count > 1000:
468
636
  suggestions.append(f"'{op}' dropped {count} rows - review if intentional")
469
637
 
638
+ # === CELL CHANGES (debug mode only) ===
639
+ n_changes = 0
640
+ changes_by_op: dict[str, int] = {}
641
+ if ctx.config.mode == TracePipeMode.DEBUG:
642
+ # Count non-drop diffs (cell-level changes)
643
+ step_map = {s.step_id: s.operation for s in ctx.store.steps}
644
+ for i in range(len(ctx.store.diff_step_ids)):
645
+ col = ctx.store.diff_cols[i]
646
+ if col != "__row__": # Skip drop events
647
+ n_changes += 1
648
+ step_id = ctx.store.diff_step_ids[i]
649
+ op = step_map.get(step_id, "unknown")
650
+ changes_by_op[op] = changes_by_op.get(op, 0) + 1
651
+ facts["n_changes"] = n_changes
652
+
470
653
  ok = len([w for w in warnings_list if w.severity == "fact"]) == 0
471
654
 
472
655
  return CheckResult(
@@ -475,6 +658,9 @@ def check(
475
658
  facts=facts,
476
659
  suggestions=suggestions,
477
660
  mode=ctx.config.mode.value,
661
+ _drops_by_op=drops_by_op,
662
+ _n_changes=n_changes,
663
+ _changes_by_op=changes_by_op,
478
664
  )
479
665
 
480
666
 
@@ -482,6 +668,7 @@ def trace(
482
668
  df: pd.DataFrame,
483
669
  *,
484
670
  row: int | None = None,
671
+ row_id: int | None = None,
485
672
  where: dict[str, Any] | None = None,
486
673
  include_ghost: bool = True,
487
674
  ) -> TraceResult | list[TraceResult]:
@@ -490,7 +677,8 @@ def trace(
490
677
 
491
678
  Args:
492
679
  df: DataFrame to search in
493
- row: Row ID (if known)
680
+ row: Row position (0-based index into current DataFrame)
681
+ row_id: Internal row ID (use for tracing dropped rows)
494
682
  where: Selector dict, e.g. {"customer_id": "C123"}
495
683
  include_ghost: Include last-known values for dropped rows
496
684
 
@@ -499,8 +687,14 @@ def trace(
499
687
  Use print(result) for pretty output, result.to_dict() for data.
500
688
 
501
689
  Examples:
502
- result = tp.trace(df, row=5)
503
- print(result)
690
+ # Trace by position in current DataFrame
691
+ result = tp.trace(df, row=0) # First row
692
+
693
+ # Trace by internal row ID (for dropped rows)
694
+ dropped = tp.debug.inspect().dropped_rows()
695
+ result = tp.trace(df, row_id=dropped[0])
696
+
697
+ # Trace by business key
504
698
  tp.trace(df, where={"customer_id": "C123"})
505
699
  """
506
700
  ctx = get_context()
@@ -511,12 +705,30 @@ def trace(
511
705
  pass
512
706
 
513
707
  # Resolve row IDs
514
- if row is not None:
515
- row_ids = [row]
708
+ if row_id is not None:
709
+ # Direct row ID specified - use as-is
710
+ row_ids = [row_id]
711
+ elif row is not None:
712
+ # row= is a DataFrame index position (0-based), not a row ID
713
+ # Convert to actual row ID using the DataFrame's registered IDs
714
+ rids = ctx.row_manager.get_ids_array(df)
715
+ if rids is not None:
716
+ # Handle negative indexing
717
+ if row < 0:
718
+ row = len(rids) + row
719
+ if 0 <= row < len(rids):
720
+ row_ids = [int(rids[row])]
721
+ else:
722
+ raise ValueError(
723
+ f"Row index {row} out of bounds for DataFrame with {len(rids)} rows"
724
+ )
725
+ else:
726
+ # DataFrame not tracked - use row as-is (legacy behavior)
727
+ row_ids = [row]
516
728
  elif where is not None:
517
729
  row_ids = _resolve_where(df, where, ctx)
518
730
  else:
519
- raise ValueError("Must provide 'row' or 'where'")
731
+ raise ValueError("Must provide 'row', 'row_id', or 'where'")
520
732
 
521
733
  results = []
522
734
  for rid in row_ids:
@@ -531,6 +743,7 @@ def why(
531
743
  *,
532
744
  col: str,
533
745
  row: int | None = None,
746
+ row_id: int | None = None,
534
747
  where: dict[str, Any] | None = None,
535
748
  ) -> WhyResult | list[WhyResult]:
536
749
  """
@@ -539,7 +752,8 @@ def why(
539
752
  Args:
540
753
  df: DataFrame to search in
541
754
  col: Column name to trace
542
- row: Row ID (if known)
755
+ row: Row position (0-based index into current DataFrame)
756
+ row_id: Internal row ID (use for cells in dropped rows)
543
757
  where: Selector dict, e.g. {"customer_id": "C123"}
544
758
 
545
759
  Returns:
@@ -547,7 +761,7 @@ def why(
547
761
  Use print(result) for pretty output, result.to_dict() for data.
548
762
 
549
763
  Examples:
550
- result = tp.why(df, col="amount", row=5)
764
+ result = tp.why(df, col="amount", row=0) # First row
551
765
  print(result)
552
766
  tp.why(df, col="email", where={"user_id": "U123"})
553
767
  """
@@ -563,12 +777,30 @@ def why(
563
777
  )
564
778
 
565
779
  # Resolve row IDs
566
- if row is not None:
567
- row_ids = [row]
780
+ if row_id is not None:
781
+ # Direct row ID specified - use as-is
782
+ row_ids = [row_id]
783
+ elif row is not None:
784
+ # row= is a DataFrame index position (0-based), not a row ID
785
+ # Convert to actual row ID using the DataFrame's registered IDs
786
+ rids = ctx.row_manager.get_ids_array(df)
787
+ if rids is not None:
788
+ # Handle negative indexing
789
+ if row < 0:
790
+ row = len(rids) + row
791
+ if 0 <= row < len(rids):
792
+ row_ids = [int(rids[row])]
793
+ else:
794
+ raise ValueError(
795
+ f"Row index {row} out of bounds for DataFrame with {len(rids)} rows"
796
+ )
797
+ else:
798
+ # DataFrame not tracked - use row as-is (legacy behavior)
799
+ row_ids = [row]
568
800
  elif where is not None:
569
801
  row_ids = _resolve_where(df, where, ctx)
570
802
  else:
571
- raise ValueError("Must provide 'row' or 'where'")
803
+ raise ValueError("Must provide 'row', 'row_id', or 'where'")
572
804
 
573
805
  results = []
574
806
  for rid in row_ids:
@@ -787,6 +1019,14 @@ def _build_trace_result(row_id: int, ctx, include_ghost: bool) -> TraceResult:
787
1019
  drop_event = store.get_drop_event(row_id)
788
1020
  merge_origin = store.get_merge_origin(row_id)
789
1021
 
1022
+ # v0.4+ provenance: concat origin and dedup representative
1023
+ concat_origin = None
1024
+ dedup_representative = None
1025
+ if hasattr(store, "get_concat_origin"):
1026
+ concat_origin = store.get_concat_origin(row_id)
1027
+ if hasattr(store, "get_duplicate_representative"):
1028
+ dedup_representative = store.get_duplicate_representative(row_id)
1029
+
790
1030
  # Use lineage-aware history to include pre-merge parent events
791
1031
  if hasattr(store, "get_row_history_with_lineage"):
792
1032
  history = store.get_row_history_with_lineage(row_id)
@@ -823,6 +1063,8 @@ def _build_trace_result(row_id: int, ctx, include_ghost: bool) -> TraceResult:
823
1063
  merge_origin=merge_origin,
824
1064
  events=history,
825
1065
  ghost_values=ghost_values,
1066
+ concat_origin=concat_origin,
1067
+ dedup_representative=dedup_representative,
826
1068
  )
827
1069
 
828
1070
 
tracepipe/core.py CHANGED
@@ -277,3 +277,82 @@ class MergeStats:
277
277
  left_dup_rate: float # -1 if not computed
278
278
  right_dup_rate: float # -1 if not computed
279
279
  how: str
280
+
281
+
282
+ @dataclass
283
+ class ConcatMapping:
284
+ """
285
+ Mapping for pd.concat operations preserving row lineage.
286
+
287
+ For axis=0 concat, each result row comes from exactly one source DataFrame.
288
+ Arrays are stored in both positional order (for "explain row i") and
289
+ sorted order (for O(log n) RID lookup).
290
+
291
+ Invariants:
292
+ - out_rids and source_indices have same length
293
+ - out_rids_sorted and out_pos_sorted are always paired (both set or both None)
294
+ - out_rids_sorted is monotonically increasing
295
+ """
296
+
297
+ step_id: int
298
+
299
+ # Positional arrays (match result row order)
300
+ out_rids: Any # numpy array, len = len(result)
301
+ source_indices: Any # numpy array, which source DF (0, 1, 2...) each row came from
302
+
303
+ # Sorted arrays (for O(log n) lookup by RID)
304
+ out_rids_sorted: Any # numpy array, SORTED
305
+ out_pos_sorted: Any # numpy array, original positions aligned with out_rids_sorted
306
+
307
+ # Metadata
308
+ source_shapes: list[tuple] = field(default_factory=list)
309
+
310
+ def __post_init__(self):
311
+ """Validate invariants."""
312
+ import numpy as np
313
+
314
+ if self.out_rids_sorted is not None and self.out_pos_sorted is not None:
315
+ if len(self.out_rids_sorted) != len(self.out_pos_sorted):
316
+ raise ValueError("out_rids_sorted and out_pos_sorted must have same length")
317
+ # Verify monotonic (debug check)
318
+ if len(self.out_rids_sorted) > 1:
319
+ assert np.all(
320
+ self.out_rids_sorted[:-1] <= self.out_rids_sorted[1:]
321
+ ), "out_rids_sorted must be monotonically increasing"
322
+
323
+
324
+ @dataclass
325
+ class DuplicateDropMapping:
326
+ """
327
+ Mapping for drop_duplicates provenance (debug mode only).
328
+
329
+ Tracks which rows were dropped and which "representative" row they lost to.
330
+ Arrays are sorted by dropped_rids for O(log n) lookup.
331
+
332
+ For keep='first': dropped rows map to first occurrence
333
+ For keep='last': dropped rows map to last occurrence
334
+ For keep=False: dropped rows have kept_rids=-1 (no representative)
335
+ """
336
+
337
+ step_id: int
338
+
339
+ # Sorted arrays for O(log n) lookup
340
+ dropped_rids: Any # numpy array, SORTED dropped row IDs
341
+ kept_rids: Any # numpy array, representative RID for each dropped row (-1 if none)
342
+
343
+ # Metadata
344
+ subset_columns: Optional[tuple[str, ...]] = None
345
+ keep_strategy: str = "first"
346
+
347
+ def __post_init__(self):
348
+ """Validate invariants."""
349
+ import numpy as np
350
+
351
+ if self.dropped_rids is not None and self.kept_rids is not None:
352
+ if len(self.dropped_rids) != len(self.kept_rids):
353
+ raise ValueError("dropped_rids and kept_rids must have same length")
354
+ # Verify sorted
355
+ if len(self.dropped_rids) > 1:
356
+ assert np.all(
357
+ self.dropped_rids[:-1] <= self.dropped_rids[1:]
358
+ ), "dropped_rids must be sorted"
tracepipe/debug.py CHANGED
@@ -179,6 +179,46 @@ class DebugInspector:
179
179
  ctx = get_context()
180
180
  return ctx.row_manager.get_ghost_rows(limit=limit)
181
181
 
182
+ def get_ghost_values(self, row_id: int) -> dict[str, Any] | None:
183
+ """
184
+ Get last-known values for a specific dropped row (DEBUG mode only).
185
+
186
+ Args:
187
+ row_id: The row ID to look up
188
+
189
+ Returns:
190
+ Dict mapping column names to their last known values,
191
+ or None if the row was not found in ghost storage.
192
+
193
+ Example:
194
+ dbg = tp.debug.inspect()
195
+ dropped_rid = list(dbg.dropped_rows())[0]
196
+ ghost = dbg.get_ghost_values(dropped_rid)
197
+ print(f"Last known values: {ghost}")
198
+ """
199
+ ctx = get_context()
200
+ ghost_df = ctx.row_manager.get_ghost_rows(limit=100000)
201
+
202
+ if ghost_df.empty or "__tp_row_id__" not in ghost_df.columns:
203
+ return None
204
+
205
+ row_match = ghost_df[ghost_df["__tp_row_id__"] == row_id]
206
+ if row_match.empty:
207
+ return None
208
+
209
+ # Convert to dict and remove internal columns
210
+ result = row_match.iloc[0].to_dict()
211
+ internal_cols = [
212
+ "__tp_row_id__",
213
+ "__tp_dropped_by__",
214
+ "__tp_dropped_step__",
215
+ "__tp_original_position__",
216
+ ]
217
+ for col in internal_cols:
218
+ result.pop(col, None)
219
+
220
+ return result
221
+
182
222
  def stats(self) -> dict:
183
223
  """Get comprehensive tracking statistics."""
184
224
  ctx = get_context()
@@ -24,7 +24,7 @@ import numpy as np
24
24
  import pandas as pd
25
25
 
26
26
  from ..context import TracePipeContext, get_context
27
- from ..core import CompletenessLevel
27
+ from ..core import CompletenessLevel, DuplicateDropMapping
28
28
  from ..safety import TracePipeWarning, get_caller_info
29
29
 
30
30
  # ============ MASK DERIVATION FUNCTIONS ============
@@ -97,6 +97,95 @@ def derive_drop_duplicates_mask(
97
97
  return kept_mask.values, completeness
98
98
 
99
99
 
100
+ def derive_drop_duplicates_provenance(
101
+ df: pd.DataFrame,
102
+ source_rids: np.ndarray,
103
+ subset: Optional[list[str]],
104
+ keep: str,
105
+ ) -> Optional[DuplicateDropMapping]:
106
+ """
107
+ Derive dropped->kept mapping for drop_duplicates (debug mode only).
108
+
109
+ Uses hash_pandas_object for NaN-safe, fast key comparison.
110
+ Uses vectorized groupby min/max for representative selection.
111
+
112
+ Args:
113
+ df: Source DataFrame
114
+ source_rids: Row IDs for each row in df
115
+ subset: Columns to consider for duplicates (None = all)
116
+ keep: 'first', 'last', or False
117
+
118
+ Returns:
119
+ DuplicateDropMapping if any rows were dropped, else None.
120
+ """
121
+ n = len(df)
122
+ if n == 0:
123
+ return None
124
+
125
+ # Determine columns to hash
126
+ if subset is None:
127
+ hash_df = df
128
+ valid_cols = tuple(df.columns)
129
+ else:
130
+ valid_cols = tuple(c for c in subset if c in df.columns)
131
+ if not valid_cols:
132
+ return None
133
+ hash_df = df[list(valid_cols)]
134
+
135
+ # Use hash_pandas_object for fast, NaN-safe key hashing
136
+ try:
137
+ h = pd.util.hash_pandas_object(hash_df, index=False)
138
+ codes, _ = pd.factorize(h, sort=False)
139
+ except Exception:
140
+ # Fallback: can't hash, skip provenance
141
+ return None
142
+
143
+ # Compute kept mask using pandas (ground truth)
144
+ kept_mask = ~df.duplicated(subset=list(valid_cols) if valid_cols else None, keep=keep)
145
+ dropped_mask = ~kept_mask.values
146
+
147
+ if not dropped_mask.any():
148
+ return None # No duplicates dropped
149
+
150
+ dropped_positions = np.where(dropped_mask)[0]
151
+ dropped_rids = source_rids[dropped_positions]
152
+
153
+ # Find representative positions using vectorized groupby min/max
154
+ positions = np.arange(n, dtype=np.int64)
155
+
156
+ if keep == "first":
157
+ # Representative = first occurrence of each group
158
+ rep_pos = pd.Series(positions).groupby(codes).min().to_numpy()
159
+ elif keep == "last":
160
+ # Representative = last occurrence of each group
161
+ rep_pos = pd.Series(positions).groupby(codes).max().to_numpy()
162
+ else:
163
+ # keep=False: no representative (all duplicates dropped)
164
+ rep_pos = None
165
+
166
+ # Build kept_rids array
167
+ if rep_pos is not None:
168
+ dropped_codes = codes[dropped_positions]
169
+ kept_positions = rep_pos[dropped_codes]
170
+ kept_rids = source_rids[kept_positions]
171
+ else:
172
+ # keep=False: no representative
173
+ kept_rids = np.full(len(dropped_rids), -1, dtype=np.int64)
174
+
175
+ # Sort by dropped_rids for O(log n) lookup
176
+ sort_order = np.argsort(dropped_rids)
177
+ dropped_rids_sorted = dropped_rids[sort_order].copy()
178
+ kept_rids_sorted = kept_rids[sort_order].copy()
179
+
180
+ return DuplicateDropMapping(
181
+ step_id=-1, # Will be set by caller
182
+ dropped_rids=dropped_rids_sorted,
183
+ kept_rids=kept_rids_sorted,
184
+ subset_columns=valid_cols if valid_cols else None,
185
+ keep_strategy=str(keep),
186
+ )
187
+
188
+
100
189
  def derive_query_mask(
101
190
  df: pd.DataFrame, args: tuple, kwargs: dict
102
191
  ) -> tuple[Optional[np.ndarray], CompletenessLevel]:
@@ -257,12 +346,19 @@ def _capture_filter_with_mask(
257
346
  kept_mask: Optional[np.ndarray] = None
258
347
  positions: Optional[np.ndarray] = None
259
348
  completeness = CompletenessLevel.FULL
349
+ dedup_mapping: Optional[DuplicateDropMapping] = None
260
350
 
261
351
  if method_name == "dropna":
262
352
  kept_mask, completeness = derive_dropna_mask(source_df, args, kwargs)
263
353
 
264
354
  elif method_name == "drop_duplicates":
265
355
  kept_mask, completeness = derive_drop_duplicates_mask(source_df, args, kwargs)
356
+ # Compute provenance mapping in debug mode
357
+ dedup_mapping = None
358
+ if ctx.config.should_capture_merge_provenance:
359
+ subset = kwargs.get("subset", None)
360
+ keep = kwargs.get("keep", "first")
361
+ dedup_mapping = derive_drop_duplicates_provenance(source_df, source_rids, subset, keep)
266
362
 
267
363
  elif method_name == "query":
268
364
  kept_mask, completeness = derive_query_mask(source_df, args, kwargs)
@@ -359,6 +455,12 @@ def _capture_filter_with_mask(
359
455
  watched_columns=ctx.watched_columns,
360
456
  )
361
457
 
458
+ # === RECORD DROP_DUPLICATES PROVENANCE (debug mode) ===
459
+ if method_name == "drop_duplicates" and dedup_mapping is not None:
460
+ # Update step_id in the mapping and store it
461
+ dedup_mapping.step_id = step_id
462
+ store.duplicate_drop_mappings.append(dedup_mapping)
463
+
362
464
 
363
465
  def _propagate_by_index_fallback(
364
466
  row_mgr, source_df: pd.DataFrame, result_df: pd.DataFrame
@@ -14,7 +14,7 @@ import numpy as np
14
14
  import pandas as pd
15
15
 
16
16
  from ..context import get_context
17
- from ..core import CompletenessLevel, MergeMapping, MergeStats
17
+ from ..core import CompletenessLevel, ConcatMapping, MergeMapping, MergeStats
18
18
  from ..safety import TracePipeWarning, get_caller_info
19
19
 
20
20
 
@@ -382,53 +382,199 @@ def wrap_join_with_lineage(original_join):
382
382
  def wrap_concat_with_lineage(original_concat):
383
383
  """
384
384
  Wrap pd.concat with lineage capture.
385
+
386
+ For axis=0 (vertical concat):
387
+ - Preserves row IDs from source DataFrames (FULL provenance)
388
+ - Tracks which source DataFrame each row came from
389
+
390
+ For axis=1 (horizontal concat):
391
+ - Propagates RIDs if all inputs have identical RID arrays
392
+ - Otherwise marks as PARTIAL
385
393
  """
386
394
 
387
395
  @wraps(original_concat)
388
396
  def wrapper(objs, *args, **kwargs):
389
397
  ctx = get_context()
390
398
 
391
- result = original_concat(objs, *args, **kwargs)
392
-
393
399
  if not ctx.enabled:
394
- return result
400
+ return original_concat(objs, *args, **kwargs)
401
+
402
+ axis = kwargs.get("axis", 0)
403
+
404
+ # === BEFORE: Capture source RIDs from all tracked DataFrames ===
405
+ source_data = [] # [(rids_copy, shape, original_index), ...]
406
+ try:
407
+ objs_list = list(objs) if hasattr(objs, "__iter__") else [objs]
408
+ except TypeError:
409
+ objs_list = [objs]
410
+
411
+ for i, obj in enumerate(objs_list):
412
+ if isinstance(obj, pd.DataFrame) and len(obj) > 0:
413
+ rids = ctx.row_manager.get_ids_array(obj)
414
+ if rids is None:
415
+ rids = ctx.row_manager.register(obj)
416
+ # IMPORTANT: Make a copy to avoid mutation issues
417
+ source_data.append((rids.copy(), obj.shape, i))
418
+
419
+ # === RUN ORIGINAL ===
420
+ try:
421
+ result = original_concat(objs_list, *args, **kwargs)
422
+ except Exception:
423
+ raise # Don't store mapping on failure
395
424
 
396
425
  if not isinstance(result, pd.DataFrame):
397
426
  return result
398
427
 
399
- try:
400
- row_mgr = ctx.row_manager
401
- store = ctx.store
428
+ row_mgr = ctx.row_manager
429
+ store = ctx.store
430
+ code_file, code_line = get_caller_info(skip_frames=2)
402
431
 
403
- # Register result
404
- row_mgr.register(result)
432
+ # Compute input shapes for step metadata
433
+ input_shapes = [sd[1] for sd in source_data]
405
434
 
406
- code_file, code_line = get_caller_info(skip_frames=2)
435
+ # === AXIS=0: Vertical concat with FULL provenance ===
436
+ if axis == 0 and source_data:
437
+ return _concat_axis0_with_provenance(
438
+ result, source_data, input_shapes, code_file, code_line, ctx
439
+ )
407
440
 
408
- # Compute input shapes
409
- input_shapes = []
410
- for obj in objs:
411
- if hasattr(obj, "shape"):
412
- input_shapes.append(obj.shape)
441
+ # === AXIS=1: Horizontal concat ===
442
+ elif axis == 1 and source_data:
443
+ return _concat_axis1_with_provenance(
444
+ result, source_data, input_shapes, code_file, code_line, ctx
445
+ )
413
446
 
447
+ # === FALLBACK: Unknown axis or no source data ===
448
+ else:
449
+ row_mgr.register(result)
414
450
  store.append_step(
415
451
  operation="pd.concat",
416
452
  stage=ctx.current_stage,
417
453
  code_file=code_file,
418
454
  code_line=code_line,
419
455
  params={
420
- "axis": kwargs.get("axis", 0),
421
- "n_inputs": len(objs) if hasattr(objs, "__len__") else 1,
456
+ "axis": axis,
457
+ "n_inputs": len(source_data),
422
458
  },
423
459
  input_shape=tuple(input_shapes) if input_shapes else None,
424
460
  output_shape=result.shape,
425
- completeness=CompletenessLevel.PARTIAL, # Concat resets lineage
461
+ completeness=CompletenessLevel.PARTIAL,
426
462
  )
427
- except Exception as e:
428
- if ctx.config.strict_mode:
429
- raise
430
- warnings.warn(f"TracePipe: Concat capture failed: {e}", TracePipeWarning)
463
+ return result
464
+
465
+ return wrapper
466
+
467
+
468
+ def _concat_axis0_with_provenance(result, source_data, input_shapes, code_file, code_line, ctx):
469
+ """
470
+ Handle axis=0 concat with FULL row provenance.
471
+
472
+ Preserves source RIDs and tracks which source DF each row came from.
473
+ """
474
+ row_mgr = ctx.row_manager
475
+ store = ctx.store
431
476
 
477
+ # Build concatenated RID array and source index array
478
+ all_rids = np.concatenate([sd[0] for sd in source_data])
479
+ all_source_idx = np.concatenate(
480
+ [np.full(len(sd[0]), sd[2], dtype=np.int32) for sd in source_data]
481
+ )
482
+
483
+ # Validate: length must match result
484
+ if len(all_rids) != len(result):
485
+ # Mismatch - some objects contributed differently (empty DFs, Series, etc.)
486
+ # Degrade gracefully to PARTIAL
487
+ row_mgr.register(result)
488
+ store.append_step(
489
+ operation="pd.concat",
490
+ stage=ctx.current_stage,
491
+ code_file=code_file,
492
+ code_line=code_line,
493
+ params={
494
+ "axis": 0,
495
+ "n_inputs": len(source_data),
496
+ "_length_mismatch": True,
497
+ },
498
+ input_shape=tuple(input_shapes) if input_shapes else None,
499
+ output_shape=result.shape,
500
+ completeness=CompletenessLevel.PARTIAL,
501
+ )
432
502
  return result
433
503
 
434
- return wrapper
504
+ # Propagate RIDs to result (preserving lineage!)
505
+ row_mgr.set_result_rids(result, all_rids.copy())
506
+
507
+ # Build sorted arrays for O(log n) lookup
508
+ sort_order = np.argsort(all_rids)
509
+ out_rids_sorted = all_rids[sort_order].copy()
510
+ out_pos_sorted = sort_order.copy()
511
+
512
+ # Record step with FULL completeness
513
+ step_id = store.append_step(
514
+ operation="pd.concat",
515
+ stage=ctx.current_stage,
516
+ code_file=code_file,
517
+ code_line=code_line,
518
+ params={
519
+ "axis": 0,
520
+ "n_inputs": len(source_data),
521
+ },
522
+ input_shape=tuple(input_shapes) if input_shapes else None,
523
+ output_shape=result.shape,
524
+ completeness=CompletenessLevel.FULL,
525
+ )
526
+
527
+ # Store mapping
528
+ mapping = ConcatMapping(
529
+ step_id=step_id,
530
+ out_rids=all_rids.copy(),
531
+ source_indices=all_source_idx.copy(),
532
+ out_rids_sorted=out_rids_sorted,
533
+ out_pos_sorted=out_pos_sorted,
534
+ source_shapes=list(input_shapes),
535
+ )
536
+ store.concat_mappings.append(mapping)
537
+
538
+ return result
539
+
540
+
541
+ def _concat_axis1_with_provenance(result, source_data, input_shapes, code_file, code_line, ctx):
542
+ """
543
+ Handle axis=1 concat with best-effort provenance.
544
+
545
+ If all inputs have identical RID arrays, propagate them (FULL).
546
+ Otherwise, mark as PARTIAL and register new RIDs.
547
+ """
548
+ row_mgr = ctx.row_manager
549
+ store = ctx.store
550
+
551
+ # Check if all inputs have the same RIDs in same order
552
+ first_rids = source_data[0][0]
553
+ all_same = all(
554
+ len(sd[0]) == len(first_rids) and np.array_equal(sd[0], first_rids) for sd in source_data
555
+ )
556
+
557
+ if all_same and len(first_rids) == len(result):
558
+ # All inputs have identical RIDs - propagate them
559
+ row_mgr.set_result_rids(result, first_rids.copy())
560
+ completeness = CompletenessLevel.FULL
561
+ else:
562
+ # Misaligned or different RIDs - register new RIDs
563
+ row_mgr.register(result)
564
+ completeness = CompletenessLevel.PARTIAL
565
+
566
+ store.append_step(
567
+ operation="pd.concat",
568
+ stage=ctx.current_stage,
569
+ code_file=code_file,
570
+ code_line=code_line,
571
+ params={
572
+ "axis": 1,
573
+ "n_inputs": len(source_data),
574
+ },
575
+ input_shape=tuple(input_shapes) if input_shapes else None,
576
+ output_shape=result.shape,
577
+ completeness=completeness,
578
+ )
579
+
580
+ return result
tracepipe/snapshot.py CHANGED
@@ -25,7 +25,7 @@ Usage:
25
25
 
26
26
  import json
27
27
  import time
28
- from dataclasses import dataclass
28
+ from dataclasses import dataclass, field
29
29
  from pathlib import Path
30
30
  from typing import Any, Optional
31
31
 
@@ -297,6 +297,20 @@ class DiffResult:
297
297
  recovered_rows: set[int]
298
298
  drops_delta: dict[str, int] # op -> change in count
299
299
  stats_changes: dict[str, dict[str, Any]] # col -> {metric: (old, new)}
300
+ # Column changes
301
+ columns_added: list[str] = field(default_factory=list)
302
+ columns_removed: list[str] = field(default_factory=list)
303
+ # Cell-level changes (only populated if both snapshots have include_values=True)
304
+ cells_changed: int = 0 # Total modified cells
305
+ changed_rows: set[int] = field(default_factory=set) # IDs of rows with value changes
306
+ changes_by_column: dict[str, int] = field(default_factory=dict) # col -> count
307
+
308
+ @property
309
+ def rows_unchanged(self) -> int:
310
+ """Number of rows that exist in both snapshots (may have value changes)."""
311
+ # This is computed from the rows that weren't added or removed
312
+ # Note: This is an estimate based on the smaller snapshot
313
+ return 0 # Will be set during diff computation
300
314
 
301
315
  def __repr__(self) -> str:
302
316
  lines = ["Snapshot Diff:"]
@@ -310,6 +324,18 @@ class DiffResult:
310
324
  if self.recovered_rows:
311
325
  lines.append(f" * {len(self.recovered_rows)} recovered")
312
326
 
327
+ if self.columns_added:
328
+ lines.append(f" Columns added: {', '.join(self.columns_added)}")
329
+ if self.columns_removed:
330
+ lines.append(f" Columns removed: {', '.join(self.columns_removed)}")
331
+
332
+ if self.cells_changed > 0:
333
+ lines.append("\n Changes:")
334
+ lines.append(f" - {self.cells_changed} cells modified")
335
+ if self.changes_by_column:
336
+ for col, count in sorted(self.changes_by_column.items(), key=lambda x: -x[1])[:5]:
337
+ lines.append(f" {col}: {count}")
338
+
313
339
  if self.drops_delta:
314
340
  lines.append(" Drop changes by operation:")
315
341
  for op, delta in sorted(self.drops_delta.items(), key=lambda x: -abs(x[1])):
@@ -339,6 +365,9 @@ class DiffResult:
339
365
  or self.recovered_rows
340
366
  or self.drops_delta
341
367
  or self.stats_changes
368
+ or self.columns_added
369
+ or self.columns_removed
370
+ or self.cells_changed
342
371
  )
343
372
 
344
373
  def to_dict(self) -> dict:
@@ -350,6 +379,11 @@ class DiffResult:
350
379
  "recovered_rows": list(self.recovered_rows),
351
380
  "drops_delta": self.drops_delta,
352
381
  "stats_changes": self.stats_changes,
382
+ "columns_added": self.columns_added,
383
+ "columns_removed": self.columns_removed,
384
+ "cells_changed": self.cells_changed,
385
+ "changed_rows": list(self.changed_rows),
386
+ "changes_by_column": self.changes_by_column,
353
387
  }
354
388
 
355
389
 
@@ -359,6 +393,9 @@ def diff(baseline: Snapshot, current: Snapshot) -> DiffResult:
359
393
 
360
394
  Note: Cross-run diff is SUMMARY-ONLY unless keys are stored.
361
395
  Row-level comparison only works within same session (same RID assignment).
396
+
397
+ For cell-level diff (cells_changed, changes_by_column), both snapshots
398
+ must have been created with include_values=True.
362
399
  """
363
400
  rows_added = current.row_ids - baseline.row_ids
364
401
  rows_removed = baseline.row_ids - current.row_ids
@@ -375,9 +412,15 @@ def diff(baseline: Snapshot, current: Snapshot) -> DiffResult:
375
412
  if old != new:
376
413
  drops_delta[op] = new - old
377
414
 
415
+ # Column changes
416
+ baseline_cols = set(baseline.column_stats.keys())
417
+ current_cols = set(current.column_stats.keys())
418
+ columns_added = sorted(current_cols - baseline_cols)
419
+ columns_removed = sorted(baseline_cols - current_cols)
420
+
378
421
  # Stats changes
379
422
  stats_changes: dict[str, dict[str, Any]] = {}
380
- all_cols = set(baseline.column_stats.keys()) | set(current.column_stats.keys())
423
+ all_cols = baseline_cols | current_cols
381
424
  for col in all_cols:
382
425
  old_stats = baseline.column_stats.get(col)
383
426
  new_stats = current.column_stats.get(col)
@@ -396,6 +439,43 @@ def diff(baseline: Snapshot, current: Snapshot) -> DiffResult:
396
439
  if changes:
397
440
  stats_changes[col] = changes
398
441
 
442
+ # Cell-level changes (only if both snapshots have watched data)
443
+ cells_changed = 0
444
+ changed_rows: set[int] = set()
445
+ changes_by_column: dict[str, int] = {}
446
+
447
+ if baseline.watched_data is not None and current.watched_data is not None:
448
+ # Find common rows and columns
449
+ common_rows = baseline.row_ids & current.row_ids
450
+ common_cols = set(baseline.watched_data.columns) & set(current.watched_data.columns)
451
+
452
+ for rid in common_rows:
453
+ for col in common_cols:
454
+ old_val = baseline.watched_data.get_value(int(rid), col)
455
+ new_val = current.watched_data.get_value(int(rid), col)
456
+
457
+ # Compare values (handle NaN)
458
+ values_equal = False
459
+ if old_val is None and new_val is None:
460
+ values_equal = True
461
+ elif old_val is not None and new_val is not None:
462
+ try:
463
+ # Handle NaN comparison
464
+ if isinstance(old_val, float) and isinstance(new_val, float):
465
+ if old_val != old_val and new_val != new_val: # Both NaN
466
+ values_equal = True
467
+ else:
468
+ values_equal = old_val == new_val
469
+ else:
470
+ values_equal = old_val == new_val
471
+ except (TypeError, ValueError):
472
+ values_equal = str(old_val) == str(new_val)
473
+
474
+ if not values_equal:
475
+ cells_changed += 1
476
+ changed_rows.add(rid)
477
+ changes_by_column[col] = changes_by_column.get(col, 0) + 1
478
+
399
479
  return DiffResult(
400
480
  rows_added=rows_added,
401
481
  rows_removed=rows_removed,
@@ -403,6 +483,11 @@ def diff(baseline: Snapshot, current: Snapshot) -> DiffResult:
403
483
  recovered_rows=recovered_rows,
404
484
  drops_delta=drops_delta,
405
485
  stats_changes=stats_changes,
486
+ columns_added=columns_added,
487
+ columns_removed=columns_removed,
488
+ cells_changed=cells_changed,
489
+ changed_rows=changed_rows,
490
+ changes_by_column=changes_by_column,
406
491
  )
407
492
 
408
493
 
@@ -22,6 +22,8 @@ from ..core import (
22
22
  AggregationMapping,
23
23
  ChangeType,
24
24
  CompletenessLevel,
25
+ ConcatMapping,
26
+ DuplicateDropMapping,
25
27
  LineageGap,
26
28
  LineageGaps,
27
29
  MergeMapping,
@@ -100,6 +102,12 @@ class InMemoryLineageStore:
100
102
  self.merge_mappings: list[MergeMapping] = []
101
103
  self.merge_stats: list[tuple[int, MergeStats]] = []
102
104
 
105
+ # === CONCAT TRACKING ===
106
+ self.concat_mappings: list[ConcatMapping] = []
107
+
108
+ # === DUPLICATE DROP TRACKING (debug mode) ===
109
+ self.duplicate_drop_mappings: list[DuplicateDropMapping] = []
110
+
103
111
  # === AGGREGATION MAPPINGS ===
104
112
  self.aggregation_mappings: list[AggregationMapping] = []
105
113
 
@@ -361,6 +369,74 @@ class InMemoryLineageStore:
361
369
  return [(sid, s) for sid, s in self.merge_stats if sid == step_id]
362
370
  return list(self.merge_stats)
363
371
 
372
+ # === CONCAT LOOKUP (O(log n) via searchsorted) ===
373
+
374
+ def _binary_search_mapping(
375
+ self, sorted_rids: Optional[np.ndarray], target_rid: int
376
+ ) -> Optional[int]:
377
+ """
378
+ Return index in sorted array, or None if not found.
379
+
380
+ Robust to None/empty arrays and dtype mismatches.
381
+ """
382
+ if sorted_rids is None or len(sorted_rids) == 0:
383
+ return None
384
+
385
+ target = np.int64(target_rid)
386
+ i = np.searchsorted(sorted_rids, target)
387
+
388
+ if i < len(sorted_rids) and sorted_rids[i] == target:
389
+ return int(i)
390
+ return None
391
+
392
+ def get_concat_origin(self, row_id: int) -> Optional[dict]:
393
+ """
394
+ Get which source DataFrame a row came from in a concat.
395
+
396
+ Uses binary search (O(log n)) on sorted RIDs.
397
+
398
+ Returns:
399
+ {step_id, source_index, source_shape, position} if found, else None.
400
+ """
401
+ for mapping in self.concat_mappings:
402
+ idx = self._binary_search_mapping(mapping.out_rids_sorted, row_id)
403
+ if idx is not None:
404
+ pos = int(mapping.out_pos_sorted[idx])
405
+ source_idx = int(mapping.source_indices[pos])
406
+ return {
407
+ "step_id": mapping.step_id,
408
+ "source_index": source_idx,
409
+ "source_shape": (
410
+ mapping.source_shapes[source_idx]
411
+ if source_idx < len(mapping.source_shapes)
412
+ else None
413
+ ),
414
+ "position": pos,
415
+ }
416
+ return None
417
+
418
+ # === DUPLICATE DROP LOOKUP (O(log n) via searchsorted) ===
419
+
420
+ def get_duplicate_representative(self, row_id: int) -> Optional[dict]:
421
+ """
422
+ Get which row replaced this one in drop_duplicates.
423
+
424
+ Returns:
425
+ {step_id, kept_rid, subset_columns, keep_strategy} if found, else None.
426
+ kept_rid is -1 if keep=False (no representative).
427
+ """
428
+ for mapping in self.duplicate_drop_mappings:
429
+ idx = self._binary_search_mapping(mapping.dropped_rids, row_id)
430
+ if idx is not None:
431
+ kept = int(mapping.kept_rids[idx])
432
+ return {
433
+ "step_id": mapping.step_id,
434
+ "kept_rid": kept if kept >= 0 else None,
435
+ "subset_columns": mapping.subset_columns,
436
+ "keep_strategy": mapping.keep_strategy,
437
+ }
438
+ return None
439
+
364
440
  # === MEMORY MANAGEMENT ===
365
441
 
366
442
  def _check_memory_and_spill(self) -> None:
@@ -567,17 +643,17 @@ class InMemoryLineageStore:
567
643
 
568
644
  def get_row_history_with_lineage(self, row_id: int, max_depth: int = 10) -> list[dict]:
569
645
  """
570
- Get row history including pre-merge parent history.
646
+ Get row history including pre-merge and pre-concat parent history.
571
647
 
572
- Follows merge lineage recursively to build complete cell provenance.
573
- This is essential for tracking changes that happened before merge operations.
648
+ Follows merge and concat lineage recursively to build complete cell provenance.
649
+ This is essential for tracking changes that happened before merge/concat operations.
574
650
 
575
651
  Deduplicates events by (col, old_val, new_val, operation) signature to prevent
576
652
  cross-pipeline contamination when multiple DataFrames share row IDs.
577
653
 
578
654
  Args:
579
655
  row_id: Row ID to trace
580
- max_depth: Maximum merge depth to follow (prevents infinite loops)
656
+ max_depth: Maximum lineage depth to follow (prevents infinite loops)
581
657
 
582
658
  Returns:
583
659
  List of UNIQUE events in chronological order, including parent row events.
@@ -592,12 +668,21 @@ class InMemoryLineageStore:
592
668
  events = []
593
669
 
594
670
  # Check if this row came from a merge
595
- origin = self.get_merge_origin(rid)
596
- if origin and origin["left_parent"] is not None:
671
+ merge_origin = self.get_merge_origin(rid)
672
+ if merge_origin and merge_origin["left_parent"] is not None:
597
673
  # Recursively get parent's history first (chronological order)
598
- parent_events = _collect_history(origin["left_parent"], depth + 1)
674
+ parent_events = _collect_history(merge_origin["left_parent"], depth + 1)
599
675
  events.extend(parent_events)
600
676
 
677
+ # Check if this row came from a concat
678
+ # For concat, parent_rid == rid (identity mapping), so we don't recurse
679
+ # But we record the concat step for completeness
680
+ concat_origin = self.get_concat_origin(rid)
681
+ if concat_origin:
682
+ # Concat preserves RIDs, so the "parent" is the same RID
683
+ # The concat step itself is recorded in the step events
684
+ pass
685
+
601
686
  # Add this row's direct events
602
687
  events.extend(self.get_row_history(rid))
603
688
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tracepipe
3
- Version: 0.3.5
3
+ Version: 0.4.2
4
4
  Summary: Row-level data lineage tracking for pandas pipelines
5
5
  Project-URL: Homepage, https://github.com/tracepipe/tracepipe
6
6
  Project-URL: Documentation, https://tracepipe.github.io/tracepipe/
@@ -278,7 +278,7 @@ tp.enable(mode="debug") # Full lineage
278
278
 
279
279
  ## Known Limitations
280
280
 
281
- TracePipe tracks **cell mutations** (fillna, replace, loc assignment) and **merge provenance** reliably. However, some patterns are not yet fully supported:
281
+ TracePipe tracks **cell mutations**, **merge provenance**, **concat provenance**, and **duplicate drop decisions** reliably. A few patterns have limited tracking:
282
282
 
283
283
  | Pattern | Status | Notes |
284
284
  |---------|--------|-------|
@@ -286,13 +286,10 @@ TracePipe tracks **cell mutations** (fillna, replace, loc assignment) and **merg
286
286
  | `df = df.fillna({"col": 0})` | ✅ Tracked | DataFrame-level fillna |
287
287
  | `df.loc[mask, "col"] = val` | ✅ Tracked | Conditional assignment |
288
288
  | `df.merge(other, on="key")` | ✅ Tracked | Full provenance in debug mode |
289
- | `pd.concat([df1, df2])` | ⚠️ Partial | Row IDs preserved, but no "source DataFrame" tracking |
290
- | `df.drop_duplicates(keep='last')` | ⚠️ Partial | Which row was kept is not tracked |
291
- | Sort + dedup patterns | ⚠️ Partial | "Latest record wins" logic not traced |
292
-
293
- **Why?** TracePipe tracks value changes within rows, not row-selection operations. When `drop_duplicates` picks one row over another, that's a provenance decision (not a cell mutation) that isn't currently instrumented.
294
-
295
- **Planned for 0.4**: Full row-provenance tracking for concat, drop_duplicates, and sort operations.
289
+ | `pd.concat([df1, df2])` | Tracked | Row IDs preserved with source DataFrame tracking (v0.4+) |
290
+ | `df.drop_duplicates()` | Tracked | Dropped rows map to kept representative (debug mode, v0.4+) |
291
+ | `pd.concat(axis=1)` | ⚠️ Partial | FULL only if all inputs have identical RIDs |
292
+ | Complex `apply`/`pipe` | ⚠️ Partial | Output tracked, internals opaque |
296
293
 
297
294
  ---
298
295
 
@@ -1,29 +1,29 @@
1
- tracepipe/__init__.py,sha256=HK7i2rACJQdbyz5oMZ4z-xo9xJbS0cUqbS2AK6uMHJU,3342
1
+ tracepipe/__init__.py,sha256=cocA8ETqC1IGgDCXvxue9M4QVzIt8C981b6NTf9BXQ4,3342
2
2
  tracepipe/api.py,sha256=WdcKvvzI3voDt6fxZWa8vjyZQU8lfRshx7T78oj7oFE,13351
3
3
  tracepipe/context.py,sha256=DvwAZGZbLDJ4xoqmS1VnZOBbOI8ZIIErsY9W6GEbFSM,4051
4
4
  tracepipe/contracts.py,sha256=m-rjPrgnCiAgKEkweOS7P95jrjDptt5UPdvUlqaV_rU,16226
5
- tracepipe/convenience.py,sha256=KuDz_ZzNivVG1SS8Srr3plu4CTwFmNhYL4rk3vV6cbE,28421
6
- tracepipe/core.py,sha256=kAXks694rR0Z4tD7Gyty0TyJGWx2whsSdteYYpHuazo,8010
7
- tracepipe/debug.py,sha256=6t2GKVZLwn7SJLhrStE9qsmTiVIHATTE3jJPQ2DYtnc,10140
5
+ tracepipe/convenience.py,sha256=ALRtVn6tLfa7Ks7d9hKVJfhLjOLuyFgxTwSoUL0BgHY,38241
6
+ tracepipe/core.py,sha256=O1QFJFTSszxKY1pVR1XLrdA0zez7t5KQLjyq6V36ARk,10883
7
+ tracepipe/debug.py,sha256=S3ga3rVHjDSV4OctkF5uEAQlzjOxFJO8RGC81awGboA,11397
8
8
  tracepipe/safety.py,sha256=UpzhQj31Dij-DjgT9mY_jPrUpVfcA51gDI2fUos4IUA,6694
9
- tracepipe/snapshot.py,sha256=OLREzE1_LkWITluG_Bqeb7Y4pAKb8Lb3zJEF3cxnloU,13967
9
+ tracepipe/snapshot.py,sha256=kvW8be1EAAsyHefXxJPgIQAAYT_FwK167SMxeQcsra4,17921
10
10
  tracepipe/value_provenance.py,sha256=ogky6aOaZ-6K2uNBQxlXpmCeuvK434Hisj30zesRTd8,9330
11
11
  tracepipe/instrumentation/__init__.py,sha256=pd0n6Z9m_V3gcBv097cXWFOZEzAP9sAq1jjQnNRrDZ8,222
12
12
  tracepipe/instrumentation/apply_capture.py,sha256=cMThWzNXqWQENuMrCGTne1hO6fqaQFV7zJYNpsPTW4w,14463
13
- tracepipe/instrumentation/filter_capture.py,sha256=onlYLU5bBZSM3WmxM2AFHfktnlx7ReG-brEn5eZ_N10,15830
13
+ tracepipe/instrumentation/filter_capture.py,sha256=aN8-Ev6kbDR8f9A9JVy236VK0iqNxpMvki3pbtUkBYQ,19445
14
14
  tracepipe/instrumentation/indexer_capture.py,sha256=1ATCeJ-uNA1uGiSbgnUx0wdVsIlZGHeUBaFJPXgFQNg,28440
15
- tracepipe/instrumentation/merge_capture.py,sha256=Eze-PTrn7IXxZRZBYX9R13mOY3diWKAkjp4z-wa1tEk,13349
15
+ tracepipe/instrumentation/merge_capture.py,sha256=zqa6SY5YLbr-N7PPTdE6TYKyJIZcPqT02d1Ifvi3Jdw,18359
16
16
  tracepipe/instrumentation/pandas_inst.py,sha256=h8RlfwYkYwuftCyBYIETdwHxVCzQM1SBBrbYP7SyjJ8,30047
17
17
  tracepipe/instrumentation/series_capture.py,sha256=i7FiA2ndEzS6duIj5y-a7SDfIMl2cCY_jGC1tmG7TGU,11271
18
18
  tracepipe/storage/__init__.py,sha256=pGFMfbIgIi2kofVPwYDqe2HTYMYJoabiGjTq77pYi-g,348
19
19
  tracepipe/storage/base.py,sha256=7DV_-rp37DjBMr9B1w85hLVYhC8OQShk2PcEhT-n4tE,4894
20
- tracepipe/storage/lineage_store.py,sha256=KhGri2uC_O_43fUivFGEHY6KBDHd1I0O_PPd_KD3L4M,28683
20
+ tracepipe/storage/lineage_store.py,sha256=1enRmDgnVjxW8Pu7WMHJ8WPnnbm-HsAm4e1dKsTvnIc,31943
21
21
  tracepipe/storage/row_identity.py,sha256=HBU0gTTJlFtFTcAdUCKuX-c9cHa0lo3CDIodDPDgOzA,17161
22
22
  tracepipe/utils/__init__.py,sha256=CI_GXViCjdMbu1j6HuzZhoQZEW0sIB6WAve6j5pfOC0,182
23
23
  tracepipe/utils/value_capture.py,sha256=wGgegQmJnVHxHbwHSH9di7JAOBChzD3ERJrabZNiayk,4092
24
24
  tracepipe/visualization/__init__.py,sha256=M3s44ZTUNEToyghjhQW0FgbmWHKPr4Xc-7iNF6DpI_E,132
25
25
  tracepipe/visualization/html_export.py,sha256=G0hfZTJctUCfpun17zXX1NIXhvJZbca6hKmP3rcIjbg,42282
26
- tracepipe-0.3.5.dist-info/METADATA,sha256=bWidBs8nMW6T6oah8xQum_IjdP7Y1J1inDAn-gfHUCg,10288
27
- tracepipe-0.3.5.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
28
- tracepipe-0.3.5.dist-info/licenses/LICENSE,sha256=HMOAFHBClL79POwWL-2_aDcx42DJAq7Ce-nwJPvMB9U,1075
29
- tracepipe-0.3.5.dist-info/RECORD,,
26
+ tracepipe-0.4.2.dist-info/METADATA,sha256=0nMQRfqFJCg1DMGjWzW_nlFcWMM-q8T4LfoqkMcYmAQ,10067
27
+ tracepipe-0.4.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
28
+ tracepipe-0.4.2.dist-info/licenses/LICENSE,sha256=HMOAFHBClL79POwWL-2_aDcx42DJAq7Ce-nwJPvMB9U,1075
29
+ tracepipe-0.4.2.dist-info/RECORD,,