tracepipe 0.3.5__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tracepipe/__init__.py CHANGED
@@ -81,7 +81,7 @@ from .core import TracePipeConfig, TracePipeMode
81
81
  from .snapshot import DiffResult, Snapshot, diff, snapshot
82
82
 
83
83
  # === VERSION ===
84
- __version__ = "0.3.5"
84
+ __version__ = "0.4.1"
85
85
 
86
86
  # === MINIMAL __all__ ===
87
87
  __all__ = [
tracepipe/convenience.py CHANGED
@@ -54,6 +54,12 @@ class CheckResult:
54
54
 
55
55
  Separates FACTS (observed, high confidence) from HEURISTICS (inferred).
56
56
  .ok is True only if there are no FACT-level warnings.
57
+
58
+ Key properties for quick access:
59
+ .passed - Alias for .ok (common naming convention)
60
+ .retention - Row retention rate (0.0-1.0)
61
+ .n_dropped - Total rows dropped
62
+ .drops_by_op - Drops broken down by operation
57
63
  """
58
64
 
59
65
  ok: bool
@@ -61,6 +67,37 @@ class CheckResult:
61
67
  facts: dict[str, Any]
62
68
  suggestions: list[str]
63
69
  mode: str
70
+ # Internal: store drops_by_op so we don't need to recompute
71
+ _drops_by_op: dict[str, int] = field(default_factory=dict)
72
+
73
+ # === CONVENIENCE PROPERTIES ===
74
+
75
+ @property
76
+ def passed(self) -> bool:
77
+ """Alias for .ok (matches common naming convention)."""
78
+ return self.ok
79
+
80
+ @property
81
+ def retention(self) -> float | None:
82
+ """Row retention rate (0.0-1.0), or None if not computed."""
83
+ return self.facts.get("retention_rate")
84
+
85
+ @property
86
+ def n_dropped(self) -> int:
87
+ """Total number of rows dropped."""
88
+ return self.facts.get("rows_dropped", 0)
89
+
90
+ @property
91
+ def drops_by_op(self) -> dict[str, int]:
92
+ """Drops broken down by operation name."""
93
+ return self._drops_by_op
94
+
95
+ @property
96
+ def n_steps(self) -> int:
97
+ """Total pipeline steps recorded."""
98
+ return self.facts.get("total_steps", 0)
99
+
100
+ # === EXISTING PROPERTIES ===
64
101
 
65
102
  @property
66
103
  def has_warnings(self) -> bool:
@@ -115,7 +152,12 @@ class CheckResult:
115
152
  """Export to dictionary."""
116
153
  return {
117
154
  "ok": self.ok,
155
+ "passed": self.passed,
118
156
  "mode": self.mode,
157
+ "retention": self.retention,
158
+ "n_dropped": self.n_dropped,
159
+ "n_steps": self.n_steps,
160
+ "drops_by_op": self.drops_by_op,
119
161
  "facts": self.facts,
120
162
  "suggestions": self.suggestions,
121
163
  "warnings": [
@@ -147,6 +189,10 @@ class TraceResult:
147
189
 
148
190
  Answers: "What happened to this row?"
149
191
  Events are in CHRONOLOGICAL order (oldest->newest).
192
+
193
+ Key attributes:
194
+ origin: Where this row came from (concat, merge, or original)
195
+ representative: If dropped by dedup, which row was kept instead
150
196
  """
151
197
 
152
198
  row_id: int
@@ -158,22 +204,70 @@ class TraceResult:
158
204
  # Mode enforcement
159
205
  supported: bool = True
160
206
  unsupported_reason: str | None = None
207
+ # v0.4+ provenance
208
+ concat_origin: dict[str, Any] | None = None
209
+ dedup_representative: dict[str, Any] | None = None
161
210
 
162
211
  @property
163
212
  def n_events(self) -> int:
164
213
  return len(self.events)
165
214
 
215
+ @property
216
+ def origin(self) -> dict[str, Any] | None:
217
+ """
218
+ Unified origin info: where did this row come from?
219
+
220
+ Returns dict with 'type' key:
221
+ - {"type": "concat", "source_df": 1, "step_id": 5}
222
+ - {"type": "merge", "left_parent": 10, "right_parent": 20, "step_id": 3}
223
+ - None if original row (not from concat/merge)
224
+ """
225
+ if self.concat_origin:
226
+ return {
227
+ "type": "concat",
228
+ "source_df": self.concat_origin.get("source_index"),
229
+ "step_id": self.concat_origin.get("step_id"),
230
+ }
231
+ if self.merge_origin:
232
+ return {
233
+ "type": "merge",
234
+ "left_parent": self.merge_origin.get("left_parent"),
235
+ "right_parent": self.merge_origin.get("right_parent"),
236
+ "step_id": self.merge_origin.get("step_id"),
237
+ }
238
+ return None
239
+
240
+ @property
241
+ def representative(self) -> dict[str, Any] | None:
242
+ """
243
+ If dropped by drop_duplicates, which row was kept instead?
244
+
245
+ Returns:
246
+ {"kept_rid": 42, "subset": ["key"], "keep": "first"} or None
247
+ kept_rid is None if keep=False (all duplicates dropped)
248
+ """
249
+ if not self.dedup_representative:
250
+ return None
251
+ return {
252
+ "kept_rid": self.dedup_representative.get("kept_rid"),
253
+ "subset": self.dedup_representative.get("subset_columns"),
254
+ "keep": self.dedup_representative.get("keep_strategy"),
255
+ }
256
+
166
257
  def to_dict(self) -> dict:
167
258
  """Export to dictionary."""
168
259
  return {
169
260
  "row_id": self.row_id,
170
261
  "is_alive": self.is_alive,
171
262
  "dropped_at": self.dropped_at,
172
- "merge_origin": self.merge_origin,
263
+ "origin": self.origin,
264
+ "representative": self.representative,
173
265
  "n_events": self.n_events,
174
266
  "events": self.events,
175
267
  "ghost_values": self.ghost_values,
176
268
  "supported": self.supported,
269
+ # Keep legacy fields for backwards compatibility
270
+ "merge_origin": self.merge_origin,
177
271
  }
178
272
 
179
273
  def __repr__(self) -> str:
@@ -195,10 +289,28 @@ class TraceResult:
195
289
  f" at step {self.dropped_at['step_id']}: {self.dropped_at['operation']}"
196
290
  )
197
291
 
198
- if self.merge_origin:
199
- left = self.merge_origin.get("left_parent", "?")
200
- right = self.merge_origin.get("right_parent", "?")
201
- lines.append(f" Origin: merge of row {left} (left) + row {right} (right)")
292
+ # Display unified origin info
293
+ origin = self.origin
294
+ if origin:
295
+ if origin["type"] == "merge":
296
+ left = origin.get("left_parent", "?")
297
+ right = origin.get("right_parent", "?")
298
+ lines.append(f" Origin: merge of row {left} (left) + row {right} (right)")
299
+ elif origin["type"] == "concat":
300
+ src = origin.get("source_df", "?")
301
+ lines.append(f" Origin: concat from DataFrame #{src}")
302
+
303
+ # Display dedup representative if dropped by dedup
304
+ if self.representative:
305
+ kept = self.representative.get("kept_rid")
306
+ subset = self.representative.get("subset")
307
+ keep = self.representative.get("keep", "first")
308
+ if kept is not None:
309
+ subset_str = f" (key: {subset})" if subset else ""
310
+ lines.append(f" Replaced by: row {kept}{subset_str} [keep={keep}]")
311
+ else:
312
+ subset_str = f" on {subset}" if subset else ""
313
+ lines.append(f" Dropped: all duplicates removed{subset_str} [keep=False]")
202
314
 
203
315
  if len(self.events) == 0:
204
316
  lines.append("\n Events: 0 (no changes to watched columns)")
@@ -462,8 +574,8 @@ def check(
462
574
  )
463
575
  )
464
576
 
465
- drops_by_step = ctx.store.get_dropped_by_step()
466
- for op, count in drops_by_step.items():
577
+ drops_by_op = ctx.store.get_dropped_by_step()
578
+ for op, count in drops_by_op.items():
467
579
  if count > 1000:
468
580
  suggestions.append(f"'{op}' dropped {count} rows - review if intentional")
469
581
 
@@ -475,6 +587,7 @@ def check(
475
587
  facts=facts,
476
588
  suggestions=suggestions,
477
589
  mode=ctx.config.mode.value,
590
+ _drops_by_op=drops_by_op,
478
591
  )
479
592
 
480
593
 
@@ -787,6 +900,14 @@ def _build_trace_result(row_id: int, ctx, include_ghost: bool) -> TraceResult:
787
900
  drop_event = store.get_drop_event(row_id)
788
901
  merge_origin = store.get_merge_origin(row_id)
789
902
 
903
+ # v0.4+ provenance: concat origin and dedup representative
904
+ concat_origin = None
905
+ dedup_representative = None
906
+ if hasattr(store, "get_concat_origin"):
907
+ concat_origin = store.get_concat_origin(row_id)
908
+ if hasattr(store, "get_duplicate_representative"):
909
+ dedup_representative = store.get_duplicate_representative(row_id)
910
+
790
911
  # Use lineage-aware history to include pre-merge parent events
791
912
  if hasattr(store, "get_row_history_with_lineage"):
792
913
  history = store.get_row_history_with_lineage(row_id)
@@ -823,6 +944,8 @@ def _build_trace_result(row_id: int, ctx, include_ghost: bool) -> TraceResult:
823
944
  merge_origin=merge_origin,
824
945
  events=history,
825
946
  ghost_values=ghost_values,
947
+ concat_origin=concat_origin,
948
+ dedup_representative=dedup_representative,
826
949
  )
827
950
 
828
951
 
tracepipe/core.py CHANGED
@@ -277,3 +277,82 @@ class MergeStats:
277
277
  left_dup_rate: float # -1 if not computed
278
278
  right_dup_rate: float # -1 if not computed
279
279
  how: str
280
+
281
+
282
+ @dataclass
283
+ class ConcatMapping:
284
+ """
285
+ Mapping for pd.concat operations preserving row lineage.
286
+
287
+ For axis=0 concat, each result row comes from exactly one source DataFrame.
288
+ Arrays are stored in both positional order (for "explain row i") and
289
+ sorted order (for O(log n) RID lookup).
290
+
291
+ Invariants:
292
+ - out_rids and source_indices have same length
293
+ - out_rids_sorted and out_pos_sorted are always paired (both set or both None)
294
+ - out_rids_sorted is monotonically increasing
295
+ """
296
+
297
+ step_id: int
298
+
299
+ # Positional arrays (match result row order)
300
+ out_rids: Any # numpy array, len = len(result)
301
+ source_indices: Any # numpy array, which source DF (0, 1, 2...) each row came from
302
+
303
+ # Sorted arrays (for O(log n) lookup by RID)
304
+ out_rids_sorted: Any # numpy array, SORTED
305
+ out_pos_sorted: Any # numpy array, original positions aligned with out_rids_sorted
306
+
307
+ # Metadata
308
+ source_shapes: list[tuple] = field(default_factory=list)
309
+
310
+ def __post_init__(self):
311
+ """Validate invariants."""
312
+ import numpy as np
313
+
314
+ if self.out_rids_sorted is not None and self.out_pos_sorted is not None:
315
+ if len(self.out_rids_sorted) != len(self.out_pos_sorted):
316
+ raise ValueError("out_rids_sorted and out_pos_sorted must have same length")
317
+ # Verify monotonic (debug check)
318
+ if len(self.out_rids_sorted) > 1:
319
+ assert np.all(
320
+ self.out_rids_sorted[:-1] <= self.out_rids_sorted[1:]
321
+ ), "out_rids_sorted must be monotonically increasing"
322
+
323
+
324
+ @dataclass
325
+ class DuplicateDropMapping:
326
+ """
327
+ Mapping for drop_duplicates provenance (debug mode only).
328
+
329
+ Tracks which rows were dropped and which "representative" row they lost to.
330
+ Arrays are sorted by dropped_rids for O(log n) lookup.
331
+
332
+ For keep='first': dropped rows map to first occurrence
333
+ For keep='last': dropped rows map to last occurrence
334
+ For keep=False: dropped rows have kept_rids=-1 (no representative)
335
+ """
336
+
337
+ step_id: int
338
+
339
+ # Sorted arrays for O(log n) lookup
340
+ dropped_rids: Any # numpy array, SORTED dropped row IDs
341
+ kept_rids: Any # numpy array, representative RID for each dropped row (-1 if none)
342
+
343
+ # Metadata
344
+ subset_columns: Optional[tuple[str, ...]] = None
345
+ keep_strategy: str = "first"
346
+
347
+ def __post_init__(self):
348
+ """Validate invariants."""
349
+ import numpy as np
350
+
351
+ if self.dropped_rids is not None and self.kept_rids is not None:
352
+ if len(self.dropped_rids) != len(self.kept_rids):
353
+ raise ValueError("dropped_rids and kept_rids must have same length")
354
+ # Verify sorted
355
+ if len(self.dropped_rids) > 1:
356
+ assert np.all(
357
+ self.dropped_rids[:-1] <= self.dropped_rids[1:]
358
+ ), "dropped_rids must be sorted"
@@ -24,7 +24,7 @@ import numpy as np
24
24
  import pandas as pd
25
25
 
26
26
  from ..context import TracePipeContext, get_context
27
- from ..core import CompletenessLevel
27
+ from ..core import CompletenessLevel, DuplicateDropMapping
28
28
  from ..safety import TracePipeWarning, get_caller_info
29
29
 
30
30
  # ============ MASK DERIVATION FUNCTIONS ============
@@ -97,6 +97,95 @@ def derive_drop_duplicates_mask(
97
97
  return kept_mask.values, completeness
98
98
 
99
99
 
100
+ def derive_drop_duplicates_provenance(
101
+ df: pd.DataFrame,
102
+ source_rids: np.ndarray,
103
+ subset: Optional[list[str]],
104
+ keep: str,
105
+ ) -> Optional[DuplicateDropMapping]:
106
+ """
107
+ Derive dropped->kept mapping for drop_duplicates (debug mode only).
108
+
109
+ Uses hash_pandas_object for NaN-safe, fast key comparison.
110
+ Uses vectorized groupby min/max for representative selection.
111
+
112
+ Args:
113
+ df: Source DataFrame
114
+ source_rids: Row IDs for each row in df
115
+ subset: Columns to consider for duplicates (None = all)
116
+ keep: 'first', 'last', or False
117
+
118
+ Returns:
119
+ DuplicateDropMapping if any rows were dropped, else None.
120
+ """
121
+ n = len(df)
122
+ if n == 0:
123
+ return None
124
+
125
+ # Determine columns to hash
126
+ if subset is None:
127
+ hash_df = df
128
+ valid_cols = tuple(df.columns)
129
+ else:
130
+ valid_cols = tuple(c for c in subset if c in df.columns)
131
+ if not valid_cols:
132
+ return None
133
+ hash_df = df[list(valid_cols)]
134
+
135
+ # Use hash_pandas_object for fast, NaN-safe key hashing
136
+ try:
137
+ h = pd.util.hash_pandas_object(hash_df, index=False)
138
+ codes, _ = pd.factorize(h, sort=False)
139
+ except Exception:
140
+ # Fallback: can't hash, skip provenance
141
+ return None
142
+
143
+ # Compute kept mask using pandas (ground truth)
144
+ kept_mask = ~df.duplicated(subset=list(valid_cols) if valid_cols else None, keep=keep)
145
+ dropped_mask = ~kept_mask.values
146
+
147
+ if not dropped_mask.any():
148
+ return None # No duplicates dropped
149
+
150
+ dropped_positions = np.where(dropped_mask)[0]
151
+ dropped_rids = source_rids[dropped_positions]
152
+
153
+ # Find representative positions using vectorized groupby min/max
154
+ positions = np.arange(n, dtype=np.int64)
155
+
156
+ if keep == "first":
157
+ # Representative = first occurrence of each group
158
+ rep_pos = pd.Series(positions).groupby(codes).min().to_numpy()
159
+ elif keep == "last":
160
+ # Representative = last occurrence of each group
161
+ rep_pos = pd.Series(positions).groupby(codes).max().to_numpy()
162
+ else:
163
+ # keep=False: no representative (all duplicates dropped)
164
+ rep_pos = None
165
+
166
+ # Build kept_rids array
167
+ if rep_pos is not None:
168
+ dropped_codes = codes[dropped_positions]
169
+ kept_positions = rep_pos[dropped_codes]
170
+ kept_rids = source_rids[kept_positions]
171
+ else:
172
+ # keep=False: no representative
173
+ kept_rids = np.full(len(dropped_rids), -1, dtype=np.int64)
174
+
175
+ # Sort by dropped_rids for O(log n) lookup
176
+ sort_order = np.argsort(dropped_rids)
177
+ dropped_rids_sorted = dropped_rids[sort_order].copy()
178
+ kept_rids_sorted = kept_rids[sort_order].copy()
179
+
180
+ return DuplicateDropMapping(
181
+ step_id=-1, # Will be set by caller
182
+ dropped_rids=dropped_rids_sorted,
183
+ kept_rids=kept_rids_sorted,
184
+ subset_columns=valid_cols if valid_cols else None,
185
+ keep_strategy=str(keep),
186
+ )
187
+
188
+
100
189
  def derive_query_mask(
101
190
  df: pd.DataFrame, args: tuple, kwargs: dict
102
191
  ) -> tuple[Optional[np.ndarray], CompletenessLevel]:
@@ -257,12 +346,19 @@ def _capture_filter_with_mask(
257
346
  kept_mask: Optional[np.ndarray] = None
258
347
  positions: Optional[np.ndarray] = None
259
348
  completeness = CompletenessLevel.FULL
349
+ dedup_mapping: Optional[DuplicateDropMapping] = None
260
350
 
261
351
  if method_name == "dropna":
262
352
  kept_mask, completeness = derive_dropna_mask(source_df, args, kwargs)
263
353
 
264
354
  elif method_name == "drop_duplicates":
265
355
  kept_mask, completeness = derive_drop_duplicates_mask(source_df, args, kwargs)
356
+ # Compute provenance mapping in debug mode
357
+ dedup_mapping = None
358
+ if ctx.config.should_capture_merge_provenance:
359
+ subset = kwargs.get("subset", None)
360
+ keep = kwargs.get("keep", "first")
361
+ dedup_mapping = derive_drop_duplicates_provenance(source_df, source_rids, subset, keep)
266
362
 
267
363
  elif method_name == "query":
268
364
  kept_mask, completeness = derive_query_mask(source_df, args, kwargs)
@@ -359,6 +455,12 @@ def _capture_filter_with_mask(
359
455
  watched_columns=ctx.watched_columns,
360
456
  )
361
457
 
458
+ # === RECORD DROP_DUPLICATES PROVENANCE (debug mode) ===
459
+ if method_name == "drop_duplicates" and dedup_mapping is not None:
460
+ # Update step_id in the mapping and store it
461
+ dedup_mapping.step_id = step_id
462
+ store.duplicate_drop_mappings.append(dedup_mapping)
463
+
362
464
 
363
465
  def _propagate_by_index_fallback(
364
466
  row_mgr, source_df: pd.DataFrame, result_df: pd.DataFrame
@@ -14,7 +14,7 @@ import numpy as np
14
14
  import pandas as pd
15
15
 
16
16
  from ..context import get_context
17
- from ..core import CompletenessLevel, MergeMapping, MergeStats
17
+ from ..core import CompletenessLevel, ConcatMapping, MergeMapping, MergeStats
18
18
  from ..safety import TracePipeWarning, get_caller_info
19
19
 
20
20
 
@@ -382,53 +382,199 @@ def wrap_join_with_lineage(original_join):
382
382
  def wrap_concat_with_lineage(original_concat):
383
383
  """
384
384
  Wrap pd.concat with lineage capture.
385
+
386
+ For axis=0 (vertical concat):
387
+ - Preserves row IDs from source DataFrames (FULL provenance)
388
+ - Tracks which source DataFrame each row came from
389
+
390
+ For axis=1 (horizontal concat):
391
+ - Propagates RIDs if all inputs have identical RID arrays
392
+ - Otherwise marks as PARTIAL
385
393
  """
386
394
 
387
395
  @wraps(original_concat)
388
396
  def wrapper(objs, *args, **kwargs):
389
397
  ctx = get_context()
390
398
 
391
- result = original_concat(objs, *args, **kwargs)
392
-
393
399
  if not ctx.enabled:
394
- return result
400
+ return original_concat(objs, *args, **kwargs)
401
+
402
+ axis = kwargs.get("axis", 0)
403
+
404
+ # === BEFORE: Capture source RIDs from all tracked DataFrames ===
405
+ source_data = [] # [(rids_copy, shape, original_index), ...]
406
+ try:
407
+ objs_list = list(objs) if hasattr(objs, "__iter__") else [objs]
408
+ except TypeError:
409
+ objs_list = [objs]
410
+
411
+ for i, obj in enumerate(objs_list):
412
+ if isinstance(obj, pd.DataFrame) and len(obj) > 0:
413
+ rids = ctx.row_manager.get_ids_array(obj)
414
+ if rids is None:
415
+ rids = ctx.row_manager.register(obj)
416
+ # IMPORTANT: Make a copy to avoid mutation issues
417
+ source_data.append((rids.copy(), obj.shape, i))
418
+
419
+ # === RUN ORIGINAL ===
420
+ try:
421
+ result = original_concat(objs_list, *args, **kwargs)
422
+ except Exception:
423
+ raise # Don't store mapping on failure
395
424
 
396
425
  if not isinstance(result, pd.DataFrame):
397
426
  return result
398
427
 
399
- try:
400
- row_mgr = ctx.row_manager
401
- store = ctx.store
428
+ row_mgr = ctx.row_manager
429
+ store = ctx.store
430
+ code_file, code_line = get_caller_info(skip_frames=2)
402
431
 
403
- # Register result
404
- row_mgr.register(result)
432
+ # Compute input shapes for step metadata
433
+ input_shapes = [sd[1] for sd in source_data]
405
434
 
406
- code_file, code_line = get_caller_info(skip_frames=2)
435
+ # === AXIS=0: Vertical concat with FULL provenance ===
436
+ if axis == 0 and source_data:
437
+ return _concat_axis0_with_provenance(
438
+ result, source_data, input_shapes, code_file, code_line, ctx
439
+ )
407
440
 
408
- # Compute input shapes
409
- input_shapes = []
410
- for obj in objs:
411
- if hasattr(obj, "shape"):
412
- input_shapes.append(obj.shape)
441
+ # === AXIS=1: Horizontal concat ===
442
+ elif axis == 1 and source_data:
443
+ return _concat_axis1_with_provenance(
444
+ result, source_data, input_shapes, code_file, code_line, ctx
445
+ )
413
446
 
447
+ # === FALLBACK: Unknown axis or no source data ===
448
+ else:
449
+ row_mgr.register(result)
414
450
  store.append_step(
415
451
  operation="pd.concat",
416
452
  stage=ctx.current_stage,
417
453
  code_file=code_file,
418
454
  code_line=code_line,
419
455
  params={
420
- "axis": kwargs.get("axis", 0),
421
- "n_inputs": len(objs) if hasattr(objs, "__len__") else 1,
456
+ "axis": axis,
457
+ "n_inputs": len(source_data),
422
458
  },
423
459
  input_shape=tuple(input_shapes) if input_shapes else None,
424
460
  output_shape=result.shape,
425
- completeness=CompletenessLevel.PARTIAL, # Concat resets lineage
461
+ completeness=CompletenessLevel.PARTIAL,
426
462
  )
427
- except Exception as e:
428
- if ctx.config.strict_mode:
429
- raise
430
- warnings.warn(f"TracePipe: Concat capture failed: {e}", TracePipeWarning)
463
+ return result
464
+
465
+ return wrapper
466
+
467
+
468
+ def _concat_axis0_with_provenance(result, source_data, input_shapes, code_file, code_line, ctx):
469
+ """
470
+ Handle axis=0 concat with FULL row provenance.
471
+
472
+ Preserves source RIDs and tracks which source DF each row came from.
473
+ """
474
+ row_mgr = ctx.row_manager
475
+ store = ctx.store
431
476
 
477
+ # Build concatenated RID array and source index array
478
+ all_rids = np.concatenate([sd[0] for sd in source_data])
479
+ all_source_idx = np.concatenate(
480
+ [np.full(len(sd[0]), sd[2], dtype=np.int32) for sd in source_data]
481
+ )
482
+
483
+ # Validate: length must match result
484
+ if len(all_rids) != len(result):
485
+ # Mismatch - some objects contributed differently (empty DFs, Series, etc.)
486
+ # Degrade gracefully to PARTIAL
487
+ row_mgr.register(result)
488
+ store.append_step(
489
+ operation="pd.concat",
490
+ stage=ctx.current_stage,
491
+ code_file=code_file,
492
+ code_line=code_line,
493
+ params={
494
+ "axis": 0,
495
+ "n_inputs": len(source_data),
496
+ "_length_mismatch": True,
497
+ },
498
+ input_shape=tuple(input_shapes) if input_shapes else None,
499
+ output_shape=result.shape,
500
+ completeness=CompletenessLevel.PARTIAL,
501
+ )
432
502
  return result
433
503
 
434
- return wrapper
504
+ # Propagate RIDs to result (preserving lineage!)
505
+ row_mgr.set_result_rids(result, all_rids.copy())
506
+
507
+ # Build sorted arrays for O(log n) lookup
508
+ sort_order = np.argsort(all_rids)
509
+ out_rids_sorted = all_rids[sort_order].copy()
510
+ out_pos_sorted = sort_order.copy()
511
+
512
+ # Record step with FULL completeness
513
+ step_id = store.append_step(
514
+ operation="pd.concat",
515
+ stage=ctx.current_stage,
516
+ code_file=code_file,
517
+ code_line=code_line,
518
+ params={
519
+ "axis": 0,
520
+ "n_inputs": len(source_data),
521
+ },
522
+ input_shape=tuple(input_shapes) if input_shapes else None,
523
+ output_shape=result.shape,
524
+ completeness=CompletenessLevel.FULL,
525
+ )
526
+
527
+ # Store mapping
528
+ mapping = ConcatMapping(
529
+ step_id=step_id,
530
+ out_rids=all_rids.copy(),
531
+ source_indices=all_source_idx.copy(),
532
+ out_rids_sorted=out_rids_sorted,
533
+ out_pos_sorted=out_pos_sorted,
534
+ source_shapes=list(input_shapes),
535
+ )
536
+ store.concat_mappings.append(mapping)
537
+
538
+ return result
539
+
540
+
541
+ def _concat_axis1_with_provenance(result, source_data, input_shapes, code_file, code_line, ctx):
542
+ """
543
+ Handle axis=1 concat with best-effort provenance.
544
+
545
+ If all inputs have identical RID arrays, propagate them (FULL).
546
+ Otherwise, mark as PARTIAL and register new RIDs.
547
+ """
548
+ row_mgr = ctx.row_manager
549
+ store = ctx.store
550
+
551
+ # Check if all inputs have the same RIDs in same order
552
+ first_rids = source_data[0][0]
553
+ all_same = all(
554
+ len(sd[0]) == len(first_rids) and np.array_equal(sd[0], first_rids) for sd in source_data
555
+ )
556
+
557
+ if all_same and len(first_rids) == len(result):
558
+ # All inputs have identical RIDs - propagate them
559
+ row_mgr.set_result_rids(result, first_rids.copy())
560
+ completeness = CompletenessLevel.FULL
561
+ else:
562
+ # Misaligned or different RIDs - register new RIDs
563
+ row_mgr.register(result)
564
+ completeness = CompletenessLevel.PARTIAL
565
+
566
+ store.append_step(
567
+ operation="pd.concat",
568
+ stage=ctx.current_stage,
569
+ code_file=code_file,
570
+ code_line=code_line,
571
+ params={
572
+ "axis": 1,
573
+ "n_inputs": len(source_data),
574
+ },
575
+ input_shape=tuple(input_shapes) if input_shapes else None,
576
+ output_shape=result.shape,
577
+ completeness=completeness,
578
+ )
579
+
580
+ return result
@@ -22,6 +22,8 @@ from ..core import (
22
22
  AggregationMapping,
23
23
  ChangeType,
24
24
  CompletenessLevel,
25
+ ConcatMapping,
26
+ DuplicateDropMapping,
25
27
  LineageGap,
26
28
  LineageGaps,
27
29
  MergeMapping,
@@ -100,6 +102,12 @@ class InMemoryLineageStore:
100
102
  self.merge_mappings: list[MergeMapping] = []
101
103
  self.merge_stats: list[tuple[int, MergeStats]] = []
102
104
 
105
+ # === CONCAT TRACKING ===
106
+ self.concat_mappings: list[ConcatMapping] = []
107
+
108
+ # === DUPLICATE DROP TRACKING (debug mode) ===
109
+ self.duplicate_drop_mappings: list[DuplicateDropMapping] = []
110
+
103
111
  # === AGGREGATION MAPPINGS ===
104
112
  self.aggregation_mappings: list[AggregationMapping] = []
105
113
 
@@ -361,6 +369,74 @@ class InMemoryLineageStore:
361
369
  return [(sid, s) for sid, s in self.merge_stats if sid == step_id]
362
370
  return list(self.merge_stats)
363
371
 
372
+ # === CONCAT LOOKUP (O(log n) via searchsorted) ===
373
+
374
+ def _binary_search_mapping(
375
+ self, sorted_rids: Optional[np.ndarray], target_rid: int
376
+ ) -> Optional[int]:
377
+ """
378
+ Return index in sorted array, or None if not found.
379
+
380
+ Robust to None/empty arrays and dtype mismatches.
381
+ """
382
+ if sorted_rids is None or len(sorted_rids) == 0:
383
+ return None
384
+
385
+ target = np.int64(target_rid)
386
+ i = np.searchsorted(sorted_rids, target)
387
+
388
+ if i < len(sorted_rids) and sorted_rids[i] == target:
389
+ return int(i)
390
+ return None
391
+
392
+ def get_concat_origin(self, row_id: int) -> Optional[dict]:
393
+ """
394
+ Get which source DataFrame a row came from in a concat.
395
+
396
+ Uses binary search (O(log n)) on sorted RIDs.
397
+
398
+ Returns:
399
+ {step_id, source_index, source_shape, position} if found, else None.
400
+ """
401
+ for mapping in self.concat_mappings:
402
+ idx = self._binary_search_mapping(mapping.out_rids_sorted, row_id)
403
+ if idx is not None:
404
+ pos = int(mapping.out_pos_sorted[idx])
405
+ source_idx = int(mapping.source_indices[pos])
406
+ return {
407
+ "step_id": mapping.step_id,
408
+ "source_index": source_idx,
409
+ "source_shape": (
410
+ mapping.source_shapes[source_idx]
411
+ if source_idx < len(mapping.source_shapes)
412
+ else None
413
+ ),
414
+ "position": pos,
415
+ }
416
+ return None
417
+
418
+ # === DUPLICATE DROP LOOKUP (O(log n) via searchsorted) ===
419
+
420
+ def get_duplicate_representative(self, row_id: int) -> Optional[dict]:
421
+ """
422
+ Get which row replaced this one in drop_duplicates.
423
+
424
+ Returns:
425
+ {step_id, kept_rid, subset_columns, keep_strategy} if found, else None.
426
+ kept_rid is -1 if keep=False (no representative).
427
+ """
428
+ for mapping in self.duplicate_drop_mappings:
429
+ idx = self._binary_search_mapping(mapping.dropped_rids, row_id)
430
+ if idx is not None:
431
+ kept = int(mapping.kept_rids[idx])
432
+ return {
433
+ "step_id": mapping.step_id,
434
+ "kept_rid": kept if kept >= 0 else None,
435
+ "subset_columns": mapping.subset_columns,
436
+ "keep_strategy": mapping.keep_strategy,
437
+ }
438
+ return None
439
+
364
440
  # === MEMORY MANAGEMENT ===
365
441
 
366
442
  def _check_memory_and_spill(self) -> None:
@@ -567,17 +643,17 @@ class InMemoryLineageStore:
567
643
 
568
644
  def get_row_history_with_lineage(self, row_id: int, max_depth: int = 10) -> list[dict]:
569
645
  """
570
- Get row history including pre-merge parent history.
646
+ Get row history including pre-merge and pre-concat parent history.
571
647
 
572
- Follows merge lineage recursively to build complete cell provenance.
573
- This is essential for tracking changes that happened before merge operations.
648
+ Follows merge and concat lineage recursively to build complete cell provenance.
649
+ This is essential for tracking changes that happened before merge/concat operations.
574
650
 
575
651
  Deduplicates events by (col, old_val, new_val, operation) signature to prevent
576
652
  cross-pipeline contamination when multiple DataFrames share row IDs.
577
653
 
578
654
  Args:
579
655
  row_id: Row ID to trace
580
- max_depth: Maximum merge depth to follow (prevents infinite loops)
656
+ max_depth: Maximum lineage depth to follow (prevents infinite loops)
581
657
 
582
658
  Returns:
583
659
  List of UNIQUE events in chronological order, including parent row events.
@@ -592,12 +668,21 @@ class InMemoryLineageStore:
592
668
  events = []
593
669
 
594
670
  # Check if this row came from a merge
595
- origin = self.get_merge_origin(rid)
596
- if origin and origin["left_parent"] is not None:
671
+ merge_origin = self.get_merge_origin(rid)
672
+ if merge_origin and merge_origin["left_parent"] is not None:
597
673
  # Recursively get parent's history first (chronological order)
598
- parent_events = _collect_history(origin["left_parent"], depth + 1)
674
+ parent_events = _collect_history(merge_origin["left_parent"], depth + 1)
599
675
  events.extend(parent_events)
600
676
 
677
+ # Check if this row came from a concat
678
+ # For concat, parent_rid == rid (identity mapping), so we don't recurse
679
+ # But we record the concat step for completeness
680
+ concat_origin = self.get_concat_origin(rid)
681
+ if concat_origin:
682
+ # Concat preserves RIDs, so the "parent" is the same RID
683
+ # The concat step itself is recorded in the step events
684
+ pass
685
+
601
686
  # Add this row's direct events
602
687
  events.extend(self.get_row_history(rid))
603
688
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tracepipe
3
- Version: 0.3.5
3
+ Version: 0.4.1
4
4
  Summary: Row-level data lineage tracking for pandas pipelines
5
5
  Project-URL: Homepage, https://github.com/tracepipe/tracepipe
6
6
  Project-URL: Documentation, https://tracepipe.github.io/tracepipe/
@@ -278,7 +278,7 @@ tp.enable(mode="debug") # Full lineage
278
278
 
279
279
  ## Known Limitations
280
280
 
281
- TracePipe tracks **cell mutations** (fillna, replace, loc assignment) and **merge provenance** reliably. However, some patterns are not yet fully supported:
281
+ TracePipe tracks **cell mutations**, **merge provenance**, **concat provenance**, and **duplicate drop decisions** reliably. A few patterns have limited tracking:
282
282
 
283
283
  | Pattern | Status | Notes |
284
284
  |---------|--------|-------|
@@ -286,13 +286,10 @@ TracePipe tracks **cell mutations** (fillna, replace, loc assignment) and **merg
286
286
  | `df = df.fillna({"col": 0})` | ✅ Tracked | DataFrame-level fillna |
287
287
  | `df.loc[mask, "col"] = val` | ✅ Tracked | Conditional assignment |
288
288
  | `df.merge(other, on="key")` | ✅ Tracked | Full provenance in debug mode |
289
- | `pd.concat([df1, df2])` | ⚠️ Partial | Row IDs preserved, but no "source DataFrame" tracking |
290
- | `df.drop_duplicates(keep='last')` | ⚠️ Partial | Which row was kept is not tracked |
291
- | Sort + dedup patterns | ⚠️ Partial | "Latest record wins" logic not traced |
292
-
293
- **Why?** TracePipe tracks value changes within rows, not row-selection operations. When `drop_duplicates` picks one row over another, that's a provenance decision (not a cell mutation) that isn't currently instrumented.
294
-
295
- **Planned for 0.4**: Full row-provenance tracking for concat, drop_duplicates, and sort operations.
289
+ | `pd.concat([df1, df2])` | Tracked | Row IDs preserved with source DataFrame tracking (v0.4+) |
290
+ | `df.drop_duplicates()` | Tracked | Dropped rows map to kept representative (debug mode, v0.4+) |
291
+ | `pd.concat(axis=1)` | ⚠️ Partial | FULL only if all inputs have identical RIDs |
292
+ | Complex `apply`/`pipe` | ⚠️ Partial | Output tracked, internals opaque |
296
293
 
297
294
  ---
298
295
 
@@ -1,29 +1,29 @@
1
- tracepipe/__init__.py,sha256=HK7i2rACJQdbyz5oMZ4z-xo9xJbS0cUqbS2AK6uMHJU,3342
1
+ tracepipe/__init__.py,sha256=VOQFGsfVlTngxxdDSgOOd7X2KJt1l4fjKDH4NeizYEg,3342
2
2
  tracepipe/api.py,sha256=WdcKvvzI3voDt6fxZWa8vjyZQU8lfRshx7T78oj7oFE,13351
3
3
  tracepipe/context.py,sha256=DvwAZGZbLDJ4xoqmS1VnZOBbOI8ZIIErsY9W6GEbFSM,4051
4
4
  tracepipe/contracts.py,sha256=m-rjPrgnCiAgKEkweOS7P95jrjDptt5UPdvUlqaV_rU,16226
5
- tracepipe/convenience.py,sha256=KuDz_ZzNivVG1SS8Srr3plu4CTwFmNhYL4rk3vV6cbE,28421
6
- tracepipe/core.py,sha256=kAXks694rR0Z4tD7Gyty0TyJGWx2whsSdteYYpHuazo,8010
5
+ tracepipe/convenience.py,sha256=nJ7Fy8riQVLXHOn1IFWtSpnmhHlyPt1hhantkOLKJs0,33141
6
+ tracepipe/core.py,sha256=O1QFJFTSszxKY1pVR1XLrdA0zez7t5KQLjyq6V36ARk,10883
7
7
  tracepipe/debug.py,sha256=6t2GKVZLwn7SJLhrStE9qsmTiVIHATTE3jJPQ2DYtnc,10140
8
8
  tracepipe/safety.py,sha256=UpzhQj31Dij-DjgT9mY_jPrUpVfcA51gDI2fUos4IUA,6694
9
9
  tracepipe/snapshot.py,sha256=OLREzE1_LkWITluG_Bqeb7Y4pAKb8Lb3zJEF3cxnloU,13967
10
10
  tracepipe/value_provenance.py,sha256=ogky6aOaZ-6K2uNBQxlXpmCeuvK434Hisj30zesRTd8,9330
11
11
  tracepipe/instrumentation/__init__.py,sha256=pd0n6Z9m_V3gcBv097cXWFOZEzAP9sAq1jjQnNRrDZ8,222
12
12
  tracepipe/instrumentation/apply_capture.py,sha256=cMThWzNXqWQENuMrCGTne1hO6fqaQFV7zJYNpsPTW4w,14463
13
- tracepipe/instrumentation/filter_capture.py,sha256=onlYLU5bBZSM3WmxM2AFHfktnlx7ReG-brEn5eZ_N10,15830
13
+ tracepipe/instrumentation/filter_capture.py,sha256=aN8-Ev6kbDR8f9A9JVy236VK0iqNxpMvki3pbtUkBYQ,19445
14
14
  tracepipe/instrumentation/indexer_capture.py,sha256=1ATCeJ-uNA1uGiSbgnUx0wdVsIlZGHeUBaFJPXgFQNg,28440
15
- tracepipe/instrumentation/merge_capture.py,sha256=Eze-PTrn7IXxZRZBYX9R13mOY3diWKAkjp4z-wa1tEk,13349
15
+ tracepipe/instrumentation/merge_capture.py,sha256=zqa6SY5YLbr-N7PPTdE6TYKyJIZcPqT02d1Ifvi3Jdw,18359
16
16
  tracepipe/instrumentation/pandas_inst.py,sha256=h8RlfwYkYwuftCyBYIETdwHxVCzQM1SBBrbYP7SyjJ8,30047
17
17
  tracepipe/instrumentation/series_capture.py,sha256=i7FiA2ndEzS6duIj5y-a7SDfIMl2cCY_jGC1tmG7TGU,11271
18
18
  tracepipe/storage/__init__.py,sha256=pGFMfbIgIi2kofVPwYDqe2HTYMYJoabiGjTq77pYi-g,348
19
19
  tracepipe/storage/base.py,sha256=7DV_-rp37DjBMr9B1w85hLVYhC8OQShk2PcEhT-n4tE,4894
20
- tracepipe/storage/lineage_store.py,sha256=KhGri2uC_O_43fUivFGEHY6KBDHd1I0O_PPd_KD3L4M,28683
20
+ tracepipe/storage/lineage_store.py,sha256=1enRmDgnVjxW8Pu7WMHJ8WPnnbm-HsAm4e1dKsTvnIc,31943
21
21
  tracepipe/storage/row_identity.py,sha256=HBU0gTTJlFtFTcAdUCKuX-c9cHa0lo3CDIodDPDgOzA,17161
22
22
  tracepipe/utils/__init__.py,sha256=CI_GXViCjdMbu1j6HuzZhoQZEW0sIB6WAve6j5pfOC0,182
23
23
  tracepipe/utils/value_capture.py,sha256=wGgegQmJnVHxHbwHSH9di7JAOBChzD3ERJrabZNiayk,4092
24
24
  tracepipe/visualization/__init__.py,sha256=M3s44ZTUNEToyghjhQW0FgbmWHKPr4Xc-7iNF6DpI_E,132
25
25
  tracepipe/visualization/html_export.py,sha256=G0hfZTJctUCfpun17zXX1NIXhvJZbca6hKmP3rcIjbg,42282
26
- tracepipe-0.3.5.dist-info/METADATA,sha256=bWidBs8nMW6T6oah8xQum_IjdP7Y1J1inDAn-gfHUCg,10288
27
- tracepipe-0.3.5.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
28
- tracepipe-0.3.5.dist-info/licenses/LICENSE,sha256=HMOAFHBClL79POwWL-2_aDcx42DJAq7Ce-nwJPvMB9U,1075
29
- tracepipe-0.3.5.dist-info/RECORD,,
26
+ tracepipe-0.4.1.dist-info/METADATA,sha256=kF2jBdGhKt-9YGR5VdFyb85jZj3Tgc26FbL9JxRLkhc,10067
27
+ tracepipe-0.4.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
28
+ tracepipe-0.4.1.dist-info/licenses/LICENSE,sha256=HMOAFHBClL79POwWL-2_aDcx42DJAq7Ce-nwJPvMB9U,1075
29
+ tracepipe-0.4.1.dist-info/RECORD,,