PyPI - tracepipe - Versions diffs - 0.3.5__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

tracepipe 0.3.5py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

tracepipe/__init__.py +1 -1
tracepipe/convenience.py +130 -7
tracepipe/core.py +79 -0
tracepipe/instrumentation/filter_capture.py +103 -1
tracepipe/instrumentation/merge_capture.py +169 -23
tracepipe/storage/lineage_store.py +92 -7
{tracepipe-0.3.5.dist-info → tracepipe-0.4.1.dist-info}/METADATA +6 -9
{tracepipe-0.3.5.dist-info → tracepipe-0.4.1.dist-info}/RECORD +10 -10
{tracepipe-0.3.5.dist-info → tracepipe-0.4.1.dist-info}/WHEEL +0 -0
{tracepipe-0.3.5.dist-info → tracepipe-0.4.1.dist-info}/licenses/LICENSE +0 -0

tracepipe/__init__.py CHANGED Viewed

@@ -81,7 +81,7 @@ from .core import TracePipeConfig, TracePipeMode
 from .snapshot import DiffResult, Snapshot, diff, snapshot
 # === VERSION ===
-__version__ = "0.3.5"
+__version__ = "0.4.1"
 # === MINIMAL __all__ ===
 __all__ = [

tracepipe/convenience.py CHANGED Viewed

@@ -54,6 +54,12 @@ class CheckResult:
     Separates FACTS (observed, high confidence) from HEURISTICS (inferred).
     .ok is True only if there are no FACT-level warnings.
+    Key properties for quick access:
+        .passed       - Alias for .ok (common naming convention)
+        .retention    - Row retention rate (0.0-1.0)
+        .n_dropped    - Total rows dropped
+        .drops_by_op  - Drops broken down by operation
     """
     ok: bool
@@ -61,6 +67,37 @@ class CheckResult:
     facts: dict[str, Any]
     suggestions: list[str]
     mode: str
+    # Internal: store drops_by_op so we don't need to recompute
+    _drops_by_op: dict[str, int] = field(default_factory=dict)
+    # === CONVENIENCE PROPERTIES ===
+    @property
+    def passed(self) -> bool:
+        """Alias for .ok (matches common naming convention)."""
+        return self.ok
+    @property
+    def retention(self) -> float | None:
+        """Row retention rate (0.0-1.0), or None if not computed."""
+        return self.facts.get("retention_rate")
+    @property
+    def n_dropped(self) -> int:
+        """Total number of rows dropped."""
+        return self.facts.get("rows_dropped", 0)
+    @property
+    def drops_by_op(self) -> dict[str, int]:
+        """Drops broken down by operation name."""
+        return self._drops_by_op
+    @property
+    def n_steps(self) -> int:
+        """Total pipeline steps recorded."""
+        return self.facts.get("total_steps", 0)
+    # === EXISTING PROPERTIES ===
     @property
     def has_warnings(self) -> bool:
@@ -115,7 +152,12 @@ class CheckResult:
         """Export to dictionary."""
         return {
             "ok": self.ok,
+            "passed": self.passed,
             "mode": self.mode,
+            "retention": self.retention,
+            "n_dropped": self.n_dropped,
+            "n_steps": self.n_steps,
+            "drops_by_op": self.drops_by_op,
             "facts": self.facts,
             "suggestions": self.suggestions,
             "warnings": [
@@ -147,6 +189,10 @@ class TraceResult:
     Answers: "What happened to this row?"
     Events are in CHRONOLOGICAL order (oldest->newest).
+    Key attributes:
+        origin: Where this row came from (concat, merge, or original)
+        representative: If dropped by dedup, which row was kept instead
     """
     row_id: int
@@ -158,22 +204,70 @@ class TraceResult:
     # Mode enforcement
     supported: bool = True
     unsupported_reason: str | None = None
+    # v0.4+ provenance
+    concat_origin: dict[str, Any] | None = None
+    dedup_representative: dict[str, Any] | None = None
     @property
     def n_events(self) -> int:
         return len(self.events)
+    @property
+    def origin(self) -> dict[str, Any] | None:
+        """
+        Unified origin info: where did this row come from?
+        Returns dict with 'type' key:
+            - {"type": "concat", "source_df": 1, "step_id": 5}
+            - {"type": "merge", "left_parent": 10, "right_parent": 20, "step_id": 3}
+            - None if original row (not from concat/merge)
+        """
+        if self.concat_origin:
+            return {
+                "type": "concat",
+                "source_df": self.concat_origin.get("source_index"),
+                "step_id": self.concat_origin.get("step_id"),
+            }
+        if self.merge_origin:
+            return {
+                "type": "merge",
+                "left_parent": self.merge_origin.get("left_parent"),
+                "right_parent": self.merge_origin.get("right_parent"),
+                "step_id": self.merge_origin.get("step_id"),
+            }
+        return None
+    @property
+    def representative(self) -> dict[str, Any] | None:
+        """
+        If dropped by drop_duplicates, which row was kept instead?
+        Returns:
+            {"kept_rid": 42, "subset": ["key"], "keep": "first"} or None
+            kept_rid is None if keep=False (all duplicates dropped)
+        """
+        if not self.dedup_representative:
+            return None
+        return {
+            "kept_rid": self.dedup_representative.get("kept_rid"),
+            "subset": self.dedup_representative.get("subset_columns"),
+            "keep": self.dedup_representative.get("keep_strategy"),
+        }
     def to_dict(self) -> dict:
         """Export to dictionary."""
         return {
             "row_id": self.row_id,
             "is_alive": self.is_alive,
             "dropped_at": self.dropped_at,
-            "merge_origin": self.merge_origin,
+            "origin": self.origin,
+            "representative": self.representative,
             "n_events": self.n_events,
             "events": self.events,
             "ghost_values": self.ghost_values,
             "supported": self.supported,
+            # Keep legacy fields for backwards compatibility
+            "merge_origin": self.merge_origin,
         }
     def __repr__(self) -> str:
@@ -195,10 +289,28 @@ class TraceResult:
                     f"    at step {self.dropped_at['step_id']}: {self.dropped_at['operation']}"
                 )
-        if self.merge_origin:
-            left = self.merge_origin.get("left_parent", "?")
-            right = self.merge_origin.get("right_parent", "?")
-            lines.append(f"  Origin: merge of row {left} (left) + row {right} (right)")
+        # Display unified origin info
+        origin = self.origin
+        if origin:
+            if origin["type"] == "merge":
+                left = origin.get("left_parent", "?")
+                right = origin.get("right_parent", "?")
+                lines.append(f"  Origin: merge of row {left} (left) + row {right} (right)")
+            elif origin["type"] == "concat":
+                src = origin.get("source_df", "?")
+                lines.append(f"  Origin: concat from DataFrame #{src}")
+        # Display dedup representative if dropped by dedup
+        if self.representative:
+            kept = self.representative.get("kept_rid")
+            subset = self.representative.get("subset")
+            keep = self.representative.get("keep", "first")
+            if kept is not None:
+                subset_str = f" (key: {subset})" if subset else ""
+                lines.append(f"  Replaced by: row {kept}{subset_str} [keep={keep}]")
+            else:
+                subset_str = f" on {subset}" if subset else ""
+                lines.append(f"  Dropped: all duplicates removed{subset_str} [keep=False]")
         if len(self.events) == 0:
             lines.append("\n  Events: 0 (no changes to watched columns)")
@@ -462,8 +574,8 @@ def check(
                 )
             )
-    drops_by_step = ctx.store.get_dropped_by_step()
-    for op, count in drops_by_step.items():
+    drops_by_op = ctx.store.get_dropped_by_step()
+    for op, count in drops_by_op.items():
         if count > 1000:
             suggestions.append(f"'{op}' dropped {count} rows - review if intentional")
@@ -475,6 +587,7 @@ def check(
         facts=facts,
         suggestions=suggestions,
         mode=ctx.config.mode.value,
+        _drops_by_op=drops_by_op,
     )
@@ -787,6 +900,14 @@ def _build_trace_result(row_id: int, ctx, include_ghost: bool) -> TraceResult:
     drop_event = store.get_drop_event(row_id)
     merge_origin = store.get_merge_origin(row_id)
+    # v0.4+ provenance: concat origin and dedup representative
+    concat_origin = None
+    dedup_representative = None
+    if hasattr(store, "get_concat_origin"):
+        concat_origin = store.get_concat_origin(row_id)
+    if hasattr(store, "get_duplicate_representative"):
+        dedup_representative = store.get_duplicate_representative(row_id)
     # Use lineage-aware history to include pre-merge parent events
     if hasattr(store, "get_row_history_with_lineage"):
         history = store.get_row_history_with_lineage(row_id)
@@ -823,6 +944,8 @@ def _build_trace_result(row_id: int, ctx, include_ghost: bool) -> TraceResult:
         merge_origin=merge_origin,
         events=history,
         ghost_values=ghost_values,
+        concat_origin=concat_origin,
+        dedup_representative=dedup_representative,
     )

tracepipe/core.py CHANGED Viewed

@@ -277,3 +277,82 @@ class MergeStats:
     left_dup_rate: float  # -1 if not computed
     right_dup_rate: float  # -1 if not computed
     how: str
+@dataclass
+class ConcatMapping:
+    """
+    Mapping for pd.concat operations preserving row lineage.
+    For axis=0 concat, each result row comes from exactly one source DataFrame.
+    Arrays are stored in both positional order (for "explain row i") and
+    sorted order (for O(log n) RID lookup).
+    Invariants:
+    - out_rids and source_indices have same length
+    - out_rids_sorted and out_pos_sorted are always paired (both set or both None)
+    - out_rids_sorted is monotonically increasing
+    """
+    step_id: int
+    # Positional arrays (match result row order)
+    out_rids: Any  # numpy array, len = len(result)
+    source_indices: Any  # numpy array, which source DF (0, 1, 2...) each row came from
+    # Sorted arrays (for O(log n) lookup by RID)
+    out_rids_sorted: Any  # numpy array, SORTED
+    out_pos_sorted: Any  # numpy array, original positions aligned with out_rids_sorted
+    # Metadata
+    source_shapes: list[tuple] = field(default_factory=list)
+    def __post_init__(self):
+        """Validate invariants."""
+        import numpy as np
+        if self.out_rids_sorted is not None and self.out_pos_sorted is not None:
+            if len(self.out_rids_sorted) != len(self.out_pos_sorted):
+                raise ValueError("out_rids_sorted and out_pos_sorted must have same length")
+            # Verify monotonic (debug check)
+            if len(self.out_rids_sorted) > 1:
+                assert np.all(
+                    self.out_rids_sorted[:-1] <= self.out_rids_sorted[1:]
+                ), "out_rids_sorted must be monotonically increasing"
+@dataclass
+class DuplicateDropMapping:
+    """
+    Mapping for drop_duplicates provenance (debug mode only).
+    Tracks which rows were dropped and which "representative" row they lost to.
+    Arrays are sorted by dropped_rids for O(log n) lookup.
+    For keep='first': dropped rows map to first occurrence
+    For keep='last': dropped rows map to last occurrence
+    For keep=False: dropped rows have kept_rids=-1 (no representative)
+    """
+    step_id: int
+    # Sorted arrays for O(log n) lookup
+    dropped_rids: Any  # numpy array, SORTED dropped row IDs
+    kept_rids: Any  # numpy array, representative RID for each dropped row (-1 if none)
+    # Metadata
+    subset_columns: Optional[tuple[str, ...]] = None
+    keep_strategy: str = "first"
+    def __post_init__(self):
+        """Validate invariants."""
+        import numpy as np
+        if self.dropped_rids is not None and self.kept_rids is not None:
+            if len(self.dropped_rids) != len(self.kept_rids):
+                raise ValueError("dropped_rids and kept_rids must have same length")
+            # Verify sorted
+            if len(self.dropped_rids) > 1:
+                assert np.all(
+                    self.dropped_rids[:-1] <= self.dropped_rids[1:]
+                ), "dropped_rids must be sorted"

tracepipe/instrumentation/filter_capture.py CHANGED Viewed

@@ -24,7 +24,7 @@ import numpy as np
 import pandas as pd
 from ..context import TracePipeContext, get_context
-from ..core import CompletenessLevel
+from ..core import CompletenessLevel, DuplicateDropMapping
 from ..safety import TracePipeWarning, get_caller_info
 # ============ MASK DERIVATION FUNCTIONS ============
@@ -97,6 +97,95 @@ def derive_drop_duplicates_mask(
     return kept_mask.values, completeness
+def derive_drop_duplicates_provenance(
+    df: pd.DataFrame,
+    source_rids: np.ndarray,
+    subset: Optional[list[str]],
+    keep: str,
+) -> Optional[DuplicateDropMapping]:
+    """
+    Derive dropped->kept mapping for drop_duplicates (debug mode only).
+    Uses hash_pandas_object for NaN-safe, fast key comparison.
+    Uses vectorized groupby min/max for representative selection.
+    Args:
+        df: Source DataFrame
+        source_rids: Row IDs for each row in df
+        subset: Columns to consider for duplicates (None = all)
+        keep: 'first', 'last', or False
+    Returns:
+        DuplicateDropMapping if any rows were dropped, else None.
+    """
+    n = len(df)
+    if n == 0:
+        return None
+    # Determine columns to hash
+    if subset is None:
+        hash_df = df
+        valid_cols = tuple(df.columns)
+    else:
+        valid_cols = tuple(c for c in subset if c in df.columns)
+        if not valid_cols:
+            return None
+        hash_df = df[list(valid_cols)]
+    # Use hash_pandas_object for fast, NaN-safe key hashing
+    try:
+        h = pd.util.hash_pandas_object(hash_df, index=False)
+        codes, _ = pd.factorize(h, sort=False)
+    except Exception:
+        # Fallback: can't hash, skip provenance
+        return None
+    # Compute kept mask using pandas (ground truth)
+    kept_mask = ~df.duplicated(subset=list(valid_cols) if valid_cols else None, keep=keep)
+    dropped_mask = ~kept_mask.values
+    if not dropped_mask.any():
+        return None  # No duplicates dropped
+    dropped_positions = np.where(dropped_mask)[0]
+    dropped_rids = source_rids[dropped_positions]
+    # Find representative positions using vectorized groupby min/max
+    positions = np.arange(n, dtype=np.int64)
+    if keep == "first":
+        # Representative = first occurrence of each group
+        rep_pos = pd.Series(positions).groupby(codes).min().to_numpy()
+    elif keep == "last":
+        # Representative = last occurrence of each group
+        rep_pos = pd.Series(positions).groupby(codes).max().to_numpy()
+    else:
+        # keep=False: no representative (all duplicates dropped)
+        rep_pos = None
+    # Build kept_rids array
+    if rep_pos is not None:
+        dropped_codes = codes[dropped_positions]
+        kept_positions = rep_pos[dropped_codes]
+        kept_rids = source_rids[kept_positions]
+    else:
+        # keep=False: no representative
+        kept_rids = np.full(len(dropped_rids), -1, dtype=np.int64)
+    # Sort by dropped_rids for O(log n) lookup
+    sort_order = np.argsort(dropped_rids)
+    dropped_rids_sorted = dropped_rids[sort_order].copy()
+    kept_rids_sorted = kept_rids[sort_order].copy()
+    return DuplicateDropMapping(
+        step_id=-1,  # Will be set by caller
+        dropped_rids=dropped_rids_sorted,
+        kept_rids=kept_rids_sorted,
+        subset_columns=valid_cols if valid_cols else None,
+        keep_strategy=str(keep),
+    )
 def derive_query_mask(
     df: pd.DataFrame, args: tuple, kwargs: dict
 ) -> tuple[Optional[np.ndarray], CompletenessLevel]:
@@ -257,12 +346,19 @@ def _capture_filter_with_mask(
     kept_mask: Optional[np.ndarray] = None
     positions: Optional[np.ndarray] = None
     completeness = CompletenessLevel.FULL
+    dedup_mapping: Optional[DuplicateDropMapping] = None
     if method_name == "dropna":
         kept_mask, completeness = derive_dropna_mask(source_df, args, kwargs)
     elif method_name == "drop_duplicates":
         kept_mask, completeness = derive_drop_duplicates_mask(source_df, args, kwargs)
+        # Compute provenance mapping in debug mode
+        dedup_mapping = None
+        if ctx.config.should_capture_merge_provenance:
+            subset = kwargs.get("subset", None)
+            keep = kwargs.get("keep", "first")
+            dedup_mapping = derive_drop_duplicates_provenance(source_df, source_rids, subset, keep)
     elif method_name == "query":
         kept_mask, completeness = derive_query_mask(source_df, args, kwargs)
@@ -359,6 +455,12 @@ def _capture_filter_with_mask(
                 watched_columns=ctx.watched_columns,
             )
+    # === RECORD DROP_DUPLICATES PROVENANCE (debug mode) ===
+    if method_name == "drop_duplicates" and dedup_mapping is not None:
+        # Update step_id in the mapping and store it
+        dedup_mapping.step_id = step_id
+        store.duplicate_drop_mappings.append(dedup_mapping)
 def _propagate_by_index_fallback(
     row_mgr, source_df: pd.DataFrame, result_df: pd.DataFrame

tracepipe/instrumentation/merge_capture.py CHANGED Viewed

@@ -14,7 +14,7 @@ import numpy as np
 import pandas as pd
 from ..context import get_context
-from ..core import CompletenessLevel, MergeMapping, MergeStats
+from ..core import CompletenessLevel, ConcatMapping, MergeMapping, MergeStats
 from ..safety import TracePipeWarning, get_caller_info
@@ -382,53 +382,199 @@ def wrap_join_with_lineage(original_join):
 def wrap_concat_with_lineage(original_concat):
     """
     Wrap pd.concat with lineage capture.
+    For axis=0 (vertical concat):
+    - Preserves row IDs from source DataFrames (FULL provenance)
+    - Tracks which source DataFrame each row came from
+    For axis=1 (horizontal concat):
+    - Propagates RIDs if all inputs have identical RID arrays
+    - Otherwise marks as PARTIAL
     """
     @wraps(original_concat)
     def wrapper(objs, *args, **kwargs):
         ctx = get_context()
-        result = original_concat(objs, *args, **kwargs)
         if not ctx.enabled:
-            return result
+            return original_concat(objs, *args, **kwargs)
+        axis = kwargs.get("axis", 0)
+        # === BEFORE: Capture source RIDs from all tracked DataFrames ===
+        source_data = []  # [(rids_copy, shape, original_index), ...]
+        try:
+            objs_list = list(objs) if hasattr(objs, "__iter__") else [objs]
+        except TypeError:
+            objs_list = [objs]
+        for i, obj in enumerate(objs_list):
+            if isinstance(obj, pd.DataFrame) and len(obj) > 0:
+                rids = ctx.row_manager.get_ids_array(obj)
+                if rids is None:
+                    rids = ctx.row_manager.register(obj)
+                # IMPORTANT: Make a copy to avoid mutation issues
+                source_data.append((rids.copy(), obj.shape, i))
+        # === RUN ORIGINAL ===
+        try:
+            result = original_concat(objs_list, *args, **kwargs)
+        except Exception:
+            raise  # Don't store mapping on failure
         if not isinstance(result, pd.DataFrame):
             return result
-        try:
-            row_mgr = ctx.row_manager
-            store = ctx.store
+        row_mgr = ctx.row_manager
+        store = ctx.store
+        code_file, code_line = get_caller_info(skip_frames=2)
-            # Register result
-            row_mgr.register(result)
+        # Compute input shapes for step metadata
+        input_shapes = [sd[1] for sd in source_data]
-            code_file, code_line = get_caller_info(skip_frames=2)
+        # === AXIS=0: Vertical concat with FULL provenance ===
+        if axis == 0 and source_data:
+            return _concat_axis0_with_provenance(
+                result, source_data, input_shapes, code_file, code_line, ctx
+            )
-            # Compute input shapes
-            input_shapes = []
-            for obj in objs:
-                if hasattr(obj, "shape"):
-                    input_shapes.append(obj.shape)
+        # === AXIS=1: Horizontal concat ===
+        elif axis == 1 and source_data:
+            return _concat_axis1_with_provenance(
+                result, source_data, input_shapes, code_file, code_line, ctx
+            )
+        # === FALLBACK: Unknown axis or no source data ===
+        else:
+            row_mgr.register(result)
             store.append_step(
                 operation="pd.concat",
                 stage=ctx.current_stage,
                 code_file=code_file,
                 code_line=code_line,
                 params={
-                    "axis": kwargs.get("axis", 0),
-                    "n_inputs": len(objs) if hasattr(objs, "__len__") else 1,
+                    "axis": axis,
+                    "n_inputs": len(source_data),
                 },
                 input_shape=tuple(input_shapes) if input_shapes else None,
                 output_shape=result.shape,
-                completeness=CompletenessLevel.PARTIAL,  # Concat resets lineage
+                completeness=CompletenessLevel.PARTIAL,
             )
-        except Exception as e:
-            if ctx.config.strict_mode:
-                raise
-            warnings.warn(f"TracePipe: Concat capture failed: {e}", TracePipeWarning)
+            return result
+    return wrapper
+def _concat_axis0_with_provenance(result, source_data, input_shapes, code_file, code_line, ctx):
+    """
+    Handle axis=0 concat with FULL row provenance.
+    Preserves source RIDs and tracks which source DF each row came from.
+    """
+    row_mgr = ctx.row_manager
+    store = ctx.store
+    # Build concatenated RID array and source index array
+    all_rids = np.concatenate([sd[0] for sd in source_data])
+    all_source_idx = np.concatenate(
+        [np.full(len(sd[0]), sd[2], dtype=np.int32) for sd in source_data]
+    )
+    # Validate: length must match result
+    if len(all_rids) != len(result):
+        # Mismatch - some objects contributed differently (empty DFs, Series, etc.)
+        # Degrade gracefully to PARTIAL
+        row_mgr.register(result)
+        store.append_step(
+            operation="pd.concat",
+            stage=ctx.current_stage,
+            code_file=code_file,
+            code_line=code_line,
+            params={
+                "axis": 0,
+                "n_inputs": len(source_data),
+                "_length_mismatch": True,
+            },
+            input_shape=tuple(input_shapes) if input_shapes else None,
+            output_shape=result.shape,
+            completeness=CompletenessLevel.PARTIAL,
+        )
         return result
-    return wrapper
+    # Propagate RIDs to result (preserving lineage!)
+    row_mgr.set_result_rids(result, all_rids.copy())
+    # Build sorted arrays for O(log n) lookup
+    sort_order = np.argsort(all_rids)
+    out_rids_sorted = all_rids[sort_order].copy()
+    out_pos_sorted = sort_order.copy()
+    # Record step with FULL completeness
+    step_id = store.append_step(
+        operation="pd.concat",
+        stage=ctx.current_stage,
+        code_file=code_file,
+        code_line=code_line,
+        params={
+            "axis": 0,
+            "n_inputs": len(source_data),
+        },
+        input_shape=tuple(input_shapes) if input_shapes else None,
+        output_shape=result.shape,
+        completeness=CompletenessLevel.FULL,
+    )
+    # Store mapping
+    mapping = ConcatMapping(
+        step_id=step_id,
+        out_rids=all_rids.copy(),
+        source_indices=all_source_idx.copy(),
+        out_rids_sorted=out_rids_sorted,
+        out_pos_sorted=out_pos_sorted,
+        source_shapes=list(input_shapes),
+    )
+    store.concat_mappings.append(mapping)
+    return result
+def _concat_axis1_with_provenance(result, source_data, input_shapes, code_file, code_line, ctx):
+    """
+    Handle axis=1 concat with best-effort provenance.
+    If all inputs have identical RID arrays, propagate them (FULL).
+    Otherwise, mark as PARTIAL and register new RIDs.
+    """
+    row_mgr = ctx.row_manager
+    store = ctx.store
+    # Check if all inputs have the same RIDs in same order
+    first_rids = source_data[0][0]
+    all_same = all(
+        len(sd[0]) == len(first_rids) and np.array_equal(sd[0], first_rids) for sd in source_data
+    )
+    if all_same and len(first_rids) == len(result):
+        # All inputs have identical RIDs - propagate them
+        row_mgr.set_result_rids(result, first_rids.copy())
+        completeness = CompletenessLevel.FULL
+    else:
+        # Misaligned or different RIDs - register new RIDs
+        row_mgr.register(result)
+        completeness = CompletenessLevel.PARTIAL
+    store.append_step(
+        operation="pd.concat",
+        stage=ctx.current_stage,
+        code_file=code_file,
+        code_line=code_line,
+        params={
+            "axis": 1,
+            "n_inputs": len(source_data),
+        },
+        input_shape=tuple(input_shapes) if input_shapes else None,
+        output_shape=result.shape,
+        completeness=completeness,
+    )
+    return result

tracepipe/storage/lineage_store.py CHANGED Viewed

@@ -22,6 +22,8 @@ from ..core import (
     AggregationMapping,
     ChangeType,
     CompletenessLevel,
+    ConcatMapping,
+    DuplicateDropMapping,
     LineageGap,
     LineageGaps,
     MergeMapping,
@@ -100,6 +102,12 @@ class InMemoryLineageStore:
         self.merge_mappings: list[MergeMapping] = []
         self.merge_stats: list[tuple[int, MergeStats]] = []
+        # === CONCAT TRACKING ===
+        self.concat_mappings: list[ConcatMapping] = []
+        # === DUPLICATE DROP TRACKING (debug mode) ===
+        self.duplicate_drop_mappings: list[DuplicateDropMapping] = []
         # === AGGREGATION MAPPINGS ===
         self.aggregation_mappings: list[AggregationMapping] = []
@@ -361,6 +369,74 @@ class InMemoryLineageStore:
             return [(sid, s) for sid, s in self.merge_stats if sid == step_id]
         return list(self.merge_stats)
+    # === CONCAT LOOKUP (O(log n) via searchsorted) ===
+    def _binary_search_mapping(
+        self, sorted_rids: Optional[np.ndarray], target_rid: int
+    ) -> Optional[int]:
+        """
+        Return index in sorted array, or None if not found.
+        Robust to None/empty arrays and dtype mismatches.
+        """
+        if sorted_rids is None or len(sorted_rids) == 0:
+            return None
+        target = np.int64(target_rid)
+        i = np.searchsorted(sorted_rids, target)
+        if i < len(sorted_rids) and sorted_rids[i] == target:
+            return int(i)
+        return None
+    def get_concat_origin(self, row_id: int) -> Optional[dict]:
+        """
+        Get which source DataFrame a row came from in a concat.
+        Uses binary search (O(log n)) on sorted RIDs.
+        Returns:
+            {step_id, source_index, source_shape, position} if found, else None.
+        """
+        for mapping in self.concat_mappings:
+            idx = self._binary_search_mapping(mapping.out_rids_sorted, row_id)
+            if idx is not None:
+                pos = int(mapping.out_pos_sorted[idx])
+                source_idx = int(mapping.source_indices[pos])
+                return {
+                    "step_id": mapping.step_id,
+                    "source_index": source_idx,
+                    "source_shape": (
+                        mapping.source_shapes[source_idx]
+                        if source_idx < len(mapping.source_shapes)
+                        else None
+                    ),
+                    "position": pos,
+                }
+        return None
+    # === DUPLICATE DROP LOOKUP (O(log n) via searchsorted) ===
+    def get_duplicate_representative(self, row_id: int) -> Optional[dict]:
+        """
+        Get which row replaced this one in drop_duplicates.
+        Returns:
+            {step_id, kept_rid, subset_columns, keep_strategy} if found, else None.
+            kept_rid is -1 if keep=False (no representative).
+        """
+        for mapping in self.duplicate_drop_mappings:
+            idx = self._binary_search_mapping(mapping.dropped_rids, row_id)
+            if idx is not None:
+                kept = int(mapping.kept_rids[idx])
+                return {
+                    "step_id": mapping.step_id,
+                    "kept_rid": kept if kept >= 0 else None,
+                    "subset_columns": mapping.subset_columns,
+                    "keep_strategy": mapping.keep_strategy,
+                }
+        return None
     # === MEMORY MANAGEMENT ===
     def _check_memory_and_spill(self) -> None:
@@ -567,17 +643,17 @@ class InMemoryLineageStore:
     def get_row_history_with_lineage(self, row_id: int, max_depth: int = 10) -> list[dict]:
         """
-        Get row history including pre-merge parent history.
+        Get row history including pre-merge and pre-concat parent history.
-        Follows merge lineage recursively to build complete cell provenance.
-        This is essential for tracking changes that happened before merge operations.
+        Follows merge and concat lineage recursively to build complete cell provenance.
+        This is essential for tracking changes that happened before merge/concat operations.
         Deduplicates events by (col, old_val, new_val, operation) signature to prevent
         cross-pipeline contamination when multiple DataFrames share row IDs.
         Args:
             row_id: Row ID to trace
-            max_depth: Maximum merge depth to follow (prevents infinite loops)
+            max_depth: Maximum lineage depth to follow (prevents infinite loops)
         Returns:
             List of UNIQUE events in chronological order, including parent row events.
@@ -592,12 +668,21 @@ class InMemoryLineageStore:
             events = []
             # Check if this row came from a merge
-            origin = self.get_merge_origin(rid)
-            if origin and origin["left_parent"] is not None:
+            merge_origin = self.get_merge_origin(rid)
+            if merge_origin and merge_origin["left_parent"] is not None:
                 # Recursively get parent's history first (chronological order)
-                parent_events = _collect_history(origin["left_parent"], depth + 1)
+                parent_events = _collect_history(merge_origin["left_parent"], depth + 1)
                 events.extend(parent_events)
+            # Check if this row came from a concat
+            # For concat, parent_rid == rid (identity mapping), so we don't recurse
+            # But we record the concat step for completeness
+            concat_origin = self.get_concat_origin(rid)
+            if concat_origin:
+                # Concat preserves RIDs, so the "parent" is the same RID
+                # The concat step itself is recorded in the step events
+                pass
             # Add this row's direct events
             events.extend(self.get_row_history(rid))

{tracepipe-0.3.5.dist-info → tracepipe-0.4.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: tracepipe
-Version: 0.3.5
+Version: 0.4.1
 Summary: Row-level data lineage tracking for pandas pipelines
 Project-URL: Homepage, https://github.com/tracepipe/tracepipe
 Project-URL: Documentation, https://tracepipe.github.io/tracepipe/
@@ -278,7 +278,7 @@ tp.enable(mode="debug")  # Full lineage
 ## Known Limitations
-TracePipe tracks **cell mutations** (fillna, replace, loc assignment) and **merge provenance** reliably. However, some patterns are not yet fully supported:
+TracePipe tracks **cell mutations**, **merge provenance**, **concat provenance**, and **duplicate drop decisions** reliably. A few patterns have limited tracking:
 | Pattern | Status | Notes |
 |---------|--------|-------|
@@ -286,13 +286,10 @@ TracePipe tracks **cell mutations** (fillna, replace, loc assignment) and **merg
 | `df = df.fillna({"col": 0})` | ✅ Tracked | DataFrame-level fillna |
 | `df.loc[mask, "col"] = val` | ✅ Tracked | Conditional assignment |
 | `df.merge(other, on="key")` | ✅ Tracked | Full provenance in debug mode |
-| `pd.concat([df1, df2])` | ⚠️ Partial | Row IDs preserved, but no "source DataFrame" tracking |
-| `df.drop_duplicates(keep='last')` | ⚠️ Partial | Which row was kept is not tracked |
-| Sort + dedup patterns | ⚠️ Partial | "Latest record wins" logic not traced |
-**Why?** TracePipe tracks value changes within rows, not row-selection operations. When `drop_duplicates` picks one row over another, that's a provenance decision (not a cell mutation) that isn't currently instrumented.
-**Planned for 0.4**: Full row-provenance tracking for concat, drop_duplicates, and sort operations.
+| `pd.concat([df1, df2])` | ✅ Tracked | Row IDs preserved with source DataFrame tracking (v0.4+) |
+| `df.drop_duplicates()` | ✅ Tracked | Dropped rows map to kept representative (debug mode, v0.4+) |
+| `pd.concat(axis=1)` | ⚠️ Partial | FULL only if all inputs have identical RIDs |
+| Complex `apply`/`pipe` | ⚠️ Partial | Output tracked, internals opaque |
 ---

{tracepipe-0.3.5.dist-info → tracepipe-0.4.1.dist-info}/RECORD RENAMED Viewed

@@ -1,29 +1,29 @@
-tracepipe/__init__.py,sha256=HK7i2rACJQdbyz5oMZ4z-xo9xJbS0cUqbS2AK6uMHJU,3342
+tracepipe/__init__.py,sha256=VOQFGsfVlTngxxdDSgOOd7X2KJt1l4fjKDH4NeizYEg,3342
 tracepipe/api.py,sha256=WdcKvvzI3voDt6fxZWa8vjyZQU8lfRshx7T78oj7oFE,13351
 tracepipe/context.py,sha256=DvwAZGZbLDJ4xoqmS1VnZOBbOI8ZIIErsY9W6GEbFSM,4051
 tracepipe/contracts.py,sha256=m-rjPrgnCiAgKEkweOS7P95jrjDptt5UPdvUlqaV_rU,16226
-tracepipe/convenience.py,sha256=KuDz_ZzNivVG1SS8Srr3plu4CTwFmNhYL4rk3vV6cbE,28421
-tracepipe/core.py,sha256=kAXks694rR0Z4tD7Gyty0TyJGWx2whsSdteYYpHuazo,8010
+tracepipe/convenience.py,sha256=nJ7Fy8riQVLXHOn1IFWtSpnmhHlyPt1hhantkOLKJs0,33141
+tracepipe/core.py,sha256=O1QFJFTSszxKY1pVR1XLrdA0zez7t5KQLjyq6V36ARk,10883
 tracepipe/debug.py,sha256=6t2GKVZLwn7SJLhrStE9qsmTiVIHATTE3jJPQ2DYtnc,10140
 tracepipe/safety.py,sha256=UpzhQj31Dij-DjgT9mY_jPrUpVfcA51gDI2fUos4IUA,6694
 tracepipe/snapshot.py,sha256=OLREzE1_LkWITluG_Bqeb7Y4pAKb8Lb3zJEF3cxnloU,13967
 tracepipe/value_provenance.py,sha256=ogky6aOaZ-6K2uNBQxlXpmCeuvK434Hisj30zesRTd8,9330
 tracepipe/instrumentation/__init__.py,sha256=pd0n6Z9m_V3gcBv097cXWFOZEzAP9sAq1jjQnNRrDZ8,222
 tracepipe/instrumentation/apply_capture.py,sha256=cMThWzNXqWQENuMrCGTne1hO6fqaQFV7zJYNpsPTW4w,14463
-tracepipe/instrumentation/filter_capture.py,sha256=onlYLU5bBZSM3WmxM2AFHfktnlx7ReG-brEn5eZ_N10,15830
+tracepipe/instrumentation/filter_capture.py,sha256=aN8-Ev6kbDR8f9A9JVy236VK0iqNxpMvki3pbtUkBYQ,19445
 tracepipe/instrumentation/indexer_capture.py,sha256=1ATCeJ-uNA1uGiSbgnUx0wdVsIlZGHeUBaFJPXgFQNg,28440
-tracepipe/instrumentation/merge_capture.py,sha256=Eze-PTrn7IXxZRZBYX9R13mOY3diWKAkjp4z-wa1tEk,13349
+tracepipe/instrumentation/merge_capture.py,sha256=zqa6SY5YLbr-N7PPTdE6TYKyJIZcPqT02d1Ifvi3Jdw,18359
 tracepipe/instrumentation/pandas_inst.py,sha256=h8RlfwYkYwuftCyBYIETdwHxVCzQM1SBBrbYP7SyjJ8,30047
 tracepipe/instrumentation/series_capture.py,sha256=i7FiA2ndEzS6duIj5y-a7SDfIMl2cCY_jGC1tmG7TGU,11271
 tracepipe/storage/__init__.py,sha256=pGFMfbIgIi2kofVPwYDqe2HTYMYJoabiGjTq77pYi-g,348
 tracepipe/storage/base.py,sha256=7DV_-rp37DjBMr9B1w85hLVYhC8OQShk2PcEhT-n4tE,4894
-tracepipe/storage/lineage_store.py,sha256=KhGri2uC_O_43fUivFGEHY6KBDHd1I0O_PPd_KD3L4M,28683
+tracepipe/storage/lineage_store.py,sha256=1enRmDgnVjxW8Pu7WMHJ8WPnnbm-HsAm4e1dKsTvnIc,31943
 tracepipe/storage/row_identity.py,sha256=HBU0gTTJlFtFTcAdUCKuX-c9cHa0lo3CDIodDPDgOzA,17161
 tracepipe/utils/__init__.py,sha256=CI_GXViCjdMbu1j6HuzZhoQZEW0sIB6WAve6j5pfOC0,182
 tracepipe/utils/value_capture.py,sha256=wGgegQmJnVHxHbwHSH9di7JAOBChzD3ERJrabZNiayk,4092
 tracepipe/visualization/__init__.py,sha256=M3s44ZTUNEToyghjhQW0FgbmWHKPr4Xc-7iNF6DpI_E,132
 tracepipe/visualization/html_export.py,sha256=G0hfZTJctUCfpun17zXX1NIXhvJZbca6hKmP3rcIjbg,42282
-tracepipe-0.3.5.dist-info/METADATA,sha256=bWidBs8nMW6T6oah8xQum_IjdP7Y1J1inDAn-gfHUCg,10288
-tracepipe-0.3.5.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-tracepipe-0.3.5.dist-info/licenses/LICENSE,sha256=HMOAFHBClL79POwWL-2_aDcx42DJAq7Ce-nwJPvMB9U,1075
-tracepipe-0.3.5.dist-info/RECORD,,
+tracepipe-0.4.1.dist-info/METADATA,sha256=kF2jBdGhKt-9YGR5VdFyb85jZj3Tgc26FbL9JxRLkhc,10067
+tracepipe-0.4.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+tracepipe-0.4.1.dist-info/licenses/LICENSE,sha256=HMOAFHBClL79POwWL-2_aDcx42DJAq7Ce-nwJPvMB9U,1075
+tracepipe-0.4.1.dist-info/RECORD,,

{tracepipe-0.3.5.dist-info → tracepipe-0.4.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{tracepipe-0.3.5.dist-info → tracepipe-0.4.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

tracepipe 0.3.5__py3-none-any.whl → 0.4.1__py3-none-any.whl

tracepipe 0.3.5py3-none-any.whl → 0.4.1py3-none-any.whl