PyPI - tracepipe - Versions diffs - 0.3.5__py3-none-any.whl → 0.4.2__py3-none-any.whl - Mend

tracepipe 0.3.5py3-none-any.whl → 0.4.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

tracepipe/__init__.py +1 -1
tracepipe/convenience.py +261 -19
tracepipe/core.py +79 -0
tracepipe/debug.py +40 -0
tracepipe/instrumentation/filter_capture.py +103 -1
tracepipe/instrumentation/merge_capture.py +169 -23
tracepipe/snapshot.py +87 -2
tracepipe/storage/lineage_store.py +92 -7
{tracepipe-0.3.5.dist-info → tracepipe-0.4.2.dist-info}/METADATA +6 -9
{tracepipe-0.3.5.dist-info → tracepipe-0.4.2.dist-info}/RECORD +12 -12
{tracepipe-0.3.5.dist-info → tracepipe-0.4.2.dist-info}/WHEEL +0 -0
{tracepipe-0.3.5.dist-info → tracepipe-0.4.2.dist-info}/licenses/LICENSE +0 -0

tracepipe/__init__.py CHANGED Viewed

@@ -81,7 +81,7 @@ from .core import TracePipeConfig, TracePipeMode
 from .snapshot import DiffResult, Snapshot, diff, snapshot
 # === VERSION ===
-__version__ = "0.3.5"
+__version__ = "0.4.2"
 # === MINIMAL __all__ ===
 __all__ = [

tracepipe/convenience.py CHANGED Viewed

@@ -54,6 +54,14 @@ class CheckResult:
     Separates FACTS (observed, high confidence) from HEURISTICS (inferred).
     .ok is True only if there are no FACT-level warnings.
+    Key properties for quick access:
+        .passed       - Alias for .ok (common naming convention)
+        .retention    - Row retention rate (0.0-1.0)
+        .n_dropped    - Total rows dropped
+        .drops_by_op  - Drops broken down by operation
+        .n_changes    - Total cell-level changes (debug mode only)
+        .changes_by_op - Changes broken down by operation (debug mode only)
     """
     ok: bool
@@ -61,6 +69,50 @@ class CheckResult:
     facts: dict[str, Any]
     suggestions: list[str]
     mode: str
+    # Internal: store drops_by_op so we don't need to recompute
+    _drops_by_op: dict[str, int] = field(default_factory=dict)
+    # Internal: store cell change counts (debug mode only)
+    _n_changes: int = 0
+    _changes_by_op: dict[str, int] = field(default_factory=dict)
+    # === CONVENIENCE PROPERTIES ===
+    @property
+    def passed(self) -> bool:
+        """Alias for .ok (matches common naming convention)."""
+        return self.ok
+    @property
+    def retention(self) -> float | None:
+        """Row retention rate (0.0-1.0), or None if not computed."""
+        return self.facts.get("retention_rate")
+    @property
+    def n_dropped(self) -> int:
+        """Total number of rows dropped."""
+        return self.facts.get("rows_dropped", 0)
+    @property
+    def drops_by_op(self) -> dict[str, int]:
+        """Drops broken down by operation name."""
+        return self._drops_by_op
+    @property
+    def n_steps(self) -> int:
+        """Total pipeline steps recorded."""
+        return self.facts.get("total_steps", 0)
+    @property
+    def n_changes(self) -> int:
+        """Total cell-level changes (debug mode only, 0 if not tracked)."""
+        return self._n_changes
+    @property
+    def changes_by_op(self) -> dict[str, int]:
+        """Cell changes broken down by operation (debug mode only)."""
+        return self._changes_by_op
+    # === EXISTING PROPERTIES ===
     @property
     def has_warnings(self) -> bool:
@@ -90,6 +142,20 @@ class CheckResult:
         lines.append(f"TracePipe Check: {status}")
         lines.append(f"  Mode: {self.mode}")
+        # Always show key metrics in compact form
+        if self.retention is not None:
+            lines.append(f"\nRetention: {int(self.retention * 100)}%")
+        if self.n_dropped > 0:
+            lines.append(f"Dropped: {self.n_dropped} rows")
+            if self.drops_by_op:
+                for op, count in list(self.drops_by_op.items())[:5]:
+                    lines.append(f"  • {op}: {count}")
+        if self.n_changes > 0:
+            lines.append(f"\nValue changes: {self.n_changes} cells")
+            if self.changes_by_op:
+                for op, count in list(self.changes_by_op.items())[:5]:
+                    lines.append(f"  • {op}: {count}")
         if verbose and self.facts:
             lines.append("\n  Measured facts:")
             for k, v in self.facts.items():
@@ -115,7 +181,14 @@ class CheckResult:
         """Export to dictionary."""
         return {
             "ok": self.ok,
+            "passed": self.passed,
             "mode": self.mode,
+            "retention": self.retention,
+            "n_dropped": self.n_dropped,
+            "n_steps": self.n_steps,
+            "drops_by_op": self.drops_by_op,
+            "n_changes": self.n_changes,
+            "changes_by_op": self.changes_by_op,
             "facts": self.facts,
             "suggestions": self.suggestions,
             "warnings": [
@@ -147,6 +220,11 @@ class TraceResult:
     Answers: "What happened to this row?"
     Events are in CHRONOLOGICAL order (oldest->newest).
+    Key attributes:
+        status: "alive" or "dropped" (string representation)
+        origin: Where this row came from (concat, merge, or original)
+        representative: If dropped by dedup, which row was kept instead
     """
     row_id: int
@@ -158,22 +236,93 @@ class TraceResult:
     # Mode enforcement
     supported: bool = True
     unsupported_reason: str | None = None
+    # v0.4+ provenance
+    concat_origin: dict[str, Any] | None = None
+    dedup_representative: dict[str, Any] | None = None
+    # Steps this row survived (for SURVIVED event generation)
+    _survived_steps: list[dict[str, Any]] = field(default_factory=list)
+    @property
+    def status(self) -> str:
+        """Row status as string: 'alive' or 'dropped'."""
+        return "alive" if self.is_alive else "dropped"
+    @property
+    def dropped_by(self) -> str | None:
+        """Operation that dropped this row, or None if alive."""
+        if self.dropped_at:
+            return self.dropped_at.get("operation")
+        return None
+    @property
+    def dropped_at_step(self) -> int | None:
+        """Step number where this row was dropped, or None if alive."""
+        if self.dropped_at:
+            return self.dropped_at.get("step_id")
+        return None
     @property
     def n_events(self) -> int:
         return len(self.events)
+    @property
+    def origin(self) -> dict[str, Any] | None:
+        """
+        Unified origin info: where did this row come from?
+        Returns dict with 'type' key:
+            - {"type": "concat", "source_df": 1, "step_id": 5}
+            - {"type": "merge", "left_parent": 10, "right_parent": 20, "step_id": 3}
+            - None if original row (not from concat/merge)
+        """
+        if self.concat_origin:
+            return {
+                "type": "concat",
+                "source_df": self.concat_origin.get("source_index"),
+                "step_id": self.concat_origin.get("step_id"),
+            }
+        if self.merge_origin:
+            return {
+                "type": "merge",
+                "left_parent": self.merge_origin.get("left_parent"),
+                "right_parent": self.merge_origin.get("right_parent"),
+                "step_id": self.merge_origin.get("step_id"),
+            }
+        return None
+    @property
+    def representative(self) -> dict[str, Any] | None:
+        """
+        If dropped by drop_duplicates, which row was kept instead?
+        Returns:
+            {"kept_rid": 42, "subset": ["key"], "keep": "first"} or None
+            kept_rid is None if keep=False (all duplicates dropped)
+        """
+        if not self.dedup_representative:
+            return None
+        return {
+            "kept_rid": self.dedup_representative.get("kept_rid"),
+            "subset": self.dedup_representative.get("subset_columns"),
+            "keep": self.dedup_representative.get("keep_strategy"),
+        }
     def to_dict(self) -> dict:
         """Export to dictionary."""
         return {
             "row_id": self.row_id,
+            "status": self.status,
             "is_alive": self.is_alive,
             "dropped_at": self.dropped_at,
-            "merge_origin": self.merge_origin,
+            "dropped_by": self.dropped_at.get("operation") if self.dropped_at else None,
+            "origin": self.origin,
+            "representative": self.representative,
             "n_events": self.n_events,
             "events": self.events,
             "ghost_values": self.ghost_values,
             "supported": self.supported,
+            # Keep legacy fields for backwards compatibility
+            "merge_origin": self.merge_origin,
         }
     def __repr__(self) -> str:
@@ -186,19 +335,38 @@ class TraceResult:
         lines = [f"Row {self.row_id} Journey:"]
+        # Status line matches documentation format
         if self.is_alive:
             lines.append("  Status: [OK] Alive")
         else:
-            lines.append("  Status: [X] Dropped")
+            lines.append("  Status: [DROPPED]")
             if self.dropped_at:
                 lines.append(
                     f"    at step {self.dropped_at['step_id']}: {self.dropped_at['operation']}"
                 )
-        if self.merge_origin:
-            left = self.merge_origin.get("left_parent", "?")
-            right = self.merge_origin.get("right_parent", "?")
-            lines.append(f"  Origin: merge of row {left} (left) + row {right} (right)")
+        # Display unified origin info
+        origin = self.origin
+        if origin:
+            if origin["type"] == "merge":
+                left = origin.get("left_parent", "?")
+                right = origin.get("right_parent", "?")
+                lines.append(f"  Origin: merge of row {left} (left) + row {right} (right)")
+            elif origin["type"] == "concat":
+                src = origin.get("source_df", "?")
+                lines.append(f"  Origin: concat from DataFrame #{src}")
+        # Display dedup representative if dropped by dedup
+        if self.representative:
+            kept = self.representative.get("kept_rid")
+            subset = self.representative.get("subset")
+            keep = self.representative.get("keep", "first")
+            if kept is not None:
+                subset_str = f" (key: {subset})" if subset else ""
+                lines.append(f"  Replaced by: row {kept}{subset_str} [keep={keep}]")
+            else:
+                subset_str = f" on {subset}" if subset else ""
+                lines.append(f"  Dropped: all duplicates removed{subset_str} [keep=False]")
         if len(self.events) == 0:
             lines.append("\n  Events: 0 (no changes to watched columns)")
@@ -462,11 +630,26 @@ def check(
                 )
             )
-    drops_by_step = ctx.store.get_dropped_by_step()
-    for op, count in drops_by_step.items():
+    drops_by_op = ctx.store.get_dropped_by_step()
+    for op, count in drops_by_op.items():
         if count > 1000:
             suggestions.append(f"'{op}' dropped {count} rows - review if intentional")
+    # === CELL CHANGES (debug mode only) ===
+    n_changes = 0
+    changes_by_op: dict[str, int] = {}
+    if ctx.config.mode == TracePipeMode.DEBUG:
+        # Count non-drop diffs (cell-level changes)
+        step_map = {s.step_id: s.operation for s in ctx.store.steps}
+        for i in range(len(ctx.store.diff_step_ids)):
+            col = ctx.store.diff_cols[i]
+            if col != "__row__":  # Skip drop events
+                n_changes += 1
+                step_id = ctx.store.diff_step_ids[i]
+                op = step_map.get(step_id, "unknown")
+                changes_by_op[op] = changes_by_op.get(op, 0) + 1
+        facts["n_changes"] = n_changes
     ok = len([w for w in warnings_list if w.severity == "fact"]) == 0
     return CheckResult(
@@ -475,6 +658,9 @@ def check(
         facts=facts,
         suggestions=suggestions,
         mode=ctx.config.mode.value,
+        _drops_by_op=drops_by_op,
+        _n_changes=n_changes,
+        _changes_by_op=changes_by_op,
     )
@@ -482,6 +668,7 @@ def trace(
     df: pd.DataFrame,
     *,
     row: int | None = None,
+    row_id: int | None = None,
     where: dict[str, Any] | None = None,
     include_ghost: bool = True,
 ) -> TraceResult | list[TraceResult]:
@@ -490,7 +677,8 @@ def trace(
     Args:
         df: DataFrame to search in
-        row: Row ID (if known)
+        row: Row position (0-based index into current DataFrame)
+        row_id: Internal row ID (use for tracing dropped rows)
         where: Selector dict, e.g. {"customer_id": "C123"}
         include_ghost: Include last-known values for dropped rows
@@ -499,8 +687,14 @@ def trace(
         Use print(result) for pretty output, result.to_dict() for data.
     Examples:
-        result = tp.trace(df, row=5)
-        print(result)
+        # Trace by position in current DataFrame
+        result = tp.trace(df, row=0)  # First row
+        # Trace by internal row ID (for dropped rows)
+        dropped = tp.debug.inspect().dropped_rows()
+        result = tp.trace(df, row_id=dropped[0])
+        # Trace by business key
         tp.trace(df, where={"customer_id": "C123"})
     """
     ctx = get_context()
@@ -511,12 +705,30 @@ def trace(
         pass
     # Resolve row IDs
-    if row is not None:
-        row_ids = [row]
+    if row_id is not None:
+        # Direct row ID specified - use as-is
+        row_ids = [row_id]
+    elif row is not None:
+        # row= is a DataFrame index position (0-based), not a row ID
+        # Convert to actual row ID using the DataFrame's registered IDs
+        rids = ctx.row_manager.get_ids_array(df)
+        if rids is not None:
+            # Handle negative indexing
+            if row < 0:
+                row = len(rids) + row
+            if 0 <= row < len(rids):
+                row_ids = [int(rids[row])]
+            else:
+                raise ValueError(
+                    f"Row index {row} out of bounds for DataFrame with {len(rids)} rows"
+                )
+        else:
+            # DataFrame not tracked - use row as-is (legacy behavior)
+            row_ids = [row]
     elif where is not None:
         row_ids = _resolve_where(df, where, ctx)
     else:
-        raise ValueError("Must provide 'row' or 'where'")
+        raise ValueError("Must provide 'row', 'row_id', or 'where'")
     results = []
     for rid in row_ids:
@@ -531,6 +743,7 @@ def why(
     *,
     col: str,
     row: int | None = None,
+    row_id: int | None = None,
     where: dict[str, Any] | None = None,
 ) -> WhyResult | list[WhyResult]:
     """
@@ -539,7 +752,8 @@ def why(
     Args:
         df: DataFrame to search in
         col: Column name to trace
-        row: Row ID (if known)
+        row: Row position (0-based index into current DataFrame)
+        row_id: Internal row ID (use for cells in dropped rows)
         where: Selector dict, e.g. {"customer_id": "C123"}
     Returns:
@@ -547,7 +761,7 @@ def why(
         Use print(result) for pretty output, result.to_dict() for data.
     Examples:
-        result = tp.why(df, col="amount", row=5)
+        result = tp.why(df, col="amount", row=0)  # First row
         print(result)
         tp.why(df, col="email", where={"user_id": "U123"})
     """
@@ -563,12 +777,30 @@ def why(
         )
     # Resolve row IDs
-    if row is not None:
-        row_ids = [row]
+    if row_id is not None:
+        # Direct row ID specified - use as-is
+        row_ids = [row_id]
+    elif row is not None:
+        # row= is a DataFrame index position (0-based), not a row ID
+        # Convert to actual row ID using the DataFrame's registered IDs
+        rids = ctx.row_manager.get_ids_array(df)
+        if rids is not None:
+            # Handle negative indexing
+            if row < 0:
+                row = len(rids) + row
+            if 0 <= row < len(rids):
+                row_ids = [int(rids[row])]
+            else:
+                raise ValueError(
+                    f"Row index {row} out of bounds for DataFrame with {len(rids)} rows"
+                )
+        else:
+            # DataFrame not tracked - use row as-is (legacy behavior)
+            row_ids = [row]
     elif where is not None:
         row_ids = _resolve_where(df, where, ctx)
     else:
-        raise ValueError("Must provide 'row' or 'where'")
+        raise ValueError("Must provide 'row', 'row_id', or 'where'")
     results = []
     for rid in row_ids:
@@ -787,6 +1019,14 @@ def _build_trace_result(row_id: int, ctx, include_ghost: bool) -> TraceResult:
     drop_event = store.get_drop_event(row_id)
     merge_origin = store.get_merge_origin(row_id)
+    # v0.4+ provenance: concat origin and dedup representative
+    concat_origin = None
+    dedup_representative = None
+    if hasattr(store, "get_concat_origin"):
+        concat_origin = store.get_concat_origin(row_id)
+    if hasattr(store, "get_duplicate_representative"):
+        dedup_representative = store.get_duplicate_representative(row_id)
     # Use lineage-aware history to include pre-merge parent events
     if hasattr(store, "get_row_history_with_lineage"):
         history = store.get_row_history_with_lineage(row_id)
@@ -823,6 +1063,8 @@ def _build_trace_result(row_id: int, ctx, include_ghost: bool) -> TraceResult:
         merge_origin=merge_origin,
         events=history,
         ghost_values=ghost_values,
+        concat_origin=concat_origin,
+        dedup_representative=dedup_representative,
     )

tracepipe/core.py CHANGED Viewed

@@ -277,3 +277,82 @@ class MergeStats:
     left_dup_rate: float  # -1 if not computed
     right_dup_rate: float  # -1 if not computed
     how: str
+@dataclass
+class ConcatMapping:
+    """
+    Mapping for pd.concat operations preserving row lineage.
+    For axis=0 concat, each result row comes from exactly one source DataFrame.
+    Arrays are stored in both positional order (for "explain row i") and
+    sorted order (for O(log n) RID lookup).
+    Invariants:
+    - out_rids and source_indices have same length
+    - out_rids_sorted and out_pos_sorted are always paired (both set or both None)
+    - out_rids_sorted is monotonically increasing
+    """
+    step_id: int
+    # Positional arrays (match result row order)
+    out_rids: Any  # numpy array, len = len(result)
+    source_indices: Any  # numpy array, which source DF (0, 1, 2...) each row came from
+    # Sorted arrays (for O(log n) lookup by RID)
+    out_rids_sorted: Any  # numpy array, SORTED
+    out_pos_sorted: Any  # numpy array, original positions aligned with out_rids_sorted
+    # Metadata
+    source_shapes: list[tuple] = field(default_factory=list)
+    def __post_init__(self):
+        """Validate invariants."""
+        import numpy as np
+        if self.out_rids_sorted is not None and self.out_pos_sorted is not None:
+            if len(self.out_rids_sorted) != len(self.out_pos_sorted):
+                raise ValueError("out_rids_sorted and out_pos_sorted must have same length")
+            # Verify monotonic (debug check)
+            if len(self.out_rids_sorted) > 1:
+                assert np.all(
+                    self.out_rids_sorted[:-1] <= self.out_rids_sorted[1:]
+                ), "out_rids_sorted must be monotonically increasing"
+@dataclass
+class DuplicateDropMapping:
+    """
+    Mapping for drop_duplicates provenance (debug mode only).
+    Tracks which rows were dropped and which "representative" row they lost to.
+    Arrays are sorted by dropped_rids for O(log n) lookup.
+    For keep='first': dropped rows map to first occurrence
+    For keep='last': dropped rows map to last occurrence
+    For keep=False: dropped rows have kept_rids=-1 (no representative)
+    """
+    step_id: int
+    # Sorted arrays for O(log n) lookup
+    dropped_rids: Any  # numpy array, SORTED dropped row IDs
+    kept_rids: Any  # numpy array, representative RID for each dropped row (-1 if none)
+    # Metadata
+    subset_columns: Optional[tuple[str, ...]] = None
+    keep_strategy: str = "first"
+    def __post_init__(self):
+        """Validate invariants."""
+        import numpy as np
+        if self.dropped_rids is not None and self.kept_rids is not None:
+            if len(self.dropped_rids) != len(self.kept_rids):
+                raise ValueError("dropped_rids and kept_rids must have same length")
+            # Verify sorted
+            if len(self.dropped_rids) > 1:
+                assert np.all(
+                    self.dropped_rids[:-1] <= self.dropped_rids[1:]
+                ), "dropped_rids must be sorted"

tracepipe/debug.py CHANGED Viewed

@@ -179,6 +179,46 @@ class DebugInspector:
         ctx = get_context()
         return ctx.row_manager.get_ghost_rows(limit=limit)
+    def get_ghost_values(self, row_id: int) -> dict[str, Any] | None:
+        """
+        Get last-known values for a specific dropped row (DEBUG mode only).
+        Args:
+            row_id: The row ID to look up
+        Returns:
+            Dict mapping column names to their last known values,
+            or None if the row was not found in ghost storage.
+        Example:
+            dbg = tp.debug.inspect()
+            dropped_rid = list(dbg.dropped_rows())[0]
+            ghost = dbg.get_ghost_values(dropped_rid)
+            print(f"Last known values: {ghost}")
+        """
+        ctx = get_context()
+        ghost_df = ctx.row_manager.get_ghost_rows(limit=100000)
+        if ghost_df.empty or "__tp_row_id__" not in ghost_df.columns:
+            return None
+        row_match = ghost_df[ghost_df["__tp_row_id__"] == row_id]
+        if row_match.empty:
+            return None
+        # Convert to dict and remove internal columns
+        result = row_match.iloc[0].to_dict()
+        internal_cols = [
+            "__tp_row_id__",
+            "__tp_dropped_by__",
+            "__tp_dropped_step__",
+            "__tp_original_position__",
+        ]
+        for col in internal_cols:
+            result.pop(col, None)
+        return result
     def stats(self) -> dict:
         """Get comprehensive tracking statistics."""
         ctx = get_context()

tracepipe/instrumentation/filter_capture.py CHANGED Viewed

@@ -24,7 +24,7 @@ import numpy as np
 import pandas as pd
 from ..context import TracePipeContext, get_context
-from ..core import CompletenessLevel
+from ..core import CompletenessLevel, DuplicateDropMapping
 from ..safety import TracePipeWarning, get_caller_info
 # ============ MASK DERIVATION FUNCTIONS ============
@@ -97,6 +97,95 @@ def derive_drop_duplicates_mask(
     return kept_mask.values, completeness
+def derive_drop_duplicates_provenance(
+    df: pd.DataFrame,
+    source_rids: np.ndarray,
+    subset: Optional[list[str]],
+    keep: str,
+) -> Optional[DuplicateDropMapping]:
+    """
+    Derive dropped->kept mapping for drop_duplicates (debug mode only).
+    Uses hash_pandas_object for NaN-safe, fast key comparison.
+    Uses vectorized groupby min/max for representative selection.
+    Args:
+        df: Source DataFrame
+        source_rids: Row IDs for each row in df
+        subset: Columns to consider for duplicates (None = all)
+        keep: 'first', 'last', or False
+    Returns:
+        DuplicateDropMapping if any rows were dropped, else None.
+    """
+    n = len(df)
+    if n == 0:
+        return None
+    # Determine columns to hash
+    if subset is None:
+        hash_df = df
+        valid_cols = tuple(df.columns)
+    else:
+        valid_cols = tuple(c for c in subset if c in df.columns)
+        if not valid_cols:
+            return None
+        hash_df = df[list(valid_cols)]
+    # Use hash_pandas_object for fast, NaN-safe key hashing
+    try:
+        h = pd.util.hash_pandas_object(hash_df, index=False)
+        codes, _ = pd.factorize(h, sort=False)
+    except Exception:
+        # Fallback: can't hash, skip provenance
+        return None
+    # Compute kept mask using pandas (ground truth)
+    kept_mask = ~df.duplicated(subset=list(valid_cols) if valid_cols else None, keep=keep)
+    dropped_mask = ~kept_mask.values
+    if not dropped_mask.any():
+        return None  # No duplicates dropped
+    dropped_positions = np.where(dropped_mask)[0]
+    dropped_rids = source_rids[dropped_positions]
+    # Find representative positions using vectorized groupby min/max
+    positions = np.arange(n, dtype=np.int64)
+    if keep == "first":
+        # Representative = first occurrence of each group
+        rep_pos = pd.Series(positions).groupby(codes).min().to_numpy()
+    elif keep == "last":
+        # Representative = last occurrence of each group
+        rep_pos = pd.Series(positions).groupby(codes).max().to_numpy()
+    else:
+        # keep=False: no representative (all duplicates dropped)
+        rep_pos = None
+    # Build kept_rids array
+    if rep_pos is not None:
+        dropped_codes = codes[dropped_positions]
+        kept_positions = rep_pos[dropped_codes]
+        kept_rids = source_rids[kept_positions]
+    else:
+        # keep=False: no representative
+        kept_rids = np.full(len(dropped_rids), -1, dtype=np.int64)
+    # Sort by dropped_rids for O(log n) lookup
+    sort_order = np.argsort(dropped_rids)
+    dropped_rids_sorted = dropped_rids[sort_order].copy()
+    kept_rids_sorted = kept_rids[sort_order].copy()
+    return DuplicateDropMapping(
+        step_id=-1,  # Will be set by caller
+        dropped_rids=dropped_rids_sorted,
+        kept_rids=kept_rids_sorted,
+        subset_columns=valid_cols if valid_cols else None,
+        keep_strategy=str(keep),
+    )
 def derive_query_mask(
     df: pd.DataFrame, args: tuple, kwargs: dict
 ) -> tuple[Optional[np.ndarray], CompletenessLevel]:
@@ -257,12 +346,19 @@ def _capture_filter_with_mask(
     kept_mask: Optional[np.ndarray] = None
     positions: Optional[np.ndarray] = None
     completeness = CompletenessLevel.FULL
+    dedup_mapping: Optional[DuplicateDropMapping] = None
     if method_name == "dropna":
         kept_mask, completeness = derive_dropna_mask(source_df, args, kwargs)
     elif method_name == "drop_duplicates":
         kept_mask, completeness = derive_drop_duplicates_mask(source_df, args, kwargs)
+        # Compute provenance mapping in debug mode
+        dedup_mapping = None
+        if ctx.config.should_capture_merge_provenance:
+            subset = kwargs.get("subset", None)
+            keep = kwargs.get("keep", "first")
+            dedup_mapping = derive_drop_duplicates_provenance(source_df, source_rids, subset, keep)
     elif method_name == "query":
         kept_mask, completeness = derive_query_mask(source_df, args, kwargs)
@@ -359,6 +455,12 @@ def _capture_filter_with_mask(
                 watched_columns=ctx.watched_columns,
             )
+    # === RECORD DROP_DUPLICATES PROVENANCE (debug mode) ===
+    if method_name == "drop_duplicates" and dedup_mapping is not None:
+        # Update step_id in the mapping and store it
+        dedup_mapping.step_id = step_id
+        store.duplicate_drop_mappings.append(dedup_mapping)
 def _propagate_by_index_fallback(
     row_mgr, source_df: pd.DataFrame, result_df: pd.DataFrame

tracepipe/instrumentation/merge_capture.py CHANGED Viewed

@@ -14,7 +14,7 @@ import numpy as np
 import pandas as pd
 from ..context import get_context
-from ..core import CompletenessLevel, MergeMapping, MergeStats
+from ..core import CompletenessLevel, ConcatMapping, MergeMapping, MergeStats
 from ..safety import TracePipeWarning, get_caller_info
@@ -382,53 +382,199 @@ def wrap_join_with_lineage(original_join):
 def wrap_concat_with_lineage(original_concat):
     """
     Wrap pd.concat with lineage capture.
+    For axis=0 (vertical concat):
+    - Preserves row IDs from source DataFrames (FULL provenance)
+    - Tracks which source DataFrame each row came from
+    For axis=1 (horizontal concat):
+    - Propagates RIDs if all inputs have identical RID arrays
+    - Otherwise marks as PARTIAL
     """
     @wraps(original_concat)
     def wrapper(objs, *args, **kwargs):
         ctx = get_context()
-        result = original_concat(objs, *args, **kwargs)
         if not ctx.enabled:
-            return result
+            return original_concat(objs, *args, **kwargs)
+        axis = kwargs.get("axis", 0)
+        # === BEFORE: Capture source RIDs from all tracked DataFrames ===
+        source_data = []  # [(rids_copy, shape, original_index), ...]
+        try:
+            objs_list = list(objs) if hasattr(objs, "__iter__") else [objs]
+        except TypeError:
+            objs_list = [objs]
+        for i, obj in enumerate(objs_list):
+            if isinstance(obj, pd.DataFrame) and len(obj) > 0:
+                rids = ctx.row_manager.get_ids_array(obj)
+                if rids is None:
+                    rids = ctx.row_manager.register(obj)
+                # IMPORTANT: Make a copy to avoid mutation issues
+                source_data.append((rids.copy(), obj.shape, i))
+        # === RUN ORIGINAL ===
+        try:
+            result = original_concat(objs_list, *args, **kwargs)
+        except Exception:
+            raise  # Don't store mapping on failure
         if not isinstance(result, pd.DataFrame):
             return result
-        try:
-            row_mgr = ctx.row_manager
-            store = ctx.store
+        row_mgr = ctx.row_manager
+        store = ctx.store
+        code_file, code_line = get_caller_info(skip_frames=2)
-            # Register result
-            row_mgr.register(result)
+        # Compute input shapes for step metadata
+        input_shapes = [sd[1] for sd in source_data]
-            code_file, code_line = get_caller_info(skip_frames=2)
+        # === AXIS=0: Vertical concat with FULL provenance ===
+        if axis == 0 and source_data:
+            return _concat_axis0_with_provenance(
+                result, source_data, input_shapes, code_file, code_line, ctx
+            )
-            # Compute input shapes
-            input_shapes = []
-            for obj in objs:
-                if hasattr(obj, "shape"):
-                    input_shapes.append(obj.shape)
+        # === AXIS=1: Horizontal concat ===
+        elif axis == 1 and source_data:
+            return _concat_axis1_with_provenance(
+                result, source_data, input_shapes, code_file, code_line, ctx
+            )
+        # === FALLBACK: Unknown axis or no source data ===
+        else:
+            row_mgr.register(result)
             store.append_step(
                 operation="pd.concat",
                 stage=ctx.current_stage,
                 code_file=code_file,
                 code_line=code_line,
                 params={
-                    "axis": kwargs.get("axis", 0),
-                    "n_inputs": len(objs) if hasattr(objs, "__len__") else 1,
+                    "axis": axis,
+                    "n_inputs": len(source_data),
                 },
                 input_shape=tuple(input_shapes) if input_shapes else None,
                 output_shape=result.shape,
-                completeness=CompletenessLevel.PARTIAL,  # Concat resets lineage
+                completeness=CompletenessLevel.PARTIAL,
             )
-        except Exception as e:
-            if ctx.config.strict_mode:
-                raise
-            warnings.warn(f"TracePipe: Concat capture failed: {e}", TracePipeWarning)
+            return result
+    return wrapper
+def _concat_axis0_with_provenance(result, source_data, input_shapes, code_file, code_line, ctx):
+    """
+    Handle axis=0 concat with FULL row provenance.
+    Preserves source RIDs and tracks which source DF each row came from.
+    """
+    row_mgr = ctx.row_manager
+    store = ctx.store
+    # Build concatenated RID array and source index array
+    all_rids = np.concatenate([sd[0] for sd in source_data])
+    all_source_idx = np.concatenate(
+        [np.full(len(sd[0]), sd[2], dtype=np.int32) for sd in source_data]
+    )
+    # Validate: length must match result
+    if len(all_rids) != len(result):
+        # Mismatch - some objects contributed differently (empty DFs, Series, etc.)
+        # Degrade gracefully to PARTIAL
+        row_mgr.register(result)
+        store.append_step(
+            operation="pd.concat",
+            stage=ctx.current_stage,
+            code_file=code_file,
+            code_line=code_line,
+            params={
+                "axis": 0,
+                "n_inputs": len(source_data),
+                "_length_mismatch": True,
+            },
+            input_shape=tuple(input_shapes) if input_shapes else None,
+            output_shape=result.shape,
+            completeness=CompletenessLevel.PARTIAL,
+        )
         return result
-    return wrapper
+    # Propagate RIDs to result (preserving lineage!)
+    row_mgr.set_result_rids(result, all_rids.copy())
+    # Build sorted arrays for O(log n) lookup
+    sort_order = np.argsort(all_rids)
+    out_rids_sorted = all_rids[sort_order].copy()
+    out_pos_sorted = sort_order.copy()
+    # Record step with FULL completeness
+    step_id = store.append_step(
+        operation="pd.concat",
+        stage=ctx.current_stage,
+        code_file=code_file,
+        code_line=code_line,
+        params={
+            "axis": 0,
+            "n_inputs": len(source_data),
+        },
+        input_shape=tuple(input_shapes) if input_shapes else None,
+        output_shape=result.shape,
+        completeness=CompletenessLevel.FULL,
+    )
+    # Store mapping
+    mapping = ConcatMapping(
+        step_id=step_id,
+        out_rids=all_rids.copy(),
+        source_indices=all_source_idx.copy(),
+        out_rids_sorted=out_rids_sorted,
+        out_pos_sorted=out_pos_sorted,
+        source_shapes=list(input_shapes),
+    )
+    store.concat_mappings.append(mapping)
+    return result
+def _concat_axis1_with_provenance(result, source_data, input_shapes, code_file, code_line, ctx):
+    """
+    Handle axis=1 concat with best-effort provenance.
+    If all inputs have identical RID arrays, propagate them (FULL).
+    Otherwise, mark as PARTIAL and register new RIDs.
+    """
+    row_mgr = ctx.row_manager
+    store = ctx.store
+    # Check if all inputs have the same RIDs in same order
+    first_rids = source_data[0][0]
+    all_same = all(
+        len(sd[0]) == len(first_rids) and np.array_equal(sd[0], first_rids) for sd in source_data
+    )
+    if all_same and len(first_rids) == len(result):
+        # All inputs have identical RIDs - propagate them
+        row_mgr.set_result_rids(result, first_rids.copy())
+        completeness = CompletenessLevel.FULL
+    else:
+        # Misaligned or different RIDs - register new RIDs
+        row_mgr.register(result)
+        completeness = CompletenessLevel.PARTIAL
+    store.append_step(
+        operation="pd.concat",
+        stage=ctx.current_stage,
+        code_file=code_file,
+        code_line=code_line,
+        params={
+            "axis": 1,
+            "n_inputs": len(source_data),
+        },
+        input_shape=tuple(input_shapes) if input_shapes else None,
+        output_shape=result.shape,
+        completeness=completeness,
+    )
+    return result

tracepipe/snapshot.py CHANGED Viewed

@@ -25,7 +25,7 @@ Usage:
 import json
 import time
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any, Optional
@@ -297,6 +297,20 @@ class DiffResult:
     recovered_rows: set[int]
     drops_delta: dict[str, int]  # op -> change in count
     stats_changes: dict[str, dict[str, Any]]  # col -> {metric: (old, new)}
+    # Column changes
+    columns_added: list[str] = field(default_factory=list)
+    columns_removed: list[str] = field(default_factory=list)
+    # Cell-level changes (only populated if both snapshots have include_values=True)
+    cells_changed: int = 0  # Total modified cells
+    changed_rows: set[int] = field(default_factory=set)  # IDs of rows with value changes
+    changes_by_column: dict[str, int] = field(default_factory=dict)  # col -> count
+    @property
+    def rows_unchanged(self) -> int:
+        """Number of rows that exist in both snapshots (may have value changes)."""
+        # This is computed from the rows that weren't added or removed
+        # Note: This is an estimate based on the smaller snapshot
+        return 0  # Will be set during diff computation
     def __repr__(self) -> str:
         lines = ["Snapshot Diff:"]
@@ -310,6 +324,18 @@ class DiffResult:
         if self.recovered_rows:
             lines.append(f"  * {len(self.recovered_rows)} recovered")
+        if self.columns_added:
+            lines.append(f"  Columns added: {', '.join(self.columns_added)}")
+        if self.columns_removed:
+            lines.append(f"  Columns removed: {', '.join(self.columns_removed)}")
+        if self.cells_changed > 0:
+            lines.append("\n  Changes:")
+            lines.append(f"    - {self.cells_changed} cells modified")
+            if self.changes_by_column:
+                for col, count in sorted(self.changes_by_column.items(), key=lambda x: -x[1])[:5]:
+                    lines.append(f"      {col}: {count}")
         if self.drops_delta:
             lines.append("  Drop changes by operation:")
             for op, delta in sorted(self.drops_delta.items(), key=lambda x: -abs(x[1])):
@@ -339,6 +365,9 @@ class DiffResult:
             or self.recovered_rows
             or self.drops_delta
             or self.stats_changes
+            or self.columns_added
+            or self.columns_removed
+            or self.cells_changed
         )
     def to_dict(self) -> dict:
@@ -350,6 +379,11 @@ class DiffResult:
             "recovered_rows": list(self.recovered_rows),
             "drops_delta": self.drops_delta,
             "stats_changes": self.stats_changes,
+            "columns_added": self.columns_added,
+            "columns_removed": self.columns_removed,
+            "cells_changed": self.cells_changed,
+            "changed_rows": list(self.changed_rows),
+            "changes_by_column": self.changes_by_column,
         }
@@ -359,6 +393,9 @@ def diff(baseline: Snapshot, current: Snapshot) -> DiffResult:
     Note: Cross-run diff is SUMMARY-ONLY unless keys are stored.
     Row-level comparison only works within same session (same RID assignment).
+    For cell-level diff (cells_changed, changes_by_column), both snapshots
+    must have been created with include_values=True.
     """
     rows_added = current.row_ids - baseline.row_ids
     rows_removed = baseline.row_ids - current.row_ids
@@ -375,9 +412,15 @@ def diff(baseline: Snapshot, current: Snapshot) -> DiffResult:
         if old != new:
             drops_delta[op] = new - old
+    # Column changes
+    baseline_cols = set(baseline.column_stats.keys())
+    current_cols = set(current.column_stats.keys())
+    columns_added = sorted(current_cols - baseline_cols)
+    columns_removed = sorted(baseline_cols - current_cols)
     # Stats changes
     stats_changes: dict[str, dict[str, Any]] = {}
-    all_cols = set(baseline.column_stats.keys()) | set(current.column_stats.keys())
+    all_cols = baseline_cols | current_cols
     for col in all_cols:
         old_stats = baseline.column_stats.get(col)
         new_stats = current.column_stats.get(col)
@@ -396,6 +439,43 @@ def diff(baseline: Snapshot, current: Snapshot) -> DiffResult:
         if changes:
             stats_changes[col] = changes
+    # Cell-level changes (only if both snapshots have watched data)
+    cells_changed = 0
+    changed_rows: set[int] = set()
+    changes_by_column: dict[str, int] = {}
+    if baseline.watched_data is not None and current.watched_data is not None:
+        # Find common rows and columns
+        common_rows = baseline.row_ids & current.row_ids
+        common_cols = set(baseline.watched_data.columns) & set(current.watched_data.columns)
+        for rid in common_rows:
+            for col in common_cols:
+                old_val = baseline.watched_data.get_value(int(rid), col)
+                new_val = current.watched_data.get_value(int(rid), col)
+                # Compare values (handle NaN)
+                values_equal = False
+                if old_val is None and new_val is None:
+                    values_equal = True
+                elif old_val is not None and new_val is not None:
+                    try:
+                        # Handle NaN comparison
+                        if isinstance(old_val, float) and isinstance(new_val, float):
+                            if old_val != old_val and new_val != new_val:  # Both NaN
+                                values_equal = True
+                            else:
+                                values_equal = old_val == new_val
+                        else:
+                            values_equal = old_val == new_val
+                    except (TypeError, ValueError):
+                        values_equal = str(old_val) == str(new_val)
+                if not values_equal:
+                    cells_changed += 1
+                    changed_rows.add(rid)
+                    changes_by_column[col] = changes_by_column.get(col, 0) + 1
     return DiffResult(
         rows_added=rows_added,
         rows_removed=rows_removed,
@@ -403,6 +483,11 @@ def diff(baseline: Snapshot, current: Snapshot) -> DiffResult:
         recovered_rows=recovered_rows,
         drops_delta=drops_delta,
         stats_changes=stats_changes,
+        columns_added=columns_added,
+        columns_removed=columns_removed,
+        cells_changed=cells_changed,
+        changed_rows=changed_rows,
+        changes_by_column=changes_by_column,
     )

tracepipe/storage/lineage_store.py CHANGED Viewed

@@ -22,6 +22,8 @@ from ..core import (
     AggregationMapping,
     ChangeType,
     CompletenessLevel,
+    ConcatMapping,
+    DuplicateDropMapping,
     LineageGap,
     LineageGaps,
     MergeMapping,
@@ -100,6 +102,12 @@ class InMemoryLineageStore:
         self.merge_mappings: list[MergeMapping] = []
         self.merge_stats: list[tuple[int, MergeStats]] = []
+        # === CONCAT TRACKING ===
+        self.concat_mappings: list[ConcatMapping] = []
+        # === DUPLICATE DROP TRACKING (debug mode) ===
+        self.duplicate_drop_mappings: list[DuplicateDropMapping] = []
         # === AGGREGATION MAPPINGS ===
         self.aggregation_mappings: list[AggregationMapping] = []
@@ -361,6 +369,74 @@ class InMemoryLineageStore:
             return [(sid, s) for sid, s in self.merge_stats if sid == step_id]
         return list(self.merge_stats)
+    # === CONCAT LOOKUP (O(log n) via searchsorted) ===
+    def _binary_search_mapping(
+        self, sorted_rids: Optional[np.ndarray], target_rid: int
+    ) -> Optional[int]:
+        """
+        Return index in sorted array, or None if not found.
+        Robust to None/empty arrays and dtype mismatches.
+        """
+        if sorted_rids is None or len(sorted_rids) == 0:
+            return None
+        target = np.int64(target_rid)
+        i = np.searchsorted(sorted_rids, target)
+        if i < len(sorted_rids) and sorted_rids[i] == target:
+            return int(i)
+        return None
+    def get_concat_origin(self, row_id: int) -> Optional[dict]:
+        """
+        Get which source DataFrame a row came from in a concat.
+        Uses binary search (O(log n)) on sorted RIDs.
+        Returns:
+            {step_id, source_index, source_shape, position} if found, else None.
+        """
+        for mapping in self.concat_mappings:
+            idx = self._binary_search_mapping(mapping.out_rids_sorted, row_id)
+            if idx is not None:
+                pos = int(mapping.out_pos_sorted[idx])
+                source_idx = int(mapping.source_indices[pos])
+                return {
+                    "step_id": mapping.step_id,
+                    "source_index": source_idx,
+                    "source_shape": (
+                        mapping.source_shapes[source_idx]
+                        if source_idx < len(mapping.source_shapes)
+                        else None
+                    ),
+                    "position": pos,
+                }
+        return None
+    # === DUPLICATE DROP LOOKUP (O(log n) via searchsorted) ===
+    def get_duplicate_representative(self, row_id: int) -> Optional[dict]:
+        """
+        Get which row replaced this one in drop_duplicates.
+        Returns:
+            {step_id, kept_rid, subset_columns, keep_strategy} if found, else None.
+            kept_rid is -1 if keep=False (no representative).
+        """
+        for mapping in self.duplicate_drop_mappings:
+            idx = self._binary_search_mapping(mapping.dropped_rids, row_id)
+            if idx is not None:
+                kept = int(mapping.kept_rids[idx])
+                return {
+                    "step_id": mapping.step_id,
+                    "kept_rid": kept if kept >= 0 else None,
+                    "subset_columns": mapping.subset_columns,
+                    "keep_strategy": mapping.keep_strategy,
+                }
+        return None
     # === MEMORY MANAGEMENT ===
     def _check_memory_and_spill(self) -> None:
@@ -567,17 +643,17 @@ class InMemoryLineageStore:
     def get_row_history_with_lineage(self, row_id: int, max_depth: int = 10) -> list[dict]:
         """
-        Get row history including pre-merge parent history.
+        Get row history including pre-merge and pre-concat parent history.
-        Follows merge lineage recursively to build complete cell provenance.
-        This is essential for tracking changes that happened before merge operations.
+        Follows merge and concat lineage recursively to build complete cell provenance.
+        This is essential for tracking changes that happened before merge/concat operations.
         Deduplicates events by (col, old_val, new_val, operation) signature to prevent
         cross-pipeline contamination when multiple DataFrames share row IDs.
         Args:
             row_id: Row ID to trace
-            max_depth: Maximum merge depth to follow (prevents infinite loops)
+            max_depth: Maximum lineage depth to follow (prevents infinite loops)
         Returns:
             List of UNIQUE events in chronological order, including parent row events.
@@ -592,12 +668,21 @@ class InMemoryLineageStore:
             events = []
             # Check if this row came from a merge
-            origin = self.get_merge_origin(rid)
-            if origin and origin["left_parent"] is not None:
+            merge_origin = self.get_merge_origin(rid)
+            if merge_origin and merge_origin["left_parent"] is not None:
                 # Recursively get parent's history first (chronological order)
-                parent_events = _collect_history(origin["left_parent"], depth + 1)
+                parent_events = _collect_history(merge_origin["left_parent"], depth + 1)
                 events.extend(parent_events)
+            # Check if this row came from a concat
+            # For concat, parent_rid == rid (identity mapping), so we don't recurse
+            # But we record the concat step for completeness
+            concat_origin = self.get_concat_origin(rid)
+            if concat_origin:
+                # Concat preserves RIDs, so the "parent" is the same RID
+                # The concat step itself is recorded in the step events
+                pass
             # Add this row's direct events
             events.extend(self.get_row_history(rid))

{tracepipe-0.3.5.dist-info → tracepipe-0.4.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: tracepipe
-Version: 0.3.5
+Version: 0.4.2
 Summary: Row-level data lineage tracking for pandas pipelines
 Project-URL: Homepage, https://github.com/tracepipe/tracepipe
 Project-URL: Documentation, https://tracepipe.github.io/tracepipe/
@@ -278,7 +278,7 @@ tp.enable(mode="debug")  # Full lineage
 ## Known Limitations
-TracePipe tracks **cell mutations** (fillna, replace, loc assignment) and **merge provenance** reliably. However, some patterns are not yet fully supported:
+TracePipe tracks **cell mutations**, **merge provenance**, **concat provenance**, and **duplicate drop decisions** reliably. A few patterns have limited tracking:
 | Pattern | Status | Notes |
 |---------|--------|-------|
@@ -286,13 +286,10 @@ TracePipe tracks **cell mutations** (fillna, replace, loc assignment) and **merg
 | `df = df.fillna({"col": 0})` | ✅ Tracked | DataFrame-level fillna |
 | `df.loc[mask, "col"] = val` | ✅ Tracked | Conditional assignment |
 | `df.merge(other, on="key")` | ✅ Tracked | Full provenance in debug mode |
-| `pd.concat([df1, df2])` | ⚠️ Partial | Row IDs preserved, but no "source DataFrame" tracking |
-| `df.drop_duplicates(keep='last')` | ⚠️ Partial | Which row was kept is not tracked |
-| Sort + dedup patterns | ⚠️ Partial | "Latest record wins" logic not traced |
-**Why?** TracePipe tracks value changes within rows, not row-selection operations. When `drop_duplicates` picks one row over another, that's a provenance decision (not a cell mutation) that isn't currently instrumented.
-**Planned for 0.4**: Full row-provenance tracking for concat, drop_duplicates, and sort operations.
+| `pd.concat([df1, df2])` | ✅ Tracked | Row IDs preserved with source DataFrame tracking (v0.4+) |
+| `df.drop_duplicates()` | ✅ Tracked | Dropped rows map to kept representative (debug mode, v0.4+) |
+| `pd.concat(axis=1)` | ⚠️ Partial | FULL only if all inputs have identical RIDs |
+| Complex `apply`/`pipe` | ⚠️ Partial | Output tracked, internals opaque |
 ---

{tracepipe-0.3.5.dist-info → tracepipe-0.4.2.dist-info}/RECORD RENAMED Viewed

@@ -1,29 +1,29 @@
-tracepipe/__init__.py,sha256=HK7i2rACJQdbyz5oMZ4z-xo9xJbS0cUqbS2AK6uMHJU,3342
+tracepipe/__init__.py,sha256=cocA8ETqC1IGgDCXvxue9M4QVzIt8C981b6NTf9BXQ4,3342
 tracepipe/api.py,sha256=WdcKvvzI3voDt6fxZWa8vjyZQU8lfRshx7T78oj7oFE,13351
 tracepipe/context.py,sha256=DvwAZGZbLDJ4xoqmS1VnZOBbOI8ZIIErsY9W6GEbFSM,4051
 tracepipe/contracts.py,sha256=m-rjPrgnCiAgKEkweOS7P95jrjDptt5UPdvUlqaV_rU,16226
-tracepipe/convenience.py,sha256=KuDz_ZzNivVG1SS8Srr3plu4CTwFmNhYL4rk3vV6cbE,28421
-tracepipe/core.py,sha256=kAXks694rR0Z4tD7Gyty0TyJGWx2whsSdteYYpHuazo,8010
-tracepipe/debug.py,sha256=6t2GKVZLwn7SJLhrStE9qsmTiVIHATTE3jJPQ2DYtnc,10140
+tracepipe/convenience.py,sha256=ALRtVn6tLfa7Ks7d9hKVJfhLjOLuyFgxTwSoUL0BgHY,38241
+tracepipe/core.py,sha256=O1QFJFTSszxKY1pVR1XLrdA0zez7t5KQLjyq6V36ARk,10883
+tracepipe/debug.py,sha256=S3ga3rVHjDSV4OctkF5uEAQlzjOxFJO8RGC81awGboA,11397
 tracepipe/safety.py,sha256=UpzhQj31Dij-DjgT9mY_jPrUpVfcA51gDI2fUos4IUA,6694
-tracepipe/snapshot.py,sha256=OLREzE1_LkWITluG_Bqeb7Y4pAKb8Lb3zJEF3cxnloU,13967
+tracepipe/snapshot.py,sha256=kvW8be1EAAsyHefXxJPgIQAAYT_FwK167SMxeQcsra4,17921
 tracepipe/value_provenance.py,sha256=ogky6aOaZ-6K2uNBQxlXpmCeuvK434Hisj30zesRTd8,9330
 tracepipe/instrumentation/__init__.py,sha256=pd0n6Z9m_V3gcBv097cXWFOZEzAP9sAq1jjQnNRrDZ8,222
 tracepipe/instrumentation/apply_capture.py,sha256=cMThWzNXqWQENuMrCGTne1hO6fqaQFV7zJYNpsPTW4w,14463
-tracepipe/instrumentation/filter_capture.py,sha256=onlYLU5bBZSM3WmxM2AFHfktnlx7ReG-brEn5eZ_N10,15830
+tracepipe/instrumentation/filter_capture.py,sha256=aN8-Ev6kbDR8f9A9JVy236VK0iqNxpMvki3pbtUkBYQ,19445
 tracepipe/instrumentation/indexer_capture.py,sha256=1ATCeJ-uNA1uGiSbgnUx0wdVsIlZGHeUBaFJPXgFQNg,28440
-tracepipe/instrumentation/merge_capture.py,sha256=Eze-PTrn7IXxZRZBYX9R13mOY3diWKAkjp4z-wa1tEk,13349
+tracepipe/instrumentation/merge_capture.py,sha256=zqa6SY5YLbr-N7PPTdE6TYKyJIZcPqT02d1Ifvi3Jdw,18359
 tracepipe/instrumentation/pandas_inst.py,sha256=h8RlfwYkYwuftCyBYIETdwHxVCzQM1SBBrbYP7SyjJ8,30047
 tracepipe/instrumentation/series_capture.py,sha256=i7FiA2ndEzS6duIj5y-a7SDfIMl2cCY_jGC1tmG7TGU,11271
 tracepipe/storage/__init__.py,sha256=pGFMfbIgIi2kofVPwYDqe2HTYMYJoabiGjTq77pYi-g,348
 tracepipe/storage/base.py,sha256=7DV_-rp37DjBMr9B1w85hLVYhC8OQShk2PcEhT-n4tE,4894
-tracepipe/storage/lineage_store.py,sha256=KhGri2uC_O_43fUivFGEHY6KBDHd1I0O_PPd_KD3L4M,28683
+tracepipe/storage/lineage_store.py,sha256=1enRmDgnVjxW8Pu7WMHJ8WPnnbm-HsAm4e1dKsTvnIc,31943
 tracepipe/storage/row_identity.py,sha256=HBU0gTTJlFtFTcAdUCKuX-c9cHa0lo3CDIodDPDgOzA,17161
 tracepipe/utils/__init__.py,sha256=CI_GXViCjdMbu1j6HuzZhoQZEW0sIB6WAve6j5pfOC0,182
 tracepipe/utils/value_capture.py,sha256=wGgegQmJnVHxHbwHSH9di7JAOBChzD3ERJrabZNiayk,4092
 tracepipe/visualization/__init__.py,sha256=M3s44ZTUNEToyghjhQW0FgbmWHKPr4Xc-7iNF6DpI_E,132
 tracepipe/visualization/html_export.py,sha256=G0hfZTJctUCfpun17zXX1NIXhvJZbca6hKmP3rcIjbg,42282
-tracepipe-0.3.5.dist-info/METADATA,sha256=bWidBs8nMW6T6oah8xQum_IjdP7Y1J1inDAn-gfHUCg,10288
-tracepipe-0.3.5.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-tracepipe-0.3.5.dist-info/licenses/LICENSE,sha256=HMOAFHBClL79POwWL-2_aDcx42DJAq7Ce-nwJPvMB9U,1075
-tracepipe-0.3.5.dist-info/RECORD,,
+tracepipe-0.4.2.dist-info/METADATA,sha256=0nMQRfqFJCg1DMGjWzW_nlFcWMM-q8T4LfoqkMcYmAQ,10067
+tracepipe-0.4.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+tracepipe-0.4.2.dist-info/licenses/LICENSE,sha256=HMOAFHBClL79POwWL-2_aDcx42DJAq7Ce-nwJPvMB9U,1075
+tracepipe-0.4.2.dist-info/RECORD,,

{tracepipe-0.3.5.dist-info → tracepipe-0.4.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{tracepipe-0.3.5.dist-info → tracepipe-0.4.2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

tracepipe 0.3.5__py3-none-any.whl → 0.4.2__py3-none-any.whl

tracepipe 0.3.5py3-none-any.whl → 0.4.2py3-none-any.whl