PyPI - tracepipe - Versions diffs - 0.4.1__py3-none-any.whl → 0.4.2__py3-none-any.whl - Mend

tracepipe 0.4.1py3-none-any.whl → 0.4.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

tracepipe/__init__.py +1 -1
tracepipe/convenience.py +131 -12
tracepipe/debug.py +40 -0
tracepipe/snapshot.py +87 -2
{tracepipe-0.4.1.dist-info → tracepipe-0.4.2.dist-info}/METADATA +1 -1
{tracepipe-0.4.1.dist-info → tracepipe-0.4.2.dist-info}/RECORD +8 -8
{tracepipe-0.4.1.dist-info → tracepipe-0.4.2.dist-info}/WHEEL +0 -0
{tracepipe-0.4.1.dist-info → tracepipe-0.4.2.dist-info}/licenses/LICENSE +0 -0

tracepipe/__init__.py CHANGED Viewed

@@ -81,7 +81,7 @@ from .core import TracePipeConfig, TracePipeMode
 from .snapshot import DiffResult, Snapshot, diff, snapshot
 # === VERSION ===
-__version__ = "0.4.1"
+__version__ = "0.4.2"
 # === MINIMAL __all__ ===
 __all__ = [

tracepipe/convenience.py CHANGED Viewed

@@ -60,6 +60,8 @@ class CheckResult:
         .retention    - Row retention rate (0.0-1.0)
         .n_dropped    - Total rows dropped
         .drops_by_op  - Drops broken down by operation
+        .n_changes    - Total cell-level changes (debug mode only)
+        .changes_by_op - Changes broken down by operation (debug mode only)
     """
     ok: bool
@@ -69,6 +71,9 @@ class CheckResult:
     mode: str
     # Internal: store drops_by_op so we don't need to recompute
     _drops_by_op: dict[str, int] = field(default_factory=dict)
+    # Internal: store cell change counts (debug mode only)
+    _n_changes: int = 0
+    _changes_by_op: dict[str, int] = field(default_factory=dict)
     # === CONVENIENCE PROPERTIES ===
@@ -97,6 +102,16 @@ class CheckResult:
         """Total pipeline steps recorded."""
         return self.facts.get("total_steps", 0)
+    @property
+    def n_changes(self) -> int:
+        """Total cell-level changes (debug mode only, 0 if not tracked)."""
+        return self._n_changes
+    @property
+    def changes_by_op(self) -> dict[str, int]:
+        """Cell changes broken down by operation (debug mode only)."""
+        return self._changes_by_op
     # === EXISTING PROPERTIES ===
     @property
@@ -127,6 +142,20 @@ class CheckResult:
         lines.append(f"TracePipe Check: {status}")
         lines.append(f"  Mode: {self.mode}")
+        # Always show key metrics in compact form
+        if self.retention is not None:
+            lines.append(f"\nRetention: {int(self.retention * 100)}%")
+        if self.n_dropped > 0:
+            lines.append(f"Dropped: {self.n_dropped} rows")
+            if self.drops_by_op:
+                for op, count in list(self.drops_by_op.items())[:5]:
+                    lines.append(f"  • {op}: {count}")
+        if self.n_changes > 0:
+            lines.append(f"\nValue changes: {self.n_changes} cells")
+            if self.changes_by_op:
+                for op, count in list(self.changes_by_op.items())[:5]:
+                    lines.append(f"  • {op}: {count}")
         if verbose and self.facts:
             lines.append("\n  Measured facts:")
             for k, v in self.facts.items():
@@ -158,6 +187,8 @@ class CheckResult:
             "n_dropped": self.n_dropped,
             "n_steps": self.n_steps,
             "drops_by_op": self.drops_by_op,
+            "n_changes": self.n_changes,
+            "changes_by_op": self.changes_by_op,
             "facts": self.facts,
             "suggestions": self.suggestions,
             "warnings": [
@@ -191,6 +222,7 @@ class TraceResult:
     Events are in CHRONOLOGICAL order (oldest->newest).
     Key attributes:
+        status: "alive" or "dropped" (string representation)
         origin: Where this row came from (concat, merge, or original)
         representative: If dropped by dedup, which row was kept instead
     """
@@ -207,6 +239,27 @@ class TraceResult:
     # v0.4+ provenance
     concat_origin: dict[str, Any] | None = None
     dedup_representative: dict[str, Any] | None = None
+    # Steps this row survived (for SURVIVED event generation)
+    _survived_steps: list[dict[str, Any]] = field(default_factory=list)
+    @property
+    def status(self) -> str:
+        """Row status as string: 'alive' or 'dropped'."""
+        return "alive" if self.is_alive else "dropped"
+    @property
+    def dropped_by(self) -> str | None:
+        """Operation that dropped this row, or None if alive."""
+        if self.dropped_at:
+            return self.dropped_at.get("operation")
+        return None
+    @property
+    def dropped_at_step(self) -> int | None:
+        """Step number where this row was dropped, or None if alive."""
+        if self.dropped_at:
+            return self.dropped_at.get("step_id")
+        return None
     @property
     def n_events(self) -> int:
@@ -258,8 +311,10 @@ class TraceResult:
         """Export to dictionary."""
         return {
             "row_id": self.row_id,
+            "status": self.status,
             "is_alive": self.is_alive,
             "dropped_at": self.dropped_at,
+            "dropped_by": self.dropped_at.get("operation") if self.dropped_at else None,
             "origin": self.origin,
             "representative": self.representative,
             "n_events": self.n_events,
@@ -280,10 +335,11 @@ class TraceResult:
         lines = [f"Row {self.row_id} Journey:"]
+        # Status line matches documentation format
         if self.is_alive:
             lines.append("  Status: [OK] Alive")
         else:
-            lines.append("  Status: [X] Dropped")
+            lines.append("  Status: [DROPPED]")
             if self.dropped_at:
                 lines.append(
                     f"    at step {self.dropped_at['step_id']}: {self.dropped_at['operation']}"
@@ -579,6 +635,21 @@ def check(
         if count > 1000:
             suggestions.append(f"'{op}' dropped {count} rows - review if intentional")
+    # === CELL CHANGES (debug mode only) ===
+    n_changes = 0
+    changes_by_op: dict[str, int] = {}
+    if ctx.config.mode == TracePipeMode.DEBUG:
+        # Count non-drop diffs (cell-level changes)
+        step_map = {s.step_id: s.operation for s in ctx.store.steps}
+        for i in range(len(ctx.store.diff_step_ids)):
+            col = ctx.store.diff_cols[i]
+            if col != "__row__":  # Skip drop events
+                n_changes += 1
+                step_id = ctx.store.diff_step_ids[i]
+                op = step_map.get(step_id, "unknown")
+                changes_by_op[op] = changes_by_op.get(op, 0) + 1
+        facts["n_changes"] = n_changes
     ok = len([w for w in warnings_list if w.severity == "fact"]) == 0
     return CheckResult(
@@ -588,6 +659,8 @@ def check(
         suggestions=suggestions,
         mode=ctx.config.mode.value,
         _drops_by_op=drops_by_op,
+        _n_changes=n_changes,
+        _changes_by_op=changes_by_op,
     )
@@ -595,6 +668,7 @@ def trace(
     df: pd.DataFrame,
     *,
     row: int | None = None,
+    row_id: int | None = None,
     where: dict[str, Any] | None = None,
     include_ghost: bool = True,
 ) -> TraceResult | list[TraceResult]:
@@ -603,7 +677,8 @@ def trace(
     Args:
         df: DataFrame to search in
-        row: Row ID (if known)
+        row: Row position (0-based index into current DataFrame)
+        row_id: Internal row ID (use for tracing dropped rows)
         where: Selector dict, e.g. {"customer_id": "C123"}
         include_ghost: Include last-known values for dropped rows
@@ -612,8 +687,14 @@ def trace(
         Use print(result) for pretty output, result.to_dict() for data.
     Examples:
-        result = tp.trace(df, row=5)
-        print(result)
+        # Trace by position in current DataFrame
+        result = tp.trace(df, row=0)  # First row
+        # Trace by internal row ID (for dropped rows)
+        dropped = tp.debug.inspect().dropped_rows()
+        result = tp.trace(df, row_id=dropped[0])
+        # Trace by business key
         tp.trace(df, where={"customer_id": "C123"})
     """
     ctx = get_context()
@@ -624,12 +705,30 @@ def trace(
         pass
     # Resolve row IDs
-    if row is not None:
-        row_ids = [row]
+    if row_id is not None:
+        # Direct row ID specified - use as-is
+        row_ids = [row_id]
+    elif row is not None:
+        # row= is a DataFrame index position (0-based), not a row ID
+        # Convert to actual row ID using the DataFrame's registered IDs
+        rids = ctx.row_manager.get_ids_array(df)
+        if rids is not None:
+            # Handle negative indexing
+            if row < 0:
+                row = len(rids) + row
+            if 0 <= row < len(rids):
+                row_ids = [int(rids[row])]
+            else:
+                raise ValueError(
+                    f"Row index {row} out of bounds for DataFrame with {len(rids)} rows"
+                )
+        else:
+            # DataFrame not tracked - use row as-is (legacy behavior)
+            row_ids = [row]
     elif where is not None:
         row_ids = _resolve_where(df, where, ctx)
     else:
-        raise ValueError("Must provide 'row' or 'where'")
+        raise ValueError("Must provide 'row', 'row_id', or 'where'")
     results = []
     for rid in row_ids:
@@ -644,6 +743,7 @@ def why(
     *,
     col: str,
     row: int | None = None,
+    row_id: int | None = None,
     where: dict[str, Any] | None = None,
 ) -> WhyResult | list[WhyResult]:
     """
@@ -652,7 +752,8 @@ def why(
     Args:
         df: DataFrame to search in
         col: Column name to trace
-        row: Row ID (if known)
+        row: Row position (0-based index into current DataFrame)
+        row_id: Internal row ID (use for cells in dropped rows)
         where: Selector dict, e.g. {"customer_id": "C123"}
     Returns:
@@ -660,7 +761,7 @@ def why(
         Use print(result) for pretty output, result.to_dict() for data.
     Examples:
-        result = tp.why(df, col="amount", row=5)
+        result = tp.why(df, col="amount", row=0)  # First row
         print(result)
         tp.why(df, col="email", where={"user_id": "U123"})
     """
@@ -676,12 +777,30 @@ def why(
         )
     # Resolve row IDs
-    if row is not None:
-        row_ids = [row]
+    if row_id is not None:
+        # Direct row ID specified - use as-is
+        row_ids = [row_id]
+    elif row is not None:
+        # row= is a DataFrame index position (0-based), not a row ID
+        # Convert to actual row ID using the DataFrame's registered IDs
+        rids = ctx.row_manager.get_ids_array(df)
+        if rids is not None:
+            # Handle negative indexing
+            if row < 0:
+                row = len(rids) + row
+            if 0 <= row < len(rids):
+                row_ids = [int(rids[row])]
+            else:
+                raise ValueError(
+                    f"Row index {row} out of bounds for DataFrame with {len(rids)} rows"
+                )
+        else:
+            # DataFrame not tracked - use row as-is (legacy behavior)
+            row_ids = [row]
     elif where is not None:
         row_ids = _resolve_where(df, where, ctx)
     else:
-        raise ValueError("Must provide 'row' or 'where'")
+        raise ValueError("Must provide 'row', 'row_id', or 'where'")
     results = []
     for rid in row_ids:

tracepipe/debug.py CHANGED Viewed

@@ -179,6 +179,46 @@ class DebugInspector:
         ctx = get_context()
         return ctx.row_manager.get_ghost_rows(limit=limit)
+    def get_ghost_values(self, row_id: int) -> dict[str, Any] | None:
+        """
+        Get last-known values for a specific dropped row (DEBUG mode only).
+        Args:
+            row_id: The row ID to look up
+        Returns:
+            Dict mapping column names to their last known values,
+            or None if the row was not found in ghost storage.
+        Example:
+            dbg = tp.debug.inspect()
+            dropped_rid = list(dbg.dropped_rows())[0]
+            ghost = dbg.get_ghost_values(dropped_rid)
+            print(f"Last known values: {ghost}")
+        """
+        ctx = get_context()
+        ghost_df = ctx.row_manager.get_ghost_rows(limit=100000)
+        if ghost_df.empty or "__tp_row_id__" not in ghost_df.columns:
+            return None
+        row_match = ghost_df[ghost_df["__tp_row_id__"] == row_id]
+        if row_match.empty:
+            return None
+        # Convert to dict and remove internal columns
+        result = row_match.iloc[0].to_dict()
+        internal_cols = [
+            "__tp_row_id__",
+            "__tp_dropped_by__",
+            "__tp_dropped_step__",
+            "__tp_original_position__",
+        ]
+        for col in internal_cols:
+            result.pop(col, None)
+        return result
     def stats(self) -> dict:
         """Get comprehensive tracking statistics."""
         ctx = get_context()

tracepipe/snapshot.py CHANGED Viewed

@@ -25,7 +25,7 @@ Usage:
 import json
 import time
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any, Optional
@@ -297,6 +297,20 @@ class DiffResult:
     recovered_rows: set[int]
     drops_delta: dict[str, int]  # op -> change in count
     stats_changes: dict[str, dict[str, Any]]  # col -> {metric: (old, new)}
+    # Column changes
+    columns_added: list[str] = field(default_factory=list)
+    columns_removed: list[str] = field(default_factory=list)
+    # Cell-level changes (only populated if both snapshots have include_values=True)
+    cells_changed: int = 0  # Total modified cells
+    changed_rows: set[int] = field(default_factory=set)  # IDs of rows with value changes
+    changes_by_column: dict[str, int] = field(default_factory=dict)  # col -> count
+    @property
+    def rows_unchanged(self) -> int:
+        """Number of rows that exist in both snapshots (may have value changes)."""
+        # This is computed from the rows that weren't added or removed
+        # Note: This is an estimate based on the smaller snapshot
+        return 0  # Will be set during diff computation
     def __repr__(self) -> str:
         lines = ["Snapshot Diff:"]
@@ -310,6 +324,18 @@ class DiffResult:
         if self.recovered_rows:
             lines.append(f"  * {len(self.recovered_rows)} recovered")
+        if self.columns_added:
+            lines.append(f"  Columns added: {', '.join(self.columns_added)}")
+        if self.columns_removed:
+            lines.append(f"  Columns removed: {', '.join(self.columns_removed)}")
+        if self.cells_changed > 0:
+            lines.append("\n  Changes:")
+            lines.append(f"    - {self.cells_changed} cells modified")
+            if self.changes_by_column:
+                for col, count in sorted(self.changes_by_column.items(), key=lambda x: -x[1])[:5]:
+                    lines.append(f"      {col}: {count}")
         if self.drops_delta:
             lines.append("  Drop changes by operation:")
             for op, delta in sorted(self.drops_delta.items(), key=lambda x: -abs(x[1])):
@@ -339,6 +365,9 @@ class DiffResult:
             or self.recovered_rows
             or self.drops_delta
             or self.stats_changes
+            or self.columns_added
+            or self.columns_removed
+            or self.cells_changed
         )
     def to_dict(self) -> dict:
@@ -350,6 +379,11 @@ class DiffResult:
             "recovered_rows": list(self.recovered_rows),
             "drops_delta": self.drops_delta,
             "stats_changes": self.stats_changes,
+            "columns_added": self.columns_added,
+            "columns_removed": self.columns_removed,
+            "cells_changed": self.cells_changed,
+            "changed_rows": list(self.changed_rows),
+            "changes_by_column": self.changes_by_column,
         }
@@ -359,6 +393,9 @@ def diff(baseline: Snapshot, current: Snapshot) -> DiffResult:
     Note: Cross-run diff is SUMMARY-ONLY unless keys are stored.
     Row-level comparison only works within same session (same RID assignment).
+    For cell-level diff (cells_changed, changes_by_column), both snapshots
+    must have been created with include_values=True.
     """
     rows_added = current.row_ids - baseline.row_ids
     rows_removed = baseline.row_ids - current.row_ids
@@ -375,9 +412,15 @@ def diff(baseline: Snapshot, current: Snapshot) -> DiffResult:
         if old != new:
             drops_delta[op] = new - old
+    # Column changes
+    baseline_cols = set(baseline.column_stats.keys())
+    current_cols = set(current.column_stats.keys())
+    columns_added = sorted(current_cols - baseline_cols)
+    columns_removed = sorted(baseline_cols - current_cols)
     # Stats changes
     stats_changes: dict[str, dict[str, Any]] = {}
-    all_cols = set(baseline.column_stats.keys()) | set(current.column_stats.keys())
+    all_cols = baseline_cols | current_cols
     for col in all_cols:
         old_stats = baseline.column_stats.get(col)
         new_stats = current.column_stats.get(col)
@@ -396,6 +439,43 @@ def diff(baseline: Snapshot, current: Snapshot) -> DiffResult:
         if changes:
             stats_changes[col] = changes
+    # Cell-level changes (only if both snapshots have watched data)
+    cells_changed = 0
+    changed_rows: set[int] = set()
+    changes_by_column: dict[str, int] = {}
+    if baseline.watched_data is not None and current.watched_data is not None:
+        # Find common rows and columns
+        common_rows = baseline.row_ids & current.row_ids
+        common_cols = set(baseline.watched_data.columns) & set(current.watched_data.columns)
+        for rid in common_rows:
+            for col in common_cols:
+                old_val = baseline.watched_data.get_value(int(rid), col)
+                new_val = current.watched_data.get_value(int(rid), col)
+                # Compare values (handle NaN)
+                values_equal = False
+                if old_val is None and new_val is None:
+                    values_equal = True
+                elif old_val is not None and new_val is not None:
+                    try:
+                        # Handle NaN comparison
+                        if isinstance(old_val, float) and isinstance(new_val, float):
+                            if old_val != old_val and new_val != new_val:  # Both NaN
+                                values_equal = True
+                            else:
+                                values_equal = old_val == new_val
+                        else:
+                            values_equal = old_val == new_val
+                    except (TypeError, ValueError):
+                        values_equal = str(old_val) == str(new_val)
+                if not values_equal:
+                    cells_changed += 1
+                    changed_rows.add(rid)
+                    changes_by_column[col] = changes_by_column.get(col, 0) + 1
     return DiffResult(
         rows_added=rows_added,
         rows_removed=rows_removed,
@@ -403,6 +483,11 @@ def diff(baseline: Snapshot, current: Snapshot) -> DiffResult:
         recovered_rows=recovered_rows,
         drops_delta=drops_delta,
         stats_changes=stats_changes,
+        columns_added=columns_added,
+        columns_removed=columns_removed,
+        cells_changed=cells_changed,
+        changed_rows=changed_rows,
+        changes_by_column=changes_by_column,
     )

{tracepipe-0.4.1.dist-info → tracepipe-0.4.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: tracepipe
-Version: 0.4.1
+Version: 0.4.2
 Summary: Row-level data lineage tracking for pandas pipelines
 Project-URL: Homepage, https://github.com/tracepipe/tracepipe
 Project-URL: Documentation, https://tracepipe.github.io/tracepipe/

{tracepipe-0.4.1.dist-info → tracepipe-0.4.2.dist-info}/RECORD RENAMED Viewed

@@ -1,12 +1,12 @@
-tracepipe/__init__.py,sha256=VOQFGsfVlTngxxdDSgOOd7X2KJt1l4fjKDH4NeizYEg,3342
+tracepipe/__init__.py,sha256=cocA8ETqC1IGgDCXvxue9M4QVzIt8C981b6NTf9BXQ4,3342
 tracepipe/api.py,sha256=WdcKvvzI3voDt6fxZWa8vjyZQU8lfRshx7T78oj7oFE,13351
 tracepipe/context.py,sha256=DvwAZGZbLDJ4xoqmS1VnZOBbOI8ZIIErsY9W6GEbFSM,4051
 tracepipe/contracts.py,sha256=m-rjPrgnCiAgKEkweOS7P95jrjDptt5UPdvUlqaV_rU,16226
-tracepipe/convenience.py,sha256=nJ7Fy8riQVLXHOn1IFWtSpnmhHlyPt1hhantkOLKJs0,33141
+tracepipe/convenience.py,sha256=ALRtVn6tLfa7Ks7d9hKVJfhLjOLuyFgxTwSoUL0BgHY,38241
 tracepipe/core.py,sha256=O1QFJFTSszxKY1pVR1XLrdA0zez7t5KQLjyq6V36ARk,10883
-tracepipe/debug.py,sha256=6t2GKVZLwn7SJLhrStE9qsmTiVIHATTE3jJPQ2DYtnc,10140
+tracepipe/debug.py,sha256=S3ga3rVHjDSV4OctkF5uEAQlzjOxFJO8RGC81awGboA,11397
 tracepipe/safety.py,sha256=UpzhQj31Dij-DjgT9mY_jPrUpVfcA51gDI2fUos4IUA,6694
-tracepipe/snapshot.py,sha256=OLREzE1_LkWITluG_Bqeb7Y4pAKb8Lb3zJEF3cxnloU,13967
+tracepipe/snapshot.py,sha256=kvW8be1EAAsyHefXxJPgIQAAYT_FwK167SMxeQcsra4,17921
 tracepipe/value_provenance.py,sha256=ogky6aOaZ-6K2uNBQxlXpmCeuvK434Hisj30zesRTd8,9330
 tracepipe/instrumentation/__init__.py,sha256=pd0n6Z9m_V3gcBv097cXWFOZEzAP9sAq1jjQnNRrDZ8,222
 tracepipe/instrumentation/apply_capture.py,sha256=cMThWzNXqWQENuMrCGTne1hO6fqaQFV7zJYNpsPTW4w,14463
@@ -23,7 +23,7 @@ tracepipe/utils/__init__.py,sha256=CI_GXViCjdMbu1j6HuzZhoQZEW0sIB6WAve6j5pfOC0,1
 tracepipe/utils/value_capture.py,sha256=wGgegQmJnVHxHbwHSH9di7JAOBChzD3ERJrabZNiayk,4092
 tracepipe/visualization/__init__.py,sha256=M3s44ZTUNEToyghjhQW0FgbmWHKPr4Xc-7iNF6DpI_E,132
 tracepipe/visualization/html_export.py,sha256=G0hfZTJctUCfpun17zXX1NIXhvJZbca6hKmP3rcIjbg,42282
-tracepipe-0.4.1.dist-info/METADATA,sha256=kF2jBdGhKt-9YGR5VdFyb85jZj3Tgc26FbL9JxRLkhc,10067
-tracepipe-0.4.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-tracepipe-0.4.1.dist-info/licenses/LICENSE,sha256=HMOAFHBClL79POwWL-2_aDcx42DJAq7Ce-nwJPvMB9U,1075
-tracepipe-0.4.1.dist-info/RECORD,,
+tracepipe-0.4.2.dist-info/METADATA,sha256=0nMQRfqFJCg1DMGjWzW_nlFcWMM-q8T4LfoqkMcYmAQ,10067
+tracepipe-0.4.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+tracepipe-0.4.2.dist-info/licenses/LICENSE,sha256=HMOAFHBClL79POwWL-2_aDcx42DJAq7Ce-nwJPvMB9U,1075
+tracepipe-0.4.2.dist-info/RECORD,,

{tracepipe-0.4.1.dist-info → tracepipe-0.4.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{tracepipe-0.4.1.dist-info → tracepipe-0.4.2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

tracepipe 0.4.1__py3-none-any.whl → 0.4.2__py3-none-any.whl

tracepipe 0.4.1py3-none-any.whl → 0.4.2py3-none-any.whl