PyPI - tracepipe - Versions diffs - 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

tracepipe 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

tracepipe/__init__.py +117 -78
tracepipe/api.py +168 -331
tracepipe/context.py +21 -1
tracepipe/contracts.py +473 -0
tracepipe/convenience.py +812 -0
tracepipe/core.py +174 -17
tracepipe/debug.py +325 -0
tracepipe/instrumentation/apply_capture.py +453 -0
tracepipe/instrumentation/filter_capture.py +468 -0
tracepipe/instrumentation/indexer_capture.py +813 -0
tracepipe/instrumentation/merge_capture.py +434 -0
tracepipe/instrumentation/pandas_inst.py +66 -183
tracepipe/instrumentation/series_capture.py +331 -0
tracepipe/safety.py +3 -3
tracepipe/snapshot.py +420 -0
tracepipe/storage/base.py +7 -3
tracepipe/storage/lineage_store.py +190 -47
tracepipe/storage/row_identity.py +366 -104
tracepipe/value_provenance.py +301 -0
tracepipe/visualization/html_export.py +22 -7
tracepipe-0.3.0.dist-info/METADATA +575 -0
tracepipe-0.3.0.dist-info/RECORD +29 -0
tracepipe-0.2.0.dist-info/METADATA +0 -508
tracepipe-0.2.0.dist-info/RECORD +0 -19
{tracepipe-0.2.0.dist-info → tracepipe-0.3.0.dist-info}/WHEEL +0 -0
{tracepipe-0.2.0.dist-info → tracepipe-0.3.0.dist-info}/licenses/LICENSE +0 -0

tracepipe/core.py CHANGED Viewed

@@ -1,11 +1,19 @@
 # tracepipe/core.py
 """
 Core types, enums, and configuration for TracePipe.
+Design Principles:
+1. Pandas Execution is Authoritative: TracePipe never re-implements operations
+2. Trust Over Features: Mark PARTIAL when uncertain; never lie about completeness
+3. Don't Touch User Data: No DataFrame mutation by default
+4. Modes for Adoption: CI mode (fast) vs Debug mode (deep)
+5. NumPy-First: Vectorized operations; no Python loops over millions of rows
 """
 import os
+import time
 from dataclasses import dataclass, field
-from enum import IntEnum
+from enum import Enum, IntEnum
 from typing import Any, Optional
@@ -22,9 +30,9 @@ class CompletenessLevel(IntEnum):
     """
     Indicates how completely an operation's internals are tracked.
-    FULL: Completely tracked (e.g., fillna, dropna)
-    PARTIAL: Output tracked, internals unknown (e.g., apply, pipe)
-    UNKNOWN: Lineage reset (e.g., merge, concat)
+    FULL: Complete lineage captured (dropna, drop_duplicates, boolean indexing)
+    PARTIAL: Output tracked, internals approximate (query with @var, merge in CI mode)
+    UNKNOWN: Operation not instrumented (future: uninstrumented custom ops)
     """
     FULL = 0
@@ -32,49 +40,157 @@ class CompletenessLevel(IntEnum):
     UNKNOWN = 2
+class TracePipeMode(Enum):
+    """TracePipe operating modes."""
+    CI = "ci"  # Fast: stats, drops, contracts
+    DEBUG = "debug"  # Deep: merge provenance, ghost values, cell history
+class IdentityStorage(Enum):
+    """Row identity storage strategies."""
+    REGISTRY = "registry"  # Default: WeakKeyDictionary, no data mutation
+    COLUMN = "column"  # Opt-in: hidden column (for edge cases)
+    ATTRS = "attrs"  # Alternative: df.attrs token
 @dataclass
 class TracePipeConfig:
     """Configuration with sensible defaults."""
+    # Memory limits
     max_diffs_in_memory: int = 500_000
     max_diffs_per_step: int = 100_000
-    max_group_membership_size: int = 100_000  # Store count-only above this threshold
+    max_group_membership_size: int = 100_000
+    # Behavior options
     strict_mode: bool = False
     auto_watch: bool = False
     auto_watch_null_threshold: float = 0.01
     spillover_dir: str = ".tracepipe"
-    use_hidden_column: bool = False
     warn_on_duplicate_index: bool = True
     cleanup_spillover_on_disable: bool = True
+    # Mode system
+    mode: TracePipeMode = TracePipeMode.CI
+    # Identity storage (default to registry, not column)
+    identity_storage: IdentityStorage = IdentityStorage.REGISTRY
+    # Feature overrides (None = use mode default)
+    merge_provenance: Optional[bool] = None
+    ghost_row_values: Optional[bool] = None
+    cell_history: Optional[bool] = None
+    # Ghost row limits
+    max_ghost_rows: int = 10_000
+    @property
+    def should_capture_merge_provenance(self) -> bool:
+        if self.merge_provenance is not None:
+            return self.merge_provenance
+        return self.mode == TracePipeMode.DEBUG
+    @property
+    def should_capture_ghost_values(self) -> bool:
+        if self.ghost_row_values is not None:
+            return self.ghost_row_values
+        return self.mode == TracePipeMode.DEBUG
+    @property
+    def should_capture_cell_history(self) -> bool:
+        if self.cell_history is not None:
+            return self.cell_history
+        return self.mode == TracePipeMode.DEBUG
+    @property
+    def use_hidden_column(self) -> bool:
+        return self.identity_storage == IdentityStorage.COLUMN
+    @property
+    def use_attrs_token(self) -> bool:
+        return self.identity_storage == IdentityStorage.ATTRS
     @classmethod
     def from_env(cls) -> "TracePipeConfig":
         """Create config from environment variables."""
+        mode_str = os.environ.get("TRACEPIPE_MODE", "ci")
         return cls(
+            mode=TracePipeMode.DEBUG if mode_str == "debug" else TracePipeMode.CI,
             max_diffs_in_memory=int(os.environ.get("TRACEPIPE_MAX_DIFFS", 500_000)),
             max_diffs_per_step=int(os.environ.get("TRACEPIPE_MAX_DIFFS_PER_STEP", 100_000)),
             strict_mode=os.environ.get("TRACEPIPE_STRICT", "0") == "1",
             auto_watch=os.environ.get("TRACEPIPE_AUTO_WATCH", "0") == "1",
-            use_hidden_column=os.environ.get("TRACEPIPE_HIDDEN_COL", "0") == "1",
         )
 @dataclass
-class StepMetadata:
-    """Metadata for a single pipeline step."""
+class StepEvent:
+    """
+    Stable schema for pipeline step events.
+    This schema is designed to be stable across versions.
+    New fields should be added as Optional with defaults.
+    """
     step_id: int
     operation: str
-    stage: Optional[str]
-    timestamp: float
-    code_file: Optional[str]
-    code_line: Optional[int]
-    params: dict[str, Any]
-    input_shape: Optional[tuple]
-    output_shape: Optional[tuple]
+    timestamp: float = field(default_factory=time.time)
+    # Context
+    stage: Optional[str] = None
+    code_file: Optional[str] = None
+    code_line: Optional[int] = None
+    # Shape tracking
+    input_shape: Optional[tuple[int, ...]] = None
+    output_shape: Optional[tuple[int, ...]] = None
+    # Parameters (operation-specific)
+    params: dict[str, Any] = field(default_factory=dict)
+    # Completeness
+    completeness: CompletenessLevel = CompletenessLevel.FULL
+    # Mass update tracking
     is_mass_update: bool = False
     rows_affected: int = 0
-    completeness: CompletenessLevel = CompletenessLevel.FULL
+    # Error tracking
+    error: Optional[str] = None
+    error_type: Optional[str] = None
+    @property
+    def code_location(self) -> Optional[str]:
+        """Human-readable code location."""
+        if self.code_file and self.code_line:
+            return f"{self.code_file}:{self.code_line}"
+        return None
+    def to_dict(self) -> dict[str, Any]:
+        """Serialize to dict (for JSON export)."""
+        return {
+            "step_id": self.step_id,
+            "operation": self.operation,
+            "timestamp": self.timestamp,
+            "stage": self.stage,
+            "code_location": self.code_location,
+            "code_file": self.code_file,
+            "code_line": self.code_line,
+            "input_shape": self.input_shape,
+            "output_shape": self.output_shape,
+            "params": self.params,
+            "completeness": self.completeness.name,
+            "is_mass_update": self.is_mass_update,
+            "rows_affected": self.rows_affected,
+            "error": self.error,
+            "error_type": self.error_type,
+        }
+# Backwards compatibility alias
+StepMetadata = StepEvent
 @dataclass
@@ -120,3 +236,44 @@ class LineageGaps:
             return f"1 step has limited visibility: {self.gaps[0].operation}"
         else:
             return f"{len(self.gaps)} steps have limited visibility"
+@dataclass
+class GhostRowInfo:
+    """Information about a dropped row."""
+    row_id: int
+    last_values: dict[str, Any]
+    dropped_by: str
+    dropped_step: int
+    original_position: int
+@dataclass
+class MergeMapping:
+    """
+    Array-based merge mapping (memory efficient).
+    Arrays are stored SORTED by out_rids to enable O(log n) lookup
+    via binary search instead of O(n) linear scan.
+    """
+    step_id: int
+    out_rids: Any  # numpy array, SORTED for binary search
+    left_parent_rids: Any  # numpy array, -1 for no match, same order as out_rids
+    right_parent_rids: Any  # numpy array, -1 for no match, same order as out_rids
+@dataclass
+class MergeStats:
+    """Merge statistics."""
+    left_rows: int
+    right_rows: int
+    result_rows: int
+    expansion_ratio: float
+    left_match_rate: float  # -1 if not computed
+    right_match_rate: float  # -1 if not computed
+    left_dup_rate: float  # -1 if not computed
+    right_dup_rate: float  # -1 if not computed
+    how: str

tracepipe/debug.py ADDED Viewed

@@ -0,0 +1,325 @@
+# tracepipe/debug.py
+"""
+Debug namespace for TracePipe power users.
+This module provides low-level introspection and raw access to lineage data.
+For most use cases, prefer the top-level convenience API (check, trace, why, report).
+Usage:
+    import tracepipe as tp
+    # Access debug inspector
+    dbg = tp.debug.inspect()
+    dbg.steps              # All recorded steps
+    dbg.dropped_rows()     # All dropped row IDs
+    dbg.explain_row(42)    # Raw row lineage
+    dbg.export("json")     # Export lineage data
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any
+import pandas as pd
+from .context import get_context
+if TYPE_CHECKING:
+    from .api import GroupLineageResult, RowLineageResult
+    from .core import StepEvent
+@dataclass
+class DebugInspector:
+    """
+    Debug inspector providing raw access to TracePipe internals.
+    This is the primary entry point for power users who need
+    low-level access to lineage data.
+    """
+    @property
+    def enabled(self) -> bool:
+        """True if TracePipe is currently enabled."""
+        return get_context().enabled
+    @property
+    def mode(self) -> str:
+        """Current mode: 'ci' or 'debug'."""
+        return get_context().config.mode.value
+    @property
+    def steps(self) -> list[StepEvent]:
+        """All recorded pipeline steps."""
+        return get_context().store.steps
+    @property
+    def watched_columns(self) -> set:
+        """Currently watched columns."""
+        return get_context().watched_columns.copy()
+    def watch(self, *columns: str) -> DebugInspector:
+        """
+        Add columns to watch for cell-level tracking.
+        Args:
+            *columns: Column names to watch.
+        Returns:
+            Self for chaining.
+        """
+        get_context().watched_columns.update(columns)
+        return self
+    @property
+    def total_diffs(self) -> int:
+        """Total number of diffs (including spilled)."""
+        return get_context().store.total_diff_count
+    @property
+    def in_memory_diffs(self) -> int:
+        """Number of diffs currently in memory."""
+        return get_context().store.diff_count
+    def dropped_rows(self, step_id: int | None = None) -> list[int]:
+        """
+        Get all dropped row IDs.
+        Args:
+            step_id: If provided, only return drops from this step.
+        Returns:
+            List of dropped row IDs.
+        """
+        return get_context().store.get_dropped_rows(step_id)
+    def dropped_by_operation(self) -> dict:
+        """Get count of dropped rows per operation."""
+        return get_context().store.get_dropped_by_step()
+    def alive_rows(self) -> list[int]:
+        """Get all row IDs that are still alive (not dropped)."""
+        ctx = get_context()
+        all_registered = set(ctx.row_manager.all_registered_ids())
+        dropped = set(ctx.store.get_dropped_rows())
+        return sorted(all_registered - dropped)
+    def explain_row(self, row_id: int) -> RowLineageResult:
+        """
+        Get lineage for a specific row.
+        Returns a RowLineageResult object with:
+            - row_id: int
+            - is_alive: bool
+            - dropped_at: Optional[str]
+            - history(): List[dict]
+            - cell_history(col): List[dict]
+            - to_dict(): dict
+        """
+        from .api import RowLineageResult
+        return RowLineageResult(row_id, get_context())
+    def explain_group(self, group_key: str) -> GroupLineageResult:
+        """Get aggregation group membership."""
+        from .api import GroupLineageResult
+        return GroupLineageResult(group_key, get_context())
+    def aggregation_groups(self) -> list[str]:
+        """List all tracked aggregation groups."""
+        ctx = get_context()
+        groups = []
+        for mapping in ctx.store.aggregation_mappings:
+            groups.extend(mapping.membership.keys())
+        return groups
+    def merge_stats(self, step_id: int | None = None) -> list[dict]:
+        """Get merge operation statistics."""
+        ctx = get_context()
+        stats_list = ctx.store.get_merge_stats(step_id)
+        return [
+            {
+                "step_id": sid,
+                "left_rows": s.left_rows,
+                "right_rows": s.right_rows,
+                "result_rows": s.result_rows,
+                "expansion_ratio": s.expansion_ratio,
+                "left_match_rate": s.left_match_rate,
+                "right_match_rate": s.right_match_rate,
+                "how": s.how,
+            }
+            for sid, s in stats_list
+        ]
+    def mass_updates(self) -> list[dict]:
+        """Get operations that exceeded cell diff threshold."""
+        ctx = get_context()
+        return [
+            {
+                "step_id": s.step_id,
+                "operation": s.operation,
+                "rows_affected": s.rows_affected,
+                "stage": s.stage,
+            }
+            for s in ctx.store.steps
+            if s.is_mass_update
+        ]
+    def ghost_rows(self, limit: int = 1000) -> pd.DataFrame:
+        """
+        Get dropped rows with their last-known values (DEBUG mode only).
+        Returns DataFrame with columns:
+            - __tp_row_id__: Original row ID
+            - __tp_dropped_by__: Operation that dropped the row
+            - [watched columns]: Last known values
+        """
+        ctx = get_context()
+        return ctx.row_manager.get_ghost_rows(limit=limit)
+    def stats(self) -> dict:
+        """Get comprehensive tracking statistics."""
+        ctx = get_context()
+        return {
+            "enabled": ctx.enabled,
+            "mode": ctx.config.mode.value,
+            "total_steps": len(ctx.store.steps),
+            "total_diffs": ctx.store.total_diff_count,
+            "in_memory_diffs": ctx.store.diff_count,
+            "spilled_files": len(ctx.store.spilled_files),
+            "watched_columns": list(ctx.watched_columns),
+            "aggregation_groups": len(ctx.store.aggregation_mappings),
+            "merge_mappings": len(ctx.store.merge_mappings),
+            "features": {
+                "merge_provenance": ctx.config.should_capture_merge_provenance,
+                "ghost_row_values": ctx.config.should_capture_ghost_values,
+                "cell_history": ctx.config.should_capture_cell_history,
+            },
+        }
+    def export(self, format: str = "json", path: str | None = None) -> str | None:
+        """
+        Export lineage data.
+        Args:
+            format: "json" or "arrow"
+            path: File path. If None, returns JSON string (json format only).
+        Returns:
+            JSON string if path is None and format is "json", else None.
+        """
+        ctx = get_context()
+        if format == "json":
+            json_str = ctx.store.to_json()
+            if path:
+                with open(path, "w") as f:
+                    f.write(json_str)
+                return None
+            return json_str
+        elif format == "arrow":
+            if path is None:
+                raise ValueError("path is required for arrow export")
+            try:
+                import pyarrow.parquet as pq
+            except ImportError:
+                raise ImportError(
+                    "pyarrow is required for Arrow export. "
+                    "Install with: pip install tracepipe[arrow]"
+                ) from None
+            table = ctx.store.to_arrow()
+            pq.write_table(table, path)
+            return None
+        else:
+            raise ValueError(f"Unknown format: {format}. Use 'json' or 'arrow'.")
+    def register(self, df: pd.DataFrame) -> None:
+        """Manually register a DataFrame for tracking."""
+        ctx = get_context()
+        if ctx.enabled:
+            ctx.row_manager.register(df)
+    def get_row_ids(self, df: pd.DataFrame) -> Any | None:
+        """Get row IDs array for a DataFrame."""
+        ctx = get_context()
+        return ctx.row_manager.get_ids_array(df)
+    def __repr__(self) -> str:
+        ctx = get_context()
+        if not ctx.enabled:
+            return "<DebugInspector enabled=False>"
+        return (
+            f"<DebugInspector mode={ctx.config.mode.value} "
+            f"steps={len(ctx.store.steps)} "
+            f"diffs={ctx.store.total_diff_count}>"
+        )
+def inspect() -> DebugInspector:
+    """
+    Get a debug inspector for TracePipe internals.
+    Returns:
+        DebugInspector with access to steps, diffs, and raw lineage data.
+    Example:
+        dbg = tp.debug.inspect()
+        print(dbg.steps)
+        print(dbg.dropped_rows())
+        dbg.export("json", "lineage.json")
+    """
+    return DebugInspector()
+# Convenience aliases for common debug operations
+def export_json(path: str) -> None:
+    """Export lineage to JSON file."""
+    inspect().export("json", path)
+def export_arrow(path: str) -> None:
+    """Export lineage to Parquet file."""
+    inspect().export("arrow", path)
+def find(
+    df: pd.DataFrame,
+    *,
+    where: dict | None = None,
+    predicate=None,
+    limit: int = 50,
+) -> list[int]:
+    """
+    Find row IDs matching a selector.
+    This is a debug utility for discovering row IDs that can be used
+    with trace() and why(). Row IDs are internal identifiers and should
+    not be persisted across sessions.
+    Args:
+        df: DataFrame to search
+        where: Exact match selector, e.g. {"status": "failed"}
+        predicate: Vector predicate (df -> boolean Series)
+        limit: Maximum number of IDs to return (default 50)
+    Returns:
+        List of internal row IDs (for use with trace/why row= parameter)
+    Example:
+        rids = tp.debug.find(df, where={"status": "failed"})
+        for rid in rids[:3]:
+            print(tp.trace(df, row=rid))
+    """
+    # Import here to avoid circular imports
+    from .convenience import _resolve_predicate, _resolve_where
+    ctx = get_context()
+    if where:
+        return _resolve_where(df, where, ctx, limit=limit)
+    elif predicate:
+        return _resolve_predicate(df, predicate, ctx, limit=limit)
+    else:
+        raise ValueError("Must provide 'where' or 'predicate'")

tracepipe 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

tracepipe 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl