PyPI - tracepipe - Versions diffs - 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

tracepipe 0.2.0py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

tracepipe/__init__.py +117 -78
tracepipe/api.py +219 -332
tracepipe/context.py +21 -1
tracepipe/contracts.py +473 -0
tracepipe/convenience.py +817 -0
tracepipe/core.py +174 -17
tracepipe/debug.py +325 -0
tracepipe/instrumentation/apply_capture.py +453 -0
tracepipe/instrumentation/filter_capture.py +468 -0
tracepipe/instrumentation/indexer_capture.py +813 -0
tracepipe/instrumentation/merge_capture.py +434 -0
tracepipe/instrumentation/pandas_inst.py +66 -183
tracepipe/instrumentation/series_capture.py +331 -0
tracepipe/safety.py +3 -3
tracepipe/snapshot.py +420 -0
tracepipe/storage/base.py +7 -3
tracepipe/storage/lineage_store.py +252 -47
tracepipe/storage/row_identity.py +366 -104
tracepipe/value_provenance.py +309 -0
tracepipe/visualization/html_export.py +22 -7
tracepipe-0.3.1.dist-info/METADATA +308 -0
tracepipe-0.3.1.dist-info/RECORD +29 -0
tracepipe-0.2.0.dist-info/METADATA +0 -508
tracepipe-0.2.0.dist-info/RECORD +0 -19
{tracepipe-0.2.0.dist-info → tracepipe-0.3.1.dist-info}/WHEEL +0 -0
{tracepipe-0.2.0.dist-info → tracepipe-0.3.1.dist-info}/licenses/LICENSE +0 -0

tracepipe/api.py CHANGED Viewed

@@ -1,18 +1,28 @@
 # tracepipe/api.py
 """
-Public API for TracePipe.
+Core API for TracePipe.
+This module provides the foundational enable/disable/reset functions
+and internal result classes. For user-facing functionality, see:
+- convenience.py: check(), trace(), why(), report()
+- debug.py: inspect(), export()
+- contracts.py: contract()
+- snapshot.py: snapshot(), diff()
+Modes:
+- CI: Fast stats and drop tracking. No merge provenance or ghost values.
+- DEBUG: Full provenance with merge origin tracking and ghost row values.
 """
 from __future__ import annotations
 import sys
 import types
+from collections.abc import Sequence
 from dataclasses import fields
-import pandas as pd
 from .context import TracePipeContext, get_context, reset_context, set_context
-from .core import LineageGaps, TracePipeConfig
+from .core import LineageGaps, TracePipeConfig, TracePipeMode
 from .instrumentation.pandas_inst import instrument_pandas, uninstrument_pandas
 from .storage.base import LineageBackend, RowIdentityStrategy
@@ -24,48 +34,102 @@ def _get_module() -> types.ModuleType:
 def enable(
     config: TracePipeConfig | None = None,
+    mode: TracePipeMode | str | None = None,
+    *,
+    watch: Sequence[str] | None = None,
     auto_watch: bool = False,
     backend: LineageBackend | None = None,
     identity: RowIdentityStrategy | None = None,
+    merge_provenance: bool | None = None,
+    ghost_row_values: bool | None = None,
+    cell_history: bool | None = None,
+    sample_rate: float | None = None,
+    max_tracked_rows: int | None = None,
 ) -> types.ModuleType:
     """
     Enable TracePipe lineage tracking.
     Args:
-        config: Optional configuration
+        config: Optional configuration object
+        mode: Operating mode - "ci" (fast) or "debug" (full provenance)
+        watch: List of columns to watch for cell-level changes
         auto_watch: If True, automatically watch columns with nulls
-        backend: Optional custom storage backend (default: InMemoryLineageStore)
-        identity: Optional custom row identity strategy (default: PandasRowIdentity)
+        backend: Optional custom storage backend
+        identity: Optional custom row identity strategy
+        merge_provenance: Override: capture merge parent RIDs (DEBUG default: True)
+        ghost_row_values: Override: capture last values of dropped rows
+        cell_history: Override: capture cell-level changes
+        sample_rate: Track only this fraction of rows (0.0-1.0)
+        max_tracked_rows: Maximum rows to track (for large datasets)
     Returns:
         The tracepipe module for fluent chaining.
     Examples:
-        # Basic usage (pandas + in-memory)
-        tracepipe.enable()
-        # Fluent chaining
-        tracepipe.enable().watch("age", "salary")
+        # CI mode (fast, default)
+        tp.enable()
-        # With SQLite persistence (v2.1+)
-        from tracepipe.storage.sqlite_backend import SQLiteLineageStore
-        tracepipe.enable(backend=SQLiteLineageStore(config, "lineage.db"))
+        # Debug mode with watched columns
+        tp.enable(mode="debug", watch=["age", "salary"])
-        # With Polars support (v2.1+)
-        from tracepipe.storage.polars_identity import PolarsRowIdentity
-        tracepipe.enable(identity=PolarsRowIdentity(config))
+        # Custom configuration
+        tp.enable(mode="ci", merge_provenance=True)
     """
+    ctx = get_context()
+    # If already enabled, reset accumulated state to prevent duplicate warnings/stats
+    # This handles the common case of re-running scripts in notebooks/IDEs
+    if ctx.enabled:
+        _reset_accumulated_state(ctx)
+    # Get or create config
+    # If config is provided explicitly, use it
+    # Otherwise, start with existing context config (if any) or create new default
+    if config is None:
+        config = ctx.config  # Use existing config as base
+    # Handle mode
+    if mode is not None:
+        if isinstance(mode, str):
+            mode = TracePipeMode(mode.lower())
+        config.mode = mode
+    # Apply feature overrides
+    if merge_provenance is not None:
+        config.merge_provenance = merge_provenance
+    if ghost_row_values is not None:
+        config.ghost_row_values = ghost_row_values
+    if cell_history is not None:
+        config.cell_history = cell_history
+    if auto_watch:
+        config.auto_watch = True
+    # Sampling config validation
+    if sample_rate is not None or max_tracked_rows is not None:
+        import warnings
+        warnings.warn(
+            "sample_rate and max_tracked_rows are not yet implemented. "
+            "These parameters will be ignored.",
+            UserWarning,
+            stacklevel=2,
+        )
     # Create context with custom backends if provided
     if backend is not None or identity is not None:
         ctx = TracePipeContext(config=config, backend=backend, identity=identity)
         set_context(ctx)
     else:
-        ctx = get_context()
-        if config:
-            ctx.config = config
+        ctx.config = config
+        # Also update config in row_manager and store (they may have their own references)
+        ctx.row_manager.config = config
+        ctx.store.config = config
-    if auto_watch:
-        ctx.config.auto_watch = True
+    # Add watched columns (reset first if re-enabling to avoid stale watches)
+    if watch:
+        ctx.watched_columns.clear()
+        ctx.watched_columns.update(watch)
     if not ctx.enabled:
         instrument_pandas()
@@ -74,14 +138,57 @@ def enable(
     return _get_module()
+def _reset_accumulated_state(ctx: TracePipeContext) -> None:
+    """
+    Reset accumulated lineage state without disabling instrumentation.
+    Called when enable() is invoked on an already-enabled context to prevent
+    state accumulation across multiple script runs in the same Python process.
+    """
+    store = ctx.store
+    # Clear merge stats (prevents duplicate warnings)
+    if hasattr(store, "merge_stats"):
+        store.merge_stats.clear()
+    # Clear bulk drops
+    if hasattr(store, "bulk_drops"):
+        store.bulk_drops.clear()
+    # Clear steps
+    if hasattr(store, "_steps"):
+        store._steps.clear()
+    # Clear in-memory diffs
+    if hasattr(store, "_clear_in_memory"):
+        store._clear_in_memory()
+    # Reset step counter
+    if hasattr(store, "_step_counter"):
+        store._step_counter = 0
+    # Clear merge mappings
+    if hasattr(store, "merge_mappings"):
+        store.merge_mappings.clear()
+    # Clear aggregation mappings
+    if hasattr(store, "aggregation_mappings"):
+        store.aggregation_mappings.clear()
+    # Reset row identity manager
+    ctx.row_manager.clear()
+    # Clear watched columns (will be re-added if watch param provided)
+    ctx.watched_columns.clear()
 def disable() -> types.ModuleType:
     """
     Disable TracePipe and restore original pandas methods.
     Note:
         This stops tracking but preserves lineage data collected so far.
-        You can still query explain(), dropped_rows(), etc. after disabling.
-        To clear all data, use reset() instead.
+        Use reset() to clear all data.
     Returns:
         The tracepipe module for fluent chaining.
@@ -90,7 +197,6 @@ def disable() -> types.ModuleType:
     if ctx.enabled:
         uninstrument_pandas()
-        # Call cleanup if backend supports it
         if hasattr(ctx.store, "_cleanup_spillover"):
             ctx.store._cleanup_spillover()
         ctx.enabled = False
@@ -105,11 +211,6 @@ def reset() -> types.ModuleType:
     This clears ALL lineage data, steps, watched columns, and row registrations.
     If tracking was enabled, it will be re-enabled with a fresh context.
-    Use this when:
-        - Starting fresh in a notebook cell
-        - Running multiple independent analyses
-        - Testing
     Returns:
         The tracepipe module for fluent chaining.
     """
@@ -122,7 +223,6 @@ def reset() -> types.ModuleType:
     reset_context()
     if was_enabled:
-        # Re-enable with fresh context
         enable()
     return _get_module()
@@ -133,38 +233,17 @@ def configure(**kwargs) -> types.ModuleType:
     Update configuration.
     Args:
-        **kwargs: Configuration options to update. Valid keys are:
-            - max_diffs_in_memory: Maximum diffs before spilling to disk
-            - max_diffs_per_step: Threshold for mass update detection
-            - max_group_membership_size: Threshold for count-only groups
-            - strict_mode: Raise exceptions on tracking errors
-            - auto_watch: Auto-watch columns with null values
-            - auto_watch_null_threshold: Null ratio threshold for auto-watch
-            - spillover_dir: Directory for spilled data
-            - use_hidden_column: Use hidden column for row tracking
-            - warn_on_duplicate_index: Warn on duplicate DataFrame index
-            - cleanup_spillover_on_disable: Clean up spilled files on disable
+        **kwargs: Configuration options to update.
     Returns:
         The tracepipe module for fluent chaining.
-    Raises:
-        ValueError: If an invalid configuration key is provided.
-    Examples:
-        tracepipe.configure(max_diffs_per_step=1000)
-        tracepipe.enable().configure(strict_mode=True).watch("amount")
     """
     ctx = get_context()
-    # Validate keys against dataclass fields
     valid_keys = {f.name for f in fields(TracePipeConfig)}
     invalid_keys = set(kwargs.keys()) - valid_keys
     if invalid_keys:
-        raise ValueError(
-            f"Invalid configuration key(s): {invalid_keys}. "
-            f"Valid keys are: {sorted(valid_keys)}"
-        )
+        raise ValueError(f"Invalid configuration key(s): {invalid_keys}")
     for key, value in kwargs.items():
         setattr(ctx.config, key, value)
@@ -172,110 +251,76 @@ def configure(**kwargs) -> types.ModuleType:
     return _get_module()
-def watch(*columns: str) -> types.ModuleType:
-    """
-    Add columns to watch for cell-level changes.
-    Args:
-        *columns: Column names to watch.
-    Returns:
-        The tracepipe module for fluent chaining.
+def stage(name: str):
+    """Context manager for naming pipeline stages."""
-    Examples:
-        tracepipe.watch("age", "salary")
-        tracepipe.enable().watch("amount").watch("price")
-    """
-    ctx = get_context()
-    ctx.watched_columns.update(columns)
-    return _get_module()
+    class StageContext:
+        def __init__(self, stage_name: str):
+            self.stage_name = stage_name
+            self.previous_stage = None
+        def __enter__(self):
+            ctx = get_context()
+            self.previous_stage = ctx.current_stage
+            ctx.current_stage = self.stage_name
+            return self
-def watch_all(df: pd.DataFrame) -> types.ModuleType:
-    """
-    Watch all columns in a DataFrame.
+        def __exit__(self, *args):
+            ctx = get_context()
+            ctx.current_stage = self.previous_stage
-    Args:
-        df: DataFrame whose columns to watch.
+    return StageContext(name)
-    Returns:
-        The tracepipe module for fluent chaining.
-    Examples:
-        tracepipe.watch_all(df)
+def register(*dfs) -> types.ModuleType:
     """
-    ctx = get_context()
-    ctx.watched_columns.update(df.columns.tolist())
-    return _get_module()
+    Register pre-existing DataFrames for tracking.
-def unwatch(*columns: str) -> types.ModuleType:
-    """
-    Remove columns from watch list.
+    Use this when DataFrames were created before tp.enable() was called.
+    After registration, snapshots, ghost rows, and cell history will work.
     Args:
-        *columns: Column names to stop watching.
-    Returns:
-        The tracepipe module for fluent chaining.
-    """
-    ctx = get_context()
-    ctx.watched_columns.difference_update(columns)
-    return _get_module()
-def clear_watch() -> types.ModuleType:
-    """
-    Clear all watched columns.
+        *dfs: One or more DataFrames to register
     Returns:
         The tracepipe module for fluent chaining.
     Examples:
-        tracepipe.clear_watch().watch("new_column")
-    """
-    ctx = get_context()
-    ctx.watched_columns.clear()
-    return _get_module()
+        # DataFrames created before enable
+        df1 = pd.DataFrame({"a": [1, 2, 3]})
+        df2 = pd.DataFrame({"b": [4, 5, 6]})
+        tp.enable()
+        tp.register(df1, df2)  # Now they're tracked
-def register(df: pd.DataFrame) -> types.ModuleType:
+        snap = tp.snapshot(df1)  # Works!
     """
-    Manually register a DataFrame for tracking.
-    Use this for DataFrames created before enable() was called.
+    import pandas as pd
-    Returns:
-        The tracepipe module for fluent chaining.
-    """
     ctx = get_context()
-    if ctx.enabled:
-        ctx.row_manager.register(df)
-    return _get_module()
-def stage(name: str):
-    """Context manager for naming pipeline stages."""
+    if not ctx.enabled:
+        import warnings
-    class StageContext:
-        def __init__(self, stage_name: str):
-            self.stage_name = stage_name
-            self.previous_stage = None
+        warnings.warn(
+            "TracePipe is not enabled. Call tp.enable() before tp.register().",
+            UserWarning,
+            stacklevel=2,
+        )
+        return _get_module()
-        def __enter__(self):
-            ctx = get_context()
-            self.previous_stage = ctx.current_stage
-            ctx.current_stage = self.stage_name
-            return self
+    for df in dfs:
+        if not isinstance(df, pd.DataFrame):
+            raise TypeError(f"Expected DataFrame, got {type(df).__name__}")
-        def __exit__(self, *args):
-            ctx = get_context()
-            ctx.current_stage = self.previous_stage
+        # Only register if not already registered
+        if ctx.row_manager.get_ids_array(df) is None:
+            ctx.row_manager.register(df)
-    return StageContext(name)
+    return _get_module()
-# === QUERY API ===
+# === INTERNAL RESULT CLASSES (used by debug module) ===
 class RowLineageResult:
@@ -284,54 +329,82 @@ class RowLineageResult:
     def __init__(self, row_id: int, ctx: TracePipeContext):
         self.row_id = row_id
         self._ctx = ctx
-        self._history = ctx.store.get_row_history(row_id)
-        self._gaps = ctx.store.compute_gaps(row_id)
+        self._history: list[dict] | None = None
+        self._gaps: LineageGaps | None = None
+        self._drop_event: dict | None = None
+        self._drop_event_checked: bool = False
+    def _ensure_drop_event(self) -> None:
+        if not self._drop_event_checked:
+            self._drop_event = self._ctx.store.get_drop_event(self.row_id)
+            self._drop_event_checked = True
+    def _ensure_history(self) -> None:
+        if self._history is None:
+            self._history = self._ctx.store.get_row_history(self.row_id)
+    def _ensure_gaps(self) -> None:
+        if self._gaps is None:
+            self._gaps = self._ctx.store.compute_gaps(self.row_id)
     @property
     def is_alive(self) -> bool:
-        """Return True if row was not dropped."""
-        return not any(h["change_type"] == "DROPPED" for h in self._history)
+        self._ensure_drop_event()
+        return self._drop_event is None
     @property
     def dropped_at(self) -> str | None:
-        """Return operation name where row was dropped, or None."""
-        for h in self._history:
-            if h["change_type"] == "DROPPED":
-                return h["operation"]
+        self._ensure_drop_event()
+        if self._drop_event is not None:
+            return self._drop_event.get("operation")
         return None
+    @property
+    def dropped_step_id(self) -> int | None:
+        self._ensure_drop_event()
+        if self._drop_event is not None:
+            return self._drop_event.get("step_id")
+        return None
+    def merge_origin(self) -> dict | None:
+        return self._ctx.store.get_merge_origin(self.row_id)
     def cell_history(self, column: str) -> list[dict]:
-        """Get history for a specific column."""
+        self._ensure_history()
         return [h for h in self._history if h["col"] == column]
     def history(self) -> list[dict]:
-        """Get full history."""
+        self._ensure_history()
         return self._history
     @property
     def gaps(self) -> LineageGaps:
-        """Get lineage gaps."""
+        self._ensure_gaps()
         return self._gaps
     @property
     def is_fully_tracked(self) -> bool:
-        """Return True if no gaps in lineage."""
+        self._ensure_gaps()
         return self._gaps.is_fully_tracked
     def to_dict(self) -> dict:
-        """Export to dictionary."""
+        self._ensure_history()
+        self._ensure_gaps()
+        merge = self.merge_origin()
         return {
             "row_id": self.row_id,
             "is_alive": self.is_alive,
             "dropped_at": self.dropped_at,
+            "dropped_step_id": self.dropped_step_id,
             "is_fully_tracked": self.is_fully_tracked,
             "gaps_summary": self._gaps.summary(),
+            "merge_origin": merge,
             "history": self._history,
         }
     def __repr__(self):
         status = "alive" if self.is_alive else f"dropped at {self.dropped_at}"
-        return f"<RowLineage row_id={self.row_id} {status} events={len(self._history)}>"
+        return f"<RowLineage row_id={self.row_id} {status} events={len(self.history())}>"
 class GroupLineageResult:
@@ -344,45 +417,25 @@ class GroupLineageResult:
     @property
     def row_ids(self) -> list[int]:
-        """Get list of row IDs in this group."""
         return self._info["row_ids"] if self._info else []
     @property
     def row_count(self) -> int:
-        """Get number of rows in this group."""
         return self._info["row_count"] if self._info else 0
     @property
     def is_count_only(self) -> bool:
-        """
-        True if group exceeded max_group_membership_size threshold.
-        When True, row_ids will be empty and only row_count is available.
-        """
         return self._info.get("is_count_only", False) if self._info else False
     @property
     def group_column(self) -> str | None:
-        """Get the column used for grouping."""
         return self._info["group_column"] if self._info else None
     @property
     def aggregation_functions(self) -> dict[str, str]:
-        """Get the aggregation functions applied."""
         return self._info["agg_functions"] if self._info else {}
-    def get_contributing_rows(self, limit: int = 100) -> list[RowLineageResult]:
-        """
-        Get lineage for contributing rows.
-        Returns empty list if is_count_only is True.
-        """
-        if self.is_count_only:
-            return []
-        return [explain(row_id) for row_id in self.row_ids[:limit]]
     def to_dict(self) -> dict:
-        """Export to dictionary."""
         return {
             "group_key": self.group_key,
             "group_column": self.group_column,
@@ -395,169 +448,3 @@ class GroupLineageResult:
     def __repr__(self):
         suffix = " (count only)" if self.is_count_only else ""
         return f"<GroupLineage key='{self.group_key}' rows={self.row_count}{suffix}>"
-def explain(row_id: int) -> RowLineageResult:
-    """Get lineage for a specific row."""
-    ctx = get_context()
-    return RowLineageResult(row_id, ctx)
-def explain_many(row_ids: list[int]) -> list[RowLineageResult]:
-    """
-    Get lineage for multiple rows.
-    Args:
-        row_ids: List of row IDs to explain.
-    Returns:
-        List of RowLineageResult objects.
-    Examples:
-        results = tracepipe.explain_many([0, 1, 2])
-        for row in results:
-            print(row.is_alive, row.dropped_at)
-    """
-    ctx = get_context()
-    return [RowLineageResult(row_id, ctx) for row_id in row_ids]
-def explain_group(group_key: str) -> GroupLineageResult:
-    """Get lineage for an aggregation group."""
-    ctx = get_context()
-    return GroupLineageResult(group_key, ctx)
-def dropped_rows(by_step: bool = False) -> list[int] | dict[str, int]:
-    """
-    Get dropped row information.
-    Args:
-        by_step: If False (default), return list of dropped row IDs.
-                 If True, return dict mapping operation names to drop counts.
-    Returns:
-        List of row IDs if by_step=False, or dict of {operation: count} if by_step=True.
-    Examples:
-        # Get all dropped row IDs
-        dropped = tracepipe.dropped_rows()
-        # Get counts by operation
-        by_op = tracepipe.dropped_rows(by_step=True)
-        # {'DataFrame.dropna': 5, 'DataFrame.query': 3}
-    """
-    ctx = get_context()
-    if by_step:
-        return ctx.store.get_dropped_by_step()
-    return ctx.store.get_dropped_rows()
-def alive_rows() -> list[int]:
-    """
-    Get all row IDs that are still alive (not dropped).
-    Returns:
-        List of row IDs that have not been dropped.
-    Examples:
-        alive = tracepipe.alive_rows()
-        print(f"{len(alive)} rows survived the pipeline")
-    """
-    ctx = get_context()
-    all_registered = set(ctx.row_manager.all_registered_ids())
-    dropped = set(ctx.store.get_dropped_rows())
-    return sorted(all_registered - dropped)
-def mass_updates() -> list[dict]:
-    """Get operations that exceeded cell diff threshold."""
-    ctx = get_context()
-    return [
-        {
-            "step_id": s.step_id,
-            "operation": s.operation,
-            "rows_affected": s.rows_affected,
-            "stage": s.stage,
-        }
-        for s in ctx.store.steps
-        if s.is_mass_update
-    ]
-def steps() -> list[dict]:
-    """Get all tracked steps."""
-    ctx = get_context()
-    return [
-        {
-            "step_id": s.step_id,
-            "operation": s.operation,
-            "stage": s.stage,
-            "input_shape": s.input_shape,
-            "output_shape": s.output_shape,
-            "completeness": s.completeness.name,
-            "is_mass_update": s.is_mass_update,
-            "timestamp": s.timestamp,
-            "code_file": s.code_file,
-            "code_line": s.code_line,
-        }
-        for s in ctx.store.steps
-    ]
-def aggregation_groups() -> list[str]:
-    """List all tracked aggregation groups."""
-    ctx = get_context()
-    groups = []
-    for mapping in ctx.store.aggregation_mappings:
-        groups.extend(mapping.membership.keys())
-    return groups
-# === EXPORT ===
-def export_json(filepath: str) -> None:
-    """Export lineage to JSON file."""
-    ctx = get_context()
-    with open(filepath, "w") as f:
-        f.write(ctx.store.to_json())
-def export_arrow(filepath: str) -> None:
-    """
-    Export lineage to Parquet file.
-    Requires pyarrow to be installed.
-    Args:
-        filepath: Path to write the Parquet file.
-    Raises:
-        ImportError: If pyarrow is not installed.
-    """
-    try:
-        import pyarrow.parquet as pq
-    except ImportError:
-        raise ImportError(
-            "pyarrow is required for Arrow/Parquet export. "
-            "Install it with: pip install tracepipe[arrow] or pip install pyarrow"
-        ) from None
-    ctx = get_context()
-    table = ctx.store.to_arrow()
-    pq.write_table(table, filepath)
-def stats() -> dict:
-    """Get tracking statistics."""
-    ctx = get_context()
-    return {
-        "enabled": ctx.enabled,
-        "total_steps": len(ctx.store.steps),
-        "total_diffs": ctx.store.total_diff_count,
-        "in_memory_diffs": ctx.store.diff_count,
-        "spilled_files": len(ctx.store.spilled_files),
-        "watched_columns": list(ctx.watched_columns),
-        "aggregation_groups": len(ctx.store.aggregation_mappings),
-    }

tracepipe 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

tracepipe 0.2.0py3-none-any.whl → 0.3.1py3-none-any.whl