PyPI - tracepipe - Versions diffs - 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

tracepipe 0.3.0py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

tracepipe/__init__.py +1 -1
tracepipe/api.py +54 -4
tracepipe/convenience.py +17 -7
tracepipe/storage/lineage_store.py +63 -1
tracepipe/value_provenance.py +32 -24
tracepipe-0.3.2.dist-info/METADATA +308 -0
{tracepipe-0.3.0.dist-info → tracepipe-0.3.2.dist-info}/RECORD +9 -9
tracepipe-0.3.0.dist-info/METADATA +0 -575
{tracepipe-0.3.0.dist-info → tracepipe-0.3.2.dist-info}/WHEEL +0 -0
{tracepipe-0.3.0.dist-info → tracepipe-0.3.2.dist-info}/licenses/LICENSE +0 -0

tracepipe/__init__.py CHANGED Viewed

@@ -81,7 +81,7 @@ from .core import TracePipeConfig, TracePipeMode
 from .snapshot import DiffResult, Snapshot, diff, snapshot
 # === VERSION ===
-__version__ = "0.3.0"
+__version__ = "0.3.2"
 # === MINIMAL __all__ ===
 __all__ = [

tracepipe/api.py CHANGED Viewed

@@ -75,12 +75,18 @@ def enable(
         # Custom configuration
         tp.enable(mode="ci", merge_provenance=True)
     """
+    ctx = get_context()
+    # If already enabled, reset accumulated state to prevent duplicate warnings/stats
+    # This handles the common case of re-running scripts in notebooks/IDEs
+    if ctx.enabled:
+        _reset_accumulated_state(ctx)
     # Get or create config
     # If config is provided explicitly, use it
     # Otherwise, start with existing context config (if any) or create new default
     if config is None:
-        existing_ctx = get_context()
-        config = existing_ctx.config  # Use existing config as base
+        config = ctx.config  # Use existing config as base
     # Handle mode
     if mode is not None:
@@ -115,14 +121,14 @@ def enable(
         ctx = TracePipeContext(config=config, backend=backend, identity=identity)
         set_context(ctx)
     else:
-        ctx = get_context()
         ctx.config = config
         # Also update config in row_manager and store (they may have their own references)
         ctx.row_manager.config = config
         ctx.store.config = config
-    # Add watched columns
+    # Add watched columns (reset first if re-enabling to avoid stale watches)
     if watch:
+        ctx.watched_columns.clear()
         ctx.watched_columns.update(watch)
     if not ctx.enabled:
@@ -132,6 +138,50 @@ def enable(
     return _get_module()
+def _reset_accumulated_state(ctx: TracePipeContext) -> None:
+    """
+    Reset accumulated lineage state without disabling instrumentation.
+    Called when enable() is invoked on an already-enabled context to prevent
+    state accumulation across multiple script runs in the same Python process.
+    """
+    store = ctx.store
+    # Clear merge stats (prevents duplicate warnings)
+    if hasattr(store, "merge_stats"):
+        store.merge_stats.clear()
+    # Clear bulk drops
+    if hasattr(store, "bulk_drops"):
+        store.bulk_drops.clear()
+    # Clear steps
+    if hasattr(store, "_steps"):
+        store._steps.clear()
+    # Clear in-memory diffs
+    if hasattr(store, "_clear_in_memory"):
+        store._clear_in_memory()
+    # Reset step counter
+    if hasattr(store, "_step_counter"):
+        store._step_counter = 0
+    # Clear merge mappings
+    if hasattr(store, "merge_mappings"):
+        store.merge_mappings.clear()
+    # Clear aggregation mappings
+    if hasattr(store, "aggregation_mappings"):
+        store.aggregation_mappings.clear()
+    # Reset row identity manager
+    ctx.row_manager.clear()
+    # Clear watched columns (will be re-added if watch param provided)
+    ctx.watched_columns.clear()
 def disable() -> types.ModuleType:
     """
     Disable TracePipe and restore original pandas methods.

tracepipe/convenience.py CHANGED Viewed

@@ -385,22 +385,27 @@ def check(
                 )
             )
-        if stats.left_dup_rate > 0.01:
+        # Note on dup_rate semantics:
+        # - left_dup_rate = fraction of LEFT rows appearing >1 times in result
+        #   This happens when RIGHT table has duplicate join keys
+        # - right_dup_rate = fraction of RIGHT rows appearing >1 times in result
+        #   This happens when LEFT table has duplicate join keys
+        if stats.right_dup_rate > 0.01:
             warnings_list.append(
                 CheckWarning(
                     category="duplicate_keys",
                     severity="fact",
-                    message=f"Left table has {stats.left_dup_rate:.1%} duplicate join keys",
-                    details={"step_id": step_id, "dup_rate": stats.left_dup_rate},
+                    message=f"Left table has {stats.right_dup_rate:.1%} duplicate join keys",
+                    details={"step_id": step_id, "dup_rate": stats.right_dup_rate},
                 )
             )
-        if stats.right_dup_rate > 0.01:
+        if stats.left_dup_rate > 0.01:
             warnings_list.append(
                 CheckWarning(
                     category="duplicate_keys",
                     severity="fact",
-                    message=f"Right table has {stats.right_dup_rate:.1%} duplicate join keys",
-                    details={"step_id": step_id, "dup_rate": stats.right_dup_rate},
+                    message=f"Right table has {stats.left_dup_rate:.1%} duplicate join keys",
+                    details={"step_id": step_id, "dup_rate": stats.left_dup_rate},
                 )
             )
@@ -733,9 +738,14 @@ def _build_trace_result(row_id: int, ctx, include_ghost: bool) -> TraceResult:
     store = ctx.store
     drop_event = store.get_drop_event(row_id)
-    history = store.get_row_history(row_id)
     merge_origin = store.get_merge_origin(row_id)
+    # Use lineage-aware history to include pre-merge parent events
+    if hasattr(store, "get_row_history_with_lineage"):
+        history = store.get_row_history_with_lineage(row_id)
+    else:
+        history = store.get_row_history(row_id)
     dropped_at = None
     if drop_event:
         dropped_at = {

tracepipe/storage/lineage_store.py CHANGED Viewed

@@ -485,6 +485,9 @@ class InMemoryLineageStore:
         CONTRACT: Returned list has monotonically increasing step_id.
         Convenience layer may reverse for display.
+        Note: This returns only direct events for this row_id.
+        Use get_row_history_with_lineage() to include pre-merge parent history.
         """
         step_map = {s.step_id: s for s in self._steps}
         events = []
@@ -546,6 +549,65 @@ class InMemoryLineageStore:
         return events
+    def get_row_history_with_lineage(self, row_id: int, max_depth: int = 10) -> list[dict]:
+        """
+        Get row history including pre-merge parent history.
+        Follows merge lineage recursively to build complete cell provenance.
+        This is essential for tracking changes that happened before merge operations.
+        Args:
+            row_id: Row ID to trace
+            max_depth: Maximum merge depth to follow (prevents infinite loops)
+        Returns:
+            List of events in chronological order, including parent row events.
+        """
+        visited: set[int] = set()
+        def _collect_history(rid: int, depth: int) -> list[dict]:
+            if depth > max_depth or rid in visited:
+                return []
+            visited.add(rid)
+            events = []
+            # Check if this row came from a merge
+            origin = self.get_merge_origin(rid)
+            if origin and origin["left_parent"] is not None:
+                # Recursively get parent's history first (chronological order)
+                parent_events = _collect_history(origin["left_parent"], depth + 1)
+                events.extend(parent_events)
+            # Add this row's direct events
+            events.extend(self.get_row_history(rid))
+            return events
+        all_events = _collect_history(row_id, 0)
+        # Sort by step_id to ensure chronological order across lineage
+        all_events.sort(key=lambda e: e["step_id"])
+        return all_events
+    def get_cell_history_with_lineage(
+        self, row_id: int, column: str, max_depth: int = 10
+    ) -> list[dict]:
+        """
+        Get cell history for a specific column, including pre-merge parent history.
+        Args:
+            row_id: Row ID to trace
+            column: Column name to filter events for
+            max_depth: Maximum merge depth to follow
+        Returns:
+            List of events for this column in chronological order.
+        """
+        all_events = self.get_row_history_with_lineage(row_id, max_depth)
+        return [e for e in all_events if e["col"] == column]
     def get_dropped_rows(self, step_id: Optional[int] = None) -> list[int]:
         """Get all dropped row IDs, optionally filtered by step."""
         if step_id is not None:
@@ -648,7 +710,7 @@ class InMemoryLineageStore:
         diffs = list(self._iter_all_diffs())
         data = {
-            "tracepipe_version": "0.3.0",
+            "tracepipe_version": "0.3.2",
             "export_timestamp": time.time(),
             "total_diffs": len(diffs),
             "total_steps": len(self._steps),

tracepipe/value_provenance.py CHANGED Viewed

@@ -19,7 +19,6 @@ from typing import Any, Optional
 import pandas as pd
 from .context import get_context
-from .core import ChangeType
 @dataclass
@@ -96,7 +95,12 @@ class ValueHistory:
         }
-def explain_value(row_id: int, column: str, df: Optional[pd.DataFrame] = None) -> ValueHistory:
+def explain_value(
+    row_id: int,
+    column: str,
+    df: Optional[pd.DataFrame] = None,
+    follow_lineage: bool = True,
+) -> ValueHistory:
     """
     Get complete history of a specific cell's value.
@@ -104,6 +108,7 @@ def explain_value(row_id: int, column: str, df: Optional[pd.DataFrame] = None) -
         row_id: Row ID to trace
         column: Column name
         df: Optional DataFrame for current value lookup
+        follow_lineage: If True, include pre-merge parent history (default: True)
     Returns:
         ValueHistory with all changes to this cell
@@ -121,35 +126,38 @@ def explain_value(row_id: int, column: str, df: Optional[pd.DataFrame] = None) -
             if len(matches) > 0 and column in df.columns:
                 current_value = df.iloc[matches[0]][column]
-    # Collect all events for this cell
+    # Collect events - use lineage-aware method if requested
+    if follow_lineage and hasattr(store, "get_cell_history_with_lineage"):
+        # Get cell history including pre-merge parent history
+        raw_events = store.get_cell_history_with_lineage(row_id, column)
+    else:
+        # Fallback to direct row_id lookup only
+        raw_events = [e for e in store.get_row_history(row_id) if e["col"] == column]
+    # Convert to ValueEvent objects
     events = []
-    step_map = {s.step_id: s for s in store.steps}
     became_null_at = None
     became_null_by = None
-    for diff in store._iter_all_diffs():
-        if diff["row_id"] == row_id and diff["col"] == column:
-            step = step_map.get(diff["step_id"])
-            events.append(
-                ValueEvent(
-                    step_id=diff["step_id"],
-                    operation=step.operation if step else "unknown",
-                    old_value=diff["old_val"],
-                    new_value=diff["new_val"],
-                    change_type=ChangeType(diff["change_type"]).name,
-                    timestamp=step.timestamp if step else 0,
-                    code_location=(
-                        f"{step.code_file}:{step.code_line}" if step and step.code_file else None
-                    ),
-                )
+    for diff in raw_events:
+        events.append(
+            ValueEvent(
+                step_id=diff["step_id"],
+                operation=diff.get("operation", "unknown"),
+                old_value=diff["old_val"],
+                new_value=diff["new_val"],
+                change_type=diff.get("change_type", "UNKNOWN"),
+                timestamp=diff.get("timestamp", 0) or 0,
+                code_location=diff.get("code_location"),
             )
+        )
-            # Track when value became null
-            if became_null_at is None and pd.isna(diff["new_val"]) and not pd.isna(diff["old_val"]):
-                became_null_at = diff["step_id"]
-                became_null_by = step.operation if step else "unknown"
+        # Track when value became null
+        if became_null_at is None and pd.isna(diff["new_val"]) and not pd.isna(diff["old_val"]):
+            became_null_at = diff["step_id"]
+            became_null_by = diff.get("operation", "unknown")
+    # Events should already be sorted by step_id from lineage method
     events.sort(key=lambda e: e.step_id)
     return ValueHistory(

tracepipe-0.3.2.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,308 @@
+Metadata-Version: 2.4
+Name: tracepipe
+Version: 0.3.2
+Summary: Row-level data lineage tracking for pandas pipelines
+Project-URL: Homepage, https://github.com/tracepipe/tracepipe
+Project-URL: Documentation, https://tracepipe.github.io/tracepipe/
+Project-URL: Repository, https://github.com/tracepipe/tracepipe.git
+Project-URL: Issues, https://github.com/tracepipe/tracepipe/issues
+Project-URL: Changelog, https://tracepipe.github.io/tracepipe/changelog/
+Author: Gauthier Piarrette
+License: MIT License
+        Copyright (c) 2026 Gauthier Piarrette
+        Permission is hereby granted, free of charge, to any person obtaining a copy
+        of this software and associated documentation files (the "Software"), to deal
+        in the Software without restriction, including without limitation the rights
+        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+        copies of the Software, and to permit persons to whom the Software is
+        furnished to do so, subject to the following conditions:
+        The above copyright notice and this permission notice shall be included in all
+        copies or substantial portions of the Software.
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+        SOFTWARE.
+License-File: LICENSE
+Keywords: data-engineering,data-lineage,data-quality,debugging,observability,pandas
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Requires-Python: >=3.9
+Requires-Dist: numpy>=1.20.0
+Requires-Dist: pandas>=1.5.0
+Provides-Extra: all
+Requires-Dist: psutil>=5.9.0; extra == 'all'
+Requires-Dist: pyarrow>=10.0.0; extra == 'all'
+Provides-Extra: arrow
+Requires-Dist: pyarrow>=10.0.0; extra == 'arrow'
+Provides-Extra: dev
+Requires-Dist: black>=23.0.0; extra == 'dev'
+Requires-Dist: pre-commit>=3.5.0; extra == 'dev'
+Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
+Requires-Dist: pytest>=7.0.0; extra == 'dev'
+Requires-Dist: ruff>=0.1.0; extra == 'dev'
+Requires-Dist: taskipy>=1.12.0; extra == 'dev'
+Provides-Extra: docs
+Requires-Dist: mkdocs-material>=9.5.0; extra == 'docs'
+Requires-Dist: mkdocs>=1.5.0; extra == 'docs'
+Requires-Dist: mkdocstrings[python]>=0.24.0; extra == 'docs'
+Requires-Dist: pymdown-extensions>=10.0.0; extra == 'docs'
+Provides-Extra: memory
+Requires-Dist: psutil>=5.9.0; extra == 'memory'
+Description-Content-Type: text/markdown
+<div align="center">
+# TracePipe
+### Row-level data lineage for pandas pipelines
+**Know exactly where every row went, why values changed, and how your data transformed.**
+[![PyPI version](https://img.shields.io/pypi/v/tracepipe.svg)](https://pypi.org/project/tracepipe/)
+[![Python 3.9+](https://img.shields.io/pypi/pyversions/tracepipe.svg)](https://pypi.org/project/tracepipe/)
+[![CI](https://github.com/gauthierpiarrette/tracepipe/actions/workflows/ci.yml/badge.svg)](https://github.com/gauthierpiarrette/tracepipe/actions/workflows/ci.yml)
+[![codecov](https://codecov.io/gh/gauthierpiarrette/tracepipe/branch/main/graph/badge.svg)](https://codecov.io/gh/gauthierpiarrette/tracepipe)
+[![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT)
+[![Docs](https://img.shields.io/badge/docs-mkdocs-blue.svg)](https://gauthierpiarrette.github.io/tracepipe/)
+[Getting Started](#getting-started) · [Documentation](https://gauthierpiarrette.github.io/tracepipe/) · [Examples](#real-world-example)
+</div>
+---
+## Why TracePipe?
+Data pipelines are black boxes. Rows vanish. Values change. You're left guessing.
+```python
+df = pd.read_csv("customers.csv")
+df = df.dropna()                      # Some rows disappear
+df = df.merge(regions, on="zip")      # New rows appear, some vanish
+df["income"] = df["income"].fillna(0) # Values change silently
+df = df[df["age"] >= 18]              # More rows gone
+# What happened to customer C-789? 🤷
+```
+**TracePipe gives you the complete audit trail — zero code changes required.**
+---
+## Getting Started
+```bash
+pip install tracepipe
+```
+```python
+import tracepipe as tp
+import pandas as pd
+tp.enable(mode="debug", watch=["income"])
+df = pd.read_csv("customers.csv")
+df = df.dropna()
+df["income"] = df["income"].fillna(0)
+df = df[df["age"] >= 18]
+tp.check(df)  # See what happened
+```
+```
+TracePipe Check: [OK] Pipeline healthy
+Retention: 847/1000 (84.7%)
+Dropped: 153 rows
+  • DataFrame.dropna: 42
+  • DataFrame.__getitem__[mask]: 111
+Value changes: 23 cells modified
+  • DataFrame.fillna: 23 (income)
+```
+That's it. **One import, full visibility.**
+---
+## Core API
+| Function | What it does |
+|----------|--------------|
+| `tp.enable()` | Start tracking |
+| `tp.check(df)` | Health check — retention, drops, changes |
+| `tp.trace(df, where={"id": "C-789"})` | Follow a row's complete journey |
+| `tp.why(df, col="income", row=5)` | Explain why a cell has its current value |
+| `tp.report(df, "audit.html")` | Export interactive HTML report |
+---
+## Key Features
+<table>
+<tr>
+<td width="50%">
+### 🔍 Zero-Code Instrumentation
+TracePipe patches pandas at runtime. Your existing code works unchanged.
+### 📊 Complete Provenance
+Track drops, transforms, merges, and cell-level changes with before/after values.
+</td>
+<td width="50%">
+### 🎯 Business-Key Lookups
+Find rows by their values: `tp.trace(df, where={"email": "alice@example.com"})`
+### ⚡ Production-Ready
+1.0-2.8x overhead (varies by operation). Tested on DataFrames up to 1M rows.
+</td>
+</tr>
+</table>
+---
+## Real-World Example
+```python
+import tracepipe as tp
+import pandas as pd
+tp.enable(mode="debug", watch=["age", "income", "label"])
+# Load and clean
+df = pd.read_csv("training_data.csv")
+df = df.dropna(subset=["label"])
+df["income"] = df["income"].fillna(df["income"].median())
+df = df[df["age"] >= 18]
+# Audit
+print(tp.check(df))
+```
+```
+Retention: 8234/10000 (82.3%)
+Dropped: 1766 rows
+  • DataFrame.dropna: 423
+  • DataFrame.__getitem__[mask]: 1343
+Value changes: 892 cells
+  • DataFrame.fillna: 892 (income)
+```
+```python
+# Why does this customer have a filled income?
+tp.why(df, col="income", where={"customer_id": "C-789"})
+```
+```
+Cell History: row 156, column 'income'
+  Current value: 45000.0
+  [i] Was null at step 1 (later recovered)
+  History (1 change):
+    None -> 45000.0
+      by: DataFrame.fillna
+```
+---
+## Two Modes
+| Mode | Use Case | What's Tracked |
+|------|----------|----------------|
+| **CI** (default) | Production pipelines | Step counts, retention rates, merge warnings |
+| **Debug** | Development | Full row history, cell diffs, merge parents, group membership |
+```python
+tp.enable(mode="ci")     # Lightweight
+tp.enable(mode="debug")  # Full lineage
+```
+---
+## What's Tracked
+| Operation | Coverage |
+|-----------|----------|
+| `dropna`, `drop_duplicates`, `query`, `df[mask]` | ✅ Full |
+| `fillna`, `replace`, `loc[]=`, `iloc[]=` | ✅ Full (cell diffs) |
+| `merge`, `join` | ✅ Full (parent tracking) |
+| `groupby().agg()` | ✅ Full (group membership) |
+| `sort_values`, `head`, `tail`, `sample` | ✅ Full |
+| `apply`, `pipe` | ⚠️ Partial |
+---
+## Data Quality Contracts
+```python
+(tp.contract()
+    .expect_unique("customer_id")
+    .expect_no_nulls("email")
+    .expect_retention(min_rate=0.9)
+    .check(df)
+    .raise_if_failed())
+```
+---
+## Documentation
+📚 **[Full Documentation](https://gauthierpiarrette.github.io/tracepipe/)**
+- [Quickstart](https://gauthierpiarrette.github.io/tracepipe/getting-started/quickstart/)
+- [User Guide](https://gauthierpiarrette.github.io/tracepipe/guide/concepts/)
+- [API Reference](https://gauthierpiarrette.github.io/tracepipe/api/)
+- [Examples](https://gauthierpiarrette.github.io/tracepipe/examples/ml-pipeline/)
+---
+## Contributing
+```bash
+git clone https://github.com/gauthierpiarrette/tracepipe.git
+cd tracepipe
+pip install -e ".[dev]"
+pytest tests/ -v
+```
+See [CONTRIBUTING](https://gauthierpiarrette.github.io/tracepipe/contributing/) for guidelines.
+---
+## License
+MIT License. See [LICENSE](LICENSE).
+---
+<div align="center">
+**Stop guessing where your rows went.**
+```bash
+pip install tracepipe
+```
+⭐ Star us on GitHub if TracePipe helps your data work!
+</div>

{tracepipe-0.3.0.dist-info → tracepipe-0.3.2.dist-info}/RECORD RENAMED Viewed

@@ -1,13 +1,13 @@
-tracepipe/__init__.py,sha256=ZO6-yKMpguohwQLSRovuJoakb7kN1ZveSBwlGwhC-ho,3342
-tracepipe/api.py,sha256=KFO0NYRaGqRevbNyFSCFK4ryhFwdixFtUnTeNabwb6o,11862
+tracepipe/__init__.py,sha256=MuwxV2mU4XxHqab62vQxaDAlhMvRCgUCmr_YU9R16ss,3342
+tracepipe/api.py,sha256=WdcKvvzI3voDt6fxZWa8vjyZQU8lfRshx7T78oj7oFE,13351
 tracepipe/context.py,sha256=_povLpqa5wd_ESHt5hbSmWTSMTF3nUfeutEQo4RMK2E,3856
 tracepipe/contracts.py,sha256=m-rjPrgnCiAgKEkweOS7P95jrjDptt5UPdvUlqaV_rU,16226
-tracepipe/convenience.py,sha256=9F4rLx7AGWwNPKhuJMZD-6PG-QiZq0_mzfmnoU28x6U,26036
+tracepipe/convenience.py,sha256=SZGcSOKPjAeJ9udPP_Fa_zTZY5GeDX61W6uftMwafjc,26563
 tracepipe/core.py,sha256=kAXks694rR0Z4tD7Gyty0TyJGWx2whsSdteYYpHuazo,8010
 tracepipe/debug.py,sha256=6t2GKVZLwn7SJLhrStE9qsmTiVIHATTE3jJPQ2DYtnc,10140
 tracepipe/safety.py,sha256=jTBZv4QGDJfnZETsSZeMKbdOUtGXk-_XkmllhnGWM-M,5537
 tracepipe/snapshot.py,sha256=OLREzE1_LkWITluG_Bqeb7Y4pAKb8Lb3zJEF3cxnloU,13967
-tracepipe/value_provenance.py,sha256=cCNDvMduYiFkTzfam5EpBNZI54RL4OtMLP6xNaM00ec,9092
+tracepipe/value_provenance.py,sha256=ogky6aOaZ-6K2uNBQxlXpmCeuvK434Hisj30zesRTd8,9330
 tracepipe/instrumentation/__init__.py,sha256=pd0n6Z9m_V3gcBv097cXWFOZEzAP9sAq1jjQnNRrDZ8,222
 tracepipe/instrumentation/apply_capture.py,sha256=cMThWzNXqWQENuMrCGTne1hO6fqaQFV7zJYNpsPTW4w,14463
 tracepipe/instrumentation/filter_capture.py,sha256=onlYLU5bBZSM3WmxM2AFHfktnlx7ReG-brEn5eZ_N10,15830
@@ -17,13 +17,13 @@ tracepipe/instrumentation/pandas_inst.py,sha256=2YSoju9ml2PjLOYzsx8MHH1iqhjgnXHb
 tracepipe/instrumentation/series_capture.py,sha256=N1Cf-pQDh23qQLLd8DNsxbcaD-91sTJkRd5AnccKZGE,10649
 tracepipe/storage/__init__.py,sha256=pGFMfbIgIi2kofVPwYDqe2HTYMYJoabiGjTq77pYi-g,348
 tracepipe/storage/base.py,sha256=7DV_-rp37DjBMr9B1w85hLVYhC8OQShk2PcEhT-n4tE,4894
-tracepipe/storage/lineage_store.py,sha256=KPN-OZOgkZeiIptodQst-Obp9krcuE7Erpc9NX53jKw,25148
+tracepipe/storage/lineage_store.py,sha256=swMMf59isoCQZHaezCmquA-0R5iGNH3eGWjc9d9LGmo,27392
 tracepipe/storage/row_identity.py,sha256=HBU0gTTJlFtFTcAdUCKuX-c9cHa0lo3CDIodDPDgOzA,17161
 tracepipe/utils/__init__.py,sha256=CI_GXViCjdMbu1j6HuzZhoQZEW0sIB6WAve6j5pfOC0,182
 tracepipe/utils/value_capture.py,sha256=wGgegQmJnVHxHbwHSH9di7JAOBChzD3ERJrabZNiayk,4092
 tracepipe/visualization/__init__.py,sha256=M3s44ZTUNEToyghjhQW0FgbmWHKPr4Xc-7iNF6DpI_E,132
 tracepipe/visualization/html_export.py,sha256=G0hfZTJctUCfpun17zXX1NIXhvJZbca6hKmP3rcIjbg,42282
-tracepipe-0.3.0.dist-info/METADATA,sha256=oEiGG2V8ya2J3ZKYU_oAfLIqYrZdgwqBRaKup44U-Uw,15478
-tracepipe-0.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-tracepipe-0.3.0.dist-info/licenses/LICENSE,sha256=HMOAFHBClL79POwWL-2_aDcx42DJAq7Ce-nwJPvMB9U,1075
-tracepipe-0.3.0.dist-info/RECORD,,
+tracepipe-0.3.2.dist-info/METADATA,sha256=ik5FLmADKLqj25TprTnJPi21SW4EJ88mBTG-aQ4p-gc,9152
+tracepipe-0.3.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+tracepipe-0.3.2.dist-info/licenses/LICENSE,sha256=HMOAFHBClL79POwWL-2_aDcx42DJAq7Ce-nwJPvMB9U,1075
+tracepipe-0.3.2.dist-info/RECORD,,

tracepipe-0.3.0.dist-info/METADATA DELETED Viewed

@@ -1,575 +0,0 @@
-Metadata-Version: 2.4
-Name: tracepipe
-Version: 0.3.0
-Summary: Row-level data lineage tracking for pandas pipelines
-Project-URL: Homepage, https://github.com/tracepipe/tracepipe
-Project-URL: Documentation, https://tracepipe.github.io/tracepipe/
-Project-URL: Repository, https://github.com/tracepipe/tracepipe.git
-Project-URL: Issues, https://github.com/tracepipe/tracepipe/issues
-Project-URL: Changelog, https://tracepipe.github.io/tracepipe/changelog/
-Author: Gauthier Piarrette
-License: MIT License
-        Copyright (c) 2026 Gauthier Piarrette
-        Permission is hereby granted, free of charge, to any person obtaining a copy
-        of this software and associated documentation files (the "Software"), to deal
-        in the Software without restriction, including without limitation the rights
-        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-        copies of the Software, and to permit persons to whom the Software is
-        furnished to do so, subject to the following conditions:
-        The above copyright notice and this permission notice shall be included in all
-        copies or substantial portions of the Software.
-        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-        SOFTWARE.
-License-File: LICENSE
-Keywords: data-engineering,data-lineage,data-quality,debugging,observability,pandas
-Classifier: Development Status :: 4 - Beta
-Classifier: Intended Audience :: Developers
-Classifier: Intended Audience :: Science/Research
-Classifier: License :: OSI Approved :: MIT License
-Classifier: Operating System :: OS Independent
-Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.9
-Classifier: Programming Language :: Python :: 3.10
-Classifier: Programming Language :: Python :: 3.11
-Classifier: Programming Language :: Python :: 3.12
-Classifier: Topic :: Scientific/Engineering
-Classifier: Topic :: Software Development :: Libraries :: Python Modules
-Requires-Python: >=3.9
-Requires-Dist: numpy>=1.20.0
-Requires-Dist: pandas>=1.5.0
-Provides-Extra: all
-Requires-Dist: psutil>=5.9.0; extra == 'all'
-Requires-Dist: pyarrow>=10.0.0; extra == 'all'
-Provides-Extra: arrow
-Requires-Dist: pyarrow>=10.0.0; extra == 'arrow'
-Provides-Extra: dev
-Requires-Dist: black>=23.0.0; extra == 'dev'
-Requires-Dist: pre-commit>=3.5.0; extra == 'dev'
-Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
-Requires-Dist: pytest>=7.0.0; extra == 'dev'
-Requires-Dist: ruff>=0.1.0; extra == 'dev'
-Requires-Dist: taskipy>=1.12.0; extra == 'dev'
-Provides-Extra: docs
-Requires-Dist: mkdocs-material>=9.5.0; extra == 'docs'
-Requires-Dist: mkdocs>=1.5.0; extra == 'docs'
-Requires-Dist: mkdocstrings[python]>=0.24.0; extra == 'docs'
-Requires-Dist: pymdown-extensions>=10.0.0; extra == 'docs'
-Provides-Extra: memory
-Requires-Dist: psutil>=5.9.0; extra == 'memory'
-Description-Content-Type: text/markdown
-# TracePipe
-**Row-level data lineage for pandas pipelines.**
-TracePipe automatically tracks what happens to every row and cell in your DataFrame — drops, transformations, merges, and value changes. Zero code changes required.
-[![PyPI version](https://img.shields.io/pypi/v/tracepipe.svg)](https://pypi.org/project/tracepipe/)
-[![Python 3.9+](https://img.shields.io/pypi/pyversions/tracepipe.svg)](https://pypi.org/project/tracepipe/)
-[![CI](https://github.com/gauthierpiarrette/tracepipe/actions/workflows/ci.yml/badge.svg)](https://github.com/gauthierpiarrette/tracepipe/actions/workflows/ci.yml)
-[![codecov](https://codecov.io/gh/gauthierpiarrette/tracepipe/branch/main/graph/badge.svg)](https://codecov.io/gh/gauthierpiarrette/tracepipe)
-[![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT)
-[![Documentation](https://img.shields.io/badge/docs-mkdocs-blue.svg)](https://gauthierpiarrette.github.io/tracepipe/)
----
-## The Problem
-Data pipelines are black boxes. When something goes wrong, you're left asking:
-- **"Where did row X go?"** — Dropped somewhere, but which step?
-- **"Why is this value wrong?"** — It was fine in the source, what changed it?
-- **"How did these rows get merged?"** — Which parent records combined?
-- **"Why are there nulls here?"** — When did they appear?
-```python
-df = pd.read_csv("customers.csv")
-df = df.dropna()                          # Some rows disappear
-df = df.merge(regions, on="zip")          # New rows appear, some vanish
-df["income"] = df["income"].fillna(0)     # Values change silently
-df = df[df["age"] >= 18]                  # More rows gone
-# What actually happened to customer C-789?
-```
-Traditional debugging means `print()` statements, manual diffs, and guesswork. **TracePipe gives you the complete audit trail.**
----
-## The Solution
-```python
-import tracepipe as tp
-import pandas as pd
-tp.enable(mode="debug", watch=["income", "score"])
-df = pd.read_csv("customers.csv")
-df = df.dropna()
-df["income"] = df["income"].fillna(0)
-df = df.merge(segments, on="customer_id")
-df = df[df["age"] >= 18]
-# Pipeline health check
-print(tp.check(df))
-```
-```
-TracePipe Check: [OK] Pipeline healthy
-  Mode: debug
-Retention: 847/1000 (84.7%)
-Dropped: 153 rows
-  • DataFrame.dropna: 42
-  • DataFrame.__getitem__[mask]: 111
-Value changes: 23 cells modified
-  • DataFrame.fillna: 23 (income)
-```
-```python
-# Why did this customer's income change?
-print(tp.why(df, col="income", where={"customer_id": "C-789"}))
-```
-```
-Cell History: row 42, column 'income'
-  Current value: 0.0
-  [i] Was null at step 1 (later recovered)
-      by: DataFrame.fillna
-  History (1 change):
-    None -> 0.0
-      by: DataFrame.fillna
-```
-**One import. Complete audit trail.**
----
-## Installation
-```bash
-pip install tracepipe
-```
----
-## Quick Start
-### 1. Enable tracking
-```python
-import tracepipe as tp
-tp.enable(mode="debug", watch=["price", "quantity"])  # Track specific columns
-```
-### 2. Run your pipeline normally
-```python
-df = pd.DataFrame({
-    "product": ["A", "B", "C", "D"],
-    "price": [10.0, None, 30.0, 40.0],
-    "quantity": [5, 10, 0, 8]
-})
-df = df.dropna()                    # Drops row B
-df = df[df["quantity"] > 0]         # Drops row C
-df["total"] = df["price"] * df["quantity"]
-```
-### 3. Inspect the lineage
-```python
-# Health check - see drops AND changes
-print(tp.check(df))
-```
-```
-TracePipe Check: [OK] Pipeline healthy
-  Mode: debug
-Retention: 2/4 (50.0%)
-Dropped: 2 rows
-  • DataFrame.dropna: 1
-  • DataFrame.__getitem__[mask]: 1
-Value changes: 2 cells
-  • DataFrame.__setitem__[total]: 2
-```
-```python
-# Trace a specific row's full journey
-print(tp.trace(df, where={"product": "A"}))
-```
-```
-Row 0 Journey:
-  Status: [OK] Alive
-  Events: 1
-    [MODIFIED] DataFrame.__setitem__[total]: total
-```
-```python
-# Explain why a specific cell has its current value
-print(tp.why(df, col="total", row=0))
-```
-```
-Cell History: row 0, column 'total'
-  Current value: 50.0
-  History (1 change):
-    None -> 50.0
-      by: DataFrame.__setitem__[total]
-```
----
-## Key Features
-### 🔍 Zero-Code Instrumentation
-TracePipe monkey-patches pandas at runtime. Your existing code works unchanged:
-```python
-tp.enable()
-# Your existing pipeline runs exactly as before
-# TracePipe silently records everything
-tp.disable()
-```
-### 📊 Rich Provenance Data
-Track everything that happens in your pipeline:
-| Question | Answer |
-|----------|--------|
-| Which rows were dropped? | `tp.check(df)` shows retention by operation |
-| Why did this value change? | `tp.why(df, col="amount", row=5)` shows before/after |
-| What's this row's history? | `tp.trace(df, row=0)` shows full journey |
-| Where did these rows merge from? | Merge parent tracking in debug mode |
-| Which rows grouped together? | `tp.debug.inspect().explain_group("A")` |
-| When did nulls appear? | `tp.why()` flags null introduction |
-### 🎯 Business-Key Lookups
-Find rows by their values, not internal IDs:
-```python
-# Find by business key
-tp.trace(df, where={"customer_id": "C-12345"})
-tp.trace(df, where={"email": "alice@example.com"})
-# Find rows where a column is null
-tp.why(df, col="email", where={"email": None})
-```
-### 📈 Production-Ready Performance
-| Operation | Overhead | Notes |
-|-----------|----------|-------|
-| Filter (dropna, query) | 1.4-1.9x | Acceptable |
-| Transform (fillna, replace) | 1.0-1.2x | Minimal |
-| GroupBy | 1.0-1.2x | Minimal |
-| Sort | 1.4x | Optimized |
-| Scalar access (at/iat) | <1ms added | Fixed overhead |
-Tested on DataFrames up to 1M rows with linear scaling.
-### 🔒 Safety First
-TracePipe never modifies your data or affects computation results:
-```python
-# Original pandas method ALWAYS runs first
-# Lineage capture happens after, and failures are non-fatal
-result = df.dropna()  # Guaranteed to work, even if tracking fails
-```
----
-## Two Modes
-### CI Mode (Default)
-Lightweight tracking for production pipelines:
-- Step counts and retention rates
-- Dropped row detection
-- Merge mismatch warnings
-- **No per-row provenance** (fast)
-```python
-tp.enable(mode="ci")
-```
-### Debug Mode
-Full lineage for development and debugging:
-- Complete row-level history
-- Cell change tracking with before/after values
-- GroupBy membership
-- Merge parent tracking
-```python
-tp.enable(mode="debug", watch=["price", "amount"])
-```
----
-## API Reference
-### Core Functions (5)
-| Function | Purpose |
-|----------|---------|
-| `tp.enable(mode, watch)` | Start tracking |
-| `tp.check(df)` | Health check with retention stats |
-| `tp.trace(df, row, where)` | Trace a row's journey |
-| `tp.why(df, col, row, where)` | Explain why a cell changed |
-| `tp.report(df, path)` | Export HTML report |
-### Control Functions
-| Function | Purpose |
-|----------|---------|
-| `tp.disable()` | Stop tracking |
-| `tp.reset()` | Clear all lineage data |
-| `tp.stage(name)` | Label pipeline stages |
-### Debug Namespace
-For power users who need raw access:
-```python
-dbg = tp.debug.inspect()
-dbg.steps              # All recorded operations
-dbg.dropped_rows()     # Set of dropped row IDs
-dbg.explain_row(42)    # Raw lineage for row 42
-dbg.stats()            # Memory and tracking stats
-dbg.export("json", "lineage.json")
-```
----
-## Data Quality Contracts
-Validate your pipeline with fluent assertions:
-```python
-result = (tp.contract()
-    .expect_unique("customer_id")
-    .expect_no_nulls("email")
-    .expect_retention(min_rate=0.9)
-    .check(df))
-result.raise_if_failed()  # Raises if any contract violated
-```
----
-## Snapshots & Diff
-Compare DataFrame states:
-```python
-before = tp.snapshot(df)
-# ... transformations ...
-after = tp.snapshot(df)
-diff = tp.diff(before, after)
-print(f"Rows added: {diff.rows_added}")
-print(f"Rows removed: {diff.rows_removed}")
-print(f"Cells changed: {diff.cells_changed}")
-```
----
-## HTML Reports
-Generate interactive lineage reports:
-```python
-tp.report(df, "pipeline_audit.html")
-```
-Opens a visual dashboard showing:
-- Pipeline flow diagram
-- Retention funnel
-- Dropped rows by operation
-- Cell change history
----
-## What's Tracked
-| Operation | Tracking | Completeness |
-|-----------|----------|--------------|
-| `dropna`, `drop_duplicates` | Dropped row IDs | FULL |
-| `query`, `df[mask]` | Dropped row IDs | FULL |
-| `head`, `tail`, `sample` | Dropped row IDs | FULL |
-| `fillna`, `replace` | Cell diffs (watched cols) | FULL |
-| `loc[]=`, `iloc[]=`, `at[]=` | Cell diffs | FULL |
-| `merge`, `join` | Parent tracking | FULL |
-| `groupby().agg()` | Group membership | FULL |
-| `sort_values` | Reorder tracking | FULL |
-| `apply`, `pipe` | Output tracked | PARTIAL |
----
-## Limitations
-TracePipe tracks pandas operations, not arbitrary Python code:
-| Limitation | Workaround |
-|------------|------------|
-| Direct NumPy array modification | Use pandas methods |
-| Mutable objects in cells (lists, dicts) | Use immutable types |
-| Custom C extensions | Wrap with pandas operations |
----
-## Example: ML Pipeline Audit
-```python
-import tracepipe as tp
-import pandas as pd
-import numpy as np
-tp.enable(mode="debug", watch=["age", "income", "label"])
-# Load and clean
-df = pd.read_csv("training_data.csv")
-df = df.dropna(subset=["label"])
-df["income"] = df["income"].fillna(df["income"].median())
-df = df[df["age"] >= 18]
-# Feature engineering
-df["age_bucket"] = pd.cut(df["age"], bins=[18, 30, 50, 100])
-df["log_income"] = np.log1p(df["income"])
-# Audit the pipeline
-print(tp.check(df))
-```
-```
-TracePipe Check: [OK] Pipeline healthy
-  Mode: debug
-Retention: 8234/10000 (82.3%)
-Dropped: 1766 rows
-  • DataFrame.dropna: 423
-  • DataFrame.__getitem__[mask]: 1343
-Value changes: 892 cells
-  • DataFrame.fillna: 892 (income)
-```
-```python
-# Why does this customer have log_income = 0?
-print(tp.why(df, col="income", where={"customer_id": "C-789"}))
-```
-```
-Cell History: row 156, column 'income'
-  Current value: 45000.0
-  [i] Was null at step 1 (later recovered)
-      by: DataFrame.fillna
-  History (1 change):
-    None -> 45000.0
-      by: DataFrame.fillna
-```
-```python
-# Full journey of a specific row
-print(tp.trace(df, where={"customer_id": "C-789"}))
-```
-```
-Row 156 Journey:
-  Status: [OK] Alive
-  Events: 3
-    [MODIFIED] DataFrame.fillna: income
-    [MODIFIED] pd.cut: age_bucket
-    [MODIFIED] DataFrame.__setitem__[log_income]: log_income
-```
----
-## Benchmarks
-Run on MacBook Pro M1, pandas 2.0, Python 3.11:
-### Overhead (10K rows, median of 10 runs)
-| Operation | Baseline | With TracePipe | Overhead |
-|-----------|----------|----------------|----------|
-| dropna | 0.9ms | 1.7ms | 1.9x |
-| query | 2.1ms | 3.0ms | 1.4x |
-| fillna | 0.4ms | 0.4ms | 1.0x |
-| groupby.sum | 1.2ms | 1.2ms | 1.0x |
-| merge | 4.5ms | 12.6ms | 2.8x |
-| sort_values | 1.1ms | 1.5ms | 1.4x |
-### Scale (filter + dropna pipeline)
-| Rows | Time | Throughput |
-|------|------|------------|
-| 10K | 5ms | 2M rows/sec |
-| 100K | 35ms | 2.8M rows/sec |
-| 1M | 320ms | 3.1M rows/sec |
-### Memory
-- Base overhead: ~40 bytes per tracked diff
-- Typical pipeline: 2-3x memory vs baseline
-- Spillover to disk available for large pipelines
----
-## Documentation
-📚 **[Full Documentation](https://gauthierpiarrette.github.io/tracepipe/)**
-- [Getting Started](https://gauthierpiarrette.github.io/tracepipe/getting-started/quickstart/)
-- [User Guide](https://gauthierpiarrette.github.io/tracepipe/guide/concepts/)
-- [API Reference](https://gauthierpiarrette.github.io/tracepipe/api/)
-- [Examples](https://gauthierpiarrette.github.io/tracepipe/examples/ml-pipeline/)
----
-## Contributing
-```bash
-git clone https://github.com/gauthierpiarrette/tracepipe.git
-cd tracepipe
-pip install -e ".[dev]"
-# Run tests
-pytest tests/ -v
-# Run linting
-ruff check tracepipe/ tests/
-# Run benchmarks
-python benchmarks/run_all.py
-```
-See [CONTRIBUTING](https://gauthierpiarrette.github.io/tracepipe/contributing/) for detailed guidelines.
----
-## License
-MIT License. See [LICENSE](LICENSE) for details.
----
-<p align="center">
-  <b>Stop guessing where your rows went.</b><br>
-  <code>pip install tracepipe</code>
-</p>

{tracepipe-0.3.0.dist-info → tracepipe-0.3.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{tracepipe-0.3.0.dist-info → tracepipe-0.3.2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

tracepipe 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

tracepipe 0.3.0py3-none-any.whl → 0.3.2py3-none-any.whl