PyPI - tracepipe - Versions diffs - 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl - Mend

tracepipe 0.3.2py3-none-any.whl → 0.3.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

tracepipe/__init__.py CHANGED Viewed

@@ -81,7 +81,7 @@ from .core import TracePipeConfig, TracePipeMode
 from .snapshot import DiffResult, Snapshot, diff, snapshot
 # === VERSION ===
-__version__ = "0.3.2"
+__version__ = "0.3.4"
 # === MINIMAL __all__ ===
 __all__ = [

tracepipe/convenience.py CHANGED Viewed

@@ -361,8 +361,8 @@ def check(
     facts["rows_dropped"] = len(dropped)
     facts["total_steps"] = len(ctx.store.steps)
-    # Merge statistics
-    merge_stats_list = ctx.store.get_merge_stats() if hasattr(ctx.store, "get_merge_stats") else []
+    # Merge statistics - filter to df's lineage to avoid cross-contamination
+    merge_stats_list = _get_merge_stats_for_df(df, ctx)
     for i, (step_id, stats) in enumerate(merge_stats_list):
         facts[f"merge_{i}_expansion"] = stats.expansion_ratio
@@ -658,6 +658,53 @@ def find(
 # ============ HELPERS ============
+def _get_merge_stats_for_df(df: pd.DataFrame, ctx) -> list[tuple[int, Any]]:
+    """
+    Get merge stats relevant to df's lineage only.
+    This prevents cross-contamination where check(df) would show warnings
+    from merges that produced OTHER DataFrames in the same session.
+    """
+    if not hasattr(ctx.store, "get_merge_stats"):
+        return []
+    all_stats = ctx.store.get_merge_stats()
+    if not all_stats:
+        return []
+    # Get row IDs from df
+    rids = ctx.row_manager.get_ids_array(df)
+    if rids is None:
+        return []
+    # Find which merge steps produced rows in df
+    relevant_step_ids = set()
+    # Check merge mappings to find which merges produced df's rows
+    if hasattr(ctx.store, "merge_mappings"):
+        for mapping in ctx.store.merge_mappings:
+            # Check if any of df's row IDs are in this merge's output
+            for rid in rids:
+                # Binary search in sorted out_rids
+                i = np.searchsorted(mapping.out_rids, rid)
+                if i < len(mapping.out_rids) and mapping.out_rids[i] == rid:
+                    relevant_step_ids.add(mapping.step_id)
+                    break  # Found at least one match, this merge is relevant
+    # If no merge mappings found, fall back to checking if df was just merged
+    # by seeing if it has more columns than typical (heuristic)
+    if not relevant_step_ids and all_stats:
+        # Fallback: return only the most recent merge that could have produced df
+        # This handles the case where merge_mappings aren't available
+        for step_id, stats in reversed(all_stats):
+            if stats.result_rows == len(df):
+                relevant_step_ids.add(step_id)
+                break
+    # Filter stats to relevant merges only
+    return [(sid, stats) for sid, stats in all_stats if sid in relevant_step_ids]
 def _json_safe(val: Any) -> Any:
     """Convert value to JSON-serializable form."""
     if pd.isna(val):

tracepipe/instrumentation/series_capture.py CHANGED Viewed

@@ -116,6 +116,10 @@ def wrap_series_assignment():
     """
     Wrap DataFrame.__setitem__ to capture diffs when assigning Series.
+    Note: For watched columns, _wrap_setitem (pandas_inst.py) already captures
+    the assignment. This wrapper only captures for NON-watched columns when
+    a TrackedSeries is assigned, to avoid double-logging.
     Handles:
     - df['col'] = series  (where series may have been modified)
     - df['col'] = scalar  (broadcast assignment)
@@ -127,28 +131,33 @@ def wrap_series_assignment():
     def tracked_setitem(self, key, value):
         ctx = get_context()
-        # Capture before state for watched columns
+        # For watched columns, _wrap_setitem already captures - skip to avoid double-logging
+        # We only capture here for NON-watched columns when a TrackedSeries is involved
+        should_capture_here = False
         before_values = None
         if (
             ctx.enabled
             and isinstance(key, str)
-            and key in ctx.watched_columns
             and key in self.columns
+            and key not in ctx.watched_columns  # Only capture NON-watched columns here
+            and isinstance(value, TrackedSeries)  # Only for TrackedSeries assignments
         ):
             rids = ctx.row_manager.get_ids_array(self)
             if rids is not None:
+                should_capture_here = True
                 before_values = {
                     "rids": rids.copy(),
                     "values": self[key].values.copy(),
                 }
-        # Always run original
+        # Always run original (which may be _wrap_setitem's wrapper)
         original_setitem(self, key, value)
         if not ctx.enabled:
             return
-        if before_values is None:
+        if not should_capture_here or before_values is None:
             return
         try:

tracepipe/storage/lineage_store.py CHANGED Viewed

@@ -32,6 +32,22 @@ from ..core import (
 from ..utils.value_capture import capture_typed_value
+def _stable_repr(val) -> str:
+    """Create a stable string representation for deduplication.
+    Handles NaN, None, and other values that don't compare equal to themselves.
+    """
+    if val is None:
+        return "None"
+    # Handle NaN (which doesn't equal itself)
+    try:
+        if isinstance(val, float) and val != val:  # NaN check
+            return "NaN"
+    except (TypeError, ValueError):
+        pass
+    return repr(val)
 class InMemoryLineageStore:
     """
     Columnar storage for lineage data using Structure of Arrays (SoA).
@@ -556,12 +572,15 @@ class InMemoryLineageStore:
         Follows merge lineage recursively to build complete cell provenance.
         This is essential for tracking changes that happened before merge operations.
+        Deduplicates events by (col, old_val, new_val, operation) signature to prevent
+        cross-pipeline contamination when multiple DataFrames share row IDs.
         Args:
             row_id: Row ID to trace
             max_depth: Maximum merge depth to follow (prevents infinite loops)
         Returns:
-            List of events in chronological order, including parent row events.
+            List of UNIQUE events in chronological order, including parent row events.
         """
         visited: set[int] = set()
@@ -589,7 +608,23 @@ class InMemoryLineageStore:
         # Sort by step_id to ensure chronological order across lineage
         all_events.sort(key=lambda e: e["step_id"])
-        return all_events
+        # Deduplicate by (col, old_val, new_val, operation) signature
+        # This prevents cross-pipeline contamination when multiple DataFrames
+        # share the same row IDs (e.g., df.copy() followed by parallel transforms)
+        seen_signatures: set[tuple] = set()
+        unique_events = []
+        for event in all_events:
+            sig = (
+                event.get("col"),
+                _stable_repr(event.get("old_val")),
+                _stable_repr(event.get("new_val")),
+                event.get("operation"),
+            )
+            if sig not in seen_signatures:
+                seen_signatures.add(sig)
+                unique_events.append(event)
+        return unique_events
     def get_cell_history_with_lineage(
         self, row_id: int, column: str, max_depth: int = 10

{tracepipe-0.3.2.dist-info → tracepipe-0.3.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: tracepipe
-Version: 0.3.2
+Version: 0.3.4
 Summary: Row-level data lineage tracking for pandas pipelines
 Project-URL: Homepage, https://github.com/tracepipe/tracepipe
 Project-URL: Documentation, https://tracepipe.github.io/tracepipe/

{tracepipe-0.3.2.dist-info → tracepipe-0.3.4.dist-info}/RECORD RENAMED Viewed

@@ -1,8 +1,8 @@
-tracepipe/__init__.py,sha256=MuwxV2mU4XxHqab62vQxaDAlhMvRCgUCmr_YU9R16ss,3342
+tracepipe/__init__.py,sha256=1vKVGGc_fePrf1FNjP1R7-RPjtVnAc3Ori9QQl-E_4U,3342
 tracepipe/api.py,sha256=WdcKvvzI3voDt6fxZWa8vjyZQU8lfRshx7T78oj7oFE,13351
 tracepipe/context.py,sha256=_povLpqa5wd_ESHt5hbSmWTSMTF3nUfeutEQo4RMK2E,3856
 tracepipe/contracts.py,sha256=m-rjPrgnCiAgKEkweOS7P95jrjDptt5UPdvUlqaV_rU,16226
-tracepipe/convenience.py,sha256=SZGcSOKPjAeJ9udPP_Fa_zTZY5GeDX61W6uftMwafjc,26563
+tracepipe/convenience.py,sha256=KuDz_ZzNivVG1SS8Srr3plu4CTwFmNhYL4rk3vV6cbE,28421
 tracepipe/core.py,sha256=kAXks694rR0Z4tD7Gyty0TyJGWx2whsSdteYYpHuazo,8010
 tracepipe/debug.py,sha256=6t2GKVZLwn7SJLhrStE9qsmTiVIHATTE3jJPQ2DYtnc,10140
 tracepipe/safety.py,sha256=jTBZv4QGDJfnZETsSZeMKbdOUtGXk-_XkmllhnGWM-M,5537
@@ -14,16 +14,16 @@ tracepipe/instrumentation/filter_capture.py,sha256=onlYLU5bBZSM3WmxM2AFHfktnlx7R
 tracepipe/instrumentation/indexer_capture.py,sha256=1ATCeJ-uNA1uGiSbgnUx0wdVsIlZGHeUBaFJPXgFQNg,28440
 tracepipe/instrumentation/merge_capture.py,sha256=Eze-PTrn7IXxZRZBYX9R13mOY3diWKAkjp4z-wa1tEk,13349
 tracepipe/instrumentation/pandas_inst.py,sha256=2YSoju9ml2PjLOYzsx8MHH1iqhjgnXHbIidnF0JDpaY,29546
-tracepipe/instrumentation/series_capture.py,sha256=N1Cf-pQDh23qQLLd8DNsxbcaD-91sTJkRd5AnccKZGE,10649
+tracepipe/instrumentation/series_capture.py,sha256=i7FiA2ndEzS6duIj5y-a7SDfIMl2cCY_jGC1tmG7TGU,11271
 tracepipe/storage/__init__.py,sha256=pGFMfbIgIi2kofVPwYDqe2HTYMYJoabiGjTq77pYi-g,348
 tracepipe/storage/base.py,sha256=7DV_-rp37DjBMr9B1w85hLVYhC8OQShk2PcEhT-n4tE,4894
-tracepipe/storage/lineage_store.py,sha256=swMMf59isoCQZHaezCmquA-0R5iGNH3eGWjc9d9LGmo,27392
+tracepipe/storage/lineage_store.py,sha256=KhGri2uC_O_43fUivFGEHY6KBDHd1I0O_PPd_KD3L4M,28683
 tracepipe/storage/row_identity.py,sha256=HBU0gTTJlFtFTcAdUCKuX-c9cHa0lo3CDIodDPDgOzA,17161
 tracepipe/utils/__init__.py,sha256=CI_GXViCjdMbu1j6HuzZhoQZEW0sIB6WAve6j5pfOC0,182
 tracepipe/utils/value_capture.py,sha256=wGgegQmJnVHxHbwHSH9di7JAOBChzD3ERJrabZNiayk,4092
 tracepipe/visualization/__init__.py,sha256=M3s44ZTUNEToyghjhQW0FgbmWHKPr4Xc-7iNF6DpI_E,132
 tracepipe/visualization/html_export.py,sha256=G0hfZTJctUCfpun17zXX1NIXhvJZbca6hKmP3rcIjbg,42282
-tracepipe-0.3.2.dist-info/METADATA,sha256=ik5FLmADKLqj25TprTnJPi21SW4EJ88mBTG-aQ4p-gc,9152
-tracepipe-0.3.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-tracepipe-0.3.2.dist-info/licenses/LICENSE,sha256=HMOAFHBClL79POwWL-2_aDcx42DJAq7Ce-nwJPvMB9U,1075
-tracepipe-0.3.2.dist-info/RECORD,,
+tracepipe-0.3.4.dist-info/METADATA,sha256=DooQHiRi1HBiFK-QZPpE3PfLg43xE5Yg93kXWEdxhNY,9152
+tracepipe-0.3.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+tracepipe-0.3.4.dist-info/licenses/LICENSE,sha256=HMOAFHBClL79POwWL-2_aDcx42DJAq7Ce-nwJPvMB9U,1075
+tracepipe-0.3.4.dist-info/RECORD,,

{tracepipe-0.3.2.dist-info → tracepipe-0.3.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{tracepipe-0.3.2.dist-info → tracepipe-0.3.4.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

tracepipe 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl

tracepipe 0.3.2py3-none-any.whl → 0.3.4py3-none-any.whl