tracepipe 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tracepipe/__init__.py +1 -1
- tracepipe/convenience.py +49 -2
- tracepipe/instrumentation/series_capture.py +13 -4
- tracepipe/storage/lineage_store.py +37 -2
- {tracepipe-0.3.2.dist-info → tracepipe-0.3.4.dist-info}/METADATA +1 -1
- {tracepipe-0.3.2.dist-info → tracepipe-0.3.4.dist-info}/RECORD +8 -8
- {tracepipe-0.3.2.dist-info → tracepipe-0.3.4.dist-info}/WHEEL +0 -0
- {tracepipe-0.3.2.dist-info → tracepipe-0.3.4.dist-info}/licenses/LICENSE +0 -0
tracepipe/__init__.py
CHANGED
tracepipe/convenience.py
CHANGED
|
@@ -361,8 +361,8 @@ def check(
|
|
|
361
361
|
facts["rows_dropped"] = len(dropped)
|
|
362
362
|
facts["total_steps"] = len(ctx.store.steps)
|
|
363
363
|
|
|
364
|
-
# Merge statistics
|
|
365
|
-
merge_stats_list =
|
|
364
|
+
# Merge statistics - filter to df's lineage to avoid cross-contamination
|
|
365
|
+
merge_stats_list = _get_merge_stats_for_df(df, ctx)
|
|
366
366
|
|
|
367
367
|
for i, (step_id, stats) in enumerate(merge_stats_list):
|
|
368
368
|
facts[f"merge_{i}_expansion"] = stats.expansion_ratio
|
|
@@ -658,6 +658,53 @@ def find(
|
|
|
658
658
|
# ============ HELPERS ============
|
|
659
659
|
|
|
660
660
|
|
|
661
|
+
def _get_merge_stats_for_df(df: pd.DataFrame, ctx) -> list[tuple[int, Any]]:
|
|
662
|
+
"""
|
|
663
|
+
Get merge stats relevant to df's lineage only.
|
|
664
|
+
|
|
665
|
+
This prevents cross-contamination where check(df) would show warnings
|
|
666
|
+
from merges that produced OTHER DataFrames in the same session.
|
|
667
|
+
"""
|
|
668
|
+
if not hasattr(ctx.store, "get_merge_stats"):
|
|
669
|
+
return []
|
|
670
|
+
|
|
671
|
+
all_stats = ctx.store.get_merge_stats()
|
|
672
|
+
if not all_stats:
|
|
673
|
+
return []
|
|
674
|
+
|
|
675
|
+
# Get row IDs from df
|
|
676
|
+
rids = ctx.row_manager.get_ids_array(df)
|
|
677
|
+
if rids is None:
|
|
678
|
+
return []
|
|
679
|
+
|
|
680
|
+
# Find which merge steps produced rows in df
|
|
681
|
+
relevant_step_ids = set()
|
|
682
|
+
|
|
683
|
+
# Check merge mappings to find which merges produced df's rows
|
|
684
|
+
if hasattr(ctx.store, "merge_mappings"):
|
|
685
|
+
for mapping in ctx.store.merge_mappings:
|
|
686
|
+
# Check if any of df's row IDs are in this merge's output
|
|
687
|
+
for rid in rids:
|
|
688
|
+
# Binary search in sorted out_rids
|
|
689
|
+
i = np.searchsorted(mapping.out_rids, rid)
|
|
690
|
+
if i < len(mapping.out_rids) and mapping.out_rids[i] == rid:
|
|
691
|
+
relevant_step_ids.add(mapping.step_id)
|
|
692
|
+
break # Found at least one match, this merge is relevant
|
|
693
|
+
|
|
694
|
+
# If no merge mappings found, fall back to checking if df was just merged
|
|
695
|
+
# by seeing if it has more columns than typical (heuristic)
|
|
696
|
+
if not relevant_step_ids and all_stats:
|
|
697
|
+
# Fallback: return only the most recent merge that could have produced df
|
|
698
|
+
# This handles the case where merge_mappings aren't available
|
|
699
|
+
for step_id, stats in reversed(all_stats):
|
|
700
|
+
if stats.result_rows == len(df):
|
|
701
|
+
relevant_step_ids.add(step_id)
|
|
702
|
+
break
|
|
703
|
+
|
|
704
|
+
# Filter stats to relevant merges only
|
|
705
|
+
return [(sid, stats) for sid, stats in all_stats if sid in relevant_step_ids]
|
|
706
|
+
|
|
707
|
+
|
|
661
708
|
def _json_safe(val: Any) -> Any:
|
|
662
709
|
"""Convert value to JSON-serializable form."""
|
|
663
710
|
if pd.isna(val):
|
|
@@ -116,6 +116,10 @@ def wrap_series_assignment():
|
|
|
116
116
|
"""
|
|
117
117
|
Wrap DataFrame.__setitem__ to capture diffs when assigning Series.
|
|
118
118
|
|
|
119
|
+
Note: For watched columns, _wrap_setitem (pandas_inst.py) already captures
|
|
120
|
+
the assignment. This wrapper only captures for NON-watched columns when
|
|
121
|
+
a TrackedSeries is assigned, to avoid double-logging.
|
|
122
|
+
|
|
119
123
|
Handles:
|
|
120
124
|
- df['col'] = series (where series may have been modified)
|
|
121
125
|
- df['col'] = scalar (broadcast assignment)
|
|
@@ -127,28 +131,33 @@ def wrap_series_assignment():
|
|
|
127
131
|
def tracked_setitem(self, key, value):
|
|
128
132
|
ctx = get_context()
|
|
129
133
|
|
|
130
|
-
#
|
|
134
|
+
# For watched columns, _wrap_setitem already captures - skip to avoid double-logging
|
|
135
|
+
# We only capture here for NON-watched columns when a TrackedSeries is involved
|
|
136
|
+
should_capture_here = False
|
|
131
137
|
before_values = None
|
|
138
|
+
|
|
132
139
|
if (
|
|
133
140
|
ctx.enabled
|
|
134
141
|
and isinstance(key, str)
|
|
135
|
-
and key in ctx.watched_columns
|
|
136
142
|
and key in self.columns
|
|
143
|
+
and key not in ctx.watched_columns # Only capture NON-watched columns here
|
|
144
|
+
and isinstance(value, TrackedSeries) # Only for TrackedSeries assignments
|
|
137
145
|
):
|
|
138
146
|
rids = ctx.row_manager.get_ids_array(self)
|
|
139
147
|
if rids is not None:
|
|
148
|
+
should_capture_here = True
|
|
140
149
|
before_values = {
|
|
141
150
|
"rids": rids.copy(),
|
|
142
151
|
"values": self[key].values.copy(),
|
|
143
152
|
}
|
|
144
153
|
|
|
145
|
-
# Always run original
|
|
154
|
+
# Always run original (which may be _wrap_setitem's wrapper)
|
|
146
155
|
original_setitem(self, key, value)
|
|
147
156
|
|
|
148
157
|
if not ctx.enabled:
|
|
149
158
|
return
|
|
150
159
|
|
|
151
|
-
if before_values is None:
|
|
160
|
+
if not should_capture_here or before_values is None:
|
|
152
161
|
return
|
|
153
162
|
|
|
154
163
|
try:
|
|
@@ -32,6 +32,22 @@ from ..core import (
|
|
|
32
32
|
from ..utils.value_capture import capture_typed_value
|
|
33
33
|
|
|
34
34
|
|
|
35
|
+
def _stable_repr(val) -> str:
|
|
36
|
+
"""Create a stable string representation for deduplication.
|
|
37
|
+
|
|
38
|
+
Handles NaN, None, and other values that don't compare equal to themselves.
|
|
39
|
+
"""
|
|
40
|
+
if val is None:
|
|
41
|
+
return "None"
|
|
42
|
+
# Handle NaN (which doesn't equal itself)
|
|
43
|
+
try:
|
|
44
|
+
if isinstance(val, float) and val != val: # NaN check
|
|
45
|
+
return "NaN"
|
|
46
|
+
except (TypeError, ValueError):
|
|
47
|
+
pass
|
|
48
|
+
return repr(val)
|
|
49
|
+
|
|
50
|
+
|
|
35
51
|
class InMemoryLineageStore:
|
|
36
52
|
"""
|
|
37
53
|
Columnar storage for lineage data using Structure of Arrays (SoA).
|
|
@@ -556,12 +572,15 @@ class InMemoryLineageStore:
|
|
|
556
572
|
Follows merge lineage recursively to build complete cell provenance.
|
|
557
573
|
This is essential for tracking changes that happened before merge operations.
|
|
558
574
|
|
|
575
|
+
Deduplicates events by (col, old_val, new_val, operation) signature to prevent
|
|
576
|
+
cross-pipeline contamination when multiple DataFrames share row IDs.
|
|
577
|
+
|
|
559
578
|
Args:
|
|
560
579
|
row_id: Row ID to trace
|
|
561
580
|
max_depth: Maximum merge depth to follow (prevents infinite loops)
|
|
562
581
|
|
|
563
582
|
Returns:
|
|
564
|
-
List of events in chronological order, including parent row events.
|
|
583
|
+
List of UNIQUE events in chronological order, including parent row events.
|
|
565
584
|
"""
|
|
566
585
|
visited: set[int] = set()
|
|
567
586
|
|
|
@@ -589,7 +608,23 @@ class InMemoryLineageStore:
|
|
|
589
608
|
# Sort by step_id to ensure chronological order across lineage
|
|
590
609
|
all_events.sort(key=lambda e: e["step_id"])
|
|
591
610
|
|
|
592
|
-
|
|
611
|
+
# Deduplicate by (col, old_val, new_val, operation) signature
|
|
612
|
+
# This prevents cross-pipeline contamination when multiple DataFrames
|
|
613
|
+
# share the same row IDs (e.g., df.copy() followed by parallel transforms)
|
|
614
|
+
seen_signatures: set[tuple] = set()
|
|
615
|
+
unique_events = []
|
|
616
|
+
for event in all_events:
|
|
617
|
+
sig = (
|
|
618
|
+
event.get("col"),
|
|
619
|
+
_stable_repr(event.get("old_val")),
|
|
620
|
+
_stable_repr(event.get("new_val")),
|
|
621
|
+
event.get("operation"),
|
|
622
|
+
)
|
|
623
|
+
if sig not in seen_signatures:
|
|
624
|
+
seen_signatures.add(sig)
|
|
625
|
+
unique_events.append(event)
|
|
626
|
+
|
|
627
|
+
return unique_events
|
|
593
628
|
|
|
594
629
|
def get_cell_history_with_lineage(
|
|
595
630
|
self, row_id: int, column: str, max_depth: int = 10
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tracepipe
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.4
|
|
4
4
|
Summary: Row-level data lineage tracking for pandas pipelines
|
|
5
5
|
Project-URL: Homepage, https://github.com/tracepipe/tracepipe
|
|
6
6
|
Project-URL: Documentation, https://tracepipe.github.io/tracepipe/
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
tracepipe/__init__.py,sha256=
|
|
1
|
+
tracepipe/__init__.py,sha256=1vKVGGc_fePrf1FNjP1R7-RPjtVnAc3Ori9QQl-E_4U,3342
|
|
2
2
|
tracepipe/api.py,sha256=WdcKvvzI3voDt6fxZWa8vjyZQU8lfRshx7T78oj7oFE,13351
|
|
3
3
|
tracepipe/context.py,sha256=_povLpqa5wd_ESHt5hbSmWTSMTF3nUfeutEQo4RMK2E,3856
|
|
4
4
|
tracepipe/contracts.py,sha256=m-rjPrgnCiAgKEkweOS7P95jrjDptt5UPdvUlqaV_rU,16226
|
|
5
|
-
tracepipe/convenience.py,sha256=
|
|
5
|
+
tracepipe/convenience.py,sha256=KuDz_ZzNivVG1SS8Srr3plu4CTwFmNhYL4rk3vV6cbE,28421
|
|
6
6
|
tracepipe/core.py,sha256=kAXks694rR0Z4tD7Gyty0TyJGWx2whsSdteYYpHuazo,8010
|
|
7
7
|
tracepipe/debug.py,sha256=6t2GKVZLwn7SJLhrStE9qsmTiVIHATTE3jJPQ2DYtnc,10140
|
|
8
8
|
tracepipe/safety.py,sha256=jTBZv4QGDJfnZETsSZeMKbdOUtGXk-_XkmllhnGWM-M,5537
|
|
@@ -14,16 +14,16 @@ tracepipe/instrumentation/filter_capture.py,sha256=onlYLU5bBZSM3WmxM2AFHfktnlx7R
|
|
|
14
14
|
tracepipe/instrumentation/indexer_capture.py,sha256=1ATCeJ-uNA1uGiSbgnUx0wdVsIlZGHeUBaFJPXgFQNg,28440
|
|
15
15
|
tracepipe/instrumentation/merge_capture.py,sha256=Eze-PTrn7IXxZRZBYX9R13mOY3diWKAkjp4z-wa1tEk,13349
|
|
16
16
|
tracepipe/instrumentation/pandas_inst.py,sha256=2YSoju9ml2PjLOYzsx8MHH1iqhjgnXHbIidnF0JDpaY,29546
|
|
17
|
-
tracepipe/instrumentation/series_capture.py,sha256=
|
|
17
|
+
tracepipe/instrumentation/series_capture.py,sha256=i7FiA2ndEzS6duIj5y-a7SDfIMl2cCY_jGC1tmG7TGU,11271
|
|
18
18
|
tracepipe/storage/__init__.py,sha256=pGFMfbIgIi2kofVPwYDqe2HTYMYJoabiGjTq77pYi-g,348
|
|
19
19
|
tracepipe/storage/base.py,sha256=7DV_-rp37DjBMr9B1w85hLVYhC8OQShk2PcEhT-n4tE,4894
|
|
20
|
-
tracepipe/storage/lineage_store.py,sha256=
|
|
20
|
+
tracepipe/storage/lineage_store.py,sha256=KhGri2uC_O_43fUivFGEHY6KBDHd1I0O_PPd_KD3L4M,28683
|
|
21
21
|
tracepipe/storage/row_identity.py,sha256=HBU0gTTJlFtFTcAdUCKuX-c9cHa0lo3CDIodDPDgOzA,17161
|
|
22
22
|
tracepipe/utils/__init__.py,sha256=CI_GXViCjdMbu1j6HuzZhoQZEW0sIB6WAve6j5pfOC0,182
|
|
23
23
|
tracepipe/utils/value_capture.py,sha256=wGgegQmJnVHxHbwHSH9di7JAOBChzD3ERJrabZNiayk,4092
|
|
24
24
|
tracepipe/visualization/__init__.py,sha256=M3s44ZTUNEToyghjhQW0FgbmWHKPr4Xc-7iNF6DpI_E,132
|
|
25
25
|
tracepipe/visualization/html_export.py,sha256=G0hfZTJctUCfpun17zXX1NIXhvJZbca6hKmP3rcIjbg,42282
|
|
26
|
-
tracepipe-0.3.
|
|
27
|
-
tracepipe-0.3.
|
|
28
|
-
tracepipe-0.3.
|
|
29
|
-
tracepipe-0.3.
|
|
26
|
+
tracepipe-0.3.4.dist-info/METADATA,sha256=DooQHiRi1HBiFK-QZPpE3PfLg43xE5Yg93kXWEdxhNY,9152
|
|
27
|
+
tracepipe-0.3.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
28
|
+
tracepipe-0.3.4.dist-info/licenses/LICENSE,sha256=HMOAFHBClL79POwWL-2_aDcx42DJAq7Ce-nwJPvMB9U,1075
|
|
29
|
+
tracepipe-0.3.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|