tracepipe 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tracepipe/__init__.py +110 -0
- tracepipe/api.py +563 -0
- tracepipe/context.py +98 -0
- tracepipe/core.py +122 -0
- tracepipe/instrumentation/__init__.py +6 -0
- tracepipe/instrumentation/pandas_inst.py +1024 -0
- tracepipe/safety.py +178 -0
- tracepipe/storage/__init__.py +13 -0
- tracepipe/storage/base.py +174 -0
- tracepipe/storage/lineage_store.py +556 -0
- tracepipe/storage/row_identity.py +217 -0
- tracepipe/utils/__init__.py +6 -0
- tracepipe/utils/value_capture.py +137 -0
- tracepipe/visualization/__init__.py +6 -0
- tracepipe/visualization/html_export.py +1335 -0
- tracepipe-0.2.0.dist-info/METADATA +508 -0
- tracepipe-0.2.0.dist-info/RECORD +19 -0
- tracepipe-0.2.0.dist-info/WHEEL +4 -0
- tracepipe-0.2.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
# tracepipe/storage/row_identity.py
|
|
2
|
+
"""
|
|
3
|
+
Row identity tracking for pandas DataFrames.
|
|
4
|
+
|
|
5
|
+
Uses: Registry + Hidden Column fallback.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import warnings
|
|
9
|
+
import weakref
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
import pandas as pd
|
|
14
|
+
|
|
15
|
+
from ..core import TracePipeConfig
|
|
16
|
+
|
|
17
|
+
_TRACEPIPE_ROW_ID_COL = "__tracepipe_row_id__"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class PandasRowIdentity:
|
|
21
|
+
"""
|
|
22
|
+
Hybrid row identity tracking for pandas DataFrames.
|
|
23
|
+
|
|
24
|
+
Implements: RowIdentityStrategy protocol
|
|
25
|
+
|
|
26
|
+
Handles:
|
|
27
|
+
- Standard operations (filter, sort, copy)
|
|
28
|
+
- reset_index(drop=True)
|
|
29
|
+
- Duplicate indices (with warning)
|
|
30
|
+
- Chained operations
|
|
31
|
+
|
|
32
|
+
Future alternatives:
|
|
33
|
+
- PolarsRowIdentity: Uses Polars row numbers and lazy evaluation
|
|
34
|
+
- SparkRowIdentity: Uses monotonically_increasing_id() or RDD zipWithIndex
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(self, config: TracePipeConfig):
|
|
38
|
+
self.config = config
|
|
39
|
+
self._registry: dict[int, pd.Series] = {}
|
|
40
|
+
self._df_refs: weakref.WeakValueDictionary = weakref.WeakValueDictionary()
|
|
41
|
+
self._next_row_id: int = 0
|
|
42
|
+
|
|
43
|
+
def register(
|
|
44
|
+
self,
|
|
45
|
+
df: pd.DataFrame,
|
|
46
|
+
row_ids: Optional[pd.Series] = None,
|
|
47
|
+
warn_duplicate_index: bool = True,
|
|
48
|
+
) -> pd.Series:
|
|
49
|
+
"""
|
|
50
|
+
Register a DataFrame and assign row IDs.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
df: DataFrame to register
|
|
54
|
+
row_ids: Optional pre-assigned IDs (for propagation)
|
|
55
|
+
warn_duplicate_index: Warn if index has duplicates
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
Series of row IDs aligned to df.index
|
|
59
|
+
"""
|
|
60
|
+
# Check for duplicate index
|
|
61
|
+
if warn_duplicate_index and self.config.warn_on_duplicate_index:
|
|
62
|
+
if df.index.has_duplicates:
|
|
63
|
+
warnings.warn(
|
|
64
|
+
"TracePipe: DataFrame has duplicate index values. "
|
|
65
|
+
"Row identity may be ambiguous for duplicates.",
|
|
66
|
+
UserWarning,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
if row_ids is None:
|
|
70
|
+
# Generate new sequential IDs
|
|
71
|
+
new_ids = list(range(self._next_row_id, self._next_row_id + len(df)))
|
|
72
|
+
self._next_row_id += len(df)
|
|
73
|
+
row_ids = pd.Series(new_ids, index=df.index, dtype="int64")
|
|
74
|
+
else:
|
|
75
|
+
# Ensure alignment
|
|
76
|
+
if not row_ids.index.equals(df.index):
|
|
77
|
+
row_ids = row_ids.copy()
|
|
78
|
+
row_ids.index = df.index
|
|
79
|
+
|
|
80
|
+
obj_id = id(df)
|
|
81
|
+
self._registry[obj_id] = row_ids
|
|
82
|
+
self._df_refs[obj_id] = df
|
|
83
|
+
|
|
84
|
+
# Optionally embed in DataFrame
|
|
85
|
+
if self.config.use_hidden_column:
|
|
86
|
+
df[_TRACEPIPE_ROW_ID_COL] = row_ids.values
|
|
87
|
+
|
|
88
|
+
return row_ids
|
|
89
|
+
|
|
90
|
+
def get_ids(self, df: pd.DataFrame) -> Optional[pd.Series]:
|
|
91
|
+
"""Get row IDs for a DataFrame."""
|
|
92
|
+
# 1. Try registry (fast path)
|
|
93
|
+
obj_id = id(df)
|
|
94
|
+
if obj_id in self._registry:
|
|
95
|
+
stored = self._registry[obj_id]
|
|
96
|
+
# Verify alignment still valid
|
|
97
|
+
if len(stored) == len(df) and stored.index.equals(df.index):
|
|
98
|
+
return stored
|
|
99
|
+
|
|
100
|
+
# 2. Try hidden column (fallback)
|
|
101
|
+
if _TRACEPIPE_ROW_ID_COL in df.columns:
|
|
102
|
+
row_ids = df[_TRACEPIPE_ROW_ID_COL].copy()
|
|
103
|
+
row_ids.index = df.index
|
|
104
|
+
# Re-register for future lookups
|
|
105
|
+
self._registry[obj_id] = row_ids
|
|
106
|
+
self._df_refs[obj_id] = df
|
|
107
|
+
return row_ids
|
|
108
|
+
|
|
109
|
+
# 3. Not tracked
|
|
110
|
+
return None
|
|
111
|
+
|
|
112
|
+
def propagate(self, source_df: pd.DataFrame, result_df: pd.DataFrame) -> Optional[pd.Series]:
|
|
113
|
+
"""
|
|
114
|
+
Propagate row IDs from source to result DataFrame.
|
|
115
|
+
|
|
116
|
+
Handles:
|
|
117
|
+
- Filtering (fewer rows)
|
|
118
|
+
- Reordering (same rows, different order)
|
|
119
|
+
- Mixed operations
|
|
120
|
+
"""
|
|
121
|
+
source_ids = self.get_ids(source_df)
|
|
122
|
+
if source_ids is None:
|
|
123
|
+
return None
|
|
124
|
+
|
|
125
|
+
if result_df is source_df:
|
|
126
|
+
return source_ids
|
|
127
|
+
|
|
128
|
+
try:
|
|
129
|
+
if result_df.index.equals(source_df.index):
|
|
130
|
+
# Same index - direct copy
|
|
131
|
+
result_ids = source_ids.copy()
|
|
132
|
+
elif result_df.index.isin(source_df.index).all():
|
|
133
|
+
# Result index is subset/reorder of source
|
|
134
|
+
result_ids = source_ids.loc[result_df.index].copy()
|
|
135
|
+
else:
|
|
136
|
+
# Partial overlap or new indices
|
|
137
|
+
result_ids = source_ids.reindex(result_df.index)
|
|
138
|
+
# Rows not in source get new IDs
|
|
139
|
+
new_mask = result_ids.isna()
|
|
140
|
+
if new_mask.any():
|
|
141
|
+
new_count = new_mask.sum()
|
|
142
|
+
new_row_ids = list(range(self._next_row_id, self._next_row_id + new_count))
|
|
143
|
+
self._next_row_id += new_count
|
|
144
|
+
result_ids.loc[new_mask] = new_row_ids
|
|
145
|
+
result_ids = result_ids.astype("int64")
|
|
146
|
+
except Exception:
|
|
147
|
+
# Fallback: positional alignment
|
|
148
|
+
if len(result_df) <= len(source_df):
|
|
149
|
+
result_ids = pd.Series(
|
|
150
|
+
source_ids.values[: len(result_df)], index=result_df.index, dtype="int64"
|
|
151
|
+
)
|
|
152
|
+
else:
|
|
153
|
+
# Result is larger - assign new IDs to extras
|
|
154
|
+
base_ids = list(source_ids.values)
|
|
155
|
+
extra_count = len(result_df) - len(source_df)
|
|
156
|
+
extra_ids = list(range(self._next_row_id, self._next_row_id + extra_count))
|
|
157
|
+
self._next_row_id += extra_count
|
|
158
|
+
result_ids = pd.Series(base_ids + extra_ids, index=result_df.index, dtype="int64")
|
|
159
|
+
|
|
160
|
+
return self.register(result_df, result_ids, warn_duplicate_index=False)
|
|
161
|
+
|
|
162
|
+
def realign_for_reset_index(
|
|
163
|
+
self, original_df: pd.DataFrame, new_df: pd.DataFrame
|
|
164
|
+
) -> Optional[pd.Series]:
|
|
165
|
+
"""Handle reset_index(drop=True) which changes index."""
|
|
166
|
+
old_ids = self.get_ids(original_df)
|
|
167
|
+
if old_ids is None:
|
|
168
|
+
return None
|
|
169
|
+
|
|
170
|
+
# Same values, new index
|
|
171
|
+
new_ids = pd.Series(old_ids.values, index=new_df.index, dtype="int64")
|
|
172
|
+
return self.register(new_df, new_ids, warn_duplicate_index=False)
|
|
173
|
+
|
|
174
|
+
def get_dropped_ids(self, source_df: pd.DataFrame, result_df: pd.DataFrame) -> np.ndarray:
|
|
175
|
+
"""
|
|
176
|
+
Get row IDs that were dropped between source and result.
|
|
177
|
+
|
|
178
|
+
Uses numpy's setdiff1d for vectorized performance (~50x faster
|
|
179
|
+
than Python set operations for large DataFrames).
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
numpy array of dropped row IDs (empty array if none dropped)
|
|
183
|
+
"""
|
|
184
|
+
source_ids = self.get_ids(source_df)
|
|
185
|
+
result_ids = self.get_ids(result_df)
|
|
186
|
+
|
|
187
|
+
if source_ids is None:
|
|
188
|
+
return np.array([], dtype="int64")
|
|
189
|
+
if result_ids is None:
|
|
190
|
+
return np.asarray(source_ids.values, dtype="int64")
|
|
191
|
+
|
|
192
|
+
# Vectorized set difference - O(n log n) in C instead of O(n) in Python
|
|
193
|
+
return np.setdiff1d(source_ids.values, result_ids.values)
|
|
194
|
+
|
|
195
|
+
def strip_hidden_column(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
196
|
+
"""Remove hidden column for export."""
|
|
197
|
+
if _TRACEPIPE_ROW_ID_COL in df.columns:
|
|
198
|
+
return df.drop(columns=[_TRACEPIPE_ROW_ID_COL])
|
|
199
|
+
return df
|
|
200
|
+
|
|
201
|
+
def cleanup(self) -> None:
|
|
202
|
+
"""Remove stale entries."""
|
|
203
|
+
stale = [k for k in list(self._registry.keys()) if k not in self._df_refs]
|
|
204
|
+
for k in stale:
|
|
205
|
+
del self._registry[k]
|
|
206
|
+
|
|
207
|
+
def all_registered_ids(self) -> list[int]:
|
|
208
|
+
"""
|
|
209
|
+
Get all row IDs that have ever been registered.
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
List of all registered row IDs.
|
|
213
|
+
"""
|
|
214
|
+
all_ids = set()
|
|
215
|
+
for row_ids in self._registry.values():
|
|
216
|
+
all_ids.update(row_ids.values.tolist())
|
|
217
|
+
return sorted(all_ids)
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
# tracepipe/utils/value_capture.py
|
|
2
|
+
"""
|
|
3
|
+
Value capture and comparison utilities with complete NA handling.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
# Interned type strings (avoid allocating same strings repeatedly)
|
|
12
|
+
_TYPE_NULL = "null"
|
|
13
|
+
_TYPE_BOOL = "bool"
|
|
14
|
+
_TYPE_INT = "int"
|
|
15
|
+
_TYPE_FLOAT = "float"
|
|
16
|
+
_TYPE_STR = "str"
|
|
17
|
+
_TYPE_DATETIME = "datetime"
|
|
18
|
+
_TYPE_OTHER = "other"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def capture_typed_value(value: Any) -> tuple[Any, str]:
|
|
22
|
+
"""
|
|
23
|
+
Convert value to (python_native, type_string) for storage.
|
|
24
|
+
|
|
25
|
+
Handles:
|
|
26
|
+
- None, np.nan, pd.NA, pd.NaT
|
|
27
|
+
- numpy scalars (not JSON-serializable)
|
|
28
|
+
- Standard Python types
|
|
29
|
+
- Datetime types
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
Tuple of (native_value, type_string)
|
|
33
|
+
"""
|
|
34
|
+
# Handle all NA types first (pd.isna handles None, np.nan, pd.NA, pd.NaT)
|
|
35
|
+
try:
|
|
36
|
+
if pd.isna(value):
|
|
37
|
+
return None, _TYPE_NULL
|
|
38
|
+
except (ValueError, TypeError):
|
|
39
|
+
# pd.isna can fail on some types (e.g., lists)
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
# numpy scalar -> Python native (CRITICAL for JSON serialization)
|
|
43
|
+
if hasattr(value, "item"):
|
|
44
|
+
try:
|
|
45
|
+
value = value.item()
|
|
46
|
+
except (ValueError, AttributeError):
|
|
47
|
+
pass
|
|
48
|
+
|
|
49
|
+
# Type mapping (order matters: bool before int)
|
|
50
|
+
if isinstance(value, bool):
|
|
51
|
+
return value, _TYPE_BOOL
|
|
52
|
+
elif isinstance(value, (int, np.integer)):
|
|
53
|
+
return int(value), _TYPE_INT
|
|
54
|
+
elif isinstance(value, (float, np.floating)):
|
|
55
|
+
return float(value), _TYPE_FLOAT
|
|
56
|
+
elif isinstance(value, str):
|
|
57
|
+
return value, _TYPE_STR
|
|
58
|
+
elif isinstance(value, (pd.Timestamp, np.datetime64)):
|
|
59
|
+
return str(value), _TYPE_DATETIME
|
|
60
|
+
else:
|
|
61
|
+
# Fallback: stringify for storage
|
|
62
|
+
return str(value), _TYPE_OTHER
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def values_equal(a: Any, b: Any) -> bool:
|
|
66
|
+
"""
|
|
67
|
+
Compare two values, handling NA correctly.
|
|
68
|
+
|
|
69
|
+
pd.isna(x) == pd.isna(y) handles the case where both are NA.
|
|
70
|
+
"""
|
|
71
|
+
try:
|
|
72
|
+
a_na = pd.isna(a)
|
|
73
|
+
b_na = pd.isna(b)
|
|
74
|
+
|
|
75
|
+
if a_na and b_na:
|
|
76
|
+
return True
|
|
77
|
+
if a_na or b_na:
|
|
78
|
+
return False
|
|
79
|
+
return a == b
|
|
80
|
+
except (ValueError, TypeError):
|
|
81
|
+
# Fallback for unhashable types
|
|
82
|
+
return str(a) == str(b)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def find_changed_indices_vectorized(old_series: pd.Series, new_series: pd.Series) -> np.ndarray:
|
|
86
|
+
"""
|
|
87
|
+
Find indices where values changed, using vectorized operations.
|
|
88
|
+
|
|
89
|
+
~50-100x faster than row-by-row .loc[] access for large DataFrames.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
old_series: Series of old values (must be aligned with new_series)
|
|
93
|
+
new_series: Series of new values
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
Boolean mask array where True indicates the value changed
|
|
97
|
+
"""
|
|
98
|
+
old_arr = old_series.values
|
|
99
|
+
new_arr = new_series.values
|
|
100
|
+
n = len(old_arr)
|
|
101
|
+
|
|
102
|
+
if n == 0:
|
|
103
|
+
return np.array([], dtype=bool)
|
|
104
|
+
|
|
105
|
+
# Vectorized NA detection
|
|
106
|
+
old_na = pd.isna(old_arr)
|
|
107
|
+
new_na = pd.isna(new_arr)
|
|
108
|
+
|
|
109
|
+
# NA status changed (one is NA, other isn't)
|
|
110
|
+
na_status_changed = old_na != new_na
|
|
111
|
+
|
|
112
|
+
# For non-NA values, check if they differ
|
|
113
|
+
# Handle mixed types safely by comparing element-by-element for non-NA
|
|
114
|
+
both_not_na = ~old_na & ~new_na
|
|
115
|
+
|
|
116
|
+
# Initialize values_differ as False
|
|
117
|
+
values_differ = np.zeros(n, dtype=bool)
|
|
118
|
+
|
|
119
|
+
# Only compare where both are non-NA
|
|
120
|
+
non_na_indices = np.where(both_not_na)[0]
|
|
121
|
+
if len(non_na_indices) > 0:
|
|
122
|
+
# Try vectorized comparison first (fast path for homogeneous arrays)
|
|
123
|
+
try:
|
|
124
|
+
with np.errstate(invalid="ignore"):
|
|
125
|
+
values_differ[non_na_indices] = old_arr[non_na_indices] != new_arr[non_na_indices]
|
|
126
|
+
except (TypeError, ValueError):
|
|
127
|
+
# Fallback for mixed types: element-by-element comparison
|
|
128
|
+
for i in non_na_indices:
|
|
129
|
+
try:
|
|
130
|
+
values_differ[i] = old_arr[i] != new_arr[i]
|
|
131
|
+
except (TypeError, ValueError):
|
|
132
|
+
# Different types that can't be compared - treat as different
|
|
133
|
+
values_differ[i] = True
|
|
134
|
+
|
|
135
|
+
changed_mask = (both_not_na & values_differ) | na_status_changed
|
|
136
|
+
|
|
137
|
+
return changed_mask
|