tracepipe 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tracepipe/__init__.py +117 -78
- tracepipe/api.py +168 -331
- tracepipe/context.py +21 -1
- tracepipe/contracts.py +473 -0
- tracepipe/convenience.py +812 -0
- tracepipe/core.py +174 -17
- tracepipe/debug.py +325 -0
- tracepipe/instrumentation/apply_capture.py +453 -0
- tracepipe/instrumentation/filter_capture.py +468 -0
- tracepipe/instrumentation/indexer_capture.py +813 -0
- tracepipe/instrumentation/merge_capture.py +434 -0
- tracepipe/instrumentation/pandas_inst.py +66 -183
- tracepipe/instrumentation/series_capture.py +331 -0
- tracepipe/safety.py +3 -3
- tracepipe/snapshot.py +420 -0
- tracepipe/storage/base.py +7 -3
- tracepipe/storage/lineage_store.py +190 -47
- tracepipe/storage/row_identity.py +366 -104
- tracepipe/value_provenance.py +301 -0
- tracepipe/visualization/html_export.py +22 -7
- tracepipe-0.3.0.dist-info/METADATA +575 -0
- tracepipe-0.3.0.dist-info/RECORD +29 -0
- tracepipe-0.2.0.dist-info/METADATA +0 -508
- tracepipe-0.2.0.dist-info/RECORD +0 -19
- {tracepipe-0.2.0.dist-info → tracepipe-0.3.0.dist-info}/WHEEL +0 -0
- {tracepipe-0.2.0.dist-info → tracepipe-0.3.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,122 +1,252 @@
|
|
|
1
1
|
# tracepipe/storage/row_identity.py
|
|
2
2
|
"""
|
|
3
|
-
Row identity tracking
|
|
4
|
-
|
|
5
|
-
|
|
3
|
+
Row identity tracking with positional propagation.
|
|
4
|
+
|
|
5
|
+
Key invariants:
|
|
6
|
+
- Every registered DataFrame has RIDs aligned to its index
|
|
7
|
+
- Propagation is POSITIONAL, not index-label based
|
|
8
|
+
- Ghost values are captured in debug mode only
|
|
9
|
+
- NO DataFrame mutation by default
|
|
10
|
+
|
|
11
|
+
Identity Storage Options:
|
|
12
|
+
- REGISTRY (default): WeakKeyDictionary, no mutation
|
|
13
|
+
- If weakref fails, auto-degrades to ATTRS with one-time warning
|
|
14
|
+
- ATTRS: df.attrs token for long sessions
|
|
15
|
+
- COLUMN: hidden column (opt-in only)
|
|
6
16
|
"""
|
|
7
17
|
|
|
8
|
-
import
|
|
18
|
+
import logging
|
|
19
|
+
import uuid
|
|
9
20
|
import weakref
|
|
21
|
+
from collections import OrderedDict
|
|
10
22
|
from typing import Optional
|
|
11
23
|
|
|
12
24
|
import numpy as np
|
|
13
25
|
import pandas as pd
|
|
14
26
|
|
|
15
|
-
from ..core import TracePipeConfig
|
|
27
|
+
from ..core import GhostRowInfo, IdentityStorage, TracePipeConfig
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
16
30
|
|
|
17
31
|
_TRACEPIPE_ROW_ID_COL = "__tracepipe_row_id__"
|
|
32
|
+
_TRACEPIPE_TOKEN_ATTR = "_tracepipe_token"
|
|
18
33
|
|
|
19
34
|
|
|
20
35
|
class PandasRowIdentity:
|
|
21
36
|
"""
|
|
22
|
-
|
|
37
|
+
Row identity tracking with positional propagation.
|
|
23
38
|
|
|
24
39
|
Implements: RowIdentityStrategy protocol
|
|
25
|
-
|
|
26
|
-
Handles:
|
|
27
|
-
- Standard operations (filter, sort, copy)
|
|
28
|
-
- reset_index(drop=True)
|
|
29
|
-
- Duplicate indices (with warning)
|
|
30
|
-
- Chained operations
|
|
31
|
-
|
|
32
|
-
Future alternatives:
|
|
33
|
-
- PolarsRowIdentity: Uses Polars row numbers and lazy evaluation
|
|
34
|
-
- SparkRowIdentity: Uses monotonically_increasing_id() or RDD zipWithIndex
|
|
35
40
|
"""
|
|
36
41
|
|
|
42
|
+
# Cap token registry size to prevent unbounded growth
|
|
43
|
+
MAX_TOKEN_REGISTRY_SIZE: int = 50_000
|
|
44
|
+
|
|
37
45
|
def __init__(self, config: TracePipeConfig):
|
|
38
46
|
self.config = config
|
|
39
|
-
self._registry: dict[int, pd.Series] = {}
|
|
40
|
-
self._df_refs: weakref.WeakValueDictionary = weakref.WeakValueDictionary()
|
|
41
47
|
self._next_row_id: int = 0
|
|
42
48
|
|
|
49
|
+
# Use WeakKeyDictionary for proper GC
|
|
50
|
+
# Maps DataFrame object -> rids_array
|
|
51
|
+
self._registry: weakref.WeakKeyDictionary = weakref.WeakKeyDictionary()
|
|
52
|
+
|
|
53
|
+
# Fallback for non-weakref-able DataFrames
|
|
54
|
+
# Maps token -> rids_array (token stored in df.attrs)
|
|
55
|
+
# Uses OrderedDict for FIFO eviction
|
|
56
|
+
self._token_registry: OrderedDict[str, np.ndarray] = OrderedDict()
|
|
57
|
+
|
|
58
|
+
# Ghost row storage (debug mode only)
|
|
59
|
+
self._ghost_rows: dict[int, GhostRowInfo] = {}
|
|
60
|
+
|
|
61
|
+
# Track if we've warned about weakref fallback
|
|
62
|
+
self._weakref_fallback_warned: bool = False
|
|
63
|
+
|
|
64
|
+
def _generate_token(self) -> str:
|
|
65
|
+
"""Generate unique token for DataFrame identification."""
|
|
66
|
+
return uuid.uuid4().hex
|
|
67
|
+
|
|
43
68
|
def register(
|
|
44
69
|
self,
|
|
45
70
|
df: pd.DataFrame,
|
|
46
|
-
row_ids: Optional[
|
|
71
|
+
row_ids: Optional[np.ndarray] = None,
|
|
47
72
|
warn_duplicate_index: bool = True,
|
|
48
|
-
) ->
|
|
73
|
+
) -> np.ndarray:
|
|
49
74
|
"""
|
|
50
75
|
Register a DataFrame and assign row IDs.
|
|
51
76
|
|
|
52
|
-
Args:
|
|
53
|
-
df: DataFrame to register
|
|
54
|
-
row_ids: Optional pre-assigned IDs (for propagation)
|
|
55
|
-
warn_duplicate_index: Warn if index has duplicates
|
|
56
|
-
|
|
57
77
|
Returns:
|
|
58
|
-
|
|
78
|
+
numpy array of row IDs (int64)
|
|
59
79
|
"""
|
|
60
|
-
# Check for duplicate index
|
|
61
80
|
if warn_duplicate_index and self.config.warn_on_duplicate_index:
|
|
62
81
|
if df.index.has_duplicates:
|
|
63
|
-
|
|
64
|
-
"TracePipe: DataFrame has duplicate index values. "
|
|
65
|
-
"Row identity may be ambiguous for duplicates.",
|
|
66
|
-
UserWarning,
|
|
67
|
-
)
|
|
82
|
+
logger.debug("DataFrame has duplicate index values. Row identity may be ambiguous.")
|
|
68
83
|
|
|
84
|
+
n = len(df)
|
|
69
85
|
if row_ids is None:
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
self._next_row_id += len(df)
|
|
73
|
-
row_ids = pd.Series(new_ids, index=df.index, dtype="int64")
|
|
86
|
+
row_ids = np.arange(self._next_row_id, self._next_row_id + n, dtype=np.int64)
|
|
87
|
+
self._next_row_id += n
|
|
74
88
|
else:
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
89
|
+
row_ids = np.asarray(row_ids, dtype=np.int64)
|
|
90
|
+
|
|
91
|
+
# Explicit storage selection based on config
|
|
92
|
+
if self.config.identity_storage == IdentityStorage.COLUMN:
|
|
93
|
+
# COLUMN mode: hidden column only
|
|
94
|
+
df[_TRACEPIPE_ROW_ID_COL] = row_ids
|
|
95
|
+
|
|
96
|
+
elif self.config.identity_storage == IdentityStorage.ATTRS:
|
|
97
|
+
# ATTRS mode: df.attrs token only
|
|
98
|
+
token = self._generate_token()
|
|
99
|
+
df.attrs[_TRACEPIPE_TOKEN_ATTR] = token
|
|
100
|
+
self._add_to_token_registry(token, row_ids)
|
|
101
|
+
|
|
102
|
+
else: # REGISTRY mode (default)
|
|
103
|
+
# Try WeakKeyDictionary first
|
|
104
|
+
try:
|
|
105
|
+
self._registry[df] = row_ids
|
|
106
|
+
except TypeError:
|
|
107
|
+
# Auto-degrade to attrs silently (log at debug level)
|
|
108
|
+
if not self._weakref_fallback_warned:
|
|
109
|
+
logger.debug(
|
|
110
|
+
"DataFrame not weakref-able; using df.attrs fallback. "
|
|
111
|
+
"This is safe but uses slightly more memory."
|
|
112
|
+
)
|
|
113
|
+
self._weakref_fallback_warned = True
|
|
114
|
+
|
|
115
|
+
token = self._generate_token()
|
|
116
|
+
df.attrs[_TRACEPIPE_TOKEN_ATTR] = token
|
|
117
|
+
self._add_to_token_registry(token, row_ids)
|
|
79
118
|
|
|
80
|
-
|
|
81
|
-
self._registry[obj_id] = row_ids
|
|
82
|
-
self._df_refs[obj_id] = df
|
|
119
|
+
return row_ids
|
|
83
120
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
121
|
+
def _add_to_token_registry(self, token: str, row_ids: np.ndarray) -> None:
|
|
122
|
+
"""
|
|
123
|
+
Add token to registry with FIFO eviction if over cap.
|
|
87
124
|
|
|
88
|
-
|
|
125
|
+
Prevents unbounded growth of _token_registry in attrs fallback mode.
|
|
126
|
+
"""
|
|
127
|
+
# Evict oldest tokens if over cap
|
|
128
|
+
while len(self._token_registry) >= self.MAX_TOKEN_REGISTRY_SIZE:
|
|
129
|
+
self._token_registry.popitem(last=False) # Remove oldest (FIFO)
|
|
130
|
+
|
|
131
|
+
self._token_registry[token] = row_ids
|
|
89
132
|
|
|
90
133
|
def get_ids(self, df: pd.DataFrame) -> Optional[pd.Series]:
|
|
91
|
-
"""Get row IDs for
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
134
|
+
"""Get row IDs as Series (for index-based operations)."""
|
|
135
|
+
rids = self.get_ids_array(df)
|
|
136
|
+
if rids is not None:
|
|
137
|
+
return pd.Series(rids, index=df.index, dtype=np.int64)
|
|
138
|
+
return None
|
|
139
|
+
|
|
140
|
+
def get_ids_array(self, df: pd.DataFrame) -> Optional[np.ndarray]:
|
|
141
|
+
"""Get row IDs as numpy array (for vectorized operations)."""
|
|
142
|
+
# 1. Try WeakKeyDictionary (fastest, for REGISTRY mode)
|
|
143
|
+
try:
|
|
144
|
+
if df in self._registry:
|
|
145
|
+
rids = self._registry[df]
|
|
146
|
+
if len(rids) == len(df):
|
|
147
|
+
return rids
|
|
148
|
+
except TypeError:
|
|
149
|
+
pass
|
|
150
|
+
|
|
151
|
+
# 2. Try attrs token (for ATTRS mode or REGISTRY fallback)
|
|
152
|
+
token = df.attrs.get(_TRACEPIPE_TOKEN_ATTR)
|
|
153
|
+
if token and token in self._token_registry:
|
|
154
|
+
rids = self._token_registry[token]
|
|
155
|
+
if len(rids) == len(df):
|
|
156
|
+
return rids
|
|
157
|
+
|
|
158
|
+
# 3. Try hidden column (for COLUMN mode)
|
|
159
|
+
# Access underlying data directly to avoid triggering instrumented __getitem__
|
|
101
160
|
if _TRACEPIPE_ROW_ID_COL in df.columns:
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
self._df_refs[obj_id] = df
|
|
107
|
-
return row_ids
|
|
108
|
-
|
|
109
|
-
# 3. Not tracked
|
|
161
|
+
col_idx = df.columns.get_loc(_TRACEPIPE_ROW_ID_COL)
|
|
162
|
+
# Use _iget_item_cache for direct column access (bypasses instrumentation)
|
|
163
|
+
return df._get_column_array(col_idx).astype(np.int64)
|
|
164
|
+
|
|
110
165
|
return None
|
|
111
166
|
|
|
167
|
+
def set_result_rids(self, result_df: pd.DataFrame, rids: np.ndarray) -> None:
|
|
168
|
+
"""
|
|
169
|
+
Set RIDs for a result DataFrame (internal controlled assignment).
|
|
170
|
+
|
|
171
|
+
Uses same storage logic as register().
|
|
172
|
+
"""
|
|
173
|
+
rids = np.asarray(rids, dtype=np.int64)
|
|
174
|
+
|
|
175
|
+
if self.config.identity_storage == IdentityStorage.COLUMN:
|
|
176
|
+
result_df[_TRACEPIPE_ROW_ID_COL] = rids
|
|
177
|
+
|
|
178
|
+
elif self.config.identity_storage == IdentityStorage.ATTRS:
|
|
179
|
+
token = self._generate_token()
|
|
180
|
+
result_df.attrs[_TRACEPIPE_TOKEN_ATTR] = token
|
|
181
|
+
self._add_to_token_registry(token, rids)
|
|
182
|
+
|
|
183
|
+
else: # REGISTRY mode
|
|
184
|
+
try:
|
|
185
|
+
self._registry[result_df] = rids
|
|
186
|
+
except TypeError:
|
|
187
|
+
# Fallback to attrs (FIFO eviction)
|
|
188
|
+
token = self._generate_token()
|
|
189
|
+
result_df.attrs[_TRACEPIPE_TOKEN_ATTR] = token
|
|
190
|
+
self._add_to_token_registry(token, rids)
|
|
191
|
+
|
|
192
|
+
# ========== POSITIONAL PROPAGATION METHODS ==========
|
|
193
|
+
|
|
194
|
+
def propagate_by_mask(
|
|
195
|
+
self, source_df: pd.DataFrame, result_df: pd.DataFrame, kept_mask: np.ndarray
|
|
196
|
+
) -> np.ndarray:
|
|
197
|
+
"""
|
|
198
|
+
Propagate RIDs using boolean mask (for filter operations).
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
source_df: Original DataFrame
|
|
202
|
+
result_df: Filtered DataFrame
|
|
203
|
+
kept_mask: Boolean array where True = row kept
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
Array of RIDs for result_df
|
|
207
|
+
"""
|
|
208
|
+
source_rids = self.get_ids_array(source_df)
|
|
209
|
+
if source_rids is None:
|
|
210
|
+
return self.register(result_df)
|
|
211
|
+
|
|
212
|
+
result_rids = source_rids[kept_mask]
|
|
213
|
+
self.set_result_rids(result_df, result_rids)
|
|
214
|
+
return result_rids
|
|
215
|
+
|
|
216
|
+
def propagate_by_positions(
|
|
217
|
+
self, source_df: pd.DataFrame, result_df: pd.DataFrame, positions: np.ndarray
|
|
218
|
+
) -> np.ndarray:
|
|
219
|
+
"""
|
|
220
|
+
Propagate RIDs using position indices (for head/tail/sample).
|
|
221
|
+
"""
|
|
222
|
+
source_rids = self.get_ids_array(source_df)
|
|
223
|
+
if source_rids is None:
|
|
224
|
+
return self.register(result_df)
|
|
225
|
+
|
|
226
|
+
result_rids = source_rids[positions]
|
|
227
|
+
self.set_result_rids(result_df, result_rids)
|
|
228
|
+
return result_rids
|
|
229
|
+
|
|
230
|
+
def propagate_by_permutation(
|
|
231
|
+
self, source_df: pd.DataFrame, result_df: pd.DataFrame, perm: np.ndarray
|
|
232
|
+
) -> np.ndarray:
|
|
233
|
+
"""
|
|
234
|
+
Propagate RIDs using permutation array (for sort_values).
|
|
235
|
+
"""
|
|
236
|
+
source_rids = self.get_ids_array(source_df)
|
|
237
|
+
if source_rids is None:
|
|
238
|
+
return self.register(result_df)
|
|
239
|
+
|
|
240
|
+
result_rids = source_rids[perm]
|
|
241
|
+
self.set_result_rids(result_df, result_rids)
|
|
242
|
+
return result_rids
|
|
243
|
+
|
|
112
244
|
def propagate(self, source_df: pd.DataFrame, result_df: pd.DataFrame) -> Optional[pd.Series]:
|
|
113
245
|
"""
|
|
114
246
|
Propagate row IDs from source to result DataFrame.
|
|
115
247
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
- Reordering (same rows, different order)
|
|
119
|
-
- Mixed operations
|
|
248
|
+
Backwards compatible method - uses index-based fallback.
|
|
249
|
+
For better accuracy, use propagate_by_mask/positions/permutation.
|
|
120
250
|
"""
|
|
121
251
|
source_ids = self.get_ids(source_df)
|
|
122
252
|
if source_ids is None:
|
|
@@ -139,7 +269,9 @@ class PandasRowIdentity:
|
|
|
139
269
|
new_mask = result_ids.isna()
|
|
140
270
|
if new_mask.any():
|
|
141
271
|
new_count = new_mask.sum()
|
|
142
|
-
new_row_ids =
|
|
272
|
+
new_row_ids = np.arange(
|
|
273
|
+
self._next_row_id, self._next_row_id + new_count, dtype=np.int64
|
|
274
|
+
)
|
|
143
275
|
self._next_row_id += new_count
|
|
144
276
|
result_ids.loc[new_mask] = new_row_ids
|
|
145
277
|
result_ids = result_ids.astype("int64")
|
|
@@ -147,7 +279,9 @@ class PandasRowIdentity:
|
|
|
147
279
|
# Fallback: positional alignment
|
|
148
280
|
if len(result_df) <= len(source_df):
|
|
149
281
|
result_ids = pd.Series(
|
|
150
|
-
source_ids.values[: len(result_df)],
|
|
282
|
+
source_ids.values[: len(result_df)],
|
|
283
|
+
index=result_df.index,
|
|
284
|
+
dtype="int64",
|
|
151
285
|
)
|
|
152
286
|
else:
|
|
153
287
|
# Result is larger - assign new IDs to extras
|
|
@@ -157,40 +291,158 @@ class PandasRowIdentity:
|
|
|
157
291
|
self._next_row_id += extra_count
|
|
158
292
|
result_ids = pd.Series(base_ids + extra_ids, index=result_df.index, dtype="int64")
|
|
159
293
|
|
|
160
|
-
|
|
294
|
+
# Register the result
|
|
295
|
+
rids_array = result_ids.values.astype(np.int64)
|
|
296
|
+
self.set_result_rids(result_df, rids_array)
|
|
297
|
+
return result_ids
|
|
161
298
|
|
|
162
|
-
|
|
163
|
-
self, original_df: pd.DataFrame, new_df: pd.DataFrame
|
|
164
|
-
) -> Optional[pd.Series]:
|
|
165
|
-
"""Handle reset_index(drop=True) which changes index."""
|
|
166
|
-
old_ids = self.get_ids(original_df)
|
|
167
|
-
if old_ids is None:
|
|
168
|
-
return None
|
|
299
|
+
# ========== DROP COMPUTATION ==========
|
|
169
300
|
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
301
|
+
def compute_dropped_ids(self, source_rids: np.ndarray, result_rids: np.ndarray) -> np.ndarray:
|
|
302
|
+
"""
|
|
303
|
+
Compute which RIDs were dropped.
|
|
173
304
|
|
|
174
|
-
|
|
305
|
+
Returns:
|
|
306
|
+
Array of dropped RIDs (sorted)
|
|
175
307
|
"""
|
|
176
|
-
|
|
308
|
+
# assume_unique=False because arrays are not guaranteed sorted
|
|
309
|
+
return np.setdiff1d(source_rids, result_rids, assume_unique=False)
|
|
177
310
|
|
|
178
|
-
|
|
179
|
-
|
|
311
|
+
def compute_dropped_with_positions(
|
|
312
|
+
self, source_rids: np.ndarray, kept_mask: np.ndarray
|
|
313
|
+
) -> tuple[np.ndarray, np.ndarray]:
|
|
314
|
+
"""
|
|
315
|
+
Compute dropped IDs with original positions preserved.
|
|
180
316
|
|
|
181
317
|
Returns:
|
|
182
|
-
|
|
318
|
+
(dropped_rids, dropped_positions) - both in original order
|
|
183
319
|
"""
|
|
184
|
-
|
|
185
|
-
|
|
320
|
+
dropped_mask = ~kept_mask
|
|
321
|
+
dropped_positions = np.where(dropped_mask)[0]
|
|
322
|
+
dropped_rids = source_rids[dropped_mask]
|
|
323
|
+
return dropped_rids, dropped_positions
|
|
324
|
+
|
|
325
|
+
def get_dropped_ids(self, source_df: pd.DataFrame, result_df: pd.DataFrame) -> np.ndarray:
|
|
326
|
+
"""
|
|
327
|
+
Get row IDs that were dropped between source and result.
|
|
328
|
+
|
|
329
|
+
Backwards compatible method.
|
|
330
|
+
"""
|
|
331
|
+
source_ids = self.get_ids_array(source_df)
|
|
332
|
+
result_ids = self.get_ids_array(result_df)
|
|
186
333
|
|
|
187
334
|
if source_ids is None:
|
|
188
335
|
return np.array([], dtype="int64")
|
|
189
336
|
if result_ids is None:
|
|
190
|
-
return np.asarray(source_ids
|
|
337
|
+
return np.asarray(source_ids, dtype="int64")
|
|
191
338
|
|
|
192
|
-
|
|
193
|
-
|
|
339
|
+
return np.setdiff1d(source_ids, result_ids)
|
|
340
|
+
|
|
341
|
+
# ========== GHOST ROW TRACKING ==========
|
|
342
|
+
|
|
343
|
+
def capture_ghost_values(
|
|
344
|
+
self,
|
|
345
|
+
source_df: pd.DataFrame,
|
|
346
|
+
dropped_mask: np.ndarray,
|
|
347
|
+
dropped_by: str,
|
|
348
|
+
step_id: int,
|
|
349
|
+
watched_columns: set[str],
|
|
350
|
+
) -> None:
|
|
351
|
+
"""
|
|
352
|
+
Capture last-known values for dropped rows (debug mode only).
|
|
353
|
+
|
|
354
|
+
Uses vectorized extraction, not iloc per cell.
|
|
355
|
+
"""
|
|
356
|
+
if not self.config.should_capture_ghost_values:
|
|
357
|
+
return
|
|
358
|
+
|
|
359
|
+
source_rids = self.get_ids_array(source_df)
|
|
360
|
+
if source_rids is None:
|
|
361
|
+
return
|
|
362
|
+
|
|
363
|
+
dropped_positions = np.where(dropped_mask)[0]
|
|
364
|
+
dropped_rids = source_rids[dropped_mask]
|
|
365
|
+
|
|
366
|
+
if len(dropped_rids) == 0:
|
|
367
|
+
return
|
|
368
|
+
|
|
369
|
+
# Limit to prevent memory explosion
|
|
370
|
+
max_ghosts = self.config.max_ghost_rows
|
|
371
|
+
if len(dropped_rids) > max_ghosts:
|
|
372
|
+
sample_idx = np.random.choice(len(dropped_rids), max_ghosts, replace=False)
|
|
373
|
+
sample_idx.sort() # Keep relative order
|
|
374
|
+
dropped_rids = dropped_rids[sample_idx]
|
|
375
|
+
dropped_positions = dropped_positions[sample_idx]
|
|
376
|
+
|
|
377
|
+
# Determine columns to capture
|
|
378
|
+
cols_to_capture = list(watched_columns & set(source_df.columns))
|
|
379
|
+
if not cols_to_capture:
|
|
380
|
+
cols_to_capture = list(source_df.columns)[:5]
|
|
381
|
+
|
|
382
|
+
# Vectorized extraction (one slice per column)
|
|
383
|
+
values_matrix: dict[str, np.ndarray] = {}
|
|
384
|
+
for col in cols_to_capture:
|
|
385
|
+
try:
|
|
386
|
+
values_matrix[col] = source_df[col].values[dropped_positions]
|
|
387
|
+
except Exception:
|
|
388
|
+
pass
|
|
389
|
+
|
|
390
|
+
# Build ghost row info from pre-extracted values
|
|
391
|
+
for i, (rid, pos) in enumerate(zip(dropped_rids, dropped_positions)):
|
|
392
|
+
values = {col: vals[i] for col, vals in values_matrix.items()}
|
|
393
|
+
|
|
394
|
+
self._ghost_rows[int(rid)] = GhostRowInfo(
|
|
395
|
+
row_id=int(rid),
|
|
396
|
+
last_values=values,
|
|
397
|
+
dropped_by=dropped_by,
|
|
398
|
+
dropped_step=step_id,
|
|
399
|
+
original_position=int(pos),
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
def get_ghost_rows(self, limit: int = 1000) -> pd.DataFrame:
|
|
403
|
+
"""
|
|
404
|
+
Get dropped rows with last-known values.
|
|
405
|
+
|
|
406
|
+
Returns DataFrame with:
|
|
407
|
+
- __tp_row_id__: Original row ID
|
|
408
|
+
- __tp_dropped_by__: Operation that dropped the row
|
|
409
|
+
- __tp_dropped_step__: Step ID
|
|
410
|
+
- __tp_original_position__: Position in original DataFrame
|
|
411
|
+
- [original columns]: Last known values
|
|
412
|
+
"""
|
|
413
|
+
if not self._ghost_rows:
|
|
414
|
+
return pd.DataFrame()
|
|
415
|
+
|
|
416
|
+
# Sort by original position for natural order
|
|
417
|
+
sorted_ghosts = sorted(self._ghost_rows.values(), key=lambda g: g.original_position)[:limit]
|
|
418
|
+
|
|
419
|
+
rows = []
|
|
420
|
+
for info in sorted_ghosts:
|
|
421
|
+
row = {
|
|
422
|
+
"__tp_row_id__": info.row_id,
|
|
423
|
+
"__tp_dropped_by__": info.dropped_by,
|
|
424
|
+
"__tp_dropped_step__": info.dropped_step,
|
|
425
|
+
"__tp_original_position__": info.original_position,
|
|
426
|
+
**info.last_values,
|
|
427
|
+
}
|
|
428
|
+
rows.append(row)
|
|
429
|
+
|
|
430
|
+
return pd.DataFrame(rows)
|
|
431
|
+
|
|
432
|
+
# ========== CLEANUP ==========
|
|
433
|
+
|
|
434
|
+
def clear(self) -> None:
|
|
435
|
+
"""Reset all state."""
|
|
436
|
+
self._registry.clear()
|
|
437
|
+
self._token_registry.clear()
|
|
438
|
+
self._ghost_rows.clear()
|
|
439
|
+
self._next_row_id = 0
|
|
440
|
+
|
|
441
|
+
def cleanup(self) -> None:
|
|
442
|
+
"""Remove stale entries (backwards compatible)."""
|
|
443
|
+
# WeakKeyDictionary handles this automatically for registry.
|
|
444
|
+
# Token registry uses FIFO eviction in _add_to_token_registry.
|
|
445
|
+
pass
|
|
194
446
|
|
|
195
447
|
def strip_hidden_column(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
196
448
|
"""Remove hidden column for export."""
|
|
@@ -198,20 +450,30 @@ class PandasRowIdentity:
|
|
|
198
450
|
return df.drop(columns=[_TRACEPIPE_ROW_ID_COL])
|
|
199
451
|
return df
|
|
200
452
|
|
|
201
|
-
def cleanup(self) -> None:
|
|
202
|
-
"""Remove stale entries."""
|
|
203
|
-
stale = [k for k in list(self._registry.keys()) if k not in self._df_refs]
|
|
204
|
-
for k in stale:
|
|
205
|
-
del self._registry[k]
|
|
206
|
-
|
|
207
453
|
def all_registered_ids(self) -> list[int]:
|
|
208
454
|
"""
|
|
209
455
|
Get all row IDs that have ever been registered.
|
|
210
|
-
|
|
211
|
-
Returns:
|
|
212
|
-
List of all registered row IDs.
|
|
213
456
|
"""
|
|
214
|
-
all_ids = set()
|
|
215
|
-
|
|
216
|
-
|
|
457
|
+
all_ids: set[int] = set()
|
|
458
|
+
|
|
459
|
+
# From WeakKeyDictionary
|
|
460
|
+
for rids in self._registry.values():
|
|
461
|
+
all_ids.update(rids.tolist())
|
|
462
|
+
|
|
463
|
+
# From token registry
|
|
464
|
+
for rids in self._token_registry.values():
|
|
465
|
+
all_ids.update(rids.tolist())
|
|
466
|
+
|
|
217
467
|
return sorted(all_ids)
|
|
468
|
+
|
|
469
|
+
def realign_for_reset_index(
|
|
470
|
+
self, original_df: pd.DataFrame, new_df: pd.DataFrame
|
|
471
|
+
) -> Optional[pd.Series]:
|
|
472
|
+
"""Handle reset_index(drop=True) which changes index."""
|
|
473
|
+
old_ids = self.get_ids_array(original_df)
|
|
474
|
+
if old_ids is None:
|
|
475
|
+
return None
|
|
476
|
+
|
|
477
|
+
# Same values, new index
|
|
478
|
+
self.set_result_rids(new_df, old_ids)
|
|
479
|
+
return pd.Series(old_ids, index=new_df.index, dtype="int64")
|