tracepipe 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tracepipe/__init__.py +117 -78
- tracepipe/api.py +219 -332
- tracepipe/context.py +21 -1
- tracepipe/contracts.py +473 -0
- tracepipe/convenience.py +817 -0
- tracepipe/core.py +174 -17
- tracepipe/debug.py +325 -0
- tracepipe/instrumentation/apply_capture.py +453 -0
- tracepipe/instrumentation/filter_capture.py +468 -0
- tracepipe/instrumentation/indexer_capture.py +813 -0
- tracepipe/instrumentation/merge_capture.py +434 -0
- tracepipe/instrumentation/pandas_inst.py +66 -183
- tracepipe/instrumentation/series_capture.py +331 -0
- tracepipe/safety.py +3 -3
- tracepipe/snapshot.py +420 -0
- tracepipe/storage/base.py +7 -3
- tracepipe/storage/lineage_store.py +252 -47
- tracepipe/storage/row_identity.py +366 -104
- tracepipe/value_provenance.py +309 -0
- tracepipe/visualization/html_export.py +22 -7
- tracepipe-0.3.1.dist-info/METADATA +308 -0
- tracepipe-0.3.1.dist-info/RECORD +29 -0
- tracepipe-0.2.0.dist-info/METADATA +0 -508
- tracepipe-0.2.0.dist-info/RECORD +0 -19
- {tracepipe-0.2.0.dist-info → tracepipe-0.3.1.dist-info}/WHEEL +0 -0
- {tracepipe-0.2.0.dist-info → tracepipe-0.3.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,813 @@
|
|
|
1
|
+
# tracepipe/instrumentation/indexer_capture.py
|
|
2
|
+
"""
|
|
3
|
+
loc/iloc instrumentation for TracePipe.
|
|
4
|
+
|
|
5
|
+
Operations tracked:
|
|
6
|
+
| Pattern | Type | Completeness |
|
|
7
|
+
|------------------------------|------------|--------------|
|
|
8
|
+
| df.loc[mask] | Filter | FULL |
|
|
9
|
+
| df.loc[mask, 'col'] | Filter | FULL |
|
|
10
|
+
| df.iloc[0:5] | Filter | FULL |
|
|
11
|
+
| df.iloc[[1,3,5]] | Filter | FULL |
|
|
12
|
+
| df.loc[mask, 'col'] = val | Transform | FULL |
|
|
13
|
+
| df.iloc[0:5, 0] = val | Transform | FULL |
|
|
14
|
+
| df.loc[mask] = other_df | Transform | PARTIAL |
|
|
15
|
+
|
|
16
|
+
Key insight: We wrap the indexer's __getitem__ and __setitem__, not DataFrame.loc itself.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import warnings
|
|
20
|
+
from typing import Any, Optional, Union
|
|
21
|
+
|
|
22
|
+
import numpy as np
|
|
23
|
+
import pandas as pd
|
|
24
|
+
|
|
25
|
+
from ..context import get_context
|
|
26
|
+
from ..core import ChangeType, CompletenessLevel
|
|
27
|
+
from ..safety import TracePipeWarning, get_caller_info
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class _CallableLocIndexer:
|
|
31
|
+
"""
|
|
32
|
+
Wrapper for the indexer returned by df.loc(axis=...).
|
|
33
|
+
|
|
34
|
+
This is used internally by pandas methods like dropna.
|
|
35
|
+
We skip tracking when _filter_op_depth > 0 to avoid double-counting
|
|
36
|
+
since the outer operation (e.g., dropna) will track the drops.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(self, indexer, parent_df: pd.DataFrame):
|
|
40
|
+
self._indexer = indexer
|
|
41
|
+
self._parent_df = parent_df
|
|
42
|
+
|
|
43
|
+
def __getitem__(self, key):
|
|
44
|
+
"""Pass through to underlying indexer and propagate RIDs."""
|
|
45
|
+
ctx = get_context()
|
|
46
|
+
result = self._indexer[key]
|
|
47
|
+
|
|
48
|
+
if not ctx.enabled:
|
|
49
|
+
return result
|
|
50
|
+
|
|
51
|
+
# Skip step tracking if we're inside a filter operation (parent will track)
|
|
52
|
+
# But still propagate RIDs for the result
|
|
53
|
+
if isinstance(result, pd.DataFrame):
|
|
54
|
+
try:
|
|
55
|
+
row_mgr = ctx.row_manager
|
|
56
|
+
source_df = self._parent_df
|
|
57
|
+
|
|
58
|
+
source_rids = row_mgr.get_ids_array(source_df)
|
|
59
|
+
if source_rids is None:
|
|
60
|
+
row_mgr.register(source_df)
|
|
61
|
+
source_rids = row_mgr.get_ids_array(source_df)
|
|
62
|
+
|
|
63
|
+
# Propagate RIDs to result
|
|
64
|
+
if hasattr(key, "dtype") and key.dtype == bool:
|
|
65
|
+
mask = key.values if isinstance(key, pd.Series) else np.asarray(key)
|
|
66
|
+
row_mgr.propagate_by_mask(source_df, result, mask)
|
|
67
|
+
else:
|
|
68
|
+
row_mgr.register(result)
|
|
69
|
+
|
|
70
|
+
# Only create step if NOT inside a filter operation
|
|
71
|
+
if ctx._filter_op_depth == 0:
|
|
72
|
+
store = ctx.store
|
|
73
|
+
if hasattr(key, "dtype") and key.dtype == bool:
|
|
74
|
+
mask = key.values if isinstance(key, pd.Series) else np.asarray(key)
|
|
75
|
+
dropped_rids, _ = row_mgr.compute_dropped_with_positions(source_rids, mask)
|
|
76
|
+
completeness = CompletenessLevel.FULL
|
|
77
|
+
else:
|
|
78
|
+
result_rids = row_mgr.get_ids_array(result)
|
|
79
|
+
dropped_rids = row_mgr.compute_dropped_ids(source_rids, result_rids)
|
|
80
|
+
completeness = CompletenessLevel.PARTIAL
|
|
81
|
+
|
|
82
|
+
code_file, code_line = get_caller_info(skip_frames=4)
|
|
83
|
+
step_id = store.append_step(
|
|
84
|
+
operation="DataFrame.loc(axis)[]",
|
|
85
|
+
stage=ctx.current_stage,
|
|
86
|
+
code_file=code_file,
|
|
87
|
+
code_line=code_line,
|
|
88
|
+
params={},
|
|
89
|
+
input_shape=source_df.shape,
|
|
90
|
+
output_shape=result.shape,
|
|
91
|
+
completeness=completeness,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
if len(dropped_rids) > 0:
|
|
95
|
+
store.append_bulk_drops(step_id, dropped_rids)
|
|
96
|
+
except Exception as e:
|
|
97
|
+
if ctx.config.strict_mode:
|
|
98
|
+
raise
|
|
99
|
+
warnings.warn(f"TracePipe: loc(axis)[] capture failed: {e}", TracePipeWarning)
|
|
100
|
+
|
|
101
|
+
return result
|
|
102
|
+
|
|
103
|
+
def __setitem__(self, key, value):
|
|
104
|
+
"""Pass through setitem."""
|
|
105
|
+
self._indexer[key] = value
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class TrackedLocIndexer:
|
|
109
|
+
"""
|
|
110
|
+
Wrapper around pandas _LocIndexer that captures lineage.
|
|
111
|
+
|
|
112
|
+
Usage (internal - user never sees this):
|
|
113
|
+
df.loc # Returns TrackedLocIndexer wrapping the real _LocIndexer
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
def __init__(self, indexer, parent_df: pd.DataFrame):
|
|
117
|
+
self._indexer = indexer
|
|
118
|
+
self._parent_df = parent_df
|
|
119
|
+
|
|
120
|
+
def __getattr__(self, name):
|
|
121
|
+
"""Proxy any other attribute access to the underlying indexer."""
|
|
122
|
+
return getattr(self._indexer, name)
|
|
123
|
+
|
|
124
|
+
def __call__(self, axis=None):
|
|
125
|
+
"""
|
|
126
|
+
Support for df.loc(axis=...) callable form used internally by pandas.
|
|
127
|
+
|
|
128
|
+
This is used by dropna and other methods that call self.loc(axis=axis)[mask].
|
|
129
|
+
"""
|
|
130
|
+
# Return a callable-aware indexer that wraps the result of calling the original
|
|
131
|
+
return _CallableLocIndexer(self._indexer(axis), self._parent_df)
|
|
132
|
+
|
|
133
|
+
def __getitem__(self, key) -> Union[pd.DataFrame, pd.Series, Any]:
|
|
134
|
+
"""
|
|
135
|
+
Capture filter operations via loc[].
|
|
136
|
+
|
|
137
|
+
Handles:
|
|
138
|
+
- df.loc[mask] -> DataFrame (filter)
|
|
139
|
+
- df.loc[mask, 'col'] -> Series (filter + column select)
|
|
140
|
+
- df.loc[mask, ['a', 'b']] -> DataFrame (filter + column select)
|
|
141
|
+
- df.loc['label'] -> Row (single row access)
|
|
142
|
+
"""
|
|
143
|
+
ctx = get_context()
|
|
144
|
+
|
|
145
|
+
# Always run original first
|
|
146
|
+
result = self._indexer[key]
|
|
147
|
+
|
|
148
|
+
if not ctx.enabled:
|
|
149
|
+
return result
|
|
150
|
+
|
|
151
|
+
try:
|
|
152
|
+
self._capture_loc_getitem(key, result, ctx)
|
|
153
|
+
except Exception as e:
|
|
154
|
+
if ctx.config.strict_mode:
|
|
155
|
+
raise
|
|
156
|
+
warnings.warn(f"TracePipe: loc[] capture failed: {e}", TracePipeWarning)
|
|
157
|
+
|
|
158
|
+
return result
|
|
159
|
+
|
|
160
|
+
def __setitem__(self, key, value) -> None:
|
|
161
|
+
"""
|
|
162
|
+
Capture transform operations via loc[] = value.
|
|
163
|
+
|
|
164
|
+
Handles:
|
|
165
|
+
- df.loc[mask, 'col'] = scalar
|
|
166
|
+
- df.loc[mask, 'col'] = array
|
|
167
|
+
- df.loc[mask, ['a', 'b']] = values
|
|
168
|
+
- df.loc[mask] = other_df (PARTIAL - complex assignment)
|
|
169
|
+
"""
|
|
170
|
+
ctx = get_context()
|
|
171
|
+
|
|
172
|
+
# Capture before state for watched columns
|
|
173
|
+
before_values = None
|
|
174
|
+
affected_cols = None
|
|
175
|
+
if ctx.enabled and ctx.watched_columns:
|
|
176
|
+
before_values, affected_cols = self._capture_before_state(key, ctx)
|
|
177
|
+
|
|
178
|
+
# Always run original
|
|
179
|
+
self._indexer[key] = value
|
|
180
|
+
|
|
181
|
+
if not ctx.enabled:
|
|
182
|
+
return
|
|
183
|
+
|
|
184
|
+
try:
|
|
185
|
+
self._capture_loc_setitem(key, value, before_values, affected_cols, ctx)
|
|
186
|
+
except Exception as e:
|
|
187
|
+
if ctx.config.strict_mode:
|
|
188
|
+
raise
|
|
189
|
+
warnings.warn(f"TracePipe: loc[]= capture failed: {e}", TracePipeWarning)
|
|
190
|
+
|
|
191
|
+
def _capture_loc_getitem(self, key, result, ctx) -> None:
|
|
192
|
+
"""Capture filter via loc[]."""
|
|
193
|
+
if not isinstance(result, pd.DataFrame):
|
|
194
|
+
return # Series or scalar - not a filter operation
|
|
195
|
+
|
|
196
|
+
row_mgr = ctx.row_manager
|
|
197
|
+
source_df = self._parent_df
|
|
198
|
+
|
|
199
|
+
source_rids = row_mgr.get_ids_array(source_df)
|
|
200
|
+
if source_rids is None:
|
|
201
|
+
row_mgr.register(source_df)
|
|
202
|
+
source_rids = row_mgr.get_ids_array(source_df)
|
|
203
|
+
|
|
204
|
+
# Derive kept mask from key
|
|
205
|
+
kept_mask, completeness = self._derive_loc_mask(key, source_df)
|
|
206
|
+
|
|
207
|
+
# Always propagate RIDs
|
|
208
|
+
if kept_mask is not None:
|
|
209
|
+
row_mgr.propagate_by_mask(source_df, result, kept_mask)
|
|
210
|
+
else:
|
|
211
|
+
row_mgr.register(result)
|
|
212
|
+
|
|
213
|
+
# Skip step tracking if we're inside a filter operation (parent will track)
|
|
214
|
+
if ctx._filter_op_depth > 0:
|
|
215
|
+
return
|
|
216
|
+
|
|
217
|
+
store = ctx.store
|
|
218
|
+
|
|
219
|
+
if kept_mask is not None:
|
|
220
|
+
dropped_rids, _ = row_mgr.compute_dropped_with_positions(source_rids, kept_mask)
|
|
221
|
+
else:
|
|
222
|
+
completeness = CompletenessLevel.PARTIAL
|
|
223
|
+
result_rids = row_mgr.get_ids_array(result)
|
|
224
|
+
dropped_rids = row_mgr.compute_dropped_ids(source_rids, result_rids)
|
|
225
|
+
|
|
226
|
+
code_file, code_line = get_caller_info(skip_frames=4)
|
|
227
|
+
step_id = store.append_step(
|
|
228
|
+
operation="DataFrame.loc[]",
|
|
229
|
+
stage=ctx.current_stage,
|
|
230
|
+
code_file=code_file,
|
|
231
|
+
code_line=code_line,
|
|
232
|
+
params={"key_type": type(key).__name__},
|
|
233
|
+
input_shape=source_df.shape,
|
|
234
|
+
output_shape=result.shape,
|
|
235
|
+
completeness=completeness,
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
if len(dropped_rids) > 0:
|
|
239
|
+
store.append_bulk_drops(step_id, dropped_rids)
|
|
240
|
+
|
|
241
|
+
def _derive_loc_mask(
|
|
242
|
+
self, key, df: pd.DataFrame
|
|
243
|
+
) -> tuple[Optional[np.ndarray], CompletenessLevel]:
|
|
244
|
+
"""
|
|
245
|
+
Derive boolean mask from loc key.
|
|
246
|
+
|
|
247
|
+
Key types:
|
|
248
|
+
- Boolean array/Series -> mask directly
|
|
249
|
+
- Slice -> convert to positional mask
|
|
250
|
+
- List of labels -> index.isin()
|
|
251
|
+
- Single label -> single row mask
|
|
252
|
+
- Tuple (row_key, col_key) -> handle row_key
|
|
253
|
+
"""
|
|
254
|
+
row_key = key[0] if isinstance(key, tuple) else key
|
|
255
|
+
|
|
256
|
+
# Boolean mask
|
|
257
|
+
if hasattr(row_key, "dtype") and row_key.dtype == bool:
|
|
258
|
+
mask = row_key.values if isinstance(row_key, pd.Series) else np.asarray(row_key)
|
|
259
|
+
if len(mask) == len(df):
|
|
260
|
+
return mask, CompletenessLevel.FULL
|
|
261
|
+
|
|
262
|
+
# List of labels
|
|
263
|
+
if isinstance(row_key, (list, np.ndarray)) and not (
|
|
264
|
+
hasattr(row_key, "dtype") and row_key.dtype == bool
|
|
265
|
+
):
|
|
266
|
+
mask = df.index.isin(row_key)
|
|
267
|
+
return (
|
|
268
|
+
mask.to_numpy() if hasattr(mask, "to_numpy") else np.asarray(mask),
|
|
269
|
+
CompletenessLevel.FULL,
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
# Slice
|
|
273
|
+
if isinstance(row_key, slice):
|
|
274
|
+
try:
|
|
275
|
+
# Get positional indices from label slice
|
|
276
|
+
start_idx = df.index.get_loc(row_key.start) if row_key.start is not None else 0
|
|
277
|
+
stop_idx = df.index.get_loc(row_key.stop) if row_key.stop is not None else len(df)
|
|
278
|
+
# loc slice is inclusive on both ends
|
|
279
|
+
if isinstance(start_idx, int) and isinstance(stop_idx, int):
|
|
280
|
+
mask = np.zeros(len(df), dtype=bool)
|
|
281
|
+
mask[start_idx : stop_idx + 1] = True
|
|
282
|
+
return mask, CompletenessLevel.FULL
|
|
283
|
+
except (KeyError, TypeError):
|
|
284
|
+
pass
|
|
285
|
+
|
|
286
|
+
# Single label
|
|
287
|
+
if not isinstance(row_key, (list, np.ndarray, slice, pd.Series)):
|
|
288
|
+
try:
|
|
289
|
+
mask = df.index == row_key
|
|
290
|
+
return (
|
|
291
|
+
mask.to_numpy() if hasattr(mask, "to_numpy") else np.asarray(mask),
|
|
292
|
+
CompletenessLevel.FULL,
|
|
293
|
+
)
|
|
294
|
+
except Exception:
|
|
295
|
+
pass
|
|
296
|
+
|
|
297
|
+
return None, CompletenessLevel.PARTIAL
|
|
298
|
+
|
|
299
|
+
def _capture_before_state(self, key, ctx) -> tuple[Optional[dict], Optional[list]]:
|
|
300
|
+
"""Capture values before assignment for watched columns."""
|
|
301
|
+
col_key = key[1] if isinstance(key, tuple) and len(key) > 1 else None
|
|
302
|
+
|
|
303
|
+
# Determine affected columns
|
|
304
|
+
if col_key is None:
|
|
305
|
+
affected_cols = list(ctx.watched_columns & set(self._parent_df.columns))
|
|
306
|
+
elif isinstance(col_key, str):
|
|
307
|
+
affected_cols = [col_key] if col_key in ctx.watched_columns else []
|
|
308
|
+
elif isinstance(col_key, list):
|
|
309
|
+
affected_cols = [c for c in col_key if c in ctx.watched_columns]
|
|
310
|
+
else:
|
|
311
|
+
affected_cols = []
|
|
312
|
+
|
|
313
|
+
if not affected_cols:
|
|
314
|
+
return None, None
|
|
315
|
+
|
|
316
|
+
# Derive affected rows
|
|
317
|
+
mask, _ = self._derive_loc_mask(key, self._parent_df)
|
|
318
|
+
if mask is None:
|
|
319
|
+
return None, affected_cols
|
|
320
|
+
|
|
321
|
+
# Capture values (vectorized per column)
|
|
322
|
+
rids = ctx.row_manager.get_ids_array(self._parent_df)
|
|
323
|
+
if rids is None:
|
|
324
|
+
return None, affected_cols
|
|
325
|
+
|
|
326
|
+
before = {}
|
|
327
|
+
affected_positions = np.where(mask)[0]
|
|
328
|
+
for col in affected_cols:
|
|
329
|
+
before[col] = {
|
|
330
|
+
"rids": rids[affected_positions].copy(),
|
|
331
|
+
"values": self._parent_df[col].values[affected_positions].copy(),
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
return before, affected_cols
|
|
335
|
+
|
|
336
|
+
def _capture_loc_setitem(self, key, value, before_values, affected_cols, ctx) -> None:
|
|
337
|
+
"""Capture transform via loc[] = value."""
|
|
338
|
+
if before_values is None or not affected_cols:
|
|
339
|
+
return
|
|
340
|
+
|
|
341
|
+
store = ctx.store
|
|
342
|
+
|
|
343
|
+
code_file, code_line = get_caller_info(skip_frames=4)
|
|
344
|
+
step_id = store.append_step(
|
|
345
|
+
operation="DataFrame.loc[]=",
|
|
346
|
+
stage=ctx.current_stage,
|
|
347
|
+
code_file=code_file,
|
|
348
|
+
code_line=code_line,
|
|
349
|
+
params={"columns": affected_cols[:3]},
|
|
350
|
+
input_shape=self._parent_df.shape,
|
|
351
|
+
output_shape=self._parent_df.shape,
|
|
352
|
+
completeness=CompletenessLevel.FULL,
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
from ..utils.value_capture import values_equal
|
|
356
|
+
|
|
357
|
+
for col in affected_cols:
|
|
358
|
+
if col not in before_values:
|
|
359
|
+
continue
|
|
360
|
+
|
|
361
|
+
rids = before_values[col]["rids"]
|
|
362
|
+
old_vals = before_values[col]["values"]
|
|
363
|
+
|
|
364
|
+
# Get current positions for these rids
|
|
365
|
+
mask, _ = self._derive_loc_mask(key, self._parent_df)
|
|
366
|
+
if mask is None:
|
|
367
|
+
continue
|
|
368
|
+
|
|
369
|
+
new_vals = self._parent_df[col].values[np.where(mask)[0]]
|
|
370
|
+
|
|
371
|
+
# Vectorized diff detection
|
|
372
|
+
for rid, old_val, new_val in zip(rids, old_vals, new_vals):
|
|
373
|
+
if not values_equal(old_val, new_val):
|
|
374
|
+
store.append_diff(
|
|
375
|
+
step_id=step_id,
|
|
376
|
+
row_id=int(rid),
|
|
377
|
+
col=col,
|
|
378
|
+
old_val=old_val,
|
|
379
|
+
new_val=new_val,
|
|
380
|
+
change_type=ChangeType.MODIFIED,
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
class TrackedILocIndexer:
|
|
385
|
+
"""
|
|
386
|
+
Wrapper around pandas _iLocIndexer that captures lineage.
|
|
387
|
+
|
|
388
|
+
Similar to TrackedLocIndexer but uses positional indexing.
|
|
389
|
+
"""
|
|
390
|
+
|
|
391
|
+
def __init__(self, indexer, parent_df: pd.DataFrame):
|
|
392
|
+
self._indexer = indexer
|
|
393
|
+
self._parent_df = parent_df
|
|
394
|
+
|
|
395
|
+
def __getattr__(self, name):
|
|
396
|
+
"""Proxy any other attribute access to the underlying indexer."""
|
|
397
|
+
return getattr(self._indexer, name)
|
|
398
|
+
|
|
399
|
+
def __getitem__(self, key) -> Union[pd.DataFrame, pd.Series, Any]:
|
|
400
|
+
"""Capture filter via iloc[]."""
|
|
401
|
+
ctx = get_context()
|
|
402
|
+
result = self._indexer[key]
|
|
403
|
+
|
|
404
|
+
if not ctx.enabled:
|
|
405
|
+
return result
|
|
406
|
+
|
|
407
|
+
try:
|
|
408
|
+
self._capture_iloc_getitem(key, result, ctx)
|
|
409
|
+
except Exception as e:
|
|
410
|
+
if ctx.config.strict_mode:
|
|
411
|
+
raise
|
|
412
|
+
warnings.warn(f"TracePipe: iloc[] capture failed: {e}", TracePipeWarning)
|
|
413
|
+
|
|
414
|
+
return result
|
|
415
|
+
|
|
416
|
+
def __setitem__(self, key, value) -> None:
|
|
417
|
+
"""Capture transform via iloc[] = value."""
|
|
418
|
+
ctx = get_context()
|
|
419
|
+
|
|
420
|
+
before_values = None
|
|
421
|
+
affected_cols = None
|
|
422
|
+
if ctx.enabled and ctx.watched_columns:
|
|
423
|
+
before_values, affected_cols = self._capture_before_state(key, ctx)
|
|
424
|
+
|
|
425
|
+
self._indexer[key] = value
|
|
426
|
+
|
|
427
|
+
if not ctx.enabled:
|
|
428
|
+
return
|
|
429
|
+
|
|
430
|
+
try:
|
|
431
|
+
self._capture_iloc_setitem(key, value, before_values, affected_cols, ctx)
|
|
432
|
+
except Exception as e:
|
|
433
|
+
if ctx.config.strict_mode:
|
|
434
|
+
raise
|
|
435
|
+
warnings.warn(f"TracePipe: iloc[]= capture failed: {e}", TracePipeWarning)
|
|
436
|
+
|
|
437
|
+
def _capture_iloc_getitem(self, key, result, ctx) -> None:
|
|
438
|
+
"""Capture filter via iloc[]."""
|
|
439
|
+
if not isinstance(result, pd.DataFrame):
|
|
440
|
+
return
|
|
441
|
+
|
|
442
|
+
row_mgr = ctx.row_manager
|
|
443
|
+
source_df = self._parent_df
|
|
444
|
+
|
|
445
|
+
source_rids = row_mgr.get_ids_array(source_df)
|
|
446
|
+
if source_rids is None:
|
|
447
|
+
row_mgr.register(source_df)
|
|
448
|
+
source_rids = row_mgr.get_ids_array(source_df)
|
|
449
|
+
|
|
450
|
+
# Derive positions from key
|
|
451
|
+
positions = self._derive_iloc_positions(key, source_df)
|
|
452
|
+
|
|
453
|
+
# Always propagate RIDs
|
|
454
|
+
if positions is not None:
|
|
455
|
+
row_mgr.propagate_by_positions(source_df, result, positions)
|
|
456
|
+
else:
|
|
457
|
+
row_mgr.register(result)
|
|
458
|
+
|
|
459
|
+
# Skip step tracking if we're inside a filter operation (parent will track)
|
|
460
|
+
if ctx._filter_op_depth > 0:
|
|
461
|
+
return
|
|
462
|
+
|
|
463
|
+
store = ctx.store
|
|
464
|
+
|
|
465
|
+
if positions is not None:
|
|
466
|
+
kept_mask = np.zeros(len(source_df), dtype=bool)
|
|
467
|
+
kept_mask[positions] = True
|
|
468
|
+
dropped_rids, _ = row_mgr.compute_dropped_with_positions(source_rids, kept_mask)
|
|
469
|
+
completeness = CompletenessLevel.FULL
|
|
470
|
+
else:
|
|
471
|
+
completeness = CompletenessLevel.PARTIAL
|
|
472
|
+
result_rids = row_mgr.get_ids_array(result)
|
|
473
|
+
dropped_rids = row_mgr.compute_dropped_ids(source_rids, result_rids)
|
|
474
|
+
|
|
475
|
+
code_file, code_line = get_caller_info(skip_frames=4)
|
|
476
|
+
step_id = store.append_step(
|
|
477
|
+
operation="DataFrame.iloc[]",
|
|
478
|
+
stage=ctx.current_stage,
|
|
479
|
+
code_file=code_file,
|
|
480
|
+
code_line=code_line,
|
|
481
|
+
params={"key_type": type(key).__name__},
|
|
482
|
+
input_shape=source_df.shape,
|
|
483
|
+
output_shape=result.shape,
|
|
484
|
+
completeness=completeness,
|
|
485
|
+
)
|
|
486
|
+
|
|
487
|
+
if len(dropped_rids) > 0:
|
|
488
|
+
store.append_bulk_drops(step_id, dropped_rids)
|
|
489
|
+
|
|
490
|
+
def _derive_iloc_positions(self, key, df: pd.DataFrame) -> Optional[np.ndarray]:
|
|
491
|
+
"""Derive position array from iloc key."""
|
|
492
|
+
row_key = key[0] if isinstance(key, tuple) else key
|
|
493
|
+
n = len(df)
|
|
494
|
+
|
|
495
|
+
# Integer
|
|
496
|
+
if isinstance(row_key, int):
|
|
497
|
+
pos = row_key if row_key >= 0 else n + row_key
|
|
498
|
+
return np.array([pos], dtype=np.int64)
|
|
499
|
+
|
|
500
|
+
# Slice
|
|
501
|
+
if isinstance(row_key, slice):
|
|
502
|
+
indices = range(*row_key.indices(n))
|
|
503
|
+
return np.array(list(indices), dtype=np.int64)
|
|
504
|
+
|
|
505
|
+
# List/array of integers
|
|
506
|
+
if isinstance(row_key, (list, np.ndarray)):
|
|
507
|
+
arr = np.asarray(row_key, dtype=np.int64)
|
|
508
|
+
# Handle negative indices
|
|
509
|
+
arr = np.where(arr < 0, n + arr, arr)
|
|
510
|
+
return arr
|
|
511
|
+
|
|
512
|
+
# Boolean array
|
|
513
|
+
if hasattr(row_key, "dtype") and row_key.dtype == bool:
|
|
514
|
+
return np.where(row_key)[0].astype(np.int64)
|
|
515
|
+
|
|
516
|
+
return None
|
|
517
|
+
|
|
518
|
+
def _capture_before_state(self, key, ctx):
|
|
519
|
+
"""Capture values before assignment for watched columns."""
|
|
520
|
+
col_key = key[1] if isinstance(key, tuple) and len(key) > 1 else None
|
|
521
|
+
|
|
522
|
+
# Determine affected columns by position
|
|
523
|
+
if col_key is None:
|
|
524
|
+
affected_cols = list(ctx.watched_columns & set(self._parent_df.columns))
|
|
525
|
+
elif isinstance(col_key, int):
|
|
526
|
+
col_name = self._parent_df.columns[col_key]
|
|
527
|
+
affected_cols = [col_name] if col_name in ctx.watched_columns else []
|
|
528
|
+
elif isinstance(col_key, (list, np.ndarray)):
|
|
529
|
+
col_names = [self._parent_df.columns[i] for i in col_key]
|
|
530
|
+
affected_cols = [c for c in col_names if c in ctx.watched_columns]
|
|
531
|
+
elif isinstance(col_key, slice):
|
|
532
|
+
col_names = self._parent_df.columns[col_key].tolist()
|
|
533
|
+
affected_cols = [c for c in col_names if c in ctx.watched_columns]
|
|
534
|
+
else:
|
|
535
|
+
affected_cols = []
|
|
536
|
+
|
|
537
|
+
if not affected_cols:
|
|
538
|
+
return None, None
|
|
539
|
+
|
|
540
|
+
positions = self._derive_iloc_positions(key, self._parent_df)
|
|
541
|
+
if positions is None:
|
|
542
|
+
return None, affected_cols
|
|
543
|
+
|
|
544
|
+
rids = ctx.row_manager.get_ids_array(self._parent_df)
|
|
545
|
+
if rids is None:
|
|
546
|
+
return None, affected_cols
|
|
547
|
+
|
|
548
|
+
before = {}
|
|
549
|
+
for col in affected_cols:
|
|
550
|
+
before[col] = {
|
|
551
|
+
"rids": rids[positions].copy(),
|
|
552
|
+
"values": self._parent_df[col].values[positions].copy(),
|
|
553
|
+
"positions": positions.copy(),
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
return before, affected_cols
|
|
557
|
+
|
|
558
|
+
def _capture_iloc_setitem(self, key, value, before_values, affected_cols, ctx) -> None:
|
|
559
|
+
"""Capture transform via iloc[] = value."""
|
|
560
|
+
if before_values is None or not affected_cols:
|
|
561
|
+
return
|
|
562
|
+
|
|
563
|
+
store = ctx.store
|
|
564
|
+
|
|
565
|
+
code_file, code_line = get_caller_info(skip_frames=4)
|
|
566
|
+
step_id = store.append_step(
|
|
567
|
+
operation="DataFrame.iloc[]=",
|
|
568
|
+
stage=ctx.current_stage,
|
|
569
|
+
code_file=code_file,
|
|
570
|
+
code_line=code_line,
|
|
571
|
+
params={"columns": affected_cols[:3]},
|
|
572
|
+
input_shape=self._parent_df.shape,
|
|
573
|
+
output_shape=self._parent_df.shape,
|
|
574
|
+
completeness=CompletenessLevel.FULL,
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
from ..utils.value_capture import values_equal
|
|
578
|
+
|
|
579
|
+
for col in affected_cols:
|
|
580
|
+
if col not in before_values:
|
|
581
|
+
continue
|
|
582
|
+
|
|
583
|
+
rids = before_values[col]["rids"]
|
|
584
|
+
old_vals = before_values[col]["values"]
|
|
585
|
+
positions = before_values[col]["positions"]
|
|
586
|
+
new_vals = self._parent_df[col].values[positions]
|
|
587
|
+
|
|
588
|
+
for rid, old_val, new_val in zip(rids, old_vals, new_vals):
|
|
589
|
+
if not values_equal(old_val, new_val):
|
|
590
|
+
store.append_diff(
|
|
591
|
+
step_id=step_id,
|
|
592
|
+
row_id=int(rid),
|
|
593
|
+
col=col,
|
|
594
|
+
old_val=old_val,
|
|
595
|
+
new_val=new_val,
|
|
596
|
+
change_type=ChangeType.MODIFIED,
|
|
597
|
+
)
|
|
598
|
+
|
|
599
|
+
|
|
600
|
+
# Store original properties for restore
|
|
601
|
+
_original_loc = None
|
|
602
|
+
_original_iloc = None
|
|
603
|
+
_original_at = None
|
|
604
|
+
_original_iat = None
|
|
605
|
+
|
|
606
|
+
|
|
607
|
+
class TrackedAtIndexer:
|
|
608
|
+
"""
|
|
609
|
+
Wrapper around pandas _AtIndexer that captures scalar assignments.
|
|
610
|
+
|
|
611
|
+
.at is optimized for scalar access by label.
|
|
612
|
+
"""
|
|
613
|
+
|
|
614
|
+
def __init__(self, indexer, df):
|
|
615
|
+
self._indexer = indexer
|
|
616
|
+
self._df = df
|
|
617
|
+
|
|
618
|
+
def __getitem__(self, key):
|
|
619
|
+
return self._indexer[key]
|
|
620
|
+
|
|
621
|
+
def __setitem__(self, key, value) -> None:
|
|
622
|
+
"""Capture scalar assignment via at[row, col] = value."""
|
|
623
|
+
ctx = get_context()
|
|
624
|
+
if not ctx or not ctx.enabled:
|
|
625
|
+
self._indexer[key] = value
|
|
626
|
+
return
|
|
627
|
+
|
|
628
|
+
row_label, col = key
|
|
629
|
+
col_str = str(col)
|
|
630
|
+
|
|
631
|
+
# Check if column is watched
|
|
632
|
+
should_track = col_str in ctx.watched_columns if ctx.watched_columns else False
|
|
633
|
+
|
|
634
|
+
# Capture before state
|
|
635
|
+
old_val = None
|
|
636
|
+
if should_track:
|
|
637
|
+
try:
|
|
638
|
+
old_val = self._df.at[row_label, col]
|
|
639
|
+
except (KeyError, IndexError):
|
|
640
|
+
pass
|
|
641
|
+
|
|
642
|
+
# Execute original
|
|
643
|
+
self._indexer[key] = value
|
|
644
|
+
|
|
645
|
+
# Capture after state
|
|
646
|
+
if should_track and ctx.store:
|
|
647
|
+
from .pandas_inst import get_caller_info
|
|
648
|
+
|
|
649
|
+
code_file, code_line = get_caller_info(skip_frames=2)
|
|
650
|
+
step_id = ctx.store.append_step(
|
|
651
|
+
operation="DataFrame.at[]=",
|
|
652
|
+
stage=ctx.current_stage,
|
|
653
|
+
code_file=code_file,
|
|
654
|
+
code_line=code_line,
|
|
655
|
+
params={"row": str(row_label), "col": col_str},
|
|
656
|
+
input_shape=self._df.shape,
|
|
657
|
+
output_shape=self._df.shape,
|
|
658
|
+
)
|
|
659
|
+
|
|
660
|
+
# Get row_id for this position
|
|
661
|
+
try:
|
|
662
|
+
row_pos = self._df.index.get_loc(row_label)
|
|
663
|
+
if isinstance(row_pos, int):
|
|
664
|
+
rids = ctx.row_manager.get_ids_array(self._df)
|
|
665
|
+
if rids is None:
|
|
666
|
+
ctx.row_manager.register(self._df)
|
|
667
|
+
rids = ctx.row_manager.get_ids_array(self._df)
|
|
668
|
+
if rids is not None and row_pos < len(rids):
|
|
669
|
+
from ..core import ChangeType
|
|
670
|
+
|
|
671
|
+
row_id = int(rids[row_pos])
|
|
672
|
+
ctx.store.append_diff(
|
|
673
|
+
step_id=step_id,
|
|
674
|
+
row_id=row_id,
|
|
675
|
+
col=col_str,
|
|
676
|
+
old_val=old_val,
|
|
677
|
+
new_val=value,
|
|
678
|
+
change_type=ChangeType.MODIFIED,
|
|
679
|
+
)
|
|
680
|
+
except (KeyError, TypeError):
|
|
681
|
+
pass
|
|
682
|
+
|
|
683
|
+
|
|
684
|
+
class TrackedIAtIndexer:
|
|
685
|
+
"""
|
|
686
|
+
Wrapper around pandas _iAtIndexer that captures scalar assignments.
|
|
687
|
+
|
|
688
|
+
.iat is optimized for scalar access by integer position.
|
|
689
|
+
"""
|
|
690
|
+
|
|
691
|
+
def __init__(self, indexer, df):
|
|
692
|
+
self._indexer = indexer
|
|
693
|
+
self._df = df
|
|
694
|
+
|
|
695
|
+
def __getitem__(self, key):
|
|
696
|
+
return self._indexer[key]
|
|
697
|
+
|
|
698
|
+
def __setitem__(self, key, value) -> None:
|
|
699
|
+
"""Capture scalar assignment via iat[row, col] = value."""
|
|
700
|
+
ctx = get_context()
|
|
701
|
+
if not ctx or not ctx.enabled:
|
|
702
|
+
self._indexer[key] = value
|
|
703
|
+
return
|
|
704
|
+
|
|
705
|
+
row_pos, col_pos = key
|
|
706
|
+
col_str = self._df.columns[col_pos] if col_pos < len(self._df.columns) else str(col_pos)
|
|
707
|
+
|
|
708
|
+
# Check if column is watched
|
|
709
|
+
should_track = col_str in ctx.watched_columns if ctx.watched_columns else False
|
|
710
|
+
|
|
711
|
+
# Capture before state
|
|
712
|
+
old_val = None
|
|
713
|
+
if should_track:
|
|
714
|
+
try:
|
|
715
|
+
old_val = self._df.iat[row_pos, col_pos]
|
|
716
|
+
except (KeyError, IndexError):
|
|
717
|
+
pass
|
|
718
|
+
|
|
719
|
+
# Execute original
|
|
720
|
+
self._indexer[key] = value
|
|
721
|
+
|
|
722
|
+
# Capture after state
|
|
723
|
+
if should_track and ctx.store:
|
|
724
|
+
from .pandas_inst import get_caller_info
|
|
725
|
+
|
|
726
|
+
code_file, code_line = get_caller_info(skip_frames=2)
|
|
727
|
+
step_id = ctx.store.append_step(
|
|
728
|
+
operation="DataFrame.iat[]=",
|
|
729
|
+
stage=ctx.current_stage,
|
|
730
|
+
code_file=code_file,
|
|
731
|
+
code_line=code_line,
|
|
732
|
+
params={"row": row_pos, "col": col_str},
|
|
733
|
+
input_shape=self._df.shape,
|
|
734
|
+
output_shape=self._df.shape,
|
|
735
|
+
)
|
|
736
|
+
|
|
737
|
+
# Get row_id for this position
|
|
738
|
+
try:
|
|
739
|
+
rids = ctx.row_manager.get_ids_array(self._df)
|
|
740
|
+
if rids is None:
|
|
741
|
+
ctx.row_manager.register(self._df)
|
|
742
|
+
rids = ctx.row_manager.get_ids_array(self._df)
|
|
743
|
+
if rids is not None and row_pos < len(rids):
|
|
744
|
+
from ..core import ChangeType
|
|
745
|
+
|
|
746
|
+
row_id = int(rids[row_pos])
|
|
747
|
+
ctx.store.append_diff(
|
|
748
|
+
step_id=step_id,
|
|
749
|
+
row_id=row_id,
|
|
750
|
+
col=col_str,
|
|
751
|
+
old_val=old_val,
|
|
752
|
+
new_val=value,
|
|
753
|
+
change_type=ChangeType.MODIFIED,
|
|
754
|
+
)
|
|
755
|
+
except (KeyError, TypeError):
|
|
756
|
+
pass
|
|
757
|
+
|
|
758
|
+
|
|
759
|
+
def instrument_indexers():
|
|
760
|
+
"""
|
|
761
|
+
Install tracked indexers for loc, iloc, at, iat.
|
|
762
|
+
|
|
763
|
+
Monkey-patches DataFrame.loc, DataFrame.iloc, DataFrame.at, DataFrame.iat properties.
|
|
764
|
+
"""
|
|
765
|
+
global _original_loc, _original_iloc, _original_at, _original_iat
|
|
766
|
+
|
|
767
|
+
if _original_loc is not None:
|
|
768
|
+
# Already instrumented
|
|
769
|
+
return
|
|
770
|
+
|
|
771
|
+
_original_loc = pd.DataFrame.loc.fget
|
|
772
|
+
_original_iloc = pd.DataFrame.iloc.fget
|
|
773
|
+
_original_at = pd.DataFrame.at.fget
|
|
774
|
+
_original_iat = pd.DataFrame.iat.fget
|
|
775
|
+
|
|
776
|
+
@property
|
|
777
|
+
def tracked_loc(self):
|
|
778
|
+
return TrackedLocIndexer(_original_loc(self), self)
|
|
779
|
+
|
|
780
|
+
@property
|
|
781
|
+
def tracked_iloc(self):
|
|
782
|
+
return TrackedILocIndexer(_original_iloc(self), self)
|
|
783
|
+
|
|
784
|
+
@property
|
|
785
|
+
def tracked_at(self):
|
|
786
|
+
return TrackedAtIndexer(_original_at(self), self)
|
|
787
|
+
|
|
788
|
+
@property
|
|
789
|
+
def tracked_iat(self):
|
|
790
|
+
return TrackedIAtIndexer(_original_iat(self), self)
|
|
791
|
+
|
|
792
|
+
pd.DataFrame.loc = tracked_loc
|
|
793
|
+
pd.DataFrame.iloc = tracked_iloc
|
|
794
|
+
pd.DataFrame.at = tracked_at
|
|
795
|
+
pd.DataFrame.iat = tracked_iat
|
|
796
|
+
|
|
797
|
+
|
|
798
|
+
def uninstrument_indexers():
|
|
799
|
+
"""Restore original loc/iloc/at/iat."""
|
|
800
|
+
global _original_loc, _original_iloc, _original_at, _original_iat
|
|
801
|
+
|
|
802
|
+
if _original_loc is not None:
|
|
803
|
+
pd.DataFrame.loc = property(_original_loc)
|
|
804
|
+
_original_loc = None
|
|
805
|
+
if _original_iloc is not None:
|
|
806
|
+
pd.DataFrame.iloc = property(_original_iloc)
|
|
807
|
+
_original_iloc = None
|
|
808
|
+
if _original_at is not None:
|
|
809
|
+
pd.DataFrame.at = property(_original_at)
|
|
810
|
+
_original_at = None
|
|
811
|
+
if _original_iat is not None:
|
|
812
|
+
pd.DataFrame.iat = property(_original_iat)
|
|
813
|
+
_original_iat = None
|