tracepipe 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,453 @@
1
+ # tracepipe/instrumentation/apply_capture.py
2
+ """
3
+ apply() and pipe() instrumentation for TracePipe.
4
+
5
+ Challenge: User functions are opaque - we can't see what they do internally.
6
+ Solution: Capture before/after state and mark as PARTIAL completeness.
7
+
8
+ Operations tracked:
9
+ | Pattern | Tracking | Completeness |
10
+ |------------------------------|-----------------------|--------------|
11
+ | df.apply(func, axis=0) | Column-wise diffs | PARTIAL |
12
+ | df.apply(func, axis=1) | Row-wise diffs | PARTIAL |
13
+ | df.pipe(func) | Before/after snapshot | PARTIAL |
14
+ | df.transform(func) | Before/after diffs | PARTIAL |
15
+ | df.assign(**kwargs) | Per-column diffs | FULL |
16
+
17
+ Key insight: We don't try to understand the function - we just diff the result.
18
+ """
19
+
20
+ import warnings
21
+ from functools import wraps
22
+ from typing import Any, Callable
23
+
24
+ import numpy as np
25
+ import pandas as pd
26
+
27
+ from ..context import get_context
28
+ from ..core import ChangeType, CompletenessLevel
29
+ from ..safety import TracePipeWarning, get_caller_info
30
+
31
+
32
+ def wrap_apply():
33
+ """
34
+ Wrap DataFrame.apply to capture before/after diffs.
35
+
36
+ apply() is inherently PARTIAL because we can't see inside the function.
37
+ However, we can still track:
38
+ - Which rows/columns changed
39
+ - What the before/after values were
40
+ """
41
+ original_apply = pd.DataFrame.apply
42
+
43
+ @wraps(original_apply)
44
+ def tracked_apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwargs):
45
+ ctx = get_context()
46
+
47
+ if not ctx.enabled:
48
+ return original_apply(self, func, axis, raw, result_type, args, **kwargs)
49
+
50
+ # Capture before state for watched columns
51
+ before_values = _capture_watched_state(self, ctx)
52
+ source_rids = ctx.row_manager.get_ids_array(self)
53
+ if source_rids is None:
54
+ ctx.row_manager.register(self)
55
+ source_rids = ctx.row_manager.get_ids_array(self)
56
+
57
+ # Run apply
58
+ result = original_apply(self, func, axis, raw, result_type, args, **kwargs)
59
+
60
+ try:
61
+ _capture_apply_result(
62
+ source_df=self,
63
+ result=result,
64
+ before_values=before_values,
65
+ source_rids=source_rids,
66
+ axis=axis,
67
+ func=func,
68
+ ctx=ctx,
69
+ )
70
+ except Exception as e:
71
+ if ctx.config.strict_mode:
72
+ raise
73
+ warnings.warn(f"TracePipe: apply() capture failed: {e}", TracePipeWarning)
74
+
75
+ return result
76
+
77
+ pd.DataFrame.apply = tracked_apply
78
+ pd.DataFrame._tp_original_apply = original_apply
79
+
80
+
81
+ def wrap_pipe():
82
+ """
83
+ Wrap DataFrame.pipe to capture before/after state.
84
+
85
+ pipe() passes the entire DataFrame to a function - we track the transformation.
86
+ """
87
+ original_pipe = pd.DataFrame.pipe
88
+
89
+ @wraps(original_pipe)
90
+ def tracked_pipe(self, func, *args, **kwargs):
91
+ ctx = get_context()
92
+
93
+ if not ctx.enabled:
94
+ return original_pipe(self, func, *args, **kwargs)
95
+
96
+ # Capture before state
97
+ before_values = _capture_watched_state(self, ctx)
98
+ source_rids = ctx.row_manager.get_ids_array(self)
99
+ if source_rids is None:
100
+ ctx.row_manager.register(self)
101
+ source_rids = ctx.row_manager.get_ids_array(self)
102
+
103
+ # Run pipe
104
+ result = original_pipe(self, func, *args, **kwargs)
105
+
106
+ try:
107
+ _capture_pipe_result(
108
+ source_df=self,
109
+ result=result,
110
+ before_values=before_values,
111
+ source_rids=source_rids,
112
+ func=func,
113
+ ctx=ctx,
114
+ )
115
+ except Exception as e:
116
+ if ctx.config.strict_mode:
117
+ raise
118
+ warnings.warn(f"TracePipe: pipe() capture failed: {e}", TracePipeWarning)
119
+
120
+ return result
121
+
122
+ pd.DataFrame.pipe = tracked_pipe
123
+ pd.DataFrame._tp_original_pipe = original_pipe
124
+
125
+
126
+ def wrap_transform():
127
+ """Wrap DataFrame.transform for element-wise transformations."""
128
+ original_transform = pd.DataFrame.transform
129
+
130
+ @wraps(original_transform)
131
+ def tracked_transform(self, func, axis=0, *args, **kwargs):
132
+ ctx = get_context()
133
+
134
+ if not ctx.enabled:
135
+ return original_transform(self, func, axis, *args, **kwargs)
136
+
137
+ before_values = _capture_watched_state(self, ctx)
138
+ source_rids = ctx.row_manager.get_ids_array(self)
139
+ if source_rids is None:
140
+ ctx.row_manager.register(self)
141
+ source_rids = ctx.row_manager.get_ids_array(self)
142
+
143
+ result = original_transform(self, func, axis, *args, **kwargs)
144
+
145
+ try:
146
+ _capture_transform_result(
147
+ source_df=self,
148
+ result=result,
149
+ before_values=before_values,
150
+ source_rids=source_rids,
151
+ func=func,
152
+ ctx=ctx,
153
+ )
154
+ except Exception as e:
155
+ if ctx.config.strict_mode:
156
+ raise
157
+ warnings.warn(f"TracePipe: transform() capture failed: {e}", TracePipeWarning)
158
+
159
+ return result
160
+
161
+ pd.DataFrame.transform = tracked_transform
162
+ pd.DataFrame._tp_original_transform = original_transform
163
+
164
+
165
+ def wrap_assign():
166
+ """
167
+ Wrap DataFrame.assign for FULL completeness tracking.
168
+
169
+ assign() is explicit about what columns are being created/modified,
170
+ so we can track with FULL completeness.
171
+ """
172
+ original_assign = pd.DataFrame.assign
173
+
174
+ @wraps(original_assign)
175
+ def tracked_assign(self, **kwargs):
176
+ ctx = get_context()
177
+
178
+ if not ctx.enabled:
179
+ return original_assign(self, **kwargs)
180
+
181
+ # Capture before state for columns being modified
182
+ before_values = {}
183
+ source_rids = ctx.row_manager.get_ids_array(self)
184
+ if source_rids is None:
185
+ ctx.row_manager.register(self)
186
+ source_rids = ctx.row_manager.get_ids_array(self)
187
+
188
+ for col in kwargs.keys():
189
+ if col in ctx.watched_columns and col in self.columns:
190
+ before_values[col] = self[col].values.copy()
191
+
192
+ # Run assign
193
+ result = original_assign(self, **kwargs)
194
+
195
+ try:
196
+ _capture_assign_result(
197
+ source_df=self,
198
+ result=result,
199
+ before_values=before_values,
200
+ source_rids=source_rids,
201
+ assigned_cols=list(kwargs.keys()),
202
+ ctx=ctx,
203
+ )
204
+ except Exception as e:
205
+ if ctx.config.strict_mode:
206
+ raise
207
+ warnings.warn(f"TracePipe: assign() capture failed: {e}", TracePipeWarning)
208
+
209
+ return result
210
+
211
+ pd.DataFrame.assign = tracked_assign
212
+ pd.DataFrame._tp_original_assign = original_assign
213
+
214
+
215
+ # ============ CAPTURE HELPERS ============
216
+
217
+
218
+ def _capture_watched_state(df: pd.DataFrame, ctx) -> dict:
219
+ """Capture current values of watched columns."""
220
+ state = {}
221
+ for col in ctx.watched_columns:
222
+ if col in df.columns:
223
+ state[col] = df[col].values.copy()
224
+ return state
225
+
226
+
227
+ def _capture_apply_result(
228
+ source_df: pd.DataFrame,
229
+ result: Any,
230
+ before_values: dict,
231
+ source_rids: np.ndarray,
232
+ axis: int,
233
+ func: Callable,
234
+ ctx,
235
+ ) -> None:
236
+ """Capture diffs from apply() result."""
237
+ store = ctx.store
238
+ row_mgr = ctx.row_manager
239
+
240
+ code_file, code_line = get_caller_info(skip_frames=4)
241
+
242
+ func_name = getattr(func, "__name__", "anonymous")
243
+
244
+ if isinstance(result, pd.DataFrame):
245
+ # Result is DataFrame - may have same or different shape
246
+ step_id = store.append_step(
247
+ operation=f"DataFrame.apply({func_name})",
248
+ stage=ctx.current_stage,
249
+ code_file=code_file,
250
+ code_line=code_line,
251
+ params={"axis": axis, "func": func_name},
252
+ input_shape=source_df.shape,
253
+ output_shape=result.shape,
254
+ completeness=CompletenessLevel.PARTIAL,
255
+ )
256
+
257
+ # Propagate RIDs if same length
258
+ if len(result) == len(source_df):
259
+ row_mgr.set_result_rids(result, source_rids)
260
+
261
+ # Capture diffs for watched columns
262
+ _capture_column_diffs(result, before_values, source_rids, step_id, store)
263
+ else:
264
+ # Different length - register new IDs, track as filter
265
+ new_rids = row_mgr.register(result)
266
+ if new_rids is not None:
267
+ dropped = row_mgr.compute_dropped_ids(source_rids, new_rids)
268
+ if len(dropped) > 0:
269
+ store.append_bulk_drops(step_id, dropped)
270
+
271
+ elif isinstance(result, pd.Series):
272
+ # Result is Series - aggregation or single column
273
+ store.append_step(
274
+ operation=f"DataFrame.apply({func_name})",
275
+ stage=ctx.current_stage,
276
+ code_file=code_file,
277
+ code_line=code_line,
278
+ params={"axis": axis, "func": func_name, "result_type": "Series"},
279
+ input_shape=source_df.shape,
280
+ output_shape=result.shape,
281
+ completeness=CompletenessLevel.PARTIAL,
282
+ )
283
+
284
+
285
+ def _capture_pipe_result(
286
+ source_df: pd.DataFrame,
287
+ result: Any,
288
+ before_values: dict,
289
+ source_rids: np.ndarray,
290
+ func: Callable,
291
+ ctx,
292
+ ) -> None:
293
+ """Capture diffs from pipe() result."""
294
+ store = ctx.store
295
+ row_mgr = ctx.row_manager
296
+
297
+ code_file, code_line = get_caller_info(skip_frames=4)
298
+ func_name = getattr(func, "__name__", "anonymous")
299
+
300
+ if isinstance(result, pd.DataFrame):
301
+ step_id = store.append_step(
302
+ operation=f"DataFrame.pipe({func_name})",
303
+ stage=ctx.current_stage,
304
+ code_file=code_file,
305
+ code_line=code_line,
306
+ params={"func": func_name},
307
+ input_shape=source_df.shape,
308
+ output_shape=result.shape,
309
+ completeness=CompletenessLevel.PARTIAL,
310
+ )
311
+
312
+ if len(result) == len(source_df):
313
+ # Same length - preserve RIDs and track diffs
314
+ row_mgr.set_result_rids(result, source_rids)
315
+ _capture_column_diffs(result, before_values, source_rids, step_id, store)
316
+ else:
317
+ # Different length - treat as filter
318
+ new_rids = row_mgr.register(result)
319
+ if new_rids is not None:
320
+ dropped = row_mgr.compute_dropped_ids(source_rids, new_rids)
321
+ if len(dropped) > 0:
322
+ store.append_bulk_drops(step_id, dropped)
323
+ else:
324
+ # Non-DataFrame result (e.g., scalar, dict)
325
+ store.append_step(
326
+ operation=f"DataFrame.pipe({func_name})",
327
+ stage=ctx.current_stage,
328
+ code_file=code_file,
329
+ code_line=code_line,
330
+ params={"func": func_name, "result_type": type(result).__name__},
331
+ input_shape=source_df.shape,
332
+ output_shape=None,
333
+ completeness=CompletenessLevel.PARTIAL,
334
+ )
335
+
336
+
337
+ def _capture_transform_result(
338
+ source_df: pd.DataFrame,
339
+ result: pd.DataFrame,
340
+ before_values: dict,
341
+ source_rids: np.ndarray,
342
+ func: Callable,
343
+ ctx,
344
+ ) -> None:
345
+ """Capture diffs from transform() result."""
346
+ store = ctx.store
347
+ row_mgr = ctx.row_manager
348
+
349
+ code_file, code_line = get_caller_info(skip_frames=4)
350
+ func_name = getattr(func, "__name__", "anonymous")
351
+
352
+ step_id = store.append_step(
353
+ operation=f"DataFrame.transform({func_name})",
354
+ stage=ctx.current_stage,
355
+ code_file=code_file,
356
+ code_line=code_line,
357
+ params={"func": func_name},
358
+ input_shape=source_df.shape,
359
+ output_shape=result.shape,
360
+ completeness=CompletenessLevel.PARTIAL,
361
+ )
362
+
363
+ # transform() preserves shape
364
+ row_mgr.set_result_rids(result, source_rids)
365
+ _capture_column_diffs(result, before_values, source_rids, step_id, store)
366
+
367
+
368
+ def _capture_assign_result(
369
+ source_df: pd.DataFrame,
370
+ result: pd.DataFrame,
371
+ before_values: dict,
372
+ source_rids: np.ndarray,
373
+ assigned_cols: list,
374
+ ctx,
375
+ ) -> None:
376
+ """Capture diffs from assign() result."""
377
+ store = ctx.store
378
+ row_mgr = ctx.row_manager
379
+
380
+ code_file, code_line = get_caller_info(skip_frames=4)
381
+
382
+ step_id = store.append_step(
383
+ operation="DataFrame.assign",
384
+ stage=ctx.current_stage,
385
+ code_file=code_file,
386
+ code_line=code_line,
387
+ params={"columns": assigned_cols[:5]},
388
+ input_shape=source_df.shape,
389
+ output_shape=result.shape,
390
+ completeness=CompletenessLevel.FULL, # assign() is explicit
391
+ )
392
+
393
+ # assign() returns new DataFrame with same rows
394
+ row_mgr.set_result_rids(result, source_rids)
395
+ _capture_column_diffs(result, before_values, source_rids, step_id, store)
396
+
397
+
398
+ def _capture_column_diffs(
399
+ result_df: pd.DataFrame,
400
+ before_values: dict,
401
+ rids: np.ndarray,
402
+ step_id: int,
403
+ store,
404
+ ) -> None:
405
+ """
406
+ Capture diffs for all watched columns using vectorized comparison.
407
+
408
+ Uses find_changed_indices_vectorized for 50-100x speedup over row-by-row.
409
+ """
410
+ from ..utils.value_capture import find_changed_indices_vectorized
411
+
412
+ for col, old_vals in before_values.items():
413
+ if col not in result_df.columns:
414
+ continue
415
+
416
+ new_vals = result_df[col].values
417
+
418
+ # Vectorized change detection
419
+ old_series = pd.Series(old_vals)
420
+ new_series = pd.Series(new_vals)
421
+ changed_mask = find_changed_indices_vectorized(old_series, new_series)
422
+
423
+ if not changed_mask.any():
424
+ continue
425
+
426
+ # Only loop over changed indices (typically small fraction)
427
+ changed_indices = np.where(changed_mask)[0]
428
+ for i in changed_indices:
429
+ store.append_diff(
430
+ step_id=step_id,
431
+ row_id=int(rids[i]),
432
+ col=col,
433
+ old_val=old_vals[i],
434
+ new_val=new_vals[i],
435
+ change_type=ChangeType.MODIFIED,
436
+ )
437
+
438
+
439
+ def instrument_apply_pipe():
440
+ """Install all apply/pipe instrumentation."""
441
+ wrap_apply()
442
+ wrap_pipe()
443
+ wrap_transform()
444
+ wrap_assign()
445
+
446
+
447
+ def uninstrument_apply_pipe():
448
+ """Restore original apply/pipe behavior."""
449
+ for method in ["apply", "pipe", "transform", "assign"]:
450
+ orig_attr = f"_tp_original_{method}"
451
+ if hasattr(pd.DataFrame, orig_attr):
452
+ setattr(pd.DataFrame, method, getattr(pd.DataFrame, orig_attr))
453
+ delattr(pd.DataFrame, orig_attr)