tracepipe 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,468 @@
1
+ # tracepipe/instrumentation/filter_capture.py
2
+ """
3
+ Mask-first filter capture with PARTIAL fallback.
4
+
5
+ Operation Matrix:
6
+ | Operation | Mask Derivation | Completeness |
7
+ |------------------|------------------------------------------|--------------|
8
+ | dropna | ~df[subset].isna().any(axis=1) | FULL |
9
+ | drop_duplicates | ~df.duplicated(subset, keep) | FULL |
10
+ | query (simple) | df.eval(expr) | FULL |
11
+ | query (complex) | fallback | PARTIAL |
12
+ | head(n) | positions [0:n] | FULL |
13
+ | tail(n) | positions [-n:] | FULL |
14
+ | sample | result.index positions | FULL |
15
+ | __getitem__[mask]| mask directly available | FULL |
16
+ | other | index-label fallback | PARTIAL |
17
+ """
18
+
19
+ import warnings
20
+ from functools import wraps
21
+ from typing import Optional
22
+
23
+ import numpy as np
24
+ import pandas as pd
25
+
26
+ from ..context import TracePipeContext, get_context
27
+ from ..core import CompletenessLevel
28
+ from ..safety import TracePipeWarning, get_caller_info
29
+
30
+ # ============ MASK DERIVATION FUNCTIONS ============
31
+
32
+
33
+ def derive_dropna_mask(
34
+ df: pd.DataFrame, args: tuple, kwargs: dict
35
+ ) -> tuple[np.ndarray, CompletenessLevel]:
36
+ """
37
+ Derive kept mask for dropna.
38
+
39
+ Guards against missing columns in subset.
40
+ """
41
+ axis = kwargs.get("axis", 0)
42
+ if axis != 0 and axis != "index":
43
+ return np.ones(len(df), dtype=bool), CompletenessLevel.FULL
44
+
45
+ how = kwargs.get("how", "any")
46
+ subset = kwargs.get("subset", None)
47
+ thresh = kwargs.get("thresh", None)
48
+
49
+ # Guard missing columns
50
+ if subset is not None:
51
+ valid_cols = [c for c in subset if c in df.columns]
52
+ if not valid_cols:
53
+ # No valid columns to check - keep all rows
54
+ return np.ones(len(df), dtype=bool), CompletenessLevel.PARTIAL
55
+ if len(valid_cols) != len(subset):
56
+ # Some columns missing - still compute but mark partial
57
+ check_df = df[valid_cols]
58
+ completeness = CompletenessLevel.PARTIAL
59
+ else:
60
+ check_df = df[valid_cols]
61
+ completeness = CompletenessLevel.FULL
62
+ else:
63
+ check_df = df
64
+ completeness = CompletenessLevel.FULL
65
+
66
+ if thresh is not None:
67
+ kept_mask = check_df.notna().sum(axis=1) >= thresh
68
+ elif how == "any":
69
+ kept_mask = ~check_df.isna().any(axis=1)
70
+ else:
71
+ kept_mask = ~check_df.isna().all(axis=1)
72
+
73
+ return kept_mask.values, completeness
74
+
75
+
76
+ def derive_drop_duplicates_mask(
77
+ df: pd.DataFrame, args: tuple, kwargs: dict
78
+ ) -> tuple[np.ndarray, CompletenessLevel]:
79
+ """Derive kept mask for drop_duplicates."""
80
+ subset = kwargs.get("subset", None)
81
+ keep = kwargs.get("keep", "first")
82
+
83
+ # Guard missing columns
84
+ if subset is not None:
85
+ valid_cols = [c for c in subset if c in df.columns]
86
+ if not valid_cols:
87
+ return np.ones(len(df), dtype=bool), CompletenessLevel.PARTIAL
88
+ if len(valid_cols) != len(subset):
89
+ subset = valid_cols
90
+ completeness = CompletenessLevel.PARTIAL
91
+ else:
92
+ completeness = CompletenessLevel.FULL
93
+ else:
94
+ completeness = CompletenessLevel.FULL
95
+
96
+ kept_mask = ~df.duplicated(subset=subset, keep=keep)
97
+ return kept_mask.values, completeness
98
+
99
+
100
+ def derive_query_mask(
101
+ df: pd.DataFrame, args: tuple, kwargs: dict
102
+ ) -> tuple[Optional[np.ndarray], CompletenessLevel]:
103
+ """
104
+ Derive kept mask for query.
105
+
106
+ More complete unsafe pattern checking.
107
+ """
108
+ expr = args[0] if args else kwargs.get("expr", "")
109
+
110
+ # Check for engine/parser that make eval unreliable
111
+ engine = kwargs.get("engine", None)
112
+ parser = kwargs.get("parser", None)
113
+ local_dict = kwargs.get("local_dict", None)
114
+ global_dict = kwargs.get("global_dict", None)
115
+
116
+ # Mark PARTIAL if non-default engine/parser
117
+ if engine == "python" or parser == "python":
118
+ return None, CompletenessLevel.PARTIAL
119
+
120
+ if local_dict is not None or global_dict is not None:
121
+ return None, CompletenessLevel.PARTIAL
122
+
123
+ # Check for unsafe patterns
124
+ unsafe_patterns = [
125
+ "@", # Local variable reference
126
+ "`", # Backtick column names
127
+ "index", # Index reference (case variations)
128
+ "Index",
129
+ "level_", # MultiIndex level
130
+ ]
131
+
132
+ if any(p in expr for p in unsafe_patterns):
133
+ return None, CompletenessLevel.PARTIAL
134
+
135
+ try:
136
+ mask = df.eval(expr)
137
+ if isinstance(mask, pd.Series) and mask.dtype == bool:
138
+ return mask.values, CompletenessLevel.FULL
139
+ else:
140
+ return None, CompletenessLevel.PARTIAL
141
+ except Exception:
142
+ return None, CompletenessLevel.PARTIAL
143
+
144
+
145
+ def derive_head_positions(df: pd.DataFrame, args: tuple, kwargs: dict) -> np.ndarray:
146
+ """Derive positions for head(n)."""
147
+ n = args[0] if args else kwargs.get("n", 5)
148
+ n = min(max(0, n), len(df))
149
+ return np.arange(n, dtype=np.int64)
150
+
151
+
152
+ def derive_tail_positions(df: pd.DataFrame, args: tuple, kwargs: dict) -> np.ndarray:
153
+ """Derive positions for tail(n)."""
154
+ n = args[0] if args else kwargs.get("n", 5)
155
+ n = min(max(0, n), len(df))
156
+ return np.arange(len(df) - n, len(df), dtype=np.int64)
157
+
158
+
159
+ def derive_sample_positions(
160
+ source_df: pd.DataFrame, result_df: pd.DataFrame
161
+ ) -> tuple[Optional[np.ndarray], CompletenessLevel]:
162
+ """
163
+ Derive positions for sample by matching result index to source.
164
+
165
+ If source index has duplicates, get_indexer returns first match
166
+ and we may map wrong rows. Mark PARTIAL in this case.
167
+ """
168
+ # Duplicate index makes position mapping unreliable
169
+ if source_df.index.has_duplicates:
170
+ return None, CompletenessLevel.PARTIAL
171
+
172
+ try:
173
+ positions = source_df.index.get_indexer(result_df.index)
174
+ if -1 in positions:
175
+ return None, CompletenessLevel.PARTIAL
176
+ return positions.astype(np.int64), CompletenessLevel.FULL
177
+ except Exception:
178
+ return None, CompletenessLevel.PARTIAL
179
+
180
+
181
+ # ============ UNIFIED FILTER WRAPPER ============
182
+
183
+
184
+ def wrap_filter_method(method_name: str, original_method):
185
+ """
186
+ Create a wrapper for filter methods with mask-first capture.
187
+ """
188
+
189
+ @wraps(original_method)
190
+ def wrapper(self, *args, **kwargs):
191
+ ctx = get_context()
192
+
193
+ # Increment filter depth to prevent internal loc/iloc from creating steps
194
+ if ctx.enabled:
195
+ ctx._filter_op_depth += 1
196
+
197
+ try:
198
+ # === ALWAYS RUN ORIGINAL FIRST ===
199
+ result = original_method(self, *args, **kwargs)
200
+ finally:
201
+ if ctx.enabled:
202
+ ctx._filter_op_depth -= 1
203
+
204
+ if not ctx.enabled:
205
+ return result
206
+
207
+ if not isinstance(result, pd.DataFrame):
208
+ return result
209
+
210
+ try:
211
+ _capture_filter_with_mask(
212
+ source_df=self,
213
+ result_df=result,
214
+ method_name=method_name,
215
+ args=args,
216
+ kwargs=kwargs,
217
+ ctx=ctx,
218
+ )
219
+ except Exception as e:
220
+ if ctx.config.strict_mode:
221
+ raise
222
+ warnings.warn(
223
+ f"TracePipe: Filter capture failed for {method_name}: {e}",
224
+ TracePipeWarning,
225
+ )
226
+
227
+ return result
228
+
229
+ return wrapper
230
+
231
+
232
+ def _capture_filter_with_mask(
233
+ source_df: pd.DataFrame,
234
+ result_df: pd.DataFrame,
235
+ method_name: str,
236
+ args: tuple,
237
+ kwargs: dict,
238
+ ctx: TracePipeContext,
239
+ ) -> None:
240
+ """
241
+ Core filter capture logic with mask-first design.
242
+ """
243
+ row_mgr = ctx.row_manager
244
+ store = ctx.store
245
+
246
+ # Ensure source is registered
247
+ source_rids = row_mgr.get_ids_array(source_df)
248
+ if source_rids is None:
249
+ row_mgr.register(source_df)
250
+ source_rids = row_mgr.get_ids_array(source_df)
251
+ if source_rids is None:
252
+ return
253
+
254
+ n_before = len(source_df)
255
+
256
+ # === DERIVE MASK/POSITIONS ===
257
+ kept_mask: Optional[np.ndarray] = None
258
+ positions: Optional[np.ndarray] = None
259
+ completeness = CompletenessLevel.FULL
260
+
261
+ if method_name == "dropna":
262
+ kept_mask, completeness = derive_dropna_mask(source_df, args, kwargs)
263
+
264
+ elif method_name == "drop_duplicates":
265
+ kept_mask, completeness = derive_drop_duplicates_mask(source_df, args, kwargs)
266
+
267
+ elif method_name == "query":
268
+ kept_mask, completeness = derive_query_mask(source_df, args, kwargs)
269
+
270
+ elif method_name == "head":
271
+ positions = derive_head_positions(source_df, args, kwargs)
272
+
273
+ elif method_name == "tail":
274
+ positions = derive_tail_positions(source_df, args, kwargs)
275
+
276
+ elif method_name == "sample":
277
+ positions, completeness = derive_sample_positions(source_df, result_df)
278
+
279
+ elif method_name == "__getitem__[mask]":
280
+ if args and hasattr(args[0], "dtype") and args[0].dtype == bool:
281
+ key = args[0]
282
+ kept_mask = key.values if isinstance(key, pd.Series) else np.asarray(key)
283
+
284
+ # === PROPAGATE RIDs ===
285
+ # Maintain kept_mask explicitly in each branch to avoid confusion
286
+ kept_mask_final: Optional[np.ndarray] = None
287
+ result_rids: Optional[np.ndarray] = None
288
+
289
+ if kept_mask is not None:
290
+ # Mask-derived branch: kept_mask is directly available
291
+ result_rids = row_mgr.propagate_by_mask(source_df, result_df, kept_mask)
292
+ kept_mask_final = kept_mask
293
+
294
+ elif positions is not None:
295
+ # Position-derived branch: build kept_mask from positions
296
+ result_rids = row_mgr.propagate_by_positions(source_df, result_df, positions)
297
+ kept_mask_final = np.zeros(n_before, dtype=bool)
298
+ kept_mask_final[positions] = True
299
+
300
+ else:
301
+ # FALLBACK: Index-label matching (mark as PARTIAL)
302
+ # Don't use compute_dropped_with_positions here - positions aren't reliable
303
+ completeness = CompletenessLevel.PARTIAL
304
+ result_rids = _propagate_by_index_fallback(row_mgr, source_df, result_df)
305
+ kept_mask_final = None # Cannot reliably determine mask
306
+
307
+ # === COMPUTE DROPPED ===
308
+ # Use kept_mask_final consistently
309
+ if kept_mask_final is not None:
310
+ # We have a reliable mask - use it for accurate drop computation
311
+ dropped_rids, dropped_positions = row_mgr.compute_dropped_with_positions(
312
+ source_rids, kept_mask_final
313
+ )
314
+ elif result_rids is not None:
315
+ # Fallback: use setdiff (no position info, but correct IDs)
316
+ dropped_rids = row_mgr.compute_dropped_ids(source_rids, result_rids)
317
+ else:
318
+ dropped_rids = np.array([], dtype=np.int64)
319
+
320
+ n_dropped = len(dropped_rids)
321
+
322
+ # === RECORD STEP (ALWAYS - even if no rows dropped) ===
323
+ code_file, code_line = get_caller_info(skip_frames=4)
324
+ step_id = store.append_step(
325
+ operation=f"DataFrame.{method_name}",
326
+ stage=ctx.current_stage,
327
+ code_file=code_file,
328
+ code_line=code_line,
329
+ params=_safe_filter_params(method_name, args, kwargs),
330
+ input_shape=source_df.shape,
331
+ output_shape=result_df.shape,
332
+ completeness=completeness,
333
+ )
334
+
335
+ # === RECORD DROPS ===
336
+ if n_dropped > 0:
337
+ store.append_bulk_drops(step_id, dropped_rids)
338
+
339
+ # Capture ghost values (debug mode)
340
+ if kept_mask_final is not None:
341
+ dropped_mask = ~kept_mask_final
342
+ row_mgr.capture_ghost_values(
343
+ source_df=source_df,
344
+ dropped_mask=dropped_mask,
345
+ dropped_by=f"DataFrame.{method_name}",
346
+ step_id=step_id,
347
+ watched_columns=ctx.watched_columns,
348
+ )
349
+ elif result_rids is not None and len(dropped_rids) > 0:
350
+ # Fallback: derive dropped_mask from dropped_rids
351
+ # Build mask by checking which source RIDs were dropped
352
+ dropped_set = set(dropped_rids)
353
+ dropped_mask = np.array([rid in dropped_set for rid in source_rids], dtype=bool)
354
+ row_mgr.capture_ghost_values(
355
+ source_df=source_df,
356
+ dropped_mask=dropped_mask,
357
+ dropped_by=f"DataFrame.{method_name}",
358
+ step_id=step_id,
359
+ watched_columns=ctx.watched_columns,
360
+ )
361
+
362
+
363
+ def _propagate_by_index_fallback(
364
+ row_mgr, source_df: pd.DataFrame, result_df: pd.DataFrame
365
+ ) -> Optional[np.ndarray]:
366
+ """
367
+ Fallback propagation using index labels.
368
+ Used when we can't derive mask/positions.
369
+ This is BEST EFFORT and marked PARTIAL.
370
+ """
371
+ source_ids = row_mgr.get_ids(source_df)
372
+ if source_ids is None:
373
+ return row_mgr.register(result_df)
374
+
375
+ try:
376
+ result_ids = source_ids.reindex(result_df.index)
377
+
378
+ new_mask = result_ids.isna()
379
+ if new_mask.any():
380
+ n_new = new_mask.sum()
381
+ new_rids = np.arange(row_mgr._next_row_id, row_mgr._next_row_id + n_new, dtype=np.int64)
382
+ row_mgr._next_row_id += n_new
383
+ result_ids.loc[new_mask] = new_rids
384
+
385
+ result_rids = result_ids.values.astype(np.int64)
386
+ row_mgr.set_result_rids(result_df, result_rids)
387
+ return result_rids
388
+ except Exception:
389
+ return row_mgr.register(result_df)
390
+
391
+
392
+ def _safe_filter_params(method_name: str, args: tuple, kwargs: dict) -> dict:
393
+ """Extract safe params for step metadata."""
394
+ params = {}
395
+
396
+ if method_name == "dropna":
397
+ params["how"] = kwargs.get("how", "any")
398
+ params["subset"] = str(kwargs.get("subset", "all"))[:50]
399
+ params["thresh"] = kwargs.get("thresh")
400
+ elif method_name == "drop_duplicates":
401
+ params["keep"] = kwargs.get("keep", "first")
402
+ params["subset"] = str(kwargs.get("subset", "all"))[:50]
403
+ elif method_name == "query":
404
+ params["expr"] = str(args[0] if args else kwargs.get("expr", ""))[:100]
405
+ elif method_name in ("head", "tail"):
406
+ params["n"] = args[0] if args else kwargs.get("n", 5)
407
+ elif method_name == "sample":
408
+ params["n"] = kwargs.get("n")
409
+ params["frac"] = kwargs.get("frac")
410
+ params["random_state"] = kwargs.get("random_state")
411
+
412
+ return params
413
+
414
+
415
+ # ============ BOOLEAN INDEXING WRAPPER ============
416
+
417
+
418
+ def wrap_getitem_filter(original_getitem):
419
+ """
420
+ Wrap DataFrame.__getitem__ to capture boolean indexing filters.
421
+ """
422
+
423
+ @wraps(original_getitem)
424
+ def wrapper(self, key):
425
+ ctx = get_context()
426
+
427
+ result = original_getitem(self, key)
428
+
429
+ if not ctx.enabled:
430
+ return result
431
+
432
+ # Skip if we're inside a filter operation (prevents double-counting)
433
+ if ctx._filter_op_depth > 0:
434
+ return result
435
+
436
+ # Only capture boolean masks that produce DataFrames
437
+ if not isinstance(result, pd.DataFrame):
438
+ return result
439
+
440
+ # Check if key is a boolean mask
441
+ is_boolean_mask = False
442
+ if isinstance(key, pd.Series) and key.dtype == bool:
443
+ is_boolean_mask = True
444
+ elif isinstance(key, np.ndarray) and key.dtype == bool:
445
+ is_boolean_mask = True
446
+ elif isinstance(key, list) and key and isinstance(key[0], bool):
447
+ is_boolean_mask = True
448
+
449
+ if not is_boolean_mask:
450
+ return result
451
+
452
+ try:
453
+ _capture_filter_with_mask(
454
+ source_df=self,
455
+ result_df=result,
456
+ method_name="__getitem__[mask]",
457
+ args=(key,),
458
+ kwargs={},
459
+ ctx=ctx,
460
+ )
461
+ except Exception as e:
462
+ if ctx.config.strict_mode:
463
+ raise
464
+ warnings.warn(f"TracePipe: Boolean indexing capture failed: {e}", TracePipeWarning)
465
+
466
+ return result
467
+
468
+ return wrapper