tracepipe 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,812 @@
1
+ # tracepipe/convenience.py
2
+ """
3
+ Convenience API: The friendly face of TracePipe.
4
+
5
+ 5 functions for 90% of use cases:
6
+ enable() - Start tracking
7
+ check() - Health audit
8
+ why() - Cell provenance ("why is this null?")
9
+ trace() - Row journey ("what happened to this row?")
10
+ report() - HTML export
11
+
12
+ All functions use df-first signatures for consistency:
13
+ tp.check(df)
14
+ tp.trace(df, row=5)
15
+ tp.why(df, col="amount", row=5)
16
+ tp.why(df, col="amount", where={"customer_id": "C123"})
17
+
18
+ Power users: Use tp.debug.inspect(), tp.contracts.contract() directly.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ from dataclasses import dataclass, field
24
+ from typing import Any, Callable
25
+
26
+ import numpy as np
27
+ import pandas as pd
28
+
29
+ from .context import get_context
30
+ from .core import TracePipeMode
31
+
32
+ # ============ RESULT TYPES ============
33
+
34
+
35
+ @dataclass
36
+ class CheckWarning:
37
+ """A single warning from check()."""
38
+
39
+ category: str # "merge_expansion", "retention", "duplicate_keys", etc.
40
+ severity: str # "fact" (measured) or "heuristic" (inferred)
41
+ message: str
42
+ details: dict[str, Any] = field(default_factory=dict)
43
+ fix_hint: str | None = None
44
+
45
+ def __repr__(self) -> str:
46
+ icon = "[!]" if self.severity == "fact" else "[?]"
47
+ return f"{icon} [{self.category}] {self.message}"
48
+
49
+
50
+ @dataclass
51
+ class CheckResult:
52
+ """
53
+ Result of check() - pipeline health audit.
54
+
55
+ Separates FACTS (observed, high confidence) from HEURISTICS (inferred).
56
+ .ok is True only if there are no FACT-level warnings.
57
+ """
58
+
59
+ ok: bool
60
+ warnings: list[CheckWarning]
61
+ facts: dict[str, Any]
62
+ suggestions: list[str]
63
+ mode: str
64
+
65
+ @property
66
+ def has_warnings(self) -> bool:
67
+ return len(self.warnings) > 0
68
+
69
+ @property
70
+ def fact_warnings(self) -> list[CheckWarning]:
71
+ return [w for w in self.warnings if w.severity == "fact"]
72
+
73
+ @property
74
+ def heuristic_warnings(self) -> list[CheckWarning]:
75
+ return [w for w in self.warnings if w.severity == "heuristic"]
76
+
77
+ def raise_if_failed(self) -> CheckResult:
78
+ """Raise CheckFailed if any FACT warnings (for CI). Returns self for chaining."""
79
+ if not self.ok:
80
+ raise CheckFailed(self.fact_warnings)
81
+ return self
82
+
83
+ def __repr__(self) -> str:
84
+ return self.to_text(verbose=False)
85
+
86
+ def to_text(self, verbose: bool = True) -> str:
87
+ """Format as text. Use verbose=True for full details."""
88
+ lines = []
89
+ status = "[OK] Pipeline healthy" if self.ok else "[WARN] Issues detected"
90
+ lines.append(f"TracePipe Check: {status}")
91
+ lines.append(f" Mode: {self.mode}")
92
+
93
+ if verbose and self.facts:
94
+ lines.append("\n Measured facts:")
95
+ for k, v in self.facts.items():
96
+ lines.append(f" {k}: {v}")
97
+
98
+ if self.fact_warnings:
99
+ lines.append("\n Issues (confirmed):")
100
+ for w in self.fact_warnings:
101
+ lines.append(f" [!] {w.message}")
102
+ if verbose and w.fix_hint:
103
+ lines.append(f" -> {w.fix_hint}")
104
+
105
+ if self.heuristic_warnings:
106
+ lines.append("\n Suggestions (possible issues):")
107
+ for w in self.heuristic_warnings:
108
+ lines.append(f" [?] {w.message}")
109
+ if verbose and w.fix_hint:
110
+ lines.append(f" -> {w.fix_hint}")
111
+
112
+ return "\n".join(lines)
113
+
114
+ def to_dict(self) -> dict:
115
+ """Export to dictionary."""
116
+ return {
117
+ "ok": self.ok,
118
+ "mode": self.mode,
119
+ "facts": self.facts,
120
+ "suggestions": self.suggestions,
121
+ "warnings": [
122
+ {
123
+ "category": w.category,
124
+ "severity": w.severity,
125
+ "message": w.message,
126
+ "details": w.details,
127
+ "fix_hint": w.fix_hint,
128
+ }
129
+ for w in self.warnings
130
+ ],
131
+ }
132
+
133
+
134
+ class CheckFailed(Exception):
135
+ """Raised by CheckResult.raise_if_failed()."""
136
+
137
+ def __init__(self, warnings: list[CheckWarning]):
138
+ self.warnings = warnings
139
+ messages = [w.message for w in warnings]
140
+ super().__init__(f"Check failed: {'; '.join(messages)}")
141
+
142
+
143
+ @dataclass
144
+ class TraceResult:
145
+ """
146
+ Result of trace() - row journey.
147
+
148
+ Answers: "What happened to this row?"
149
+ Events are in CHRONOLOGICAL order (oldest->newest).
150
+ """
151
+
152
+ row_id: int
153
+ is_alive: bool
154
+ dropped_at: dict[str, Any] | None = None
155
+ merge_origin: dict[str, Any] | None = None
156
+ events: list[dict[str, Any]] = field(default_factory=list)
157
+ ghost_values: dict[str, Any] | None = None
158
+ # Mode enforcement
159
+ supported: bool = True
160
+ unsupported_reason: str | None = None
161
+
162
+ @property
163
+ def n_events(self) -> int:
164
+ return len(self.events)
165
+
166
+ def to_dict(self) -> dict:
167
+ """Export to dictionary."""
168
+ return {
169
+ "row_id": self.row_id,
170
+ "is_alive": self.is_alive,
171
+ "dropped_at": self.dropped_at,
172
+ "merge_origin": self.merge_origin,
173
+ "n_events": self.n_events,
174
+ "events": self.events,
175
+ "ghost_values": self.ghost_values,
176
+ "supported": self.supported,
177
+ }
178
+
179
+ def __repr__(self) -> str:
180
+ return self.to_text(verbose=False)
181
+
182
+ def to_text(self, verbose: bool = True) -> str:
183
+ """Format as text. Use verbose=True for full details."""
184
+ if not self.supported:
185
+ return f"TraceResult: {self.unsupported_reason}"
186
+
187
+ lines = [f"Row {self.row_id} Journey:"]
188
+
189
+ if self.is_alive:
190
+ lines.append(" Status: [OK] Alive")
191
+ else:
192
+ lines.append(" Status: [X] Dropped")
193
+ if self.dropped_at:
194
+ lines.append(
195
+ f" at step {self.dropped_at['step_id']}: {self.dropped_at['operation']}"
196
+ )
197
+
198
+ if self.merge_origin:
199
+ left = self.merge_origin.get("left_parent", "?")
200
+ right = self.merge_origin.get("right_parent", "?")
201
+ lines.append(f" Origin: merge of row {left} (left) + row {right} (right)")
202
+
203
+ if len(self.events) == 0:
204
+ lines.append("\n Events: 0 (no changes to watched columns)")
205
+ else:
206
+ lines.append(f"\n Events: {len(self.events)}")
207
+ event_limit = 10 if verbose else 5
208
+ for event in self.events[-event_limit:]:
209
+ change = event.get("change_type", "?")
210
+ op = event.get("operation", "?")
211
+ col = event.get("col", "")
212
+ if col and col != "__row__":
213
+ lines.append(f" [{change}] {op}: {col}")
214
+ else:
215
+ lines.append(f" [{change}] {op}")
216
+
217
+ if self.ghost_values:
218
+ lines.append("\n Last known values:")
219
+ limit = 10 if verbose else 5
220
+ for k, v in list(self.ghost_values.items())[:limit]:
221
+ lines.append(f" {k}: {v}")
222
+
223
+ return "\n".join(lines)
224
+
225
+
226
+ @dataclass
227
+ class WhyResult:
228
+ """
229
+ Result of why() - cell provenance.
230
+
231
+ Answers: "Why does this cell have this value?"
232
+ History is stored in CHRONOLOGICAL order (oldest->newest).
233
+ """
234
+
235
+ row_id: int
236
+ column: str
237
+ current_value: Any = None
238
+ history: list[dict[str, Any]] = field(default_factory=list)
239
+ became_null_at: dict[str, Any] | None = None
240
+ # Mode enforcement
241
+ supported: bool = True
242
+ unsupported_reason: str | None = None
243
+ # Value tracking
244
+ _current_value_known: bool = False
245
+
246
+ @property
247
+ def n_changes(self) -> int:
248
+ return len(self.history)
249
+
250
+ @property
251
+ def root_cause(self) -> dict[str, Any] | None:
252
+ """The first change (oldest)."""
253
+ return self.history[0] if self.history else None
254
+
255
+ @property
256
+ def latest_change(self) -> dict[str, Any] | None:
257
+ """The most recent change."""
258
+ return self.history[-1] if self.history else None
259
+
260
+ def to_dict(self) -> dict:
261
+ """JSON-serializable dict representation."""
262
+ return {
263
+ "row_id": self.row_id,
264
+ "column": self.column,
265
+ "current_value": self.current_value,
266
+ "n_changes": self.n_changes,
267
+ "history": self.history,
268
+ "became_null_at": self.became_null_at,
269
+ "supported": self.supported,
270
+ }
271
+
272
+ def __repr__(self) -> str:
273
+ return self.to_text(verbose=False)
274
+
275
+ def to_text(self, verbose: bool = True) -> str:
276
+ """Format as text. Use verbose=True for full details."""
277
+ if not self.supported:
278
+ return f"WhyResult: {self.unsupported_reason}"
279
+
280
+ lines = [f"Cell History: row {self.row_id}, column '{self.column}'"]
281
+
282
+ if self._current_value_known:
283
+ lines.append(f" Current value: {self.current_value}")
284
+ else:
285
+ lines.append(" Current value: (provide df to see)")
286
+
287
+ if self.became_null_at:
288
+ # Check if null was later recovered
289
+ import pandas as pd
290
+
291
+ is_still_null = pd.isna(self.current_value) if self._current_value_known else True
292
+ if is_still_null:
293
+ lines.append(f" [!] Became null at step {self.became_null_at['step_id']}")
294
+ else:
295
+ lines.append(
296
+ f" [i] Was null at step {self.became_null_at['step_id']} (later recovered)"
297
+ )
298
+ lines.append(f" by: {self.became_null_at['operation']}")
299
+
300
+ if self.history:
301
+ lines.append(f"\n History ({len(self.history)} changes, most recent first):")
302
+ event_limit = 10 if verbose else 5
303
+ for event in reversed(self.history[-event_limit:]):
304
+ old = event.get("old_val", "?")
305
+ new = event.get("new_val", "?")
306
+ op = event.get("operation", "?")
307
+ loc = event.get("code_location", "")
308
+ lines.append(f" {old} -> {new}")
309
+ lines.append(f" by: {op}")
310
+ if verbose and loc:
311
+ lines.append(f" at: {loc}")
312
+ else:
313
+ lines.append("\n No changes tracked (original value)")
314
+
315
+ return "\n".join(lines)
316
+
317
+
318
+ # ============ CONVENIENCE FUNCTIONS ============
319
+
320
+ # Default thresholds
321
+ _DEFAULT_MERGE_EXPANSION_THRESHOLD = 1.5
322
+ _DEFAULT_RETENTION_THRESHOLD = 0.5
323
+
324
+
325
+ def check(
326
+ df: pd.DataFrame,
327
+ *,
328
+ merge_expansion_threshold: float | None = None,
329
+ retention_threshold: float | None = None,
330
+ ) -> CheckResult:
331
+ """
332
+ Run health check on pipeline.
333
+
334
+ Args:
335
+ df: DataFrame to check (used for additional validation)
336
+ merge_expansion_threshold: Flag merges expanding beyond this ratio
337
+ retention_threshold: Flag if retention drops below this
338
+
339
+ Returns:
340
+ CheckResult with .ok, .warnings, .facts, .suggestions
341
+ Use print(result) for pretty output, result.to_dict() for data.
342
+
343
+ Examples:
344
+ result = tp.check(df)
345
+ print(result) # Pretty output
346
+ result.raise_if_failed() # For CI
347
+ """
348
+ ctx = get_context()
349
+ warnings_list: list[CheckWarning] = []
350
+ facts: dict[str, Any] = {}
351
+ suggestions: list[str] = []
352
+
353
+ merge_threshold_source = "user" if merge_expansion_threshold is not None else "default"
354
+ retention_threshold_source = "user" if retention_threshold is not None else "default"
355
+
356
+ merge_expansion_threshold = merge_expansion_threshold or _DEFAULT_MERGE_EXPANSION_THRESHOLD
357
+ retention_threshold = retention_threshold or _DEFAULT_RETENTION_THRESHOLD
358
+
359
+ # === FACTS ===
360
+ dropped = ctx.store.get_dropped_rows()
361
+ facts["rows_dropped"] = len(dropped)
362
+ facts["total_steps"] = len(ctx.store.steps)
363
+
364
+ # Merge statistics
365
+ merge_stats_list = ctx.store.get_merge_stats() if hasattr(ctx.store, "get_merge_stats") else []
366
+
367
+ for i, (step_id, stats) in enumerate(merge_stats_list):
368
+ facts[f"merge_{i}_expansion"] = stats.expansion_ratio
369
+ facts[f"merge_{i}_result_rows"] = stats.result_rows
370
+
371
+ if stats.expansion_ratio > merge_expansion_threshold:
372
+ severity = "fact" if merge_threshold_source == "user" else "heuristic"
373
+ warnings_list.append(
374
+ CheckWarning(
375
+ category="merge_expansion",
376
+ severity=severity,
377
+ message=f"Merge expanded {stats.expansion_ratio:.2f}x "
378
+ f"({stats.left_rows} x {stats.right_rows} -> {stats.result_rows})",
379
+ details={
380
+ "step_id": step_id,
381
+ "expansion": stats.expansion_ratio,
382
+ "how": stats.how,
383
+ },
384
+ fix_hint="Check for duplicate keys in join columns",
385
+ )
386
+ )
387
+
388
+ if stats.left_dup_rate > 0.01:
389
+ warnings_list.append(
390
+ CheckWarning(
391
+ category="duplicate_keys",
392
+ severity="fact",
393
+ message=f"Left table has {stats.left_dup_rate:.1%} duplicate join keys",
394
+ details={"step_id": step_id, "dup_rate": stats.left_dup_rate},
395
+ )
396
+ )
397
+ if stats.right_dup_rate > 0.01:
398
+ warnings_list.append(
399
+ CheckWarning(
400
+ category="duplicate_keys",
401
+ severity="fact",
402
+ message=f"Right table has {stats.right_dup_rate:.1%} duplicate join keys",
403
+ details={"step_id": step_id, "dup_rate": stats.right_dup_rate},
404
+ )
405
+ )
406
+
407
+ # Retention rate - use max rows seen to handle multi-table pipelines
408
+ if ctx.store.steps:
409
+ max_rows_seen = 0
410
+ for step in ctx.store.steps:
411
+ # input_shape can be a single shape tuple (rows, cols) or
412
+ # a tuple of shapes for merge operations ((left_rows, cols), (right_rows, cols))
413
+ if step.input_shape:
414
+ shape = step.input_shape
415
+ if isinstance(shape[0], tuple):
416
+ # Multiple inputs (e.g., merge) - take max of all inputs
417
+ for s in shape:
418
+ if isinstance(s, tuple) and len(s) >= 1:
419
+ max_rows_seen = max(max_rows_seen, s[0])
420
+ elif isinstance(shape[0], int):
421
+ max_rows_seen = max(max_rows_seen, shape[0])
422
+
423
+ if step.output_shape and isinstance(step.output_shape[0], int):
424
+ max_rows_seen = max(max_rows_seen, step.output_shape[0])
425
+
426
+ if max_rows_seen > 0:
427
+ current = len(df)
428
+ retention = current / max_rows_seen if max_rows_seen > 0 else 1.0
429
+ facts["retention_rate"] = round(retention, 4)
430
+
431
+ if retention < retention_threshold:
432
+ severity = "fact" if retention_threshold_source == "user" else "heuristic"
433
+ warnings_list.append(
434
+ CheckWarning(
435
+ category="retention",
436
+ severity=severity,
437
+ message=f"Retention is {retention:.1%} (below {retention_threshold:.0%})",
438
+ details={
439
+ "retention": retention,
440
+ "max_rows_seen": max_rows_seen,
441
+ "current": current,
442
+ },
443
+ fix_hint="Review filter operations",
444
+ )
445
+ )
446
+
447
+ # === HEURISTICS ===
448
+ for i, (step_id, stats) in enumerate(merge_stats_list):
449
+ if stats.how == "left" and stats.expansion_ratio > 1.0:
450
+ warnings_list.append(
451
+ CheckWarning(
452
+ category="possible_unintended_expansion",
453
+ severity="heuristic",
454
+ message=f"Left join expanded {stats.expansion_ratio:.2f}x - was 1:1 expected?",
455
+ details={"step_id": step_id},
456
+ fix_hint="If 1:1 was intended, use validate='1:1' in merge()",
457
+ )
458
+ )
459
+
460
+ drops_by_step = ctx.store.get_dropped_by_step()
461
+ for op, count in drops_by_step.items():
462
+ if count > 1000:
463
+ suggestions.append(f"'{op}' dropped {count} rows - review if intentional")
464
+
465
+ ok = len([w for w in warnings_list if w.severity == "fact"]) == 0
466
+
467
+ return CheckResult(
468
+ ok=ok,
469
+ warnings=warnings_list,
470
+ facts=facts,
471
+ suggestions=suggestions,
472
+ mode=ctx.config.mode.value,
473
+ )
474
+
475
+
476
+ def trace(
477
+ df: pd.DataFrame,
478
+ *,
479
+ row: int | None = None,
480
+ where: dict[str, Any] | None = None,
481
+ include_ghost: bool = True,
482
+ ) -> TraceResult | list[TraceResult]:
483
+ """
484
+ Trace a row's journey through the pipeline.
485
+
486
+ Args:
487
+ df: DataFrame to search in
488
+ row: Row ID (if known)
489
+ where: Selector dict, e.g. {"customer_id": "C123"}
490
+ include_ghost: Include last-known values for dropped rows
491
+
492
+ Returns:
493
+ TraceResult (single row) or List[TraceResult] (if where matches multiple)
494
+ Use print(result) for pretty output, result.to_dict() for data.
495
+
496
+ Examples:
497
+ result = tp.trace(df, row=5)
498
+ print(result)
499
+ tp.trace(df, where={"customer_id": "C123"})
500
+ """
501
+ ctx = get_context()
502
+
503
+ # Mode enforcement for deep lineage
504
+ if ctx.config.mode == TracePipeMode.CI and not ctx.config.should_capture_cell_history:
505
+ # CI mode still supports basic trace (drop tracking)
506
+ pass
507
+
508
+ # Resolve row IDs
509
+ if row is not None:
510
+ row_ids = [row]
511
+ elif where is not None:
512
+ row_ids = _resolve_where(df, where, ctx)
513
+ else:
514
+ raise ValueError("Must provide 'row' or 'where'")
515
+
516
+ results = []
517
+ for rid in row_ids:
518
+ result = _build_trace_result(rid, ctx, include_ghost)
519
+ results.append(result)
520
+
521
+ return results[0] if len(results) == 1 else results
522
+
523
+
524
+ def why(
525
+ df: pd.DataFrame,
526
+ *,
527
+ col: str,
528
+ row: int | None = None,
529
+ where: dict[str, Any] | None = None,
530
+ ) -> WhyResult | list[WhyResult]:
531
+ """
532
+ Explain why a cell has its current value.
533
+
534
+ Args:
535
+ df: DataFrame to search in
536
+ col: Column name to trace
537
+ row: Row ID (if known)
538
+ where: Selector dict, e.g. {"customer_id": "C123"}
539
+
540
+ Returns:
541
+ WhyResult (single row) or List[WhyResult] (if where matches multiple)
542
+ Use print(result) for pretty output, result.to_dict() for data.
543
+
544
+ Examples:
545
+ result = tp.why(df, col="amount", row=5)
546
+ print(result)
547
+ tp.why(df, col="email", where={"user_id": "U123"})
548
+ """
549
+ ctx = get_context()
550
+
551
+ # Mode enforcement
552
+ if ctx.config.mode == TracePipeMode.CI and not ctx.config.should_capture_cell_history:
553
+ return WhyResult(
554
+ row_id=-1,
555
+ column=col,
556
+ supported=False,
557
+ unsupported_reason="Cell history requires mode='debug' or cell_history=True",
558
+ )
559
+
560
+ # Resolve row IDs
561
+ if row is not None:
562
+ row_ids = [row]
563
+ elif where is not None:
564
+ row_ids = _resolve_where(df, where, ctx)
565
+ else:
566
+ raise ValueError("Must provide 'row' or 'where'")
567
+
568
+ results = []
569
+ for rid in row_ids:
570
+ result = _build_why_result(df, rid, col, ctx)
571
+ results.append(result)
572
+
573
+ return results[0] if len(results) == 1 else results
574
+
575
+
576
+ def report(
577
+ df: pd.DataFrame,
578
+ path: str = "tracepipe_report.html",
579
+ *,
580
+ title: str = "TracePipe Report",
581
+ ) -> str:
582
+ """
583
+ Generate HTML report.
584
+
585
+ Args:
586
+ df: Final DataFrame
587
+ path: Output path
588
+ title: Report title
589
+
590
+ Returns:
591
+ Path to saved report
592
+ """
593
+ try:
594
+ from .visualization.html_export import save as _save
595
+
596
+ _save(path, title=title)
597
+ except ImportError:
598
+ # Fallback if visualization module can't be imported
599
+ ctx = get_context()
600
+ html_content = f"""<!DOCTYPE html>
601
+ <html>
602
+ <head><title>{title}</title></head>
603
+ <body>
604
+ <h1>{title}</h1>
605
+ <p>Mode: {ctx.config.mode.value}</p>
606
+ <p>Steps: {len(ctx.store.steps)}</p>
607
+ <p>Rows dropped: {len(ctx.store.get_dropped_rows())}</p>
608
+ <p>DataFrame shape: {df.shape}</p>
609
+ </body>
610
+ </html>"""
611
+ with open(path, "w") as f:
612
+ f.write(html_content)
613
+
614
+ print(f"Report saved to: {path}")
615
+ return path
616
+
617
+
618
+ def find(
619
+ df: pd.DataFrame,
620
+ *,
621
+ where: dict[str, Any] | None = None,
622
+ predicate: Callable[[pd.DataFrame], pd.Series] | None = None,
623
+ limit: int = 10,
624
+ ) -> list[int]:
625
+ """
626
+ Find row IDs matching a selector.
627
+
628
+ Args:
629
+ df: DataFrame to search
630
+ where: Exact match selector
631
+ predicate: Vector predicate (df -> boolean Series)
632
+ limit: Maximum number of IDs to return
633
+
634
+ Returns:
635
+ List of row IDs
636
+
637
+ Examples:
638
+ rids = tp.find(df, where={"status": "failed"})
639
+ tp.trace(df, row=rids[0])
640
+ """
641
+ ctx = get_context()
642
+
643
+ if where:
644
+ row_ids = _resolve_where(df, where, ctx, limit=limit)
645
+ elif predicate:
646
+ row_ids = _resolve_predicate(df, predicate, ctx, limit=limit)
647
+ else:
648
+ raise ValueError("Must provide 'where' or 'predicate'")
649
+
650
+ return row_ids
651
+
652
+
653
+ # ============ HELPERS ============
654
+
655
+
656
+ def _json_safe(val: Any) -> Any:
657
+ """Convert value to JSON-serializable form."""
658
+ if pd.isna(val):
659
+ return None
660
+ if isinstance(val, (np.integer, np.floating)):
661
+ return val.item()
662
+ if isinstance(val, np.ndarray):
663
+ return val.tolist()
664
+ return val
665
+
666
+
667
+ def _resolve_where(
668
+ df: pd.DataFrame,
669
+ where: dict[str, Any],
670
+ ctx,
671
+ limit: int | None = None,
672
+ ) -> list[int]:
673
+ """Resolve row IDs from where dict selector."""
674
+ rids = ctx.row_manager.get_ids_array(df)
675
+ if rids is None:
676
+ raise ValueError("DataFrame not tracked by TracePipe")
677
+
678
+ mask = np.ones(len(df), dtype=bool)
679
+ for col, val in where.items():
680
+ if col not in df.columns:
681
+ raise ValueError(f"Column '{col}' not in DataFrame")
682
+
683
+ series = df[col]
684
+ if isinstance(val, (list, tuple)):
685
+ col_mask = series.isin(val).to_numpy()
686
+ elif pd.isna(val):
687
+ col_mask = series.isna().to_numpy()
688
+ else:
689
+ col_mask = series.eq(val).to_numpy()
690
+ mask &= col_mask
691
+
692
+ matched_positions = np.where(mask)[0]
693
+ if len(matched_positions) == 0:
694
+ raise ValueError(f"No rows matched where={where}")
695
+
696
+ if limit:
697
+ matched_positions = matched_positions[:limit]
698
+
699
+ return rids[matched_positions].tolist()
700
+
701
+
702
+ def _resolve_predicate(
703
+ df: pd.DataFrame,
704
+ predicate: Callable[[pd.DataFrame], pd.Series],
705
+ ctx,
706
+ limit: int | None = None,
707
+ ) -> list[int]:
708
+ """Resolve row IDs from predicate function."""
709
+ rids = ctx.row_manager.get_ids_array(df)
710
+ if rids is None:
711
+ raise ValueError("DataFrame not tracked by TracePipe")
712
+
713
+ mask_series = predicate(df)
714
+ if not isinstance(mask_series, pd.Series):
715
+ raise TypeError("predicate must return pd.Series")
716
+ if mask_series.dtype != bool:
717
+ raise TypeError("predicate must return boolean Series")
718
+
719
+ mask = mask_series.to_numpy()
720
+ matched_positions = np.where(mask)[0]
721
+
722
+ if len(matched_positions) == 0:
723
+ raise ValueError("No rows matched predicate")
724
+
725
+ if limit:
726
+ matched_positions = matched_positions[:limit]
727
+
728
+ return rids[matched_positions].tolist()
729
+
730
+
731
+ def _build_trace_result(row_id: int, ctx, include_ghost: bool) -> TraceResult:
732
+ """Build TraceResult for a single row."""
733
+ store = ctx.store
734
+
735
+ drop_event = store.get_drop_event(row_id)
736
+ history = store.get_row_history(row_id)
737
+ merge_origin = store.get_merge_origin(row_id)
738
+
739
+ dropped_at = None
740
+ if drop_event:
741
+ dropped_at = {
742
+ "step_id": drop_event.get("step_id"),
743
+ "operation": drop_event.get("operation"),
744
+ }
745
+
746
+ ghost_values = None
747
+ if include_ghost and drop_event is not None:
748
+ ghost_df = ctx.row_manager.get_ghost_rows(limit=10000)
749
+ if not ghost_df.empty and "__tp_row_id__" in ghost_df.columns:
750
+ ghost_row = ghost_df[ghost_df["__tp_row_id__"] == row_id]
751
+ if not ghost_row.empty:
752
+ ghost_values = ghost_row.iloc[0].to_dict()
753
+ tp_cols = [
754
+ "__tp_row_id__",
755
+ "__tp_dropped_by__",
756
+ "__tp_dropped_step__",
757
+ "__tp_original_position__",
758
+ ]
759
+ for col in tp_cols:
760
+ ghost_values.pop(col, None)
761
+
762
+ return TraceResult(
763
+ row_id=row_id,
764
+ is_alive=drop_event is None,
765
+ dropped_at=dropped_at,
766
+ merge_origin=merge_origin,
767
+ events=history,
768
+ ghost_values=ghost_values,
769
+ )
770
+
771
+
772
+ def _build_why_result(df: pd.DataFrame, row_id: int, col: str, ctx) -> WhyResult:
773
+ """Build WhyResult for a single cell."""
774
+ from .value_provenance import explain_value
775
+
776
+ history_obj = explain_value(row_id, col, df)
777
+
778
+ current_value = None
779
+ current_value_known = False
780
+ rids = ctx.row_manager.get_ids_array(df)
781
+ if rids is not None:
782
+ pos = np.where(rids == row_id)[0]
783
+ if len(pos) > 0 and col in df.columns:
784
+ current_value = df.iloc[pos[0]][col]
785
+ current_value_known = True
786
+
787
+ became_null_at = None
788
+ if history_obj.became_null_at:
789
+ became_null_at = {
790
+ "step_id": history_obj.became_null_at,
791
+ "operation": history_obj.became_null_by,
792
+ }
793
+
794
+ result = WhyResult(
795
+ row_id=row_id,
796
+ column=col,
797
+ current_value=_json_safe(current_value),
798
+ history=[
799
+ {
800
+ "step_id": e.step_id,
801
+ "operation": e.operation,
802
+ "old_val": _json_safe(e.old_value),
803
+ "new_val": _json_safe(e.new_value),
804
+ "change_type": e.change_type,
805
+ "code_location": e.code_location,
806
+ }
807
+ for e in history_obj.events
808
+ ],
809
+ became_null_at=became_null_at,
810
+ )
811
+ result._current_value_known = current_value_known
812
+ return result