tracepipe 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tracepipe/__init__.py +117 -78
- tracepipe/api.py +219 -332
- tracepipe/context.py +21 -1
- tracepipe/contracts.py +473 -0
- tracepipe/convenience.py +817 -0
- tracepipe/core.py +174 -17
- tracepipe/debug.py +325 -0
- tracepipe/instrumentation/apply_capture.py +453 -0
- tracepipe/instrumentation/filter_capture.py +468 -0
- tracepipe/instrumentation/indexer_capture.py +813 -0
- tracepipe/instrumentation/merge_capture.py +434 -0
- tracepipe/instrumentation/pandas_inst.py +66 -183
- tracepipe/instrumentation/series_capture.py +331 -0
- tracepipe/safety.py +3 -3
- tracepipe/snapshot.py +420 -0
- tracepipe/storage/base.py +7 -3
- tracepipe/storage/lineage_store.py +252 -47
- tracepipe/storage/row_identity.py +366 -104
- tracepipe/value_provenance.py +309 -0
- tracepipe/visualization/html_export.py +22 -7
- tracepipe-0.3.1.dist-info/METADATA +308 -0
- tracepipe-0.3.1.dist-info/RECORD +29 -0
- tracepipe-0.2.0.dist-info/METADATA +0 -508
- tracepipe-0.2.0.dist-info/RECORD +0 -19
- {tracepipe-0.2.0.dist-info → tracepipe-0.3.1.dist-info}/WHEEL +0 -0
- {tracepipe-0.2.0.dist-info → tracepipe-0.3.1.dist-info}/licenses/LICENSE +0 -0
tracepipe/convenience.py
ADDED
|
@@ -0,0 +1,817 @@
|
|
|
1
|
+
# tracepipe/convenience.py
|
|
2
|
+
"""
|
|
3
|
+
Convenience API: The friendly face of TracePipe.
|
|
4
|
+
|
|
5
|
+
5 functions for 90% of use cases:
|
|
6
|
+
enable() - Start tracking
|
|
7
|
+
check() - Health audit
|
|
8
|
+
why() - Cell provenance ("why is this null?")
|
|
9
|
+
trace() - Row journey ("what happened to this row?")
|
|
10
|
+
report() - HTML export
|
|
11
|
+
|
|
12
|
+
All functions use df-first signatures for consistency:
|
|
13
|
+
tp.check(df)
|
|
14
|
+
tp.trace(df, row=5)
|
|
15
|
+
tp.why(df, col="amount", row=5)
|
|
16
|
+
tp.why(df, col="amount", where={"customer_id": "C123"})
|
|
17
|
+
|
|
18
|
+
Power users: Use tp.debug.inspect(), tp.contracts.contract() directly.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
from dataclasses import dataclass, field
|
|
24
|
+
from typing import Any, Callable
|
|
25
|
+
|
|
26
|
+
import numpy as np
|
|
27
|
+
import pandas as pd
|
|
28
|
+
|
|
29
|
+
from .context import get_context
|
|
30
|
+
from .core import TracePipeMode
|
|
31
|
+
|
|
32
|
+
# ============ RESULT TYPES ============
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class CheckWarning:
|
|
37
|
+
"""A single warning from check()."""
|
|
38
|
+
|
|
39
|
+
category: str # "merge_expansion", "retention", "duplicate_keys", etc.
|
|
40
|
+
severity: str # "fact" (measured) or "heuristic" (inferred)
|
|
41
|
+
message: str
|
|
42
|
+
details: dict[str, Any] = field(default_factory=dict)
|
|
43
|
+
fix_hint: str | None = None
|
|
44
|
+
|
|
45
|
+
def __repr__(self) -> str:
|
|
46
|
+
icon = "[!]" if self.severity == "fact" else "[?]"
|
|
47
|
+
return f"{icon} [{self.category}] {self.message}"
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class CheckResult:
|
|
52
|
+
"""
|
|
53
|
+
Result of check() - pipeline health audit.
|
|
54
|
+
|
|
55
|
+
Separates FACTS (observed, high confidence) from HEURISTICS (inferred).
|
|
56
|
+
.ok is True only if there are no FACT-level warnings.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
ok: bool
|
|
60
|
+
warnings: list[CheckWarning]
|
|
61
|
+
facts: dict[str, Any]
|
|
62
|
+
suggestions: list[str]
|
|
63
|
+
mode: str
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def has_warnings(self) -> bool:
|
|
67
|
+
return len(self.warnings) > 0
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def fact_warnings(self) -> list[CheckWarning]:
|
|
71
|
+
return [w for w in self.warnings if w.severity == "fact"]
|
|
72
|
+
|
|
73
|
+
@property
|
|
74
|
+
def heuristic_warnings(self) -> list[CheckWarning]:
|
|
75
|
+
return [w for w in self.warnings if w.severity == "heuristic"]
|
|
76
|
+
|
|
77
|
+
def raise_if_failed(self) -> CheckResult:
|
|
78
|
+
"""Raise CheckFailed if any FACT warnings (for CI). Returns self for chaining."""
|
|
79
|
+
if not self.ok:
|
|
80
|
+
raise CheckFailed(self.fact_warnings)
|
|
81
|
+
return self
|
|
82
|
+
|
|
83
|
+
def __repr__(self) -> str:
|
|
84
|
+
return self.to_text(verbose=False)
|
|
85
|
+
|
|
86
|
+
def to_text(self, verbose: bool = True) -> str:
|
|
87
|
+
"""Format as text. Use verbose=True for full details."""
|
|
88
|
+
lines = []
|
|
89
|
+
status = "[OK] Pipeline healthy" if self.ok else "[WARN] Issues detected"
|
|
90
|
+
lines.append(f"TracePipe Check: {status}")
|
|
91
|
+
lines.append(f" Mode: {self.mode}")
|
|
92
|
+
|
|
93
|
+
if verbose and self.facts:
|
|
94
|
+
lines.append("\n Measured facts:")
|
|
95
|
+
for k, v in self.facts.items():
|
|
96
|
+
lines.append(f" {k}: {v}")
|
|
97
|
+
|
|
98
|
+
if self.fact_warnings:
|
|
99
|
+
lines.append("\n Issues (confirmed):")
|
|
100
|
+
for w in self.fact_warnings:
|
|
101
|
+
lines.append(f" [!] {w.message}")
|
|
102
|
+
if verbose and w.fix_hint:
|
|
103
|
+
lines.append(f" -> {w.fix_hint}")
|
|
104
|
+
|
|
105
|
+
if self.heuristic_warnings:
|
|
106
|
+
lines.append("\n Suggestions (possible issues):")
|
|
107
|
+
for w in self.heuristic_warnings:
|
|
108
|
+
lines.append(f" [?] {w.message}")
|
|
109
|
+
if verbose and w.fix_hint:
|
|
110
|
+
lines.append(f" -> {w.fix_hint}")
|
|
111
|
+
|
|
112
|
+
return "\n".join(lines)
|
|
113
|
+
|
|
114
|
+
def to_dict(self) -> dict:
|
|
115
|
+
"""Export to dictionary."""
|
|
116
|
+
return {
|
|
117
|
+
"ok": self.ok,
|
|
118
|
+
"mode": self.mode,
|
|
119
|
+
"facts": self.facts,
|
|
120
|
+
"suggestions": self.suggestions,
|
|
121
|
+
"warnings": [
|
|
122
|
+
{
|
|
123
|
+
"category": w.category,
|
|
124
|
+
"severity": w.severity,
|
|
125
|
+
"message": w.message,
|
|
126
|
+
"details": w.details,
|
|
127
|
+
"fix_hint": w.fix_hint,
|
|
128
|
+
}
|
|
129
|
+
for w in self.warnings
|
|
130
|
+
],
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class CheckFailed(Exception):
|
|
135
|
+
"""Raised by CheckResult.raise_if_failed()."""
|
|
136
|
+
|
|
137
|
+
def __init__(self, warnings: list[CheckWarning]):
|
|
138
|
+
self.warnings = warnings
|
|
139
|
+
messages = [w.message for w in warnings]
|
|
140
|
+
super().__init__(f"Check failed: {'; '.join(messages)}")
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
@dataclass
|
|
144
|
+
class TraceResult:
|
|
145
|
+
"""
|
|
146
|
+
Result of trace() - row journey.
|
|
147
|
+
|
|
148
|
+
Answers: "What happened to this row?"
|
|
149
|
+
Events are in CHRONOLOGICAL order (oldest->newest).
|
|
150
|
+
"""
|
|
151
|
+
|
|
152
|
+
row_id: int
|
|
153
|
+
is_alive: bool
|
|
154
|
+
dropped_at: dict[str, Any] | None = None
|
|
155
|
+
merge_origin: dict[str, Any] | None = None
|
|
156
|
+
events: list[dict[str, Any]] = field(default_factory=list)
|
|
157
|
+
ghost_values: dict[str, Any] | None = None
|
|
158
|
+
# Mode enforcement
|
|
159
|
+
supported: bool = True
|
|
160
|
+
unsupported_reason: str | None = None
|
|
161
|
+
|
|
162
|
+
@property
|
|
163
|
+
def n_events(self) -> int:
|
|
164
|
+
return len(self.events)
|
|
165
|
+
|
|
166
|
+
def to_dict(self) -> dict:
|
|
167
|
+
"""Export to dictionary."""
|
|
168
|
+
return {
|
|
169
|
+
"row_id": self.row_id,
|
|
170
|
+
"is_alive": self.is_alive,
|
|
171
|
+
"dropped_at": self.dropped_at,
|
|
172
|
+
"merge_origin": self.merge_origin,
|
|
173
|
+
"n_events": self.n_events,
|
|
174
|
+
"events": self.events,
|
|
175
|
+
"ghost_values": self.ghost_values,
|
|
176
|
+
"supported": self.supported,
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
def __repr__(self) -> str:
|
|
180
|
+
return self.to_text(verbose=False)
|
|
181
|
+
|
|
182
|
+
def to_text(self, verbose: bool = True) -> str:
|
|
183
|
+
"""Format as text. Use verbose=True for full details."""
|
|
184
|
+
if not self.supported:
|
|
185
|
+
return f"TraceResult: {self.unsupported_reason}"
|
|
186
|
+
|
|
187
|
+
lines = [f"Row {self.row_id} Journey:"]
|
|
188
|
+
|
|
189
|
+
if self.is_alive:
|
|
190
|
+
lines.append(" Status: [OK] Alive")
|
|
191
|
+
else:
|
|
192
|
+
lines.append(" Status: [X] Dropped")
|
|
193
|
+
if self.dropped_at:
|
|
194
|
+
lines.append(
|
|
195
|
+
f" at step {self.dropped_at['step_id']}: {self.dropped_at['operation']}"
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
if self.merge_origin:
|
|
199
|
+
left = self.merge_origin.get("left_parent", "?")
|
|
200
|
+
right = self.merge_origin.get("right_parent", "?")
|
|
201
|
+
lines.append(f" Origin: merge of row {left} (left) + row {right} (right)")
|
|
202
|
+
|
|
203
|
+
if len(self.events) == 0:
|
|
204
|
+
lines.append("\n Events: 0 (no changes to watched columns)")
|
|
205
|
+
else:
|
|
206
|
+
lines.append(f"\n Events: {len(self.events)}")
|
|
207
|
+
event_limit = 10 if verbose else 5
|
|
208
|
+
for event in self.events[-event_limit:]:
|
|
209
|
+
change = event.get("change_type", "?")
|
|
210
|
+
op = event.get("operation", "?")
|
|
211
|
+
col = event.get("col", "")
|
|
212
|
+
if col and col != "__row__":
|
|
213
|
+
lines.append(f" [{change}] {op}: {col}")
|
|
214
|
+
else:
|
|
215
|
+
lines.append(f" [{change}] {op}")
|
|
216
|
+
|
|
217
|
+
if self.ghost_values:
|
|
218
|
+
lines.append("\n Last known values:")
|
|
219
|
+
limit = 10 if verbose else 5
|
|
220
|
+
for k, v in list(self.ghost_values.items())[:limit]:
|
|
221
|
+
lines.append(f" {k}: {v}")
|
|
222
|
+
|
|
223
|
+
return "\n".join(lines)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
@dataclass
|
|
227
|
+
class WhyResult:
|
|
228
|
+
"""
|
|
229
|
+
Result of why() - cell provenance.
|
|
230
|
+
|
|
231
|
+
Answers: "Why does this cell have this value?"
|
|
232
|
+
History is stored in CHRONOLOGICAL order (oldest->newest).
|
|
233
|
+
"""
|
|
234
|
+
|
|
235
|
+
row_id: int
|
|
236
|
+
column: str
|
|
237
|
+
current_value: Any = None
|
|
238
|
+
history: list[dict[str, Any]] = field(default_factory=list)
|
|
239
|
+
became_null_at: dict[str, Any] | None = None
|
|
240
|
+
# Mode enforcement
|
|
241
|
+
supported: bool = True
|
|
242
|
+
unsupported_reason: str | None = None
|
|
243
|
+
# Value tracking
|
|
244
|
+
_current_value_known: bool = False
|
|
245
|
+
|
|
246
|
+
@property
|
|
247
|
+
def n_changes(self) -> int:
|
|
248
|
+
return len(self.history)
|
|
249
|
+
|
|
250
|
+
@property
|
|
251
|
+
def root_cause(self) -> dict[str, Any] | None:
|
|
252
|
+
"""The first change (oldest)."""
|
|
253
|
+
return self.history[0] if self.history else None
|
|
254
|
+
|
|
255
|
+
@property
|
|
256
|
+
def latest_change(self) -> dict[str, Any] | None:
|
|
257
|
+
"""The most recent change."""
|
|
258
|
+
return self.history[-1] if self.history else None
|
|
259
|
+
|
|
260
|
+
def to_dict(self) -> dict:
|
|
261
|
+
"""JSON-serializable dict representation."""
|
|
262
|
+
return {
|
|
263
|
+
"row_id": self.row_id,
|
|
264
|
+
"column": self.column,
|
|
265
|
+
"current_value": self.current_value,
|
|
266
|
+
"n_changes": self.n_changes,
|
|
267
|
+
"history": self.history,
|
|
268
|
+
"became_null_at": self.became_null_at,
|
|
269
|
+
"supported": self.supported,
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
def __repr__(self) -> str:
|
|
273
|
+
return self.to_text(verbose=False)
|
|
274
|
+
|
|
275
|
+
def to_text(self, verbose: bool = True) -> str:
|
|
276
|
+
"""Format as text. Use verbose=True for full details."""
|
|
277
|
+
if not self.supported:
|
|
278
|
+
return f"WhyResult: {self.unsupported_reason}"
|
|
279
|
+
|
|
280
|
+
lines = [f"Cell History: row {self.row_id}, column '{self.column}'"]
|
|
281
|
+
|
|
282
|
+
if self._current_value_known:
|
|
283
|
+
lines.append(f" Current value: {self.current_value}")
|
|
284
|
+
else:
|
|
285
|
+
lines.append(" Current value: (provide df to see)")
|
|
286
|
+
|
|
287
|
+
if self.became_null_at:
|
|
288
|
+
# Check if null was later recovered
|
|
289
|
+
import pandas as pd
|
|
290
|
+
|
|
291
|
+
is_still_null = pd.isna(self.current_value) if self._current_value_known else True
|
|
292
|
+
if is_still_null:
|
|
293
|
+
lines.append(f" [!] Became null at step {self.became_null_at['step_id']}")
|
|
294
|
+
else:
|
|
295
|
+
lines.append(
|
|
296
|
+
f" [i] Was null at step {self.became_null_at['step_id']} (later recovered)"
|
|
297
|
+
)
|
|
298
|
+
lines.append(f" by: {self.became_null_at['operation']}")
|
|
299
|
+
|
|
300
|
+
if self.history:
|
|
301
|
+
lines.append(f"\n History ({len(self.history)} changes, most recent first):")
|
|
302
|
+
event_limit = 10 if verbose else 5
|
|
303
|
+
for event in reversed(self.history[-event_limit:]):
|
|
304
|
+
old = event.get("old_val", "?")
|
|
305
|
+
new = event.get("new_val", "?")
|
|
306
|
+
op = event.get("operation", "?")
|
|
307
|
+
loc = event.get("code_location", "")
|
|
308
|
+
lines.append(f" {old} -> {new}")
|
|
309
|
+
lines.append(f" by: {op}")
|
|
310
|
+
if verbose and loc:
|
|
311
|
+
lines.append(f" at: {loc}")
|
|
312
|
+
else:
|
|
313
|
+
lines.append("\n No changes tracked (original value)")
|
|
314
|
+
|
|
315
|
+
return "\n".join(lines)
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
# ============ CONVENIENCE FUNCTIONS ============
|
|
319
|
+
|
|
320
|
+
# Default thresholds
|
|
321
|
+
_DEFAULT_MERGE_EXPANSION_THRESHOLD = 1.5
|
|
322
|
+
_DEFAULT_RETENTION_THRESHOLD = 0.5
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def check(
|
|
326
|
+
df: pd.DataFrame,
|
|
327
|
+
*,
|
|
328
|
+
merge_expansion_threshold: float | None = None,
|
|
329
|
+
retention_threshold: float | None = None,
|
|
330
|
+
) -> CheckResult:
|
|
331
|
+
"""
|
|
332
|
+
Run health check on pipeline.
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
df: DataFrame to check (used for additional validation)
|
|
336
|
+
merge_expansion_threshold: Flag merges expanding beyond this ratio
|
|
337
|
+
retention_threshold: Flag if retention drops below this
|
|
338
|
+
|
|
339
|
+
Returns:
|
|
340
|
+
CheckResult with .ok, .warnings, .facts, .suggestions
|
|
341
|
+
Use print(result) for pretty output, result.to_dict() for data.
|
|
342
|
+
|
|
343
|
+
Examples:
|
|
344
|
+
result = tp.check(df)
|
|
345
|
+
print(result) # Pretty output
|
|
346
|
+
result.raise_if_failed() # For CI
|
|
347
|
+
"""
|
|
348
|
+
ctx = get_context()
|
|
349
|
+
warnings_list: list[CheckWarning] = []
|
|
350
|
+
facts: dict[str, Any] = {}
|
|
351
|
+
suggestions: list[str] = []
|
|
352
|
+
|
|
353
|
+
merge_threshold_source = "user" if merge_expansion_threshold is not None else "default"
|
|
354
|
+
retention_threshold_source = "user" if retention_threshold is not None else "default"
|
|
355
|
+
|
|
356
|
+
merge_expansion_threshold = merge_expansion_threshold or _DEFAULT_MERGE_EXPANSION_THRESHOLD
|
|
357
|
+
retention_threshold = retention_threshold or _DEFAULT_RETENTION_THRESHOLD
|
|
358
|
+
|
|
359
|
+
# === FACTS ===
|
|
360
|
+
dropped = ctx.store.get_dropped_rows()
|
|
361
|
+
facts["rows_dropped"] = len(dropped)
|
|
362
|
+
facts["total_steps"] = len(ctx.store.steps)
|
|
363
|
+
|
|
364
|
+
# Merge statistics
|
|
365
|
+
merge_stats_list = ctx.store.get_merge_stats() if hasattr(ctx.store, "get_merge_stats") else []
|
|
366
|
+
|
|
367
|
+
for i, (step_id, stats) in enumerate(merge_stats_list):
|
|
368
|
+
facts[f"merge_{i}_expansion"] = stats.expansion_ratio
|
|
369
|
+
facts[f"merge_{i}_result_rows"] = stats.result_rows
|
|
370
|
+
|
|
371
|
+
if stats.expansion_ratio > merge_expansion_threshold:
|
|
372
|
+
severity = "fact" if merge_threshold_source == "user" else "heuristic"
|
|
373
|
+
warnings_list.append(
|
|
374
|
+
CheckWarning(
|
|
375
|
+
category="merge_expansion",
|
|
376
|
+
severity=severity,
|
|
377
|
+
message=f"Merge expanded {stats.expansion_ratio:.2f}x "
|
|
378
|
+
f"({stats.left_rows} x {stats.right_rows} -> {stats.result_rows})",
|
|
379
|
+
details={
|
|
380
|
+
"step_id": step_id,
|
|
381
|
+
"expansion": stats.expansion_ratio,
|
|
382
|
+
"how": stats.how,
|
|
383
|
+
},
|
|
384
|
+
fix_hint="Check for duplicate keys in join columns",
|
|
385
|
+
)
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
if stats.left_dup_rate > 0.01:
|
|
389
|
+
warnings_list.append(
|
|
390
|
+
CheckWarning(
|
|
391
|
+
category="duplicate_keys",
|
|
392
|
+
severity="fact",
|
|
393
|
+
message=f"Left table has {stats.left_dup_rate:.1%} duplicate join keys",
|
|
394
|
+
details={"step_id": step_id, "dup_rate": stats.left_dup_rate},
|
|
395
|
+
)
|
|
396
|
+
)
|
|
397
|
+
if stats.right_dup_rate > 0.01:
|
|
398
|
+
warnings_list.append(
|
|
399
|
+
CheckWarning(
|
|
400
|
+
category="duplicate_keys",
|
|
401
|
+
severity="fact",
|
|
402
|
+
message=f"Right table has {stats.right_dup_rate:.1%} duplicate join keys",
|
|
403
|
+
details={"step_id": step_id, "dup_rate": stats.right_dup_rate},
|
|
404
|
+
)
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
# Retention rate - use max rows seen to handle multi-table pipelines
|
|
408
|
+
if ctx.store.steps:
|
|
409
|
+
max_rows_seen = 0
|
|
410
|
+
for step in ctx.store.steps:
|
|
411
|
+
# input_shape can be a single shape tuple (rows, cols) or
|
|
412
|
+
# a tuple of shapes for merge operations ((left_rows, cols), (right_rows, cols))
|
|
413
|
+
if step.input_shape:
|
|
414
|
+
shape = step.input_shape
|
|
415
|
+
if isinstance(shape[0], tuple):
|
|
416
|
+
# Multiple inputs (e.g., merge) - take max of all inputs
|
|
417
|
+
for s in shape:
|
|
418
|
+
if isinstance(s, tuple) and len(s) >= 1:
|
|
419
|
+
max_rows_seen = max(max_rows_seen, s[0])
|
|
420
|
+
elif isinstance(shape[0], int):
|
|
421
|
+
max_rows_seen = max(max_rows_seen, shape[0])
|
|
422
|
+
|
|
423
|
+
if step.output_shape and isinstance(step.output_shape[0], int):
|
|
424
|
+
max_rows_seen = max(max_rows_seen, step.output_shape[0])
|
|
425
|
+
|
|
426
|
+
if max_rows_seen > 0:
|
|
427
|
+
current = len(df)
|
|
428
|
+
retention = current / max_rows_seen if max_rows_seen > 0 else 1.0
|
|
429
|
+
facts["retention_rate"] = round(retention, 4)
|
|
430
|
+
|
|
431
|
+
if retention < retention_threshold:
|
|
432
|
+
severity = "fact" if retention_threshold_source == "user" else "heuristic"
|
|
433
|
+
warnings_list.append(
|
|
434
|
+
CheckWarning(
|
|
435
|
+
category="retention",
|
|
436
|
+
severity=severity,
|
|
437
|
+
message=f"Retention is {retention:.1%} (below {retention_threshold:.0%})",
|
|
438
|
+
details={
|
|
439
|
+
"retention": retention,
|
|
440
|
+
"max_rows_seen": max_rows_seen,
|
|
441
|
+
"current": current,
|
|
442
|
+
},
|
|
443
|
+
fix_hint="Review filter operations",
|
|
444
|
+
)
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
# === HEURISTICS ===
|
|
448
|
+
for i, (step_id, stats) in enumerate(merge_stats_list):
|
|
449
|
+
if stats.how == "left" and stats.expansion_ratio > 1.0:
|
|
450
|
+
warnings_list.append(
|
|
451
|
+
CheckWarning(
|
|
452
|
+
category="possible_unintended_expansion",
|
|
453
|
+
severity="heuristic",
|
|
454
|
+
message=f"Left join expanded {stats.expansion_ratio:.2f}x - was 1:1 expected?",
|
|
455
|
+
details={"step_id": step_id},
|
|
456
|
+
fix_hint="If 1:1 was intended, use validate='1:1' in merge()",
|
|
457
|
+
)
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
drops_by_step = ctx.store.get_dropped_by_step()
|
|
461
|
+
for op, count in drops_by_step.items():
|
|
462
|
+
if count > 1000:
|
|
463
|
+
suggestions.append(f"'{op}' dropped {count} rows - review if intentional")
|
|
464
|
+
|
|
465
|
+
ok = len([w for w in warnings_list if w.severity == "fact"]) == 0
|
|
466
|
+
|
|
467
|
+
return CheckResult(
|
|
468
|
+
ok=ok,
|
|
469
|
+
warnings=warnings_list,
|
|
470
|
+
facts=facts,
|
|
471
|
+
suggestions=suggestions,
|
|
472
|
+
mode=ctx.config.mode.value,
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
def trace(
|
|
477
|
+
df: pd.DataFrame,
|
|
478
|
+
*,
|
|
479
|
+
row: int | None = None,
|
|
480
|
+
where: dict[str, Any] | None = None,
|
|
481
|
+
include_ghost: bool = True,
|
|
482
|
+
) -> TraceResult | list[TraceResult]:
|
|
483
|
+
"""
|
|
484
|
+
Trace a row's journey through the pipeline.
|
|
485
|
+
|
|
486
|
+
Args:
|
|
487
|
+
df: DataFrame to search in
|
|
488
|
+
row: Row ID (if known)
|
|
489
|
+
where: Selector dict, e.g. {"customer_id": "C123"}
|
|
490
|
+
include_ghost: Include last-known values for dropped rows
|
|
491
|
+
|
|
492
|
+
Returns:
|
|
493
|
+
TraceResult (single row) or List[TraceResult] (if where matches multiple)
|
|
494
|
+
Use print(result) for pretty output, result.to_dict() for data.
|
|
495
|
+
|
|
496
|
+
Examples:
|
|
497
|
+
result = tp.trace(df, row=5)
|
|
498
|
+
print(result)
|
|
499
|
+
tp.trace(df, where={"customer_id": "C123"})
|
|
500
|
+
"""
|
|
501
|
+
ctx = get_context()
|
|
502
|
+
|
|
503
|
+
# Mode enforcement for deep lineage
|
|
504
|
+
if ctx.config.mode == TracePipeMode.CI and not ctx.config.should_capture_cell_history:
|
|
505
|
+
# CI mode still supports basic trace (drop tracking)
|
|
506
|
+
pass
|
|
507
|
+
|
|
508
|
+
# Resolve row IDs
|
|
509
|
+
if row is not None:
|
|
510
|
+
row_ids = [row]
|
|
511
|
+
elif where is not None:
|
|
512
|
+
row_ids = _resolve_where(df, where, ctx)
|
|
513
|
+
else:
|
|
514
|
+
raise ValueError("Must provide 'row' or 'where'")
|
|
515
|
+
|
|
516
|
+
results = []
|
|
517
|
+
for rid in row_ids:
|
|
518
|
+
result = _build_trace_result(rid, ctx, include_ghost)
|
|
519
|
+
results.append(result)
|
|
520
|
+
|
|
521
|
+
return results[0] if len(results) == 1 else results
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
def why(
|
|
525
|
+
df: pd.DataFrame,
|
|
526
|
+
*,
|
|
527
|
+
col: str,
|
|
528
|
+
row: int | None = None,
|
|
529
|
+
where: dict[str, Any] | None = None,
|
|
530
|
+
) -> WhyResult | list[WhyResult]:
|
|
531
|
+
"""
|
|
532
|
+
Explain why a cell has its current value.
|
|
533
|
+
|
|
534
|
+
Args:
|
|
535
|
+
df: DataFrame to search in
|
|
536
|
+
col: Column name to trace
|
|
537
|
+
row: Row ID (if known)
|
|
538
|
+
where: Selector dict, e.g. {"customer_id": "C123"}
|
|
539
|
+
|
|
540
|
+
Returns:
|
|
541
|
+
WhyResult (single row) or List[WhyResult] (if where matches multiple)
|
|
542
|
+
Use print(result) for pretty output, result.to_dict() for data.
|
|
543
|
+
|
|
544
|
+
Examples:
|
|
545
|
+
result = tp.why(df, col="amount", row=5)
|
|
546
|
+
print(result)
|
|
547
|
+
tp.why(df, col="email", where={"user_id": "U123"})
|
|
548
|
+
"""
|
|
549
|
+
ctx = get_context()
|
|
550
|
+
|
|
551
|
+
# Mode enforcement
|
|
552
|
+
if ctx.config.mode == TracePipeMode.CI and not ctx.config.should_capture_cell_history:
|
|
553
|
+
return WhyResult(
|
|
554
|
+
row_id=-1,
|
|
555
|
+
column=col,
|
|
556
|
+
supported=False,
|
|
557
|
+
unsupported_reason="Cell history requires mode='debug' or cell_history=True",
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
# Resolve row IDs
|
|
561
|
+
if row is not None:
|
|
562
|
+
row_ids = [row]
|
|
563
|
+
elif where is not None:
|
|
564
|
+
row_ids = _resolve_where(df, where, ctx)
|
|
565
|
+
else:
|
|
566
|
+
raise ValueError("Must provide 'row' or 'where'")
|
|
567
|
+
|
|
568
|
+
results = []
|
|
569
|
+
for rid in row_ids:
|
|
570
|
+
result = _build_why_result(df, rid, col, ctx)
|
|
571
|
+
results.append(result)
|
|
572
|
+
|
|
573
|
+
return results[0] if len(results) == 1 else results
|
|
574
|
+
|
|
575
|
+
|
|
576
|
+
def report(
|
|
577
|
+
df: pd.DataFrame,
|
|
578
|
+
path: str = "tracepipe_report.html",
|
|
579
|
+
*,
|
|
580
|
+
title: str = "TracePipe Report",
|
|
581
|
+
) -> str:
|
|
582
|
+
"""
|
|
583
|
+
Generate HTML report.
|
|
584
|
+
|
|
585
|
+
Args:
|
|
586
|
+
df: Final DataFrame
|
|
587
|
+
path: Output path
|
|
588
|
+
title: Report title
|
|
589
|
+
|
|
590
|
+
Returns:
|
|
591
|
+
Path to saved report
|
|
592
|
+
"""
|
|
593
|
+
try:
|
|
594
|
+
from .visualization.html_export import save as _save
|
|
595
|
+
|
|
596
|
+
_save(path, title=title)
|
|
597
|
+
except ImportError:
|
|
598
|
+
# Fallback if visualization module can't be imported
|
|
599
|
+
ctx = get_context()
|
|
600
|
+
html_content = f"""<!DOCTYPE html>
|
|
601
|
+
<html>
|
|
602
|
+
<head><title>{title}</title></head>
|
|
603
|
+
<body>
|
|
604
|
+
<h1>{title}</h1>
|
|
605
|
+
<p>Mode: {ctx.config.mode.value}</p>
|
|
606
|
+
<p>Steps: {len(ctx.store.steps)}</p>
|
|
607
|
+
<p>Rows dropped: {len(ctx.store.get_dropped_rows())}</p>
|
|
608
|
+
<p>DataFrame shape: {df.shape}</p>
|
|
609
|
+
</body>
|
|
610
|
+
</html>"""
|
|
611
|
+
with open(path, "w") as f:
|
|
612
|
+
f.write(html_content)
|
|
613
|
+
|
|
614
|
+
print(f"Report saved to: {path}")
|
|
615
|
+
return path
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
def find(
|
|
619
|
+
df: pd.DataFrame,
|
|
620
|
+
*,
|
|
621
|
+
where: dict[str, Any] | None = None,
|
|
622
|
+
predicate: Callable[[pd.DataFrame], pd.Series] | None = None,
|
|
623
|
+
limit: int = 10,
|
|
624
|
+
) -> list[int]:
|
|
625
|
+
"""
|
|
626
|
+
Find row IDs matching a selector.
|
|
627
|
+
|
|
628
|
+
Args:
|
|
629
|
+
df: DataFrame to search
|
|
630
|
+
where: Exact match selector
|
|
631
|
+
predicate: Vector predicate (df -> boolean Series)
|
|
632
|
+
limit: Maximum number of IDs to return
|
|
633
|
+
|
|
634
|
+
Returns:
|
|
635
|
+
List of row IDs
|
|
636
|
+
|
|
637
|
+
Examples:
|
|
638
|
+
rids = tp.find(df, where={"status": "failed"})
|
|
639
|
+
tp.trace(df, row=rids[0])
|
|
640
|
+
"""
|
|
641
|
+
ctx = get_context()
|
|
642
|
+
|
|
643
|
+
if where:
|
|
644
|
+
row_ids = _resolve_where(df, where, ctx, limit=limit)
|
|
645
|
+
elif predicate:
|
|
646
|
+
row_ids = _resolve_predicate(df, predicate, ctx, limit=limit)
|
|
647
|
+
else:
|
|
648
|
+
raise ValueError("Must provide 'where' or 'predicate'")
|
|
649
|
+
|
|
650
|
+
return row_ids
|
|
651
|
+
|
|
652
|
+
|
|
653
|
+
# ============ HELPERS ============
|
|
654
|
+
|
|
655
|
+
|
|
656
|
+
def _json_safe(val: Any) -> Any:
|
|
657
|
+
"""Convert value to JSON-serializable form."""
|
|
658
|
+
if pd.isna(val):
|
|
659
|
+
return None
|
|
660
|
+
if isinstance(val, (np.integer, np.floating)):
|
|
661
|
+
return val.item()
|
|
662
|
+
if isinstance(val, np.ndarray):
|
|
663
|
+
return val.tolist()
|
|
664
|
+
return val
|
|
665
|
+
|
|
666
|
+
|
|
667
|
+
def _resolve_where(
|
|
668
|
+
df: pd.DataFrame,
|
|
669
|
+
where: dict[str, Any],
|
|
670
|
+
ctx,
|
|
671
|
+
limit: int | None = None,
|
|
672
|
+
) -> list[int]:
|
|
673
|
+
"""Resolve row IDs from where dict selector."""
|
|
674
|
+
rids = ctx.row_manager.get_ids_array(df)
|
|
675
|
+
if rids is None:
|
|
676
|
+
raise ValueError("DataFrame not tracked by TracePipe")
|
|
677
|
+
|
|
678
|
+
mask = np.ones(len(df), dtype=bool)
|
|
679
|
+
for col, val in where.items():
|
|
680
|
+
if col not in df.columns:
|
|
681
|
+
raise ValueError(f"Column '{col}' not in DataFrame")
|
|
682
|
+
|
|
683
|
+
series = df[col]
|
|
684
|
+
if isinstance(val, (list, tuple)):
|
|
685
|
+
col_mask = series.isin(val).to_numpy()
|
|
686
|
+
elif pd.isna(val):
|
|
687
|
+
col_mask = series.isna().to_numpy()
|
|
688
|
+
else:
|
|
689
|
+
col_mask = series.eq(val).to_numpy()
|
|
690
|
+
mask &= col_mask
|
|
691
|
+
|
|
692
|
+
matched_positions = np.where(mask)[0]
|
|
693
|
+
if len(matched_positions) == 0:
|
|
694
|
+
raise ValueError(f"No rows matched where={where}")
|
|
695
|
+
|
|
696
|
+
if limit:
|
|
697
|
+
matched_positions = matched_positions[:limit]
|
|
698
|
+
|
|
699
|
+
return rids[matched_positions].tolist()
|
|
700
|
+
|
|
701
|
+
|
|
702
|
+
def _resolve_predicate(
|
|
703
|
+
df: pd.DataFrame,
|
|
704
|
+
predicate: Callable[[pd.DataFrame], pd.Series],
|
|
705
|
+
ctx,
|
|
706
|
+
limit: int | None = None,
|
|
707
|
+
) -> list[int]:
|
|
708
|
+
"""Resolve row IDs from predicate function."""
|
|
709
|
+
rids = ctx.row_manager.get_ids_array(df)
|
|
710
|
+
if rids is None:
|
|
711
|
+
raise ValueError("DataFrame not tracked by TracePipe")
|
|
712
|
+
|
|
713
|
+
mask_series = predicate(df)
|
|
714
|
+
if not isinstance(mask_series, pd.Series):
|
|
715
|
+
raise TypeError("predicate must return pd.Series")
|
|
716
|
+
if mask_series.dtype != bool:
|
|
717
|
+
raise TypeError("predicate must return boolean Series")
|
|
718
|
+
|
|
719
|
+
mask = mask_series.to_numpy()
|
|
720
|
+
matched_positions = np.where(mask)[0]
|
|
721
|
+
|
|
722
|
+
if len(matched_positions) == 0:
|
|
723
|
+
raise ValueError("No rows matched predicate")
|
|
724
|
+
|
|
725
|
+
if limit:
|
|
726
|
+
matched_positions = matched_positions[:limit]
|
|
727
|
+
|
|
728
|
+
return rids[matched_positions].tolist()
|
|
729
|
+
|
|
730
|
+
|
|
731
|
+
def _build_trace_result(row_id: int, ctx, include_ghost: bool) -> TraceResult:
|
|
732
|
+
"""Build TraceResult for a single row."""
|
|
733
|
+
store = ctx.store
|
|
734
|
+
|
|
735
|
+
drop_event = store.get_drop_event(row_id)
|
|
736
|
+
merge_origin = store.get_merge_origin(row_id)
|
|
737
|
+
|
|
738
|
+
# Use lineage-aware history to include pre-merge parent events
|
|
739
|
+
if hasattr(store, "get_row_history_with_lineage"):
|
|
740
|
+
history = store.get_row_history_with_lineage(row_id)
|
|
741
|
+
else:
|
|
742
|
+
history = store.get_row_history(row_id)
|
|
743
|
+
|
|
744
|
+
dropped_at = None
|
|
745
|
+
if drop_event:
|
|
746
|
+
dropped_at = {
|
|
747
|
+
"step_id": drop_event.get("step_id"),
|
|
748
|
+
"operation": drop_event.get("operation"),
|
|
749
|
+
}
|
|
750
|
+
|
|
751
|
+
ghost_values = None
|
|
752
|
+
if include_ghost and drop_event is not None:
|
|
753
|
+
ghost_df = ctx.row_manager.get_ghost_rows(limit=10000)
|
|
754
|
+
if not ghost_df.empty and "__tp_row_id__" in ghost_df.columns:
|
|
755
|
+
ghost_row = ghost_df[ghost_df["__tp_row_id__"] == row_id]
|
|
756
|
+
if not ghost_row.empty:
|
|
757
|
+
ghost_values = ghost_row.iloc[0].to_dict()
|
|
758
|
+
tp_cols = [
|
|
759
|
+
"__tp_row_id__",
|
|
760
|
+
"__tp_dropped_by__",
|
|
761
|
+
"__tp_dropped_step__",
|
|
762
|
+
"__tp_original_position__",
|
|
763
|
+
]
|
|
764
|
+
for col in tp_cols:
|
|
765
|
+
ghost_values.pop(col, None)
|
|
766
|
+
|
|
767
|
+
return TraceResult(
|
|
768
|
+
row_id=row_id,
|
|
769
|
+
is_alive=drop_event is None,
|
|
770
|
+
dropped_at=dropped_at,
|
|
771
|
+
merge_origin=merge_origin,
|
|
772
|
+
events=history,
|
|
773
|
+
ghost_values=ghost_values,
|
|
774
|
+
)
|
|
775
|
+
|
|
776
|
+
|
|
777
|
+
def _build_why_result(df: pd.DataFrame, row_id: int, col: str, ctx) -> WhyResult:
|
|
778
|
+
"""Build WhyResult for a single cell."""
|
|
779
|
+
from .value_provenance import explain_value
|
|
780
|
+
|
|
781
|
+
history_obj = explain_value(row_id, col, df)
|
|
782
|
+
|
|
783
|
+
current_value = None
|
|
784
|
+
current_value_known = False
|
|
785
|
+
rids = ctx.row_manager.get_ids_array(df)
|
|
786
|
+
if rids is not None:
|
|
787
|
+
pos = np.where(rids == row_id)[0]
|
|
788
|
+
if len(pos) > 0 and col in df.columns:
|
|
789
|
+
current_value = df.iloc[pos[0]][col]
|
|
790
|
+
current_value_known = True
|
|
791
|
+
|
|
792
|
+
became_null_at = None
|
|
793
|
+
if history_obj.became_null_at:
|
|
794
|
+
became_null_at = {
|
|
795
|
+
"step_id": history_obj.became_null_at,
|
|
796
|
+
"operation": history_obj.became_null_by,
|
|
797
|
+
}
|
|
798
|
+
|
|
799
|
+
result = WhyResult(
|
|
800
|
+
row_id=row_id,
|
|
801
|
+
column=col,
|
|
802
|
+
current_value=_json_safe(current_value),
|
|
803
|
+
history=[
|
|
804
|
+
{
|
|
805
|
+
"step_id": e.step_id,
|
|
806
|
+
"operation": e.operation,
|
|
807
|
+
"old_val": _json_safe(e.old_value),
|
|
808
|
+
"new_val": _json_safe(e.new_value),
|
|
809
|
+
"change_type": e.change_type,
|
|
810
|
+
"code_location": e.code_location,
|
|
811
|
+
}
|
|
812
|
+
for e in history_obj.events
|
|
813
|
+
],
|
|
814
|
+
became_null_at=became_null_at,
|
|
815
|
+
)
|
|
816
|
+
result._current_value_known = current_value_known
|
|
817
|
+
return result
|