tracepipe 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tracepipe/safety.py CHANGED
@@ -129,7 +129,8 @@ def _make_wrapper(
129
129
  ctx._filter_op_depth -= 1
130
130
 
131
131
  # === CAPTURE LINEAGE (SIDE EFFECT) ===
132
- if ctx.enabled:
132
+ # Skip capture if we're inside a filter operation (prevents recursion during export)
133
+ if ctx.enabled and ctx._filter_op_depth == 0:
133
134
  try:
134
135
  if mode == "inplace" and kwargs.get("inplace", False):
135
136
  if before_snapshot is not None:
@@ -141,8 +142,7 @@ def _make_wrapper(
141
142
  except Exception as e:
142
143
  if ctx.config.strict_mode:
143
144
  raise TracePipeError(
144
- f"Instrumentation failed for {method_name}: {e}\n"
145
- f"{traceback.format_exc()}"
145
+ f"Instrumentation failed for {method_name}: {e}\n{traceback.format_exc()}"
146
146
  ) from e
147
147
  else:
148
148
  warnings.warn(
tracepipe/snapshot.py ADDED
@@ -0,0 +1,420 @@
1
+ # tracepipe/snapshot.py
2
+ """
3
+ Pipeline state snapshots and diff functionality.
4
+
5
+ Snapshots capture the current state of a pipeline for comparison,
6
+ debugging, and regression testing.
7
+
8
+ Features:
9
+ - Row ID tracking (which rows are alive/dropped)
10
+ - Column statistics (null rates, unique counts, min/max)
11
+ - Watched column values (columnar storage for efficiency)
12
+ - Cross-run comparison with summary-level diffing
13
+
14
+ Usage:
15
+ # Capture state
16
+ snap = tp.snapshot(df)
17
+
18
+ # Save and load
19
+ snap.save("baseline.json")
20
+ baseline = Snapshot.load("baseline.json")
21
+
22
+ # Compare
23
+ result = tp.diff(baseline, current)
24
+ """
25
+
26
+ import json
27
+ import time
28
+ from dataclasses import dataclass
29
+ from pathlib import Path
30
+ from typing import Any, Optional
31
+
32
+ import numpy as np
33
+ import pandas as pd
34
+
35
+ from .context import get_context
36
+
37
+
38
+ @dataclass
39
+ class ColumnStats:
40
+ """Statistics for a single column."""
41
+
42
+ name: str
43
+ dtype: str
44
+ null_count: int
45
+ null_rate: float
46
+ unique_count: int
47
+ min_val: Any = None
48
+ max_val: Any = None
49
+ mean_val: Optional[float] = None
50
+
51
+
52
+ @dataclass
53
+ class WatchedColumnData:
54
+ """
55
+ Columnar storage for watched values (memory efficient).
56
+
57
+ Instead of Dict[int, Dict[str, Any]] which creates Python objects per row,
58
+ we store arrays that enable vectorized diffing and O(log n) lookup.
59
+ """
60
+
61
+ rids: np.ndarray # Row IDs (sorted for binary search)
62
+ columns: list[str] # Column names
63
+ values: dict[str, np.ndarray] # col -> values array (aligned with rids)
64
+
65
+ def get_value(self, rid: int, col: str) -> Optional[Any]:
66
+ """Get value for a specific row/column (O(log n) lookup)."""
67
+ if col not in self.values:
68
+ return None
69
+ i = np.searchsorted(self.rids, rid)
70
+ if i < len(self.rids) and self.rids[i] == rid:
71
+ return self.values[col][i]
72
+ return None
73
+
74
+ def to_dict_view(self, limit: int = 1000) -> dict[int, dict[str, Any]]:
75
+ """Build dict view for small samples (for serialization)."""
76
+ result = {}
77
+ for i, rid in enumerate(self.rids[:limit]):
78
+ result[int(rid)] = {col: self.values[col][i] for col in self.columns}
79
+ return result
80
+
81
+
82
+ @dataclass
83
+ class Snapshot:
84
+ """
85
+ Pipeline state snapshot.
86
+
87
+ Captures:
88
+ - Row IDs present
89
+ - Dropped row IDs with reasons
90
+ - Watched column values (columnar storage)
91
+ - Summary statistics
92
+ """
93
+
94
+ timestamp: float
95
+ row_ids: set[int]
96
+ dropped_ids: set[int]
97
+ drops_by_op: dict[str, int]
98
+ column_stats: dict[str, ColumnStats]
99
+ watched_data: Optional[WatchedColumnData]
100
+ total_steps: int
101
+ mode: str
102
+
103
+ @classmethod
104
+ def capture(cls, df: pd.DataFrame, include_values: bool = False) -> "Snapshot":
105
+ """
106
+ Capture current pipeline state.
107
+
108
+ Args:
109
+ df: Current DataFrame
110
+ include_values: If True, store watched column values (columnar)
111
+ """
112
+ ctx = get_context()
113
+
114
+ row_ids = set()
115
+ rids = ctx.row_manager.get_ids_array(df)
116
+ if rids is not None:
117
+ row_ids = set(rids.tolist())
118
+
119
+ dropped_ids = set(ctx.store.get_dropped_rows())
120
+ drops_by_op = ctx.store.get_dropped_by_step()
121
+
122
+ # Column stats
123
+ column_stats = {}
124
+ for col in df.columns:
125
+ if col.startswith("__tp"):
126
+ continue
127
+
128
+ null_count = df[col].isna().sum()
129
+ stats = ColumnStats(
130
+ name=col,
131
+ dtype=str(df[col].dtype),
132
+ null_count=int(null_count),
133
+ null_rate=null_count / len(df) if len(df) > 0 else 0,
134
+ unique_count=df[col].nunique(),
135
+ )
136
+
137
+ if pd.api.types.is_numeric_dtype(df[col]):
138
+ try:
139
+ stats.min_val = float(df[col].min()) if not df[col].isna().all() else None
140
+ stats.max_val = float(df[col].max()) if not df[col].isna().all() else None
141
+ stats.mean_val = float(df[col].mean()) if not df[col].isna().all() else None
142
+ except (TypeError, ValueError):
143
+ pass
144
+
145
+ column_stats[col] = stats
146
+
147
+ # Columnar watched values (memory efficient)
148
+ watched_data = None
149
+ if include_values and ctx.watched_columns and rids is not None:
150
+ cols = list(ctx.watched_columns & set(df.columns))
151
+ if cols:
152
+ # Sort RIDs for binary search
153
+ sort_idx = np.argsort(rids)
154
+ sorted_rids = rids[sort_idx]
155
+
156
+ # Extract values columnar (one array per column)
157
+ values = {}
158
+ for col in cols:
159
+ values[col] = df[col].values[sort_idx]
160
+
161
+ watched_data = WatchedColumnData(
162
+ rids=sorted_rids,
163
+ columns=cols,
164
+ values=values,
165
+ )
166
+
167
+ return cls(
168
+ timestamp=time.time(),
169
+ row_ids=row_ids,
170
+ dropped_ids=dropped_ids,
171
+ drops_by_op=drops_by_op,
172
+ column_stats=column_stats,
173
+ watched_data=watched_data,
174
+ total_steps=len(ctx.store.steps),
175
+ mode=ctx.config.mode.value,
176
+ )
177
+
178
+ def save(self, path: str) -> None:
179
+ """
180
+ Save snapshot to file.
181
+
182
+ Uses separate files for large watched values:
183
+ - {path}: metadata, stats, row IDs (JSON)
184
+ - {path}.npz: watched column arrays (if present)
185
+ """
186
+ base_path = Path(path)
187
+ npz_path = base_path.with_suffix(".npz")
188
+
189
+ # Metadata (always JSON)
190
+ data = {
191
+ "timestamp": self.timestamp,
192
+ "row_ids": list(self.row_ids),
193
+ "dropped_ids": list(self.dropped_ids),
194
+ "drops_by_op": self.drops_by_op,
195
+ "column_stats": {k: vars(v) for k, v in self.column_stats.items()},
196
+ "watched_columns": self.watched_data.columns if self.watched_data else [],
197
+ "has_watched_npz": self.watched_data is not None,
198
+ "total_steps": self.total_steps,
199
+ "mode": self.mode,
200
+ }
201
+ base_path.write_text(json.dumps(data, default=str))
202
+
203
+ # Save watched values as npz (efficient for large data)
204
+ if self.watched_data is not None:
205
+ arrays = {"rids": self.watched_data.rids}
206
+ for col, vals in self.watched_data.values.items():
207
+ # Sanitize column name for npz key
208
+ safe_col = col.replace(".", "_").replace(" ", "_")
209
+ arrays[f"col_{safe_col}"] = vals
210
+ # Save column name mapping
211
+ arrays["_col_names"] = np.array(
212
+ [col.replace(".", "_").replace(" ", "_") for col in self.watched_data.columns]
213
+ )
214
+ arrays["_col_names_original"] = np.array(self.watched_data.columns)
215
+ np.savez_compressed(npz_path, **arrays)
216
+
217
+ @classmethod
218
+ def load(cls, path: str) -> "Snapshot":
219
+ """
220
+ Load snapshot from file.
221
+
222
+ Loads watched values from npz if present.
223
+ """
224
+ base_path = Path(path)
225
+ npz_path = base_path.with_suffix(".npz")
226
+
227
+ data = json.loads(base_path.read_text())
228
+
229
+ column_stats = {k: ColumnStats(**v) for k, v in data["column_stats"].items()}
230
+
231
+ # Load watched data from npz if present
232
+ watched_data = None
233
+ if data.get("has_watched_npz") and npz_path.exists():
234
+ with np.load(npz_path, allow_pickle=True) as npz:
235
+ rids = npz["rids"]
236
+
237
+ # Get original column names
238
+ if "_col_names_original" in npz:
239
+ cols = list(npz["_col_names_original"])
240
+ safe_names = list(npz["_col_names"])
241
+ values = {}
242
+ for col, safe_col in zip(cols, safe_names):
243
+ key = f"col_{safe_col}"
244
+ if key in npz:
245
+ values[col] = npz[key]
246
+ else:
247
+ # Legacy format
248
+ cols = data.get("watched_columns", [])
249
+ values = {}
250
+ for col in cols:
251
+ safe_col = col.replace(".", "_").replace(" ", "_")
252
+ key = f"col_{safe_col}"
253
+ if key in npz:
254
+ values[col] = npz[key]
255
+
256
+ if len(values) > 0:
257
+ watched_data = WatchedColumnData(rids=rids, columns=cols, values=values)
258
+
259
+ return cls(
260
+ timestamp=data["timestamp"],
261
+ row_ids=set(data["row_ids"]),
262
+ dropped_ids=set(data["dropped_ids"]),
263
+ drops_by_op=data["drops_by_op"],
264
+ column_stats=column_stats,
265
+ watched_data=watched_data,
266
+ total_steps=data["total_steps"],
267
+ mode=data["mode"],
268
+ )
269
+
270
+ def summary(self) -> str:
271
+ """Human-readable summary."""
272
+ lines = [
273
+ f"Snapshot ({self.mode} mode)",
274
+ f" Rows: {len(self.row_ids)}",
275
+ f" Dropped: {len(self.dropped_ids)}",
276
+ f" Steps: {self.total_steps}",
277
+ f" Columns: {len(self.column_stats)}",
278
+ ]
279
+ if self.watched_data:
280
+ lines.append(f" Watched: {len(self.watched_data.columns)} columns")
281
+ return "\n".join(lines)
282
+
283
+ def __repr__(self) -> str:
284
+ return (
285
+ f"<Snapshot rows={len(self.row_ids)} dropped={len(self.dropped_ids)} "
286
+ f"steps={self.total_steps}>"
287
+ )
288
+
289
+
290
+ @dataclass
291
+ class DiffResult:
292
+ """Result of comparing two snapshots."""
293
+
294
+ rows_added: set[int]
295
+ rows_removed: set[int]
296
+ new_drops: set[int]
297
+ recovered_rows: set[int]
298
+ drops_delta: dict[str, int] # op -> change in count
299
+ stats_changes: dict[str, dict[str, Any]] # col -> {metric: (old, new)}
300
+
301
+ def __repr__(self) -> str:
302
+ lines = ["Snapshot Diff:"]
303
+
304
+ if self.rows_added:
305
+ lines.append(f" + {len(self.rows_added)} rows added")
306
+ if self.rows_removed:
307
+ lines.append(f" - {len(self.rows_removed)} rows removed")
308
+ if self.new_drops:
309
+ lines.append(f" ! {len(self.new_drops)} new drops")
310
+ if self.recovered_rows:
311
+ lines.append(f" * {len(self.recovered_rows)} recovered")
312
+
313
+ if self.drops_delta:
314
+ lines.append(" Drop changes by operation:")
315
+ for op, delta in sorted(self.drops_delta.items(), key=lambda x: -abs(x[1])):
316
+ sign = "+" if delta > 0 else ""
317
+ lines.append(f" {op}: {sign}{delta}")
318
+
319
+ if self.stats_changes:
320
+ lines.append(" Column stat changes:")
321
+ for col, changes in list(self.stats_changes.items())[:5]:
322
+ for metric, (old, new) in changes.items():
323
+ lines.append(f" {col}.{metric}: {old} -> {new}")
324
+ if len(self.stats_changes) > 5:
325
+ lines.append(f" ... and {len(self.stats_changes) - 5} more")
326
+
327
+ if len(lines) == 1:
328
+ lines.append(" No differences")
329
+
330
+ return "\n".join(lines)
331
+
332
+ @property
333
+ def has_changes(self) -> bool:
334
+ """True if there are any differences."""
335
+ return bool(
336
+ self.rows_added
337
+ or self.rows_removed
338
+ or self.new_drops
339
+ or self.recovered_rows
340
+ or self.drops_delta
341
+ or self.stats_changes
342
+ )
343
+
344
+ def to_dict(self) -> dict:
345
+ """Export to dictionary."""
346
+ return {
347
+ "rows_added": list(self.rows_added),
348
+ "rows_removed": list(self.rows_removed),
349
+ "new_drops": list(self.new_drops),
350
+ "recovered_rows": list(self.recovered_rows),
351
+ "drops_delta": self.drops_delta,
352
+ "stats_changes": self.stats_changes,
353
+ }
354
+
355
+
356
+ def diff(baseline: Snapshot, current: Snapshot) -> DiffResult:
357
+ """
358
+ Compare two snapshots.
359
+
360
+ Note: Cross-run diff is SUMMARY-ONLY unless keys are stored.
361
+ Row-level comparison only works within same session (same RID assignment).
362
+ """
363
+ rows_added = current.row_ids - baseline.row_ids
364
+ rows_removed = baseline.row_ids - current.row_ids
365
+
366
+ new_drops = current.dropped_ids - baseline.dropped_ids
367
+ recovered_rows = baseline.dropped_ids - current.dropped_ids
368
+
369
+ # Drops delta by operation
370
+ all_ops = set(baseline.drops_by_op.keys()) | set(current.drops_by_op.keys())
371
+ drops_delta = {}
372
+ for op in all_ops:
373
+ old = baseline.drops_by_op.get(op, 0)
374
+ new = current.drops_by_op.get(op, 0)
375
+ if old != new:
376
+ drops_delta[op] = new - old
377
+
378
+ # Stats changes
379
+ stats_changes: dict[str, dict[str, Any]] = {}
380
+ all_cols = set(baseline.column_stats.keys()) | set(current.column_stats.keys())
381
+ for col in all_cols:
382
+ old_stats = baseline.column_stats.get(col)
383
+ new_stats = current.column_stats.get(col)
384
+
385
+ if old_stats is None or new_stats is None:
386
+ continue
387
+
388
+ changes: dict[str, Any] = {}
389
+ if old_stats.null_rate != new_stats.null_rate:
390
+ changes["null_rate"] = (old_stats.null_rate, new_stats.null_rate)
391
+ if old_stats.unique_count != new_stats.unique_count:
392
+ changes["unique_count"] = (old_stats.unique_count, new_stats.unique_count)
393
+ if old_stats.dtype != new_stats.dtype:
394
+ changes["dtype"] = (old_stats.dtype, new_stats.dtype)
395
+
396
+ if changes:
397
+ stats_changes[col] = changes
398
+
399
+ return DiffResult(
400
+ rows_added=rows_added,
401
+ rows_removed=rows_removed,
402
+ new_drops=new_drops,
403
+ recovered_rows=recovered_rows,
404
+ drops_delta=drops_delta,
405
+ stats_changes=stats_changes,
406
+ )
407
+
408
+
409
+ def snapshot(df: pd.DataFrame, include_values: bool = False) -> Snapshot:
410
+ """
411
+ Capture current pipeline state.
412
+
413
+ Args:
414
+ df: Current DataFrame
415
+ include_values: If True, store watched column values (columnar)
416
+
417
+ Returns:
418
+ Snapshot object
419
+ """
420
+ return Snapshot.capture(df, include_values)
tracepipe/storage/base.py CHANGED
@@ -137,15 +137,19 @@ class RowIdentityStrategy(Protocol):
137
137
  ...
138
138
 
139
139
  def get_ids(self, df: Any) -> Optional[Any]:
140
- """Get row IDs for a DataFrame, or None if not tracked."""
140
+ """Get row IDs as Series for a DataFrame, or None if not tracked."""
141
+ ...
142
+
143
+ def get_ids_array(self, df: Any) -> Optional[Any]:
144
+ """Get row IDs as numpy array for a DataFrame, or None if not tracked."""
141
145
  ...
142
146
 
143
147
  def propagate(self, source_df: Any, result_df: Any) -> Optional[Any]:
144
148
  """Propagate row IDs from source to result DataFrame."""
145
149
  ...
146
150
 
147
- def get_dropped_ids(self, source_df: Any, result_df: Any) -> set:
148
- """Get row IDs that were dropped between source and result."""
151
+ def get_dropped_ids(self, source_df: Any, result_df: Any) -> Any:
152
+ """Get row IDs that were dropped between source and result (as numpy array)."""
149
153
  ...
150
154
 
151
155
  def strip_hidden_column(self, df: Any) -> Any: