tracepipe 0.4.1__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tracepipe/__init__.py +1 -1
- tracepipe/convenience.py +131 -12
- tracepipe/debug.py +40 -0
- tracepipe/snapshot.py +87 -2
- {tracepipe-0.4.1.dist-info → tracepipe-0.4.2.dist-info}/METADATA +1 -1
- {tracepipe-0.4.1.dist-info → tracepipe-0.4.2.dist-info}/RECORD +8 -8
- {tracepipe-0.4.1.dist-info → tracepipe-0.4.2.dist-info}/WHEEL +0 -0
- {tracepipe-0.4.1.dist-info → tracepipe-0.4.2.dist-info}/licenses/LICENSE +0 -0
tracepipe/__init__.py
CHANGED
tracepipe/convenience.py
CHANGED
|
@@ -60,6 +60,8 @@ class CheckResult:
|
|
|
60
60
|
.retention - Row retention rate (0.0-1.0)
|
|
61
61
|
.n_dropped - Total rows dropped
|
|
62
62
|
.drops_by_op - Drops broken down by operation
|
|
63
|
+
.n_changes - Total cell-level changes (debug mode only)
|
|
64
|
+
.changes_by_op - Changes broken down by operation (debug mode only)
|
|
63
65
|
"""
|
|
64
66
|
|
|
65
67
|
ok: bool
|
|
@@ -69,6 +71,9 @@ class CheckResult:
|
|
|
69
71
|
mode: str
|
|
70
72
|
# Internal: store drops_by_op so we don't need to recompute
|
|
71
73
|
_drops_by_op: dict[str, int] = field(default_factory=dict)
|
|
74
|
+
# Internal: store cell change counts (debug mode only)
|
|
75
|
+
_n_changes: int = 0
|
|
76
|
+
_changes_by_op: dict[str, int] = field(default_factory=dict)
|
|
72
77
|
|
|
73
78
|
# === CONVENIENCE PROPERTIES ===
|
|
74
79
|
|
|
@@ -97,6 +102,16 @@ class CheckResult:
|
|
|
97
102
|
"""Total pipeline steps recorded."""
|
|
98
103
|
return self.facts.get("total_steps", 0)
|
|
99
104
|
|
|
105
|
+
@property
|
|
106
|
+
def n_changes(self) -> int:
|
|
107
|
+
"""Total cell-level changes (debug mode only, 0 if not tracked)."""
|
|
108
|
+
return self._n_changes
|
|
109
|
+
|
|
110
|
+
@property
|
|
111
|
+
def changes_by_op(self) -> dict[str, int]:
|
|
112
|
+
"""Cell changes broken down by operation (debug mode only)."""
|
|
113
|
+
return self._changes_by_op
|
|
114
|
+
|
|
100
115
|
# === EXISTING PROPERTIES ===
|
|
101
116
|
|
|
102
117
|
@property
|
|
@@ -127,6 +142,20 @@ class CheckResult:
|
|
|
127
142
|
lines.append(f"TracePipe Check: {status}")
|
|
128
143
|
lines.append(f" Mode: {self.mode}")
|
|
129
144
|
|
|
145
|
+
# Always show key metrics in compact form
|
|
146
|
+
if self.retention is not None:
|
|
147
|
+
lines.append(f"\nRetention: {int(self.retention * 100)}%")
|
|
148
|
+
if self.n_dropped > 0:
|
|
149
|
+
lines.append(f"Dropped: {self.n_dropped} rows")
|
|
150
|
+
if self.drops_by_op:
|
|
151
|
+
for op, count in list(self.drops_by_op.items())[:5]:
|
|
152
|
+
lines.append(f" • {op}: {count}")
|
|
153
|
+
if self.n_changes > 0:
|
|
154
|
+
lines.append(f"\nValue changes: {self.n_changes} cells")
|
|
155
|
+
if self.changes_by_op:
|
|
156
|
+
for op, count in list(self.changes_by_op.items())[:5]:
|
|
157
|
+
lines.append(f" • {op}: {count}")
|
|
158
|
+
|
|
130
159
|
if verbose and self.facts:
|
|
131
160
|
lines.append("\n Measured facts:")
|
|
132
161
|
for k, v in self.facts.items():
|
|
@@ -158,6 +187,8 @@ class CheckResult:
|
|
|
158
187
|
"n_dropped": self.n_dropped,
|
|
159
188
|
"n_steps": self.n_steps,
|
|
160
189
|
"drops_by_op": self.drops_by_op,
|
|
190
|
+
"n_changes": self.n_changes,
|
|
191
|
+
"changes_by_op": self.changes_by_op,
|
|
161
192
|
"facts": self.facts,
|
|
162
193
|
"suggestions": self.suggestions,
|
|
163
194
|
"warnings": [
|
|
@@ -191,6 +222,7 @@ class TraceResult:
|
|
|
191
222
|
Events are in CHRONOLOGICAL order (oldest->newest).
|
|
192
223
|
|
|
193
224
|
Key attributes:
|
|
225
|
+
status: "alive" or "dropped" (string representation)
|
|
194
226
|
origin: Where this row came from (concat, merge, or original)
|
|
195
227
|
representative: If dropped by dedup, which row was kept instead
|
|
196
228
|
"""
|
|
@@ -207,6 +239,27 @@ class TraceResult:
|
|
|
207
239
|
# v0.4+ provenance
|
|
208
240
|
concat_origin: dict[str, Any] | None = None
|
|
209
241
|
dedup_representative: dict[str, Any] | None = None
|
|
242
|
+
# Steps this row survived (for SURVIVED event generation)
|
|
243
|
+
_survived_steps: list[dict[str, Any]] = field(default_factory=list)
|
|
244
|
+
|
|
245
|
+
@property
|
|
246
|
+
def status(self) -> str:
|
|
247
|
+
"""Row status as string: 'alive' or 'dropped'."""
|
|
248
|
+
return "alive" if self.is_alive else "dropped"
|
|
249
|
+
|
|
250
|
+
@property
|
|
251
|
+
def dropped_by(self) -> str | None:
|
|
252
|
+
"""Operation that dropped this row, or None if alive."""
|
|
253
|
+
if self.dropped_at:
|
|
254
|
+
return self.dropped_at.get("operation")
|
|
255
|
+
return None
|
|
256
|
+
|
|
257
|
+
@property
|
|
258
|
+
def dropped_at_step(self) -> int | None:
|
|
259
|
+
"""Step number where this row was dropped, or None if alive."""
|
|
260
|
+
if self.dropped_at:
|
|
261
|
+
return self.dropped_at.get("step_id")
|
|
262
|
+
return None
|
|
210
263
|
|
|
211
264
|
@property
|
|
212
265
|
def n_events(self) -> int:
|
|
@@ -258,8 +311,10 @@ class TraceResult:
|
|
|
258
311
|
"""Export to dictionary."""
|
|
259
312
|
return {
|
|
260
313
|
"row_id": self.row_id,
|
|
314
|
+
"status": self.status,
|
|
261
315
|
"is_alive": self.is_alive,
|
|
262
316
|
"dropped_at": self.dropped_at,
|
|
317
|
+
"dropped_by": self.dropped_at.get("operation") if self.dropped_at else None,
|
|
263
318
|
"origin": self.origin,
|
|
264
319
|
"representative": self.representative,
|
|
265
320
|
"n_events": self.n_events,
|
|
@@ -280,10 +335,11 @@ class TraceResult:
|
|
|
280
335
|
|
|
281
336
|
lines = [f"Row {self.row_id} Journey:"]
|
|
282
337
|
|
|
338
|
+
# Status line matches documentation format
|
|
283
339
|
if self.is_alive:
|
|
284
340
|
lines.append(" Status: [OK] Alive")
|
|
285
341
|
else:
|
|
286
|
-
lines.append(" Status: [
|
|
342
|
+
lines.append(" Status: [DROPPED]")
|
|
287
343
|
if self.dropped_at:
|
|
288
344
|
lines.append(
|
|
289
345
|
f" at step {self.dropped_at['step_id']}: {self.dropped_at['operation']}"
|
|
@@ -579,6 +635,21 @@ def check(
|
|
|
579
635
|
if count > 1000:
|
|
580
636
|
suggestions.append(f"'{op}' dropped {count} rows - review if intentional")
|
|
581
637
|
|
|
638
|
+
# === CELL CHANGES (debug mode only) ===
|
|
639
|
+
n_changes = 0
|
|
640
|
+
changes_by_op: dict[str, int] = {}
|
|
641
|
+
if ctx.config.mode == TracePipeMode.DEBUG:
|
|
642
|
+
# Count non-drop diffs (cell-level changes)
|
|
643
|
+
step_map = {s.step_id: s.operation for s in ctx.store.steps}
|
|
644
|
+
for i in range(len(ctx.store.diff_step_ids)):
|
|
645
|
+
col = ctx.store.diff_cols[i]
|
|
646
|
+
if col != "__row__": # Skip drop events
|
|
647
|
+
n_changes += 1
|
|
648
|
+
step_id = ctx.store.diff_step_ids[i]
|
|
649
|
+
op = step_map.get(step_id, "unknown")
|
|
650
|
+
changes_by_op[op] = changes_by_op.get(op, 0) + 1
|
|
651
|
+
facts["n_changes"] = n_changes
|
|
652
|
+
|
|
582
653
|
ok = len([w for w in warnings_list if w.severity == "fact"]) == 0
|
|
583
654
|
|
|
584
655
|
return CheckResult(
|
|
@@ -588,6 +659,8 @@ def check(
|
|
|
588
659
|
suggestions=suggestions,
|
|
589
660
|
mode=ctx.config.mode.value,
|
|
590
661
|
_drops_by_op=drops_by_op,
|
|
662
|
+
_n_changes=n_changes,
|
|
663
|
+
_changes_by_op=changes_by_op,
|
|
591
664
|
)
|
|
592
665
|
|
|
593
666
|
|
|
@@ -595,6 +668,7 @@ def trace(
|
|
|
595
668
|
df: pd.DataFrame,
|
|
596
669
|
*,
|
|
597
670
|
row: int | None = None,
|
|
671
|
+
row_id: int | None = None,
|
|
598
672
|
where: dict[str, Any] | None = None,
|
|
599
673
|
include_ghost: bool = True,
|
|
600
674
|
) -> TraceResult | list[TraceResult]:
|
|
@@ -603,7 +677,8 @@ def trace(
|
|
|
603
677
|
|
|
604
678
|
Args:
|
|
605
679
|
df: DataFrame to search in
|
|
606
|
-
row: Row
|
|
680
|
+
row: Row position (0-based index into current DataFrame)
|
|
681
|
+
row_id: Internal row ID (use for tracing dropped rows)
|
|
607
682
|
where: Selector dict, e.g. {"customer_id": "C123"}
|
|
608
683
|
include_ghost: Include last-known values for dropped rows
|
|
609
684
|
|
|
@@ -612,8 +687,14 @@ def trace(
|
|
|
612
687
|
Use print(result) for pretty output, result.to_dict() for data.
|
|
613
688
|
|
|
614
689
|
Examples:
|
|
615
|
-
|
|
616
|
-
|
|
690
|
+
# Trace by position in current DataFrame
|
|
691
|
+
result = tp.trace(df, row=0) # First row
|
|
692
|
+
|
|
693
|
+
# Trace by internal row ID (for dropped rows)
|
|
694
|
+
dropped = tp.debug.inspect().dropped_rows()
|
|
695
|
+
result = tp.trace(df, row_id=dropped[0])
|
|
696
|
+
|
|
697
|
+
# Trace by business key
|
|
617
698
|
tp.trace(df, where={"customer_id": "C123"})
|
|
618
699
|
"""
|
|
619
700
|
ctx = get_context()
|
|
@@ -624,12 +705,30 @@ def trace(
|
|
|
624
705
|
pass
|
|
625
706
|
|
|
626
707
|
# Resolve row IDs
|
|
627
|
-
if
|
|
628
|
-
|
|
708
|
+
if row_id is not None:
|
|
709
|
+
# Direct row ID specified - use as-is
|
|
710
|
+
row_ids = [row_id]
|
|
711
|
+
elif row is not None:
|
|
712
|
+
# row= is a DataFrame index position (0-based), not a row ID
|
|
713
|
+
# Convert to actual row ID using the DataFrame's registered IDs
|
|
714
|
+
rids = ctx.row_manager.get_ids_array(df)
|
|
715
|
+
if rids is not None:
|
|
716
|
+
# Handle negative indexing
|
|
717
|
+
if row < 0:
|
|
718
|
+
row = len(rids) + row
|
|
719
|
+
if 0 <= row < len(rids):
|
|
720
|
+
row_ids = [int(rids[row])]
|
|
721
|
+
else:
|
|
722
|
+
raise ValueError(
|
|
723
|
+
f"Row index {row} out of bounds for DataFrame with {len(rids)} rows"
|
|
724
|
+
)
|
|
725
|
+
else:
|
|
726
|
+
# DataFrame not tracked - use row as-is (legacy behavior)
|
|
727
|
+
row_ids = [row]
|
|
629
728
|
elif where is not None:
|
|
630
729
|
row_ids = _resolve_where(df, where, ctx)
|
|
631
730
|
else:
|
|
632
|
-
raise ValueError("Must provide 'row' or 'where'")
|
|
731
|
+
raise ValueError("Must provide 'row', 'row_id', or 'where'")
|
|
633
732
|
|
|
634
733
|
results = []
|
|
635
734
|
for rid in row_ids:
|
|
@@ -644,6 +743,7 @@ def why(
|
|
|
644
743
|
*,
|
|
645
744
|
col: str,
|
|
646
745
|
row: int | None = None,
|
|
746
|
+
row_id: int | None = None,
|
|
647
747
|
where: dict[str, Any] | None = None,
|
|
648
748
|
) -> WhyResult | list[WhyResult]:
|
|
649
749
|
"""
|
|
@@ -652,7 +752,8 @@ def why(
|
|
|
652
752
|
Args:
|
|
653
753
|
df: DataFrame to search in
|
|
654
754
|
col: Column name to trace
|
|
655
|
-
row: Row
|
|
755
|
+
row: Row position (0-based index into current DataFrame)
|
|
756
|
+
row_id: Internal row ID (use for cells in dropped rows)
|
|
656
757
|
where: Selector dict, e.g. {"customer_id": "C123"}
|
|
657
758
|
|
|
658
759
|
Returns:
|
|
@@ -660,7 +761,7 @@ def why(
|
|
|
660
761
|
Use print(result) for pretty output, result.to_dict() for data.
|
|
661
762
|
|
|
662
763
|
Examples:
|
|
663
|
-
result = tp.why(df, col="amount", row=
|
|
764
|
+
result = tp.why(df, col="amount", row=0) # First row
|
|
664
765
|
print(result)
|
|
665
766
|
tp.why(df, col="email", where={"user_id": "U123"})
|
|
666
767
|
"""
|
|
@@ -676,12 +777,30 @@ def why(
|
|
|
676
777
|
)
|
|
677
778
|
|
|
678
779
|
# Resolve row IDs
|
|
679
|
-
if
|
|
680
|
-
|
|
780
|
+
if row_id is not None:
|
|
781
|
+
# Direct row ID specified - use as-is
|
|
782
|
+
row_ids = [row_id]
|
|
783
|
+
elif row is not None:
|
|
784
|
+
# row= is a DataFrame index position (0-based), not a row ID
|
|
785
|
+
# Convert to actual row ID using the DataFrame's registered IDs
|
|
786
|
+
rids = ctx.row_manager.get_ids_array(df)
|
|
787
|
+
if rids is not None:
|
|
788
|
+
# Handle negative indexing
|
|
789
|
+
if row < 0:
|
|
790
|
+
row = len(rids) + row
|
|
791
|
+
if 0 <= row < len(rids):
|
|
792
|
+
row_ids = [int(rids[row])]
|
|
793
|
+
else:
|
|
794
|
+
raise ValueError(
|
|
795
|
+
f"Row index {row} out of bounds for DataFrame with {len(rids)} rows"
|
|
796
|
+
)
|
|
797
|
+
else:
|
|
798
|
+
# DataFrame not tracked - use row as-is (legacy behavior)
|
|
799
|
+
row_ids = [row]
|
|
681
800
|
elif where is not None:
|
|
682
801
|
row_ids = _resolve_where(df, where, ctx)
|
|
683
802
|
else:
|
|
684
|
-
raise ValueError("Must provide 'row' or 'where'")
|
|
803
|
+
raise ValueError("Must provide 'row', 'row_id', or 'where'")
|
|
685
804
|
|
|
686
805
|
results = []
|
|
687
806
|
for rid in row_ids:
|
tracepipe/debug.py
CHANGED
|
@@ -179,6 +179,46 @@ class DebugInspector:
|
|
|
179
179
|
ctx = get_context()
|
|
180
180
|
return ctx.row_manager.get_ghost_rows(limit=limit)
|
|
181
181
|
|
|
182
|
+
def get_ghost_values(self, row_id: int) -> dict[str, Any] | None:
|
|
183
|
+
"""
|
|
184
|
+
Get last-known values for a specific dropped row (DEBUG mode only).
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
row_id: The row ID to look up
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
Dict mapping column names to their last known values,
|
|
191
|
+
or None if the row was not found in ghost storage.
|
|
192
|
+
|
|
193
|
+
Example:
|
|
194
|
+
dbg = tp.debug.inspect()
|
|
195
|
+
dropped_rid = list(dbg.dropped_rows())[0]
|
|
196
|
+
ghost = dbg.get_ghost_values(dropped_rid)
|
|
197
|
+
print(f"Last known values: {ghost}")
|
|
198
|
+
"""
|
|
199
|
+
ctx = get_context()
|
|
200
|
+
ghost_df = ctx.row_manager.get_ghost_rows(limit=100000)
|
|
201
|
+
|
|
202
|
+
if ghost_df.empty or "__tp_row_id__" not in ghost_df.columns:
|
|
203
|
+
return None
|
|
204
|
+
|
|
205
|
+
row_match = ghost_df[ghost_df["__tp_row_id__"] == row_id]
|
|
206
|
+
if row_match.empty:
|
|
207
|
+
return None
|
|
208
|
+
|
|
209
|
+
# Convert to dict and remove internal columns
|
|
210
|
+
result = row_match.iloc[0].to_dict()
|
|
211
|
+
internal_cols = [
|
|
212
|
+
"__tp_row_id__",
|
|
213
|
+
"__tp_dropped_by__",
|
|
214
|
+
"__tp_dropped_step__",
|
|
215
|
+
"__tp_original_position__",
|
|
216
|
+
]
|
|
217
|
+
for col in internal_cols:
|
|
218
|
+
result.pop(col, None)
|
|
219
|
+
|
|
220
|
+
return result
|
|
221
|
+
|
|
182
222
|
def stats(self) -> dict:
|
|
183
223
|
"""Get comprehensive tracking statistics."""
|
|
184
224
|
ctx = get_context()
|
tracepipe/snapshot.py
CHANGED
|
@@ -25,7 +25,7 @@ Usage:
|
|
|
25
25
|
|
|
26
26
|
import json
|
|
27
27
|
import time
|
|
28
|
-
from dataclasses import dataclass
|
|
28
|
+
from dataclasses import dataclass, field
|
|
29
29
|
from pathlib import Path
|
|
30
30
|
from typing import Any, Optional
|
|
31
31
|
|
|
@@ -297,6 +297,20 @@ class DiffResult:
|
|
|
297
297
|
recovered_rows: set[int]
|
|
298
298
|
drops_delta: dict[str, int] # op -> change in count
|
|
299
299
|
stats_changes: dict[str, dict[str, Any]] # col -> {metric: (old, new)}
|
|
300
|
+
# Column changes
|
|
301
|
+
columns_added: list[str] = field(default_factory=list)
|
|
302
|
+
columns_removed: list[str] = field(default_factory=list)
|
|
303
|
+
# Cell-level changes (only populated if both snapshots have include_values=True)
|
|
304
|
+
cells_changed: int = 0 # Total modified cells
|
|
305
|
+
changed_rows: set[int] = field(default_factory=set) # IDs of rows with value changes
|
|
306
|
+
changes_by_column: dict[str, int] = field(default_factory=dict) # col -> count
|
|
307
|
+
|
|
308
|
+
@property
|
|
309
|
+
def rows_unchanged(self) -> int:
|
|
310
|
+
"""Number of rows that exist in both snapshots (may have value changes)."""
|
|
311
|
+
# This is computed from the rows that weren't added or removed
|
|
312
|
+
# Note: This is an estimate based on the smaller snapshot
|
|
313
|
+
return 0 # Will be set during diff computation
|
|
300
314
|
|
|
301
315
|
def __repr__(self) -> str:
|
|
302
316
|
lines = ["Snapshot Diff:"]
|
|
@@ -310,6 +324,18 @@ class DiffResult:
|
|
|
310
324
|
if self.recovered_rows:
|
|
311
325
|
lines.append(f" * {len(self.recovered_rows)} recovered")
|
|
312
326
|
|
|
327
|
+
if self.columns_added:
|
|
328
|
+
lines.append(f" Columns added: {', '.join(self.columns_added)}")
|
|
329
|
+
if self.columns_removed:
|
|
330
|
+
lines.append(f" Columns removed: {', '.join(self.columns_removed)}")
|
|
331
|
+
|
|
332
|
+
if self.cells_changed > 0:
|
|
333
|
+
lines.append("\n Changes:")
|
|
334
|
+
lines.append(f" - {self.cells_changed} cells modified")
|
|
335
|
+
if self.changes_by_column:
|
|
336
|
+
for col, count in sorted(self.changes_by_column.items(), key=lambda x: -x[1])[:5]:
|
|
337
|
+
lines.append(f" {col}: {count}")
|
|
338
|
+
|
|
313
339
|
if self.drops_delta:
|
|
314
340
|
lines.append(" Drop changes by operation:")
|
|
315
341
|
for op, delta in sorted(self.drops_delta.items(), key=lambda x: -abs(x[1])):
|
|
@@ -339,6 +365,9 @@ class DiffResult:
|
|
|
339
365
|
or self.recovered_rows
|
|
340
366
|
or self.drops_delta
|
|
341
367
|
or self.stats_changes
|
|
368
|
+
or self.columns_added
|
|
369
|
+
or self.columns_removed
|
|
370
|
+
or self.cells_changed
|
|
342
371
|
)
|
|
343
372
|
|
|
344
373
|
def to_dict(self) -> dict:
|
|
@@ -350,6 +379,11 @@ class DiffResult:
|
|
|
350
379
|
"recovered_rows": list(self.recovered_rows),
|
|
351
380
|
"drops_delta": self.drops_delta,
|
|
352
381
|
"stats_changes": self.stats_changes,
|
|
382
|
+
"columns_added": self.columns_added,
|
|
383
|
+
"columns_removed": self.columns_removed,
|
|
384
|
+
"cells_changed": self.cells_changed,
|
|
385
|
+
"changed_rows": list(self.changed_rows),
|
|
386
|
+
"changes_by_column": self.changes_by_column,
|
|
353
387
|
}
|
|
354
388
|
|
|
355
389
|
|
|
@@ -359,6 +393,9 @@ def diff(baseline: Snapshot, current: Snapshot) -> DiffResult:
|
|
|
359
393
|
|
|
360
394
|
Note: Cross-run diff is SUMMARY-ONLY unless keys are stored.
|
|
361
395
|
Row-level comparison only works within same session (same RID assignment).
|
|
396
|
+
|
|
397
|
+
For cell-level diff (cells_changed, changes_by_column), both snapshots
|
|
398
|
+
must have been created with include_values=True.
|
|
362
399
|
"""
|
|
363
400
|
rows_added = current.row_ids - baseline.row_ids
|
|
364
401
|
rows_removed = baseline.row_ids - current.row_ids
|
|
@@ -375,9 +412,15 @@ def diff(baseline: Snapshot, current: Snapshot) -> DiffResult:
|
|
|
375
412
|
if old != new:
|
|
376
413
|
drops_delta[op] = new - old
|
|
377
414
|
|
|
415
|
+
# Column changes
|
|
416
|
+
baseline_cols = set(baseline.column_stats.keys())
|
|
417
|
+
current_cols = set(current.column_stats.keys())
|
|
418
|
+
columns_added = sorted(current_cols - baseline_cols)
|
|
419
|
+
columns_removed = sorted(baseline_cols - current_cols)
|
|
420
|
+
|
|
378
421
|
# Stats changes
|
|
379
422
|
stats_changes: dict[str, dict[str, Any]] = {}
|
|
380
|
-
all_cols =
|
|
423
|
+
all_cols = baseline_cols | current_cols
|
|
381
424
|
for col in all_cols:
|
|
382
425
|
old_stats = baseline.column_stats.get(col)
|
|
383
426
|
new_stats = current.column_stats.get(col)
|
|
@@ -396,6 +439,43 @@ def diff(baseline: Snapshot, current: Snapshot) -> DiffResult:
|
|
|
396
439
|
if changes:
|
|
397
440
|
stats_changes[col] = changes
|
|
398
441
|
|
|
442
|
+
# Cell-level changes (only if both snapshots have watched data)
|
|
443
|
+
cells_changed = 0
|
|
444
|
+
changed_rows: set[int] = set()
|
|
445
|
+
changes_by_column: dict[str, int] = {}
|
|
446
|
+
|
|
447
|
+
if baseline.watched_data is not None and current.watched_data is not None:
|
|
448
|
+
# Find common rows and columns
|
|
449
|
+
common_rows = baseline.row_ids & current.row_ids
|
|
450
|
+
common_cols = set(baseline.watched_data.columns) & set(current.watched_data.columns)
|
|
451
|
+
|
|
452
|
+
for rid in common_rows:
|
|
453
|
+
for col in common_cols:
|
|
454
|
+
old_val = baseline.watched_data.get_value(int(rid), col)
|
|
455
|
+
new_val = current.watched_data.get_value(int(rid), col)
|
|
456
|
+
|
|
457
|
+
# Compare values (handle NaN)
|
|
458
|
+
values_equal = False
|
|
459
|
+
if old_val is None and new_val is None:
|
|
460
|
+
values_equal = True
|
|
461
|
+
elif old_val is not None and new_val is not None:
|
|
462
|
+
try:
|
|
463
|
+
# Handle NaN comparison
|
|
464
|
+
if isinstance(old_val, float) and isinstance(new_val, float):
|
|
465
|
+
if old_val != old_val and new_val != new_val: # Both NaN
|
|
466
|
+
values_equal = True
|
|
467
|
+
else:
|
|
468
|
+
values_equal = old_val == new_val
|
|
469
|
+
else:
|
|
470
|
+
values_equal = old_val == new_val
|
|
471
|
+
except (TypeError, ValueError):
|
|
472
|
+
values_equal = str(old_val) == str(new_val)
|
|
473
|
+
|
|
474
|
+
if not values_equal:
|
|
475
|
+
cells_changed += 1
|
|
476
|
+
changed_rows.add(rid)
|
|
477
|
+
changes_by_column[col] = changes_by_column.get(col, 0) + 1
|
|
478
|
+
|
|
399
479
|
return DiffResult(
|
|
400
480
|
rows_added=rows_added,
|
|
401
481
|
rows_removed=rows_removed,
|
|
@@ -403,6 +483,11 @@ def diff(baseline: Snapshot, current: Snapshot) -> DiffResult:
|
|
|
403
483
|
recovered_rows=recovered_rows,
|
|
404
484
|
drops_delta=drops_delta,
|
|
405
485
|
stats_changes=stats_changes,
|
|
486
|
+
columns_added=columns_added,
|
|
487
|
+
columns_removed=columns_removed,
|
|
488
|
+
cells_changed=cells_changed,
|
|
489
|
+
changed_rows=changed_rows,
|
|
490
|
+
changes_by_column=changes_by_column,
|
|
406
491
|
)
|
|
407
492
|
|
|
408
493
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tracepipe
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.2
|
|
4
4
|
Summary: Row-level data lineage tracking for pandas pipelines
|
|
5
5
|
Project-URL: Homepage, https://github.com/tracepipe/tracepipe
|
|
6
6
|
Project-URL: Documentation, https://tracepipe.github.io/tracepipe/
|
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
tracepipe/__init__.py,sha256=
|
|
1
|
+
tracepipe/__init__.py,sha256=cocA8ETqC1IGgDCXvxue9M4QVzIt8C981b6NTf9BXQ4,3342
|
|
2
2
|
tracepipe/api.py,sha256=WdcKvvzI3voDt6fxZWa8vjyZQU8lfRshx7T78oj7oFE,13351
|
|
3
3
|
tracepipe/context.py,sha256=DvwAZGZbLDJ4xoqmS1VnZOBbOI8ZIIErsY9W6GEbFSM,4051
|
|
4
4
|
tracepipe/contracts.py,sha256=m-rjPrgnCiAgKEkweOS7P95jrjDptt5UPdvUlqaV_rU,16226
|
|
5
|
-
tracepipe/convenience.py,sha256=
|
|
5
|
+
tracepipe/convenience.py,sha256=ALRtVn6tLfa7Ks7d9hKVJfhLjOLuyFgxTwSoUL0BgHY,38241
|
|
6
6
|
tracepipe/core.py,sha256=O1QFJFTSszxKY1pVR1XLrdA0zez7t5KQLjyq6V36ARk,10883
|
|
7
|
-
tracepipe/debug.py,sha256=
|
|
7
|
+
tracepipe/debug.py,sha256=S3ga3rVHjDSV4OctkF5uEAQlzjOxFJO8RGC81awGboA,11397
|
|
8
8
|
tracepipe/safety.py,sha256=UpzhQj31Dij-DjgT9mY_jPrUpVfcA51gDI2fUos4IUA,6694
|
|
9
|
-
tracepipe/snapshot.py,sha256=
|
|
9
|
+
tracepipe/snapshot.py,sha256=kvW8be1EAAsyHefXxJPgIQAAYT_FwK167SMxeQcsra4,17921
|
|
10
10
|
tracepipe/value_provenance.py,sha256=ogky6aOaZ-6K2uNBQxlXpmCeuvK434Hisj30zesRTd8,9330
|
|
11
11
|
tracepipe/instrumentation/__init__.py,sha256=pd0n6Z9m_V3gcBv097cXWFOZEzAP9sAq1jjQnNRrDZ8,222
|
|
12
12
|
tracepipe/instrumentation/apply_capture.py,sha256=cMThWzNXqWQENuMrCGTne1hO6fqaQFV7zJYNpsPTW4w,14463
|
|
@@ -23,7 +23,7 @@ tracepipe/utils/__init__.py,sha256=CI_GXViCjdMbu1j6HuzZhoQZEW0sIB6WAve6j5pfOC0,1
|
|
|
23
23
|
tracepipe/utils/value_capture.py,sha256=wGgegQmJnVHxHbwHSH9di7JAOBChzD3ERJrabZNiayk,4092
|
|
24
24
|
tracepipe/visualization/__init__.py,sha256=M3s44ZTUNEToyghjhQW0FgbmWHKPr4Xc-7iNF6DpI_E,132
|
|
25
25
|
tracepipe/visualization/html_export.py,sha256=G0hfZTJctUCfpun17zXX1NIXhvJZbca6hKmP3rcIjbg,42282
|
|
26
|
-
tracepipe-0.4.
|
|
27
|
-
tracepipe-0.4.
|
|
28
|
-
tracepipe-0.4.
|
|
29
|
-
tracepipe-0.4.
|
|
26
|
+
tracepipe-0.4.2.dist-info/METADATA,sha256=0nMQRfqFJCg1DMGjWzW_nlFcWMM-q8T4LfoqkMcYmAQ,10067
|
|
27
|
+
tracepipe-0.4.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
28
|
+
tracepipe-0.4.2.dist-info/licenses/LICENSE,sha256=HMOAFHBClL79POwWL-2_aDcx42DJAq7Ce-nwJPvMB9U,1075
|
|
29
|
+
tracepipe-0.4.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|