tracepipe 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tracepipe/__init__.py +117 -78
- tracepipe/api.py +168 -331
- tracepipe/context.py +21 -1
- tracepipe/contracts.py +473 -0
- tracepipe/convenience.py +812 -0
- tracepipe/core.py +174 -17
- tracepipe/debug.py +325 -0
- tracepipe/instrumentation/apply_capture.py +453 -0
- tracepipe/instrumentation/filter_capture.py +468 -0
- tracepipe/instrumentation/indexer_capture.py +813 -0
- tracepipe/instrumentation/merge_capture.py +434 -0
- tracepipe/instrumentation/pandas_inst.py +66 -183
- tracepipe/instrumentation/series_capture.py +331 -0
- tracepipe/safety.py +3 -3
- tracepipe/snapshot.py +420 -0
- tracepipe/storage/base.py +7 -3
- tracepipe/storage/lineage_store.py +190 -47
- tracepipe/storage/row_identity.py +366 -104
- tracepipe/value_provenance.py +301 -0
- tracepipe/visualization/html_export.py +22 -7
- tracepipe-0.3.0.dist-info/METADATA +575 -0
- tracepipe-0.3.0.dist-info/RECORD +29 -0
- tracepipe-0.2.0.dist-info/METADATA +0 -508
- tracepipe-0.2.0.dist-info/RECORD +0 -19
- {tracepipe-0.2.0.dist-info → tracepipe-0.3.0.dist-info}/WHEEL +0 -0
- {tracepipe-0.2.0.dist-info → tracepipe-0.3.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -3,6 +3,11 @@
|
|
|
3
3
|
In-memory lineage storage using Structure of Arrays (SoA) pattern.
|
|
4
4
|
|
|
5
5
|
Memory: ~40 bytes/diff vs ~150 bytes with dataclass
|
|
6
|
+
|
|
7
|
+
Features:
|
|
8
|
+
- Merge mapping storage with O(log n) lookup via binary search
|
|
9
|
+
- Sorted bulk drops for efficient drop event lookup
|
|
10
|
+
- Stable API for api/convenience/visualization layers
|
|
6
11
|
"""
|
|
7
12
|
|
|
8
13
|
import atexit
|
|
@@ -11,13 +16,17 @@ import time
|
|
|
11
16
|
from pathlib import Path
|
|
12
17
|
from typing import Any, Optional
|
|
13
18
|
|
|
19
|
+
import numpy as np
|
|
20
|
+
|
|
14
21
|
from ..core import (
|
|
15
22
|
AggregationMapping,
|
|
16
23
|
ChangeType,
|
|
17
24
|
CompletenessLevel,
|
|
18
25
|
LineageGap,
|
|
19
26
|
LineageGaps,
|
|
20
|
-
|
|
27
|
+
MergeMapping,
|
|
28
|
+
MergeStats,
|
|
29
|
+
StepEvent,
|
|
21
30
|
TracePipeConfig,
|
|
22
31
|
)
|
|
23
32
|
from ..utils.value_capture import capture_typed_value
|
|
@@ -29,9 +38,26 @@ class InMemoryLineageStore:
|
|
|
29
38
|
|
|
30
39
|
Implements: LineageBackend protocol
|
|
31
40
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
41
|
+
STABLE INTERNAL API (used by api.py, convenience.py, visualization):
|
|
42
|
+
|
|
43
|
+
=== ATTRIBUTES (read-only from outside) ===
|
|
44
|
+
steps: list[StepEvent] # All recorded steps
|
|
45
|
+
bulk_drops: dict[int, np.ndarray] # step_id -> sorted dropped RIDs
|
|
46
|
+
merge_mappings: list[MergeMapping] # Merge parent mappings (debug mode)
|
|
47
|
+
merge_stats: list[tuple[int, MergeStats]] # (step_id, stats) pairs
|
|
48
|
+
|
|
49
|
+
=== WRITE METHODS (called by instrumentation) ===
|
|
50
|
+
append_step(...) -> int # Returns step_id
|
|
51
|
+
append_bulk_drops(step_id, rids) # rids will be sorted internally
|
|
52
|
+
append_diff(step_id, row_id, col, ...) # Cell-level diff
|
|
53
|
+
|
|
54
|
+
=== READ METHODS (called by api/convenience) ===
|
|
55
|
+
get_drop_event(row_id) -> Optional[dict] # {step_id, operation}
|
|
56
|
+
get_dropped_rows() -> list[int] # All dropped RIDs
|
|
57
|
+
get_dropped_by_step() -> dict[str, int] # operation -> count
|
|
58
|
+
get_row_history(row_id) -> list[dict] # Chronological events
|
|
59
|
+
get_merge_stats(step_id=None) -> list[tuple[int, MergeStats]]
|
|
60
|
+
get_merge_origin(row_id) -> Optional[dict] # {left_parent, right_parent, step_id}
|
|
35
61
|
"""
|
|
36
62
|
|
|
37
63
|
def __init__(self, config: TracePipeConfig):
|
|
@@ -49,7 +75,14 @@ class InMemoryLineageStore:
|
|
|
49
75
|
self.diff_change_types: list[int] = []
|
|
50
76
|
|
|
51
77
|
# === STEP METADATA ===
|
|
52
|
-
self._steps: list[
|
|
78
|
+
self._steps: list[StepEvent] = []
|
|
79
|
+
|
|
80
|
+
# === BULK DROPS (step_id -> SORTED numpy array) ===
|
|
81
|
+
self.bulk_drops: dict[int, np.ndarray] = {}
|
|
82
|
+
|
|
83
|
+
# === MERGE TRACKING ===
|
|
84
|
+
self.merge_mappings: list[MergeMapping] = []
|
|
85
|
+
self.merge_stats: list[tuple[int, MergeStats]] = []
|
|
53
86
|
|
|
54
87
|
# === AGGREGATION MAPPINGS ===
|
|
55
88
|
self.aggregation_mappings: list[AggregationMapping] = []
|
|
@@ -66,11 +99,12 @@ class InMemoryLineageStore:
|
|
|
66
99
|
self._col_intern: dict[str, str] = {}
|
|
67
100
|
self._type_intern: dict[str, str] = {}
|
|
68
101
|
|
|
69
|
-
#
|
|
70
|
-
|
|
102
|
+
# === ATEXIT HANDLER ===
|
|
103
|
+
self._atexit_registered: bool = False
|
|
104
|
+
self._register_atexit()
|
|
71
105
|
|
|
72
106
|
@property
|
|
73
|
-
def steps(self) -> list[
|
|
107
|
+
def steps(self) -> list[StepEvent]:
|
|
74
108
|
"""Access step metadata list."""
|
|
75
109
|
return self._steps
|
|
76
110
|
|
|
@@ -150,8 +184,7 @@ class InMemoryLineageStore:
|
|
|
150
184
|
"""
|
|
151
185
|
Bulk append dropped rows - optimized for filter operations.
|
|
152
186
|
|
|
153
|
-
|
|
154
|
-
Typically 10-50x faster than calling append_diff() in a loop.
|
|
187
|
+
Stores dropped RIDs SORTED for O(log n) lookup via searchsorted.
|
|
155
188
|
|
|
156
189
|
Args:
|
|
157
190
|
step_id: Step ID for all drops
|
|
@@ -160,24 +193,28 @@ class InMemoryLineageStore:
|
|
|
160
193
|
Returns:
|
|
161
194
|
Number of drops recorded
|
|
162
195
|
"""
|
|
163
|
-
import numpy as np
|
|
164
|
-
|
|
165
196
|
n = len(dropped_row_ids)
|
|
166
197
|
if n == 0:
|
|
167
198
|
return 0
|
|
168
199
|
|
|
169
|
-
# Convert to
|
|
200
|
+
# Convert to sorted numpy array
|
|
170
201
|
if isinstance(dropped_row_ids, np.ndarray):
|
|
171
|
-
|
|
202
|
+
sorted_rids = np.sort(dropped_row_ids.astype(np.int64))
|
|
172
203
|
else:
|
|
173
|
-
|
|
204
|
+
sorted_rids = np.sort(np.array(list(dropped_row_ids), dtype=np.int64))
|
|
205
|
+
|
|
206
|
+
# Store sorted for O(log n) lookup
|
|
207
|
+
self.bulk_drops[step_id] = sorted_rids
|
|
208
|
+
|
|
209
|
+
# Also record in diff arrays for backwards compatibility
|
|
210
|
+
row_ids_list = sorted_rids.tolist()
|
|
174
211
|
|
|
175
212
|
# Pre-intern the constant strings once
|
|
176
213
|
col_interned = self._intern_string("__row__", self._col_intern)
|
|
177
214
|
old_type_interned = self._intern_string("str", self._type_intern)
|
|
178
215
|
new_type_interned = self._intern_string("null", self._type_intern)
|
|
179
216
|
|
|
180
|
-
# Bulk extend all arrays at once
|
|
217
|
+
# Bulk extend all arrays at once
|
|
181
218
|
self.diff_step_ids.extend([step_id] * n)
|
|
182
219
|
self.diff_row_ids.extend(row_ids_list)
|
|
183
220
|
self.diff_cols.extend([col_interned] * n)
|
|
@@ -212,7 +249,7 @@ class InMemoryLineageStore:
|
|
|
212
249
|
"""Append step metadata and return step_id."""
|
|
213
250
|
step_id = self.next_step_id()
|
|
214
251
|
self._steps.append(
|
|
215
|
-
|
|
252
|
+
StepEvent(
|
|
216
253
|
step_id=step_id,
|
|
217
254
|
operation=operation,
|
|
218
255
|
stage=stage,
|
|
@@ -250,6 +287,64 @@ class InMemoryLineageStore:
|
|
|
250
287
|
"""Return False for mass updates exceeding threshold."""
|
|
251
288
|
return affected_count <= self.config.max_diffs_per_step
|
|
252
289
|
|
|
290
|
+
# === DROP LOOKUP (O(log n) via searchsorted) ===
|
|
291
|
+
|
|
292
|
+
def get_drop_event(self, row_id: int) -> Optional[dict]:
|
|
293
|
+
"""
|
|
294
|
+
Get drop event for a row from bulk_drops.
|
|
295
|
+
|
|
296
|
+
O(log n) per step via searchsorted.
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
{step_id, operation} if dropped, else None.
|
|
300
|
+
"""
|
|
301
|
+
for step_id, dropped_rids in self.bulk_drops.items():
|
|
302
|
+
# Binary search in sorted array
|
|
303
|
+
i = np.searchsorted(dropped_rids, row_id)
|
|
304
|
+
if i < len(dropped_rids) and dropped_rids[i] == row_id:
|
|
305
|
+
step = self._steps[step_id - 1] if step_id <= len(self._steps) else None
|
|
306
|
+
return {
|
|
307
|
+
"step_id": step_id,
|
|
308
|
+
"operation": step.operation if step else "unknown",
|
|
309
|
+
}
|
|
310
|
+
return None
|
|
311
|
+
|
|
312
|
+
def is_dropped(self, row_id: int) -> bool:
|
|
313
|
+
"""Fast check if row was dropped anywhere."""
|
|
314
|
+
return self.get_drop_event(row_id) is not None
|
|
315
|
+
|
|
316
|
+
# === MERGE LOOKUP (O(log n) via searchsorted) ===
|
|
317
|
+
|
|
318
|
+
def get_merge_origin(self, row_id: int) -> Optional[dict]:
|
|
319
|
+
"""
|
|
320
|
+
Get merge parent RIDs for a row.
|
|
321
|
+
|
|
322
|
+
Uses binary search (O(log n)) instead of linear scan (O(n)).
|
|
323
|
+
"""
|
|
324
|
+
for mapping in self.merge_mappings:
|
|
325
|
+
# Binary search on sorted out_rids
|
|
326
|
+
i = np.searchsorted(mapping.out_rids, row_id)
|
|
327
|
+
if i < len(mapping.out_rids) and mapping.out_rids[i] == row_id:
|
|
328
|
+
left_parent = mapping.left_parent_rids[i]
|
|
329
|
+
right_parent = mapping.right_parent_rids[i]
|
|
330
|
+
return {
|
|
331
|
+
"step_id": mapping.step_id,
|
|
332
|
+
"left_parent": int(left_parent) if left_parent >= 0 else None,
|
|
333
|
+
"right_parent": int(right_parent) if right_parent >= 0 else None,
|
|
334
|
+
}
|
|
335
|
+
return None
|
|
336
|
+
|
|
337
|
+
def get_merge_stats(self, step_id: Optional[int] = None) -> list[tuple[int, MergeStats]]:
|
|
338
|
+
"""
|
|
339
|
+
Get merge statistics.
|
|
340
|
+
|
|
341
|
+
Returns:
|
|
342
|
+
list of (step_id, MergeStats) tuples - ALWAYS this shape for consistency.
|
|
343
|
+
"""
|
|
344
|
+
if step_id is not None:
|
|
345
|
+
return [(sid, s) for sid, s in self.merge_stats if sid == step_id]
|
|
346
|
+
return list(self.merge_stats)
|
|
347
|
+
|
|
253
348
|
# === MEMORY MANAGEMENT ===
|
|
254
349
|
|
|
255
350
|
def _check_memory_and_spill(self) -> None:
|
|
@@ -307,8 +402,26 @@ class InMemoryLineageStore:
|
|
|
307
402
|
self.diff_change_types.clear()
|
|
308
403
|
self._diff_count = 0
|
|
309
404
|
|
|
405
|
+
def _register_atexit(self) -> None:
|
|
406
|
+
"""Register cleanup handler if not already registered."""
|
|
407
|
+
if not self._atexit_registered:
|
|
408
|
+
atexit.register(self._cleanup_spillover)
|
|
409
|
+
self._atexit_registered = True
|
|
410
|
+
|
|
411
|
+
def _unregister_atexit(self) -> None:
|
|
412
|
+
"""Unregister cleanup handler."""
|
|
413
|
+
if self._atexit_registered:
|
|
414
|
+
try:
|
|
415
|
+
atexit.unregister(self._cleanup_spillover)
|
|
416
|
+
except Exception:
|
|
417
|
+
pass
|
|
418
|
+
self._atexit_registered = False
|
|
419
|
+
|
|
310
420
|
def _cleanup_spillover(self) -> None:
|
|
311
421
|
"""Clean up spillover files on exit."""
|
|
422
|
+
# Unregister to prevent multiple calls
|
|
423
|
+
self._unregister_atexit()
|
|
424
|
+
|
|
312
425
|
if not self.config.cleanup_spillover_on_disable:
|
|
313
426
|
return
|
|
314
427
|
|
|
@@ -367,12 +480,45 @@ class InMemoryLineageStore:
|
|
|
367
480
|
}
|
|
368
481
|
|
|
369
482
|
def get_row_history(self, row_id: int) -> list[dict]:
|
|
370
|
-
"""
|
|
483
|
+
"""
|
|
484
|
+
Get all events for a specific row in CHRONOLOGICAL order (oldest first).
|
|
485
|
+
|
|
486
|
+
CONTRACT: Returned list has monotonically increasing step_id.
|
|
487
|
+
Convenience layer may reverse for display.
|
|
488
|
+
"""
|
|
371
489
|
step_map = {s.step_id: s for s in self._steps}
|
|
372
490
|
events = []
|
|
373
491
|
|
|
492
|
+
# Collect from bulk_drops (sorted by step_id)
|
|
493
|
+
for step_id in sorted(self.bulk_drops.keys()):
|
|
494
|
+
dropped_rids = self.bulk_drops[step_id]
|
|
495
|
+
i = np.searchsorted(dropped_rids, row_id)
|
|
496
|
+
if i < len(dropped_rids) and dropped_rids[i] == row_id:
|
|
497
|
+
step = step_map.get(step_id)
|
|
498
|
+
events.append(
|
|
499
|
+
{
|
|
500
|
+
"step_id": step_id,
|
|
501
|
+
"operation": step.operation if step else "unknown",
|
|
502
|
+
"stage": step.stage if step else None,
|
|
503
|
+
"col": "__row__",
|
|
504
|
+
"old_val": "present",
|
|
505
|
+
"old_type": "str",
|
|
506
|
+
"new_val": None,
|
|
507
|
+
"new_type": "null",
|
|
508
|
+
"change_type": "DROPPED",
|
|
509
|
+
"timestamp": step.timestamp if step else None,
|
|
510
|
+
"completeness": step.completeness.name if step else "UNKNOWN",
|
|
511
|
+
"code_location": (
|
|
512
|
+
f"{step.code_file}:{step.code_line}"
|
|
513
|
+
if step and step.code_file
|
|
514
|
+
else None
|
|
515
|
+
),
|
|
516
|
+
}
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
# Collect from diffs
|
|
374
520
|
for diff in self._iter_all_diffs():
|
|
375
|
-
if diff["row_id"] == row_id:
|
|
521
|
+
if diff["row_id"] == row_id and diff["col"] != "__row__":
|
|
376
522
|
step = step_map.get(diff["step_id"])
|
|
377
523
|
events.append(
|
|
378
524
|
{
|
|
@@ -395,16 +541,22 @@ class InMemoryLineageStore:
|
|
|
395
541
|
}
|
|
396
542
|
)
|
|
397
543
|
|
|
398
|
-
|
|
544
|
+
# ENFORCE: sort by step_id (chronological)
|
|
545
|
+
events.sort(key=lambda e: e["step_id"])
|
|
546
|
+
|
|
547
|
+
return events
|
|
399
548
|
|
|
400
549
|
def get_dropped_rows(self, step_id: Optional[int] = None) -> list[int]:
|
|
401
550
|
"""Get all dropped row IDs, optionally filtered by step."""
|
|
402
|
-
|
|
551
|
+
if step_id is not None:
|
|
552
|
+
if step_id in self.bulk_drops:
|
|
553
|
+
return self.bulk_drops[step_id].tolist()
|
|
554
|
+
return []
|
|
403
555
|
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
556
|
+
# Collect all dropped rows
|
|
557
|
+
dropped = set()
|
|
558
|
+
for rids in self.bulk_drops.values():
|
|
559
|
+
dropped.update(rids.tolist())
|
|
408
560
|
|
|
409
561
|
return sorted(dropped)
|
|
410
562
|
|
|
@@ -413,10 +565,9 @@ class InMemoryLineageStore:
|
|
|
413
565
|
step_map = {s.step_id: s.operation for s in self._steps}
|
|
414
566
|
counts: dict[str, int] = {}
|
|
415
567
|
|
|
416
|
-
for
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
counts[op] = counts.get(op, 0) + 1
|
|
568
|
+
for step_id, rids in self.bulk_drops.items():
|
|
569
|
+
op = step_map.get(step_id, "unknown")
|
|
570
|
+
counts[op] = counts.get(op, 0) + len(rids)
|
|
420
571
|
|
|
421
572
|
return dict(sorted(counts.items(), key=lambda x: -x[1]))
|
|
422
573
|
|
|
@@ -458,6 +609,13 @@ class InMemoryLineageStore:
|
|
|
458
609
|
gaps = []
|
|
459
610
|
row_step_ids = set()
|
|
460
611
|
|
|
612
|
+
# From bulk_drops
|
|
613
|
+
for step_id, rids in self.bulk_drops.items():
|
|
614
|
+
i = np.searchsorted(rids, row_id)
|
|
615
|
+
if i < len(rids) and rids[i] == row_id:
|
|
616
|
+
row_step_ids.add(step_id)
|
|
617
|
+
|
|
618
|
+
# From diffs
|
|
461
619
|
for diff in self._iter_all_diffs():
|
|
462
620
|
if diff["row_id"] == row_id:
|
|
463
621
|
row_step_ids.add(diff["step_id"])
|
|
@@ -490,28 +648,12 @@ class InMemoryLineageStore:
|
|
|
490
648
|
diffs = list(self._iter_all_diffs())
|
|
491
649
|
|
|
492
650
|
data = {
|
|
493
|
-
"tracepipe_version": "0.
|
|
651
|
+
"tracepipe_version": "0.3.0",
|
|
494
652
|
"export_timestamp": time.time(),
|
|
495
653
|
"total_diffs": len(diffs),
|
|
496
654
|
"total_steps": len(self._steps),
|
|
497
655
|
"diffs": diffs,
|
|
498
|
-
"steps": [
|
|
499
|
-
{
|
|
500
|
-
"step_id": s.step_id,
|
|
501
|
-
"operation": s.operation,
|
|
502
|
-
"stage": s.stage,
|
|
503
|
-
"timestamp": s.timestamp,
|
|
504
|
-
"code_file": s.code_file,
|
|
505
|
-
"code_line": s.code_line,
|
|
506
|
-
"params": s.params,
|
|
507
|
-
"input_shape": s.input_shape,
|
|
508
|
-
"output_shape": s.output_shape,
|
|
509
|
-
"is_mass_update": s.is_mass_update,
|
|
510
|
-
"rows_affected": s.rows_affected,
|
|
511
|
-
"completeness": s.completeness.name,
|
|
512
|
-
}
|
|
513
|
-
for s in self._steps
|
|
514
|
-
],
|
|
656
|
+
"steps": [s.to_dict() for s in self._steps],
|
|
515
657
|
"aggregation_mappings": [
|
|
516
658
|
{
|
|
517
659
|
"step_id": a.step_id,
|
|
@@ -521,6 +663,7 @@ class InMemoryLineageStore:
|
|
|
521
663
|
}
|
|
522
664
|
for a in self.aggregation_mappings
|
|
523
665
|
],
|
|
666
|
+
"merge_stats": [{"step_id": sid, **vars(stats)} for sid, stats in self.merge_stats],
|
|
524
667
|
}
|
|
525
668
|
|
|
526
669
|
return json.dumps(data)
|