tracepipe 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,11 @@
3
3
  In-memory lineage storage using Structure of Arrays (SoA) pattern.
4
4
 
5
5
  Memory: ~40 bytes/diff vs ~150 bytes with dataclass
6
+
7
+ Features:
8
+ - Merge mapping storage with O(log n) lookup via binary search
9
+ - Sorted bulk drops for efficient drop event lookup
10
+ - Stable API for api/convenience/visualization layers
6
11
  """
7
12
 
8
13
  import atexit
@@ -11,13 +16,17 @@ import time
11
16
  from pathlib import Path
12
17
  from typing import Any, Optional
13
18
 
19
+ import numpy as np
20
+
14
21
  from ..core import (
15
22
  AggregationMapping,
16
23
  ChangeType,
17
24
  CompletenessLevel,
18
25
  LineageGap,
19
26
  LineageGaps,
20
- StepMetadata,
27
+ MergeMapping,
28
+ MergeStats,
29
+ StepEvent,
21
30
  TracePipeConfig,
22
31
  )
23
32
  from ..utils.value_capture import capture_typed_value
@@ -29,9 +38,26 @@ class InMemoryLineageStore:
29
38
 
30
39
  Implements: LineageBackend protocol
31
40
 
32
- Future alternatives:
33
- - SQLiteLineageStore: Persistent storage for long-running pipelines
34
- - DeltaLakeBackend: Distributed storage for big data
41
+ STABLE INTERNAL API (used by api.py, convenience.py, visualization):
42
+
43
+ === ATTRIBUTES (read-only from outside) ===
44
+ steps: list[StepEvent] # All recorded steps
45
+ bulk_drops: dict[int, np.ndarray] # step_id -> sorted dropped RIDs
46
+ merge_mappings: list[MergeMapping] # Merge parent mappings (debug mode)
47
+ merge_stats: list[tuple[int, MergeStats]] # (step_id, stats) pairs
48
+
49
+ === WRITE METHODS (called by instrumentation) ===
50
+ append_step(...) -> int # Returns step_id
51
+ append_bulk_drops(step_id, rids) # rids will be sorted internally
52
+ append_diff(step_id, row_id, col, ...) # Cell-level diff
53
+
54
+ === READ METHODS (called by api/convenience) ===
55
+ get_drop_event(row_id) -> Optional[dict] # {step_id, operation}
56
+ get_dropped_rows() -> list[int] # All dropped RIDs
57
+ get_dropped_by_step() -> dict[str, int] # operation -> count
58
+ get_row_history(row_id) -> list[dict] # Chronological events
59
+ get_merge_stats(step_id=None) -> list[tuple[int, MergeStats]]
60
+ get_merge_origin(row_id) -> Optional[dict] # {left_parent, right_parent, step_id}
35
61
  """
36
62
 
37
63
  def __init__(self, config: TracePipeConfig):
@@ -49,7 +75,14 @@ class InMemoryLineageStore:
49
75
  self.diff_change_types: list[int] = []
50
76
 
51
77
  # === STEP METADATA ===
52
- self._steps: list[StepMetadata] = []
78
+ self._steps: list[StepEvent] = []
79
+
80
+ # === BULK DROPS (step_id -> SORTED numpy array) ===
81
+ self.bulk_drops: dict[int, np.ndarray] = {}
82
+
83
+ # === MERGE TRACKING ===
84
+ self.merge_mappings: list[MergeMapping] = []
85
+ self.merge_stats: list[tuple[int, MergeStats]] = []
53
86
 
54
87
  # === AGGREGATION MAPPINGS ===
55
88
  self.aggregation_mappings: list[AggregationMapping] = []
@@ -66,11 +99,12 @@ class InMemoryLineageStore:
66
99
  self._col_intern: dict[str, str] = {}
67
100
  self._type_intern: dict[str, str] = {}
68
101
 
69
- # Register cleanup on exit
70
- atexit.register(self._cleanup_spillover)
102
+ # === ATEXIT HANDLER ===
103
+ self._atexit_registered: bool = False
104
+ self._register_atexit()
71
105
 
72
106
  @property
73
- def steps(self) -> list[StepMetadata]:
107
+ def steps(self) -> list[StepEvent]:
74
108
  """Access step metadata list."""
75
109
  return self._steps
76
110
 
@@ -150,8 +184,7 @@ class InMemoryLineageStore:
150
184
  """
151
185
  Bulk append dropped rows - optimized for filter operations.
152
186
 
153
- Uses list.extend() for O(1) amortized append instead of O(n) individual appends.
154
- Typically 10-50x faster than calling append_diff() in a loop.
187
+ Stores dropped RIDs SORTED for O(log n) lookup via searchsorted.
155
188
 
156
189
  Args:
157
190
  step_id: Step ID for all drops
@@ -160,24 +193,28 @@ class InMemoryLineageStore:
160
193
  Returns:
161
194
  Number of drops recorded
162
195
  """
163
- import numpy as np
164
-
165
196
  n = len(dropped_row_ids)
166
197
  if n == 0:
167
198
  return 0
168
199
 
169
- # Convert to list if numpy array
200
+ # Convert to sorted numpy array
170
201
  if isinstance(dropped_row_ids, np.ndarray):
171
- row_ids_list = dropped_row_ids.tolist()
202
+ sorted_rids = np.sort(dropped_row_ids.astype(np.int64))
172
203
  else:
173
- row_ids_list = list(dropped_row_ids)
204
+ sorted_rids = np.sort(np.array(list(dropped_row_ids), dtype=np.int64))
205
+
206
+ # Store sorted for O(log n) lookup
207
+ self.bulk_drops[step_id] = sorted_rids
208
+
209
+ # Also record in diff arrays for backwards compatibility
210
+ row_ids_list = sorted_rids.tolist()
174
211
 
175
212
  # Pre-intern the constant strings once
176
213
  col_interned = self._intern_string("__row__", self._col_intern)
177
214
  old_type_interned = self._intern_string("str", self._type_intern)
178
215
  new_type_interned = self._intern_string("null", self._type_intern)
179
216
 
180
- # Bulk extend all arrays at once (much faster than individual appends)
217
+ # Bulk extend all arrays at once
181
218
  self.diff_step_ids.extend([step_id] * n)
182
219
  self.diff_row_ids.extend(row_ids_list)
183
220
  self.diff_cols.extend([col_interned] * n)
@@ -212,7 +249,7 @@ class InMemoryLineageStore:
212
249
  """Append step metadata and return step_id."""
213
250
  step_id = self.next_step_id()
214
251
  self._steps.append(
215
- StepMetadata(
252
+ StepEvent(
216
253
  step_id=step_id,
217
254
  operation=operation,
218
255
  stage=stage,
@@ -250,6 +287,64 @@ class InMemoryLineageStore:
250
287
  """Return False for mass updates exceeding threshold."""
251
288
  return affected_count <= self.config.max_diffs_per_step
252
289
 
290
+ # === DROP LOOKUP (O(log n) via searchsorted) ===
291
+
292
+ def get_drop_event(self, row_id: int) -> Optional[dict]:
293
+ """
294
+ Get drop event for a row from bulk_drops.
295
+
296
+ O(log n) per step via searchsorted.
297
+
298
+ Returns:
299
+ {step_id, operation} if dropped, else None.
300
+ """
301
+ for step_id, dropped_rids in self.bulk_drops.items():
302
+ # Binary search in sorted array
303
+ i = np.searchsorted(dropped_rids, row_id)
304
+ if i < len(dropped_rids) and dropped_rids[i] == row_id:
305
+ step = self._steps[step_id - 1] if step_id <= len(self._steps) else None
306
+ return {
307
+ "step_id": step_id,
308
+ "operation": step.operation if step else "unknown",
309
+ }
310
+ return None
311
+
312
+ def is_dropped(self, row_id: int) -> bool:
313
+ """Fast check if row was dropped anywhere."""
314
+ return self.get_drop_event(row_id) is not None
315
+
316
+ # === MERGE LOOKUP (O(log n) via searchsorted) ===
317
+
318
+ def get_merge_origin(self, row_id: int) -> Optional[dict]:
319
+ """
320
+ Get merge parent RIDs for a row.
321
+
322
+ Uses binary search (O(log n)) instead of linear scan (O(n)).
323
+ """
324
+ for mapping in self.merge_mappings:
325
+ # Binary search on sorted out_rids
326
+ i = np.searchsorted(mapping.out_rids, row_id)
327
+ if i < len(mapping.out_rids) and mapping.out_rids[i] == row_id:
328
+ left_parent = mapping.left_parent_rids[i]
329
+ right_parent = mapping.right_parent_rids[i]
330
+ return {
331
+ "step_id": mapping.step_id,
332
+ "left_parent": int(left_parent) if left_parent >= 0 else None,
333
+ "right_parent": int(right_parent) if right_parent >= 0 else None,
334
+ }
335
+ return None
336
+
337
+ def get_merge_stats(self, step_id: Optional[int] = None) -> list[tuple[int, MergeStats]]:
338
+ """
339
+ Get merge statistics.
340
+
341
+ Returns:
342
+ list of (step_id, MergeStats) tuples - ALWAYS this shape for consistency.
343
+ """
344
+ if step_id is not None:
345
+ return [(sid, s) for sid, s in self.merge_stats if sid == step_id]
346
+ return list(self.merge_stats)
347
+
253
348
  # === MEMORY MANAGEMENT ===
254
349
 
255
350
  def _check_memory_and_spill(self) -> None:
@@ -307,8 +402,26 @@ class InMemoryLineageStore:
307
402
  self.diff_change_types.clear()
308
403
  self._diff_count = 0
309
404
 
405
+ def _register_atexit(self) -> None:
406
+ """Register cleanup handler if not already registered."""
407
+ if not self._atexit_registered:
408
+ atexit.register(self._cleanup_spillover)
409
+ self._atexit_registered = True
410
+
411
+ def _unregister_atexit(self) -> None:
412
+ """Unregister cleanup handler."""
413
+ if self._atexit_registered:
414
+ try:
415
+ atexit.unregister(self._cleanup_spillover)
416
+ except Exception:
417
+ pass
418
+ self._atexit_registered = False
419
+
310
420
  def _cleanup_spillover(self) -> None:
311
421
  """Clean up spillover files on exit."""
422
+ # Unregister to prevent multiple calls
423
+ self._unregister_atexit()
424
+
312
425
  if not self.config.cleanup_spillover_on_disable:
313
426
  return
314
427
 
@@ -367,12 +480,48 @@ class InMemoryLineageStore:
367
480
  }
368
481
 
369
482
  def get_row_history(self, row_id: int) -> list[dict]:
370
- """Get all events for a specific row."""
483
+ """
484
+ Get all events for a specific row in CHRONOLOGICAL order (oldest first).
485
+
486
+ CONTRACT: Returned list has monotonically increasing step_id.
487
+ Convenience layer may reverse for display.
488
+
489
+ Note: This returns only direct events for this row_id.
490
+ Use get_row_history_with_lineage() to include pre-merge parent history.
491
+ """
371
492
  step_map = {s.step_id: s for s in self._steps}
372
493
  events = []
373
494
 
495
+ # Collect from bulk_drops (sorted by step_id)
496
+ for step_id in sorted(self.bulk_drops.keys()):
497
+ dropped_rids = self.bulk_drops[step_id]
498
+ i = np.searchsorted(dropped_rids, row_id)
499
+ if i < len(dropped_rids) and dropped_rids[i] == row_id:
500
+ step = step_map.get(step_id)
501
+ events.append(
502
+ {
503
+ "step_id": step_id,
504
+ "operation": step.operation if step else "unknown",
505
+ "stage": step.stage if step else None,
506
+ "col": "__row__",
507
+ "old_val": "present",
508
+ "old_type": "str",
509
+ "new_val": None,
510
+ "new_type": "null",
511
+ "change_type": "DROPPED",
512
+ "timestamp": step.timestamp if step else None,
513
+ "completeness": step.completeness.name if step else "UNKNOWN",
514
+ "code_location": (
515
+ f"{step.code_file}:{step.code_line}"
516
+ if step and step.code_file
517
+ else None
518
+ ),
519
+ }
520
+ )
521
+
522
+ # Collect from diffs
374
523
  for diff in self._iter_all_diffs():
375
- if diff["row_id"] == row_id:
524
+ if diff["row_id"] == row_id and diff["col"] != "__row__":
376
525
  step = step_map.get(diff["step_id"])
377
526
  events.append(
378
527
  {
@@ -395,16 +544,81 @@ class InMemoryLineageStore:
395
544
  }
396
545
  )
397
546
 
398
- return sorted(events, key=lambda e: e["step_id"])
547
+ # ENFORCE: sort by step_id (chronological)
548
+ events.sort(key=lambda e: e["step_id"])
549
+
550
+ return events
551
+
552
+ def get_row_history_with_lineage(self, row_id: int, max_depth: int = 10) -> list[dict]:
553
+ """
554
+ Get row history including pre-merge parent history.
555
+
556
+ Follows merge lineage recursively to build complete cell provenance.
557
+ This is essential for tracking changes that happened before merge operations.
558
+
559
+ Args:
560
+ row_id: Row ID to trace
561
+ max_depth: Maximum merge depth to follow (prevents infinite loops)
562
+
563
+ Returns:
564
+ List of events in chronological order, including parent row events.
565
+ """
566
+ visited: set[int] = set()
567
+
568
+ def _collect_history(rid: int, depth: int) -> list[dict]:
569
+ if depth > max_depth or rid in visited:
570
+ return []
571
+ visited.add(rid)
572
+
573
+ events = []
574
+
575
+ # Check if this row came from a merge
576
+ origin = self.get_merge_origin(rid)
577
+ if origin and origin["left_parent"] is not None:
578
+ # Recursively get parent's history first (chronological order)
579
+ parent_events = _collect_history(origin["left_parent"], depth + 1)
580
+ events.extend(parent_events)
581
+
582
+ # Add this row's direct events
583
+ events.extend(self.get_row_history(rid))
584
+
585
+ return events
586
+
587
+ all_events = _collect_history(row_id, 0)
588
+
589
+ # Sort by step_id to ensure chronological order across lineage
590
+ all_events.sort(key=lambda e: e["step_id"])
591
+
592
+ return all_events
593
+
594
+ def get_cell_history_with_lineage(
595
+ self, row_id: int, column: str, max_depth: int = 10
596
+ ) -> list[dict]:
597
+ """
598
+ Get cell history for a specific column, including pre-merge parent history.
599
+
600
+ Args:
601
+ row_id: Row ID to trace
602
+ column: Column name to filter events for
603
+ max_depth: Maximum merge depth to follow
604
+
605
+ Returns:
606
+ List of events for this column in chronological order.
607
+ """
608
+ all_events = self.get_row_history_with_lineage(row_id, max_depth)
609
+ return [e for e in all_events if e["col"] == column]
399
610
 
400
611
  def get_dropped_rows(self, step_id: Optional[int] = None) -> list[int]:
401
612
  """Get all dropped row IDs, optionally filtered by step."""
402
- dropped = set()
613
+ if step_id is not None:
614
+ if step_id in self.bulk_drops:
615
+ return self.bulk_drops[step_id].tolist()
616
+ return []
403
617
 
404
- for diff in self._iter_all_diffs():
405
- if diff["change_type"] == ChangeType.DROPPED:
406
- if step_id is None or diff["step_id"] == step_id:
407
- dropped.add(diff["row_id"])
618
+ # Collect all dropped rows
619
+ dropped = set()
620
+ for rids in self.bulk_drops.values():
621
+ dropped.update(rids.tolist())
408
622
 
409
623
  return sorted(dropped)
410
624
 
@@ -413,10 +627,9 @@ class InMemoryLineageStore:
413
627
  step_map = {s.step_id: s.operation for s in self._steps}
414
628
  counts: dict[str, int] = {}
415
629
 
416
- for diff in self._iter_all_diffs():
417
- if diff["change_type"] == ChangeType.DROPPED:
418
- op = step_map.get(diff["step_id"], "unknown")
419
- counts[op] = counts.get(op, 0) + 1
630
+ for step_id, rids in self.bulk_drops.items():
631
+ op = step_map.get(step_id, "unknown")
632
+ counts[op] = counts.get(op, 0) + len(rids)
420
633
 
421
634
  return dict(sorted(counts.items(), key=lambda x: -x[1]))
422
635
 
@@ -458,6 +671,13 @@ class InMemoryLineageStore:
458
671
  gaps = []
459
672
  row_step_ids = set()
460
673
 
674
+ # From bulk_drops
675
+ for step_id, rids in self.bulk_drops.items():
676
+ i = np.searchsorted(rids, row_id)
677
+ if i < len(rids) and rids[i] == row_id:
678
+ row_step_ids.add(step_id)
679
+
680
+ # From diffs
461
681
  for diff in self._iter_all_diffs():
462
682
  if diff["row_id"] == row_id:
463
683
  row_step_ids.add(diff["step_id"])
@@ -490,28 +710,12 @@ class InMemoryLineageStore:
490
710
  diffs = list(self._iter_all_diffs())
491
711
 
492
712
  data = {
493
- "tracepipe_version": "0.2.0",
713
+ "tracepipe_version": "0.3.1",
494
714
  "export_timestamp": time.time(),
495
715
  "total_diffs": len(diffs),
496
716
  "total_steps": len(self._steps),
497
717
  "diffs": diffs,
498
- "steps": [
499
- {
500
- "step_id": s.step_id,
501
- "operation": s.operation,
502
- "stage": s.stage,
503
- "timestamp": s.timestamp,
504
- "code_file": s.code_file,
505
- "code_line": s.code_line,
506
- "params": s.params,
507
- "input_shape": s.input_shape,
508
- "output_shape": s.output_shape,
509
- "is_mass_update": s.is_mass_update,
510
- "rows_affected": s.rows_affected,
511
- "completeness": s.completeness.name,
512
- }
513
- for s in self._steps
514
- ],
718
+ "steps": [s.to_dict() for s in self._steps],
515
719
  "aggregation_mappings": [
516
720
  {
517
721
  "step_id": a.step_id,
@@ -521,6 +725,7 @@ class InMemoryLineageStore:
521
725
  }
522
726
  for a in self.aggregation_mappings
523
727
  ],
728
+ "merge_stats": [{"step_id": sid, **vars(stats)} for sid, stats in self.merge_stats],
524
729
  }
525
730
 
526
731
  return json.dumps(data)