tracepipe 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,11 @@
3
3
  In-memory lineage storage using Structure of Arrays (SoA) pattern.
4
4
 
5
5
  Memory: ~40 bytes/diff vs ~150 bytes with dataclass
6
+
7
+ Features:
8
+ - Merge mapping storage with O(log n) lookup via binary search
9
+ - Sorted bulk drops for efficient drop event lookup
10
+ - Stable API for api/convenience/visualization layers
6
11
  """
7
12
 
8
13
  import atexit
@@ -11,13 +16,17 @@ import time
11
16
  from pathlib import Path
12
17
  from typing import Any, Optional
13
18
 
19
+ import numpy as np
20
+
14
21
  from ..core import (
15
22
  AggregationMapping,
16
23
  ChangeType,
17
24
  CompletenessLevel,
18
25
  LineageGap,
19
26
  LineageGaps,
20
- StepMetadata,
27
+ MergeMapping,
28
+ MergeStats,
29
+ StepEvent,
21
30
  TracePipeConfig,
22
31
  )
23
32
  from ..utils.value_capture import capture_typed_value
@@ -29,9 +38,26 @@ class InMemoryLineageStore:
29
38
 
30
39
  Implements: LineageBackend protocol
31
40
 
32
- Future alternatives:
33
- - SQLiteLineageStore: Persistent storage for long-running pipelines
34
- - DeltaLakeBackend: Distributed storage for big data
41
+ STABLE INTERNAL API (used by api.py, convenience.py, visualization):
42
+
43
+ === ATTRIBUTES (read-only from outside) ===
44
+ steps: list[StepEvent] # All recorded steps
45
+ bulk_drops: dict[int, np.ndarray] # step_id -> sorted dropped RIDs
46
+ merge_mappings: list[MergeMapping] # Merge parent mappings (debug mode)
47
+ merge_stats: list[tuple[int, MergeStats]] # (step_id, stats) pairs
48
+
49
+ === WRITE METHODS (called by instrumentation) ===
50
+ append_step(...) -> int # Returns step_id
51
+ append_bulk_drops(step_id, rids) # rids will be sorted internally
52
+ append_diff(step_id, row_id, col, ...) # Cell-level diff
53
+
54
+ === READ METHODS (called by api/convenience) ===
55
+ get_drop_event(row_id) -> Optional[dict] # {step_id, operation}
56
+ get_dropped_rows() -> list[int] # All dropped RIDs
57
+ get_dropped_by_step() -> dict[str, int] # operation -> count
58
+ get_row_history(row_id) -> list[dict] # Chronological events
59
+ get_merge_stats(step_id=None) -> list[tuple[int, MergeStats]]
60
+ get_merge_origin(row_id) -> Optional[dict] # {left_parent, right_parent, step_id}
35
61
  """
36
62
 
37
63
  def __init__(self, config: TracePipeConfig):
@@ -49,7 +75,14 @@ class InMemoryLineageStore:
49
75
  self.diff_change_types: list[int] = []
50
76
 
51
77
  # === STEP METADATA ===
52
- self._steps: list[StepMetadata] = []
78
+ self._steps: list[StepEvent] = []
79
+
80
+ # === BULK DROPS (step_id -> SORTED numpy array) ===
81
+ self.bulk_drops: dict[int, np.ndarray] = {}
82
+
83
+ # === MERGE TRACKING ===
84
+ self.merge_mappings: list[MergeMapping] = []
85
+ self.merge_stats: list[tuple[int, MergeStats]] = []
53
86
 
54
87
  # === AGGREGATION MAPPINGS ===
55
88
  self.aggregation_mappings: list[AggregationMapping] = []
@@ -66,11 +99,12 @@ class InMemoryLineageStore:
66
99
  self._col_intern: dict[str, str] = {}
67
100
  self._type_intern: dict[str, str] = {}
68
101
 
69
- # Register cleanup on exit
70
- atexit.register(self._cleanup_spillover)
102
+ # === ATEXIT HANDLER ===
103
+ self._atexit_registered: bool = False
104
+ self._register_atexit()
71
105
 
72
106
  @property
73
- def steps(self) -> list[StepMetadata]:
107
+ def steps(self) -> list[StepEvent]:
74
108
  """Access step metadata list."""
75
109
  return self._steps
76
110
 
@@ -150,8 +184,7 @@ class InMemoryLineageStore:
150
184
  """
151
185
  Bulk append dropped rows - optimized for filter operations.
152
186
 
153
- Uses list.extend() for O(1) amortized append instead of O(n) individual appends.
154
- Typically 10-50x faster than calling append_diff() in a loop.
187
+ Stores dropped RIDs SORTED for O(log n) lookup via searchsorted.
155
188
 
156
189
  Args:
157
190
  step_id: Step ID for all drops
@@ -160,24 +193,28 @@ class InMemoryLineageStore:
160
193
  Returns:
161
194
  Number of drops recorded
162
195
  """
163
- import numpy as np
164
-
165
196
  n = len(dropped_row_ids)
166
197
  if n == 0:
167
198
  return 0
168
199
 
169
- # Convert to list if numpy array
200
+ # Convert to sorted numpy array
170
201
  if isinstance(dropped_row_ids, np.ndarray):
171
- row_ids_list = dropped_row_ids.tolist()
202
+ sorted_rids = np.sort(dropped_row_ids.astype(np.int64))
172
203
  else:
173
- row_ids_list = list(dropped_row_ids)
204
+ sorted_rids = np.sort(np.array(list(dropped_row_ids), dtype=np.int64))
205
+
206
+ # Store sorted for O(log n) lookup
207
+ self.bulk_drops[step_id] = sorted_rids
208
+
209
+ # Also record in diff arrays for backwards compatibility
210
+ row_ids_list = sorted_rids.tolist()
174
211
 
175
212
  # Pre-intern the constant strings once
176
213
  col_interned = self._intern_string("__row__", self._col_intern)
177
214
  old_type_interned = self._intern_string("str", self._type_intern)
178
215
  new_type_interned = self._intern_string("null", self._type_intern)
179
216
 
180
- # Bulk extend all arrays at once (much faster than individual appends)
217
+ # Bulk extend all arrays at once
181
218
  self.diff_step_ids.extend([step_id] * n)
182
219
  self.diff_row_ids.extend(row_ids_list)
183
220
  self.diff_cols.extend([col_interned] * n)
@@ -212,7 +249,7 @@ class InMemoryLineageStore:
212
249
  """Append step metadata and return step_id."""
213
250
  step_id = self.next_step_id()
214
251
  self._steps.append(
215
- StepMetadata(
252
+ StepEvent(
216
253
  step_id=step_id,
217
254
  operation=operation,
218
255
  stage=stage,
@@ -250,6 +287,64 @@ class InMemoryLineageStore:
250
287
  """Return False for mass updates exceeding threshold."""
251
288
  return affected_count <= self.config.max_diffs_per_step
252
289
 
290
+ # === DROP LOOKUP (O(log n) via searchsorted) ===
291
+
292
+ def get_drop_event(self, row_id: int) -> Optional[dict]:
293
+ """
294
+ Get drop event for a row from bulk_drops.
295
+
296
+ O(log n) per step via searchsorted.
297
+
298
+ Returns:
299
+ {step_id, operation} if dropped, else None.
300
+ """
301
+ for step_id, dropped_rids in self.bulk_drops.items():
302
+ # Binary search in sorted array
303
+ i = np.searchsorted(dropped_rids, row_id)
304
+ if i < len(dropped_rids) and dropped_rids[i] == row_id:
305
+ step = self._steps[step_id - 1] if step_id <= len(self._steps) else None
306
+ return {
307
+ "step_id": step_id,
308
+ "operation": step.operation if step else "unknown",
309
+ }
310
+ return None
311
+
312
+ def is_dropped(self, row_id: int) -> bool:
313
+ """Fast check if row was dropped anywhere."""
314
+ return self.get_drop_event(row_id) is not None
315
+
316
+ # === MERGE LOOKUP (O(log n) via searchsorted) ===
317
+
318
+ def get_merge_origin(self, row_id: int) -> Optional[dict]:
319
+ """
320
+ Get merge parent RIDs for a row.
321
+
322
+ Uses binary search (O(log n)) instead of linear scan (O(n)).
323
+ """
324
+ for mapping in self.merge_mappings:
325
+ # Binary search on sorted out_rids
326
+ i = np.searchsorted(mapping.out_rids, row_id)
327
+ if i < len(mapping.out_rids) and mapping.out_rids[i] == row_id:
328
+ left_parent = mapping.left_parent_rids[i]
329
+ right_parent = mapping.right_parent_rids[i]
330
+ return {
331
+ "step_id": mapping.step_id,
332
+ "left_parent": int(left_parent) if left_parent >= 0 else None,
333
+ "right_parent": int(right_parent) if right_parent >= 0 else None,
334
+ }
335
+ return None
336
+
337
+ def get_merge_stats(self, step_id: Optional[int] = None) -> list[tuple[int, MergeStats]]:
338
+ """
339
+ Get merge statistics.
340
+
341
+ Returns:
342
+ list of (step_id, MergeStats) tuples - ALWAYS this shape for consistency.
343
+ """
344
+ if step_id is not None:
345
+ return [(sid, s) for sid, s in self.merge_stats if sid == step_id]
346
+ return list(self.merge_stats)
347
+
253
348
  # === MEMORY MANAGEMENT ===
254
349
 
255
350
  def _check_memory_and_spill(self) -> None:
@@ -307,8 +402,26 @@ class InMemoryLineageStore:
307
402
  self.diff_change_types.clear()
308
403
  self._diff_count = 0
309
404
 
405
+ def _register_atexit(self) -> None:
406
+ """Register cleanup handler if not already registered."""
407
+ if not self._atexit_registered:
408
+ atexit.register(self._cleanup_spillover)
409
+ self._atexit_registered = True
410
+
411
+ def _unregister_atexit(self) -> None:
412
+ """Unregister cleanup handler."""
413
+ if self._atexit_registered:
414
+ try:
415
+ atexit.unregister(self._cleanup_spillover)
416
+ except Exception:
417
+ pass
418
+ self._atexit_registered = False
419
+
310
420
  def _cleanup_spillover(self) -> None:
311
421
  """Clean up spillover files on exit."""
422
+ # Unregister to prevent multiple calls
423
+ self._unregister_atexit()
424
+
312
425
  if not self.config.cleanup_spillover_on_disable:
313
426
  return
314
427
 
@@ -367,12 +480,45 @@ class InMemoryLineageStore:
367
480
  }
368
481
 
369
482
  def get_row_history(self, row_id: int) -> list[dict]:
370
- """Get all events for a specific row."""
483
+ """
484
+ Get all events for a specific row in CHRONOLOGICAL order (oldest first).
485
+
486
+ CONTRACT: Returned list has monotonically increasing step_id.
487
+ Convenience layer may reverse for display.
488
+ """
371
489
  step_map = {s.step_id: s for s in self._steps}
372
490
  events = []
373
491
 
492
+ # Collect from bulk_drops (sorted by step_id)
493
+ for step_id in sorted(self.bulk_drops.keys()):
494
+ dropped_rids = self.bulk_drops[step_id]
495
+ i = np.searchsorted(dropped_rids, row_id)
496
+ if i < len(dropped_rids) and dropped_rids[i] == row_id:
497
+ step = step_map.get(step_id)
498
+ events.append(
499
+ {
500
+ "step_id": step_id,
501
+ "operation": step.operation if step else "unknown",
502
+ "stage": step.stage if step else None,
503
+ "col": "__row__",
504
+ "old_val": "present",
505
+ "old_type": "str",
506
+ "new_val": None,
507
+ "new_type": "null",
508
+ "change_type": "DROPPED",
509
+ "timestamp": step.timestamp if step else None,
510
+ "completeness": step.completeness.name if step else "UNKNOWN",
511
+ "code_location": (
512
+ f"{step.code_file}:{step.code_line}"
513
+ if step and step.code_file
514
+ else None
515
+ ),
516
+ }
517
+ )
518
+
519
+ # Collect from diffs
374
520
  for diff in self._iter_all_diffs():
375
- if diff["row_id"] == row_id:
521
+ if diff["row_id"] == row_id and diff["col"] != "__row__":
376
522
  step = step_map.get(diff["step_id"])
377
523
  events.append(
378
524
  {
@@ -395,16 +541,22 @@ class InMemoryLineageStore:
395
541
  }
396
542
  )
397
543
 
398
- return sorted(events, key=lambda e: e["step_id"])
544
+ # ENFORCE: sort by step_id (chronological)
545
+ events.sort(key=lambda e: e["step_id"])
546
+
547
+ return events
399
548
 
400
549
  def get_dropped_rows(self, step_id: Optional[int] = None) -> list[int]:
401
550
  """Get all dropped row IDs, optionally filtered by step."""
402
- dropped = set()
551
+ if step_id is not None:
552
+ if step_id in self.bulk_drops:
553
+ return self.bulk_drops[step_id].tolist()
554
+ return []
403
555
 
404
- for diff in self._iter_all_diffs():
405
- if diff["change_type"] == ChangeType.DROPPED:
406
- if step_id is None or diff["step_id"] == step_id:
407
- dropped.add(diff["row_id"])
556
+ # Collect all dropped rows
557
+ dropped = set()
558
+ for rids in self.bulk_drops.values():
559
+ dropped.update(rids.tolist())
408
560
 
409
561
  return sorted(dropped)
410
562
 
@@ -413,10 +565,9 @@ class InMemoryLineageStore:
413
565
  step_map = {s.step_id: s.operation for s in self._steps}
414
566
  counts: dict[str, int] = {}
415
567
 
416
- for diff in self._iter_all_diffs():
417
- if diff["change_type"] == ChangeType.DROPPED:
418
- op = step_map.get(diff["step_id"], "unknown")
419
- counts[op] = counts.get(op, 0) + 1
568
+ for step_id, rids in self.bulk_drops.items():
569
+ op = step_map.get(step_id, "unknown")
570
+ counts[op] = counts.get(op, 0) + len(rids)
420
571
 
421
572
  return dict(sorted(counts.items(), key=lambda x: -x[1]))
422
573
 
@@ -458,6 +609,13 @@ class InMemoryLineageStore:
458
609
  gaps = []
459
610
  row_step_ids = set()
460
611
 
612
+ # From bulk_drops
613
+ for step_id, rids in self.bulk_drops.items():
614
+ i = np.searchsorted(rids, row_id)
615
+ if i < len(rids) and rids[i] == row_id:
616
+ row_step_ids.add(step_id)
617
+
618
+ # From diffs
461
619
  for diff in self._iter_all_diffs():
462
620
  if diff["row_id"] == row_id:
463
621
  row_step_ids.add(diff["step_id"])
@@ -490,28 +648,12 @@ class InMemoryLineageStore:
490
648
  diffs = list(self._iter_all_diffs())
491
649
 
492
650
  data = {
493
- "tracepipe_version": "0.2.0",
651
+ "tracepipe_version": "0.3.0",
494
652
  "export_timestamp": time.time(),
495
653
  "total_diffs": len(diffs),
496
654
  "total_steps": len(self._steps),
497
655
  "diffs": diffs,
498
- "steps": [
499
- {
500
- "step_id": s.step_id,
501
- "operation": s.operation,
502
- "stage": s.stage,
503
- "timestamp": s.timestamp,
504
- "code_file": s.code_file,
505
- "code_line": s.code_line,
506
- "params": s.params,
507
- "input_shape": s.input_shape,
508
- "output_shape": s.output_shape,
509
- "is_mass_update": s.is_mass_update,
510
- "rows_affected": s.rows_affected,
511
- "completeness": s.completeness.name,
512
- }
513
- for s in self._steps
514
- ],
656
+ "steps": [s.to_dict() for s in self._steps],
515
657
  "aggregation_mappings": [
516
658
  {
517
659
  "step_id": a.step_id,
@@ -521,6 +663,7 @@ class InMemoryLineageStore:
521
663
  }
522
664
  for a in self.aggregation_mappings
523
665
  ],
666
+ "merge_stats": [{"step_id": sid, **vars(stats)} for sid, stats in self.merge_stats],
524
667
  }
525
668
 
526
669
  return json.dumps(data)