tracepipe 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,556 @@
1
+ # tracepipe/storage/lineage_store.py
2
+ """
3
+ In-memory lineage storage using Structure of Arrays (SoA) pattern.
4
+
5
+ Memory: ~40 bytes/diff vs ~150 bytes with dataclass
6
+ """
7
+
8
+ import atexit
9
+ import json
10
+ import time
11
+ from pathlib import Path
12
+ from typing import Any, Optional
13
+
14
+ from ..core import (
15
+ AggregationMapping,
16
+ ChangeType,
17
+ CompletenessLevel,
18
+ LineageGap,
19
+ LineageGaps,
20
+ StepMetadata,
21
+ TracePipeConfig,
22
+ )
23
+ from ..utils.value_capture import capture_typed_value
24
+
25
+
26
+ class InMemoryLineageStore:
27
+ """
28
+ Columnar storage for lineage data using Structure of Arrays (SoA).
29
+
30
+ Implements: LineageBackend protocol
31
+
32
+ Future alternatives:
33
+ - SQLiteLineageStore: Persistent storage for long-running pipelines
34
+ - DeltaLakeBackend: Distributed storage for big data
35
+ """
36
+
37
+ def __init__(self, config: TracePipeConfig):
38
+ self.config = config
39
+ self._spillover_dir = Path(config.spillover_dir)
40
+
41
+ # === DIFF STORAGE (Columnar) ===
42
+ self.diff_step_ids: list[int] = []
43
+ self.diff_row_ids: list[int] = []
44
+ self.diff_cols: list[str] = []
45
+ self.diff_old_vals: list[Any] = []
46
+ self.diff_old_types: list[str] = []
47
+ self.diff_new_vals: list[Any] = []
48
+ self.diff_new_types: list[str] = []
49
+ self.diff_change_types: list[int] = []
50
+
51
+ # === STEP METADATA ===
52
+ self._steps: list[StepMetadata] = []
53
+
54
+ # === AGGREGATION MAPPINGS ===
55
+ self.aggregation_mappings: list[AggregationMapping] = []
56
+
57
+ # === SPILLOVER TRACKING ===
58
+ self.spilled_files: list[str] = []
59
+
60
+ # === COUNTERS ===
61
+ self._step_counter: int = 0
62
+ self._diff_count: int = 0
63
+ self._total_diff_count: int = 0 # Including spilled
64
+
65
+ # === STRING INTERNING ===
66
+ self._col_intern: dict[str, str] = {}
67
+ self._type_intern: dict[str, str] = {}
68
+
69
+ # Register cleanup on exit
70
+ atexit.register(self._cleanup_spillover)
71
+
72
+ @property
73
+ def steps(self) -> list[StepMetadata]:
74
+ """Access step metadata list."""
75
+ return self._steps
76
+
77
+ def _intern_string(self, s: str, cache: dict[str, str]) -> str:
78
+ """Intern string to avoid duplicate allocations."""
79
+ if s not in cache:
80
+ cache[s] = s
81
+ return cache[s]
82
+
83
+ def next_step_id(self) -> int:
84
+ """Generate next step ID."""
85
+ self._step_counter += 1
86
+ return self._step_counter
87
+
88
+ @property
89
+ def diff_count(self) -> int:
90
+ """In-memory diff count."""
91
+ return self._diff_count
92
+
93
+ @property
94
+ def total_diff_count(self) -> int:
95
+ """Total diffs including spilled."""
96
+ return self._total_diff_count
97
+
98
+ def append_diff(
99
+ self,
100
+ step_id: int,
101
+ row_id: int,
102
+ col: str,
103
+ old_val: Any,
104
+ new_val: Any,
105
+ change_type: ChangeType,
106
+ ) -> None:
107
+ """Append a single diff in columnar format."""
108
+ old_val, old_type = capture_typed_value(old_val)
109
+ new_val, new_type = capture_typed_value(new_val)
110
+
111
+ self.diff_step_ids.append(step_id)
112
+ self.diff_row_ids.append(row_id)
113
+ self.diff_cols.append(self._intern_string(col, self._col_intern))
114
+ self.diff_old_vals.append(old_val)
115
+ self.diff_old_types.append(self._intern_string(old_type, self._type_intern))
116
+ self.diff_new_vals.append(new_val)
117
+ self.diff_new_types.append(self._intern_string(new_type, self._type_intern))
118
+ self.diff_change_types.append(int(change_type))
119
+
120
+ self._diff_count += 1
121
+ self._total_diff_count += 1
122
+
123
+ # Check memory every 10k diffs
124
+ if self._diff_count % 10_000 == 0:
125
+ self._check_memory_and_spill()
126
+
127
+ def append_diff_batch(
128
+ self, step_id: int, diffs: list[tuple], check_threshold: bool = True
129
+ ) -> int:
130
+ """
131
+ Batch append for performance.
132
+
133
+ Args:
134
+ step_id: Step ID for all diffs
135
+ diffs: List of (row_id, col, old_val, new_val, change_type)
136
+ check_threshold: If True, skip if too many diffs
137
+
138
+ Returns:
139
+ Number of diffs actually appended
140
+ """
141
+ if check_threshold and len(diffs) > self.config.max_diffs_per_step:
142
+ return 0 # Caller should log as mass update
143
+
144
+ for row_id, col, old_val, new_val, change_type in diffs:
145
+ self.append_diff(step_id, row_id, col, old_val, new_val, change_type)
146
+
147
+ return len(diffs)
148
+
149
+ def append_bulk_drops(self, step_id: int, dropped_row_ids) -> int:
150
+ """
151
+ Bulk append dropped rows - optimized for filter operations.
152
+
153
+ Uses list.extend() for O(1) amortized append instead of O(n) individual appends.
154
+ Typically 10-50x faster than calling append_diff() in a loop.
155
+
156
+ Args:
157
+ step_id: Step ID for all drops
158
+ dropped_row_ids: Array-like of row IDs that were dropped
159
+
160
+ Returns:
161
+ Number of drops recorded
162
+ """
163
+ import numpy as np
164
+
165
+ n = len(dropped_row_ids)
166
+ if n == 0:
167
+ return 0
168
+
169
+ # Convert to list if numpy array
170
+ if isinstance(dropped_row_ids, np.ndarray):
171
+ row_ids_list = dropped_row_ids.tolist()
172
+ else:
173
+ row_ids_list = list(dropped_row_ids)
174
+
175
+ # Pre-intern the constant strings once
176
+ col_interned = self._intern_string("__row__", self._col_intern)
177
+ old_type_interned = self._intern_string("str", self._type_intern)
178
+ new_type_interned = self._intern_string("null", self._type_intern)
179
+
180
+ # Bulk extend all arrays at once (much faster than individual appends)
181
+ self.diff_step_ids.extend([step_id] * n)
182
+ self.diff_row_ids.extend(row_ids_list)
183
+ self.diff_cols.extend([col_interned] * n)
184
+ self.diff_old_vals.extend(["present"] * n)
185
+ self.diff_old_types.extend([old_type_interned] * n)
186
+ self.diff_new_vals.extend([None] * n)
187
+ self.diff_new_types.extend([new_type_interned] * n)
188
+ self.diff_change_types.extend([int(ChangeType.DROPPED)] * n)
189
+
190
+ self._diff_count += n
191
+ self._total_diff_count += n
192
+
193
+ # Check memory threshold
194
+ if self._diff_count >= self.config.max_diffs_in_memory:
195
+ self._check_memory_and_spill()
196
+
197
+ return n
198
+
199
+ def append_step(
200
+ self,
201
+ operation: str,
202
+ stage: Optional[str],
203
+ code_file: Optional[str],
204
+ code_line: Optional[int],
205
+ params: dict[str, Any],
206
+ input_shape: Optional[tuple],
207
+ output_shape: Optional[tuple],
208
+ completeness: CompletenessLevel = CompletenessLevel.FULL,
209
+ is_mass_update: bool = False,
210
+ rows_affected: int = 0,
211
+ ) -> int:
212
+ """Append step metadata and return step_id."""
213
+ step_id = self.next_step_id()
214
+ self._steps.append(
215
+ StepMetadata(
216
+ step_id=step_id,
217
+ operation=operation,
218
+ stage=stage,
219
+ timestamp=time.time(),
220
+ code_file=code_file,
221
+ code_line=code_line,
222
+ params=params,
223
+ input_shape=input_shape,
224
+ output_shape=output_shape,
225
+ completeness=completeness,
226
+ is_mass_update=is_mass_update,
227
+ rows_affected=rows_affected,
228
+ )
229
+ )
230
+ return step_id
231
+
232
+ def append_aggregation(
233
+ self,
234
+ step_id: int,
235
+ group_column: str,
236
+ membership: dict[str, list[int]],
237
+ agg_functions: dict[str, str],
238
+ ) -> None:
239
+ """Record aggregation group membership."""
240
+ self.aggregation_mappings.append(
241
+ AggregationMapping(
242
+ step_id=step_id,
243
+ group_column=group_column,
244
+ membership=membership,
245
+ agg_functions=agg_functions,
246
+ )
247
+ )
248
+
249
+ def should_track_cell_diffs(self, affected_count: int) -> bool:
250
+ """Return False for mass updates exceeding threshold."""
251
+ return affected_count <= self.config.max_diffs_per_step
252
+
253
+ # === MEMORY MANAGEMENT ===
254
+
255
+ def _check_memory_and_spill(self) -> None:
256
+ """Spill to disk based on count threshold."""
257
+ if self._diff_count < self.config.max_diffs_in_memory:
258
+ return
259
+
260
+ # Optional: use psutil for real memory check
261
+ try:
262
+ import psutil
263
+
264
+ process = psutil.Process()
265
+ mem_mb = process.memory_info().rss / (1024 * 1024)
266
+ if mem_mb < 500:
267
+ return
268
+ except ImportError:
269
+ pass
270
+
271
+ self._spill_to_disk()
272
+
273
+ def _spill_to_disk(self) -> None:
274
+ """Spill current diffs to disk and clear memory."""
275
+ self._spillover_dir.mkdir(exist_ok=True)
276
+
277
+ filename = f"diffs_{int(time.time() * 1000)}_{self._diff_count}.json"
278
+ filepath = self._spillover_dir / filename
279
+
280
+ data = {
281
+ "step_ids": self.diff_step_ids,
282
+ "row_ids": self.diff_row_ids,
283
+ "cols": self.diff_cols,
284
+ "old_vals": self.diff_old_vals,
285
+ "old_types": self.diff_old_types,
286
+ "new_vals": self.diff_new_vals,
287
+ "new_types": self.diff_new_types,
288
+ "change_types": self.diff_change_types,
289
+ }
290
+
291
+ with open(filepath, "w") as f:
292
+ json.dump(data, f)
293
+ self.spilled_files.append(str(filepath))
294
+
295
+ # Clear in-memory arrays
296
+ self._clear_in_memory()
297
+
298
+ def _clear_in_memory(self) -> None:
299
+ """Clear in-memory diff arrays."""
300
+ self.diff_step_ids.clear()
301
+ self.diff_row_ids.clear()
302
+ self.diff_cols.clear()
303
+ self.diff_old_vals.clear()
304
+ self.diff_old_types.clear()
305
+ self.diff_new_vals.clear()
306
+ self.diff_new_types.clear()
307
+ self.diff_change_types.clear()
308
+ self._diff_count = 0
309
+
310
+ def _cleanup_spillover(self) -> None:
311
+ """Clean up spillover files on exit."""
312
+ if not self.config.cleanup_spillover_on_disable:
313
+ return
314
+
315
+ for filepath in self.spilled_files:
316
+ try:
317
+ Path(filepath).unlink(missing_ok=True)
318
+ except Exception:
319
+ pass
320
+
321
+ # Try to remove directory if empty
322
+ try:
323
+ if self._spillover_dir.exists() and not any(self._spillover_dir.iterdir()):
324
+ self._spillover_dir.rmdir()
325
+ except Exception:
326
+ pass
327
+
328
+ # === QUERY METHODS ===
329
+
330
+ def _iter_all_diffs(self):
331
+ """
332
+ Iterate over all diffs (spilled + in-memory) without loading all into memory.
333
+
334
+ Yields:
335
+ dict with step_id, row_id, col, old_val, new_val, change_type, etc.
336
+ """
337
+ # Spilled files first (older data)
338
+ for filepath in self.spilled_files:
339
+ try:
340
+ with open(filepath) as f:
341
+ data = json.load(f)
342
+ for i in range(len(data["step_ids"])):
343
+ yield {
344
+ "step_id": data["step_ids"][i],
345
+ "row_id": data["row_ids"][i],
346
+ "col": data["cols"][i],
347
+ "old_val": data["old_vals"][i],
348
+ "old_type": data["old_types"][i],
349
+ "new_val": data["new_vals"][i],
350
+ "new_type": data["new_types"][i],
351
+ "change_type": data["change_types"][i],
352
+ }
353
+ except Exception:
354
+ continue
355
+
356
+ # In-memory diffs
357
+ for i in range(len(self.diff_step_ids)):
358
+ yield {
359
+ "step_id": self.diff_step_ids[i],
360
+ "row_id": self.diff_row_ids[i],
361
+ "col": self.diff_cols[i],
362
+ "old_val": self.diff_old_vals[i],
363
+ "old_type": self.diff_old_types[i],
364
+ "new_val": self.diff_new_vals[i],
365
+ "new_type": self.diff_new_types[i],
366
+ "change_type": self.diff_change_types[i],
367
+ }
368
+
369
+ def get_row_history(self, row_id: int) -> list[dict]:
370
+ """Get all events for a specific row."""
371
+ step_map = {s.step_id: s for s in self._steps}
372
+ events = []
373
+
374
+ for diff in self._iter_all_diffs():
375
+ if diff["row_id"] == row_id:
376
+ step = step_map.get(diff["step_id"])
377
+ events.append(
378
+ {
379
+ "step_id": diff["step_id"],
380
+ "operation": step.operation if step else "unknown",
381
+ "stage": step.stage if step else None,
382
+ "col": diff["col"],
383
+ "old_val": diff["old_val"],
384
+ "old_type": diff["old_type"],
385
+ "new_val": diff["new_val"],
386
+ "new_type": diff["new_type"],
387
+ "change_type": ChangeType(diff["change_type"]).name,
388
+ "timestamp": step.timestamp if step else None,
389
+ "completeness": step.completeness.name if step else "UNKNOWN",
390
+ "code_location": (
391
+ f"{step.code_file}:{step.code_line}"
392
+ if step and step.code_file
393
+ else None
394
+ ),
395
+ }
396
+ )
397
+
398
+ return sorted(events, key=lambda e: e["step_id"])
399
+
400
+ def get_dropped_rows(self, step_id: Optional[int] = None) -> list[int]:
401
+ """Get all dropped row IDs, optionally filtered by step."""
402
+ dropped = set()
403
+
404
+ for diff in self._iter_all_diffs():
405
+ if diff["change_type"] == ChangeType.DROPPED:
406
+ if step_id is None or diff["step_id"] == step_id:
407
+ dropped.add(diff["row_id"])
408
+
409
+ return sorted(dropped)
410
+
411
+ def get_dropped_by_step(self) -> dict[str, int]:
412
+ """Get count of dropped rows per operation."""
413
+ step_map = {s.step_id: s.operation for s in self._steps}
414
+ counts: dict[str, int] = {}
415
+
416
+ for diff in self._iter_all_diffs():
417
+ if diff["change_type"] == ChangeType.DROPPED:
418
+ op = step_map.get(diff["step_id"], "unknown")
419
+ counts[op] = counts.get(op, 0) + 1
420
+
421
+ return dict(sorted(counts.items(), key=lambda x: -x[1]))
422
+
423
+ def get_group_members(self, group_key: str) -> Optional[dict]:
424
+ """
425
+ Get all rows that contributed to a group.
426
+
427
+ Note: For large groups (exceeding max_group_membership_size),
428
+ membership is stored as count-only: [-count]. In this case,
429
+ row_ids will be empty and is_count_only will be True.
430
+ """
431
+ for mapping in self.aggregation_mappings:
432
+ if group_key in mapping.membership:
433
+ member_data = mapping.membership[group_key]
434
+
435
+ # Check for count-only marker (negative count)
436
+ if len(member_data) == 1 and member_data[0] < 0:
437
+ return {
438
+ "group_key": group_key,
439
+ "group_column": mapping.group_column,
440
+ "row_ids": [],
441
+ "row_count": abs(member_data[0]),
442
+ "is_count_only": True,
443
+ "agg_functions": mapping.agg_functions,
444
+ }
445
+ else:
446
+ return {
447
+ "group_key": group_key,
448
+ "group_column": mapping.group_column,
449
+ "row_ids": member_data,
450
+ "row_count": len(member_data),
451
+ "is_count_only": False,
452
+ "agg_functions": mapping.agg_functions,
453
+ }
454
+ return None
455
+
456
+ def compute_gaps(self, row_id: int) -> LineageGaps:
457
+ """Compute lineage gaps for a specific row."""
458
+ gaps = []
459
+ row_step_ids = set()
460
+
461
+ for diff in self._iter_all_diffs():
462
+ if diff["row_id"] == row_id:
463
+ row_step_ids.add(diff["step_id"])
464
+
465
+ for step in self._steps:
466
+ if step.step_id in row_step_ids:
467
+ if step.completeness == CompletenessLevel.PARTIAL:
468
+ gaps.append(
469
+ LineageGap(
470
+ step_id=step.step_id,
471
+ operation=step.operation,
472
+ reason="Custom function - output tracked, internals unknown",
473
+ )
474
+ )
475
+ elif step.completeness == CompletenessLevel.UNKNOWN:
476
+ gaps.append(
477
+ LineageGap(
478
+ step_id=step.step_id,
479
+ operation=step.operation,
480
+ reason="Operation resets lineage (merge/concat)",
481
+ )
482
+ )
483
+
484
+ return LineageGaps(gaps=gaps)
485
+
486
+ # === EXPORT METHODS ===
487
+
488
+ def to_json(self) -> str:
489
+ """Export all data as JSON string."""
490
+ diffs = list(self._iter_all_diffs())
491
+
492
+ data = {
493
+ "tracepipe_version": "0.2.0",
494
+ "export_timestamp": time.time(),
495
+ "total_diffs": len(diffs),
496
+ "total_steps": len(self._steps),
497
+ "diffs": diffs,
498
+ "steps": [
499
+ {
500
+ "step_id": s.step_id,
501
+ "operation": s.operation,
502
+ "stage": s.stage,
503
+ "timestamp": s.timestamp,
504
+ "code_file": s.code_file,
505
+ "code_line": s.code_line,
506
+ "params": s.params,
507
+ "input_shape": s.input_shape,
508
+ "output_shape": s.output_shape,
509
+ "is_mass_update": s.is_mass_update,
510
+ "rows_affected": s.rows_affected,
511
+ "completeness": s.completeness.name,
512
+ }
513
+ for s in self._steps
514
+ ],
515
+ "aggregation_mappings": [
516
+ {
517
+ "step_id": a.step_id,
518
+ "group_column": a.group_column,
519
+ "membership": a.membership,
520
+ "agg_functions": a.agg_functions,
521
+ }
522
+ for a in self.aggregation_mappings
523
+ ],
524
+ }
525
+
526
+ return json.dumps(data)
527
+
528
+ def to_arrow(self):
529
+ """Convert to Arrow table (requires pyarrow)."""
530
+ try:
531
+ import pyarrow as pa
532
+ except ImportError:
533
+ raise ImportError("pyarrow required: pip install pyarrow")
534
+
535
+ # Collect into columnar format
536
+ step_ids, row_ids, cols = [], [], []
537
+ old_vals, new_vals, change_types = [], [], []
538
+
539
+ for diff in self._iter_all_diffs():
540
+ step_ids.append(diff["step_id"])
541
+ row_ids.append(diff["row_id"])
542
+ cols.append(diff["col"])
543
+ old_vals.append(str(diff["old_val"]))
544
+ new_vals.append(str(diff["new_val"]))
545
+ change_types.append(diff["change_type"])
546
+
547
+ return pa.Table.from_pydict(
548
+ {
549
+ "step_id": pa.array(step_ids, type=pa.int32()),
550
+ "row_id": pa.array(row_ids, type=pa.int64()),
551
+ "col": pa.array(cols, type=pa.string()),
552
+ "old_val": pa.array(old_vals, type=pa.string()),
553
+ "new_val": pa.array(new_vals, type=pa.string()),
554
+ "change_type": pa.array(change_types, type=pa.int8()),
555
+ }
556
+ )