tracepipe 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tracepipe/api.py CHANGED
@@ -1,18 +1,28 @@
1
1
  # tracepipe/api.py
2
2
  """
3
- Public API for TracePipe.
3
+ Core API for TracePipe.
4
+
5
+ This module provides the foundational enable/disable/reset functions
6
+ and internal result classes. For user-facing functionality, see:
7
+ - convenience.py: check(), trace(), why(), report()
8
+ - debug.py: inspect(), export()
9
+ - contracts.py: contract()
10
+ - snapshot.py: snapshot(), diff()
11
+
12
+ Modes:
13
+ - CI: Fast stats and drop tracking. No merge provenance or ghost values.
14
+ - DEBUG: Full provenance with merge origin tracking and ghost row values.
4
15
  """
5
16
 
6
17
  from __future__ import annotations
7
18
 
8
19
  import sys
9
20
  import types
21
+ from collections.abc import Sequence
10
22
  from dataclasses import fields
11
23
 
12
- import pandas as pd
13
-
14
24
  from .context import TracePipeContext, get_context, reset_context, set_context
15
- from .core import LineageGaps, TracePipeConfig
25
+ from .core import LineageGaps, TracePipeConfig, TracePipeMode
16
26
  from .instrumentation.pandas_inst import instrument_pandas, uninstrument_pandas
17
27
  from .storage.base import LineageBackend, RowIdentityStrategy
18
28
 
@@ -24,48 +34,102 @@ def _get_module() -> types.ModuleType:
24
34
 
25
35
  def enable(
26
36
  config: TracePipeConfig | None = None,
37
+ mode: TracePipeMode | str | None = None,
38
+ *,
39
+ watch: Sequence[str] | None = None,
27
40
  auto_watch: bool = False,
28
41
  backend: LineageBackend | None = None,
29
42
  identity: RowIdentityStrategy | None = None,
43
+ merge_provenance: bool | None = None,
44
+ ghost_row_values: bool | None = None,
45
+ cell_history: bool | None = None,
46
+ sample_rate: float | None = None,
47
+ max_tracked_rows: int | None = None,
30
48
  ) -> types.ModuleType:
31
49
  """
32
50
  Enable TracePipe lineage tracking.
33
51
 
34
52
  Args:
35
- config: Optional configuration
53
+ config: Optional configuration object
54
+ mode: Operating mode - "ci" (fast) or "debug" (full provenance)
55
+ watch: List of columns to watch for cell-level changes
36
56
  auto_watch: If True, automatically watch columns with nulls
37
- backend: Optional custom storage backend (default: InMemoryLineageStore)
38
- identity: Optional custom row identity strategy (default: PandasRowIdentity)
57
+ backend: Optional custom storage backend
58
+ identity: Optional custom row identity strategy
59
+ merge_provenance: Override: capture merge parent RIDs (DEBUG default: True)
60
+ ghost_row_values: Override: capture last values of dropped rows
61
+ cell_history: Override: capture cell-level changes
62
+ sample_rate: Track only this fraction of rows (0.0-1.0)
63
+ max_tracked_rows: Maximum rows to track (for large datasets)
39
64
 
40
65
  Returns:
41
66
  The tracepipe module for fluent chaining.
42
67
 
43
68
  Examples:
44
- # Basic usage (pandas + in-memory)
45
- tracepipe.enable()
46
-
47
- # Fluent chaining
48
- tracepipe.enable().watch("age", "salary")
69
+ # CI mode (fast, default)
70
+ tp.enable()
49
71
 
50
- # With SQLite persistence (v2.1+)
51
- from tracepipe.storage.sqlite_backend import SQLiteLineageStore
52
- tracepipe.enable(backend=SQLiteLineageStore(config, "lineage.db"))
72
+ # Debug mode with watched columns
73
+ tp.enable(mode="debug", watch=["age", "salary"])
53
74
 
54
- # With Polars support (v2.1+)
55
- from tracepipe.storage.polars_identity import PolarsRowIdentity
56
- tracepipe.enable(identity=PolarsRowIdentity(config))
75
+ # Custom configuration
76
+ tp.enable(mode="ci", merge_provenance=True)
57
77
  """
78
+ ctx = get_context()
79
+
80
+ # If already enabled, reset accumulated state to prevent duplicate warnings/stats
81
+ # This handles the common case of re-running scripts in notebooks/IDEs
82
+ if ctx.enabled:
83
+ _reset_accumulated_state(ctx)
84
+
85
+ # Get or create config
86
+ # If config is provided explicitly, use it
87
+ # Otherwise, start with existing context config (if any) or create new default
88
+ if config is None:
89
+ config = ctx.config # Use existing config as base
90
+
91
+ # Handle mode
92
+ if mode is not None:
93
+ if isinstance(mode, str):
94
+ mode = TracePipeMode(mode.lower())
95
+ config.mode = mode
96
+
97
+ # Apply feature overrides
98
+ if merge_provenance is not None:
99
+ config.merge_provenance = merge_provenance
100
+ if ghost_row_values is not None:
101
+ config.ghost_row_values = ghost_row_values
102
+ if cell_history is not None:
103
+ config.cell_history = cell_history
104
+
105
+ if auto_watch:
106
+ config.auto_watch = True
107
+
108
+ # Sampling config validation
109
+ if sample_rate is not None or max_tracked_rows is not None:
110
+ import warnings
111
+
112
+ warnings.warn(
113
+ "sample_rate and max_tracked_rows are not yet implemented. "
114
+ "These parameters will be ignored.",
115
+ UserWarning,
116
+ stacklevel=2,
117
+ )
118
+
58
119
  # Create context with custom backends if provided
59
120
  if backend is not None or identity is not None:
60
121
  ctx = TracePipeContext(config=config, backend=backend, identity=identity)
61
122
  set_context(ctx)
62
123
  else:
63
- ctx = get_context()
64
- if config:
65
- ctx.config = config
124
+ ctx.config = config
125
+ # Also update config in row_manager and store (they may have their own references)
126
+ ctx.row_manager.config = config
127
+ ctx.store.config = config
66
128
 
67
- if auto_watch:
68
- ctx.config.auto_watch = True
129
+ # Add watched columns (reset first if re-enabling to avoid stale watches)
130
+ if watch:
131
+ ctx.watched_columns.clear()
132
+ ctx.watched_columns.update(watch)
69
133
 
70
134
  if not ctx.enabled:
71
135
  instrument_pandas()
@@ -74,14 +138,57 @@ def enable(
74
138
  return _get_module()
75
139
 
76
140
 
141
+ def _reset_accumulated_state(ctx: TracePipeContext) -> None:
142
+ """
143
+ Reset accumulated lineage state without disabling instrumentation.
144
+
145
+ Called when enable() is invoked on an already-enabled context to prevent
146
+ state accumulation across multiple script runs in the same Python process.
147
+ """
148
+ store = ctx.store
149
+
150
+ # Clear merge stats (prevents duplicate warnings)
151
+ if hasattr(store, "merge_stats"):
152
+ store.merge_stats.clear()
153
+
154
+ # Clear bulk drops
155
+ if hasattr(store, "bulk_drops"):
156
+ store.bulk_drops.clear()
157
+
158
+ # Clear steps
159
+ if hasattr(store, "_steps"):
160
+ store._steps.clear()
161
+
162
+ # Clear in-memory diffs
163
+ if hasattr(store, "_clear_in_memory"):
164
+ store._clear_in_memory()
165
+
166
+ # Reset step counter
167
+ if hasattr(store, "_step_counter"):
168
+ store._step_counter = 0
169
+
170
+ # Clear merge mappings
171
+ if hasattr(store, "merge_mappings"):
172
+ store.merge_mappings.clear()
173
+
174
+ # Clear aggregation mappings
175
+ if hasattr(store, "aggregation_mappings"):
176
+ store.aggregation_mappings.clear()
177
+
178
+ # Reset row identity manager
179
+ ctx.row_manager.clear()
180
+
181
+ # Clear watched columns (will be re-added if watch param provided)
182
+ ctx.watched_columns.clear()
183
+
184
+
77
185
  def disable() -> types.ModuleType:
78
186
  """
79
187
  Disable TracePipe and restore original pandas methods.
80
188
 
81
189
  Note:
82
190
  This stops tracking but preserves lineage data collected so far.
83
- You can still query explain(), dropped_rows(), etc. after disabling.
84
- To clear all data, use reset() instead.
191
+ Use reset() to clear all data.
85
192
 
86
193
  Returns:
87
194
  The tracepipe module for fluent chaining.
@@ -90,7 +197,6 @@ def disable() -> types.ModuleType:
90
197
 
91
198
  if ctx.enabled:
92
199
  uninstrument_pandas()
93
- # Call cleanup if backend supports it
94
200
  if hasattr(ctx.store, "_cleanup_spillover"):
95
201
  ctx.store._cleanup_spillover()
96
202
  ctx.enabled = False
@@ -105,11 +211,6 @@ def reset() -> types.ModuleType:
105
211
  This clears ALL lineage data, steps, watched columns, and row registrations.
106
212
  If tracking was enabled, it will be re-enabled with a fresh context.
107
213
 
108
- Use this when:
109
- - Starting fresh in a notebook cell
110
- - Running multiple independent analyses
111
- - Testing
112
-
113
214
  Returns:
114
215
  The tracepipe module for fluent chaining.
115
216
  """
@@ -122,7 +223,6 @@ def reset() -> types.ModuleType:
122
223
  reset_context()
123
224
 
124
225
  if was_enabled:
125
- # Re-enable with fresh context
126
226
  enable()
127
227
 
128
228
  return _get_module()
@@ -133,38 +233,17 @@ def configure(**kwargs) -> types.ModuleType:
133
233
  Update configuration.
134
234
 
135
235
  Args:
136
- **kwargs: Configuration options to update. Valid keys are:
137
- - max_diffs_in_memory: Maximum diffs before spilling to disk
138
- - max_diffs_per_step: Threshold for mass update detection
139
- - max_group_membership_size: Threshold for count-only groups
140
- - strict_mode: Raise exceptions on tracking errors
141
- - auto_watch: Auto-watch columns with null values
142
- - auto_watch_null_threshold: Null ratio threshold for auto-watch
143
- - spillover_dir: Directory for spilled data
144
- - use_hidden_column: Use hidden column for row tracking
145
- - warn_on_duplicate_index: Warn on duplicate DataFrame index
146
- - cleanup_spillover_on_disable: Clean up spilled files on disable
236
+ **kwargs: Configuration options to update.
147
237
 
148
238
  Returns:
149
239
  The tracepipe module for fluent chaining.
150
-
151
- Raises:
152
- ValueError: If an invalid configuration key is provided.
153
-
154
- Examples:
155
- tracepipe.configure(max_diffs_per_step=1000)
156
- tracepipe.enable().configure(strict_mode=True).watch("amount")
157
240
  """
158
241
  ctx = get_context()
159
242
 
160
- # Validate keys against dataclass fields
161
243
  valid_keys = {f.name for f in fields(TracePipeConfig)}
162
244
  invalid_keys = set(kwargs.keys()) - valid_keys
163
245
  if invalid_keys:
164
- raise ValueError(
165
- f"Invalid configuration key(s): {invalid_keys}. "
166
- f"Valid keys are: {sorted(valid_keys)}"
167
- )
246
+ raise ValueError(f"Invalid configuration key(s): {invalid_keys}")
168
247
 
169
248
  for key, value in kwargs.items():
170
249
  setattr(ctx.config, key, value)
@@ -172,110 +251,76 @@ def configure(**kwargs) -> types.ModuleType:
172
251
  return _get_module()
173
252
 
174
253
 
175
- def watch(*columns: str) -> types.ModuleType:
176
- """
177
- Add columns to watch for cell-level changes.
178
-
179
- Args:
180
- *columns: Column names to watch.
181
-
182
- Returns:
183
- The tracepipe module for fluent chaining.
254
+ def stage(name: str):
255
+ """Context manager for naming pipeline stages."""
184
256
 
185
- Examples:
186
- tracepipe.watch("age", "salary")
187
- tracepipe.enable().watch("amount").watch("price")
188
- """
189
- ctx = get_context()
190
- ctx.watched_columns.update(columns)
191
- return _get_module()
257
+ class StageContext:
258
+ def __init__(self, stage_name: str):
259
+ self.stage_name = stage_name
260
+ self.previous_stage = None
192
261
 
262
+ def __enter__(self):
263
+ ctx = get_context()
264
+ self.previous_stage = ctx.current_stage
265
+ ctx.current_stage = self.stage_name
266
+ return self
193
267
 
194
- def watch_all(df: pd.DataFrame) -> types.ModuleType:
195
- """
196
- Watch all columns in a DataFrame.
268
+ def __exit__(self, *args):
269
+ ctx = get_context()
270
+ ctx.current_stage = self.previous_stage
197
271
 
198
- Args:
199
- df: DataFrame whose columns to watch.
272
+ return StageContext(name)
200
273
 
201
- Returns:
202
- The tracepipe module for fluent chaining.
203
274
 
204
- Examples:
205
- tracepipe.watch_all(df)
275
+ def register(*dfs) -> types.ModuleType:
206
276
  """
207
- ctx = get_context()
208
- ctx.watched_columns.update(df.columns.tolist())
209
- return _get_module()
277
+ Register pre-existing DataFrames for tracking.
210
278
 
211
-
212
- def unwatch(*columns: str) -> types.ModuleType:
213
- """
214
- Remove columns from watch list.
279
+ Use this when DataFrames were created before tp.enable() was called.
280
+ After registration, snapshots, ghost rows, and cell history will work.
215
281
 
216
282
  Args:
217
- *columns: Column names to stop watching.
218
-
219
- Returns:
220
- The tracepipe module for fluent chaining.
221
- """
222
- ctx = get_context()
223
- ctx.watched_columns.difference_update(columns)
224
- return _get_module()
225
-
226
-
227
- def clear_watch() -> types.ModuleType:
228
- """
229
- Clear all watched columns.
283
+ *dfs: One or more DataFrames to register
230
284
 
231
285
  Returns:
232
286
  The tracepipe module for fluent chaining.
233
287
 
234
288
  Examples:
235
- tracepipe.clear_watch().watch("new_column")
236
- """
237
- ctx = get_context()
238
- ctx.watched_columns.clear()
239
- return _get_module()
289
+ # DataFrames created before enable
290
+ df1 = pd.DataFrame({"a": [1, 2, 3]})
291
+ df2 = pd.DataFrame({"b": [4, 5, 6]})
240
292
 
293
+ tp.enable()
294
+ tp.register(df1, df2) # Now they're tracked
241
295
 
242
- def register(df: pd.DataFrame) -> types.ModuleType:
296
+ snap = tp.snapshot(df1) # Works!
243
297
  """
244
- Manually register a DataFrame for tracking.
245
-
246
- Use this for DataFrames created before enable() was called.
298
+ import pandas as pd
247
299
 
248
- Returns:
249
- The tracepipe module for fluent chaining.
250
- """
251
300
  ctx = get_context()
252
- if ctx.enabled:
253
- ctx.row_manager.register(df)
254
- return _get_module()
255
-
256
301
 
257
- def stage(name: str):
258
- """Context manager for naming pipeline stages."""
302
+ if not ctx.enabled:
303
+ import warnings
259
304
 
260
- class StageContext:
261
- def __init__(self, stage_name: str):
262
- self.stage_name = stage_name
263
- self.previous_stage = None
305
+ warnings.warn(
306
+ "TracePipe is not enabled. Call tp.enable() before tp.register().",
307
+ UserWarning,
308
+ stacklevel=2,
309
+ )
310
+ return _get_module()
264
311
 
265
- def __enter__(self):
266
- ctx = get_context()
267
- self.previous_stage = ctx.current_stage
268
- ctx.current_stage = self.stage_name
269
- return self
312
+ for df in dfs:
313
+ if not isinstance(df, pd.DataFrame):
314
+ raise TypeError(f"Expected DataFrame, got {type(df).__name__}")
270
315
 
271
- def __exit__(self, *args):
272
- ctx = get_context()
273
- ctx.current_stage = self.previous_stage
316
+ # Only register if not already registered
317
+ if ctx.row_manager.get_ids_array(df) is None:
318
+ ctx.row_manager.register(df)
274
319
 
275
- return StageContext(name)
320
+ return _get_module()
276
321
 
277
322
 
278
- # === QUERY API ===
323
+ # === INTERNAL RESULT CLASSES (used by debug module) ===
279
324
 
280
325
 
281
326
  class RowLineageResult:
@@ -284,54 +329,82 @@ class RowLineageResult:
284
329
  def __init__(self, row_id: int, ctx: TracePipeContext):
285
330
  self.row_id = row_id
286
331
  self._ctx = ctx
287
- self._history = ctx.store.get_row_history(row_id)
288
- self._gaps = ctx.store.compute_gaps(row_id)
332
+ self._history: list[dict] | None = None
333
+ self._gaps: LineageGaps | None = None
334
+ self._drop_event: dict | None = None
335
+ self._drop_event_checked: bool = False
336
+
337
+ def _ensure_drop_event(self) -> None:
338
+ if not self._drop_event_checked:
339
+ self._drop_event = self._ctx.store.get_drop_event(self.row_id)
340
+ self._drop_event_checked = True
341
+
342
+ def _ensure_history(self) -> None:
343
+ if self._history is None:
344
+ self._history = self._ctx.store.get_row_history(self.row_id)
345
+
346
+ def _ensure_gaps(self) -> None:
347
+ if self._gaps is None:
348
+ self._gaps = self._ctx.store.compute_gaps(self.row_id)
289
349
 
290
350
  @property
291
351
  def is_alive(self) -> bool:
292
- """Return True if row was not dropped."""
293
- return not any(h["change_type"] == "DROPPED" for h in self._history)
352
+ self._ensure_drop_event()
353
+ return self._drop_event is None
294
354
 
295
355
  @property
296
356
  def dropped_at(self) -> str | None:
297
- """Return operation name where row was dropped, or None."""
298
- for h in self._history:
299
- if h["change_type"] == "DROPPED":
300
- return h["operation"]
357
+ self._ensure_drop_event()
358
+ if self._drop_event is not None:
359
+ return self._drop_event.get("operation")
301
360
  return None
302
361
 
362
+ @property
363
+ def dropped_step_id(self) -> int | None:
364
+ self._ensure_drop_event()
365
+ if self._drop_event is not None:
366
+ return self._drop_event.get("step_id")
367
+ return None
368
+
369
+ def merge_origin(self) -> dict | None:
370
+ return self._ctx.store.get_merge_origin(self.row_id)
371
+
303
372
  def cell_history(self, column: str) -> list[dict]:
304
- """Get history for a specific column."""
373
+ self._ensure_history()
305
374
  return [h for h in self._history if h["col"] == column]
306
375
 
307
376
  def history(self) -> list[dict]:
308
- """Get full history."""
377
+ self._ensure_history()
309
378
  return self._history
310
379
 
311
380
  @property
312
381
  def gaps(self) -> LineageGaps:
313
- """Get lineage gaps."""
382
+ self._ensure_gaps()
314
383
  return self._gaps
315
384
 
316
385
  @property
317
386
  def is_fully_tracked(self) -> bool:
318
- """Return True if no gaps in lineage."""
387
+ self._ensure_gaps()
319
388
  return self._gaps.is_fully_tracked
320
389
 
321
390
  def to_dict(self) -> dict:
322
- """Export to dictionary."""
391
+ self._ensure_history()
392
+ self._ensure_gaps()
393
+ merge = self.merge_origin()
323
394
  return {
324
395
  "row_id": self.row_id,
325
396
  "is_alive": self.is_alive,
326
397
  "dropped_at": self.dropped_at,
398
+ "dropped_step_id": self.dropped_step_id,
327
399
  "is_fully_tracked": self.is_fully_tracked,
328
400
  "gaps_summary": self._gaps.summary(),
401
+ "merge_origin": merge,
329
402
  "history": self._history,
330
403
  }
331
404
 
332
405
  def __repr__(self):
333
406
  status = "alive" if self.is_alive else f"dropped at {self.dropped_at}"
334
- return f"<RowLineage row_id={self.row_id} {status} events={len(self._history)}>"
407
+ return f"<RowLineage row_id={self.row_id} {status} events={len(self.history())}>"
335
408
 
336
409
 
337
410
  class GroupLineageResult:
@@ -344,45 +417,25 @@ class GroupLineageResult:
344
417
 
345
418
  @property
346
419
  def row_ids(self) -> list[int]:
347
- """Get list of row IDs in this group."""
348
420
  return self._info["row_ids"] if self._info else []
349
421
 
350
422
  @property
351
423
  def row_count(self) -> int:
352
- """Get number of rows in this group."""
353
424
  return self._info["row_count"] if self._info else 0
354
425
 
355
426
  @property
356
427
  def is_count_only(self) -> bool:
357
- """
358
- True if group exceeded max_group_membership_size threshold.
359
-
360
- When True, row_ids will be empty and only row_count is available.
361
- """
362
428
  return self._info.get("is_count_only", False) if self._info else False
363
429
 
364
430
  @property
365
431
  def group_column(self) -> str | None:
366
- """Get the column used for grouping."""
367
432
  return self._info["group_column"] if self._info else None
368
433
 
369
434
  @property
370
435
  def aggregation_functions(self) -> dict[str, str]:
371
- """Get the aggregation functions applied."""
372
436
  return self._info["agg_functions"] if self._info else {}
373
437
 
374
- def get_contributing_rows(self, limit: int = 100) -> list[RowLineageResult]:
375
- """
376
- Get lineage for contributing rows.
377
-
378
- Returns empty list if is_count_only is True.
379
- """
380
- if self.is_count_only:
381
- return []
382
- return [explain(row_id) for row_id in self.row_ids[:limit]]
383
-
384
438
  def to_dict(self) -> dict:
385
- """Export to dictionary."""
386
439
  return {
387
440
  "group_key": self.group_key,
388
441
  "group_column": self.group_column,
@@ -395,169 +448,3 @@ class GroupLineageResult:
395
448
  def __repr__(self):
396
449
  suffix = " (count only)" if self.is_count_only else ""
397
450
  return f"<GroupLineage key='{self.group_key}' rows={self.row_count}{suffix}>"
398
-
399
-
400
- def explain(row_id: int) -> RowLineageResult:
401
- """Get lineage for a specific row."""
402
- ctx = get_context()
403
- return RowLineageResult(row_id, ctx)
404
-
405
-
406
- def explain_many(row_ids: list[int]) -> list[RowLineageResult]:
407
- """
408
- Get lineage for multiple rows.
409
-
410
- Args:
411
- row_ids: List of row IDs to explain.
412
-
413
- Returns:
414
- List of RowLineageResult objects.
415
-
416
- Examples:
417
- results = tracepipe.explain_many([0, 1, 2])
418
- for row in results:
419
- print(row.is_alive, row.dropped_at)
420
- """
421
- ctx = get_context()
422
- return [RowLineageResult(row_id, ctx) for row_id in row_ids]
423
-
424
-
425
- def explain_group(group_key: str) -> GroupLineageResult:
426
- """Get lineage for an aggregation group."""
427
- ctx = get_context()
428
- return GroupLineageResult(group_key, ctx)
429
-
430
-
431
- def dropped_rows(by_step: bool = False) -> list[int] | dict[str, int]:
432
- """
433
- Get dropped row information.
434
-
435
- Args:
436
- by_step: If False (default), return list of dropped row IDs.
437
- If True, return dict mapping operation names to drop counts.
438
-
439
- Returns:
440
- List of row IDs if by_step=False, or dict of {operation: count} if by_step=True.
441
-
442
- Examples:
443
- # Get all dropped row IDs
444
- dropped = tracepipe.dropped_rows()
445
-
446
- # Get counts by operation
447
- by_op = tracepipe.dropped_rows(by_step=True)
448
- # {'DataFrame.dropna': 5, 'DataFrame.query': 3}
449
- """
450
- ctx = get_context()
451
- if by_step:
452
- return ctx.store.get_dropped_by_step()
453
- return ctx.store.get_dropped_rows()
454
-
455
-
456
- def alive_rows() -> list[int]:
457
- """
458
- Get all row IDs that are still alive (not dropped).
459
-
460
- Returns:
461
- List of row IDs that have not been dropped.
462
-
463
- Examples:
464
- alive = tracepipe.alive_rows()
465
- print(f"{len(alive)} rows survived the pipeline")
466
- """
467
- ctx = get_context()
468
- all_registered = set(ctx.row_manager.all_registered_ids())
469
- dropped = set(ctx.store.get_dropped_rows())
470
- return sorted(all_registered - dropped)
471
-
472
-
473
- def mass_updates() -> list[dict]:
474
- """Get operations that exceeded cell diff threshold."""
475
- ctx = get_context()
476
- return [
477
- {
478
- "step_id": s.step_id,
479
- "operation": s.operation,
480
- "rows_affected": s.rows_affected,
481
- "stage": s.stage,
482
- }
483
- for s in ctx.store.steps
484
- if s.is_mass_update
485
- ]
486
-
487
-
488
- def steps() -> list[dict]:
489
- """Get all tracked steps."""
490
- ctx = get_context()
491
- return [
492
- {
493
- "step_id": s.step_id,
494
- "operation": s.operation,
495
- "stage": s.stage,
496
- "input_shape": s.input_shape,
497
- "output_shape": s.output_shape,
498
- "completeness": s.completeness.name,
499
- "is_mass_update": s.is_mass_update,
500
- "timestamp": s.timestamp,
501
- "code_file": s.code_file,
502
- "code_line": s.code_line,
503
- }
504
- for s in ctx.store.steps
505
- ]
506
-
507
-
508
- def aggregation_groups() -> list[str]:
509
- """List all tracked aggregation groups."""
510
- ctx = get_context()
511
- groups = []
512
- for mapping in ctx.store.aggregation_mappings:
513
- groups.extend(mapping.membership.keys())
514
- return groups
515
-
516
-
517
- # === EXPORT ===
518
-
519
-
520
- def export_json(filepath: str) -> None:
521
- """Export lineage to JSON file."""
522
- ctx = get_context()
523
- with open(filepath, "w") as f:
524
- f.write(ctx.store.to_json())
525
-
526
-
527
- def export_arrow(filepath: str) -> None:
528
- """
529
- Export lineage to Parquet file.
530
-
531
- Requires pyarrow to be installed.
532
-
533
- Args:
534
- filepath: Path to write the Parquet file.
535
-
536
- Raises:
537
- ImportError: If pyarrow is not installed.
538
- """
539
- try:
540
- import pyarrow.parquet as pq
541
- except ImportError:
542
- raise ImportError(
543
- "pyarrow is required for Arrow/Parquet export. "
544
- "Install it with: pip install tracepipe[arrow] or pip install pyarrow"
545
- ) from None
546
-
547
- ctx = get_context()
548
- table = ctx.store.to_arrow()
549
- pq.write_table(table, filepath)
550
-
551
-
552
- def stats() -> dict:
553
- """Get tracking statistics."""
554
- ctx = get_context()
555
- return {
556
- "enabled": ctx.enabled,
557
- "total_steps": len(ctx.store.steps),
558
- "total_diffs": ctx.store.total_diff_count,
559
- "in_memory_diffs": ctx.store.diff_count,
560
- "spilled_files": len(ctx.store.spilled_files),
561
- "watched_columns": list(ctx.watched_columns),
562
- "aggregation_groups": len(ctx.store.aggregation_mappings),
563
- }