tracepipe 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tracepipe/api.py CHANGED
@@ -1,18 +1,28 @@
1
1
  # tracepipe/api.py
2
2
  """
3
- Public API for TracePipe.
3
+ Core API for TracePipe.
4
+
5
+ This module provides the foundational enable/disable/reset functions
6
+ and internal result classes. For user-facing functionality, see:
7
+ - convenience.py: check(), trace(), why(), report()
8
+ - debug.py: inspect(), export()
9
+ - contracts.py: contract()
10
+ - snapshot.py: snapshot(), diff()
11
+
12
+ Modes:
13
+ - CI: Fast stats and drop tracking. No merge provenance or ghost values.
14
+ - DEBUG: Full provenance with merge origin tracking and ghost row values.
4
15
  """
5
16
 
6
17
  from __future__ import annotations
7
18
 
8
19
  import sys
9
20
  import types
21
+ from collections.abc import Sequence
10
22
  from dataclasses import fields
11
23
 
12
- import pandas as pd
13
-
14
24
  from .context import TracePipeContext, get_context, reset_context, set_context
15
- from .core import LineageGaps, TracePipeConfig
25
+ from .core import LineageGaps, TracePipeConfig, TracePipeMode
16
26
  from .instrumentation.pandas_inst import instrument_pandas, uninstrument_pandas
17
27
  from .storage.base import LineageBackend, RowIdentityStrategy
18
28
 
@@ -24,48 +34,96 @@ def _get_module() -> types.ModuleType:
24
34
 
25
35
  def enable(
26
36
  config: TracePipeConfig | None = None,
37
+ mode: TracePipeMode | str | None = None,
38
+ *,
39
+ watch: Sequence[str] | None = None,
27
40
  auto_watch: bool = False,
28
41
  backend: LineageBackend | None = None,
29
42
  identity: RowIdentityStrategy | None = None,
43
+ merge_provenance: bool | None = None,
44
+ ghost_row_values: bool | None = None,
45
+ cell_history: bool | None = None,
46
+ sample_rate: float | None = None,
47
+ max_tracked_rows: int | None = None,
30
48
  ) -> types.ModuleType:
31
49
  """
32
50
  Enable TracePipe lineage tracking.
33
51
 
34
52
  Args:
35
- config: Optional configuration
53
+ config: Optional configuration object
54
+ mode: Operating mode - "ci" (fast) or "debug" (full provenance)
55
+ watch: List of columns to watch for cell-level changes
36
56
  auto_watch: If True, automatically watch columns with nulls
37
- backend: Optional custom storage backend (default: InMemoryLineageStore)
38
- identity: Optional custom row identity strategy (default: PandasRowIdentity)
57
+ backend: Optional custom storage backend
58
+ identity: Optional custom row identity strategy
59
+ merge_provenance: Override: capture merge parent RIDs (DEBUG default: True)
60
+ ghost_row_values: Override: capture last values of dropped rows
61
+ cell_history: Override: capture cell-level changes
62
+ sample_rate: Track only this fraction of rows (0.0-1.0)
63
+ max_tracked_rows: Maximum rows to track (for large datasets)
39
64
 
40
65
  Returns:
41
66
  The tracepipe module for fluent chaining.
42
67
 
43
68
  Examples:
44
- # Basic usage (pandas + in-memory)
45
- tracepipe.enable()
46
-
47
- # Fluent chaining
48
- tracepipe.enable().watch("age", "salary")
69
+ # CI mode (fast, default)
70
+ tp.enable()
49
71
 
50
- # With SQLite persistence (v2.1+)
51
- from tracepipe.storage.sqlite_backend import SQLiteLineageStore
52
- tracepipe.enable(backend=SQLiteLineageStore(config, "lineage.db"))
72
+ # Debug mode with watched columns
73
+ tp.enable(mode="debug", watch=["age", "salary"])
53
74
 
54
- # With Polars support (v2.1+)
55
- from tracepipe.storage.polars_identity import PolarsRowIdentity
56
- tracepipe.enable(identity=PolarsRowIdentity(config))
75
+ # Custom configuration
76
+ tp.enable(mode="ci", merge_provenance=True)
57
77
  """
78
+ # Get or create config
79
+ # If config is provided explicitly, use it
80
+ # Otherwise, start with existing context config (if any) or create new default
81
+ if config is None:
82
+ existing_ctx = get_context()
83
+ config = existing_ctx.config # Use existing config as base
84
+
85
+ # Handle mode
86
+ if mode is not None:
87
+ if isinstance(mode, str):
88
+ mode = TracePipeMode(mode.lower())
89
+ config.mode = mode
90
+
91
+ # Apply feature overrides
92
+ if merge_provenance is not None:
93
+ config.merge_provenance = merge_provenance
94
+ if ghost_row_values is not None:
95
+ config.ghost_row_values = ghost_row_values
96
+ if cell_history is not None:
97
+ config.cell_history = cell_history
98
+
99
+ if auto_watch:
100
+ config.auto_watch = True
101
+
102
+ # Sampling config validation
103
+ if sample_rate is not None or max_tracked_rows is not None:
104
+ import warnings
105
+
106
+ warnings.warn(
107
+ "sample_rate and max_tracked_rows are not yet implemented. "
108
+ "These parameters will be ignored.",
109
+ UserWarning,
110
+ stacklevel=2,
111
+ )
112
+
58
113
  # Create context with custom backends if provided
59
114
  if backend is not None or identity is not None:
60
115
  ctx = TracePipeContext(config=config, backend=backend, identity=identity)
61
116
  set_context(ctx)
62
117
  else:
63
118
  ctx = get_context()
64
- if config:
65
- ctx.config = config
119
+ ctx.config = config
120
+ # Also update config in row_manager and store (they may have their own references)
121
+ ctx.row_manager.config = config
122
+ ctx.store.config = config
66
123
 
67
- if auto_watch:
68
- ctx.config.auto_watch = True
124
+ # Add watched columns
125
+ if watch:
126
+ ctx.watched_columns.update(watch)
69
127
 
70
128
  if not ctx.enabled:
71
129
  instrument_pandas()
@@ -80,8 +138,7 @@ def disable() -> types.ModuleType:
80
138
 
81
139
  Note:
82
140
  This stops tracking but preserves lineage data collected so far.
83
- You can still query explain(), dropped_rows(), etc. after disabling.
84
- To clear all data, use reset() instead.
141
+ Use reset() to clear all data.
85
142
 
86
143
  Returns:
87
144
  The tracepipe module for fluent chaining.
@@ -90,7 +147,6 @@ def disable() -> types.ModuleType:
90
147
 
91
148
  if ctx.enabled:
92
149
  uninstrument_pandas()
93
- # Call cleanup if backend supports it
94
150
  if hasattr(ctx.store, "_cleanup_spillover"):
95
151
  ctx.store._cleanup_spillover()
96
152
  ctx.enabled = False
@@ -105,11 +161,6 @@ def reset() -> types.ModuleType:
105
161
  This clears ALL lineage data, steps, watched columns, and row registrations.
106
162
  If tracking was enabled, it will be re-enabled with a fresh context.
107
163
 
108
- Use this when:
109
- - Starting fresh in a notebook cell
110
- - Running multiple independent analyses
111
- - Testing
112
-
113
164
  Returns:
114
165
  The tracepipe module for fluent chaining.
115
166
  """
@@ -122,7 +173,6 @@ def reset() -> types.ModuleType:
122
173
  reset_context()
123
174
 
124
175
  if was_enabled:
125
- # Re-enable with fresh context
126
176
  enable()
127
177
 
128
178
  return _get_module()
@@ -133,38 +183,17 @@ def configure(**kwargs) -> types.ModuleType:
133
183
  Update configuration.
134
184
 
135
185
  Args:
136
- **kwargs: Configuration options to update. Valid keys are:
137
- - max_diffs_in_memory: Maximum diffs before spilling to disk
138
- - max_diffs_per_step: Threshold for mass update detection
139
- - max_group_membership_size: Threshold for count-only groups
140
- - strict_mode: Raise exceptions on tracking errors
141
- - auto_watch: Auto-watch columns with null values
142
- - auto_watch_null_threshold: Null ratio threshold for auto-watch
143
- - spillover_dir: Directory for spilled data
144
- - use_hidden_column: Use hidden column for row tracking
145
- - warn_on_duplicate_index: Warn on duplicate DataFrame index
146
- - cleanup_spillover_on_disable: Clean up spilled files on disable
186
+ **kwargs: Configuration options to update.
147
187
 
148
188
  Returns:
149
189
  The tracepipe module for fluent chaining.
150
-
151
- Raises:
152
- ValueError: If an invalid configuration key is provided.
153
-
154
- Examples:
155
- tracepipe.configure(max_diffs_per_step=1000)
156
- tracepipe.enable().configure(strict_mode=True).watch("amount")
157
190
  """
158
191
  ctx = get_context()
159
192
 
160
- # Validate keys against dataclass fields
161
193
  valid_keys = {f.name for f in fields(TracePipeConfig)}
162
194
  invalid_keys = set(kwargs.keys()) - valid_keys
163
195
  if invalid_keys:
164
- raise ValueError(
165
- f"Invalid configuration key(s): {invalid_keys}. "
166
- f"Valid keys are: {sorted(valid_keys)}"
167
- )
196
+ raise ValueError(f"Invalid configuration key(s): {invalid_keys}")
168
197
 
169
198
  for key, value in kwargs.items():
170
199
  setattr(ctx.config, key, value)
@@ -172,110 +201,76 @@ def configure(**kwargs) -> types.ModuleType:
172
201
  return _get_module()
173
202
 
174
203
 
175
- def watch(*columns: str) -> types.ModuleType:
176
- """
177
- Add columns to watch for cell-level changes.
178
-
179
- Args:
180
- *columns: Column names to watch.
204
+ def stage(name: str):
205
+ """Context manager for naming pipeline stages."""
181
206
 
182
- Returns:
183
- The tracepipe module for fluent chaining.
207
+ class StageContext:
208
+ def __init__(self, stage_name: str):
209
+ self.stage_name = stage_name
210
+ self.previous_stage = None
184
211
 
185
- Examples:
186
- tracepipe.watch("age", "salary")
187
- tracepipe.enable().watch("amount").watch("price")
188
- """
189
- ctx = get_context()
190
- ctx.watched_columns.update(columns)
191
- return _get_module()
212
+ def __enter__(self):
213
+ ctx = get_context()
214
+ self.previous_stage = ctx.current_stage
215
+ ctx.current_stage = self.stage_name
216
+ return self
192
217
 
218
+ def __exit__(self, *args):
219
+ ctx = get_context()
220
+ ctx.current_stage = self.previous_stage
193
221
 
194
- def watch_all(df: pd.DataFrame) -> types.ModuleType:
195
- """
196
- Watch all columns in a DataFrame.
222
+ return StageContext(name)
197
223
 
198
- Args:
199
- df: DataFrame whose columns to watch.
200
224
 
201
- Returns:
202
- The tracepipe module for fluent chaining.
203
-
204
- Examples:
205
- tracepipe.watch_all(df)
225
+ def register(*dfs) -> types.ModuleType:
206
226
  """
207
- ctx = get_context()
208
- ctx.watched_columns.update(df.columns.tolist())
209
- return _get_module()
227
+ Register pre-existing DataFrames for tracking.
210
228
 
211
-
212
- def unwatch(*columns: str) -> types.ModuleType:
213
- """
214
- Remove columns from watch list.
229
+ Use this when DataFrames were created before tp.enable() was called.
230
+ After registration, snapshots, ghost rows, and cell history will work.
215
231
 
216
232
  Args:
217
- *columns: Column names to stop watching.
218
-
219
- Returns:
220
- The tracepipe module for fluent chaining.
221
- """
222
- ctx = get_context()
223
- ctx.watched_columns.difference_update(columns)
224
- return _get_module()
225
-
226
-
227
- def clear_watch() -> types.ModuleType:
228
- """
229
- Clear all watched columns.
233
+ *dfs: One or more DataFrames to register
230
234
 
231
235
  Returns:
232
236
  The tracepipe module for fluent chaining.
233
237
 
234
238
  Examples:
235
- tracepipe.clear_watch().watch("new_column")
236
- """
237
- ctx = get_context()
238
- ctx.watched_columns.clear()
239
- return _get_module()
239
+ # DataFrames created before enable
240
+ df1 = pd.DataFrame({"a": [1, 2, 3]})
241
+ df2 = pd.DataFrame({"b": [4, 5, 6]})
240
242
 
243
+ tp.enable()
244
+ tp.register(df1, df2) # Now they're tracked
241
245
 
242
- def register(df: pd.DataFrame) -> types.ModuleType:
246
+ snap = tp.snapshot(df1) # Works!
243
247
  """
244
- Manually register a DataFrame for tracking.
245
-
246
- Use this for DataFrames created before enable() was called.
248
+ import pandas as pd
247
249
 
248
- Returns:
249
- The tracepipe module for fluent chaining.
250
- """
251
250
  ctx = get_context()
252
- if ctx.enabled:
253
- ctx.row_manager.register(df)
254
- return _get_module()
255
251
 
252
+ if not ctx.enabled:
253
+ import warnings
256
254
 
257
- def stage(name: str):
258
- """Context manager for naming pipeline stages."""
259
-
260
- class StageContext:
261
- def __init__(self, stage_name: str):
262
- self.stage_name = stage_name
263
- self.previous_stage = None
255
+ warnings.warn(
256
+ "TracePipe is not enabled. Call tp.enable() before tp.register().",
257
+ UserWarning,
258
+ stacklevel=2,
259
+ )
260
+ return _get_module()
264
261
 
265
- def __enter__(self):
266
- ctx = get_context()
267
- self.previous_stage = ctx.current_stage
268
- ctx.current_stage = self.stage_name
269
- return self
262
+ for df in dfs:
263
+ if not isinstance(df, pd.DataFrame):
264
+ raise TypeError(f"Expected DataFrame, got {type(df).__name__}")
270
265
 
271
- def __exit__(self, *args):
272
- ctx = get_context()
273
- ctx.current_stage = self.previous_stage
266
+ # Only register if not already registered
267
+ if ctx.row_manager.get_ids_array(df) is None:
268
+ ctx.row_manager.register(df)
274
269
 
275
- return StageContext(name)
270
+ return _get_module()
276
271
 
277
272
 
278
- # === QUERY API ===
273
+ # === INTERNAL RESULT CLASSES (used by debug module) ===
279
274
 
280
275
 
281
276
  class RowLineageResult:
@@ -284,54 +279,82 @@ class RowLineageResult:
284
279
  def __init__(self, row_id: int, ctx: TracePipeContext):
285
280
  self.row_id = row_id
286
281
  self._ctx = ctx
287
- self._history = ctx.store.get_row_history(row_id)
288
- self._gaps = ctx.store.compute_gaps(row_id)
282
+ self._history: list[dict] | None = None
283
+ self._gaps: LineageGaps | None = None
284
+ self._drop_event: dict | None = None
285
+ self._drop_event_checked: bool = False
286
+
287
+ def _ensure_drop_event(self) -> None:
288
+ if not self._drop_event_checked:
289
+ self._drop_event = self._ctx.store.get_drop_event(self.row_id)
290
+ self._drop_event_checked = True
291
+
292
+ def _ensure_history(self) -> None:
293
+ if self._history is None:
294
+ self._history = self._ctx.store.get_row_history(self.row_id)
295
+
296
+ def _ensure_gaps(self) -> None:
297
+ if self._gaps is None:
298
+ self._gaps = self._ctx.store.compute_gaps(self.row_id)
289
299
 
290
300
  @property
291
301
  def is_alive(self) -> bool:
292
- """Return True if row was not dropped."""
293
- return not any(h["change_type"] == "DROPPED" for h in self._history)
302
+ self._ensure_drop_event()
303
+ return self._drop_event is None
294
304
 
295
305
  @property
296
306
  def dropped_at(self) -> str | None:
297
- """Return operation name where row was dropped, or None."""
298
- for h in self._history:
299
- if h["change_type"] == "DROPPED":
300
- return h["operation"]
307
+ self._ensure_drop_event()
308
+ if self._drop_event is not None:
309
+ return self._drop_event.get("operation")
301
310
  return None
302
311
 
312
+ @property
313
+ def dropped_step_id(self) -> int | None:
314
+ self._ensure_drop_event()
315
+ if self._drop_event is not None:
316
+ return self._drop_event.get("step_id")
317
+ return None
318
+
319
+ def merge_origin(self) -> dict | None:
320
+ return self._ctx.store.get_merge_origin(self.row_id)
321
+
303
322
  def cell_history(self, column: str) -> list[dict]:
304
- """Get history for a specific column."""
323
+ self._ensure_history()
305
324
  return [h for h in self._history if h["col"] == column]
306
325
 
307
326
  def history(self) -> list[dict]:
308
- """Get full history."""
327
+ self._ensure_history()
309
328
  return self._history
310
329
 
311
330
  @property
312
331
  def gaps(self) -> LineageGaps:
313
- """Get lineage gaps."""
332
+ self._ensure_gaps()
314
333
  return self._gaps
315
334
 
316
335
  @property
317
336
  def is_fully_tracked(self) -> bool:
318
- """Return True if no gaps in lineage."""
337
+ self._ensure_gaps()
319
338
  return self._gaps.is_fully_tracked
320
339
 
321
340
  def to_dict(self) -> dict:
322
- """Export to dictionary."""
341
+ self._ensure_history()
342
+ self._ensure_gaps()
343
+ merge = self.merge_origin()
323
344
  return {
324
345
  "row_id": self.row_id,
325
346
  "is_alive": self.is_alive,
326
347
  "dropped_at": self.dropped_at,
348
+ "dropped_step_id": self.dropped_step_id,
327
349
  "is_fully_tracked": self.is_fully_tracked,
328
350
  "gaps_summary": self._gaps.summary(),
351
+ "merge_origin": merge,
329
352
  "history": self._history,
330
353
  }
331
354
 
332
355
  def __repr__(self):
333
356
  status = "alive" if self.is_alive else f"dropped at {self.dropped_at}"
334
- return f"<RowLineage row_id={self.row_id} {status} events={len(self._history)}>"
357
+ return f"<RowLineage row_id={self.row_id} {status} events={len(self.history())}>"
335
358
 
336
359
 
337
360
  class GroupLineageResult:
@@ -344,45 +367,25 @@ class GroupLineageResult:
344
367
 
345
368
  @property
346
369
  def row_ids(self) -> list[int]:
347
- """Get list of row IDs in this group."""
348
370
  return self._info["row_ids"] if self._info else []
349
371
 
350
372
  @property
351
373
  def row_count(self) -> int:
352
- """Get number of rows in this group."""
353
374
  return self._info["row_count"] if self._info else 0
354
375
 
355
376
  @property
356
377
  def is_count_only(self) -> bool:
357
- """
358
- True if group exceeded max_group_membership_size threshold.
359
-
360
- When True, row_ids will be empty and only row_count is available.
361
- """
362
378
  return self._info.get("is_count_only", False) if self._info else False
363
379
 
364
380
  @property
365
381
  def group_column(self) -> str | None:
366
- """Get the column used for grouping."""
367
382
  return self._info["group_column"] if self._info else None
368
383
 
369
384
  @property
370
385
  def aggregation_functions(self) -> dict[str, str]:
371
- """Get the aggregation functions applied."""
372
386
  return self._info["agg_functions"] if self._info else {}
373
387
 
374
- def get_contributing_rows(self, limit: int = 100) -> list[RowLineageResult]:
375
- """
376
- Get lineage for contributing rows.
377
-
378
- Returns empty list if is_count_only is True.
379
- """
380
- if self.is_count_only:
381
- return []
382
- return [explain(row_id) for row_id in self.row_ids[:limit]]
383
-
384
388
  def to_dict(self) -> dict:
385
- """Export to dictionary."""
386
389
  return {
387
390
  "group_key": self.group_key,
388
391
  "group_column": self.group_column,
@@ -395,169 +398,3 @@ class GroupLineageResult:
395
398
  def __repr__(self):
396
399
  suffix = " (count only)" if self.is_count_only else ""
397
400
  return f"<GroupLineage key='{self.group_key}' rows={self.row_count}{suffix}>"
398
-
399
-
400
- def explain(row_id: int) -> RowLineageResult:
401
- """Get lineage for a specific row."""
402
- ctx = get_context()
403
- return RowLineageResult(row_id, ctx)
404
-
405
-
406
- def explain_many(row_ids: list[int]) -> list[RowLineageResult]:
407
- """
408
- Get lineage for multiple rows.
409
-
410
- Args:
411
- row_ids: List of row IDs to explain.
412
-
413
- Returns:
414
- List of RowLineageResult objects.
415
-
416
- Examples:
417
- results = tracepipe.explain_many([0, 1, 2])
418
- for row in results:
419
- print(row.is_alive, row.dropped_at)
420
- """
421
- ctx = get_context()
422
- return [RowLineageResult(row_id, ctx) for row_id in row_ids]
423
-
424
-
425
- def explain_group(group_key: str) -> GroupLineageResult:
426
- """Get lineage for an aggregation group."""
427
- ctx = get_context()
428
- return GroupLineageResult(group_key, ctx)
429
-
430
-
431
- def dropped_rows(by_step: bool = False) -> list[int] | dict[str, int]:
432
- """
433
- Get dropped row information.
434
-
435
- Args:
436
- by_step: If False (default), return list of dropped row IDs.
437
- If True, return dict mapping operation names to drop counts.
438
-
439
- Returns:
440
- List of row IDs if by_step=False, or dict of {operation: count} if by_step=True.
441
-
442
- Examples:
443
- # Get all dropped row IDs
444
- dropped = tracepipe.dropped_rows()
445
-
446
- # Get counts by operation
447
- by_op = tracepipe.dropped_rows(by_step=True)
448
- # {'DataFrame.dropna': 5, 'DataFrame.query': 3}
449
- """
450
- ctx = get_context()
451
- if by_step:
452
- return ctx.store.get_dropped_by_step()
453
- return ctx.store.get_dropped_rows()
454
-
455
-
456
- def alive_rows() -> list[int]:
457
- """
458
- Get all row IDs that are still alive (not dropped).
459
-
460
- Returns:
461
- List of row IDs that have not been dropped.
462
-
463
- Examples:
464
- alive = tracepipe.alive_rows()
465
- print(f"{len(alive)} rows survived the pipeline")
466
- """
467
- ctx = get_context()
468
- all_registered = set(ctx.row_manager.all_registered_ids())
469
- dropped = set(ctx.store.get_dropped_rows())
470
- return sorted(all_registered - dropped)
471
-
472
-
473
- def mass_updates() -> list[dict]:
474
- """Get operations that exceeded cell diff threshold."""
475
- ctx = get_context()
476
- return [
477
- {
478
- "step_id": s.step_id,
479
- "operation": s.operation,
480
- "rows_affected": s.rows_affected,
481
- "stage": s.stage,
482
- }
483
- for s in ctx.store.steps
484
- if s.is_mass_update
485
- ]
486
-
487
-
488
- def steps() -> list[dict]:
489
- """Get all tracked steps."""
490
- ctx = get_context()
491
- return [
492
- {
493
- "step_id": s.step_id,
494
- "operation": s.operation,
495
- "stage": s.stage,
496
- "input_shape": s.input_shape,
497
- "output_shape": s.output_shape,
498
- "completeness": s.completeness.name,
499
- "is_mass_update": s.is_mass_update,
500
- "timestamp": s.timestamp,
501
- "code_file": s.code_file,
502
- "code_line": s.code_line,
503
- }
504
- for s in ctx.store.steps
505
- ]
506
-
507
-
508
- def aggregation_groups() -> list[str]:
509
- """List all tracked aggregation groups."""
510
- ctx = get_context()
511
- groups = []
512
- for mapping in ctx.store.aggregation_mappings:
513
- groups.extend(mapping.membership.keys())
514
- return groups
515
-
516
-
517
- # === EXPORT ===
518
-
519
-
520
- def export_json(filepath: str) -> None:
521
- """Export lineage to JSON file."""
522
- ctx = get_context()
523
- with open(filepath, "w") as f:
524
- f.write(ctx.store.to_json())
525
-
526
-
527
- def export_arrow(filepath: str) -> None:
528
- """
529
- Export lineage to Parquet file.
530
-
531
- Requires pyarrow to be installed.
532
-
533
- Args:
534
- filepath: Path to write the Parquet file.
535
-
536
- Raises:
537
- ImportError: If pyarrow is not installed.
538
- """
539
- try:
540
- import pyarrow.parquet as pq
541
- except ImportError:
542
- raise ImportError(
543
- "pyarrow is required for Arrow/Parquet export. "
544
- "Install it with: pip install tracepipe[arrow] or pip install pyarrow"
545
- ) from None
546
-
547
- ctx = get_context()
548
- table = ctx.store.to_arrow()
549
- pq.write_table(table, filepath)
550
-
551
-
552
- def stats() -> dict:
553
- """Get tracking statistics."""
554
- ctx = get_context()
555
- return {
556
- "enabled": ctx.enabled,
557
- "total_steps": len(ctx.store.steps),
558
- "total_diffs": ctx.store.total_diff_count,
559
- "in_memory_diffs": ctx.store.diff_count,
560
- "spilled_files": len(ctx.store.spilled_files),
561
- "watched_columns": list(ctx.watched_columns),
562
- "aggregation_groups": len(ctx.store.aggregation_mappings),
563
- }