tracepipe 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tracepipe/__init__.py +117 -78
- tracepipe/api.py +168 -331
- tracepipe/context.py +21 -1
- tracepipe/contracts.py +473 -0
- tracepipe/convenience.py +812 -0
- tracepipe/core.py +174 -17
- tracepipe/debug.py +325 -0
- tracepipe/instrumentation/apply_capture.py +453 -0
- tracepipe/instrumentation/filter_capture.py +468 -0
- tracepipe/instrumentation/indexer_capture.py +813 -0
- tracepipe/instrumentation/merge_capture.py +434 -0
- tracepipe/instrumentation/pandas_inst.py +66 -183
- tracepipe/instrumentation/series_capture.py +331 -0
- tracepipe/safety.py +3 -3
- tracepipe/snapshot.py +420 -0
- tracepipe/storage/base.py +7 -3
- tracepipe/storage/lineage_store.py +190 -47
- tracepipe/storage/row_identity.py +366 -104
- tracepipe/value_provenance.py +301 -0
- tracepipe/visualization/html_export.py +22 -7
- tracepipe-0.3.0.dist-info/METADATA +575 -0
- tracepipe-0.3.0.dist-info/RECORD +29 -0
- tracepipe-0.2.0.dist-info/METADATA +0 -508
- tracepipe-0.2.0.dist-info/RECORD +0 -19
- {tracepipe-0.2.0.dist-info → tracepipe-0.3.0.dist-info}/WHEEL +0 -0
- {tracepipe-0.2.0.dist-info → tracepipe-0.3.0.dist-info}/licenses/LICENSE +0 -0
tracepipe/api.py
CHANGED
|
@@ -1,18 +1,28 @@
|
|
|
1
1
|
# tracepipe/api.py
|
|
2
2
|
"""
|
|
3
|
-
|
|
3
|
+
Core API for TracePipe.
|
|
4
|
+
|
|
5
|
+
This module provides the foundational enable/disable/reset functions
|
|
6
|
+
and internal result classes. For user-facing functionality, see:
|
|
7
|
+
- convenience.py: check(), trace(), why(), report()
|
|
8
|
+
- debug.py: inspect(), export()
|
|
9
|
+
- contracts.py: contract()
|
|
10
|
+
- snapshot.py: snapshot(), diff()
|
|
11
|
+
|
|
12
|
+
Modes:
|
|
13
|
+
- CI: Fast stats and drop tracking. No merge provenance or ghost values.
|
|
14
|
+
- DEBUG: Full provenance with merge origin tracking and ghost row values.
|
|
4
15
|
"""
|
|
5
16
|
|
|
6
17
|
from __future__ import annotations
|
|
7
18
|
|
|
8
19
|
import sys
|
|
9
20
|
import types
|
|
21
|
+
from collections.abc import Sequence
|
|
10
22
|
from dataclasses import fields
|
|
11
23
|
|
|
12
|
-
import pandas as pd
|
|
13
|
-
|
|
14
24
|
from .context import TracePipeContext, get_context, reset_context, set_context
|
|
15
|
-
from .core import LineageGaps, TracePipeConfig
|
|
25
|
+
from .core import LineageGaps, TracePipeConfig, TracePipeMode
|
|
16
26
|
from .instrumentation.pandas_inst import instrument_pandas, uninstrument_pandas
|
|
17
27
|
from .storage.base import LineageBackend, RowIdentityStrategy
|
|
18
28
|
|
|
@@ -24,48 +34,96 @@ def _get_module() -> types.ModuleType:
|
|
|
24
34
|
|
|
25
35
|
def enable(
|
|
26
36
|
config: TracePipeConfig | None = None,
|
|
37
|
+
mode: TracePipeMode | str | None = None,
|
|
38
|
+
*,
|
|
39
|
+
watch: Sequence[str] | None = None,
|
|
27
40
|
auto_watch: bool = False,
|
|
28
41
|
backend: LineageBackend | None = None,
|
|
29
42
|
identity: RowIdentityStrategy | None = None,
|
|
43
|
+
merge_provenance: bool | None = None,
|
|
44
|
+
ghost_row_values: bool | None = None,
|
|
45
|
+
cell_history: bool | None = None,
|
|
46
|
+
sample_rate: float | None = None,
|
|
47
|
+
max_tracked_rows: int | None = None,
|
|
30
48
|
) -> types.ModuleType:
|
|
31
49
|
"""
|
|
32
50
|
Enable TracePipe lineage tracking.
|
|
33
51
|
|
|
34
52
|
Args:
|
|
35
|
-
config: Optional configuration
|
|
53
|
+
config: Optional configuration object
|
|
54
|
+
mode: Operating mode - "ci" (fast) or "debug" (full provenance)
|
|
55
|
+
watch: List of columns to watch for cell-level changes
|
|
36
56
|
auto_watch: If True, automatically watch columns with nulls
|
|
37
|
-
backend: Optional custom storage backend
|
|
38
|
-
identity: Optional custom row identity strategy
|
|
57
|
+
backend: Optional custom storage backend
|
|
58
|
+
identity: Optional custom row identity strategy
|
|
59
|
+
merge_provenance: Override: capture merge parent RIDs (DEBUG default: True)
|
|
60
|
+
ghost_row_values: Override: capture last values of dropped rows
|
|
61
|
+
cell_history: Override: capture cell-level changes
|
|
62
|
+
sample_rate: Track only this fraction of rows (0.0-1.0)
|
|
63
|
+
max_tracked_rows: Maximum rows to track (for large datasets)
|
|
39
64
|
|
|
40
65
|
Returns:
|
|
41
66
|
The tracepipe module for fluent chaining.
|
|
42
67
|
|
|
43
68
|
Examples:
|
|
44
|
-
#
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
# Fluent chaining
|
|
48
|
-
tracepipe.enable().watch("age", "salary")
|
|
69
|
+
# CI mode (fast, default)
|
|
70
|
+
tp.enable()
|
|
49
71
|
|
|
50
|
-
#
|
|
51
|
-
|
|
52
|
-
tracepipe.enable(backend=SQLiteLineageStore(config, "lineage.db"))
|
|
72
|
+
# Debug mode with watched columns
|
|
73
|
+
tp.enable(mode="debug", watch=["age", "salary"])
|
|
53
74
|
|
|
54
|
-
#
|
|
55
|
-
|
|
56
|
-
tracepipe.enable(identity=PolarsRowIdentity(config))
|
|
75
|
+
# Custom configuration
|
|
76
|
+
tp.enable(mode="ci", merge_provenance=True)
|
|
57
77
|
"""
|
|
78
|
+
# Get or create config
|
|
79
|
+
# If config is provided explicitly, use it
|
|
80
|
+
# Otherwise, start with existing context config (if any) or create new default
|
|
81
|
+
if config is None:
|
|
82
|
+
existing_ctx = get_context()
|
|
83
|
+
config = existing_ctx.config # Use existing config as base
|
|
84
|
+
|
|
85
|
+
# Handle mode
|
|
86
|
+
if mode is not None:
|
|
87
|
+
if isinstance(mode, str):
|
|
88
|
+
mode = TracePipeMode(mode.lower())
|
|
89
|
+
config.mode = mode
|
|
90
|
+
|
|
91
|
+
# Apply feature overrides
|
|
92
|
+
if merge_provenance is not None:
|
|
93
|
+
config.merge_provenance = merge_provenance
|
|
94
|
+
if ghost_row_values is not None:
|
|
95
|
+
config.ghost_row_values = ghost_row_values
|
|
96
|
+
if cell_history is not None:
|
|
97
|
+
config.cell_history = cell_history
|
|
98
|
+
|
|
99
|
+
if auto_watch:
|
|
100
|
+
config.auto_watch = True
|
|
101
|
+
|
|
102
|
+
# Sampling config validation
|
|
103
|
+
if sample_rate is not None or max_tracked_rows is not None:
|
|
104
|
+
import warnings
|
|
105
|
+
|
|
106
|
+
warnings.warn(
|
|
107
|
+
"sample_rate and max_tracked_rows are not yet implemented. "
|
|
108
|
+
"These parameters will be ignored.",
|
|
109
|
+
UserWarning,
|
|
110
|
+
stacklevel=2,
|
|
111
|
+
)
|
|
112
|
+
|
|
58
113
|
# Create context with custom backends if provided
|
|
59
114
|
if backend is not None or identity is not None:
|
|
60
115
|
ctx = TracePipeContext(config=config, backend=backend, identity=identity)
|
|
61
116
|
set_context(ctx)
|
|
62
117
|
else:
|
|
63
118
|
ctx = get_context()
|
|
64
|
-
|
|
65
|
-
|
|
119
|
+
ctx.config = config
|
|
120
|
+
# Also update config in row_manager and store (they may have their own references)
|
|
121
|
+
ctx.row_manager.config = config
|
|
122
|
+
ctx.store.config = config
|
|
66
123
|
|
|
67
|
-
|
|
68
|
-
|
|
124
|
+
# Add watched columns
|
|
125
|
+
if watch:
|
|
126
|
+
ctx.watched_columns.update(watch)
|
|
69
127
|
|
|
70
128
|
if not ctx.enabled:
|
|
71
129
|
instrument_pandas()
|
|
@@ -80,8 +138,7 @@ def disable() -> types.ModuleType:
|
|
|
80
138
|
|
|
81
139
|
Note:
|
|
82
140
|
This stops tracking but preserves lineage data collected so far.
|
|
83
|
-
|
|
84
|
-
To clear all data, use reset() instead.
|
|
141
|
+
Use reset() to clear all data.
|
|
85
142
|
|
|
86
143
|
Returns:
|
|
87
144
|
The tracepipe module for fluent chaining.
|
|
@@ -90,7 +147,6 @@ def disable() -> types.ModuleType:
|
|
|
90
147
|
|
|
91
148
|
if ctx.enabled:
|
|
92
149
|
uninstrument_pandas()
|
|
93
|
-
# Call cleanup if backend supports it
|
|
94
150
|
if hasattr(ctx.store, "_cleanup_spillover"):
|
|
95
151
|
ctx.store._cleanup_spillover()
|
|
96
152
|
ctx.enabled = False
|
|
@@ -105,11 +161,6 @@ def reset() -> types.ModuleType:
|
|
|
105
161
|
This clears ALL lineage data, steps, watched columns, and row registrations.
|
|
106
162
|
If tracking was enabled, it will be re-enabled with a fresh context.
|
|
107
163
|
|
|
108
|
-
Use this when:
|
|
109
|
-
- Starting fresh in a notebook cell
|
|
110
|
-
- Running multiple independent analyses
|
|
111
|
-
- Testing
|
|
112
|
-
|
|
113
164
|
Returns:
|
|
114
165
|
The tracepipe module for fluent chaining.
|
|
115
166
|
"""
|
|
@@ -122,7 +173,6 @@ def reset() -> types.ModuleType:
|
|
|
122
173
|
reset_context()
|
|
123
174
|
|
|
124
175
|
if was_enabled:
|
|
125
|
-
# Re-enable with fresh context
|
|
126
176
|
enable()
|
|
127
177
|
|
|
128
178
|
return _get_module()
|
|
@@ -133,38 +183,17 @@ def configure(**kwargs) -> types.ModuleType:
|
|
|
133
183
|
Update configuration.
|
|
134
184
|
|
|
135
185
|
Args:
|
|
136
|
-
**kwargs: Configuration options to update.
|
|
137
|
-
- max_diffs_in_memory: Maximum diffs before spilling to disk
|
|
138
|
-
- max_diffs_per_step: Threshold for mass update detection
|
|
139
|
-
- max_group_membership_size: Threshold for count-only groups
|
|
140
|
-
- strict_mode: Raise exceptions on tracking errors
|
|
141
|
-
- auto_watch: Auto-watch columns with null values
|
|
142
|
-
- auto_watch_null_threshold: Null ratio threshold for auto-watch
|
|
143
|
-
- spillover_dir: Directory for spilled data
|
|
144
|
-
- use_hidden_column: Use hidden column for row tracking
|
|
145
|
-
- warn_on_duplicate_index: Warn on duplicate DataFrame index
|
|
146
|
-
- cleanup_spillover_on_disable: Clean up spilled files on disable
|
|
186
|
+
**kwargs: Configuration options to update.
|
|
147
187
|
|
|
148
188
|
Returns:
|
|
149
189
|
The tracepipe module for fluent chaining.
|
|
150
|
-
|
|
151
|
-
Raises:
|
|
152
|
-
ValueError: If an invalid configuration key is provided.
|
|
153
|
-
|
|
154
|
-
Examples:
|
|
155
|
-
tracepipe.configure(max_diffs_per_step=1000)
|
|
156
|
-
tracepipe.enable().configure(strict_mode=True).watch("amount")
|
|
157
190
|
"""
|
|
158
191
|
ctx = get_context()
|
|
159
192
|
|
|
160
|
-
# Validate keys against dataclass fields
|
|
161
193
|
valid_keys = {f.name for f in fields(TracePipeConfig)}
|
|
162
194
|
invalid_keys = set(kwargs.keys()) - valid_keys
|
|
163
195
|
if invalid_keys:
|
|
164
|
-
raise ValueError(
|
|
165
|
-
f"Invalid configuration key(s): {invalid_keys}. "
|
|
166
|
-
f"Valid keys are: {sorted(valid_keys)}"
|
|
167
|
-
)
|
|
196
|
+
raise ValueError(f"Invalid configuration key(s): {invalid_keys}")
|
|
168
197
|
|
|
169
198
|
for key, value in kwargs.items():
|
|
170
199
|
setattr(ctx.config, key, value)
|
|
@@ -172,110 +201,76 @@ def configure(**kwargs) -> types.ModuleType:
|
|
|
172
201
|
return _get_module()
|
|
173
202
|
|
|
174
203
|
|
|
175
|
-
def
|
|
176
|
-
"""
|
|
177
|
-
Add columns to watch for cell-level changes.
|
|
178
|
-
|
|
179
|
-
Args:
|
|
180
|
-
*columns: Column names to watch.
|
|
204
|
+
def stage(name: str):
|
|
205
|
+
"""Context manager for naming pipeline stages."""
|
|
181
206
|
|
|
182
|
-
|
|
183
|
-
|
|
207
|
+
class StageContext:
|
|
208
|
+
def __init__(self, stage_name: str):
|
|
209
|
+
self.stage_name = stage_name
|
|
210
|
+
self.previous_stage = None
|
|
184
211
|
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
ctx.watched_columns.update(columns)
|
|
191
|
-
return _get_module()
|
|
212
|
+
def __enter__(self):
|
|
213
|
+
ctx = get_context()
|
|
214
|
+
self.previous_stage = ctx.current_stage
|
|
215
|
+
ctx.current_stage = self.stage_name
|
|
216
|
+
return self
|
|
192
217
|
|
|
218
|
+
def __exit__(self, *args):
|
|
219
|
+
ctx = get_context()
|
|
220
|
+
ctx.current_stage = self.previous_stage
|
|
193
221
|
|
|
194
|
-
|
|
195
|
-
"""
|
|
196
|
-
Watch all columns in a DataFrame.
|
|
222
|
+
return StageContext(name)
|
|
197
223
|
|
|
198
|
-
Args:
|
|
199
|
-
df: DataFrame whose columns to watch.
|
|
200
224
|
|
|
201
|
-
|
|
202
|
-
The tracepipe module for fluent chaining.
|
|
203
|
-
|
|
204
|
-
Examples:
|
|
205
|
-
tracepipe.watch_all(df)
|
|
225
|
+
def register(*dfs) -> types.ModuleType:
|
|
206
226
|
"""
|
|
207
|
-
|
|
208
|
-
ctx.watched_columns.update(df.columns.tolist())
|
|
209
|
-
return _get_module()
|
|
227
|
+
Register pre-existing DataFrames for tracking.
|
|
210
228
|
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
"""
|
|
214
|
-
Remove columns from watch list.
|
|
229
|
+
Use this when DataFrames were created before tp.enable() was called.
|
|
230
|
+
After registration, snapshots, ghost rows, and cell history will work.
|
|
215
231
|
|
|
216
232
|
Args:
|
|
217
|
-
*
|
|
218
|
-
|
|
219
|
-
Returns:
|
|
220
|
-
The tracepipe module for fluent chaining.
|
|
221
|
-
"""
|
|
222
|
-
ctx = get_context()
|
|
223
|
-
ctx.watched_columns.difference_update(columns)
|
|
224
|
-
return _get_module()
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
def clear_watch() -> types.ModuleType:
|
|
228
|
-
"""
|
|
229
|
-
Clear all watched columns.
|
|
233
|
+
*dfs: One or more DataFrames to register
|
|
230
234
|
|
|
231
235
|
Returns:
|
|
232
236
|
The tracepipe module for fluent chaining.
|
|
233
237
|
|
|
234
238
|
Examples:
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
ctx.watched_columns.clear()
|
|
239
|
-
return _get_module()
|
|
239
|
+
# DataFrames created before enable
|
|
240
|
+
df1 = pd.DataFrame({"a": [1, 2, 3]})
|
|
241
|
+
df2 = pd.DataFrame({"b": [4, 5, 6]})
|
|
240
242
|
|
|
243
|
+
tp.enable()
|
|
244
|
+
tp.register(df1, df2) # Now they're tracked
|
|
241
245
|
|
|
242
|
-
|
|
246
|
+
snap = tp.snapshot(df1) # Works!
|
|
243
247
|
"""
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
Use this for DataFrames created before enable() was called.
|
|
248
|
+
import pandas as pd
|
|
247
249
|
|
|
248
|
-
Returns:
|
|
249
|
-
The tracepipe module for fluent chaining.
|
|
250
|
-
"""
|
|
251
250
|
ctx = get_context()
|
|
252
|
-
if ctx.enabled:
|
|
253
|
-
ctx.row_manager.register(df)
|
|
254
|
-
return _get_module()
|
|
255
251
|
|
|
252
|
+
if not ctx.enabled:
|
|
253
|
+
import warnings
|
|
256
254
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
self.previous_stage = None
|
|
255
|
+
warnings.warn(
|
|
256
|
+
"TracePipe is not enabled. Call tp.enable() before tp.register().",
|
|
257
|
+
UserWarning,
|
|
258
|
+
stacklevel=2,
|
|
259
|
+
)
|
|
260
|
+
return _get_module()
|
|
264
261
|
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
ctx.current_stage = self.stage_name
|
|
269
|
-
return self
|
|
262
|
+
for df in dfs:
|
|
263
|
+
if not isinstance(df, pd.DataFrame):
|
|
264
|
+
raise TypeError(f"Expected DataFrame, got {type(df).__name__}")
|
|
270
265
|
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
ctx.
|
|
266
|
+
# Only register if not already registered
|
|
267
|
+
if ctx.row_manager.get_ids_array(df) is None:
|
|
268
|
+
ctx.row_manager.register(df)
|
|
274
269
|
|
|
275
|
-
return
|
|
270
|
+
return _get_module()
|
|
276
271
|
|
|
277
272
|
|
|
278
|
-
# ===
|
|
273
|
+
# === INTERNAL RESULT CLASSES (used by debug module) ===
|
|
279
274
|
|
|
280
275
|
|
|
281
276
|
class RowLineageResult:
|
|
@@ -284,54 +279,82 @@ class RowLineageResult:
|
|
|
284
279
|
def __init__(self, row_id: int, ctx: TracePipeContext):
|
|
285
280
|
self.row_id = row_id
|
|
286
281
|
self._ctx = ctx
|
|
287
|
-
self._history =
|
|
288
|
-
self._gaps =
|
|
282
|
+
self._history: list[dict] | None = None
|
|
283
|
+
self._gaps: LineageGaps | None = None
|
|
284
|
+
self._drop_event: dict | None = None
|
|
285
|
+
self._drop_event_checked: bool = False
|
|
286
|
+
|
|
287
|
+
def _ensure_drop_event(self) -> None:
|
|
288
|
+
if not self._drop_event_checked:
|
|
289
|
+
self._drop_event = self._ctx.store.get_drop_event(self.row_id)
|
|
290
|
+
self._drop_event_checked = True
|
|
291
|
+
|
|
292
|
+
def _ensure_history(self) -> None:
|
|
293
|
+
if self._history is None:
|
|
294
|
+
self._history = self._ctx.store.get_row_history(self.row_id)
|
|
295
|
+
|
|
296
|
+
def _ensure_gaps(self) -> None:
|
|
297
|
+
if self._gaps is None:
|
|
298
|
+
self._gaps = self._ctx.store.compute_gaps(self.row_id)
|
|
289
299
|
|
|
290
300
|
@property
|
|
291
301
|
def is_alive(self) -> bool:
|
|
292
|
-
|
|
293
|
-
return
|
|
302
|
+
self._ensure_drop_event()
|
|
303
|
+
return self._drop_event is None
|
|
294
304
|
|
|
295
305
|
@property
|
|
296
306
|
def dropped_at(self) -> str | None:
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
return h["operation"]
|
|
307
|
+
self._ensure_drop_event()
|
|
308
|
+
if self._drop_event is not None:
|
|
309
|
+
return self._drop_event.get("operation")
|
|
301
310
|
return None
|
|
302
311
|
|
|
312
|
+
@property
|
|
313
|
+
def dropped_step_id(self) -> int | None:
|
|
314
|
+
self._ensure_drop_event()
|
|
315
|
+
if self._drop_event is not None:
|
|
316
|
+
return self._drop_event.get("step_id")
|
|
317
|
+
return None
|
|
318
|
+
|
|
319
|
+
def merge_origin(self) -> dict | None:
|
|
320
|
+
return self._ctx.store.get_merge_origin(self.row_id)
|
|
321
|
+
|
|
303
322
|
def cell_history(self, column: str) -> list[dict]:
|
|
304
|
-
|
|
323
|
+
self._ensure_history()
|
|
305
324
|
return [h for h in self._history if h["col"] == column]
|
|
306
325
|
|
|
307
326
|
def history(self) -> list[dict]:
|
|
308
|
-
|
|
327
|
+
self._ensure_history()
|
|
309
328
|
return self._history
|
|
310
329
|
|
|
311
330
|
@property
|
|
312
331
|
def gaps(self) -> LineageGaps:
|
|
313
|
-
|
|
332
|
+
self._ensure_gaps()
|
|
314
333
|
return self._gaps
|
|
315
334
|
|
|
316
335
|
@property
|
|
317
336
|
def is_fully_tracked(self) -> bool:
|
|
318
|
-
|
|
337
|
+
self._ensure_gaps()
|
|
319
338
|
return self._gaps.is_fully_tracked
|
|
320
339
|
|
|
321
340
|
def to_dict(self) -> dict:
|
|
322
|
-
|
|
341
|
+
self._ensure_history()
|
|
342
|
+
self._ensure_gaps()
|
|
343
|
+
merge = self.merge_origin()
|
|
323
344
|
return {
|
|
324
345
|
"row_id": self.row_id,
|
|
325
346
|
"is_alive": self.is_alive,
|
|
326
347
|
"dropped_at": self.dropped_at,
|
|
348
|
+
"dropped_step_id": self.dropped_step_id,
|
|
327
349
|
"is_fully_tracked": self.is_fully_tracked,
|
|
328
350
|
"gaps_summary": self._gaps.summary(),
|
|
351
|
+
"merge_origin": merge,
|
|
329
352
|
"history": self._history,
|
|
330
353
|
}
|
|
331
354
|
|
|
332
355
|
def __repr__(self):
|
|
333
356
|
status = "alive" if self.is_alive else f"dropped at {self.dropped_at}"
|
|
334
|
-
return f"<RowLineage row_id={self.row_id} {status} events={len(self.
|
|
357
|
+
return f"<RowLineage row_id={self.row_id} {status} events={len(self.history())}>"
|
|
335
358
|
|
|
336
359
|
|
|
337
360
|
class GroupLineageResult:
|
|
@@ -344,45 +367,25 @@ class GroupLineageResult:
|
|
|
344
367
|
|
|
345
368
|
@property
|
|
346
369
|
def row_ids(self) -> list[int]:
|
|
347
|
-
"""Get list of row IDs in this group."""
|
|
348
370
|
return self._info["row_ids"] if self._info else []
|
|
349
371
|
|
|
350
372
|
@property
|
|
351
373
|
def row_count(self) -> int:
|
|
352
|
-
"""Get number of rows in this group."""
|
|
353
374
|
return self._info["row_count"] if self._info else 0
|
|
354
375
|
|
|
355
376
|
@property
|
|
356
377
|
def is_count_only(self) -> bool:
|
|
357
|
-
"""
|
|
358
|
-
True if group exceeded max_group_membership_size threshold.
|
|
359
|
-
|
|
360
|
-
When True, row_ids will be empty and only row_count is available.
|
|
361
|
-
"""
|
|
362
378
|
return self._info.get("is_count_only", False) if self._info else False
|
|
363
379
|
|
|
364
380
|
@property
|
|
365
381
|
def group_column(self) -> str | None:
|
|
366
|
-
"""Get the column used for grouping."""
|
|
367
382
|
return self._info["group_column"] if self._info else None
|
|
368
383
|
|
|
369
384
|
@property
|
|
370
385
|
def aggregation_functions(self) -> dict[str, str]:
|
|
371
|
-
"""Get the aggregation functions applied."""
|
|
372
386
|
return self._info["agg_functions"] if self._info else {}
|
|
373
387
|
|
|
374
|
-
def get_contributing_rows(self, limit: int = 100) -> list[RowLineageResult]:
|
|
375
|
-
"""
|
|
376
|
-
Get lineage for contributing rows.
|
|
377
|
-
|
|
378
|
-
Returns empty list if is_count_only is True.
|
|
379
|
-
"""
|
|
380
|
-
if self.is_count_only:
|
|
381
|
-
return []
|
|
382
|
-
return [explain(row_id) for row_id in self.row_ids[:limit]]
|
|
383
|
-
|
|
384
388
|
def to_dict(self) -> dict:
|
|
385
|
-
"""Export to dictionary."""
|
|
386
389
|
return {
|
|
387
390
|
"group_key": self.group_key,
|
|
388
391
|
"group_column": self.group_column,
|
|
@@ -395,169 +398,3 @@ class GroupLineageResult:
|
|
|
395
398
|
def __repr__(self):
|
|
396
399
|
suffix = " (count only)" if self.is_count_only else ""
|
|
397
400
|
return f"<GroupLineage key='{self.group_key}' rows={self.row_count}{suffix}>"
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
def explain(row_id: int) -> RowLineageResult:
|
|
401
|
-
"""Get lineage for a specific row."""
|
|
402
|
-
ctx = get_context()
|
|
403
|
-
return RowLineageResult(row_id, ctx)
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
def explain_many(row_ids: list[int]) -> list[RowLineageResult]:
|
|
407
|
-
"""
|
|
408
|
-
Get lineage for multiple rows.
|
|
409
|
-
|
|
410
|
-
Args:
|
|
411
|
-
row_ids: List of row IDs to explain.
|
|
412
|
-
|
|
413
|
-
Returns:
|
|
414
|
-
List of RowLineageResult objects.
|
|
415
|
-
|
|
416
|
-
Examples:
|
|
417
|
-
results = tracepipe.explain_many([0, 1, 2])
|
|
418
|
-
for row in results:
|
|
419
|
-
print(row.is_alive, row.dropped_at)
|
|
420
|
-
"""
|
|
421
|
-
ctx = get_context()
|
|
422
|
-
return [RowLineageResult(row_id, ctx) for row_id in row_ids]
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
def explain_group(group_key: str) -> GroupLineageResult:
|
|
426
|
-
"""Get lineage for an aggregation group."""
|
|
427
|
-
ctx = get_context()
|
|
428
|
-
return GroupLineageResult(group_key, ctx)
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
def dropped_rows(by_step: bool = False) -> list[int] | dict[str, int]:
|
|
432
|
-
"""
|
|
433
|
-
Get dropped row information.
|
|
434
|
-
|
|
435
|
-
Args:
|
|
436
|
-
by_step: If False (default), return list of dropped row IDs.
|
|
437
|
-
If True, return dict mapping operation names to drop counts.
|
|
438
|
-
|
|
439
|
-
Returns:
|
|
440
|
-
List of row IDs if by_step=False, or dict of {operation: count} if by_step=True.
|
|
441
|
-
|
|
442
|
-
Examples:
|
|
443
|
-
# Get all dropped row IDs
|
|
444
|
-
dropped = tracepipe.dropped_rows()
|
|
445
|
-
|
|
446
|
-
# Get counts by operation
|
|
447
|
-
by_op = tracepipe.dropped_rows(by_step=True)
|
|
448
|
-
# {'DataFrame.dropna': 5, 'DataFrame.query': 3}
|
|
449
|
-
"""
|
|
450
|
-
ctx = get_context()
|
|
451
|
-
if by_step:
|
|
452
|
-
return ctx.store.get_dropped_by_step()
|
|
453
|
-
return ctx.store.get_dropped_rows()
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
def alive_rows() -> list[int]:
|
|
457
|
-
"""
|
|
458
|
-
Get all row IDs that are still alive (not dropped).
|
|
459
|
-
|
|
460
|
-
Returns:
|
|
461
|
-
List of row IDs that have not been dropped.
|
|
462
|
-
|
|
463
|
-
Examples:
|
|
464
|
-
alive = tracepipe.alive_rows()
|
|
465
|
-
print(f"{len(alive)} rows survived the pipeline")
|
|
466
|
-
"""
|
|
467
|
-
ctx = get_context()
|
|
468
|
-
all_registered = set(ctx.row_manager.all_registered_ids())
|
|
469
|
-
dropped = set(ctx.store.get_dropped_rows())
|
|
470
|
-
return sorted(all_registered - dropped)
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
def mass_updates() -> list[dict]:
|
|
474
|
-
"""Get operations that exceeded cell diff threshold."""
|
|
475
|
-
ctx = get_context()
|
|
476
|
-
return [
|
|
477
|
-
{
|
|
478
|
-
"step_id": s.step_id,
|
|
479
|
-
"operation": s.operation,
|
|
480
|
-
"rows_affected": s.rows_affected,
|
|
481
|
-
"stage": s.stage,
|
|
482
|
-
}
|
|
483
|
-
for s in ctx.store.steps
|
|
484
|
-
if s.is_mass_update
|
|
485
|
-
]
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
def steps() -> list[dict]:
|
|
489
|
-
"""Get all tracked steps."""
|
|
490
|
-
ctx = get_context()
|
|
491
|
-
return [
|
|
492
|
-
{
|
|
493
|
-
"step_id": s.step_id,
|
|
494
|
-
"operation": s.operation,
|
|
495
|
-
"stage": s.stage,
|
|
496
|
-
"input_shape": s.input_shape,
|
|
497
|
-
"output_shape": s.output_shape,
|
|
498
|
-
"completeness": s.completeness.name,
|
|
499
|
-
"is_mass_update": s.is_mass_update,
|
|
500
|
-
"timestamp": s.timestamp,
|
|
501
|
-
"code_file": s.code_file,
|
|
502
|
-
"code_line": s.code_line,
|
|
503
|
-
}
|
|
504
|
-
for s in ctx.store.steps
|
|
505
|
-
]
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
def aggregation_groups() -> list[str]:
|
|
509
|
-
"""List all tracked aggregation groups."""
|
|
510
|
-
ctx = get_context()
|
|
511
|
-
groups = []
|
|
512
|
-
for mapping in ctx.store.aggregation_mappings:
|
|
513
|
-
groups.extend(mapping.membership.keys())
|
|
514
|
-
return groups
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
# === EXPORT ===
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
def export_json(filepath: str) -> None:
|
|
521
|
-
"""Export lineage to JSON file."""
|
|
522
|
-
ctx = get_context()
|
|
523
|
-
with open(filepath, "w") as f:
|
|
524
|
-
f.write(ctx.store.to_json())
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
def export_arrow(filepath: str) -> None:
|
|
528
|
-
"""
|
|
529
|
-
Export lineage to Parquet file.
|
|
530
|
-
|
|
531
|
-
Requires pyarrow to be installed.
|
|
532
|
-
|
|
533
|
-
Args:
|
|
534
|
-
filepath: Path to write the Parquet file.
|
|
535
|
-
|
|
536
|
-
Raises:
|
|
537
|
-
ImportError: If pyarrow is not installed.
|
|
538
|
-
"""
|
|
539
|
-
try:
|
|
540
|
-
import pyarrow.parquet as pq
|
|
541
|
-
except ImportError:
|
|
542
|
-
raise ImportError(
|
|
543
|
-
"pyarrow is required for Arrow/Parquet export. "
|
|
544
|
-
"Install it with: pip install tracepipe[arrow] or pip install pyarrow"
|
|
545
|
-
) from None
|
|
546
|
-
|
|
547
|
-
ctx = get_context()
|
|
548
|
-
table = ctx.store.to_arrow()
|
|
549
|
-
pq.write_table(table, filepath)
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
def stats() -> dict:
|
|
553
|
-
"""Get tracking statistics."""
|
|
554
|
-
ctx = get_context()
|
|
555
|
-
return {
|
|
556
|
-
"enabled": ctx.enabled,
|
|
557
|
-
"total_steps": len(ctx.store.steps),
|
|
558
|
-
"total_diffs": ctx.store.total_diff_count,
|
|
559
|
-
"in_memory_diffs": ctx.store.diff_count,
|
|
560
|
-
"spilled_files": len(ctx.store.spilled_files),
|
|
561
|
-
"watched_columns": list(ctx.watched_columns),
|
|
562
|
-
"aggregation_groups": len(ctx.store.aggregation_mappings),
|
|
563
|
-
}
|