tracepipe 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tracepipe/__init__.py +117 -78
- tracepipe/api.py +219 -332
- tracepipe/context.py +21 -1
- tracepipe/contracts.py +473 -0
- tracepipe/convenience.py +817 -0
- tracepipe/core.py +174 -17
- tracepipe/debug.py +325 -0
- tracepipe/instrumentation/apply_capture.py +453 -0
- tracepipe/instrumentation/filter_capture.py +468 -0
- tracepipe/instrumentation/indexer_capture.py +813 -0
- tracepipe/instrumentation/merge_capture.py +434 -0
- tracepipe/instrumentation/pandas_inst.py +66 -183
- tracepipe/instrumentation/series_capture.py +331 -0
- tracepipe/safety.py +3 -3
- tracepipe/snapshot.py +420 -0
- tracepipe/storage/base.py +7 -3
- tracepipe/storage/lineage_store.py +252 -47
- tracepipe/storage/row_identity.py +366 -104
- tracepipe/value_provenance.py +309 -0
- tracepipe/visualization/html_export.py +22 -7
- tracepipe-0.3.1.dist-info/METADATA +308 -0
- tracepipe-0.3.1.dist-info/RECORD +29 -0
- tracepipe-0.2.0.dist-info/METADATA +0 -508
- tracepipe-0.2.0.dist-info/RECORD +0 -19
- {tracepipe-0.2.0.dist-info → tracepipe-0.3.1.dist-info}/WHEEL +0 -0
- {tracepipe-0.2.0.dist-info → tracepipe-0.3.1.dist-info}/licenses/LICENSE +0 -0
tracepipe/api.py
CHANGED
|
@@ -1,18 +1,28 @@
|
|
|
1
1
|
# tracepipe/api.py
|
|
2
2
|
"""
|
|
3
|
-
|
|
3
|
+
Core API for TracePipe.
|
|
4
|
+
|
|
5
|
+
This module provides the foundational enable/disable/reset functions
|
|
6
|
+
and internal result classes. For user-facing functionality, see:
|
|
7
|
+
- convenience.py: check(), trace(), why(), report()
|
|
8
|
+
- debug.py: inspect(), export()
|
|
9
|
+
- contracts.py: contract()
|
|
10
|
+
- snapshot.py: snapshot(), diff()
|
|
11
|
+
|
|
12
|
+
Modes:
|
|
13
|
+
- CI: Fast stats and drop tracking. No merge provenance or ghost values.
|
|
14
|
+
- DEBUG: Full provenance with merge origin tracking and ghost row values.
|
|
4
15
|
"""
|
|
5
16
|
|
|
6
17
|
from __future__ import annotations
|
|
7
18
|
|
|
8
19
|
import sys
|
|
9
20
|
import types
|
|
21
|
+
from collections.abc import Sequence
|
|
10
22
|
from dataclasses import fields
|
|
11
23
|
|
|
12
|
-
import pandas as pd
|
|
13
|
-
|
|
14
24
|
from .context import TracePipeContext, get_context, reset_context, set_context
|
|
15
|
-
from .core import LineageGaps, TracePipeConfig
|
|
25
|
+
from .core import LineageGaps, TracePipeConfig, TracePipeMode
|
|
16
26
|
from .instrumentation.pandas_inst import instrument_pandas, uninstrument_pandas
|
|
17
27
|
from .storage.base import LineageBackend, RowIdentityStrategy
|
|
18
28
|
|
|
@@ -24,48 +34,102 @@ def _get_module() -> types.ModuleType:
|
|
|
24
34
|
|
|
25
35
|
def enable(
|
|
26
36
|
config: TracePipeConfig | None = None,
|
|
37
|
+
mode: TracePipeMode | str | None = None,
|
|
38
|
+
*,
|
|
39
|
+
watch: Sequence[str] | None = None,
|
|
27
40
|
auto_watch: bool = False,
|
|
28
41
|
backend: LineageBackend | None = None,
|
|
29
42
|
identity: RowIdentityStrategy | None = None,
|
|
43
|
+
merge_provenance: bool | None = None,
|
|
44
|
+
ghost_row_values: bool | None = None,
|
|
45
|
+
cell_history: bool | None = None,
|
|
46
|
+
sample_rate: float | None = None,
|
|
47
|
+
max_tracked_rows: int | None = None,
|
|
30
48
|
) -> types.ModuleType:
|
|
31
49
|
"""
|
|
32
50
|
Enable TracePipe lineage tracking.
|
|
33
51
|
|
|
34
52
|
Args:
|
|
35
|
-
config: Optional configuration
|
|
53
|
+
config: Optional configuration object
|
|
54
|
+
mode: Operating mode - "ci" (fast) or "debug" (full provenance)
|
|
55
|
+
watch: List of columns to watch for cell-level changes
|
|
36
56
|
auto_watch: If True, automatically watch columns with nulls
|
|
37
|
-
backend: Optional custom storage backend
|
|
38
|
-
identity: Optional custom row identity strategy
|
|
57
|
+
backend: Optional custom storage backend
|
|
58
|
+
identity: Optional custom row identity strategy
|
|
59
|
+
merge_provenance: Override: capture merge parent RIDs (DEBUG default: True)
|
|
60
|
+
ghost_row_values: Override: capture last values of dropped rows
|
|
61
|
+
cell_history: Override: capture cell-level changes
|
|
62
|
+
sample_rate: Track only this fraction of rows (0.0-1.0)
|
|
63
|
+
max_tracked_rows: Maximum rows to track (for large datasets)
|
|
39
64
|
|
|
40
65
|
Returns:
|
|
41
66
|
The tracepipe module for fluent chaining.
|
|
42
67
|
|
|
43
68
|
Examples:
|
|
44
|
-
#
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
# Fluent chaining
|
|
48
|
-
tracepipe.enable().watch("age", "salary")
|
|
69
|
+
# CI mode (fast, default)
|
|
70
|
+
tp.enable()
|
|
49
71
|
|
|
50
|
-
#
|
|
51
|
-
|
|
52
|
-
tracepipe.enable(backend=SQLiteLineageStore(config, "lineage.db"))
|
|
72
|
+
# Debug mode with watched columns
|
|
73
|
+
tp.enable(mode="debug", watch=["age", "salary"])
|
|
53
74
|
|
|
54
|
-
#
|
|
55
|
-
|
|
56
|
-
tracepipe.enable(identity=PolarsRowIdentity(config))
|
|
75
|
+
# Custom configuration
|
|
76
|
+
tp.enable(mode="ci", merge_provenance=True)
|
|
57
77
|
"""
|
|
78
|
+
ctx = get_context()
|
|
79
|
+
|
|
80
|
+
# If already enabled, reset accumulated state to prevent duplicate warnings/stats
|
|
81
|
+
# This handles the common case of re-running scripts in notebooks/IDEs
|
|
82
|
+
if ctx.enabled:
|
|
83
|
+
_reset_accumulated_state(ctx)
|
|
84
|
+
|
|
85
|
+
# Get or create config
|
|
86
|
+
# If config is provided explicitly, use it
|
|
87
|
+
# Otherwise, start with existing context config (if any) or create new default
|
|
88
|
+
if config is None:
|
|
89
|
+
config = ctx.config # Use existing config as base
|
|
90
|
+
|
|
91
|
+
# Handle mode
|
|
92
|
+
if mode is not None:
|
|
93
|
+
if isinstance(mode, str):
|
|
94
|
+
mode = TracePipeMode(mode.lower())
|
|
95
|
+
config.mode = mode
|
|
96
|
+
|
|
97
|
+
# Apply feature overrides
|
|
98
|
+
if merge_provenance is not None:
|
|
99
|
+
config.merge_provenance = merge_provenance
|
|
100
|
+
if ghost_row_values is not None:
|
|
101
|
+
config.ghost_row_values = ghost_row_values
|
|
102
|
+
if cell_history is not None:
|
|
103
|
+
config.cell_history = cell_history
|
|
104
|
+
|
|
105
|
+
if auto_watch:
|
|
106
|
+
config.auto_watch = True
|
|
107
|
+
|
|
108
|
+
# Sampling config validation
|
|
109
|
+
if sample_rate is not None or max_tracked_rows is not None:
|
|
110
|
+
import warnings
|
|
111
|
+
|
|
112
|
+
warnings.warn(
|
|
113
|
+
"sample_rate and max_tracked_rows are not yet implemented. "
|
|
114
|
+
"These parameters will be ignored.",
|
|
115
|
+
UserWarning,
|
|
116
|
+
stacklevel=2,
|
|
117
|
+
)
|
|
118
|
+
|
|
58
119
|
# Create context with custom backends if provided
|
|
59
120
|
if backend is not None or identity is not None:
|
|
60
121
|
ctx = TracePipeContext(config=config, backend=backend, identity=identity)
|
|
61
122
|
set_context(ctx)
|
|
62
123
|
else:
|
|
63
|
-
ctx =
|
|
64
|
-
|
|
65
|
-
|
|
124
|
+
ctx.config = config
|
|
125
|
+
# Also update config in row_manager and store (they may have their own references)
|
|
126
|
+
ctx.row_manager.config = config
|
|
127
|
+
ctx.store.config = config
|
|
66
128
|
|
|
67
|
-
if
|
|
68
|
-
|
|
129
|
+
# Add watched columns (reset first if re-enabling to avoid stale watches)
|
|
130
|
+
if watch:
|
|
131
|
+
ctx.watched_columns.clear()
|
|
132
|
+
ctx.watched_columns.update(watch)
|
|
69
133
|
|
|
70
134
|
if not ctx.enabled:
|
|
71
135
|
instrument_pandas()
|
|
@@ -74,14 +138,57 @@ def enable(
|
|
|
74
138
|
return _get_module()
|
|
75
139
|
|
|
76
140
|
|
|
141
|
+
def _reset_accumulated_state(ctx: TracePipeContext) -> None:
|
|
142
|
+
"""
|
|
143
|
+
Reset accumulated lineage state without disabling instrumentation.
|
|
144
|
+
|
|
145
|
+
Called when enable() is invoked on an already-enabled context to prevent
|
|
146
|
+
state accumulation across multiple script runs in the same Python process.
|
|
147
|
+
"""
|
|
148
|
+
store = ctx.store
|
|
149
|
+
|
|
150
|
+
# Clear merge stats (prevents duplicate warnings)
|
|
151
|
+
if hasattr(store, "merge_stats"):
|
|
152
|
+
store.merge_stats.clear()
|
|
153
|
+
|
|
154
|
+
# Clear bulk drops
|
|
155
|
+
if hasattr(store, "bulk_drops"):
|
|
156
|
+
store.bulk_drops.clear()
|
|
157
|
+
|
|
158
|
+
# Clear steps
|
|
159
|
+
if hasattr(store, "_steps"):
|
|
160
|
+
store._steps.clear()
|
|
161
|
+
|
|
162
|
+
# Clear in-memory diffs
|
|
163
|
+
if hasattr(store, "_clear_in_memory"):
|
|
164
|
+
store._clear_in_memory()
|
|
165
|
+
|
|
166
|
+
# Reset step counter
|
|
167
|
+
if hasattr(store, "_step_counter"):
|
|
168
|
+
store._step_counter = 0
|
|
169
|
+
|
|
170
|
+
# Clear merge mappings
|
|
171
|
+
if hasattr(store, "merge_mappings"):
|
|
172
|
+
store.merge_mappings.clear()
|
|
173
|
+
|
|
174
|
+
# Clear aggregation mappings
|
|
175
|
+
if hasattr(store, "aggregation_mappings"):
|
|
176
|
+
store.aggregation_mappings.clear()
|
|
177
|
+
|
|
178
|
+
# Reset row identity manager
|
|
179
|
+
ctx.row_manager.clear()
|
|
180
|
+
|
|
181
|
+
# Clear watched columns (will be re-added if watch param provided)
|
|
182
|
+
ctx.watched_columns.clear()
|
|
183
|
+
|
|
184
|
+
|
|
77
185
|
def disable() -> types.ModuleType:
|
|
78
186
|
"""
|
|
79
187
|
Disable TracePipe and restore original pandas methods.
|
|
80
188
|
|
|
81
189
|
Note:
|
|
82
190
|
This stops tracking but preserves lineage data collected so far.
|
|
83
|
-
|
|
84
|
-
To clear all data, use reset() instead.
|
|
191
|
+
Use reset() to clear all data.
|
|
85
192
|
|
|
86
193
|
Returns:
|
|
87
194
|
The tracepipe module for fluent chaining.
|
|
@@ -90,7 +197,6 @@ def disable() -> types.ModuleType:
|
|
|
90
197
|
|
|
91
198
|
if ctx.enabled:
|
|
92
199
|
uninstrument_pandas()
|
|
93
|
-
# Call cleanup if backend supports it
|
|
94
200
|
if hasattr(ctx.store, "_cleanup_spillover"):
|
|
95
201
|
ctx.store._cleanup_spillover()
|
|
96
202
|
ctx.enabled = False
|
|
@@ -105,11 +211,6 @@ def reset() -> types.ModuleType:
|
|
|
105
211
|
This clears ALL lineage data, steps, watched columns, and row registrations.
|
|
106
212
|
If tracking was enabled, it will be re-enabled with a fresh context.
|
|
107
213
|
|
|
108
|
-
Use this when:
|
|
109
|
-
- Starting fresh in a notebook cell
|
|
110
|
-
- Running multiple independent analyses
|
|
111
|
-
- Testing
|
|
112
|
-
|
|
113
214
|
Returns:
|
|
114
215
|
The tracepipe module for fluent chaining.
|
|
115
216
|
"""
|
|
@@ -122,7 +223,6 @@ def reset() -> types.ModuleType:
|
|
|
122
223
|
reset_context()
|
|
123
224
|
|
|
124
225
|
if was_enabled:
|
|
125
|
-
# Re-enable with fresh context
|
|
126
226
|
enable()
|
|
127
227
|
|
|
128
228
|
return _get_module()
|
|
@@ -133,38 +233,17 @@ def configure(**kwargs) -> types.ModuleType:
|
|
|
133
233
|
Update configuration.
|
|
134
234
|
|
|
135
235
|
Args:
|
|
136
|
-
**kwargs: Configuration options to update.
|
|
137
|
-
- max_diffs_in_memory: Maximum diffs before spilling to disk
|
|
138
|
-
- max_diffs_per_step: Threshold for mass update detection
|
|
139
|
-
- max_group_membership_size: Threshold for count-only groups
|
|
140
|
-
- strict_mode: Raise exceptions on tracking errors
|
|
141
|
-
- auto_watch: Auto-watch columns with null values
|
|
142
|
-
- auto_watch_null_threshold: Null ratio threshold for auto-watch
|
|
143
|
-
- spillover_dir: Directory for spilled data
|
|
144
|
-
- use_hidden_column: Use hidden column for row tracking
|
|
145
|
-
- warn_on_duplicate_index: Warn on duplicate DataFrame index
|
|
146
|
-
- cleanup_spillover_on_disable: Clean up spilled files on disable
|
|
236
|
+
**kwargs: Configuration options to update.
|
|
147
237
|
|
|
148
238
|
Returns:
|
|
149
239
|
The tracepipe module for fluent chaining.
|
|
150
|
-
|
|
151
|
-
Raises:
|
|
152
|
-
ValueError: If an invalid configuration key is provided.
|
|
153
|
-
|
|
154
|
-
Examples:
|
|
155
|
-
tracepipe.configure(max_diffs_per_step=1000)
|
|
156
|
-
tracepipe.enable().configure(strict_mode=True).watch("amount")
|
|
157
240
|
"""
|
|
158
241
|
ctx = get_context()
|
|
159
242
|
|
|
160
|
-
# Validate keys against dataclass fields
|
|
161
243
|
valid_keys = {f.name for f in fields(TracePipeConfig)}
|
|
162
244
|
invalid_keys = set(kwargs.keys()) - valid_keys
|
|
163
245
|
if invalid_keys:
|
|
164
|
-
raise ValueError(
|
|
165
|
-
f"Invalid configuration key(s): {invalid_keys}. "
|
|
166
|
-
f"Valid keys are: {sorted(valid_keys)}"
|
|
167
|
-
)
|
|
246
|
+
raise ValueError(f"Invalid configuration key(s): {invalid_keys}")
|
|
168
247
|
|
|
169
248
|
for key, value in kwargs.items():
|
|
170
249
|
setattr(ctx.config, key, value)
|
|
@@ -172,110 +251,76 @@ def configure(**kwargs) -> types.ModuleType:
|
|
|
172
251
|
return _get_module()
|
|
173
252
|
|
|
174
253
|
|
|
175
|
-
def
|
|
176
|
-
"""
|
|
177
|
-
Add columns to watch for cell-level changes.
|
|
178
|
-
|
|
179
|
-
Args:
|
|
180
|
-
*columns: Column names to watch.
|
|
181
|
-
|
|
182
|
-
Returns:
|
|
183
|
-
The tracepipe module for fluent chaining.
|
|
254
|
+
def stage(name: str):
|
|
255
|
+
"""Context manager for naming pipeline stages."""
|
|
184
256
|
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
ctx = get_context()
|
|
190
|
-
ctx.watched_columns.update(columns)
|
|
191
|
-
return _get_module()
|
|
257
|
+
class StageContext:
|
|
258
|
+
def __init__(self, stage_name: str):
|
|
259
|
+
self.stage_name = stage_name
|
|
260
|
+
self.previous_stage = None
|
|
192
261
|
|
|
262
|
+
def __enter__(self):
|
|
263
|
+
ctx = get_context()
|
|
264
|
+
self.previous_stage = ctx.current_stage
|
|
265
|
+
ctx.current_stage = self.stage_name
|
|
266
|
+
return self
|
|
193
267
|
|
|
194
|
-
def
|
|
195
|
-
|
|
196
|
-
|
|
268
|
+
def __exit__(self, *args):
|
|
269
|
+
ctx = get_context()
|
|
270
|
+
ctx.current_stage = self.previous_stage
|
|
197
271
|
|
|
198
|
-
|
|
199
|
-
df: DataFrame whose columns to watch.
|
|
272
|
+
return StageContext(name)
|
|
200
273
|
|
|
201
|
-
Returns:
|
|
202
|
-
The tracepipe module for fluent chaining.
|
|
203
274
|
|
|
204
|
-
|
|
205
|
-
tracepipe.watch_all(df)
|
|
275
|
+
def register(*dfs) -> types.ModuleType:
|
|
206
276
|
"""
|
|
207
|
-
|
|
208
|
-
ctx.watched_columns.update(df.columns.tolist())
|
|
209
|
-
return _get_module()
|
|
277
|
+
Register pre-existing DataFrames for tracking.
|
|
210
278
|
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
"""
|
|
214
|
-
Remove columns from watch list.
|
|
279
|
+
Use this when DataFrames were created before tp.enable() was called.
|
|
280
|
+
After registration, snapshots, ghost rows, and cell history will work.
|
|
215
281
|
|
|
216
282
|
Args:
|
|
217
|
-
*
|
|
218
|
-
|
|
219
|
-
Returns:
|
|
220
|
-
The tracepipe module for fluent chaining.
|
|
221
|
-
"""
|
|
222
|
-
ctx = get_context()
|
|
223
|
-
ctx.watched_columns.difference_update(columns)
|
|
224
|
-
return _get_module()
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
def clear_watch() -> types.ModuleType:
|
|
228
|
-
"""
|
|
229
|
-
Clear all watched columns.
|
|
283
|
+
*dfs: One or more DataFrames to register
|
|
230
284
|
|
|
231
285
|
Returns:
|
|
232
286
|
The tracepipe module for fluent chaining.
|
|
233
287
|
|
|
234
288
|
Examples:
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
ctx.watched_columns.clear()
|
|
239
|
-
return _get_module()
|
|
289
|
+
# DataFrames created before enable
|
|
290
|
+
df1 = pd.DataFrame({"a": [1, 2, 3]})
|
|
291
|
+
df2 = pd.DataFrame({"b": [4, 5, 6]})
|
|
240
292
|
|
|
293
|
+
tp.enable()
|
|
294
|
+
tp.register(df1, df2) # Now they're tracked
|
|
241
295
|
|
|
242
|
-
|
|
296
|
+
snap = tp.snapshot(df1) # Works!
|
|
243
297
|
"""
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
Use this for DataFrames created before enable() was called.
|
|
298
|
+
import pandas as pd
|
|
247
299
|
|
|
248
|
-
Returns:
|
|
249
|
-
The tracepipe module for fluent chaining.
|
|
250
|
-
"""
|
|
251
300
|
ctx = get_context()
|
|
252
|
-
if ctx.enabled:
|
|
253
|
-
ctx.row_manager.register(df)
|
|
254
|
-
return _get_module()
|
|
255
|
-
|
|
256
301
|
|
|
257
|
-
|
|
258
|
-
|
|
302
|
+
if not ctx.enabled:
|
|
303
|
+
import warnings
|
|
259
304
|
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
305
|
+
warnings.warn(
|
|
306
|
+
"TracePipe is not enabled. Call tp.enable() before tp.register().",
|
|
307
|
+
UserWarning,
|
|
308
|
+
stacklevel=2,
|
|
309
|
+
)
|
|
310
|
+
return _get_module()
|
|
264
311
|
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
ctx.current_stage = self.stage_name
|
|
269
|
-
return self
|
|
312
|
+
for df in dfs:
|
|
313
|
+
if not isinstance(df, pd.DataFrame):
|
|
314
|
+
raise TypeError(f"Expected DataFrame, got {type(df).__name__}")
|
|
270
315
|
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
ctx.
|
|
316
|
+
# Only register if not already registered
|
|
317
|
+
if ctx.row_manager.get_ids_array(df) is None:
|
|
318
|
+
ctx.row_manager.register(df)
|
|
274
319
|
|
|
275
|
-
return
|
|
320
|
+
return _get_module()
|
|
276
321
|
|
|
277
322
|
|
|
278
|
-
# ===
|
|
323
|
+
# === INTERNAL RESULT CLASSES (used by debug module) ===
|
|
279
324
|
|
|
280
325
|
|
|
281
326
|
class RowLineageResult:
|
|
@@ -284,54 +329,82 @@ class RowLineageResult:
|
|
|
284
329
|
def __init__(self, row_id: int, ctx: TracePipeContext):
|
|
285
330
|
self.row_id = row_id
|
|
286
331
|
self._ctx = ctx
|
|
287
|
-
self._history =
|
|
288
|
-
self._gaps =
|
|
332
|
+
self._history: list[dict] | None = None
|
|
333
|
+
self._gaps: LineageGaps | None = None
|
|
334
|
+
self._drop_event: dict | None = None
|
|
335
|
+
self._drop_event_checked: bool = False
|
|
336
|
+
|
|
337
|
+
def _ensure_drop_event(self) -> None:
|
|
338
|
+
if not self._drop_event_checked:
|
|
339
|
+
self._drop_event = self._ctx.store.get_drop_event(self.row_id)
|
|
340
|
+
self._drop_event_checked = True
|
|
341
|
+
|
|
342
|
+
def _ensure_history(self) -> None:
|
|
343
|
+
if self._history is None:
|
|
344
|
+
self._history = self._ctx.store.get_row_history(self.row_id)
|
|
345
|
+
|
|
346
|
+
def _ensure_gaps(self) -> None:
|
|
347
|
+
if self._gaps is None:
|
|
348
|
+
self._gaps = self._ctx.store.compute_gaps(self.row_id)
|
|
289
349
|
|
|
290
350
|
@property
|
|
291
351
|
def is_alive(self) -> bool:
|
|
292
|
-
|
|
293
|
-
return
|
|
352
|
+
self._ensure_drop_event()
|
|
353
|
+
return self._drop_event is None
|
|
294
354
|
|
|
295
355
|
@property
|
|
296
356
|
def dropped_at(self) -> str | None:
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
return h["operation"]
|
|
357
|
+
self._ensure_drop_event()
|
|
358
|
+
if self._drop_event is not None:
|
|
359
|
+
return self._drop_event.get("operation")
|
|
301
360
|
return None
|
|
302
361
|
|
|
362
|
+
@property
|
|
363
|
+
def dropped_step_id(self) -> int | None:
|
|
364
|
+
self._ensure_drop_event()
|
|
365
|
+
if self._drop_event is not None:
|
|
366
|
+
return self._drop_event.get("step_id")
|
|
367
|
+
return None
|
|
368
|
+
|
|
369
|
+
def merge_origin(self) -> dict | None:
|
|
370
|
+
return self._ctx.store.get_merge_origin(self.row_id)
|
|
371
|
+
|
|
303
372
|
def cell_history(self, column: str) -> list[dict]:
|
|
304
|
-
|
|
373
|
+
self._ensure_history()
|
|
305
374
|
return [h for h in self._history if h["col"] == column]
|
|
306
375
|
|
|
307
376
|
def history(self) -> list[dict]:
|
|
308
|
-
|
|
377
|
+
self._ensure_history()
|
|
309
378
|
return self._history
|
|
310
379
|
|
|
311
380
|
@property
|
|
312
381
|
def gaps(self) -> LineageGaps:
|
|
313
|
-
|
|
382
|
+
self._ensure_gaps()
|
|
314
383
|
return self._gaps
|
|
315
384
|
|
|
316
385
|
@property
|
|
317
386
|
def is_fully_tracked(self) -> bool:
|
|
318
|
-
|
|
387
|
+
self._ensure_gaps()
|
|
319
388
|
return self._gaps.is_fully_tracked
|
|
320
389
|
|
|
321
390
|
def to_dict(self) -> dict:
|
|
322
|
-
|
|
391
|
+
self._ensure_history()
|
|
392
|
+
self._ensure_gaps()
|
|
393
|
+
merge = self.merge_origin()
|
|
323
394
|
return {
|
|
324
395
|
"row_id": self.row_id,
|
|
325
396
|
"is_alive": self.is_alive,
|
|
326
397
|
"dropped_at": self.dropped_at,
|
|
398
|
+
"dropped_step_id": self.dropped_step_id,
|
|
327
399
|
"is_fully_tracked": self.is_fully_tracked,
|
|
328
400
|
"gaps_summary": self._gaps.summary(),
|
|
401
|
+
"merge_origin": merge,
|
|
329
402
|
"history": self._history,
|
|
330
403
|
}
|
|
331
404
|
|
|
332
405
|
def __repr__(self):
|
|
333
406
|
status = "alive" if self.is_alive else f"dropped at {self.dropped_at}"
|
|
334
|
-
return f"<RowLineage row_id={self.row_id} {status} events={len(self.
|
|
407
|
+
return f"<RowLineage row_id={self.row_id} {status} events={len(self.history())}>"
|
|
335
408
|
|
|
336
409
|
|
|
337
410
|
class GroupLineageResult:
|
|
@@ -344,45 +417,25 @@ class GroupLineageResult:
|
|
|
344
417
|
|
|
345
418
|
@property
|
|
346
419
|
def row_ids(self) -> list[int]:
|
|
347
|
-
"""Get list of row IDs in this group."""
|
|
348
420
|
return self._info["row_ids"] if self._info else []
|
|
349
421
|
|
|
350
422
|
@property
|
|
351
423
|
def row_count(self) -> int:
|
|
352
|
-
"""Get number of rows in this group."""
|
|
353
424
|
return self._info["row_count"] if self._info else 0
|
|
354
425
|
|
|
355
426
|
@property
|
|
356
427
|
def is_count_only(self) -> bool:
|
|
357
|
-
"""
|
|
358
|
-
True if group exceeded max_group_membership_size threshold.
|
|
359
|
-
|
|
360
|
-
When True, row_ids will be empty and only row_count is available.
|
|
361
|
-
"""
|
|
362
428
|
return self._info.get("is_count_only", False) if self._info else False
|
|
363
429
|
|
|
364
430
|
@property
|
|
365
431
|
def group_column(self) -> str | None:
|
|
366
|
-
"""Get the column used for grouping."""
|
|
367
432
|
return self._info["group_column"] if self._info else None
|
|
368
433
|
|
|
369
434
|
@property
|
|
370
435
|
def aggregation_functions(self) -> dict[str, str]:
|
|
371
|
-
"""Get the aggregation functions applied."""
|
|
372
436
|
return self._info["agg_functions"] if self._info else {}
|
|
373
437
|
|
|
374
|
-
def get_contributing_rows(self, limit: int = 100) -> list[RowLineageResult]:
|
|
375
|
-
"""
|
|
376
|
-
Get lineage for contributing rows.
|
|
377
|
-
|
|
378
|
-
Returns empty list if is_count_only is True.
|
|
379
|
-
"""
|
|
380
|
-
if self.is_count_only:
|
|
381
|
-
return []
|
|
382
|
-
return [explain(row_id) for row_id in self.row_ids[:limit]]
|
|
383
|
-
|
|
384
438
|
def to_dict(self) -> dict:
|
|
385
|
-
"""Export to dictionary."""
|
|
386
439
|
return {
|
|
387
440
|
"group_key": self.group_key,
|
|
388
441
|
"group_column": self.group_column,
|
|
@@ -395,169 +448,3 @@ class GroupLineageResult:
|
|
|
395
448
|
def __repr__(self):
|
|
396
449
|
suffix = " (count only)" if self.is_count_only else ""
|
|
397
450
|
return f"<GroupLineage key='{self.group_key}' rows={self.row_count}{suffix}>"
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
def explain(row_id: int) -> RowLineageResult:
|
|
401
|
-
"""Get lineage for a specific row."""
|
|
402
|
-
ctx = get_context()
|
|
403
|
-
return RowLineageResult(row_id, ctx)
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
def explain_many(row_ids: list[int]) -> list[RowLineageResult]:
|
|
407
|
-
"""
|
|
408
|
-
Get lineage for multiple rows.
|
|
409
|
-
|
|
410
|
-
Args:
|
|
411
|
-
row_ids: List of row IDs to explain.
|
|
412
|
-
|
|
413
|
-
Returns:
|
|
414
|
-
List of RowLineageResult objects.
|
|
415
|
-
|
|
416
|
-
Examples:
|
|
417
|
-
results = tracepipe.explain_many([0, 1, 2])
|
|
418
|
-
for row in results:
|
|
419
|
-
print(row.is_alive, row.dropped_at)
|
|
420
|
-
"""
|
|
421
|
-
ctx = get_context()
|
|
422
|
-
return [RowLineageResult(row_id, ctx) for row_id in row_ids]
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
def explain_group(group_key: str) -> GroupLineageResult:
|
|
426
|
-
"""Get lineage for an aggregation group."""
|
|
427
|
-
ctx = get_context()
|
|
428
|
-
return GroupLineageResult(group_key, ctx)
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
def dropped_rows(by_step: bool = False) -> list[int] | dict[str, int]:
|
|
432
|
-
"""
|
|
433
|
-
Get dropped row information.
|
|
434
|
-
|
|
435
|
-
Args:
|
|
436
|
-
by_step: If False (default), return list of dropped row IDs.
|
|
437
|
-
If True, return dict mapping operation names to drop counts.
|
|
438
|
-
|
|
439
|
-
Returns:
|
|
440
|
-
List of row IDs if by_step=False, or dict of {operation: count} if by_step=True.
|
|
441
|
-
|
|
442
|
-
Examples:
|
|
443
|
-
# Get all dropped row IDs
|
|
444
|
-
dropped = tracepipe.dropped_rows()
|
|
445
|
-
|
|
446
|
-
# Get counts by operation
|
|
447
|
-
by_op = tracepipe.dropped_rows(by_step=True)
|
|
448
|
-
# {'DataFrame.dropna': 5, 'DataFrame.query': 3}
|
|
449
|
-
"""
|
|
450
|
-
ctx = get_context()
|
|
451
|
-
if by_step:
|
|
452
|
-
return ctx.store.get_dropped_by_step()
|
|
453
|
-
return ctx.store.get_dropped_rows()
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
def alive_rows() -> list[int]:
|
|
457
|
-
"""
|
|
458
|
-
Get all row IDs that are still alive (not dropped).
|
|
459
|
-
|
|
460
|
-
Returns:
|
|
461
|
-
List of row IDs that have not been dropped.
|
|
462
|
-
|
|
463
|
-
Examples:
|
|
464
|
-
alive = tracepipe.alive_rows()
|
|
465
|
-
print(f"{len(alive)} rows survived the pipeline")
|
|
466
|
-
"""
|
|
467
|
-
ctx = get_context()
|
|
468
|
-
all_registered = set(ctx.row_manager.all_registered_ids())
|
|
469
|
-
dropped = set(ctx.store.get_dropped_rows())
|
|
470
|
-
return sorted(all_registered - dropped)
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
def mass_updates() -> list[dict]:
|
|
474
|
-
"""Get operations that exceeded cell diff threshold."""
|
|
475
|
-
ctx = get_context()
|
|
476
|
-
return [
|
|
477
|
-
{
|
|
478
|
-
"step_id": s.step_id,
|
|
479
|
-
"operation": s.operation,
|
|
480
|
-
"rows_affected": s.rows_affected,
|
|
481
|
-
"stage": s.stage,
|
|
482
|
-
}
|
|
483
|
-
for s in ctx.store.steps
|
|
484
|
-
if s.is_mass_update
|
|
485
|
-
]
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
def steps() -> list[dict]:
|
|
489
|
-
"""Get all tracked steps."""
|
|
490
|
-
ctx = get_context()
|
|
491
|
-
return [
|
|
492
|
-
{
|
|
493
|
-
"step_id": s.step_id,
|
|
494
|
-
"operation": s.operation,
|
|
495
|
-
"stage": s.stage,
|
|
496
|
-
"input_shape": s.input_shape,
|
|
497
|
-
"output_shape": s.output_shape,
|
|
498
|
-
"completeness": s.completeness.name,
|
|
499
|
-
"is_mass_update": s.is_mass_update,
|
|
500
|
-
"timestamp": s.timestamp,
|
|
501
|
-
"code_file": s.code_file,
|
|
502
|
-
"code_line": s.code_line,
|
|
503
|
-
}
|
|
504
|
-
for s in ctx.store.steps
|
|
505
|
-
]
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
def aggregation_groups() -> list[str]:
|
|
509
|
-
"""List all tracked aggregation groups."""
|
|
510
|
-
ctx = get_context()
|
|
511
|
-
groups = []
|
|
512
|
-
for mapping in ctx.store.aggregation_mappings:
|
|
513
|
-
groups.extend(mapping.membership.keys())
|
|
514
|
-
return groups
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
# === EXPORT ===
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
def export_json(filepath: str) -> None:
|
|
521
|
-
"""Export lineage to JSON file."""
|
|
522
|
-
ctx = get_context()
|
|
523
|
-
with open(filepath, "w") as f:
|
|
524
|
-
f.write(ctx.store.to_json())
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
def export_arrow(filepath: str) -> None:
|
|
528
|
-
"""
|
|
529
|
-
Export lineage to Parquet file.
|
|
530
|
-
|
|
531
|
-
Requires pyarrow to be installed.
|
|
532
|
-
|
|
533
|
-
Args:
|
|
534
|
-
filepath: Path to write the Parquet file.
|
|
535
|
-
|
|
536
|
-
Raises:
|
|
537
|
-
ImportError: If pyarrow is not installed.
|
|
538
|
-
"""
|
|
539
|
-
try:
|
|
540
|
-
import pyarrow.parquet as pq
|
|
541
|
-
except ImportError:
|
|
542
|
-
raise ImportError(
|
|
543
|
-
"pyarrow is required for Arrow/Parquet export. "
|
|
544
|
-
"Install it with: pip install tracepipe[arrow] or pip install pyarrow"
|
|
545
|
-
) from None
|
|
546
|
-
|
|
547
|
-
ctx = get_context()
|
|
548
|
-
table = ctx.store.to_arrow()
|
|
549
|
-
pq.write_table(table, filepath)
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
def stats() -> dict:
|
|
553
|
-
"""Get tracking statistics."""
|
|
554
|
-
ctx = get_context()
|
|
555
|
-
return {
|
|
556
|
-
"enabled": ctx.enabled,
|
|
557
|
-
"total_steps": len(ctx.store.steps),
|
|
558
|
-
"total_diffs": ctx.store.total_diff_count,
|
|
559
|
-
"in_memory_diffs": ctx.store.diff_count,
|
|
560
|
-
"spilled_files": len(ctx.store.spilled_files),
|
|
561
|
-
"watched_columns": list(ctx.watched_columns),
|
|
562
|
-
"aggregation_groups": len(ctx.store.aggregation_mappings),
|
|
563
|
-
}
|