tracepipe 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tracepipe/__init__.py +110 -0
- tracepipe/api.py +563 -0
- tracepipe/context.py +98 -0
- tracepipe/core.py +122 -0
- tracepipe/instrumentation/__init__.py +6 -0
- tracepipe/instrumentation/pandas_inst.py +1024 -0
- tracepipe/safety.py +178 -0
- tracepipe/storage/__init__.py +13 -0
- tracepipe/storage/base.py +174 -0
- tracepipe/storage/lineage_store.py +556 -0
- tracepipe/storage/row_identity.py +217 -0
- tracepipe/utils/__init__.py +6 -0
- tracepipe/utils/value_capture.py +137 -0
- tracepipe/visualization/__init__.py +6 -0
- tracepipe/visualization/html_export.py +1335 -0
- tracepipe-0.2.0.dist-info/METADATA +508 -0
- tracepipe-0.2.0.dist-info/RECORD +19 -0
- tracepipe-0.2.0.dist-info/WHEEL +4 -0
- tracepipe-0.2.0.dist-info/licenses/LICENSE +21 -0
tracepipe/safety.py
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
# tracepipe/safety.py
|
|
2
|
+
"""
|
|
3
|
+
Safe instrumentation wrappers.
|
|
4
|
+
|
|
5
|
+
CRITICAL: Execute original FIRST, capture lineage SECOND, return ALWAYS.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import inspect
|
|
9
|
+
import traceback
|
|
10
|
+
import warnings
|
|
11
|
+
from functools import wraps
|
|
12
|
+
from typing import Callable
|
|
13
|
+
|
|
14
|
+
from .context import get_context
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TracePipeWarning(UserWarning):
|
|
18
|
+
"""Warning for non-fatal instrumentation issues."""
|
|
19
|
+
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class TracePipeError(Exception):
|
|
24
|
+
"""Error raised in strict mode."""
|
|
25
|
+
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def get_caller_info(skip_frames: int = 2) -> tuple:
|
|
30
|
+
"""
|
|
31
|
+
Get caller's file and line number, skipping library frames.
|
|
32
|
+
|
|
33
|
+
Walks up the stack to find the first frame that's in user code,
|
|
34
|
+
not in pandas, numpy, or tracepipe internals.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
skip_frames: Minimum frames to skip (default 2 for wrapper + this func)
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
(filename, line_number) or (None, None)
|
|
41
|
+
"""
|
|
42
|
+
import os
|
|
43
|
+
|
|
44
|
+
# Get tracepipe package directory to skip it specifically
|
|
45
|
+
tracepipe_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
46
|
+
tracepipe_pkg = os.path.join(tracepipe_dir, "tracepipe")
|
|
47
|
+
|
|
48
|
+
# Paths/patterns to skip (library code)
|
|
49
|
+
SKIP_PATTERNS = (
|
|
50
|
+
"/pandas/",
|
|
51
|
+
"/numpy/",
|
|
52
|
+
"site-packages",
|
|
53
|
+
"<frozen",
|
|
54
|
+
"<string>",
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
try:
|
|
58
|
+
frame = inspect.currentframe()
|
|
59
|
+
# Skip minimum frames first
|
|
60
|
+
for _ in range(skip_frames + 1):
|
|
61
|
+
if frame is None:
|
|
62
|
+
return None, None
|
|
63
|
+
frame = frame.f_back
|
|
64
|
+
|
|
65
|
+
# Now walk up until we find user code
|
|
66
|
+
max_depth = 20 # Safety limit
|
|
67
|
+
for _ in range(max_depth):
|
|
68
|
+
if frame is None:
|
|
69
|
+
return None, None
|
|
70
|
+
|
|
71
|
+
filename = frame.f_code.co_filename
|
|
72
|
+
abs_filename = os.path.abspath(filename)
|
|
73
|
+
|
|
74
|
+
# Check if this is tracepipe package code
|
|
75
|
+
if abs_filename.startswith(tracepipe_pkg):
|
|
76
|
+
frame = frame.f_back
|
|
77
|
+
continue
|
|
78
|
+
|
|
79
|
+
# Check if this is other library code
|
|
80
|
+
is_library = any(pattern in filename for pattern in SKIP_PATTERNS)
|
|
81
|
+
|
|
82
|
+
if not is_library:
|
|
83
|
+
return filename, frame.f_lineno
|
|
84
|
+
|
|
85
|
+
frame = frame.f_back
|
|
86
|
+
|
|
87
|
+
return None, None
|
|
88
|
+
except Exception:
|
|
89
|
+
return None, None
|
|
90
|
+
finally:
|
|
91
|
+
del frame
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _make_wrapper(
|
|
95
|
+
method_name: str, original_method: Callable, capture_func: Callable, mode: str = "standard"
|
|
96
|
+
) -> Callable:
|
|
97
|
+
"""
|
|
98
|
+
Factory for pandas method wrappers with lineage capture.
|
|
99
|
+
|
|
100
|
+
CRITICAL: Execute original FIRST, capture lineage SECOND, return ALWAYS.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
method_name: Name for error messages
|
|
104
|
+
original_method: The original pandas method
|
|
105
|
+
capture_func: func(self, args, kwargs, result, ctx, method_name)
|
|
106
|
+
mode: "standard", "filter", or "inplace"
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
@wraps(original_method)
|
|
110
|
+
def wrapper(self, *args, **kwargs):
|
|
111
|
+
ctx = get_context()
|
|
112
|
+
|
|
113
|
+
# === PRE-EXECUTION SETUP ===
|
|
114
|
+
before_snapshot = None
|
|
115
|
+
|
|
116
|
+
if mode == "filter" and ctx.enabled:
|
|
117
|
+
ctx._filter_op_depth += 1
|
|
118
|
+
elif mode == "inplace" and ctx.enabled and kwargs.get("inplace", False):
|
|
119
|
+
try:
|
|
120
|
+
before_snapshot = self.copy()
|
|
121
|
+
except Exception:
|
|
122
|
+
pass
|
|
123
|
+
|
|
124
|
+
# === EXECUTE ORIGINAL (SACRED) ===
|
|
125
|
+
try:
|
|
126
|
+
result = original_method(self, *args, **kwargs)
|
|
127
|
+
finally:
|
|
128
|
+
if mode == "filter" and ctx.enabled:
|
|
129
|
+
ctx._filter_op_depth -= 1
|
|
130
|
+
|
|
131
|
+
# === CAPTURE LINEAGE (SIDE EFFECT) ===
|
|
132
|
+
if ctx.enabled:
|
|
133
|
+
try:
|
|
134
|
+
if mode == "inplace" and kwargs.get("inplace", False):
|
|
135
|
+
if before_snapshot is not None:
|
|
136
|
+
capture_func(before_snapshot, args, kwargs, self, ctx, method_name)
|
|
137
|
+
elif mode == "inplace" and result is not None:
|
|
138
|
+
capture_func(self, args, kwargs, result, ctx, method_name)
|
|
139
|
+
else:
|
|
140
|
+
capture_func(self, args, kwargs, result, ctx, method_name)
|
|
141
|
+
except Exception as e:
|
|
142
|
+
if ctx.config.strict_mode:
|
|
143
|
+
raise TracePipeError(
|
|
144
|
+
f"Instrumentation failed for {method_name}: {e}\n"
|
|
145
|
+
f"{traceback.format_exc()}"
|
|
146
|
+
) from e
|
|
147
|
+
else:
|
|
148
|
+
warnings.warn(
|
|
149
|
+
f"TracePipe: {method_name} instrumentation failed: {e}. "
|
|
150
|
+
f"Lineage may be incomplete.",
|
|
151
|
+
TracePipeWarning,
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
# === RETURN RESULT (ALWAYS) ===
|
|
155
|
+
return result
|
|
156
|
+
|
|
157
|
+
return wrapper
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def wrap_pandas_method(
|
|
161
|
+
method_name: str, original_method: Callable, capture_func: Callable
|
|
162
|
+
) -> Callable:
|
|
163
|
+
"""Wrap a pandas method with lineage capture."""
|
|
164
|
+
return _make_wrapper(method_name, original_method, capture_func, mode="standard")
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def wrap_pandas_filter_method(
|
|
168
|
+
method_name: str, original_method: Callable, capture_func: Callable
|
|
169
|
+
) -> Callable:
|
|
170
|
+
"""Wrap a pandas filter method (dropna, drop_duplicates, etc.)."""
|
|
171
|
+
return _make_wrapper(method_name, original_method, capture_func, mode="filter")
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def wrap_pandas_method_inplace(
|
|
175
|
+
method_name: str, original_method: Callable, capture_func: Callable
|
|
176
|
+
) -> Callable:
|
|
177
|
+
"""Wrap a pandas method that supports inplace=True."""
|
|
178
|
+
return _make_wrapper(method_name, original_method, capture_func, mode="inplace")
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# tracepipe/storage/__init__.py
|
|
2
|
+
"""Storage backends and row identity strategies."""
|
|
3
|
+
|
|
4
|
+
from .base import LineageBackend, RowIdentityStrategy
|
|
5
|
+
from .lineage_store import InMemoryLineageStore
|
|
6
|
+
from .row_identity import PandasRowIdentity
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"LineageBackend",
|
|
10
|
+
"RowIdentityStrategy",
|
|
11
|
+
"InMemoryLineageStore",
|
|
12
|
+
"PandasRowIdentity",
|
|
13
|
+
]
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
# tracepipe/storage/base.py
|
|
2
|
+
"""
|
|
3
|
+
Protocol definitions for TracePipe storage backends.
|
|
4
|
+
|
|
5
|
+
These protocols enable:
|
|
6
|
+
- Swappable storage backends (InMemory, SQLite, Delta Lake)
|
|
7
|
+
- Engine-specific row identity strategies (Pandas, Polars, Spark)
|
|
8
|
+
- Easy testing with mock implementations
|
|
9
|
+
|
|
10
|
+
To add a new backend, implement LineageBackend.
|
|
11
|
+
To support a new DataFrame engine, implement RowIdentityStrategy.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from typing import Any, Optional, Protocol, runtime_checkable
|
|
15
|
+
|
|
16
|
+
from ..core import ChangeType, CompletenessLevel, LineageGaps, TracePipeConfig
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@runtime_checkable
|
|
20
|
+
class LineageBackend(Protocol):
|
|
21
|
+
"""
|
|
22
|
+
Protocol for lineage storage backends.
|
|
23
|
+
|
|
24
|
+
Implementations:
|
|
25
|
+
- InMemoryLineageStore (default, v0.2.0)
|
|
26
|
+
- SQLiteBackend (future)
|
|
27
|
+
- DeltaLakeBackend (future)
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
config: TracePipeConfig
|
|
31
|
+
|
|
32
|
+
def append_diff(
|
|
33
|
+
self,
|
|
34
|
+
step_id: int,
|
|
35
|
+
row_id: int,
|
|
36
|
+
col: str,
|
|
37
|
+
old_val: Any,
|
|
38
|
+
new_val: Any,
|
|
39
|
+
change_type: ChangeType,
|
|
40
|
+
) -> None:
|
|
41
|
+
"""Append a single cell diff."""
|
|
42
|
+
...
|
|
43
|
+
|
|
44
|
+
def append_diff_batch(
|
|
45
|
+
self, step_id: int, diffs: list[tuple], check_threshold: bool = True
|
|
46
|
+
) -> int:
|
|
47
|
+
"""Batch append diffs. Returns count appended."""
|
|
48
|
+
...
|
|
49
|
+
|
|
50
|
+
def append_step(
|
|
51
|
+
self,
|
|
52
|
+
operation: str,
|
|
53
|
+
stage: Optional[str],
|
|
54
|
+
code_file: Optional[str],
|
|
55
|
+
code_line: Optional[int],
|
|
56
|
+
params: dict[str, Any],
|
|
57
|
+
input_shape: Optional[tuple],
|
|
58
|
+
output_shape: Optional[tuple],
|
|
59
|
+
completeness: CompletenessLevel = CompletenessLevel.FULL,
|
|
60
|
+
is_mass_update: bool = False,
|
|
61
|
+
rows_affected: int = 0,
|
|
62
|
+
) -> int:
|
|
63
|
+
"""Append step metadata. Returns step_id."""
|
|
64
|
+
...
|
|
65
|
+
|
|
66
|
+
def append_aggregation(
|
|
67
|
+
self,
|
|
68
|
+
step_id: int,
|
|
69
|
+
group_column: str,
|
|
70
|
+
membership: dict[str, list[int]],
|
|
71
|
+
agg_functions: dict[str, str],
|
|
72
|
+
) -> None:
|
|
73
|
+
"""Record aggregation group membership."""
|
|
74
|
+
...
|
|
75
|
+
|
|
76
|
+
def get_row_history(self, row_id: int) -> list[dict]:
|
|
77
|
+
"""Get all events for a specific row."""
|
|
78
|
+
...
|
|
79
|
+
|
|
80
|
+
def get_dropped_rows(self, step_id: Optional[int] = None) -> list[int]:
|
|
81
|
+
"""Get dropped row IDs, optionally filtered by step."""
|
|
82
|
+
...
|
|
83
|
+
|
|
84
|
+
def get_dropped_by_step(self) -> dict[str, int]:
|
|
85
|
+
"""Get count of dropped rows per operation."""
|
|
86
|
+
...
|
|
87
|
+
|
|
88
|
+
def get_group_members(self, group_key: str) -> Optional[dict]:
|
|
89
|
+
"""Get rows that contributed to a group."""
|
|
90
|
+
...
|
|
91
|
+
|
|
92
|
+
def compute_gaps(self, row_id: int) -> LineageGaps:
|
|
93
|
+
"""Compute lineage gaps for a row."""
|
|
94
|
+
...
|
|
95
|
+
|
|
96
|
+
def should_track_cell_diffs(self, affected_count: int) -> bool:
|
|
97
|
+
"""Return False for mass updates exceeding threshold."""
|
|
98
|
+
...
|
|
99
|
+
|
|
100
|
+
def to_json(self) -> str:
|
|
101
|
+
"""Export all data as JSON string."""
|
|
102
|
+
...
|
|
103
|
+
|
|
104
|
+
@property
|
|
105
|
+
def steps(self) -> list:
|
|
106
|
+
"""Access step metadata list."""
|
|
107
|
+
...
|
|
108
|
+
|
|
109
|
+
@property
|
|
110
|
+
def total_diff_count(self) -> int:
|
|
111
|
+
"""Total diffs including spilled."""
|
|
112
|
+
...
|
|
113
|
+
|
|
114
|
+
@property
|
|
115
|
+
def diff_count(self) -> int:
|
|
116
|
+
"""In-memory diff count."""
|
|
117
|
+
...
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
@runtime_checkable
|
|
121
|
+
class RowIdentityStrategy(Protocol):
|
|
122
|
+
"""
|
|
123
|
+
Protocol for row identity tracking.
|
|
124
|
+
|
|
125
|
+
Implementations:
|
|
126
|
+
- PandasRowIdentity (default, v0.2.0)
|
|
127
|
+
- PolarsRowIdentity (future)
|
|
128
|
+
- SparkRowIdentity (future)
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
config: TracePipeConfig
|
|
132
|
+
|
|
133
|
+
def register(
|
|
134
|
+
self, df: Any, row_ids: Optional[Any] = None, warn_duplicate_index: bool = True
|
|
135
|
+
) -> Any:
|
|
136
|
+
"""Register a DataFrame and assign/return row IDs."""
|
|
137
|
+
...
|
|
138
|
+
|
|
139
|
+
def get_ids(self, df: Any) -> Optional[Any]:
|
|
140
|
+
"""Get row IDs for a DataFrame, or None if not tracked."""
|
|
141
|
+
...
|
|
142
|
+
|
|
143
|
+
def propagate(self, source_df: Any, result_df: Any) -> Optional[Any]:
|
|
144
|
+
"""Propagate row IDs from source to result DataFrame."""
|
|
145
|
+
...
|
|
146
|
+
|
|
147
|
+
def get_dropped_ids(self, source_df: Any, result_df: Any) -> set:
|
|
148
|
+
"""Get row IDs that were dropped between source and result."""
|
|
149
|
+
...
|
|
150
|
+
|
|
151
|
+
def strip_hidden_column(self, df: Any) -> Any:
|
|
152
|
+
"""Remove hidden column for export."""
|
|
153
|
+
...
|
|
154
|
+
|
|
155
|
+
def cleanup(self) -> None:
|
|
156
|
+
"""Remove stale entries."""
|
|
157
|
+
...
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
# === FACTORY FUNCTIONS ===
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def create_default_backend(config: TracePipeConfig) -> "LineageBackend":
|
|
164
|
+
"""Create the default in-memory backend."""
|
|
165
|
+
from .lineage_store import InMemoryLineageStore
|
|
166
|
+
|
|
167
|
+
return InMemoryLineageStore(config)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def create_default_identity(config: TracePipeConfig) -> "RowIdentityStrategy":
|
|
171
|
+
"""Create the default pandas row identity strategy."""
|
|
172
|
+
from .row_identity import PandasRowIdentity
|
|
173
|
+
|
|
174
|
+
return PandasRowIdentity(config)
|