tracepipe 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tracepipe/safety.py ADDED
@@ -0,0 +1,178 @@
1
+ # tracepipe/safety.py
2
+ """
3
+ Safe instrumentation wrappers.
4
+
5
+ CRITICAL: Execute original FIRST, capture lineage SECOND, return ALWAYS.
6
+ """
7
+
8
+ import inspect
9
+ import traceback
10
+ import warnings
11
+ from functools import wraps
12
+ from typing import Callable
13
+
14
+ from .context import get_context
15
+
16
+
17
+ class TracePipeWarning(UserWarning):
18
+ """Warning for non-fatal instrumentation issues."""
19
+
20
+ pass
21
+
22
+
23
+ class TracePipeError(Exception):
24
+ """Error raised in strict mode."""
25
+
26
+ pass
27
+
28
+
29
+ def get_caller_info(skip_frames: int = 2) -> tuple:
30
+ """
31
+ Get caller's file and line number, skipping library frames.
32
+
33
+ Walks up the stack to find the first frame that's in user code,
34
+ not in pandas, numpy, or tracepipe internals.
35
+
36
+ Args:
37
+ skip_frames: Minimum frames to skip (default 2 for wrapper + this func)
38
+
39
+ Returns:
40
+ (filename, line_number) or (None, None)
41
+ """
42
+ import os
43
+
44
+ # Get tracepipe package directory to skip it specifically
45
+ tracepipe_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
46
+ tracepipe_pkg = os.path.join(tracepipe_dir, "tracepipe")
47
+
48
+ # Paths/patterns to skip (library code)
49
+ SKIP_PATTERNS = (
50
+ "/pandas/",
51
+ "/numpy/",
52
+ "site-packages",
53
+ "<frozen",
54
+ "<string>",
55
+ )
56
+
57
+ try:
58
+ frame = inspect.currentframe()
59
+ # Skip minimum frames first
60
+ for _ in range(skip_frames + 1):
61
+ if frame is None:
62
+ return None, None
63
+ frame = frame.f_back
64
+
65
+ # Now walk up until we find user code
66
+ max_depth = 20 # Safety limit
67
+ for _ in range(max_depth):
68
+ if frame is None:
69
+ return None, None
70
+
71
+ filename = frame.f_code.co_filename
72
+ abs_filename = os.path.abspath(filename)
73
+
74
+ # Check if this is tracepipe package code
75
+ if abs_filename.startswith(tracepipe_pkg):
76
+ frame = frame.f_back
77
+ continue
78
+
79
+ # Check if this is other library code
80
+ is_library = any(pattern in filename for pattern in SKIP_PATTERNS)
81
+
82
+ if not is_library:
83
+ return filename, frame.f_lineno
84
+
85
+ frame = frame.f_back
86
+
87
+ return None, None
88
+ except Exception:
89
+ return None, None
90
+ finally:
91
+ del frame
92
+
93
+
94
+ def _make_wrapper(
95
+ method_name: str, original_method: Callable, capture_func: Callable, mode: str = "standard"
96
+ ) -> Callable:
97
+ """
98
+ Factory for pandas method wrappers with lineage capture.
99
+
100
+ CRITICAL: Execute original FIRST, capture lineage SECOND, return ALWAYS.
101
+
102
+ Args:
103
+ method_name: Name for error messages
104
+ original_method: The original pandas method
105
+ capture_func: func(self, args, kwargs, result, ctx, method_name)
106
+ mode: "standard", "filter", or "inplace"
107
+ """
108
+
109
+ @wraps(original_method)
110
+ def wrapper(self, *args, **kwargs):
111
+ ctx = get_context()
112
+
113
+ # === PRE-EXECUTION SETUP ===
114
+ before_snapshot = None
115
+
116
+ if mode == "filter" and ctx.enabled:
117
+ ctx._filter_op_depth += 1
118
+ elif mode == "inplace" and ctx.enabled and kwargs.get("inplace", False):
119
+ try:
120
+ before_snapshot = self.copy()
121
+ except Exception:
122
+ pass
123
+
124
+ # === EXECUTE ORIGINAL (SACRED) ===
125
+ try:
126
+ result = original_method(self, *args, **kwargs)
127
+ finally:
128
+ if mode == "filter" and ctx.enabled:
129
+ ctx._filter_op_depth -= 1
130
+
131
+ # === CAPTURE LINEAGE (SIDE EFFECT) ===
132
+ if ctx.enabled:
133
+ try:
134
+ if mode == "inplace" and kwargs.get("inplace", False):
135
+ if before_snapshot is not None:
136
+ capture_func(before_snapshot, args, kwargs, self, ctx, method_name)
137
+ elif mode == "inplace" and result is not None:
138
+ capture_func(self, args, kwargs, result, ctx, method_name)
139
+ else:
140
+ capture_func(self, args, kwargs, result, ctx, method_name)
141
+ except Exception as e:
142
+ if ctx.config.strict_mode:
143
+ raise TracePipeError(
144
+ f"Instrumentation failed for {method_name}: {e}\n"
145
+ f"{traceback.format_exc()}"
146
+ ) from e
147
+ else:
148
+ warnings.warn(
149
+ f"TracePipe: {method_name} instrumentation failed: {e}. "
150
+ f"Lineage may be incomplete.",
151
+ TracePipeWarning,
152
+ )
153
+
154
+ # === RETURN RESULT (ALWAYS) ===
155
+ return result
156
+
157
+ return wrapper
158
+
159
+
160
+ def wrap_pandas_method(
161
+ method_name: str, original_method: Callable, capture_func: Callable
162
+ ) -> Callable:
163
+ """Wrap a pandas method with lineage capture."""
164
+ return _make_wrapper(method_name, original_method, capture_func, mode="standard")
165
+
166
+
167
+ def wrap_pandas_filter_method(
168
+ method_name: str, original_method: Callable, capture_func: Callable
169
+ ) -> Callable:
170
+ """Wrap a pandas filter method (dropna, drop_duplicates, etc.)."""
171
+ return _make_wrapper(method_name, original_method, capture_func, mode="filter")
172
+
173
+
174
+ def wrap_pandas_method_inplace(
175
+ method_name: str, original_method: Callable, capture_func: Callable
176
+ ) -> Callable:
177
+ """Wrap a pandas method that supports inplace=True."""
178
+ return _make_wrapper(method_name, original_method, capture_func, mode="inplace")
@@ -0,0 +1,13 @@
1
+ # tracepipe/storage/__init__.py
2
+ """Storage backends and row identity strategies."""
3
+
4
+ from .base import LineageBackend, RowIdentityStrategy
5
+ from .lineage_store import InMemoryLineageStore
6
+ from .row_identity import PandasRowIdentity
7
+
8
+ __all__ = [
9
+ "LineageBackend",
10
+ "RowIdentityStrategy",
11
+ "InMemoryLineageStore",
12
+ "PandasRowIdentity",
13
+ ]
@@ -0,0 +1,174 @@
1
+ # tracepipe/storage/base.py
2
+ """
3
+ Protocol definitions for TracePipe storage backends.
4
+
5
+ These protocols enable:
6
+ - Swappable storage backends (InMemory, SQLite, Delta Lake)
7
+ - Engine-specific row identity strategies (Pandas, Polars, Spark)
8
+ - Easy testing with mock implementations
9
+
10
+ To add a new backend, implement LineageBackend.
11
+ To support a new DataFrame engine, implement RowIdentityStrategy.
12
+ """
13
+
14
+ from typing import Any, Optional, Protocol, runtime_checkable
15
+
16
+ from ..core import ChangeType, CompletenessLevel, LineageGaps, TracePipeConfig
17
+
18
+
19
+ @runtime_checkable
20
+ class LineageBackend(Protocol):
21
+ """
22
+ Protocol for lineage storage backends.
23
+
24
+ Implementations:
25
+ - InMemoryLineageStore (default, v0.2.0)
26
+ - SQLiteBackend (future)
27
+ - DeltaLakeBackend (future)
28
+ """
29
+
30
+ config: TracePipeConfig
31
+
32
+ def append_diff(
33
+ self,
34
+ step_id: int,
35
+ row_id: int,
36
+ col: str,
37
+ old_val: Any,
38
+ new_val: Any,
39
+ change_type: ChangeType,
40
+ ) -> None:
41
+ """Append a single cell diff."""
42
+ ...
43
+
44
+ def append_diff_batch(
45
+ self, step_id: int, diffs: list[tuple], check_threshold: bool = True
46
+ ) -> int:
47
+ """Batch append diffs. Returns count appended."""
48
+ ...
49
+
50
+ def append_step(
51
+ self,
52
+ operation: str,
53
+ stage: Optional[str],
54
+ code_file: Optional[str],
55
+ code_line: Optional[int],
56
+ params: dict[str, Any],
57
+ input_shape: Optional[tuple],
58
+ output_shape: Optional[tuple],
59
+ completeness: CompletenessLevel = CompletenessLevel.FULL,
60
+ is_mass_update: bool = False,
61
+ rows_affected: int = 0,
62
+ ) -> int:
63
+ """Append step metadata. Returns step_id."""
64
+ ...
65
+
66
+ def append_aggregation(
67
+ self,
68
+ step_id: int,
69
+ group_column: str,
70
+ membership: dict[str, list[int]],
71
+ agg_functions: dict[str, str],
72
+ ) -> None:
73
+ """Record aggregation group membership."""
74
+ ...
75
+
76
+ def get_row_history(self, row_id: int) -> list[dict]:
77
+ """Get all events for a specific row."""
78
+ ...
79
+
80
+ def get_dropped_rows(self, step_id: Optional[int] = None) -> list[int]:
81
+ """Get dropped row IDs, optionally filtered by step."""
82
+ ...
83
+
84
+ def get_dropped_by_step(self) -> dict[str, int]:
85
+ """Get count of dropped rows per operation."""
86
+ ...
87
+
88
+ def get_group_members(self, group_key: str) -> Optional[dict]:
89
+ """Get rows that contributed to a group."""
90
+ ...
91
+
92
+ def compute_gaps(self, row_id: int) -> LineageGaps:
93
+ """Compute lineage gaps for a row."""
94
+ ...
95
+
96
+ def should_track_cell_diffs(self, affected_count: int) -> bool:
97
+ """Return False for mass updates exceeding threshold."""
98
+ ...
99
+
100
+ def to_json(self) -> str:
101
+ """Export all data as JSON string."""
102
+ ...
103
+
104
+ @property
105
+ def steps(self) -> list:
106
+ """Access step metadata list."""
107
+ ...
108
+
109
+ @property
110
+ def total_diff_count(self) -> int:
111
+ """Total diffs including spilled."""
112
+ ...
113
+
114
+ @property
115
+ def diff_count(self) -> int:
116
+ """In-memory diff count."""
117
+ ...
118
+
119
+
120
+ @runtime_checkable
121
+ class RowIdentityStrategy(Protocol):
122
+ """
123
+ Protocol for row identity tracking.
124
+
125
+ Implementations:
126
+ - PandasRowIdentity (default, v0.2.0)
127
+ - PolarsRowIdentity (future)
128
+ - SparkRowIdentity (future)
129
+ """
130
+
131
+ config: TracePipeConfig
132
+
133
+ def register(
134
+ self, df: Any, row_ids: Optional[Any] = None, warn_duplicate_index: bool = True
135
+ ) -> Any:
136
+ """Register a DataFrame and assign/return row IDs."""
137
+ ...
138
+
139
+ def get_ids(self, df: Any) -> Optional[Any]:
140
+ """Get row IDs for a DataFrame, or None if not tracked."""
141
+ ...
142
+
143
+ def propagate(self, source_df: Any, result_df: Any) -> Optional[Any]:
144
+ """Propagate row IDs from source to result DataFrame."""
145
+ ...
146
+
147
+ def get_dropped_ids(self, source_df: Any, result_df: Any) -> set:
148
+ """Get row IDs that were dropped between source and result."""
149
+ ...
150
+
151
+ def strip_hidden_column(self, df: Any) -> Any:
152
+ """Remove hidden column for export."""
153
+ ...
154
+
155
+ def cleanup(self) -> None:
156
+ """Remove stale entries."""
157
+ ...
158
+
159
+
160
+ # === FACTORY FUNCTIONS ===
161
+
162
+
163
+ def create_default_backend(config: TracePipeConfig) -> "LineageBackend":
164
+ """Create the default in-memory backend."""
165
+ from .lineage_store import InMemoryLineageStore
166
+
167
+ return InMemoryLineageStore(config)
168
+
169
+
170
+ def create_default_identity(config: TracePipeConfig) -> "RowIdentityStrategy":
171
+ """Create the default pandas row identity strategy."""
172
+ from .row_identity import PandasRowIdentity
173
+
174
+ return PandasRowIdentity(config)