tracepipe 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tracepipe/__init__.py +110 -0
- tracepipe/api.py +563 -0
- tracepipe/context.py +98 -0
- tracepipe/core.py +122 -0
- tracepipe/instrumentation/__init__.py +6 -0
- tracepipe/instrumentation/pandas_inst.py +1024 -0
- tracepipe/safety.py +178 -0
- tracepipe/storage/__init__.py +13 -0
- tracepipe/storage/base.py +174 -0
- tracepipe/storage/lineage_store.py +556 -0
- tracepipe/storage/row_identity.py +217 -0
- tracepipe/utils/__init__.py +6 -0
- tracepipe/utils/value_capture.py +137 -0
- tracepipe/visualization/__init__.py +6 -0
- tracepipe/visualization/html_export.py +1335 -0
- tracepipe-0.2.0.dist-info/METADATA +508 -0
- tracepipe-0.2.0.dist-info/RECORD +19 -0
- tracepipe-0.2.0.dist-info/WHEEL +4 -0
- tracepipe-0.2.0.dist-info/licenses/LICENSE +21 -0
tracepipe/context.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# tracepipe/context.py
|
|
2
|
+
"""
|
|
3
|
+
Thread-safe context for TracePipe state.
|
|
4
|
+
|
|
5
|
+
Each thread gets its own context via threading.local().
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import threading
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
from .core import TracePipeConfig
|
|
12
|
+
from .storage.base import (
|
|
13
|
+
LineageBackend,
|
|
14
|
+
RowIdentityStrategy,
|
|
15
|
+
create_default_backend,
|
|
16
|
+
create_default_identity,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
# Thread-local storage for context
|
|
20
|
+
_thread_local = threading.local()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class TracePipeContext:
|
|
24
|
+
"""
|
|
25
|
+
Per-thread context for TracePipe state.
|
|
26
|
+
|
|
27
|
+
Thread Safety:
|
|
28
|
+
- Each thread gets its own context via threading.local()
|
|
29
|
+
- Shared state (if needed) must use locks
|
|
30
|
+
- This design supports concurrent notebook cells but NOT
|
|
31
|
+
parallel pandas operations on shared DataFrames
|
|
32
|
+
|
|
33
|
+
Extensibility:
|
|
34
|
+
- Pass custom `backend` for alternative storage (SQLite, Delta Lake)
|
|
35
|
+
- Pass custom `identity` for alternative engines (Polars, Spark)
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
config: Optional[TracePipeConfig] = None,
|
|
41
|
+
backend: Optional[LineageBackend] = None,
|
|
42
|
+
identity: Optional[RowIdentityStrategy] = None,
|
|
43
|
+
):
|
|
44
|
+
self.config = config or TracePipeConfig()
|
|
45
|
+
self.enabled: bool = False
|
|
46
|
+
|
|
47
|
+
# Use provided backends or create defaults
|
|
48
|
+
self.store: LineageBackend = backend or create_default_backend(self.config)
|
|
49
|
+
self.row_manager: RowIdentityStrategy = identity or create_default_identity(self.config)
|
|
50
|
+
|
|
51
|
+
self.watched_columns: set[str] = set()
|
|
52
|
+
self.current_stage: Optional[str] = None
|
|
53
|
+
|
|
54
|
+
# Nested filter operation tracking (prevents double-counting drops)
|
|
55
|
+
# When > 0, __getitem__[mask] skips capture (parent op will capture)
|
|
56
|
+
self._filter_op_depth: int = 0
|
|
57
|
+
|
|
58
|
+
# GroupBy state stack (supports nesting)
|
|
59
|
+
self._groupby_stack: list[dict] = []
|
|
60
|
+
|
|
61
|
+
def push_groupby(self, state: dict) -> None:
|
|
62
|
+
"""Push groupby state for nested operations."""
|
|
63
|
+
self._groupby_stack.append(state)
|
|
64
|
+
|
|
65
|
+
def pop_groupby(self) -> Optional[dict]:
|
|
66
|
+
"""Pop most recent groupby state."""
|
|
67
|
+
return self._groupby_stack.pop() if self._groupby_stack else None
|
|
68
|
+
|
|
69
|
+
def peek_groupby(self) -> Optional[dict]:
|
|
70
|
+
"""Peek at current groupby state without removing."""
|
|
71
|
+
return self._groupby_stack[-1] if self._groupby_stack else None
|
|
72
|
+
|
|
73
|
+
def clear_groupby_for_source(self, source_id: int) -> None:
|
|
74
|
+
"""
|
|
75
|
+
Clear any groupby state for a given source DataFrame.
|
|
76
|
+
|
|
77
|
+
Called when a new groupby() is performed on the same DataFrame,
|
|
78
|
+
which invalidates any previous groupby state for that source.
|
|
79
|
+
"""
|
|
80
|
+
self._groupby_stack = [s for s in self._groupby_stack if s.get("source_id") != source_id]
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def get_context() -> TracePipeContext:
|
|
84
|
+
"""Get the current thread's TracePipe context."""
|
|
85
|
+
if not hasattr(_thread_local, "context"):
|
|
86
|
+
_thread_local.context = TracePipeContext()
|
|
87
|
+
return _thread_local.context
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def set_context(ctx: TracePipeContext) -> None:
|
|
91
|
+
"""Set context for current thread (used in testing)."""
|
|
92
|
+
_thread_local.context = ctx
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def reset_context() -> None:
|
|
96
|
+
"""Reset context for current thread."""
|
|
97
|
+
if hasattr(_thread_local, "context"):
|
|
98
|
+
del _thread_local.context
|
tracepipe/core.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# tracepipe/core.py
|
|
2
|
+
"""
|
|
3
|
+
Core types, enums, and configuration for TracePipe.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import os
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from enum import IntEnum
|
|
9
|
+
from typing import Any, Optional
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ChangeType(IntEnum):
|
|
13
|
+
"""Types of changes tracked by TracePipe."""
|
|
14
|
+
|
|
15
|
+
MODIFIED = 0
|
|
16
|
+
DROPPED = 1
|
|
17
|
+
ADDED = 2
|
|
18
|
+
REORDERED = 3
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class CompletenessLevel(IntEnum):
|
|
22
|
+
"""
|
|
23
|
+
Indicates how completely an operation's internals are tracked.
|
|
24
|
+
|
|
25
|
+
FULL: Completely tracked (e.g., fillna, dropna)
|
|
26
|
+
PARTIAL: Output tracked, internals unknown (e.g., apply, pipe)
|
|
27
|
+
UNKNOWN: Lineage reset (e.g., merge, concat)
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
FULL = 0
|
|
31
|
+
PARTIAL = 1
|
|
32
|
+
UNKNOWN = 2
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class TracePipeConfig:
|
|
37
|
+
"""Configuration with sensible defaults."""
|
|
38
|
+
|
|
39
|
+
max_diffs_in_memory: int = 500_000
|
|
40
|
+
max_diffs_per_step: int = 100_000
|
|
41
|
+
max_group_membership_size: int = 100_000 # Store count-only above this threshold
|
|
42
|
+
strict_mode: bool = False
|
|
43
|
+
auto_watch: bool = False
|
|
44
|
+
auto_watch_null_threshold: float = 0.01
|
|
45
|
+
spillover_dir: str = ".tracepipe"
|
|
46
|
+
use_hidden_column: bool = False
|
|
47
|
+
warn_on_duplicate_index: bool = True
|
|
48
|
+
cleanup_spillover_on_disable: bool = True
|
|
49
|
+
|
|
50
|
+
@classmethod
|
|
51
|
+
def from_env(cls) -> "TracePipeConfig":
|
|
52
|
+
"""Create config from environment variables."""
|
|
53
|
+
return cls(
|
|
54
|
+
max_diffs_in_memory=int(os.environ.get("TRACEPIPE_MAX_DIFFS", 500_000)),
|
|
55
|
+
max_diffs_per_step=int(os.environ.get("TRACEPIPE_MAX_DIFFS_PER_STEP", 100_000)),
|
|
56
|
+
strict_mode=os.environ.get("TRACEPIPE_STRICT", "0") == "1",
|
|
57
|
+
auto_watch=os.environ.get("TRACEPIPE_AUTO_WATCH", "0") == "1",
|
|
58
|
+
use_hidden_column=os.environ.get("TRACEPIPE_HIDDEN_COL", "0") == "1",
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class StepMetadata:
|
|
64
|
+
"""Metadata for a single pipeline step."""
|
|
65
|
+
|
|
66
|
+
step_id: int
|
|
67
|
+
operation: str
|
|
68
|
+
stage: Optional[str]
|
|
69
|
+
timestamp: float
|
|
70
|
+
code_file: Optional[str]
|
|
71
|
+
code_line: Optional[int]
|
|
72
|
+
params: dict[str, Any]
|
|
73
|
+
input_shape: Optional[tuple]
|
|
74
|
+
output_shape: Optional[tuple]
|
|
75
|
+
is_mass_update: bool = False
|
|
76
|
+
rows_affected: int = 0
|
|
77
|
+
completeness: CompletenessLevel = CompletenessLevel.FULL
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@dataclass
|
|
81
|
+
class AggregationMapping:
|
|
82
|
+
"""Tracks which rows contributed to an aggregation group."""
|
|
83
|
+
|
|
84
|
+
step_id: int
|
|
85
|
+
group_column: str
|
|
86
|
+
membership: dict[str, list[int]] # {group_key: [row_ids]}
|
|
87
|
+
agg_functions: dict[str, str]
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@dataclass
|
|
91
|
+
class LineageGap:
|
|
92
|
+
"""Represents a gap in lineage tracking."""
|
|
93
|
+
|
|
94
|
+
step_id: int
|
|
95
|
+
operation: str
|
|
96
|
+
reason: str
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@dataclass
|
|
100
|
+
class LineageGaps:
|
|
101
|
+
"""Collection of lineage gaps for a row."""
|
|
102
|
+
|
|
103
|
+
gaps: list[LineageGap] = field(default_factory=list)
|
|
104
|
+
|
|
105
|
+
@property
|
|
106
|
+
def has_gaps(self) -> bool:
|
|
107
|
+
"""Return True if there are any gaps."""
|
|
108
|
+
return len(self.gaps) > 0
|
|
109
|
+
|
|
110
|
+
@property
|
|
111
|
+
def is_fully_tracked(self) -> bool:
|
|
112
|
+
"""Return True if lineage is complete."""
|
|
113
|
+
return len(self.gaps) == 0
|
|
114
|
+
|
|
115
|
+
def summary(self) -> str:
|
|
116
|
+
"""Return a human-readable summary."""
|
|
117
|
+
if self.is_fully_tracked:
|
|
118
|
+
return "Fully tracked"
|
|
119
|
+
elif len(self.gaps) == 1:
|
|
120
|
+
return f"1 step has limited visibility: {self.gaps[0].operation}"
|
|
121
|
+
else:
|
|
122
|
+
return f"{len(self.gaps)} steps have limited visibility"
|