tracepipe 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tracepipe/context.py ADDED
@@ -0,0 +1,98 @@
1
+ # tracepipe/context.py
2
+ """
3
+ Thread-safe context for TracePipe state.
4
+
5
+ Each thread gets its own context via threading.local().
6
+ """
7
+
8
+ import threading
9
+ from typing import Optional
10
+
11
+ from .core import TracePipeConfig
12
+ from .storage.base import (
13
+ LineageBackend,
14
+ RowIdentityStrategy,
15
+ create_default_backend,
16
+ create_default_identity,
17
+ )
18
+
19
+ # Thread-local storage for context
20
+ _thread_local = threading.local()
21
+
22
+
23
+ class TracePipeContext:
24
+ """
25
+ Per-thread context for TracePipe state.
26
+
27
+ Thread Safety:
28
+ - Each thread gets its own context via threading.local()
29
+ - Shared state (if needed) must use locks
30
+ - This design supports concurrent notebook cells but NOT
31
+ parallel pandas operations on shared DataFrames
32
+
33
+ Extensibility:
34
+ - Pass custom `backend` for alternative storage (SQLite, Delta Lake)
35
+ - Pass custom `identity` for alternative engines (Polars, Spark)
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ config: Optional[TracePipeConfig] = None,
41
+ backend: Optional[LineageBackend] = None,
42
+ identity: Optional[RowIdentityStrategy] = None,
43
+ ):
44
+ self.config = config or TracePipeConfig()
45
+ self.enabled: bool = False
46
+
47
+ # Use provided backends or create defaults
48
+ self.store: LineageBackend = backend or create_default_backend(self.config)
49
+ self.row_manager: RowIdentityStrategy = identity or create_default_identity(self.config)
50
+
51
+ self.watched_columns: set[str] = set()
52
+ self.current_stage: Optional[str] = None
53
+
54
+ # Nested filter operation tracking (prevents double-counting drops)
55
+ # When > 0, __getitem__[mask] skips capture (parent op will capture)
56
+ self._filter_op_depth: int = 0
57
+
58
+ # GroupBy state stack (supports nesting)
59
+ self._groupby_stack: list[dict] = []
60
+
61
+ def push_groupby(self, state: dict) -> None:
62
+ """Push groupby state for nested operations."""
63
+ self._groupby_stack.append(state)
64
+
65
+ def pop_groupby(self) -> Optional[dict]:
66
+ """Pop most recent groupby state."""
67
+ return self._groupby_stack.pop() if self._groupby_stack else None
68
+
69
+ def peek_groupby(self) -> Optional[dict]:
70
+ """Peek at current groupby state without removing."""
71
+ return self._groupby_stack[-1] if self._groupby_stack else None
72
+
73
+ def clear_groupby_for_source(self, source_id: int) -> None:
74
+ """
75
+ Clear any groupby state for a given source DataFrame.
76
+
77
+ Called when a new groupby() is performed on the same DataFrame,
78
+ which invalidates any previous groupby state for that source.
79
+ """
80
+ self._groupby_stack = [s for s in self._groupby_stack if s.get("source_id") != source_id]
81
+
82
+
83
+ def get_context() -> TracePipeContext:
84
+ """Get the current thread's TracePipe context."""
85
+ if not hasattr(_thread_local, "context"):
86
+ _thread_local.context = TracePipeContext()
87
+ return _thread_local.context
88
+
89
+
90
+ def set_context(ctx: TracePipeContext) -> None:
91
+ """Set context for current thread (used in testing)."""
92
+ _thread_local.context = ctx
93
+
94
+
95
+ def reset_context() -> None:
96
+ """Reset context for current thread."""
97
+ if hasattr(_thread_local, "context"):
98
+ del _thread_local.context
tracepipe/core.py ADDED
@@ -0,0 +1,122 @@
1
+ # tracepipe/core.py
2
+ """
3
+ Core types, enums, and configuration for TracePipe.
4
+ """
5
+
6
+ import os
7
+ from dataclasses import dataclass, field
8
+ from enum import IntEnum
9
+ from typing import Any, Optional
10
+
11
+
12
+ class ChangeType(IntEnum):
13
+ """Types of changes tracked by TracePipe."""
14
+
15
+ MODIFIED = 0
16
+ DROPPED = 1
17
+ ADDED = 2
18
+ REORDERED = 3
19
+
20
+
21
+ class CompletenessLevel(IntEnum):
22
+ """
23
+ Indicates how completely an operation's internals are tracked.
24
+
25
+ FULL: Completely tracked (e.g., fillna, dropna)
26
+ PARTIAL: Output tracked, internals unknown (e.g., apply, pipe)
27
+ UNKNOWN: Lineage reset (e.g., merge, concat)
28
+ """
29
+
30
+ FULL = 0
31
+ PARTIAL = 1
32
+ UNKNOWN = 2
33
+
34
+
35
+ @dataclass
36
+ class TracePipeConfig:
37
+ """Configuration with sensible defaults."""
38
+
39
+ max_diffs_in_memory: int = 500_000
40
+ max_diffs_per_step: int = 100_000
41
+ max_group_membership_size: int = 100_000 # Store count-only above this threshold
42
+ strict_mode: bool = False
43
+ auto_watch: bool = False
44
+ auto_watch_null_threshold: float = 0.01
45
+ spillover_dir: str = ".tracepipe"
46
+ use_hidden_column: bool = False
47
+ warn_on_duplicate_index: bool = True
48
+ cleanup_spillover_on_disable: bool = True
49
+
50
+ @classmethod
51
+ def from_env(cls) -> "TracePipeConfig":
52
+ """Create config from environment variables."""
53
+ return cls(
54
+ max_diffs_in_memory=int(os.environ.get("TRACEPIPE_MAX_DIFFS", 500_000)),
55
+ max_diffs_per_step=int(os.environ.get("TRACEPIPE_MAX_DIFFS_PER_STEP", 100_000)),
56
+ strict_mode=os.environ.get("TRACEPIPE_STRICT", "0") == "1",
57
+ auto_watch=os.environ.get("TRACEPIPE_AUTO_WATCH", "0") == "1",
58
+ use_hidden_column=os.environ.get("TRACEPIPE_HIDDEN_COL", "0") == "1",
59
+ )
60
+
61
+
62
+ @dataclass
63
+ class StepMetadata:
64
+ """Metadata for a single pipeline step."""
65
+
66
+ step_id: int
67
+ operation: str
68
+ stage: Optional[str]
69
+ timestamp: float
70
+ code_file: Optional[str]
71
+ code_line: Optional[int]
72
+ params: dict[str, Any]
73
+ input_shape: Optional[tuple]
74
+ output_shape: Optional[tuple]
75
+ is_mass_update: bool = False
76
+ rows_affected: int = 0
77
+ completeness: CompletenessLevel = CompletenessLevel.FULL
78
+
79
+
80
+ @dataclass
81
+ class AggregationMapping:
82
+ """Tracks which rows contributed to an aggregation group."""
83
+
84
+ step_id: int
85
+ group_column: str
86
+ membership: dict[str, list[int]] # {group_key: [row_ids]}
87
+ agg_functions: dict[str, str]
88
+
89
+
90
+ @dataclass
91
+ class LineageGap:
92
+ """Represents a gap in lineage tracking."""
93
+
94
+ step_id: int
95
+ operation: str
96
+ reason: str
97
+
98
+
99
+ @dataclass
100
+ class LineageGaps:
101
+ """Collection of lineage gaps for a row."""
102
+
103
+ gaps: list[LineageGap] = field(default_factory=list)
104
+
105
+ @property
106
+ def has_gaps(self) -> bool:
107
+ """Return True if there are any gaps."""
108
+ return len(self.gaps) > 0
109
+
110
+ @property
111
+ def is_fully_tracked(self) -> bool:
112
+ """Return True if lineage is complete."""
113
+ return len(self.gaps) == 0
114
+
115
+ def summary(self) -> str:
116
+ """Return a human-readable summary."""
117
+ if self.is_fully_tracked:
118
+ return "Fully tracked"
119
+ elif len(self.gaps) == 1:
120
+ return f"1 step has limited visibility: {self.gaps[0].operation}"
121
+ else:
122
+ return f"{len(self.gaps)} steps have limited visibility"
@@ -0,0 +1,6 @@
1
+ # tracepipe/instrumentation/__init__.py
2
+ """Instrumentation for various data processing libraries."""
3
+
4
+ from .pandas_inst import instrument_pandas, uninstrument_pandas
5
+
6
+ __all__ = ["instrument_pandas", "uninstrument_pandas"]