tracepipe 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,217 @@
1
+ # tracepipe/storage/row_identity.py
2
+ """
3
+ Row identity tracking for pandas DataFrames.
4
+
5
+ Uses: Registry + Hidden Column fallback.
6
+ """
7
+
8
+ import warnings
9
+ import weakref
10
+ from typing import Optional
11
+
12
+ import numpy as np
13
+ import pandas as pd
14
+
15
+ from ..core import TracePipeConfig
16
+
17
+ _TRACEPIPE_ROW_ID_COL = "__tracepipe_row_id__"
18
+
19
+
20
+ class PandasRowIdentity:
21
+ """
22
+ Hybrid row identity tracking for pandas DataFrames.
23
+
24
+ Implements: RowIdentityStrategy protocol
25
+
26
+ Handles:
27
+ - Standard operations (filter, sort, copy)
28
+ - reset_index(drop=True)
29
+ - Duplicate indices (with warning)
30
+ - Chained operations
31
+
32
+ Future alternatives:
33
+ - PolarsRowIdentity: Uses Polars row numbers and lazy evaluation
34
+ - SparkRowIdentity: Uses monotonically_increasing_id() or RDD zipWithIndex
35
+ """
36
+
37
+ def __init__(self, config: TracePipeConfig):
38
+ self.config = config
39
+ self._registry: dict[int, pd.Series] = {}
40
+ self._df_refs: weakref.WeakValueDictionary = weakref.WeakValueDictionary()
41
+ self._next_row_id: int = 0
42
+
43
+ def register(
44
+ self,
45
+ df: pd.DataFrame,
46
+ row_ids: Optional[pd.Series] = None,
47
+ warn_duplicate_index: bool = True,
48
+ ) -> pd.Series:
49
+ """
50
+ Register a DataFrame and assign row IDs.
51
+
52
+ Args:
53
+ df: DataFrame to register
54
+ row_ids: Optional pre-assigned IDs (for propagation)
55
+ warn_duplicate_index: Warn if index has duplicates
56
+
57
+ Returns:
58
+ Series of row IDs aligned to df.index
59
+ """
60
+ # Check for duplicate index
61
+ if warn_duplicate_index and self.config.warn_on_duplicate_index:
62
+ if df.index.has_duplicates:
63
+ warnings.warn(
64
+ "TracePipe: DataFrame has duplicate index values. "
65
+ "Row identity may be ambiguous for duplicates.",
66
+ UserWarning,
67
+ )
68
+
69
+ if row_ids is None:
70
+ # Generate new sequential IDs
71
+ new_ids = list(range(self._next_row_id, self._next_row_id + len(df)))
72
+ self._next_row_id += len(df)
73
+ row_ids = pd.Series(new_ids, index=df.index, dtype="int64")
74
+ else:
75
+ # Ensure alignment
76
+ if not row_ids.index.equals(df.index):
77
+ row_ids = row_ids.copy()
78
+ row_ids.index = df.index
79
+
80
+ obj_id = id(df)
81
+ self._registry[obj_id] = row_ids
82
+ self._df_refs[obj_id] = df
83
+
84
+ # Optionally embed in DataFrame
85
+ if self.config.use_hidden_column:
86
+ df[_TRACEPIPE_ROW_ID_COL] = row_ids.values
87
+
88
+ return row_ids
89
+
90
+ def get_ids(self, df: pd.DataFrame) -> Optional[pd.Series]:
91
+ """Get row IDs for a DataFrame."""
92
+ # 1. Try registry (fast path)
93
+ obj_id = id(df)
94
+ if obj_id in self._registry:
95
+ stored = self._registry[obj_id]
96
+ # Verify alignment still valid
97
+ if len(stored) == len(df) and stored.index.equals(df.index):
98
+ return stored
99
+
100
+ # 2. Try hidden column (fallback)
101
+ if _TRACEPIPE_ROW_ID_COL in df.columns:
102
+ row_ids = df[_TRACEPIPE_ROW_ID_COL].copy()
103
+ row_ids.index = df.index
104
+ # Re-register for future lookups
105
+ self._registry[obj_id] = row_ids
106
+ self._df_refs[obj_id] = df
107
+ return row_ids
108
+
109
+ # 3. Not tracked
110
+ return None
111
+
112
+ def propagate(self, source_df: pd.DataFrame, result_df: pd.DataFrame) -> Optional[pd.Series]:
113
+ """
114
+ Propagate row IDs from source to result DataFrame.
115
+
116
+ Handles:
117
+ - Filtering (fewer rows)
118
+ - Reordering (same rows, different order)
119
+ - Mixed operations
120
+ """
121
+ source_ids = self.get_ids(source_df)
122
+ if source_ids is None:
123
+ return None
124
+
125
+ if result_df is source_df:
126
+ return source_ids
127
+
128
+ try:
129
+ if result_df.index.equals(source_df.index):
130
+ # Same index - direct copy
131
+ result_ids = source_ids.copy()
132
+ elif result_df.index.isin(source_df.index).all():
133
+ # Result index is subset/reorder of source
134
+ result_ids = source_ids.loc[result_df.index].copy()
135
+ else:
136
+ # Partial overlap or new indices
137
+ result_ids = source_ids.reindex(result_df.index)
138
+ # Rows not in source get new IDs
139
+ new_mask = result_ids.isna()
140
+ if new_mask.any():
141
+ new_count = new_mask.sum()
142
+ new_row_ids = list(range(self._next_row_id, self._next_row_id + new_count))
143
+ self._next_row_id += new_count
144
+ result_ids.loc[new_mask] = new_row_ids
145
+ result_ids = result_ids.astype("int64")
146
+ except Exception:
147
+ # Fallback: positional alignment
148
+ if len(result_df) <= len(source_df):
149
+ result_ids = pd.Series(
150
+ source_ids.values[: len(result_df)], index=result_df.index, dtype="int64"
151
+ )
152
+ else:
153
+ # Result is larger - assign new IDs to extras
154
+ base_ids = list(source_ids.values)
155
+ extra_count = len(result_df) - len(source_df)
156
+ extra_ids = list(range(self._next_row_id, self._next_row_id + extra_count))
157
+ self._next_row_id += extra_count
158
+ result_ids = pd.Series(base_ids + extra_ids, index=result_df.index, dtype="int64")
159
+
160
+ return self.register(result_df, result_ids, warn_duplicate_index=False)
161
+
162
+ def realign_for_reset_index(
163
+ self, original_df: pd.DataFrame, new_df: pd.DataFrame
164
+ ) -> Optional[pd.Series]:
165
+ """Handle reset_index(drop=True) which changes index."""
166
+ old_ids = self.get_ids(original_df)
167
+ if old_ids is None:
168
+ return None
169
+
170
+ # Same values, new index
171
+ new_ids = pd.Series(old_ids.values, index=new_df.index, dtype="int64")
172
+ return self.register(new_df, new_ids, warn_duplicate_index=False)
173
+
174
+ def get_dropped_ids(self, source_df: pd.DataFrame, result_df: pd.DataFrame) -> np.ndarray:
175
+ """
176
+ Get row IDs that were dropped between source and result.
177
+
178
+ Uses numpy's setdiff1d for vectorized performance (~50x faster
179
+ than Python set operations for large DataFrames).
180
+
181
+ Returns:
182
+ numpy array of dropped row IDs (empty array if none dropped)
183
+ """
184
+ source_ids = self.get_ids(source_df)
185
+ result_ids = self.get_ids(result_df)
186
+
187
+ if source_ids is None:
188
+ return np.array([], dtype="int64")
189
+ if result_ids is None:
190
+ return np.asarray(source_ids.values, dtype="int64")
191
+
192
+ # Vectorized set difference - O(n log n) in C instead of O(n) in Python
193
+ return np.setdiff1d(source_ids.values, result_ids.values)
194
+
195
+ def strip_hidden_column(self, df: pd.DataFrame) -> pd.DataFrame:
196
+ """Remove hidden column for export."""
197
+ if _TRACEPIPE_ROW_ID_COL in df.columns:
198
+ return df.drop(columns=[_TRACEPIPE_ROW_ID_COL])
199
+ return df
200
+
201
+ def cleanup(self) -> None:
202
+ """Remove stale entries."""
203
+ stale = [k for k in list(self._registry.keys()) if k not in self._df_refs]
204
+ for k in stale:
205
+ del self._registry[k]
206
+
207
+ def all_registered_ids(self) -> list[int]:
208
+ """
209
+ Get all row IDs that have ever been registered.
210
+
211
+ Returns:
212
+ List of all registered row IDs.
213
+ """
214
+ all_ids = set()
215
+ for row_ids in self._registry.values():
216
+ all_ids.update(row_ids.values.tolist())
217
+ return sorted(all_ids)
@@ -0,0 +1,6 @@
1
+ # tracepipe/utils/__init__.py
2
+ """Utility functions for TracePipe."""
3
+
4
+ from .value_capture import capture_typed_value, values_equal
5
+
6
+ __all__ = ["capture_typed_value", "values_equal"]
@@ -0,0 +1,137 @@
1
+ # tracepipe/utils/value_capture.py
2
+ """
3
+ Value capture and comparison utilities with complete NA handling.
4
+ """
5
+
6
+ from typing import Any
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+
11
+ # Interned type strings (avoid allocating same strings repeatedly)
12
+ _TYPE_NULL = "null"
13
+ _TYPE_BOOL = "bool"
14
+ _TYPE_INT = "int"
15
+ _TYPE_FLOAT = "float"
16
+ _TYPE_STR = "str"
17
+ _TYPE_DATETIME = "datetime"
18
+ _TYPE_OTHER = "other"
19
+
20
+
21
+ def capture_typed_value(value: Any) -> tuple[Any, str]:
22
+ """
23
+ Convert value to (python_native, type_string) for storage.
24
+
25
+ Handles:
26
+ - None, np.nan, pd.NA, pd.NaT
27
+ - numpy scalars (not JSON-serializable)
28
+ - Standard Python types
29
+ - Datetime types
30
+
31
+ Returns:
32
+ Tuple of (native_value, type_string)
33
+ """
34
+ # Handle all NA types first (pd.isna handles None, np.nan, pd.NA, pd.NaT)
35
+ try:
36
+ if pd.isna(value):
37
+ return None, _TYPE_NULL
38
+ except (ValueError, TypeError):
39
+ # pd.isna can fail on some types (e.g., lists)
40
+ pass
41
+
42
+ # numpy scalar -> Python native (CRITICAL for JSON serialization)
43
+ if hasattr(value, "item"):
44
+ try:
45
+ value = value.item()
46
+ except (ValueError, AttributeError):
47
+ pass
48
+
49
+ # Type mapping (order matters: bool before int)
50
+ if isinstance(value, bool):
51
+ return value, _TYPE_BOOL
52
+ elif isinstance(value, (int, np.integer)):
53
+ return int(value), _TYPE_INT
54
+ elif isinstance(value, (float, np.floating)):
55
+ return float(value), _TYPE_FLOAT
56
+ elif isinstance(value, str):
57
+ return value, _TYPE_STR
58
+ elif isinstance(value, (pd.Timestamp, np.datetime64)):
59
+ return str(value), _TYPE_DATETIME
60
+ else:
61
+ # Fallback: stringify for storage
62
+ return str(value), _TYPE_OTHER
63
+
64
+
65
+ def values_equal(a: Any, b: Any) -> bool:
66
+ """
67
+ Compare two values, handling NA correctly.
68
+
69
+ pd.isna(x) == pd.isna(y) handles the case where both are NA.
70
+ """
71
+ try:
72
+ a_na = pd.isna(a)
73
+ b_na = pd.isna(b)
74
+
75
+ if a_na and b_na:
76
+ return True
77
+ if a_na or b_na:
78
+ return False
79
+ return a == b
80
+ except (ValueError, TypeError):
81
+ # Fallback for unhashable types
82
+ return str(a) == str(b)
83
+
84
+
85
+ def find_changed_indices_vectorized(old_series: pd.Series, new_series: pd.Series) -> np.ndarray:
86
+ """
87
+ Find indices where values changed, using vectorized operations.
88
+
89
+ ~50-100x faster than row-by-row .loc[] access for large DataFrames.
90
+
91
+ Args:
92
+ old_series: Series of old values (must be aligned with new_series)
93
+ new_series: Series of new values
94
+
95
+ Returns:
96
+ Boolean mask array where True indicates the value changed
97
+ """
98
+ old_arr = old_series.values
99
+ new_arr = new_series.values
100
+ n = len(old_arr)
101
+
102
+ if n == 0:
103
+ return np.array([], dtype=bool)
104
+
105
+ # Vectorized NA detection
106
+ old_na = pd.isna(old_arr)
107
+ new_na = pd.isna(new_arr)
108
+
109
+ # NA status changed (one is NA, other isn't)
110
+ na_status_changed = old_na != new_na
111
+
112
+ # For non-NA values, check if they differ
113
+ # Handle mixed types safely by comparing element-by-element for non-NA
114
+ both_not_na = ~old_na & ~new_na
115
+
116
+ # Initialize values_differ as False
117
+ values_differ = np.zeros(n, dtype=bool)
118
+
119
+ # Only compare where both are non-NA
120
+ non_na_indices = np.where(both_not_na)[0]
121
+ if len(non_na_indices) > 0:
122
+ # Try vectorized comparison first (fast path for homogeneous arrays)
123
+ try:
124
+ with np.errstate(invalid="ignore"):
125
+ values_differ[non_na_indices] = old_arr[non_na_indices] != new_arr[non_na_indices]
126
+ except (TypeError, ValueError):
127
+ # Fallback for mixed types: element-by-element comparison
128
+ for i in non_na_indices:
129
+ try:
130
+ values_differ[i] = old_arr[i] != new_arr[i]
131
+ except (TypeError, ValueError):
132
+ # Different types that can't be compared - treat as different
133
+ values_differ[i] = True
134
+
135
+ changed_mask = (both_not_na & values_differ) | na_status_changed
136
+
137
+ return changed_mask
@@ -0,0 +1,6 @@
1
+ # tracepipe/visualization/__init__.py
2
+ """Visualization exports for TracePipe."""
3
+
4
+ from .html_export import save
5
+
6
+ __all__ = ["save"]