tracepipe 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,122 +1,252 @@
1
1
  # tracepipe/storage/row_identity.py
2
2
  """
3
- Row identity tracking for pandas DataFrames.
4
-
5
- Uses: Registry + Hidden Column fallback.
3
+ Row identity tracking with positional propagation.
4
+
5
+ Key invariants:
6
+ - Every registered DataFrame has RIDs aligned to its index
7
+ - Propagation is POSITIONAL, not index-label based
8
+ - Ghost values are captured in debug mode only
9
+ - NO DataFrame mutation by default
10
+
11
+ Identity Storage Options:
12
+ - REGISTRY (default): WeakKeyDictionary, no mutation
13
+ - If weakref fails, auto-degrades to ATTRS with one-time warning
14
+ - ATTRS: df.attrs token for long sessions
15
+ - COLUMN: hidden column (opt-in only)
6
16
  """
7
17
 
8
- import warnings
18
+ import logging
19
+ import uuid
9
20
  import weakref
21
+ from collections import OrderedDict
10
22
  from typing import Optional
11
23
 
12
24
  import numpy as np
13
25
  import pandas as pd
14
26
 
15
- from ..core import TracePipeConfig
27
+ from ..core import GhostRowInfo, IdentityStorage, TracePipeConfig
28
+
29
+ logger = logging.getLogger(__name__)
16
30
 
17
31
  _TRACEPIPE_ROW_ID_COL = "__tracepipe_row_id__"
32
+ _TRACEPIPE_TOKEN_ATTR = "_tracepipe_token"
18
33
 
19
34
 
20
35
  class PandasRowIdentity:
21
36
  """
22
- Hybrid row identity tracking for pandas DataFrames.
37
+ Row identity tracking with positional propagation.
23
38
 
24
39
  Implements: RowIdentityStrategy protocol
25
-
26
- Handles:
27
- - Standard operations (filter, sort, copy)
28
- - reset_index(drop=True)
29
- - Duplicate indices (with warning)
30
- - Chained operations
31
-
32
- Future alternatives:
33
- - PolarsRowIdentity: Uses Polars row numbers and lazy evaluation
34
- - SparkRowIdentity: Uses monotonically_increasing_id() or RDD zipWithIndex
35
40
  """
36
41
 
42
+ # Cap token registry size to prevent unbounded growth
43
+ MAX_TOKEN_REGISTRY_SIZE: int = 50_000
44
+
37
45
  def __init__(self, config: TracePipeConfig):
38
46
  self.config = config
39
- self._registry: dict[int, pd.Series] = {}
40
- self._df_refs: weakref.WeakValueDictionary = weakref.WeakValueDictionary()
41
47
  self._next_row_id: int = 0
42
48
 
49
+ # Use WeakKeyDictionary for proper GC
50
+ # Maps DataFrame object -> rids_array
51
+ self._registry: weakref.WeakKeyDictionary = weakref.WeakKeyDictionary()
52
+
53
+ # Fallback for non-weakref-able DataFrames
54
+ # Maps token -> rids_array (token stored in df.attrs)
55
+ # Uses OrderedDict for FIFO eviction
56
+ self._token_registry: OrderedDict[str, np.ndarray] = OrderedDict()
57
+
58
+ # Ghost row storage (debug mode only)
59
+ self._ghost_rows: dict[int, GhostRowInfo] = {}
60
+
61
+ # Track if we've warned about weakref fallback
62
+ self._weakref_fallback_warned: bool = False
63
+
64
+ def _generate_token(self) -> str:
65
+ """Generate unique token for DataFrame identification."""
66
+ return uuid.uuid4().hex
67
+
43
68
  def register(
44
69
  self,
45
70
  df: pd.DataFrame,
46
- row_ids: Optional[pd.Series] = None,
71
+ row_ids: Optional[np.ndarray] = None,
47
72
  warn_duplicate_index: bool = True,
48
- ) -> pd.Series:
73
+ ) -> np.ndarray:
49
74
  """
50
75
  Register a DataFrame and assign row IDs.
51
76
 
52
- Args:
53
- df: DataFrame to register
54
- row_ids: Optional pre-assigned IDs (for propagation)
55
- warn_duplicate_index: Warn if index has duplicates
56
-
57
77
  Returns:
58
- Series of row IDs aligned to df.index
78
+ numpy array of row IDs (int64)
59
79
  """
60
- # Check for duplicate index
61
80
  if warn_duplicate_index and self.config.warn_on_duplicate_index:
62
81
  if df.index.has_duplicates:
63
- warnings.warn(
64
- "TracePipe: DataFrame has duplicate index values. "
65
- "Row identity may be ambiguous for duplicates.",
66
- UserWarning,
67
- )
82
+ logger.debug("DataFrame has duplicate index values. Row identity may be ambiguous.")
68
83
 
84
+ n = len(df)
69
85
  if row_ids is None:
70
- # Generate new sequential IDs
71
- new_ids = list(range(self._next_row_id, self._next_row_id + len(df)))
72
- self._next_row_id += len(df)
73
- row_ids = pd.Series(new_ids, index=df.index, dtype="int64")
86
+ row_ids = np.arange(self._next_row_id, self._next_row_id + n, dtype=np.int64)
87
+ self._next_row_id += n
74
88
  else:
75
- # Ensure alignment
76
- if not row_ids.index.equals(df.index):
77
- row_ids = row_ids.copy()
78
- row_ids.index = df.index
89
+ row_ids = np.asarray(row_ids, dtype=np.int64)
90
+
91
+ # Explicit storage selection based on config
92
+ if self.config.identity_storage == IdentityStorage.COLUMN:
93
+ # COLUMN mode: hidden column only
94
+ df[_TRACEPIPE_ROW_ID_COL] = row_ids
95
+
96
+ elif self.config.identity_storage == IdentityStorage.ATTRS:
97
+ # ATTRS mode: df.attrs token only
98
+ token = self._generate_token()
99
+ df.attrs[_TRACEPIPE_TOKEN_ATTR] = token
100
+ self._add_to_token_registry(token, row_ids)
101
+
102
+ else: # REGISTRY mode (default)
103
+ # Try WeakKeyDictionary first
104
+ try:
105
+ self._registry[df] = row_ids
106
+ except TypeError:
107
+ # Auto-degrade to attrs silently (log at debug level)
108
+ if not self._weakref_fallback_warned:
109
+ logger.debug(
110
+ "DataFrame not weakref-able; using df.attrs fallback. "
111
+ "This is safe but uses slightly more memory."
112
+ )
113
+ self._weakref_fallback_warned = True
114
+
115
+ token = self._generate_token()
116
+ df.attrs[_TRACEPIPE_TOKEN_ATTR] = token
117
+ self._add_to_token_registry(token, row_ids)
79
118
 
80
- obj_id = id(df)
81
- self._registry[obj_id] = row_ids
82
- self._df_refs[obj_id] = df
119
+ return row_ids
83
120
 
84
- # Optionally embed in DataFrame
85
- if self.config.use_hidden_column:
86
- df[_TRACEPIPE_ROW_ID_COL] = row_ids.values
121
+ def _add_to_token_registry(self, token: str, row_ids: np.ndarray) -> None:
122
+ """
123
+ Add token to registry with FIFO eviction if over cap.
87
124
 
88
- return row_ids
125
+ Prevents unbounded growth of _token_registry in attrs fallback mode.
126
+ """
127
+ # Evict oldest tokens if over cap
128
+ while len(self._token_registry) >= self.MAX_TOKEN_REGISTRY_SIZE:
129
+ self._token_registry.popitem(last=False) # Remove oldest (FIFO)
130
+
131
+ self._token_registry[token] = row_ids
89
132
 
90
133
  def get_ids(self, df: pd.DataFrame) -> Optional[pd.Series]:
91
- """Get row IDs for a DataFrame."""
92
- # 1. Try registry (fast path)
93
- obj_id = id(df)
94
- if obj_id in self._registry:
95
- stored = self._registry[obj_id]
96
- # Verify alignment still valid
97
- if len(stored) == len(df) and stored.index.equals(df.index):
98
- return stored
99
-
100
- # 2. Try hidden column (fallback)
134
+ """Get row IDs as Series (for index-based operations)."""
135
+ rids = self.get_ids_array(df)
136
+ if rids is not None:
137
+ return pd.Series(rids, index=df.index, dtype=np.int64)
138
+ return None
139
+
140
+ def get_ids_array(self, df: pd.DataFrame) -> Optional[np.ndarray]:
141
+ """Get row IDs as numpy array (for vectorized operations)."""
142
+ # 1. Try WeakKeyDictionary (fastest, for REGISTRY mode)
143
+ try:
144
+ if df in self._registry:
145
+ rids = self._registry[df]
146
+ if len(rids) == len(df):
147
+ return rids
148
+ except TypeError:
149
+ pass
150
+
151
+ # 2. Try attrs token (for ATTRS mode or REGISTRY fallback)
152
+ token = df.attrs.get(_TRACEPIPE_TOKEN_ATTR)
153
+ if token and token in self._token_registry:
154
+ rids = self._token_registry[token]
155
+ if len(rids) == len(df):
156
+ return rids
157
+
158
+ # 3. Try hidden column (for COLUMN mode)
159
+ # Access underlying data directly to avoid triggering instrumented __getitem__
101
160
  if _TRACEPIPE_ROW_ID_COL in df.columns:
102
- row_ids = df[_TRACEPIPE_ROW_ID_COL].copy()
103
- row_ids.index = df.index
104
- # Re-register for future lookups
105
- self._registry[obj_id] = row_ids
106
- self._df_refs[obj_id] = df
107
- return row_ids
108
-
109
- # 3. Not tracked
161
+ col_idx = df.columns.get_loc(_TRACEPIPE_ROW_ID_COL)
162
+ # Use _iget_item_cache for direct column access (bypasses instrumentation)
163
+ return df._get_column_array(col_idx).astype(np.int64)
164
+
110
165
  return None
111
166
 
167
+ def set_result_rids(self, result_df: pd.DataFrame, rids: np.ndarray) -> None:
168
+ """
169
+ Set RIDs for a result DataFrame (internal controlled assignment).
170
+
171
+ Uses same storage logic as register().
172
+ """
173
+ rids = np.asarray(rids, dtype=np.int64)
174
+
175
+ if self.config.identity_storage == IdentityStorage.COLUMN:
176
+ result_df[_TRACEPIPE_ROW_ID_COL] = rids
177
+
178
+ elif self.config.identity_storage == IdentityStorage.ATTRS:
179
+ token = self._generate_token()
180
+ result_df.attrs[_TRACEPIPE_TOKEN_ATTR] = token
181
+ self._add_to_token_registry(token, rids)
182
+
183
+ else: # REGISTRY mode
184
+ try:
185
+ self._registry[result_df] = rids
186
+ except TypeError:
187
+ # Fallback to attrs (FIFO eviction)
188
+ token = self._generate_token()
189
+ result_df.attrs[_TRACEPIPE_TOKEN_ATTR] = token
190
+ self._add_to_token_registry(token, rids)
191
+
192
+ # ========== POSITIONAL PROPAGATION METHODS ==========
193
+
194
+ def propagate_by_mask(
195
+ self, source_df: pd.DataFrame, result_df: pd.DataFrame, kept_mask: np.ndarray
196
+ ) -> np.ndarray:
197
+ """
198
+ Propagate RIDs using boolean mask (for filter operations).
199
+
200
+ Args:
201
+ source_df: Original DataFrame
202
+ result_df: Filtered DataFrame
203
+ kept_mask: Boolean array where True = row kept
204
+
205
+ Returns:
206
+ Array of RIDs for result_df
207
+ """
208
+ source_rids = self.get_ids_array(source_df)
209
+ if source_rids is None:
210
+ return self.register(result_df)
211
+
212
+ result_rids = source_rids[kept_mask]
213
+ self.set_result_rids(result_df, result_rids)
214
+ return result_rids
215
+
216
+ def propagate_by_positions(
217
+ self, source_df: pd.DataFrame, result_df: pd.DataFrame, positions: np.ndarray
218
+ ) -> np.ndarray:
219
+ """
220
+ Propagate RIDs using position indices (for head/tail/sample).
221
+ """
222
+ source_rids = self.get_ids_array(source_df)
223
+ if source_rids is None:
224
+ return self.register(result_df)
225
+
226
+ result_rids = source_rids[positions]
227
+ self.set_result_rids(result_df, result_rids)
228
+ return result_rids
229
+
230
+ def propagate_by_permutation(
231
+ self, source_df: pd.DataFrame, result_df: pd.DataFrame, perm: np.ndarray
232
+ ) -> np.ndarray:
233
+ """
234
+ Propagate RIDs using permutation array (for sort_values).
235
+ """
236
+ source_rids = self.get_ids_array(source_df)
237
+ if source_rids is None:
238
+ return self.register(result_df)
239
+
240
+ result_rids = source_rids[perm]
241
+ self.set_result_rids(result_df, result_rids)
242
+ return result_rids
243
+
112
244
  def propagate(self, source_df: pd.DataFrame, result_df: pd.DataFrame) -> Optional[pd.Series]:
113
245
  """
114
246
  Propagate row IDs from source to result DataFrame.
115
247
 
116
- Handles:
117
- - Filtering (fewer rows)
118
- - Reordering (same rows, different order)
119
- - Mixed operations
248
+ Backwards compatible method - uses index-based fallback.
249
+ For better accuracy, use propagate_by_mask/positions/permutation.
120
250
  """
121
251
  source_ids = self.get_ids(source_df)
122
252
  if source_ids is None:
@@ -139,7 +269,9 @@ class PandasRowIdentity:
139
269
  new_mask = result_ids.isna()
140
270
  if new_mask.any():
141
271
  new_count = new_mask.sum()
142
- new_row_ids = list(range(self._next_row_id, self._next_row_id + new_count))
272
+ new_row_ids = np.arange(
273
+ self._next_row_id, self._next_row_id + new_count, dtype=np.int64
274
+ )
143
275
  self._next_row_id += new_count
144
276
  result_ids.loc[new_mask] = new_row_ids
145
277
  result_ids = result_ids.astype("int64")
@@ -147,7 +279,9 @@ class PandasRowIdentity:
147
279
  # Fallback: positional alignment
148
280
  if len(result_df) <= len(source_df):
149
281
  result_ids = pd.Series(
150
- source_ids.values[: len(result_df)], index=result_df.index, dtype="int64"
282
+ source_ids.values[: len(result_df)],
283
+ index=result_df.index,
284
+ dtype="int64",
151
285
  )
152
286
  else:
153
287
  # Result is larger - assign new IDs to extras
@@ -157,40 +291,158 @@ class PandasRowIdentity:
157
291
  self._next_row_id += extra_count
158
292
  result_ids = pd.Series(base_ids + extra_ids, index=result_df.index, dtype="int64")
159
293
 
160
- return self.register(result_df, result_ids, warn_duplicate_index=False)
294
+ # Register the result
295
+ rids_array = result_ids.values.astype(np.int64)
296
+ self.set_result_rids(result_df, rids_array)
297
+ return result_ids
161
298
 
162
- def realign_for_reset_index(
163
- self, original_df: pd.DataFrame, new_df: pd.DataFrame
164
- ) -> Optional[pd.Series]:
165
- """Handle reset_index(drop=True) which changes index."""
166
- old_ids = self.get_ids(original_df)
167
- if old_ids is None:
168
- return None
299
+ # ========== DROP COMPUTATION ==========
169
300
 
170
- # Same values, new index
171
- new_ids = pd.Series(old_ids.values, index=new_df.index, dtype="int64")
172
- return self.register(new_df, new_ids, warn_duplicate_index=False)
301
+ def compute_dropped_ids(self, source_rids: np.ndarray, result_rids: np.ndarray) -> np.ndarray:
302
+ """
303
+ Compute which RIDs were dropped.
173
304
 
174
- def get_dropped_ids(self, source_df: pd.DataFrame, result_df: pd.DataFrame) -> np.ndarray:
305
+ Returns:
306
+ Array of dropped RIDs (sorted)
175
307
  """
176
- Get row IDs that were dropped between source and result.
308
+ # assume_unique=False because arrays are not guaranteed sorted
309
+ return np.setdiff1d(source_rids, result_rids, assume_unique=False)
177
310
 
178
- Uses numpy's setdiff1d for vectorized performance (~50x faster
179
- than Python set operations for large DataFrames).
311
+ def compute_dropped_with_positions(
312
+ self, source_rids: np.ndarray, kept_mask: np.ndarray
313
+ ) -> tuple[np.ndarray, np.ndarray]:
314
+ """
315
+ Compute dropped IDs with original positions preserved.
180
316
 
181
317
  Returns:
182
- numpy array of dropped row IDs (empty array if none dropped)
318
+ (dropped_rids, dropped_positions) - both in original order
183
319
  """
184
- source_ids = self.get_ids(source_df)
185
- result_ids = self.get_ids(result_df)
320
+ dropped_mask = ~kept_mask
321
+ dropped_positions = np.where(dropped_mask)[0]
322
+ dropped_rids = source_rids[dropped_mask]
323
+ return dropped_rids, dropped_positions
324
+
325
+ def get_dropped_ids(self, source_df: pd.DataFrame, result_df: pd.DataFrame) -> np.ndarray:
326
+ """
327
+ Get row IDs that were dropped between source and result.
328
+
329
+ Backwards compatible method.
330
+ """
331
+ source_ids = self.get_ids_array(source_df)
332
+ result_ids = self.get_ids_array(result_df)
186
333
 
187
334
  if source_ids is None:
188
335
  return np.array([], dtype="int64")
189
336
  if result_ids is None:
190
- return np.asarray(source_ids.values, dtype="int64")
337
+ return np.asarray(source_ids, dtype="int64")
191
338
 
192
- # Vectorized set difference - O(n log n) in C instead of O(n) in Python
193
- return np.setdiff1d(source_ids.values, result_ids.values)
339
+ return np.setdiff1d(source_ids, result_ids)
340
+
341
+ # ========== GHOST ROW TRACKING ==========
342
+
343
+ def capture_ghost_values(
344
+ self,
345
+ source_df: pd.DataFrame,
346
+ dropped_mask: np.ndarray,
347
+ dropped_by: str,
348
+ step_id: int,
349
+ watched_columns: set[str],
350
+ ) -> None:
351
+ """
352
+ Capture last-known values for dropped rows (debug mode only).
353
+
354
+ Uses vectorized extraction, not iloc per cell.
355
+ """
356
+ if not self.config.should_capture_ghost_values:
357
+ return
358
+
359
+ source_rids = self.get_ids_array(source_df)
360
+ if source_rids is None:
361
+ return
362
+
363
+ dropped_positions = np.where(dropped_mask)[0]
364
+ dropped_rids = source_rids[dropped_mask]
365
+
366
+ if len(dropped_rids) == 0:
367
+ return
368
+
369
+ # Limit to prevent memory explosion
370
+ max_ghosts = self.config.max_ghost_rows
371
+ if len(dropped_rids) > max_ghosts:
372
+ sample_idx = np.random.choice(len(dropped_rids), max_ghosts, replace=False)
373
+ sample_idx.sort() # Keep relative order
374
+ dropped_rids = dropped_rids[sample_idx]
375
+ dropped_positions = dropped_positions[sample_idx]
376
+
377
+ # Determine columns to capture
378
+ cols_to_capture = list(watched_columns & set(source_df.columns))
379
+ if not cols_to_capture:
380
+ cols_to_capture = list(source_df.columns)[:5]
381
+
382
+ # Vectorized extraction (one slice per column)
383
+ values_matrix: dict[str, np.ndarray] = {}
384
+ for col in cols_to_capture:
385
+ try:
386
+ values_matrix[col] = source_df[col].values[dropped_positions]
387
+ except Exception:
388
+ pass
389
+
390
+ # Build ghost row info from pre-extracted values
391
+ for i, (rid, pos) in enumerate(zip(dropped_rids, dropped_positions)):
392
+ values = {col: vals[i] for col, vals in values_matrix.items()}
393
+
394
+ self._ghost_rows[int(rid)] = GhostRowInfo(
395
+ row_id=int(rid),
396
+ last_values=values,
397
+ dropped_by=dropped_by,
398
+ dropped_step=step_id,
399
+ original_position=int(pos),
400
+ )
401
+
402
+ def get_ghost_rows(self, limit: int = 1000) -> pd.DataFrame:
403
+ """
404
+ Get dropped rows with last-known values.
405
+
406
+ Returns DataFrame with:
407
+ - __tp_row_id__: Original row ID
408
+ - __tp_dropped_by__: Operation that dropped the row
409
+ - __tp_dropped_step__: Step ID
410
+ - __tp_original_position__: Position in original DataFrame
411
+ - [original columns]: Last known values
412
+ """
413
+ if not self._ghost_rows:
414
+ return pd.DataFrame()
415
+
416
+ # Sort by original position for natural order
417
+ sorted_ghosts = sorted(self._ghost_rows.values(), key=lambda g: g.original_position)[:limit]
418
+
419
+ rows = []
420
+ for info in sorted_ghosts:
421
+ row = {
422
+ "__tp_row_id__": info.row_id,
423
+ "__tp_dropped_by__": info.dropped_by,
424
+ "__tp_dropped_step__": info.dropped_step,
425
+ "__tp_original_position__": info.original_position,
426
+ **info.last_values,
427
+ }
428
+ rows.append(row)
429
+
430
+ return pd.DataFrame(rows)
431
+
432
+ # ========== CLEANUP ==========
433
+
434
+ def clear(self) -> None:
435
+ """Reset all state."""
436
+ self._registry.clear()
437
+ self._token_registry.clear()
438
+ self._ghost_rows.clear()
439
+ self._next_row_id = 0
440
+
441
+ def cleanup(self) -> None:
442
+ """Remove stale entries (backwards compatible)."""
443
+ # WeakKeyDictionary handles this automatically for registry.
444
+ # Token registry uses FIFO eviction in _add_to_token_registry.
445
+ pass
194
446
 
195
447
  def strip_hidden_column(self, df: pd.DataFrame) -> pd.DataFrame:
196
448
  """Remove hidden column for export."""
@@ -198,20 +450,30 @@ class PandasRowIdentity:
198
450
  return df.drop(columns=[_TRACEPIPE_ROW_ID_COL])
199
451
  return df
200
452
 
201
- def cleanup(self) -> None:
202
- """Remove stale entries."""
203
- stale = [k for k in list(self._registry.keys()) if k not in self._df_refs]
204
- for k in stale:
205
- del self._registry[k]
206
-
207
453
  def all_registered_ids(self) -> list[int]:
208
454
  """
209
455
  Get all row IDs that have ever been registered.
210
-
211
- Returns:
212
- List of all registered row IDs.
213
456
  """
214
- all_ids = set()
215
- for row_ids in self._registry.values():
216
- all_ids.update(row_ids.values.tolist())
457
+ all_ids: set[int] = set()
458
+
459
+ # From WeakKeyDictionary
460
+ for rids in self._registry.values():
461
+ all_ids.update(rids.tolist())
462
+
463
+ # From token registry
464
+ for rids in self._token_registry.values():
465
+ all_ids.update(rids.tolist())
466
+
217
467
  return sorted(all_ids)
468
+
469
+ def realign_for_reset_index(
470
+ self, original_df: pd.DataFrame, new_df: pd.DataFrame
471
+ ) -> Optional[pd.Series]:
472
+ """Handle reset_index(drop=True) which changes index."""
473
+ old_ids = self.get_ids_array(original_df)
474
+ if old_ids is None:
475
+ return None
476
+
477
+ # Same values, new index
478
+ self.set_result_rids(new_df, old_ids)
479
+ return pd.Series(old_ids, index=new_df.index, dtype="int64")