tracepipe 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,11 @@
1
1
  # tracepipe/instrumentation/pandas_inst.py
2
2
  """
3
3
  Pandas DataFrame instrumentation for row-level lineage tracking.
4
+
5
+ This module wires the filter and merge capture modules to pandas methods.
6
+ The actual capture logic is in:
7
+ - filter_capture.py: Mask-first filter capture with FULL/PARTIAL completeness
8
+ - merge_capture.py: Position column injection for merge provenance
4
9
  """
5
10
 
6
11
  import warnings
@@ -15,56 +20,26 @@ from ..core import ChangeType, CompletenessLevel
15
20
  from ..safety import (
16
21
  TracePipeWarning,
17
22
  get_caller_info,
18
- wrap_pandas_filter_method,
19
23
  wrap_pandas_method,
20
24
  wrap_pandas_method_inplace,
21
25
  )
22
26
  from ..utils.value_capture import find_changed_indices_vectorized
27
+ from .apply_capture import instrument_apply_pipe, uninstrument_apply_pipe
28
+
29
+ # Import new capture modules
30
+ from .filter_capture import wrap_filter_method, wrap_getitem_filter
31
+ from .indexer_capture import instrument_indexers, uninstrument_indexers
32
+ from .merge_capture import (
33
+ wrap_concat_with_lineage,
34
+ wrap_join_with_lineage,
35
+ wrap_merge_with_lineage,
36
+ )
37
+ from .series_capture import instrument_series, uninstrument_series
23
38
 
24
39
  # Store original methods for restore
25
40
  _originals: dict[str, Any] = {}
26
41
 
27
42
 
28
- # === FILTER CAPTURE ===
29
-
30
-
31
- def _capture_filter(
32
- self: pd.DataFrame, args, kwargs, result, ctx: TracePipeContext, method_name: str
33
- ):
34
- """Capture lineage for filter operations (dropna, query, head, etc.)."""
35
- if not isinstance(result, pd.DataFrame):
36
- return
37
-
38
- source_ids = ctx.row_manager.get_ids(self)
39
- if source_ids is None:
40
- # Auto-register if not tracked
41
- ctx.row_manager.register(self)
42
- source_ids = ctx.row_manager.get_ids(self)
43
- if source_ids is None:
44
- return
45
-
46
- # Propagate IDs to result
47
- ctx.row_manager.propagate(self, result)
48
-
49
- # Find dropped rows (returns numpy array for performance)
50
- dropped_ids = ctx.row_manager.get_dropped_ids(self, result)
51
-
52
- if len(dropped_ids) > 0:
53
- code_file, code_line = get_caller_info(skip_frames=2)
54
- step_id = ctx.store.append_step(
55
- operation=f"DataFrame.{method_name}",
56
- stage=ctx.current_stage,
57
- code_file=code_file,
58
- code_line=code_line,
59
- params=_safe_params(kwargs),
60
- input_shape=self.shape,
61
- output_shape=result.shape,
62
- )
63
-
64
- # Bulk record all drops at once (10-50x faster than loop)
65
- ctx.store.append_bulk_drops(step_id, dropped_ids)
66
-
67
-
68
43
  # === TRANSFORM CAPTURE ===
69
44
 
70
45
 
@@ -299,56 +274,6 @@ def _capture_agg(self, args, kwargs, result, ctx: TracePipeContext, method_name:
299
274
  ctx.row_manager.register(result)
300
275
 
301
276
 
302
- # === MERGE/CONCAT (UNKNOWN - OUT OF SCOPE) ===
303
-
304
-
305
- def _capture_merge(
306
- self: pd.DataFrame, args, kwargs, result, ctx: TracePipeContext, method_name: str
307
- ):
308
- """Mark merge as UNKNOWN completeness and reset lineage."""
309
- code_file, code_line = get_caller_info(skip_frames=2)
310
- ctx.store.append_step(
311
- operation=f"DataFrame.{method_name}",
312
- stage=ctx.current_stage,
313
- code_file=code_file,
314
- code_line=code_line,
315
- params={"how": kwargs.get("how", "inner")},
316
- input_shape=self.shape,
317
- output_shape=result.shape if hasattr(result, "shape") else None,
318
- completeness=CompletenessLevel.UNKNOWN,
319
- )
320
-
321
- warnings.warn(
322
- f"TracePipe: {method_name}() resets row lineage. "
323
- f"Rows in result cannot be traced back to source rows.",
324
- TracePipeWarning,
325
- )
326
-
327
- # Register result with NEW IDs
328
- if isinstance(result, pd.DataFrame):
329
- ctx.row_manager.register(result)
330
-
331
-
332
- def _capture_concat(args, kwargs, result, ctx: TracePipeContext):
333
- """Capture pd.concat (module-level function)."""
334
- code_file, code_line = get_caller_info(skip_frames=2)
335
- ctx.store.append_step(
336
- operation="pd.concat",
337
- stage=ctx.current_stage,
338
- code_file=code_file,
339
- code_line=code_line,
340
- params={"axis": kwargs.get("axis", 0)},
341
- input_shape=None,
342
- output_shape=result.shape if hasattr(result, "shape") else None,
343
- completeness=CompletenessLevel.UNKNOWN,
344
- )
345
-
346
- warnings.warn("TracePipe: pd.concat() resets row lineage.", TracePipeWarning)
347
-
348
- if isinstance(result, pd.DataFrame):
349
- ctx.row_manager.register(result)
350
-
351
-
352
277
  # === INDEX OPERATIONS ===
353
278
 
354
279
 
@@ -408,21 +333,21 @@ def _capture_sort_values(
408
333
  # Record reorder for each row
409
334
  result_ids = ctx.row_manager.get_ids(result)
410
335
  if result_ids is not None:
336
+ # Pre-compute position lookup (O(n) instead of O(n²))
337
+ source_idx_list = list(source_ids.index) if hasattr(source_ids, "index") else []
338
+ source_pos_map = {idx: pos for pos, idx in enumerate(source_idx_list)}
339
+
411
340
  for new_pos, (idx, row_id) in enumerate(result_ids.items()):
412
- # Find old position
413
- try:
414
- old_pos = list(source_ids.index).index(idx)
415
- if old_pos != new_pos:
416
- ctx.store.append_diff(
417
- step_id=step_id,
418
- row_id=int(row_id),
419
- col="__position__",
420
- old_val=old_pos,
421
- new_val=new_pos,
422
- change_type=ChangeType.REORDERED,
423
- )
424
- except (ValueError, KeyError):
425
- pass
341
+ old_pos = source_pos_map.get(idx)
342
+ if old_pos is not None and old_pos != new_pos:
343
+ ctx.store.append_diff(
344
+ step_id=step_id,
345
+ row_id=int(row_id),
346
+ col="__position__",
347
+ old_val=old_pos,
348
+ new_val=new_pos,
349
+ change_type=ChangeType.REORDERED,
350
+ )
426
351
 
427
352
 
428
353
  # === COPY CAPTURE ===
@@ -458,7 +383,11 @@ def _capture_drop(
458
383
 
459
384
  - Row drops (axis=0): Track as filter operation
460
385
  - Column drops (axis=1): Track as schema change (step metadata only)
386
+ - Handles inplace=True (result is self, passed by wrapper)
461
387
  """
388
+ # Handle inplace: result is passed as self by the inplace wrapper
389
+ if result is None:
390
+ return
462
391
  if not isinstance(result, pd.DataFrame):
463
392
  return
464
393
 
@@ -509,60 +438,6 @@ def _capture_drop(
509
438
  )
510
439
 
511
440
 
512
- # === __getitem__ DISPATCH ===
513
-
514
-
515
- def _capture_getitem(
516
- self: pd.DataFrame, args, kwargs, result, ctx: TracePipeContext, method_name: str
517
- ):
518
- """
519
- Dispatch __getitem__ based on key type.
520
-
521
- - df['col'] -> Series (ignore)
522
- - df[['a','b']] -> DataFrame column select (propagate)
523
- - df[mask] -> DataFrame row filter (track drops)
524
- - df[slice] -> DataFrame row slice (track drops)
525
- """
526
- if len(args) != 1:
527
- return
528
-
529
- key = args[0]
530
-
531
- # Series result - column access, not row filter
532
- if isinstance(result, pd.Series):
533
- return
534
-
535
- if not isinstance(result, pd.DataFrame):
536
- return
537
-
538
- # Boolean mask - row filter
539
- if isinstance(key, (pd.Series, np.ndarray)) and getattr(key, "dtype", None) is np.dtype("bool"):
540
- # Skip if we're inside a named filter op (e.g., drop_duplicates)
541
- # to avoid double-counting drops
542
- if ctx._filter_op_depth > 0:
543
- ctx.row_manager.propagate(self, result)
544
- return
545
- _capture_filter(self, args, kwargs, result, ctx, "__getitem__[mask]")
546
- return
547
-
548
- # List of columns - column selection
549
- if isinstance(key, list):
550
- ctx.row_manager.propagate(self, result)
551
- return
552
-
553
- # Slice - row selection
554
- if isinstance(key, slice):
555
- # Skip if we're inside a named filter op
556
- if ctx._filter_op_depth > 0:
557
- ctx.row_manager.propagate(self, result)
558
- return
559
- _capture_filter(self, args, kwargs, result, ctx, "__getitem__[slice]")
560
- return
561
-
562
- # Default: propagate
563
- ctx.row_manager.propagate(self, result)
564
-
565
-
566
441
  # === __setitem__ CAPTURE ===
567
442
 
568
443
 
@@ -751,7 +626,8 @@ def _wrap_dataframe_init(original):
751
626
  original(self, *args, **kwargs)
752
627
 
753
628
  ctx = get_context()
754
- if ctx.enabled:
629
+ # Skip registration when inside filter/export operation (prevents re-adding hidden column)
630
+ if ctx.enabled and ctx._filter_op_depth == 0:
755
631
  if ctx.row_manager.get_ids(self) is None:
756
632
  ctx.row_manager.register(self)
757
633
 
@@ -768,7 +644,12 @@ def _make_export_wrapper(original):
768
644
  def wrapper(self, *args, **kwargs):
769
645
  ctx = get_context()
770
646
  if ctx.enabled:
771
- clean_df = ctx.row_manager.strip_hidden_column(self)
647
+ # Increment filter depth to prevent tracking during column strip
648
+ ctx._filter_op_depth += 1
649
+ try:
650
+ clean_df = ctx.row_manager.strip_hidden_column(self)
651
+ finally:
652
+ ctx._filter_op_depth -= 1
772
653
  return original(clean_df, *args, **kwargs)
773
654
  return original(self, *args, **kwargs)
774
655
 
@@ -880,15 +761,14 @@ def instrument_pandas():
880
761
  # Already instrumented
881
762
  return
882
763
 
883
- # === DataFrame filter methods ===
884
- # Use wrap_pandas_filter_method to prevent double-counting when
885
- # methods like drop_duplicates internally call __getitem__
764
+ # === DataFrame filter methods (using new mask-first capture) ===
765
+ # wrap_filter_method provides FULL/PARTIAL completeness tracking
886
766
  filter_methods = ["dropna", "drop_duplicates", "query", "head", "tail", "sample"]
887
767
  for method_name in filter_methods:
888
768
  if hasattr(pd.DataFrame, method_name):
889
769
  original = getattr(pd.DataFrame, method_name)
890
770
  _originals[f"DataFrame.{method_name}"] = original
891
- wrapped = wrap_pandas_filter_method(method_name, original, _capture_filter)
771
+ wrapped = wrap_filter_method(method_name, original)
892
772
  setattr(pd.DataFrame, method_name, wrapped)
893
773
 
894
774
  # === DataFrame transform methods (with inplace support) ===
@@ -908,9 +788,9 @@ def instrument_pandas():
908
788
  _originals["DataFrame.copy"] = pd.DataFrame.copy
909
789
  pd.DataFrame.copy = wrap_pandas_method("copy", pd.DataFrame.copy, _capture_copy)
910
790
 
911
- # === drop (row/column removal) ===
791
+ # === drop (row/column removal, supports inplace=True) ===
912
792
  _originals["DataFrame.drop"] = pd.DataFrame.drop
913
- pd.DataFrame.drop = wrap_pandas_method("drop", pd.DataFrame.drop, _capture_drop)
793
+ pd.DataFrame.drop = wrap_pandas_method_inplace("drop", pd.DataFrame.drop, _capture_drop)
914
794
 
915
795
  # === apply/pipe ===
916
796
  _originals["DataFrame.apply"] = pd.DataFrame.apply
@@ -933,12 +813,14 @@ def instrument_pandas():
933
813
  wrapped = wrap_pandas_method(agg_method, original, _capture_agg)
934
814
  setattr(DataFrameGroupBy, agg_method, wrapped)
935
815
 
936
- # === merge ===
816
+ # === merge (using new position column injection capture) ===
817
+ # wrap_merge_with_lineage provides full provenance in DEBUG mode
937
818
  _originals["DataFrame.merge"] = pd.DataFrame.merge
938
- pd.DataFrame.merge = wrap_pandas_method("merge", pd.DataFrame.merge, _capture_merge)
819
+ pd.DataFrame.merge = wrap_merge_with_lineage(pd.DataFrame.merge)
939
820
 
821
+ # === join (using new join wrapper) ===
940
822
  _originals["DataFrame.join"] = pd.DataFrame.join
941
- pd.DataFrame.join = wrap_pandas_method("join", pd.DataFrame.join, _capture_merge)
823
+ pd.DataFrame.join = wrap_join_with_lineage(pd.DataFrame.join)
942
824
 
943
825
  # === Index operations ===
944
826
  _originals["DataFrame.reset_index"] = pd.DataFrame.reset_index
@@ -956,11 +838,9 @@ def instrument_pandas():
956
838
  "sort_values", pd.DataFrame.sort_values, _capture_sort_values
957
839
  )
958
840
 
959
- # === __getitem__ ===
841
+ # === __getitem__ (using new filter capture for boolean indexing) ===
960
842
  _originals["DataFrame.__getitem__"] = pd.DataFrame.__getitem__
961
- pd.DataFrame.__getitem__ = wrap_pandas_method(
962
- "__getitem__", pd.DataFrame.__getitem__, _capture_getitem
963
- )
843
+ pd.DataFrame.__getitem__ = wrap_getitem_filter(pd.DataFrame.__getitem__)
964
844
 
965
845
  # === __setitem__ (column assignment) ===
966
846
  _originals["DataFrame.__setitem__"] = pd.DataFrame.__setitem__
@@ -993,23 +873,26 @@ def instrument_pandas():
993
873
  _originals["DataFrame.to_parquet"] = pd.DataFrame.to_parquet
994
874
  pd.DataFrame.to_parquet = _make_export_wrapper(pd.DataFrame.to_parquet)
995
875
 
996
- # === pd.concat ===
876
+ # === pd.concat (using new concat wrapper) ===
997
877
  _originals["pd.concat"] = pd.concat
878
+ pd.concat = wrap_concat_with_lineage(_originals["pd.concat"])
998
879
 
999
- def wrapped_concat(*args, **kwargs):
1000
- result = _originals["pd.concat"](*args, **kwargs)
1001
- ctx = get_context()
1002
- if ctx.enabled:
1003
- _capture_concat(args, kwargs, result, ctx)
1004
- return result
1005
-
1006
- pd.concat = wrapped_concat
880
+ # === Phase 6: Extended operation support ===
881
+ # Note: These modules handle their own original storage
882
+ instrument_indexers()
883
+ instrument_series()
884
+ instrument_apply_pipe()
1007
885
 
1008
886
 
1009
887
  def uninstrument_pandas():
1010
888
  """Restore original pandas methods."""
1011
889
  global _originals
1012
890
 
891
+ # Uninstrument Phase 6 modules first (reverse order)
892
+ uninstrument_apply_pipe()
893
+ uninstrument_series()
894
+ uninstrument_indexers()
895
+
1013
896
  for key, original in _originals.items():
1014
897
  parts = key.split(".")
1015
898
  if parts[0] == "pd":
@@ -0,0 +1,331 @@
1
+ # tracepipe/instrumentation/series_capture.py
2
+ """
3
+ Series method instrumentation for TracePipe.
4
+
5
+ Challenge: Series operations are often chained and may not be assigned back.
6
+ Solution: Track when Series is extracted, wrap common methods, capture on assignment.
7
+
8
+ Operations tracked:
9
+ | Pattern | Tracking | Completeness |
10
+ |-----------------------------------|-----------------------------|--------------|
11
+ | df['col'].fillna(val) | Method call + assignment | FULL |
12
+ | df['col'].replace(...) | Method call + assignment | FULL |
13
+ | df['col'].str.upper() | Method call + assignment | FULL |
14
+ | df['col'].dt.year | Method call + assignment | FULL |
15
+ | df['col'].apply(func) | Before/after diff | PARTIAL |
16
+ | df['col'] = series | Assignment diff | FULL |
17
+
18
+ Key insight: We track at ASSIGNMENT time, not method call time.
19
+ This handles arbitrary chains: df['col'] = df['other'].str.strip().str.upper()
20
+ """
21
+
22
+ import warnings
23
+ import weakref
24
+ from functools import wraps
25
+
26
+ import pandas as pd
27
+
28
+ from ..context import get_context
29
+ from ..core import ChangeType, CompletenessLevel
30
+ from ..safety import TracePipeWarning, get_caller_info
31
+
32
+
33
+ class TrackedSeries(pd.Series):
34
+ """
35
+ Series subclass that tracks its origin DataFrame and column.
36
+
37
+ When assigned back to a DataFrame, we can compute the diff.
38
+
39
+ Note: This is created only when extracting from a tracked DataFrame.
40
+ Regular Series operations remain unchanged.
41
+
42
+ Memory Safety:
43
+ - _tp_source_df_ref is a weakref to prevent memory leaks
44
+ - Source DataFrame can be garbage collected independently
45
+ """
46
+
47
+ _metadata = ["_tp_source_df_ref", "_tp_source_col", "_tp_source_rids", "_tp_last_op"]
48
+
49
+ @property
50
+ def _constructor(self):
51
+ return TrackedSeries
52
+
53
+ @property
54
+ def _constructor_expanddim(self):
55
+ return pd.DataFrame
56
+
57
+ @property
58
+ def _tp_source_df(self):
59
+ """Get source DataFrame from weakref (may return None if GC'd)."""
60
+ ref = getattr(self, "_tp_source_df_ref", None)
61
+ if ref is not None:
62
+ return ref()
63
+ return None
64
+
65
+ @_tp_source_df.setter
66
+ def _tp_source_df(self, df):
67
+ """Store source DataFrame as weakref."""
68
+ if df is not None:
69
+ self._tp_source_df_ref = weakref.ref(df)
70
+ else:
71
+ self._tp_source_df_ref = None
72
+
73
+
74
+ def wrap_series_extraction():
75
+ """
76
+ Wrap DataFrame.__getitem__ to return TrackedSeries for single column access.
77
+
78
+ This allows us to track the origin of Series that may be modified and assigned back.
79
+ """
80
+ original_getitem = pd.DataFrame.__getitem__
81
+
82
+ @wraps(original_getitem)
83
+ def tracked_getitem(self, key):
84
+ result = original_getitem(self, key)
85
+
86
+ ctx = get_context()
87
+ if not ctx.enabled:
88
+ return result
89
+
90
+ # Skip internal tracking operations to avoid recursion
91
+ if ctx._filter_op_depth > 0:
92
+ return result
93
+
94
+ # Skip internal tracepipe columns
95
+ if isinstance(key, str) and key.startswith("__tracepipe"):
96
+ return result
97
+
98
+ # Only wrap single-column Series access
99
+ if isinstance(key, str) and isinstance(result, pd.Series):
100
+ rids = ctx.row_manager.get_ids_array(self)
101
+ if rids is not None:
102
+ # Convert to TrackedSeries
103
+ tracked = TrackedSeries(result)
104
+ tracked._tp_source_df = self
105
+ tracked._tp_source_col = key
106
+ tracked._tp_source_rids = rids.copy()
107
+ return tracked
108
+
109
+ return result
110
+
111
+ pd.DataFrame.__getitem__ = tracked_getitem
112
+ pd.DataFrame._tp_original_getitem_series = original_getitem
113
+
114
+
115
+ def wrap_series_assignment():
116
+ """
117
+ Wrap DataFrame.__setitem__ to capture diffs when assigning Series.
118
+
119
+ Handles:
120
+ - df['col'] = series (where series may have been modified)
121
+ - df['col'] = scalar (broadcast assignment)
122
+ - df['col'] = array (direct assignment)
123
+ """
124
+ original_setitem = pd.DataFrame.__setitem__
125
+
126
+ @wraps(original_setitem)
127
+ def tracked_setitem(self, key, value):
128
+ ctx = get_context()
129
+
130
+ # Capture before state for watched columns
131
+ before_values = None
132
+ if (
133
+ ctx.enabled
134
+ and isinstance(key, str)
135
+ and key in ctx.watched_columns
136
+ and key in self.columns
137
+ ):
138
+ rids = ctx.row_manager.get_ids_array(self)
139
+ if rids is not None:
140
+ before_values = {
141
+ "rids": rids.copy(),
142
+ "values": self[key].values.copy(),
143
+ }
144
+
145
+ # Always run original
146
+ original_setitem(self, key, value)
147
+
148
+ if not ctx.enabled:
149
+ return
150
+
151
+ if before_values is None:
152
+ return
153
+
154
+ try:
155
+ _capture_series_assignment(self, key, value, before_values, ctx)
156
+ except Exception as e:
157
+ if ctx.config.strict_mode:
158
+ raise
159
+ warnings.warn(f"TracePipe: Series assignment capture failed: {e}", TracePipeWarning)
160
+
161
+ pd.DataFrame.__setitem__ = tracked_setitem
162
+ pd.DataFrame._tp_original_setitem_series = original_setitem
163
+
164
+
165
+ def _capture_series_assignment(df, key, value, before_values, ctx):
166
+ """Capture diffs from Series assignment."""
167
+ from ..utils.value_capture import values_equal
168
+
169
+ store = ctx.store
170
+ rids = before_values["rids"]
171
+ old_vals = before_values["values"]
172
+ new_vals = df[key].values
173
+
174
+ # Determine completeness based on value type
175
+ if isinstance(value, TrackedSeries):
176
+ # Can trace back to source
177
+ completeness = CompletenessLevel.FULL
178
+ operation = f"Series.{_infer_series_operation(value)}"
179
+ elif hasattr(value, "apply") or callable(value):
180
+ completeness = CompletenessLevel.PARTIAL
181
+ operation = "Series.transform"
182
+ else:
183
+ completeness = CompletenessLevel.FULL
184
+ operation = "DataFrame[]="
185
+
186
+ code_file, code_line = get_caller_info(skip_frames=4)
187
+ step_id = store.append_step(
188
+ operation=operation,
189
+ stage=ctx.current_stage,
190
+ code_file=code_file,
191
+ code_line=code_line,
192
+ params={"column": key},
193
+ input_shape=df.shape,
194
+ output_shape=df.shape,
195
+ completeness=completeness,
196
+ )
197
+
198
+ # Track diffs for changed values
199
+ for rid, old_val, new_val in zip(rids, old_vals, new_vals):
200
+ if not values_equal(old_val, new_val):
201
+ store.append_diff(
202
+ step_id=step_id,
203
+ row_id=int(rid),
204
+ col=key,
205
+ old_val=old_val,
206
+ new_val=new_val,
207
+ change_type=ChangeType.MODIFIED,
208
+ )
209
+
210
+
211
+ def _infer_series_operation(series: TrackedSeries) -> str:
212
+ """
213
+ Infer the operation that produced this Series.
214
+
215
+ Best effort - returns generic name if unknown.
216
+ """
217
+ if hasattr(series, "_tp_last_op") and series._tp_last_op is not None:
218
+ return series._tp_last_op
219
+ return "transform"
220
+
221
+
222
+ # ============ STRING ACCESSOR WRAPPERS ============
223
+
224
+ # Store original accessors module-level for restore
225
+ _original_str_methods = {}
226
+
227
+ # Use a WeakKeyDictionary to track series references without modifying accessor
228
+ _str_accessor_series_map = weakref.WeakKeyDictionary()
229
+
230
+
231
+ def wrap_str_accessor():
232
+ """
233
+ Wrap StringMethods to track operations.
234
+
235
+ We wrap the individual methods rather than __init__ since pandas
236
+ doesn't allow adding new attributes to accessor instances.
237
+ """
238
+ global _original_str_methods
239
+ from pandas.core.strings.accessor import StringMethods
240
+
241
+ # Wrap common string methods to preserve TrackedSeries
242
+ for method_name in [
243
+ "lower",
244
+ "upper",
245
+ "strip",
246
+ "lstrip",
247
+ "rstrip",
248
+ "replace",
249
+ "slice",
250
+ "split",
251
+ "contains",
252
+ "startswith",
253
+ "endswith",
254
+ "len",
255
+ "extract",
256
+ "findall",
257
+ "cat",
258
+ "get",
259
+ "pad",
260
+ "center",
261
+ "ljust",
262
+ "rjust",
263
+ "zfill",
264
+ "wrap",
265
+ "title",
266
+ "capitalize",
267
+ "swapcase",
268
+ "normalize",
269
+ ]:
270
+ if hasattr(StringMethods, method_name):
271
+ _wrap_str_method(StringMethods, method_name)
272
+
273
+
274
+ def _wrap_str_method(cls, method_name):
275
+ """Wrap a single string method to preserve TrackedSeries."""
276
+ global _original_str_methods
277
+
278
+ if method_name in _original_str_methods:
279
+ return # Already wrapped
280
+
281
+ original = getattr(cls, method_name)
282
+ _original_str_methods[method_name] = original
283
+
284
+ @wraps(original)
285
+ def wrapped(self, *args, **kwargs):
286
+ result = original(self, *args, **kwargs)
287
+ # Try to get the originating series from the accessor's internal _orig
288
+ series = getattr(self, "_orig", None)
289
+ if isinstance(result, pd.Series) and isinstance(series, TrackedSeries):
290
+ tracked = TrackedSeries(result)
291
+ # Copy weakref directly to avoid creating strong reference
292
+ tracked._tp_source_df_ref = getattr(series, "_tp_source_df_ref", None)
293
+ tracked._tp_source_col = getattr(series, "_tp_source_col", None)
294
+ tracked._tp_source_rids = getattr(series, "_tp_source_rids", None)
295
+ tracked._tp_last_op = f"str.{method_name}"
296
+ return tracked
297
+ return result
298
+
299
+ setattr(cls, method_name, wrapped)
300
+
301
+
302
+ def instrument_series():
303
+ """Install all Series instrumentation."""
304
+ wrap_series_extraction()
305
+ wrap_series_assignment()
306
+ wrap_str_accessor()
307
+ # Note: DateTime accessor (.dt) wrapping is not implemented.
308
+ # Most datetime operations don't require cell-level tracking.
309
+
310
+
311
+ def uninstrument_series():
312
+ """Restore original Series behavior."""
313
+ global _original_str_methods
314
+
315
+ if hasattr(pd.DataFrame, "_tp_original_getitem_series"):
316
+ pd.DataFrame.__getitem__ = pd.DataFrame._tp_original_getitem_series
317
+ delattr(pd.DataFrame, "_tp_original_getitem_series")
318
+ if hasattr(pd.DataFrame, "_tp_original_setitem_series"):
319
+ pd.DataFrame.__setitem__ = pd.DataFrame._tp_original_setitem_series
320
+ delattr(pd.DataFrame, "_tp_original_setitem_series")
321
+
322
+ # Restore str methods
323
+ if _original_str_methods:
324
+ try:
325
+ from pandas.core.strings.accessor import StringMethods
326
+
327
+ for method_name, original in _original_str_methods.items():
328
+ setattr(StringMethods, method_name, original)
329
+ _original_str_methods.clear()
330
+ except ImportError:
331
+ pass