tracepipe 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tracepipe/__init__.py +117 -78
- tracepipe/api.py +168 -331
- tracepipe/context.py +21 -1
- tracepipe/contracts.py +473 -0
- tracepipe/convenience.py +812 -0
- tracepipe/core.py +174 -17
- tracepipe/debug.py +325 -0
- tracepipe/instrumentation/apply_capture.py +453 -0
- tracepipe/instrumentation/filter_capture.py +468 -0
- tracepipe/instrumentation/indexer_capture.py +813 -0
- tracepipe/instrumentation/merge_capture.py +434 -0
- tracepipe/instrumentation/pandas_inst.py +66 -183
- tracepipe/instrumentation/series_capture.py +331 -0
- tracepipe/safety.py +3 -3
- tracepipe/snapshot.py +420 -0
- tracepipe/storage/base.py +7 -3
- tracepipe/storage/lineage_store.py +190 -47
- tracepipe/storage/row_identity.py +366 -104
- tracepipe/value_provenance.py +301 -0
- tracepipe/visualization/html_export.py +22 -7
- tracepipe-0.3.0.dist-info/METADATA +575 -0
- tracepipe-0.3.0.dist-info/RECORD +29 -0
- tracepipe-0.2.0.dist-info/METADATA +0 -508
- tracepipe-0.2.0.dist-info/RECORD +0 -19
- {tracepipe-0.2.0.dist-info → tracepipe-0.3.0.dist-info}/WHEEL +0 -0
- {tracepipe-0.2.0.dist-info → tracepipe-0.3.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
# tracepipe/instrumentation/pandas_inst.py
|
|
2
2
|
"""
|
|
3
3
|
Pandas DataFrame instrumentation for row-level lineage tracking.
|
|
4
|
+
|
|
5
|
+
This module wires the filter and merge capture modules to pandas methods.
|
|
6
|
+
The actual capture logic is in:
|
|
7
|
+
- filter_capture.py: Mask-first filter capture with FULL/PARTIAL completeness
|
|
8
|
+
- merge_capture.py: Position column injection for merge provenance
|
|
4
9
|
"""
|
|
5
10
|
|
|
6
11
|
import warnings
|
|
@@ -15,56 +20,26 @@ from ..core import ChangeType, CompletenessLevel
|
|
|
15
20
|
from ..safety import (
|
|
16
21
|
TracePipeWarning,
|
|
17
22
|
get_caller_info,
|
|
18
|
-
wrap_pandas_filter_method,
|
|
19
23
|
wrap_pandas_method,
|
|
20
24
|
wrap_pandas_method_inplace,
|
|
21
25
|
)
|
|
22
26
|
from ..utils.value_capture import find_changed_indices_vectorized
|
|
27
|
+
from .apply_capture import instrument_apply_pipe, uninstrument_apply_pipe
|
|
28
|
+
|
|
29
|
+
# Import new capture modules
|
|
30
|
+
from .filter_capture import wrap_filter_method, wrap_getitem_filter
|
|
31
|
+
from .indexer_capture import instrument_indexers, uninstrument_indexers
|
|
32
|
+
from .merge_capture import (
|
|
33
|
+
wrap_concat_with_lineage,
|
|
34
|
+
wrap_join_with_lineage,
|
|
35
|
+
wrap_merge_with_lineage,
|
|
36
|
+
)
|
|
37
|
+
from .series_capture import instrument_series, uninstrument_series
|
|
23
38
|
|
|
24
39
|
# Store original methods for restore
|
|
25
40
|
_originals: dict[str, Any] = {}
|
|
26
41
|
|
|
27
42
|
|
|
28
|
-
# === FILTER CAPTURE ===
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def _capture_filter(
|
|
32
|
-
self: pd.DataFrame, args, kwargs, result, ctx: TracePipeContext, method_name: str
|
|
33
|
-
):
|
|
34
|
-
"""Capture lineage for filter operations (dropna, query, head, etc.)."""
|
|
35
|
-
if not isinstance(result, pd.DataFrame):
|
|
36
|
-
return
|
|
37
|
-
|
|
38
|
-
source_ids = ctx.row_manager.get_ids(self)
|
|
39
|
-
if source_ids is None:
|
|
40
|
-
# Auto-register if not tracked
|
|
41
|
-
ctx.row_manager.register(self)
|
|
42
|
-
source_ids = ctx.row_manager.get_ids(self)
|
|
43
|
-
if source_ids is None:
|
|
44
|
-
return
|
|
45
|
-
|
|
46
|
-
# Propagate IDs to result
|
|
47
|
-
ctx.row_manager.propagate(self, result)
|
|
48
|
-
|
|
49
|
-
# Find dropped rows (returns numpy array for performance)
|
|
50
|
-
dropped_ids = ctx.row_manager.get_dropped_ids(self, result)
|
|
51
|
-
|
|
52
|
-
if len(dropped_ids) > 0:
|
|
53
|
-
code_file, code_line = get_caller_info(skip_frames=2)
|
|
54
|
-
step_id = ctx.store.append_step(
|
|
55
|
-
operation=f"DataFrame.{method_name}",
|
|
56
|
-
stage=ctx.current_stage,
|
|
57
|
-
code_file=code_file,
|
|
58
|
-
code_line=code_line,
|
|
59
|
-
params=_safe_params(kwargs),
|
|
60
|
-
input_shape=self.shape,
|
|
61
|
-
output_shape=result.shape,
|
|
62
|
-
)
|
|
63
|
-
|
|
64
|
-
# Bulk record all drops at once (10-50x faster than loop)
|
|
65
|
-
ctx.store.append_bulk_drops(step_id, dropped_ids)
|
|
66
|
-
|
|
67
|
-
|
|
68
43
|
# === TRANSFORM CAPTURE ===
|
|
69
44
|
|
|
70
45
|
|
|
@@ -299,56 +274,6 @@ def _capture_agg(self, args, kwargs, result, ctx: TracePipeContext, method_name:
|
|
|
299
274
|
ctx.row_manager.register(result)
|
|
300
275
|
|
|
301
276
|
|
|
302
|
-
# === MERGE/CONCAT (UNKNOWN - OUT OF SCOPE) ===
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
def _capture_merge(
|
|
306
|
-
self: pd.DataFrame, args, kwargs, result, ctx: TracePipeContext, method_name: str
|
|
307
|
-
):
|
|
308
|
-
"""Mark merge as UNKNOWN completeness and reset lineage."""
|
|
309
|
-
code_file, code_line = get_caller_info(skip_frames=2)
|
|
310
|
-
ctx.store.append_step(
|
|
311
|
-
operation=f"DataFrame.{method_name}",
|
|
312
|
-
stage=ctx.current_stage,
|
|
313
|
-
code_file=code_file,
|
|
314
|
-
code_line=code_line,
|
|
315
|
-
params={"how": kwargs.get("how", "inner")},
|
|
316
|
-
input_shape=self.shape,
|
|
317
|
-
output_shape=result.shape if hasattr(result, "shape") else None,
|
|
318
|
-
completeness=CompletenessLevel.UNKNOWN,
|
|
319
|
-
)
|
|
320
|
-
|
|
321
|
-
warnings.warn(
|
|
322
|
-
f"TracePipe: {method_name}() resets row lineage. "
|
|
323
|
-
f"Rows in result cannot be traced back to source rows.",
|
|
324
|
-
TracePipeWarning,
|
|
325
|
-
)
|
|
326
|
-
|
|
327
|
-
# Register result with NEW IDs
|
|
328
|
-
if isinstance(result, pd.DataFrame):
|
|
329
|
-
ctx.row_manager.register(result)
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
def _capture_concat(args, kwargs, result, ctx: TracePipeContext):
|
|
333
|
-
"""Capture pd.concat (module-level function)."""
|
|
334
|
-
code_file, code_line = get_caller_info(skip_frames=2)
|
|
335
|
-
ctx.store.append_step(
|
|
336
|
-
operation="pd.concat",
|
|
337
|
-
stage=ctx.current_stage,
|
|
338
|
-
code_file=code_file,
|
|
339
|
-
code_line=code_line,
|
|
340
|
-
params={"axis": kwargs.get("axis", 0)},
|
|
341
|
-
input_shape=None,
|
|
342
|
-
output_shape=result.shape if hasattr(result, "shape") else None,
|
|
343
|
-
completeness=CompletenessLevel.UNKNOWN,
|
|
344
|
-
)
|
|
345
|
-
|
|
346
|
-
warnings.warn("TracePipe: pd.concat() resets row lineage.", TracePipeWarning)
|
|
347
|
-
|
|
348
|
-
if isinstance(result, pd.DataFrame):
|
|
349
|
-
ctx.row_manager.register(result)
|
|
350
|
-
|
|
351
|
-
|
|
352
277
|
# === INDEX OPERATIONS ===
|
|
353
278
|
|
|
354
279
|
|
|
@@ -408,21 +333,21 @@ def _capture_sort_values(
|
|
|
408
333
|
# Record reorder for each row
|
|
409
334
|
result_ids = ctx.row_manager.get_ids(result)
|
|
410
335
|
if result_ids is not None:
|
|
336
|
+
# Pre-compute position lookup (O(n) instead of O(n²))
|
|
337
|
+
source_idx_list = list(source_ids.index) if hasattr(source_ids, "index") else []
|
|
338
|
+
source_pos_map = {idx: pos for pos, idx in enumerate(source_idx_list)}
|
|
339
|
+
|
|
411
340
|
for new_pos, (idx, row_id) in enumerate(result_ids.items()):
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
change_type=ChangeType.REORDERED,
|
|
423
|
-
)
|
|
424
|
-
except (ValueError, KeyError):
|
|
425
|
-
pass
|
|
341
|
+
old_pos = source_pos_map.get(idx)
|
|
342
|
+
if old_pos is not None and old_pos != new_pos:
|
|
343
|
+
ctx.store.append_diff(
|
|
344
|
+
step_id=step_id,
|
|
345
|
+
row_id=int(row_id),
|
|
346
|
+
col="__position__",
|
|
347
|
+
old_val=old_pos,
|
|
348
|
+
new_val=new_pos,
|
|
349
|
+
change_type=ChangeType.REORDERED,
|
|
350
|
+
)
|
|
426
351
|
|
|
427
352
|
|
|
428
353
|
# === COPY CAPTURE ===
|
|
@@ -458,7 +383,11 @@ def _capture_drop(
|
|
|
458
383
|
|
|
459
384
|
- Row drops (axis=0): Track as filter operation
|
|
460
385
|
- Column drops (axis=1): Track as schema change (step metadata only)
|
|
386
|
+
- Handles inplace=True (result is self, passed by wrapper)
|
|
461
387
|
"""
|
|
388
|
+
# Handle inplace: result is passed as self by the inplace wrapper
|
|
389
|
+
if result is None:
|
|
390
|
+
return
|
|
462
391
|
if not isinstance(result, pd.DataFrame):
|
|
463
392
|
return
|
|
464
393
|
|
|
@@ -509,60 +438,6 @@ def _capture_drop(
|
|
|
509
438
|
)
|
|
510
439
|
|
|
511
440
|
|
|
512
|
-
# === __getitem__ DISPATCH ===
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
def _capture_getitem(
|
|
516
|
-
self: pd.DataFrame, args, kwargs, result, ctx: TracePipeContext, method_name: str
|
|
517
|
-
):
|
|
518
|
-
"""
|
|
519
|
-
Dispatch __getitem__ based on key type.
|
|
520
|
-
|
|
521
|
-
- df['col'] -> Series (ignore)
|
|
522
|
-
- df[['a','b']] -> DataFrame column select (propagate)
|
|
523
|
-
- df[mask] -> DataFrame row filter (track drops)
|
|
524
|
-
- df[slice] -> DataFrame row slice (track drops)
|
|
525
|
-
"""
|
|
526
|
-
if len(args) != 1:
|
|
527
|
-
return
|
|
528
|
-
|
|
529
|
-
key = args[0]
|
|
530
|
-
|
|
531
|
-
# Series result - column access, not row filter
|
|
532
|
-
if isinstance(result, pd.Series):
|
|
533
|
-
return
|
|
534
|
-
|
|
535
|
-
if not isinstance(result, pd.DataFrame):
|
|
536
|
-
return
|
|
537
|
-
|
|
538
|
-
# Boolean mask - row filter
|
|
539
|
-
if isinstance(key, (pd.Series, np.ndarray)) and getattr(key, "dtype", None) is np.dtype("bool"):
|
|
540
|
-
# Skip if we're inside a named filter op (e.g., drop_duplicates)
|
|
541
|
-
# to avoid double-counting drops
|
|
542
|
-
if ctx._filter_op_depth > 0:
|
|
543
|
-
ctx.row_manager.propagate(self, result)
|
|
544
|
-
return
|
|
545
|
-
_capture_filter(self, args, kwargs, result, ctx, "__getitem__[mask]")
|
|
546
|
-
return
|
|
547
|
-
|
|
548
|
-
# List of columns - column selection
|
|
549
|
-
if isinstance(key, list):
|
|
550
|
-
ctx.row_manager.propagate(self, result)
|
|
551
|
-
return
|
|
552
|
-
|
|
553
|
-
# Slice - row selection
|
|
554
|
-
if isinstance(key, slice):
|
|
555
|
-
# Skip if we're inside a named filter op
|
|
556
|
-
if ctx._filter_op_depth > 0:
|
|
557
|
-
ctx.row_manager.propagate(self, result)
|
|
558
|
-
return
|
|
559
|
-
_capture_filter(self, args, kwargs, result, ctx, "__getitem__[slice]")
|
|
560
|
-
return
|
|
561
|
-
|
|
562
|
-
# Default: propagate
|
|
563
|
-
ctx.row_manager.propagate(self, result)
|
|
564
|
-
|
|
565
|
-
|
|
566
441
|
# === __setitem__ CAPTURE ===
|
|
567
442
|
|
|
568
443
|
|
|
@@ -751,7 +626,8 @@ def _wrap_dataframe_init(original):
|
|
|
751
626
|
original(self, *args, **kwargs)
|
|
752
627
|
|
|
753
628
|
ctx = get_context()
|
|
754
|
-
|
|
629
|
+
# Skip registration when inside filter/export operation (prevents re-adding hidden column)
|
|
630
|
+
if ctx.enabled and ctx._filter_op_depth == 0:
|
|
755
631
|
if ctx.row_manager.get_ids(self) is None:
|
|
756
632
|
ctx.row_manager.register(self)
|
|
757
633
|
|
|
@@ -768,7 +644,12 @@ def _make_export_wrapper(original):
|
|
|
768
644
|
def wrapper(self, *args, **kwargs):
|
|
769
645
|
ctx = get_context()
|
|
770
646
|
if ctx.enabled:
|
|
771
|
-
|
|
647
|
+
# Increment filter depth to prevent tracking during column strip
|
|
648
|
+
ctx._filter_op_depth += 1
|
|
649
|
+
try:
|
|
650
|
+
clean_df = ctx.row_manager.strip_hidden_column(self)
|
|
651
|
+
finally:
|
|
652
|
+
ctx._filter_op_depth -= 1
|
|
772
653
|
return original(clean_df, *args, **kwargs)
|
|
773
654
|
return original(self, *args, **kwargs)
|
|
774
655
|
|
|
@@ -880,15 +761,14 @@ def instrument_pandas():
|
|
|
880
761
|
# Already instrumented
|
|
881
762
|
return
|
|
882
763
|
|
|
883
|
-
# === DataFrame filter methods ===
|
|
884
|
-
#
|
|
885
|
-
# methods like drop_duplicates internally call __getitem__
|
|
764
|
+
# === DataFrame filter methods (using new mask-first capture) ===
|
|
765
|
+
# wrap_filter_method provides FULL/PARTIAL completeness tracking
|
|
886
766
|
filter_methods = ["dropna", "drop_duplicates", "query", "head", "tail", "sample"]
|
|
887
767
|
for method_name in filter_methods:
|
|
888
768
|
if hasattr(pd.DataFrame, method_name):
|
|
889
769
|
original = getattr(pd.DataFrame, method_name)
|
|
890
770
|
_originals[f"DataFrame.{method_name}"] = original
|
|
891
|
-
wrapped =
|
|
771
|
+
wrapped = wrap_filter_method(method_name, original)
|
|
892
772
|
setattr(pd.DataFrame, method_name, wrapped)
|
|
893
773
|
|
|
894
774
|
# === DataFrame transform methods (with inplace support) ===
|
|
@@ -908,9 +788,9 @@ def instrument_pandas():
|
|
|
908
788
|
_originals["DataFrame.copy"] = pd.DataFrame.copy
|
|
909
789
|
pd.DataFrame.copy = wrap_pandas_method("copy", pd.DataFrame.copy, _capture_copy)
|
|
910
790
|
|
|
911
|
-
# === drop (row/column removal) ===
|
|
791
|
+
# === drop (row/column removal, supports inplace=True) ===
|
|
912
792
|
_originals["DataFrame.drop"] = pd.DataFrame.drop
|
|
913
|
-
pd.DataFrame.drop =
|
|
793
|
+
pd.DataFrame.drop = wrap_pandas_method_inplace("drop", pd.DataFrame.drop, _capture_drop)
|
|
914
794
|
|
|
915
795
|
# === apply/pipe ===
|
|
916
796
|
_originals["DataFrame.apply"] = pd.DataFrame.apply
|
|
@@ -933,12 +813,14 @@ def instrument_pandas():
|
|
|
933
813
|
wrapped = wrap_pandas_method(agg_method, original, _capture_agg)
|
|
934
814
|
setattr(DataFrameGroupBy, agg_method, wrapped)
|
|
935
815
|
|
|
936
|
-
# === merge ===
|
|
816
|
+
# === merge (using new position column injection capture) ===
|
|
817
|
+
# wrap_merge_with_lineage provides full provenance in DEBUG mode
|
|
937
818
|
_originals["DataFrame.merge"] = pd.DataFrame.merge
|
|
938
|
-
pd.DataFrame.merge =
|
|
819
|
+
pd.DataFrame.merge = wrap_merge_with_lineage(pd.DataFrame.merge)
|
|
939
820
|
|
|
821
|
+
# === join (using new join wrapper) ===
|
|
940
822
|
_originals["DataFrame.join"] = pd.DataFrame.join
|
|
941
|
-
pd.DataFrame.join =
|
|
823
|
+
pd.DataFrame.join = wrap_join_with_lineage(pd.DataFrame.join)
|
|
942
824
|
|
|
943
825
|
# === Index operations ===
|
|
944
826
|
_originals["DataFrame.reset_index"] = pd.DataFrame.reset_index
|
|
@@ -956,11 +838,9 @@ def instrument_pandas():
|
|
|
956
838
|
"sort_values", pd.DataFrame.sort_values, _capture_sort_values
|
|
957
839
|
)
|
|
958
840
|
|
|
959
|
-
# === __getitem__ ===
|
|
841
|
+
# === __getitem__ (using new filter capture for boolean indexing) ===
|
|
960
842
|
_originals["DataFrame.__getitem__"] = pd.DataFrame.__getitem__
|
|
961
|
-
pd.DataFrame.__getitem__ =
|
|
962
|
-
"__getitem__", pd.DataFrame.__getitem__, _capture_getitem
|
|
963
|
-
)
|
|
843
|
+
pd.DataFrame.__getitem__ = wrap_getitem_filter(pd.DataFrame.__getitem__)
|
|
964
844
|
|
|
965
845
|
# === __setitem__ (column assignment) ===
|
|
966
846
|
_originals["DataFrame.__setitem__"] = pd.DataFrame.__setitem__
|
|
@@ -993,23 +873,26 @@ def instrument_pandas():
|
|
|
993
873
|
_originals["DataFrame.to_parquet"] = pd.DataFrame.to_parquet
|
|
994
874
|
pd.DataFrame.to_parquet = _make_export_wrapper(pd.DataFrame.to_parquet)
|
|
995
875
|
|
|
996
|
-
# === pd.concat ===
|
|
876
|
+
# === pd.concat (using new concat wrapper) ===
|
|
997
877
|
_originals["pd.concat"] = pd.concat
|
|
878
|
+
pd.concat = wrap_concat_with_lineage(_originals["pd.concat"])
|
|
998
879
|
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
return result
|
|
1005
|
-
|
|
1006
|
-
pd.concat = wrapped_concat
|
|
880
|
+
# === Phase 6: Extended operation support ===
|
|
881
|
+
# Note: These modules handle their own original storage
|
|
882
|
+
instrument_indexers()
|
|
883
|
+
instrument_series()
|
|
884
|
+
instrument_apply_pipe()
|
|
1007
885
|
|
|
1008
886
|
|
|
1009
887
|
def uninstrument_pandas():
|
|
1010
888
|
"""Restore original pandas methods."""
|
|
1011
889
|
global _originals
|
|
1012
890
|
|
|
891
|
+
# Uninstrument Phase 6 modules first (reverse order)
|
|
892
|
+
uninstrument_apply_pipe()
|
|
893
|
+
uninstrument_series()
|
|
894
|
+
uninstrument_indexers()
|
|
895
|
+
|
|
1013
896
|
for key, original in _originals.items():
|
|
1014
897
|
parts = key.split(".")
|
|
1015
898
|
if parts[0] == "pd":
|
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
# tracepipe/instrumentation/series_capture.py
|
|
2
|
+
"""
|
|
3
|
+
Series method instrumentation for TracePipe.
|
|
4
|
+
|
|
5
|
+
Challenge: Series operations are often chained and may not be assigned back.
|
|
6
|
+
Solution: Track when Series is extracted, wrap common methods, capture on assignment.
|
|
7
|
+
|
|
8
|
+
Operations tracked:
|
|
9
|
+
| Pattern | Tracking | Completeness |
|
|
10
|
+
|-----------------------------------|-----------------------------|--------------|
|
|
11
|
+
| df['col'].fillna(val) | Method call + assignment | FULL |
|
|
12
|
+
| df['col'].replace(...) | Method call + assignment | FULL |
|
|
13
|
+
| df['col'].str.upper() | Method call + assignment | FULL |
|
|
14
|
+
| df['col'].dt.year | Method call + assignment | FULL |
|
|
15
|
+
| df['col'].apply(func) | Before/after diff | PARTIAL |
|
|
16
|
+
| df['col'] = series | Assignment diff | FULL |
|
|
17
|
+
|
|
18
|
+
Key insight: We track at ASSIGNMENT time, not method call time.
|
|
19
|
+
This handles arbitrary chains: df['col'] = df['other'].str.strip().str.upper()
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
import warnings
|
|
23
|
+
import weakref
|
|
24
|
+
from functools import wraps
|
|
25
|
+
|
|
26
|
+
import pandas as pd
|
|
27
|
+
|
|
28
|
+
from ..context import get_context
|
|
29
|
+
from ..core import ChangeType, CompletenessLevel
|
|
30
|
+
from ..safety import TracePipeWarning, get_caller_info
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class TrackedSeries(pd.Series):
|
|
34
|
+
"""
|
|
35
|
+
Series subclass that tracks its origin DataFrame and column.
|
|
36
|
+
|
|
37
|
+
When assigned back to a DataFrame, we can compute the diff.
|
|
38
|
+
|
|
39
|
+
Note: This is created only when extracting from a tracked DataFrame.
|
|
40
|
+
Regular Series operations remain unchanged.
|
|
41
|
+
|
|
42
|
+
Memory Safety:
|
|
43
|
+
- _tp_source_df_ref is a weakref to prevent memory leaks
|
|
44
|
+
- Source DataFrame can be garbage collected independently
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
_metadata = ["_tp_source_df_ref", "_tp_source_col", "_tp_source_rids", "_tp_last_op"]
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def _constructor(self):
|
|
51
|
+
return TrackedSeries
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def _constructor_expanddim(self):
|
|
55
|
+
return pd.DataFrame
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def _tp_source_df(self):
|
|
59
|
+
"""Get source DataFrame from weakref (may return None if GC'd)."""
|
|
60
|
+
ref = getattr(self, "_tp_source_df_ref", None)
|
|
61
|
+
if ref is not None:
|
|
62
|
+
return ref()
|
|
63
|
+
return None
|
|
64
|
+
|
|
65
|
+
@_tp_source_df.setter
|
|
66
|
+
def _tp_source_df(self, df):
|
|
67
|
+
"""Store source DataFrame as weakref."""
|
|
68
|
+
if df is not None:
|
|
69
|
+
self._tp_source_df_ref = weakref.ref(df)
|
|
70
|
+
else:
|
|
71
|
+
self._tp_source_df_ref = None
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def wrap_series_extraction():
|
|
75
|
+
"""
|
|
76
|
+
Wrap DataFrame.__getitem__ to return TrackedSeries for single column access.
|
|
77
|
+
|
|
78
|
+
This allows us to track the origin of Series that may be modified and assigned back.
|
|
79
|
+
"""
|
|
80
|
+
original_getitem = pd.DataFrame.__getitem__
|
|
81
|
+
|
|
82
|
+
@wraps(original_getitem)
|
|
83
|
+
def tracked_getitem(self, key):
|
|
84
|
+
result = original_getitem(self, key)
|
|
85
|
+
|
|
86
|
+
ctx = get_context()
|
|
87
|
+
if not ctx.enabled:
|
|
88
|
+
return result
|
|
89
|
+
|
|
90
|
+
# Skip internal tracking operations to avoid recursion
|
|
91
|
+
if ctx._filter_op_depth > 0:
|
|
92
|
+
return result
|
|
93
|
+
|
|
94
|
+
# Skip internal tracepipe columns
|
|
95
|
+
if isinstance(key, str) and key.startswith("__tracepipe"):
|
|
96
|
+
return result
|
|
97
|
+
|
|
98
|
+
# Only wrap single-column Series access
|
|
99
|
+
if isinstance(key, str) and isinstance(result, pd.Series):
|
|
100
|
+
rids = ctx.row_manager.get_ids_array(self)
|
|
101
|
+
if rids is not None:
|
|
102
|
+
# Convert to TrackedSeries
|
|
103
|
+
tracked = TrackedSeries(result)
|
|
104
|
+
tracked._tp_source_df = self
|
|
105
|
+
tracked._tp_source_col = key
|
|
106
|
+
tracked._tp_source_rids = rids.copy()
|
|
107
|
+
return tracked
|
|
108
|
+
|
|
109
|
+
return result
|
|
110
|
+
|
|
111
|
+
pd.DataFrame.__getitem__ = tracked_getitem
|
|
112
|
+
pd.DataFrame._tp_original_getitem_series = original_getitem
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def wrap_series_assignment():
|
|
116
|
+
"""
|
|
117
|
+
Wrap DataFrame.__setitem__ to capture diffs when assigning Series.
|
|
118
|
+
|
|
119
|
+
Handles:
|
|
120
|
+
- df['col'] = series (where series may have been modified)
|
|
121
|
+
- df['col'] = scalar (broadcast assignment)
|
|
122
|
+
- df['col'] = array (direct assignment)
|
|
123
|
+
"""
|
|
124
|
+
original_setitem = pd.DataFrame.__setitem__
|
|
125
|
+
|
|
126
|
+
@wraps(original_setitem)
|
|
127
|
+
def tracked_setitem(self, key, value):
|
|
128
|
+
ctx = get_context()
|
|
129
|
+
|
|
130
|
+
# Capture before state for watched columns
|
|
131
|
+
before_values = None
|
|
132
|
+
if (
|
|
133
|
+
ctx.enabled
|
|
134
|
+
and isinstance(key, str)
|
|
135
|
+
and key in ctx.watched_columns
|
|
136
|
+
and key in self.columns
|
|
137
|
+
):
|
|
138
|
+
rids = ctx.row_manager.get_ids_array(self)
|
|
139
|
+
if rids is not None:
|
|
140
|
+
before_values = {
|
|
141
|
+
"rids": rids.copy(),
|
|
142
|
+
"values": self[key].values.copy(),
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
# Always run original
|
|
146
|
+
original_setitem(self, key, value)
|
|
147
|
+
|
|
148
|
+
if not ctx.enabled:
|
|
149
|
+
return
|
|
150
|
+
|
|
151
|
+
if before_values is None:
|
|
152
|
+
return
|
|
153
|
+
|
|
154
|
+
try:
|
|
155
|
+
_capture_series_assignment(self, key, value, before_values, ctx)
|
|
156
|
+
except Exception as e:
|
|
157
|
+
if ctx.config.strict_mode:
|
|
158
|
+
raise
|
|
159
|
+
warnings.warn(f"TracePipe: Series assignment capture failed: {e}", TracePipeWarning)
|
|
160
|
+
|
|
161
|
+
pd.DataFrame.__setitem__ = tracked_setitem
|
|
162
|
+
pd.DataFrame._tp_original_setitem_series = original_setitem
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _capture_series_assignment(df, key, value, before_values, ctx):
|
|
166
|
+
"""Capture diffs from Series assignment."""
|
|
167
|
+
from ..utils.value_capture import values_equal
|
|
168
|
+
|
|
169
|
+
store = ctx.store
|
|
170
|
+
rids = before_values["rids"]
|
|
171
|
+
old_vals = before_values["values"]
|
|
172
|
+
new_vals = df[key].values
|
|
173
|
+
|
|
174
|
+
# Determine completeness based on value type
|
|
175
|
+
if isinstance(value, TrackedSeries):
|
|
176
|
+
# Can trace back to source
|
|
177
|
+
completeness = CompletenessLevel.FULL
|
|
178
|
+
operation = f"Series.{_infer_series_operation(value)}"
|
|
179
|
+
elif hasattr(value, "apply") or callable(value):
|
|
180
|
+
completeness = CompletenessLevel.PARTIAL
|
|
181
|
+
operation = "Series.transform"
|
|
182
|
+
else:
|
|
183
|
+
completeness = CompletenessLevel.FULL
|
|
184
|
+
operation = "DataFrame[]="
|
|
185
|
+
|
|
186
|
+
code_file, code_line = get_caller_info(skip_frames=4)
|
|
187
|
+
step_id = store.append_step(
|
|
188
|
+
operation=operation,
|
|
189
|
+
stage=ctx.current_stage,
|
|
190
|
+
code_file=code_file,
|
|
191
|
+
code_line=code_line,
|
|
192
|
+
params={"column": key},
|
|
193
|
+
input_shape=df.shape,
|
|
194
|
+
output_shape=df.shape,
|
|
195
|
+
completeness=completeness,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
# Track diffs for changed values
|
|
199
|
+
for rid, old_val, new_val in zip(rids, old_vals, new_vals):
|
|
200
|
+
if not values_equal(old_val, new_val):
|
|
201
|
+
store.append_diff(
|
|
202
|
+
step_id=step_id,
|
|
203
|
+
row_id=int(rid),
|
|
204
|
+
col=key,
|
|
205
|
+
old_val=old_val,
|
|
206
|
+
new_val=new_val,
|
|
207
|
+
change_type=ChangeType.MODIFIED,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _infer_series_operation(series: TrackedSeries) -> str:
|
|
212
|
+
"""
|
|
213
|
+
Infer the operation that produced this Series.
|
|
214
|
+
|
|
215
|
+
Best effort - returns generic name if unknown.
|
|
216
|
+
"""
|
|
217
|
+
if hasattr(series, "_tp_last_op") and series._tp_last_op is not None:
|
|
218
|
+
return series._tp_last_op
|
|
219
|
+
return "transform"
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
# ============ STRING ACCESSOR WRAPPERS ============
|
|
223
|
+
|
|
224
|
+
# Store original accessors module-level for restore
|
|
225
|
+
_original_str_methods = {}
|
|
226
|
+
|
|
227
|
+
# Use a WeakKeyDictionary to track series references without modifying accessor
|
|
228
|
+
_str_accessor_series_map = weakref.WeakKeyDictionary()
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def wrap_str_accessor():
|
|
232
|
+
"""
|
|
233
|
+
Wrap StringMethods to track operations.
|
|
234
|
+
|
|
235
|
+
We wrap the individual methods rather than __init__ since pandas
|
|
236
|
+
doesn't allow adding new attributes to accessor instances.
|
|
237
|
+
"""
|
|
238
|
+
global _original_str_methods
|
|
239
|
+
from pandas.core.strings.accessor import StringMethods
|
|
240
|
+
|
|
241
|
+
# Wrap common string methods to preserve TrackedSeries
|
|
242
|
+
for method_name in [
|
|
243
|
+
"lower",
|
|
244
|
+
"upper",
|
|
245
|
+
"strip",
|
|
246
|
+
"lstrip",
|
|
247
|
+
"rstrip",
|
|
248
|
+
"replace",
|
|
249
|
+
"slice",
|
|
250
|
+
"split",
|
|
251
|
+
"contains",
|
|
252
|
+
"startswith",
|
|
253
|
+
"endswith",
|
|
254
|
+
"len",
|
|
255
|
+
"extract",
|
|
256
|
+
"findall",
|
|
257
|
+
"cat",
|
|
258
|
+
"get",
|
|
259
|
+
"pad",
|
|
260
|
+
"center",
|
|
261
|
+
"ljust",
|
|
262
|
+
"rjust",
|
|
263
|
+
"zfill",
|
|
264
|
+
"wrap",
|
|
265
|
+
"title",
|
|
266
|
+
"capitalize",
|
|
267
|
+
"swapcase",
|
|
268
|
+
"normalize",
|
|
269
|
+
]:
|
|
270
|
+
if hasattr(StringMethods, method_name):
|
|
271
|
+
_wrap_str_method(StringMethods, method_name)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def _wrap_str_method(cls, method_name):
|
|
275
|
+
"""Wrap a single string method to preserve TrackedSeries."""
|
|
276
|
+
global _original_str_methods
|
|
277
|
+
|
|
278
|
+
if method_name in _original_str_methods:
|
|
279
|
+
return # Already wrapped
|
|
280
|
+
|
|
281
|
+
original = getattr(cls, method_name)
|
|
282
|
+
_original_str_methods[method_name] = original
|
|
283
|
+
|
|
284
|
+
@wraps(original)
|
|
285
|
+
def wrapped(self, *args, **kwargs):
|
|
286
|
+
result = original(self, *args, **kwargs)
|
|
287
|
+
# Try to get the originating series from the accessor's internal _orig
|
|
288
|
+
series = getattr(self, "_orig", None)
|
|
289
|
+
if isinstance(result, pd.Series) and isinstance(series, TrackedSeries):
|
|
290
|
+
tracked = TrackedSeries(result)
|
|
291
|
+
# Copy weakref directly to avoid creating strong reference
|
|
292
|
+
tracked._tp_source_df_ref = getattr(series, "_tp_source_df_ref", None)
|
|
293
|
+
tracked._tp_source_col = getattr(series, "_tp_source_col", None)
|
|
294
|
+
tracked._tp_source_rids = getattr(series, "_tp_source_rids", None)
|
|
295
|
+
tracked._tp_last_op = f"str.{method_name}"
|
|
296
|
+
return tracked
|
|
297
|
+
return result
|
|
298
|
+
|
|
299
|
+
setattr(cls, method_name, wrapped)
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def instrument_series():
|
|
303
|
+
"""Install all Series instrumentation."""
|
|
304
|
+
wrap_series_extraction()
|
|
305
|
+
wrap_series_assignment()
|
|
306
|
+
wrap_str_accessor()
|
|
307
|
+
# Note: DateTime accessor (.dt) wrapping is not implemented.
|
|
308
|
+
# Most datetime operations don't require cell-level tracking.
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def uninstrument_series():
|
|
312
|
+
"""Restore original Series behavior."""
|
|
313
|
+
global _original_str_methods
|
|
314
|
+
|
|
315
|
+
if hasattr(pd.DataFrame, "_tp_original_getitem_series"):
|
|
316
|
+
pd.DataFrame.__getitem__ = pd.DataFrame._tp_original_getitem_series
|
|
317
|
+
delattr(pd.DataFrame, "_tp_original_getitem_series")
|
|
318
|
+
if hasattr(pd.DataFrame, "_tp_original_setitem_series"):
|
|
319
|
+
pd.DataFrame.__setitem__ = pd.DataFrame._tp_original_setitem_series
|
|
320
|
+
delattr(pd.DataFrame, "_tp_original_setitem_series")
|
|
321
|
+
|
|
322
|
+
# Restore str methods
|
|
323
|
+
if _original_str_methods:
|
|
324
|
+
try:
|
|
325
|
+
from pandas.core.strings.accessor import StringMethods
|
|
326
|
+
|
|
327
|
+
for method_name, original in _original_str_methods.items():
|
|
328
|
+
setattr(StringMethods, method_name, original)
|
|
329
|
+
_original_str_methods.clear()
|
|
330
|
+
except ImportError:
|
|
331
|
+
pass
|