tracepipe 0.3.5__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tracepipe/__init__.py +1 -1
- tracepipe/convenience.py +130 -7
- tracepipe/core.py +79 -0
- tracepipe/instrumentation/filter_capture.py +103 -1
- tracepipe/instrumentation/merge_capture.py +169 -23
- tracepipe/storage/lineage_store.py +92 -7
- {tracepipe-0.3.5.dist-info → tracepipe-0.4.1.dist-info}/METADATA +6 -9
- {tracepipe-0.3.5.dist-info → tracepipe-0.4.1.dist-info}/RECORD +10 -10
- {tracepipe-0.3.5.dist-info → tracepipe-0.4.1.dist-info}/WHEEL +0 -0
- {tracepipe-0.3.5.dist-info → tracepipe-0.4.1.dist-info}/licenses/LICENSE +0 -0
tracepipe/__init__.py
CHANGED
tracepipe/convenience.py
CHANGED
|
@@ -54,6 +54,12 @@ class CheckResult:
|
|
|
54
54
|
|
|
55
55
|
Separates FACTS (observed, high confidence) from HEURISTICS (inferred).
|
|
56
56
|
.ok is True only if there are no FACT-level warnings.
|
|
57
|
+
|
|
58
|
+
Key properties for quick access:
|
|
59
|
+
.passed - Alias for .ok (common naming convention)
|
|
60
|
+
.retention - Row retention rate (0.0-1.0)
|
|
61
|
+
.n_dropped - Total rows dropped
|
|
62
|
+
.drops_by_op - Drops broken down by operation
|
|
57
63
|
"""
|
|
58
64
|
|
|
59
65
|
ok: bool
|
|
@@ -61,6 +67,37 @@ class CheckResult:
|
|
|
61
67
|
facts: dict[str, Any]
|
|
62
68
|
suggestions: list[str]
|
|
63
69
|
mode: str
|
|
70
|
+
# Internal: store drops_by_op so we don't need to recompute
|
|
71
|
+
_drops_by_op: dict[str, int] = field(default_factory=dict)
|
|
72
|
+
|
|
73
|
+
# === CONVENIENCE PROPERTIES ===
|
|
74
|
+
|
|
75
|
+
@property
|
|
76
|
+
def passed(self) -> bool:
|
|
77
|
+
"""Alias for .ok (matches common naming convention)."""
|
|
78
|
+
return self.ok
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
def retention(self) -> float | None:
|
|
82
|
+
"""Row retention rate (0.0-1.0), or None if not computed."""
|
|
83
|
+
return self.facts.get("retention_rate")
|
|
84
|
+
|
|
85
|
+
@property
|
|
86
|
+
def n_dropped(self) -> int:
|
|
87
|
+
"""Total number of rows dropped."""
|
|
88
|
+
return self.facts.get("rows_dropped", 0)
|
|
89
|
+
|
|
90
|
+
@property
|
|
91
|
+
def drops_by_op(self) -> dict[str, int]:
|
|
92
|
+
"""Drops broken down by operation name."""
|
|
93
|
+
return self._drops_by_op
|
|
94
|
+
|
|
95
|
+
@property
|
|
96
|
+
def n_steps(self) -> int:
|
|
97
|
+
"""Total pipeline steps recorded."""
|
|
98
|
+
return self.facts.get("total_steps", 0)
|
|
99
|
+
|
|
100
|
+
# === EXISTING PROPERTIES ===
|
|
64
101
|
|
|
65
102
|
@property
|
|
66
103
|
def has_warnings(self) -> bool:
|
|
@@ -115,7 +152,12 @@ class CheckResult:
|
|
|
115
152
|
"""Export to dictionary."""
|
|
116
153
|
return {
|
|
117
154
|
"ok": self.ok,
|
|
155
|
+
"passed": self.passed,
|
|
118
156
|
"mode": self.mode,
|
|
157
|
+
"retention": self.retention,
|
|
158
|
+
"n_dropped": self.n_dropped,
|
|
159
|
+
"n_steps": self.n_steps,
|
|
160
|
+
"drops_by_op": self.drops_by_op,
|
|
119
161
|
"facts": self.facts,
|
|
120
162
|
"suggestions": self.suggestions,
|
|
121
163
|
"warnings": [
|
|
@@ -147,6 +189,10 @@ class TraceResult:
|
|
|
147
189
|
|
|
148
190
|
Answers: "What happened to this row?"
|
|
149
191
|
Events are in CHRONOLOGICAL order (oldest->newest).
|
|
192
|
+
|
|
193
|
+
Key attributes:
|
|
194
|
+
origin: Where this row came from (concat, merge, or original)
|
|
195
|
+
representative: If dropped by dedup, which row was kept instead
|
|
150
196
|
"""
|
|
151
197
|
|
|
152
198
|
row_id: int
|
|
@@ -158,22 +204,70 @@ class TraceResult:
|
|
|
158
204
|
# Mode enforcement
|
|
159
205
|
supported: bool = True
|
|
160
206
|
unsupported_reason: str | None = None
|
|
207
|
+
# v0.4+ provenance
|
|
208
|
+
concat_origin: dict[str, Any] | None = None
|
|
209
|
+
dedup_representative: dict[str, Any] | None = None
|
|
161
210
|
|
|
162
211
|
@property
|
|
163
212
|
def n_events(self) -> int:
|
|
164
213
|
return len(self.events)
|
|
165
214
|
|
|
215
|
+
@property
|
|
216
|
+
def origin(self) -> dict[str, Any] | None:
|
|
217
|
+
"""
|
|
218
|
+
Unified origin info: where did this row come from?
|
|
219
|
+
|
|
220
|
+
Returns dict with 'type' key:
|
|
221
|
+
- {"type": "concat", "source_df": 1, "step_id": 5}
|
|
222
|
+
- {"type": "merge", "left_parent": 10, "right_parent": 20, "step_id": 3}
|
|
223
|
+
- None if original row (not from concat/merge)
|
|
224
|
+
"""
|
|
225
|
+
if self.concat_origin:
|
|
226
|
+
return {
|
|
227
|
+
"type": "concat",
|
|
228
|
+
"source_df": self.concat_origin.get("source_index"),
|
|
229
|
+
"step_id": self.concat_origin.get("step_id"),
|
|
230
|
+
}
|
|
231
|
+
if self.merge_origin:
|
|
232
|
+
return {
|
|
233
|
+
"type": "merge",
|
|
234
|
+
"left_parent": self.merge_origin.get("left_parent"),
|
|
235
|
+
"right_parent": self.merge_origin.get("right_parent"),
|
|
236
|
+
"step_id": self.merge_origin.get("step_id"),
|
|
237
|
+
}
|
|
238
|
+
return None
|
|
239
|
+
|
|
240
|
+
@property
|
|
241
|
+
def representative(self) -> dict[str, Any] | None:
|
|
242
|
+
"""
|
|
243
|
+
If dropped by drop_duplicates, which row was kept instead?
|
|
244
|
+
|
|
245
|
+
Returns:
|
|
246
|
+
{"kept_rid": 42, "subset": ["key"], "keep": "first"} or None
|
|
247
|
+
kept_rid is None if keep=False (all duplicates dropped)
|
|
248
|
+
"""
|
|
249
|
+
if not self.dedup_representative:
|
|
250
|
+
return None
|
|
251
|
+
return {
|
|
252
|
+
"kept_rid": self.dedup_representative.get("kept_rid"),
|
|
253
|
+
"subset": self.dedup_representative.get("subset_columns"),
|
|
254
|
+
"keep": self.dedup_representative.get("keep_strategy"),
|
|
255
|
+
}
|
|
256
|
+
|
|
166
257
|
def to_dict(self) -> dict:
|
|
167
258
|
"""Export to dictionary."""
|
|
168
259
|
return {
|
|
169
260
|
"row_id": self.row_id,
|
|
170
261
|
"is_alive": self.is_alive,
|
|
171
262
|
"dropped_at": self.dropped_at,
|
|
172
|
-
"
|
|
263
|
+
"origin": self.origin,
|
|
264
|
+
"representative": self.representative,
|
|
173
265
|
"n_events": self.n_events,
|
|
174
266
|
"events": self.events,
|
|
175
267
|
"ghost_values": self.ghost_values,
|
|
176
268
|
"supported": self.supported,
|
|
269
|
+
# Keep legacy fields for backwards compatibility
|
|
270
|
+
"merge_origin": self.merge_origin,
|
|
177
271
|
}
|
|
178
272
|
|
|
179
273
|
def __repr__(self) -> str:
|
|
@@ -195,10 +289,28 @@ class TraceResult:
|
|
|
195
289
|
f" at step {self.dropped_at['step_id']}: {self.dropped_at['operation']}"
|
|
196
290
|
)
|
|
197
291
|
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
292
|
+
# Display unified origin info
|
|
293
|
+
origin = self.origin
|
|
294
|
+
if origin:
|
|
295
|
+
if origin["type"] == "merge":
|
|
296
|
+
left = origin.get("left_parent", "?")
|
|
297
|
+
right = origin.get("right_parent", "?")
|
|
298
|
+
lines.append(f" Origin: merge of row {left} (left) + row {right} (right)")
|
|
299
|
+
elif origin["type"] == "concat":
|
|
300
|
+
src = origin.get("source_df", "?")
|
|
301
|
+
lines.append(f" Origin: concat from DataFrame #{src}")
|
|
302
|
+
|
|
303
|
+
# Display dedup representative if dropped by dedup
|
|
304
|
+
if self.representative:
|
|
305
|
+
kept = self.representative.get("kept_rid")
|
|
306
|
+
subset = self.representative.get("subset")
|
|
307
|
+
keep = self.representative.get("keep", "first")
|
|
308
|
+
if kept is not None:
|
|
309
|
+
subset_str = f" (key: {subset})" if subset else ""
|
|
310
|
+
lines.append(f" Replaced by: row {kept}{subset_str} [keep={keep}]")
|
|
311
|
+
else:
|
|
312
|
+
subset_str = f" on {subset}" if subset else ""
|
|
313
|
+
lines.append(f" Dropped: all duplicates removed{subset_str} [keep=False]")
|
|
202
314
|
|
|
203
315
|
if len(self.events) == 0:
|
|
204
316
|
lines.append("\n Events: 0 (no changes to watched columns)")
|
|
@@ -462,8 +574,8 @@ def check(
|
|
|
462
574
|
)
|
|
463
575
|
)
|
|
464
576
|
|
|
465
|
-
|
|
466
|
-
for op, count in
|
|
577
|
+
drops_by_op = ctx.store.get_dropped_by_step()
|
|
578
|
+
for op, count in drops_by_op.items():
|
|
467
579
|
if count > 1000:
|
|
468
580
|
suggestions.append(f"'{op}' dropped {count} rows - review if intentional")
|
|
469
581
|
|
|
@@ -475,6 +587,7 @@ def check(
|
|
|
475
587
|
facts=facts,
|
|
476
588
|
suggestions=suggestions,
|
|
477
589
|
mode=ctx.config.mode.value,
|
|
590
|
+
_drops_by_op=drops_by_op,
|
|
478
591
|
)
|
|
479
592
|
|
|
480
593
|
|
|
@@ -787,6 +900,14 @@ def _build_trace_result(row_id: int, ctx, include_ghost: bool) -> TraceResult:
|
|
|
787
900
|
drop_event = store.get_drop_event(row_id)
|
|
788
901
|
merge_origin = store.get_merge_origin(row_id)
|
|
789
902
|
|
|
903
|
+
# v0.4+ provenance: concat origin and dedup representative
|
|
904
|
+
concat_origin = None
|
|
905
|
+
dedup_representative = None
|
|
906
|
+
if hasattr(store, "get_concat_origin"):
|
|
907
|
+
concat_origin = store.get_concat_origin(row_id)
|
|
908
|
+
if hasattr(store, "get_duplicate_representative"):
|
|
909
|
+
dedup_representative = store.get_duplicate_representative(row_id)
|
|
910
|
+
|
|
790
911
|
# Use lineage-aware history to include pre-merge parent events
|
|
791
912
|
if hasattr(store, "get_row_history_with_lineage"):
|
|
792
913
|
history = store.get_row_history_with_lineage(row_id)
|
|
@@ -823,6 +944,8 @@ def _build_trace_result(row_id: int, ctx, include_ghost: bool) -> TraceResult:
|
|
|
823
944
|
merge_origin=merge_origin,
|
|
824
945
|
events=history,
|
|
825
946
|
ghost_values=ghost_values,
|
|
947
|
+
concat_origin=concat_origin,
|
|
948
|
+
dedup_representative=dedup_representative,
|
|
826
949
|
)
|
|
827
950
|
|
|
828
951
|
|
tracepipe/core.py
CHANGED
|
@@ -277,3 +277,82 @@ class MergeStats:
|
|
|
277
277
|
left_dup_rate: float # -1 if not computed
|
|
278
278
|
right_dup_rate: float # -1 if not computed
|
|
279
279
|
how: str
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
@dataclass
|
|
283
|
+
class ConcatMapping:
|
|
284
|
+
"""
|
|
285
|
+
Mapping for pd.concat operations preserving row lineage.
|
|
286
|
+
|
|
287
|
+
For axis=0 concat, each result row comes from exactly one source DataFrame.
|
|
288
|
+
Arrays are stored in both positional order (for "explain row i") and
|
|
289
|
+
sorted order (for O(log n) RID lookup).
|
|
290
|
+
|
|
291
|
+
Invariants:
|
|
292
|
+
- out_rids and source_indices have same length
|
|
293
|
+
- out_rids_sorted and out_pos_sorted are always paired (both set or both None)
|
|
294
|
+
- out_rids_sorted is monotonically increasing
|
|
295
|
+
"""
|
|
296
|
+
|
|
297
|
+
step_id: int
|
|
298
|
+
|
|
299
|
+
# Positional arrays (match result row order)
|
|
300
|
+
out_rids: Any # numpy array, len = len(result)
|
|
301
|
+
source_indices: Any # numpy array, which source DF (0, 1, 2...) each row came from
|
|
302
|
+
|
|
303
|
+
# Sorted arrays (for O(log n) lookup by RID)
|
|
304
|
+
out_rids_sorted: Any # numpy array, SORTED
|
|
305
|
+
out_pos_sorted: Any # numpy array, original positions aligned with out_rids_sorted
|
|
306
|
+
|
|
307
|
+
# Metadata
|
|
308
|
+
source_shapes: list[tuple] = field(default_factory=list)
|
|
309
|
+
|
|
310
|
+
def __post_init__(self):
|
|
311
|
+
"""Validate invariants."""
|
|
312
|
+
import numpy as np
|
|
313
|
+
|
|
314
|
+
if self.out_rids_sorted is not None and self.out_pos_sorted is not None:
|
|
315
|
+
if len(self.out_rids_sorted) != len(self.out_pos_sorted):
|
|
316
|
+
raise ValueError("out_rids_sorted and out_pos_sorted must have same length")
|
|
317
|
+
# Verify monotonic (debug check)
|
|
318
|
+
if len(self.out_rids_sorted) > 1:
|
|
319
|
+
assert np.all(
|
|
320
|
+
self.out_rids_sorted[:-1] <= self.out_rids_sorted[1:]
|
|
321
|
+
), "out_rids_sorted must be monotonically increasing"
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
@dataclass
|
|
325
|
+
class DuplicateDropMapping:
|
|
326
|
+
"""
|
|
327
|
+
Mapping for drop_duplicates provenance (debug mode only).
|
|
328
|
+
|
|
329
|
+
Tracks which rows were dropped and which "representative" row they lost to.
|
|
330
|
+
Arrays are sorted by dropped_rids for O(log n) lookup.
|
|
331
|
+
|
|
332
|
+
For keep='first': dropped rows map to first occurrence
|
|
333
|
+
For keep='last': dropped rows map to last occurrence
|
|
334
|
+
For keep=False: dropped rows have kept_rids=-1 (no representative)
|
|
335
|
+
"""
|
|
336
|
+
|
|
337
|
+
step_id: int
|
|
338
|
+
|
|
339
|
+
# Sorted arrays for O(log n) lookup
|
|
340
|
+
dropped_rids: Any # numpy array, SORTED dropped row IDs
|
|
341
|
+
kept_rids: Any # numpy array, representative RID for each dropped row (-1 if none)
|
|
342
|
+
|
|
343
|
+
# Metadata
|
|
344
|
+
subset_columns: Optional[tuple[str, ...]] = None
|
|
345
|
+
keep_strategy: str = "first"
|
|
346
|
+
|
|
347
|
+
def __post_init__(self):
|
|
348
|
+
"""Validate invariants."""
|
|
349
|
+
import numpy as np
|
|
350
|
+
|
|
351
|
+
if self.dropped_rids is not None and self.kept_rids is not None:
|
|
352
|
+
if len(self.dropped_rids) != len(self.kept_rids):
|
|
353
|
+
raise ValueError("dropped_rids and kept_rids must have same length")
|
|
354
|
+
# Verify sorted
|
|
355
|
+
if len(self.dropped_rids) > 1:
|
|
356
|
+
assert np.all(
|
|
357
|
+
self.dropped_rids[:-1] <= self.dropped_rids[1:]
|
|
358
|
+
), "dropped_rids must be sorted"
|
|
@@ -24,7 +24,7 @@ import numpy as np
|
|
|
24
24
|
import pandas as pd
|
|
25
25
|
|
|
26
26
|
from ..context import TracePipeContext, get_context
|
|
27
|
-
from ..core import CompletenessLevel
|
|
27
|
+
from ..core import CompletenessLevel, DuplicateDropMapping
|
|
28
28
|
from ..safety import TracePipeWarning, get_caller_info
|
|
29
29
|
|
|
30
30
|
# ============ MASK DERIVATION FUNCTIONS ============
|
|
@@ -97,6 +97,95 @@ def derive_drop_duplicates_mask(
|
|
|
97
97
|
return kept_mask.values, completeness
|
|
98
98
|
|
|
99
99
|
|
|
100
|
+
def derive_drop_duplicates_provenance(
|
|
101
|
+
df: pd.DataFrame,
|
|
102
|
+
source_rids: np.ndarray,
|
|
103
|
+
subset: Optional[list[str]],
|
|
104
|
+
keep: str,
|
|
105
|
+
) -> Optional[DuplicateDropMapping]:
|
|
106
|
+
"""
|
|
107
|
+
Derive dropped->kept mapping for drop_duplicates (debug mode only).
|
|
108
|
+
|
|
109
|
+
Uses hash_pandas_object for NaN-safe, fast key comparison.
|
|
110
|
+
Uses vectorized groupby min/max for representative selection.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
df: Source DataFrame
|
|
114
|
+
source_rids: Row IDs for each row in df
|
|
115
|
+
subset: Columns to consider for duplicates (None = all)
|
|
116
|
+
keep: 'first', 'last', or False
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
DuplicateDropMapping if any rows were dropped, else None.
|
|
120
|
+
"""
|
|
121
|
+
n = len(df)
|
|
122
|
+
if n == 0:
|
|
123
|
+
return None
|
|
124
|
+
|
|
125
|
+
# Determine columns to hash
|
|
126
|
+
if subset is None:
|
|
127
|
+
hash_df = df
|
|
128
|
+
valid_cols = tuple(df.columns)
|
|
129
|
+
else:
|
|
130
|
+
valid_cols = tuple(c for c in subset if c in df.columns)
|
|
131
|
+
if not valid_cols:
|
|
132
|
+
return None
|
|
133
|
+
hash_df = df[list(valid_cols)]
|
|
134
|
+
|
|
135
|
+
# Use hash_pandas_object for fast, NaN-safe key hashing
|
|
136
|
+
try:
|
|
137
|
+
h = pd.util.hash_pandas_object(hash_df, index=False)
|
|
138
|
+
codes, _ = pd.factorize(h, sort=False)
|
|
139
|
+
except Exception:
|
|
140
|
+
# Fallback: can't hash, skip provenance
|
|
141
|
+
return None
|
|
142
|
+
|
|
143
|
+
# Compute kept mask using pandas (ground truth)
|
|
144
|
+
kept_mask = ~df.duplicated(subset=list(valid_cols) if valid_cols else None, keep=keep)
|
|
145
|
+
dropped_mask = ~kept_mask.values
|
|
146
|
+
|
|
147
|
+
if not dropped_mask.any():
|
|
148
|
+
return None # No duplicates dropped
|
|
149
|
+
|
|
150
|
+
dropped_positions = np.where(dropped_mask)[0]
|
|
151
|
+
dropped_rids = source_rids[dropped_positions]
|
|
152
|
+
|
|
153
|
+
# Find representative positions using vectorized groupby min/max
|
|
154
|
+
positions = np.arange(n, dtype=np.int64)
|
|
155
|
+
|
|
156
|
+
if keep == "first":
|
|
157
|
+
# Representative = first occurrence of each group
|
|
158
|
+
rep_pos = pd.Series(positions).groupby(codes).min().to_numpy()
|
|
159
|
+
elif keep == "last":
|
|
160
|
+
# Representative = last occurrence of each group
|
|
161
|
+
rep_pos = pd.Series(positions).groupby(codes).max().to_numpy()
|
|
162
|
+
else:
|
|
163
|
+
# keep=False: no representative (all duplicates dropped)
|
|
164
|
+
rep_pos = None
|
|
165
|
+
|
|
166
|
+
# Build kept_rids array
|
|
167
|
+
if rep_pos is not None:
|
|
168
|
+
dropped_codes = codes[dropped_positions]
|
|
169
|
+
kept_positions = rep_pos[dropped_codes]
|
|
170
|
+
kept_rids = source_rids[kept_positions]
|
|
171
|
+
else:
|
|
172
|
+
# keep=False: no representative
|
|
173
|
+
kept_rids = np.full(len(dropped_rids), -1, dtype=np.int64)
|
|
174
|
+
|
|
175
|
+
# Sort by dropped_rids for O(log n) lookup
|
|
176
|
+
sort_order = np.argsort(dropped_rids)
|
|
177
|
+
dropped_rids_sorted = dropped_rids[sort_order].copy()
|
|
178
|
+
kept_rids_sorted = kept_rids[sort_order].copy()
|
|
179
|
+
|
|
180
|
+
return DuplicateDropMapping(
|
|
181
|
+
step_id=-1, # Will be set by caller
|
|
182
|
+
dropped_rids=dropped_rids_sorted,
|
|
183
|
+
kept_rids=kept_rids_sorted,
|
|
184
|
+
subset_columns=valid_cols if valid_cols else None,
|
|
185
|
+
keep_strategy=str(keep),
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
|
|
100
189
|
def derive_query_mask(
|
|
101
190
|
df: pd.DataFrame, args: tuple, kwargs: dict
|
|
102
191
|
) -> tuple[Optional[np.ndarray], CompletenessLevel]:
|
|
@@ -257,12 +346,19 @@ def _capture_filter_with_mask(
|
|
|
257
346
|
kept_mask: Optional[np.ndarray] = None
|
|
258
347
|
positions: Optional[np.ndarray] = None
|
|
259
348
|
completeness = CompletenessLevel.FULL
|
|
349
|
+
dedup_mapping: Optional[DuplicateDropMapping] = None
|
|
260
350
|
|
|
261
351
|
if method_name == "dropna":
|
|
262
352
|
kept_mask, completeness = derive_dropna_mask(source_df, args, kwargs)
|
|
263
353
|
|
|
264
354
|
elif method_name == "drop_duplicates":
|
|
265
355
|
kept_mask, completeness = derive_drop_duplicates_mask(source_df, args, kwargs)
|
|
356
|
+
# Compute provenance mapping in debug mode
|
|
357
|
+
dedup_mapping = None
|
|
358
|
+
if ctx.config.should_capture_merge_provenance:
|
|
359
|
+
subset = kwargs.get("subset", None)
|
|
360
|
+
keep = kwargs.get("keep", "first")
|
|
361
|
+
dedup_mapping = derive_drop_duplicates_provenance(source_df, source_rids, subset, keep)
|
|
266
362
|
|
|
267
363
|
elif method_name == "query":
|
|
268
364
|
kept_mask, completeness = derive_query_mask(source_df, args, kwargs)
|
|
@@ -359,6 +455,12 @@ def _capture_filter_with_mask(
|
|
|
359
455
|
watched_columns=ctx.watched_columns,
|
|
360
456
|
)
|
|
361
457
|
|
|
458
|
+
# === RECORD DROP_DUPLICATES PROVENANCE (debug mode) ===
|
|
459
|
+
if method_name == "drop_duplicates" and dedup_mapping is not None:
|
|
460
|
+
# Update step_id in the mapping and store it
|
|
461
|
+
dedup_mapping.step_id = step_id
|
|
462
|
+
store.duplicate_drop_mappings.append(dedup_mapping)
|
|
463
|
+
|
|
362
464
|
|
|
363
465
|
def _propagate_by_index_fallback(
|
|
364
466
|
row_mgr, source_df: pd.DataFrame, result_df: pd.DataFrame
|
|
@@ -14,7 +14,7 @@ import numpy as np
|
|
|
14
14
|
import pandas as pd
|
|
15
15
|
|
|
16
16
|
from ..context import get_context
|
|
17
|
-
from ..core import CompletenessLevel, MergeMapping, MergeStats
|
|
17
|
+
from ..core import CompletenessLevel, ConcatMapping, MergeMapping, MergeStats
|
|
18
18
|
from ..safety import TracePipeWarning, get_caller_info
|
|
19
19
|
|
|
20
20
|
|
|
@@ -382,53 +382,199 @@ def wrap_join_with_lineage(original_join):
|
|
|
382
382
|
def wrap_concat_with_lineage(original_concat):
|
|
383
383
|
"""
|
|
384
384
|
Wrap pd.concat with lineage capture.
|
|
385
|
+
|
|
386
|
+
For axis=0 (vertical concat):
|
|
387
|
+
- Preserves row IDs from source DataFrames (FULL provenance)
|
|
388
|
+
- Tracks which source DataFrame each row came from
|
|
389
|
+
|
|
390
|
+
For axis=1 (horizontal concat):
|
|
391
|
+
- Propagates RIDs if all inputs have identical RID arrays
|
|
392
|
+
- Otherwise marks as PARTIAL
|
|
385
393
|
"""
|
|
386
394
|
|
|
387
395
|
@wraps(original_concat)
|
|
388
396
|
def wrapper(objs, *args, **kwargs):
|
|
389
397
|
ctx = get_context()
|
|
390
398
|
|
|
391
|
-
result = original_concat(objs, *args, **kwargs)
|
|
392
|
-
|
|
393
399
|
if not ctx.enabled:
|
|
394
|
-
return
|
|
400
|
+
return original_concat(objs, *args, **kwargs)
|
|
401
|
+
|
|
402
|
+
axis = kwargs.get("axis", 0)
|
|
403
|
+
|
|
404
|
+
# === BEFORE: Capture source RIDs from all tracked DataFrames ===
|
|
405
|
+
source_data = [] # [(rids_copy, shape, original_index), ...]
|
|
406
|
+
try:
|
|
407
|
+
objs_list = list(objs) if hasattr(objs, "__iter__") else [objs]
|
|
408
|
+
except TypeError:
|
|
409
|
+
objs_list = [objs]
|
|
410
|
+
|
|
411
|
+
for i, obj in enumerate(objs_list):
|
|
412
|
+
if isinstance(obj, pd.DataFrame) and len(obj) > 0:
|
|
413
|
+
rids = ctx.row_manager.get_ids_array(obj)
|
|
414
|
+
if rids is None:
|
|
415
|
+
rids = ctx.row_manager.register(obj)
|
|
416
|
+
# IMPORTANT: Make a copy to avoid mutation issues
|
|
417
|
+
source_data.append((rids.copy(), obj.shape, i))
|
|
418
|
+
|
|
419
|
+
# === RUN ORIGINAL ===
|
|
420
|
+
try:
|
|
421
|
+
result = original_concat(objs_list, *args, **kwargs)
|
|
422
|
+
except Exception:
|
|
423
|
+
raise # Don't store mapping on failure
|
|
395
424
|
|
|
396
425
|
if not isinstance(result, pd.DataFrame):
|
|
397
426
|
return result
|
|
398
427
|
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
428
|
+
row_mgr = ctx.row_manager
|
|
429
|
+
store = ctx.store
|
|
430
|
+
code_file, code_line = get_caller_info(skip_frames=2)
|
|
402
431
|
|
|
403
|
-
|
|
404
|
-
|
|
432
|
+
# Compute input shapes for step metadata
|
|
433
|
+
input_shapes = [sd[1] for sd in source_data]
|
|
405
434
|
|
|
406
|
-
|
|
435
|
+
# === AXIS=0: Vertical concat with FULL provenance ===
|
|
436
|
+
if axis == 0 and source_data:
|
|
437
|
+
return _concat_axis0_with_provenance(
|
|
438
|
+
result, source_data, input_shapes, code_file, code_line, ctx
|
|
439
|
+
)
|
|
407
440
|
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
441
|
+
# === AXIS=1: Horizontal concat ===
|
|
442
|
+
elif axis == 1 and source_data:
|
|
443
|
+
return _concat_axis1_with_provenance(
|
|
444
|
+
result, source_data, input_shapes, code_file, code_line, ctx
|
|
445
|
+
)
|
|
413
446
|
|
|
447
|
+
# === FALLBACK: Unknown axis or no source data ===
|
|
448
|
+
else:
|
|
449
|
+
row_mgr.register(result)
|
|
414
450
|
store.append_step(
|
|
415
451
|
operation="pd.concat",
|
|
416
452
|
stage=ctx.current_stage,
|
|
417
453
|
code_file=code_file,
|
|
418
454
|
code_line=code_line,
|
|
419
455
|
params={
|
|
420
|
-
"axis":
|
|
421
|
-
"n_inputs": len(
|
|
456
|
+
"axis": axis,
|
|
457
|
+
"n_inputs": len(source_data),
|
|
422
458
|
},
|
|
423
459
|
input_shape=tuple(input_shapes) if input_shapes else None,
|
|
424
460
|
output_shape=result.shape,
|
|
425
|
-
completeness=CompletenessLevel.PARTIAL,
|
|
461
|
+
completeness=CompletenessLevel.PARTIAL,
|
|
426
462
|
)
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
463
|
+
return result
|
|
464
|
+
|
|
465
|
+
return wrapper
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
def _concat_axis0_with_provenance(result, source_data, input_shapes, code_file, code_line, ctx):
|
|
469
|
+
"""
|
|
470
|
+
Handle axis=0 concat with FULL row provenance.
|
|
471
|
+
|
|
472
|
+
Preserves source RIDs and tracks which source DF each row came from.
|
|
473
|
+
"""
|
|
474
|
+
row_mgr = ctx.row_manager
|
|
475
|
+
store = ctx.store
|
|
431
476
|
|
|
477
|
+
# Build concatenated RID array and source index array
|
|
478
|
+
all_rids = np.concatenate([sd[0] for sd in source_data])
|
|
479
|
+
all_source_idx = np.concatenate(
|
|
480
|
+
[np.full(len(sd[0]), sd[2], dtype=np.int32) for sd in source_data]
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
# Validate: length must match result
|
|
484
|
+
if len(all_rids) != len(result):
|
|
485
|
+
# Mismatch - some objects contributed differently (empty DFs, Series, etc.)
|
|
486
|
+
# Degrade gracefully to PARTIAL
|
|
487
|
+
row_mgr.register(result)
|
|
488
|
+
store.append_step(
|
|
489
|
+
operation="pd.concat",
|
|
490
|
+
stage=ctx.current_stage,
|
|
491
|
+
code_file=code_file,
|
|
492
|
+
code_line=code_line,
|
|
493
|
+
params={
|
|
494
|
+
"axis": 0,
|
|
495
|
+
"n_inputs": len(source_data),
|
|
496
|
+
"_length_mismatch": True,
|
|
497
|
+
},
|
|
498
|
+
input_shape=tuple(input_shapes) if input_shapes else None,
|
|
499
|
+
output_shape=result.shape,
|
|
500
|
+
completeness=CompletenessLevel.PARTIAL,
|
|
501
|
+
)
|
|
432
502
|
return result
|
|
433
503
|
|
|
434
|
-
|
|
504
|
+
# Propagate RIDs to result (preserving lineage!)
|
|
505
|
+
row_mgr.set_result_rids(result, all_rids.copy())
|
|
506
|
+
|
|
507
|
+
# Build sorted arrays for O(log n) lookup
|
|
508
|
+
sort_order = np.argsort(all_rids)
|
|
509
|
+
out_rids_sorted = all_rids[sort_order].copy()
|
|
510
|
+
out_pos_sorted = sort_order.copy()
|
|
511
|
+
|
|
512
|
+
# Record step with FULL completeness
|
|
513
|
+
step_id = store.append_step(
|
|
514
|
+
operation="pd.concat",
|
|
515
|
+
stage=ctx.current_stage,
|
|
516
|
+
code_file=code_file,
|
|
517
|
+
code_line=code_line,
|
|
518
|
+
params={
|
|
519
|
+
"axis": 0,
|
|
520
|
+
"n_inputs": len(source_data),
|
|
521
|
+
},
|
|
522
|
+
input_shape=tuple(input_shapes) if input_shapes else None,
|
|
523
|
+
output_shape=result.shape,
|
|
524
|
+
completeness=CompletenessLevel.FULL,
|
|
525
|
+
)
|
|
526
|
+
|
|
527
|
+
# Store mapping
|
|
528
|
+
mapping = ConcatMapping(
|
|
529
|
+
step_id=step_id,
|
|
530
|
+
out_rids=all_rids.copy(),
|
|
531
|
+
source_indices=all_source_idx.copy(),
|
|
532
|
+
out_rids_sorted=out_rids_sorted,
|
|
533
|
+
out_pos_sorted=out_pos_sorted,
|
|
534
|
+
source_shapes=list(input_shapes),
|
|
535
|
+
)
|
|
536
|
+
store.concat_mappings.append(mapping)
|
|
537
|
+
|
|
538
|
+
return result
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
def _concat_axis1_with_provenance(result, source_data, input_shapes, code_file, code_line, ctx):
|
|
542
|
+
"""
|
|
543
|
+
Handle axis=1 concat with best-effort provenance.
|
|
544
|
+
|
|
545
|
+
If all inputs have identical RID arrays, propagate them (FULL).
|
|
546
|
+
Otherwise, mark as PARTIAL and register new RIDs.
|
|
547
|
+
"""
|
|
548
|
+
row_mgr = ctx.row_manager
|
|
549
|
+
store = ctx.store
|
|
550
|
+
|
|
551
|
+
# Check if all inputs have the same RIDs in same order
|
|
552
|
+
first_rids = source_data[0][0]
|
|
553
|
+
all_same = all(
|
|
554
|
+
len(sd[0]) == len(first_rids) and np.array_equal(sd[0], first_rids) for sd in source_data
|
|
555
|
+
)
|
|
556
|
+
|
|
557
|
+
if all_same and len(first_rids) == len(result):
|
|
558
|
+
# All inputs have identical RIDs - propagate them
|
|
559
|
+
row_mgr.set_result_rids(result, first_rids.copy())
|
|
560
|
+
completeness = CompletenessLevel.FULL
|
|
561
|
+
else:
|
|
562
|
+
# Misaligned or different RIDs - register new RIDs
|
|
563
|
+
row_mgr.register(result)
|
|
564
|
+
completeness = CompletenessLevel.PARTIAL
|
|
565
|
+
|
|
566
|
+
store.append_step(
|
|
567
|
+
operation="pd.concat",
|
|
568
|
+
stage=ctx.current_stage,
|
|
569
|
+
code_file=code_file,
|
|
570
|
+
code_line=code_line,
|
|
571
|
+
params={
|
|
572
|
+
"axis": 1,
|
|
573
|
+
"n_inputs": len(source_data),
|
|
574
|
+
},
|
|
575
|
+
input_shape=tuple(input_shapes) if input_shapes else None,
|
|
576
|
+
output_shape=result.shape,
|
|
577
|
+
completeness=completeness,
|
|
578
|
+
)
|
|
579
|
+
|
|
580
|
+
return result
|
|
@@ -22,6 +22,8 @@ from ..core import (
|
|
|
22
22
|
AggregationMapping,
|
|
23
23
|
ChangeType,
|
|
24
24
|
CompletenessLevel,
|
|
25
|
+
ConcatMapping,
|
|
26
|
+
DuplicateDropMapping,
|
|
25
27
|
LineageGap,
|
|
26
28
|
LineageGaps,
|
|
27
29
|
MergeMapping,
|
|
@@ -100,6 +102,12 @@ class InMemoryLineageStore:
|
|
|
100
102
|
self.merge_mappings: list[MergeMapping] = []
|
|
101
103
|
self.merge_stats: list[tuple[int, MergeStats]] = []
|
|
102
104
|
|
|
105
|
+
# === CONCAT TRACKING ===
|
|
106
|
+
self.concat_mappings: list[ConcatMapping] = []
|
|
107
|
+
|
|
108
|
+
# === DUPLICATE DROP TRACKING (debug mode) ===
|
|
109
|
+
self.duplicate_drop_mappings: list[DuplicateDropMapping] = []
|
|
110
|
+
|
|
103
111
|
# === AGGREGATION MAPPINGS ===
|
|
104
112
|
self.aggregation_mappings: list[AggregationMapping] = []
|
|
105
113
|
|
|
@@ -361,6 +369,74 @@ class InMemoryLineageStore:
|
|
|
361
369
|
return [(sid, s) for sid, s in self.merge_stats if sid == step_id]
|
|
362
370
|
return list(self.merge_stats)
|
|
363
371
|
|
|
372
|
+
# === CONCAT LOOKUP (O(log n) via searchsorted) ===
|
|
373
|
+
|
|
374
|
+
def _binary_search_mapping(
|
|
375
|
+
self, sorted_rids: Optional[np.ndarray], target_rid: int
|
|
376
|
+
) -> Optional[int]:
|
|
377
|
+
"""
|
|
378
|
+
Return index in sorted array, or None if not found.
|
|
379
|
+
|
|
380
|
+
Robust to None/empty arrays and dtype mismatches.
|
|
381
|
+
"""
|
|
382
|
+
if sorted_rids is None or len(sorted_rids) == 0:
|
|
383
|
+
return None
|
|
384
|
+
|
|
385
|
+
target = np.int64(target_rid)
|
|
386
|
+
i = np.searchsorted(sorted_rids, target)
|
|
387
|
+
|
|
388
|
+
if i < len(sorted_rids) and sorted_rids[i] == target:
|
|
389
|
+
return int(i)
|
|
390
|
+
return None
|
|
391
|
+
|
|
392
|
+
def get_concat_origin(self, row_id: int) -> Optional[dict]:
|
|
393
|
+
"""
|
|
394
|
+
Get which source DataFrame a row came from in a concat.
|
|
395
|
+
|
|
396
|
+
Uses binary search (O(log n)) on sorted RIDs.
|
|
397
|
+
|
|
398
|
+
Returns:
|
|
399
|
+
{step_id, source_index, source_shape, position} if found, else None.
|
|
400
|
+
"""
|
|
401
|
+
for mapping in self.concat_mappings:
|
|
402
|
+
idx = self._binary_search_mapping(mapping.out_rids_sorted, row_id)
|
|
403
|
+
if idx is not None:
|
|
404
|
+
pos = int(mapping.out_pos_sorted[idx])
|
|
405
|
+
source_idx = int(mapping.source_indices[pos])
|
|
406
|
+
return {
|
|
407
|
+
"step_id": mapping.step_id,
|
|
408
|
+
"source_index": source_idx,
|
|
409
|
+
"source_shape": (
|
|
410
|
+
mapping.source_shapes[source_idx]
|
|
411
|
+
if source_idx < len(mapping.source_shapes)
|
|
412
|
+
else None
|
|
413
|
+
),
|
|
414
|
+
"position": pos,
|
|
415
|
+
}
|
|
416
|
+
return None
|
|
417
|
+
|
|
418
|
+
# === DUPLICATE DROP LOOKUP (O(log n) via searchsorted) ===
|
|
419
|
+
|
|
420
|
+
def get_duplicate_representative(self, row_id: int) -> Optional[dict]:
|
|
421
|
+
"""
|
|
422
|
+
Get which row replaced this one in drop_duplicates.
|
|
423
|
+
|
|
424
|
+
Returns:
|
|
425
|
+
{step_id, kept_rid, subset_columns, keep_strategy} if found, else None.
|
|
426
|
+
kept_rid is -1 if keep=False (no representative).
|
|
427
|
+
"""
|
|
428
|
+
for mapping in self.duplicate_drop_mappings:
|
|
429
|
+
idx = self._binary_search_mapping(mapping.dropped_rids, row_id)
|
|
430
|
+
if idx is not None:
|
|
431
|
+
kept = int(mapping.kept_rids[idx])
|
|
432
|
+
return {
|
|
433
|
+
"step_id": mapping.step_id,
|
|
434
|
+
"kept_rid": kept if kept >= 0 else None,
|
|
435
|
+
"subset_columns": mapping.subset_columns,
|
|
436
|
+
"keep_strategy": mapping.keep_strategy,
|
|
437
|
+
}
|
|
438
|
+
return None
|
|
439
|
+
|
|
364
440
|
# === MEMORY MANAGEMENT ===
|
|
365
441
|
|
|
366
442
|
def _check_memory_and_spill(self) -> None:
|
|
@@ -567,17 +643,17 @@ class InMemoryLineageStore:
|
|
|
567
643
|
|
|
568
644
|
def get_row_history_with_lineage(self, row_id: int, max_depth: int = 10) -> list[dict]:
|
|
569
645
|
"""
|
|
570
|
-
Get row history including pre-merge parent history.
|
|
646
|
+
Get row history including pre-merge and pre-concat parent history.
|
|
571
647
|
|
|
572
|
-
Follows merge lineage recursively to build complete cell provenance.
|
|
573
|
-
This is essential for tracking changes that happened before merge operations.
|
|
648
|
+
Follows merge and concat lineage recursively to build complete cell provenance.
|
|
649
|
+
This is essential for tracking changes that happened before merge/concat operations.
|
|
574
650
|
|
|
575
651
|
Deduplicates events by (col, old_val, new_val, operation) signature to prevent
|
|
576
652
|
cross-pipeline contamination when multiple DataFrames share row IDs.
|
|
577
653
|
|
|
578
654
|
Args:
|
|
579
655
|
row_id: Row ID to trace
|
|
580
|
-
max_depth: Maximum
|
|
656
|
+
max_depth: Maximum lineage depth to follow (prevents infinite loops)
|
|
581
657
|
|
|
582
658
|
Returns:
|
|
583
659
|
List of UNIQUE events in chronological order, including parent row events.
|
|
@@ -592,12 +668,21 @@ class InMemoryLineageStore:
|
|
|
592
668
|
events = []
|
|
593
669
|
|
|
594
670
|
# Check if this row came from a merge
|
|
595
|
-
|
|
596
|
-
if
|
|
671
|
+
merge_origin = self.get_merge_origin(rid)
|
|
672
|
+
if merge_origin and merge_origin["left_parent"] is not None:
|
|
597
673
|
# Recursively get parent's history first (chronological order)
|
|
598
|
-
parent_events = _collect_history(
|
|
674
|
+
parent_events = _collect_history(merge_origin["left_parent"], depth + 1)
|
|
599
675
|
events.extend(parent_events)
|
|
600
676
|
|
|
677
|
+
# Check if this row came from a concat
|
|
678
|
+
# For concat, parent_rid == rid (identity mapping), so we don't recurse
|
|
679
|
+
# But we record the concat step for completeness
|
|
680
|
+
concat_origin = self.get_concat_origin(rid)
|
|
681
|
+
if concat_origin:
|
|
682
|
+
# Concat preserves RIDs, so the "parent" is the same RID
|
|
683
|
+
# The concat step itself is recorded in the step events
|
|
684
|
+
pass
|
|
685
|
+
|
|
601
686
|
# Add this row's direct events
|
|
602
687
|
events.extend(self.get_row_history(rid))
|
|
603
688
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tracepipe
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.1
|
|
4
4
|
Summary: Row-level data lineage tracking for pandas pipelines
|
|
5
5
|
Project-URL: Homepage, https://github.com/tracepipe/tracepipe
|
|
6
6
|
Project-URL: Documentation, https://tracepipe.github.io/tracepipe/
|
|
@@ -278,7 +278,7 @@ tp.enable(mode="debug") # Full lineage
|
|
|
278
278
|
|
|
279
279
|
## Known Limitations
|
|
280
280
|
|
|
281
|
-
TracePipe tracks **cell mutations**
|
|
281
|
+
TracePipe tracks **cell mutations**, **merge provenance**, **concat provenance**, and **duplicate drop decisions** reliably. A few patterns have limited tracking:
|
|
282
282
|
|
|
283
283
|
| Pattern | Status | Notes |
|
|
284
284
|
|---------|--------|-------|
|
|
@@ -286,13 +286,10 @@ TracePipe tracks **cell mutations** (fillna, replace, loc assignment) and **merg
|
|
|
286
286
|
| `df = df.fillna({"col": 0})` | ✅ Tracked | DataFrame-level fillna |
|
|
287
287
|
| `df.loc[mask, "col"] = val` | ✅ Tracked | Conditional assignment |
|
|
288
288
|
| `df.merge(other, on="key")` | ✅ Tracked | Full provenance in debug mode |
|
|
289
|
-
| `pd.concat([df1, df2])` |
|
|
290
|
-
| `df.drop_duplicates(
|
|
291
|
-
|
|
|
292
|
-
|
|
293
|
-
**Why?** TracePipe tracks value changes within rows, not row-selection operations. When `drop_duplicates` picks one row over another, that's a provenance decision (not a cell mutation) that isn't currently instrumented.
|
|
294
|
-
|
|
295
|
-
**Planned for 0.4**: Full row-provenance tracking for concat, drop_duplicates, and sort operations.
|
|
289
|
+
| `pd.concat([df1, df2])` | ✅ Tracked | Row IDs preserved with source DataFrame tracking (v0.4+) |
|
|
290
|
+
| `df.drop_duplicates()` | ✅ Tracked | Dropped rows map to kept representative (debug mode, v0.4+) |
|
|
291
|
+
| `pd.concat(axis=1)` | ⚠️ Partial | FULL only if all inputs have identical RIDs |
|
|
292
|
+
| Complex `apply`/`pipe` | ⚠️ Partial | Output tracked, internals opaque |
|
|
296
293
|
|
|
297
294
|
---
|
|
298
295
|
|
|
@@ -1,29 +1,29 @@
|
|
|
1
|
-
tracepipe/__init__.py,sha256=
|
|
1
|
+
tracepipe/__init__.py,sha256=VOQFGsfVlTngxxdDSgOOd7X2KJt1l4fjKDH4NeizYEg,3342
|
|
2
2
|
tracepipe/api.py,sha256=WdcKvvzI3voDt6fxZWa8vjyZQU8lfRshx7T78oj7oFE,13351
|
|
3
3
|
tracepipe/context.py,sha256=DvwAZGZbLDJ4xoqmS1VnZOBbOI8ZIIErsY9W6GEbFSM,4051
|
|
4
4
|
tracepipe/contracts.py,sha256=m-rjPrgnCiAgKEkweOS7P95jrjDptt5UPdvUlqaV_rU,16226
|
|
5
|
-
tracepipe/convenience.py,sha256=
|
|
6
|
-
tracepipe/core.py,sha256=
|
|
5
|
+
tracepipe/convenience.py,sha256=nJ7Fy8riQVLXHOn1IFWtSpnmhHlyPt1hhantkOLKJs0,33141
|
|
6
|
+
tracepipe/core.py,sha256=O1QFJFTSszxKY1pVR1XLrdA0zez7t5KQLjyq6V36ARk,10883
|
|
7
7
|
tracepipe/debug.py,sha256=6t2GKVZLwn7SJLhrStE9qsmTiVIHATTE3jJPQ2DYtnc,10140
|
|
8
8
|
tracepipe/safety.py,sha256=UpzhQj31Dij-DjgT9mY_jPrUpVfcA51gDI2fUos4IUA,6694
|
|
9
9
|
tracepipe/snapshot.py,sha256=OLREzE1_LkWITluG_Bqeb7Y4pAKb8Lb3zJEF3cxnloU,13967
|
|
10
10
|
tracepipe/value_provenance.py,sha256=ogky6aOaZ-6K2uNBQxlXpmCeuvK434Hisj30zesRTd8,9330
|
|
11
11
|
tracepipe/instrumentation/__init__.py,sha256=pd0n6Z9m_V3gcBv097cXWFOZEzAP9sAq1jjQnNRrDZ8,222
|
|
12
12
|
tracepipe/instrumentation/apply_capture.py,sha256=cMThWzNXqWQENuMrCGTne1hO6fqaQFV7zJYNpsPTW4w,14463
|
|
13
|
-
tracepipe/instrumentation/filter_capture.py,sha256=
|
|
13
|
+
tracepipe/instrumentation/filter_capture.py,sha256=aN8-Ev6kbDR8f9A9JVy236VK0iqNxpMvki3pbtUkBYQ,19445
|
|
14
14
|
tracepipe/instrumentation/indexer_capture.py,sha256=1ATCeJ-uNA1uGiSbgnUx0wdVsIlZGHeUBaFJPXgFQNg,28440
|
|
15
|
-
tracepipe/instrumentation/merge_capture.py,sha256=
|
|
15
|
+
tracepipe/instrumentation/merge_capture.py,sha256=zqa6SY5YLbr-N7PPTdE6TYKyJIZcPqT02d1Ifvi3Jdw,18359
|
|
16
16
|
tracepipe/instrumentation/pandas_inst.py,sha256=h8RlfwYkYwuftCyBYIETdwHxVCzQM1SBBrbYP7SyjJ8,30047
|
|
17
17
|
tracepipe/instrumentation/series_capture.py,sha256=i7FiA2ndEzS6duIj5y-a7SDfIMl2cCY_jGC1tmG7TGU,11271
|
|
18
18
|
tracepipe/storage/__init__.py,sha256=pGFMfbIgIi2kofVPwYDqe2HTYMYJoabiGjTq77pYi-g,348
|
|
19
19
|
tracepipe/storage/base.py,sha256=7DV_-rp37DjBMr9B1w85hLVYhC8OQShk2PcEhT-n4tE,4894
|
|
20
|
-
tracepipe/storage/lineage_store.py,sha256=
|
|
20
|
+
tracepipe/storage/lineage_store.py,sha256=1enRmDgnVjxW8Pu7WMHJ8WPnnbm-HsAm4e1dKsTvnIc,31943
|
|
21
21
|
tracepipe/storage/row_identity.py,sha256=HBU0gTTJlFtFTcAdUCKuX-c9cHa0lo3CDIodDPDgOzA,17161
|
|
22
22
|
tracepipe/utils/__init__.py,sha256=CI_GXViCjdMbu1j6HuzZhoQZEW0sIB6WAve6j5pfOC0,182
|
|
23
23
|
tracepipe/utils/value_capture.py,sha256=wGgegQmJnVHxHbwHSH9di7JAOBChzD3ERJrabZNiayk,4092
|
|
24
24
|
tracepipe/visualization/__init__.py,sha256=M3s44ZTUNEToyghjhQW0FgbmWHKPr4Xc-7iNF6DpI_E,132
|
|
25
25
|
tracepipe/visualization/html_export.py,sha256=G0hfZTJctUCfpun17zXX1NIXhvJZbca6hKmP3rcIjbg,42282
|
|
26
|
-
tracepipe-0.
|
|
27
|
-
tracepipe-0.
|
|
28
|
-
tracepipe-0.
|
|
29
|
-
tracepipe-0.
|
|
26
|
+
tracepipe-0.4.1.dist-info/METADATA,sha256=kF2jBdGhKt-9YGR5VdFyb85jZj3Tgc26FbL9JxRLkhc,10067
|
|
27
|
+
tracepipe-0.4.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
28
|
+
tracepipe-0.4.1.dist-info/licenses/LICENSE,sha256=HMOAFHBClL79POwWL-2_aDcx42DJAq7Ce-nwJPvMB9U,1075
|
|
29
|
+
tracepipe-0.4.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|