tracepipe 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tracepipe/__init__.py +117 -78
- tracepipe/api.py +168 -331
- tracepipe/context.py +21 -1
- tracepipe/contracts.py +473 -0
- tracepipe/convenience.py +812 -0
- tracepipe/core.py +174 -17
- tracepipe/debug.py +325 -0
- tracepipe/instrumentation/apply_capture.py +453 -0
- tracepipe/instrumentation/filter_capture.py +468 -0
- tracepipe/instrumentation/indexer_capture.py +813 -0
- tracepipe/instrumentation/merge_capture.py +434 -0
- tracepipe/instrumentation/pandas_inst.py +66 -183
- tracepipe/instrumentation/series_capture.py +331 -0
- tracepipe/safety.py +3 -3
- tracepipe/snapshot.py +420 -0
- tracepipe/storage/base.py +7 -3
- tracepipe/storage/lineage_store.py +190 -47
- tracepipe/storage/row_identity.py +366 -104
- tracepipe/value_provenance.py +301 -0
- tracepipe/visualization/html_export.py +22 -7
- tracepipe-0.3.0.dist-info/METADATA +575 -0
- tracepipe-0.3.0.dist-info/RECORD +29 -0
- tracepipe-0.2.0.dist-info/METADATA +0 -508
- tracepipe-0.2.0.dist-info/RECORD +0 -19
- {tracepipe-0.2.0.dist-info → tracepipe-0.3.0.dist-info}/WHEEL +0 -0
- {tracepipe-0.2.0.dist-info → tracepipe-0.3.0.dist-info}/licenses/LICENSE +0 -0
tracepipe/__init__.py
CHANGED
|
@@ -5,106 +5,145 @@ TracePipe: Row-Level Data Lineage Tracking
|
|
|
5
5
|
Track every row, every change, every step in your pandas pipelines.
|
|
6
6
|
|
|
7
7
|
Quick Start:
|
|
8
|
-
import tracepipe
|
|
8
|
+
import tracepipe as tp
|
|
9
9
|
import pandas as pd
|
|
10
10
|
|
|
11
|
-
|
|
12
|
-
tracepipe.watch("age", "salary") # Watch specific columns
|
|
11
|
+
tp.enable(mode="debug", watch=["age", "salary"])
|
|
13
12
|
|
|
14
13
|
df = pd.DataFrame({"age": [25, None, 35], "salary": [50000, 60000, None]})
|
|
15
14
|
df = df.dropna()
|
|
16
15
|
df["salary"] = df["salary"] * 1.1
|
|
17
16
|
|
|
18
|
-
#
|
|
19
|
-
|
|
20
|
-
print(
|
|
17
|
+
# Health audit
|
|
18
|
+
result = tp.check(df)
|
|
19
|
+
print(result)
|
|
21
20
|
|
|
22
|
-
|
|
23
|
-
|
|
21
|
+
# Row journey
|
|
22
|
+
trace = tp.trace(df, row=0)
|
|
23
|
+
print(trace)
|
|
24
24
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
- Aggregation lineage: Trace back from grouped results to source rows
|
|
29
|
-
- Zero-copy design: Minimal overhead on your pipelines
|
|
30
|
-
- Safe instrumentation: Never crashes your code
|
|
25
|
+
# Cell provenance
|
|
26
|
+
why = tp.why(df, col="salary", row=0)
|
|
27
|
+
print(why)
|
|
31
28
|
|
|
32
|
-
|
|
29
|
+
# Generate report
|
|
30
|
+
tp.report(df, "audit.html")
|
|
31
|
+
|
|
32
|
+
Modes:
|
|
33
|
+
- CI mode (default): Step stats, retention rates, merge mismatch detection.
|
|
34
|
+
No per-row provenance. Fast for production.
|
|
35
|
+
- DEBUG mode: Full per-row provenance, cell history, ghost values.
|
|
36
|
+
|
|
37
|
+
API Summary:
|
|
38
|
+
Core (5 functions for 90% of use cases):
|
|
39
|
+
tp.enable() - Start tracking
|
|
40
|
+
tp.check() - Health audit → CheckResult
|
|
41
|
+
tp.trace() - Row journey → TraceResult
|
|
42
|
+
tp.why() - Cell provenance → WhyResult
|
|
43
|
+
tp.report() - HTML export
|
|
44
|
+
|
|
45
|
+
Power features (via namespaces):
|
|
46
|
+
tp.debug.inspect() - Raw lineage access
|
|
47
|
+
tp.contracts.contract() - Data quality contracts
|
|
48
|
+
tp.snapshot(), tp.diff() - Pipeline state comparison
|
|
49
|
+
|
|
50
|
+
All functions return structured result objects.
|
|
51
|
+
Use print(result) for pretty output, result.to_dict() for data.
|
|
33
52
|
"""
|
|
34
53
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
#
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
register,
|
|
57
|
-
reset,
|
|
58
|
-
stage,
|
|
59
|
-
stats,
|
|
60
|
-
steps,
|
|
61
|
-
unwatch,
|
|
62
|
-
# Column watching
|
|
63
|
-
watch,
|
|
64
|
-
watch_all,
|
|
54
|
+
# === CORE API (6 functions) ===
|
|
55
|
+
# === NAMESPACES ===
|
|
56
|
+
from . import contracts, debug
|
|
57
|
+
from .api import configure, disable, enable, register, reset, stage
|
|
58
|
+
|
|
59
|
+
# Re-export contract() at top level for convenience
|
|
60
|
+
from .contracts import contract
|
|
61
|
+
|
|
62
|
+
# === CONVENIENCE API (user-facing) ===
|
|
63
|
+
from .convenience import (
|
|
64
|
+
CheckFailed,
|
|
65
|
+
# Result types
|
|
66
|
+
CheckResult,
|
|
67
|
+
CheckWarning,
|
|
68
|
+
TraceResult,
|
|
69
|
+
WhyResult,
|
|
70
|
+
check,
|
|
71
|
+
find,
|
|
72
|
+
report,
|
|
73
|
+
trace,
|
|
74
|
+
why,
|
|
65
75
|
)
|
|
66
|
-
from .core import TracePipeConfig
|
|
67
76
|
|
|
68
|
-
#
|
|
69
|
-
from .
|
|
70
|
-
from .visualization.html_export import save
|
|
77
|
+
# === CONFIGURATION ===
|
|
78
|
+
from .core import TracePipeConfig, TracePipeMode
|
|
71
79
|
|
|
72
|
-
|
|
80
|
+
# === SNAPSHOTS (top-level for convenience) ===
|
|
81
|
+
from .snapshot import DiffResult, Snapshot, diff, snapshot
|
|
73
82
|
|
|
83
|
+
# === VERSION ===
|
|
84
|
+
__version__ = "0.3.0"
|
|
85
|
+
|
|
86
|
+
# === MINIMAL __all__ ===
|
|
74
87
|
__all__ = [
|
|
75
|
-
# Core
|
|
88
|
+
# Core control (6)
|
|
76
89
|
"enable",
|
|
77
90
|
"disable",
|
|
78
91
|
"reset",
|
|
79
|
-
"configure",
|
|
80
|
-
"watch",
|
|
81
|
-
"watch_all",
|
|
82
|
-
"unwatch",
|
|
83
|
-
"clear_watch",
|
|
84
92
|
"register",
|
|
85
93
|
"stage",
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
"
|
|
89
|
-
"
|
|
90
|
-
"
|
|
91
|
-
"
|
|
92
|
-
"
|
|
93
|
-
|
|
94
|
-
"
|
|
95
|
-
|
|
96
|
-
"
|
|
97
|
-
"
|
|
98
|
-
"
|
|
99
|
-
|
|
100
|
-
|
|
94
|
+
"configure",
|
|
95
|
+
# Convenience API (5)
|
|
96
|
+
"check",
|
|
97
|
+
"find",
|
|
98
|
+
"trace",
|
|
99
|
+
"why",
|
|
100
|
+
"report",
|
|
101
|
+
# Result types (5)
|
|
102
|
+
"CheckResult",
|
|
103
|
+
"CheckWarning",
|
|
104
|
+
"CheckFailed",
|
|
105
|
+
"TraceResult",
|
|
106
|
+
"WhyResult",
|
|
107
|
+
# Snapshots (4)
|
|
108
|
+
"snapshot",
|
|
109
|
+
"diff",
|
|
110
|
+
"Snapshot",
|
|
111
|
+
"DiffResult",
|
|
112
|
+
# Contracts (1)
|
|
113
|
+
"contract",
|
|
114
|
+
# Namespaces (2)
|
|
115
|
+
"debug",
|
|
116
|
+
"contracts",
|
|
117
|
+
# Config (2)
|
|
101
118
|
"TracePipeConfig",
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
"GroupLineageResult",
|
|
105
|
-
# Protocols (for custom backends)
|
|
106
|
-
"LineageBackend",
|
|
107
|
-
"RowIdentityStrategy",
|
|
108
|
-
# Version
|
|
119
|
+
"TracePipeMode",
|
|
120
|
+
# Version (1)
|
|
109
121
|
"__version__",
|
|
110
122
|
]
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def __dir__():
|
|
126
|
+
"""Control what shows up in IDE autocomplete - only essential functions."""
|
|
127
|
+
return [
|
|
128
|
+
# Primary API
|
|
129
|
+
"enable",
|
|
130
|
+
"disable",
|
|
131
|
+
"reset",
|
|
132
|
+
"register",
|
|
133
|
+
"configure",
|
|
134
|
+
"check",
|
|
135
|
+
"find",
|
|
136
|
+
"trace",
|
|
137
|
+
"why",
|
|
138
|
+
"report",
|
|
139
|
+
# Snapshots
|
|
140
|
+
"snapshot",
|
|
141
|
+
"diff",
|
|
142
|
+
# Contract
|
|
143
|
+
"contract",
|
|
144
|
+
# Namespaces
|
|
145
|
+
"debug",
|
|
146
|
+
"contracts",
|
|
147
|
+
# Config
|
|
148
|
+
"TracePipeConfig",
|
|
149
|
+
]
|