tracepipe 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,301 @@
1
+ # tracepipe/value_provenance.py
2
+ """
3
+ Cell-level value provenance tracking.
4
+
5
+ Provides detailed history of how specific cell values changed
6
+ throughout the pipeline, including null introduction tracking.
7
+
8
+ Usage:
9
+ # Get history of a specific cell
10
+ history = tp.explain_value(row_id=123, column="price", df=result)
11
+
12
+ # Analyze where nulls came from in a column
13
+ analysis = tp.null_analysis("email", df)
14
+ """
15
+
16
+ from dataclasses import dataclass
17
+ from typing import Any, Optional
18
+
19
+ import pandas as pd
20
+
21
+ from .context import get_context
22
+ from .core import ChangeType
23
+
24
+
25
+ @dataclass
26
+ class ValueEvent:
27
+ """Single change event for a cell."""
28
+
29
+ step_id: int
30
+ operation: str
31
+ old_value: Any
32
+ new_value: Any
33
+ change_type: str
34
+ timestamp: float
35
+ code_location: Optional[str]
36
+
37
+ def to_dict(self) -> dict:
38
+ """Export to dictionary."""
39
+ return {
40
+ "step_id": self.step_id,
41
+ "operation": self.operation,
42
+ "old_value": self.old_value,
43
+ "new_value": self.new_value,
44
+ "change_type": self.change_type,
45
+ "timestamp": self.timestamp,
46
+ "code_location": self.code_location,
47
+ }
48
+
49
+
50
+ @dataclass
51
+ class ValueHistory:
52
+ """Complete history of a cell's value."""
53
+
54
+ row_id: int
55
+ column: str
56
+ current_value: Any
57
+ events: list[ValueEvent]
58
+ became_null_at: Optional[int] = None # step_id
59
+ became_null_by: Optional[str] = None # operation
60
+
61
+ def __repr__(self) -> str:
62
+ lines = [f"Value History: row {self.row_id}, column '{self.column}'"]
63
+ lines.append(f" Current: {self.current_value}")
64
+ lines.append(f" Changes: {len(self.events)}")
65
+
66
+ if self.became_null_at:
67
+ lines.append(f" ! Became null at step {self.became_null_at} by {self.became_null_by}")
68
+
69
+ for event in self.events[-5:]:
70
+ lines.append(f" {event.operation}: {event.old_value} -> {event.new_value}")
71
+
72
+ if len(self.events) > 5:
73
+ lines.append(f" ... and {len(self.events) - 5} more events")
74
+
75
+ return "\n".join(lines)
76
+
77
+ @property
78
+ def was_modified(self) -> bool:
79
+ """True if value was ever modified."""
80
+ return len(self.events) > 0
81
+
82
+ @property
83
+ def is_null(self) -> bool:
84
+ """True if current value is null."""
85
+ return pd.isna(self.current_value)
86
+
87
+ def to_dict(self) -> dict:
88
+ """Export to dictionary."""
89
+ return {
90
+ "row_id": self.row_id,
91
+ "column": self.column,
92
+ "current_value": self.current_value,
93
+ "events": [e.to_dict() for e in self.events],
94
+ "became_null_at": self.became_null_at,
95
+ "became_null_by": self.became_null_by,
96
+ }
97
+
98
+
99
+ def explain_value(row_id: int, column: str, df: Optional[pd.DataFrame] = None) -> ValueHistory:
100
+ """
101
+ Get complete history of a specific cell's value.
102
+
103
+ Args:
104
+ row_id: Row ID to trace
105
+ column: Column name
106
+ df: Optional DataFrame for current value lookup
107
+
108
+ Returns:
109
+ ValueHistory with all changes to this cell
110
+ """
111
+ ctx = get_context()
112
+ store = ctx.store
113
+
114
+ # Get current value if df provided
115
+ current_value = None
116
+ if df is not None:
117
+ rids = ctx.row_manager.get_ids_array(df)
118
+ if rids is not None:
119
+ # Find position of this row_id
120
+ matches = (rids == row_id).nonzero()[0]
121
+ if len(matches) > 0 and column in df.columns:
122
+ current_value = df.iloc[matches[0]][column]
123
+
124
+ # Collect all events for this cell
125
+ events = []
126
+ step_map = {s.step_id: s for s in store.steps}
127
+ became_null_at = None
128
+ became_null_by = None
129
+
130
+ for diff in store._iter_all_diffs():
131
+ if diff["row_id"] == row_id and diff["col"] == column:
132
+ step = step_map.get(diff["step_id"])
133
+
134
+ events.append(
135
+ ValueEvent(
136
+ step_id=diff["step_id"],
137
+ operation=step.operation if step else "unknown",
138
+ old_value=diff["old_val"],
139
+ new_value=diff["new_val"],
140
+ change_type=ChangeType(diff["change_type"]).name,
141
+ timestamp=step.timestamp if step else 0,
142
+ code_location=(
143
+ f"{step.code_file}:{step.code_line}" if step and step.code_file else None
144
+ ),
145
+ )
146
+ )
147
+
148
+ # Track when value became null
149
+ if became_null_at is None and pd.isna(diff["new_val"]) and not pd.isna(diff["old_val"]):
150
+ became_null_at = diff["step_id"]
151
+ became_null_by = step.operation if step else "unknown"
152
+
153
+ events.sort(key=lambda e: e.step_id)
154
+
155
+ return ValueHistory(
156
+ row_id=row_id,
157
+ column=column,
158
+ current_value=current_value,
159
+ events=events,
160
+ became_null_at=became_null_at,
161
+ became_null_by=became_null_by,
162
+ )
163
+
164
+
165
+ @dataclass
166
+ class NullAnalysis:
167
+ """Analysis of how nulls appeared in a column."""
168
+
169
+ column: str
170
+ total_nulls: int
171
+ null_sources: dict[str, int] # operation -> count
172
+ sample_row_ids: list[int]
173
+
174
+ def __repr__(self) -> str:
175
+ lines = [f"Null Analysis: '{self.column}'"]
176
+ lines.append(f" Total nulls: {self.total_nulls}")
177
+
178
+ if self.null_sources:
179
+ lines.append(" Sources:")
180
+ for op, count in sorted(self.null_sources.items(), key=lambda x: -x[1]):
181
+ lines.append(f" {op}: {count}")
182
+ else:
183
+ lines.append(" No tracked null introductions")
184
+
185
+ if self.sample_row_ids:
186
+ lines.append(f" Sample row IDs: {self.sample_row_ids[:5]}")
187
+
188
+ return "\n".join(lines)
189
+
190
+ @property
191
+ def has_untracked_nulls(self) -> bool:
192
+ """True if some nulls were not tracked by TracePipe."""
193
+ tracked = sum(self.null_sources.values())
194
+ return tracked < self.total_nulls
195
+
196
+ def to_dict(self) -> dict:
197
+ """Export to dictionary."""
198
+ return {
199
+ "column": self.column,
200
+ "total_nulls": self.total_nulls,
201
+ "null_sources": self.null_sources,
202
+ "sample_row_ids": self.sample_row_ids,
203
+ "has_untracked_nulls": self.has_untracked_nulls,
204
+ }
205
+
206
+
207
+ def null_analysis(column: str, df: pd.DataFrame) -> NullAnalysis:
208
+ """
209
+ Analyze how nulls appeared in a column.
210
+
211
+ Returns breakdown of which operations introduced nulls.
212
+
213
+ Args:
214
+ column: Column name to analyze
215
+ df: Current DataFrame
216
+
217
+ Returns:
218
+ NullAnalysis with breakdown of null sources
219
+ """
220
+ ctx = get_context()
221
+ store = ctx.store
222
+
223
+ if column not in df.columns:
224
+ return NullAnalysis(column=column, total_nulls=0, null_sources={}, sample_row_ids=[])
225
+
226
+ rids = ctx.row_manager.get_ids_array(df)
227
+ if rids is None:
228
+ return NullAnalysis(
229
+ column=column,
230
+ total_nulls=int(df[column].isna().sum()),
231
+ null_sources={},
232
+ sample_row_ids=[],
233
+ )
234
+
235
+ # Find null rows
236
+ null_mask = df[column].isna()
237
+ null_rids = set(rids[null_mask].tolist())
238
+
239
+ # Track which operations introduced nulls
240
+ null_sources: dict[str, int] = {}
241
+ step_map = {s.step_id: s for s in store.steps}
242
+ sample_ids: list[int] = []
243
+
244
+ for diff in store._iter_all_diffs():
245
+ if diff["col"] == column and diff["row_id"] in null_rids:
246
+ if pd.isna(diff["new_val"]) and not pd.isna(diff["old_val"]):
247
+ step = step_map.get(diff["step_id"])
248
+ op = step.operation if step else "unknown"
249
+ null_sources[op] = null_sources.get(op, 0) + 1
250
+ if len(sample_ids) < 10:
251
+ sample_ids.append(diff["row_id"])
252
+
253
+ return NullAnalysis(
254
+ column=column,
255
+ total_nulls=len(null_rids),
256
+ null_sources=null_sources,
257
+ sample_row_ids=sample_ids,
258
+ )
259
+
260
+
261
+ def column_changes_summary(column: str, df: pd.DataFrame) -> dict[str, Any]:
262
+ """
263
+ Get summary of all changes to a column.
264
+
265
+ Args:
266
+ column: Column name
267
+ df: Current DataFrame
268
+
269
+ Returns:
270
+ Dict with summary statistics
271
+ """
272
+ ctx = get_context()
273
+ store = ctx.store
274
+
275
+ rids = ctx.row_manager.get_ids_array(df)
276
+ if rids is None:
277
+ return {
278
+ "column": column,
279
+ "total_changes": 0,
280
+ "changes_by_operation": {},
281
+ "unique_rows_modified": 0,
282
+ }
283
+
284
+ rid_set = set(rids.tolist())
285
+ changes_by_op: dict[str, int] = {}
286
+ modified_rows: set = set()
287
+ step_map = {s.step_id: s for s in store.steps}
288
+
289
+ for diff in store._iter_all_diffs():
290
+ if diff["col"] == column and diff["row_id"] in rid_set:
291
+ step = step_map.get(diff["step_id"])
292
+ op = step.operation if step else "unknown"
293
+ changes_by_op[op] = changes_by_op.get(op, 0) + 1
294
+ modified_rows.add(diff["row_id"])
295
+
296
+ return {
297
+ "column": column,
298
+ "total_changes": sum(changes_by_op.values()),
299
+ "changes_by_operation": changes_by_op,
300
+ "unique_rows_modified": len(modified_rows),
301
+ }
@@ -156,12 +156,17 @@ def _get_groups_summary(ctx) -> list[dict]:
156
156
  groups = []
157
157
  for mapping in ctx.store.aggregation_mappings:
158
158
  for group_key, row_ids in mapping.membership.items():
159
- is_count_only = isinstance(row_ids, int)
159
+ # Count-only groups are stored as [-count] (list with one negative element)
160
+ is_count_only = len(row_ids) == 1 and row_ids[0] < 0
161
+ if is_count_only:
162
+ row_count = abs(row_ids[0])
163
+ else:
164
+ row_count = len(row_ids)
160
165
  groups.append(
161
166
  {
162
167
  "key": str(group_key),
163
168
  "column": mapping.group_column,
164
- "row_count": row_ids if is_count_only else len(row_ids),
169
+ "row_count": row_count,
165
170
  "is_count_only": is_count_only,
166
171
  "row_ids": [] if is_count_only else row_ids[:100], # First 100 only
167
172
  "agg_functions": mapping.agg_functions,
@@ -1059,9 +1064,13 @@ document.addEventListener('DOMContentLoaded', () => {
1059
1064
  """
1060
1065
 
1061
1066
 
1062
- def save(filepath: str) -> None:
1067
+ def save(filepath: str, title: str = "TracePipe Dashboard") -> None:
1063
1068
  """
1064
1069
  Save interactive lineage report as HTML.
1070
+
1071
+ Args:
1072
+ filepath: Path to save the HTML file
1073
+ title: Title for the report (shown in browser tab and header)
1065
1074
  """
1066
1075
  ctx = get_context()
1067
1076
 
@@ -1073,7 +1082,9 @@ def save(filepath: str) -> None:
1073
1082
  row_index = _build_row_index(ctx)
1074
1083
 
1075
1084
  # Total registered rows (approximate)
1076
- total_registered = ctx.row_manager.next_row_id if hasattr(ctx.row_manager, "next_row_id") else 0
1085
+ total_registered = (
1086
+ ctx.row_manager._next_row_id if hasattr(ctx.row_manager, "_next_row_id") else 0
1087
+ )
1077
1088
 
1078
1089
  # Identify Suggested Rows for UX
1079
1090
  suggested_rows = {"dropped": [], "modified": [], "survivors": []}
@@ -1181,13 +1192,16 @@ def save(filepath: str) -> None:
1181
1192
  </div>
1182
1193
  """
1183
1194
 
1195
+ # Escape title for HTML
1196
+ escaped_title = html.escape(title)
1197
+
1184
1198
  html_content = f"""
1185
1199
  <!DOCTYPE html>
1186
1200
  <html lang="en">
1187
1201
  <head>
1188
1202
  <meta charset="utf-8">
1189
1203
  <meta name="viewport" content="width=device-width, initial-scale=1">
1190
- <title>TracePipe Dashboard</title>
1204
+ <title>{escaped_title}</title>
1191
1205
  {CSS}
1192
1206
  </head>
1193
1207
  <body>
@@ -1217,7 +1231,7 @@ def save(filepath: str) -> None:
1217
1231
  <div class="main-content">
1218
1232
  <!-- Top Bar -->
1219
1233
  <div class="top-bar">
1220
- <div class="page-title">Data Lineage Report</div>
1234
+ <div class="page-title">{escaped_title}</div>
1221
1235
  <div class="search-wrapper">
1222
1236
  <i class="search-icon-abs">🔍</i>
1223
1237
  <input type="text" id="globalSearch" class="search-input"
@@ -1236,7 +1250,8 @@ def save(filepath: str) -> None:
1236
1250
  </div>
1237
1251
  <div class="card">
1238
1252
  <h3>Retention</h3>
1239
- <div class="metric-value">{(final_rows / initial_rows * 100) if initial_rows else 0:.1f}%</div>
1253
+ <div class="metric-value">{
1254
+ (final_rows / initial_rows * 100) if initial_rows else 0:.1f}%</div>
1240
1255
  <div class="metric-sub">{_format_number(final_rows)} of {
1241
1256
  _format_number(initial_rows)
1242
1257
  } rows</div>