tracepipe 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tracepipe/__init__.py ADDED
@@ -0,0 +1,110 @@
1
+ # tracepipe/__init__.py
2
+ """
3
+ TracePipe: Row-Level Data Lineage Tracking
4
+
5
+ Track every row, every change, every step in your pandas pipelines.
6
+
7
+ Quick Start:
8
+ import tracepipe
9
+ import pandas as pd
10
+
11
+ tracepipe.enable()
12
+ tracepipe.watch("age", "salary") # Watch specific columns
13
+
14
+ df = pd.DataFrame({"age": [25, None, 35], "salary": [50000, 60000, None]})
15
+ df = df.dropna()
16
+ df["salary"] = df["salary"] * 1.1
17
+
18
+ # Query lineage
19
+ row = tracepipe.explain(0) # What happened to row 0?
20
+ print(row.history())
21
+
22
+ dropped = tracepipe.dropped_rows() # Which rows were dropped?
23
+ print(dropped)
24
+
25
+ Features:
26
+ - Row-level tracking: Know exactly which rows were dropped and why
27
+ - Cell-level diffs: See before/after values for watched columns
28
+ - Aggregation lineage: Trace back from grouped results to source rows
29
+ - Zero-copy design: Minimal overhead on your pipelines
30
+ - Safe instrumentation: Never crashes your code
31
+
32
+ See IMPLEMENTATION_PLAN_v5.md for full documentation.
33
+ """
34
+
35
+ from .api import (
36
+ GroupLineageResult,
37
+ # Result classes
38
+ RowLineageResult,
39
+ aggregation_groups,
40
+ # Convenience functions
41
+ alive_rows,
42
+ clear_watch,
43
+ configure,
44
+ disable,
45
+ dropped_rows,
46
+ # Core control
47
+ enable,
48
+ # Query API
49
+ explain,
50
+ explain_group,
51
+ explain_many,
52
+ export_arrow,
53
+ # Export
54
+ export_json,
55
+ mass_updates,
56
+ register,
57
+ reset,
58
+ stage,
59
+ stats,
60
+ steps,
61
+ unwatch,
62
+ # Column watching
63
+ watch,
64
+ watch_all,
65
+ )
66
+ from .core import TracePipeConfig
67
+
68
+ # Export protocols for custom backend implementers
69
+ from .storage.base import LineageBackend, RowIdentityStrategy
70
+ from .visualization.html_export import save
71
+
72
+ __version__ = "0.2.0"
73
+
74
+ __all__ = [
75
+ # Core API
76
+ "enable",
77
+ "disable",
78
+ "reset",
79
+ "configure",
80
+ "watch",
81
+ "watch_all",
82
+ "unwatch",
83
+ "clear_watch",
84
+ "register",
85
+ "stage",
86
+ # Query API
87
+ "explain",
88
+ "explain_many",
89
+ "explain_group",
90
+ "dropped_rows",
91
+ "alive_rows",
92
+ "mass_updates",
93
+ "steps",
94
+ "aggregation_groups",
95
+ # Export
96
+ "export_json",
97
+ "export_arrow",
98
+ "stats",
99
+ "save",
100
+ # Configuration
101
+ "TracePipeConfig",
102
+ # Result classes
103
+ "RowLineageResult",
104
+ "GroupLineageResult",
105
+ # Protocols (for custom backends)
106
+ "LineageBackend",
107
+ "RowIdentityStrategy",
108
+ # Version
109
+ "__version__",
110
+ ]
tracepipe/api.py ADDED
@@ -0,0 +1,563 @@
1
+ # tracepipe/api.py
2
+ """
3
+ Public API for TracePipe.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import sys
9
+ import types
10
+ from dataclasses import fields
11
+
12
+ import pandas as pd
13
+
14
+ from .context import TracePipeContext, get_context, reset_context, set_context
15
+ from .core import LineageGaps, TracePipeConfig
16
+ from .instrumentation.pandas_inst import instrument_pandas, uninstrument_pandas
17
+ from .storage.base import LineageBackend, RowIdentityStrategy
18
+
19
+
20
+ def _get_module() -> types.ModuleType:
21
+ """Get the tracepipe module for fluent chaining."""
22
+ return sys.modules["tracepipe"]
23
+
24
+
25
+ def enable(
26
+ config: TracePipeConfig | None = None,
27
+ auto_watch: bool = False,
28
+ backend: LineageBackend | None = None,
29
+ identity: RowIdentityStrategy | None = None,
30
+ ) -> types.ModuleType:
31
+ """
32
+ Enable TracePipe lineage tracking.
33
+
34
+ Args:
35
+ config: Optional configuration
36
+ auto_watch: If True, automatically watch columns with nulls
37
+ backend: Optional custom storage backend (default: InMemoryLineageStore)
38
+ identity: Optional custom row identity strategy (default: PandasRowIdentity)
39
+
40
+ Returns:
41
+ The tracepipe module for fluent chaining.
42
+
43
+ Examples:
44
+ # Basic usage (pandas + in-memory)
45
+ tracepipe.enable()
46
+
47
+ # Fluent chaining
48
+ tracepipe.enable().watch("age", "salary")
49
+
50
+ # With SQLite persistence (v2.1+)
51
+ from tracepipe.storage.sqlite_backend import SQLiteLineageStore
52
+ tracepipe.enable(backend=SQLiteLineageStore(config, "lineage.db"))
53
+
54
+ # With Polars support (v2.1+)
55
+ from tracepipe.storage.polars_identity import PolarsRowIdentity
56
+ tracepipe.enable(identity=PolarsRowIdentity(config))
57
+ """
58
+ # Create context with custom backends if provided
59
+ if backend is not None or identity is not None:
60
+ ctx = TracePipeContext(config=config, backend=backend, identity=identity)
61
+ set_context(ctx)
62
+ else:
63
+ ctx = get_context()
64
+ if config:
65
+ ctx.config = config
66
+
67
+ if auto_watch:
68
+ ctx.config.auto_watch = True
69
+
70
+ if not ctx.enabled:
71
+ instrument_pandas()
72
+ ctx.enabled = True
73
+
74
+ return _get_module()
75
+
76
+
77
+ def disable() -> types.ModuleType:
78
+ """
79
+ Disable TracePipe and restore original pandas methods.
80
+
81
+ Note:
82
+ This stops tracking but preserves lineage data collected so far.
83
+ You can still query explain(), dropped_rows(), etc. after disabling.
84
+ To clear all data, use reset() instead.
85
+
86
+ Returns:
87
+ The tracepipe module for fluent chaining.
88
+ """
89
+ ctx = get_context()
90
+
91
+ if ctx.enabled:
92
+ uninstrument_pandas()
93
+ # Call cleanup if backend supports it
94
+ if hasattr(ctx.store, "_cleanup_spillover"):
95
+ ctx.store._cleanup_spillover()
96
+ ctx.enabled = False
97
+
98
+ return _get_module()
99
+
100
+
101
+ def reset() -> types.ModuleType:
102
+ """
103
+ Reset all tracking state for the current thread.
104
+
105
+ This clears ALL lineage data, steps, watched columns, and row registrations.
106
+ If tracking was enabled, it will be re-enabled with a fresh context.
107
+
108
+ Use this when:
109
+ - Starting fresh in a notebook cell
110
+ - Running multiple independent analyses
111
+ - Testing
112
+
113
+ Returns:
114
+ The tracepipe module for fluent chaining.
115
+ """
116
+ ctx = get_context()
117
+ was_enabled = ctx.enabled
118
+
119
+ if was_enabled:
120
+ uninstrument_pandas()
121
+
122
+ reset_context()
123
+
124
+ if was_enabled:
125
+ # Re-enable with fresh context
126
+ enable()
127
+
128
+ return _get_module()
129
+
130
+
131
+ def configure(**kwargs) -> types.ModuleType:
132
+ """
133
+ Update configuration.
134
+
135
+ Args:
136
+ **kwargs: Configuration options to update. Valid keys are:
137
+ - max_diffs_in_memory: Maximum diffs before spilling to disk
138
+ - max_diffs_per_step: Threshold for mass update detection
139
+ - max_group_membership_size: Threshold for count-only groups
140
+ - strict_mode: Raise exceptions on tracking errors
141
+ - auto_watch: Auto-watch columns with null values
142
+ - auto_watch_null_threshold: Null ratio threshold for auto-watch
143
+ - spillover_dir: Directory for spilled data
144
+ - use_hidden_column: Use hidden column for row tracking
145
+ - warn_on_duplicate_index: Warn on duplicate DataFrame index
146
+ - cleanup_spillover_on_disable: Clean up spilled files on disable
147
+
148
+ Returns:
149
+ The tracepipe module for fluent chaining.
150
+
151
+ Raises:
152
+ ValueError: If an invalid configuration key is provided.
153
+
154
+ Examples:
155
+ tracepipe.configure(max_diffs_per_step=1000)
156
+ tracepipe.enable().configure(strict_mode=True).watch("amount")
157
+ """
158
+ ctx = get_context()
159
+
160
+ # Validate keys against dataclass fields
161
+ valid_keys = {f.name for f in fields(TracePipeConfig)}
162
+ invalid_keys = set(kwargs.keys()) - valid_keys
163
+ if invalid_keys:
164
+ raise ValueError(
165
+ f"Invalid configuration key(s): {invalid_keys}. "
166
+ f"Valid keys are: {sorted(valid_keys)}"
167
+ )
168
+
169
+ for key, value in kwargs.items():
170
+ setattr(ctx.config, key, value)
171
+
172
+ return _get_module()
173
+
174
+
175
+ def watch(*columns: str) -> types.ModuleType:
176
+ """
177
+ Add columns to watch for cell-level changes.
178
+
179
+ Args:
180
+ *columns: Column names to watch.
181
+
182
+ Returns:
183
+ The tracepipe module for fluent chaining.
184
+
185
+ Examples:
186
+ tracepipe.watch("age", "salary")
187
+ tracepipe.enable().watch("amount").watch("price")
188
+ """
189
+ ctx = get_context()
190
+ ctx.watched_columns.update(columns)
191
+ return _get_module()
192
+
193
+
194
+ def watch_all(df: pd.DataFrame) -> types.ModuleType:
195
+ """
196
+ Watch all columns in a DataFrame.
197
+
198
+ Args:
199
+ df: DataFrame whose columns to watch.
200
+
201
+ Returns:
202
+ The tracepipe module for fluent chaining.
203
+
204
+ Examples:
205
+ tracepipe.watch_all(df)
206
+ """
207
+ ctx = get_context()
208
+ ctx.watched_columns.update(df.columns.tolist())
209
+ return _get_module()
210
+
211
+
212
+ def unwatch(*columns: str) -> types.ModuleType:
213
+ """
214
+ Remove columns from watch list.
215
+
216
+ Args:
217
+ *columns: Column names to stop watching.
218
+
219
+ Returns:
220
+ The tracepipe module for fluent chaining.
221
+ """
222
+ ctx = get_context()
223
+ ctx.watched_columns.difference_update(columns)
224
+ return _get_module()
225
+
226
+
227
+ def clear_watch() -> types.ModuleType:
228
+ """
229
+ Clear all watched columns.
230
+
231
+ Returns:
232
+ The tracepipe module for fluent chaining.
233
+
234
+ Examples:
235
+ tracepipe.clear_watch().watch("new_column")
236
+ """
237
+ ctx = get_context()
238
+ ctx.watched_columns.clear()
239
+ return _get_module()
240
+
241
+
242
+ def register(df: pd.DataFrame) -> types.ModuleType:
243
+ """
244
+ Manually register a DataFrame for tracking.
245
+
246
+ Use this for DataFrames created before enable() was called.
247
+
248
+ Returns:
249
+ The tracepipe module for fluent chaining.
250
+ """
251
+ ctx = get_context()
252
+ if ctx.enabled:
253
+ ctx.row_manager.register(df)
254
+ return _get_module()
255
+
256
+
257
+ def stage(name: str):
258
+ """Context manager for naming pipeline stages."""
259
+
260
+ class StageContext:
261
+ def __init__(self, stage_name: str):
262
+ self.stage_name = stage_name
263
+ self.previous_stage = None
264
+
265
+ def __enter__(self):
266
+ ctx = get_context()
267
+ self.previous_stage = ctx.current_stage
268
+ ctx.current_stage = self.stage_name
269
+ return self
270
+
271
+ def __exit__(self, *args):
272
+ ctx = get_context()
273
+ ctx.current_stage = self.previous_stage
274
+
275
+ return StageContext(name)
276
+
277
+
278
+ # === QUERY API ===
279
+
280
+
281
+ class RowLineageResult:
282
+ """Query result for a single row's journey."""
283
+
284
+ def __init__(self, row_id: int, ctx: TracePipeContext):
285
+ self.row_id = row_id
286
+ self._ctx = ctx
287
+ self._history = ctx.store.get_row_history(row_id)
288
+ self._gaps = ctx.store.compute_gaps(row_id)
289
+
290
+ @property
291
+ def is_alive(self) -> bool:
292
+ """Return True if row was not dropped."""
293
+ return not any(h["change_type"] == "DROPPED" for h in self._history)
294
+
295
+ @property
296
+ def dropped_at(self) -> str | None:
297
+ """Return operation name where row was dropped, or None."""
298
+ for h in self._history:
299
+ if h["change_type"] == "DROPPED":
300
+ return h["operation"]
301
+ return None
302
+
303
+ def cell_history(self, column: str) -> list[dict]:
304
+ """Get history for a specific column."""
305
+ return [h for h in self._history if h["col"] == column]
306
+
307
+ def history(self) -> list[dict]:
308
+ """Get full history."""
309
+ return self._history
310
+
311
+ @property
312
+ def gaps(self) -> LineageGaps:
313
+ """Get lineage gaps."""
314
+ return self._gaps
315
+
316
+ @property
317
+ def is_fully_tracked(self) -> bool:
318
+ """Return True if no gaps in lineage."""
319
+ return self._gaps.is_fully_tracked
320
+
321
+ def to_dict(self) -> dict:
322
+ """Export to dictionary."""
323
+ return {
324
+ "row_id": self.row_id,
325
+ "is_alive": self.is_alive,
326
+ "dropped_at": self.dropped_at,
327
+ "is_fully_tracked": self.is_fully_tracked,
328
+ "gaps_summary": self._gaps.summary(),
329
+ "history": self._history,
330
+ }
331
+
332
+ def __repr__(self):
333
+ status = "alive" if self.is_alive else f"dropped at {self.dropped_at}"
334
+ return f"<RowLineage row_id={self.row_id} {status} events={len(self._history)}>"
335
+
336
+
337
+ class GroupLineageResult:
338
+ """Query result for an aggregation group."""
339
+
340
+ def __init__(self, group_key: str, ctx: TracePipeContext):
341
+ self.group_key = group_key
342
+ self._ctx = ctx
343
+ self._info = ctx.store.get_group_members(group_key)
344
+
345
+ @property
346
+ def row_ids(self) -> list[int]:
347
+ """Get list of row IDs in this group."""
348
+ return self._info["row_ids"] if self._info else []
349
+
350
+ @property
351
+ def row_count(self) -> int:
352
+ """Get number of rows in this group."""
353
+ return self._info["row_count"] if self._info else 0
354
+
355
+ @property
356
+ def is_count_only(self) -> bool:
357
+ """
358
+ True if group exceeded max_group_membership_size threshold.
359
+
360
+ When True, row_ids will be empty and only row_count is available.
361
+ """
362
+ return self._info.get("is_count_only", False) if self._info else False
363
+
364
+ @property
365
+ def group_column(self) -> str | None:
366
+ """Get the column used for grouping."""
367
+ return self._info["group_column"] if self._info else None
368
+
369
+ @property
370
+ def aggregation_functions(self) -> dict[str, str]:
371
+ """Get the aggregation functions applied."""
372
+ return self._info["agg_functions"] if self._info else {}
373
+
374
+ def get_contributing_rows(self, limit: int = 100) -> list[RowLineageResult]:
375
+ """
376
+ Get lineage for contributing rows.
377
+
378
+ Returns empty list if is_count_only is True.
379
+ """
380
+ if self.is_count_only:
381
+ return []
382
+ return [explain(row_id) for row_id in self.row_ids[:limit]]
383
+
384
+ def to_dict(self) -> dict:
385
+ """Export to dictionary."""
386
+ return {
387
+ "group_key": self.group_key,
388
+ "group_column": self.group_column,
389
+ "row_count": self.row_count,
390
+ "row_ids": self.row_ids,
391
+ "is_count_only": self.is_count_only,
392
+ "aggregation_functions": self.aggregation_functions,
393
+ }
394
+
395
+ def __repr__(self):
396
+ suffix = " (count only)" if self.is_count_only else ""
397
+ return f"<GroupLineage key='{self.group_key}' rows={self.row_count}{suffix}>"
398
+
399
+
400
+ def explain(row_id: int) -> RowLineageResult:
401
+ """Get lineage for a specific row."""
402
+ ctx = get_context()
403
+ return RowLineageResult(row_id, ctx)
404
+
405
+
406
+ def explain_many(row_ids: list[int]) -> list[RowLineageResult]:
407
+ """
408
+ Get lineage for multiple rows.
409
+
410
+ Args:
411
+ row_ids: List of row IDs to explain.
412
+
413
+ Returns:
414
+ List of RowLineageResult objects.
415
+
416
+ Examples:
417
+ results = tracepipe.explain_many([0, 1, 2])
418
+ for row in results:
419
+ print(row.is_alive, row.dropped_at)
420
+ """
421
+ ctx = get_context()
422
+ return [RowLineageResult(row_id, ctx) for row_id in row_ids]
423
+
424
+
425
+ def explain_group(group_key: str) -> GroupLineageResult:
426
+ """Get lineage for an aggregation group."""
427
+ ctx = get_context()
428
+ return GroupLineageResult(group_key, ctx)
429
+
430
+
431
+ def dropped_rows(by_step: bool = False) -> list[int] | dict[str, int]:
432
+ """
433
+ Get dropped row information.
434
+
435
+ Args:
436
+ by_step: If False (default), return list of dropped row IDs.
437
+ If True, return dict mapping operation names to drop counts.
438
+
439
+ Returns:
440
+ List of row IDs if by_step=False, or dict of {operation: count} if by_step=True.
441
+
442
+ Examples:
443
+ # Get all dropped row IDs
444
+ dropped = tracepipe.dropped_rows()
445
+
446
+ # Get counts by operation
447
+ by_op = tracepipe.dropped_rows(by_step=True)
448
+ # {'DataFrame.dropna': 5, 'DataFrame.query': 3}
449
+ """
450
+ ctx = get_context()
451
+ if by_step:
452
+ return ctx.store.get_dropped_by_step()
453
+ return ctx.store.get_dropped_rows()
454
+
455
+
456
+ def alive_rows() -> list[int]:
457
+ """
458
+ Get all row IDs that are still alive (not dropped).
459
+
460
+ Returns:
461
+ List of row IDs that have not been dropped.
462
+
463
+ Examples:
464
+ alive = tracepipe.alive_rows()
465
+ print(f"{len(alive)} rows survived the pipeline")
466
+ """
467
+ ctx = get_context()
468
+ all_registered = set(ctx.row_manager.all_registered_ids())
469
+ dropped = set(ctx.store.get_dropped_rows())
470
+ return sorted(all_registered - dropped)
471
+
472
+
473
+ def mass_updates() -> list[dict]:
474
+ """Get operations that exceeded cell diff threshold."""
475
+ ctx = get_context()
476
+ return [
477
+ {
478
+ "step_id": s.step_id,
479
+ "operation": s.operation,
480
+ "rows_affected": s.rows_affected,
481
+ "stage": s.stage,
482
+ }
483
+ for s in ctx.store.steps
484
+ if s.is_mass_update
485
+ ]
486
+
487
+
488
+ def steps() -> list[dict]:
489
+ """Get all tracked steps."""
490
+ ctx = get_context()
491
+ return [
492
+ {
493
+ "step_id": s.step_id,
494
+ "operation": s.operation,
495
+ "stage": s.stage,
496
+ "input_shape": s.input_shape,
497
+ "output_shape": s.output_shape,
498
+ "completeness": s.completeness.name,
499
+ "is_mass_update": s.is_mass_update,
500
+ "timestamp": s.timestamp,
501
+ "code_file": s.code_file,
502
+ "code_line": s.code_line,
503
+ }
504
+ for s in ctx.store.steps
505
+ ]
506
+
507
+
508
+ def aggregation_groups() -> list[str]:
509
+ """List all tracked aggregation groups."""
510
+ ctx = get_context()
511
+ groups = []
512
+ for mapping in ctx.store.aggregation_mappings:
513
+ groups.extend(mapping.membership.keys())
514
+ return groups
515
+
516
+
517
+ # === EXPORT ===
518
+
519
+
520
+ def export_json(filepath: str) -> None:
521
+ """Export lineage to JSON file."""
522
+ ctx = get_context()
523
+ with open(filepath, "w") as f:
524
+ f.write(ctx.store.to_json())
525
+
526
+
527
+ def export_arrow(filepath: str) -> None:
528
+ """
529
+ Export lineage to Parquet file.
530
+
531
+ Requires pyarrow to be installed.
532
+
533
+ Args:
534
+ filepath: Path to write the Parquet file.
535
+
536
+ Raises:
537
+ ImportError: If pyarrow is not installed.
538
+ """
539
+ try:
540
+ import pyarrow.parquet as pq
541
+ except ImportError:
542
+ raise ImportError(
543
+ "pyarrow is required for Arrow/Parquet export. "
544
+ "Install it with: pip install tracepipe[arrow] or pip install pyarrow"
545
+ ) from None
546
+
547
+ ctx = get_context()
548
+ table = ctx.store.to_arrow()
549
+ pq.write_table(table, filepath)
550
+
551
+
552
+ def stats() -> dict:
553
+ """Get tracking statistics."""
554
+ ctx = get_context()
555
+ return {
556
+ "enabled": ctx.enabled,
557
+ "total_steps": len(ctx.store.steps),
558
+ "total_diffs": ctx.store.total_diff_count,
559
+ "in_memory_diffs": ctx.store.diff_count,
560
+ "spilled_files": len(ctx.store.spilled_files),
561
+ "watched_columns": list(ctx.watched_columns),
562
+ "aggregation_groups": len(ctx.store.aggregation_mappings),
563
+ }