tracepipe 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tracepipe/core.py CHANGED
@@ -1,11 +1,19 @@
1
1
  # tracepipe/core.py
2
2
  """
3
3
  Core types, enums, and configuration for TracePipe.
4
+
5
+ Design Principles:
6
+ 1. Pandas Execution is Authoritative: TracePipe never re-implements operations
7
+ 2. Trust Over Features: Mark PARTIAL when uncertain; never lie about completeness
8
+ 3. Don't Touch User Data: No DataFrame mutation by default
9
+ 4. Modes for Adoption: CI mode (fast) vs Debug mode (deep)
10
+ 5. NumPy-First: Vectorized operations; no Python loops over millions of rows
4
11
  """
5
12
 
6
13
  import os
14
+ import time
7
15
  from dataclasses import dataclass, field
8
- from enum import IntEnum
16
+ from enum import Enum, IntEnum
9
17
  from typing import Any, Optional
10
18
 
11
19
 
@@ -22,9 +30,9 @@ class CompletenessLevel(IntEnum):
22
30
  """
23
31
  Indicates how completely an operation's internals are tracked.
24
32
 
25
- FULL: Completely tracked (e.g., fillna, dropna)
26
- PARTIAL: Output tracked, internals unknown (e.g., apply, pipe)
27
- UNKNOWN: Lineage reset (e.g., merge, concat)
33
+ FULL: Complete lineage captured (dropna, drop_duplicates, boolean indexing)
34
+ PARTIAL: Output tracked, internals approximate (query with @var, merge in CI mode)
35
+ UNKNOWN: Operation not instrumented (future: uninstrumented custom ops)
28
36
  """
29
37
 
30
38
  FULL = 0
@@ -32,49 +40,157 @@ class CompletenessLevel(IntEnum):
32
40
  UNKNOWN = 2
33
41
 
34
42
 
43
+ class TracePipeMode(Enum):
44
+ """TracePipe operating modes."""
45
+
46
+ CI = "ci" # Fast: stats, drops, contracts
47
+ DEBUG = "debug" # Deep: merge provenance, ghost values, cell history
48
+
49
+
50
+ class IdentityStorage(Enum):
51
+ """Row identity storage strategies."""
52
+
53
+ REGISTRY = "registry" # Default: WeakKeyDictionary, no data mutation
54
+ COLUMN = "column" # Opt-in: hidden column (for edge cases)
55
+ ATTRS = "attrs" # Alternative: df.attrs token
56
+
57
+
35
58
  @dataclass
36
59
  class TracePipeConfig:
37
60
  """Configuration with sensible defaults."""
38
61
 
62
+ # Memory limits
39
63
  max_diffs_in_memory: int = 500_000
40
64
  max_diffs_per_step: int = 100_000
41
- max_group_membership_size: int = 100_000 # Store count-only above this threshold
65
+ max_group_membership_size: int = 100_000
66
+
67
+ # Behavior options
42
68
  strict_mode: bool = False
43
69
  auto_watch: bool = False
44
70
  auto_watch_null_threshold: float = 0.01
45
71
  spillover_dir: str = ".tracepipe"
46
- use_hidden_column: bool = False
47
72
  warn_on_duplicate_index: bool = True
48
73
  cleanup_spillover_on_disable: bool = True
49
74
 
75
+ # Mode system
76
+ mode: TracePipeMode = TracePipeMode.CI
77
+
78
+ # Identity storage (default to registry, not column)
79
+ identity_storage: IdentityStorage = IdentityStorage.REGISTRY
80
+
81
+ # Feature overrides (None = use mode default)
82
+ merge_provenance: Optional[bool] = None
83
+ ghost_row_values: Optional[bool] = None
84
+ cell_history: Optional[bool] = None
85
+
86
+ # Ghost row limits
87
+ max_ghost_rows: int = 10_000
88
+
89
+ @property
90
+ def should_capture_merge_provenance(self) -> bool:
91
+ if self.merge_provenance is not None:
92
+ return self.merge_provenance
93
+ return self.mode == TracePipeMode.DEBUG
94
+
95
+ @property
96
+ def should_capture_ghost_values(self) -> bool:
97
+ if self.ghost_row_values is not None:
98
+ return self.ghost_row_values
99
+ return self.mode == TracePipeMode.DEBUG
100
+
101
+ @property
102
+ def should_capture_cell_history(self) -> bool:
103
+ if self.cell_history is not None:
104
+ return self.cell_history
105
+ return self.mode == TracePipeMode.DEBUG
106
+
107
+ @property
108
+ def use_hidden_column(self) -> bool:
109
+ return self.identity_storage == IdentityStorage.COLUMN
110
+
111
+ @property
112
+ def use_attrs_token(self) -> bool:
113
+ return self.identity_storage == IdentityStorage.ATTRS
114
+
50
115
  @classmethod
51
116
  def from_env(cls) -> "TracePipeConfig":
52
117
  """Create config from environment variables."""
118
+ mode_str = os.environ.get("TRACEPIPE_MODE", "ci")
53
119
  return cls(
120
+ mode=TracePipeMode.DEBUG if mode_str == "debug" else TracePipeMode.CI,
54
121
  max_diffs_in_memory=int(os.environ.get("TRACEPIPE_MAX_DIFFS", 500_000)),
55
122
  max_diffs_per_step=int(os.environ.get("TRACEPIPE_MAX_DIFFS_PER_STEP", 100_000)),
56
123
  strict_mode=os.environ.get("TRACEPIPE_STRICT", "0") == "1",
57
124
  auto_watch=os.environ.get("TRACEPIPE_AUTO_WATCH", "0") == "1",
58
- use_hidden_column=os.environ.get("TRACEPIPE_HIDDEN_COL", "0") == "1",
59
125
  )
60
126
 
61
127
 
62
128
  @dataclass
63
- class StepMetadata:
64
- """Metadata for a single pipeline step."""
129
+ class StepEvent:
130
+ """
131
+ Stable schema for pipeline step events.
132
+
133
+ This schema is designed to be stable across versions.
134
+ New fields should be added as Optional with defaults.
135
+ """
65
136
 
66
137
  step_id: int
67
138
  operation: str
68
- stage: Optional[str]
69
- timestamp: float
70
- code_file: Optional[str]
71
- code_line: Optional[int]
72
- params: dict[str, Any]
73
- input_shape: Optional[tuple]
74
- output_shape: Optional[tuple]
139
+ timestamp: float = field(default_factory=time.time)
140
+
141
+ # Context
142
+ stage: Optional[str] = None
143
+ code_file: Optional[str] = None
144
+ code_line: Optional[int] = None
145
+
146
+ # Shape tracking
147
+ input_shape: Optional[tuple[int, ...]] = None
148
+ output_shape: Optional[tuple[int, ...]] = None
149
+
150
+ # Parameters (operation-specific)
151
+ params: dict[str, Any] = field(default_factory=dict)
152
+
153
+ # Completeness
154
+ completeness: CompletenessLevel = CompletenessLevel.FULL
155
+
156
+ # Mass update tracking
75
157
  is_mass_update: bool = False
76
158
  rows_affected: int = 0
77
- completeness: CompletenessLevel = CompletenessLevel.FULL
159
+
160
+ # Error tracking
161
+ error: Optional[str] = None
162
+ error_type: Optional[str] = None
163
+
164
+ @property
165
+ def code_location(self) -> Optional[str]:
166
+ """Human-readable code location."""
167
+ if self.code_file and self.code_line:
168
+ return f"{self.code_file}:{self.code_line}"
169
+ return None
170
+
171
+ def to_dict(self) -> dict[str, Any]:
172
+ """Serialize to dict (for JSON export)."""
173
+ return {
174
+ "step_id": self.step_id,
175
+ "operation": self.operation,
176
+ "timestamp": self.timestamp,
177
+ "stage": self.stage,
178
+ "code_location": self.code_location,
179
+ "code_file": self.code_file,
180
+ "code_line": self.code_line,
181
+ "input_shape": self.input_shape,
182
+ "output_shape": self.output_shape,
183
+ "params": self.params,
184
+ "completeness": self.completeness.name,
185
+ "is_mass_update": self.is_mass_update,
186
+ "rows_affected": self.rows_affected,
187
+ "error": self.error,
188
+ "error_type": self.error_type,
189
+ }
190
+
191
+
192
+ # Backwards compatibility alias
193
+ StepMetadata = StepEvent
78
194
 
79
195
 
80
196
  @dataclass
@@ -120,3 +236,44 @@ class LineageGaps:
120
236
  return f"1 step has limited visibility: {self.gaps[0].operation}"
121
237
  else:
122
238
  return f"{len(self.gaps)} steps have limited visibility"
239
+
240
+
241
+ @dataclass
242
+ class GhostRowInfo:
243
+ """Information about a dropped row."""
244
+
245
+ row_id: int
246
+ last_values: dict[str, Any]
247
+ dropped_by: str
248
+ dropped_step: int
249
+ original_position: int
250
+
251
+
252
+ @dataclass
253
+ class MergeMapping:
254
+ """
255
+ Array-based merge mapping (memory efficient).
256
+
257
+ Arrays are stored SORTED by out_rids to enable O(log n) lookup
258
+ via binary search instead of O(n) linear scan.
259
+ """
260
+
261
+ step_id: int
262
+ out_rids: Any # numpy array, SORTED for binary search
263
+ left_parent_rids: Any # numpy array, -1 for no match, same order as out_rids
264
+ right_parent_rids: Any # numpy array, -1 for no match, same order as out_rids
265
+
266
+
267
+ @dataclass
268
+ class MergeStats:
269
+ """Merge statistics."""
270
+
271
+ left_rows: int
272
+ right_rows: int
273
+ result_rows: int
274
+ expansion_ratio: float
275
+ left_match_rate: float # -1 if not computed
276
+ right_match_rate: float # -1 if not computed
277
+ left_dup_rate: float # -1 if not computed
278
+ right_dup_rate: float # -1 if not computed
279
+ how: str
tracepipe/debug.py ADDED
@@ -0,0 +1,325 @@
1
+ # tracepipe/debug.py
2
+ """
3
+ Debug namespace for TracePipe power users.
4
+
5
+ This module provides low-level introspection and raw access to lineage data.
6
+ For most use cases, prefer the top-level convenience API (check, trace, why, report).
7
+
8
+ Usage:
9
+ import tracepipe as tp
10
+
11
+ # Access debug inspector
12
+ dbg = tp.debug.inspect()
13
+ dbg.steps # All recorded steps
14
+ dbg.dropped_rows() # All dropped row IDs
15
+ dbg.explain_row(42) # Raw row lineage
16
+ dbg.export("json") # Export lineage data
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ from dataclasses import dataclass
22
+ from typing import TYPE_CHECKING, Any
23
+
24
+ import pandas as pd
25
+
26
+ from .context import get_context
27
+
28
+ if TYPE_CHECKING:
29
+ from .api import GroupLineageResult, RowLineageResult
30
+ from .core import StepEvent
31
+
32
+
33
+ @dataclass
34
+ class DebugInspector:
35
+ """
36
+ Debug inspector providing raw access to TracePipe internals.
37
+
38
+ This is the primary entry point for power users who need
39
+ low-level access to lineage data.
40
+ """
41
+
42
+ @property
43
+ def enabled(self) -> bool:
44
+ """True if TracePipe is currently enabled."""
45
+ return get_context().enabled
46
+
47
+ @property
48
+ def mode(self) -> str:
49
+ """Current mode: 'ci' or 'debug'."""
50
+ return get_context().config.mode.value
51
+
52
+ @property
53
+ def steps(self) -> list[StepEvent]:
54
+ """All recorded pipeline steps."""
55
+ return get_context().store.steps
56
+
57
+ @property
58
+ def watched_columns(self) -> set:
59
+ """Currently watched columns."""
60
+ return get_context().watched_columns.copy()
61
+
62
+ def watch(self, *columns: str) -> DebugInspector:
63
+ """
64
+ Add columns to watch for cell-level tracking.
65
+
66
+ Args:
67
+ *columns: Column names to watch.
68
+
69
+ Returns:
70
+ Self for chaining.
71
+ """
72
+ get_context().watched_columns.update(columns)
73
+ return self
74
+
75
+ @property
76
+ def total_diffs(self) -> int:
77
+ """Total number of diffs (including spilled)."""
78
+ return get_context().store.total_diff_count
79
+
80
+ @property
81
+ def in_memory_diffs(self) -> int:
82
+ """Number of diffs currently in memory."""
83
+ return get_context().store.diff_count
84
+
85
+ def dropped_rows(self, step_id: int | None = None) -> list[int]:
86
+ """
87
+ Get all dropped row IDs.
88
+
89
+ Args:
90
+ step_id: If provided, only return drops from this step.
91
+
92
+ Returns:
93
+ List of dropped row IDs.
94
+ """
95
+ return get_context().store.get_dropped_rows(step_id)
96
+
97
+ def dropped_by_operation(self) -> dict:
98
+ """Get count of dropped rows per operation."""
99
+ return get_context().store.get_dropped_by_step()
100
+
101
+ def alive_rows(self) -> list[int]:
102
+ """Get all row IDs that are still alive (not dropped)."""
103
+ ctx = get_context()
104
+ all_registered = set(ctx.row_manager.all_registered_ids())
105
+ dropped = set(ctx.store.get_dropped_rows())
106
+ return sorted(all_registered - dropped)
107
+
108
+ def explain_row(self, row_id: int) -> RowLineageResult:
109
+ """
110
+ Get lineage for a specific row.
111
+
112
+ Returns a RowLineageResult object with:
113
+ - row_id: int
114
+ - is_alive: bool
115
+ - dropped_at: Optional[str]
116
+ - history(): List[dict]
117
+ - cell_history(col): List[dict]
118
+ - to_dict(): dict
119
+ """
120
+ from .api import RowLineageResult
121
+
122
+ return RowLineageResult(row_id, get_context())
123
+
124
+ def explain_group(self, group_key: str) -> GroupLineageResult:
125
+ """Get aggregation group membership."""
126
+ from .api import GroupLineageResult
127
+
128
+ return GroupLineageResult(group_key, get_context())
129
+
130
+ def aggregation_groups(self) -> list[str]:
131
+ """List all tracked aggregation groups."""
132
+ ctx = get_context()
133
+ groups = []
134
+ for mapping in ctx.store.aggregation_mappings:
135
+ groups.extend(mapping.membership.keys())
136
+ return groups
137
+
138
+ def merge_stats(self, step_id: int | None = None) -> list[dict]:
139
+ """Get merge operation statistics."""
140
+ ctx = get_context()
141
+ stats_list = ctx.store.get_merge_stats(step_id)
142
+ return [
143
+ {
144
+ "step_id": sid,
145
+ "left_rows": s.left_rows,
146
+ "right_rows": s.right_rows,
147
+ "result_rows": s.result_rows,
148
+ "expansion_ratio": s.expansion_ratio,
149
+ "left_match_rate": s.left_match_rate,
150
+ "right_match_rate": s.right_match_rate,
151
+ "how": s.how,
152
+ }
153
+ for sid, s in stats_list
154
+ ]
155
+
156
+ def mass_updates(self) -> list[dict]:
157
+ """Get operations that exceeded cell diff threshold."""
158
+ ctx = get_context()
159
+ return [
160
+ {
161
+ "step_id": s.step_id,
162
+ "operation": s.operation,
163
+ "rows_affected": s.rows_affected,
164
+ "stage": s.stage,
165
+ }
166
+ for s in ctx.store.steps
167
+ if s.is_mass_update
168
+ ]
169
+
170
+ def ghost_rows(self, limit: int = 1000) -> pd.DataFrame:
171
+ """
172
+ Get dropped rows with their last-known values (DEBUG mode only).
173
+
174
+ Returns DataFrame with columns:
175
+ - __tp_row_id__: Original row ID
176
+ - __tp_dropped_by__: Operation that dropped the row
177
+ - [watched columns]: Last known values
178
+ """
179
+ ctx = get_context()
180
+ return ctx.row_manager.get_ghost_rows(limit=limit)
181
+
182
+ def stats(self) -> dict:
183
+ """Get comprehensive tracking statistics."""
184
+ ctx = get_context()
185
+ return {
186
+ "enabled": ctx.enabled,
187
+ "mode": ctx.config.mode.value,
188
+ "total_steps": len(ctx.store.steps),
189
+ "total_diffs": ctx.store.total_diff_count,
190
+ "in_memory_diffs": ctx.store.diff_count,
191
+ "spilled_files": len(ctx.store.spilled_files),
192
+ "watched_columns": list(ctx.watched_columns),
193
+ "aggregation_groups": len(ctx.store.aggregation_mappings),
194
+ "merge_mappings": len(ctx.store.merge_mappings),
195
+ "features": {
196
+ "merge_provenance": ctx.config.should_capture_merge_provenance,
197
+ "ghost_row_values": ctx.config.should_capture_ghost_values,
198
+ "cell_history": ctx.config.should_capture_cell_history,
199
+ },
200
+ }
201
+
202
+ def export(self, format: str = "json", path: str | None = None) -> str | None:
203
+ """
204
+ Export lineage data.
205
+
206
+ Args:
207
+ format: "json" or "arrow"
208
+ path: File path. If None, returns JSON string (json format only).
209
+
210
+ Returns:
211
+ JSON string if path is None and format is "json", else None.
212
+ """
213
+ ctx = get_context()
214
+
215
+ if format == "json":
216
+ json_str = ctx.store.to_json()
217
+ if path:
218
+ with open(path, "w") as f:
219
+ f.write(json_str)
220
+ return None
221
+ return json_str
222
+ elif format == "arrow":
223
+ if path is None:
224
+ raise ValueError("path is required for arrow export")
225
+ try:
226
+ import pyarrow.parquet as pq
227
+ except ImportError:
228
+ raise ImportError(
229
+ "pyarrow is required for Arrow export. "
230
+ "Install with: pip install tracepipe[arrow]"
231
+ ) from None
232
+ table = ctx.store.to_arrow()
233
+ pq.write_table(table, path)
234
+ return None
235
+ else:
236
+ raise ValueError(f"Unknown format: {format}. Use 'json' or 'arrow'.")
237
+
238
+ def register(self, df: pd.DataFrame) -> None:
239
+ """Manually register a DataFrame for tracking."""
240
+ ctx = get_context()
241
+ if ctx.enabled:
242
+ ctx.row_manager.register(df)
243
+
244
+ def get_row_ids(self, df: pd.DataFrame) -> Any | None:
245
+ """Get row IDs array for a DataFrame."""
246
+ ctx = get_context()
247
+ return ctx.row_manager.get_ids_array(df)
248
+
249
+ def __repr__(self) -> str:
250
+ ctx = get_context()
251
+ if not ctx.enabled:
252
+ return "<DebugInspector enabled=False>"
253
+ return (
254
+ f"<DebugInspector mode={ctx.config.mode.value} "
255
+ f"steps={len(ctx.store.steps)} "
256
+ f"diffs={ctx.store.total_diff_count}>"
257
+ )
258
+
259
+
260
+ def inspect() -> DebugInspector:
261
+ """
262
+ Get a debug inspector for TracePipe internals.
263
+
264
+ Returns:
265
+ DebugInspector with access to steps, diffs, and raw lineage data.
266
+
267
+ Example:
268
+ dbg = tp.debug.inspect()
269
+ print(dbg.steps)
270
+ print(dbg.dropped_rows())
271
+ dbg.export("json", "lineage.json")
272
+ """
273
+ return DebugInspector()
274
+
275
+
276
+ # Convenience aliases for common debug operations
277
+ def export_json(path: str) -> None:
278
+ """Export lineage to JSON file."""
279
+ inspect().export("json", path)
280
+
281
+
282
+ def export_arrow(path: str) -> None:
283
+ """Export lineage to Parquet file."""
284
+ inspect().export("arrow", path)
285
+
286
+
287
+ def find(
288
+ df: pd.DataFrame,
289
+ *,
290
+ where: dict | None = None,
291
+ predicate=None,
292
+ limit: int = 50,
293
+ ) -> list[int]:
294
+ """
295
+ Find row IDs matching a selector.
296
+
297
+ This is a debug utility for discovering row IDs that can be used
298
+ with trace() and why(). Row IDs are internal identifiers and should
299
+ not be persisted across sessions.
300
+
301
+ Args:
302
+ df: DataFrame to search
303
+ where: Exact match selector, e.g. {"status": "failed"}
304
+ predicate: Vector predicate (df -> boolean Series)
305
+ limit: Maximum number of IDs to return (default 50)
306
+
307
+ Returns:
308
+ List of internal row IDs (for use with trace/why row= parameter)
309
+
310
+ Example:
311
+ rids = tp.debug.find(df, where={"status": "failed"})
312
+ for rid in rids[:3]:
313
+ print(tp.trace(df, row=rid))
314
+ """
315
+ # Import here to avoid circular imports
316
+ from .convenience import _resolve_predicate, _resolve_where
317
+
318
+ ctx = get_context()
319
+
320
+ if where:
321
+ return _resolve_where(df, where, ctx, limit=limit)
322
+ elif predicate:
323
+ return _resolve_predicate(df, predicate, ctx, limit=limit)
324
+ else:
325
+ raise ValueError("Must provide 'where' or 'predicate'")