tracepipe 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tracepipe/context.py CHANGED
@@ -3,12 +3,16 @@
3
3
  Thread-safe context for TracePipe state.
4
4
 
5
5
  Each thread gets its own context via threading.local().
6
+
7
+ Modes:
8
+ - CI: Fast stats and drop tracking for production/CI use.
9
+ - DEBUG: Full provenance with merge origin tracking and ghost row values.
6
10
  """
7
11
 
8
12
  import threading
9
13
  from typing import Optional
10
14
 
11
- from .core import TracePipeConfig
15
+ from .core import TracePipeConfig, TracePipeMode
12
16
  from .storage.base import (
13
17
  LineageBackend,
14
18
  RowIdentityStrategy,
@@ -33,6 +37,10 @@ class TracePipeContext:
33
37
  Extensibility:
34
38
  - Pass custom `backend` for alternative storage (SQLite, Delta Lake)
35
39
  - Pass custom `identity` for alternative engines (Polars, Spark)
40
+
41
+ Mode System:
42
+ - CI mode (default): Fast stats, drop tracking, contracts
43
+ - DEBUG mode: Full merge provenance, ghost values, cell history
36
44
  """
37
45
 
38
46
  def __init__(
@@ -58,6 +66,18 @@ class TracePipeContext:
58
66
  # GroupBy state stack (supports nesting)
59
67
  self._groupby_stack: list[dict] = []
60
68
 
69
+ # === MODE CONVENIENCE PROPERTIES ===
70
+
71
+ @property
72
+ def is_debug_mode(self) -> bool:
73
+ """True if running in DEBUG mode."""
74
+ return self.config.mode == TracePipeMode.DEBUG
75
+
76
+ @property
77
+ def is_ci_mode(self) -> bool:
78
+ """True if running in CI mode (default)."""
79
+ return self.config.mode == TracePipeMode.CI
80
+
61
81
  def push_groupby(self, state: dict) -> None:
62
82
  """Push groupby state for nested operations."""
63
83
  self._groupby_stack.append(state)
tracepipe/contracts.py ADDED
@@ -0,0 +1,473 @@
1
+ # tracepipe/contracts.py
2
+ """
3
+ Data quality contracts for TracePipe pipelines.
4
+
5
+ Contracts provide a fluent API for defining data quality expectations
6
+ that can be checked against pipeline output.
7
+
8
+ Usage:
9
+ result = (tp.contract()
10
+ .expect_merge_expansion(max_ratio=2.0)
11
+ .expect_retention(min_rate=0.9)
12
+ .expect_no_nulls("user_id", "email")
13
+ .expect_unique("transaction_id")
14
+ .check(df))
15
+
16
+ # Fail fast
17
+ result.raise_if_failed()
18
+ """
19
+
20
+ from dataclasses import dataclass
21
+ from typing import Any, Callable, Optional
22
+
23
+ import pandas as pd
24
+
25
+ from .context import get_context
26
+
27
+
28
+ @dataclass
29
+ class ExpectationResult:
30
+ """Result of a single expectation check."""
31
+
32
+ name: str
33
+ passed: bool
34
+ actual_value: Any
35
+ expected: str
36
+ message: str
37
+
38
+
39
+ @dataclass
40
+ class ContractResult:
41
+ """Result of contract validation."""
42
+
43
+ passed: bool
44
+ expectations: list[ExpectationResult]
45
+
46
+ def __repr__(self) -> str:
47
+ status = "PASSED" if self.passed else "FAILED"
48
+ lines = [f"Contract {status}"]
49
+ for exp in self.expectations:
50
+ mark = "[OK]" if exp.passed else "[FAIL]"
51
+ lines.append(f" {mark} {exp.name}: {exp.message}")
52
+ return "\n".join(lines)
53
+
54
+ def raise_if_failed(self) -> None:
55
+ """Raise ContractViolation if any expectation failed."""
56
+ if not self.passed:
57
+ failed = [e for e in self.expectations if not e.passed]
58
+ raise ContractViolation(failed)
59
+
60
+ @property
61
+ def failures(self) -> list[ExpectationResult]:
62
+ """Get list of failed expectations."""
63
+ return [e for e in self.expectations if not e.passed]
64
+
65
+ def to_dict(self) -> dict:
66
+ """Export to dictionary."""
67
+ return {
68
+ "passed": self.passed,
69
+ "expectations": [
70
+ {
71
+ "name": e.name,
72
+ "passed": e.passed,
73
+ "actual_value": e.actual_value,
74
+ "expected": e.expected,
75
+ "message": e.message,
76
+ }
77
+ for e in self.expectations
78
+ ],
79
+ }
80
+
81
+
82
+ class ContractViolation(Exception):
83
+ """Raised when contract expectations fail."""
84
+
85
+ def __init__(self, failures: list[ExpectationResult]):
86
+ self.failures = failures
87
+ messages = [f"{f.name}: {f.message}" for f in failures]
88
+ super().__init__(f"Contract violated: {'; '.join(messages)}")
89
+
90
+
91
+ class ContractBuilder:
92
+ """
93
+ Fluent API for defining data quality contracts.
94
+
95
+ Usage:
96
+ result = (tp.contract()
97
+ .expect_merge_expansion(max_ratio=2.0)
98
+ .expect_retention(min_rate=0.9)
99
+ .expect_no_nulls("user_id", "email")
100
+ .expect_unique("transaction_id")
101
+ .check(df))
102
+ """
103
+
104
+ def __init__(self):
105
+ self._expectations: list[Callable[[pd.DataFrame], ExpectationResult]] = []
106
+
107
+ def expect_merge_expansion(self, max_ratio: float = 2.0) -> "ContractBuilder":
108
+ """Fail if any merge expanded rows beyond ratio."""
109
+
110
+ def check(df: pd.DataFrame) -> ExpectationResult:
111
+ ctx = get_context()
112
+ stats = ctx.store.get_merge_stats()
113
+
114
+ # Extract MergeStats objects from (step_id, stats) tuples
115
+ violations = [s for _, s in stats if s.expansion_ratio > max_ratio]
116
+
117
+ if violations:
118
+ worst = max(violations, key=lambda s: s.expansion_ratio)
119
+ return ExpectationResult(
120
+ name="merge_expansion",
121
+ passed=False,
122
+ actual_value=worst.expansion_ratio,
123
+ expected=f"<= {max_ratio}",
124
+ message=f"Merge expanded {worst.expansion_ratio:.1f}x (max: {max_ratio}x)",
125
+ )
126
+
127
+ max_actual = max((s.expansion_ratio for _, s in stats), default=0)
128
+ return ExpectationResult(
129
+ name="merge_expansion",
130
+ passed=True,
131
+ actual_value=max_actual,
132
+ expected=f"<= {max_ratio}",
133
+ message=f"All merges within {max_ratio}x expansion limit",
134
+ )
135
+
136
+ self._expectations.append(check)
137
+ return self
138
+
139
+ def expect_retention(self, min_rate: float = 0.8) -> "ContractBuilder":
140
+ """Fail if too many rows were dropped."""
141
+
142
+ def check(df: pd.DataFrame) -> ExpectationResult:
143
+ ctx = get_context()
144
+ dropped = len(ctx.store.get_dropped_rows())
145
+ current = len(df)
146
+
147
+ # Estimate original count: use max input rows seen across all steps
148
+ # This handles multi-table pipelines where merges can expand rows
149
+ max_input_rows = 0
150
+ for step in ctx.store.steps:
151
+ # input_shape can be a single shape tuple (rows, cols) or
152
+ # a tuple of shapes for merge operations
153
+ if step.input_shape:
154
+ shape = step.input_shape
155
+ if isinstance(shape[0], tuple):
156
+ # Multiple inputs (e.g., merge) - take max of all inputs
157
+ for s in shape:
158
+ if isinstance(s, tuple) and len(s) >= 1:
159
+ max_input_rows = max(max_input_rows, s[0])
160
+ elif isinstance(shape[0], int):
161
+ max_input_rows = max(max_input_rows, shape[0])
162
+
163
+ if step.output_shape and isinstance(step.output_shape[0], int):
164
+ max_input_rows = max(max_input_rows, step.output_shape[0])
165
+
166
+ # Fall back to current + dropped if no steps recorded
167
+ if max_input_rows == 0:
168
+ max_input_rows = current + dropped
169
+
170
+ # Retention = final rows / peak rows seen
171
+ # This gives a sensible answer for multi-table pipelines
172
+ retention = current / max_input_rows if max_input_rows > 0 else 1.0
173
+
174
+ if retention < min_rate:
175
+ return ExpectationResult(
176
+ name="retention",
177
+ passed=False,
178
+ actual_value=retention,
179
+ expected=f">= {min_rate}",
180
+ message=f"Retention {retention:.1%} below minimum {min_rate:.1%}",
181
+ )
182
+
183
+ return ExpectationResult(
184
+ name="retention",
185
+ passed=True,
186
+ actual_value=retention,
187
+ expected=f">= {min_rate}",
188
+ message=f"Retention {retention:.1%} meets minimum {min_rate:.1%}",
189
+ )
190
+
191
+ self._expectations.append(check)
192
+ return self
193
+
194
+ def expect_no_nulls(self, *columns: str) -> "ContractBuilder":
195
+ """Fail if specified columns contain nulls."""
196
+
197
+ def check(df: pd.DataFrame) -> ExpectationResult:
198
+ null_cols = []
199
+ for col in columns:
200
+ if col in df.columns and df[col].isna().any():
201
+ null_count = df[col].isna().sum()
202
+ null_cols.append(f"{col}({null_count})")
203
+
204
+ if null_cols:
205
+ return ExpectationResult(
206
+ name="no_null",
207
+ passed=False,
208
+ actual_value=null_cols,
209
+ expected="no nulls",
210
+ message=f"Nulls found in: {', '.join(null_cols)}",
211
+ )
212
+
213
+ return ExpectationResult(
214
+ name="no_null",
215
+ passed=True,
216
+ actual_value=[],
217
+ expected="no nulls",
218
+ message=f"No nulls in {', '.join(columns)}",
219
+ )
220
+
221
+ self._expectations.append(check)
222
+ return self
223
+
224
+ # Alias for backwards compatibility
225
+ expect_no_null_in = expect_no_nulls
226
+
227
+ def expect_unique(self, *columns: str) -> "ContractBuilder":
228
+ """Fail if columns have duplicate values."""
229
+
230
+ def check(df: pd.DataFrame) -> ExpectationResult:
231
+ cols = [c for c in columns if c in df.columns]
232
+ if not cols:
233
+ return ExpectationResult(
234
+ name="unique",
235
+ passed=True,
236
+ actual_value=0,
237
+ expected="unique",
238
+ message="Columns not present",
239
+ )
240
+
241
+ dup_count = df.duplicated(subset=cols).sum()
242
+
243
+ if dup_count > 0:
244
+ return ExpectationResult(
245
+ name="unique",
246
+ passed=False,
247
+ actual_value=dup_count,
248
+ expected="0 duplicates",
249
+ message=f"{dup_count} duplicate rows on {cols}",
250
+ )
251
+
252
+ return ExpectationResult(
253
+ name="unique",
254
+ passed=True,
255
+ actual_value=0,
256
+ expected="0 duplicates",
257
+ message=f"All rows unique on {cols}",
258
+ )
259
+
260
+ self._expectations.append(check)
261
+ return self
262
+
263
+ def expect_row_count(
264
+ self, min_rows: int = 0, max_rows: Optional[int] = None
265
+ ) -> "ContractBuilder":
266
+ """Fail if row count outside bounds."""
267
+
268
+ def check(df: pd.DataFrame) -> ExpectationResult:
269
+ n = len(df)
270
+
271
+ if n < min_rows:
272
+ return ExpectationResult(
273
+ name="row_count",
274
+ passed=False,
275
+ actual_value=n,
276
+ expected=f">= {min_rows}",
277
+ message=f"Only {n} rows, minimum is {min_rows}",
278
+ )
279
+
280
+ if max_rows is not None and n > max_rows:
281
+ return ExpectationResult(
282
+ name="row_count",
283
+ passed=False,
284
+ actual_value=n,
285
+ expected=f"<= {max_rows}",
286
+ message=f"{n} rows exceeds maximum {max_rows}",
287
+ )
288
+
289
+ max_str = str(max_rows) if max_rows is not None else "inf"
290
+ return ExpectationResult(
291
+ name="row_count",
292
+ passed=True,
293
+ actual_value=n,
294
+ expected=f"{min_rows}-{max_str}",
295
+ message=f"{n} rows within bounds",
296
+ )
297
+
298
+ self._expectations.append(check)
299
+ return self
300
+
301
+ def expect_columns_exist(self, *columns: str) -> "ContractBuilder":
302
+ """Fail if any specified columns are missing."""
303
+
304
+ def check(df: pd.DataFrame) -> ExpectationResult:
305
+ missing = [c for c in columns if c not in df.columns]
306
+
307
+ if missing:
308
+ return ExpectationResult(
309
+ name="columns_exist",
310
+ passed=False,
311
+ actual_value=missing,
312
+ expected="all present",
313
+ message=f"Missing columns: {', '.join(missing)}",
314
+ )
315
+
316
+ return ExpectationResult(
317
+ name="columns_exist",
318
+ passed=True,
319
+ actual_value=[],
320
+ expected="all present",
321
+ message=f"All {len(columns)} columns present",
322
+ )
323
+
324
+ self._expectations.append(check)
325
+ return self
326
+
327
+ def expect_no_duplicates(self) -> "ContractBuilder":
328
+ """Fail if DataFrame has duplicate rows."""
329
+
330
+ def check(df: pd.DataFrame) -> ExpectationResult:
331
+ dup_count = df.duplicated().sum()
332
+
333
+ if dup_count > 0:
334
+ return ExpectationResult(
335
+ name="no_duplicates",
336
+ passed=False,
337
+ actual_value=dup_count,
338
+ expected="0",
339
+ message=f"{dup_count} duplicate rows",
340
+ )
341
+
342
+ return ExpectationResult(
343
+ name="no_duplicates",
344
+ passed=True,
345
+ actual_value=0,
346
+ expected="0",
347
+ message="No duplicate rows",
348
+ )
349
+
350
+ self._expectations.append(check)
351
+ return self
352
+
353
+ def expect_dtype(self, column: str, expected_dtype: str) -> "ContractBuilder":
354
+ """Fail if column dtype doesn't match expected."""
355
+
356
+ def check(df: pd.DataFrame) -> ExpectationResult:
357
+ if column not in df.columns:
358
+ return ExpectationResult(
359
+ name="dtype",
360
+ passed=False,
361
+ actual_value="column missing",
362
+ expected=expected_dtype,
363
+ message=f"Column '{column}' not found",
364
+ )
365
+
366
+ actual = str(df[column].dtype)
367
+
368
+ # Allow partial matches (e.g., "int" matches "int64")
369
+ if expected_dtype in actual or actual in expected_dtype:
370
+ return ExpectationResult(
371
+ name="dtype",
372
+ passed=True,
373
+ actual_value=actual,
374
+ expected=expected_dtype,
375
+ message=f"Column '{column}' dtype is {actual}",
376
+ )
377
+
378
+ return ExpectationResult(
379
+ name="dtype",
380
+ passed=False,
381
+ actual_value=actual,
382
+ expected=expected_dtype,
383
+ message=f"Column '{column}' dtype is {actual}, expected {expected_dtype}",
384
+ )
385
+
386
+ self._expectations.append(check)
387
+ return self
388
+
389
+ def expect_values_in(self, column: str, allowed_values: list) -> "ContractBuilder":
390
+ """Fail if column contains values not in allowed set."""
391
+
392
+ def check(df: pd.DataFrame) -> ExpectationResult:
393
+ if column not in df.columns:
394
+ return ExpectationResult(
395
+ name="values_in",
396
+ passed=False,
397
+ actual_value="column missing",
398
+ expected=f"in {allowed_values}",
399
+ message=f"Column '{column}' not found",
400
+ )
401
+
402
+ # Get unique values not in allowed set
403
+ actual_values = df[column].dropna().unique()
404
+ invalid = [v for v in actual_values if v not in allowed_values]
405
+
406
+ if invalid:
407
+ # Show up to 5 invalid values
408
+ invalid_str = str(invalid[:5])
409
+ if len(invalid) > 5:
410
+ invalid_str += f"... ({len(invalid)} total)"
411
+ return ExpectationResult(
412
+ name="values_in",
413
+ passed=False,
414
+ actual_value=invalid_str,
415
+ expected=f"in {allowed_values}",
416
+ message=f"Column '{column}' has invalid values: {invalid_str}",
417
+ )
418
+
419
+ return ExpectationResult(
420
+ name="values_in",
421
+ passed=True,
422
+ actual_value=list(actual_values),
423
+ expected=f"in {allowed_values}",
424
+ message=f"All values in '{column}' are valid",
425
+ )
426
+
427
+ self._expectations.append(check)
428
+ return self
429
+
430
+ def expect(
431
+ self,
432
+ predicate: Callable[[pd.DataFrame], bool],
433
+ name: str,
434
+ message: str = "",
435
+ ) -> "ContractBuilder":
436
+ """Add custom expectation."""
437
+
438
+ def check(df: pd.DataFrame) -> ExpectationResult:
439
+ try:
440
+ passed = predicate(df)
441
+ except Exception as e:
442
+ message_with_error = f"{message}: {e}" if message else str(e)
443
+ return ExpectationResult(
444
+ name=name,
445
+ passed=False,
446
+ actual_value=str(e),
447
+ expected="pass",
448
+ message=message_with_error,
449
+ )
450
+
451
+ return ExpectationResult(
452
+ name=name,
453
+ passed=passed,
454
+ actual_value=passed,
455
+ expected="pass",
456
+ message=message if message else ("passed" if passed else "failed"),
457
+ )
458
+
459
+ self._expectations.append(check)
460
+ return self
461
+
462
+ def check(self, df: pd.DataFrame) -> ContractResult:
463
+ """Run all expectations and return result."""
464
+ results = [exp(df) for exp in self._expectations]
465
+ return ContractResult(
466
+ passed=all(r.passed for r in results),
467
+ expectations=results,
468
+ )
469
+
470
+
471
+ def contract() -> ContractBuilder:
472
+ """Create a new contract builder."""
473
+ return ContractBuilder()