odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,605 @@
1
+ """
2
+ Optimized quarantine table support for routing failed validation rows.
3
+
4
+ Performance optimizations:
5
+ - Removed per-row test_results lists (O(N*tests) memory savings)
6
+ - Added sampling/limiting for large invalid sets
7
+ - Single pass for combined mask evaluation
8
+ - No unnecessary Python list conversions
9
+
10
+ This module provides functionality to:
11
+ 1. Split DataFrames into valid and invalid portions based on test results
12
+ 2. Add metadata columns to quarantined rows
13
+ 3. Write quarantined rows to a dedicated table (with optional sampling)
14
+ """
15
+
16
+ import logging
17
+ from dataclasses import dataclass, field
18
+ from datetime import datetime, timezone
19
+ from typing import Any, Dict, List
20
+
21
+ from odibi.config import (
22
+ ContractSeverity,
23
+ QuarantineColumnsConfig,
24
+ QuarantineConfig,
25
+ TestConfig,
26
+ TestType,
27
+ )
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ @dataclass
33
+ class QuarantineResult:
34
+ """Result of quarantine operation."""
35
+
36
+ valid_df: Any
37
+ invalid_df: Any
38
+ rows_quarantined: int
39
+ rows_valid: int
40
+ test_results: Dict[str, Dict[str, int]] = field(default_factory=dict)
41
+ failed_test_details: Dict[int, List[str]] = field(default_factory=dict)
42
+
43
+
44
+ def _evaluate_test_mask(
45
+ df: Any,
46
+ test: TestConfig,
47
+ is_spark: bool,
48
+ is_polars: bool,
49
+ ) -> Any:
50
+ """
51
+ Evaluate a single test and return a boolean mask (True = passed).
52
+
53
+ Args:
54
+ df: DataFrame to evaluate
55
+ test: Test configuration
56
+ is_spark: Whether using Spark engine
57
+ is_polars: Whether using Polars engine
58
+
59
+ Returns:
60
+ Boolean mask where True means the row passed the test
61
+ """
62
+ if is_spark:
63
+ from pyspark.sql import functions as F
64
+
65
+ if test.type == TestType.NOT_NULL:
66
+ masks = []
67
+ for col in test.columns:
68
+ if col in df.columns:
69
+ masks.append(F.col(col).isNotNull())
70
+ if masks:
71
+ combined = masks[0]
72
+ for m in masks[1:]:
73
+ combined = combined & m
74
+ return combined
75
+ return F.lit(True)
76
+
77
+ elif test.type == TestType.UNIQUE:
78
+ return F.lit(True)
79
+
80
+ elif test.type == TestType.ACCEPTED_VALUES:
81
+ col = test.column
82
+ if col in df.columns:
83
+ return F.col(col).isin(test.values)
84
+ return F.lit(True)
85
+
86
+ elif test.type == TestType.RANGE:
87
+ col = test.column
88
+ if col in df.columns:
89
+ cond = F.lit(True)
90
+ if test.min is not None:
91
+ cond = cond & (F.col(col) >= test.min)
92
+ if test.max is not None:
93
+ cond = cond & (F.col(col) <= test.max)
94
+ return cond
95
+ return F.lit(True)
96
+
97
+ elif test.type == TestType.REGEX_MATCH:
98
+ col = test.column
99
+ if col in df.columns:
100
+ return F.col(col).rlike(test.pattern) | F.col(col).isNull()
101
+ return F.lit(True)
102
+
103
+ elif test.type == TestType.CUSTOM_SQL:
104
+ try:
105
+ return F.expr(test.condition)
106
+ except Exception:
107
+ return F.lit(True)
108
+
109
+ return F.lit(True)
110
+
111
+ elif is_polars:
112
+ import polars as pl
113
+
114
+ if test.type == TestType.NOT_NULL:
115
+ masks = []
116
+ for col in test.columns:
117
+ if col in df.columns:
118
+ masks.append(pl.col(col).is_not_null())
119
+ if masks:
120
+ combined = masks[0]
121
+ for m in masks[1:]:
122
+ combined = combined & m
123
+ return combined
124
+ return pl.lit(True)
125
+
126
+ elif test.type == TestType.ACCEPTED_VALUES:
127
+ col = test.column
128
+ if col in df.columns:
129
+ return pl.col(col).is_in(test.values)
130
+ return pl.lit(True)
131
+
132
+ elif test.type == TestType.RANGE:
133
+ col = test.column
134
+ if col in df.columns:
135
+ cond = pl.lit(True)
136
+ if test.min is not None:
137
+ cond = cond & (pl.col(col) >= test.min)
138
+ if test.max is not None:
139
+ cond = cond & (pl.col(col) <= test.max)
140
+ return cond
141
+ return pl.lit(True)
142
+
143
+ elif test.type == TestType.REGEX_MATCH:
144
+ col = test.column
145
+ if col in df.columns:
146
+ return pl.col(col).str.contains(test.pattern) | pl.col(col).is_null()
147
+ return pl.lit(True)
148
+
149
+ return pl.lit(True)
150
+
151
+ else:
152
+ import pandas as pd
153
+
154
+ if test.type == TestType.NOT_NULL:
155
+ masks = []
156
+ for col in test.columns:
157
+ if col in df.columns:
158
+ masks.append(df[col].notna())
159
+ if masks:
160
+ combined = masks[0]
161
+ for m in masks[1:]:
162
+ combined = combined & m
163
+ return combined
164
+ return pd.Series([True] * len(df), index=df.index)
165
+
166
+ elif test.type == TestType.UNIQUE:
167
+ return pd.Series([True] * len(df), index=df.index)
168
+
169
+ elif test.type == TestType.ACCEPTED_VALUES:
170
+ col = test.column
171
+ if col in df.columns:
172
+ return df[col].isin(test.values)
173
+ return pd.Series([True] * len(df), index=df.index)
174
+
175
+ elif test.type == TestType.RANGE:
176
+ col = test.column
177
+ if col in df.columns:
178
+ mask = pd.Series([True] * len(df), index=df.index)
179
+ if test.min is not None:
180
+ mask = mask & (df[col] >= test.min)
181
+ if test.max is not None:
182
+ mask = mask & (df[col] <= test.max)
183
+ return mask
184
+ return pd.Series([True] * len(df), index=df.index)
185
+
186
+ elif test.type == TestType.REGEX_MATCH:
187
+ col = test.column
188
+ if col in df.columns:
189
+ return df[col].isna() | df[col].astype(str).str.match(test.pattern, na=True)
190
+ return pd.Series([True] * len(df), index=df.index)
191
+
192
+ elif test.type == TestType.CUSTOM_SQL:
193
+ try:
194
+ valid = df.query(test.condition)
195
+ mask = df.index.isin(valid.index)
196
+ return pd.Series(mask, index=df.index)
197
+ except Exception:
198
+ return pd.Series([True] * len(df), index=df.index)
199
+
200
+ return pd.Series([True] * len(df), index=df.index)
201
+
202
+
203
+ def split_valid_invalid(
204
+ df: Any,
205
+ tests: List[TestConfig],
206
+ engine: Any,
207
+ ) -> QuarantineResult:
208
+ """
209
+ Split DataFrame into valid and invalid portions based on quarantine tests.
210
+
211
+ Only tests with on_fail == QUARANTINE are evaluated for splitting.
212
+ A row is invalid if it fails ANY quarantine test.
213
+
214
+ Performance: Removed per-row test_results lists to save O(N*tests) memory.
215
+ Now stores only aggregate counts per test.
216
+
217
+ Args:
218
+ df: DataFrame to split
219
+ tests: List of test configurations
220
+ engine: Engine instance (Spark, Pandas, or Polars)
221
+
222
+ Returns:
223
+ QuarantineResult with valid_df, invalid_df, and test metadata
224
+ """
225
+ is_spark = False
226
+ is_polars = False
227
+
228
+ try:
229
+ import pyspark
230
+
231
+ if hasattr(engine, "spark") or isinstance(df, pyspark.sql.DataFrame):
232
+ is_spark = True
233
+ except ImportError:
234
+ pass
235
+
236
+ if not is_spark:
237
+ try:
238
+ import polars as pl
239
+
240
+ if isinstance(df, (pl.DataFrame, pl.LazyFrame)):
241
+ is_polars = True
242
+ except ImportError:
243
+ pass
244
+
245
+ quarantine_tests = [t for t in tests if t.on_fail == ContractSeverity.QUARANTINE]
246
+
247
+ if not quarantine_tests:
248
+ if is_spark:
249
+ from pyspark.sql import functions as F
250
+
251
+ empty_df = df.filter(F.lit(False))
252
+ elif is_polars:
253
+ import polars as pl
254
+
255
+ empty_df = df.filter(pl.lit(False))
256
+ else:
257
+ empty_df = df.iloc[0:0].copy()
258
+
259
+ row_count = engine.count_rows(df) if hasattr(engine, "count_rows") else len(df)
260
+ return QuarantineResult(
261
+ valid_df=df,
262
+ invalid_df=empty_df,
263
+ rows_quarantined=0,
264
+ rows_valid=row_count,
265
+ test_results={},
266
+ failed_test_details={},
267
+ )
268
+
269
+ test_masks = {}
270
+ test_names = []
271
+
272
+ for idx, test in enumerate(quarantine_tests):
273
+ base_name = test.name or f"{test.type.value}"
274
+ test_name = base_name if base_name not in test_masks else f"{base_name}_{idx}"
275
+ test_names.append(test_name)
276
+ mask = _evaluate_test_mask(df, test, is_spark, is_polars)
277
+ test_masks[test_name] = mask
278
+
279
+ if is_spark:
280
+ from pyspark.sql import functions as F
281
+
282
+ combined_valid_mask = F.lit(True)
283
+ for mask in test_masks.values():
284
+ combined_valid_mask = combined_valid_mask & mask
285
+
286
+ df_cached = df.cache()
287
+
288
+ valid_df = df_cached.filter(combined_valid_mask)
289
+ invalid_df = df_cached.filter(~combined_valid_mask)
290
+
291
+ valid_df = valid_df.cache()
292
+ invalid_df = invalid_df.cache()
293
+
294
+ rows_valid = valid_df.count()
295
+ rows_quarantined = invalid_df.count()
296
+ total = rows_valid + rows_quarantined
297
+
298
+ test_results = {}
299
+ for name, mask in test_masks.items():
300
+ pass_count = df_cached.filter(mask).count()
301
+ fail_count = total - pass_count
302
+ test_results[name] = {"pass_count": pass_count, "fail_count": fail_count}
303
+
304
+ df_cached.unpersist()
305
+
306
+ elif is_polars:
307
+ import polars as pl
308
+
309
+ combined_valid_mask = pl.lit(True)
310
+ for mask in test_masks.values():
311
+ combined_valid_mask = combined_valid_mask & mask
312
+
313
+ valid_df = df.filter(combined_valid_mask)
314
+ invalid_df = df.filter(~combined_valid_mask)
315
+
316
+ rows_valid = len(valid_df)
317
+ rows_quarantined = len(invalid_df)
318
+
319
+ test_results = {}
320
+
321
+ else:
322
+ import pandas as pd
323
+
324
+ combined_valid_mask = pd.Series([True] * len(df), index=df.index)
325
+ for mask in test_masks.values():
326
+ combined_valid_mask = combined_valid_mask & mask
327
+
328
+ valid_df = df[combined_valid_mask].copy()
329
+ invalid_df = df[~combined_valid_mask].copy()
330
+
331
+ rows_valid = len(valid_df)
332
+ rows_quarantined = len(invalid_df)
333
+
334
+ test_results = {}
335
+ for name, mask in test_masks.items():
336
+ pass_count = int(mask.sum())
337
+ fail_count = len(df) - pass_count
338
+ test_results[name] = {"pass_count": pass_count, "fail_count": fail_count}
339
+
340
+ logger.info(f"Quarantine split: {rows_valid} valid, {rows_quarantined} invalid")
341
+
342
+ return QuarantineResult(
343
+ valid_df=valid_df,
344
+ invalid_df=invalid_df,
345
+ rows_quarantined=rows_quarantined,
346
+ rows_valid=rows_valid,
347
+ test_results=test_results,
348
+ failed_test_details={},
349
+ )
350
+
351
+
352
+ def add_quarantine_metadata(
353
+ invalid_df: Any,
354
+ test_results: Dict[str, Any],
355
+ config: QuarantineColumnsConfig,
356
+ engine: Any,
357
+ node_name: str,
358
+ run_id: str,
359
+ tests: List[TestConfig],
360
+ ) -> Any:
361
+ """
362
+ Add metadata columns to quarantined rows.
363
+
364
+ Args:
365
+ invalid_df: DataFrame of invalid rows
366
+ test_results: Dict of test_name -> aggregate results (not per-row)
367
+ config: QuarantineColumnsConfig specifying which columns to add
368
+ engine: Engine instance
369
+ node_name: Name of the originating node
370
+ run_id: Current run ID
371
+ tests: List of test configurations (for building failure reasons)
372
+
373
+ Returns:
374
+ DataFrame with added metadata columns
375
+ """
376
+ is_spark = False
377
+ is_polars = False
378
+
379
+ try:
380
+ import pyspark
381
+
382
+ if hasattr(engine, "spark") or isinstance(invalid_df, pyspark.sql.DataFrame):
383
+ is_spark = True
384
+ except ImportError:
385
+ pass
386
+
387
+ if not is_spark:
388
+ try:
389
+ import polars as pl
390
+
391
+ if isinstance(invalid_df, (pl.DataFrame, pl.LazyFrame)):
392
+ is_polars = True
393
+ except ImportError:
394
+ pass
395
+
396
+ rejected_at = datetime.now(timezone.utc).isoformat()
397
+
398
+ quarantine_tests = [t for t in tests if t.on_fail == ContractSeverity.QUARANTINE]
399
+ test_names = [t.name or f"{t.type.value}" for t in quarantine_tests]
400
+ failed_tests_str = ",".join(test_names)
401
+ rejection_reason = f"Failed tests: {failed_tests_str}"
402
+
403
+ if is_spark:
404
+ from pyspark.sql import functions as F
405
+
406
+ result_df = invalid_df
407
+
408
+ if config.rejection_reason:
409
+ result_df = result_df.withColumn("_rejection_reason", F.lit(rejection_reason))
410
+
411
+ if config.rejected_at:
412
+ result_df = result_df.withColumn("_rejected_at", F.lit(rejected_at))
413
+
414
+ if config.source_batch_id:
415
+ result_df = result_df.withColumn("_source_batch_id", F.lit(run_id))
416
+
417
+ if config.failed_tests:
418
+ result_df = result_df.withColumn("_failed_tests", F.lit(failed_tests_str))
419
+
420
+ if config.original_node:
421
+ result_df = result_df.withColumn("_original_node", F.lit(node_name))
422
+
423
+ return result_df
424
+
425
+ elif is_polars:
426
+ import polars as pl
427
+
428
+ result_df = invalid_df
429
+
430
+ if config.rejection_reason:
431
+ result_df = result_df.with_columns(pl.lit(rejection_reason).alias("_rejection_reason"))
432
+
433
+ if config.rejected_at:
434
+ result_df = result_df.with_columns(pl.lit(rejected_at).alias("_rejected_at"))
435
+
436
+ if config.source_batch_id:
437
+ result_df = result_df.with_columns(pl.lit(run_id).alias("_source_batch_id"))
438
+
439
+ if config.failed_tests:
440
+ result_df = result_df.with_columns(pl.lit(failed_tests_str).alias("_failed_tests"))
441
+
442
+ if config.original_node:
443
+ result_df = result_df.with_columns(pl.lit(node_name).alias("_original_node"))
444
+
445
+ return result_df
446
+
447
+ else:
448
+ result_df = invalid_df.copy()
449
+
450
+ if config.rejection_reason:
451
+ result_df["_rejection_reason"] = rejection_reason
452
+
453
+ if config.rejected_at:
454
+ result_df["_rejected_at"] = rejected_at
455
+
456
+ if config.source_batch_id:
457
+ result_df["_source_batch_id"] = run_id
458
+
459
+ if config.failed_tests:
460
+ result_df["_failed_tests"] = failed_tests_str
461
+
462
+ if config.original_node:
463
+ result_df["_original_node"] = node_name
464
+
465
+ return result_df
466
+
467
+
468
+ def _apply_sampling(
469
+ invalid_df: Any,
470
+ config: QuarantineConfig,
471
+ is_spark: bool,
472
+ is_polars: bool,
473
+ ) -> Any:
474
+ """
475
+ Apply sampling/limiting to invalid DataFrame based on config.
476
+
477
+ Args:
478
+ invalid_df: DataFrame of invalid rows
479
+ config: QuarantineConfig with max_rows and sample_fraction
480
+ is_spark: Whether using Spark engine
481
+ is_polars: Whether using Polars engine
482
+
483
+ Returns:
484
+ Sampled/limited DataFrame
485
+ """
486
+ sample_fraction = getattr(config, "sample_fraction", None)
487
+ max_rows = getattr(config, "max_rows", None)
488
+
489
+ if sample_fraction is None and max_rows is None:
490
+ return invalid_df
491
+
492
+ if is_spark:
493
+ result = invalid_df
494
+ if sample_fraction is not None:
495
+ result = result.sample(fraction=sample_fraction)
496
+ if max_rows is not None:
497
+ result = result.limit(max_rows)
498
+ return result
499
+
500
+ elif is_polars:
501
+ result = invalid_df
502
+ if sample_fraction is not None:
503
+ n_samples = max(1, int(len(result) * sample_fraction))
504
+ result = result.sample(n=min(n_samples, len(result)))
505
+ if max_rows is not None:
506
+ result = result.head(max_rows)
507
+ return result
508
+
509
+ else:
510
+ result = invalid_df
511
+ if sample_fraction is not None:
512
+ result = result.sample(frac=sample_fraction)
513
+ if max_rows is not None:
514
+ result = result.head(max_rows)
515
+ return result
516
+
517
+
518
+ def write_quarantine(
519
+ invalid_df: Any,
520
+ config: QuarantineConfig,
521
+ engine: Any,
522
+ connections: Dict[str, Any],
523
+ ) -> Dict[str, Any]:
524
+ """
525
+ Write quarantined rows to destination (always append mode).
526
+
527
+ Supports optional sampling/limiting via config.max_rows and config.sample_fraction.
528
+
529
+ Args:
530
+ invalid_df: DataFrame of invalid rows with metadata
531
+ config: QuarantineConfig specifying destination and sampling options
532
+ engine: Engine instance
533
+ connections: Dict of connection configurations
534
+
535
+ Returns:
536
+ Dict with write result metadata
537
+ """
538
+ is_spark = False
539
+ is_polars = False
540
+
541
+ try:
542
+ import pyspark
543
+
544
+ if hasattr(engine, "spark") or isinstance(invalid_df, pyspark.sql.DataFrame):
545
+ is_spark = True
546
+ except ImportError:
547
+ pass
548
+
549
+ if not is_spark:
550
+ try:
551
+ import polars as pl
552
+
553
+ if isinstance(invalid_df, (pl.DataFrame, pl.LazyFrame)):
554
+ is_polars = True
555
+ except ImportError:
556
+ pass
557
+
558
+ invalid_df = _apply_sampling(invalid_df, config, is_spark, is_polars)
559
+
560
+ if is_spark:
561
+ row_count = invalid_df.count()
562
+ elif is_polars:
563
+ row_count = len(invalid_df)
564
+ else:
565
+ row_count = len(invalid_df)
566
+
567
+ if row_count == 0:
568
+ return {
569
+ "rows_quarantined": 0,
570
+ "quarantine_path": config.path or config.table,
571
+ "write_info": None,
572
+ }
573
+
574
+ connection = connections.get(config.connection)
575
+ if connection is None:
576
+ raise ValueError(
577
+ f"Quarantine connection '{config.connection}' not found. "
578
+ f"Available: {', '.join(connections.keys())}"
579
+ )
580
+
581
+ try:
582
+ write_result = engine.write(
583
+ invalid_df,
584
+ connection=connection,
585
+ format="delta" if config.table else "parquet",
586
+ path=config.path,
587
+ table=config.table,
588
+ mode="append",
589
+ )
590
+ except Exception as e:
591
+ logger.error(f"Failed to write quarantine data: {e}")
592
+ raise
593
+
594
+ logger.info(f"Wrote {row_count} rows to quarantine: {config.path or config.table}")
595
+
596
+ return {
597
+ "rows_quarantined": row_count,
598
+ "quarantine_path": config.path or config.table,
599
+ "write_info": write_result,
600
+ }
601
+
602
+
603
+ def has_quarantine_tests(tests: List[TestConfig]) -> bool:
604
+ """Check if any tests use quarantine severity."""
605
+ return any(t.on_fail == ContractSeverity.QUARANTINE for t in tests)
@@ -0,0 +1,15 @@
1
+ """Writers module for database-specific write operations."""
2
+
3
+ from odibi.writers.sql_server_writer import (
4
+ MergeResult,
5
+ OverwriteResult,
6
+ SqlServerMergeWriter,
7
+ ValidationResult,
8
+ )
9
+
10
+ __all__ = [
11
+ "MergeResult",
12
+ "OverwriteResult",
13
+ "SqlServerMergeWriter",
14
+ "ValidationResult",
15
+ ]