odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
odibi/config.py ADDED
@@ -0,0 +1,3541 @@
1
+ """Configuration models for ODIBI framework."""
2
+
3
+ from enum import Enum
4
+ from typing import Any, Dict, List, Literal, Optional, Union
5
+
6
+ try:
7
+ from typing import Annotated
8
+ except ImportError:
9
+ from typing_extensions import Annotated
10
+
11
+ from pydantic import BaseModel, Field, field_validator, model_validator
12
+
13
+
14
+ class EngineType(str, Enum):
15
+ """Supported execution engines."""
16
+
17
+ SPARK = "spark"
18
+ PANDAS = "pandas"
19
+ POLARS = "polars"
20
+
21
+
22
+ class ConnectionType(str, Enum):
23
+ """Supported connection types."""
24
+
25
+ LOCAL = "local"
26
+ AZURE_BLOB = "azure_blob"
27
+ DELTA = "delta"
28
+ SQL_SERVER = "sql_server"
29
+ HTTP = "http"
30
+
31
+
32
+ class WriteMode(str, Enum):
33
+ """Write modes for output operations."""
34
+
35
+ OVERWRITE = "overwrite"
36
+ APPEND = "append"
37
+ UPSERT = "upsert"
38
+ APPEND_ONCE = "append_once"
39
+ MERGE = "merge" # SQL Server MERGE (staging table + T-SQL MERGE)
40
+
41
+
42
+ class DeleteDetectionMode(str, Enum):
43
+ """
44
+ Delete detection strategies for Silver layer processing.
45
+
46
+ Values:
47
+ * `none` - No delete detection (default). Use for append-only facts.
48
+ * `snapshot_diff` - Compare Delta version N vs N-1 keys. Use for full snapshot sources only.
49
+ * `sql_compare` - LEFT ANTI JOIN Silver keys against live source. Recommended for HWM ingestion.
50
+ """
51
+
52
+ NONE = "none"
53
+ SNAPSHOT_DIFF = "snapshot_diff"
54
+ SQL_COMPARE = "sql_compare"
55
+
56
+
57
+ class LogLevel(str, Enum):
58
+ """Logging levels."""
59
+
60
+ DEBUG = "DEBUG"
61
+ INFO = "INFO"
62
+ WARNING = "WARNING"
63
+ ERROR = "ERROR"
64
+
65
+
66
+ class AlertType(str, Enum):
67
+ """Types of alerting channels."""
68
+
69
+ WEBHOOK = "webhook"
70
+ SLACK = "slack"
71
+ TEAMS = "teams" # Uses Power Automate workflow format (classic webhooks retired Dec 2025)
72
+ TEAMS_WORKFLOW = "teams_workflow" # Alias for teams (explicit Power Automate)
73
+
74
+
75
+ class AlertEvent(str, Enum):
76
+ """Events that trigger alerts."""
77
+
78
+ ON_START = "on_start"
79
+ ON_SUCCESS = "on_success"
80
+ ON_FAILURE = "on_failure"
81
+ ON_QUARANTINE = "on_quarantine"
82
+ ON_GATE_BLOCK = "on_gate_block"
83
+ ON_THRESHOLD_BREACH = "on_threshold_breach"
84
+
85
+
86
+ class AlertConfig(BaseModel):
87
+ """
88
+ Configuration for alerts with throttling support.
89
+
90
+ Supports Slack, Teams, and generic webhooks with event-specific payloads.
91
+
92
+ **Available Events:**
93
+ - `on_start` - Pipeline started
94
+ - `on_success` - Pipeline completed successfully
95
+ - `on_failure` - Pipeline failed
96
+ - `on_quarantine` - Rows were quarantined
97
+ - `on_gate_block` - Quality gate blocked the pipeline
98
+ - `on_threshold_breach` - A threshold was exceeded
99
+
100
+ Example:
101
+ ```yaml
102
+ alerts:
103
+ - type: slack
104
+ url: "${SLACK_WEBHOOK_URL}"
105
+ on_events:
106
+ - on_failure
107
+ - on_quarantine
108
+ - on_gate_block
109
+ metadata:
110
+ throttle_minutes: 15
111
+ max_per_hour: 10
112
+ channel: "#data-alerts"
113
+ ```
114
+ """
115
+
116
+ type: AlertType
117
+ url: str = Field(description="Webhook URL")
118
+ on_events: List[AlertEvent] = Field(
119
+ default=[AlertEvent.ON_FAILURE],
120
+ description="Events to trigger alert: on_start, on_success, on_failure, on_quarantine, on_gate_block, on_threshold_breach",
121
+ )
122
+ metadata: Dict[str, Any] = Field(
123
+ default_factory=dict,
124
+ description="Extra metadata: throttle_minutes, max_per_hour, channel, etc.",
125
+ )
126
+
127
+
128
+ class ErrorStrategy(str, Enum):
129
+ """Strategy for handling node failures."""
130
+
131
+ FAIL_FAST = "fail_fast" # Stop pipeline immediately
132
+ FAIL_LATER = "fail_later" # Continue pipeline (dependents skipped) - DEFAULT
133
+ IGNORE = "ignore" # Treat as success (warning) - Dependents run
134
+
135
+
136
+ class ValidationMode(str, Enum):
137
+ """Validation execution mode."""
138
+
139
+ LAZY = "lazy"
140
+ EAGER = "eager"
141
+
142
+
143
+ class ThresholdBreachAction(str, Enum):
144
+ """Action to take when delete threshold is exceeded."""
145
+
146
+ WARN = "warn"
147
+ ERROR = "error"
148
+ SKIP = "skip"
149
+
150
+
151
+ class FirstRunBehavior(str, Enum):
152
+ """Behavior when no previous version exists for snapshot_diff."""
153
+
154
+ SKIP = "skip"
155
+ ERROR = "error"
156
+
157
+
158
+ # ============================================
159
+ # Delete Detection Configuration
160
+ # ============================================
161
+
162
+
163
+ class DeleteDetectionConfig(BaseModel):
164
+ """
165
+ Configuration for delete detection in Silver layer.
166
+
167
+ ### 🔍 "CDC Without CDC" Guide
168
+
169
+ **Business Problem:**
170
+ "Records are deleted in our Azure SQL source, but our Silver tables still show them."
171
+
172
+ **The Solution:**
173
+ Use delete detection to identify and flag records that no longer exist in the source.
174
+
175
+ **Recipe 1: SQL Compare (Recommended for HWM)**
176
+ ```yaml
177
+ transform:
178
+ steps:
179
+ - operation: detect_deletes
180
+ params:
181
+ mode: sql_compare
182
+ keys: [customer_id]
183
+ source_connection: azure_sql
184
+ source_table: dbo.Customers
185
+ ```
186
+
187
+ **Recipe 2: Snapshot Diff (For Full Snapshot Sources)**
188
+ Use ONLY with full snapshot ingestion, NOT with HWM incremental.
189
+ Requires `connection` and `path` to specify the target Delta table for comparison.
190
+ ```yaml
191
+ transform:
192
+ steps:
193
+ - operation: detect_deletes
194
+ params:
195
+ mode: snapshot_diff
196
+ keys: [customer_id]
197
+ connection: silver_conn # Required: connection to target Delta table
198
+ path: "silver/customers" # Required: path to target Delta table
199
+ ```
200
+
201
+ **Recipe 3: Conservative Threshold**
202
+ ```yaml
203
+ transform:
204
+ steps:
205
+ - operation: detect_deletes
206
+ params:
207
+ mode: sql_compare
208
+ keys: [customer_id]
209
+ source_connection: erp
210
+ source_table: dbo.Customers
211
+ max_delete_percent: 20.0
212
+ on_threshold_breach: error
213
+ ```
214
+
215
+ **Recipe 4: Hard Delete (Remove Rows)**
216
+ ```yaml
217
+ transform:
218
+ steps:
219
+ - operation: detect_deletes
220
+ params:
221
+ mode: sql_compare
222
+ keys: [customer_id]
223
+ source_connection: azure_sql
224
+ source_table: dbo.Customers
225
+ soft_delete_col: null # removes rows instead of flagging
226
+ ```
227
+ """
228
+
229
+ mode: DeleteDetectionMode = Field(
230
+ default=DeleteDetectionMode.NONE,
231
+ description="Delete detection strategy: none, snapshot_diff, sql_compare",
232
+ )
233
+
234
+ keys: List[str] = Field(
235
+ default_factory=list,
236
+ description="Business key columns for comparison",
237
+ )
238
+
239
+ connection: Optional[str] = Field(
240
+ default=None,
241
+ description="For snapshot_diff: connection name to target Delta table (required for snapshot_diff)",
242
+ )
243
+ path: Optional[str] = Field(
244
+ default=None,
245
+ description="For snapshot_diff: path to target Delta table (required for snapshot_diff)",
246
+ )
247
+
248
+ soft_delete_col: Optional[str] = Field(
249
+ default="_is_deleted",
250
+ description="Column to flag deletes (True = deleted). Set to null for hard-delete (removes rows).",
251
+ )
252
+
253
+ source_connection: Optional[str] = Field(
254
+ default=None,
255
+ description="For sql_compare: connection name to query live source",
256
+ )
257
+ source_table: Optional[str] = Field(
258
+ default=None,
259
+ description="For sql_compare: table to query for current keys",
260
+ )
261
+ source_query: Optional[str] = Field(
262
+ default=None,
263
+ description="For sql_compare: custom SQL query for keys (overrides source_table)",
264
+ )
265
+
266
+ snapshot_column: Optional[str] = Field(
267
+ default=None,
268
+ description="For snapshot_diff on non-Delta: column to identify snapshots. "
269
+ "If None, uses Delta time travel (default).",
270
+ )
271
+
272
+ on_first_run: FirstRunBehavior = Field(
273
+ default=FirstRunBehavior.SKIP,
274
+ description="Behavior when no previous version exists for snapshot_diff",
275
+ )
276
+
277
+ max_delete_percent: Optional[float] = Field(
278
+ default=50.0,
279
+ ge=0.0,
280
+ le=100.0,
281
+ description="Safety threshold: warn/error if more than X% of rows would be deleted",
282
+ )
283
+
284
+ on_threshold_breach: ThresholdBreachAction = Field(
285
+ default=ThresholdBreachAction.WARN,
286
+ description="Behavior when delete percentage exceeds max_delete_percent",
287
+ )
288
+
289
+ @model_validator(mode="after")
290
+ def validate_mode_requirements(self):
291
+ """Validate that required fields are present for each mode."""
292
+ if self.mode == DeleteDetectionMode.NONE:
293
+ return self
294
+
295
+ if not self.keys:
296
+ raise ValueError(
297
+ f"DeleteDetectionConfig validation failed: 'keys' is required when mode='{self.mode.value}'. "
298
+ f"Specify the business key column(s) to identify deleted records. "
299
+ f"Example: keys: ['customer_id'] or keys: ['order_id', 'line_num']"
300
+ )
301
+
302
+ # Note: snapshot_diff can use connection+path OR fallback to context inference
303
+ # Validation at runtime in detect_deletes transformer will warn if neither available
304
+
305
+ if self.mode == DeleteDetectionMode.SQL_COMPARE:
306
+ if not self.source_connection:
307
+ raise ValueError(
308
+ "DeleteDetectionConfig validation failed: 'source_connection' is required for mode='sql_compare'. "
309
+ "Specify the connection name that points to the live source database. "
310
+ "Example: source_connection: 'azure_sql'"
311
+ )
312
+ if not self.source_table and not self.source_query:
313
+ raise ValueError(
314
+ "DeleteDetectionConfig validation failed: Either 'source_table' or 'source_query' is required for mode='sql_compare'. "
315
+ "Specify the table/query to compare against for detecting deleted records. "
316
+ "Example: source_table: 'dbo.Customers' or source_query: 'SELECT customer_id FROM dbo.Customers WHERE active = 1'"
317
+ )
318
+
319
+ return self
320
+
321
+
322
+ # ============================================
323
+ # Write Metadata Configuration
324
+ # ============================================
325
+
326
+
327
+ class WriteMetadataConfig(BaseModel):
328
+ """
329
+ Configuration for metadata columns added during Bronze writes.
330
+
331
+ ### 📋 Bronze Metadata Guide
332
+
333
+ **Business Problem:**
334
+ "We need lineage tracking and debugging info for our Bronze layer data."
335
+
336
+ **The Solution:**
337
+ Add metadata columns during ingestion for traceability.
338
+
339
+ **Recipe 1: Add All Metadata (Recommended)**
340
+ ```yaml
341
+ write:
342
+ connection: bronze
343
+ table: customers
344
+ mode: append
345
+ add_metadata: true # adds all applicable columns
346
+ ```
347
+
348
+ **Recipe 2: Selective Metadata**
349
+ ```yaml
350
+ write:
351
+ connection: bronze
352
+ table: customers
353
+ mode: append
354
+ add_metadata:
355
+ extracted_at: true
356
+ source_file: true
357
+ source_connection: false
358
+ source_table: false
359
+ ```
360
+
361
+ **Available Columns:**
362
+ - `_extracted_at`: Pipeline execution timestamp (all sources)
363
+ - `_source_file`: Source filename/path (file sources only)
364
+ - `_source_connection`: Connection name used (all sources)
365
+ - `_source_table`: Table or query name (SQL sources only)
366
+ """
367
+
368
+ extracted_at: bool = Field(
369
+ default=True,
370
+ description="Add _extracted_at column with pipeline execution timestamp",
371
+ )
372
+ source_file: bool = Field(
373
+ default=True,
374
+ description="Add _source_file column with source filename (file sources only)",
375
+ )
376
+ source_connection: bool = Field(
377
+ default=False,
378
+ description="Add _source_connection column with connection name",
379
+ )
380
+ source_table: bool = Field(
381
+ default=False,
382
+ description="Add _source_table column with table/query name (SQL sources only)",
383
+ )
384
+
385
+
386
+ # ============================================
387
+ # Connection Configurations
388
+ # ============================================
389
+
390
+
391
+ class BaseConnectionConfig(BaseModel):
392
+ """Base configuration for all connections."""
393
+
394
+ type: ConnectionType
395
+ validation_mode: ValidationMode = ValidationMode.LAZY
396
+
397
+
398
+ class LocalConnectionConfig(BaseConnectionConfig):
399
+ """
400
+ Local filesystem connection.
401
+
402
+ **When to Use:** Development, testing, small datasets, local processing.
403
+
404
+ **See Also:** [AzureBlobConnectionConfig](#azureblobconnectionconfig) for cloud alternatives.
405
+
406
+ Example:
407
+ ```yaml
408
+ local_data:
409
+ type: "local"
410
+ base_path: "./data"
411
+ ```
412
+ """
413
+
414
+ type: Literal[ConnectionType.LOCAL] = ConnectionType.LOCAL
415
+ base_path: str = Field(default="./data", description="Base directory path")
416
+
417
+
418
+ # --- Azure Blob Auth ---
419
+
420
+
421
+ class AzureBlobAuthMode(str, Enum):
422
+ ACCOUNT_KEY = "account_key"
423
+ SAS = "sas"
424
+ CONNECTION_STRING = "connection_string"
425
+ KEY_VAULT = "key_vault"
426
+ AAD_MSI = "aad_msi"
427
+
428
+
429
+ class AzureBlobKeyVaultAuth(BaseModel):
430
+ mode: Literal[AzureBlobAuthMode.KEY_VAULT] = AzureBlobAuthMode.KEY_VAULT
431
+ key_vault: str
432
+ secret: str
433
+
434
+
435
+ class AzureBlobAccountKeyAuth(BaseModel):
436
+ mode: Literal[AzureBlobAuthMode.ACCOUNT_KEY] = AzureBlobAuthMode.ACCOUNT_KEY
437
+ account_key: str
438
+
439
+
440
+ class AzureBlobSasAuth(BaseModel):
441
+ mode: Literal[AzureBlobAuthMode.SAS] = AzureBlobAuthMode.SAS
442
+ sas_token: str
443
+
444
+
445
+ class AzureBlobConnectionStringAuth(BaseModel):
446
+ mode: Literal[AzureBlobAuthMode.CONNECTION_STRING] = AzureBlobAuthMode.CONNECTION_STRING
447
+ connection_string: str
448
+
449
+
450
+ class AzureBlobMsiAuth(BaseModel):
451
+ mode: Literal[AzureBlobAuthMode.AAD_MSI] = AzureBlobAuthMode.AAD_MSI
452
+ client_id: Optional[str] = None
453
+
454
+
455
+ AzureBlobAuthConfig = Annotated[
456
+ Union[
457
+ AzureBlobKeyVaultAuth,
458
+ AzureBlobAccountKeyAuth,
459
+ AzureBlobSasAuth,
460
+ AzureBlobConnectionStringAuth,
461
+ AzureBlobMsiAuth,
462
+ ],
463
+ Field(discriminator="mode"),
464
+ ]
465
+
466
+
467
+ class AzureBlobConnectionConfig(BaseConnectionConfig):
468
+ """
469
+ Azure Blob Storage / ADLS Gen2 connection.
470
+
471
+ **When to Use:** Azure-based data lakes, landing zones, raw data storage.
472
+
473
+ **See Also:** [DeltaConnectionConfig](#deltaconnectionconfig) for Delta-specific options
474
+
475
+ Scenario 1: Prod with Key Vault-managed key
476
+ ```yaml
477
+ adls_bronze:
478
+ type: "azure_blob"
479
+ account_name: "myaccount"
480
+ container: "bronze"
481
+ auth:
482
+ mode: "key_vault"
483
+ key_vault: "kv-data"
484
+ secret: "adls-account-key"
485
+ ```
486
+
487
+ Scenario 2: Local dev with inline account key
488
+ ```yaml
489
+ adls_dev:
490
+ type: "azure_blob"
491
+ account_name: "devaccount"
492
+ container: "sandbox"
493
+ auth:
494
+ mode: "account_key"
495
+ account_key: "${ADLS_ACCOUNT_KEY}"
496
+ ```
497
+
498
+ Scenario 3: MSI (no secrets)
499
+ ```yaml
500
+ adls_msi:
501
+ type: "azure_blob"
502
+ account_name: "myaccount"
503
+ container: "bronze"
504
+ auth:
505
+ mode: "aad_msi"
506
+ # optional: client_id for user-assigned identity
507
+ client_id: "00000000-0000-0000-0000-000000000000"
508
+ ```
509
+ """
510
+
511
+ type: Literal[ConnectionType.AZURE_BLOB] = ConnectionType.AZURE_BLOB
512
+ account_name: str
513
+ container: str
514
+ auth: AzureBlobAuthConfig = Field(
515
+ default_factory=lambda: AzureBlobMsiAuth(mode=AzureBlobAuthMode.AAD_MSI)
516
+ )
517
+
518
+
519
+ class DeltaConnectionConfig(BaseConnectionConfig):
520
+ """
521
+ Delta Lake connection for ACID-compliant data lakes.
522
+
523
+ **When to Use:**
524
+ - Production data lakes on Azure/AWS/GCP
525
+ - Need time travel, ACID transactions, schema evolution
526
+ - Upsert/merge operations
527
+
528
+ **See Also:** [WriteConfig](#writeconfig) for Delta write options
529
+
530
+ Scenario 1: Delta via metastore
531
+ ```yaml
532
+ delta_silver:
533
+ type: "delta"
534
+ catalog: "spark_catalog"
535
+ schema: "silver_db"
536
+ ```
537
+
538
+ Scenario 2: Direct path + Node usage
539
+ ```yaml
540
+ delta_local:
541
+ type: "local"
542
+ base_path: "dbfs:/mnt/delta"
543
+
544
+ # In pipeline:
545
+ # read:
546
+ # connection: "delta_local"
547
+ # format: "delta"
548
+ # path: "bronze/orders"
549
+ ```
550
+ """
551
+
552
+ type: Literal[ConnectionType.DELTA] = ConnectionType.DELTA
553
+ catalog: str = Field(description="Spark catalog name (e.g. 'spark_catalog')")
554
+ schema_name: str = Field(alias="schema", description="Database/schema name")
555
+ table: Optional[str] = Field(
556
+ default=None,
557
+ description="Optional default table name for this connection (used by story/pipeline helpers)",
558
+ )
559
+
560
+
561
+ # --- SQL Server Auth ---
562
+
563
+
564
+ class SQLServerAuthMode(str, Enum):
565
+ AAD_MSI = "aad_msi"
566
+ AAD_PASSWORD = "aad_password"
567
+ SQL_LOGIN = "sql_login"
568
+ CONNECTION_STRING = "connection_string"
569
+
570
+
571
+ class SQLLoginAuth(BaseModel):
572
+ mode: Literal[SQLServerAuthMode.SQL_LOGIN] = SQLServerAuthMode.SQL_LOGIN
573
+ username: str
574
+ password: str
575
+
576
+
577
+ class SQLAadPasswordAuth(BaseModel):
578
+ mode: Literal[SQLServerAuthMode.AAD_PASSWORD] = SQLServerAuthMode.AAD_PASSWORD
579
+ tenant_id: str
580
+ client_id: str
581
+ client_secret: str
582
+
583
+
584
+ class SQLMsiAuth(BaseModel):
585
+ mode: Literal[SQLServerAuthMode.AAD_MSI] = SQLServerAuthMode.AAD_MSI
586
+ client_id: Optional[str] = None
587
+
588
+
589
+ class SQLConnectionStringAuth(BaseModel):
590
+ mode: Literal[SQLServerAuthMode.CONNECTION_STRING] = SQLServerAuthMode.CONNECTION_STRING
591
+ connection_string: str
592
+
593
+
594
+ SQLServerAuthConfig = Annotated[
595
+ Union[SQLLoginAuth, SQLAadPasswordAuth, SQLMsiAuth, SQLConnectionStringAuth],
596
+ Field(discriminator="mode"),
597
+ ]
598
+
599
+
600
+ class SQLServerConnectionConfig(BaseConnectionConfig):
601
+ """
602
+ SQL Server / Azure SQL Database connection.
603
+
604
+ **When to Use:** Reading from SQL Server sources, Azure SQL DB, Azure Synapse.
605
+
606
+ **See Also:** [ReadConfig](#readconfig) for query options
607
+
608
+ Scenario 1: Managed identity (AAD MSI)
609
+ ```yaml
610
+ sql_dw_msi:
611
+ type: "sql_server"
612
+ host: "server.database.windows.net"
613
+ database: "dw"
614
+ auth:
615
+ mode: "aad_msi"
616
+ ```
617
+
618
+ Scenario 2: SQL login
619
+ ```yaml
620
+ sql_dw_login:
621
+ type: "sql_server"
622
+ host: "server.database.windows.net"
623
+ database: "dw"
624
+ auth:
625
+ mode: "sql_login"
626
+ username: "dw_writer"
627
+ password: "${DW_PASSWORD}"
628
+ ```
629
+ """
630
+
631
+ type: Literal[ConnectionType.SQL_SERVER] = ConnectionType.SQL_SERVER
632
+ host: str
633
+ database: str
634
+ port: int = 1433
635
+ auth: SQLServerAuthConfig = Field(
636
+ default_factory=lambda: SQLMsiAuth(mode=SQLServerAuthMode.AAD_MSI)
637
+ )
638
+
639
+
640
+ # --- HTTP Auth ---
641
+
642
+
643
+ class HttpAuthMode(str, Enum):
644
+ NONE = "none"
645
+ BASIC = "basic"
646
+ BEARER = "bearer"
647
+ API_KEY = "api_key"
648
+
649
+
650
+ class HttpBasicAuth(BaseModel):
651
+ mode: Literal[HttpAuthMode.BASIC] = HttpAuthMode.BASIC
652
+ username: str
653
+ password: str
654
+
655
+
656
+ class HttpBearerAuth(BaseModel):
657
+ mode: Literal[HttpAuthMode.BEARER] = HttpAuthMode.BEARER
658
+ token: str
659
+
660
+
661
+ class HttpApiKeyAuth(BaseModel):
662
+ mode: Literal[HttpAuthMode.API_KEY] = HttpAuthMode.API_KEY
663
+ header_name: str = "Authorization"
664
+ value_template: str = "Bearer {token}"
665
+
666
+
667
+ class HttpNoAuth(BaseModel):
668
+ mode: Literal[HttpAuthMode.NONE] = HttpAuthMode.NONE
669
+
670
+
671
+ HttpAuthConfig = Annotated[
672
+ Union[HttpNoAuth, HttpBasicAuth, HttpBearerAuth, HttpApiKeyAuth],
673
+ Field(discriminator="mode"),
674
+ ]
675
+
676
+
677
+ class HttpConnectionConfig(BaseConnectionConfig):
678
+ """
679
+ HTTP connection.
680
+
681
+ Scenario: Bearer token via env var
682
+ ```yaml
683
+ api_source:
684
+ type: "http"
685
+ base_url: "https://api.example.com"
686
+ headers:
687
+ User-Agent: "odibi-pipeline"
688
+ auth:
689
+ mode: "bearer"
690
+ token: "${API_TOKEN}"
691
+ ```
692
+ """
693
+
694
+ type: Literal[ConnectionType.HTTP] = ConnectionType.HTTP
695
+ base_url: str
696
+ headers: Dict[str, str] = Field(default_factory=dict)
697
+ auth: HttpAuthConfig = Field(default_factory=lambda: HttpNoAuth(mode=HttpAuthMode.NONE))
698
+
699
+
700
+ class CustomConnectionConfig(BaseModel):
701
+ """
702
+ Configuration for custom/plugin connections.
703
+ Allows any fields.
704
+ """
705
+
706
+ type: str
707
+ validation_mode: ValidationMode = ValidationMode.LAZY
708
+ # Allow extra fields
709
+ model_config = {"extra": "allow"}
710
+
711
+
712
+ # Connection config discriminated union
713
+ ConnectionConfig = Union[
714
+ LocalConnectionConfig,
715
+ AzureBlobConnectionConfig,
716
+ DeltaConnectionConfig,
717
+ SQLServerConnectionConfig,
718
+ HttpConnectionConfig,
719
+ CustomConnectionConfig,
720
+ ]
721
+
722
+
723
+ # ============================================
724
+ # Node Configurations
725
+ # ============================================
726
+
727
+
728
+ class ReadFormat(str, Enum):
729
+ CSV = "csv"
730
+ PARQUET = "parquet"
731
+ DELTA = "delta"
732
+ JSON = "json"
733
+ SQL = "sql"
734
+
735
+
736
+ class TimeTravelConfig(BaseModel):
737
+ """
738
+ Configuration for time travel reading (Delta/Iceberg).
739
+
740
+ Example:
741
+ ```yaml
742
+ time_travel:
743
+ as_of_version: 10
744
+ # OR
745
+ as_of_timestamp: "2023-10-01T12:00:00Z"
746
+ ```
747
+ """
748
+
749
+ as_of_version: Optional[int] = Field(
750
+ default=None, description="Version number to time travel to"
751
+ )
752
+ as_of_timestamp: Optional[str] = Field(
753
+ default=None, description="Timestamp string to time travel to"
754
+ )
755
+
756
+ @model_validator(mode="after")
757
+ def check_one_method(self):
758
+ if self.as_of_version is not None and self.as_of_timestamp is not None:
759
+ raise ValueError(
760
+ f"TimeTravelConfig validation failed: Cannot specify both 'as_of_version' and 'as_of_timestamp'. "
761
+ f"Got as_of_version={self.as_of_version} and as_of_timestamp='{self.as_of_timestamp}'. "
762
+ f"Use only one: as_of_version for a specific Delta version number, or as_of_timestamp for a point in time."
763
+ )
764
+ return self
765
+
766
+
767
+ class IncrementalUnit(str, Enum):
768
+ """
769
+ Time units for incremental lookback.
770
+
771
+ Values:
772
+ * `hour`
773
+ * `day`
774
+ * `month`
775
+ * `year`
776
+ """
777
+
778
+ HOUR = "hour"
779
+ DAY = "day"
780
+ MONTH = "month"
781
+ YEAR = "year"
782
+
783
+
784
+ class IncrementalMode(str, Enum):
785
+ """Mode for incremental loading."""
786
+
787
+ ROLLING_WINDOW = "rolling_window" # Current default: WHERE col >= NOW() - lookback
788
+ STATEFUL = "stateful" # New: WHERE col > last_hwm
789
+
790
+
791
+ class IncrementalConfig(BaseModel):
792
+ """
793
+ Configuration for automatic incremental loading.
794
+
795
+ **When to Use:** Load only new/changed data instead of full table scans.
796
+
797
+ **See Also:** [ReadConfig](#readconfig)
798
+
799
+ **Modes:**
800
+ 1. **Rolling Window** (Default): Uses a time-based lookback from NOW().
801
+ Good for: Stateless loading where you just want "recent" data.
802
+ Args: `lookback`, `unit`
803
+
804
+ 2. **Stateful**: Tracks the High-Water Mark (HWM) of the key column.
805
+ Good for: Exact incremental ingestion (e.g. CDC-like).
806
+ Args: `state_key` (optional), `watermark_lag` (optional)
807
+
808
+ Generates SQL:
809
+ - Rolling: `WHERE column >= NOW() - lookback`
810
+ - Stateful: `WHERE column > :last_hwm`
811
+
812
+ Example (Rolling Window):
813
+ ```yaml
814
+ incremental:
815
+ mode: "rolling_window"
816
+ column: "updated_at"
817
+ lookback: 3
818
+ unit: "day"
819
+ ```
820
+
821
+ Example (Stateful HWM):
822
+ ```yaml
823
+ incremental:
824
+ mode: "stateful"
825
+ column: "id"
826
+ # Optional: track separate column for HWM state
827
+ state_key: "last_processed_id"
828
+ ```
829
+
830
+ Example (Stateful with Watermark Lag):
831
+ ```yaml
832
+ incremental:
833
+ mode: "stateful"
834
+ column: "updated_at"
835
+ # Handle late-arriving data: look back 2 hours from HWM
836
+ watermark_lag: "2h"
837
+ ```
838
+
839
+ Example (Oracle Date Format):
840
+ ```yaml
841
+ incremental:
842
+ mode: "rolling_window"
843
+ column: "EVENT_START"
844
+ lookback: 3
845
+ unit: "day"
846
+ # For string columns with Oracle format (DD-MON-YY)
847
+ date_format: "oracle"
848
+ ```
849
+
850
+ Supported date_format values:
851
+ - `oracle`: DD-MON-YY for Oracle databases (uses TO_TIMESTAMP)
852
+ - `oracle_sqlserver`: DD-MON-YY format stored in SQL Server (uses TRY_CONVERT)
853
+ - `sql_server`: Uses CONVERT with style 120
854
+ - `us`: MM/DD/YYYY format
855
+ - `eu`: DD/MM/YYYY format
856
+ - `iso`: YYYY-MM-DDTHH:MM:SS format
857
+ """
858
+
859
+ model_config = {"populate_by_name": True}
860
+
861
+ mode: IncrementalMode = Field(
862
+ default=IncrementalMode.ROLLING_WINDOW,
863
+ description="Incremental strategy: 'rolling_window' or 'stateful'",
864
+ )
865
+
866
+ # Columns
867
+ column: str = Field(
868
+ alias="key_column", description="Primary column to filter on (e.g., updated_at)"
869
+ )
870
+ fallback_column: Optional[str] = Field(
871
+ default=None,
872
+ description="Backup column if primary is NULL (e.g., created_at). Generates COALESCE(col, fallback) >= ...",
873
+ )
874
+
875
+ # Rolling Window Args
876
+ lookback: Optional[int] = Field(
877
+ default=None, description="Time units to look back (Rolling Window only)"
878
+ )
879
+ unit: Optional[IncrementalUnit] = Field(
880
+ default=None,
881
+ description="Time unit for lookback (Rolling Window only). Options: 'hour', 'day', 'month', 'year'",
882
+ )
883
+
884
+ # Stateful Args
885
+ state_key: Optional[str] = Field(
886
+ default=None,
887
+ description="Unique ID for state tracking. Defaults to node name if not provided.",
888
+ )
889
+ watermark_lag: Optional[str] = Field(
890
+ default=None,
891
+ description=(
892
+ "Safety buffer for late-arriving data in stateful mode. "
893
+ "Subtracts this duration from the stored HWM when filtering. "
894
+ "Format: '<number><unit>' where unit is 's', 'm', 'h', or 'd'. "
895
+ "Examples: '2h' (2 hours), '30m' (30 minutes), '1d' (1 day). "
896
+ "Use when source has replication lag or eventual consistency."
897
+ ),
898
+ )
899
+
900
+ # Date format for string columns
901
+ date_format: Optional[str] = Field(
902
+ default=None,
903
+ description=(
904
+ "Source date format when the column is stored as a string. "
905
+ "Options: 'oracle' (DD-MON-YY for Oracle DB), "
906
+ "'oracle_sqlserver' (DD-MON-YY format in SQL Server), "
907
+ "'sql_server' (uses CONVERT with style 120), "
908
+ "'us' (MM/DD/YYYY), 'eu' (DD/MM/YYYY), 'iso' (YYYY-MM-DDTHH:MM:SS). "
909
+ "When set, SQL pushdown will use appropriate CONVERT/TO_TIMESTAMP functions."
910
+ ),
911
+ )
912
+
913
+ @model_validator(mode="after")
914
+ def check_mode_args(self):
915
+ if self.mode == IncrementalMode.ROLLING_WINDOW:
916
+ # Apply defaults if missing (Backward Compatibility)
917
+ if self.lookback is None:
918
+ self.lookback = 1
919
+ if self.unit is None:
920
+ self.unit = IncrementalUnit.DAY
921
+ return self
922
+
923
+
924
+ class ReadConfig(BaseModel):
925
+ """
926
+ Configuration for reading data into a node.
927
+
928
+ **When to Use:** First node in a pipeline, or any node that reads from storage.
929
+
930
+ **Key Concepts:**
931
+ - `connection`: References a named connection from `connections:` section
932
+ - `format`: File format (csv, parquet, delta, json, sql)
933
+ - `incremental`: Enable incremental loading (only new data)
934
+
935
+ **See Also:**
936
+ - [Incremental Loading](../patterns/incremental_stateful.md) - HWM-based loading
937
+ - [IncrementalConfig](#incrementalconfig) - Incremental loading options
938
+
939
+ ### 📖 "Universal Reader" Guide
940
+
941
+ **Business Problem:**
942
+ "I need to read from files, databases, streams, and even travel back in time to see how data looked yesterday."
943
+
944
+ **Recipe 1: The Time Traveler (Delta/Iceberg)**
945
+ *Reproduce a bug by seeing the data exactly as it was.*
946
+ ```yaml
947
+ read:
948
+ connection: "silver_lake"
949
+ format: "delta"
950
+ table: "fact_sales"
951
+ time_travel:
952
+ as_of_timestamp: "2023-10-25T14:00:00Z"
953
+ ```
954
+
955
+ **Recipe 2: The Streamer**
956
+ *Process data in real-time.*
957
+ ```yaml
958
+ read:
959
+ connection: "event_hub"
960
+ format: "json"
961
+ streaming: true
962
+ ```
963
+
964
+ **Recipe 3: The SQL Query**
965
+ *Push down filtering to the source database.*
966
+ ```yaml
967
+ read:
968
+ connection: "enterprise_dw"
969
+ format: "sql"
970
+ # Use the query option to filter at source!
971
+ query: "SELECT * FROM huge_table WHERE date >= '2024-01-01'"
972
+ ```
973
+
974
+ **Recipe 4: Archive Bad Records (Spark)**
975
+ *Capture malformed records for later inspection.*
976
+ ```yaml
977
+ read:
978
+ connection: "landing"
979
+ format: "json"
980
+ path: "events/*.json"
981
+ archive_options:
982
+ badRecordsPath: "/mnt/quarantine/bad_records"
983
+ ```
984
+
985
+ **Recipe 5: Optimize JDBC Parallelism (Spark)**
986
+ *Control partition count for SQL sources to reduce task overhead.*
987
+ ```yaml
988
+ read:
989
+ connection: "enterprise_dw"
990
+ format: "sql"
991
+ table: "small_lookup_table"
992
+ options:
993
+ numPartitions: 1 # Single partition for small tables
994
+ ```
995
+
996
+ **Performance Tip:** For small tables (<100K rows), use `numPartitions: 1` to avoid
997
+ excessive Spark task scheduling overhead. For large tables, increase partitions
998
+ to enable parallel reads (requires partitionColumn, lowerBound, upperBound).
999
+ """
1000
+
1001
+ connection: str = Field(description="Connection name from project.yaml")
1002
+ format: Union[ReadFormat, str] = Field(description="Data format (csv, parquet, delta, etc.)")
1003
+ table: Optional[str] = Field(default=None, description="Table name for SQL/Delta")
1004
+ path: Optional[str] = Field(default=None, description="Path for file-based sources")
1005
+ streaming: bool = Field(default=False, description="Enable streaming read (Spark only)")
1006
+ schema_ddl: Optional[str] = Field(
1007
+ default=None,
1008
+ description=(
1009
+ "Schema for streaming reads from file sources (required for Avro, JSON, CSV). "
1010
+ "Use Spark DDL format: 'col1 STRING, col2 INT, col3 TIMESTAMP'. "
1011
+ "Not required for Delta (schema is inferred from table metadata)."
1012
+ ),
1013
+ )
1014
+ query: Optional[str] = Field(
1015
+ default=None,
1016
+ description="SQL query to filter at source (pushdown). Mutually exclusive with table/path if supported by connector.",
1017
+ )
1018
+ filter: Optional[str] = Field(
1019
+ default=None,
1020
+ description="SQL WHERE clause filter (pushed down to source for SQL formats). Example: \"DAY > '2022-12-31'\"",
1021
+ )
1022
+ incremental: Optional[IncrementalConfig] = Field(
1023
+ default=None,
1024
+ description="Automatic incremental loading strategy (CDC-like). If set, generates query based on target state (HWM).",
1025
+ )
1026
+ time_travel: Optional[TimeTravelConfig] = Field(
1027
+ default=None, description="Time travel options (Delta only)"
1028
+ )
1029
+ archive_options: Dict[str, Any] = Field(
1030
+ default_factory=dict,
1031
+ description="Options for archiving bad records (e.g. badRecordsPath for Spark)",
1032
+ )
1033
+ options: Dict[str, Any] = Field(default_factory=dict, description="Format-specific options")
1034
+
1035
+ @model_validator(mode="after")
1036
+ def move_query_to_options(self):
1037
+ """Move top-level query to options."""
1038
+ if self.query:
1039
+ if "query" in self.options and self.options["query"] != self.query:
1040
+ raise ValueError(
1041
+ f"ReadConfig validation failed: 'query' specified in both top-level and options with different values. "
1042
+ f"Top-level query: '{self.query[:50]}{'...' if len(self.query) > 50 else ''}'. "
1043
+ f"Options query: '{str(self.options['query'])[:50]}{'...' if len(str(self.options['query'])) > 50 else ''}'. "
1044
+ f"Remove one of them or ensure they are identical."
1045
+ )
1046
+ self.options["query"] = self.query
1047
+ return self
1048
+
1049
+ @model_validator(mode="after")
1050
+ def move_filter_to_options(self):
1051
+ """Move top-level filter to options for SQL pushdown."""
1052
+ if self.filter:
1053
+ if "filter" in self.options and self.options["filter"] != self.filter:
1054
+ raise ValueError(
1055
+ f"ReadConfig validation failed: 'filter' specified in both top-level and options with different values. "
1056
+ f"Top-level filter: '{self.filter[:50]}{'...' if len(self.filter) > 50 else ''}'. "
1057
+ f"Options filter: '{str(self.options['filter'])[:50]}{'...' if len(str(self.options['filter'])) > 50 else ''}'. "
1058
+ f"Remove one of them or ensure they are identical."
1059
+ )
1060
+ self.options["filter"] = self.filter
1061
+ return self
1062
+
1063
+ @model_validator(mode="after")
1064
+ def check_table_or_path(self):
1065
+ """Ensure either table or path is provided."""
1066
+ # 1. Can't set both path and table
1067
+ if self.table and self.path:
1068
+ raise ValueError(
1069
+ f"ReadConfig validation failed: 'table' and 'path' are mutually exclusive. "
1070
+ f"Got table='{self.table}' and path='{self.path}'. "
1071
+ f"Use 'table' for catalog/database tables or 'path' for file-based sources, but not both."
1072
+ )
1073
+
1074
+ # 2. Format-specific rules
1075
+ has_query = self.options and "query" in self.options
1076
+
1077
+ if self.format == ReadFormat.SQL:
1078
+ if not (self.table or self.query or has_query):
1079
+ raise ValueError(
1080
+ f"ReadConfig validation failed: For format='sql', either 'table' or 'query' is required. "
1081
+ f"Got table={self.table}, query={self.query}. "
1082
+ f"Example: table: 'dbo.Customers' or query: 'SELECT * FROM dbo.Customers WHERE active = 1'"
1083
+ )
1084
+ elif self.format in [ReadFormat.CSV, ReadFormat.PARQUET, ReadFormat.JSON]:
1085
+ if not self.path:
1086
+ # Some users might read from table/catalog even for parquet?
1087
+ # But usually file formats need path.
1088
+ pass
1089
+
1090
+ if not self.table and not self.path and not has_query:
1091
+ raise ValueError(
1092
+ "ReadConfig validation failed: No data source specified. "
1093
+ "Provide one of: 'table' (for database/catalog), 'path' (for files), "
1094
+ "or 'query' (for SQL). Example: table: 'schema.table_name'"
1095
+ )
1096
+
1097
+ return self
1098
+
1099
+
1100
+ class TransformStep(BaseModel):
1101
+ """
1102
+ Single transformation step.
1103
+
1104
+ Supports four step types (exactly one required):
1105
+
1106
+ * `sql` - Inline SQL query string
1107
+ * `sql_file` - Path to external .sql file (relative to the YAML file defining the node)
1108
+ * `function` - Registered Python function name
1109
+ * `operation` - Built-in operation (e.g., drop_duplicates)
1110
+
1111
+ **sql_file Example:**
1112
+
1113
+ If your project structure is:
1114
+ ```
1115
+ project.yaml # imports pipelines/silver/silver.yaml
1116
+ pipelines/
1117
+ silver/
1118
+ silver.yaml # defines the node
1119
+ sql/
1120
+ transform.sql # your SQL file
1121
+ ```
1122
+
1123
+ In `silver.yaml`, use a path relative to `silver.yaml`:
1124
+ ```yaml
1125
+ transform:
1126
+ steps:
1127
+ - sql_file: sql/transform.sql # relative to silver.yaml
1128
+ ```
1129
+
1130
+ **Important:** The path is resolved relative to the YAML file where the node is defined,
1131
+ NOT the project.yaml that imports it. Do NOT use absolute paths like `/pipelines/silver/sql/...`.
1132
+ """
1133
+
1134
+ sql: Optional[str] = Field(
1135
+ default=None,
1136
+ description="Inline SQL query. Use `df` to reference the current DataFrame.",
1137
+ )
1138
+ sql_file: Optional[str] = Field(
1139
+ default=None,
1140
+ description=(
1141
+ "Path to external .sql file, relative to the YAML file defining the node. "
1142
+ "Example: 'sql/transform.sql' resolves relative to the node's source YAML."
1143
+ ),
1144
+ )
1145
+ function: Optional[str] = Field(
1146
+ default=None,
1147
+ description="Name of a registered Python function (@transform or @register).",
1148
+ )
1149
+ operation: Optional[str] = Field(
1150
+ default=None,
1151
+ description="Built-in operation name (e.g., drop_duplicates, fill_na).",
1152
+ )
1153
+ params: Dict[str, Any] = Field(
1154
+ default_factory=dict,
1155
+ description="Parameters to pass to function or operation.",
1156
+ )
1157
+
1158
+ @model_validator(mode="after")
1159
+ def check_step_type(self):
1160
+ """Ensure exactly one step type is provided."""
1161
+ specified = [
1162
+ name
1163
+ for name, val in [
1164
+ ("sql", self.sql),
1165
+ ("sql_file", self.sql_file),
1166
+ ("function", self.function),
1167
+ ("operation", self.operation),
1168
+ ]
1169
+ if val is not None
1170
+ ]
1171
+ if len(specified) != 1:
1172
+ if len(specified) == 0:
1173
+ raise ValueError(
1174
+ "TransformStep validation failed: No step type specified. "
1175
+ "Provide exactly one of: 'sql', 'sql_file', 'function', or 'operation'. "
1176
+ "Example: sql: 'SELECT * FROM df' or operation: 'drop_duplicates'"
1177
+ )
1178
+ else:
1179
+ raise ValueError(
1180
+ f"TransformStep validation failed: Multiple step types specified: "
1181
+ f"{specified}. Use exactly one of: 'sql', 'sql_file', 'function', or 'operation'."
1182
+ )
1183
+ return self
1184
+
1185
+
1186
+ class TransformConfig(BaseModel):
1187
+ """
1188
+ Configuration for transformation steps within a node.
1189
+
1190
+ **When to Use:** Custom business logic, data cleaning, SQL transformations.
1191
+
1192
+ **Key Concepts:**
1193
+ - `steps`: Ordered list of operations (SQL, functions, or both)
1194
+ - Each step receives the DataFrame from the previous step
1195
+ - Steps execute in order: step1 → step2 → step3
1196
+
1197
+ **See Also:** [Transformer Catalog](#nodeconfig)
1198
+
1199
+ **Transformer vs Transform:**
1200
+ - `transformer`: Single heavy operation (scd2, merge, deduplicate)
1201
+ - `transform.steps`: Chain of lighter operations
1202
+
1203
+ ### 🔧 "Transformation Pipeline" Guide
1204
+
1205
+ **Business Problem:**
1206
+ "I have complex logic that mixes SQL for speed and Python for complex calculations."
1207
+
1208
+ **The Solution:**
1209
+ Chain multiple steps together. Output of Step 1 becomes input of Step 2.
1210
+
1211
+ **Function Registry:**
1212
+ The `function` step type looks up functions registered with `@transform` (or `@register`).
1213
+ This allows you to use the *same* registered functions as both top-level Transformers and steps in a chain.
1214
+
1215
+ **Recipe: The Mix-and-Match**
1216
+ ```yaml
1217
+ transform:
1218
+ steps:
1219
+ # Step 1: SQL Filter (Fast)
1220
+ - sql: "SELECT * FROM df WHERE status = 'ACTIVE'"
1221
+
1222
+ # Step 2: Custom Python Function (Complex Logic)
1223
+ # Looks up 'calculate_lifetime_value' in the registry
1224
+ - function: "calculate_lifetime_value"
1225
+ params: { discount_rate: 0.05 }
1226
+
1227
+ # Step 3: Built-in Operation (Standard)
1228
+ - operation: "drop_duplicates"
1229
+ params: { subset: ["user_id"] }
1230
+ ```
1231
+ """
1232
+
1233
+ steps: List[Union[str, TransformStep]] = Field(
1234
+ description="List of transformation steps (SQL strings or TransformStep configs)"
1235
+ )
1236
+
1237
+
1238
+ class ValidationAction(str, Enum):
1239
+ FAIL = "fail"
1240
+ WARN = "warn"
1241
+
1242
+
1243
+ class OnFailAction(str, Enum):
1244
+ ALERT = "alert"
1245
+ IGNORE = "ignore"
1246
+
1247
+
1248
+ class TestType(str, Enum):
1249
+ __test__ = False # Prevent pytest collection
1250
+
1251
+ NOT_NULL = "not_null"
1252
+ UNIQUE = "unique"
1253
+ ACCEPTED_VALUES = "accepted_values"
1254
+ ROW_COUNT = "row_count"
1255
+ CUSTOM_SQL = "custom_sql"
1256
+ RANGE = "range"
1257
+ REGEX_MATCH = "regex_match"
1258
+ VOLUME_DROP = "volume_drop" # Phase 4.1: History-Aware
1259
+ SCHEMA = "schema"
1260
+ DISTRIBUTION = "distribution"
1261
+ FRESHNESS = "freshness"
1262
+
1263
+
1264
+ class ContractSeverity(str, Enum):
1265
+ WARN = "warn"
1266
+ FAIL = "fail"
1267
+ QUARANTINE = "quarantine"
1268
+
1269
+
1270
+ class BaseTestConfig(BaseModel):
1271
+ type: TestType
1272
+ name: Optional[str] = Field(default=None, description="Optional name for the check")
1273
+ on_fail: ContractSeverity = Field(
1274
+ default=ContractSeverity.FAIL, description="Action on failure"
1275
+ )
1276
+
1277
+
1278
+ class VolumeDropTest(BaseTestConfig):
1279
+ """
1280
+ Checks if row count dropped significantly compared to history.
1281
+
1282
+ **When to Use:** Detect source outages, partial loads, or data pipeline issues.
1283
+
1284
+ **See Also:** [Contracts Overview](#contracts-data-quality-gates), [RowCountTest](#rowcounttest)
1285
+
1286
+ Formula: `(current - avg) / avg < -threshold`
1287
+
1288
+ ```yaml
1289
+ contracts:
1290
+ - type: volume_drop
1291
+ threshold: 0.5 # Fail if > 50% drop from 7-day average
1292
+ lookback_days: 7
1293
+ ```
1294
+ """
1295
+
1296
+ type: Literal[TestType.VOLUME_DROP] = TestType.VOLUME_DROP
1297
+ threshold: float = Field(default=0.5, description="Max allowed drop (0.5 = 50% drop)")
1298
+ lookback_days: int = Field(default=7, description="Days of history to average")
1299
+
1300
+
1301
+ class NotNullTest(BaseTestConfig):
1302
+ """
1303
+ Ensures specified columns contain no NULL values.
1304
+
1305
+ **When to Use:** Primary keys, required fields, foreign keys that must resolve.
1306
+
1307
+ **See Also:** [Contracts Overview](#contracts-data-quality-gates)
1308
+
1309
+ ```yaml
1310
+ contracts:
1311
+ - type: not_null
1312
+ columns: [order_id, customer_id, created_at]
1313
+ ```
1314
+ """
1315
+
1316
+ type: Literal[TestType.NOT_NULL] = TestType.NOT_NULL
1317
+ columns: List[str] = Field(description="Columns that must not contain nulls")
1318
+
1319
+
1320
+ class UniqueTest(BaseTestConfig):
1321
+ """
1322
+ Ensures specified columns (or combination) contain unique values.
1323
+
1324
+ **When to Use:** Primary keys, natural keys, deduplication verification.
1325
+
1326
+ **See Also:** [Contracts Overview](#contracts-data-quality-gates)
1327
+
1328
+ ```yaml
1329
+ contracts:
1330
+ - type: unique
1331
+ columns: [order_id] # Single column
1332
+ # OR composite key:
1333
+ - type: unique
1334
+ columns: [customer_id, order_date] # Composite uniqueness
1335
+ ```
1336
+ """
1337
+
1338
+ type: Literal[TestType.UNIQUE] = TestType.UNIQUE
1339
+ columns: List[str] = Field(
1340
+ description="Columns that must be unique (composite key if multiple)"
1341
+ )
1342
+
1343
+
1344
+ class AcceptedValuesTest(BaseTestConfig):
1345
+ """
1346
+ Ensures a column only contains values from an allowed list.
1347
+
1348
+ **When to Use:** Enum-like fields, status columns, categorical data validation.
1349
+
1350
+ **See Also:** [Contracts Overview](#contracts-data-quality-gates)
1351
+
1352
+ ```yaml
1353
+ contracts:
1354
+ - type: accepted_values
1355
+ column: status
1356
+ values: [pending, approved, rejected]
1357
+ ```
1358
+ """
1359
+
1360
+ type: Literal[TestType.ACCEPTED_VALUES] = TestType.ACCEPTED_VALUES
1361
+ column: str = Field(description="Column to check")
1362
+ values: List[Any] = Field(description="Allowed values")
1363
+
1364
+
1365
+ class RowCountTest(BaseTestConfig):
1366
+ """
1367
+ Validates that row count falls within expected bounds.
1368
+
1369
+ **When to Use:** Ensure minimum data completeness, detect truncated loads, cap batch sizes.
1370
+
1371
+ **See Also:** [Contracts Overview](#contracts-data-quality-gates), [GateConfig](#gateconfig)
1372
+
1373
+ ```yaml
1374
+ contracts:
1375
+ - type: row_count
1376
+ min: 1000
1377
+ max: 100000
1378
+ ```
1379
+ """
1380
+
1381
+ type: Literal[TestType.ROW_COUNT] = TestType.ROW_COUNT
1382
+ min: Optional[int] = Field(default=None, description="Minimum row count")
1383
+ max: Optional[int] = Field(default=None, description="Maximum row count")
1384
+
1385
+
1386
+ class CustomSQLTest(BaseTestConfig):
1387
+ """
1388
+ Runs a custom SQL condition and fails if too many rows violate it.
1389
+
1390
+ ```yaml
1391
+ contracts:
1392
+ - type: custom_sql
1393
+ condition: "amount > 0"
1394
+ threshold: 0.01 # Allow up to 1% failures
1395
+ ```
1396
+ """
1397
+
1398
+ type: Literal[TestType.CUSTOM_SQL] = TestType.CUSTOM_SQL
1399
+ condition: str = Field(description="SQL condition that should be true for valid rows")
1400
+ threshold: float = Field(
1401
+ default=0.0, description="Failure rate threshold (0.0 = strictly no failures allowed)"
1402
+ )
1403
+
1404
+
1405
+ class RangeTest(BaseTestConfig):
1406
+ """
1407
+ Ensures column values fall within a specified range.
1408
+
1409
+ **When to Use:** Numeric bounds validation (ages, prices, quantities), date ranges.
1410
+
1411
+ **See Also:** [Contracts Overview](#contracts-data-quality-gates)
1412
+
1413
+ ```yaml
1414
+ contracts:
1415
+ - type: range
1416
+ column: age
1417
+ min: 0
1418
+ max: 150
1419
+ ```
1420
+ """
1421
+
1422
+ type: Literal[TestType.RANGE] = TestType.RANGE
1423
+ column: str = Field(description="Column to check")
1424
+ min: Optional[Union[int, float, str]] = Field(
1425
+ default=None, description="Minimum value (inclusive)"
1426
+ )
1427
+ max: Optional[Union[int, float, str]] = Field(
1428
+ default=None, description="Maximum value (inclusive)"
1429
+ )
1430
+
1431
+
1432
+ class RegexMatchTest(BaseTestConfig):
1433
+ """
1434
+ Ensures column values match a regex pattern.
1435
+
1436
+ **When to Use:** Format validation (emails, phone numbers, IDs, codes).
1437
+
1438
+ **See Also:** [Contracts Overview](#contracts-data-quality-gates)
1439
+
1440
+ ```yaml
1441
+ contracts:
1442
+ - type: regex_match
1443
+ column: email
1444
+ pattern: "^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+$"
1445
+ ```
1446
+ """
1447
+
1448
+ type: Literal[TestType.REGEX_MATCH] = TestType.REGEX_MATCH
1449
+ column: str = Field(description="Column to check")
1450
+ pattern: str = Field(description="Regex pattern to match")
1451
+
1452
+
1453
+ class SchemaContract(BaseTestConfig):
1454
+ """
1455
+ Validates that the DataFrame schema matches expected columns.
1456
+
1457
+ **When to Use:** Enforce schema stability, detect upstream schema drift, ensure column presence.
1458
+
1459
+ **See Also:** [Contracts Overview](#contracts-data-quality-gates), [SchemaPolicyConfig](#schemapolicyconfig)
1460
+
1461
+ Uses the `columns` metadata from NodeConfig to verify schema.
1462
+
1463
+ ```yaml
1464
+ contracts:
1465
+ - type: schema
1466
+ strict: true # Fail if extra columns present
1467
+ ```
1468
+ """
1469
+
1470
+ type: Literal[TestType.SCHEMA] = TestType.SCHEMA
1471
+ strict: bool = Field(default=True, description="If true, fail on unexpected columns")
1472
+ on_fail: ContractSeverity = ContractSeverity.FAIL
1473
+
1474
+
1475
+ class DistributionContract(BaseTestConfig):
1476
+ """
1477
+ Checks if a column's statistical distribution is within expected bounds.
1478
+
1479
+ **When to Use:** Detect data drift, anomaly detection, statistical monitoring.
1480
+
1481
+ **See Also:** [Contracts Overview](#contracts-data-quality-gates)
1482
+
1483
+ ```yaml
1484
+ contracts:
1485
+ - type: distribution
1486
+ column: price
1487
+ metric: mean
1488
+ threshold: ">100" # Mean must be > 100
1489
+ on_fail: warn
1490
+ ```
1491
+ """
1492
+
1493
+ type: Literal[TestType.DISTRIBUTION] = TestType.DISTRIBUTION
1494
+ column: str = Field(description="Column to analyze")
1495
+ metric: Literal["mean", "min", "max", "null_percentage"] = Field(
1496
+ description="Statistical metric to check"
1497
+ )
1498
+ threshold: str = Field(description="Threshold expression (e.g., '>100', '<0.05')")
1499
+ on_fail: ContractSeverity = ContractSeverity.WARN
1500
+
1501
+
1502
+ class FreshnessContract(BaseTestConfig):
1503
+ """
1504
+ Validates that data is not stale by checking a timestamp column.
1505
+
1506
+ **When to Use:** Source systems that should update regularly, SLA monitoring.
1507
+
1508
+ **See Also:** [Contracts Overview](#contracts-data-quality-gates)
1509
+
1510
+ ```yaml
1511
+ contracts:
1512
+ - type: freshness
1513
+ column: updated_at
1514
+ max_age: "24h" # Fail if no data newer than 24 hours
1515
+ ```
1516
+ """
1517
+
1518
+ type: Literal[TestType.FRESHNESS] = TestType.FRESHNESS
1519
+ column: str = Field(default="updated_at", description="Timestamp column to check")
1520
+ max_age: str = Field(description="Maximum allowed age (e.g., '24h', '7d')")
1521
+ on_fail: ContractSeverity = ContractSeverity.FAIL
1522
+
1523
+
1524
+ TestConfig = Annotated[
1525
+ Union[
1526
+ NotNullTest,
1527
+ UniqueTest,
1528
+ AcceptedValuesTest,
1529
+ RowCountTest,
1530
+ CustomSQLTest,
1531
+ RangeTest,
1532
+ RegexMatchTest,
1533
+ VolumeDropTest,
1534
+ SchemaContract,
1535
+ DistributionContract,
1536
+ FreshnessContract,
1537
+ ],
1538
+ Field(discriminator="type"),
1539
+ ]
1540
+
1541
+
1542
+ # ============================================
1543
+ # Quarantine Configuration
1544
+ # ============================================
1545
+
1546
+
1547
+ class QuarantineColumnsConfig(BaseModel):
1548
+ """
1549
+ Columns added to quarantined rows for debugging and reprocessing.
1550
+
1551
+ Example:
1552
+ ```yaml
1553
+ quarantine:
1554
+ connection: silver
1555
+ path: customers_quarantine
1556
+ add_columns:
1557
+ _rejection_reason: true
1558
+ _rejected_at: true
1559
+ _source_batch_id: true
1560
+ _failed_tests: true
1561
+ _original_node: false
1562
+ ```
1563
+ """
1564
+
1565
+ rejection_reason: bool = Field(
1566
+ default=True,
1567
+ description="Add _rejection_reason column with test failure description",
1568
+ )
1569
+ rejected_at: bool = Field(
1570
+ default=True,
1571
+ description="Add _rejected_at column with UTC timestamp",
1572
+ )
1573
+ source_batch_id: bool = Field(
1574
+ default=True,
1575
+ description="Add _source_batch_id column with run ID for traceability",
1576
+ )
1577
+ failed_tests: bool = Field(
1578
+ default=True,
1579
+ description="Add _failed_tests column with comma-separated list of failed test names",
1580
+ )
1581
+ original_node: bool = Field(
1582
+ default=False,
1583
+ description="Add _original_node column with source node name",
1584
+ )
1585
+
1586
+
1587
+ class QuarantineConfig(BaseModel):
1588
+ """
1589
+ Configuration for quarantine table routing.
1590
+
1591
+ **When to Use:** Capture invalid records for review/reprocessing instead of failing the pipeline.
1592
+
1593
+ **See Also:** [Quarantine Guide](../features/quarantine.md), [ValidationConfig](#validationconfig)
1594
+
1595
+ Routes rows that fail validation tests to a quarantine table
1596
+ with rejection metadata for later analysis/reprocessing.
1597
+
1598
+ Example:
1599
+ ```yaml
1600
+ validation:
1601
+ tests:
1602
+ - type: not_null
1603
+ columns: [customer_id]
1604
+ on_fail: quarantine
1605
+ quarantine:
1606
+ connection: silver
1607
+ path: customers_quarantine
1608
+ add_columns:
1609
+ _rejection_reason: true
1610
+ _rejected_at: true
1611
+ max_rows: 10000
1612
+ sample_fraction: 0.1
1613
+ ```
1614
+ """
1615
+
1616
+ connection: str = Field(description="Connection for quarantine writes")
1617
+ path: Optional[str] = Field(default=None, description="Path for quarantine data")
1618
+ table: Optional[str] = Field(default=None, description="Table name for quarantine")
1619
+ add_columns: QuarantineColumnsConfig = Field(
1620
+ default_factory=QuarantineColumnsConfig,
1621
+ description="Metadata columns to add to quarantined rows",
1622
+ )
1623
+ retention_days: Optional[int] = Field(
1624
+ default=90,
1625
+ ge=1,
1626
+ description="Days to retain quarantined data (auto-cleanup)",
1627
+ )
1628
+ max_rows: Optional[int] = Field(
1629
+ default=None,
1630
+ ge=1,
1631
+ description="Maximum number of rows to quarantine per run. Limits storage for high-failure batches.",
1632
+ )
1633
+ sample_fraction: Optional[float] = Field(
1634
+ default=None,
1635
+ ge=0.0,
1636
+ le=1.0,
1637
+ description="Sample fraction of invalid rows to quarantine (0.0-1.0). Use for sampling large invalid sets.",
1638
+ )
1639
+
1640
+ @model_validator(mode="after")
1641
+ def validate_destination(self):
1642
+ """Ensure either path or table is specified."""
1643
+ if not self.path and not self.table:
1644
+ raise ValueError("QuarantineConfig requires either 'path' or 'table'")
1645
+ return self
1646
+
1647
+
1648
+ # ============================================
1649
+ # Quality Gate Configuration
1650
+ # ============================================
1651
+
1652
+
1653
+ class GateOnFail(str, Enum):
1654
+ """
1655
+ Action when quality gate fails.
1656
+
1657
+ Values:
1658
+ * `abort` - Stop pipeline, write nothing (default)
1659
+ * `warn_and_write` - Log warning, write all rows anyway
1660
+ * `write_valid_only` - Write only rows that passed validation
1661
+ """
1662
+
1663
+ ABORT = "abort"
1664
+ WARN_AND_WRITE = "warn_and_write"
1665
+ WRITE_VALID_ONLY = "write_valid_only"
1666
+
1667
+
1668
+ class GateThreshold(BaseModel):
1669
+ """
1670
+ Per-test threshold configuration for quality gates.
1671
+
1672
+ Allows setting different pass rate requirements for specific tests.
1673
+
1674
+ Example:
1675
+ ```yaml
1676
+ gate:
1677
+ thresholds:
1678
+ - test: not_null
1679
+ min_pass_rate: 0.99
1680
+ - test: unique
1681
+ min_pass_rate: 1.0
1682
+ ```
1683
+ """
1684
+
1685
+ test: str = Field(description="Test name or type to apply threshold to")
1686
+ min_pass_rate: float = Field(
1687
+ ge=0.0,
1688
+ le=1.0,
1689
+ description="Minimum pass rate required (0.0-1.0, e.g., 0.99 = 99%)",
1690
+ )
1691
+
1692
+
1693
+ class RowCountGate(BaseModel):
1694
+ """
1695
+ Row count anomaly detection for quality gates.
1696
+
1697
+ Validates that batch size falls within expected bounds and
1698
+ detects significant changes from previous runs.
1699
+
1700
+ Example:
1701
+ ```yaml
1702
+ gate:
1703
+ row_count:
1704
+ min: 100
1705
+ max: 1000000
1706
+ change_threshold: 0.5
1707
+ ```
1708
+ """
1709
+
1710
+ min: Optional[int] = Field(default=None, ge=0, description="Minimum expected row count")
1711
+ max: Optional[int] = Field(default=None, ge=0, description="Maximum expected row count")
1712
+ change_threshold: Optional[float] = Field(
1713
+ default=None,
1714
+ ge=0.0,
1715
+ le=1.0,
1716
+ description="Max allowed change vs previous run (e.g., 0.5 = 50% change triggers failure)",
1717
+ )
1718
+
1719
+
1720
+ class GateConfig(BaseModel):
1721
+ """
1722
+ Quality gate configuration for batch-level validation.
1723
+
1724
+ **When to Use:** Pipeline-level pass/fail thresholds, row count limits, change detection.
1725
+
1726
+ **See Also:** Quality Gates, [ValidationConfig](#validationconfig)
1727
+
1728
+ Gates evaluate the entire batch before writing, ensuring
1729
+ data quality thresholds are met.
1730
+
1731
+ Example:
1732
+ ```yaml
1733
+ gate:
1734
+ require_pass_rate: 0.95
1735
+ on_fail: abort
1736
+ thresholds:
1737
+ - test: not_null
1738
+ min_pass_rate: 0.99
1739
+ row_count:
1740
+ min: 100
1741
+ change_threshold: 0.5
1742
+ ```
1743
+ """
1744
+
1745
+ require_pass_rate: float = Field(
1746
+ default=0.95,
1747
+ ge=0.0,
1748
+ le=1.0,
1749
+ description="Minimum percentage of rows passing ALL tests",
1750
+ )
1751
+ on_fail: GateOnFail = Field(
1752
+ default=GateOnFail.ABORT,
1753
+ description="Action when gate fails",
1754
+ )
1755
+ thresholds: List[GateThreshold] = Field(
1756
+ default_factory=list,
1757
+ description="Per-test thresholds (overrides global require_pass_rate)",
1758
+ )
1759
+ row_count: Optional[RowCountGate] = Field(
1760
+ default=None,
1761
+ description="Row count anomaly detection",
1762
+ )
1763
+
1764
+
1765
+ class ValidationConfig(BaseModel):
1766
+ """
1767
+ Configuration for data validation (post-transform checks).
1768
+
1769
+ **When to Use:** Output data quality checks that run after transformation but before writing.
1770
+
1771
+ **See Also:** Validation Guide, Quarantine Guide, Contracts Overview (pre-transform checks)
1772
+
1773
+ ### 🛡️ "The Indestructible Pipeline" Pattern
1774
+
1775
+ **Business Problem:**
1776
+ "Bad data polluted our Gold reports, causing executives to make wrong decisions. We need to stop it *before* it lands."
1777
+
1778
+ **The Solution:**
1779
+ A Quality Gate that runs *after* transformation but *before* writing.
1780
+
1781
+ **Recipe: The Quality Gate**
1782
+ ```yaml
1783
+ validation:
1784
+ mode: "fail" # fail (stop pipeline) or warn (log only)
1785
+ on_fail: "alert" # alert or ignore
1786
+
1787
+ tests:
1788
+ # 1. Completeness
1789
+ - type: "not_null"
1790
+ columns: ["transaction_id", "customer_id"]
1791
+
1792
+ # 2. Integrity
1793
+ - type: "unique"
1794
+ columns: ["transaction_id"]
1795
+
1796
+ - type: "accepted_values"
1797
+ column: "status"
1798
+ values: ["PENDING", "COMPLETED", "FAILED"]
1799
+
1800
+ # 3. Ranges & Patterns
1801
+ - type: "range"
1802
+ column: "age"
1803
+ min: 18
1804
+ max: 120
1805
+
1806
+ - type: "regex_match"
1807
+ column: "email"
1808
+ pattern: "^[\\w\\.-]+@[\\w\\.-]+\\.\\w+$"
1809
+
1810
+ # 4. Business Logic (SQL)
1811
+ - type: "custom_sql"
1812
+ name: "dates_ordered"
1813
+ condition: "created_at <= completed_at"
1814
+ threshold: 0.01 # Allow 1% failure
1815
+ ```
1816
+
1817
+ **Recipe: Quarantine + Gate**
1818
+ ```yaml
1819
+ validation:
1820
+ tests:
1821
+ - type: not_null
1822
+ columns: [customer_id]
1823
+ on_fail: quarantine
1824
+ quarantine:
1825
+ connection: silver
1826
+ path: customers_quarantine
1827
+ gate:
1828
+ require_pass_rate: 0.95
1829
+ on_fail: abort
1830
+ ```
1831
+ """
1832
+
1833
+ mode: ValidationAction = Field(
1834
+ default=ValidationAction.FAIL,
1835
+ description="Execution mode: 'fail' (stop pipeline) or 'warn' (log only)",
1836
+ )
1837
+ on_fail: OnFailAction = Field(
1838
+ default=OnFailAction.ALERT,
1839
+ description="Action on failure: 'alert' (send notification) or 'ignore'",
1840
+ )
1841
+ tests: List[TestConfig] = Field(default_factory=list, description="List of validation tests")
1842
+ quarantine: Optional[QuarantineConfig] = Field(
1843
+ default=None,
1844
+ description="Quarantine configuration for failed rows",
1845
+ )
1846
+ gate: Optional[GateConfig] = Field(
1847
+ default=None,
1848
+ description="Quality gate configuration for batch-level validation",
1849
+ )
1850
+ fail_fast: bool = Field(
1851
+ default=False,
1852
+ description="Stop validation on first failure. Skips remaining tests for faster feedback.",
1853
+ )
1854
+ cache_df: bool = Field(
1855
+ default=False,
1856
+ description="Cache DataFrame before validation (Spark only). Improves performance with many tests.",
1857
+ )
1858
+
1859
+ @model_validator(mode="after")
1860
+ def validate_quarantine_config(self):
1861
+ """Warn if quarantine config exists but no tests use on_fail: quarantine."""
1862
+ import warnings
1863
+
1864
+ if self.quarantine and self.tests:
1865
+ has_quarantine_tests = any(t.on_fail == ContractSeverity.QUARANTINE for t in self.tests)
1866
+ if not has_quarantine_tests:
1867
+ warnings.warn(
1868
+ "Quarantine config is defined but no tests have 'on_fail: quarantine'. "
1869
+ "Quarantine will not be used. Add 'on_fail: quarantine' to tests that "
1870
+ "should route failed rows to quarantine.",
1871
+ UserWarning,
1872
+ stacklevel=2,
1873
+ )
1874
+ return self
1875
+
1876
+
1877
+ class AutoOptimizeConfig(BaseModel):
1878
+ """
1879
+ Configuration for Delta Lake automatic optimization.
1880
+
1881
+ Example:
1882
+ ```yaml
1883
+ auto_optimize:
1884
+ enabled: true
1885
+ vacuum_retention_hours: 168
1886
+ ```
1887
+ """
1888
+
1889
+ enabled: bool = Field(default=True, description="Enable auto optimization")
1890
+ vacuum_retention_hours: int = Field(
1891
+ default=168,
1892
+ description="Hours to retain history for VACUUM (default 7 days). Set to 0 to disable VACUUM.",
1893
+ )
1894
+
1895
+
1896
+ class SqlServerAuditColsConfig(BaseModel):
1897
+ """
1898
+ Audit column configuration for SQL Server merge operations.
1899
+
1900
+ These columns are automatically populated with GETUTCDATE() during merge:
1901
+ - `created_col`: Set on INSERT only
1902
+ - `updated_col`: Set on INSERT and UPDATE
1903
+
1904
+ Example:
1905
+ ```yaml
1906
+ audit_cols:
1907
+ created_col: created_ts
1908
+ updated_col: updated_ts
1909
+ ```
1910
+ """
1911
+
1912
+ created_col: Optional[str] = Field(
1913
+ default=None,
1914
+ description="Column name for creation timestamp (set on INSERT)",
1915
+ )
1916
+ updated_col: Optional[str] = Field(
1917
+ default=None,
1918
+ description="Column name for update timestamp (set on INSERT and UPDATE)",
1919
+ )
1920
+
1921
+
1922
+ class SqlServerMergeOptions(BaseModel):
1923
+ """
1924
+ Options for SQL Server MERGE operations (Phase 1).
1925
+
1926
+ Enables incremental sync from Spark to SQL Server using T-SQL MERGE.
1927
+ Data is written to a staging table, then merged into the target.
1928
+
1929
+ ### Basic Usage
1930
+ ```yaml
1931
+ write:
1932
+ connection: azure_sql
1933
+ format: sql_server
1934
+ table: sales.fact_orders
1935
+ mode: merge
1936
+ merge_keys: [DateId, store_id]
1937
+ merge_options:
1938
+ update_condition: "source._hash_diff != target._hash_diff"
1939
+ exclude_columns: [_hash_diff]
1940
+ audit_cols:
1941
+ created_col: created_ts
1942
+ updated_col: updated_ts
1943
+ ```
1944
+
1945
+ ### Conditions
1946
+ - `update_condition`: Only update rows matching this condition (e.g., hash diff)
1947
+ - `delete_condition`: Delete rows matching this condition (soft delete pattern)
1948
+ - `insert_condition`: Only insert rows matching this condition
1949
+ """
1950
+
1951
+ update_condition: Optional[str] = Field(
1952
+ default=None,
1953
+ description=(
1954
+ "SQL condition for WHEN MATCHED UPDATE. "
1955
+ "Use 'source.' and 'target.' prefixes. "
1956
+ "Example: 'source._hash_diff != target._hash_diff'"
1957
+ ),
1958
+ )
1959
+ delete_condition: Optional[str] = Field(
1960
+ default=None,
1961
+ description=("SQL condition for WHEN MATCHED DELETE. Example: 'source._is_deleted = 1'"),
1962
+ )
1963
+ insert_condition: Optional[str] = Field(
1964
+ default=None,
1965
+ description=("SQL condition for WHEN NOT MATCHED INSERT. Example: 'source.is_valid = 1'"),
1966
+ )
1967
+ exclude_columns: List[str] = Field(
1968
+ default_factory=list,
1969
+ description="Columns to exclude from MERGE (not written to target table)",
1970
+ )
1971
+ staging_schema: str = Field(
1972
+ default="staging",
1973
+ description="Schema for staging table. Table name: {staging_schema}.{table}_staging",
1974
+ )
1975
+ audit_cols: Optional[SqlServerAuditColsConfig] = Field(
1976
+ default=None,
1977
+ description="Audit columns for created/updated timestamps",
1978
+ )
1979
+ validations: Optional["SqlServerMergeValidationConfig"] = Field(
1980
+ default=None,
1981
+ description="Validation checks before merge (null keys, duplicate keys)",
1982
+ )
1983
+ auto_create_schema: bool = Field(
1984
+ default=False,
1985
+ description="Auto-create schema if it doesn't exist (Phase 4). Runs CREATE SCHEMA IF NOT EXISTS.",
1986
+ )
1987
+ auto_create_table: bool = Field(
1988
+ default=False,
1989
+ description="Auto-create target table if it doesn't exist (Phase 4). Infers schema from DataFrame.",
1990
+ )
1991
+ schema_evolution: Optional["SqlServerSchemaEvolutionConfig"] = Field(
1992
+ default=None,
1993
+ description="Schema evolution configuration (Phase 4). Controls handling of schema differences.",
1994
+ )
1995
+ batch_size: Optional[int] = Field(
1996
+ default=None,
1997
+ description="Batch size for staging table writes (Phase 4). Chunks large DataFrames for memory efficiency.",
1998
+ )
1999
+ primary_key_on_merge_keys: bool = Field(
2000
+ default=False,
2001
+ description="Create a clustered primary key on merge_keys when auto-creating table. Enforces uniqueness.",
2002
+ )
2003
+ index_on_merge_keys: bool = Field(
2004
+ default=False,
2005
+ description="Create a nonclustered index on merge_keys. Use if primary key already exists elsewhere.",
2006
+ )
2007
+ incremental: bool = Field(
2008
+ default=False,
2009
+ description=(
2010
+ "Enable incremental merge optimization. When True, reads target table's keys and hashes "
2011
+ "to determine which rows changed, then only writes changed rows to staging. "
2012
+ "Significantly faster when few rows change between runs."
2013
+ ),
2014
+ )
2015
+ hash_column: Optional[str] = Field(
2016
+ default=None,
2017
+ description=(
2018
+ "Name of pre-computed hash column in DataFrame for change detection. "
2019
+ "Used when incremental=True. If not specified, will auto-detect '_hash_diff' column."
2020
+ ),
2021
+ )
2022
+ change_detection_columns: Optional[List[str]] = Field(
2023
+ default=None,
2024
+ description=(
2025
+ "Columns to use for computing change detection hash. Used when incremental=True "
2026
+ "and no hash_column is specified. If None, uses all non-key columns."
2027
+ ),
2028
+ )
2029
+
2030
+
2031
+ class SqlServerOverwriteStrategy(str, Enum):
2032
+ """Strategies for SQL Server overwrite operations."""
2033
+
2034
+ TRUNCATE_INSERT = "truncate_insert" # TRUNCATE then INSERT (fastest, needs permission)
2035
+ DROP_CREATE = "drop_create" # DROP TABLE, CREATE, INSERT (schema refresh)
2036
+ DELETE_INSERT = "delete_insert" # DELETE FROM then INSERT (no special permissions)
2037
+
2038
+
2039
+ class SqlServerSchemaEvolutionMode(str, Enum):
2040
+ """
2041
+ Schema evolution modes for SQL Server writes (Phase 4).
2042
+
2043
+ Controls how schema differences between DataFrame and target table are handled.
2044
+ """
2045
+
2046
+ STRICT = "strict" # Fail if schemas don't match (default, no auto DDL)
2047
+ EVOLVE = "evolve" # Add new columns via ALTER TABLE (additive only)
2048
+ IGNORE = "ignore" # Ignore schema differences, write matching columns only
2049
+
2050
+
2051
+ class SqlServerSchemaEvolutionConfig(BaseModel):
2052
+ """
2053
+ Schema evolution configuration for SQL Server operations (Phase 4).
2054
+
2055
+ Controls automatic schema changes when DataFrame schema differs from target table.
2056
+
2057
+ Example:
2058
+ ```yaml
2059
+ merge_options:
2060
+ schema_evolution:
2061
+ mode: evolve
2062
+ add_columns: true
2063
+ ```
2064
+ """
2065
+
2066
+ mode: SqlServerSchemaEvolutionMode = Field(
2067
+ default=SqlServerSchemaEvolutionMode.STRICT,
2068
+ description="Schema evolution mode: strict (fail), evolve (add columns), ignore (skip mismatched)",
2069
+ )
2070
+ add_columns: bool = Field(
2071
+ default=False,
2072
+ description="If mode='evolve', automatically add new columns via ALTER TABLE ADD COLUMN",
2073
+ )
2074
+
2075
+
2076
+ class SqlServerMergeValidationConfig(BaseModel):
2077
+ """
2078
+ Validation configuration for SQL Server merge/overwrite operations.
2079
+
2080
+ Validates source data before writing to SQL Server.
2081
+
2082
+ Example:
2083
+ ```yaml
2084
+ merge_options:
2085
+ validations:
2086
+ check_null_keys: true
2087
+ check_duplicate_keys: true
2088
+ fail_on_validation_error: true
2089
+ ```
2090
+ """
2091
+
2092
+ check_null_keys: bool = Field(
2093
+ default=True,
2094
+ description="Fail if merge_keys contain NULL values",
2095
+ )
2096
+ check_duplicate_keys: bool = Field(
2097
+ default=True,
2098
+ description="Fail if merge_keys have duplicate combinations",
2099
+ )
2100
+ fail_on_validation_error: bool = Field(
2101
+ default=True,
2102
+ description="If False, log warning instead of failing on validation errors",
2103
+ )
2104
+
2105
+
2106
+ class SqlServerOverwriteOptions(BaseModel):
2107
+ """
2108
+ Options for SQL Server overwrite operations (Phase 2).
2109
+
2110
+ Enhanced overwrite with multiple strategies for different use cases.
2111
+
2112
+ ### Strategies
2113
+ - `truncate_insert`: TRUNCATE TABLE then INSERT (fastest, requires TRUNCATE permission)
2114
+ - `drop_create`: DROP TABLE, CREATE TABLE, INSERT (refreshes schema)
2115
+ - `delete_insert`: DELETE FROM then INSERT (works with limited permissions)
2116
+
2117
+ ### Example
2118
+ ```yaml
2119
+ write:
2120
+ connection: azure_sql
2121
+ format: sql_server
2122
+ table: fact.combined_downtime
2123
+ mode: overwrite
2124
+ overwrite_options:
2125
+ strategy: truncate_insert
2126
+ audit_cols:
2127
+ created_col: created_ts
2128
+ updated_col: updated_ts
2129
+ ```
2130
+ """
2131
+
2132
+ strategy: SqlServerOverwriteStrategy = Field(
2133
+ default=SqlServerOverwriteStrategy.TRUNCATE_INSERT,
2134
+ description="Overwrite strategy: truncate_insert, drop_create, delete_insert",
2135
+ )
2136
+ audit_cols: Optional[SqlServerAuditColsConfig] = Field(
2137
+ default=None,
2138
+ description="Audit columns for created/updated timestamps",
2139
+ )
2140
+ validations: Optional[SqlServerMergeValidationConfig] = Field(
2141
+ default=None,
2142
+ description="Validation checks before overwrite",
2143
+ )
2144
+ auto_create_schema: bool = Field(
2145
+ default=False,
2146
+ description="Auto-create schema if it doesn't exist (Phase 4). Runs CREATE SCHEMA IF NOT EXISTS.",
2147
+ )
2148
+ auto_create_table: bool = Field(
2149
+ default=False,
2150
+ description="Auto-create target table if it doesn't exist (Phase 4). Infers schema from DataFrame.",
2151
+ )
2152
+ schema_evolution: Optional[SqlServerSchemaEvolutionConfig] = Field(
2153
+ default=None,
2154
+ description="Schema evolution configuration (Phase 4). Controls handling of schema differences.",
2155
+ )
2156
+ batch_size: Optional[int] = Field(
2157
+ default=None,
2158
+ description="Batch size for writes (Phase 4). Chunks large DataFrames for memory efficiency.",
2159
+ )
2160
+
2161
+
2162
+ class TriggerConfig(BaseModel):
2163
+ """
2164
+ Configuration for streaming trigger intervals.
2165
+
2166
+ Specify exactly one of the trigger options.
2167
+
2168
+ Example:
2169
+ ```yaml
2170
+ trigger:
2171
+ processing_time: "10 seconds"
2172
+ ```
2173
+
2174
+ Or for one-time processing:
2175
+ ```yaml
2176
+ trigger:
2177
+ once: true
2178
+ ```
2179
+ """
2180
+
2181
+ processing_time: Optional[str] = Field(
2182
+ default=None,
2183
+ description="Trigger interval as duration string (e.g., '10 seconds', '1 minute')",
2184
+ )
2185
+ once: Optional[bool] = Field(
2186
+ default=None,
2187
+ description="Process all available data once and stop",
2188
+ )
2189
+ available_now: Optional[bool] = Field(
2190
+ default=None,
2191
+ description="Process all available data in multiple batches, then stop",
2192
+ )
2193
+ continuous: Optional[str] = Field(
2194
+ default=None,
2195
+ description="Continuous processing with checkpoint interval (e.g., '1 second')",
2196
+ )
2197
+
2198
+ @model_validator(mode="after")
2199
+ def check_exactly_one_trigger(self):
2200
+ """Ensure exactly one trigger type is specified."""
2201
+ specified = []
2202
+ if self.processing_time is not None:
2203
+ specified.append(f"processing_time='{self.processing_time}'")
2204
+ if self.once is True:
2205
+ specified.append("once=True")
2206
+ if self.available_now is True:
2207
+ specified.append("available_now=True")
2208
+ if self.continuous is not None:
2209
+ specified.append(f"continuous='{self.continuous}'")
2210
+
2211
+ if len(specified) > 1:
2212
+ raise ValueError(
2213
+ f"TriggerConfig validation failed: Multiple trigger types specified: {', '.join(specified)}. "
2214
+ f"Specify exactly one of: 'processing_time', 'once', 'available_now', or 'continuous'. "
2215
+ f"Example: processing_time: '10 seconds' for micro-batch, or once: true for single batch."
2216
+ )
2217
+ return self
2218
+
2219
+
2220
+ class StreamingWriteConfig(BaseModel):
2221
+ """
2222
+ Configuration for Spark Structured Streaming writes.
2223
+
2224
+ ### 🚀 "Real-Time Pipeline" Guide
2225
+
2226
+ **Business Problem:**
2227
+ "I need to process data continuously as it arrives from Kafka/Event Hubs
2228
+ and write it to Delta Lake in near real-time."
2229
+
2230
+ **The Solution:**
2231
+ Configure streaming write with checkpoint location for fault tolerance
2232
+ and trigger interval for processing frequency.
2233
+
2234
+ **Recipe: Streaming Ingestion**
2235
+ ```yaml
2236
+ write:
2237
+ connection: "silver_lake"
2238
+ format: "delta"
2239
+ table: "events_stream"
2240
+ streaming:
2241
+ output_mode: append
2242
+ checkpoint_location: "/checkpoints/events_stream"
2243
+ trigger:
2244
+ processing_time: "10 seconds"
2245
+ ```
2246
+
2247
+ **Recipe: One-Time Streaming (Batch-like)**
2248
+ ```yaml
2249
+ write:
2250
+ connection: "silver_lake"
2251
+ format: "delta"
2252
+ table: "events_batch"
2253
+ streaming:
2254
+ output_mode: append
2255
+ checkpoint_location: "/checkpoints/events_batch"
2256
+ trigger:
2257
+ available_now: true
2258
+ ```
2259
+ """
2260
+
2261
+ output_mode: Literal["append", "update", "complete"] = Field(
2262
+ default="append",
2263
+ description=(
2264
+ "Output mode for streaming writes. "
2265
+ "'append' - Only new rows. 'update' - Updated rows only. "
2266
+ "'complete' - Entire result table (requires aggregation)."
2267
+ ),
2268
+ )
2269
+ checkpoint_location: str = Field(
2270
+ description=(
2271
+ "Path for streaming checkpoints. Required for fault tolerance. "
2272
+ "Must be a reliable storage location (e.g., cloud storage, DBFS)."
2273
+ ),
2274
+ )
2275
+ trigger: Optional[TriggerConfig] = Field(
2276
+ default=None,
2277
+ description=(
2278
+ "Trigger configuration. If not specified, processes data as fast as possible. "
2279
+ "Use 'processing_time' for micro-batch intervals, 'once' for single batch, "
2280
+ "'available_now' for processing all available data then stopping."
2281
+ ),
2282
+ )
2283
+ query_name: Optional[str] = Field(
2284
+ default=None,
2285
+ description="Name for the streaming query (useful for monitoring and debugging)",
2286
+ )
2287
+ await_termination: Optional[bool] = Field(
2288
+ default=False,
2289
+ description=(
2290
+ "Wait for the streaming query to terminate. "
2291
+ "Set to True for batch-like streaming with 'once' or 'available_now' triggers."
2292
+ ),
2293
+ )
2294
+ timeout_seconds: Optional[int] = Field(
2295
+ default=None,
2296
+ description=(
2297
+ "Timeout in seconds when await_termination is True. If None, waits indefinitely."
2298
+ ),
2299
+ )
2300
+
2301
+
2302
+ class WriteConfig(BaseModel):
2303
+ """
2304
+ Configuration for writing data from a node.
2305
+
2306
+ **When to Use:** Any node that persists data to storage.
2307
+
2308
+ **Key Concepts:**
2309
+ - `mode`: How to handle existing data (overwrite, append, upsert)
2310
+ - `keys`: Required for upsert mode - columns that identify unique records
2311
+ - `partition_by`: Columns to partition output by (improves query performance)
2312
+
2313
+ **See Also:**
2314
+ - [Performance Tuning](../guides/performance_tuning.md) - Partitioning strategies
2315
+
2316
+ ### 🚀 "Big Data Performance" Guide
2317
+
2318
+ **Business Problem:**
2319
+ "My dashboards are slow because the query scans terabytes of data just to find one day's sales."
2320
+
2321
+ **The Solution:**
2322
+ Use **Partitioning** for coarse filtering (skipping huge chunks) and **Z-Ordering** for fine-grained skipping (colocating related data).
2323
+
2324
+ **Recipe: Lakehouse Optimized**
2325
+ ```yaml
2326
+ write:
2327
+ connection: "gold_lake"
2328
+ format: "delta"
2329
+ table: "fact_sales"
2330
+ mode: "append"
2331
+
2332
+ # 1. Partitioning: Physical folders.
2333
+ # Use for low-cardinality columns often used in WHERE clauses.
2334
+ # WARNING: Do NOT partition by high-cardinality cols like ID or Timestamp!
2335
+ partition_by: ["country_code", "txn_year_month"]
2336
+
2337
+ # 2. Z-Ordering: Data clustering.
2338
+ # Use for high-cardinality columns often used in JOINs or predicates.
2339
+ zorder_by: ["customer_id", "product_id"]
2340
+
2341
+ # 3. Table Properties: Engine tuning.
2342
+ table_properties:
2343
+ "delta.autoOptimize.optimizeWrite": "true"
2344
+ "delta.autoOptimize.autoCompact": "true"
2345
+ ```
2346
+ """
2347
+
2348
+ connection: str = Field(description="Connection name from project.yaml")
2349
+ format: Union[ReadFormat, str] = Field(description="Output format (csv, parquet, delta, etc.)")
2350
+ table: Optional[str] = Field(default=None, description="Table name for SQL/Delta")
2351
+ path: Optional[str] = Field(default=None, description="Path for file-based outputs")
2352
+ register_table: Optional[str] = Field(
2353
+ default=None, description="Register file output as external table (Spark/Delta only)"
2354
+ )
2355
+ mode: WriteMode = Field(
2356
+ default=WriteMode.OVERWRITE,
2357
+ description="Write mode. Options: 'overwrite', 'append', 'upsert', 'append_once'",
2358
+ )
2359
+ partition_by: List[str] = Field(
2360
+ default_factory=list,
2361
+ description="List of columns to physically partition the output by (folder structure). Use for low-cardinality columns (e.g. date, country).",
2362
+ )
2363
+ zorder_by: List[str] = Field(
2364
+ default_factory=list,
2365
+ description="List of columns to Z-Order by. Improves read performance for high-cardinality columns used in filters/joins (Delta only).",
2366
+ )
2367
+ table_properties: Dict[str, str] = Field(
2368
+ default_factory=dict,
2369
+ description=(
2370
+ "Delta table properties. Overrides global performance.delta_table_properties. "
2371
+ "Example: {'delta.columnMapping.mode': 'name'} to allow special characters in column names."
2372
+ ),
2373
+ )
2374
+ merge_schema: bool = Field(
2375
+ default=False, description="Allow schema evolution (mergeSchema option in Delta)"
2376
+ )
2377
+ first_run_query: Optional[str] = Field(
2378
+ default=None,
2379
+ description=(
2380
+ "SQL query for full-load on first run (High Water Mark pattern). "
2381
+ "If set, uses this query when target table doesn't exist, then switches to incremental. "
2382
+ "Only applies to SQL reads."
2383
+ ),
2384
+ )
2385
+ options: Dict[str, Any] = Field(default_factory=dict, description="Format-specific options")
2386
+ auto_optimize: Optional[Union[bool, AutoOptimizeConfig]] = Field(
2387
+ default=None,
2388
+ description="Auto-run OPTIMIZE and VACUUM after write (Delta only)",
2389
+ )
2390
+ add_metadata: Optional[Union[bool, WriteMetadataConfig]] = Field(
2391
+ default=None,
2392
+ description=(
2393
+ "Add metadata columns for Bronze layer lineage. "
2394
+ "Set to `true` to add all applicable columns, or provide a WriteMetadataConfig for selective columns. "
2395
+ "Columns: _extracted_at, _source_file (file sources), _source_connection, _source_table (SQL sources)."
2396
+ ),
2397
+ )
2398
+ skip_if_unchanged: bool = Field(
2399
+ default=False,
2400
+ description=(
2401
+ "Skip write if DataFrame content is identical to previous write. "
2402
+ "Computes SHA256 hash of entire DataFrame and compares to stored hash in Delta table metadata. "
2403
+ "Useful for snapshot tables without timestamps to avoid redundant appends. "
2404
+ "Only supported for Delta format."
2405
+ ),
2406
+ )
2407
+ skip_hash_columns: Optional[List[str]] = Field(
2408
+ default=None,
2409
+ description=(
2410
+ "Columns to include in hash computation for skip_if_unchanged. "
2411
+ "If None, all columns are used. Specify a subset to ignore volatile columns like timestamps."
2412
+ ),
2413
+ )
2414
+ skip_hash_sort_columns: Optional[List[str]] = Field(
2415
+ default=None,
2416
+ description=(
2417
+ "Columns to sort by before hashing for deterministic comparison. "
2418
+ "Required if row order may vary between runs. Typically your business key columns."
2419
+ ),
2420
+ )
2421
+ streaming: Optional[StreamingWriteConfig] = Field(
2422
+ default=None,
2423
+ description=(
2424
+ "Streaming write configuration for Spark Structured Streaming. "
2425
+ "When set, uses writeStream instead of batch write. "
2426
+ "Requires a streaming DataFrame from a streaming read source."
2427
+ ),
2428
+ )
2429
+ merge_keys: Optional[List[str]] = Field(
2430
+ default=None,
2431
+ description=(
2432
+ "Key columns for SQL Server MERGE operations. Required when mode='merge'. "
2433
+ "These columns form the ON clause of the MERGE statement."
2434
+ ),
2435
+ )
2436
+ merge_options: Optional[SqlServerMergeOptions] = Field(
2437
+ default=None,
2438
+ description="Options for SQL Server MERGE operations (conditions, staging, audit cols)",
2439
+ )
2440
+ overwrite_options: Optional[SqlServerOverwriteOptions] = Field(
2441
+ default=None,
2442
+ description="Options for SQL Server overwrite operations (strategy, audit cols)",
2443
+ )
2444
+
2445
+ @model_validator(mode="after")
2446
+ def check_table_or_path(self):
2447
+ """Ensure either table or path is provided."""
2448
+ if not self.table and not self.path:
2449
+ raise ValueError("Either 'table' or 'path' must be provided for write config")
2450
+ if self.table and self.path:
2451
+ raise ValueError("WriteConfig: 'table' and 'path' are mutually exclusive.")
2452
+ return self
2453
+
2454
+ @model_validator(mode="after")
2455
+ def check_merge_keys(self):
2456
+ """Ensure merge_keys is provided when mode is merge."""
2457
+ if self.mode == WriteMode.MERGE and not self.merge_keys:
2458
+ raise ValueError(
2459
+ "WriteConfig: 'merge_keys' is required when mode='merge'. "
2460
+ "Specify the key columns for the MERGE ON clause."
2461
+ )
2462
+ return self
2463
+
2464
+
2465
+ class ColumnMetadata(BaseModel):
2466
+ """Metadata for a column in the data dictionary."""
2467
+
2468
+ description: Optional[str] = Field(default=None, description="Column description")
2469
+ pii: bool = Field(default=False, description="Contains PII?")
2470
+ tags: List[str] = Field(
2471
+ default_factory=list, description="Tags (e.g. 'business_key', 'measure')"
2472
+ )
2473
+
2474
+
2475
+ class SchemaMode(str, Enum):
2476
+ ENFORCE = "enforce"
2477
+ EVOLVE = "evolve"
2478
+
2479
+
2480
+ class OnNewColumns(str, Enum):
2481
+ IGNORE = "ignore"
2482
+ FAIL = "fail"
2483
+ ADD_NULLABLE = "add_nullable"
2484
+
2485
+
2486
+ class OnMissingColumns(str, Enum):
2487
+ FAIL = "fail"
2488
+ FILL_NULL = "fill_null"
2489
+
2490
+
2491
+ class PrivacyMethod(str, Enum):
2492
+ """Supported privacy anonymization methods."""
2493
+
2494
+ HASH = "hash" # SHA256 hash
2495
+ MASK = "mask" # Mask all but last 4 chars
2496
+ REDACT = "redact" # Replace with [REDACTED]
2497
+
2498
+
2499
+ class PrivacyConfig(BaseModel):
2500
+ """
2501
+ Configuration for PII anonymization.
2502
+
2503
+ ### 🔐 Privacy & PII Protection
2504
+
2505
+ **How It Works:**
2506
+ 1. Mark columns as `pii: true` in the `columns` metadata
2507
+ 2. Configure a `privacy` block with the anonymization method
2508
+ 3. During node execution, all columns marked as PII (and inherited from dependencies) are anonymized
2509
+ 4. Upstream PII markings are inherited by downstream nodes
2510
+
2511
+ **Example:**
2512
+ ```yaml
2513
+ columns:
2514
+ customer_email:
2515
+ pii: true # Mark as PII
2516
+ customer_id:
2517
+ pii: false
2518
+
2519
+ privacy:
2520
+ method: hash # hash, mask, or redact
2521
+ salt: "secret_key" # Optional: makes hash unique/secure
2522
+ declassify: [] # Remove columns from PII protection
2523
+ ```
2524
+
2525
+ **Methods:**
2526
+ - `hash`: SHA256 hash (length 64). With salt, prevents pre-computed rainbow tables.
2527
+ - `mask`: Show only last 4 chars, replace rest with `*`. Example: `john@email.com` → `****@email.com`
2528
+ - `redact`: Replace entire value with `[REDACTED]`
2529
+
2530
+ **Important:**
2531
+ - `pii: true` alone does NOTHING. You must set a `privacy.method` to actually mask data.
2532
+ - PII inheritance: If dependency outputs PII columns, this node inherits them unless declassified.
2533
+ - Salt is optional but recommended for hash to prevent attacks.
2534
+ """
2535
+
2536
+ method: PrivacyMethod = Field(
2537
+ ...,
2538
+ description="Anonymization method: 'hash' (SHA256), 'mask' (show last 4), or 'redact' ([REDACTED])",
2539
+ )
2540
+ salt: Optional[str] = Field(
2541
+ default=None,
2542
+ description="Salt for hashing (optional but recommended). Appended before hashing to create unique hashes. Example: 'company_secret_key_2025'",
2543
+ )
2544
+ declassify: List[str] = Field(
2545
+ default_factory=list,
2546
+ description="List of columns to remove from PII protection (stops inheritance from upstream). Example: ['customer_id']",
2547
+ )
2548
+
2549
+
2550
+ class SchemaPolicyConfig(BaseModel):
2551
+ """
2552
+ Configuration for Schema Management (Drift Handling).
2553
+
2554
+ Controls how the node handles differences between input data and target table schema.
2555
+ """
2556
+
2557
+ mode: SchemaMode = Field(
2558
+ default=SchemaMode.ENFORCE, description="Schema evolution mode: 'enforce' or 'evolve'"
2559
+ )
2560
+ on_new_columns: Optional[OnNewColumns] = Field(
2561
+ default=None,
2562
+ description="Action for new columns in input: 'ignore', 'fail', 'add_nullable'",
2563
+ )
2564
+ on_missing_columns: OnMissingColumns = Field(
2565
+ default=OnMissingColumns.FILL_NULL,
2566
+ description="Action for missing columns in input: 'fail', 'fill_null'",
2567
+ )
2568
+
2569
+ @model_validator(mode="after")
2570
+ def set_defaults(self):
2571
+ if self.mode == SchemaMode.EVOLVE:
2572
+ if self.on_new_columns is None:
2573
+ self.on_new_columns = OnNewColumns.ADD_NULLABLE
2574
+ else: # ENFORCE
2575
+ if self.on_new_columns is None:
2576
+ self.on_new_columns = OnNewColumns.IGNORE
2577
+ return self
2578
+
2579
+
2580
+ class NodeConfig(BaseModel):
2581
+ """
2582
+ Configuration for a single node.
2583
+
2584
+ ### 🧠 "The Smart Node" Pattern
2585
+
2586
+ **Business Problem:**
2587
+ "We need complex dependencies, caching for heavy computations, and the ability to run only specific parts of the pipeline."
2588
+
2589
+ **The Solution:**
2590
+ Nodes are the building blocks. They handle dependencies (`depends_on`), execution control (`tags`, `enabled`), and performance (`cache`).
2591
+
2592
+ ### 🕸️ DAG & Dependencies
2593
+ **The Glue of the Pipeline.**
2594
+ Nodes don't run in isolation. They form a Directed Acyclic Graph (DAG).
2595
+
2596
+ * **`depends_on`**: Critical! If Node B reads from Node A (in memory), you MUST list `["Node A"]`.
2597
+ * *Implicit Data Flow*: If a node has no `read` block, it automatically picks up the DataFrame from its first dependency.
2598
+
2599
+ ### 🧠 Smart Read & Incremental Loading
2600
+
2601
+ **Automated History Management.**
2602
+
2603
+ Odibi intelligently determines whether to perform a **Full Load** or an **Incremental Load** based on the state of the target.
2604
+
2605
+ **The "Smart Read" Logic:**
2606
+ 1. **First Run (Full Load):** If the target table (defined in `write`) does **not exist**:
2607
+ * Incremental filtering rules are **ignored**.
2608
+ * The entire source dataset is read.
2609
+ * Use `write.first_run_query` (optional) to override the read query for this initial bootstrap (e.g., to backfill only 1 year of history instead of all time).
2610
+
2611
+ 2. **Subsequent Runs (Incremental Load):** If the target table **exists**:
2612
+ * **Rolling Window:** Filters source data where `column >= NOW() - lookback`.
2613
+ * **Stateful:** Filters source data where `column > last_high_water_mark`.
2614
+
2615
+ This ensures you don't need separate "init" and "update" pipelines. One config handles both lifecycle states.
2616
+
2617
+ ### 🏷️ Orchestration Tags
2618
+ **Run What You Need.**
2619
+ Tags allow you to execute slices of your pipeline.
2620
+ * `odibi run --tag daily` -> Runs all nodes with "daily" tag.
2621
+ * `odibi run --tag critical` -> Runs high-priority nodes.
2622
+
2623
+ ### 🤖 Choosing Your Logic: Transformer vs. Transform
2624
+
2625
+ **1. The "Transformer" (Top-Level)**
2626
+ * **What it is:** A pre-packaged, heavy-duty operation that defines the *entire purpose* of the node.
2627
+ * **When to use:** When applying a standard Data Engineering pattern (e.g., SCD2, Merge, Deduplicate).
2628
+ * **Analogy:** "Run this App."
2629
+ * **Syntax:** `transformer: "scd2"` + `params: {...}`
2630
+
2631
+ **2. The "Transform Steps" (Process Chain)**
2632
+ * **What it is:** A sequence of smaller steps (SQL, functions, operations) executed in order.
2633
+ * **When to use:** For custom business logic, data cleaning, or feature engineering pipelines.
2634
+ * **Analogy:** "Run this Script."
2635
+ * **Syntax:** `transform: { steps: [...] }`
2636
+
2637
+ *Note: You can use both! The `transformer` runs first, then `transform` steps refine the result.*
2638
+
2639
+ ### 🔗 Chaining Operations
2640
+ **You can mix and match!**
2641
+ The execution order is always:
2642
+ 1. **Read** (or Dependency Injection)
2643
+ 2. **Transformer** (The "App" logic, e.g., Deduplicate)
2644
+ 3. **Transform Steps** (The "Script" logic, e.g., cleanup)
2645
+ 4. **Validation**
2646
+ 5. **Write**
2647
+
2648
+ *Constraint:* You must define **at least one** of `read`, `transformer`, `transform`, or `write`.
2649
+
2650
+ ### ⚡ Example: App vs. Script
2651
+
2652
+ **Scenario 1: The Full ETL Flow (Chained)**
2653
+ *Shows explicit Read, Transform Chain, and Write.*
2654
+
2655
+ ```yaml
2656
+ # 1. Ingest (The Dependency)
2657
+ - name: "load_raw_users"
2658
+ read: { connection: "s3_landing", format: "json", path: "users/*.json" }
2659
+ write: { connection: "bronze", format: "parquet", path: "users_raw" }
2660
+
2661
+ # 2. Process (The Consumer)
2662
+ - name: "clean_users"
2663
+ depends_on: ["load_raw_users"]
2664
+
2665
+ # "clean_text" is a registered function from the Transformer Catalog
2666
+ transform:
2667
+ steps:
2668
+ - sql: "SELECT * FROM df WHERE email IS NOT NULL"
2669
+ - function: "clean_text"
2670
+ params: { columns: ["email"], case: "lower" }
2671
+
2672
+ write: { connection: "silver", format: "delta", table: "dim_users" }
2673
+ ```
2674
+
2675
+ **Scenario 2: The "App" Node (Top-Level Transformer)**
2676
+ *Shows a node that applies a pattern (Deduplicate) to incoming data.*
2677
+
2678
+ ```yaml
2679
+ - name: "deduped_users"
2680
+ depends_on: ["clean_users"]
2681
+
2682
+ # The "App": Deduplication (From Transformer Catalog)
2683
+ transformer: "deduplicate"
2684
+ params:
2685
+ keys: ["user_id"]
2686
+ order_by: "updated_at DESC"
2687
+
2688
+ write: { connection: "gold", format: "delta", table: "users_unique" }
2689
+ ```
2690
+
2691
+ **Scenario 3: The Tagged Runner (Reporting)**
2692
+ *Shows how tags allow running specific slices (e.g., `odibi run --tag daily`).*
2693
+
2694
+ ```yaml
2695
+ - name: "daily_report"
2696
+ tags: ["daily", "reporting"]
2697
+ depends_on: ["deduped_users"]
2698
+
2699
+ # Ad-hoc aggregation script
2700
+ transform:
2701
+ steps:
2702
+ - sql: "SELECT date_trunc('day', updated_at) as day, count(*) as total FROM df GROUP BY 1"
2703
+
2704
+ write: { connection: "local_data", format: "csv", path: "reports/daily_stats.csv" }
2705
+ ```
2706
+
2707
+ **Scenario 4: The "Kitchen Sink" (All Operations)**
2708
+ *Shows Read -> Transformer -> Transform -> Write execution order.*
2709
+
2710
+ **Why this works:**
2711
+ 1. **Internal Chaining (`df`):** In every step (Transformer or SQL), `df` refers to the output of the *previous* step.
2712
+ 2. **External Access (`depends_on`):** If you added `depends_on: ["other_node"]`, you could also run `SELECT * FROM other_node` in your SQL steps!
2713
+
2714
+ ```yaml
2715
+ - name: "complex_flow"
2716
+ # 1. Read -> Creates initial 'df'
2717
+ read: { connection: "bronze", format: "parquet", path: "users" }
2718
+
2719
+ # 2. Transformer (The "App": Deduplicate first)
2720
+ # Takes 'df' (from Read), dedups it, returns new 'df'
2721
+ transformer: "deduplicate"
2722
+ params: { keys: ["user_id"], order_by: "updated_at DESC" }
2723
+
2724
+ # 3. Transform Steps (The "Script": Filter AFTER deduplication)
2725
+ # SQL sees the deduped data as 'df'
2726
+ transform:
2727
+ steps:
2728
+ - sql: "SELECT * FROM df WHERE status = 'active'"
2729
+
2730
+ # 4. Write -> Saves the final filtered 'df'
2731
+ write: { connection: "silver", format: "delta", table: "active_unique_users" }
2732
+ ```
2733
+
2734
+ ### 📚 Transformer Catalog
2735
+
2736
+ These are the built-in functions you can use in two ways:
2737
+
2738
+ 1. **As a Top-Level Transformer:** `transformer: "name"` (Defines the node's main logic)
2739
+ 2. **As a Step in a Chain:** `transform: { steps: [{ function: "name" }] }` (Part of a sequence)
2740
+
2741
+ *Note: `merge` and `scd2` are special "Heavy Lifters" and should generally be used as Top-Level Transformers.*
2742
+
2743
+ **Data Engineering Patterns**
2744
+ * `merge`: Upsert/Merge into target (Delta/SQL). *([Params](#mergeparams))*
2745
+ * `scd2`: Slowly Changing Dimensions Type 2. *([Params](#scd2params))*
2746
+ * `deduplicate`: Remove duplicates using window functions. *([Params](#deduplicateparams))*
2747
+
2748
+ **Relational Algebra**
2749
+ * `join`: Join two datasets. *([Params](#joinparams))*
2750
+ * `union`: Stack datasets vertically. *([Params](#unionparams))*
2751
+ * `pivot`: Rotate rows to columns. *([Params](#pivotparams))*
2752
+ * `unpivot`: Rotate columns to rows (melt). *([Params](#unpivotparams))*
2753
+ * `aggregate`: Group by and sum/count/avg. *([Params](#aggregateparams))*
2754
+
2755
+ **Data Quality & Cleaning**
2756
+ * `validate_and_flag`: Check rules and flag invalid rows. *([Params](#validateandflagparams))*
2757
+ * `clean_text`: Trim and normalize case. *([Params](#cleantextparams))*
2758
+ * `filter_rows`: SQL-based filtering. *([Params](#filterrowsparams))*
2759
+ * `fill_nulls`: Replace NULLs with defaults. *([Params](#fillnullsparams))*
2760
+
2761
+ **Feature Engineering**
2762
+ * `derive_columns`: Create new cols via SQL expressions. *([Params](#derivecolumnsparams))*
2763
+ * `case_when`: Conditional logic (if-else). *([Params](#casewhenparams))*
2764
+ * `generate_surrogate_key`: Create MD5 keys from columns. *([Params](#surrogatekeyparams))*
2765
+ * `date_diff`, `date_add`, `date_trunc`: Date arithmetic.
2766
+
2767
+ **Scenario 1: The Full ETL Flow**
2768
+ *(Show two nodes: one loader, one processor)*
2769
+
2770
+ ```yaml
2771
+ # 1. Ingest (The Dependency)
2772
+ - name: "load_raw_users"
2773
+ read: { connection: "s3_landing", format: "json", path: "users/*.json" }
2774
+ write: { connection: "bronze", format: "parquet", path: "users_raw" }
2775
+
2776
+ # 2. Process (The Consumer)
2777
+ - name: "clean_users"
2778
+ depends_on: ["load_raw_users"] # <--- Explicit dependency
2779
+
2780
+ # Explicit Transformation Steps
2781
+ transform:
2782
+ steps:
2783
+ - sql: "SELECT * FROM df WHERE email IS NOT NULL"
2784
+ - function: "clean_text"
2785
+ params: { columns: ["email"], case: "lower" }
2786
+
2787
+ write: { connection: "silver", format: "delta", table: "dim_users" }
2788
+ ```
2789
+
2790
+ **Scenario 2: The "App" Node (Transformer)**
2791
+ *(Show a node that is a Transformer, no read needed if it picks up from dependency)*
2792
+
2793
+ ```yaml
2794
+ - name: "deduped_users"
2795
+ depends_on: ["clean_users"]
2796
+
2797
+ # The "App": Deduplication
2798
+ transformer: "deduplicate"
2799
+ params:
2800
+ keys: ["user_id"]
2801
+ order_by: "updated_at DESC"
2802
+
2803
+ write: { connection: "gold", format: "delta", table: "users_unique" }
2804
+ ```
2805
+
2806
+ **Scenario 3: The Tagged Runner**
2807
+ *Run only this with `odibi run --tag daily`*
2808
+ ```yaml
2809
+ - name: "daily_report"
2810
+ tags: ["daily", "reporting"]
2811
+ # ...
2812
+ ```
2813
+
2814
+ **Scenario 4: Pre/Post SQL Hooks**
2815
+ *Setup and cleanup with SQL statements.*
2816
+ ```yaml
2817
+ - name: "optimize_sales"
2818
+ depends_on: ["load_sales"]
2819
+ pre_sql:
2820
+ - "SET spark.sql.shuffle.partitions = 200"
2821
+ - "CREATE TEMP VIEW staging AS SELECT * FROM bronze.raw_sales"
2822
+ transform:
2823
+ steps:
2824
+ - sql: "SELECT * FROM staging WHERE amount > 0"
2825
+ post_sql:
2826
+ - "OPTIMIZE gold.fact_sales ZORDER BY (customer_id)"
2827
+ - "VACUUM gold.fact_sales RETAIN 168 HOURS"
2828
+ write:
2829
+ connection: "gold"
2830
+ format: "delta"
2831
+ table: "fact_sales"
2832
+ ```
2833
+
2834
+ **Scenario 5: Materialization Strategies**
2835
+ *Choose how output is persisted.*
2836
+ ```yaml
2837
+ # Option 1: View (no physical storage, logical model)
2838
+ - name: "vw_active_customers"
2839
+ materialized: "view" # Creates SQL view instead of table
2840
+ transform:
2841
+ steps:
2842
+ - sql: "SELECT * FROM customers WHERE status = 'active'"
2843
+ write:
2844
+ connection: "gold"
2845
+ table: "vw_active_customers"
2846
+
2847
+ # Option 2: Incremental (append to existing Delta table)
2848
+ - name: "fact_events"
2849
+ materialized: "incremental" # Uses APPEND mode
2850
+ read:
2851
+ connection: "bronze"
2852
+ table: "raw_events"
2853
+ incremental:
2854
+ mode: "stateful"
2855
+ column: "event_time"
2856
+ write:
2857
+ connection: "silver"
2858
+ format: "delta"
2859
+ table: "fact_events"
2860
+
2861
+ # Option 3: Table (default - full overwrite)
2862
+ - name: "dim_products"
2863
+ materialized: "table" # Default behavior
2864
+ # ...
2865
+ ```
2866
+ """
2867
+
2868
+ name: str = Field(description="Unique node name")
2869
+ description: Optional[str] = Field(default=None, description="Human-readable description")
2870
+ runbook_url: Optional[str] = Field(
2871
+ default=None,
2872
+ description="URL to troubleshooting guide or runbook. Shown as 'Troubleshooting guide →' link on failures.",
2873
+ )
2874
+ enabled: bool = Field(default=True, description="If False, node is skipped during execution")
2875
+ tags: List[str] = Field(
2876
+ default_factory=list,
2877
+ description="Operational tags for selective execution (e.g., 'daily', 'critical'). Use with `odibi run --tag`.",
2878
+ )
2879
+ depends_on: List[str] = Field(
2880
+ default_factory=list,
2881
+ description="List of parent nodes that must complete before this node runs. The output of these nodes is available for reading.",
2882
+ )
2883
+
2884
+ columns: Dict[str, ColumnMetadata] = Field(
2885
+ default_factory=dict,
2886
+ description="Data Dictionary defining the output schema. Used for documentation, PII tagging, and validation.",
2887
+ )
2888
+
2889
+ # Operations (at least one required)
2890
+ read: Optional[ReadConfig] = Field(
2891
+ default=None,
2892
+ description="Input operation (Load). If missing, data is taken from the first dependency.",
2893
+ )
2894
+ inputs: Optional[Dict[str, Union[str, Dict[str, Any]]]] = Field(
2895
+ default=None,
2896
+ description=(
2897
+ "Multi-input support for cross-pipeline dependencies. "
2898
+ "Map input names to either: "
2899
+ "(a) $pipeline.node reference (e.g., '$read_bronze.shift_events') "
2900
+ "(b) Explicit read config dict. "
2901
+ "Cannot be used with 'read'. "
2902
+ "Example: inputs: {events: '$read_bronze.events', calendar: {connection: 'goat', path: 'cal'}}"
2903
+ ),
2904
+ )
2905
+ transform: Optional[TransformConfig] = Field(
2906
+ default=None,
2907
+ description="Chain of fine-grained transformation steps (SQL, functions). Runs after 'transformer' if both are present.",
2908
+ )
2909
+ write: Optional[WriteConfig] = Field(
2910
+ default=None, description="Output operation (Save to file/table)."
2911
+ )
2912
+ streaming: bool = Field(
2913
+ default=False, description="Enable streaming execution for this node (Spark only)"
2914
+ )
2915
+ transformer: Optional[str] = Field(
2916
+ default=None,
2917
+ description="Name of the 'App' logic to run (e.g., 'deduplicate', 'scd2'). See Transformer Catalog for options.",
2918
+ )
2919
+ params: Dict[str, Any] = Field(default_factory=dict, description="Parameters for transformer")
2920
+
2921
+ # Optional features
2922
+ pre_sql: List[str] = Field(
2923
+ default_factory=list,
2924
+ description=(
2925
+ "List of SQL statements to execute before node runs. "
2926
+ "Use for setup: temp tables, variable initialization, grants. "
2927
+ "Example: ['SET spark.sql.shuffle.partitions=200', "
2928
+ "'CREATE TEMP VIEW src AS SELECT * FROM raw']"
2929
+ ),
2930
+ )
2931
+ post_sql: List[str] = Field(
2932
+ default_factory=list,
2933
+ description=(
2934
+ "List of SQL statements to execute after node completes. "
2935
+ "Use for cleanup, optimization, or audit logging. "
2936
+ "Example: ['OPTIMIZE gold.fact_sales', 'VACUUM gold.fact_sales RETAIN 168 HOURS']"
2937
+ ),
2938
+ )
2939
+ materialized: Optional[Literal["table", "view", "incremental"]] = Field(
2940
+ default=None,
2941
+ description=(
2942
+ "Materialization strategy. Options: "
2943
+ "'table' (default physical write), "
2944
+ "'view' (creates SQL view instead of table), "
2945
+ "'incremental' (uses append mode for Delta tables). "
2946
+ "Views are useful for Gold layer logical models."
2947
+ ),
2948
+ )
2949
+
2950
+ cache: bool = Field(default=False, description="Cache result for reuse")
2951
+ log_level: Optional[LogLevel] = Field(
2952
+ default=None, description="Override log level for this node"
2953
+ )
2954
+ on_error: ErrorStrategy = Field(
2955
+ default=ErrorStrategy.FAIL_LATER, description="Failure handling strategy"
2956
+ )
2957
+ validation: Optional[ValidationConfig] = None
2958
+ contracts: List[TestConfig] = Field(
2959
+ default_factory=list,
2960
+ description="Pre-condition contracts (Circuit Breakers). Runs on input data before transformation.",
2961
+ )
2962
+ schema_policy: Optional[SchemaPolicyConfig] = Field(
2963
+ default=None, description="Schema drift handling policy"
2964
+ )
2965
+ privacy: Optional[PrivacyConfig] = Field(
2966
+ default=None, description="Privacy Suite: PII anonymization settings"
2967
+ )
2968
+ sensitive: Union[bool, List[str]] = Field(
2969
+ default=False, description="If true or list of columns, masks sample data in stories"
2970
+ )
2971
+
2972
+ # Internal: tracks which YAML file this node was defined in (for sql_file resolution)
2973
+ source_yaml: Optional[str] = Field(
2974
+ default=None,
2975
+ alias="_source_yaml",
2976
+ description="Internal: source YAML file path for sql_file resolution",
2977
+ )
2978
+
2979
+ model_config = {"populate_by_name": True}
2980
+
2981
+ @model_validator(mode="after")
2982
+ def check_at_least_one_operation(self):
2983
+ """Ensure at least one operation is defined."""
2984
+ if not any([self.read, self.inputs, self.transform, self.write, self.transformer]):
2985
+ raise ValueError(
2986
+ f"Node '{self.name}' must have at least one of: read, inputs, transform, write, transformer"
2987
+ )
2988
+ return self
2989
+
2990
+ @model_validator(mode="after")
2991
+ def check_read_inputs_exclusive(self):
2992
+ """Ensure read and inputs are mutually exclusive."""
2993
+ if self.read and self.inputs:
2994
+ raise ValueError(
2995
+ f"Node '{self.name}': Cannot have both 'read' and 'inputs'. "
2996
+ "Use 'read' for single-source nodes or 'inputs' for multi-source cross-pipeline dependencies."
2997
+ )
2998
+ return self
2999
+
3000
+ @model_validator(mode="after")
3001
+ def check_transformer_params(self):
3002
+ if self.transformer and not self.params:
3003
+ raise ValueError(
3004
+ f"Node '{self.name}': 'transformer' is set but 'params' is empty. "
3005
+ "Either remove transformer or provide matching params."
3006
+ )
3007
+ return self
3008
+
3009
+
3010
+ # ============================================
3011
+ # Pipeline Configuration
3012
+ # ============================================
3013
+
3014
+
3015
+ class PipelineConfig(BaseModel):
3016
+ """
3017
+ Configuration for a pipeline.
3018
+
3019
+ Example:
3020
+ ```yaml
3021
+ pipelines:
3022
+ - pipeline: "user_onboarding"
3023
+ description: "Ingest and process new users"
3024
+ layer: "silver"
3025
+ nodes:
3026
+ - name: "node1"
3027
+ ...
3028
+ ```
3029
+ """
3030
+
3031
+ pipeline: str = Field(description="Pipeline name")
3032
+ description: Optional[str] = Field(default=None, description="Pipeline description")
3033
+ layer: Optional[str] = Field(default=None, description="Logical layer (bronze/silver/gold)")
3034
+ nodes: List[NodeConfig] = Field(description="List of nodes in this pipeline")
3035
+
3036
+ @field_validator("nodes")
3037
+ @classmethod
3038
+ def check_unique_node_names(cls, nodes: List[NodeConfig]) -> List[NodeConfig]:
3039
+ """Ensure all node names are unique within the pipeline."""
3040
+ names = [node.name for node in nodes]
3041
+ if len(names) != len(set(names)):
3042
+ duplicates = [name for name in names if names.count(name) > 1]
3043
+ raise ValueError(f"Duplicate node names found: {set(duplicates)}")
3044
+ return nodes
3045
+
3046
+ @model_validator(mode="after")
3047
+ def auto_populate_depends_on_from_inputs(self):
3048
+ """
3049
+ Auto-populate depends_on for same-pipeline references in inputs.
3050
+
3051
+ If a node has inputs like $silver.other_node and this is the silver pipeline,
3052
+ automatically add 'other_node' to depends_on for correct execution order.
3053
+ """
3054
+ node_names = {node.name for node in self.nodes}
3055
+
3056
+ for node in self.nodes:
3057
+ if not node.inputs:
3058
+ continue
3059
+
3060
+ for input_name, ref in node.inputs.items():
3061
+ if not isinstance(ref, str) or not ref.startswith("$"):
3062
+ continue
3063
+
3064
+ # Parse $pipeline.node reference
3065
+ parts = ref[1:].split(".", 1)
3066
+ if len(parts) != 2:
3067
+ continue
3068
+
3069
+ ref_pipeline, ref_node = parts
3070
+
3071
+ # Check if reference is to same pipeline
3072
+ if ref_pipeline == self.pipeline and ref_node in node_names:
3073
+ # Add to depends_on if not already there
3074
+ if ref_node not in node.depends_on:
3075
+ node.depends_on.append(ref_node)
3076
+
3077
+ return self
3078
+
3079
+
3080
+ # ============================================
3081
+ # Project Configuration
3082
+ # ============================================
3083
+
3084
+
3085
+ class BackoffStrategy(str, Enum):
3086
+ EXPONENTIAL = "exponential"
3087
+ LINEAR = "linear"
3088
+ CONSTANT = "constant"
3089
+
3090
+
3091
+ class RetryConfig(BaseModel):
3092
+ """
3093
+ Retry configuration.
3094
+
3095
+ Example:
3096
+ ```yaml
3097
+ retry:
3098
+ enabled: true
3099
+ max_attempts: 3
3100
+ backoff: "exponential"
3101
+ ```
3102
+ """
3103
+
3104
+ enabled: bool = True
3105
+ max_attempts: int = Field(default=3, ge=1, le=10)
3106
+ backoff: BackoffStrategy = Field(default=BackoffStrategy.EXPONENTIAL)
3107
+
3108
+
3109
+ class LoggingConfig(BaseModel):
3110
+ """
3111
+ Logging configuration.
3112
+
3113
+ Example:
3114
+ ```yaml
3115
+ logging:
3116
+ level: "INFO"
3117
+ structured: true
3118
+ ```
3119
+ """
3120
+
3121
+ level: LogLevel = LogLevel.INFO
3122
+ structured: bool = Field(default=False, description="Output JSON logs")
3123
+ metadata: Dict[str, Any] = Field(default_factory=dict, description="Extra metadata in logs")
3124
+
3125
+
3126
+ class PerformanceConfig(BaseModel):
3127
+ """
3128
+ Performance tuning configuration.
3129
+
3130
+ Example:
3131
+ ```yaml
3132
+ performance:
3133
+ use_arrow: true
3134
+ spark_config:
3135
+ "spark.sql.shuffle.partitions": "200"
3136
+ "spark.sql.adaptive.enabled": "true"
3137
+ "spark.databricks.delta.optimizeWrite.enabled": "true"
3138
+ delta_table_properties:
3139
+ "delta.columnMapping.mode": "name"
3140
+ ```
3141
+
3142
+ **Spark Config Notes:**
3143
+ - Configs are applied via `spark.conf.set()` at runtime
3144
+ - For existing sessions (e.g., Databricks), only runtime-settable configs will take effect
3145
+ - Session-level configs (e.g., `spark.executor.memory`) require session restart
3146
+ - Common runtime-safe configs: shuffle partitions, adaptive query execution, Delta optimizations
3147
+ """
3148
+
3149
+ use_arrow: bool = Field(
3150
+ default=True,
3151
+ description="Use Apache Arrow-backed DataFrames (Pandas only). Reduces memory and speeds up I/O.",
3152
+ )
3153
+ spark_config: Dict[str, str] = Field(
3154
+ default_factory=dict,
3155
+ description=(
3156
+ "Spark configuration settings applied at runtime via spark.conf.set(). "
3157
+ "Example: {'spark.sql.shuffle.partitions': '200', 'spark.sql.adaptive.enabled': 'true'}. "
3158
+ "Note: Some configs require session restart and cannot be set at runtime."
3159
+ ),
3160
+ )
3161
+ delta_table_properties: Dict[str, str] = Field(
3162
+ default_factory=dict,
3163
+ description=(
3164
+ "Default table properties applied to all Delta writes. "
3165
+ "Example: {'delta.columnMapping.mode': 'name'} to allow special characters in column names."
3166
+ ),
3167
+ )
3168
+ skip_null_profiling: bool = Field(
3169
+ default=False,
3170
+ description=(
3171
+ "Skip null profiling in metadata collection phase. "
3172
+ "Reduces execution time for large DataFrames by avoiding an additional Spark job."
3173
+ ),
3174
+ )
3175
+ skip_catalog_writes: bool = Field(
3176
+ default=False,
3177
+ description=(
3178
+ "Skip catalog metadata writes (register_asset, track_schema, log_pattern, record_lineage) "
3179
+ "after each node write. Significantly improves performance for high-throughput pipelines "
3180
+ "like Bronze layer ingestion. Set to true when catalog tracking is not needed."
3181
+ ),
3182
+ )
3183
+ skip_run_logging: bool = Field(
3184
+ default=False,
3185
+ description=(
3186
+ "Skip batch catalog writes at pipeline end (log_runs_batch, register_outputs_batch). "
3187
+ "Saves 10-20s per pipeline run. Enable when you don't need run history in the catalog. "
3188
+ "Stories are still generated and contain full execution details."
3189
+ ),
3190
+ )
3191
+
3192
+
3193
+ class StoryConfig(BaseModel):
3194
+ """
3195
+ Story generation configuration.
3196
+
3197
+ Stories are ODIBI's core value - execution reports with lineage.
3198
+ They must use a connection for consistent, traceable output.
3199
+
3200
+ Example:
3201
+ ```yaml
3202
+ story:
3203
+ connection: "local_data"
3204
+ path: "stories/"
3205
+ retention_days: 30
3206
+ failure_sample_size: 100
3207
+ max_failure_samples: 500
3208
+ max_sampled_validations: 5
3209
+ ```
3210
+
3211
+ **Failure Sample Settings:**
3212
+ - `failure_sample_size`: Number of failed rows to capture per validation (default: 100)
3213
+ - `max_failure_samples`: Total failed rows across all validations (default: 500)
3214
+ - `max_sampled_validations`: After this many validations, show only counts (default: 5)
3215
+ """
3216
+
3217
+ connection: str = Field(
3218
+ description="Connection name for story output (uses connection's path resolution)"
3219
+ )
3220
+ path: str = Field(description="Path for stories (relative to connection base_path)")
3221
+ max_sample_rows: int = Field(default=10, ge=0, le=100)
3222
+ auto_generate: bool = True
3223
+ retention_days: Optional[int] = Field(default=30, ge=1, description="Days to keep stories")
3224
+ retention_count: Optional[int] = Field(
3225
+ default=100, ge=1, description="Max number of stories to keep"
3226
+ )
3227
+
3228
+ # Failure sample settings (troubleshooting)
3229
+ failure_sample_size: int = Field(
3230
+ default=100,
3231
+ ge=0,
3232
+ le=1000,
3233
+ description="Number of failed rows to capture per validation rule",
3234
+ )
3235
+ max_failure_samples: int = Field(
3236
+ default=500,
3237
+ ge=0,
3238
+ le=5000,
3239
+ description="Maximum total failed rows across all validations",
3240
+ )
3241
+ max_sampled_validations: int = Field(
3242
+ default=5,
3243
+ ge=1,
3244
+ le=20,
3245
+ description="After this many validations, show only counts (no samples)",
3246
+ )
3247
+
3248
+ # Performance settings
3249
+ async_generation: bool = Field(
3250
+ default=False,
3251
+ description=(
3252
+ "Generate stories asynchronously (fire-and-forget). "
3253
+ "Pipeline returns immediately while story writes in background. "
3254
+ "Improves multi-pipeline performance by ~5-10s per pipeline."
3255
+ ),
3256
+ )
3257
+
3258
+ # Lineage settings
3259
+ generate_lineage: bool = Field(
3260
+ default=True,
3261
+ description=(
3262
+ "Generate combined lineage graph from all stories. "
3263
+ "Creates a unified view of data flow across pipelines."
3264
+ ),
3265
+ )
3266
+
3267
+ @model_validator(mode="after")
3268
+ def check_retention_policy(self):
3269
+ if self.retention_days is None and self.retention_count is None:
3270
+ raise ValueError(
3271
+ "StoryConfig validation failed: No retention policy specified. "
3272
+ "Provide at least one of: 'retention_days' (e.g., 30) or 'retention_count' (e.g., 100). "
3273
+ "This controls how long/many story files are kept before cleanup."
3274
+ )
3275
+ return self
3276
+
3277
+
3278
+ class SyncFromConfig(BaseModel):
3279
+ """
3280
+ Configuration for syncing system data from a source location.
3281
+
3282
+ Used to pull system data (runs, state) from another backend into the target.
3283
+
3284
+ Example:
3285
+ ```yaml
3286
+ sync_from:
3287
+ connection: local_parquet
3288
+ path: .odibi/system/
3289
+ ```
3290
+ """
3291
+
3292
+ connection: str = Field(description="Connection name for the source system data")
3293
+ path: Optional[str] = Field(
3294
+ default=None,
3295
+ description="Path to source system data (for file-based sources)",
3296
+ )
3297
+ schema_name: Optional[str] = Field(
3298
+ default=None,
3299
+ description="Schema name for SQL Server source (if applicable)",
3300
+ )
3301
+
3302
+
3303
+ class SystemConfig(BaseModel):
3304
+ """
3305
+ Configuration for the Odibi System Catalog (The Brain).
3306
+
3307
+ Stores metadata, state, and pattern configurations.
3308
+
3309
+ Example:
3310
+ ```yaml
3311
+ system:
3312
+ connection: adls_bronze
3313
+ path: _odibi_system
3314
+ environment: dev # Tags all system records with environment
3315
+ ```
3316
+
3317
+ With SQL Server (Phase 2):
3318
+ ```yaml
3319
+ system:
3320
+ connection: sql_server
3321
+ schema: odibi_system
3322
+ environment: prod
3323
+ ```
3324
+
3325
+ With sync from local (Phase 4):
3326
+ ```yaml
3327
+ system:
3328
+ connection: sql_server
3329
+ schema_name: odibi_system
3330
+ environment: prod
3331
+ sync_from:
3332
+ connection: local_parquet
3333
+ path: .odibi/system/
3334
+ ```
3335
+ """
3336
+
3337
+ connection: str = Field(description="Connection to store system tables (e.g., 'adls_bronze')")
3338
+ path: str = Field(default="_odibi_system", description="Path relative to connection root")
3339
+ environment: Optional[str] = Field(
3340
+ default=None,
3341
+ description=(
3342
+ "Environment tag (e.g., 'dev', 'qat', 'prod'). "
3343
+ "Written to all system table records for cross-environment querying."
3344
+ ),
3345
+ )
3346
+ schema_name: Optional[str] = Field(
3347
+ default=None,
3348
+ description="Schema name for SQL Server system tables (e.g., 'odibi_system'). Used when connection is SQL Server.",
3349
+ )
3350
+ sync_from: Optional[SyncFromConfig] = Field(
3351
+ default=None,
3352
+ description=(
3353
+ "Source to sync system data from. Enables pushing local development "
3354
+ "data to centralized SQL Server system tables."
3355
+ ),
3356
+ )
3357
+
3358
+
3359
+ class LineageConfig(BaseModel):
3360
+ """
3361
+ Configuration for OpenLineage integration.
3362
+
3363
+ Example:
3364
+ ```yaml
3365
+ lineage:
3366
+ url: "http://localhost:5000"
3367
+ namespace: "my_project"
3368
+ ```
3369
+ """
3370
+
3371
+ url: Optional[str] = Field(default=None, description="OpenLineage API URL")
3372
+ namespace: str = Field(default="odibi", description="Namespace for jobs")
3373
+ api_key: Optional[str] = Field(default=None, description="API Key")
3374
+
3375
+
3376
+ class ProjectConfig(BaseModel):
3377
+ """
3378
+ Complete project configuration from YAML.
3379
+
3380
+ ### 🏢 "Enterprise Setup" Guide
3381
+
3382
+ **Business Problem:**
3383
+ "We need a robust production environment with alerts, retries, and proper logging."
3384
+
3385
+ **Recipe: Production Ready**
3386
+ ```yaml
3387
+ project: "Customer360"
3388
+ engine: "spark"
3389
+
3390
+ # 1. Resilience
3391
+ retry:
3392
+ enabled: true
3393
+ max_attempts: 3
3394
+ backoff: "exponential"
3395
+
3396
+ # 2. Observability
3397
+ logging:
3398
+ level: "INFO"
3399
+ structured: true # JSON logs for Splunk/Datadog
3400
+
3401
+ # 3. Alerting
3402
+ alerts:
3403
+ - type: "slack"
3404
+ url: "${SLACK_WEBHOOK_URL}"
3405
+ on_events: ["on_failure"]
3406
+
3407
+ # ... connections and pipelines ...
3408
+ ```
3409
+ """
3410
+
3411
+ # === MANDATORY ===
3412
+ project: str = Field(description="Project name")
3413
+ engine: EngineType = Field(default=EngineType.PANDAS, description="Execution engine")
3414
+ connections: Dict[str, ConnectionConfig] = Field(
3415
+ description="Named connections (at least one required)"
3416
+ )
3417
+ pipelines: List[PipelineConfig] = Field(
3418
+ description="Pipeline definitions (at least one required)"
3419
+ )
3420
+ story: StoryConfig = Field(description="Story generation configuration (mandatory)")
3421
+ system: SystemConfig = Field(description="System Catalog configuration (mandatory)")
3422
+
3423
+ # === OPTIONAL (with sensible defaults) ===
3424
+ lineage: Optional["LineageConfig"] = Field(
3425
+ default=None, description="OpenLineage configuration"
3426
+ )
3427
+ description: Optional[str] = Field(default=None, description="Project description")
3428
+ version: str = Field(default="1.0.0", description="Project version")
3429
+ owner: Optional[str] = Field(default=None, description="Project owner/contact")
3430
+ vars: Dict[str, Any] = Field(
3431
+ default_factory=dict, description="Global variables for substitution (e.g. ${vars.env})"
3432
+ )
3433
+
3434
+ # Global settings (optional with defaults in Pydantic)
3435
+ retry: RetryConfig = Field(default_factory=RetryConfig)
3436
+ logging: LoggingConfig = Field(default_factory=LoggingConfig)
3437
+ alerts: List[AlertConfig] = Field(default_factory=list, description="Alert configurations")
3438
+ performance: PerformanceConfig = Field(
3439
+ default_factory=PerformanceConfig, description="Performance tuning"
3440
+ )
3441
+
3442
+ # === PHASE 3 ===
3443
+ environments: Optional[Dict[str, Dict[str, Any]]] = Field(
3444
+ default=None,
3445
+ description="Structure: same as ProjectConfig but with only overridden fields. Not yet validated strictly.",
3446
+ )
3447
+
3448
+ # === SEMANTIC LAYER ===
3449
+ semantic: Optional[Dict[str, Any]] = Field(
3450
+ default=None,
3451
+ description=(
3452
+ "Semantic layer configuration. Can be inline or reference external file. "
3453
+ "Contains metrics, dimensions, and materializations for self-service analytics. "
3454
+ "Example: semantic: { config: 'semantic_config.yaml' } or inline definitions."
3455
+ ),
3456
+ )
3457
+
3458
+ @model_validator(mode="after")
3459
+ def validate_story_connection_exists(self):
3460
+ """Ensure story.connection is defined in connections."""
3461
+ if self.story.connection not in self.connections:
3462
+ available = ", ".join(sorted(self.connections.keys())) or "(none defined)"
3463
+ raise ValueError(
3464
+ f"ProjectConfig validation failed: Story connection '{self.story.connection}' not found in connections. "
3465
+ f"Available connections: [{available}]. "
3466
+ f"Add '{self.story.connection}' to your connections section or update story.connection to use an existing one."
3467
+ )
3468
+ return self
3469
+
3470
+ @model_validator(mode="after")
3471
+ def ensure_system_config(self):
3472
+ """
3473
+ Validate system config connection exists.
3474
+ """
3475
+ if self.system is None:
3476
+ raise ValueError(
3477
+ "ProjectConfig validation failed: 'system' configuration is mandatory. "
3478
+ "Add a system section with connection and path for the Odibi System Catalog. "
3479
+ "Example: system: { connection: 'adls_bronze', path: '_odibi_system' }"
3480
+ )
3481
+
3482
+ # Ensure the system connection exists
3483
+ if self.system.connection not in self.connections:
3484
+ available = ", ".join(sorted(self.connections.keys())) or "(none defined)"
3485
+ raise ValueError(
3486
+ f"ProjectConfig validation failed: System connection '{self.system.connection}' not found in connections. "
3487
+ f"Available connections: [{available}]. "
3488
+ f"Add '{self.system.connection}' to your connections section or update system.connection to use an existing one."
3489
+ )
3490
+
3491
+ return self
3492
+
3493
+ @model_validator(mode="after")
3494
+ def validate_environments_structure(self):
3495
+ """Validate environments block contains only overrideable fields."""
3496
+ if not self.environments:
3497
+ return self
3498
+
3499
+ # Fields that can be overridden per environment
3500
+ overrideable_fields = {
3501
+ "connections",
3502
+ "system",
3503
+ "performance",
3504
+ "logging",
3505
+ "retry",
3506
+ "alerts",
3507
+ "story",
3508
+ "lineage",
3509
+ }
3510
+
3511
+ for env_name, env_overrides in self.environments.items():
3512
+ if not isinstance(env_overrides, dict):
3513
+ raise ValueError(
3514
+ f"Environment '{env_name}' must be a dictionary of overrides, "
3515
+ f"got {type(env_overrides).__name__}"
3516
+ )
3517
+ invalid_keys = set(env_overrides.keys()) - overrideable_fields
3518
+ if invalid_keys:
3519
+ raise ValueError(
3520
+ f"Environment '{env_name}' contains non-overrideable fields: "
3521
+ f"{sorted(invalid_keys)}. "
3522
+ f"Only these fields can be overridden: {sorted(overrideable_fields)}"
3523
+ )
3524
+
3525
+ return self
3526
+
3527
+
3528
+ def load_config_from_file(path: str) -> ProjectConfig:
3529
+ """
3530
+ Load and validate configuration from file.
3531
+
3532
+ Args:
3533
+ path: Path to YAML file
3534
+
3535
+ Returns:
3536
+ ProjectConfig
3537
+ """
3538
+ from odibi.utils import load_yaml_with_env
3539
+
3540
+ config_dict = load_yaml_with_env(path)
3541
+ return ProjectConfig(**config_dict)