fabricks 3.0.18__py3-none-any.whl → 4.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. fabricks/api/context.py +15 -3
  2. fabricks/api/notebooks/schedule.py +2 -3
  3. fabricks/api/parsers.py +2 -1
  4. fabricks/api/utils.py +3 -1
  5. fabricks/cdc/__init__.py +1 -2
  6. fabricks/cdc/base/__init__.py +1 -2
  7. fabricks/cdc/base/_types.py +5 -3
  8. fabricks/cdc/base/configurator.py +5 -0
  9. fabricks/cdc/base/generator.py +7 -3
  10. fabricks/cdc/base/merger.py +2 -0
  11. fabricks/cdc/base/processor.py +15 -0
  12. fabricks/cdc/templates/README.md +490 -0
  13. fabricks/cdc/templates/ctes/base.sql.jinja +1 -0
  14. fabricks/cdc/templates/ctes/current.sql.jinja +4 -0
  15. fabricks/cdc/templates/merges/scd1.sql.jinja +6 -0
  16. fabricks/cdc/templates/merges/scd2.sql.jinja +6 -0
  17. fabricks/cdc/templates/queries/context.sql.jinja +104 -96
  18. fabricks/cdc/templates/query.sql.jinja +1 -1
  19. fabricks/context/__init__.py +13 -1
  20. fabricks/context/config.py +13 -122
  21. fabricks/context/log.py +92 -1
  22. fabricks/context/runtime.py +35 -69
  23. fabricks/context/spark_session.py +8 -7
  24. fabricks/context/utils.py +26 -39
  25. fabricks/core/__init__.py +2 -2
  26. fabricks/core/dags/base.py +5 -5
  27. fabricks/core/dags/processor.py +2 -3
  28. fabricks/core/extenders.py +1 -1
  29. fabricks/core/job_schema.py +26 -16
  30. fabricks/core/jobs/__init__.py +1 -7
  31. fabricks/core/jobs/base/README.md +1545 -0
  32. fabricks/core/jobs/base/__init__.py +1 -8
  33. fabricks/core/jobs/base/checker.py +7 -7
  34. fabricks/core/jobs/base/configurator.py +142 -63
  35. fabricks/core/jobs/base/generator.py +38 -34
  36. fabricks/core/jobs/base/invoker.py +48 -63
  37. fabricks/core/jobs/base/processor.py +13 -28
  38. fabricks/core/jobs/bronze.py +88 -38
  39. fabricks/core/jobs/get_job.py +3 -6
  40. fabricks/core/jobs/get_job_conf.py +19 -68
  41. fabricks/core/jobs/get_jobs.py +10 -11
  42. fabricks/core/jobs/get_schedules.py +3 -17
  43. fabricks/core/jobs/gold.py +96 -43
  44. fabricks/core/jobs/silver.py +42 -22
  45. fabricks/core/masks.py +11 -8
  46. fabricks/core/parsers/__init__.py +0 -2
  47. fabricks/core/parsers/base.py +10 -10
  48. fabricks/core/parsers/decorator.py +1 -1
  49. fabricks/core/parsers/get_parser.py +4 -5
  50. fabricks/core/schedules/process.py +1 -4
  51. fabricks/core/steps/base.py +27 -17
  52. fabricks/core/steps/get_step.py +2 -4
  53. fabricks/core/steps/get_step_conf.py +3 -7
  54. fabricks/core/udfs.py +9 -8
  55. fabricks/core/views.py +2 -2
  56. fabricks/deploy/__init__.py +27 -16
  57. fabricks/deploy/masks.py +1 -1
  58. fabricks/deploy/notebooks.py +19 -16
  59. fabricks/deploy/schedules.py +1 -1
  60. fabricks/deploy/tables.py +66 -49
  61. fabricks/deploy/udfs.py +2 -2
  62. fabricks/deploy/views.py +15 -16
  63. fabricks/metastore/database.py +3 -3
  64. fabricks/metastore/table.py +103 -68
  65. fabricks/models/__init__.py +125 -0
  66. fabricks/models/common.py +79 -0
  67. fabricks/models/config.py +225 -0
  68. fabricks/models/dependency.py +50 -0
  69. fabricks/models/job.py +157 -0
  70. fabricks/models/path.py +17 -0
  71. fabricks/models/runtime.py +182 -0
  72. fabricks/models/schedule.py +21 -0
  73. fabricks/models/step.py +103 -0
  74. fabricks/models/table.py +77 -0
  75. fabricks/{core/jobs/get_job_id.py → models/utils.py} +2 -0
  76. fabricks/utils/helpers.py +6 -5
  77. fabricks/utils/log.py +25 -6
  78. fabricks/utils/path.py +269 -102
  79. fabricks/utils/pip.py +7 -7
  80. fabricks/utils/read/read.py +23 -22
  81. fabricks/utils/read/read_yaml.py +2 -2
  82. fabricks/utils/write/delta.py +4 -4
  83. fabricks/utils/write/stream.py +2 -2
  84. {fabricks-3.0.18.dist-info → fabricks-4.0.0.dist-info}/METADATA +9 -4
  85. {fabricks-3.0.18.dist-info → fabricks-4.0.0.dist-info}/RECORD +86 -83
  86. fabricks/context/_types.py +0 -137
  87. fabricks/context/helpers.py +0 -63
  88. fabricks/core/jobs/base/_types.py +0 -284
  89. fabricks/core/parsers/_types.py +0 -6
  90. fabricks/utils/fdict.py +0 -240
  91. fabricks/utils/pydantic.py +0 -94
  92. fabricks/utils/schema/__init__.py +0 -7
  93. fabricks/utils/schema/get_json_schema_for_type.py +0 -161
  94. fabricks/utils/schema/get_schema_for_type.py +0 -99
  95. {fabricks-3.0.18.dist-info → fabricks-4.0.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,1545 @@
1
+ # Fabricks Job Options Documentation
2
+
3
+ > **Comprehensive guide to configuring Bronze, Silver, and Gold tier jobs in Fabricks**
4
+
5
+ This document provides detailed documentation for all job configuration options in Fabricks. Jobs are organized into three tiers: Bronze, Silver, and Gold, each with their own specific options and behaviors.
6
+
7
+ ---
8
+
9
+ ## 📑 Table of Contents
10
+
11
+ - [📋 Overview](#-overview)
12
+ - [🏗️ Job Tiers](#️-job-tiers)
13
+ - [⚙️ Common Options](#️-common-options)
14
+ - [🥉 Bronze Options](#-bronze-options)
15
+ - [🥈 Silver Options](#-silver-options)
16
+ - [🥇 Gold Options](#-gold-options)
17
+ - [📊 Table Options](#-table-options)
18
+ - [✅ Check Options](#-check-options)
19
+ - [⚡ Spark Options](#-spark-options)
20
+ - [🔗 Invoker Options](#-invoker-options)
21
+ - [🔌 Extender Options](#-extender-options)
22
+ - [📝 Complete Configuration Examples](#-complete-configuration-examples)
23
+ - [💡 Key Concepts](#-key-concepts)
24
+ - [🎯 Best Practices](#-best-practices)
25
+
26
+ ---
27
+
28
+ ## 📋 Overview
29
+
30
+ Fabricks uses a **tiered data processing architecture** with three layers:
31
+
32
+ | Tier | Purpose | Description |
33
+ |------|---------|-------------|
34
+ | **🥉 Bronze** | Raw Data Ingestion | Ingest raw data from external sources with minimal transformation |
35
+ | **🥈 Silver** | Data Cleaning & Validation | Clean, validate, deduplicate, and apply quality checks |
36
+ | **🥇 Gold** | Business Analytics | Create business-ready aggregated and transformed data |
37
+
38
+ Each tier has specific configuration options that control how data is processed, stored, and managed.
39
+
40
+ ---
41
+
42
+ ## 🏗️ Job Tiers
43
+
44
+ ### Available Tiers
45
+
46
+ ```python
47
+ TBronze = Literal["bronze"] # Raw data ingestion layer
48
+ TSilver = Literal["silver"] # Cleaned and validated data layer
49
+ TGold = Literal["gold"] # Business-ready analytics layer
50
+ ```
51
+
52
+ ---
53
+
54
+ ## ⚙️ Common Options
55
+
56
+ > These options are available across **all tiers** (Bronze, Silver, and Gold).
57
+
58
+ <table>
59
+ <tr>
60
+ <td width="200"><strong>Option</strong></td>
61
+ <td><strong>Description</strong></td>
62
+ </tr>
63
+
64
+ <tr>
65
+ <td valign="top">
66
+
67
+ ### `type`
68
+
69
+ **Type:** `Optional[Literal["manual", "default"]]`
70
+ **Default:** `"default"`
71
+
72
+ </td>
73
+ <td>
74
+
75
+ Specifies the job execution type:
76
+ - `"default"` - Standard automated job processing
77
+ - `"manual"` - Manual intervention required, job skips automatic execution
78
+
79
+ </td>
80
+ </tr>
81
+
82
+ <tr>
83
+ <td valign="top">
84
+
85
+ ### `parents`
86
+
87
+ **Type:** `Optional[List[str]]`
88
+ **Default:** `None`
89
+
90
+ </td>
91
+ <td>
92
+
93
+ List of parent job names that this job depends on. The job will only execute after all parent jobs have completed successfully.
94
+
95
+ **Example:**
96
+ ```python
97
+ parents=["bronze__customers", "bronze__orders"]
98
+ ```
99
+
100
+ </td>
101
+ </tr>
102
+
103
+ <tr>
104
+ <td valign="top">
105
+
106
+ ### `filter_where`
107
+
108
+ **Type:** `Optional[str]`
109
+ **Default:** `None`
110
+
111
+ </td>
112
+ <td>
113
+
114
+ SQL WHERE clause to filter data during processing. Applied to the source data before any transformations.
115
+
116
+ **Example:**
117
+ ```python
118
+ filter_where="status = 'active' AND created_date >= '2024-01-01'"
119
+ ```
120
+
121
+ </td>
122
+ </tr>
123
+
124
+ <tr>
125
+ <td valign="top">
126
+
127
+ ### `optimize`
128
+
129
+ **Type:** `Optional[bool]`
130
+ **Default:** `False`
131
+
132
+ </td>
133
+ <td>
134
+
135
+ When `True`, runs the OPTIMIZE command on the target table after data loading to improve query performance by compacting small files.
136
+
137
+ </td>
138
+ </tr>
139
+
140
+ <tr>
141
+ <td valign="top">
142
+
143
+ ### `compute_statistics`
144
+
145
+ **Type:** `Optional[bool]`
146
+ **Default:** `False`
147
+
148
+ </td>
149
+ <td>
150
+
151
+ When `True`, computes table statistics after data loading to help the query optimizer make better decisions.
152
+
153
+ </td>
154
+ </tr>
155
+
156
+ <tr>
157
+ <td valign="top">
158
+
159
+ ### `vacuum`
160
+
161
+ **Type:** `Optional[bool]`
162
+ **Default:** `False`
163
+
164
+ </td>
165
+ <td>
166
+
167
+ When `True`, runs the VACUUM command to remove old data files that are no longer referenced by the table (typically files older than the retention period).
168
+
169
+ </td>
170
+ </tr>
171
+
172
+ <tr>
173
+ <td valign="top">
174
+
175
+ ### `no_drop`
176
+
177
+ **Type:** `Optional[bool]`
178
+ **Default:** `False`
179
+
180
+ </td>
181
+ <td>
182
+
183
+ When `True`, prevents the table from being dropped during job execution, even if the job configuration would normally trigger a drop operation.
184
+
185
+ </td>
186
+ </tr>
187
+
188
+ <tr>
189
+ <td valign="top">
190
+
191
+ ### `timeout`
192
+
193
+ **Type:** `Optional[int]`
194
+ **Default:** `None`
195
+
196
+ </td>
197
+ <td>
198
+
199
+ Maximum execution time in seconds for the job. If the job exceeds this time, it will be terminated.
200
+
201
+ </td>
202
+ </tr>
203
+
204
+ </table>
205
+
206
+ ---
207
+
208
+ ## 🥉 Bronze Options
209
+
210
+ > **Bronze tier**: Raw data ingestion from external sources
211
+
212
+ Bronze tier is responsible for ingesting raw data from external sources with minimal transformation. It focuses on capturing data exactly as it arrives.
213
+
214
+ ### 🔧 Bronze-Specific Options
215
+
216
+ <details open>
217
+ <summary><h4>📌 <code>mode</code> (Required)</h4></summary>
218
+
219
+ **Type:** `Literal["memory", "append", "register"]`
220
+
221
+ Defines how data is loaded into the bronze table:
222
+
223
+ | Mode | Description | Use Case |
224
+ |------|-------------|----------|
225
+ | `"memory"` | Load data into memory only, don't persist | Temporary or test data |
226
+ | `"append"` | Append new data without checking duplicates | Raw data ingestion |
227
+ | `"register"` | Register external table without moving data | External data sources |
228
+
229
+ </details>
230
+
231
+ <details open>
232
+ <summary><h4>📌 <code>uri</code> (Required)</h4></summary>
233
+
234
+ **Type:** `str`
235
+
236
+ URI or path to the source data. Supports multiple formats:
237
+
238
+ - **File path**: `"/mnt/data/customers/*.json"`
239
+ - **URL**: `"https://api.example.com/data"`
240
+ - **Cloud storage**: `"s3://bucket/path/to/data"`
241
+ - **Database connection string**
242
+
243
+ **Example:**
244
+ ```python
245
+ uri="/mnt/raw/customers/2024/*.parquet"
246
+ ```
247
+
248
+ </details>
249
+
250
+ <details open>
251
+ <summary><h4>📌 <code>parser</code> (Required)</h4></summary>
252
+
253
+ **Type:** `str`
254
+
255
+ Name of the parser to use for reading and transforming the source data. Parsers define how to interpret the source format.
256
+
257
+ **Example:**
258
+ ```python
259
+ parser="json_customer_parser"
260
+ ```
261
+
262
+ </details>
263
+
264
+ <details open>
265
+ <summary><h4>📌 <code>source</code> (Required)</h4></summary>
266
+
267
+ **Type:** `str`
268
+
269
+ Name or identifier of the source system providing the data.
270
+
271
+ **Example:**
272
+ ```python
273
+ source="salesforce_api"
274
+ ```
275
+
276
+ </details>
277
+
278
+ <details open>
279
+ <summary><h4>📌 <code>keys</code></h4></summary>
280
+
281
+ **Type:** `Optional[List[str]]` | **Default:** `None`
282
+
283
+ List of column names that uniquely identify a record. Used for deduplication and change tracking.
284
+
285
+ **Example:**
286
+ ```python
287
+ keys=["customer_id"]
288
+ ```
289
+
290
+ </details>
291
+
292
+ <details open>
293
+ <summary><h4>📌 <code>encrypted_columns</code></h4></summary>
294
+
295
+ **Type:** `Optional[List[str]]` | **Default:** `None`
296
+
297
+ List of column names containing encrypted data that should be decrypted during ingestion.
298
+
299
+ **Example:**
300
+ ```python
301
+ encrypted_columns=["ssn", "credit_card"]
302
+ ```
303
+
304
+ </details>
305
+
306
+ <details open>
307
+ <summary><h4>📌 <code>calculated_columns</code></h4></summary>
308
+
309
+ **Type:** `Optional[dict[str, str]]` | **Default:** `None`
310
+
311
+ Dictionary mapping new column names to SQL expressions for calculating derived values.
312
+
313
+ **Example:**
314
+ ```python
315
+ calculated_columns={
316
+ "full_name": "concat(first_name, ' ', last_name)",
317
+ "age": "year(current_date()) - year(birth_date)"
318
+ }
319
+ ```
320
+
321
+ </details>
322
+
323
+ <details open>
324
+ <summary><h4>📌 <code>operation</code></h4></summary>
325
+
326
+ **Type:** `Optional[Literal["upsert", "reload", "delete"]]` | **Default:** `None`
327
+
328
+ Specifies the operation type for CDC (Change Data Capture):
329
+
330
+ | Operation | Description |
331
+ |-----------|-------------|
332
+ | `"upsert"` | Insert new records or update existing ones |
333
+ | `"reload"` | Full reload of all data (marks timestamp for downstream) |
334
+ | `"delete"` | Mark records as deleted |
335
+
336
+ </details>
337
+
338
+ ### 📦 Complete Bronze Configuration Example
339
+
340
+ <details>
341
+ <summary>Click to expand full Bronze configuration example</summary>
342
+
343
+ ```python
344
+ BronzeOptions(
345
+ # Required fields
346
+ mode="append",
347
+ uri="/mnt/data/customers/*.json",
348
+ parser="customer_json_parser",
349
+ source="crm_system",
350
+
351
+ # Identification
352
+ keys=["customer_id"],
353
+
354
+ # Common options
355
+ type="default",
356
+ parents=["bronze__raw_sources"],
357
+ filter_where="status IS NOT NULL",
358
+ optimize=True,
359
+ compute_statistics=True,
360
+ vacuum=False,
361
+ no_drop=False,
362
+ timeout=3600,
363
+
364
+ # Bronze-specific
365
+ encrypted_columns=["ssn"],
366
+ calculated_columns={"full_name": "concat(first_name, ' ', last_name)"},
367
+ operation="upsert"
368
+ )
369
+ ```
370
+
371
+ </details>
372
+
373
+ ---
374
+
375
+ ## 🥈 Silver Options
376
+
377
+ > **Silver tier**: Cleaned and validated data with quality checks and change tracking
378
+
379
+ Silver tier processes bronze data with quality checks, deduplication, and change data capture strategies.
380
+
381
+ ### 🔧 Silver-Specific Options
382
+
383
+ <details open>
384
+ <summary><h4>📌 <code>mode</code> (Required)</h4></summary>
385
+
386
+ **Type:** `Literal["memory", "append", "latest", "update", "combine"]`
387
+
388
+ Defines how data is processed and loaded:
389
+
390
+ | Mode | Description | Use Case |
391
+ |------|-------------|----------|
392
+ | `"memory"` | Process in memory only without persisting | Testing transformations |
393
+ | `"append"` | Append all new data without checking existing | Accumulate all data |
394
+ | `"latest"` | Process only the most recent data | Latest snapshot processing |
395
+ | `"update"` | Incremental updates, only new/changed records | Efficient incremental loads |
396
+ | `"combine"` | Combine multiple sources into single table | Multi-source consolidation |
397
+
398
+ </details>
399
+
400
+ <details open>
401
+ <summary><h4>📌 <code>change_data_capture</code> (Required)</h4></summary>
402
+
403
+ **Type:** `Literal["nocdc", "scd1", "scd2"]`
404
+
405
+ Change Data Capture strategy for tracking changes:
406
+
407
+ | Strategy | Description | History Tracking |
408
+ |----------|-------------|------------------|
409
+ | `"nocdc"` | No change tracking, simple updates/inserts | ❌ None |
410
+ | `"scd1"` | Slowly Changing Dimension Type 1 - Overwrite | ⚠️ Current state only |
411
+ | `"scd2"` | Slowly Changing Dimension Type 2 - Versioning | ✅ Full history |
412
+
413
+ </details>
414
+
415
+ <details open>
416
+ <summary><h4>📌 <code>deduplicate</code></h4></summary>
417
+
418
+ **Type:** `Optional[bool]` | **Default:** `False`
419
+
420
+ When `True`, removes duplicate records based on keys and hash values. Keeps the most recent record for each key.
421
+
422
+ </details>
423
+
424
+ <details open>
425
+ <summary><h4>📌 <code>stream</code></h4></summary>
426
+
427
+ **Type:** `Optional[bool]` | **Default:** `False`
428
+
429
+ When `True`, processes data using Spark Structured Streaming for real-time data processing.
430
+
431
+ </details>
432
+
433
+ <details open>
434
+ <summary><h4>📌 <code>order_duplicate_by</code></h4></summary>
435
+
436
+ **Type:** `Optional[dict[str, str]]` | **Default:** `None`
437
+
438
+ Dictionary specifying columns and sort order for determining which duplicate record to keep.
439
+
440
+ **Example:**
441
+ ```python
442
+ order_duplicate_by={
443
+ "updated_at": "desc",
444
+ "priority": "desc"
445
+ }
446
+ ```
447
+
448
+ </details>
449
+
450
+ ### 📦 Complete Silver Configuration Example
451
+
452
+ <details>
453
+ <summary>Click to expand full Silver configuration example</summary>
454
+
455
+ ```python
456
+ SilverOptions(
457
+ # Required fields
458
+ mode="update",
459
+ change_data_capture="scd2",
460
+
461
+ # Common options
462
+ type="default",
463
+ parents=["bronze__customers"],
464
+ filter_where="quality_score > 0.8",
465
+ optimize=True,
466
+ compute_statistics=True,
467
+ vacuum=True,
468
+ no_drop=False,
469
+ timeout=7200,
470
+
471
+ # Silver-specific
472
+ deduplicate=True,
473
+ stream=False,
474
+ order_duplicate_by={"updated_at": "desc"}
475
+ )
476
+ ```
477
+
478
+ </details>
479
+
480
+ ---
481
+
482
+ ## 🥇 Gold Options
483
+
484
+ > **Gold tier**: Business-ready analytics with aggregations and ML enrichment
485
+
486
+ Gold tier creates business-ready data with aggregations, transformations, and analytics-optimized structures.
487
+
488
+ ### 🔧 Gold-Specific Options
489
+
490
+ #### mode (Required)
491
+ **Type:** `Literal["memory", "append", "complete", "update", "invoke"]`
492
+
493
+ Defines how data is processed and loaded:
494
+
495
+ - **`"memory"`**: Process in memory only
496
+ - **`"append"`**: Append new calculated results
497
+ - **`"complete"`**: Complete rebuild of the entire table
498
+ - **`"update"`**: Incremental updates to existing data
499
+ - **`"invoke"`**: Execute external notebook or process
500
+
501
+ #### change_data_capture (Required)
502
+ **Type:** `Literal["nocdc", "scd1", "scd2"]`
503
+
504
+ Same as Silver tier - defines the CDC strategy.
505
+
506
+ #### update_where
507
+ **Type:** `Optional[str]`
508
+ **Default:** `None`
509
+
510
+ SQL WHERE clause to filter which records should be updated. Only applicable in `"update"` mode.
511
+
512
+ **Example:**
513
+ ```python
514
+ update_where="last_modified >= current_date() - INTERVAL 7 DAYS"
515
+ ```
516
+
517
+ #### deduplicate
518
+ **Type:** `Optional[bool]`
519
+ **Default:** `False`
520
+
521
+ Remove duplicates based on keys and hash values.
522
+
523
+ #### rectify_as_upserts
524
+ **Type:** `Optional[bool]`
525
+ **Default:** `False`
526
+
527
+ When `True`, converts reload operations into individual upsert and delete operations. This ensures historical consistency by generating synthetic delete records for items that disappear between reloads.
528
+
529
+ **Use Case:** Handle scenarios where records are deleted between full reloads without explicit delete operations.
530
+
531
+ #### correct_valid_from
532
+ **Type:** `Optional[bool]`
533
+ **Default:** `False`
534
+
535
+ When `True` and using SCD2, sets the `__valid_from` timestamp of the earliest record to `'1900-01-01'` instead of the actual first timestamp.
536
+
537
+ **Use Case:** Standardize historical record start dates for reporting purposes.
538
+
539
+ #### persist_last_timestamp
540
+ **Type:** `Optional[bool]`
541
+ **Default:** `False`
542
+
543
+ When `True`, persists the maximum timestamp from the processed data to be used as a watermark for the next incremental run.
544
+
545
+ #### persist_last_updated_timestamp
546
+ **Type:** `Optional[bool]`
547
+ **Default:** `False`
548
+
549
+ When `True`, persists the maximum `__last_updated` timestamp for incremental processing tracking.
550
+
551
+ #### table
552
+ **Type:** `Optional[str]`
553
+ **Default:** `None`
554
+
555
+ Override the default target table name with a custom table name.
556
+
557
+ **Example:**
558
+ ```python
559
+ table="custom_schema.custom_table_name"
560
+ ```
561
+
562
+ #### notebook
563
+ **Type:** `Optional[bool]`
564
+ **Default:** `False`
565
+
566
+ When `True`, generates a notebook for this job that can be executed independently.
567
+
568
+ #### requirements
569
+ **Type:** `Optional[bool]`
570
+ **Default:** `False`
571
+
572
+ When `True`, generates a requirements file for dependencies needed by this job.
573
+
574
+ #### metadata
575
+ **Type:** `Optional[bool]`
576
+ **Default:** `False`
577
+
578
+ When `True`, adds or updates metadata tracking columns (`__metadata.inserted`, `__metadata.updated`).
579
+
580
+ #### last_updated
581
+ **Type:** `Optional[bool]`
582
+ **Default:** `False`
583
+
584
+ When `True`, adds or updates the `__last_updated` timestamp column.
585
+
586
+ ### Complete Gold Configuration
587
+
588
+ ```python
589
+ GoldOptions(
590
+ type="default",
591
+ mode="update",
592
+ change_data_capture="scd2",
593
+ update_where="updated_at > (SELECT max(last_run) FROM control_table)",
594
+ parents=["silver__customers", "silver__orders"],
595
+ optimize=True,
596
+ compute_statistics=True,
597
+ vacuum=True,
598
+ no_drop=False,
599
+ deduplicate=True,
600
+ rectify_as_upserts=True,
601
+ correct_valid_from=True,
602
+ persist_last_timestamp=True,
603
+ persist_last_updated_timestamp=True,
604
+ table="analytics.customer_360",
605
+ notebook=False,
606
+ requirements=False,
607
+ timeout=10800,
608
+ metadata=True,
609
+ last_updated=True
610
+ )
611
+ ```
612
+
613
+ ---
614
+
615
+ ## Table Options
616
+
617
+ Table options control the physical table structure and optimization features.
618
+
619
+ ### identity
620
+ **Type:** `Optional[bool]`
621
+ **Default:** `None`
622
+
623
+ When `True`, adds an auto-incrementing identity column to the table.
624
+
625
+ ### liquid_clustering
626
+ **Type:** `Optional[bool]`
627
+ **Default:** `None`
628
+
629
+ When `True`, enables Databricks liquid clustering for improved query performance with automatic optimization.
630
+
631
+ ### partition_by
632
+ **Type:** `Optional[List[str]]`
633
+ **Default:** `None`
634
+
635
+ List of columns to partition the table by. Partitioning physically organizes data for faster queries on partition columns.
636
+
637
+ **Example:**
638
+ ```python
639
+ partition_by=["year", "month"]
640
+ ```
641
+
642
+ **Use Case:** Date-based partitioning for time-series data.
643
+
644
+ ### zorder_by
645
+ **Type:** `Optional[List[str]]`
646
+ **Default:** `None`
647
+
648
+ List of columns to Z-order (multi-dimensional clustering). Improves query performance for columns frequently used in filters.
649
+
650
+ **Example:**
651
+ ```python
652
+ zorder_by=["customer_id", "product_id"]
653
+ ```
654
+
655
+ ### cluster_by
656
+ **Type:** `Optional[List[str]]`
657
+ **Default:** `None`
658
+
659
+ List of columns to cluster by. Alternative to liquid clustering for organizing data.
660
+
661
+ **Example:**
662
+ ```python
663
+ cluster_by=["region", "category"]
664
+ ```
665
+
666
+ ### powerbi
667
+ **Type:** `Optional[bool]`
668
+ **Default:** `None`
669
+
670
+ When `True`, optimizes table settings for Power BI connectivity and performance.
671
+
672
+ ### maximum_compatibility
673
+ **Type:** `Optional[bool]`
674
+ **Default:** `None`
675
+
676
+ When `True`, creates tables with maximum compatibility settings for older Spark/Delta versions.
677
+
678
+ ### bloomfilter_by
679
+ **Type:** `Optional[List[str]]`
680
+ **Default:** `None`
681
+
682
+ List of columns to create bloom filters on for faster equality lookups.
683
+
684
+ **Example:**
685
+ ```python
686
+ bloomfilter_by=["email", "phone_number"]
687
+ ```
688
+
689
+ ### constraints
690
+ **Type:** `Optional[dict[str, str]]`
691
+ **Default:** `None`
692
+
693
+ Dictionary mapping constraint names to SQL constraint expressions.
694
+
695
+ **Example:**
696
+ ```python
697
+ constraints={
698
+ "valid_email": "email LIKE '%@%.%'",
699
+ "positive_amount": "amount > 0"
700
+ }
701
+ ```
702
+
703
+ ### properties
704
+ **Type:** `Optional[dict[str, str]]`
705
+ **Default:** `None`
706
+
707
+ Dictionary of custom table properties as key-value pairs.
708
+
709
+ **Example:**
710
+ ```python
711
+ properties={
712
+ "owner": "data_team",
713
+ "data_classification": "confidential"
714
+ }
715
+ ```
716
+
717
+ ### comment
718
+ **Type:** `Optional[str]`
719
+ **Default:** `None`
720
+
721
+ Description or comment for the table.
722
+
723
+ **Example:**
724
+ ```python
725
+ comment="Customer master data with full history tracking"
726
+ ```
727
+
728
+ ### calculated_columns
729
+ **Type:** `Optional[dict[str, str]]`
730
+ **Default:** `None`
731
+
732
+ Dictionary mapping column names to SQL expressions for computed/generated columns.
733
+
734
+ **Example:**
735
+ ```python
736
+ calculated_columns={
737
+ "full_address": "concat(street, ', ', city, ', ', state)",
738
+ "is_premium": "CASE WHEN tier = 'premium' THEN true ELSE false END"
739
+ }
740
+ ```
741
+
742
+ ### masks
743
+ **Type:** `Optional[dict[str, str]]`
744
+ **Default:** `None`
745
+
746
+ Dictionary mapping column names to masking expressions for data privacy/security.
747
+
748
+ **Example:**
749
+ ```python
750
+ masks={
751
+ "ssn": "concat('***-**-', right(ssn, 4))",
752
+ "credit_card": "concat('****-****-****-', right(credit_card, 4))"
753
+ }
754
+ ```
755
+
756
+ ### comments
757
+ **Type:** `Optional[dict[str, str]]`
758
+ **Default:** `None`
759
+
760
+ Dictionary mapping column names to their descriptions.
761
+
762
+ **Example:**
763
+ ```python
764
+ comments={
765
+ "customer_id": "Unique identifier for each customer",
766
+ "lifetime_value": "Total revenue generated by customer"
767
+ }
768
+ ```
769
+
770
+ ### retention_days
771
+ **Type:** `Optional[int]`
772
+ **Default:** `None`
773
+
774
+ Number of days to retain old versions of data before they can be vacuumed.
775
+
776
+ **Example:**
777
+ ```python
778
+ retention_days=90
779
+ ```
780
+
781
+ ### primary_key
782
+ **Type:** `Optional[dict[str, PrimaryKey]]`
783
+ **Default:** `None`
784
+
785
+ Dictionary defining primary key constraint with name and configuration.
786
+
787
+ **Example:**
788
+ ```python
789
+ primary_key={
790
+ "pk_customers": {
791
+ "keys": ["customer_id"],
792
+ "options": {"constraint": "not enforced"}
793
+ }
794
+ }
795
+ ```
796
+
797
+ ### foreign_keys
798
+ **Type:** `Optional[dict[str, ForeignKey]]`
799
+ **Default:** `None`
800
+
801
+ Dictionary defining foreign key constraints with names and configurations.
802
+
803
+ **Example:**
804
+ ```python
805
+ foreign_keys={
806
+ "fk_orders_customer": {
807
+ "keys": ["customer_id"],
808
+ "reference": "customers",
809
+ "options": {
810
+ "foreign_key": "on delete no action",
811
+ "constraint": "not enforced"
812
+ }
813
+ }
814
+ }
815
+ ```
816
+
817
+ ### Complete Table Options Example
818
+
819
+ ```python
820
+ TableOptions(
821
+ identity=False,
822
+ liquid_clustering=True,
823
+ partition_by=["year", "month"],
824
+ zorder_by=["customer_id"],
825
+ cluster_by=None,
826
+ powerbi=True,
827
+ maximum_compatibility=False,
828
+ bloomfilter_by=["email"],
829
+ constraints={
830
+ "valid_email": "email LIKE '%@%.%'",
831
+ "positive_balance": "balance >= 0"
832
+ },
833
+ properties={
834
+ "owner": "analytics_team",
835
+ "pii": "true"
836
+ },
837
+ comment="Customer dimension table with full SCD2 history",
838
+ calculated_columns={
839
+ "age": "year(current_date()) - year(birth_date)"
840
+ },
841
+ masks={
842
+ "ssn": "concat('***-**-', right(ssn, 4))"
843
+ },
844
+ comments={
845
+ "customer_id": "Primary key - unique customer identifier",
846
+ "balance": "Current account balance"
847
+ },
848
+ retention_days=90,
849
+ primary_key={
850
+ "pk_customer": {
851
+ "keys": ["customer_id"],
852
+ "options": {"constraint": "not enforced"}
853
+ }
854
+ },
855
+ foreign_keys={
856
+ "fk_country": {
857
+ "keys": ["country_code"],
858
+ "reference": "dim_countries",
859
+ "options": {
860
+ "foreign_key": "on delete no action",
861
+ "constraint": "not enforced"
862
+ }
863
+ }
864
+ }
865
+ )
866
+ ```
867
+
868
+ ---
869
+
870
+ ## Check Options
871
+
872
+ Data quality and validation checks that run before or after job execution.
873
+
874
+ ### skip
875
+ **Type:** `Optional[bool]`
876
+ **Default:** `False`
877
+
878
+ When `True`, skips all data quality checks for this job.
879
+
880
+ ### pre_run
881
+ **Type:** `Optional[bool]`
882
+ **Default:** `False`
883
+
884
+ When `True`, runs data quality checks before job execution. Job fails if checks don't pass.
885
+
886
+ ### post_run
887
+ **Type:** `Optional[bool]`
888
+ **Default:** `False`
889
+
890
+ When `True`, runs data quality checks after job execution. Job fails if checks don't pass.
891
+
892
+ ### min_rows
893
+ **Type:** `Optional[int]`
894
+ **Default:** `None`
895
+
896
+ Minimum number of rows expected in the result. Check fails if row count is below this threshold.
897
+
898
+ **Example:**
899
+ ```python
900
+ min_rows=1000
901
+ ```
902
+
903
+ ### max_rows
904
+ **Type:** `Optional[int]`
905
+ **Default:** `None`
906
+
907
+ Maximum number of rows expected in the result. Check fails if row count exceeds this threshold.
908
+
909
+ **Example:**
910
+ ```python
911
+ max_rows=1000000
912
+ ```
913
+
914
+ ### count_must_equal
915
+ **Type:** `Optional[str]`
916
+ **Default:** `None`
917
+
918
+ SQL expression or table name to compare row counts. Check fails if counts don't match.
919
+
920
+ **Example:**
921
+ ```python
922
+ count_must_equal="bronze__source_table"
923
+ ```
924
+
925
+ ### Complete Check Options Example
926
+
927
+ ```python
928
+ CheckOptions(
929
+ skip=False,
930
+ pre_run=True,
931
+ post_run=True,
932
+ min_rows=100,
933
+ max_rows=10000000,
934
+ count_must_equal="bronze__raw_customers"
935
+ )
936
+ ```
937
+
938
+ ---
939
+
940
+ ## Spark Options
941
+
942
+ Spark configuration for the job execution.
943
+
944
+ ### sql
945
+ **Type:** `Optional[dict[str, str]]`
946
+ **Default:** `None`
947
+
948
+ Dictionary of Spark SQL configuration parameters.
949
+
950
+ **Example:**
951
+ ```python
952
+ sql={
953
+ "spark.sql.adaptive.enabled": "true",
954
+ "spark.sql.adaptive.coalescePartitions.enabled": "true"
955
+ }
956
+ ```
957
+
958
+ ### conf
959
+ **Type:** `Optional[dict[str, str]]`
960
+ **Default:** `None`
961
+
962
+ Dictionary of general Spark configuration parameters.
963
+
964
+ **Example:**
965
+ ```python
966
+ conf={
967
+ "spark.executor.memory": "8g",
968
+ "spark.executor.cores": "4",
969
+ "spark.dynamicAllocation.enabled": "true"
970
+ }
971
+ ```
972
+
973
+ ### Complete Spark Options Example
974
+
975
+ ```python
976
+ SparkOptions(
977
+ sql={
978
+ "spark.sql.adaptive.enabled": "true",
979
+ "spark.sql.shuffle.partitions": "200",
980
+ "spark.sql.autoBroadcastJoinThreshold": "10485760"
981
+ },
982
+ conf={
983
+ "spark.executor.memory": "8g",
984
+ "spark.executor.cores": "4",
985
+ "spark.executor.instances": "10"
986
+ }
987
+ )
988
+ ```
989
+
990
+ ---
991
+
992
+ ## Invoker Options
993
+
994
+ Options for invoking external notebooks at different stages of job execution.
995
+
996
+ ### Structure
997
+
998
+ ```python
999
+ class _InvokeOptions(TypedDict):
1000
+ notebook: str # Path to notebook to invoke
1001
+ timeout: int # Timeout in seconds
1002
+ arguments: Optional[dict[str, str]] # Arguments to pass to notebook
1003
+ ```
1004
+
1005
+ ### pre_run
1006
+ **Type:** `Optional[List[_InvokeOptions]]`
1007
+ **Default:** `None`
1008
+
1009
+ List of notebooks to execute before the main job runs.
1010
+
1011
+ ### run
1012
+ **Type:** `Optional[List[_InvokeOptions]]`
1013
+ **Default:** `None`
1014
+
1015
+ List of notebooks to execute as the main job (replaces default job logic).
1016
+
1017
+ ### post_run
1018
+ **Type:** `Optional[List[_InvokeOptions]]`
1019
+ **Default:** `None`
1020
+
1021
+ List of notebooks to execute after the main job completes.
1022
+
1023
+ ### Complete Invoker Options Example
1024
+
1025
+ ```python
1026
+ InvokerOptions(
1027
+ pre_run=[
1028
+ {
1029
+ "notebook": "/Notebooks/setup/validate_sources",
1030
+ "timeout": 300,
1031
+ "arguments": {"check_level": "strict"}
1032
+ }
1033
+ ],
1034
+ run=None,
1035
+ post_run=[
1036
+ {
1037
+ "notebook": "/Notebooks/post/send_notification",
1038
+ "timeout": 60,
1039
+ "arguments": {
1040
+ "recipients": "team@example.com",
1041
+ "status": "success"
1042
+ }
1043
+ },
1044
+ {
1045
+ "notebook": "/Notebooks/post/update_dashboard",
1046
+ "timeout": 120,
1047
+ "arguments": {"dashboard_id": "main"}
1048
+ }
1049
+ ]
1050
+ )
1051
+ ```
1052
+
1053
+ ---
1054
+
1055
+ ## Extender Options
1056
+
1057
+ Options for extending job functionality with custom logic.
1058
+
1059
+ ### extender (Required)
1060
+ **Type:** `str`
1061
+
1062
+ Name or path of the extender class to use.
1063
+
1064
+ **Example:**
1065
+ ```python
1066
+ extender="CustomDataProcessor"
1067
+ ```
1068
+
1069
+ ### arguments
1070
+ **Type:** `Optional[dict[str, str]]`
1071
+ **Default:** `None`
1072
+
1073
+ Dictionary of arguments to pass to the extender.
1074
+
1075
+ **Example:**
1076
+ ```python
1077
+ arguments={
1078
+ "transformation": "advanced",
1079
+ "output_format": "parquet"
1080
+ }
1081
+ ```
1082
+
1083
+ ### Complete Extender Options Example
1084
+
1085
+ ```python
1086
+ ExtenderOptions(
1087
+ extender="ml.FeatureEngineeringExtender",
1088
+ arguments={
1089
+ "feature_set": "customer_features_v2",
1090
+ "normalize": "true",
1091
+ "handle_missing": "impute"
1092
+ }
1093
+ )
1094
+ ```
1095
+
1096
+ Multiple extenders can be chained:
1097
+
1098
+ ```python
1099
+ extender_options=[
1100
+ {
1101
+ "extender": "DataValidationExtender",
1102
+ "arguments": {"strict": "true"}
1103
+ },
1104
+ {
1105
+ "extender": "FeatureEngineeringExtender",
1106
+ "arguments": {"feature_set": "v2"}
1107
+ },
1108
+ {
1109
+ "extender": "MLScoringExtender",
1110
+ "arguments": {"model": "customer_churn_v3"}
1111
+ }
1112
+ ]
1113
+ ```
1114
+
1115
+ ---
1116
+
1117
+ ## Complete Configuration Examples
1118
+
1119
+ ### Bronze Job: Ingest Customer Data
1120
+
1121
+ ```python
1122
+ JobConfBronze(
1123
+ job_id="bronze__crm_customers",
1124
+ topic="customers",
1125
+ item="raw_data",
1126
+ step="bronze",
1127
+ options=BronzeOptions(
1128
+ type="default",
1129
+ mode="append",
1130
+ uri="s3://data-lake/raw/crm/customers/*.json",
1131
+ parser="crm_customer_parser",
1132
+ source="salesforce_crm",
1133
+ keys=["customer_id"],
1134
+ parents=None,
1135
+ filter_where="status != 'test'",
1136
+ optimize=True,
1137
+ compute_statistics=True,
1138
+ vacuum=False,
1139
+ no_drop=False,
1140
+ encrypted_columns=["ssn", "credit_card"],
1141
+ calculated_columns={
1142
+ "full_name": "concat(first_name, ' ', last_name)",
1143
+ "account_age_days": "datediff(current_date(), created_date)"
1144
+ },
1145
+ operation="upsert",
1146
+ timeout=3600
1147
+ ),
1148
+ table_options=TableOptions(
1149
+ partition_by=["ingestion_date"],
1150
+ comment="Raw customer data from CRM system",
1151
+ retention_days=30
1152
+ ),
1153
+ check_options=CheckOptions(
1154
+ skip=False,
1155
+ post_run=True,
1156
+ min_rows=1
1157
+ ),
1158
+ spark_options=SparkOptions(
1159
+ conf={
1160
+ "spark.executor.memory": "4g",
1161
+ "spark.executor.cores": "2"
1162
+ }
1163
+ ),
1164
+ tags=["pii", "critical", "daily"],
1165
+ comment="Daily customer data ingestion from Salesforce CRM"
1166
+ )
1167
+ ```
1168
+
1169
+ ### Silver Job: Clean and Deduplicate Customers
1170
+
1171
+ ```python
1172
+ JobConfSilver(
1173
+ job_id="silver__customers",
1174
+ topic="customers",
1175
+ item="cleaned",
1176
+ step="silver",
1177
+ options=SilverOptions(
1178
+ type="default",
1179
+ mode="update",
1180
+ change_data_capture="scd2",
1181
+ parents=["bronze__crm_customers"],
1182
+ filter_where="email IS NOT NULL AND email LIKE '%@%.%'",
1183
+ optimize=True,
1184
+ compute_statistics=True,
1185
+ vacuum=True,
1186
+ no_drop=False,
1187
+ deduplicate=True,
1188
+ stream=False,
1189
+ order_duplicate_by={"updated_at": "desc", "source_priority": "asc"},
1190
+ timeout=7200
1191
+ ),
1192
+ table_options=TableOptions(
1193
+ liquid_clustering=True,
1194
+ cluster_by=["customer_id", "country_code"],
1195
+ bloomfilter_by=["email", "phone"],
1196
+ constraints={
1197
+ "valid_email": "email LIKE '%@%.%'",
1198
+ "valid_country": "country_code IN ('US', 'CA', 'UK', 'DE')"
1199
+ },
1200
+ comments={
1201
+ "customer_id": "Unique customer identifier",
1202
+ "email": "Primary contact email",
1203
+ "phone": "Primary phone number"
1204
+ },
1205
+ comment="Cleaned customer master data with SCD2 history",
1206
+ retention_days=90
1207
+ ),
1208
+ check_options=CheckOptions(
1209
+ pre_run=True,
1210
+ post_run=True,
1211
+ min_rows=1000,
1212
+ count_must_equal="bronze__crm_customers"
1213
+ ),
1214
+ spark_options=SparkOptions(
1215
+ sql={
1216
+ "spark.sql.adaptive.enabled": "true",
1217
+ "spark.sql.adaptive.coalescePartitions.enabled": "true"
1218
+ }
1219
+ ),
1220
+ invoker_options=InvokerOptions(
1221
+ post_run=[
1222
+ {
1223
+ "notebook": "/Quality/DataQualityReport",
1224
+ "timeout": 300,
1225
+ "arguments": {"table": "silver__customers"}
1226
+ }
1227
+ ]
1228
+ ),
1229
+ tags=["master_data", "scd2", "pii"],
1230
+ comment="Silver layer customer data with deduplication and validation"
1231
+ )
1232
+ ```
1233
+
1234
+ ### Gold Job: Customer 360 View
1235
+
1236
+ ```python
1237
+ JobConfGold(
1238
+ job_id="gold__customer_360",
1239
+ topic="analytics",
1240
+ item="customer_360",
1241
+ step="gold",
1242
+ options=GoldOptions(
1243
+ type="default",
1244
+ mode="update",
1245
+ change_data_capture="scd1",
1246
+ update_where="__last_updated >= current_date() - INTERVAL 7 DAYS",
1247
+ parents=["silver__customers", "silver__orders", "silver__interactions"],
1248
+ optimize=True,
1249
+ compute_statistics=True,
1250
+ vacuum=True,
1251
+ no_drop=False,
1252
+ deduplicate=True,
1253
+ rectify_as_upserts=True,
1254
+ correct_valid_from=False,
1255
+ persist_last_timestamp=True,
1256
+ persist_last_updated_timestamp=True,
1257
+ table="analytics.customer_360_view",
1258
+ notebook=False,
1259
+ requirements=False,
1260
+ timeout=10800,
1261
+ metadata=True,
1262
+ last_updated=True
1263
+ ),
1264
+ table_options=TableOptions(
1265
+ liquid_clustering=True,
1266
+ cluster_by=["customer_segment", "region"],
1267
+ zorder_by=["customer_id"],
1268
+ powerbi=True,
1269
+ bloomfilter_by=["customer_id"],
1270
+ calculated_columns={
1271
+ "customer_lifetime_value": "total_revenue - total_costs",
1272
+ "churn_risk_category": """
1273
+ CASE
1274
+ WHEN churn_score > 0.8 THEN 'High'
1275
+ WHEN churn_score > 0.5 THEN 'Medium'
1276
+ ELSE 'Low'
1277
+ END
1278
+ """
1279
+ },
1280
+ properties={
1281
+ "owner": "analytics_team",
1282
+ "refresh_frequency": "daily",
1283
+ "data_classification": "internal"
1284
+ },
1285
+ comment="Complete 360-degree view of customer data for analytics and reporting",
1286
+ retention_days=180
1287
+ ),
1288
+ check_options=CheckOptions(
1289
+ post_run=True,
1290
+ min_rows=100,
1291
+ max_rows=100000000
1292
+ ),
1293
+ spark_options=SparkOptions(
1294
+ sql={
1295
+ "spark.sql.adaptive.enabled": "true",
1296
+ "spark.sql.shuffle.partitions": "400"
1297
+ },
1298
+ conf={
1299
+ "spark.executor.memory": "16g",
1300
+ "spark.executor.cores": "8",
1301
+ "spark.dynamicAllocation.enabled": "true",
1302
+ "spark.dynamicAllocation.maxExecutors": "50"
1303
+ }
1304
+ ),
1305
+ invoker_options=InvokerOptions(
1306
+ post_run=[
1307
+ {
1308
+ "notebook": "/Analytics/RefreshDashboards",
1309
+ "timeout": 600,
1310
+ "arguments": {
1311
+ "dashboard_list": "customer_360,executive_summary"
1312
+ }
1313
+ },
1314
+ {
1315
+ "notebook": "/Notifications/SendSuccessEmail",
1316
+ "timeout": 60,
1317
+ "arguments": {
1318
+ "recipients": "analytics-team@company.com",
1319
+ "job": "customer_360"
1320
+ }
1321
+ }
1322
+ ]
1323
+ ),
1324
+ extender_options=[
1325
+ {
1326
+ "extender": "MLScoringExtender",
1327
+ "arguments": {
1328
+ "model": "customer_churn_model_v2",
1329
+ "score_column": "churn_score"
1330
+ }
1331
+ },
1332
+ {
1333
+ "extender": "SegmentationExtender",
1334
+ "arguments": {
1335
+ "algorithm": "kmeans",
1336
+ "n_segments": "5"
1337
+ }
1338
+ }
1339
+ ],
1340
+ tags=["analytics", "customer", "ml_enriched", "daily"],
1341
+ comment="Gold layer customer 360 view with ML scoring and segmentation"
1342
+ )
1343
+ ```
1344
+
1345
+ ---
1346
+
1347
+ ## 💡 Key Concepts
1348
+
1349
+ ### 🔄 Change Data Capture (CDC) Strategies
1350
+
1351
+ Understanding CDC strategies is crucial for choosing the right data tracking approach:
1352
+
1353
+ <table>
1354
+ <tr>
1355
+ <th width="150">Strategy</th>
1356
+ <th>Description</th>
1357
+ <th>History</th>
1358
+ <th>Use Case</th>
1359
+ </tr>
1360
+
1361
+ <tr>
1362
+ <td><strong>nocdc</strong></td>
1363
+ <td>No change tracking. Simple insert/update/delete operations without history.</td>
1364
+ <td align="center">❌</td>
1365
+ <td>Lookup tables, configuration data, or data where history isn't needed</td>
1366
+ </tr>
1367
+
1368
+ <tr>
1369
+ <td><strong>scd1</strong><br/><em>(Type 1)</em></td>
1370
+ <td>Overwrites existing records with new values. Maintains only current state with flags for deleted records.</td>
1371
+ <td align="center">⚠️ Current</td>
1372
+ <td>Master data where only current values matter (current address, current status)</td>
1373
+ </tr>
1374
+
1375
+ <tr>
1376
+ <td><strong>scd2</strong><br/><em>(Type 2)</em></td>
1377
+ <td>Maintains complete history with versioning. Each change creates a new version with validity dates.</td>
1378
+ <td align="center">✅ Full</td>
1379
+ <td>Data requiring full audit trail (customer history, product changes over time)</td>
1380
+ </tr>
1381
+
1382
+ </table>
1383
+
1384
+ ### 🎭 Processing Modes by Tier
1385
+
1386
+ <table>
1387
+ <tr>
1388
+ <th width="120">Tier</th>
1389
+ <th width="120">Mode</th>
1390
+ <th>Description</th>
1391
+ </tr>
1392
+
1393
+ <tr>
1394
+ <td rowspan="3"><strong>🥉 Bronze</strong></td>
1395
+ <td><code>memory</code></td>
1396
+ <td>Testing and temporary data</td>
1397
+ </tr>
1398
+ <tr>
1399
+ <td><code>append</code></td>
1400
+ <td>Raw data ingestion without deduplication</td>
1401
+ </tr>
1402
+ <tr>
1403
+ <td><code>register</code></td>
1404
+ <td>External table registration</td>
1405
+ </tr>
1406
+
1407
+ <tr>
1408
+ <td rowspan="5"><strong>🥈 Silver</strong></td>
1409
+ <td><code>memory</code></td>
1410
+ <td>Testing transformations</td>
1411
+ </tr>
1412
+ <tr>
1413
+ <td><code>append</code></td>
1414
+ <td>Accumulate all data</td>
1415
+ </tr>
1416
+ <tr>
1417
+ <td><code>latest</code></td>
1418
+ <td>Process only latest snapshot</td>
1419
+ </tr>
1420
+ <tr>
1421
+ <td><code>update</code></td>
1422
+ <td>Incremental processing</td>
1423
+ </tr>
1424
+ <tr>
1425
+ <td><code>combine</code></td>
1426
+ <td>Merge multiple sources</td>
1427
+ </tr>
1428
+
1429
+ <tr>
1430
+ <td rowspan="5"><strong>🥇 Gold</strong></td>
1431
+ <td><code>memory</code></td>
1432
+ <td>Testing analytics</td>
1433
+ </tr>
1434
+ <tr>
1435
+ <td><code>append</code></td>
1436
+ <td>Accumulate metrics/aggregations</td>
1437
+ </tr>
1438
+ <tr>
1439
+ <td><code>complete</code></td>
1440
+ <td>Full rebuild</td>
1441
+ </tr>
1442
+ <tr>
1443
+ <td><code>update</code></td>
1444
+ <td>Incremental updates</td>
1445
+ </tr>
1446
+ <tr>
1447
+ <td><code>invoke</code></td>
1448
+ <td>Custom notebook execution</td>
1449
+ </tr>
1450
+
1451
+ </table>
1452
+
1453
+ ### 🔗 Parent Dependencies
1454
+
1455
+ Jobs can depend on other jobs via the `parents` option. The dependency system ensures:
1456
+
1457
+ ✅ **Correct execution order** - Parents complete before children run
1458
+ ✅ **No circular dependencies** - System validates dependency graph
1459
+ ✅ **Parallel execution** - Independent jobs run concurrently
1460
+ ✅ **Automatic retry** - Failed dependencies trigger retries
1461
+
1462
+ **Example:**
1463
+ ```python
1464
+ # This job waits for both bronze jobs to complete
1465
+ parents=["bronze__customers", "bronze__orders"]
1466
+ ```
1467
+
1468
+ ### ✅ Data Quality Checks
1469
+
1470
+ Quality checks can run at different stages:
1471
+
1472
+ | Stage | Description | Failure Behavior |
1473
+ |-------|-------------|------------------|
1474
+ | **pre_run** | Validate input before processing | Job fails before execution |
1475
+ | **post_run** | Validate output after processing | Job fails after execution |
1476
+
1477
+ **Check Types:**
1478
+ - ✅ **Row counts** - Validate min/max row thresholds
1479
+ - ✅ **Data equality** - Compare counts with other tables
1480
+ - ✅ **Custom assertions** - SQL constraints and validations
1481
+
1482
+ ---
1483
+
1484
+ ## 🎯 Best Practices
1485
+
1486
+ ### 🥉 Bronze Layer
1487
+
1488
+ | ✅ Best Practice | 💡 Recommendation | 📝 Rationale |
1489
+ |-----------------|-------------------|--------------|
1490
+ | **Mode Selection** | Use `mode="append"` | Capture all raw data without loss |
1491
+ | **CDC Operation** | Set `operation="upsert"` or `"reload"` | Enable change tracking for downstream processing |
1492
+ | **Performance** | Enable `optimize=True` for large datasets | Compact small files, improve query speed |
1493
+ | **Security** | Use `encrypted_columns` for sensitive fields | Protect PII during ingestion |
1494
+ | **Retention** | Keep short (`retention_days=30`) | Bronze is transient; Silver is source of truth |
1495
+
1496
+ ### 🥈 Silver Layer
1497
+
1498
+ | ✅ Best Practice | 💡 Recommendation | 📝 Rationale |
1499
+ |-----------------|-------------------|--------------|
1500
+ | **CDC Strategy** | Always use `scd1` or `scd2` | Systematic change tracking required |
1501
+ | **Data Quality** | Enable `deduplicate=True` | Remove duplicates for clean data |
1502
+ | **Processing Mode** | Use `mode="update"` | Efficient incremental processing |
1503
+ | **Validation** | Configure `check_options` | Enforce data quality standards |
1504
+ | **Retention** | Use longer period (`retention_days=90`) | Silver is the authoritative source |
1505
+
1506
+ ### 🥇 Gold Layer
1507
+
1508
+ | ✅ Best Practice | 💡 Recommendation | 📝 Rationale |
1509
+ |-----------------|-------------------|--------------|
1510
+ | **CDC Strategy** | Choose based on business needs | Balance history requirements vs performance |
1511
+ | **Rectification** | Enable `rectify_as_upserts=True` | Ensure historical data consistency |
1512
+ | **Watermarking** | Use `persist_last_timestamp=True` | Enable efficient incremental loads |
1513
+ | **Audit Tracking** | Enable `metadata=True` + `last_updated=True` | Support troubleshooting and audits |
1514
+ | **BI Optimization** | Set `powerbi=True` for analytics workloads | Optimize for business intelligence tools |
1515
+ | **Enrichment** | Use extenders for ML scoring | Add predictive intelligence |
1516
+
1517
+ ### ⚡ Performance Optimization
1518
+
1519
+ | 🚀 Technique | 💡 Implementation | 🎯 Use Case |
1520
+ |-------------|-------------------|-------------|
1521
+ | **Liquid Clustering** | `liquid_clustering=True` | Modern Delta tables with auto-optimization |
1522
+ | **Partitioning** | `partition_by=["year", "month"]` | Time-series data and date-based queries |
1523
+ | **Z-Ordering** | `zorder_by=["customer_id"]` | High-cardinality columns in WHERE clauses |
1524
+ | **Bloom Filters** | `bloomfilter_by=["email"]` | Fast equality lookups on strings |
1525
+ | **Spark Tuning** | Configure `spark_options` | Match cluster resources to workload |
1526
+
1527
+ > **💡 Pro Tip:** Liquid clustering is self-optimizing and recommended for new tables over manual partitioning.
1528
+
1529
+ ### 🔒 Data Governance
1530
+
1531
+ | 📋 Area | 💡 Recommendation | ✅ Benefit |
1532
+ |---------|-------------------|------------|
1533
+ | **Documentation** | Add detailed `comments` and `comment` | Enable knowledge sharing and understanding |
1534
+ | **Validation** | Define `constraints` for business rules | Enforce data quality at write-time |
1535
+ | **Metadata** | Set `properties` for classification | Improve discoverability and compliance |
1536
+ | **Privacy** | Use `masks` for PII fields | Protect sensitive data, meet regulations |
1537
+ | **Organization** | Apply meaningful `tags` | Enable filtering, cost tracking, ownership |
1538
+
1539
+ ---
1540
+
1541
+ ## Related Documentation
1542
+
1543
+ - [CDC Templates Documentation](../cdc/templates/README.md)
1544
+ - [Parser Options](../parsers/README.md)
1545
+ - [Metastore Table Documentation](../../metastore/README.md)