fabricks 3.0.19__py3-none-any.whl → 4.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/api/context.py +15 -3
- fabricks/api/notebooks/schedule.py +2 -3
- fabricks/api/parsers.py +2 -1
- fabricks/api/utils.py +3 -1
- fabricks/cdc/__init__.py +1 -2
- fabricks/cdc/base/__init__.py +1 -2
- fabricks/cdc/base/_types.py +5 -3
- fabricks/cdc/base/configurator.py +5 -0
- fabricks/cdc/base/generator.py +7 -3
- fabricks/cdc/base/merger.py +2 -0
- fabricks/cdc/base/processor.py +15 -0
- fabricks/cdc/templates/README.md +490 -0
- fabricks/cdc/templates/ctes/base.sql.jinja +1 -0
- fabricks/cdc/templates/ctes/current.sql.jinja +4 -0
- fabricks/cdc/templates/merges/scd1.sql.jinja +6 -0
- fabricks/cdc/templates/merges/scd2.sql.jinja +6 -0
- fabricks/cdc/templates/queries/context.sql.jinja +104 -96
- fabricks/cdc/templates/query.sql.jinja +1 -1
- fabricks/context/__init__.py +13 -1
- fabricks/context/config.py +13 -122
- fabricks/context/log.py +92 -1
- fabricks/context/runtime.py +35 -69
- fabricks/context/spark_session.py +4 -4
- fabricks/context/utils.py +26 -39
- fabricks/core/__init__.py +2 -2
- fabricks/core/dags/base.py +5 -5
- fabricks/core/dags/processor.py +2 -3
- fabricks/core/extenders.py +1 -1
- fabricks/core/job_schema.py +26 -16
- fabricks/core/jobs/__init__.py +1 -7
- fabricks/core/jobs/base/README.md +1545 -0
- fabricks/core/jobs/base/__init__.py +1 -8
- fabricks/core/jobs/base/checker.py +7 -7
- fabricks/core/jobs/base/configurator.py +142 -63
- fabricks/core/jobs/base/generator.py +38 -34
- fabricks/core/jobs/base/invoker.py +48 -63
- fabricks/core/jobs/base/processor.py +13 -28
- fabricks/core/jobs/bronze.py +88 -38
- fabricks/core/jobs/get_job.py +3 -6
- fabricks/core/jobs/get_job_conf.py +19 -68
- fabricks/core/jobs/get_jobs.py +10 -11
- fabricks/core/jobs/get_schedules.py +3 -17
- fabricks/core/jobs/gold.py +89 -47
- fabricks/core/jobs/silver.py +42 -22
- fabricks/core/masks.py +11 -8
- fabricks/core/parsers/__init__.py +0 -2
- fabricks/core/parsers/base.py +10 -10
- fabricks/core/parsers/decorator.py +1 -1
- fabricks/core/parsers/get_parser.py +4 -5
- fabricks/core/schedules/process.py +1 -4
- fabricks/core/steps/base.py +27 -17
- fabricks/core/steps/get_step.py +2 -4
- fabricks/core/steps/get_step_conf.py +3 -7
- fabricks/core/udfs.py +7 -7
- fabricks/core/views.py +2 -2
- fabricks/deploy/__init__.py +27 -16
- fabricks/deploy/masks.py +1 -1
- fabricks/deploy/notebooks.py +19 -16
- fabricks/deploy/schedules.py +1 -1
- fabricks/deploy/tables.py +66 -49
- fabricks/deploy/udfs.py +2 -2
- fabricks/deploy/views.py +15 -16
- fabricks/metastore/database.py +3 -3
- fabricks/metastore/table.py +103 -68
- fabricks/models/__init__.py +125 -0
- fabricks/models/common.py +79 -0
- fabricks/models/config.py +225 -0
- fabricks/models/dependency.py +50 -0
- fabricks/models/job.py +157 -0
- fabricks/models/path.py +17 -0
- fabricks/models/runtime.py +182 -0
- fabricks/models/schedule.py +21 -0
- fabricks/models/step.py +103 -0
- fabricks/models/table.py +77 -0
- fabricks/{core/jobs/get_job_id.py → models/utils.py} +2 -0
- fabricks/utils/helpers.py +6 -5
- fabricks/utils/log.py +25 -6
- fabricks/utils/path.py +265 -108
- fabricks/utils/pip.py +7 -7
- fabricks/utils/read/read.py +23 -22
- fabricks/utils/read/read_yaml.py +2 -2
- fabricks/utils/write/delta.py +4 -4
- fabricks/utils/write/stream.py +2 -2
- {fabricks-3.0.19.dist-info → fabricks-4.0.1.dist-info}/METADATA +9 -4
- {fabricks-3.0.19.dist-info → fabricks-4.0.1.dist-info}/RECORD +86 -83
- fabricks/context/_types.py +0 -139
- fabricks/context/helpers.py +0 -63
- fabricks/core/jobs/base/_types.py +0 -284
- fabricks/core/parsers/_types.py +0 -6
- fabricks/utils/fdict.py +0 -240
- fabricks/utils/pydantic.py +0 -94
- fabricks/utils/schema/__init__.py +0 -7
- fabricks/utils/schema/get_json_schema_for_type.py +0 -161
- fabricks/utils/schema/get_schema_for_type.py +0 -99
- {fabricks-3.0.19.dist-info → fabricks-4.0.1.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,1545 @@
|
|
|
1
|
+
# Fabricks Job Options Documentation
|
|
2
|
+
|
|
3
|
+
> **Comprehensive guide to configuring Bronze, Silver, and Gold tier jobs in Fabricks**
|
|
4
|
+
|
|
5
|
+
This document provides detailed documentation for all job configuration options in Fabricks. Jobs are organized into three tiers: Bronze, Silver, and Gold, each with their own specific options and behaviors.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## 📑 Table of Contents
|
|
10
|
+
|
|
11
|
+
- [📋 Overview](#-overview)
|
|
12
|
+
- [🏗️ Job Tiers](#️-job-tiers)
|
|
13
|
+
- [⚙️ Common Options](#️-common-options)
|
|
14
|
+
- [🥉 Bronze Options](#-bronze-options)
|
|
15
|
+
- [🥈 Silver Options](#-silver-options)
|
|
16
|
+
- [🥇 Gold Options](#-gold-options)
|
|
17
|
+
- [📊 Table Options](#-table-options)
|
|
18
|
+
- [✅ Check Options](#-check-options)
|
|
19
|
+
- [⚡ Spark Options](#-spark-options)
|
|
20
|
+
- [🔗 Invoker Options](#-invoker-options)
|
|
21
|
+
- [🔌 Extender Options](#-extender-options)
|
|
22
|
+
- [📝 Complete Configuration Examples](#-complete-configuration-examples)
|
|
23
|
+
- [💡 Key Concepts](#-key-concepts)
|
|
24
|
+
- [🎯 Best Practices](#-best-practices)
|
|
25
|
+
|
|
26
|
+
---
|
|
27
|
+
|
|
28
|
+
## 📋 Overview
|
|
29
|
+
|
|
30
|
+
Fabricks uses a **tiered data processing architecture** with three layers:
|
|
31
|
+
|
|
32
|
+
| Tier | Purpose | Description |
|
|
33
|
+
|------|---------|-------------|
|
|
34
|
+
| **🥉 Bronze** | Raw Data Ingestion | Ingest raw data from external sources with minimal transformation |
|
|
35
|
+
| **🥈 Silver** | Data Cleaning & Validation | Clean, validate, deduplicate, and apply quality checks |
|
|
36
|
+
| **🥇 Gold** | Business Analytics | Create business-ready aggregated and transformed data |
|
|
37
|
+
|
|
38
|
+
Each tier has specific configuration options that control how data is processed, stored, and managed.
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
## 🏗️ Job Tiers
|
|
43
|
+
|
|
44
|
+
### Available Tiers
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
TBronze = Literal["bronze"] # Raw data ingestion layer
|
|
48
|
+
TSilver = Literal["silver"] # Cleaned and validated data layer
|
|
49
|
+
TGold = Literal["gold"] # Business-ready analytics layer
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## ⚙️ Common Options
|
|
55
|
+
|
|
56
|
+
> These options are available across **all tiers** (Bronze, Silver, and Gold).
|
|
57
|
+
|
|
58
|
+
<table>
|
|
59
|
+
<tr>
|
|
60
|
+
<td width="200"><strong>Option</strong></td>
|
|
61
|
+
<td><strong>Description</strong></td>
|
|
62
|
+
</tr>
|
|
63
|
+
|
|
64
|
+
<tr>
|
|
65
|
+
<td valign="top">
|
|
66
|
+
|
|
67
|
+
### `type`
|
|
68
|
+
|
|
69
|
+
**Type:** `Optional[Literal["manual", "default"]]`
|
|
70
|
+
**Default:** `"default"`
|
|
71
|
+
|
|
72
|
+
</td>
|
|
73
|
+
<td>
|
|
74
|
+
|
|
75
|
+
Specifies the job execution type:
|
|
76
|
+
- `"default"` - Standard automated job processing
|
|
77
|
+
- `"manual"` - Manual intervention required, job skips automatic execution
|
|
78
|
+
|
|
79
|
+
</td>
|
|
80
|
+
</tr>
|
|
81
|
+
|
|
82
|
+
<tr>
|
|
83
|
+
<td valign="top">
|
|
84
|
+
|
|
85
|
+
### `parents`
|
|
86
|
+
|
|
87
|
+
**Type:** `Optional[List[str]]`
|
|
88
|
+
**Default:** `None`
|
|
89
|
+
|
|
90
|
+
</td>
|
|
91
|
+
<td>
|
|
92
|
+
|
|
93
|
+
List of parent job names that this job depends on. The job will only execute after all parent jobs have completed successfully.
|
|
94
|
+
|
|
95
|
+
**Example:**
|
|
96
|
+
```python
|
|
97
|
+
parents=["bronze__customers", "bronze__orders"]
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
</td>
|
|
101
|
+
</tr>
|
|
102
|
+
|
|
103
|
+
<tr>
|
|
104
|
+
<td valign="top">
|
|
105
|
+
|
|
106
|
+
### `filter_where`
|
|
107
|
+
|
|
108
|
+
**Type:** `Optional[str]`
|
|
109
|
+
**Default:** `None`
|
|
110
|
+
|
|
111
|
+
</td>
|
|
112
|
+
<td>
|
|
113
|
+
|
|
114
|
+
SQL WHERE clause to filter data during processing. Applied to the source data before any transformations.
|
|
115
|
+
|
|
116
|
+
**Example:**
|
|
117
|
+
```python
|
|
118
|
+
filter_where="status = 'active' AND created_date >= '2024-01-01'"
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
</td>
|
|
122
|
+
</tr>
|
|
123
|
+
|
|
124
|
+
<tr>
|
|
125
|
+
<td valign="top">
|
|
126
|
+
|
|
127
|
+
### `optimize`
|
|
128
|
+
|
|
129
|
+
**Type:** `Optional[bool]`
|
|
130
|
+
**Default:** `False`
|
|
131
|
+
|
|
132
|
+
</td>
|
|
133
|
+
<td>
|
|
134
|
+
|
|
135
|
+
When `True`, runs the OPTIMIZE command on the target table after data loading to improve query performance by compacting small files.
|
|
136
|
+
|
|
137
|
+
</td>
|
|
138
|
+
</tr>
|
|
139
|
+
|
|
140
|
+
<tr>
|
|
141
|
+
<td valign="top">
|
|
142
|
+
|
|
143
|
+
### `compute_statistics`
|
|
144
|
+
|
|
145
|
+
**Type:** `Optional[bool]`
|
|
146
|
+
**Default:** `False`
|
|
147
|
+
|
|
148
|
+
</td>
|
|
149
|
+
<td>
|
|
150
|
+
|
|
151
|
+
When `True`, computes table statistics after data loading to help the query optimizer make better decisions.
|
|
152
|
+
|
|
153
|
+
</td>
|
|
154
|
+
</tr>
|
|
155
|
+
|
|
156
|
+
<tr>
|
|
157
|
+
<td valign="top">
|
|
158
|
+
|
|
159
|
+
### `vacuum`
|
|
160
|
+
|
|
161
|
+
**Type:** `Optional[bool]`
|
|
162
|
+
**Default:** `False`
|
|
163
|
+
|
|
164
|
+
</td>
|
|
165
|
+
<td>
|
|
166
|
+
|
|
167
|
+
When `True`, runs the VACUUM command to remove old data files that are no longer referenced by the table (typically files older than the retention period).
|
|
168
|
+
|
|
169
|
+
</td>
|
|
170
|
+
</tr>
|
|
171
|
+
|
|
172
|
+
<tr>
|
|
173
|
+
<td valign="top">
|
|
174
|
+
|
|
175
|
+
### `no_drop`
|
|
176
|
+
|
|
177
|
+
**Type:** `Optional[bool]`
|
|
178
|
+
**Default:** `False`
|
|
179
|
+
|
|
180
|
+
</td>
|
|
181
|
+
<td>
|
|
182
|
+
|
|
183
|
+
When `True`, prevents the table from being dropped during job execution, even if the job configuration would normally trigger a drop operation.
|
|
184
|
+
|
|
185
|
+
</td>
|
|
186
|
+
</tr>
|
|
187
|
+
|
|
188
|
+
<tr>
|
|
189
|
+
<td valign="top">
|
|
190
|
+
|
|
191
|
+
### `timeout`
|
|
192
|
+
|
|
193
|
+
**Type:** `Optional[int]`
|
|
194
|
+
**Default:** `None`
|
|
195
|
+
|
|
196
|
+
</td>
|
|
197
|
+
<td>
|
|
198
|
+
|
|
199
|
+
Maximum execution time in seconds for the job. If the job exceeds this time, it will be terminated.
|
|
200
|
+
|
|
201
|
+
</td>
|
|
202
|
+
</tr>
|
|
203
|
+
|
|
204
|
+
</table>
|
|
205
|
+
|
|
206
|
+
---
|
|
207
|
+
|
|
208
|
+
## 🥉 Bronze Options
|
|
209
|
+
|
|
210
|
+
> **Bronze tier**: Raw data ingestion from external sources
|
|
211
|
+
|
|
212
|
+
Bronze tier is responsible for ingesting raw data from external sources with minimal transformation. It focuses on capturing data exactly as it arrives.
|
|
213
|
+
|
|
214
|
+
### 🔧 Bronze-Specific Options
|
|
215
|
+
|
|
216
|
+
<details open>
|
|
217
|
+
<summary><h4>📌 <code>mode</code> (Required)</h4></summary>
|
|
218
|
+
|
|
219
|
+
**Type:** `Literal["memory", "append", "register"]`
|
|
220
|
+
|
|
221
|
+
Defines how data is loaded into the bronze table:
|
|
222
|
+
|
|
223
|
+
| Mode | Description | Use Case |
|
|
224
|
+
|------|-------------|----------|
|
|
225
|
+
| `"memory"` | Load data into memory only, don't persist | Temporary or test data |
|
|
226
|
+
| `"append"` | Append new data without checking duplicates | Raw data ingestion |
|
|
227
|
+
| `"register"` | Register external table without moving data | External data sources |
|
|
228
|
+
|
|
229
|
+
</details>
|
|
230
|
+
|
|
231
|
+
<details open>
|
|
232
|
+
<summary><h4>📌 <code>uri</code> (Required)</h4></summary>
|
|
233
|
+
|
|
234
|
+
**Type:** `str`
|
|
235
|
+
|
|
236
|
+
URI or path to the source data. Supports multiple formats:
|
|
237
|
+
|
|
238
|
+
- **File path**: `"/mnt/data/customers/*.json"`
|
|
239
|
+
- **URL**: `"https://api.example.com/data"`
|
|
240
|
+
- **Cloud storage**: `"s3://bucket/path/to/data"`
|
|
241
|
+
- **Database connection string**
|
|
242
|
+
|
|
243
|
+
**Example:**
|
|
244
|
+
```python
|
|
245
|
+
uri="/mnt/raw/customers/2024/*.parquet"
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
</details>
|
|
249
|
+
|
|
250
|
+
<details open>
|
|
251
|
+
<summary><h4>📌 <code>parser</code> (Required)</h4></summary>
|
|
252
|
+
|
|
253
|
+
**Type:** `str`
|
|
254
|
+
|
|
255
|
+
Name of the parser to use for reading and transforming the source data. Parsers define how to interpret the source format.
|
|
256
|
+
|
|
257
|
+
**Example:**
|
|
258
|
+
```python
|
|
259
|
+
parser="json_customer_parser"
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
</details>
|
|
263
|
+
|
|
264
|
+
<details open>
|
|
265
|
+
<summary><h4>📌 <code>source</code> (Required)</h4></summary>
|
|
266
|
+
|
|
267
|
+
**Type:** `str`
|
|
268
|
+
|
|
269
|
+
Name or identifier of the source system providing the data.
|
|
270
|
+
|
|
271
|
+
**Example:**
|
|
272
|
+
```python
|
|
273
|
+
source="salesforce_api"
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
</details>
|
|
277
|
+
|
|
278
|
+
<details open>
|
|
279
|
+
<summary><h4>📌 <code>keys</code></h4></summary>
|
|
280
|
+
|
|
281
|
+
**Type:** `Optional[List[str]]` | **Default:** `None`
|
|
282
|
+
|
|
283
|
+
List of column names that uniquely identify a record. Used for deduplication and change tracking.
|
|
284
|
+
|
|
285
|
+
**Example:**
|
|
286
|
+
```python
|
|
287
|
+
keys=["customer_id"]
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
</details>
|
|
291
|
+
|
|
292
|
+
<details open>
|
|
293
|
+
<summary><h4>📌 <code>encrypted_columns</code></h4></summary>
|
|
294
|
+
|
|
295
|
+
**Type:** `Optional[List[str]]` | **Default:** `None`
|
|
296
|
+
|
|
297
|
+
List of column names containing encrypted data that should be decrypted during ingestion.
|
|
298
|
+
|
|
299
|
+
**Example:**
|
|
300
|
+
```python
|
|
301
|
+
encrypted_columns=["ssn", "credit_card"]
|
|
302
|
+
```
|
|
303
|
+
|
|
304
|
+
</details>
|
|
305
|
+
|
|
306
|
+
<details open>
|
|
307
|
+
<summary><h4>📌 <code>calculated_columns</code></h4></summary>
|
|
308
|
+
|
|
309
|
+
**Type:** `Optional[dict[str, str]]` | **Default:** `None`
|
|
310
|
+
|
|
311
|
+
Dictionary mapping new column names to SQL expressions for calculating derived values.
|
|
312
|
+
|
|
313
|
+
**Example:**
|
|
314
|
+
```python
|
|
315
|
+
calculated_columns={
|
|
316
|
+
"full_name": "concat(first_name, ' ', last_name)",
|
|
317
|
+
"age": "year(current_date()) - year(birth_date)"
|
|
318
|
+
}
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
</details>
|
|
322
|
+
|
|
323
|
+
<details open>
|
|
324
|
+
<summary><h4>📌 <code>operation</code></h4></summary>
|
|
325
|
+
|
|
326
|
+
**Type:** `Optional[Literal["upsert", "reload", "delete"]]` | **Default:** `None`
|
|
327
|
+
|
|
328
|
+
Specifies the operation type for CDC (Change Data Capture):
|
|
329
|
+
|
|
330
|
+
| Operation | Description |
|
|
331
|
+
|-----------|-------------|
|
|
332
|
+
| `"upsert"` | Insert new records or update existing ones |
|
|
333
|
+
| `"reload"` | Full reload of all data (marks timestamp for downstream) |
|
|
334
|
+
| `"delete"` | Mark records as deleted |
|
|
335
|
+
|
|
336
|
+
</details>
|
|
337
|
+
|
|
338
|
+
### 📦 Complete Bronze Configuration Example
|
|
339
|
+
|
|
340
|
+
<details>
|
|
341
|
+
<summary>Click to expand full Bronze configuration example</summary>
|
|
342
|
+
|
|
343
|
+
```python
|
|
344
|
+
BronzeOptions(
|
|
345
|
+
# Required fields
|
|
346
|
+
mode="append",
|
|
347
|
+
uri="/mnt/data/customers/*.json",
|
|
348
|
+
parser="customer_json_parser",
|
|
349
|
+
source="crm_system",
|
|
350
|
+
|
|
351
|
+
# Identification
|
|
352
|
+
keys=["customer_id"],
|
|
353
|
+
|
|
354
|
+
# Common options
|
|
355
|
+
type="default",
|
|
356
|
+
parents=["bronze__raw_sources"],
|
|
357
|
+
filter_where="status IS NOT NULL",
|
|
358
|
+
optimize=True,
|
|
359
|
+
compute_statistics=True,
|
|
360
|
+
vacuum=False,
|
|
361
|
+
no_drop=False,
|
|
362
|
+
timeout=3600,
|
|
363
|
+
|
|
364
|
+
# Bronze-specific
|
|
365
|
+
encrypted_columns=["ssn"],
|
|
366
|
+
calculated_columns={"full_name": "concat(first_name, ' ', last_name)"},
|
|
367
|
+
operation="upsert"
|
|
368
|
+
)
|
|
369
|
+
```
|
|
370
|
+
|
|
371
|
+
</details>
|
|
372
|
+
|
|
373
|
+
---
|
|
374
|
+
|
|
375
|
+
## 🥈 Silver Options
|
|
376
|
+
|
|
377
|
+
> **Silver tier**: Cleaned and validated data with quality checks and change tracking
|
|
378
|
+
|
|
379
|
+
Silver tier processes bronze data with quality checks, deduplication, and change data capture strategies.
|
|
380
|
+
|
|
381
|
+
### 🔧 Silver-Specific Options
|
|
382
|
+
|
|
383
|
+
<details open>
|
|
384
|
+
<summary><h4>📌 <code>mode</code> (Required)</h4></summary>
|
|
385
|
+
|
|
386
|
+
**Type:** `Literal["memory", "append", "latest", "update", "combine"]`
|
|
387
|
+
|
|
388
|
+
Defines how data is processed and loaded:
|
|
389
|
+
|
|
390
|
+
| Mode | Description | Use Case |
|
|
391
|
+
|------|-------------|----------|
|
|
392
|
+
| `"memory"` | Process in memory only without persisting | Testing transformations |
|
|
393
|
+
| `"append"` | Append all new data without checking existing | Accumulate all data |
|
|
394
|
+
| `"latest"` | Process only the most recent data | Latest snapshot processing |
|
|
395
|
+
| `"update"` | Incremental updates, only new/changed records | Efficient incremental loads |
|
|
396
|
+
| `"combine"` | Combine multiple sources into single table | Multi-source consolidation |
|
|
397
|
+
|
|
398
|
+
</details>
|
|
399
|
+
|
|
400
|
+
<details open>
|
|
401
|
+
<summary><h4>📌 <code>change_data_capture</code> (Required)</h4></summary>
|
|
402
|
+
|
|
403
|
+
**Type:** `Literal["nocdc", "scd1", "scd2"]`
|
|
404
|
+
|
|
405
|
+
Change Data Capture strategy for tracking changes:
|
|
406
|
+
|
|
407
|
+
| Strategy | Description | History Tracking |
|
|
408
|
+
|----------|-------------|------------------|
|
|
409
|
+
| `"nocdc"` | No change tracking, simple updates/inserts | ❌ None |
|
|
410
|
+
| `"scd1"` | Slowly Changing Dimension Type 1 - Overwrite | ⚠️ Current state only |
|
|
411
|
+
| `"scd2"` | Slowly Changing Dimension Type 2 - Versioning | ✅ Full history |
|
|
412
|
+
|
|
413
|
+
</details>
|
|
414
|
+
|
|
415
|
+
<details open>
|
|
416
|
+
<summary><h4>📌 <code>deduplicate</code></h4></summary>
|
|
417
|
+
|
|
418
|
+
**Type:** `Optional[bool]` | **Default:** `False`
|
|
419
|
+
|
|
420
|
+
When `True`, removes duplicate records based on keys and hash values. Keeps the most recent record for each key.
|
|
421
|
+
|
|
422
|
+
</details>
|
|
423
|
+
|
|
424
|
+
<details open>
|
|
425
|
+
<summary><h4>📌 <code>stream</code></h4></summary>
|
|
426
|
+
|
|
427
|
+
**Type:** `Optional[bool]` | **Default:** `False`
|
|
428
|
+
|
|
429
|
+
When `True`, processes data using Spark Structured Streaming for real-time data processing.
|
|
430
|
+
|
|
431
|
+
</details>
|
|
432
|
+
|
|
433
|
+
<details open>
|
|
434
|
+
<summary><h4>📌 <code>order_duplicate_by</code></h4></summary>
|
|
435
|
+
|
|
436
|
+
**Type:** `Optional[dict[str, str]]` | **Default:** `None`
|
|
437
|
+
|
|
438
|
+
Dictionary specifying columns and sort order for determining which duplicate record to keep.
|
|
439
|
+
|
|
440
|
+
**Example:**
|
|
441
|
+
```python
|
|
442
|
+
order_duplicate_by={
|
|
443
|
+
"updated_at": "desc",
|
|
444
|
+
"priority": "desc"
|
|
445
|
+
}
|
|
446
|
+
```
|
|
447
|
+
|
|
448
|
+
</details>
|
|
449
|
+
|
|
450
|
+
### 📦 Complete Silver Configuration Example
|
|
451
|
+
|
|
452
|
+
<details>
|
|
453
|
+
<summary>Click to expand full Silver configuration example</summary>
|
|
454
|
+
|
|
455
|
+
```python
|
|
456
|
+
SilverOptions(
|
|
457
|
+
# Required fields
|
|
458
|
+
mode="update",
|
|
459
|
+
change_data_capture="scd2",
|
|
460
|
+
|
|
461
|
+
# Common options
|
|
462
|
+
type="default",
|
|
463
|
+
parents=["bronze__customers"],
|
|
464
|
+
filter_where="quality_score > 0.8",
|
|
465
|
+
optimize=True,
|
|
466
|
+
compute_statistics=True,
|
|
467
|
+
vacuum=True,
|
|
468
|
+
no_drop=False,
|
|
469
|
+
timeout=7200,
|
|
470
|
+
|
|
471
|
+
# Silver-specific
|
|
472
|
+
deduplicate=True,
|
|
473
|
+
stream=False,
|
|
474
|
+
order_duplicate_by={"updated_at": "desc"}
|
|
475
|
+
)
|
|
476
|
+
```
|
|
477
|
+
|
|
478
|
+
</details>
|
|
479
|
+
|
|
480
|
+
---
|
|
481
|
+
|
|
482
|
+
## 🥇 Gold Options
|
|
483
|
+
|
|
484
|
+
> **Gold tier**: Business-ready analytics with aggregations and ML enrichment
|
|
485
|
+
|
|
486
|
+
Gold tier creates business-ready data with aggregations, transformations, and analytics-optimized structures.
|
|
487
|
+
|
|
488
|
+
### 🔧 Gold-Specific Options
|
|
489
|
+
|
|
490
|
+
#### mode (Required)
|
|
491
|
+
**Type:** `Literal["memory", "append", "complete", "update", "invoke"]`
|
|
492
|
+
|
|
493
|
+
Defines how data is processed and loaded:
|
|
494
|
+
|
|
495
|
+
- **`"memory"`**: Process in memory only
|
|
496
|
+
- **`"append"`**: Append new calculated results
|
|
497
|
+
- **`"complete"`**: Complete rebuild of the entire table
|
|
498
|
+
- **`"update"`**: Incremental updates to existing data
|
|
499
|
+
- **`"invoke"`**: Execute external notebook or process
|
|
500
|
+
|
|
501
|
+
#### change_data_capture (Required)
|
|
502
|
+
**Type:** `Literal["nocdc", "scd1", "scd2"]`
|
|
503
|
+
|
|
504
|
+
Same as Silver tier - defines the CDC strategy.
|
|
505
|
+
|
|
506
|
+
#### update_where
|
|
507
|
+
**Type:** `Optional[str]`
|
|
508
|
+
**Default:** `None`
|
|
509
|
+
|
|
510
|
+
SQL WHERE clause to filter which records should be updated. Only applicable in `"update"` mode.
|
|
511
|
+
|
|
512
|
+
**Example:**
|
|
513
|
+
```python
|
|
514
|
+
update_where="last_modified >= current_date() - INTERVAL 7 DAYS"
|
|
515
|
+
```
|
|
516
|
+
|
|
517
|
+
#### deduplicate
|
|
518
|
+
**Type:** `Optional[bool]`
|
|
519
|
+
**Default:** `False`
|
|
520
|
+
|
|
521
|
+
Remove duplicates based on keys and hash values.
|
|
522
|
+
|
|
523
|
+
#### rectify_as_upserts
|
|
524
|
+
**Type:** `Optional[bool]`
|
|
525
|
+
**Default:** `False`
|
|
526
|
+
|
|
527
|
+
When `True`, converts reload operations into individual upsert and delete operations. This ensures historical consistency by generating synthetic delete records for items that disappear between reloads.
|
|
528
|
+
|
|
529
|
+
**Use Case:** Handle scenarios where records are deleted between full reloads without explicit delete operations.
|
|
530
|
+
|
|
531
|
+
#### correct_valid_from
|
|
532
|
+
**Type:** `Optional[bool]`
|
|
533
|
+
**Default:** `False`
|
|
534
|
+
|
|
535
|
+
When `True` and using SCD2, sets the `__valid_from` timestamp of the earliest record to `'1900-01-01'` instead of the actual first timestamp.
|
|
536
|
+
|
|
537
|
+
**Use Case:** Standardize historical record start dates for reporting purposes.
|
|
538
|
+
|
|
539
|
+
#### persist_last_timestamp
|
|
540
|
+
**Type:** `Optional[bool]`
|
|
541
|
+
**Default:** `False`
|
|
542
|
+
|
|
543
|
+
When `True`, persists the maximum timestamp from the processed data to be used as a watermark for the next incremental run.
|
|
544
|
+
|
|
545
|
+
#### persist_last_updated_timestamp
|
|
546
|
+
**Type:** `Optional[bool]`
|
|
547
|
+
**Default:** `False`
|
|
548
|
+
|
|
549
|
+
When `True`, persists the maximum `__last_updated` timestamp for incremental processing tracking.
|
|
550
|
+
|
|
551
|
+
#### table
|
|
552
|
+
**Type:** `Optional[str]`
|
|
553
|
+
**Default:** `None`
|
|
554
|
+
|
|
555
|
+
Override the default target table name with a custom table name.
|
|
556
|
+
|
|
557
|
+
**Example:**
|
|
558
|
+
```python
|
|
559
|
+
table="custom_schema.custom_table_name"
|
|
560
|
+
```
|
|
561
|
+
|
|
562
|
+
#### notebook
|
|
563
|
+
**Type:** `Optional[bool]`
|
|
564
|
+
**Default:** `False`
|
|
565
|
+
|
|
566
|
+
When `True`, generates a notebook for this job that can be executed independently.
|
|
567
|
+
|
|
568
|
+
#### requirements
|
|
569
|
+
**Type:** `Optional[bool]`
|
|
570
|
+
**Default:** `False`
|
|
571
|
+
|
|
572
|
+
When `True`, generates a requirements file for dependencies needed by this job.
|
|
573
|
+
|
|
574
|
+
#### metadata
|
|
575
|
+
**Type:** `Optional[bool]`
|
|
576
|
+
**Default:** `False`
|
|
577
|
+
|
|
578
|
+
When `True`, adds or updates metadata tracking columns (`__metadata.inserted`, `__metadata.updated`).
|
|
579
|
+
|
|
580
|
+
#### last_updated
|
|
581
|
+
**Type:** `Optional[bool]`
|
|
582
|
+
**Default:** `False`
|
|
583
|
+
|
|
584
|
+
When `True`, adds or updates the `__last_updated` timestamp column.
|
|
585
|
+
|
|
586
|
+
### Complete Gold Configuration
|
|
587
|
+
|
|
588
|
+
```python
|
|
589
|
+
GoldOptions(
|
|
590
|
+
type="default",
|
|
591
|
+
mode="update",
|
|
592
|
+
change_data_capture="scd2",
|
|
593
|
+
update_where="updated_at > (SELECT max(last_run) FROM control_table)",
|
|
594
|
+
parents=["silver__customers", "silver__orders"],
|
|
595
|
+
optimize=True,
|
|
596
|
+
compute_statistics=True,
|
|
597
|
+
vacuum=True,
|
|
598
|
+
no_drop=False,
|
|
599
|
+
deduplicate=True,
|
|
600
|
+
rectify_as_upserts=True,
|
|
601
|
+
correct_valid_from=True,
|
|
602
|
+
persist_last_timestamp=True,
|
|
603
|
+
persist_last_updated_timestamp=True,
|
|
604
|
+
table="analytics.customer_360",
|
|
605
|
+
notebook=False,
|
|
606
|
+
requirements=False,
|
|
607
|
+
timeout=10800,
|
|
608
|
+
metadata=True,
|
|
609
|
+
last_updated=True
|
|
610
|
+
)
|
|
611
|
+
```
|
|
612
|
+
|
|
613
|
+
---
|
|
614
|
+
|
|
615
|
+
## Table Options
|
|
616
|
+
|
|
617
|
+
Table options control the physical table structure and optimization features.
|
|
618
|
+
|
|
619
|
+
### identity
|
|
620
|
+
**Type:** `Optional[bool]`
|
|
621
|
+
**Default:** `None`
|
|
622
|
+
|
|
623
|
+
When `True`, adds an auto-incrementing identity column to the table.
|
|
624
|
+
|
|
625
|
+
### liquid_clustering
|
|
626
|
+
**Type:** `Optional[bool]`
|
|
627
|
+
**Default:** `None`
|
|
628
|
+
|
|
629
|
+
When `True`, enables Databricks liquid clustering for improved query performance with automatic optimization.
|
|
630
|
+
|
|
631
|
+
### partition_by
|
|
632
|
+
**Type:** `Optional[List[str]]`
|
|
633
|
+
**Default:** `None`
|
|
634
|
+
|
|
635
|
+
List of columns to partition the table by. Partitioning physically organizes data for faster queries on partition columns.
|
|
636
|
+
|
|
637
|
+
**Example:**
|
|
638
|
+
```python
|
|
639
|
+
partition_by=["year", "month"]
|
|
640
|
+
```
|
|
641
|
+
|
|
642
|
+
**Use Case:** Date-based partitioning for time-series data.
|
|
643
|
+
|
|
644
|
+
### zorder_by
|
|
645
|
+
**Type:** `Optional[List[str]]`
|
|
646
|
+
**Default:** `None`
|
|
647
|
+
|
|
648
|
+
List of columns to Z-order (multi-dimensional clustering). Improves query performance for columns frequently used in filters.
|
|
649
|
+
|
|
650
|
+
**Example:**
|
|
651
|
+
```python
|
|
652
|
+
zorder_by=["customer_id", "product_id"]
|
|
653
|
+
```
|
|
654
|
+
|
|
655
|
+
### cluster_by
|
|
656
|
+
**Type:** `Optional[List[str]]`
|
|
657
|
+
**Default:** `None`
|
|
658
|
+
|
|
659
|
+
List of columns to cluster by. Alternative to liquid clustering for organizing data.
|
|
660
|
+
|
|
661
|
+
**Example:**
|
|
662
|
+
```python
|
|
663
|
+
cluster_by=["region", "category"]
|
|
664
|
+
```
|
|
665
|
+
|
|
666
|
+
### powerbi
|
|
667
|
+
**Type:** `Optional[bool]`
|
|
668
|
+
**Default:** `None`
|
|
669
|
+
|
|
670
|
+
When `True`, optimizes table settings for Power BI connectivity and performance.
|
|
671
|
+
|
|
672
|
+
### maximum_compatibility
|
|
673
|
+
**Type:** `Optional[bool]`
|
|
674
|
+
**Default:** `None`
|
|
675
|
+
|
|
676
|
+
When `True`, creates tables with maximum compatibility settings for older Spark/Delta versions.
|
|
677
|
+
|
|
678
|
+
### bloomfilter_by
|
|
679
|
+
**Type:** `Optional[List[str]]`
|
|
680
|
+
**Default:** `None`
|
|
681
|
+
|
|
682
|
+
List of columns to create bloom filters on for faster equality lookups.
|
|
683
|
+
|
|
684
|
+
**Example:**
|
|
685
|
+
```python
|
|
686
|
+
bloomfilter_by=["email", "phone_number"]
|
|
687
|
+
```
|
|
688
|
+
|
|
689
|
+
### constraints
|
|
690
|
+
**Type:** `Optional[dict[str, str]]`
|
|
691
|
+
**Default:** `None`
|
|
692
|
+
|
|
693
|
+
Dictionary mapping constraint names to SQL constraint expressions.
|
|
694
|
+
|
|
695
|
+
**Example:**
|
|
696
|
+
```python
|
|
697
|
+
constraints={
|
|
698
|
+
"valid_email": "email LIKE '%@%.%'",
|
|
699
|
+
"positive_amount": "amount > 0"
|
|
700
|
+
}
|
|
701
|
+
```
|
|
702
|
+
|
|
703
|
+
### properties
|
|
704
|
+
**Type:** `Optional[dict[str, str]]`
|
|
705
|
+
**Default:** `None`
|
|
706
|
+
|
|
707
|
+
Dictionary of custom table properties as key-value pairs.
|
|
708
|
+
|
|
709
|
+
**Example:**
|
|
710
|
+
```python
|
|
711
|
+
properties={
|
|
712
|
+
"owner": "data_team",
|
|
713
|
+
"data_classification": "confidential"
|
|
714
|
+
}
|
|
715
|
+
```
|
|
716
|
+
|
|
717
|
+
### comment
|
|
718
|
+
**Type:** `Optional[str]`
|
|
719
|
+
**Default:** `None`
|
|
720
|
+
|
|
721
|
+
Description or comment for the table.
|
|
722
|
+
|
|
723
|
+
**Example:**
|
|
724
|
+
```python
|
|
725
|
+
comment="Customer master data with full history tracking"
|
|
726
|
+
```
|
|
727
|
+
|
|
728
|
+
### calculated_columns
|
|
729
|
+
**Type:** `Optional[dict[str, str]]`
|
|
730
|
+
**Default:** `None`
|
|
731
|
+
|
|
732
|
+
Dictionary mapping column names to SQL expressions for computed/generated columns.
|
|
733
|
+
|
|
734
|
+
**Example:**
|
|
735
|
+
```python
|
|
736
|
+
calculated_columns={
|
|
737
|
+
"full_address": "concat(street, ', ', city, ', ', state)",
|
|
738
|
+
"is_premium": "CASE WHEN tier = 'premium' THEN true ELSE false END"
|
|
739
|
+
}
|
|
740
|
+
```
|
|
741
|
+
|
|
742
|
+
### masks
|
|
743
|
+
**Type:** `Optional[dict[str, str]]`
|
|
744
|
+
**Default:** `None`
|
|
745
|
+
|
|
746
|
+
Dictionary mapping column names to masking expressions for data privacy/security.
|
|
747
|
+
|
|
748
|
+
**Example:**
|
|
749
|
+
```python
|
|
750
|
+
masks={
|
|
751
|
+
"ssn": "concat('***-**-', right(ssn, 4))",
|
|
752
|
+
"credit_card": "concat('****-****-****-', right(credit_card, 4))"
|
|
753
|
+
}
|
|
754
|
+
```
|
|
755
|
+
|
|
756
|
+
### comments
|
|
757
|
+
**Type:** `Optional[dict[str, str]]`
|
|
758
|
+
**Default:** `None`
|
|
759
|
+
|
|
760
|
+
Dictionary mapping column names to their descriptions.
|
|
761
|
+
|
|
762
|
+
**Example:**
|
|
763
|
+
```python
|
|
764
|
+
comments={
|
|
765
|
+
"customer_id": "Unique identifier for each customer",
|
|
766
|
+
"lifetime_value": "Total revenue generated by customer"
|
|
767
|
+
}
|
|
768
|
+
```
|
|
769
|
+
|
|
770
|
+
### retention_days
|
|
771
|
+
**Type:** `Optional[int]`
|
|
772
|
+
**Default:** `None`
|
|
773
|
+
|
|
774
|
+
Number of days to retain old versions of data before they can be vacuumed.
|
|
775
|
+
|
|
776
|
+
**Example:**
|
|
777
|
+
```python
|
|
778
|
+
retention_days=90
|
|
779
|
+
```
|
|
780
|
+
|
|
781
|
+
### primary_key
|
|
782
|
+
**Type:** `Optional[dict[str, PrimaryKey]]`
|
|
783
|
+
**Default:** `None`
|
|
784
|
+
|
|
785
|
+
Dictionary defining primary key constraint with name and configuration.
|
|
786
|
+
|
|
787
|
+
**Example:**
|
|
788
|
+
```python
|
|
789
|
+
primary_key={
|
|
790
|
+
"pk_customers": {
|
|
791
|
+
"keys": ["customer_id"],
|
|
792
|
+
"options": {"constraint": "not enforced"}
|
|
793
|
+
}
|
|
794
|
+
}
|
|
795
|
+
```
|
|
796
|
+
|
|
797
|
+
### foreign_keys
|
|
798
|
+
**Type:** `Optional[dict[str, ForeignKey]]`
|
|
799
|
+
**Default:** `None`
|
|
800
|
+
|
|
801
|
+
Dictionary defining foreign key constraints with names and configurations.
|
|
802
|
+
|
|
803
|
+
**Example:**
|
|
804
|
+
```python
|
|
805
|
+
foreign_keys={
|
|
806
|
+
"fk_orders_customer": {
|
|
807
|
+
"keys": ["customer_id"],
|
|
808
|
+
"reference": "customers",
|
|
809
|
+
"options": {
|
|
810
|
+
"foreign_key": "on delete no action",
|
|
811
|
+
"constraint": "not enforced"
|
|
812
|
+
}
|
|
813
|
+
}
|
|
814
|
+
}
|
|
815
|
+
```
|
|
816
|
+
|
|
817
|
+
### Complete Table Options Example
|
|
818
|
+
|
|
819
|
+
```python
|
|
820
|
+
TableOptions(
|
|
821
|
+
identity=False,
|
|
822
|
+
liquid_clustering=True,
|
|
823
|
+
partition_by=["year", "month"],
|
|
824
|
+
zorder_by=["customer_id"],
|
|
825
|
+
cluster_by=None,
|
|
826
|
+
powerbi=True,
|
|
827
|
+
maximum_compatibility=False,
|
|
828
|
+
bloomfilter_by=["email"],
|
|
829
|
+
constraints={
|
|
830
|
+
"valid_email": "email LIKE '%@%.%'",
|
|
831
|
+
"positive_balance": "balance >= 0"
|
|
832
|
+
},
|
|
833
|
+
properties={
|
|
834
|
+
"owner": "analytics_team",
|
|
835
|
+
"pii": "true"
|
|
836
|
+
},
|
|
837
|
+
comment="Customer dimension table with full SCD2 history",
|
|
838
|
+
calculated_columns={
|
|
839
|
+
"age": "year(current_date()) - year(birth_date)"
|
|
840
|
+
},
|
|
841
|
+
masks={
|
|
842
|
+
"ssn": "concat('***-**-', right(ssn, 4))"
|
|
843
|
+
},
|
|
844
|
+
comments={
|
|
845
|
+
"customer_id": "Primary key - unique customer identifier",
|
|
846
|
+
"balance": "Current account balance"
|
|
847
|
+
},
|
|
848
|
+
retention_days=90,
|
|
849
|
+
primary_key={
|
|
850
|
+
"pk_customer": {
|
|
851
|
+
"keys": ["customer_id"],
|
|
852
|
+
"options": {"constraint": "not enforced"}
|
|
853
|
+
}
|
|
854
|
+
},
|
|
855
|
+
foreign_keys={
|
|
856
|
+
"fk_country": {
|
|
857
|
+
"keys": ["country_code"],
|
|
858
|
+
"reference": "dim_countries",
|
|
859
|
+
"options": {
|
|
860
|
+
"foreign_key": "on delete no action",
|
|
861
|
+
"constraint": "not enforced"
|
|
862
|
+
}
|
|
863
|
+
}
|
|
864
|
+
}
|
|
865
|
+
)
|
|
866
|
+
```
|
|
867
|
+
|
|
868
|
+
---
|
|
869
|
+
|
|
870
|
+
## Check Options
|
|
871
|
+
|
|
872
|
+
Data quality and validation checks that run before or after job execution.
|
|
873
|
+
|
|
874
|
+
### skip
|
|
875
|
+
**Type:** `Optional[bool]`
|
|
876
|
+
**Default:** `False`
|
|
877
|
+
|
|
878
|
+
When `True`, skips all data quality checks for this job.
|
|
879
|
+
|
|
880
|
+
### pre_run
|
|
881
|
+
**Type:** `Optional[bool]`
|
|
882
|
+
**Default:** `False`
|
|
883
|
+
|
|
884
|
+
When `True`, runs data quality checks before job execution. Job fails if checks don't pass.
|
|
885
|
+
|
|
886
|
+
### post_run
|
|
887
|
+
**Type:** `Optional[bool]`
|
|
888
|
+
**Default:** `False`
|
|
889
|
+
|
|
890
|
+
When `True`, runs data quality checks after job execution. Job fails if checks don't pass.
|
|
891
|
+
|
|
892
|
+
### min_rows
|
|
893
|
+
**Type:** `Optional[int]`
|
|
894
|
+
**Default:** `None`
|
|
895
|
+
|
|
896
|
+
Minimum number of rows expected in the result. Check fails if row count is below this threshold.
|
|
897
|
+
|
|
898
|
+
**Example:**
|
|
899
|
+
```python
|
|
900
|
+
min_rows=1000
|
|
901
|
+
```
|
|
902
|
+
|
|
903
|
+
### max_rows
|
|
904
|
+
**Type:** `Optional[int]`
|
|
905
|
+
**Default:** `None`
|
|
906
|
+
|
|
907
|
+
Maximum number of rows expected in the result. Check fails if row count exceeds this threshold.
|
|
908
|
+
|
|
909
|
+
**Example:**
|
|
910
|
+
```python
|
|
911
|
+
max_rows=1000000
|
|
912
|
+
```
|
|
913
|
+
|
|
914
|
+
### count_must_equal
|
|
915
|
+
**Type:** `Optional[str]`
|
|
916
|
+
**Default:** `None`
|
|
917
|
+
|
|
918
|
+
SQL expression or table name to compare row counts. Check fails if counts don't match.
|
|
919
|
+
|
|
920
|
+
**Example:**
|
|
921
|
+
```python
|
|
922
|
+
count_must_equal="bronze__source_table"
|
|
923
|
+
```
|
|
924
|
+
|
|
925
|
+
### Complete Check Options Example
|
|
926
|
+
|
|
927
|
+
```python
|
|
928
|
+
CheckOptions(
|
|
929
|
+
skip=False,
|
|
930
|
+
pre_run=True,
|
|
931
|
+
post_run=True,
|
|
932
|
+
min_rows=100,
|
|
933
|
+
max_rows=10000000,
|
|
934
|
+
count_must_equal="bronze__raw_customers"
|
|
935
|
+
)
|
|
936
|
+
```
|
|
937
|
+
|
|
938
|
+
---
|
|
939
|
+
|
|
940
|
+
## Spark Options
|
|
941
|
+
|
|
942
|
+
Spark configuration for the job execution.
|
|
943
|
+
|
|
944
|
+
### sql
|
|
945
|
+
**Type:** `Optional[dict[str, str]]`
|
|
946
|
+
**Default:** `None`
|
|
947
|
+
|
|
948
|
+
Dictionary of Spark SQL configuration parameters.
|
|
949
|
+
|
|
950
|
+
**Example:**
|
|
951
|
+
```python
|
|
952
|
+
sql={
|
|
953
|
+
"spark.sql.adaptive.enabled": "true",
|
|
954
|
+
"spark.sql.adaptive.coalescePartitions.enabled": "true"
|
|
955
|
+
}
|
|
956
|
+
```
|
|
957
|
+
|
|
958
|
+
### conf
|
|
959
|
+
**Type:** `Optional[dict[str, str]]`
|
|
960
|
+
**Default:** `None`
|
|
961
|
+
|
|
962
|
+
Dictionary of general Spark configuration parameters.
|
|
963
|
+
|
|
964
|
+
**Example:**
|
|
965
|
+
```python
|
|
966
|
+
conf={
|
|
967
|
+
"spark.executor.memory": "8g",
|
|
968
|
+
"spark.executor.cores": "4",
|
|
969
|
+
"spark.dynamicAllocation.enabled": "true"
|
|
970
|
+
}
|
|
971
|
+
```
|
|
972
|
+
|
|
973
|
+
### Complete Spark Options Example
|
|
974
|
+
|
|
975
|
+
```python
|
|
976
|
+
SparkOptions(
|
|
977
|
+
sql={
|
|
978
|
+
"spark.sql.adaptive.enabled": "true",
|
|
979
|
+
"spark.sql.shuffle.partitions": "200",
|
|
980
|
+
"spark.sql.autoBroadcastJoinThreshold": "10485760"
|
|
981
|
+
},
|
|
982
|
+
conf={
|
|
983
|
+
"spark.executor.memory": "8g",
|
|
984
|
+
"spark.executor.cores": "4",
|
|
985
|
+
"spark.executor.instances": "10"
|
|
986
|
+
}
|
|
987
|
+
)
|
|
988
|
+
```
|
|
989
|
+
|
|
990
|
+
---
|
|
991
|
+
|
|
992
|
+
## Invoker Options
|
|
993
|
+
|
|
994
|
+
Options for invoking external notebooks at different stages of job execution.
|
|
995
|
+
|
|
996
|
+
### Structure
|
|
997
|
+
|
|
998
|
+
```python
|
|
999
|
+
class _InvokeOptions(TypedDict):
|
|
1000
|
+
notebook: str # Path to notebook to invoke
|
|
1001
|
+
timeout: int # Timeout in seconds
|
|
1002
|
+
arguments: Optional[dict[str, str]] # Arguments to pass to notebook
|
|
1003
|
+
```
|
|
1004
|
+
|
|
1005
|
+
### pre_run
|
|
1006
|
+
**Type:** `Optional[List[_InvokeOptions]]`
|
|
1007
|
+
**Default:** `None`
|
|
1008
|
+
|
|
1009
|
+
List of notebooks to execute before the main job runs.
|
|
1010
|
+
|
|
1011
|
+
### run
|
|
1012
|
+
**Type:** `Optional[List[_InvokeOptions]]`
|
|
1013
|
+
**Default:** `None`
|
|
1014
|
+
|
|
1015
|
+
List of notebooks to execute as the main job (replaces default job logic).
|
|
1016
|
+
|
|
1017
|
+
### post_run
|
|
1018
|
+
**Type:** `Optional[List[_InvokeOptions]]`
|
|
1019
|
+
**Default:** `None`
|
|
1020
|
+
|
|
1021
|
+
List of notebooks to execute after the main job completes.
|
|
1022
|
+
|
|
1023
|
+
### Complete Invoker Options Example
|
|
1024
|
+
|
|
1025
|
+
```python
|
|
1026
|
+
InvokerOptions(
|
|
1027
|
+
pre_run=[
|
|
1028
|
+
{
|
|
1029
|
+
"notebook": "/Notebooks/setup/validate_sources",
|
|
1030
|
+
"timeout": 300,
|
|
1031
|
+
"arguments": {"check_level": "strict"}
|
|
1032
|
+
}
|
|
1033
|
+
],
|
|
1034
|
+
run=None,
|
|
1035
|
+
post_run=[
|
|
1036
|
+
{
|
|
1037
|
+
"notebook": "/Notebooks/post/send_notification",
|
|
1038
|
+
"timeout": 60,
|
|
1039
|
+
"arguments": {
|
|
1040
|
+
"recipients": "team@example.com",
|
|
1041
|
+
"status": "success"
|
|
1042
|
+
}
|
|
1043
|
+
},
|
|
1044
|
+
{
|
|
1045
|
+
"notebook": "/Notebooks/post/update_dashboard",
|
|
1046
|
+
"timeout": 120,
|
|
1047
|
+
"arguments": {"dashboard_id": "main"}
|
|
1048
|
+
}
|
|
1049
|
+
]
|
|
1050
|
+
)
|
|
1051
|
+
```
|
|
1052
|
+
|
|
1053
|
+
---
|
|
1054
|
+
|
|
1055
|
+
## Extender Options
|
|
1056
|
+
|
|
1057
|
+
Options for extending job functionality with custom logic.
|
|
1058
|
+
|
|
1059
|
+
### extender (Required)
|
|
1060
|
+
**Type:** `str`
|
|
1061
|
+
|
|
1062
|
+
Name or path of the extender class to use.
|
|
1063
|
+
|
|
1064
|
+
**Example:**
|
|
1065
|
+
```python
|
|
1066
|
+
extender="CustomDataProcessor"
|
|
1067
|
+
```
|
|
1068
|
+
|
|
1069
|
+
### arguments
|
|
1070
|
+
**Type:** `Optional[dict[str, str]]`
|
|
1071
|
+
**Default:** `None`
|
|
1072
|
+
|
|
1073
|
+
Dictionary of arguments to pass to the extender.
|
|
1074
|
+
|
|
1075
|
+
**Example:**
|
|
1076
|
+
```python
|
|
1077
|
+
arguments={
|
|
1078
|
+
"transformation": "advanced",
|
|
1079
|
+
"output_format": "parquet"
|
|
1080
|
+
}
|
|
1081
|
+
```
|
|
1082
|
+
|
|
1083
|
+
### Complete Extender Options Example
|
|
1084
|
+
|
|
1085
|
+
```python
|
|
1086
|
+
ExtenderOptions(
|
|
1087
|
+
extender="ml.FeatureEngineeringExtender",
|
|
1088
|
+
arguments={
|
|
1089
|
+
"feature_set": "customer_features_v2",
|
|
1090
|
+
"normalize": "true",
|
|
1091
|
+
"handle_missing": "impute"
|
|
1092
|
+
}
|
|
1093
|
+
)
|
|
1094
|
+
```
|
|
1095
|
+
|
|
1096
|
+
Multiple extenders can be chained:
|
|
1097
|
+
|
|
1098
|
+
```python
|
|
1099
|
+
extender_options=[
|
|
1100
|
+
{
|
|
1101
|
+
"extender": "DataValidationExtender",
|
|
1102
|
+
"arguments": {"strict": "true"}
|
|
1103
|
+
},
|
|
1104
|
+
{
|
|
1105
|
+
"extender": "FeatureEngineeringExtender",
|
|
1106
|
+
"arguments": {"feature_set": "v2"}
|
|
1107
|
+
},
|
|
1108
|
+
{
|
|
1109
|
+
"extender": "MLScoringExtender",
|
|
1110
|
+
"arguments": {"model": "customer_churn_v3"}
|
|
1111
|
+
}
|
|
1112
|
+
]
|
|
1113
|
+
```
|
|
1114
|
+
|
|
1115
|
+
---
|
|
1116
|
+
|
|
1117
|
+
## Complete Configuration Examples
|
|
1118
|
+
|
|
1119
|
+
### Bronze Job: Ingest Customer Data
|
|
1120
|
+
|
|
1121
|
+
```python
|
|
1122
|
+
JobConfBronze(
|
|
1123
|
+
job_id="bronze__crm_customers",
|
|
1124
|
+
topic="customers",
|
|
1125
|
+
item="raw_data",
|
|
1126
|
+
step="bronze",
|
|
1127
|
+
options=BronzeOptions(
|
|
1128
|
+
type="default",
|
|
1129
|
+
mode="append",
|
|
1130
|
+
uri="s3://data-lake/raw/crm/customers/*.json",
|
|
1131
|
+
parser="crm_customer_parser",
|
|
1132
|
+
source="salesforce_crm",
|
|
1133
|
+
keys=["customer_id"],
|
|
1134
|
+
parents=None,
|
|
1135
|
+
filter_where="status != 'test'",
|
|
1136
|
+
optimize=True,
|
|
1137
|
+
compute_statistics=True,
|
|
1138
|
+
vacuum=False,
|
|
1139
|
+
no_drop=False,
|
|
1140
|
+
encrypted_columns=["ssn", "credit_card"],
|
|
1141
|
+
calculated_columns={
|
|
1142
|
+
"full_name": "concat(first_name, ' ', last_name)",
|
|
1143
|
+
"account_age_days": "datediff(current_date(), created_date)"
|
|
1144
|
+
},
|
|
1145
|
+
operation="upsert",
|
|
1146
|
+
timeout=3600
|
|
1147
|
+
),
|
|
1148
|
+
table_options=TableOptions(
|
|
1149
|
+
partition_by=["ingestion_date"],
|
|
1150
|
+
comment="Raw customer data from CRM system",
|
|
1151
|
+
retention_days=30
|
|
1152
|
+
),
|
|
1153
|
+
check_options=CheckOptions(
|
|
1154
|
+
skip=False,
|
|
1155
|
+
post_run=True,
|
|
1156
|
+
min_rows=1
|
|
1157
|
+
),
|
|
1158
|
+
spark_options=SparkOptions(
|
|
1159
|
+
conf={
|
|
1160
|
+
"spark.executor.memory": "4g",
|
|
1161
|
+
"spark.executor.cores": "2"
|
|
1162
|
+
}
|
|
1163
|
+
),
|
|
1164
|
+
tags=["pii", "critical", "daily"],
|
|
1165
|
+
comment="Daily customer data ingestion from Salesforce CRM"
|
|
1166
|
+
)
|
|
1167
|
+
```
|
|
1168
|
+
|
|
1169
|
+
### Silver Job: Clean and Deduplicate Customers
|
|
1170
|
+
|
|
1171
|
+
```python
|
|
1172
|
+
JobConfSilver(
|
|
1173
|
+
job_id="silver__customers",
|
|
1174
|
+
topic="customers",
|
|
1175
|
+
item="cleaned",
|
|
1176
|
+
step="silver",
|
|
1177
|
+
options=SilverOptions(
|
|
1178
|
+
type="default",
|
|
1179
|
+
mode="update",
|
|
1180
|
+
change_data_capture="scd2",
|
|
1181
|
+
parents=["bronze__crm_customers"],
|
|
1182
|
+
filter_where="email IS NOT NULL AND email LIKE '%@%.%'",
|
|
1183
|
+
optimize=True,
|
|
1184
|
+
compute_statistics=True,
|
|
1185
|
+
vacuum=True,
|
|
1186
|
+
no_drop=False,
|
|
1187
|
+
deduplicate=True,
|
|
1188
|
+
stream=False,
|
|
1189
|
+
order_duplicate_by={"updated_at": "desc", "source_priority": "asc"},
|
|
1190
|
+
timeout=7200
|
|
1191
|
+
),
|
|
1192
|
+
table_options=TableOptions(
|
|
1193
|
+
liquid_clustering=True,
|
|
1194
|
+
cluster_by=["customer_id", "country_code"],
|
|
1195
|
+
bloomfilter_by=["email", "phone"],
|
|
1196
|
+
constraints={
|
|
1197
|
+
"valid_email": "email LIKE '%@%.%'",
|
|
1198
|
+
"valid_country": "country_code IN ('US', 'CA', 'UK', 'DE')"
|
|
1199
|
+
},
|
|
1200
|
+
comments={
|
|
1201
|
+
"customer_id": "Unique customer identifier",
|
|
1202
|
+
"email": "Primary contact email",
|
|
1203
|
+
"phone": "Primary phone number"
|
|
1204
|
+
},
|
|
1205
|
+
comment="Cleaned customer master data with SCD2 history",
|
|
1206
|
+
retention_days=90
|
|
1207
|
+
),
|
|
1208
|
+
check_options=CheckOptions(
|
|
1209
|
+
pre_run=True,
|
|
1210
|
+
post_run=True,
|
|
1211
|
+
min_rows=1000,
|
|
1212
|
+
count_must_equal="bronze__crm_customers"
|
|
1213
|
+
),
|
|
1214
|
+
spark_options=SparkOptions(
|
|
1215
|
+
sql={
|
|
1216
|
+
"spark.sql.adaptive.enabled": "true",
|
|
1217
|
+
"spark.sql.adaptive.coalescePartitions.enabled": "true"
|
|
1218
|
+
}
|
|
1219
|
+
),
|
|
1220
|
+
invoker_options=InvokerOptions(
|
|
1221
|
+
post_run=[
|
|
1222
|
+
{
|
|
1223
|
+
"notebook": "/Quality/DataQualityReport",
|
|
1224
|
+
"timeout": 300,
|
|
1225
|
+
"arguments": {"table": "silver__customers"}
|
|
1226
|
+
}
|
|
1227
|
+
]
|
|
1228
|
+
),
|
|
1229
|
+
tags=["master_data", "scd2", "pii"],
|
|
1230
|
+
comment="Silver layer customer data with deduplication and validation"
|
|
1231
|
+
)
|
|
1232
|
+
```
|
|
1233
|
+
|
|
1234
|
+
### Gold Job: Customer 360 View
|
|
1235
|
+
|
|
1236
|
+
```python
|
|
1237
|
+
JobConfGold(
|
|
1238
|
+
job_id="gold__customer_360",
|
|
1239
|
+
topic="analytics",
|
|
1240
|
+
item="customer_360",
|
|
1241
|
+
step="gold",
|
|
1242
|
+
options=GoldOptions(
|
|
1243
|
+
type="default",
|
|
1244
|
+
mode="update",
|
|
1245
|
+
change_data_capture="scd1",
|
|
1246
|
+
update_where="__last_updated >= current_date() - INTERVAL 7 DAYS",
|
|
1247
|
+
parents=["silver__customers", "silver__orders", "silver__interactions"],
|
|
1248
|
+
optimize=True,
|
|
1249
|
+
compute_statistics=True,
|
|
1250
|
+
vacuum=True,
|
|
1251
|
+
no_drop=False,
|
|
1252
|
+
deduplicate=True,
|
|
1253
|
+
rectify_as_upserts=True,
|
|
1254
|
+
correct_valid_from=False,
|
|
1255
|
+
persist_last_timestamp=True,
|
|
1256
|
+
persist_last_updated_timestamp=True,
|
|
1257
|
+
table="analytics.customer_360_view",
|
|
1258
|
+
notebook=False,
|
|
1259
|
+
requirements=False,
|
|
1260
|
+
timeout=10800,
|
|
1261
|
+
metadata=True,
|
|
1262
|
+
last_updated=True
|
|
1263
|
+
),
|
|
1264
|
+
table_options=TableOptions(
|
|
1265
|
+
liquid_clustering=True,
|
|
1266
|
+
cluster_by=["customer_segment", "region"],
|
|
1267
|
+
zorder_by=["customer_id"],
|
|
1268
|
+
powerbi=True,
|
|
1269
|
+
bloomfilter_by=["customer_id"],
|
|
1270
|
+
calculated_columns={
|
|
1271
|
+
"customer_lifetime_value": "total_revenue - total_costs",
|
|
1272
|
+
"churn_risk_category": """
|
|
1273
|
+
CASE
|
|
1274
|
+
WHEN churn_score > 0.8 THEN 'High'
|
|
1275
|
+
WHEN churn_score > 0.5 THEN 'Medium'
|
|
1276
|
+
ELSE 'Low'
|
|
1277
|
+
END
|
|
1278
|
+
"""
|
|
1279
|
+
},
|
|
1280
|
+
properties={
|
|
1281
|
+
"owner": "analytics_team",
|
|
1282
|
+
"refresh_frequency": "daily",
|
|
1283
|
+
"data_classification": "internal"
|
|
1284
|
+
},
|
|
1285
|
+
comment="Complete 360-degree view of customer data for analytics and reporting",
|
|
1286
|
+
retention_days=180
|
|
1287
|
+
),
|
|
1288
|
+
check_options=CheckOptions(
|
|
1289
|
+
post_run=True,
|
|
1290
|
+
min_rows=100,
|
|
1291
|
+
max_rows=100000000
|
|
1292
|
+
),
|
|
1293
|
+
spark_options=SparkOptions(
|
|
1294
|
+
sql={
|
|
1295
|
+
"spark.sql.adaptive.enabled": "true",
|
|
1296
|
+
"spark.sql.shuffle.partitions": "400"
|
|
1297
|
+
},
|
|
1298
|
+
conf={
|
|
1299
|
+
"spark.executor.memory": "16g",
|
|
1300
|
+
"spark.executor.cores": "8",
|
|
1301
|
+
"spark.dynamicAllocation.enabled": "true",
|
|
1302
|
+
"spark.dynamicAllocation.maxExecutors": "50"
|
|
1303
|
+
}
|
|
1304
|
+
),
|
|
1305
|
+
invoker_options=InvokerOptions(
|
|
1306
|
+
post_run=[
|
|
1307
|
+
{
|
|
1308
|
+
"notebook": "/Analytics/RefreshDashboards",
|
|
1309
|
+
"timeout": 600,
|
|
1310
|
+
"arguments": {
|
|
1311
|
+
"dashboard_list": "customer_360,executive_summary"
|
|
1312
|
+
}
|
|
1313
|
+
},
|
|
1314
|
+
{
|
|
1315
|
+
"notebook": "/Notifications/SendSuccessEmail",
|
|
1316
|
+
"timeout": 60,
|
|
1317
|
+
"arguments": {
|
|
1318
|
+
"recipients": "analytics-team@company.com",
|
|
1319
|
+
"job": "customer_360"
|
|
1320
|
+
}
|
|
1321
|
+
}
|
|
1322
|
+
]
|
|
1323
|
+
),
|
|
1324
|
+
extender_options=[
|
|
1325
|
+
{
|
|
1326
|
+
"extender": "MLScoringExtender",
|
|
1327
|
+
"arguments": {
|
|
1328
|
+
"model": "customer_churn_model_v2",
|
|
1329
|
+
"score_column": "churn_score"
|
|
1330
|
+
}
|
|
1331
|
+
},
|
|
1332
|
+
{
|
|
1333
|
+
"extender": "SegmentationExtender",
|
|
1334
|
+
"arguments": {
|
|
1335
|
+
"algorithm": "kmeans",
|
|
1336
|
+
"n_segments": "5"
|
|
1337
|
+
}
|
|
1338
|
+
}
|
|
1339
|
+
],
|
|
1340
|
+
tags=["analytics", "customer", "ml_enriched", "daily"],
|
|
1341
|
+
comment="Gold layer customer 360 view with ML scoring and segmentation"
|
|
1342
|
+
)
|
|
1343
|
+
```
|
|
1344
|
+
|
|
1345
|
+
---
|
|
1346
|
+
|
|
1347
|
+
## 💡 Key Concepts
|
|
1348
|
+
|
|
1349
|
+
### 🔄 Change Data Capture (CDC) Strategies
|
|
1350
|
+
|
|
1351
|
+
Understanding CDC strategies is crucial for choosing the right data tracking approach:
|
|
1352
|
+
|
|
1353
|
+
<table>
|
|
1354
|
+
<tr>
|
|
1355
|
+
<th width="150">Strategy</th>
|
|
1356
|
+
<th>Description</th>
|
|
1357
|
+
<th>History</th>
|
|
1358
|
+
<th>Use Case</th>
|
|
1359
|
+
</tr>
|
|
1360
|
+
|
|
1361
|
+
<tr>
|
|
1362
|
+
<td><strong>nocdc</strong></td>
|
|
1363
|
+
<td>No change tracking. Simple insert/update/delete operations without history.</td>
|
|
1364
|
+
<td align="center">❌</td>
|
|
1365
|
+
<td>Lookup tables, configuration data, or data where history isn't needed</td>
|
|
1366
|
+
</tr>
|
|
1367
|
+
|
|
1368
|
+
<tr>
|
|
1369
|
+
<td><strong>scd1</strong><br/><em>(Type 1)</em></td>
|
|
1370
|
+
<td>Overwrites existing records with new values. Maintains only current state with flags for deleted records.</td>
|
|
1371
|
+
<td align="center">⚠️ Current</td>
|
|
1372
|
+
<td>Master data where only current values matter (current address, current status)</td>
|
|
1373
|
+
</tr>
|
|
1374
|
+
|
|
1375
|
+
<tr>
|
|
1376
|
+
<td><strong>scd2</strong><br/><em>(Type 2)</em></td>
|
|
1377
|
+
<td>Maintains complete history with versioning. Each change creates a new version with validity dates.</td>
|
|
1378
|
+
<td align="center">✅ Full</td>
|
|
1379
|
+
<td>Data requiring full audit trail (customer history, product changes over time)</td>
|
|
1380
|
+
</tr>
|
|
1381
|
+
|
|
1382
|
+
</table>
|
|
1383
|
+
|
|
1384
|
+
### 🎭 Processing Modes by Tier
|
|
1385
|
+
|
|
1386
|
+
<table>
|
|
1387
|
+
<tr>
|
|
1388
|
+
<th width="120">Tier</th>
|
|
1389
|
+
<th width="120">Mode</th>
|
|
1390
|
+
<th>Description</th>
|
|
1391
|
+
</tr>
|
|
1392
|
+
|
|
1393
|
+
<tr>
|
|
1394
|
+
<td rowspan="3"><strong>🥉 Bronze</strong></td>
|
|
1395
|
+
<td><code>memory</code></td>
|
|
1396
|
+
<td>Testing and temporary data</td>
|
|
1397
|
+
</tr>
|
|
1398
|
+
<tr>
|
|
1399
|
+
<td><code>append</code></td>
|
|
1400
|
+
<td>Raw data ingestion without deduplication</td>
|
|
1401
|
+
</tr>
|
|
1402
|
+
<tr>
|
|
1403
|
+
<td><code>register</code></td>
|
|
1404
|
+
<td>External table registration</td>
|
|
1405
|
+
</tr>
|
|
1406
|
+
|
|
1407
|
+
<tr>
|
|
1408
|
+
<td rowspan="5"><strong>🥈 Silver</strong></td>
|
|
1409
|
+
<td><code>memory</code></td>
|
|
1410
|
+
<td>Testing transformations</td>
|
|
1411
|
+
</tr>
|
|
1412
|
+
<tr>
|
|
1413
|
+
<td><code>append</code></td>
|
|
1414
|
+
<td>Accumulate all data</td>
|
|
1415
|
+
</tr>
|
|
1416
|
+
<tr>
|
|
1417
|
+
<td><code>latest</code></td>
|
|
1418
|
+
<td>Process only latest snapshot</td>
|
|
1419
|
+
</tr>
|
|
1420
|
+
<tr>
|
|
1421
|
+
<td><code>update</code></td>
|
|
1422
|
+
<td>Incremental processing</td>
|
|
1423
|
+
</tr>
|
|
1424
|
+
<tr>
|
|
1425
|
+
<td><code>combine</code></td>
|
|
1426
|
+
<td>Merge multiple sources</td>
|
|
1427
|
+
</tr>
|
|
1428
|
+
|
|
1429
|
+
<tr>
|
|
1430
|
+
<td rowspan="5"><strong>🥇 Gold</strong></td>
|
|
1431
|
+
<td><code>memory</code></td>
|
|
1432
|
+
<td>Testing analytics</td>
|
|
1433
|
+
</tr>
|
|
1434
|
+
<tr>
|
|
1435
|
+
<td><code>append</code></td>
|
|
1436
|
+
<td>Accumulate metrics/aggregations</td>
|
|
1437
|
+
</tr>
|
|
1438
|
+
<tr>
|
|
1439
|
+
<td><code>complete</code></td>
|
|
1440
|
+
<td>Full rebuild</td>
|
|
1441
|
+
</tr>
|
|
1442
|
+
<tr>
|
|
1443
|
+
<td><code>update</code></td>
|
|
1444
|
+
<td>Incremental updates</td>
|
|
1445
|
+
</tr>
|
|
1446
|
+
<tr>
|
|
1447
|
+
<td><code>invoke</code></td>
|
|
1448
|
+
<td>Custom notebook execution</td>
|
|
1449
|
+
</tr>
|
|
1450
|
+
|
|
1451
|
+
</table>
|
|
1452
|
+
|
|
1453
|
+
### 🔗 Parent Dependencies
|
|
1454
|
+
|
|
1455
|
+
Jobs can depend on other jobs via the `parents` option. The dependency system ensures:
|
|
1456
|
+
|
|
1457
|
+
✅ **Correct execution order** - Parents complete before children run
|
|
1458
|
+
✅ **No circular dependencies** - System validates dependency graph
|
|
1459
|
+
✅ **Parallel execution** - Independent jobs run concurrently
|
|
1460
|
+
✅ **Automatic retry** - Failed dependencies trigger retries
|
|
1461
|
+
|
|
1462
|
+
**Example:**
|
|
1463
|
+
```python
|
|
1464
|
+
# This job waits for both bronze jobs to complete
|
|
1465
|
+
parents=["bronze__customers", "bronze__orders"]
|
|
1466
|
+
```
|
|
1467
|
+
|
|
1468
|
+
### ✅ Data Quality Checks
|
|
1469
|
+
|
|
1470
|
+
Quality checks can run at different stages:
|
|
1471
|
+
|
|
1472
|
+
| Stage | Description | Failure Behavior |
|
|
1473
|
+
|-------|-------------|------------------|
|
|
1474
|
+
| **pre_run** | Validate input before processing | Job fails before execution |
|
|
1475
|
+
| **post_run** | Validate output after processing | Job fails after execution |
|
|
1476
|
+
|
|
1477
|
+
**Check Types:**
|
|
1478
|
+
- ✅ **Row counts** - Validate min/max row thresholds
|
|
1479
|
+
- ✅ **Data equality** - Compare counts with other tables
|
|
1480
|
+
- ✅ **Custom assertions** - SQL constraints and validations
|
|
1481
|
+
|
|
1482
|
+
---
|
|
1483
|
+
|
|
1484
|
+
## 🎯 Best Practices
|
|
1485
|
+
|
|
1486
|
+
### 🥉 Bronze Layer
|
|
1487
|
+
|
|
1488
|
+
| ✅ Best Practice | 💡 Recommendation | 📝 Rationale |
|
|
1489
|
+
|-----------------|-------------------|--------------|
|
|
1490
|
+
| **Mode Selection** | Use `mode="append"` | Capture all raw data without loss |
|
|
1491
|
+
| **CDC Operation** | Set `operation="upsert"` or `"reload"` | Enable change tracking for downstream processing |
|
|
1492
|
+
| **Performance** | Enable `optimize=True` for large datasets | Compact small files, improve query speed |
|
|
1493
|
+
| **Security** | Use `encrypted_columns` for sensitive fields | Protect PII during ingestion |
|
|
1494
|
+
| **Retention** | Keep short (`retention_days=30`) | Bronze is transient; Silver is source of truth |
|
|
1495
|
+
|
|
1496
|
+
### 🥈 Silver Layer
|
|
1497
|
+
|
|
1498
|
+
| ✅ Best Practice | 💡 Recommendation | 📝 Rationale |
|
|
1499
|
+
|-----------------|-------------------|--------------|
|
|
1500
|
+
| **CDC Strategy** | Always use `scd1` or `scd2` | Systematic change tracking required |
|
|
1501
|
+
| **Data Quality** | Enable `deduplicate=True` | Remove duplicates for clean data |
|
|
1502
|
+
| **Processing Mode** | Use `mode="update"` | Efficient incremental processing |
|
|
1503
|
+
| **Validation** | Configure `check_options` | Enforce data quality standards |
|
|
1504
|
+
| **Retention** | Use longer period (`retention_days=90`) | Silver is the authoritative source |
|
|
1505
|
+
|
|
1506
|
+
### 🥇 Gold Layer
|
|
1507
|
+
|
|
1508
|
+
| ✅ Best Practice | 💡 Recommendation | 📝 Rationale |
|
|
1509
|
+
|-----------------|-------------------|--------------|
|
|
1510
|
+
| **CDC Strategy** | Choose based on business needs | Balance history requirements vs performance |
|
|
1511
|
+
| **Rectification** | Enable `rectify_as_upserts=True` | Ensure historical data consistency |
|
|
1512
|
+
| **Watermarking** | Use `persist_last_timestamp=True` | Enable efficient incremental loads |
|
|
1513
|
+
| **Audit Tracking** | Enable `metadata=True` + `last_updated=True` | Support troubleshooting and audits |
|
|
1514
|
+
| **BI Optimization** | Set `powerbi=True` for analytics workloads | Optimize for business intelligence tools |
|
|
1515
|
+
| **Enrichment** | Use extenders for ML scoring | Add predictive intelligence |
|
|
1516
|
+
|
|
1517
|
+
### ⚡ Performance Optimization
|
|
1518
|
+
|
|
1519
|
+
| 🚀 Technique | 💡 Implementation | 🎯 Use Case |
|
|
1520
|
+
|-------------|-------------------|-------------|
|
|
1521
|
+
| **Liquid Clustering** | `liquid_clustering=True` | Modern Delta tables with auto-optimization |
|
|
1522
|
+
| **Partitioning** | `partition_by=["year", "month"]` | Time-series data and date-based queries |
|
|
1523
|
+
| **Z-Ordering** | `zorder_by=["customer_id"]` | High-cardinality columns in WHERE clauses |
|
|
1524
|
+
| **Bloom Filters** | `bloomfilter_by=["email"]` | Fast equality lookups on strings |
|
|
1525
|
+
| **Spark Tuning** | Configure `spark_options` | Match cluster resources to workload |
|
|
1526
|
+
|
|
1527
|
+
> **💡 Pro Tip:** Liquid clustering is self-optimizing and recommended for new tables over manual partitioning.
|
|
1528
|
+
|
|
1529
|
+
### 🔒 Data Governance
|
|
1530
|
+
|
|
1531
|
+
| 📋 Area | 💡 Recommendation | ✅ Benefit |
|
|
1532
|
+
|---------|-------------------|------------|
|
|
1533
|
+
| **Documentation** | Add detailed `comments` and `comment` | Enable knowledge sharing and understanding |
|
|
1534
|
+
| **Validation** | Define `constraints` for business rules | Enforce data quality at write-time |
|
|
1535
|
+
| **Metadata** | Set `properties` for classification | Improve discoverability and compliance |
|
|
1536
|
+
| **Privacy** | Use `masks` for PII fields | Protect sensitive data, meet regulations |
|
|
1537
|
+
| **Organization** | Apply meaningful `tags` | Enable filtering, cost tracking, ownership |
|
|
1538
|
+
|
|
1539
|
+
---
|
|
1540
|
+
|
|
1541
|
+
## Related Documentation
|
|
1542
|
+
|
|
1543
|
+
- [CDC Templates Documentation](../cdc/templates/README.md)
|
|
1544
|
+
- [Parser Options](../parsers/README.md)
|
|
1545
|
+
- [Metastore Table Documentation](../../metastore/README.md)
|