fabricks 3.0.19__py3-none-any.whl → 4.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/api/context.py +15 -3
- fabricks/api/notebooks/schedule.py +2 -3
- fabricks/api/parsers.py +2 -1
- fabricks/api/utils.py +3 -1
- fabricks/cdc/__init__.py +1 -2
- fabricks/cdc/base/__init__.py +1 -2
- fabricks/cdc/base/_types.py +5 -3
- fabricks/cdc/base/configurator.py +5 -0
- fabricks/cdc/base/generator.py +7 -3
- fabricks/cdc/base/merger.py +2 -0
- fabricks/cdc/base/processor.py +15 -0
- fabricks/cdc/templates/README.md +490 -0
- fabricks/cdc/templates/ctes/base.sql.jinja +1 -0
- fabricks/cdc/templates/ctes/current.sql.jinja +4 -0
- fabricks/cdc/templates/merges/scd1.sql.jinja +6 -0
- fabricks/cdc/templates/merges/scd2.sql.jinja +6 -0
- fabricks/cdc/templates/queries/context.sql.jinja +104 -96
- fabricks/cdc/templates/query.sql.jinja +1 -1
- fabricks/context/__init__.py +13 -1
- fabricks/context/config.py +13 -122
- fabricks/context/log.py +92 -1
- fabricks/context/runtime.py +35 -69
- fabricks/context/spark_session.py +4 -4
- fabricks/context/utils.py +26 -39
- fabricks/core/__init__.py +2 -2
- fabricks/core/dags/base.py +5 -5
- fabricks/core/dags/processor.py +2 -3
- fabricks/core/extenders.py +1 -1
- fabricks/core/job_schema.py +26 -16
- fabricks/core/jobs/__init__.py +1 -7
- fabricks/core/jobs/base/README.md +1545 -0
- fabricks/core/jobs/base/__init__.py +1 -8
- fabricks/core/jobs/base/checker.py +7 -7
- fabricks/core/jobs/base/configurator.py +142 -63
- fabricks/core/jobs/base/generator.py +38 -34
- fabricks/core/jobs/base/invoker.py +48 -63
- fabricks/core/jobs/base/processor.py +13 -28
- fabricks/core/jobs/bronze.py +88 -38
- fabricks/core/jobs/get_job.py +3 -6
- fabricks/core/jobs/get_job_conf.py +19 -68
- fabricks/core/jobs/get_jobs.py +10 -11
- fabricks/core/jobs/get_schedules.py +3 -17
- fabricks/core/jobs/gold.py +89 -47
- fabricks/core/jobs/silver.py +42 -22
- fabricks/core/masks.py +11 -8
- fabricks/core/parsers/__init__.py +0 -2
- fabricks/core/parsers/base.py +10 -10
- fabricks/core/parsers/decorator.py +1 -1
- fabricks/core/parsers/get_parser.py +4 -5
- fabricks/core/schedules/process.py +1 -4
- fabricks/core/steps/base.py +27 -17
- fabricks/core/steps/get_step.py +2 -4
- fabricks/core/steps/get_step_conf.py +3 -7
- fabricks/core/udfs.py +7 -7
- fabricks/core/views.py +2 -2
- fabricks/deploy/__init__.py +27 -16
- fabricks/deploy/masks.py +1 -1
- fabricks/deploy/notebooks.py +19 -16
- fabricks/deploy/schedules.py +1 -1
- fabricks/deploy/tables.py +66 -49
- fabricks/deploy/udfs.py +2 -2
- fabricks/deploy/views.py +15 -16
- fabricks/metastore/database.py +3 -3
- fabricks/metastore/table.py +103 -68
- fabricks/models/__init__.py +125 -0
- fabricks/models/common.py +79 -0
- fabricks/models/config.py +225 -0
- fabricks/models/dependency.py +50 -0
- fabricks/models/job.py +157 -0
- fabricks/models/path.py +17 -0
- fabricks/models/runtime.py +182 -0
- fabricks/models/schedule.py +21 -0
- fabricks/models/step.py +103 -0
- fabricks/models/table.py +77 -0
- fabricks/{core/jobs/get_job_id.py → models/utils.py} +2 -0
- fabricks/utils/helpers.py +6 -5
- fabricks/utils/log.py +25 -6
- fabricks/utils/path.py +265 -108
- fabricks/utils/pip.py +7 -7
- fabricks/utils/read/read.py +23 -22
- fabricks/utils/read/read_yaml.py +2 -2
- fabricks/utils/write/delta.py +4 -4
- fabricks/utils/write/stream.py +2 -2
- {fabricks-3.0.19.dist-info → fabricks-4.0.1.dist-info}/METADATA +9 -4
- {fabricks-3.0.19.dist-info → fabricks-4.0.1.dist-info}/RECORD +86 -83
- fabricks/context/_types.py +0 -139
- fabricks/context/helpers.py +0 -63
- fabricks/core/jobs/base/_types.py +0 -284
- fabricks/core/parsers/_types.py +0 -6
- fabricks/utils/fdict.py +0 -240
- fabricks/utils/pydantic.py +0 -94
- fabricks/utils/schema/__init__.py +0 -7
- fabricks/utils/schema/get_json_schema_for_type.py +0 -161
- fabricks/utils/schema/get_schema_for_type.py +0 -99
- {fabricks-3.0.19.dist-info → fabricks-4.0.1.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,490 @@
|
|
|
1
|
+
# CDC Templates Documentation
|
|
2
|
+
|
|
3
|
+
This directory contains Jinja2 SQL templates used by the Fabricks CDC (Change Data Capture) system to generate queries for handling data changes across different CDC strategies (NoCDC, SCD1, SCD2).
|
|
4
|
+
|
|
5
|
+
## Directory Structure
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
templates/
|
|
9
|
+
├── ctes/ # Common Table Expression templates
|
|
10
|
+
├── filters/ # Filter logic for slicing and updating data
|
|
11
|
+
├── macros/ # Reusable Jinja2 macros
|
|
12
|
+
├── merges/ # MERGE statement templates for each CDC type
|
|
13
|
+
├── queries/ # Query assembly templates for each CDC type
|
|
14
|
+
├── filter.sql.jinja # Main filter orchestration
|
|
15
|
+
├── merge.sql.jinja # Main merge orchestration
|
|
16
|
+
└── query.sql.jinja # Main query orchestration
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Main Templates
|
|
20
|
+
|
|
21
|
+
### query.sql.jinja
|
|
22
|
+
Main template that orchestrates the complete query generation process. It assembles various CTEs and query components based on the CDC type and configuration.
|
|
23
|
+
|
|
24
|
+
**Included Components:**
|
|
25
|
+
- Context information (via `queries/context.sql.jinja`)
|
|
26
|
+
- Base CTE (via `ctes/base.sql.jinja`)
|
|
27
|
+
- Optional slice CTE (via `ctes/slice.sql.jinja`)
|
|
28
|
+
- Optional deduplication CTEs (key and hash)
|
|
29
|
+
- Optional current state CTE (for update mode)
|
|
30
|
+
- Optional rectification CTE
|
|
31
|
+
- CDC-specific query logic (NoCDC, SCD1, or SCD2)
|
|
32
|
+
- Final output CTE
|
|
33
|
+
|
|
34
|
+
**Parameters:**
|
|
35
|
+
- `slice`: Type of slice filter ("update" or "latest")
|
|
36
|
+
- `deduplicate_key`: Enable key-based deduplication
|
|
37
|
+
- `deduplicate_hash`: Enable hash-based deduplication
|
|
38
|
+
- `mode`: Operation mode ("update" or "complete")
|
|
39
|
+
- `has_rows`: Whether target table has existing rows
|
|
40
|
+
- `rectify`: Enable data rectification
|
|
41
|
+
- `cdc`: CDC type ("nocdc", "scd1", or "scd2")
|
|
42
|
+
|
|
43
|
+
### merge.sql.jinja
|
|
44
|
+
Main template for generating MERGE statements to apply changes to target tables.
|
|
45
|
+
|
|
46
|
+
**Included Components:**
|
|
47
|
+
- `merges/scd1.sql.jinja` - For SCD Type 1 merges
|
|
48
|
+
- `merges/scd2.sql.jinja` - For SCD Type 2 merges
|
|
49
|
+
- `merges/nocdc.sql.jinja` - For tables without CDC
|
|
50
|
+
|
|
51
|
+
**Parameters:**
|
|
52
|
+
- `cdc`: CDC type determining which merge template to use
|
|
53
|
+
|
|
54
|
+
### filter.sql.jinja
|
|
55
|
+
Main template for generating filter queries to determine which data slices to process.
|
|
56
|
+
|
|
57
|
+
**Included Components:**
|
|
58
|
+
- Base CTE (via `ctes/base.sql.jinja`)
|
|
59
|
+
- Update filter (via `filters/update.sql.jinja`)
|
|
60
|
+
- Latest filter (via `filters/latest.sql.jinja`)
|
|
61
|
+
- Final aggregation (via `filters/final.sql.jinja`)
|
|
62
|
+
|
|
63
|
+
**Parameters:**
|
|
64
|
+
- `slice`: Type of slice ("update" or "latest")
|
|
65
|
+
|
|
66
|
+
## CTEs (Common Table Expressions)
|
|
67
|
+
|
|
68
|
+
### ctes/base.sql.jinja
|
|
69
|
+
Creates the base CTE that reads from the source and prepares the data with necessary transformations.
|
|
70
|
+
|
|
71
|
+
**Features:**
|
|
72
|
+
- Handles multiple source formats (query, table, global_temp_view, dataframe)
|
|
73
|
+
- Applies column casting
|
|
74
|
+
- Adds calculated columns
|
|
75
|
+
- Adds system columns (__timestamp, __operation, __last_updated, __source, __hash, __key, __metadata)
|
|
76
|
+
- Supports column overwriting
|
|
77
|
+
- Applies WHERE filter
|
|
78
|
+
|
|
79
|
+
**Parameters:**
|
|
80
|
+
- `format`: Source format type
|
|
81
|
+
- `src`: Source reference
|
|
82
|
+
- `cast`: Dictionary of columns to cast with target types
|
|
83
|
+
- `overwrite`: List of columns to overwrite
|
|
84
|
+
- `add_calculated_columns`: List of calculated column expressions
|
|
85
|
+
- `add_timestamp`, `add_operation`, `add_last_updated`, `add_source`, `add_hash`, `add_key`, `add_metadata`: Flags to add system columns
|
|
86
|
+
- `hashes`: List of columns to include in hash calculation
|
|
87
|
+
- `keys`: List of columns to include in key calculation
|
|
88
|
+
- `filter_where`: WHERE clause filter
|
|
89
|
+
|
|
90
|
+
### ctes/slice.sql.jinja
|
|
91
|
+
Filters data based on timestamp and source slices.
|
|
92
|
+
|
|
93
|
+
**Parameters:**
|
|
94
|
+
- `parent_slice`: Parent CTE name to slice from
|
|
95
|
+
- `slices`: Slice condition expression
|
|
96
|
+
- `has_source`: Whether source tracking is enabled
|
|
97
|
+
|
|
98
|
+
### ctes/deduplicate_key.sql.jinja
|
|
99
|
+
Removes duplicate records based on key and timestamp, keeping the most relevant record based on priority.
|
|
100
|
+
|
|
101
|
+
**Features:**
|
|
102
|
+
- Prioritizes delete operations over upserts
|
|
103
|
+
- Supports custom ordering for tie-breaking
|
|
104
|
+
- Advanced mode adds explicit row numbering; simple mode uses QUALIFY
|
|
105
|
+
|
|
106
|
+
**Parameters:**
|
|
107
|
+
- `parent_deduplicate_key`: Parent CTE to deduplicate
|
|
108
|
+
- `has_source`: Whether source tracking is enabled
|
|
109
|
+
- `advanced_deduplication`: Use advanced deduplication logic
|
|
110
|
+
- `has_order_by`: Whether custom ordering is specified
|
|
111
|
+
- `order_duplicate_by`: List of columns for ordering duplicates
|
|
112
|
+
|
|
113
|
+
### ctes/deduplicate_hash.sql.jinja
|
|
114
|
+
Removes consecutive duplicate records based on hash value changes.
|
|
115
|
+
|
|
116
|
+
**Features:**
|
|
117
|
+
- Detects when hash or operation changes from previous record
|
|
118
|
+
- Preserves only records where values differ from previous
|
|
119
|
+
- Advanced mode uses explicit LAG; simple mode uses QUALIFY
|
|
120
|
+
|
|
121
|
+
**Parameters:**
|
|
122
|
+
- `parent_deduplicate_hash`: Parent CTE to deduplicate
|
|
123
|
+
- `has_source`: Whether source tracking is enabled
|
|
124
|
+
- `advanced_deduplication`: Use advanced deduplication logic
|
|
125
|
+
|
|
126
|
+
### ctes/current.sql.jinja
|
|
127
|
+
Retrieves the current state from the target table for update operations.
|
|
128
|
+
|
|
129
|
+
**Features:**
|
|
130
|
+
- Handles different timestamp columns per CDC type (SCD2 uses __valid_from)
|
|
131
|
+
- Refreshes __timestamp, __last_updated, __hash, __key as needed
|
|
132
|
+
- Filters by __is_current flag for SCD1/SCD2
|
|
133
|
+
- Applies source and update filters
|
|
134
|
+
|
|
135
|
+
**Parameters:**
|
|
136
|
+
- `intermediates`: List of intermediate columns to select
|
|
137
|
+
- `tgt`: Target table name
|
|
138
|
+
- `cdc`: CDC type
|
|
139
|
+
- `add_timestamp`, `add_last_updated`, `add_hash`, `add_key`: Flags for column refresh
|
|
140
|
+
- `has_no_data`: Whether treating as delete operation
|
|
141
|
+
- `soft_delete`: Whether soft delete is enabled
|
|
142
|
+
- `sources`: Source filter condition
|
|
143
|
+
- `update_where`: Additional WHERE clause
|
|
144
|
+
|
|
145
|
+
### ctes/rectify.sql.jinja
|
|
146
|
+
Corrects historical data inconsistencies, particularly handling deleted records that reappear in subsequent reloads.
|
|
147
|
+
|
|
148
|
+
**Features:**
|
|
149
|
+
- Detects records deleted before reloads but present in later reloads
|
|
150
|
+
- Generates synthetic delete operations to maintain consistency
|
|
151
|
+
- Handles cross-reload data validation
|
|
152
|
+
- Filters out redundant current operations in update mode
|
|
153
|
+
|
|
154
|
+
**Logic Flow:**
|
|
155
|
+
1. Combines base records with current state (update mode)
|
|
156
|
+
2. Identifies next operation for each record
|
|
157
|
+
3. Tracks reload timestamps
|
|
158
|
+
4. Determines if records are deleted before next reload or missing in next reload
|
|
159
|
+
5. Generates appropriate delete operations
|
|
160
|
+
|
|
161
|
+
**Parameters:**
|
|
162
|
+
- `mode`: Operation mode ("update" or "complete")
|
|
163
|
+
- `parent_rectify`: Parent CTE to rectify
|
|
164
|
+
- `intermediates`: List of intermediate columns
|
|
165
|
+
- `has_rows`: Whether target has existing rows
|
|
166
|
+
- `has_source`: Whether source tracking is enabled
|
|
167
|
+
|
|
168
|
+
## Filters
|
|
169
|
+
|
|
170
|
+
### filters/update.sql.jinja
|
|
171
|
+
Generates filter conditions to select only new or updated records since the last load.
|
|
172
|
+
|
|
173
|
+
**Features:**
|
|
174
|
+
- Determines maximum timestamp from target table
|
|
175
|
+
- Generates slice conditions for records newer than max timestamp
|
|
176
|
+
- Handles different timestamp columns per CDC type
|
|
177
|
+
- Supports multi-source filtering
|
|
178
|
+
|
|
179
|
+
**Parameters:**
|
|
180
|
+
- `parent_slice`: Parent CTE name
|
|
181
|
+
- `tgt`: Target table name
|
|
182
|
+
- `cdc`: CDC type
|
|
183
|
+
- `has_source`: Whether source tracking is enabled
|
|
184
|
+
|
|
185
|
+
### filters/latest.sql.jinja
|
|
186
|
+
Generates filter conditions to select only the most recent timestamp per source.
|
|
187
|
+
|
|
188
|
+
**Features:**
|
|
189
|
+
- Finds maximum timestamp per source
|
|
190
|
+
- Creates slice conditions for latest data only
|
|
191
|
+
|
|
192
|
+
**Parameters:**
|
|
193
|
+
- `parent_slice`: Parent CTE name
|
|
194
|
+
- `has_source`: Whether source tracking is enabled
|
|
195
|
+
|
|
196
|
+
### filters/final.sql.jinja
|
|
197
|
+
Aggregates slice and source filter conditions using OR logic.
|
|
198
|
+
|
|
199
|
+
**Parameters:**
|
|
200
|
+
- `has_source`: Whether source tracking is enabled
|
|
201
|
+
|
|
202
|
+
## Macros
|
|
203
|
+
|
|
204
|
+
### macros/hash.sql.jinja
|
|
205
|
+
Defines macros for generating hash and key values.
|
|
206
|
+
|
|
207
|
+
**Macros:**
|
|
208
|
+
- `add_hash(fields)`: Creates MD5 hash from specified fields, treating __operation specially (deletes get different hash)
|
|
209
|
+
- `add_key(fields)`: Creates MD5 hash for key columns
|
|
210
|
+
|
|
211
|
+
**Features:**
|
|
212
|
+
- Uses array concatenation with '*' delimiter and '-1' null replacement
|
|
213
|
+
- Casts all fields to string
|
|
214
|
+
- Special handling for __operation field in hashes
|
|
215
|
+
|
|
216
|
+
### macros/backtick.sql.jinja
|
|
217
|
+
Simple macro to wrap field names in backticks for proper SQL escaping.
|
|
218
|
+
|
|
219
|
+
## Merge Templates
|
|
220
|
+
|
|
221
|
+
### merges/nocdc.sql.jinja
|
|
222
|
+
Generates MERGE statement for tables without CDC tracking.
|
|
223
|
+
|
|
224
|
+
**Features:**
|
|
225
|
+
- Matches on key columns
|
|
226
|
+
- Supports upsert and delete operations
|
|
227
|
+
- No historical tracking
|
|
228
|
+
|
|
229
|
+
**Parameters:**
|
|
230
|
+
- `format`: Source format ("dataframe" or "view")
|
|
231
|
+
- `tgt`: Target table name
|
|
232
|
+
- `src`: Source reference
|
|
233
|
+
- `has_key`: Whether to use __key column for matching
|
|
234
|
+
- `keys`: List of key columns for matching
|
|
235
|
+
- `has_source`: Whether source tracking is enabled
|
|
236
|
+
- `update_where`: Additional WHERE clause
|
|
237
|
+
- `columns`: List of all columns to merge
|
|
238
|
+
|
|
239
|
+
### merges/scd1.sql.jinja
|
|
240
|
+
Generates MERGE statement for SCD Type 1 (overwrite) tracking.
|
|
241
|
+
|
|
242
|
+
**Features:**
|
|
243
|
+
- Updates records in place
|
|
244
|
+
- Maintains __is_current and __is_deleted flags
|
|
245
|
+
- Supports soft delete option
|
|
246
|
+
- Updates metadata timestamps
|
|
247
|
+
|
|
248
|
+
**Parameters:**
|
|
249
|
+
- `format`: Source format
|
|
250
|
+
- `tgt`, `src`: Target and source references
|
|
251
|
+
- `has_key`, `keys`, `has_source`: Matching configuration
|
|
252
|
+
- `fields`: Data fields to update
|
|
253
|
+
- `has_timestamp`, `has_last_updated`, `has_metadata`, `has_hash`, `has_rescued_data`: System column flags
|
|
254
|
+
- `soft_delete`: Enable soft delete instead of physical delete
|
|
255
|
+
- `columns`: All columns for insert
|
|
256
|
+
|
|
257
|
+
### merges/scd2.sql.jinja
|
|
258
|
+
Generates MERGE statement for SCD Type 2 (versioned history) tracking.
|
|
259
|
+
|
|
260
|
+
**Features:**
|
|
261
|
+
- Closes current records by setting __valid_to
|
|
262
|
+
- Inserts new versions with __valid_from
|
|
263
|
+
- Maintains __is_current and __is_deleted flags
|
|
264
|
+
- Updates records matched with current flag only
|
|
265
|
+
|
|
266
|
+
**Operations:**
|
|
267
|
+
- `update`: Close current version and insert new version
|
|
268
|
+
- `delete`: Close current version and mark as deleted
|
|
269
|
+
- `insert`: Insert new version
|
|
270
|
+
|
|
271
|
+
**Parameters:**
|
|
272
|
+
- `format`: Source format
|
|
273
|
+
- `tgt`, `src`: Target and source references
|
|
274
|
+
- `has_key`, `keys`, `has_source`: Matching configuration
|
|
275
|
+
- `soft_delete`: Enable soft delete marking
|
|
276
|
+
- `has_metadata`, `has_last_updated`: System column flags
|
|
277
|
+
- `columns`: All columns for insert
|
|
278
|
+
|
|
279
|
+
## Query Templates
|
|
280
|
+
|
|
281
|
+
### queries/context.sql.jinja
|
|
282
|
+
Generates a SQL comment block documenting the query configuration and parameters.
|
|
283
|
+
|
|
284
|
+
**Sections:**
|
|
285
|
+
- ⚙️ BASE: CDC type and mode
|
|
286
|
+
- 🎯 SOURCE & TARGET: Format and references
|
|
287
|
+
- 📊 CTE's: Which CTEs are included
|
|
288
|
+
- 🔪 FILTERING: Filter conditions
|
|
289
|
+
- 🗑️ DELETES: Delete handling options
|
|
290
|
+
- ✅ DATA VALIDATION: Data state flags
|
|
291
|
+
- 🏷️ HAS FIELDS: Which system fields are present
|
|
292
|
+
- ➕ ADD COLUMNS: Which columns to add
|
|
293
|
+
- 🔄 EXTRA COLUMN OPERATIONS: Column transformations
|
|
294
|
+
- 👨👩👧 PARENTS: Parent CTE references
|
|
295
|
+
- 📦 LAYOUT: Column lists
|
|
296
|
+
|
|
297
|
+
### queries/final.sql.jinja
|
|
298
|
+
Final SELECT that outputs the result, excluding specified columns.
|
|
299
|
+
|
|
300
|
+
**Parameters:**
|
|
301
|
+
- `all_except`: List of columns to exclude from output
|
|
302
|
+
|
|
303
|
+
### queries/scd1.sql.jinja
|
|
304
|
+
Implements SCD Type 1 logic that maintains only current state.
|
|
305
|
+
|
|
306
|
+
**Features:**
|
|
307
|
+
- Takes latest record per key
|
|
308
|
+
- Marks deleted records
|
|
309
|
+
- Handles first delete when no upserts exist (update mode)
|
|
310
|
+
- Filters out fake updates (records matching current hash)
|
|
311
|
+
- Generates merge conditions for upsert/delete operations
|
|
312
|
+
|
|
313
|
+
**Parameters:**
|
|
314
|
+
- `parent_cdc`: Parent CTE name
|
|
315
|
+
- `mode`: "complete" or "update"
|
|
316
|
+
- `has_source`, `has_rows`: Configuration flags
|
|
317
|
+
- `soft_delete`: Enable soft delete
|
|
318
|
+
- `rectify`: Whether rectification was applied
|
|
319
|
+
- `outputs`: Output columns
|
|
320
|
+
|
|
321
|
+
### queries/scd2.sql.jinja
|
|
322
|
+
Implements SCD Type 2 logic that maintains full version history.
|
|
323
|
+
|
|
324
|
+
**Features:**
|
|
325
|
+
- Creates __valid_from and __valid_to temporal columns
|
|
326
|
+
- Assigns validity periods based on next timestamp
|
|
327
|
+
- Marks current records (__is_current)
|
|
328
|
+
- Identifies deleted records
|
|
329
|
+
- Generates merge conditions (insert/update/delete)
|
|
330
|
+
- Filters out fake updates
|
|
331
|
+
- Optional __valid_from correction to use 1900-01-01 for earliest records
|
|
332
|
+
|
|
333
|
+
**Parameters:**
|
|
334
|
+
- `parent_cdc`: Parent CTE name
|
|
335
|
+
- `mode`: "complete" or "update"
|
|
336
|
+
- `has_source`, `has_rows`: Configuration flags
|
|
337
|
+
- `correct_valid_from`: Correct earliest valid_from date
|
|
338
|
+
- `rectify`: Whether rectification was applied
|
|
339
|
+
- `outputs`: Output columns
|
|
340
|
+
|
|
341
|
+
### queries/nocdc/complete.sql.jinja
|
|
342
|
+
Generates complete load query for NoCDC mode.
|
|
343
|
+
|
|
344
|
+
**Features:**
|
|
345
|
+
- Selects all output columns
|
|
346
|
+
- Filters out 'current' operations if filter enabled
|
|
347
|
+
|
|
348
|
+
**Parameters:**
|
|
349
|
+
- `parent_cdc`: Parent CTE name
|
|
350
|
+
- `filter`: Enable operation filtering
|
|
351
|
+
- `outputs`: Output columns
|
|
352
|
+
|
|
353
|
+
### queries/nocdc/update.sql.jinja
|
|
354
|
+
Generates incremental update query for NoCDC mode.
|
|
355
|
+
|
|
356
|
+
**Features:**
|
|
357
|
+
- Identifies records to upsert (not matching current hash)
|
|
358
|
+
- Optional delete missing records
|
|
359
|
+
- Filters out 'current' operations if filter enabled
|
|
360
|
+
|
|
361
|
+
**Parameters:**
|
|
362
|
+
- `parent_cdc`: Parent CTE name
|
|
363
|
+
- `has_rows`: Whether target has existing rows
|
|
364
|
+
- `delete_missing`: Enable delete for missing records
|
|
365
|
+
- `has_source`: Whether source tracking is enabled
|
|
366
|
+
- `filter`: Enable operation filtering
|
|
367
|
+
- `outputs`: Output columns
|
|
368
|
+
|
|
369
|
+
## Usage Example
|
|
370
|
+
|
|
371
|
+
The templates are typically invoked through the CDC classes (NoCDC, SCD1, SCD2) which populate the template variables and render the appropriate templates based on the operation:
|
|
372
|
+
|
|
373
|
+
```python
|
|
374
|
+
from fabricks.cdc import SCD2
|
|
375
|
+
|
|
376
|
+
cdc = SCD2(
|
|
377
|
+
src="source_table",
|
|
378
|
+
tgt="target_table",
|
|
379
|
+
keys=["id"],
|
|
380
|
+
mode="update"
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
# Generates query using query.sql.jinja and dependencies
|
|
384
|
+
query = cdc.render_query()
|
|
385
|
+
|
|
386
|
+
# Generates merge using merge.sql.jinja and dependencies
|
|
387
|
+
merge = cdc.render_merge()
|
|
388
|
+
```
|
|
389
|
+
|
|
390
|
+
## Template Parameters Reference
|
|
391
|
+
|
|
392
|
+
### Common Parameters
|
|
393
|
+
|
|
394
|
+
- `cdc`: CDC type ("nocdc", "scd1", "scd2")
|
|
395
|
+
- `mode`: Operation mode ("complete", "update")
|
|
396
|
+
- `format`: Source format ("query", "table", "global_temp_view", "dataframe")
|
|
397
|
+
- `src`: Source table/query reference
|
|
398
|
+
- `tgt`: Target table name
|
|
399
|
+
- `keys`: List of key columns for matching records
|
|
400
|
+
- `hashes`: List of columns to include in hash calculation
|
|
401
|
+
|
|
402
|
+
### System Column Flags
|
|
403
|
+
|
|
404
|
+
- `has_timestamp`, `add_timestamp`: Timestamp tracking
|
|
405
|
+
- `has_last_updated`, `add_last_updated`: Last updated timestamp
|
|
406
|
+
- `has_operation`, `add_operation`: Operation type (upsert/delete/reload)
|
|
407
|
+
- `has_source`, `add_source`: Source system tracking
|
|
408
|
+
- `has_hash`, `add_hash`: Row hash for change detection
|
|
409
|
+
- `has_key`, `add_key`: Composite key hash
|
|
410
|
+
- `has_metadata`, `add_metadata`: Metadata struct (inserted/updated times)
|
|
411
|
+
- `has_identity`: Identity column present
|
|
412
|
+
- `has_rescued_data`: Rescued data column present
|
|
413
|
+
|
|
414
|
+
### Processing Options
|
|
415
|
+
|
|
416
|
+
- `deduplicate_key`: Enable key-based deduplication
|
|
417
|
+
- `deduplicate_hash`: Enable hash-based deduplication
|
|
418
|
+
- `advanced_deduplication`: Use explicit window functions
|
|
419
|
+
- `rectify`: Enable data rectification
|
|
420
|
+
- `soft_delete`: Enable soft delete (mark as deleted vs physical delete)
|
|
421
|
+
- `delete_missing`: Delete records not in source
|
|
422
|
+
- `slice`: Slice type ("update", "latest")
|
|
423
|
+
- `filter`: Enable operation filtering
|
|
424
|
+
|
|
425
|
+
### Data State Flags
|
|
426
|
+
|
|
427
|
+
- `has_rows`: Target table has existing rows
|
|
428
|
+
- `has_no_data`: Treating as empty/delete operation
|
|
429
|
+
- `has_order_by`: Custom ordering specified
|
|
430
|
+
|
|
431
|
+
### Column Lists
|
|
432
|
+
|
|
433
|
+
- `columns`: All columns in target table
|
|
434
|
+
- `inputs`: Input columns from source
|
|
435
|
+
- `intermediates`: Intermediate processing columns
|
|
436
|
+
- `outputs`: Final output columns
|
|
437
|
+
- `fields`: Data fields (non-system columns)
|
|
438
|
+
- `order_duplicate_by`: Columns for ordering duplicates
|
|
439
|
+
- `all_except`: Columns to exclude from output
|
|
440
|
+
- `all_overwrite`: Columns to overwrite
|
|
441
|
+
- `overwrite`: Columns to overwrite (subset)
|
|
442
|
+
- `cast`: Dictionary of column type casts
|
|
443
|
+
- `add_calculated_columns`: Calculated column expressions
|
|
444
|
+
|
|
445
|
+
### Filter Conditions
|
|
446
|
+
|
|
447
|
+
- `filter_where`: WHERE clause for base data
|
|
448
|
+
- `update_where`: WHERE clause for update operations
|
|
449
|
+
- `slices`: Slice condition expression
|
|
450
|
+
- `sources`: Source filter condition
|
|
451
|
+
|
|
452
|
+
### Parent CTE References
|
|
453
|
+
|
|
454
|
+
- `parent_slice`: Parent CTE for slicing
|
|
455
|
+
- `parent_rectify`: Parent CTE for rectification
|
|
456
|
+
- `parent_deduplicate_key`: Parent CTE for key deduplication
|
|
457
|
+
- `parent_deduplicate_hash`: Parent CTE for hash deduplication
|
|
458
|
+
- `parent_cdc`: Parent CTE for CDC logic
|
|
459
|
+
- `parent_final`: Parent CTE for final output
|
|
460
|
+
|
|
461
|
+
### SCD2-Specific
|
|
462
|
+
|
|
463
|
+
- `correct_valid_from`: Correct earliest __valid_from to 1900-01-01
|
|
464
|
+
|
|
465
|
+
## Template Rendering Flow
|
|
466
|
+
|
|
467
|
+
### Query Rendering (query.sql.jinja)
|
|
468
|
+
|
|
469
|
+
1. **Context** - Document configuration in SQL comment
|
|
470
|
+
2. **Base CTE** - Load and transform source data
|
|
471
|
+
3. **Slice CTE** (optional) - Filter to specific time slices
|
|
472
|
+
4. **Deduplicate Key CTE** (optional) - Remove key duplicates
|
|
473
|
+
5. **Current CTE** (optional, update mode) - Load existing target state
|
|
474
|
+
6. **Rectify CTE** (optional) - Fix historical inconsistencies
|
|
475
|
+
7. **Deduplicate Hash CTE** (optional) - Remove hash duplicates
|
|
476
|
+
8. **CDC Logic CTE** - Apply NoCDC/SCD1/SCD2 logic
|
|
477
|
+
9. **Final CTE** - Select output columns
|
|
478
|
+
|
|
479
|
+
### Merge Rendering (merge.sql.jinja)
|
|
480
|
+
|
|
481
|
+
1. Select appropriate merge template based on CDC type
|
|
482
|
+
2. Generate MERGE statement with ON clause
|
|
483
|
+
3. Define WHEN MATCHED and WHEN NOT MATCHED clauses
|
|
484
|
+
4. Specify UPDATE, DELETE, and INSERT operations
|
|
485
|
+
|
|
486
|
+
### Filter Rendering (filter.sql.jinja)
|
|
487
|
+
|
|
488
|
+
1. **Base CTE** - Load source metadata
|
|
489
|
+
2. **Update/Latest Filter CTE** - Determine slice conditions
|
|
490
|
+
3. **Final CTE** - Aggregate filter expressions
|
|
@@ -16,6 +16,7 @@ with
|
|
|
16
16
|
{% if add_calculated_columns %} {% for c in add_calculated_columns %} {{ c }}, {% endfor %} {% endif %}
|
|
17
17
|
{% if add_timestamp %} cast(current_date() as timestamp) as __timestamp, {% endif %}
|
|
18
18
|
{% if add_operation %} cast('{{ add_operation }}' as string) as __operation, {% endif %}
|
|
19
|
+
{% if add_last_updated %} cast(current_timestamp() as timestamp) as __last_updated, {% endif %}
|
|
19
20
|
{% if add_source %} cast('{{ add_source }}' as string) as __source, {% endif %}
|
|
20
21
|
{% if add_hash %} {{ h.add_hash(fields=hashes) }} as __hash, {% endif %}
|
|
21
22
|
{% if add_key %} {{ h.add_hash(fields=keys) }} as __key, {% endif %}
|
|
@@ -9,6 +9,10 @@ __current as (
|
|
|
9
9
|
{% elif cdc == "scd1" %} __timestamp,
|
|
10
10
|
{% elif cdc == "scd2" %} __valid_from as __timestamp,
|
|
11
11
|
{% endif %}
|
|
12
|
+
{% elif i == "__last_updated" %}
|
|
13
|
+
{% if add_last_updated %} cast(current_timestamp() as timestamp) as __last_updated,
|
|
14
|
+
{% else %} __last_updated,
|
|
15
|
+
{% endif %}
|
|
12
16
|
{% elif i == "__operation" %}
|
|
13
17
|
{% if has_no_data %} 'delete' as __operation, {% else %} 'current' as __operation, {% endif %}
|
|
14
18
|
{% elif i == "__hash" %}
|
|
@@ -26,6 +26,9 @@
|
|
|
26
26
|
{% if has_timestamp %}
|
|
27
27
|
__timestamp = s.__timestamp,
|
|
28
28
|
{% endif %}
|
|
29
|
+
{% if has_last_updated %}
|
|
30
|
+
__last_updated = s.__last_updated,
|
|
31
|
+
{% endif %}
|
|
29
32
|
{% if has_metadata %}
|
|
30
33
|
__metadata.updated = cast(current_timestamp() as timestamp),
|
|
31
34
|
{% endif %}
|
|
@@ -50,6 +53,9 @@
|
|
|
50
53
|
{% if has_timestamp %}
|
|
51
54
|
__timestamp = s.__timestamp,
|
|
52
55
|
{% endif %}
|
|
56
|
+
{% if has_last_updated %}
|
|
57
|
+
__last_updated = s.__last_updated,
|
|
58
|
+
{% endif %}
|
|
53
59
|
{% if has_metadata %}
|
|
54
60
|
__metadata.updated = cast(current_timestamp() as timestamp),
|
|
55
61
|
{% endif %}
|
|
@@ -27,6 +27,9 @@
|
|
|
27
27
|
{% endif %}
|
|
28
28
|
{% if has_metadata %}
|
|
29
29
|
__metadata.updated = cast(current_timestamp() as timestamp),
|
|
30
|
+
{% endif %}
|
|
31
|
+
{% if has_last_updated %}
|
|
32
|
+
__last_updated = s.__last_updated,
|
|
30
33
|
{% endif %}
|
|
31
34
|
when matched
|
|
32
35
|
and __merge_condition == 'delete' then
|
|
@@ -39,6 +42,9 @@
|
|
|
39
42
|
{% endif %}
|
|
40
43
|
{% if has_metadata %}
|
|
41
44
|
__metadata.updated = cast(current_timestamp() as timestamp),
|
|
45
|
+
{% endif %}
|
|
46
|
+
{% if has_last_updated %}
|
|
47
|
+
__last_updated = s.__last_updated,
|
|
42
48
|
{% endif %}
|
|
43
49
|
when not matched
|
|
44
50
|
and __merge_condition == 'insert' then
|