gcp-pipeline-ref-transform 1.0.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gcp_pipeline_ref_transform-1.0.7/PKG-INFO +206 -0
- gcp_pipeline_ref_transform-1.0.7/README.md +183 -0
- gcp_pipeline_ref_transform-1.0.7/dbt/dbt_project.yml +57 -0
- gcp_pipeline_ref_transform-1.0.7/dbt/macros/data_quality_check.sql +17 -0
- gcp_pipeline_ref_transform-1.0.7/dbt/macros/incremental_strategy.sql +120 -0
- gcp_pipeline_ref_transform-1.0.7/dbt/models/fdp/_fdp_generic_models.yml +93 -0
- gcp_pipeline_ref_transform-1.0.7/dbt/models/fdp/event_transaction_excess.sql +64 -0
- gcp_pipeline_ref_transform-1.0.7/dbt/models/fdp/portfolio_account_excess.sql +52 -0
- gcp_pipeline_ref_transform-1.0.7/dbt/models/fdp/portfolio_account_facility.sql +51 -0
- gcp_pipeline_ref_transform-1.0.7/dbt/models/fdp/unit_tests.yml +11 -0
- gcp_pipeline_ref_transform-1.0.7/dbt/models/staging/generic/_generic_sources.yml +75 -0
- gcp_pipeline_ref_transform-1.0.7/dbt/models/staging/generic/stg_generic_accounts.sql +69 -0
- gcp_pipeline_ref_transform-1.0.7/dbt/models/staging/generic/stg_generic_applications.sql +47 -0
- gcp_pipeline_ref_transform-1.0.7/dbt/models/staging/generic/stg_generic_customers.sql +59 -0
- gcp_pipeline_ref_transform-1.0.7/dbt/models/staging/generic/stg_generic_decision.sql +59 -0
- gcp_pipeline_ref_transform-1.0.7/dbt/packages.yml +3 -0
- gcp_pipeline_ref_transform-1.0.7/gcp_pipeline_ref_transform.egg-info/PKG-INFO +206 -0
- gcp_pipeline_ref_transform-1.0.7/gcp_pipeline_ref_transform.egg-info/SOURCES.txt +21 -0
- gcp_pipeline_ref_transform-1.0.7/gcp_pipeline_ref_transform.egg-info/dependency_links.txt +1 -0
- gcp_pipeline_ref_transform-1.0.7/gcp_pipeline_ref_transform.egg-info/requires.txt +5 -0
- gcp_pipeline_ref_transform-1.0.7/gcp_pipeline_ref_transform.egg-info/top_level.txt +1 -0
- gcp_pipeline_ref_transform-1.0.7/pyproject.toml +48 -0
- gcp_pipeline_ref_transform-1.0.7/setup.cfg +4 -0
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gcp-pipeline-ref-transform
|
|
3
|
+
Version: 1.0.7
|
|
4
|
+
Summary: GCP Pipeline Reference: Generic FDP Transformation — dbt models for ODP to FDP (JOIN + MAP patterns)
|
|
5
|
+
License: MIT
|
|
6
|
+
Project-URL: Homepage, https://github.com/enrichmeai/gcp-pipeline-reference
|
|
7
|
+
Project-URL: Documentation, https://github.com/enrichmeai/gcp-pipeline-reference/tree/main/deployments/bigquery-to-mapped-product
|
|
8
|
+
Project-URL: Repository, https://github.com/enrichmeai/gcp-pipeline-reference
|
|
9
|
+
Keywords: gcp,bigquery,dbt,transformation,reference,example
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Requires-Python: >=3.9
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
Requires-Dist: dbt-bigquery>=1.5.0
|
|
20
|
+
Requires-Dist: gcp-pipeline-framework>=1.0.6
|
|
21
|
+
Provides-Extra: dev
|
|
22
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
23
|
+
|
|
24
|
+
# Generic Transformation
|
|
25
|
+
|
|
26
|
+
**Unit 2 of Generic 3-Unit Deployment**
|
|
27
|
+
|
|
28
|
+
FDP Transformation - dbt models for ODP → FDP transformation.
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
## Flow Diagram
|
|
33
|
+
|
|
34
|
+
```
|
|
35
|
+
Generic TRANSFORMATION FLOW
|
|
36
|
+
──────────────────────
|
|
37
|
+
|
|
38
|
+
BigQuery ODP dbt BigQuery FDP
|
|
39
|
+
──────────── ─── ────────────
|
|
40
|
+
|
|
41
|
+
odp_generic.customers ─────┐
|
|
42
|
+
│ ┌─────────────────┐
|
|
43
|
+
odp_generic.accounts ─────┼───►│ JOIN Logic │────────► fdp_generic.event_transaction_excess
|
|
44
|
+
│ └─────────────────┘
|
|
45
|
+
│
|
|
46
|
+
odp_generic.decision ─────┼───────────────────────────────► fdp_generic.portfolio_account_excess
|
|
47
|
+
│
|
|
48
|
+
└───────────────────────
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## Pattern
|
|
54
|
+
|
|
55
|
+
**MULTI-TARGET**:
|
|
56
|
+
1. **JOIN**: 2 ODP sources (customers, accounts) → 1 FDP target (`event_transaction_excess`)
|
|
57
|
+
2. **MAP**: 1 ODP source (decision) → 1 FDP target (`portfolio_account_excess`)
|
|
58
|
+
|
|
59
|
+
| Step | Description |
|
|
60
|
+
|------|-------------|
|
|
61
|
+
| 1 | Staging models clean and type-cast raw ODP data |
|
|
62
|
+
| 2 | `add_audit_columns` macro injects `run_id` and `source_file` |
|
|
63
|
+
| 3 | `mask_pii` macro applies environment-aware masking to sensitive fields |
|
|
64
|
+
| 4 | `event_transaction_excess` performs `INNER JOIN` between Customers and Accounts |
|
|
65
|
+
| 5 | `portfolio_account_excess` maps Decision ODP 1:1 to FDP |
|
|
66
|
+
|
|
67
|
+
---
|
|
68
|
+
|
|
69
|
+
## Data Mapping
|
|
70
|
+
|
|
71
|
+
| Source Table | Key Fields |
|
|
72
|
+
|--------------|------------|
|
|
73
|
+
| `odp_generic.customers` | customer_id, ssn, first_name, last_name, dob, status |
|
|
74
|
+
| `odp_generic.accounts` | account_id, customer_id, account_type, balance, open_date |
|
|
75
|
+
| `odp_generic.decision` | decision_id, customer_id, application_id, decision_code, score, decision_date |
|
|
76
|
+
|
|
77
|
+
| Target Table | Description |
|
|
78
|
+
|--------------|-------------|
|
|
79
|
+
| `fdp_generic.event_transaction_excess` | Joined customer-account view |
|
|
80
|
+
| `fdp_generic.portfolio_account_excess` | Decision-based portfolio view |
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
## Components
|
|
85
|
+
|
|
86
|
+
| Directory | Purpose |
|
|
87
|
+
|-----------|---------|
|
|
88
|
+
| `dbt/models/staging/generic/` | Staging models (clean raw data) |
|
|
89
|
+
| `dbt/models/fdp/` | FDP models (JOIN and MAP logic) |
|
|
90
|
+
|
|
91
|
+
---
|
|
92
|
+
|
|
93
|
+
## Library-Driven Ease of Use
|
|
94
|
+
|
|
95
|
+
The Generic transformation unit uses the `gcp-pipeline-transform` library to ensure data privacy and lineage with zero local macro development:
|
|
96
|
+
|
|
97
|
+
1. **Zero-Bleed PII Masking**: Uses `{{ mask_pii(column, 'SSN') }}`. The library automatically applies the correct mask (Full in Prod, Partial in Staging) based on the environment.
|
|
98
|
+
2. **Automated Lineage**: Uses `{{ add_audit_columns() }}` to inject `run_id` and `source_file` variables, maintaining the E2E lineage established in the ingestion layer.
|
|
99
|
+
3. **Metadata Enrichment**: Replaces hardcoded business logic with generic library macros that interpret rules from the `EntitySchema`.
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
## How to Replicate this JOIN Transformation (3-to-1)
|
|
104
|
+
|
|
105
|
+
To create a new transformation unit that joins multiple entities, follow the [Creating New Deployment Guide](../../docs/CREATING_NEW_DEPLOYMENT_GUIDE.md).
|
|
106
|
+
|
|
107
|
+
Key steps for this JOIN pattern:
|
|
108
|
+
1. **Register Library**: Point your `dbt_project.yml` to the `gcp-pipeline-transform` macro paths.
|
|
109
|
+
2. **Staging Models**: Create views for your ODP tables. Use `add_audit_columns` for consistency.
|
|
110
|
+
3. **FDP Models**: Implement your `LEFT JOIN` logic. Apply `mask_pii` to all sensitive fields.
|
|
111
|
+
4. **Governance**: Run `validate_no_pii_in_export` in your CI/CD to prevent leakage.
|
|
112
|
+
|
|
113
|
+
---
|
|
114
|
+
|
|
115
|
+
## Infrastructure & Configurations
|
|
116
|
+
|
|
117
|
+
### Google Cloud Resources
|
|
118
|
+
This deployment requires the following GCP infrastructure, provisioned via Terraform:
|
|
119
|
+
- **Data Warehouse**: BigQuery datasets `odp_generic` (source) and `fdp_generic` (target).
|
|
120
|
+
- **Processing**: dbt (running on Cloud Composer or as a standalone process) for executing transformations.
|
|
121
|
+
|
|
122
|
+
For detailed infrastructure definitions, see [infrastructure/terraform/systems/generic/transformation/](../../infrastructure/terraform/systems/generic/transformation/).
|
|
123
|
+
|
|
124
|
+
### dbt Configuration (`dbt_project.yml`)
|
|
125
|
+
The transformation behavior is controlled by variables and configurations in `dbt_project.yml`:
|
|
126
|
+
|
|
127
|
+
| Variable | Description | Default / Source |
|
|
128
|
+
|----------|-------------|------------------|
|
|
129
|
+
| `gcp_project_id` | Target GCP Project | `GCP_PROJECT_ID` env var |
|
|
130
|
+
| `source_dataset` | Source ODP dataset | `odp_generic` |
|
|
131
|
+
| `staging_dataset` | Intermediate staging dataset | `stg_generic` |
|
|
132
|
+
| `fdp_dataset` | Target FDP dataset | `fdp_generic` |
|
|
133
|
+
| `marts_dataset` | Marts dataset | `marts_generic` |
|
|
134
|
+
| `analytics_dataset` | Analytics dataset | `analytics_generic` |
|
|
135
|
+
| `extract_date` | Date of data extract | `null` (optional filter) |
|
|
136
|
+
| `generic_entities` | List of entities to process | `['customers', 'accounts', 'decision']` |
|
|
137
|
+
| `masking_level` | PII masking strategy (`FULL`, `PARTIAL`, `NONE`) | `AUTO` |
|
|
138
|
+
|
|
139
|
+
### Technology Stack & Documentation
|
|
140
|
+
- [Google BigQuery](https://cloud.google.com/bigquery/docs) - Serverless data warehouse
|
|
141
|
+
- [dbt (data build tool)](https://docs.getdbt.com/docs/introduction) - Transformation workflow
|
|
142
|
+
- [dbt-bigquery Adapter](https://docs.getdbt.com/reference/warehouse-setups/bigquery-setup) - dbt to BigQuery connector
|
|
143
|
+
- [Data Modeling in dbt](https://docs.getdbt.com/docs/build/models) - Best practices for models
|
|
144
|
+
|
|
145
|
+
---
|
|
146
|
+
|
|
147
|
+
## Dependencies
|
|
148
|
+
|
|
149
|
+
| Library | Purpose |
|
|
150
|
+
|---------|---------|
|
|
151
|
+
| `dbt-bigquery` | dbt adapter for BigQuery |
|
|
152
|
+
| `gcp-pipeline-transform` | Shared macros (audit columns) |
|
|
153
|
+
|
|
154
|
+
---
|
|
155
|
+
|
|
156
|
+
## Execution & Testing
|
|
157
|
+
|
|
158
|
+
### 1. Local Development Setup
|
|
159
|
+
Initialize the virtual environment:
|
|
160
|
+
```bash
|
|
161
|
+
./scripts/setup_deployment_venv.sh bigquery-to-mapped-product
|
|
162
|
+
source deployments/bigquery-to-mapped-product/venv/bin/activate
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
### 2. Local dbt Execution
|
|
166
|
+
Run dbt models locally against the development BigQuery dataset:
|
|
167
|
+
```bash
|
|
168
|
+
cd dbt
|
|
169
|
+
dbt run --profiles-dir . --target dev
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
### 3. Data Quality Validation
|
|
173
|
+
Run dbt tests to verify transformation logic and PII masking:
|
|
174
|
+
```bash
|
|
175
|
+
dbt test --profiles-dir . --target dev
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
### 4. Governance Verification
|
|
179
|
+
Use the library macro to ensure no unmasked PII exists in your models before deployment:
|
|
180
|
+
```sql
|
|
181
|
+
{{ validate_no_pii_in_export('fdp_generic.event_transaction_excess') }}
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
### 5. Cloud Execution
|
|
185
|
+
In production, this unit is triggered by the `generic_odp_load_dag` once ingestion is successful. The transformation is executed via a `BashOperator` running `dbt run`.
|
|
186
|
+
|
|
187
|
+
---
|
|
188
|
+
|
|
189
|
+
## SQL Example
|
|
190
|
+
|
|
191
|
+
```sql
|
|
192
|
+
-- fdp_generic.event_transaction_excess
|
|
193
|
+
SELECT
|
|
194
|
+
c.customer_id,
|
|
195
|
+
c.first_name,
|
|
196
|
+
c.last_name,
|
|
197
|
+
a.account_id,
|
|
198
|
+
a.current_balance,
|
|
199
|
+
-- Audit columns
|
|
200
|
+
c._run_id,
|
|
201
|
+
CURRENT_TIMESTAMP() as _transformed_at
|
|
202
|
+
FROM {{ ref('stg_generic_customers') }} c
|
|
203
|
+
JOIN {{ ref('stg_generic_accounts') }} a
|
|
204
|
+
ON c.customer_id = a.customer_id
|
|
205
|
+
```
|
|
206
|
+
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
# Generic Transformation
|
|
2
|
+
|
|
3
|
+
**Unit 2 of Generic 3-Unit Deployment**
|
|
4
|
+
|
|
5
|
+
FDP Transformation - dbt models for ODP → FDP transformation.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## Flow Diagram
|
|
10
|
+
|
|
11
|
+
```
|
|
12
|
+
Generic TRANSFORMATION FLOW
|
|
13
|
+
──────────────────────
|
|
14
|
+
|
|
15
|
+
BigQuery ODP dbt BigQuery FDP
|
|
16
|
+
──────────── ─── ────────────
|
|
17
|
+
|
|
18
|
+
odp_generic.customers ─────┐
|
|
19
|
+
│ ┌─────────────────┐
|
|
20
|
+
odp_generic.accounts ─────┼───►│ JOIN Logic │────────► fdp_generic.event_transaction_excess
|
|
21
|
+
│ └─────────────────┘
|
|
22
|
+
│
|
|
23
|
+
odp_generic.decision ─────┼───────────────────────────────► fdp_generic.portfolio_account_excess
|
|
24
|
+
│
|
|
25
|
+
└───────────────────────
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
---
|
|
29
|
+
|
|
30
|
+
## Pattern
|
|
31
|
+
|
|
32
|
+
**MULTI-TARGET**:
|
|
33
|
+
1. **JOIN**: 2 ODP sources (customers, accounts) → 1 FDP target (`event_transaction_excess`)
|
|
34
|
+
2. **MAP**: 1 ODP source (decision) → 1 FDP target (`portfolio_account_excess`)
|
|
35
|
+
|
|
36
|
+
| Step | Description |
|
|
37
|
+
|------|-------------|
|
|
38
|
+
| 1 | Staging models clean and type-cast raw ODP data |
|
|
39
|
+
| 2 | `add_audit_columns` macro injects `run_id` and `source_file` |
|
|
40
|
+
| 3 | `mask_pii` macro applies environment-aware masking to sensitive fields |
|
|
41
|
+
| 4 | `event_transaction_excess` performs `INNER JOIN` between Customers and Accounts |
|
|
42
|
+
| 5 | `portfolio_account_excess` maps Decision ODP 1:1 to FDP |
|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
## Data Mapping
|
|
47
|
+
|
|
48
|
+
| Source Table | Key Fields |
|
|
49
|
+
|--------------|------------|
|
|
50
|
+
| `odp_generic.customers` | customer_id, ssn, first_name, last_name, dob, status |
|
|
51
|
+
| `odp_generic.accounts` | account_id, customer_id, account_type, balance, open_date |
|
|
52
|
+
| `odp_generic.decision` | decision_id, customer_id, application_id, decision_code, score, decision_date |
|
|
53
|
+
|
|
54
|
+
| Target Table | Description |
|
|
55
|
+
|--------------|-------------|
|
|
56
|
+
| `fdp_generic.event_transaction_excess` | Joined customer-account view |
|
|
57
|
+
| `fdp_generic.portfolio_account_excess` | Decision-based portfolio view |
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## Components
|
|
62
|
+
|
|
63
|
+
| Directory | Purpose |
|
|
64
|
+
|-----------|---------|
|
|
65
|
+
| `dbt/models/staging/generic/` | Staging models (clean raw data) |
|
|
66
|
+
| `dbt/models/fdp/` | FDP models (JOIN and MAP logic) |
|
|
67
|
+
|
|
68
|
+
---
|
|
69
|
+
|
|
70
|
+
## Library-Driven Ease of Use
|
|
71
|
+
|
|
72
|
+
The Generic transformation unit uses the `gcp-pipeline-transform` library to ensure data privacy and lineage with zero local macro development:
|
|
73
|
+
|
|
74
|
+
1. **Zero-Bleed PII Masking**: Uses `{{ mask_pii(column, 'SSN') }}`. The library automatically applies the correct mask (Full in Prod, Partial in Staging) based on the environment.
|
|
75
|
+
2. **Automated Lineage**: Uses `{{ add_audit_columns() }}` to inject `run_id` and `source_file` variables, maintaining the E2E lineage established in the ingestion layer.
|
|
76
|
+
3. **Metadata Enrichment**: Replaces hardcoded business logic with generic library macros that interpret rules from the `EntitySchema`.
|
|
77
|
+
|
|
78
|
+
---
|
|
79
|
+
|
|
80
|
+
## How to Replicate this JOIN Transformation (3-to-1)
|
|
81
|
+
|
|
82
|
+
To create a new transformation unit that joins multiple entities, follow the [Creating New Deployment Guide](../../docs/CREATING_NEW_DEPLOYMENT_GUIDE.md).
|
|
83
|
+
|
|
84
|
+
Key steps for this JOIN pattern:
|
|
85
|
+
1. **Register Library**: Point your `dbt_project.yml` to the `gcp-pipeline-transform` macro paths.
|
|
86
|
+
2. **Staging Models**: Create views for your ODP tables. Use `add_audit_columns` for consistency.
|
|
87
|
+
3. **FDP Models**: Implement your `LEFT JOIN` logic. Apply `mask_pii` to all sensitive fields.
|
|
88
|
+
4. **Governance**: Run `validate_no_pii_in_export` in your CI/CD to prevent leakage.
|
|
89
|
+
|
|
90
|
+
---
|
|
91
|
+
|
|
92
|
+
## Infrastructure & Configurations
|
|
93
|
+
|
|
94
|
+
### Google Cloud Resources
|
|
95
|
+
This deployment requires the following GCP infrastructure, provisioned via Terraform:
|
|
96
|
+
- **Data Warehouse**: BigQuery datasets `odp_generic` (source) and `fdp_generic` (target).
|
|
97
|
+
- **Processing**: dbt (running on Cloud Composer or as a standalone process) for executing transformations.
|
|
98
|
+
|
|
99
|
+
For detailed infrastructure definitions, see [infrastructure/terraform/systems/generic/transformation/](../../infrastructure/terraform/systems/generic/transformation/).
|
|
100
|
+
|
|
101
|
+
### dbt Configuration (`dbt_project.yml`)
|
|
102
|
+
The transformation behavior is controlled by variables and configurations in `dbt_project.yml`:
|
|
103
|
+
|
|
104
|
+
| Variable | Description | Default / Source |
|
|
105
|
+
|----------|-------------|------------------|
|
|
106
|
+
| `gcp_project_id` | Target GCP Project | `GCP_PROJECT_ID` env var |
|
|
107
|
+
| `source_dataset` | Source ODP dataset | `odp_generic` |
|
|
108
|
+
| `staging_dataset` | Intermediate staging dataset | `stg_generic` |
|
|
109
|
+
| `fdp_dataset` | Target FDP dataset | `fdp_generic` |
|
|
110
|
+
| `marts_dataset` | Marts dataset | `marts_generic` |
|
|
111
|
+
| `analytics_dataset` | Analytics dataset | `analytics_generic` |
|
|
112
|
+
| `extract_date` | Date of data extract | `null` (optional filter) |
|
|
113
|
+
| `generic_entities` | List of entities to process | `['customers', 'accounts', 'decision']` |
|
|
114
|
+
| `masking_level` | PII masking strategy (`FULL`, `PARTIAL`, `NONE`) | `AUTO` |
|
|
115
|
+
|
|
116
|
+
### Technology Stack & Documentation
|
|
117
|
+
- [Google BigQuery](https://cloud.google.com/bigquery/docs) - Serverless data warehouse
|
|
118
|
+
- [dbt (data build tool)](https://docs.getdbt.com/docs/introduction) - Transformation workflow
|
|
119
|
+
- [dbt-bigquery Adapter](https://docs.getdbt.com/reference/warehouse-setups/bigquery-setup) - dbt to BigQuery connector
|
|
120
|
+
- [Data Modeling in dbt](https://docs.getdbt.com/docs/build/models) - Best practices for models
|
|
121
|
+
|
|
122
|
+
---
|
|
123
|
+
|
|
124
|
+
## Dependencies
|
|
125
|
+
|
|
126
|
+
| Library | Purpose |
|
|
127
|
+
|---------|---------|
|
|
128
|
+
| `dbt-bigquery` | dbt adapter for BigQuery |
|
|
129
|
+
| `gcp-pipeline-transform` | Shared macros (audit columns) |
|
|
130
|
+
|
|
131
|
+
---
|
|
132
|
+
|
|
133
|
+
## Execution & Testing
|
|
134
|
+
|
|
135
|
+
### 1. Local Development Setup
|
|
136
|
+
Initialize the virtual environment:
|
|
137
|
+
```bash
|
|
138
|
+
./scripts/setup_deployment_venv.sh bigquery-to-mapped-product
|
|
139
|
+
source deployments/bigquery-to-mapped-product/venv/bin/activate
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
### 2. Local dbt Execution
|
|
143
|
+
Run dbt models locally against the development BigQuery dataset:
|
|
144
|
+
```bash
|
|
145
|
+
cd dbt
|
|
146
|
+
dbt run --profiles-dir . --target dev
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
### 3. Data Quality Validation
|
|
150
|
+
Run dbt tests to verify transformation logic and PII masking:
|
|
151
|
+
```bash
|
|
152
|
+
dbt test --profiles-dir . --target dev
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### 4. Governance Verification
|
|
156
|
+
Use the library macro to ensure no unmasked PII exists in your models before deployment:
|
|
157
|
+
```sql
|
|
158
|
+
{{ validate_no_pii_in_export('fdp_generic.event_transaction_excess') }}
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
### 5. Cloud Execution
|
|
162
|
+
In production, this unit is triggered by the `generic_odp_load_dag` once ingestion is successful. The transformation is executed via a `BashOperator` running `dbt run`.
|
|
163
|
+
|
|
164
|
+
---
|
|
165
|
+
|
|
166
|
+
## SQL Example
|
|
167
|
+
|
|
168
|
+
```sql
|
|
169
|
+
-- fdp_generic.event_transaction_excess
|
|
170
|
+
SELECT
|
|
171
|
+
c.customer_id,
|
|
172
|
+
c.first_name,
|
|
173
|
+
c.last_name,
|
|
174
|
+
a.account_id,
|
|
175
|
+
a.current_balance,
|
|
176
|
+
-- Audit columns
|
|
177
|
+
c._run_id,
|
|
178
|
+
CURRENT_TIMESTAMP() as _transformed_at
|
|
179
|
+
FROM {{ ref('stg_generic_customers') }} c
|
|
180
|
+
JOIN {{ ref('stg_generic_accounts') }} a
|
|
181
|
+
ON c.customer_id = a.customer_id
|
|
182
|
+
```
|
|
183
|
+
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
name: 'bigquery_mapped_product'
|
|
2
|
+
version: '1.0.0'
|
|
3
|
+
config-version: 2
|
|
4
|
+
|
|
5
|
+
profile: 'mapped_product_profile'
|
|
6
|
+
|
|
7
|
+
model-paths: ["models"]
|
|
8
|
+
analysis-paths: ["analyses"]
|
|
9
|
+
test-paths: ["tests"]
|
|
10
|
+
seed-paths: ["seeds"]
|
|
11
|
+
macro-paths: ["macros", "../libs/gcp_pipeline_transform/dbt_shared/macros"]
|
|
12
|
+
snapshot-paths: ["snapshots"]
|
|
13
|
+
|
|
14
|
+
target-path: "target"
|
|
15
|
+
clean-targets:
|
|
16
|
+
- "target"
|
|
17
|
+
- "dbt_packages"
|
|
18
|
+
|
|
19
|
+
# Documentation
|
|
20
|
+
docs-paths: ["docs"]
|
|
21
|
+
|
|
22
|
+
models:
|
|
23
|
+
bigquery_mapped_product:
|
|
24
|
+
staging:
|
|
25
|
+
+materialized: view
|
|
26
|
+
+schema: staging
|
|
27
|
+
+tags: ["staging"]
|
|
28
|
+
fdp:
|
|
29
|
+
+materialized: incremental
|
|
30
|
+
+schema: fdp
|
|
31
|
+
+tags: ["fdp"]
|
|
32
|
+
marts:
|
|
33
|
+
+materialized: table
|
|
34
|
+
+schema: marts
|
|
35
|
+
+tags: ["marts"]
|
|
36
|
+
analytics:
|
|
37
|
+
+materialized: table
|
|
38
|
+
+schema: analytics
|
|
39
|
+
+tags: ["analytics"]
|
|
40
|
+
|
|
41
|
+
vars:
|
|
42
|
+
# GCP project
|
|
43
|
+
gcp_project_id: "{{ env_var('GCP_PROJECT_ID') }}"
|
|
44
|
+
|
|
45
|
+
# Consolidated datasets
|
|
46
|
+
source_dataset: "odp_generic"
|
|
47
|
+
staging_dataset: "stg_generic"
|
|
48
|
+
fdp_dataset: "fdp_generic"
|
|
49
|
+
marts_dataset: "marts_generic"
|
|
50
|
+
analytics_dataset: "analytics_generic"
|
|
51
|
+
|
|
52
|
+
# Date parameters
|
|
53
|
+
extract_date: null
|
|
54
|
+
current_date: "CURRENT_DATE()"
|
|
55
|
+
|
|
56
|
+
# Consolidated entities
|
|
57
|
+
all_entities: ['customers', 'accounts', 'decision', 'applications']
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
-- Generic dbt Macro: Data Quality Checks
|
|
2
|
+
-- Validates data quality metrics (completeness, uniqueness, patterns)
|
|
3
|
+
-- Thresholds configurable via dbt variables
|
|
4
|
+
|
|
5
|
+
{% macro check_data_quality(table_name) %}
|
|
6
|
+
select
|
|
7
|
+
'{{ table_name }}' as table_name,
|
|
8
|
+
count(*) as row_count,
|
|
9
|
+
count(case when id is not null then 1 end) / count(*) * 100 as completeness_pct,
|
|
10
|
+
count(distinct id) / count(*) * 100 as uniqueness_pct,
|
|
11
|
+
max(created_at) as last_updated
|
|
12
|
+
from {{ table_name }}
|
|
13
|
+
having
|
|
14
|
+
count(case when id is not null then 1 end) / count(*) * 100 >= {{ var('quality_completeness_threshold', 95) }}
|
|
15
|
+
and count(distinct id) / count(*) * 100 >= {{ var('quality_uniqueness_threshold', 100) }}
|
|
16
|
+
{% endmacro %}
|
|
17
|
+
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
-- Generic Blueprint - dbt Macro: Incremental Strategy
|
|
2
|
+
--
|
|
3
|
+
-- Provides different strategies for incremental loads:
|
|
4
|
+
-- - MERGE: Update existing, insert new (recommended)
|
|
5
|
+
-- - APPEND_ONLY: Always append (simple but slower)
|
|
6
|
+
-- - DELETE_INSERT: Delete+insert partition (good for daily batches)
|
|
7
|
+
--
|
|
8
|
+
-- Usage in dbt model:
|
|
9
|
+
-- {{ config(
|
|
10
|
+
-- materialized = 'incremental',
|
|
11
|
+
-- on_schema_change = 'fail',
|
|
12
|
+
-- incremental_strategy = 'merge',
|
|
13
|
+
-- unique_key = 'id'
|
|
14
|
+
-- ) }}
|
|
15
|
+
--
|
|
16
|
+
|
|
17
|
+
{% macro build_merge_statement(target_table, source_table, unique_key, update_columns) %}
|
|
18
|
+
MERGE INTO {{ target_table }} T
|
|
19
|
+
USING {{ source_table }} S
|
|
20
|
+
ON T.{{ unique_key }} = S.{{ unique_key }}
|
|
21
|
+
|
|
22
|
+
-- Update existing records
|
|
23
|
+
WHEN MATCHED THEN
|
|
24
|
+
UPDATE SET
|
|
25
|
+
{% for column in update_columns %}
|
|
26
|
+
T.{{ column }} = S.{{ column }}
|
|
27
|
+
{%- if not loop.last %}, {% endif %}
|
|
28
|
+
{% endfor %}
|
|
29
|
+
|
|
30
|
+
-- Insert new records
|
|
31
|
+
WHEN NOT MATCHED THEN
|
|
32
|
+
INSERT (*)
|
|
33
|
+
VALUES (*)
|
|
34
|
+
{% endmacro %}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
{% macro build_append_strategy() %}
|
|
38
|
+
-- Simple append: just SELECT all new records
|
|
39
|
+
-- This is the least efficient but simplest approach
|
|
40
|
+
-- Use when:
|
|
41
|
+
-- - Data never changes (immutable)
|
|
42
|
+
-- - Speed is not critical
|
|
43
|
+
-- - Simplicity is preferred
|
|
44
|
+
|
|
45
|
+
SELECT *
|
|
46
|
+
FROM {{ source }}
|
|
47
|
+
WHERE run_id = '{{ var("run_id") }}'
|
|
48
|
+
{% endmacro %}
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
{% macro build_delete_insert_strategy(unique_key, date_column) %}
|
|
52
|
+
-- Delete+insert for current partition, then insert new records
|
|
53
|
+
-- Use when:
|
|
54
|
+
-- - Processing daily batches
|
|
55
|
+
-- - Current day's data might be updated
|
|
56
|
+
-- - Historical data is immutable
|
|
57
|
+
|
|
58
|
+
DELETE FROM {{ this }}
|
|
59
|
+
WHERE DATE({{ date_column }}) = '{{ run_date }}';
|
|
60
|
+
|
|
61
|
+
INSERT INTO {{ this }}
|
|
62
|
+
SELECT * FROM {{ source }}
|
|
63
|
+
WHERE run_id = '{{ var("run_id") }}'
|
|
64
|
+
{% endmacro %}
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
-- Implementation of merge strategy for incremental loads
|
|
68
|
+
{% if execute %}
|
|
69
|
+
{% if flags.FULL_REFRESH %}
|
|
70
|
+
-- Full refresh: truncate and reload
|
|
71
|
+
TRUNCATE TABLE {{ this }};
|
|
72
|
+
{% elif var("incremental_strategy") == "merge" %}
|
|
73
|
+
-- Use merge for incremental loads
|
|
74
|
+
{{ build_merge_statement(this, source_data, var("unique_key"), var("update_columns")) }}
|
|
75
|
+
{% elif var("incremental_strategy") == "delete_insert" %}
|
|
76
|
+
-- Use delete+insert for daily batches
|
|
77
|
+
{{ build_delete_insert_strategy(var("unique_key"), var("date_column")) }}
|
|
78
|
+
{% else %}
|
|
79
|
+
-- Default: append only
|
|
80
|
+
INSERT INTO {{ this }}
|
|
81
|
+
{{ build_append_strategy() }}
|
|
82
|
+
{% endif %}
|
|
83
|
+
{% endif %}
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
-- Macro to handle late-arriving data
|
|
87
|
+
{% macro handle_late_arriving_data(table, days_lookback) %}
|
|
88
|
+
{% set sql %}
|
|
89
|
+
CREATE OR REPLACE TEMP TABLE late_arrivals AS
|
|
90
|
+
SELECT *
|
|
91
|
+
FROM {{ table }}
|
|
92
|
+
WHERE DATE(processed_timestamp) >= DATE_SUB(CURRENT_DATE(), INTERVAL {{ days_lookback }} DAY)
|
|
93
|
+
AND run_id != '{{ var("run_id") }}';
|
|
94
|
+
|
|
95
|
+
MERGE INTO {{ table }} T
|
|
96
|
+
USING late_arrivals S
|
|
97
|
+
ON T.id = S.id
|
|
98
|
+
WHEN MATCHED THEN
|
|
99
|
+
UPDATE SET T.* = S.*
|
|
100
|
+
WHEN NOT MATCHED THEN
|
|
101
|
+
INSERT *;
|
|
102
|
+
{% endset %}
|
|
103
|
+
|
|
104
|
+
{% do run_query(sql) %}
|
|
105
|
+
{% endmacro %}
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
-- Macro for SCD Type 2 (Slowly Changing Dimension)
|
|
109
|
+
{% macro scd2_logic(business_key, change_columns) %}
|
|
110
|
+
SELECT
|
|
111
|
+
{% for col in change_columns %}
|
|
112
|
+
{{ col }},
|
|
113
|
+
{% endfor %}
|
|
114
|
+
CURRENT_TIMESTAMP() as valid_from,
|
|
115
|
+
NULL as valid_to,
|
|
116
|
+
TRUE as is_current,
|
|
117
|
+
ROW_NUMBER() OVER (PARTITION BY {{ business_key }} ORDER BY {{ change_columns | join(", ") }}) as version
|
|
118
|
+
FROM {{ source }}
|
|
119
|
+
{% endmacro %}
|
|
120
|
+
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
version: 2
|
|
2
|
+
|
|
3
|
+
models:
|
|
4
|
+
- name: event_transaction_excess
|
|
5
|
+
description: |
|
|
6
|
+
Generic FDP: Event Transaction Excess
|
|
7
|
+
|
|
8
|
+
JOIN of 2 ODP sources (customers, accounts) into a single
|
|
9
|
+
table for transaction-level analysis.
|
|
10
|
+
|
|
11
|
+
config:
|
|
12
|
+
tags: ["fdp", "generic", "join"]
|
|
13
|
+
materialized: table
|
|
14
|
+
partition_by:
|
|
15
|
+
field: _extract_date
|
|
16
|
+
data_type: date
|
|
17
|
+
cluster_by: [customer_id, account_id]
|
|
18
|
+
|
|
19
|
+
columns:
|
|
20
|
+
- name: event_key
|
|
21
|
+
tests: [not_null, unique]
|
|
22
|
+
- name: customer_id
|
|
23
|
+
tests: [not_null]
|
|
24
|
+
- name: account_id
|
|
25
|
+
tests: [not_null]
|
|
26
|
+
- name: ssn_masked
|
|
27
|
+
- name: first_name
|
|
28
|
+
- name: last_name
|
|
29
|
+
- name: current_balance
|
|
30
|
+
- name: account_type_desc
|
|
31
|
+
- name: _run_id
|
|
32
|
+
tests: [not_null]
|
|
33
|
+
- name: _extract_date
|
|
34
|
+
tests: [not_null]
|
|
35
|
+
- name: _transformed_ts
|
|
36
|
+
|
|
37
|
+
- name: portfolio_account_excess
|
|
38
|
+
description: |
|
|
39
|
+
Generic FDP: Portfolio Account Excess
|
|
40
|
+
|
|
41
|
+
MAP of 1 ODP source (decision) into FDP target for portfolio analysis.
|
|
42
|
+
|
|
43
|
+
config:
|
|
44
|
+
tags: ["fdp", "generic", "map"]
|
|
45
|
+
materialized: table
|
|
46
|
+
partition_by:
|
|
47
|
+
field: _extract_date
|
|
48
|
+
data_type: date
|
|
49
|
+
cluster_by: [customer_id, _run_id]
|
|
50
|
+
|
|
51
|
+
columns:
|
|
52
|
+
- name: portfolio_key
|
|
53
|
+
tests: [not_null, unique]
|
|
54
|
+
- name: decision_id
|
|
55
|
+
tests: [not_null]
|
|
56
|
+
- name: customer_id
|
|
57
|
+
tests: [not_null]
|
|
58
|
+
- name: decision_outcome
|
|
59
|
+
- name: score
|
|
60
|
+
- name: _run_id
|
|
61
|
+
tests: [not_null]
|
|
62
|
+
- name: _extract_date
|
|
63
|
+
tests: [not_null]
|
|
64
|
+
- name: _transformed_ts
|
|
65
|
+
|
|
66
|
+
- name: portfolio_account_facility
|
|
67
|
+
description: |
|
|
68
|
+
Generic FDP: Portfolio Account Facility
|
|
69
|
+
|
|
70
|
+
MAP of 1 ODP source (applications) into FDP target for facility analysis.
|
|
71
|
+
|
|
72
|
+
config:
|
|
73
|
+
tags: ["fdp", "generic", "map"]
|
|
74
|
+
materialized: table
|
|
75
|
+
partition_by:
|
|
76
|
+
field: _extract_date
|
|
77
|
+
data_type: date
|
|
78
|
+
cluster_by: [application_id, customer_id]
|
|
79
|
+
|
|
80
|
+
columns:
|
|
81
|
+
- name: facility_key
|
|
82
|
+
tests: [not_null, unique]
|
|
83
|
+
- name: application_id
|
|
84
|
+
tests: [not_null]
|
|
85
|
+
- name: customer_id
|
|
86
|
+
- name: loan_amount
|
|
87
|
+
- name: application_date
|
|
88
|
+
- name: application_status
|
|
89
|
+
- name: _run_id
|
|
90
|
+
tests: [not_null]
|
|
91
|
+
- name: _extract_date
|
|
92
|
+
tests: [not_null]
|
|
93
|
+
- name: _transformed_at
|