fabricks 2024.7.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/__init__.py +0 -0
- fabricks/api/__init__.py +7 -0
- fabricks/api/cdc/__init__.py +6 -0
- fabricks/api/cdc/nocdc.py +3 -0
- fabricks/api/cdc/scd1.py +3 -0
- fabricks/api/cdc/scd2.py +3 -0
- fabricks/api/context.py +31 -0
- fabricks/api/core.py +4 -0
- fabricks/api/extenders.py +3 -0
- fabricks/api/log.py +3 -0
- fabricks/api/metastore/__init__.py +10 -0
- fabricks/api/metastore/database.py +3 -0
- fabricks/api/metastore/table.py +3 -0
- fabricks/api/metastore/view.py +6 -0
- fabricks/api/notebooks/__init__.py +0 -0
- fabricks/api/notebooks/cluster.py +6 -0
- fabricks/api/notebooks/deploy/__init__.py +0 -0
- fabricks/api/notebooks/deploy/fabricks.py +147 -0
- fabricks/api/notebooks/deploy/notebooks.py +86 -0
- fabricks/api/notebooks/initialize.py +38 -0
- fabricks/api/notebooks/optimize.py +25 -0
- fabricks/api/notebooks/process.py +50 -0
- fabricks/api/notebooks/run.py +87 -0
- fabricks/api/notebooks/terminate.py +27 -0
- fabricks/api/notebooks/vacuum.py +25 -0
- fabricks/api/parsers.py +3 -0
- fabricks/api/udfs.py +3 -0
- fabricks/api/utils.py +9 -0
- fabricks/cdc/__init__.py +14 -0
- fabricks/cdc/base/__init__.py +4 -0
- fabricks/cdc/base/cdc.py +5 -0
- fabricks/cdc/base/configurator.py +145 -0
- fabricks/cdc/base/generator.py +117 -0
- fabricks/cdc/base/merger.py +107 -0
- fabricks/cdc/base/processor.py +338 -0
- fabricks/cdc/base/types.py +3 -0
- fabricks/cdc/cdc.py +5 -0
- fabricks/cdc/nocdc.py +19 -0
- fabricks/cdc/scd.py +21 -0
- fabricks/cdc/scd1.py +15 -0
- fabricks/cdc/scd2.py +15 -0
- fabricks/cdc/templates/__init__.py +0 -0
- fabricks/cdc/templates/merge/scd1.sql.jinja +72 -0
- fabricks/cdc/templates/merge/scd2.sql.jinja +54 -0
- fabricks/cdc/templates/merge.sql.jinja +2 -0
- fabricks/cdc/templates/query/__init__.py +0 -0
- fabricks/cdc/templates/query/base.sql.jinja +34 -0
- fabricks/cdc/templates/query/context.sql.jinja +95 -0
- fabricks/cdc/templates/query/current.sql.jinja +32 -0
- fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +21 -0
- fabricks/cdc/templates/query/deduplicate_key.sql.jinja +14 -0
- fabricks/cdc/templates/query/filter.sql.jinja +71 -0
- fabricks/cdc/templates/query/final.sql.jinja +1 -0
- fabricks/cdc/templates/query/hash.sql.jinja +1 -0
- fabricks/cdc/templates/query/nocdc.sql.jinja +10 -0
- fabricks/cdc/templates/query/rectify.sql.jinja +120 -0
- fabricks/cdc/templates/query/scd1.sql.jinja +112 -0
- fabricks/cdc/templates/query/scd2.sql.jinja +114 -0
- fabricks/cdc/templates/query.sql.jinja +11 -0
- fabricks/context/__init__.py +51 -0
- fabricks/context/log.py +26 -0
- fabricks/context/runtime.py +143 -0
- fabricks/context/spark.py +43 -0
- fabricks/context/types.py +123 -0
- fabricks/core/__init__.py +4 -0
- fabricks/core/dags/__init__.py +9 -0
- fabricks/core/dags/base.py +72 -0
- fabricks/core/dags/generator.py +154 -0
- fabricks/core/dags/log.py +14 -0
- fabricks/core/dags/processor.py +163 -0
- fabricks/core/dags/terminator.py +26 -0
- fabricks/core/deploy/__init__.py +12 -0
- fabricks/core/deploy/tables.py +76 -0
- fabricks/core/deploy/views.py +417 -0
- fabricks/core/extenders.py +29 -0
- fabricks/core/jobs/__init__.py +20 -0
- fabricks/core/jobs/base/__init__.py +10 -0
- fabricks/core/jobs/base/checker.py +89 -0
- fabricks/core/jobs/base/configurator.py +323 -0
- fabricks/core/jobs/base/error.py +16 -0
- fabricks/core/jobs/base/generator.py +391 -0
- fabricks/core/jobs/base/invoker.py +119 -0
- fabricks/core/jobs/base/job.py +5 -0
- fabricks/core/jobs/base/processor.py +204 -0
- fabricks/core/jobs/base/types.py +191 -0
- fabricks/core/jobs/bronze.py +333 -0
- fabricks/core/jobs/get_job.py +126 -0
- fabricks/core/jobs/get_job_conf.py +115 -0
- fabricks/core/jobs/get_job_id.py +26 -0
- fabricks/core/jobs/get_jobs.py +89 -0
- fabricks/core/jobs/gold.py +218 -0
- fabricks/core/jobs/silver.py +354 -0
- fabricks/core/parsers/__init__.py +12 -0
- fabricks/core/parsers/base.py +91 -0
- fabricks/core/parsers/decorator.py +11 -0
- fabricks/core/parsers/get_parser.py +25 -0
- fabricks/core/parsers/types.py +6 -0
- fabricks/core/schedules.py +89 -0
- fabricks/core/scripts/__init__.py +13 -0
- fabricks/core/scripts/armageddon.py +82 -0
- fabricks/core/scripts/generate.py +20 -0
- fabricks/core/scripts/job_schema.py +28 -0
- fabricks/core/scripts/optimize.py +45 -0
- fabricks/core/scripts/process.py +9 -0
- fabricks/core/scripts/stats.py +48 -0
- fabricks/core/scripts/steps.py +27 -0
- fabricks/core/scripts/terminate.py +6 -0
- fabricks/core/scripts/vacuum.py +45 -0
- fabricks/core/site_packages.py +55 -0
- fabricks/core/steps/__init__.py +4 -0
- fabricks/core/steps/base.py +282 -0
- fabricks/core/steps/get_step.py +10 -0
- fabricks/core/steps/get_step_conf.py +33 -0
- fabricks/core/steps/types.py +7 -0
- fabricks/core/udfs.py +106 -0
- fabricks/core/utils.py +69 -0
- fabricks/core/views.py +36 -0
- fabricks/metastore/README.md +3 -0
- fabricks/metastore/__init__.py +5 -0
- fabricks/metastore/database.py +71 -0
- fabricks/metastore/pyproject.toml +20 -0
- fabricks/metastore/relational.py +61 -0
- fabricks/metastore/table.py +529 -0
- fabricks/metastore/utils.py +35 -0
- fabricks/metastore/view.py +40 -0
- fabricks/utils/README.md +3 -0
- fabricks/utils/__init__.py +0 -0
- fabricks/utils/azure_queue.py +63 -0
- fabricks/utils/azure_table.py +99 -0
- fabricks/utils/console.py +51 -0
- fabricks/utils/container.py +57 -0
- fabricks/utils/fdict.py +28 -0
- fabricks/utils/helpers.py +89 -0
- fabricks/utils/log.py +153 -0
- fabricks/utils/path.py +206 -0
- fabricks/utils/pip.py +61 -0
- fabricks/utils/pydantic.py +92 -0
- fabricks/utils/pyproject.toml +18 -0
- fabricks/utils/read/__init__.py +11 -0
- fabricks/utils/read/read.py +305 -0
- fabricks/utils/read/read_excel.py +5 -0
- fabricks/utils/read/read_yaml.py +43 -0
- fabricks/utils/read/types.py +3 -0
- fabricks/utils/schema/__init__.py +7 -0
- fabricks/utils/schema/get_json_schema_for_type.py +161 -0
- fabricks/utils/schema/get_schema_for_type.py +93 -0
- fabricks/utils/secret.py +78 -0
- fabricks/utils/sqlglot.py +48 -0
- fabricks/utils/write/__init__.py +8 -0
- fabricks/utils/write/delta.py +46 -0
- fabricks/utils/write/stream.py +27 -0
- fabricks-2024.7.1.5.dist-info/METADATA +212 -0
- fabricks-2024.7.1.5.dist-info/RECORD +154 -0
- fabricks-2024.7.1.5.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
{% import 'query/hash.sql.jinja' as h -%}
|
|
2
|
+
|
|
3
|
+
__current as (
|
|
4
|
+
select
|
|
5
|
+
{% for field in fields %} {{ field }}, {% endfor %}
|
|
6
|
+
'current' as __operation,
|
|
7
|
+
'current' as __original_operation,
|
|
8
|
+
{% if has_timestamp %}
|
|
9
|
+
{% if cdc == "nocdc" %} __timestamp as __timestamp, {% endif %}
|
|
10
|
+
{% if cdc == "scd1" %} __timestamp as __timestamp, {% endif %}
|
|
11
|
+
{% if cdc == "scd2" %} __valid_from as __timestamp, {% endif %}
|
|
12
|
+
{% else %} cast('0001-01-01' as timestamp) as __timestamp,
|
|
13
|
+
{% endif %}
|
|
14
|
+
__timestamp as __original_timestamp,
|
|
15
|
+
{% if has_hash %} __hash,
|
|
16
|
+
{% else %} {{ h.hash(fields=hashes) }} as __hash,
|
|
17
|
+
{% endif %}
|
|
18
|
+
{% if has_identity %} __identity, {% endif %}
|
|
19
|
+
{% if has_key %} __key,
|
|
20
|
+
{% else %} {{ h.hash(fields=keys) }} as __key,
|
|
21
|
+
{% endif %}
|
|
22
|
+
{% if has_source %} __source, {% endif %}
|
|
23
|
+
{% if has_metadata %} __metadata, {% endif %}
|
|
24
|
+
{% if has_rescued_data %} __rescued_data, {% endif %}
|
|
25
|
+
from {{ tgt }} t
|
|
26
|
+
where
|
|
27
|
+
true
|
|
28
|
+
{% if cdc == "scd2" %} and __is_current {% endif %}
|
|
29
|
+
{% if cdc == "scd1" %} {% if soft_delete %} and __is_current {% endif %} {% endif %}
|
|
30
|
+
{% if has_source %} and exists (select 1 from __base s where s.__source == t.__source) {% endif %}
|
|
31
|
+
{% if update_where %} and {{ update_where }} {% endif %}
|
|
32
|
+
),
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
__deduplicate_hash as (
|
|
2
|
+
select
|
|
3
|
+
*,
|
|
4
|
+
lag(__hash) over (
|
|
5
|
+
partition by {% if has_source %} __source, {% endif %} __key order by __timestamp asc
|
|
6
|
+
) as __deduplicate_hash_previous__hash,
|
|
7
|
+
lag(__operation) over (
|
|
8
|
+
partition by {% if has_source %} __source, {% endif %} __key order by __timestamp asc
|
|
9
|
+
) as __deduplicate_hash_previous_operation
|
|
10
|
+
from {{ parent_deduplicate_hash }}
|
|
11
|
+
where true
|
|
12
|
+
),
|
|
13
|
+
__deduplicated_hash as (
|
|
14
|
+
select *
|
|
15
|
+
from __deduplicate_hash
|
|
16
|
+
where
|
|
17
|
+
true
|
|
18
|
+
and not (
|
|
19
|
+
__hash <=> __deduplicate_hash_previous__hash and __operation <=> __deduplicate_hash_previous_operation
|
|
20
|
+
)
|
|
21
|
+
),
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
__deduplicate_key as (
|
|
2
|
+
select
|
|
3
|
+
*,
|
|
4
|
+
row_number() over (
|
|
5
|
+
partition by {% if has_source %} __source, {% endif %} __key, __timestamp
|
|
6
|
+
order by
|
|
7
|
+
/* prioritize delete over upsert */
|
|
8
|
+
__operation asc,
|
|
9
|
+
{% if has_order_by %} {% for o in order_duplicate_by %} {{ o }}, {% endfor %} {% endif %}
|
|
10
|
+
) as __deduplicate_key_rn
|
|
11
|
+
from {{ parent_deduplicate_key }}
|
|
12
|
+
where true
|
|
13
|
+
),
|
|
14
|
+
__deduplicated_key as (select *, from __deduplicate_key where __deduplicate_key_rn == 1),
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
{% if filter == "latest" %}
|
|
2
|
+
__filter_latest_timestamp as (
|
|
3
|
+
select {% if has_source %} t.__source, {% endif %} max(t.__timestamp) as __max_timestamp
|
|
4
|
+
from {{ parent_filter }} t
|
|
5
|
+
{% if has_source %} group by t.__source {% endif %}
|
|
6
|
+
),
|
|
7
|
+
__filtered as (
|
|
8
|
+
select
|
|
9
|
+
{% for field in fields %} {{ field }}, {% endfor %}
|
|
10
|
+
s.__operation,
|
|
11
|
+
s.__timestamp,
|
|
12
|
+
s.__hash,
|
|
13
|
+
s.__key,
|
|
14
|
+
{% if has_source %} s.__source, {% endif %}
|
|
15
|
+
{% if has_metadata %} s.__metadata, {% endif %}
|
|
16
|
+
{% if has_rescued_data %} __rescued_data, {% endif %}
|
|
17
|
+
from {{ parent_filter }} s
|
|
18
|
+
where
|
|
19
|
+
true
|
|
20
|
+
and exists (
|
|
21
|
+
select 1
|
|
22
|
+
from __filter_latest_timestamp m
|
|
23
|
+
where
|
|
24
|
+
true and s.__timestamp = m.__max_timestamp
|
|
25
|
+
{% if has_source %} and s.__source == m.__source {% endif %}
|
|
26
|
+
)
|
|
27
|
+
),
|
|
28
|
+
{% else %}
|
|
29
|
+
__filter_updated_timestamp as (
|
|
30
|
+
select
|
|
31
|
+
{% if has_source %} t.__source,
|
|
32
|
+
{% endif %}
|
|
33
|
+
{% if cdc == "nocdc" %} coalesce(max(t.__timestamp), cast('0001-01-01' as timestamp)) as __max_timestamp
|
|
34
|
+
{% endif %}
|
|
35
|
+
{% if cdc == "scd1" %} coalesce(max(t.__timestamp), cast('0001-01-01' as timestamp)) as __max_timestamp
|
|
36
|
+
{% endif %}
|
|
37
|
+
{% if cdc == "scd2" %} coalesce(max(t.__valid_from), cast('0001-01-01' as timestamp)) as __max_timestamp
|
|
38
|
+
{% endif %}
|
|
39
|
+
from {{ tgt }} t
|
|
40
|
+
where true
|
|
41
|
+
{% if has_source %}
|
|
42
|
+
and exists (select 1 from {{ parent_filter }} s where s.__source == t.__source) group by t.__source
|
|
43
|
+
{% endif %}
|
|
44
|
+
),
|
|
45
|
+
__filtered as (
|
|
46
|
+
select
|
|
47
|
+
{% for field in fields %} {{ field }},
|
|
48
|
+
{% endfor %}
|
|
49
|
+
s.__operation,
|
|
50
|
+
s.__timestamp,
|
|
51
|
+
s.__hash,
|
|
52
|
+
s.__key,
|
|
53
|
+
{% if has_source %} s.__source,
|
|
54
|
+
{% endif %}
|
|
55
|
+
{% if has_metadata %} s.__metadata,
|
|
56
|
+
{% endif %}
|
|
57
|
+
{% if has_rescued_data %} __rescued_data,
|
|
58
|
+
{% endif %}
|
|
59
|
+
from {{ parent_filter }} s
|
|
60
|
+
where
|
|
61
|
+
true
|
|
62
|
+
and exists (
|
|
63
|
+
select 1
|
|
64
|
+
from __filter_updated_timestamp m
|
|
65
|
+
where
|
|
66
|
+
true and s.__timestamp > m.__max_timestamp
|
|
67
|
+
{% if has_source %} and s.__source == m.__source
|
|
68
|
+
{% endif %}
|
|
69
|
+
)
|
|
70
|
+
),
|
|
71
|
+
{% endif %}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
select * {% if all_except %} except ({% for e in all_except %}{{ e }}, {% endfor %}), {% endif %} from __final
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{% macro hash(fields) -%} md5(array_join(array({% for f in fields %}{{ f }}, {% endfor %}), '*', '-1')) {%- endmacro %}
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
{% import 'query/hash.sql.jinja' as h -%}
|
|
2
|
+
|
|
3
|
+
{% if mode == "update" %}
|
|
4
|
+
__rectified_base as (
|
|
5
|
+
select
|
|
6
|
+
{% for field in fields %} {{ field }}, {% endfor %}
|
|
7
|
+
s.__operation,
|
|
8
|
+
s.__operation as __original_operation,
|
|
9
|
+
s.__timestamp,
|
|
10
|
+
s.__timestamp as __original_timestamp,
|
|
11
|
+
s.__hash,
|
|
12
|
+
s.__key,
|
|
13
|
+
{% if has_identity %} s.__identity, {% endif %}
|
|
14
|
+
{% if has_source %} s.__source, {% endif %}
|
|
15
|
+
{% if has_metadata %} s.__metadata, {% endif %}
|
|
16
|
+
{% if has_rescued_data %} s.__rescued_data, {% endif %}
|
|
17
|
+
from {{ parent_rectify }} s
|
|
18
|
+
{% if has_rows %}
|
|
19
|
+
union all
|
|
20
|
+
select *
|
|
21
|
+
from __current
|
|
22
|
+
{% endif %}
|
|
23
|
+
),
|
|
24
|
+
{% endif %}
|
|
25
|
+
__rectified_next_operation as (
|
|
26
|
+
select
|
|
27
|
+
*,
|
|
28
|
+
lead(__operation) over (
|
|
29
|
+
partition by {% if has_source %} __source, {% endif %} __key order by __timestamp asc
|
|
30
|
+
) as __rectified_next_operation
|
|
31
|
+
{% if mode == "update" %} from __rectified_base p
|
|
32
|
+
{% else %} from {{ parent_rectify }}
|
|
33
|
+
{% endif %}
|
|
34
|
+
),
|
|
35
|
+
__rectified_timestamps as (
|
|
36
|
+
select
|
|
37
|
+
{% if has_source %} __source, {% endif %}
|
|
38
|
+
__timestamp,
|
|
39
|
+
lead(__timestamp) over (
|
|
40
|
+
{% if has_source %}partition by __source {% endif %} order by __timestamp asc
|
|
41
|
+
) as __rectified_next_timestamp,
|
|
42
|
+
lead(if(max(__operation) == 'reload', __timestamp, null)) ignore nulls over (
|
|
43
|
+
{% if has_source %}partition by __source {% endif %} order by __timestamp asc
|
|
44
|
+
) as __rectified_next_timestamp_reload,
|
|
45
|
+
from __rectified_next_operation
|
|
46
|
+
group by {% if has_source %} __source, {% endif %} __timestamp
|
|
47
|
+
),
|
|
48
|
+
__rectified_is_deleted_next as (
|
|
49
|
+
select
|
|
50
|
+
cur.*,
|
|
51
|
+
t.* except ({% if has_source %}t.__source, {% endif %} t.__timestamp),
|
|
52
|
+
-- there is more reload
|
|
53
|
+
__rectified_next_timestamp_reload is not null as __rectify_more_reload,
|
|
54
|
+
-- the next operation is bedore the next reload
|
|
55
|
+
if(
|
|
56
|
+
__rectify_more_reload, t.__rectified_next_timestamp < t.__rectified_next_timestamp_reload, true
|
|
57
|
+
) as __rectify_key_next_operation_before_next_reload,
|
|
58
|
+
-- the record is deleted
|
|
59
|
+
cur.__operation == 'delete' as __rectified_key_is_deleted,
|
|
60
|
+
-- the record is not found in next reload
|
|
61
|
+
__rectify_more_reload and nxt.__timestamp is null as __rectified_key_not_found_in_next_reload,
|
|
62
|
+
-- there is no more operation
|
|
63
|
+
t.__rectified_next_timestamp is null as __rectified_no_more_operation,
|
|
64
|
+
-- the record is deleted before next reload
|
|
65
|
+
__rectify_key_next_operation_before_next_reload
|
|
66
|
+
and cur.__rectified_next_operation <=> 'delete' as __rectified_key_is_deleted_next,
|
|
67
|
+
-- the record is updated before next reload
|
|
68
|
+
__rectify_key_next_operation_before_next_reload
|
|
69
|
+
and cur.__rectified_next_operation <=> 'upsert' as __rectified_key_is_updated_next,
|
|
70
|
+
case
|
|
71
|
+
when __rectified_key_is_deleted
|
|
72
|
+
then false
|
|
73
|
+
when __rectified_key_is_updated_next
|
|
74
|
+
then false
|
|
75
|
+
when __rectified_key_is_deleted_next
|
|
76
|
+
then false
|
|
77
|
+
when __rectified_no_more_operation
|
|
78
|
+
then false
|
|
79
|
+
when __rectified_key_not_found_in_next_reload
|
|
80
|
+
then true
|
|
81
|
+
else false
|
|
82
|
+
end as __rectified_is_deleted_next
|
|
83
|
+
from __rectified_next_operation cur
|
|
84
|
+
left join
|
|
85
|
+
__rectified_timestamps t on cur.__timestamp = t.__timestamp
|
|
86
|
+
{% if has_source %} and cur.__source == t.__source {% endif %}
|
|
87
|
+
left join
|
|
88
|
+
__rectified_next_operation nxt
|
|
89
|
+
on t.__rectified_next_timestamp_reload = nxt.__timestamp
|
|
90
|
+
and cur.__key == nxt.__key
|
|
91
|
+
{% if has_source %} and cur.__source == nxt.__source {% endif %}
|
|
92
|
+
),
|
|
93
|
+
{% if mode == "complete" %} __rectified as ( {% else %} __rectified_operation as (
|
|
94
|
+
{% endif %}
|
|
95
|
+
select
|
|
96
|
+
c.* except (c.__operation, c.__timestamp),
|
|
97
|
+
if(c.__operation == 'delete', 'delete', d2.__rectified_operation) as __operation,
|
|
98
|
+
case
|
|
99
|
+
when c.__operation == 'delete'
|
|
100
|
+
then c.__timestamp
|
|
101
|
+
when d2.__rectified_operation == 'upsert'
|
|
102
|
+
then c.__timestamp
|
|
103
|
+
else c.__rectified_next_timestamp_reload
|
|
104
|
+
end as __timestamp
|
|
105
|
+
from __rectified_is_deleted_next c
|
|
106
|
+
cross join
|
|
107
|
+
(
|
|
108
|
+
select false as __is_deleted, 'upsert' as __rectified_operation
|
|
109
|
+
union all
|
|
110
|
+
select true as __is_deleted, 'delete' as __rectified_operation
|
|
111
|
+
union all
|
|
112
|
+
select true as __is_deleted, 'upsert' as __rectified_operation
|
|
113
|
+
) d2
|
|
114
|
+
on c.__rectified_is_deleted_next = d2.__is_deleted
|
|
115
|
+
),
|
|
116
|
+
{% if mode == "update" %}
|
|
117
|
+
__rectified as (
|
|
118
|
+
select * from __rectified_operation where not (__original_operation == 'current' and __operation == 'upsert')
|
|
119
|
+
),
|
|
120
|
+
{% endif %}
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
__scd1_base as (
|
|
2
|
+
select
|
|
3
|
+
*,
|
|
4
|
+
{% if not rectify %} __operation as __original_operation, {% endif %}
|
|
5
|
+
lead(__operation) over (
|
|
6
|
+
partition by {% if has_source %} __source, {% endif %} __key order by __timestamp asc
|
|
7
|
+
) as __scd1_next_operation
|
|
8
|
+
from {{ parent_cdc }}
|
|
9
|
+
),
|
|
10
|
+
__scd1_last_key as (
|
|
11
|
+
-- take last update as it is the latest picture
|
|
12
|
+
select
|
|
13
|
+
*,
|
|
14
|
+
row_number() over (
|
|
15
|
+
partition by {% if has_source %} __source, {% endif %} __key order by __timestamp desc
|
|
16
|
+
) as __scd1_rn
|
|
17
|
+
from __scd1_base
|
|
18
|
+
where true and __operation == 'upsert'
|
|
19
|
+
{% if mode == "update" %}
|
|
20
|
+
{% if has_rows %}
|
|
21
|
+
-- take first delete ONLY if no upsert is present
|
|
22
|
+
union all
|
|
23
|
+
select
|
|
24
|
+
*,
|
|
25
|
+
row_number() over (
|
|
26
|
+
partition by {% if has_source %} __source, {% endif %} __key order by __timestamp asc
|
|
27
|
+
) as __scd1_rn
|
|
28
|
+
from __scd1_base b
|
|
29
|
+
where
|
|
30
|
+
true
|
|
31
|
+
and __operation == 'delete'
|
|
32
|
+
and not exists (
|
|
33
|
+
select 1
|
|
34
|
+
from __scd1_base b2
|
|
35
|
+
where
|
|
36
|
+
true and b.__key == b2.__key
|
|
37
|
+
{% if has_source %} and b.__source == b2.__source {% endif %} and b2.__operation == 'upsert'
|
|
38
|
+
)
|
|
39
|
+
{% endif %}
|
|
40
|
+
{% endif %}
|
|
41
|
+
),
|
|
42
|
+
__scd1 as (
|
|
43
|
+
select
|
|
44
|
+
*,
|
|
45
|
+
__scd1_next_operation <=> 'delete'
|
|
46
|
+
or __operation == 'delete' as __is_deleted,
|
|
47
|
+
not (__scd1_next_operation <=> 'delete' or __operation == 'delete') as __is_current
|
|
48
|
+
from __scd1_last_key
|
|
49
|
+
where true and __scd1_rn == 1
|
|
50
|
+
),
|
|
51
|
+
{% if mode == "complete" %}
|
|
52
|
+
__final as (
|
|
53
|
+
select
|
|
54
|
+
{% for field in fields %} s.{{ field }}, {% endfor %},
|
|
55
|
+
{% if has_identity %} __identity, {% endif %}
|
|
56
|
+
s.__key,
|
|
57
|
+
s.__timestamp,
|
|
58
|
+
{% if soft_delete %} s.__is_current, s.__is_deleted, {% endif %}
|
|
59
|
+
s.__hash,
|
|
60
|
+
{% if has_source %} s.__source, {% endif %}
|
|
61
|
+
{% if has_metadata %} s.__metadata, {% endif %}
|
|
62
|
+
{% if has_rescued_data %} s.__rescued_data, {% endif %}
|
|
63
|
+
from __scd1 s
|
|
64
|
+
where true {% if not soft_delete %} and s.__is_current {% endif %}
|
|
65
|
+
)
|
|
66
|
+
{% else %}
|
|
67
|
+
__merge_condition as (
|
|
68
|
+
select s.*, s.__key as __merge_key, o.__merge_condition
|
|
69
|
+
from __scd1 s
|
|
70
|
+
left join
|
|
71
|
+
(
|
|
72
|
+
select 'upsert' as __operation, 'upsert' as __merge_condition
|
|
73
|
+
{% if has_rows %}
|
|
74
|
+
union all
|
|
75
|
+
select 'delete' as __operation, 'delete' as __merge_condition
|
|
76
|
+
{% endif %}
|
|
77
|
+
) o
|
|
78
|
+
on s.__operation = o.__operation
|
|
79
|
+
),
|
|
80
|
+
{% if has_rows %}
|
|
81
|
+
__scd1_no_fake_update as (
|
|
82
|
+
select *
|
|
83
|
+
from __merge_condition m left
|
|
84
|
+
anti join
|
|
85
|
+
__current c on m.__key == c.__key and m.__hash = c.__hash
|
|
86
|
+
{% if has_source %} and m.__source = c.__source {% endif %} and m.__operation == 'upsert'
|
|
87
|
+
),
|
|
88
|
+
{% endif %}
|
|
89
|
+
__final as (
|
|
90
|
+
select
|
|
91
|
+
__merge_key,
|
|
92
|
+
__merge_condition,
|
|
93
|
+
{% for field in fields %} {{ field }},
|
|
94
|
+
{% endfor %},
|
|
95
|
+
{% if has_identity %} __identity,
|
|
96
|
+
{% endif %}
|
|
97
|
+
__key,
|
|
98
|
+
__timestamp,
|
|
99
|
+
{% if soft_delete %} __is_current, __is_deleted,
|
|
100
|
+
{% endif %}
|
|
101
|
+
__hash,
|
|
102
|
+
{% if has_source %} __source,
|
|
103
|
+
{% endif %}
|
|
104
|
+
{% if has_metadata %} __metadata,
|
|
105
|
+
{% endif %}
|
|
106
|
+
{% if has_rescued_data %} __rescued_data,
|
|
107
|
+
{% endif %}
|
|
108
|
+
{% if has_rows %} from __scd1_no_fake_update m
|
|
109
|
+
{% else %} from __merge_condition m
|
|
110
|
+
{% endif %}
|
|
111
|
+
)
|
|
112
|
+
{% endif %}
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
__scd2_base as (
|
|
2
|
+
select
|
|
3
|
+
*,
|
|
4
|
+
{% if not rectify %} __operation as __original_operation, {% endif %}
|
|
5
|
+
lead(__operation) over (
|
|
6
|
+
partition by {% if has_source %} __source, {% endif %} __key order by __timestamp
|
|
7
|
+
) as __scd2_next_operation,
|
|
8
|
+
lead(__timestamp) over (
|
|
9
|
+
partition by {% if has_source %} __source, {% endif %} __key order by __timestamp
|
|
10
|
+
) as __scd2_next_timestamp
|
|
11
|
+
from {{ parent_cdc }}
|
|
12
|
+
),
|
|
13
|
+
__scd2 as (
|
|
14
|
+
select
|
|
15
|
+
*,
|
|
16
|
+
__timestamp as __valid_from,
|
|
17
|
+
coalesce(__scd2_next_timestamp - interval 1 second, cast('9999-12-31' as timestamp)) as __valid_to,
|
|
18
|
+
__operation <> 'delete' and __valid_to <=> '9999-12-31' as __is_current,
|
|
19
|
+
__operation == 'delete' or __scd2_next_operation <=> 'delete' as __is_deleted,
|
|
20
|
+
{% if mode == "update" %}
|
|
21
|
+
row_number() over (
|
|
22
|
+
partition by __key{% if has_source %}, __source{% endif %} order by __timestamp asc
|
|
23
|
+
) as __scd2_rn
|
|
24
|
+
{% endif %}
|
|
25
|
+
from __scd2_base
|
|
26
|
+
),
|
|
27
|
+
{% if mode == "complete" %}
|
|
28
|
+
__complete as (select s.* from __scd2 s where true and not __operation <=> 'delete'),
|
|
29
|
+
__final as (
|
|
30
|
+
select
|
|
31
|
+
{% for field in fields %} {{ field }}, {% endfor %},
|
|
32
|
+
{% if has_identity %} __identity, {% endif %}
|
|
33
|
+
__key,
|
|
34
|
+
if(
|
|
35
|
+
__valid_from == min(__valid_from) over (partition by null),
|
|
36
|
+
cast('1900-01-01' as timestamp),
|
|
37
|
+
__valid_from
|
|
38
|
+
) as __valid_from,
|
|
39
|
+
__valid_to,
|
|
40
|
+
__is_current,
|
|
41
|
+
{% if soft_delete %} __is_deleted, {% endif %}
|
|
42
|
+
__hash,
|
|
43
|
+
{% if has_source %} __source, {% endif %}
|
|
44
|
+
{% if has_metadata %} __metadata, {% endif %}
|
|
45
|
+
{% if has_rescued_data %} __rescued_data, {% endif %}
|
|
46
|
+
from __complete
|
|
47
|
+
)
|
|
48
|
+
{% else %}
|
|
49
|
+
{% if has_rows %}
|
|
50
|
+
__scd2_no_fake_update as (
|
|
51
|
+
select
|
|
52
|
+
* except (__scd2_rn),
|
|
53
|
+
row_number() over (partition by `__key` order by `__timestamp` asc) as `__scd2_rn`
|
|
54
|
+
from __scd2 s left
|
|
55
|
+
anti join
|
|
56
|
+
__current c on s.__key == c.__key and s.__hash == c.__hash
|
|
57
|
+
{% if has_source %} and s.__source == c.__source {% endif %}
|
|
58
|
+
and s.__operation == 'upsert'
|
|
59
|
+
and s.__scd2_rn == 1
|
|
60
|
+
),
|
|
61
|
+
{% endif %}
|
|
62
|
+
__merge_condition as (
|
|
63
|
+
select s.*, if(__merge_condition == 'insert', null, __key) as __merge_key, o.__merge_condition
|
|
64
|
+
{% if has_rows %} from __scd2_no_fake_update s
|
|
65
|
+
{% else %} from __scd2 s
|
|
66
|
+
{% endif %}
|
|
67
|
+
inner join
|
|
68
|
+
(
|
|
69
|
+
select 'upsert' as __operation, 'insert' as __merge_condition
|
|
70
|
+
{% if has_rows %}
|
|
71
|
+
union all
|
|
72
|
+
select 'upsert' as __operation, 'update' as __merge_condition
|
|
73
|
+
union all
|
|
74
|
+
select 'delete' as __operation, 'delete' as __merge_condition
|
|
75
|
+
{% endif %}
|
|
76
|
+
) o
|
|
77
|
+
on s.__operation = o.__operation
|
|
78
|
+
-- only the first record can be an update or a delete
|
|
79
|
+
where (s.__scd2_rn == 1 and o.__merge_condition in ('update', 'delete')) or o.__merge_condition == 'insert'
|
|
80
|
+
),
|
|
81
|
+
__final as (
|
|
82
|
+
select
|
|
83
|
+
__merge_key,
|
|
84
|
+
__merge_condition,
|
|
85
|
+
{% for field in fields %} {{ field }},
|
|
86
|
+
{% endfor %},
|
|
87
|
+
{% if has_identity %} __identity,
|
|
88
|
+
{% endif %}
|
|
89
|
+
__key,
|
|
90
|
+
{% if fix_valid_from %}
|
|
91
|
+
{% if not has_rows %}
|
|
92
|
+
if(
|
|
93
|
+
__valid_from == min(__valid_from) over (partition by null),
|
|
94
|
+
cast('1900-01-01' as timestamp),
|
|
95
|
+
__valid_from
|
|
96
|
+
) as __valid_from,
|
|
97
|
+
{% else %} __valid_from,
|
|
98
|
+
{% endif %}
|
|
99
|
+
{% else %} __valid_from,
|
|
100
|
+
{% endif %}
|
|
101
|
+
__valid_to,
|
|
102
|
+
__is_current and __merge_condition == 'insert' as __is_current,
|
|
103
|
+
{% if soft_delete %} __is_deleted,
|
|
104
|
+
{% endif %}
|
|
105
|
+
__hash,
|
|
106
|
+
{% if has_source %} __source,
|
|
107
|
+
{% endif %}
|
|
108
|
+
{% if has_metadata %} __metadata,
|
|
109
|
+
{% endif %}
|
|
110
|
+
{% if has_rescued_data %} __rescued_data,
|
|
111
|
+
{% endif %}
|
|
112
|
+
from __merge_condition m
|
|
113
|
+
)
|
|
114
|
+
{% endif %}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
{% include 'query/context.sql.jinja' %}
|
|
2
|
+
{% include 'query/base.sql.jinja' %}
|
|
3
|
+
{% if filter %} {% include 'query/filter.sql.jinja' %} {% endif %}
|
|
4
|
+
{% if deduplicate_key %} {% include 'query/deduplicate_key.sql.jinja' %} {% endif %}
|
|
5
|
+
{% if mode == "update" %} {% if has_rows %} {% include 'query/current.sql.jinja' %} {% endif %} {% endif %}
|
|
6
|
+
{% if rectify %} {% include 'query/rectify.sql.jinja' %} {% endif %}
|
|
7
|
+
{% if deduplicate_hash %} {% include 'query/deduplicate_hash.sql.jinja' %} {% endif %}
|
|
8
|
+
{% if cdc == "nocdc" %} {% include 'query/nocdc.sql.jinja' %} {% endif %}
|
|
9
|
+
{% if cdc == "scd1" %} {% include 'query/scd1.sql.jinja' %} {% endif %}
|
|
10
|
+
{% if cdc == "scd2" %} {% include 'query/scd2.sql.jinja' %} {% endif %}
|
|
11
|
+
{% include 'query/final.sql.jinja' %}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from fabricks.context.runtime import (
|
|
2
|
+
BRONZE,
|
|
3
|
+
CONF_RUNTIME,
|
|
4
|
+
FABRICKS_STORAGE,
|
|
5
|
+
GOLD,
|
|
6
|
+
IS_DEBUG,
|
|
7
|
+
IS_LIVE,
|
|
8
|
+
IS_TEST,
|
|
9
|
+
PATH_EXTENDERS,
|
|
10
|
+
PATH_LIBRARIES,
|
|
11
|
+
PATH_PARSERS,
|
|
12
|
+
PATH_REQUIREMENTS,
|
|
13
|
+
PATH_RUNTIME,
|
|
14
|
+
PATH_SCHEDULES,
|
|
15
|
+
PATH_UDFS,
|
|
16
|
+
PATH_VIEWS,
|
|
17
|
+
PATHS_RUNTIME,
|
|
18
|
+
PATHS_STORAGE,
|
|
19
|
+
SECRET_SCOPE,
|
|
20
|
+
SILVER,
|
|
21
|
+
STEPS,
|
|
22
|
+
VARIABLES,
|
|
23
|
+
VERSION,
|
|
24
|
+
)
|
|
25
|
+
from fabricks.context.spark import build_spark_session
|
|
26
|
+
|
|
27
|
+
__all__ = [
|
|
28
|
+
"BRONZE",
|
|
29
|
+
"build_spark_session",
|
|
30
|
+
"CONF_RUNTIME",
|
|
31
|
+
"FABRICKS_STORAGE",
|
|
32
|
+
"GOLD",
|
|
33
|
+
"IS_DEBUG",
|
|
34
|
+
"IS_LIVE",
|
|
35
|
+
"IS_TEST",
|
|
36
|
+
"PATH_EXTENDERS",
|
|
37
|
+
"PATH_LIBRARIES",
|
|
38
|
+
"PATH_PARSERS",
|
|
39
|
+
"PATH_REQUIREMENTS",
|
|
40
|
+
"PATH_RUNTIME",
|
|
41
|
+
"PATH_SCHEDULES",
|
|
42
|
+
"PATH_UDFS",
|
|
43
|
+
"PATH_VIEWS",
|
|
44
|
+
"PATHS_RUNTIME",
|
|
45
|
+
"PATHS_STORAGE",
|
|
46
|
+
"SECRET_SCOPE",
|
|
47
|
+
"SILVER",
|
|
48
|
+
"STEPS",
|
|
49
|
+
"VARIABLES",
|
|
50
|
+
"VERSION",
|
|
51
|
+
]
|
fabricks/context/log.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from functools import wraps
|
|
3
|
+
from typing import Callable, cast
|
|
4
|
+
|
|
5
|
+
from fabricks.context.runtime import FABRICKS_STORAGE, SECRET_SCOPE
|
|
6
|
+
from fabricks.utils.azure_table import AzureTable
|
|
7
|
+
from fabricks.utils.log import get_logger
|
|
8
|
+
from fabricks.utils.secret import AccessKey, get_secret_from_secret_scope
|
|
9
|
+
|
|
10
|
+
storage_account = FABRICKS_STORAGE.get_storage_account()
|
|
11
|
+
secret = get_secret_from_secret_scope(SECRET_SCOPE, f"{storage_account}-access-key")
|
|
12
|
+
access_key = cast(AccessKey, secret).key
|
|
13
|
+
|
|
14
|
+
table = AzureTable("logs", storage_account=storage_account, access_key=access_key)
|
|
15
|
+
Logger, TableLogger = get_logger("logs", logging.DEBUG, table=table)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def flush(func: Callable):
|
|
19
|
+
@wraps(func)
|
|
20
|
+
def wrapper(*args, **kwargs):
|
|
21
|
+
try:
|
|
22
|
+
return func(*args, **kwargs)
|
|
23
|
+
finally:
|
|
24
|
+
TableLogger.flush()
|
|
25
|
+
|
|
26
|
+
return wrapper
|