sandwich 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sandwich/dialects/base.py +18 -48
- sandwich/dialects/ddl_mssql.py +2 -27
- sandwich/dialects/ddl_postgres.py +0 -18
- sandwich/dialects/mssql.py +65 -55
- sandwich/dialects/postgres.py +3 -4
- sandwich/dialects/utils.py +1 -3
- sandwich/dwh/__init__.py +82 -0
- sandwich/modeling/__init__.py +26 -9
- sandwich/modeling/strategies/base.py +94 -0
- sandwich/{strategies → modeling/strategies}/factory.py +3 -2
- sandwich/modeling/strategies/link2fact.py +225 -0
- sandwich/{strategies → modeling/strategies}/scd2dim.py +27 -45
- {sandwich-0.2.2.dist-info → sandwich-0.3.0.dist-info}/METADATA +17 -15
- sandwich-0.3.0.dist-info/RECORD +23 -0
- sandwich/dv2_helper.py +0 -98
- sandwich/strategies/base.py +0 -44
- sandwich/strategies/link2fact.py +0 -91
- sandwich-0.2.2.dist-info/RECORD +0 -23
- /sandwich/{strategies → modeling/strategies}/__init__.py +0 -0
- {sandwich-0.2.2.dist-info → sandwich-0.3.0.dist-info}/WHEEL +0 -0
- {sandwich-0.2.2.dist-info → sandwich-0.3.0.dist-info}/entry_points.txt +0 -0
sandwich/dialects/base.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
from typing import Tuple
|
|
3
3
|
|
|
4
|
-
from sqlalchemy import Table
|
|
4
|
+
from sqlalchemy import Table, TextClause
|
|
5
5
|
|
|
6
6
|
class DialectHandler(ABC):
|
|
7
7
|
@abstractmethod
|
|
@@ -64,7 +64,16 @@ class DialectHandler(ABC):
|
|
|
64
64
|
pass
|
|
65
65
|
|
|
66
66
|
@abstractmethod
|
|
67
|
-
def
|
|
67
|
+
def make_link_proc(
|
|
68
|
+
self,
|
|
69
|
+
link_table: Table,
|
|
70
|
+
hk_keys: list,
|
|
71
|
+
header: str
|
|
72
|
+
) -> Tuple[str, str, str]:
|
|
73
|
+
pass
|
|
74
|
+
|
|
75
|
+
@abstractmethod
|
|
76
|
+
def make_scd2_sat_proc(
|
|
68
77
|
self,
|
|
69
78
|
sat_table: Table,
|
|
70
79
|
hk_name: str,
|
|
@@ -74,25 +83,14 @@ class DialectHandler(ABC):
|
|
|
74
83
|
stg_schema: str,
|
|
75
84
|
header: str
|
|
76
85
|
) -> Tuple[str, str, str]:
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
Args:
|
|
80
|
-
sat_table: SQLAlchemy Table object for satellite
|
|
81
|
-
hk_name: Hash key column name
|
|
82
|
-
hashdiff_col: Hash diff column name
|
|
83
|
-
is_available_col: Is available column name
|
|
84
|
-
loaddate_col: Load date column name
|
|
85
|
-
columns_list: Comma-separated list of columns
|
|
86
|
-
stg_schema: Staging schema name ('stg' or 'proxy')
|
|
87
|
-
header: Auto-generated header comment
|
|
86
|
+
pass
|
|
88
87
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
"""
|
|
88
|
+
@abstractmethod
|
|
89
|
+
def make_scd0_sat_proc(self, sat_table: Table, header: str) -> Tuple[str, str, str]:
|
|
92
90
|
pass
|
|
93
91
|
|
|
94
92
|
@abstractmethod
|
|
95
|
-
def
|
|
93
|
+
def make_scd2_dim_proc(
|
|
96
94
|
self,
|
|
97
95
|
dim_table: Table,
|
|
98
96
|
bk_keys: list,
|
|
@@ -115,10 +113,7 @@ class DialectHandler(ABC):
|
|
|
115
113
|
def make_job_proc(
|
|
116
114
|
self,
|
|
117
115
|
entity_name: str,
|
|
118
|
-
|
|
119
|
-
sat_proc_name: str,
|
|
120
|
-
dim_proc_name: str,
|
|
121
|
-
stg_proc_name: str | None,
|
|
116
|
+
proc_names: list[str],
|
|
122
117
|
header: str
|
|
123
118
|
) -> Tuple[str, str, str]:
|
|
124
119
|
"""Generate main job orchestration procedure.
|
|
@@ -137,30 +132,5 @@ class DialectHandler(ABC):
|
|
|
137
132
|
pass
|
|
138
133
|
|
|
139
134
|
@abstractmethod
|
|
140
|
-
def make_drop_proc(
|
|
141
|
-
|
|
142
|
-
entity_name: str,
|
|
143
|
-
stg_schema: str,
|
|
144
|
-
job_proc_name: str,
|
|
145
|
-
stg_proc_name: str | None,
|
|
146
|
-
hub_proc_name: str,
|
|
147
|
-
sat_proc_name: str,
|
|
148
|
-
dim_proc_name: str,
|
|
149
|
-
header: str
|
|
150
|
-
) -> Tuple[str, str, str]:
|
|
151
|
-
"""Generate cleanup/drop procedure for all entity objects.
|
|
152
|
-
|
|
153
|
-
Args:
|
|
154
|
-
entity_name: Entity name
|
|
155
|
-
stg_schema: Staging schema name ('stg' or 'proxy')
|
|
156
|
-
job_proc_name: Name of job orchestration procedure
|
|
157
|
-
stg_proc_name: Name of staging materialization procedure (optional)
|
|
158
|
-
hub_proc_name: Name of hub population procedure
|
|
159
|
-
sat_proc_name: Name of satellite population procedure
|
|
160
|
-
dim_proc_name: Name of dimension recalculation procedure
|
|
161
|
-
header: Auto-generated header comment
|
|
162
|
-
|
|
163
|
-
Returns:
|
|
164
|
-
Tuple of (procedure_code, procedure_name)
|
|
165
|
-
"""
|
|
166
|
-
pass
|
|
135
|
+
def make_drop_proc(self, entity_name, table_schemas: list[str], procedures: list[str], header: str) \
|
|
136
|
+
-> Tuple[str, str, str]: ...
|
sandwich/dialects/ddl_mssql.py
CHANGED
|
@@ -15,33 +15,6 @@ begin
|
|
|
15
15
|
end
|
|
16
16
|
"""
|
|
17
17
|
|
|
18
|
-
# language=sql
|
|
19
|
-
create_proc_register_entity = """
|
|
20
|
-
create or alter proc core.register_entity (
|
|
21
|
-
@entity_name varchar(100),
|
|
22
|
-
@template varchar(50)
|
|
23
|
-
) as
|
|
24
|
-
begin
|
|
25
|
-
set nocount on;
|
|
26
|
-
|
|
27
|
-
if exists (
|
|
28
|
-
select *
|
|
29
|
-
from core.[entities]
|
|
30
|
-
where [entity_name] = @entity_name
|
|
31
|
-
)
|
|
32
|
-
begin
|
|
33
|
-
update core.[entities]
|
|
34
|
-
set [updated] = sysdatetime(), [is_deleted] = 0
|
|
35
|
-
where [entity_name] = @entity_name
|
|
36
|
-
end
|
|
37
|
-
else begin
|
|
38
|
-
insert into core.[entities]
|
|
39
|
-
([entity_name], [template])
|
|
40
|
-
values (@entity_name, @template)
|
|
41
|
-
end
|
|
42
|
-
end
|
|
43
|
-
"""
|
|
44
|
-
|
|
45
18
|
# language=sql
|
|
46
19
|
create_table_ExecutionLog = """
|
|
47
20
|
if object_id('core.ExecutionLog') is null
|
|
@@ -101,6 +74,8 @@ if schema_id('hub') is null
|
|
|
101
74
|
exec ('create schema hub')
|
|
102
75
|
if schema_id('sat') is null
|
|
103
76
|
exec ('create schema sat')
|
|
77
|
+
if schema_id('link') is null
|
|
78
|
+
exec ('create schema link')
|
|
104
79
|
if schema_id('dim') is null
|
|
105
80
|
exec ('create schema dim')
|
|
106
81
|
if schema_id('fact') is null
|
|
@@ -17,24 +17,6 @@ CREATE TABLE IF NOT EXISTS core.entities (
|
|
|
17
17
|
);
|
|
18
18
|
"""
|
|
19
19
|
|
|
20
|
-
# language=sql
|
|
21
|
-
create_proc_register_entity = """
|
|
22
|
-
CREATE OR REPLACE PROCEDURE core.register_entity(
|
|
23
|
-
p_entity_name VARCHAR(100),
|
|
24
|
-
p_gen_path VARCHAR(50)
|
|
25
|
-
)
|
|
26
|
-
LANGUAGE plpgsql
|
|
27
|
-
AS $$
|
|
28
|
-
BEGIN
|
|
29
|
-
INSERT INTO core.entities (entity_name, template)
|
|
30
|
-
VALUES (p_entity_name, p_gen_path)
|
|
31
|
-
ON CONFLICT (entity_name) DO UPDATE
|
|
32
|
-
SET updated = NOW(),
|
|
33
|
-
is_deleted = FALSE;
|
|
34
|
-
END;
|
|
35
|
-
$$;
|
|
36
|
-
"""
|
|
37
|
-
|
|
38
20
|
# language=sql
|
|
39
21
|
create_func_StringToHash= """
|
|
40
22
|
create or replace function core.string_to_hash1(str_value text)
|
sandwich/dialects/mssql.py
CHANGED
|
@@ -1,7 +1,11 @@
|
|
|
1
1
|
"""MSSQL dialect handler for SQL code generation."""
|
|
2
2
|
from typing import Tuple
|
|
3
3
|
|
|
4
|
-
from sqlalchemy import dialects, Table
|
|
4
|
+
from sqlalchemy import dialects, Table, text
|
|
5
|
+
|
|
6
|
+
#from sandwich import SANDWICH_VERSION
|
|
7
|
+
#from sandwich.modeling import modeling_metadata
|
|
8
|
+
#from sandwich.modeling.strategies.base import ValidationResult
|
|
5
9
|
|
|
6
10
|
from .base import DialectHandler
|
|
7
11
|
from .utils import get_columns_list
|
|
@@ -103,7 +107,47 @@ end
|
|
|
103
107
|
proc_code = self.apply_proc_template(proc_name, proc_body, header)
|
|
104
108
|
return proc_code, proc_name, f"exec {proc_name}"
|
|
105
109
|
|
|
106
|
-
def
|
|
110
|
+
def make_link_proc(self, link_table: Table, hk_keys: list, header: str) -> Tuple[str, str, str]:
|
|
111
|
+
proc_name = self.get_proc_name_format("elt", f"Populate_{link_table.schema}", link_table.name)
|
|
112
|
+
where_fields_list_str = "\n\t\tand ".join([f"link.[{hk[0]}] = stg.[{hk[0]}]" for hk in hk_keys if hk[0] != f"hk_{link_table.name}"])
|
|
113
|
+
columns_list = get_columns_list(link_table)
|
|
114
|
+
|
|
115
|
+
# language=sql
|
|
116
|
+
proc_body = f"""
|
|
117
|
+
insert into [{link_table.schema}].[{link_table.name}]
|
|
118
|
+
({columns_list})
|
|
119
|
+
select distinct {get_columns_list(link_table, alias="stg")}
|
|
120
|
+
from stg.[{link_table.name}] as stg
|
|
121
|
+
where not exists (
|
|
122
|
+
select *
|
|
123
|
+
from [{link_table.schema}].[{link_table.name}] as link
|
|
124
|
+
where {where_fields_list_str}
|
|
125
|
+
);
|
|
126
|
+
"""
|
|
127
|
+
proc_code = self.apply_proc_template(proc_name, proc_body, header)
|
|
128
|
+
return proc_code, proc_name, f"exec {proc_name}"
|
|
129
|
+
|
|
130
|
+
def make_scd0_sat_proc(self, sat_table: Table, header: str) -> Tuple[str, str, str]:
|
|
131
|
+
proc_name = self.get_proc_name_format("elt", f"Populate_{sat_table.schema}", sat_table.name)
|
|
132
|
+
columns_list = get_columns_list(sat_table)
|
|
133
|
+
hk_name = f"hk_{sat_table.name}"
|
|
134
|
+
|
|
135
|
+
# language=sql
|
|
136
|
+
proc_body = f"""
|
|
137
|
+
insert into [{sat_table.schema}].[{sat_table.name}]
|
|
138
|
+
({columns_list})
|
|
139
|
+
select {get_columns_list(sat_table, alias="stg")}
|
|
140
|
+
from stg.[{sat_table.name}] stg
|
|
141
|
+
where not exists (
|
|
142
|
+
select *
|
|
143
|
+
from sat.[{sat_table.name}] sat
|
|
144
|
+
where stg.[{hk_name}] = sat.[{hk_name}]
|
|
145
|
+
)
|
|
146
|
+
"""
|
|
147
|
+
proc_code = self.apply_proc_template(proc_name, proc_body, header)
|
|
148
|
+
return proc_code, proc_name, f"exec {proc_name}"
|
|
149
|
+
|
|
150
|
+
def make_scd2_sat_proc(
|
|
107
151
|
self,
|
|
108
152
|
sat_table: Table,
|
|
109
153
|
hk_name: str,
|
|
@@ -175,12 +219,7 @@ end
|
|
|
175
219
|
proc_code = self.apply_proc_template(proc_name, proc_body, header)
|
|
176
220
|
return proc_code, proc_name, f"exec {proc_name}"
|
|
177
221
|
|
|
178
|
-
def
|
|
179
|
-
self,
|
|
180
|
-
dim_table: Table,
|
|
181
|
-
bk_keys: list,
|
|
182
|
-
header: str
|
|
183
|
-
) -> Tuple[str, str, str]:
|
|
222
|
+
def make_scd2_dim_proc(self, dim_table: Table, bk_keys: list, header: str) -> Tuple[str, str, str]:
|
|
184
223
|
proc_name = self.get_proc_name_format("elt", f"Recalculate_{dim_table.schema}", dim_table.name)
|
|
185
224
|
columns_list = get_columns_list(dim_table)
|
|
186
225
|
pk_keys = lambda: ", ".join([f"sat.[{bk[0]}]" for bk in bk_keys])
|
|
@@ -210,62 +249,33 @@ end
|
|
|
210
249
|
proc_code = self.apply_proc_template(proc_name, proc_body, header)
|
|
211
250
|
return proc_code, proc_name, f"exec {proc_name}"
|
|
212
251
|
|
|
213
|
-
def make_job_proc(
|
|
214
|
-
self,
|
|
215
|
-
entity_name: str,
|
|
216
|
-
hub_proc_name: str,
|
|
217
|
-
sat_proc_name: str,
|
|
218
|
-
dim_proc_name: str,
|
|
219
|
-
stg_proc_name: str | None,
|
|
220
|
-
header: str
|
|
221
|
-
) -> Tuple[str, str, str]:
|
|
222
|
-
"""Generate MSSQL job orchestration procedure."""
|
|
252
|
+
def make_job_proc(self, entity_name: str, proc_names: list[str], header: str) -> Tuple[str, str, str]:
|
|
223
253
|
proc_name = f"[job].[Run_all_related_to_{entity_name}]"
|
|
254
|
+
proc_body = "\n\t"
|
|
255
|
+
for proc in proc_names:
|
|
256
|
+
if proc is None: continue
|
|
257
|
+
proc_body += f"exec {proc} @executionID;\n\t"
|
|
224
258
|
|
|
225
|
-
stg_call = f" exec {stg_proc_name} @executionID;\n" if stg_proc_name else ""
|
|
226
|
-
|
|
227
|
-
# language=sql
|
|
228
|
-
proc_body = f"""
|
|
229
|
-
{stg_call} exec {hub_proc_name} @executionID;
|
|
230
|
-
exec {sat_proc_name} @executionID;
|
|
231
|
-
exec {dim_proc_name} @executionID;
|
|
232
|
-
"""
|
|
233
259
|
proc_code = self.apply_proc_template(proc_name, proc_body, header)
|
|
234
260
|
return proc_code, proc_name, f"exec {proc_name}"
|
|
235
261
|
|
|
236
|
-
def make_drop_proc(
|
|
237
|
-
self,
|
|
238
|
-
entity_name: str,
|
|
239
|
-
stg_schema: str,
|
|
240
|
-
job_proc_name: str,
|
|
241
|
-
stg_proc_name: str | None,
|
|
242
|
-
hub_proc_name: str,
|
|
243
|
-
sat_proc_name: str,
|
|
244
|
-
dim_proc_name: str,
|
|
245
|
-
header: str
|
|
246
|
-
) -> Tuple[str, str, str]:
|
|
247
|
-
"""Generate MSSQL cleanup/drop procedure."""
|
|
262
|
+
def make_drop_proc(self, entity_name, table_schemas: list[str], procedures: list[str], header: str) -> Tuple[str, str, str]:
|
|
248
263
|
proc_name = f"[meta].[Drop_all_related_to_{entity_name}]"
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
proc_body
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
drop table if exists [sat].[{entity_name}];
|
|
260
|
-
drop procedure if exists {sat_proc_name};
|
|
261
|
-
drop table if exists [hub].[{entity_name}];
|
|
262
|
-
drop procedure if exists {hub_proc_name};
|
|
263
|
-
drop procedure if exists {job_proc_name};
|
|
264
|
-
|
|
265
|
-
update core.[entities]
|
|
264
|
+
proc_body = "\n\t"
|
|
265
|
+
for proc in procedures:
|
|
266
|
+
if proc is None: continue
|
|
267
|
+
proc_body += f"drop procedure if exists {proc};\n\t"
|
|
268
|
+
proc_body += "\n\t"
|
|
269
|
+
for schema in table_schemas:
|
|
270
|
+
proc_body += f"drop table if exists [{schema}].[{entity_name}];\n\t"
|
|
271
|
+
proc_body += "\n\t"
|
|
272
|
+
proc_body += \
|
|
273
|
+
f"""update core.[entities]
|
|
266
274
|
set [deleted] = sysdatetime()
|
|
267
275
|
, [is_deleted] = 1
|
|
268
276
|
where [entity_name] = '{entity_name}'
|
|
269
277
|
"""
|
|
278
|
+
|
|
270
279
|
proc_code = self.apply_proc_template(proc_name, proc_body, header)
|
|
271
280
|
return proc_code, proc_name, f"exec {proc_name}"
|
|
281
|
+
|
sandwich/dialects/postgres.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
|
-
"""Postgres dialect handler for SQL code generation."""
|
|
2
1
|
from typing import Tuple
|
|
3
2
|
|
|
4
|
-
from sqlalchemy import dialects, Table
|
|
3
|
+
from sqlalchemy import dialects, Table, text
|
|
5
4
|
|
|
6
5
|
from src.sandwich.dialects.base import DialectHandler
|
|
7
6
|
|
|
@@ -49,7 +48,7 @@ class PostgresDialectHandler(DialectHandler):
|
|
|
49
48
|
# TODO: Implement using INSERT...ON CONFLICT or NOT EXISTS pattern
|
|
50
49
|
raise NotImplementedError("Postgres hub procedure not yet implemented")
|
|
51
50
|
|
|
52
|
-
def
|
|
51
|
+
def make_sdc2_sat_proc(
|
|
53
52
|
self,
|
|
54
53
|
sat_table: Table,
|
|
55
54
|
hk_name: str,
|
|
@@ -65,7 +64,7 @@ class PostgresDialectHandler(DialectHandler):
|
|
|
65
64
|
# Use BOOLEAN type instead of BIT
|
|
66
65
|
raise NotImplementedError("Postgres satellite procedure not yet implemented")
|
|
67
66
|
|
|
68
|
-
def
|
|
67
|
+
def make_scd2_dim_proc(
|
|
69
68
|
self,
|
|
70
69
|
dim_table: Table,
|
|
71
70
|
bk_keys: list,
|
sandwich/dialects/utils.py
CHANGED
|
@@ -89,7 +89,6 @@ def initialize_database(conn: Engine | Connection, dialect: str = "mssql",
|
|
|
89
89
|
if drop_entities_table:
|
|
90
90
|
init_scripts["drop_entities_table"] = "drop table if exists [core].[entities];"
|
|
91
91
|
init_scripts["create_entities_table"] = ddl_mssql.create_entities_table
|
|
92
|
-
init_scripts["create_proc_register_entity"] = header + ddl_mssql.create_proc_register_entity
|
|
93
92
|
init_scripts["create_func_StringToHash1"] = header + ddl_mssql.create_func_StringToHash
|
|
94
93
|
for i in range(2, str_to_hash_count):
|
|
95
94
|
init_scripts[f"create_func_StringToHash{i}"] = header + get_string_to_hash_ddl_mssql(i)
|
|
@@ -102,7 +101,6 @@ def initialize_database(conn: Engine | Connection, dialect: str = "mssql",
|
|
|
102
101
|
if drop_entities_table:
|
|
103
102
|
init_scripts["drop_entities_table"] = "drop table if exists core.entities"
|
|
104
103
|
init_scripts["create_entities_table"] = ddl_postgres.create_entities_table
|
|
105
|
-
init_scripts["create_proc_register_entity"] = ddl_postgres.create_proc_register_entity
|
|
106
104
|
init_scripts["create_func_StringToHash1"] = ddl_postgres.create_func_StringToHash
|
|
107
105
|
for i in range(2, str_to_hash_count):
|
|
108
106
|
init_scripts[f"create_func_StringToHash{i}"] = get_string_to_hash_ddl_postgres(i)
|
|
@@ -146,4 +144,4 @@ def parse_auto_generated_header(full_proc_text: str) -> dict[str, Any]:
|
|
|
146
144
|
started = True
|
|
147
145
|
continue
|
|
148
146
|
result["rows_in_header"] = rows_in_header - 1 if rows_in_header > 0 else 0
|
|
149
|
-
return result
|
|
147
|
+
return result
|
sandwich/dwh/__init__.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
|
|
3
|
+
from sqlalchemy import Connection, Engine, MetaData, select, Table, text, RowMapping, Sequence
|
|
4
|
+
|
|
5
|
+
from sandwich.dialects import DialectHandlerFactory
|
|
6
|
+
from sandwich.modeling import get_stg_info, infer_template, Dv2SystemInfo, Dv2Entity, StgInfo
|
|
7
|
+
from sandwich.modeling.strategies import SchemaGenerator, StrategyFactory
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _register_entity(entity_name: str, template: str, conn: Engine | Connection,
|
|
11
|
+
verbose: bool = False) -> datetime:
|
|
12
|
+
entities = Table("entities", MetaData(), schema="core", autoload_with=conn)
|
|
13
|
+
created_result = conn.execute(select(entities.c.created).where(entity_name == entities.c.entity_name)).scalar_one_or_none()
|
|
14
|
+
|
|
15
|
+
if created_result is None:
|
|
16
|
+
created_result = datetime.now()
|
|
17
|
+
conn.execute(entities.insert().values(entity_name=entity_name, template=template, created=created_result))
|
|
18
|
+
if verbose:
|
|
19
|
+
print(f"[ok] Registered `{entity_name}` for `{template}`")
|
|
20
|
+
else:
|
|
21
|
+
_update_entity(entity_name, conn, entities, verbose=verbose)
|
|
22
|
+
|
|
23
|
+
return created_result
|
|
24
|
+
|
|
25
|
+
def _update_entity(entity_name: str, conn: Engine | Connection, sys_entities: Table, verbose: bool = False) -> None:
|
|
26
|
+
conn.execute(
|
|
27
|
+
sys_entities.update().where(entity_name == sys_entities.c.entity_name).values(updated=datetime.now(), is_deleted=False))
|
|
28
|
+
if verbose:
|
|
29
|
+
print(f"[ok] Updated `{entity_name}`")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def generate_schema(schema_generator: SchemaGenerator, registered_on: datetime, conn: Engine | Connection,
|
|
33
|
+
verbose: bool = False) -> None:
|
|
34
|
+
tables = schema_generator.make_tables()
|
|
35
|
+
for table_type, table in tables.items():
|
|
36
|
+
if table is not None:
|
|
37
|
+
table.create(conn, checkfirst=True)
|
|
38
|
+
if verbose:
|
|
39
|
+
print(f"[ok] Created table [{table.schema}].[{table.name}]")
|
|
40
|
+
|
|
41
|
+
procedures = schema_generator.make_procedures(tables, registered_on)
|
|
42
|
+
for proc_type, (proc_code, proc_name, _) in procedures.items():
|
|
43
|
+
conn.execute(text(proc_code))
|
|
44
|
+
if verbose:
|
|
45
|
+
print(f"[ok] Created or altered {proc_name}")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _generate_schema_for_entity(stg_info: StgInfo, conn: Engine | Connection, dialect: str,
|
|
49
|
+
registered_on: datetime, template: str | None, verbose: bool = False) -> None:
|
|
50
|
+
validator = StrategyFactory.create_validator(template)
|
|
51
|
+
sys_info = get_system_info(conn)
|
|
52
|
+
validation_result = validator.validate_staging(stg_info, sys_info)
|
|
53
|
+
dialect_handler = DialectHandlerFactory.create_handler(dialect)
|
|
54
|
+
schema_generator = StrategyFactory.create_generator(dialect_handler, validation_result)
|
|
55
|
+
generate_schema(schema_generator, registered_on, conn, verbose=verbose)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def register_and_create_entity(entity_name: str, conn: Engine | Connection, dialect: str, template: str | None = None,
|
|
59
|
+
schema: str = "stg", verbose: bool = False) -> None:
|
|
60
|
+
stg_info = get_stg_info(entity_name, schema, conn)
|
|
61
|
+
if template is None:
|
|
62
|
+
template = infer_template(stg_info)
|
|
63
|
+
registered_on = _register_entity(entity_name, template, conn)
|
|
64
|
+
_generate_schema_for_entity(stg_info, conn, dialect, registered_on, template, verbose=verbose)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def update_registered_entities(conn: Engine | Connection, dialect: str, schema: str = "stg",
|
|
68
|
+
verbose: bool = False) -> None:
|
|
69
|
+
sys_info = get_system_info(conn)
|
|
70
|
+
for en in sys_info.entities_list:
|
|
71
|
+
stg_info = get_stg_info(en.entity_name, schema, conn)
|
|
72
|
+
_update_entity(en.entity_name, conn, sys_info.sys_entities, verbose=verbose)
|
|
73
|
+
_generate_schema_for_entity(stg_info, conn, dialect, en.created_on, en.template, verbose=verbose)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def get_system_info(conn: Engine | Connection):
|
|
77
|
+
sys_entities = Table("entities", MetaData(), schema="core", autoload_with=conn)
|
|
78
|
+
select_result = conn.execute(sys_entities.select().where(~sys_entities.c.is_deleted))
|
|
79
|
+
return Dv2SystemInfo(
|
|
80
|
+
[Dv2Entity(en["entity_name"], en["template"], en["created"]) for en in select_result.mappings().all()],
|
|
81
|
+
sys_entities
|
|
82
|
+
)
|
sandwich/modeling/__init__.py
CHANGED
|
@@ -1,7 +1,15 @@
|
|
|
1
1
|
from dataclasses import dataclass
|
|
2
|
-
from
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from typing import Any, Tuple
|
|
4
|
+
|
|
5
|
+
from sqlalchemy import Table, Engine, Connection, MetaData
|
|
6
|
+
|
|
7
|
+
@dataclass(frozen=True)
|
|
8
|
+
class Dv2Entity:
|
|
9
|
+
entity_name: str
|
|
10
|
+
template: str
|
|
11
|
+
created_on: datetime
|
|
3
12
|
|
|
4
|
-
from sqlalchemy import Table
|
|
5
13
|
|
|
6
14
|
@dataclass(frozen=True)
|
|
7
15
|
class StgInfo:
|
|
@@ -11,6 +19,12 @@ class StgInfo:
|
|
|
11
19
|
bk_keys: dict[str, Any]
|
|
12
20
|
sys_columns: dict[str, Any]
|
|
13
21
|
bus_columns: dict[str, Any]
|
|
22
|
+
degenerate_field: Tuple[str, Any] | None = None
|
|
23
|
+
|
|
24
|
+
@dataclass(frozen=True)
|
|
25
|
+
class Dv2SystemInfo:
|
|
26
|
+
entities_list: list[Dv2Entity]
|
|
27
|
+
sys_entities: Table
|
|
14
28
|
|
|
15
29
|
class Dv2ModelingMetadata:
|
|
16
30
|
HEADER_TEMPLATE = """/*
|
|
@@ -58,24 +72,26 @@ class Dv2ModelingMetadata:
|
|
|
58
72
|
#self.column_types = self._dialects_config[self.dialect]
|
|
59
73
|
self.required_columns: list[str] = [self.loaddate, self.recordsource]
|
|
60
74
|
|
|
61
|
-
|
|
62
75
|
modeling_metadata = Dv2ModelingMetadata()
|
|
63
76
|
|
|
64
|
-
def get_stg_info(
|
|
77
|
+
def get_stg_info(entity_name: str, schema: str, conn: Engine | Connection) -> StgInfo:
|
|
78
|
+
stg = Table(entity_name, MetaData(), schema=schema, autoload_with=conn)
|
|
79
|
+
|
|
65
80
|
hk_keys: dict[str, Any] = {}
|
|
66
81
|
bk_keys: dict[str, Any] = {}
|
|
67
82
|
sys_columns: dict[str, Any] = {}
|
|
68
83
|
bus_columns: dict[str, Any] = {}
|
|
84
|
+
degenerate_field: Tuple[str, Any] | None = None
|
|
69
85
|
|
|
70
86
|
for col in stg.columns.values():
|
|
71
|
-
if col.name.startswith("hk_"):
|
|
87
|
+
if col.name.startswith("hk_"): # hash key
|
|
72
88
|
hk_keys[col.name] = col.type
|
|
73
|
-
elif col.name.startswith("bk_"):
|
|
89
|
+
elif col.name.startswith("bk_"): # business key
|
|
74
90
|
bk_keys[col.name] = col.type
|
|
75
|
-
elif col.name.startswith("
|
|
91
|
+
elif col.name.startswith("dg_"): # degenerate field (transactional links only)
|
|
92
|
+
degenerate_field = (col.name, col.type)
|
|
93
|
+
elif col.name.startswith("sg_"): # surrogate key
|
|
76
94
|
raise Exception(f"sg column '{col.name}' is not implemented yet")
|
|
77
|
-
elif col.name.startswith("ts_"):
|
|
78
|
-
raise Exception(f"ts column '{col.name}' is not implemented yet")
|
|
79
95
|
elif col.name in modeling_metadata.names:
|
|
80
96
|
# type_name = metadata.column_types[col.name]
|
|
81
97
|
# if not str(col.type).startswith(type_name):
|
|
@@ -91,6 +107,7 @@ def get_stg_info(stg: Table) -> StgInfo:
|
|
|
91
107
|
bk_keys=bk_keys,
|
|
92
108
|
sys_columns=sys_columns,
|
|
93
109
|
bus_columns=bus_columns,
|
|
110
|
+
degenerate_field=degenerate_field,
|
|
94
111
|
)
|
|
95
112
|
|
|
96
113
|
def infer_template(stg_info: StgInfo):
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import Any, Callable, Tuple
|
|
5
|
+
|
|
6
|
+
from sqlalchemy import Table
|
|
7
|
+
|
|
8
|
+
from sandwich.modeling import Dv2SystemInfo, modeling_metadata, StgInfo
|
|
9
|
+
|
|
10
|
+
@dataclass(frozen=True)
|
|
11
|
+
class ValidationResult:
|
|
12
|
+
stg_schema: str
|
|
13
|
+
entity_name: str
|
|
14
|
+
bk_keys: list[Tuple[str, Any]]
|
|
15
|
+
hk_keys: list[Tuple[str, Any]]
|
|
16
|
+
business_column_types: dict[str, Any]
|
|
17
|
+
system_column_types: dict[str, Any]
|
|
18
|
+
template: str
|
|
19
|
+
degenerate_field: Tuple[str, Any] | None = None
|
|
20
|
+
|
|
21
|
+
class Validator(ABC):
|
|
22
|
+
@abstractmethod
|
|
23
|
+
def validate_staging(self, stg_info: StgInfo, sys_info: Dv2SystemInfo, verbose: bool = False) -> ValidationResult:
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
class BaseValidator(Validator):
|
|
27
|
+
def __init__(self, template: str):
|
|
28
|
+
self._on_validate_staging: Callable[[StgInfo, Dv2SystemInfo], None] | None = None
|
|
29
|
+
self.template = template
|
|
30
|
+
|
|
31
|
+
def validate_staging(self, stg_info: StgInfo, sys_info: Dv2SystemInfo, verbose: bool = False) -> ValidationResult:
|
|
32
|
+
"""Validate staging table or view for `scd2dim` template.
|
|
33
|
+
|
|
34
|
+
Raises: Exception"""
|
|
35
|
+
if verbose:
|
|
36
|
+
raise Exception("verbose is not implemented yet")
|
|
37
|
+
|
|
38
|
+
system_column_names = stg_info.sys_columns.keys()
|
|
39
|
+
|
|
40
|
+
# universal check - all dv2 raw objects should be auditable
|
|
41
|
+
for required_col in modeling_metadata.required_columns:
|
|
42
|
+
if required_col not in system_column_names:
|
|
43
|
+
raise Exception(f"{required_col} column is required")
|
|
44
|
+
|
|
45
|
+
if self._on_validate_staging is not None:
|
|
46
|
+
self._on_validate_staging(stg_info, sys_info)
|
|
47
|
+
|
|
48
|
+
# todo: ValidationResult is not required whatsoever
|
|
49
|
+
return ValidationResult(
|
|
50
|
+
stg_schema=stg_info.stg_schema,
|
|
51
|
+
entity_name=stg_info.stg_name,
|
|
52
|
+
bk_keys=[(nm, tp) for nm, tp in stg_info.bk_keys.items()],
|
|
53
|
+
hk_keys=[(nm, tp) for nm, tp in stg_info.hk_keys.items()],
|
|
54
|
+
degenerate_field = stg_info.degenerate_field,
|
|
55
|
+
business_column_types=stg_info.bus_columns,
|
|
56
|
+
system_column_types=stg_info.sys_columns,
|
|
57
|
+
template=self.template
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class SchemaGenerator(ABC):
|
|
63
|
+
@abstractmethod
|
|
64
|
+
def make_tables(self) -> dict[str, Table]:
|
|
65
|
+
pass
|
|
66
|
+
|
|
67
|
+
@abstractmethod
|
|
68
|
+
def make_procedures(
|
|
69
|
+
self,
|
|
70
|
+
tables: dict[str, Table],
|
|
71
|
+
entity_registration_date: datetime = datetime.now()
|
|
72
|
+
) -> dict[str, Tuple[str, str, str]]:
|
|
73
|
+
pass
|
|
74
|
+
|
|
75
|
+
# class BaseSchemaGenerator(SchemaGenerator):
|
|
76
|
+
# def __init__(self, dialect_handler: DialectHandler, validation_result: ValidationResult):
|
|
77
|
+
# self.dialect_handler = dialect_handler
|
|
78
|
+
# self._validation_result = validation_result
|
|
79
|
+
# self._on_make_proc: Callable[[Table, datetime], Tuple[str, str, str]] | None = None
|
|
80
|
+
#
|
|
81
|
+
# def make_proc(self, tbl: Table, entity_registration_date: datetime) -> Tuple[str, str, str]:
|
|
82
|
+
# header = modeling_metadata.HEADER_TEMPLATE.format(
|
|
83
|
+
# created_on=entity_registration_date,
|
|
84
|
+
# updated_on=datetime.now(),
|
|
85
|
+
# version=SANDWICH_VERSION,
|
|
86
|
+
# entity_name=self._validation_result.entity_name
|
|
87
|
+
# )
|
|
88
|
+
#
|
|
89
|
+
# if self._validation_result.stg_schema == "proxy":
|
|
90
|
+
# stg_proc_code, stg_proc_name, stg_call_stmt = self.dialect_handler.make_stg_materialization_proc(
|
|
91
|
+
# entity_name=self._validation_result.entity_name,
|
|
92
|
+
# header=header
|
|
93
|
+
# )
|
|
94
|
+
# procedures["stg"] = (stg_proc_code, stg_proc_name, stg_call_stmt)
|
|
@@ -22,10 +22,11 @@ class StrategyFactory:
|
|
|
22
22
|
raise ValueError(f"Unknown template '{template}'. Available templates: {available}")
|
|
23
23
|
|
|
24
24
|
validator_class, _ = cls._strategies[template]
|
|
25
|
-
return validator_class()
|
|
25
|
+
return validator_class(template)
|
|
26
26
|
|
|
27
27
|
@classmethod
|
|
28
|
-
def create_generator(cls,
|
|
28
|
+
def create_generator(cls, dialect_handler: DialectHandler, validation_result: ValidationResult) -> SchemaGenerator:
|
|
29
|
+
template = validation_result.template
|
|
29
30
|
if template not in cls._strategies:
|
|
30
31
|
available = ", ".join(cls._strategies.keys())
|
|
31
32
|
raise ValueError(f"Unknown template '{template}'. Available templates: {available}")
|