sandwich 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sandwich/__init__.py +3 -0
- sandwich/dialects/__init__.py +12 -0
- sandwich/dialects/base.py +136 -0
- sandwich/dialects/ddl_mssql.py +123 -0
- sandwich/dialects/ddl_postgres.py +114 -0
- sandwich/dialects/factory.py +27 -0
- sandwich/dialects/mssql.py +281 -0
- sandwich/dialects/postgres.py +107 -0
- sandwich/dialects/utils.py +147 -0
- sandwich/dwh/__init__.py +82 -0
- sandwich/errors.py +25 -0
- sandwich/main.py +0 -0
- sandwich/modeling/__init__.py +120 -0
- sandwich/modeling/strategies/__init__.py +15 -0
- sandwich/modeling/strategies/base.py +94 -0
- sandwich/modeling/strategies/factory.py +39 -0
- sandwich/modeling/strategies/link2fact.py +225 -0
- sandwich/modeling/strategies/scd2dim.py +228 -0
- {sandwich-0.2.1.dist-info → sandwich-0.3.0.dist-info}/METADATA +170 -155
- sandwich-0.3.0.dist-info/RECORD +23 -0
- sandwich-0.3.0.dist-info/WHEEL +4 -0
- sandwich-0.3.0.dist-info/entry_points.txt +3 -0
- sandwich-0.2.1.dist-info/RECORD +0 -5
- sandwich-0.2.1.dist-info/WHEEL +0 -4
- sandwich-0.2.1.dist-info/licenses/LICENSE +0 -9
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
from typing import Tuple
|
|
2
|
+
|
|
3
|
+
from sqlalchemy import dialects, Table, text
|
|
4
|
+
|
|
5
|
+
from src.sandwich.dialects.base import DialectHandler
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class PostgresDialectHandler(DialectHandler):
|
|
9
|
+
"""Dialect handler for PostgreSQL.
|
|
10
|
+
|
|
11
|
+
NOTE: This is a stub implementation. All methods need to be implemented
|
|
12
|
+
based on PostgreSQL syntax and conventions.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def get_boolean_type(self):
|
|
16
|
+
return dialects.postgresql.BOOLEAN
|
|
17
|
+
|
|
18
|
+
def get_proc_name_format(self, schema: str, operation: str, entity_name: str) -> str:
|
|
19
|
+
"""Get Postgres procedure naming format."""
|
|
20
|
+
# Postgres uses lowercase with underscores by convention
|
|
21
|
+
operation_lower = operation.lower()
|
|
22
|
+
return f"{schema}.{operation_lower}_{entity_name}"
|
|
23
|
+
|
|
24
|
+
def apply_proc_template(self, proc_name: str, sql_body: str, header: str) -> str:
|
|
25
|
+
"""Wrap SQL body in Postgres procedure template with error handling."""
|
|
26
|
+
# TODO: Implement Postgres procedure template
|
|
27
|
+
# Postgres uses CREATE OR REPLACE PROCEDURE/FUNCTION with PL/pgSQL
|
|
28
|
+
# Error handling uses EXCEPTION blocks
|
|
29
|
+
# Logging integration needed
|
|
30
|
+
raise NotImplementedError("Postgres procedure template not yet implemented")
|
|
31
|
+
|
|
32
|
+
def make_stg_materialization_proc(
|
|
33
|
+
self,
|
|
34
|
+
entity_name: str,
|
|
35
|
+
header: str
|
|
36
|
+
) -> Tuple[str, str]:
|
|
37
|
+
"""Generate Postgres staging materialization procedure."""
|
|
38
|
+
# TODO: Implement using CREATE OR REPLACE and DROP/CREATE TABLE pattern
|
|
39
|
+
raise NotImplementedError("Postgres staging materialization not yet implemented")
|
|
40
|
+
|
|
41
|
+
def make_hub_proc(
|
|
42
|
+
self,
|
|
43
|
+
hub_table: Table,
|
|
44
|
+
bk_keys: list,
|
|
45
|
+
header: str
|
|
46
|
+
) -> Tuple[str, str]:
|
|
47
|
+
"""Generate Postgres hub population procedure."""
|
|
48
|
+
# TODO: Implement using INSERT...ON CONFLICT or NOT EXISTS pattern
|
|
49
|
+
raise NotImplementedError("Postgres hub procedure not yet implemented")
|
|
50
|
+
|
|
51
|
+
def make_sdc2_sat_proc(
|
|
52
|
+
self,
|
|
53
|
+
sat_table: Table,
|
|
54
|
+
hk_name: str,
|
|
55
|
+
hashdiff_col: str,
|
|
56
|
+
is_available_col: str,
|
|
57
|
+
loaddate_col: str,
|
|
58
|
+
stg_schema: str,
|
|
59
|
+
header: str
|
|
60
|
+
) -> Tuple[str, str]:
|
|
61
|
+
"""Generate Postgres satellite population procedure."""
|
|
62
|
+
# TODO: Implement using CTE and window functions (similar to MSSQL but with Postgres syntax)
|
|
63
|
+
# Use CURRENT_TIMESTAMP instead of SYSDATETIME()
|
|
64
|
+
# Use BOOLEAN type instead of BIT
|
|
65
|
+
raise NotImplementedError("Postgres satellite procedure not yet implemented")
|
|
66
|
+
|
|
67
|
+
def make_scd2_dim_proc(
|
|
68
|
+
self,
|
|
69
|
+
dim_table: Table,
|
|
70
|
+
bk_keys: list,
|
|
71
|
+
header: str
|
|
72
|
+
) -> Tuple[str, str]:
|
|
73
|
+
"""Generate Postgres dimension SCD2 recalculation procedure."""
|
|
74
|
+
# TODO: Implement using TRUNCATE and INSERT with window functions
|
|
75
|
+
# Use LAG/LEAD for SCD2 date calculations
|
|
76
|
+
# Use INTERVAL for date arithmetic instead of DATEADD
|
|
77
|
+
raise NotImplementedError("Postgres dimension procedure not yet implemented")
|
|
78
|
+
|
|
79
|
+
def make_job_proc(
|
|
80
|
+
self,
|
|
81
|
+
entity_name: str,
|
|
82
|
+
hub_proc_name: str,
|
|
83
|
+
sat_proc_name: str,
|
|
84
|
+
dim_proc_name: str,
|
|
85
|
+
stg_proc_name: str | None,
|
|
86
|
+
header: str
|
|
87
|
+
) -> Tuple[str, str]:
|
|
88
|
+
"""Generate Postgres job orchestration procedure."""
|
|
89
|
+
# TODO: Implement using CALL statements for other procedures
|
|
90
|
+
# Pass execution_id through procedure parameters
|
|
91
|
+
raise NotImplementedError("Postgres job procedure not yet implemented")
|
|
92
|
+
|
|
93
|
+
def make_drop_proc(
|
|
94
|
+
self,
|
|
95
|
+
entity_name: str,
|
|
96
|
+
stg_schema: str,
|
|
97
|
+
job_proc_name: str,
|
|
98
|
+
stg_proc_name: str | None,
|
|
99
|
+
hub_proc_name: str,
|
|
100
|
+
sat_proc_name: str,
|
|
101
|
+
dim_proc_name: str,
|
|
102
|
+
header: str
|
|
103
|
+
) -> Tuple[str, str]:
|
|
104
|
+
"""Generate Postgres cleanup/drop procedure."""
|
|
105
|
+
# TODO: Implement using DROP IF EXISTS for tables and procedures
|
|
106
|
+
# Update core.entities with deletion timestamp
|
|
107
|
+
raise NotImplementedError("Postgres drop procedure not yet implemented")
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from sqlalchemy import Engine, Connection, Table, text, TextClause
|
|
5
|
+
|
|
6
|
+
from src.sandwich import SANDWICH_VERSION
|
|
7
|
+
from .. import errors as err
|
|
8
|
+
from src.sandwich.modeling import modeling_metadata
|
|
9
|
+
|
|
10
|
+
from . import ddl_mssql, ddl_postgres
|
|
11
|
+
|
|
12
|
+
def get_columns_list(table: Table, sep: str = ", ", alias: str = None):
|
|
13
|
+
alias = alias + "." if alias else ""
|
|
14
|
+
return sep.join([f"{alias or ''}[{fld.name}]" for fld in table.columns.values()])
|
|
15
|
+
|
|
16
|
+
def get_string_to_hash_ddl_mssql(columns_count: int) -> str:
|
|
17
|
+
if columns_count < 2 or columns_count > 100:
|
|
18
|
+
raise ValueError("columns_count must be between 2 and 100")
|
|
19
|
+
|
|
20
|
+
params_list_str = ",\n\t".join([f"@StrValue{v} nvarchar(1000)" for v in range(1, columns_count + 1)])
|
|
21
|
+
concat_list_str = ", ';',\n\t\t\t".join(
|
|
22
|
+
[f"rtrim(ltrim(isnull(@StrValue{v}, '')))" for v in range(1, columns_count + 1)])
|
|
23
|
+
|
|
24
|
+
# language=sql
|
|
25
|
+
func = f"""
|
|
26
|
+
create or alter function [core].[StringToHash{columns_count}]
|
|
27
|
+
(
|
|
28
|
+
{params_list_str}
|
|
29
|
+
) returns char(40) as
|
|
30
|
+
begin
|
|
31
|
+
declare @result char(40);
|
|
32
|
+
set @result = upper(convert(char(40), hashbytes('sha1',
|
|
33
|
+
upper(concat(
|
|
34
|
+
{concat_list_str}
|
|
35
|
+
))
|
|
36
|
+
), 2));
|
|
37
|
+
return @result;
|
|
38
|
+
end"""
|
|
39
|
+
return func
|
|
40
|
+
|
|
41
|
+
def get_string_to_hash_ddl_postgres(columns_count: int) -> str:
|
|
42
|
+
if columns_count < 2 or columns_count > 100:
|
|
43
|
+
raise ValueError("columns_count must be between 2 and 100")
|
|
44
|
+
|
|
45
|
+
params_list_str = ",\n\t".join([f"p_str_value{v} text" for v in range(1, columns_count + 1)])
|
|
46
|
+
concat_list_str = ", ';',\n\t\t\t".join(
|
|
47
|
+
[f"upper(trim(coalesce(p_str_value{v}, '')))" for v in range(1, columns_count + 1)])
|
|
48
|
+
|
|
49
|
+
# language=sql
|
|
50
|
+
func = f"""
|
|
51
|
+
create or replace function core.string_to_hash{columns_count}(
|
|
52
|
+
{params_list_str}
|
|
53
|
+
) returns char(40)
|
|
54
|
+
language plpgsql
|
|
55
|
+
as $$
|
|
56
|
+
declare
|
|
57
|
+
result char(40);
|
|
58
|
+
begin
|
|
59
|
+
result :=
|
|
60
|
+
upper(
|
|
61
|
+
encode(
|
|
62
|
+
digest(
|
|
63
|
+
concat(
|
|
64
|
+
{concat_list_str}
|
|
65
|
+
),
|
|
66
|
+
'sha1'
|
|
67
|
+
),
|
|
68
|
+
'hex'
|
|
69
|
+
)
|
|
70
|
+
);
|
|
71
|
+
return cast(result as char(40));
|
|
72
|
+
end;
|
|
73
|
+
$$;"""
|
|
74
|
+
return func
|
|
75
|
+
|
|
76
|
+
def initialize_database(conn: Engine | Connection, dialect: str = "mssql",
|
|
77
|
+
str_to_hash_count:int = 66,
|
|
78
|
+
verbose: bool = False,
|
|
79
|
+
drop_entities_table: bool = False) -> None:
|
|
80
|
+
init_scripts: dict[str, str] = {}
|
|
81
|
+
header = modeling_metadata.HEADER_TEMPLATE.format(
|
|
82
|
+
created_on=datetime.now(),
|
|
83
|
+
updated_on=datetime.now(),
|
|
84
|
+
version=SANDWICH_VERSION,
|
|
85
|
+
entity_name="SYSTEM")
|
|
86
|
+
|
|
87
|
+
if dialect == "mssql":
|
|
88
|
+
init_scripts["create_schemas"] = ddl_mssql.create_schemas
|
|
89
|
+
if drop_entities_table:
|
|
90
|
+
init_scripts["drop_entities_table"] = "drop table if exists [core].[entities];"
|
|
91
|
+
init_scripts["create_entities_table"] = ddl_mssql.create_entities_table
|
|
92
|
+
init_scripts["create_func_StringToHash1"] = header + ddl_mssql.create_func_StringToHash
|
|
93
|
+
for i in range(2, str_to_hash_count):
|
|
94
|
+
init_scripts[f"create_func_StringToHash{i}"] = header + get_string_to_hash_ddl_mssql(i)
|
|
95
|
+
init_scripts["create_table_ExecutionLog"] = ddl_mssql.create_table_ExecutionLog
|
|
96
|
+
init_scripts["create_table_ErrorLog"] = ddl_mssql.create_table_ErrorLog
|
|
97
|
+
init_scripts["create_proc_LogExecution"] = header + ddl_mssql.create_proc_LogExecution
|
|
98
|
+
elif dialect == "postgres":
|
|
99
|
+
init_scripts["create_extensions"] = ddl_postgres.create_extensions
|
|
100
|
+
init_scripts["create_schemas"] = ddl_postgres.create_schemas
|
|
101
|
+
if drop_entities_table:
|
|
102
|
+
init_scripts["drop_entities_table"] = "drop table if exists core.entities"
|
|
103
|
+
init_scripts["create_entities_table"] = ddl_postgres.create_entities_table
|
|
104
|
+
init_scripts["create_func_StringToHash1"] = ddl_postgres.create_func_StringToHash
|
|
105
|
+
for i in range(2, str_to_hash_count):
|
|
106
|
+
init_scripts[f"create_func_StringToHash{i}"] = get_string_to_hash_ddl_postgres(i)
|
|
107
|
+
init_scripts["create_table_ExecutionLog"] = ddl_postgres.create_table_ExecutionLog
|
|
108
|
+
init_scripts["create_table_ErrorLog"] = ddl_postgres.create_table_ErrorLog
|
|
109
|
+
init_scripts["create_proc_LogExecution"] = ddl_postgres.create_proc_LogExecution
|
|
110
|
+
else:
|
|
111
|
+
raise err.Dv2NotYetImplementedForDialectError(dialect)
|
|
112
|
+
|
|
113
|
+
for name, script in init_scripts.items():
|
|
114
|
+
if verbose:
|
|
115
|
+
print(f"[ok] Executing script: {name}")
|
|
116
|
+
conn.execute(text(script))
|
|
117
|
+
|
|
118
|
+
def get_proc_definition_dml_mssql(proc_param_name: str) -> TextClause:
|
|
119
|
+
return text(f"""
|
|
120
|
+
SELECT sm.definition
|
|
121
|
+
FROM sys.sql_modules sm
|
|
122
|
+
JOIN sys.objects o ON sm.object_id = o.object_id
|
|
123
|
+
JOIN sys.schemas s ON o.schema_id = s.schema_id
|
|
124
|
+
WHERE o.type = 'P'
|
|
125
|
+
AND '['+s.name+'].['+o.name+']' = :{proc_param_name}
|
|
126
|
+
""")
|
|
127
|
+
|
|
128
|
+
def parse_auto_generated_header(full_proc_text: str) -> dict[str, Any]:
|
|
129
|
+
started = False
|
|
130
|
+
rows_in_header = 0
|
|
131
|
+
result: dict[str, Any] = {}
|
|
132
|
+
for ln in full_proc_text.splitlines():
|
|
133
|
+
if started:
|
|
134
|
+
rows_in_header += 1
|
|
135
|
+
if ln.lstrip().startswith("Created on"):
|
|
136
|
+
result["created_on"] = ln.split(":", 1)[1].strip()
|
|
137
|
+
elif ln.lstrip().startswith("Updated on"):
|
|
138
|
+
result["updated_on"] = ln.split(":", 1)[1].strip()
|
|
139
|
+
elif ln.strip() == "*/":
|
|
140
|
+
break
|
|
141
|
+
else:
|
|
142
|
+
continue
|
|
143
|
+
if ln.strip() == "/*":
|
|
144
|
+
started = True
|
|
145
|
+
continue
|
|
146
|
+
result["rows_in_header"] = rows_in_header - 1 if rows_in_header > 0 else 0
|
|
147
|
+
return result
|
sandwich/dwh/__init__.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
|
|
3
|
+
from sqlalchemy import Connection, Engine, MetaData, select, Table, text, RowMapping, Sequence
|
|
4
|
+
|
|
5
|
+
from sandwich.dialects import DialectHandlerFactory
|
|
6
|
+
from sandwich.modeling import get_stg_info, infer_template, Dv2SystemInfo, Dv2Entity, StgInfo
|
|
7
|
+
from sandwich.modeling.strategies import SchemaGenerator, StrategyFactory
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _register_entity(entity_name: str, template: str, conn: Engine | Connection,
|
|
11
|
+
verbose: bool = False) -> datetime:
|
|
12
|
+
entities = Table("entities", MetaData(), schema="core", autoload_with=conn)
|
|
13
|
+
created_result = conn.execute(select(entities.c.created).where(entity_name == entities.c.entity_name)).scalar_one_or_none()
|
|
14
|
+
|
|
15
|
+
if created_result is None:
|
|
16
|
+
created_result = datetime.now()
|
|
17
|
+
conn.execute(entities.insert().values(entity_name=entity_name, template=template, created=created_result))
|
|
18
|
+
if verbose:
|
|
19
|
+
print(f"[ok] Registered `{entity_name}` for `{template}`")
|
|
20
|
+
else:
|
|
21
|
+
_update_entity(entity_name, conn, entities, verbose=verbose)
|
|
22
|
+
|
|
23
|
+
return created_result
|
|
24
|
+
|
|
25
|
+
def _update_entity(entity_name: str, conn: Engine | Connection, sys_entities: Table, verbose: bool = False) -> None:
|
|
26
|
+
conn.execute(
|
|
27
|
+
sys_entities.update().where(entity_name == sys_entities.c.entity_name).values(updated=datetime.now(), is_deleted=False))
|
|
28
|
+
if verbose:
|
|
29
|
+
print(f"[ok] Updated `{entity_name}`")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def generate_schema(schema_generator: SchemaGenerator, registered_on: datetime, conn: Engine | Connection,
|
|
33
|
+
verbose: bool = False) -> None:
|
|
34
|
+
tables = schema_generator.make_tables()
|
|
35
|
+
for table_type, table in tables.items():
|
|
36
|
+
if table is not None:
|
|
37
|
+
table.create(conn, checkfirst=True)
|
|
38
|
+
if verbose:
|
|
39
|
+
print(f"[ok] Created table [{table.schema}].[{table.name}]")
|
|
40
|
+
|
|
41
|
+
procedures = schema_generator.make_procedures(tables, registered_on)
|
|
42
|
+
for proc_type, (proc_code, proc_name, _) in procedures.items():
|
|
43
|
+
conn.execute(text(proc_code))
|
|
44
|
+
if verbose:
|
|
45
|
+
print(f"[ok] Created or altered {proc_name}")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _generate_schema_for_entity(stg_info: StgInfo, conn: Engine | Connection, dialect: str,
|
|
49
|
+
registered_on: datetime, template: str | None, verbose: bool = False) -> None:
|
|
50
|
+
validator = StrategyFactory.create_validator(template)
|
|
51
|
+
sys_info = get_system_info(conn)
|
|
52
|
+
validation_result = validator.validate_staging(stg_info, sys_info)
|
|
53
|
+
dialect_handler = DialectHandlerFactory.create_handler(dialect)
|
|
54
|
+
schema_generator = StrategyFactory.create_generator(dialect_handler, validation_result)
|
|
55
|
+
generate_schema(schema_generator, registered_on, conn, verbose=verbose)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def register_and_create_entity(entity_name: str, conn: Engine | Connection, dialect: str, template: str | None = None,
|
|
59
|
+
schema: str = "stg", verbose: bool = False) -> None:
|
|
60
|
+
stg_info = get_stg_info(entity_name, schema, conn)
|
|
61
|
+
if template is None:
|
|
62
|
+
template = infer_template(stg_info)
|
|
63
|
+
registered_on = _register_entity(entity_name, template, conn)
|
|
64
|
+
_generate_schema_for_entity(stg_info, conn, dialect, registered_on, template, verbose=verbose)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def update_registered_entities(conn: Engine | Connection, dialect: str, schema: str = "stg",
|
|
68
|
+
verbose: bool = False) -> None:
|
|
69
|
+
sys_info = get_system_info(conn)
|
|
70
|
+
for en in sys_info.entities_list:
|
|
71
|
+
stg_info = get_stg_info(en.entity_name, schema, conn)
|
|
72
|
+
_update_entity(en.entity_name, conn, sys_info.sys_entities, verbose=verbose)
|
|
73
|
+
_generate_schema_for_entity(stg_info, conn, dialect, en.created_on, en.template, verbose=verbose)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def get_system_info(conn: Engine | Connection):
|
|
77
|
+
sys_entities = Table("entities", MetaData(), schema="core", autoload_with=conn)
|
|
78
|
+
select_result = conn.execute(sys_entities.select().where(~sys_entities.c.is_deleted))
|
|
79
|
+
return Dv2SystemInfo(
|
|
80
|
+
[Dv2Entity(en["entity_name"], en["template"], en["created"]) for en in select_result.mappings().all()],
|
|
81
|
+
sys_entities
|
|
82
|
+
)
|
sandwich/errors.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
class _Dv2TemplatedError(Exception):
|
|
4
|
+
"""Override `_template` with a string using `{value}` placeholder and optionally `{field}` placeholder.
|
|
5
|
+
Example: `_template = "User with {field}={value} not found"`
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
_template: str
|
|
9
|
+
|
|
10
|
+
def __init__(self, value: Any, field: str | None = None):
|
|
11
|
+
template = getattr(self, "_template", None)
|
|
12
|
+
if not template:
|
|
13
|
+
raise NotImplementedError("_template is not implemented")
|
|
14
|
+
|
|
15
|
+
if field:
|
|
16
|
+
message = template.format(field=field, value=value)
|
|
17
|
+
else:
|
|
18
|
+
message = template.format(value=value)
|
|
19
|
+
|
|
20
|
+
super().__init__(message)
|
|
21
|
+
|
|
22
|
+
class Dv2NotYetImplementedForDialectError(_Dv2TemplatedError):
|
|
23
|
+
_template = "Not yet implemented for '{value}' dialect"
|
|
24
|
+
def __init__(self, value: Any):
|
|
25
|
+
super().__init__(value)
|
sandwich/main.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from typing import Any, Tuple
|
|
4
|
+
|
|
5
|
+
from sqlalchemy import Table, Engine, Connection, MetaData
|
|
6
|
+
|
|
7
|
+
@dataclass(frozen=True)
|
|
8
|
+
class Dv2Entity:
|
|
9
|
+
entity_name: str
|
|
10
|
+
template: str
|
|
11
|
+
created_on: datetime
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass(frozen=True)
|
|
15
|
+
class StgInfo:
|
|
16
|
+
stg_name: str
|
|
17
|
+
stg_schema: str
|
|
18
|
+
hk_keys: dict[str, Any]
|
|
19
|
+
bk_keys: dict[str, Any]
|
|
20
|
+
sys_columns: dict[str, Any]
|
|
21
|
+
bus_columns: dict[str, Any]
|
|
22
|
+
degenerate_field: Tuple[str, Any] | None = None
|
|
23
|
+
|
|
24
|
+
@dataclass(frozen=True)
|
|
25
|
+
class Dv2SystemInfo:
|
|
26
|
+
entities_list: list[Dv2Entity]
|
|
27
|
+
sys_entities: Table
|
|
28
|
+
|
|
29
|
+
class Dv2ModelingMetadata:
|
|
30
|
+
HEADER_TEMPLATE = """/*
|
|
31
|
+
=====================================================================
|
|
32
|
+
AUTO-GENERATED CODE — DO NOT EDIT MANUALLY
|
|
33
|
+
=====================================================================
|
|
34
|
+
|
|
35
|
+
This stored procedure was automatically generated by:
|
|
36
|
+
sandwich (https://pypi.org/project/sandwich/)
|
|
37
|
+
|
|
38
|
+
Manual changes are discouraged.
|
|
39
|
+
This file may be regenerated at any time, and all manual edits
|
|
40
|
+
will be overwritten.
|
|
41
|
+
|
|
42
|
+
Created on : {created_on:%Y-%m-%d %H:%M:%S}
|
|
43
|
+
Updated on : {updated_on:%Y-%m-%d %H:%M:%S}
|
|
44
|
+
Generator : sandwich v{version}
|
|
45
|
+
Entity name : {entity_name}
|
|
46
|
+
|
|
47
|
+
=====================================================================
|
|
48
|
+
*/
|
|
49
|
+
"""
|
|
50
|
+
loaddate = "LoadDate"
|
|
51
|
+
recordsource = "RecordSource"
|
|
52
|
+
hashdiff = "HashDiff"
|
|
53
|
+
is_available = "IsAvailable"
|
|
54
|
+
names: list[str] = [loaddate, recordsource, hashdiff, is_available]
|
|
55
|
+
# _dialects_config: dict[str, dict[str, str]] = {
|
|
56
|
+
# "mssql": {
|
|
57
|
+
# loaddate: "DATETIME2",
|
|
58
|
+
# recordsource: "VARCHAR(200)",
|
|
59
|
+
# hashdiff: "CHAR(40)",
|
|
60
|
+
# is_available: "BIT",
|
|
61
|
+
# },
|
|
62
|
+
# "postgres": {
|
|
63
|
+
# loaddate: "TIMESTAMP",
|
|
64
|
+
# recordsource: "TEXT",
|
|
65
|
+
# hashdiff: "CHAR(40)",
|
|
66
|
+
# is_available: "BOOLEAN",
|
|
67
|
+
# },
|
|
68
|
+
# }
|
|
69
|
+
def __init__(self):
|
|
70
|
+
#self.dialect = dialect
|
|
71
|
+
|
|
72
|
+
#self.column_types = self._dialects_config[self.dialect]
|
|
73
|
+
self.required_columns: list[str] = [self.loaddate, self.recordsource]
|
|
74
|
+
|
|
75
|
+
modeling_metadata = Dv2ModelingMetadata()
|
|
76
|
+
|
|
77
|
+
def get_stg_info(entity_name: str, schema: str, conn: Engine | Connection) -> StgInfo:
|
|
78
|
+
stg = Table(entity_name, MetaData(), schema=schema, autoload_with=conn)
|
|
79
|
+
|
|
80
|
+
hk_keys: dict[str, Any] = {}
|
|
81
|
+
bk_keys: dict[str, Any] = {}
|
|
82
|
+
sys_columns: dict[str, Any] = {}
|
|
83
|
+
bus_columns: dict[str, Any] = {}
|
|
84
|
+
degenerate_field: Tuple[str, Any] | None = None
|
|
85
|
+
|
|
86
|
+
for col in stg.columns.values():
|
|
87
|
+
if col.name.startswith("hk_"): # hash key
|
|
88
|
+
hk_keys[col.name] = col.type
|
|
89
|
+
elif col.name.startswith("bk_"): # business key
|
|
90
|
+
bk_keys[col.name] = col.type
|
|
91
|
+
elif col.name.startswith("dg_"): # degenerate field (transactional links only)
|
|
92
|
+
degenerate_field = (col.name, col.type)
|
|
93
|
+
elif col.name.startswith("sg_"): # surrogate key
|
|
94
|
+
raise Exception(f"sg column '{col.name}' is not implemented yet")
|
|
95
|
+
elif col.name in modeling_metadata.names:
|
|
96
|
+
# type_name = metadata.column_types[col.name]
|
|
97
|
+
# if not str(col.type).startswith(type_name):
|
|
98
|
+
# raise Exception(f"{col.name} column must be `{type_name}` type, but it is `{str(col.type)}`")
|
|
99
|
+
sys_columns[col.name] = col.type
|
|
100
|
+
else:
|
|
101
|
+
bus_columns[col.name] = col.type
|
|
102
|
+
|
|
103
|
+
return StgInfo(
|
|
104
|
+
stg_name=stg.name,
|
|
105
|
+
stg_schema=stg.schema,
|
|
106
|
+
hk_keys=hk_keys,
|
|
107
|
+
bk_keys=bk_keys,
|
|
108
|
+
sys_columns=sys_columns,
|
|
109
|
+
bus_columns=bus_columns,
|
|
110
|
+
degenerate_field=degenerate_field,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
def infer_template(stg_info: StgInfo):
|
|
114
|
+
hk_count = len(stg_info.hk_keys)
|
|
115
|
+
if hk_count == 0:
|
|
116
|
+
raise Exception("hk column is required for `scd2dim` validation")
|
|
117
|
+
elif hk_count > 1:
|
|
118
|
+
return "link2fact"
|
|
119
|
+
else:
|
|
120
|
+
return "scd2dim"
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from .base import SchemaGenerator, Validator
|
|
2
|
+
from .factory import StrategyFactory
|
|
3
|
+
from .link2fact import Link2FactValidator, Link2FactSchemaGenerator
|
|
4
|
+
from .scd2dim import Scd2DimValidator, Scd2DimSchemaGenerator
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"Validator",
|
|
8
|
+
"SchemaGenerator",
|
|
9
|
+
"StrategyFactory",
|
|
10
|
+
"Scd2DimValidator",
|
|
11
|
+
"Scd2DimSchemaGenerator",
|
|
12
|
+
"Link2FactValidator",
|
|
13
|
+
"Link2FactSchemaGenerator",
|
|
14
|
+
]
|
|
15
|
+
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import Any, Callable, Tuple
|
|
5
|
+
|
|
6
|
+
from sqlalchemy import Table
|
|
7
|
+
|
|
8
|
+
from sandwich.modeling import Dv2SystemInfo, modeling_metadata, StgInfo
|
|
9
|
+
|
|
10
|
+
@dataclass(frozen=True)
|
|
11
|
+
class ValidationResult:
|
|
12
|
+
stg_schema: str
|
|
13
|
+
entity_name: str
|
|
14
|
+
bk_keys: list[Tuple[str, Any]]
|
|
15
|
+
hk_keys: list[Tuple[str, Any]]
|
|
16
|
+
business_column_types: dict[str, Any]
|
|
17
|
+
system_column_types: dict[str, Any]
|
|
18
|
+
template: str
|
|
19
|
+
degenerate_field: Tuple[str, Any] | None = None
|
|
20
|
+
|
|
21
|
+
class Validator(ABC):
|
|
22
|
+
@abstractmethod
|
|
23
|
+
def validate_staging(self, stg_info: StgInfo, sys_info: Dv2SystemInfo, verbose: bool = False) -> ValidationResult:
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
class BaseValidator(Validator):
|
|
27
|
+
def __init__(self, template: str):
|
|
28
|
+
self._on_validate_staging: Callable[[StgInfo, Dv2SystemInfo], None] | None = None
|
|
29
|
+
self.template = template
|
|
30
|
+
|
|
31
|
+
def validate_staging(self, stg_info: StgInfo, sys_info: Dv2SystemInfo, verbose: bool = False) -> ValidationResult:
|
|
32
|
+
"""Validate staging table or view for `scd2dim` template.
|
|
33
|
+
|
|
34
|
+
Raises: Exception"""
|
|
35
|
+
if verbose:
|
|
36
|
+
raise Exception("verbose is not implemented yet")
|
|
37
|
+
|
|
38
|
+
system_column_names = stg_info.sys_columns.keys()
|
|
39
|
+
|
|
40
|
+
# universal check - all dv2 raw objects should be auditable
|
|
41
|
+
for required_col in modeling_metadata.required_columns:
|
|
42
|
+
if required_col not in system_column_names:
|
|
43
|
+
raise Exception(f"{required_col} column is required")
|
|
44
|
+
|
|
45
|
+
if self._on_validate_staging is not None:
|
|
46
|
+
self._on_validate_staging(stg_info, sys_info)
|
|
47
|
+
|
|
48
|
+
# todo: ValidationResult is not required whatsoever
|
|
49
|
+
return ValidationResult(
|
|
50
|
+
stg_schema=stg_info.stg_schema,
|
|
51
|
+
entity_name=stg_info.stg_name,
|
|
52
|
+
bk_keys=[(nm, tp) for nm, tp in stg_info.bk_keys.items()],
|
|
53
|
+
hk_keys=[(nm, tp) for nm, tp in stg_info.hk_keys.items()],
|
|
54
|
+
degenerate_field = stg_info.degenerate_field,
|
|
55
|
+
business_column_types=stg_info.bus_columns,
|
|
56
|
+
system_column_types=stg_info.sys_columns,
|
|
57
|
+
template=self.template
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class SchemaGenerator(ABC):
|
|
63
|
+
@abstractmethod
|
|
64
|
+
def make_tables(self) -> dict[str, Table]:
|
|
65
|
+
pass
|
|
66
|
+
|
|
67
|
+
@abstractmethod
|
|
68
|
+
def make_procedures(
|
|
69
|
+
self,
|
|
70
|
+
tables: dict[str, Table],
|
|
71
|
+
entity_registration_date: datetime = datetime.now()
|
|
72
|
+
) -> dict[str, Tuple[str, str, str]]:
|
|
73
|
+
pass
|
|
74
|
+
|
|
75
|
+
# class BaseSchemaGenerator(SchemaGenerator):
|
|
76
|
+
# def __init__(self, dialect_handler: DialectHandler, validation_result: ValidationResult):
|
|
77
|
+
# self.dialect_handler = dialect_handler
|
|
78
|
+
# self._validation_result = validation_result
|
|
79
|
+
# self._on_make_proc: Callable[[Table, datetime], Tuple[str, str, str]] | None = None
|
|
80
|
+
#
|
|
81
|
+
# def make_proc(self, tbl: Table, entity_registration_date: datetime) -> Tuple[str, str, str]:
|
|
82
|
+
# header = modeling_metadata.HEADER_TEMPLATE.format(
|
|
83
|
+
# created_on=entity_registration_date,
|
|
84
|
+
# updated_on=datetime.now(),
|
|
85
|
+
# version=SANDWICH_VERSION,
|
|
86
|
+
# entity_name=self._validation_result.entity_name
|
|
87
|
+
# )
|
|
88
|
+
#
|
|
89
|
+
# if self._validation_result.stg_schema == "proxy":
|
|
90
|
+
# stg_proc_code, stg_proc_name, stg_call_stmt = self.dialect_handler.make_stg_materialization_proc(
|
|
91
|
+
# entity_name=self._validation_result.entity_name,
|
|
92
|
+
# header=header
|
|
93
|
+
# )
|
|
94
|
+
# procedures["stg"] = (stg_proc_code, stg_proc_name, stg_call_stmt)
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from src.sandwich.dialects import DialectHandler
|
|
2
|
+
|
|
3
|
+
from .base import Validator, SchemaGenerator, ValidationResult
|
|
4
|
+
from .link2fact import Link2FactValidator, Link2FactSchemaGenerator
|
|
5
|
+
from .scd2dim import Scd2DimValidator, Scd2DimSchemaGenerator
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class StrategyFactory:
|
|
9
|
+
_strategies = {
|
|
10
|
+
"scd2dim": (Scd2DimValidator, Scd2DimSchemaGenerator),
|
|
11
|
+
"link2fact": (Link2FactValidator, Link2FactSchemaGenerator),
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
@classmethod
|
|
15
|
+
def register_strategy(cls, template_name: str, validator_class, generator_class):
|
|
16
|
+
cls._strategies[template_name] = (validator_class, generator_class)
|
|
17
|
+
|
|
18
|
+
@classmethod
|
|
19
|
+
def create_validator(cls, template: str) -> Validator:
|
|
20
|
+
if template not in cls._strategies:
|
|
21
|
+
available = ", ".join(cls._strategies.keys())
|
|
22
|
+
raise ValueError(f"Unknown template '{template}'. Available templates: {available}")
|
|
23
|
+
|
|
24
|
+
validator_class, _ = cls._strategies[template]
|
|
25
|
+
return validator_class(template)
|
|
26
|
+
|
|
27
|
+
@classmethod
|
|
28
|
+
def create_generator(cls, dialect_handler: DialectHandler, validation_result: ValidationResult) -> SchemaGenerator:
|
|
29
|
+
template = validation_result.template
|
|
30
|
+
if template not in cls._strategies:
|
|
31
|
+
available = ", ".join(cls._strategies.keys())
|
|
32
|
+
raise ValueError(f"Unknown template '{template}'. Available templates: {available}")
|
|
33
|
+
|
|
34
|
+
_, generator_class = cls._strategies[template]
|
|
35
|
+
return generator_class(dialect_handler, validation_result)
|
|
36
|
+
|
|
37
|
+
@classmethod
|
|
38
|
+
def get_available_templates(cls) -> list[str]:
|
|
39
|
+
return list(cls._strategies.keys())
|