kelpmesh-core 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kelpmesh/__init__.py +6 -0
- kelpmesh/__main__.py +3 -0
- kelpmesh/adapters/__init__.py +51 -0
- kelpmesh/adapters/athena.py +347 -0
- kelpmesh/adapters/base.py +139 -0
- kelpmesh/adapters/bigquery.py +211 -0
- kelpmesh/adapters/clickhouse.py +277 -0
- kelpmesh/adapters/databricks.py +222 -0
- kelpmesh/adapters/duckdb.py +328 -0
- kelpmesh/adapters/fabric.py +236 -0
- kelpmesh/adapters/hive.py +366 -0
- kelpmesh/adapters/mysql.py +221 -0
- kelpmesh/adapters/postgres.py +214 -0
- kelpmesh/adapters/redshift.py +221 -0
- kelpmesh/adapters/snowflake.py +229 -0
- kelpmesh/adapters/spark.py +375 -0
- kelpmesh/adapters/sqlserver.py +285 -0
- kelpmesh/adapters/trino.py +251 -0
- kelpmesh/cli/__init__.py +0 -0
- kelpmesh/cli/build.py +132 -0
- kelpmesh/cli/ci.py +589 -0
- kelpmesh/cli/clean.py +30 -0
- kelpmesh/cli/compare.py +62 -0
- kelpmesh/cli/compile.py +135 -0
- kelpmesh/cli/create_test.py +190 -0
- kelpmesh/cli/debug.py +237 -0
- kelpmesh/cli/deps.py +93 -0
- kelpmesh/cli/diff.py +55 -0
- kelpmesh/cli/docs.py +71 -0
- kelpmesh/cli/export.py +78 -0
- kelpmesh/cli/exposures.py +39 -0
- kelpmesh/cli/format.py +206 -0
- kelpmesh/cli/freshness.py +156 -0
- kelpmesh/cli/generate.py +151 -0
- kelpmesh/cli/history.py +79 -0
- kelpmesh/cli/import_dbt.py +621 -0
- kelpmesh/cli/init.py +104 -0
- kelpmesh/cli/lint.py +619 -0
- kelpmesh/cli/ls.py +45 -0
- kelpmesh/cli/main.py +172 -0
- kelpmesh/cli/mesh.py +269 -0
- kelpmesh/cli/metrics.py +89 -0
- kelpmesh/cli/orchestrate.py +51 -0
- kelpmesh/cli/package_cli.py +66 -0
- kelpmesh/cli/plan.py +202 -0
- kelpmesh/cli/pre_commit.py +54 -0
- kelpmesh/cli/preview.py +50 -0
- kelpmesh/cli/rollback.py +105 -0
- kelpmesh/cli/run.py +171 -0
- kelpmesh/cli/scan.py +147 -0
- kelpmesh/cli/schedule.py +176 -0
- kelpmesh/cli/schema.py +68 -0
- kelpmesh/cli/security.py +318 -0
- kelpmesh/cli/seed.py +244 -0
- kelpmesh/cli/serve.py +34 -0
- kelpmesh/cli/sources.py +136 -0
- kelpmesh/cli/studio.py +72 -0
- kelpmesh/cli/test.py +220 -0
- kelpmesh/core/__init__.py +0 -0
- kelpmesh/core/audits.py +151 -0
- kelpmesh/core/ci.py +79 -0
- kelpmesh/core/config.py +102 -0
- kelpmesh/core/contracts.py +130 -0
- kelpmesh/core/crypto.py +72 -0
- kelpmesh/core/errors.py +69 -0
- kelpmesh/core/executor.py +641 -0
- kelpmesh/core/graph.py +95 -0
- kelpmesh/core/macros.py +639 -0
- kelpmesh/core/model.py +55 -0
- kelpmesh/core/packages.py +184 -0
- kelpmesh/core/project.py +187 -0
- kelpmesh/core/python_runner.py +190 -0
- kelpmesh/core/scheduler.py +296 -0
- kelpmesh/core/schema_yaml.py +96 -0
- kelpmesh/core/substitutions.py +150 -0
- kelpmesh/core/versioning.py +106 -0
- kelpmesh/diff/__init__.py +0 -0
- kelpmesh/diff/comparison.py +62 -0
- kelpmesh/diff/engine.py +55 -0
- kelpmesh/docs/__init__.py +0 -0
- kelpmesh/docs/generator.py +394 -0
- kelpmesh/integrations/__init__.py +1 -0
- kelpmesh/integrations/bitbucket.py +97 -0
- kelpmesh/integrations/github.py +128 -0
- kelpmesh/integrations/gitlab.py +93 -0
- kelpmesh/mesh/__init__.py +21 -0
- kelpmesh/mesh/access.py +110 -0
- kelpmesh/mesh/config.py +91 -0
- kelpmesh/mesh/contracts.py +200 -0
- kelpmesh/mesh/health.py +178 -0
- kelpmesh/mesh/resolver.py +143 -0
- kelpmesh/observability/__init__.py +0 -0
- kelpmesh/observability/alerts.py +106 -0
- kelpmesh/observability/anomaly.py +67 -0
- kelpmesh/observability/history.py +120 -0
- kelpmesh/orchestrate/engine.py +89 -0
- kelpmesh/parser/__init__.py +0 -0
- kelpmesh/parser/lineage.py +58 -0
- kelpmesh/parser/python.py +28 -0
- kelpmesh/parser/sql.py +96 -0
- kelpmesh/schema/__init__.py +0 -0
- kelpmesh/schema/drift.py +98 -0
- kelpmesh/security/__init__.py +1 -0
- kelpmesh/security/audit.py +94 -0
- kelpmesh/security/classifier.py +111 -0
- kelpmesh/security/erasure.py +135 -0
- kelpmesh/security/masking.py +81 -0
- kelpmesh/security/rls.py +93 -0
- kelpmesh/semantic/__init__.py +263 -0
- kelpmesh/semantic/exporters/__init__.py +26 -0
- kelpmesh/semantic/exporters/base.py +62 -0
- kelpmesh/semantic/exporters/looker.py +118 -0
- kelpmesh/semantic/exporters/manifest.py +73 -0
- kelpmesh/semantic/exporters/powerbi.py +148 -0
- kelpmesh/semantic/exporters/qlik.py +139 -0
- kelpmesh/semantic/exporters/tableau.py +109 -0
- kelpmesh/semantic/serve.py +128 -0
- kelpmesh/state/__init__.py +0 -0
- kelpmesh/state/engine.py +392 -0
- kelpmesh/studio/__init__.py +1 -0
- kelpmesh/studio/app.py +378 -0
- kelpmesh/testing/__init__.py +0 -0
- kelpmesh/testing/fixtures.py +247 -0
- kelpmesh/testing/runner.py +82 -0
- kelpmesh/testing/schema_tests.py +96 -0
- kelpmesh_core-1.0.0.dist-info/METADATA +402 -0
- kelpmesh_core-1.0.0.dist-info/RECORD +130 -0
- kelpmesh_core-1.0.0.dist-info/WHEEL +4 -0
- kelpmesh_core-1.0.0.dist-info/entry_points.txt +2 -0
- kelpmesh_core-1.0.0.dist-info/licenses/LICENSE +201 -0
kelpmesh/__init__.py
ADDED
kelpmesh/__main__.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from kelpmesh.adapters.base import WarehouseAdapter
|
|
2
|
+
from kelpmesh.core.config import WarehouseConfig
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def get_adapter(config: WarehouseConfig, project_path: str | None = None) -> WarehouseAdapter:
|
|
6
|
+
match config.type:
|
|
7
|
+
case "duckdb":
|
|
8
|
+
from kelpmesh.adapters.duckdb import DuckDBAdapter
|
|
9
|
+
return DuckDBAdapter(config, project_path=project_path)
|
|
10
|
+
case "snowflake":
|
|
11
|
+
from kelpmesh.adapters.snowflake import SnowflakeAdapter
|
|
12
|
+
return SnowflakeAdapter(config)
|
|
13
|
+
case "bigquery":
|
|
14
|
+
from kelpmesh.adapters.bigquery import BigQueryAdapter
|
|
15
|
+
return BigQueryAdapter(config)
|
|
16
|
+
case "postgres":
|
|
17
|
+
from kelpmesh.adapters.postgres import PostgresAdapter
|
|
18
|
+
return PostgresAdapter(config)
|
|
19
|
+
case "redshift":
|
|
20
|
+
from kelpmesh.adapters.redshift import RedshiftAdapter
|
|
21
|
+
return RedshiftAdapter(config)
|
|
22
|
+
case "databricks":
|
|
23
|
+
from kelpmesh.adapters.databricks import DatabricksAdapter
|
|
24
|
+
return DatabricksAdapter(config)
|
|
25
|
+
case "fabric":
|
|
26
|
+
from kelpmesh.adapters.fabric import FabricAdapter
|
|
27
|
+
return FabricAdapter(config)
|
|
28
|
+
case "mysql" | "mariadb":
|
|
29
|
+
from kelpmesh.adapters.mysql import MySQLAdapter
|
|
30
|
+
return MySQLAdapter(config)
|
|
31
|
+
case "trino" | "presto":
|
|
32
|
+
from kelpmesh.adapters.trino import TrinoAdapter
|
|
33
|
+
return TrinoAdapter(config)
|
|
34
|
+
case "clickhouse":
|
|
35
|
+
from kelpmesh.adapters.clickhouse import ClickHouseAdapter
|
|
36
|
+
return ClickHouseAdapter(config)
|
|
37
|
+
case "spark":
|
|
38
|
+
from kelpmesh.adapters.spark import SparkAdapter
|
|
39
|
+
return SparkAdapter(config)
|
|
40
|
+
case "athena":
|
|
41
|
+
from kelpmesh.adapters.athena import AthenaAdapter
|
|
42
|
+
return AthenaAdapter(config)
|
|
43
|
+
case "hive":
|
|
44
|
+
from kelpmesh.adapters.hive import HiveAdapter
|
|
45
|
+
return HiveAdapter(config)
|
|
46
|
+
case "sqlserver" | "mssql" | "synapse" | "azuresynapse":
|
|
47
|
+
from kelpmesh.adapters.sqlserver import SQLServerAdapter
|
|
48
|
+
return SQLServerAdapter(config)
|
|
49
|
+
case _:
|
|
50
|
+
from kelpmesh.adapters.duckdb import DuckDBAdapter
|
|
51
|
+
return DuckDBAdapter(config, project_path=project_path)
|
|
@@ -0,0 +1,347 @@
|
|
|
1
|
+
"""Amazon Athena adapter for KelpMesh.
|
|
2
|
+
|
|
3
|
+
Install the driver:
|
|
4
|
+
pip install kelpmesh[athena]
|
|
5
|
+
|
|
6
|
+
kelpmesh.yml:
|
|
7
|
+
warehouse:
|
|
8
|
+
type: athena
|
|
9
|
+
host: us-east-1 # AWS region
|
|
10
|
+
database: my_glue_database
|
|
11
|
+
path: "s3://my-bucket/athena-results/" # S3 staging dir for query results
|
|
12
|
+
user: "{{ env_var('AWS_ACCESS_KEY_ID') }}"
|
|
13
|
+
password: "{{ env_var('AWS_SECRET_ACCESS_KEY') }}"
|
|
14
|
+
|
|
15
|
+
Notes:
|
|
16
|
+
- `host` maps to the AWS region (e.g. us-east-1).
|
|
17
|
+
- `path` is the S3 staging directory used by Athena for result output.
|
|
18
|
+
- `user` / `password` are the AWS access key id / secret access key.
|
|
19
|
+
Leave both unset to use the default credential chain (IAM role, env
|
|
20
|
+
vars, ~/.aws/credentials, etc.).
|
|
21
|
+
- Athena does not support INSERT INTO on CTAS tables; incremental runs
|
|
22
|
+
use a CTAS-then-rename workaround.
|
|
23
|
+
- External table creation requires `s3_location` to be passed as a
|
|
24
|
+
keyword argument to execute_model via the `extra` dict (not yet
|
|
25
|
+
exposed in the standard interface — extend as needed).
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
|
|
30
|
+
from kelpmesh.adapters.base import WarehouseAdapter, sanitize_name
|
|
31
|
+
from kelpmesh.core.config import WarehouseConfig
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class AthenaAdapter(WarehouseAdapter):
|
|
35
|
+
def __init__(self, config: WarehouseConfig) -> None:
|
|
36
|
+
self.config = config
|
|
37
|
+
self.conn = None
|
|
38
|
+
|
|
39
|
+
# ------------------------------------------------------------------
|
|
40
|
+
# Connection helpers
|
|
41
|
+
# ------------------------------------------------------------------
|
|
42
|
+
|
|
43
|
+
def connect(self) -> None:
|
|
44
|
+
try:
|
|
45
|
+
import pyathena
|
|
46
|
+
except ImportError:
|
|
47
|
+
raise ImportError(
|
|
48
|
+
"PyAthena not installed. Run: pip install kelpmesh[athena]"
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
kwargs: dict = {
|
|
52
|
+
"s3_staging_dir": self.config.path or "",
|
|
53
|
+
"region_name": self.config.host or "us-east-1",
|
|
54
|
+
}
|
|
55
|
+
if self.config.user:
|
|
56
|
+
kwargs["aws_access_key_id"] = self.config.user
|
|
57
|
+
if self.config.password:
|
|
58
|
+
kwargs["aws_secret_access_key"] = self.config.password
|
|
59
|
+
if self.config.database:
|
|
60
|
+
kwargs["schema_name"] = self.config.database
|
|
61
|
+
|
|
62
|
+
self.conn = pyathena.connect(**kwargs)
|
|
63
|
+
|
|
64
|
+
def disconnect(self) -> None:
|
|
65
|
+
if self.conn:
|
|
66
|
+
self.conn.close()
|
|
67
|
+
self.conn = None
|
|
68
|
+
|
|
69
|
+
def _ensure_conn(self, conn=None):
|
|
70
|
+
c = conn or self.conn
|
|
71
|
+
if not c:
|
|
72
|
+
self.connect()
|
|
73
|
+
return self.conn
|
|
74
|
+
return c
|
|
75
|
+
|
|
76
|
+
# ------------------------------------------------------------------
|
|
77
|
+
# execute
|
|
78
|
+
# ------------------------------------------------------------------
|
|
79
|
+
|
|
80
|
+
def execute(self, sql: str, conn=None) -> list[dict]:
|
|
81
|
+
c = self._ensure_conn(conn)
|
|
82
|
+
cursor = c.cursor()
|
|
83
|
+
try:
|
|
84
|
+
cursor.execute(sql)
|
|
85
|
+
if cursor.description:
|
|
86
|
+
cols = [d[0] for d in cursor.description]
|
|
87
|
+
return [dict(zip(cols, row)) for row in cursor.fetchall()]
|
|
88
|
+
return []
|
|
89
|
+
finally:
|
|
90
|
+
cursor.close()
|
|
91
|
+
|
|
92
|
+
# ------------------------------------------------------------------
|
|
93
|
+
# execute_model
|
|
94
|
+
# ------------------------------------------------------------------
|
|
95
|
+
|
|
96
|
+
def execute_model(
|
|
97
|
+
self,
|
|
98
|
+
sql: str,
|
|
99
|
+
table_name: str,
|
|
100
|
+
materialized: str = "view",
|
|
101
|
+
conn=None,
|
|
102
|
+
unique_key: str | None = None,
|
|
103
|
+
incremental_strategy: str = "append",
|
|
104
|
+
) -> None:
|
|
105
|
+
db = self.config.database or ""
|
|
106
|
+
safe = f"`{db}`.`{table_name}`" if db else f"`{table_name}`"
|
|
107
|
+
c = self._ensure_conn(conn)
|
|
108
|
+
|
|
109
|
+
if materialized == "incremental":
|
|
110
|
+
if self.table_exists(table_name, conn=c):
|
|
111
|
+
# Athena CTAS tables do not support INSERT INTO.
|
|
112
|
+
# Workaround: create a new CTAS table, then swap.
|
|
113
|
+
tmp = f"_km_inc_{table_name}"
|
|
114
|
+
safe_tmp = f"`{db}`.`{tmp}`" if db else f"`{tmp}`"
|
|
115
|
+
# Drop any leftover temp table
|
|
116
|
+
try:
|
|
117
|
+
self.execute(f"DROP TABLE IF EXISTS {safe_tmp}", conn=c)
|
|
118
|
+
except Exception:
|
|
119
|
+
pass
|
|
120
|
+
self.execute(f"CREATE TABLE {safe_tmp} AS {sql}", conn=c)
|
|
121
|
+
self.execute(f"DROP TABLE IF EXISTS {safe}", conn=c)
|
|
122
|
+
self.execute(
|
|
123
|
+
f"ALTER TABLE {safe_tmp} RENAME TO `{table_name}`", conn=c
|
|
124
|
+
)
|
|
125
|
+
else:
|
|
126
|
+
self.execute(f"CREATE TABLE {safe} AS {sql}", conn=c)
|
|
127
|
+
return
|
|
128
|
+
|
|
129
|
+
self.drop_table(table_name, materialized, conn=c)
|
|
130
|
+
if materialized == "table":
|
|
131
|
+
self.execute(f"CREATE TABLE {safe} AS {sql}", conn=c)
|
|
132
|
+
elif materialized == "ephemeral":
|
|
133
|
+
pass
|
|
134
|
+
else:
|
|
135
|
+
self.execute(f"CREATE OR REPLACE VIEW {safe} AS {sql}", conn=c)
|
|
136
|
+
|
|
137
|
+
# ------------------------------------------------------------------
|
|
138
|
+
# table_exists
|
|
139
|
+
# ------------------------------------------------------------------
|
|
140
|
+
|
|
141
|
+
def table_exists(self, table_name: str, conn=None) -> bool:
|
|
142
|
+
db = self.config.database or ""
|
|
143
|
+
c = self._ensure_conn(conn)
|
|
144
|
+
try:
|
|
145
|
+
if db:
|
|
146
|
+
rows = self.execute(
|
|
147
|
+
"SELECT COUNT(*) AS cnt FROM information_schema.tables "
|
|
148
|
+
f"WHERE table_schema = '{db}' AND table_name = '{table_name}'",
|
|
149
|
+
conn=c,
|
|
150
|
+
)
|
|
151
|
+
else:
|
|
152
|
+
rows = self.execute(
|
|
153
|
+
"SELECT COUNT(*) AS cnt FROM information_schema.tables "
|
|
154
|
+
f"WHERE table_name = '{table_name}'",
|
|
155
|
+
conn=c,
|
|
156
|
+
)
|
|
157
|
+
return (rows[0].get("cnt") or 0) > 0 if rows else False
|
|
158
|
+
except Exception:
|
|
159
|
+
return False
|
|
160
|
+
|
|
161
|
+
# ------------------------------------------------------------------
|
|
162
|
+
# table_schema
|
|
163
|
+
# ------------------------------------------------------------------
|
|
164
|
+
|
|
165
|
+
def table_schema(self, table_name: str, conn=None) -> list[dict]:
|
|
166
|
+
db = self.config.database or ""
|
|
167
|
+
c = self._ensure_conn(conn)
|
|
168
|
+
if db:
|
|
169
|
+
rows = self.execute(
|
|
170
|
+
"SELECT column_name, data_type, is_nullable "
|
|
171
|
+
"FROM information_schema.columns "
|
|
172
|
+
f"WHERE table_schema = '{db}' AND table_name = '{table_name}' "
|
|
173
|
+
"ORDER BY ordinal_position",
|
|
174
|
+
conn=c,
|
|
175
|
+
)
|
|
176
|
+
else:
|
|
177
|
+
rows = self.execute(
|
|
178
|
+
"SELECT column_name, data_type, is_nullable "
|
|
179
|
+
"FROM information_schema.columns "
|
|
180
|
+
f"WHERE table_name = '{table_name}' "
|
|
181
|
+
"ORDER BY ordinal_position",
|
|
182
|
+
conn=c,
|
|
183
|
+
)
|
|
184
|
+
return rows or []
|
|
185
|
+
|
|
186
|
+
# ------------------------------------------------------------------
|
|
187
|
+
# drop_table
|
|
188
|
+
# ------------------------------------------------------------------
|
|
189
|
+
|
|
190
|
+
def drop_table(self, table_name: str, materialized: str = "view", conn=None) -> None:
|
|
191
|
+
db = self.config.database or ""
|
|
192
|
+
safe = f"`{db}`.`{table_name}`" if db else f"`{table_name}`"
|
|
193
|
+
c = self._ensure_conn(conn)
|
|
194
|
+
if materialized == "view":
|
|
195
|
+
self.execute(f"DROP VIEW IF EXISTS {safe}", conn=c)
|
|
196
|
+
else:
|
|
197
|
+
self.execute(f"DROP TABLE IF EXISTS {safe}", conn=c)
|
|
198
|
+
|
|
199
|
+
# ------------------------------------------------------------------
|
|
200
|
+
# execute_snapshot (CTAS workaround — Athena has no MERGE INTO)
|
|
201
|
+
# ------------------------------------------------------------------
|
|
202
|
+
|
|
203
|
+
def execute_snapshot(
|
|
204
|
+
self,
|
|
205
|
+
sql: str,
|
|
206
|
+
table_name: str,
|
|
207
|
+
unique_key: str,
|
|
208
|
+
strategy: str = "timestamp",
|
|
209
|
+
updated_at: str = "updated_at",
|
|
210
|
+
conn=None,
|
|
211
|
+
) -> None:
|
|
212
|
+
"""SCD Type 2 snapshot for Athena.
|
|
213
|
+
|
|
214
|
+
Athena (Presto/Trino SQL engine) does not support MERGE INTO or UPDATE.
|
|
215
|
+
The implementation uses a full-rebuild CTAS pattern:
|
|
216
|
+
|
|
217
|
+
1. Read the existing snapshot table (if it exists).
|
|
218
|
+
2. Identify changed rows by joining with incoming data.
|
|
219
|
+
3. Expire changed rows by setting _valid_to / _is_current.
|
|
220
|
+
4. Union everything into a new CTAS table, then swap.
|
|
221
|
+
"""
|
|
222
|
+
db = self.config.database or ""
|
|
223
|
+
safe = f"`{db}`.`{table_name}`" if db else f"`{table_name}`"
|
|
224
|
+
stage = f"_km_snap_{table_name}"
|
|
225
|
+
safe_stage = f"`{db}`.`{stage}`" if db else f"`{stage}`"
|
|
226
|
+
new_table = f"_km_snap_new_{table_name}"
|
|
227
|
+
safe_new = f"`{db}`.`{new_table}`" if db else f"`{new_table}`"
|
|
228
|
+
c = self._ensure_conn(conn)
|
|
229
|
+
|
|
230
|
+
try:
|
|
231
|
+
if not self.table_exists(table_name, conn=c):
|
|
232
|
+
if strategy == "timestamp":
|
|
233
|
+
dbt_updated_expr = f"CAST(\"{updated_at}\" AS TIMESTAMP)"
|
|
234
|
+
else:
|
|
235
|
+
dbt_updated_expr = "current_timestamp"
|
|
236
|
+
|
|
237
|
+
self.execute(f"""
|
|
238
|
+
CREATE TABLE {safe} AS
|
|
239
|
+
SELECT *,
|
|
240
|
+
to_hex(md5(to_utf8(CAST("{unique_key}" AS VARCHAR)))) AS _scd_id,
|
|
241
|
+
current_timestamp AS _valid_from,
|
|
242
|
+
CAST(NULL AS TIMESTAMP) AS _valid_to,
|
|
243
|
+
TRUE AS _is_current,
|
|
244
|
+
{dbt_updated_expr} AS _dbt_updated_at
|
|
245
|
+
FROM ({sql}) _src
|
|
246
|
+
""", conn=c)
|
|
247
|
+
return
|
|
248
|
+
|
|
249
|
+
# Stage incoming data as a view (Athena supports CREATE OR REPLACE VIEW)
|
|
250
|
+
try:
|
|
251
|
+
self.execute(f"DROP VIEW IF EXISTS {safe_stage}", conn=c)
|
|
252
|
+
except Exception:
|
|
253
|
+
pass
|
|
254
|
+
self.execute(f"CREATE VIEW {safe_stage} AS {sql}", conn=c)
|
|
255
|
+
|
|
256
|
+
if strategy == "timestamp":
|
|
257
|
+
changed_cond = (
|
|
258
|
+
f"CAST(n.\"{updated_at}\" AS TIMESTAMP) > s._dbt_updated_at"
|
|
259
|
+
)
|
|
260
|
+
else:
|
|
261
|
+
cols_info = self.table_schema(table_name, conn=c)
|
|
262
|
+
audit_cols = {
|
|
263
|
+
"_scd_id", "_valid_from", "_valid_to",
|
|
264
|
+
"_is_current", "_dbt_updated_at",
|
|
265
|
+
}
|
|
266
|
+
check_cols = [
|
|
267
|
+
r["column_name"] for r in cols_info
|
|
268
|
+
if r["column_name"] != unique_key
|
|
269
|
+
and r["column_name"] not in audit_cols
|
|
270
|
+
]
|
|
271
|
+
changed_cond = (
|
|
272
|
+
" OR ".join(
|
|
273
|
+
f"n.\"{col}\" IS DISTINCT FROM s.\"{col}\""
|
|
274
|
+
for col in check_cols
|
|
275
|
+
)
|
|
276
|
+
if check_cols
|
|
277
|
+
else "FALSE"
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
if strategy == "timestamp":
|
|
281
|
+
dbt_updated_insert = f"CAST(n.\"{updated_at}\" AS TIMESTAMP)"
|
|
282
|
+
else:
|
|
283
|
+
dbt_updated_insert = "current_timestamp"
|
|
284
|
+
|
|
285
|
+
# Build the full rebuilt snapshot in a new CTAS table
|
|
286
|
+
try:
|
|
287
|
+
self.execute(f"DROP TABLE IF EXISTS {safe_new}", conn=c)
|
|
288
|
+
except Exception:
|
|
289
|
+
pass
|
|
290
|
+
|
|
291
|
+
self.execute(f"""
|
|
292
|
+
CREATE TABLE {safe_new} AS
|
|
293
|
+
-- Existing rows: expire changed ones, keep unchanged
|
|
294
|
+
SELECT s.*,
|
|
295
|
+
CASE
|
|
296
|
+
WHEN n."{unique_key}" IS NOT NULL AND ({changed_cond})
|
|
297
|
+
THEN current_timestamp
|
|
298
|
+
ELSE s._valid_to
|
|
299
|
+
END AS _valid_to_new,
|
|
300
|
+
CASE
|
|
301
|
+
WHEN n."{unique_key}" IS NOT NULL AND ({changed_cond})
|
|
302
|
+
THEN FALSE
|
|
303
|
+
ELSE s._is_current
|
|
304
|
+
END AS _is_current_new
|
|
305
|
+
FROM {safe} s
|
|
306
|
+
LEFT JOIN {safe_stage} n ON n."{unique_key}" = s."{unique_key}"
|
|
307
|
+
AND s._is_current = TRUE
|
|
308
|
+
|
|
309
|
+
UNION ALL
|
|
310
|
+
|
|
311
|
+
-- New rows for changed or brand-new keys
|
|
312
|
+
SELECT n.*,
|
|
313
|
+
to_hex(md5(to_utf8(CAST(n."{unique_key}" AS VARCHAR)))) AS _scd_id,
|
|
314
|
+
current_timestamp AS _valid_from,
|
|
315
|
+
CAST(NULL AS TIMESTAMP) AS _valid_to_new,
|
|
316
|
+
TRUE AS _is_current_new,
|
|
317
|
+
{dbt_updated_insert} AS _dbt_updated_at
|
|
318
|
+
FROM {safe_stage} n
|
|
319
|
+
WHERE NOT EXISTS (
|
|
320
|
+
SELECT 1 FROM {safe} s
|
|
321
|
+
WHERE s."{unique_key}" = n."{unique_key}" AND s._is_current = TRUE
|
|
322
|
+
)
|
|
323
|
+
OR EXISTS (
|
|
324
|
+
SELECT 1 FROM {safe} s
|
|
325
|
+
WHERE s."{unique_key}" = n."{unique_key}"
|
|
326
|
+
AND s._is_current = TRUE
|
|
327
|
+
AND ({changed_cond})
|
|
328
|
+
)
|
|
329
|
+
""", conn=c)
|
|
330
|
+
|
|
331
|
+
# Swap: drop original, rename new
|
|
332
|
+
self.execute(f"DROP TABLE IF EXISTS {safe}", conn=c)
|
|
333
|
+
self.execute(
|
|
334
|
+
f"ALTER TABLE {safe_new} RENAME TO `{table_name}`", conn=c
|
|
335
|
+
)
|
|
336
|
+
self.execute(f"DROP VIEW IF EXISTS {safe_stage}", conn=c)
|
|
337
|
+
|
|
338
|
+
except Exception:
|
|
339
|
+
try:
|
|
340
|
+
self.execute(f"DROP VIEW IF EXISTS {safe_stage}", conn=c)
|
|
341
|
+
except Exception:
|
|
342
|
+
pass
|
|
343
|
+
try:
|
|
344
|
+
self.execute(f"DROP TABLE IF EXISTS {safe_new}", conn=c)
|
|
345
|
+
except Exception:
|
|
346
|
+
pass
|
|
347
|
+
raise
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
_IDENTIFIER_RE = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]*$")
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def sanitize_name(name: str) -> str:
|
|
9
|
+
"""Validate and quote a SQL identifier to prevent injection."""
|
|
10
|
+
if not _IDENTIFIER_RE.match(name):
|
|
11
|
+
raise ValueError(f"Invalid SQL identifier: {name!r}")
|
|
12
|
+
return f'"{name}"'
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class WarehouseAdapter(ABC):
|
|
16
|
+
@abstractmethod
|
|
17
|
+
def connect(self) -> None:
|
|
18
|
+
...
|
|
19
|
+
|
|
20
|
+
@abstractmethod
|
|
21
|
+
def disconnect(self) -> None:
|
|
22
|
+
...
|
|
23
|
+
|
|
24
|
+
@abstractmethod
|
|
25
|
+
def execute(self, sql: str, conn=None) -> Any:
|
|
26
|
+
...
|
|
27
|
+
|
|
28
|
+
@abstractmethod
|
|
29
|
+
def execute_model(
|
|
30
|
+
self, sql: str, table_name: str, materialized: str = "view",
|
|
31
|
+
conn=None, unique_key: str | None = None,
|
|
32
|
+
incremental_strategy: str = "append",
|
|
33
|
+
) -> None:
|
|
34
|
+
...
|
|
35
|
+
|
|
36
|
+
@abstractmethod
|
|
37
|
+
def table_exists(self, table_name: str, conn=None) -> bool:
|
|
38
|
+
...
|
|
39
|
+
|
|
40
|
+
@abstractmethod
|
|
41
|
+
def table_schema(self, table_name: str, conn=None) -> list[dict]:
|
|
42
|
+
...
|
|
43
|
+
|
|
44
|
+
@abstractmethod
|
|
45
|
+
def drop_table(self, table_name: str, materialized: str = "view", conn=None) -> None:
|
|
46
|
+
...
|
|
47
|
+
|
|
48
|
+
def acquire_conn(self) -> Any:
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
def release_conn(self, conn: Any) -> None:
|
|
52
|
+
pass
|
|
53
|
+
|
|
54
|
+
def preview(self, sql: str, limit: int = 100, conn=None) -> list[dict]:
|
|
55
|
+
wrapped = f"SELECT * FROM ({sql}) AS _km_preview LIMIT {limit}"
|
|
56
|
+
return self.execute(wrapped, conn=conn)
|
|
57
|
+
|
|
58
|
+
def fetch_row_count(self, table_name: str, conn=None) -> int:
|
|
59
|
+
result = self.execute(f"SELECT COUNT(*) AS cnt FROM {table_name}", conn=conn)
|
|
60
|
+
if result and len(result) > 0:
|
|
61
|
+
return result[0]["cnt"]
|
|
62
|
+
return 0
|
|
63
|
+
|
|
64
|
+
def load_csv(self, path: str, table_name: str, delimiter: str = ",") -> None:
|
|
65
|
+
"""Load a CSV/TSV file into a table. Override for warehouse-native ingest."""
|
|
66
|
+
import pandas as pd
|
|
67
|
+
df = pd.read_csv(path, sep=delimiter)
|
|
68
|
+
self._write_df(df, table_name)
|
|
69
|
+
|
|
70
|
+
def execute_snapshot(
|
|
71
|
+
self,
|
|
72
|
+
sql: str,
|
|
73
|
+
table_name: str,
|
|
74
|
+
unique_key: str,
|
|
75
|
+
strategy: str = "timestamp",
|
|
76
|
+
updated_at: str = "updated_at",
|
|
77
|
+
conn=None,
|
|
78
|
+
) -> None:
|
|
79
|
+
"""SCD Type 2 snapshot. Override per adapter."""
|
|
80
|
+
raise NotImplementedError(
|
|
81
|
+
f"Snapshots are not yet implemented for {self.__class__.__name__}. "
|
|
82
|
+
f"Supported: DuckDB, Postgres, Snowflake, BigQuery, Databricks, Fabric, Redshift."
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
def execute_materialized_view(
|
|
86
|
+
self,
|
|
87
|
+
sql: str,
|
|
88
|
+
table_name: str,
|
|
89
|
+
conn=None,
|
|
90
|
+
) -> None:
|
|
91
|
+
"""Create or refresh a materialized view. Falls back to table if unsupported."""
|
|
92
|
+
# Default: fall back to regular table (DuckDB, MySQL, Hive don't support MV natively)
|
|
93
|
+
self.drop_table(table_name, materialized="table", conn=conn)
|
|
94
|
+
self.execute_model(sql, table_name, materialized="table", conn=conn)
|
|
95
|
+
|
|
96
|
+
def _write_df(self, df, table_name: str) -> None:
|
|
97
|
+
"""Write a pandas DataFrame to the warehouse as a table."""
|
|
98
|
+
import pandas as pd
|
|
99
|
+
from io import StringIO
|
|
100
|
+
buf = StringIO()
|
|
101
|
+
df.to_csv(buf, index=False)
|
|
102
|
+
buf.seek(0)
|
|
103
|
+
lines = []
|
|
104
|
+
for _ in range(20):
|
|
105
|
+
line = buf.readline()
|
|
106
|
+
if not line:
|
|
107
|
+
break
|
|
108
|
+
lines.append(line.strip())
|
|
109
|
+
header = lines[0].split(",") if lines else []
|
|
110
|
+
sample = []
|
|
111
|
+
for line in lines[1:]:
|
|
112
|
+
vals = line.split(",")
|
|
113
|
+
if len(vals) == len(header):
|
|
114
|
+
sample.append(vals)
|
|
115
|
+
import re
|
|
116
|
+
clean = re.sub(r"[^a-zA-Z0-9_]", "_", table_name)
|
|
117
|
+
col_defs = ", ".join(
|
|
118
|
+
f'"{c}" VARCHAR' for c in header
|
|
119
|
+
)
|
|
120
|
+
stmt = f"CREATE TABLE IF NOT EXISTS \"{clean}\" ({col_defs})"
|
|
121
|
+
self.execute(stmt)
|
|
122
|
+
placeholders = ", ".join("?" for _ in header)
|
|
123
|
+
insert = f'INSERT INTO "{clean}" VALUES ({placeholders})'
|
|
124
|
+
for row in sample:
|
|
125
|
+
self.execute(insert, list(row))
|
|
126
|
+
# Rest via chunked INSERT from buffered CSV
|
|
127
|
+
buf.seek(0)
|
|
128
|
+
next(buf) # skip header
|
|
129
|
+
chunk = []
|
|
130
|
+
for line in buf:
|
|
131
|
+
vals = line.strip().split(",")
|
|
132
|
+
if len(vals) == len(header):
|
|
133
|
+
chunk.append(vals)
|
|
134
|
+
if len(chunk) >= 500:
|
|
135
|
+
for row in chunk:
|
|
136
|
+
self.execute(insert, row)
|
|
137
|
+
chunk = []
|
|
138
|
+
for row in chunk:
|
|
139
|
+
self.execute(insert, row)
|