squirrels 0.1.0__py3-none-any.whl → 0.6.0.post0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dateutils/__init__.py +6 -0
- dateutils/_enums.py +25 -0
- squirrels/dateutils.py → dateutils/_implementation.py +409 -380
- dateutils/types.py +6 -0
- squirrels/__init__.py +21 -18
- squirrels/_api_routes/__init__.py +5 -0
- squirrels/_api_routes/auth.py +337 -0
- squirrels/_api_routes/base.py +196 -0
- squirrels/_api_routes/dashboards.py +156 -0
- squirrels/_api_routes/data_management.py +148 -0
- squirrels/_api_routes/datasets.py +220 -0
- squirrels/_api_routes/project.py +289 -0
- squirrels/_api_server.py +552 -134
- squirrels/_arguments/__init__.py +0 -0
- squirrels/_arguments/init_time_args.py +83 -0
- squirrels/_arguments/run_time_args.py +111 -0
- squirrels/_auth.py +777 -0
- squirrels/_command_line.py +239 -107
- squirrels/_compile_prompts.py +147 -0
- squirrels/_connection_set.py +94 -0
- squirrels/_constants.py +141 -64
- squirrels/_dashboards.py +179 -0
- squirrels/_data_sources.py +570 -0
- squirrels/_dataset_types.py +91 -0
- squirrels/_env_vars.py +209 -0
- squirrels/_exceptions.py +29 -0
- squirrels/_http_error_responses.py +52 -0
- squirrels/_initializer.py +319 -110
- squirrels/_logging.py +121 -0
- squirrels/_manifest.py +357 -187
- squirrels/_mcp_server.py +578 -0
- squirrels/_model_builder.py +69 -0
- squirrels/_model_configs.py +74 -0
- squirrels/_model_queries.py +52 -0
- squirrels/_models.py +1201 -0
- squirrels/_package_data/base_project/.env +7 -0
- squirrels/_package_data/base_project/.env.example +44 -0
- squirrels/_package_data/base_project/connections.yml +16 -0
- squirrels/_package_data/base_project/dashboards/dashboard_example.py +40 -0
- squirrels/_package_data/base_project/dashboards/dashboard_example.yml +22 -0
- squirrels/_package_data/base_project/docker/.dockerignore +16 -0
- squirrels/_package_data/base_project/docker/Dockerfile +16 -0
- squirrels/_package_data/base_project/docker/compose.yml +7 -0
- squirrels/_package_data/base_project/duckdb_init.sql +10 -0
- squirrels/_package_data/base_project/gitignore +13 -0
- squirrels/_package_data/base_project/macros/macros_example.sql +17 -0
- squirrels/_package_data/base_project/models/builds/build_example.py +26 -0
- squirrels/_package_data/base_project/models/builds/build_example.sql +16 -0
- squirrels/_package_data/base_project/models/builds/build_example.yml +57 -0
- squirrels/_package_data/base_project/models/dbviews/dbview_example.sql +17 -0
- squirrels/_package_data/base_project/models/dbviews/dbview_example.yml +32 -0
- squirrels/_package_data/base_project/models/federates/federate_example.py +51 -0
- squirrels/_package_data/base_project/models/federates/federate_example.sql +21 -0
- squirrels/_package_data/base_project/models/federates/federate_example.yml +65 -0
- squirrels/_package_data/base_project/models/sources.yml +38 -0
- squirrels/_package_data/base_project/parameters.yml +142 -0
- squirrels/_package_data/base_project/pyconfigs/connections.py +19 -0
- squirrels/_package_data/base_project/pyconfigs/context.py +96 -0
- squirrels/_package_data/base_project/pyconfigs/parameters.py +141 -0
- squirrels/_package_data/base_project/pyconfigs/user.py +56 -0
- squirrels/_package_data/base_project/resources/expenses.db +0 -0
- squirrels/_package_data/base_project/resources/public/.gitkeep +0 -0
- squirrels/_package_data/base_project/resources/weather.db +0 -0
- squirrels/_package_data/base_project/seeds/seed_categories.csv +6 -0
- squirrels/_package_data/base_project/seeds/seed_categories.yml +15 -0
- squirrels/_package_data/base_project/seeds/seed_subcategories.csv +15 -0
- squirrels/_package_data/base_project/seeds/seed_subcategories.yml +21 -0
- squirrels/_package_data/base_project/squirrels.yml.j2 +61 -0
- squirrels/_package_data/base_project/tmp/.gitignore +2 -0
- squirrels/_package_data/templates/login_successful.html +53 -0
- squirrels/_package_data/templates/squirrels_studio.html +22 -0
- squirrels/_package_loader.py +29 -0
- squirrels/_parameter_configs.py +592 -0
- squirrels/_parameter_options.py +348 -0
- squirrels/_parameter_sets.py +207 -0
- squirrels/_parameters.py +1703 -0
- squirrels/_project.py +796 -0
- squirrels/_py_module.py +122 -0
- squirrels/_request_context.py +33 -0
- squirrels/_schemas/__init__.py +0 -0
- squirrels/_schemas/auth_models.py +83 -0
- squirrels/_schemas/query_param_models.py +70 -0
- squirrels/_schemas/request_models.py +26 -0
- squirrels/_schemas/response_models.py +286 -0
- squirrels/_seeds.py +97 -0
- squirrels/_sources.py +112 -0
- squirrels/_utils.py +540 -149
- squirrels/_version.py +1 -3
- squirrels/arguments.py +7 -0
- squirrels/auth.py +4 -0
- squirrels/connections.py +3 -0
- squirrels/dashboards.py +3 -0
- squirrels/data_sources.py +14 -282
- squirrels/parameter_options.py +13 -189
- squirrels/parameters.py +14 -801
- squirrels/types.py +18 -0
- squirrels-0.6.0.post0.dist-info/METADATA +148 -0
- squirrels-0.6.0.post0.dist-info/RECORD +101 -0
- {squirrels-0.1.0.dist-info → squirrels-0.6.0.post0.dist-info}/WHEEL +1 -2
- {squirrels-0.1.0.dist-info → squirrels-0.6.0.post0.dist-info}/entry_points.txt +1 -0
- squirrels-0.6.0.post0.dist-info/licenses/LICENSE +201 -0
- squirrels/_credentials_manager.py +0 -87
- squirrels/_module_loader.py +0 -37
- squirrels/_parameter_set.py +0 -151
- squirrels/_renderer.py +0 -286
- squirrels/_timed_imports.py +0 -37
- squirrels/connection_set.py +0 -126
- squirrels/package_data/base_project/.gitignore +0 -4
- squirrels/package_data/base_project/connections.py +0 -21
- squirrels/package_data/base_project/database/sample_database.db +0 -0
- squirrels/package_data/base_project/database/seattle_weather.db +0 -0
- squirrels/package_data/base_project/datasets/sample_dataset/context.py +0 -8
- squirrels/package_data/base_project/datasets/sample_dataset/database_view1.py +0 -23
- squirrels/package_data/base_project/datasets/sample_dataset/database_view1.sql.j2 +0 -7
- squirrels/package_data/base_project/datasets/sample_dataset/final_view.py +0 -10
- squirrels/package_data/base_project/datasets/sample_dataset/final_view.sql.j2 +0 -2
- squirrels/package_data/base_project/datasets/sample_dataset/parameters.py +0 -30
- squirrels/package_data/base_project/datasets/sample_dataset/selections.cfg +0 -6
- squirrels/package_data/base_project/squirrels.yaml +0 -26
- squirrels/package_data/static/favicon.ico +0 -0
- squirrels/package_data/static/script.js +0 -234
- squirrels/package_data/static/style.css +0 -110
- squirrels/package_data/templates/index.html +0 -32
- squirrels-0.1.0.dist-info/LICENSE +0 -22
- squirrels-0.1.0.dist-info/METADATA +0 -67
- squirrels-0.1.0.dist-info/RECORD +0 -40
- squirrels-0.1.0.dist-info/top_level.txt +0 -1
squirrels/_models.py
ADDED
|
@@ -0,0 +1,1201 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Callable, Any
|
|
3
|
+
from dataclasses import dataclass, field, KW_ONLY
|
|
4
|
+
from abc import ABCMeta, abstractmethod
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
import asyncio, os, re, time, duckdb, sqlglot
|
|
8
|
+
import polars as pl, pandas as pd
|
|
9
|
+
|
|
10
|
+
from . import _constants as c, _utils as u, _py_module as pm, _model_queries as mq, _model_configs as mc, _sources as src
|
|
11
|
+
from ._schemas import response_models as rm
|
|
12
|
+
from ._exceptions import FileExecutionError, InvalidInputError
|
|
13
|
+
from ._arguments.run_time_args import ContextArgs, ModelArgs, BuildModelArgs
|
|
14
|
+
from ._auth import AbstractUser
|
|
15
|
+
from ._connection_set import ConnectionsArgs, ConnectionSet, ConnectionProperties
|
|
16
|
+
from ._manifest import DatasetConfig, ConnectionTypeEnum
|
|
17
|
+
from ._parameter_sets import ParameterConfigsSet, ParametersArgs, ParameterSet
|
|
18
|
+
from ._env_vars import SquirrelsEnvVars
|
|
19
|
+
|
|
20
|
+
ContextFunc = Callable[[dict[str, Any], ContextArgs], None]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ModelType(Enum):
|
|
24
|
+
SEED = "seed"
|
|
25
|
+
SOURCE = "source"
|
|
26
|
+
BUILD = "build"
|
|
27
|
+
DBVIEW = "dbview"
|
|
28
|
+
FEDERATE = "federate"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class DataModel(metaclass=ABCMeta):
|
|
33
|
+
name: str
|
|
34
|
+
model_config: mc.ModelConfig
|
|
35
|
+
is_target: bool = field(default=False, init=False)
|
|
36
|
+
|
|
37
|
+
result: pl.LazyFrame | None = field(default=None, init=False, repr=False)
|
|
38
|
+
needs_python_df: bool = field(default=False, init=False)
|
|
39
|
+
|
|
40
|
+
wait_count: int = field(default=0, init=False, repr=False)
|
|
41
|
+
confirmed_no_cycles: bool = field(default=False, init=False)
|
|
42
|
+
upstreams: dict[str, DataModel] = field(default_factory=dict, init=False, repr=False)
|
|
43
|
+
downstreams: dict[str, DataModel] = field(default_factory=dict, init=False, repr=False)
|
|
44
|
+
|
|
45
|
+
_: KW_ONLY
|
|
46
|
+
logger: u.Logger = field(default_factory=lambda: u.Logger(""))
|
|
47
|
+
conn_set: ConnectionSet = field(default_factory=ConnectionSet)
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
@abstractmethod
|
|
51
|
+
def model_type(self) -> ModelType:
|
|
52
|
+
pass
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def is_queryable(self) -> bool:
|
|
56
|
+
return True
|
|
57
|
+
|
|
58
|
+
def compile(
|
|
59
|
+
self, ctx: dict[str, Any], ctx_args: ContextArgs, models_dict: dict[str, DataModel], recurse: bool
|
|
60
|
+
) -> None:
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
def get_terminal_nodes(self, depencency_path: set[str]) -> set[str]:
|
|
64
|
+
if self.confirmed_no_cycles:
|
|
65
|
+
return set()
|
|
66
|
+
|
|
67
|
+
if self.name in depencency_path:
|
|
68
|
+
raise u.ConfigurationError(f'Cycle found in model dependency graph')
|
|
69
|
+
|
|
70
|
+
terminal_nodes = set()
|
|
71
|
+
if len(self.upstreams) == 0:
|
|
72
|
+
terminal_nodes.add(self.name)
|
|
73
|
+
else:
|
|
74
|
+
new_path = set(depencency_path)
|
|
75
|
+
new_path.add(self.name)
|
|
76
|
+
for dep_model in self.upstreams.values():
|
|
77
|
+
terminal_nodes.update(dep_model.get_terminal_nodes(new_path))
|
|
78
|
+
|
|
79
|
+
self.confirmed_no_cycles = True
|
|
80
|
+
return terminal_nodes
|
|
81
|
+
|
|
82
|
+
def _load_duckdb_view_to_python_df(self, conn: duckdb.DuckDBPyConnection, *, use_datalake: bool = False) -> pl.LazyFrame:
|
|
83
|
+
table_name = ("vdl." if use_datalake else "") + self.name
|
|
84
|
+
try:
|
|
85
|
+
return conn.sql(f"FROM {table_name}").pl().lazy()
|
|
86
|
+
except duckdb.CatalogException as e:
|
|
87
|
+
raise u.ConfigurationError(f'Failed to load duckdb table or view "{self.name}" to python dataframe') from e
|
|
88
|
+
|
|
89
|
+
def _run_sql_query_on_connection(self, connection_name: str, query: str, placeholders: dict = {}) -> pl.DataFrame:
|
|
90
|
+
self.logger.debug(f"Running SQL query on connection '{connection_name}':\n{query}")
|
|
91
|
+
return self.conn_set.run_sql_query_from_conn_name(query, connection_name, placeholders)
|
|
92
|
+
|
|
93
|
+
async def _trigger(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
|
|
94
|
+
self.wait_count -= 1
|
|
95
|
+
if (self.wait_count == 0):
|
|
96
|
+
await self.run_model(conn, placeholders)
|
|
97
|
+
|
|
98
|
+
async def run_model(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
|
|
99
|
+
coroutines = []
|
|
100
|
+
for model in self.downstreams.values():
|
|
101
|
+
coroutines.append(model._trigger(conn, placeholders))
|
|
102
|
+
await u.asyncio_gather(coroutines)
|
|
103
|
+
|
|
104
|
+
def retrieve_dependent_query_models(self, dependent_model_names: set[str]) -> None:
|
|
105
|
+
pass
|
|
106
|
+
|
|
107
|
+
def _register_all_upstream_python_df_helper(self, conn: duckdb.DuckDBPyConnection, tables_set: set[str]) -> None:
|
|
108
|
+
if self.result is not None and self.name not in tables_set:
|
|
109
|
+
conn.register(self.name, self.result)
|
|
110
|
+
for dep_model in self.upstreams.values():
|
|
111
|
+
dep_model._register_all_upstream_python_df_helper(conn, tables_set)
|
|
112
|
+
|
|
113
|
+
def register_all_upstream_python_df(self, conn: duckdb.DuckDBPyConnection) -> None:
|
|
114
|
+
show_tables_query = f"SHOW TABLES"
|
|
115
|
+
tables_df = conn.sql(show_tables_query).pl()
|
|
116
|
+
tables_set = set(tables_df["name"])
|
|
117
|
+
self._register_all_upstream_python_df_helper(conn, tables_set)
|
|
118
|
+
|
|
119
|
+
def get_max_path_length_to_target(self) -> int | None:
|
|
120
|
+
if not hasattr(self, "max_path_len_to_target"):
|
|
121
|
+
path_lengths = []
|
|
122
|
+
for child_model in self.downstreams.values():
|
|
123
|
+
assert isinstance(child_model_path_length := child_model.get_max_path_length_to_target(), int)
|
|
124
|
+
path_lengths.append(child_model_path_length+1)
|
|
125
|
+
if len(path_lengths) > 0:
|
|
126
|
+
self.max_path_len_to_target = max(path_lengths)
|
|
127
|
+
else:
|
|
128
|
+
self.max_path_len_to_target = 0 if self.is_target else None
|
|
129
|
+
return self.max_path_len_to_target
|
|
130
|
+
|
|
131
|
+
async def _trigger_build(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
|
|
132
|
+
pass
|
|
133
|
+
|
|
134
|
+
def _create_table_from_df(self, conn: duckdb.DuckDBPyConnection, query_result: pl.LazyFrame | pd.DataFrame):
|
|
135
|
+
local_conn = conn.cursor()
|
|
136
|
+
# local_conn = conn
|
|
137
|
+
try:
|
|
138
|
+
assert query_result is not None
|
|
139
|
+
local_conn.execute(f"CREATE OR REPLACE TABLE {self.name} AS FROM query_result")
|
|
140
|
+
finally:
|
|
141
|
+
local_conn.close()
|
|
142
|
+
# pass
|
|
143
|
+
|
|
144
|
+
def process_pass_through_columns(self, models_dict: dict[str, DataModel]) -> None:
|
|
145
|
+
pass
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
@dataclass
|
|
149
|
+
class StaticModel(DataModel):
|
|
150
|
+
needs_python_df_for_build: bool = field(default=False, init=False)
|
|
151
|
+
wait_count_for_build: int = field(default=0, init=False, repr=False)
|
|
152
|
+
upstreams_for_build: dict[str, StaticModel] = field(default_factory=dict, init=False, repr=False)
|
|
153
|
+
downstreams_for_build: dict[str, StaticModel] = field(default_factory=dict, init=False, repr=False)
|
|
154
|
+
|
|
155
|
+
def get_terminal_nodes_for_build(self, depencency_path: set[str]) -> set[str]:
|
|
156
|
+
if self.confirmed_no_cycles:
|
|
157
|
+
return set()
|
|
158
|
+
|
|
159
|
+
if self.name in depencency_path:
|
|
160
|
+
raise u.ConfigurationError(f'Cycle found in model dependency graph')
|
|
161
|
+
|
|
162
|
+
terminal_nodes = set()
|
|
163
|
+
if len(self.upstreams_for_build) == 0:
|
|
164
|
+
terminal_nodes.add(self.name)
|
|
165
|
+
else:
|
|
166
|
+
new_path = set(depencency_path)
|
|
167
|
+
new_path.add(self.name)
|
|
168
|
+
for dep_model in self.upstreams_for_build.values():
|
|
169
|
+
terminal_nodes.update(dep_model.get_terminal_nodes_for_build(new_path))
|
|
170
|
+
|
|
171
|
+
self.confirmed_no_cycles = True
|
|
172
|
+
return terminal_nodes
|
|
173
|
+
|
|
174
|
+
def _get_result(self, conn: duckdb.DuckDBPyConnection) -> pl.LazyFrame:
|
|
175
|
+
local_conn = conn.cursor()
|
|
176
|
+
try:
|
|
177
|
+
return self._load_duckdb_view_to_python_df(local_conn, use_datalake=True)
|
|
178
|
+
except Exception as e:
|
|
179
|
+
raise InvalidInputError(409, f'dependent_data_model_not_found', f'Model "{self.name}" depends on static data models that cannot be found. Try building the Virtual Data Lake (VDL) first.')
|
|
180
|
+
finally:
|
|
181
|
+
local_conn.close()
|
|
182
|
+
|
|
183
|
+
async def run_model(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
|
|
184
|
+
if (self.needs_python_df or self.is_target) and self.result is None:
|
|
185
|
+
start = time.time()
|
|
186
|
+
|
|
187
|
+
self.result = await asyncio.to_thread(self._get_result, conn)
|
|
188
|
+
|
|
189
|
+
self.logger.log_activity_time(
|
|
190
|
+
f"loading {self.model_type.value} model '{self.name}' into memory", start,
|
|
191
|
+
additional_data={
|
|
192
|
+
"activity": "loading static data model into memory",
|
|
193
|
+
"model_name": self.name,
|
|
194
|
+
"model_type": self.model_type.value
|
|
195
|
+
}
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
await super().run_model(conn, placeholders)
|
|
199
|
+
|
|
200
|
+
def compile_for_build(
|
|
201
|
+
self, conn_args: ConnectionsArgs, models_dict: dict[str, StaticModel]
|
|
202
|
+
) -> None:
|
|
203
|
+
pass
|
|
204
|
+
|
|
205
|
+
async def _trigger_build(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
|
|
206
|
+
self.wait_count_for_build -= 1
|
|
207
|
+
if (self.wait_count_for_build == 0):
|
|
208
|
+
await self.build_model(conn, full_refresh)
|
|
209
|
+
|
|
210
|
+
async def build_model(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
|
|
211
|
+
if self.needs_python_df and self.result is None:
|
|
212
|
+
local_conn = conn.cursor()
|
|
213
|
+
try:
|
|
214
|
+
self.result = await asyncio.to_thread(self._load_duckdb_view_to_python_df, local_conn)
|
|
215
|
+
finally:
|
|
216
|
+
local_conn.close()
|
|
217
|
+
|
|
218
|
+
coroutines = []
|
|
219
|
+
for model in self.downstreams_for_build.values():
|
|
220
|
+
coroutines.append(model._trigger_build(conn, full_refresh))
|
|
221
|
+
await u.asyncio_gather(coroutines)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
@dataclass
|
|
225
|
+
class Seed(StaticModel):
|
|
226
|
+
model_config: mc.SeedConfig
|
|
227
|
+
result: pl.LazyFrame
|
|
228
|
+
|
|
229
|
+
@property
|
|
230
|
+
def model_type(self) -> ModelType:
|
|
231
|
+
return ModelType.SEED
|
|
232
|
+
|
|
233
|
+
async def build_model(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
|
|
234
|
+
start = time.time()
|
|
235
|
+
|
|
236
|
+
print(f"[{u.get_current_time()}] 🔨 BUILDING: seed model '{self.name}'")
|
|
237
|
+
# await asyncio.to_thread(self._create_table_from_df, conn, self.result)
|
|
238
|
+
self._create_table_from_df(conn, self.result) # without threading
|
|
239
|
+
|
|
240
|
+
print(f"[{u.get_current_time()}] ✅ FINISHED: seed model '{self.name}'")
|
|
241
|
+
self.logger.log_activity_time(
|
|
242
|
+
f"building seed model '{self.name}' into VDL", start,
|
|
243
|
+
additional_data={
|
|
244
|
+
"activity": "building data model into VDL",
|
|
245
|
+
"model_name": self.name,
|
|
246
|
+
"model_type": self.model_type.value
|
|
247
|
+
}
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
await super().build_model(conn, full_refresh)
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
@dataclass
|
|
254
|
+
class SourceModel(StaticModel):
|
|
255
|
+
model_config: src.Source
|
|
256
|
+
|
|
257
|
+
@property
|
|
258
|
+
def model_type(self) -> ModelType:
|
|
259
|
+
return ModelType.SOURCE
|
|
260
|
+
|
|
261
|
+
@property
|
|
262
|
+
def connection_props(self) -> ConnectionProperties:
|
|
263
|
+
conn_name = self.model_config.get_connection()
|
|
264
|
+
conn_props = self.conn_set.get_connection(conn_name)
|
|
265
|
+
if isinstance(conn_props, ConnectionProperties):
|
|
266
|
+
return conn_props
|
|
267
|
+
raise u.ConfigurationError(f'Unable to use connection "{conn_name}" for source "{self.name}". Connection "{conn_name}" must be a ConnectionProperties object')
|
|
268
|
+
|
|
269
|
+
@property
|
|
270
|
+
def is_queryable(self) -> bool:
|
|
271
|
+
connection_props = self.connection_props
|
|
272
|
+
return self.model_config.load_to_vdl or connection_props.type == ConnectionTypeEnum.DUCKDB
|
|
273
|
+
|
|
274
|
+
def _build_source_model(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
|
|
275
|
+
local_conn = conn.cursor()
|
|
276
|
+
# local_conn = conn
|
|
277
|
+
|
|
278
|
+
local_conn.begin()
|
|
279
|
+
try:
|
|
280
|
+
source = self.model_config
|
|
281
|
+
conn_name = source.get_connection()
|
|
282
|
+
|
|
283
|
+
connection_props = self.connection_props
|
|
284
|
+
dialect = connection_props.dialect
|
|
285
|
+
attach_uri = connection_props.attach_uri_for_duckdb
|
|
286
|
+
if attach_uri is None:
|
|
287
|
+
raise u.ConfigurationError(f'Loading to duckdb is not supported for source "{self.name}" since its connection "{conn_name}" uses an unsupported dialect')
|
|
288
|
+
|
|
289
|
+
result = u.run_duckdb_stmt(self.logger, local_conn, f"FROM (SHOW DATABASES) WHERE database_name = 'db_{conn_name}'").fetchone()
|
|
290
|
+
if result is None:
|
|
291
|
+
return # skip this source if connection is not attached
|
|
292
|
+
|
|
293
|
+
table_name = source.get_table()
|
|
294
|
+
new_table_name = self.name
|
|
295
|
+
|
|
296
|
+
if len(source.columns) == 0:
|
|
297
|
+
stmt = f"CREATE OR REPLACE TABLE {new_table_name} AS FROM db_{conn_name}.{table_name}"
|
|
298
|
+
u.run_duckdb_stmt(self.logger, local_conn, stmt)
|
|
299
|
+
local_conn.commit()
|
|
300
|
+
return
|
|
301
|
+
|
|
302
|
+
increasing_column = source.update_hints.increasing_column
|
|
303
|
+
recreate_table = full_refresh or increasing_column is None
|
|
304
|
+
if recreate_table:
|
|
305
|
+
u.run_duckdb_stmt(self.logger, local_conn, f"DROP TABLE IF EXISTS {new_table_name}")
|
|
306
|
+
|
|
307
|
+
create_table_cols_clause = source.get_cols_for_create_table_stmt()
|
|
308
|
+
stmt = f"CREATE TABLE IF NOT EXISTS {new_table_name} ({create_table_cols_clause})"
|
|
309
|
+
u.run_duckdb_stmt(self.logger, local_conn, stmt)
|
|
310
|
+
|
|
311
|
+
if not recreate_table:
|
|
312
|
+
if source.update_hints.selective_overwrite_value is not None:
|
|
313
|
+
stmt = f"DELETE FROM {new_table_name} WHERE {increasing_column} >= $value"
|
|
314
|
+
u.run_duckdb_stmt(self.logger, local_conn, stmt, params={"value": source.update_hints.selective_overwrite_value})
|
|
315
|
+
elif not source.update_hints.strictly_increasing:
|
|
316
|
+
stmt = f"DELETE FROM {new_table_name} WHERE {increasing_column} = ({source.get_max_incr_col_query(new_table_name)})"
|
|
317
|
+
u.run_duckdb_stmt(self.logger, local_conn, stmt)
|
|
318
|
+
|
|
319
|
+
max_val_of_incr_col = None
|
|
320
|
+
if increasing_column is not None:
|
|
321
|
+
max_val_of_incr_col_tuple = u.run_duckdb_stmt(self.logger, local_conn, source.get_max_incr_col_query(new_table_name)).fetchone()
|
|
322
|
+
max_val_of_incr_col = max_val_of_incr_col_tuple[0] if isinstance(max_val_of_incr_col_tuple, tuple) else None
|
|
323
|
+
if max_val_of_incr_col is None:
|
|
324
|
+
recreate_table = True
|
|
325
|
+
|
|
326
|
+
query = source.get_query_for_upsert(dialect, conn_name, table_name, max_val_of_incr_col, full_refresh=recreate_table)
|
|
327
|
+
|
|
328
|
+
primary_keys = ", ".join(source.primary_key) if source.primary_key else ""
|
|
329
|
+
match_condition = f"USING ({primary_keys})" if primary_keys else "ON false"
|
|
330
|
+
stmt = (
|
|
331
|
+
f"MERGE INTO {new_table_name} "
|
|
332
|
+
f"USING ({query}) AS src "
|
|
333
|
+
f"{match_condition} "
|
|
334
|
+
f"WHEN MATCHED THEN UPDATE "
|
|
335
|
+
f"WHEN NOT MATCHED THEN INSERT BY NAME"
|
|
336
|
+
)
|
|
337
|
+
u.run_duckdb_stmt(self.logger, local_conn, stmt)
|
|
338
|
+
|
|
339
|
+
local_conn.commit()
|
|
340
|
+
|
|
341
|
+
finally:
|
|
342
|
+
local_conn.close()
|
|
343
|
+
# pass
|
|
344
|
+
|
|
345
|
+
async def build_model(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
|
|
346
|
+
if self.model_config.load_to_vdl:
|
|
347
|
+
start = time.time()
|
|
348
|
+
print(f"[{u.get_current_time()}] 🔨 BUILDING: source model '{self.name}'")
|
|
349
|
+
|
|
350
|
+
# await asyncio.to_thread(self._build_source_model, conn, full_refresh)
|
|
351
|
+
self._build_source_model(conn, full_refresh) # without threading
|
|
352
|
+
|
|
353
|
+
print(f"[{u.get_current_time()}] ✅ FINISHED: source model '{self.name}'")
|
|
354
|
+
self.logger.log_activity_time(
|
|
355
|
+
f"building source model '{self.name}' into VDL", start,
|
|
356
|
+
additional_data={
|
|
357
|
+
"activity": "building data model into VDL",
|
|
358
|
+
"model_name": self.name,
|
|
359
|
+
"model_type": self.model_type.value
|
|
360
|
+
}
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
await super().build_model(conn, full_refresh)
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
@dataclass
|
|
367
|
+
class QueryModel(DataModel):
|
|
368
|
+
model_config: mc.QueryModelConfig
|
|
369
|
+
query_file: mq.QueryFile
|
|
370
|
+
compiled_query: mq.Query | None = field(default=None, init=False)
|
|
371
|
+
_: KW_ONLY
|
|
372
|
+
j2_env: u.j2.Environment = field(default_factory=lambda: u.j2.Environment(loader=u.j2.FileSystemLoader(".")))
|
|
373
|
+
|
|
374
|
+
def _add_upstream(self, other: DataModel) -> None:
|
|
375
|
+
self.upstreams[other.name] = other
|
|
376
|
+
other.downstreams[self.name] = self
|
|
377
|
+
|
|
378
|
+
if isinstance(self.query_file, mq.PyQueryFile):
|
|
379
|
+
other.needs_python_df = True
|
|
380
|
+
|
|
381
|
+
def _ref_for_sql(self, dependent_model_name: str, models_dict: dict[str, DataModel]) -> str:
|
|
382
|
+
if dependent_model_name not in models_dict:
|
|
383
|
+
raise u.ConfigurationError(f'Model "{self.name}" references unknown model "{dependent_model_name}"')
|
|
384
|
+
|
|
385
|
+
dep_model = models_dict[dependent_model_name]
|
|
386
|
+
if isinstance(dep_model, SourceModel) and not dep_model.model_config.load_to_vdl:
|
|
387
|
+
# Allow when caller is Build or Federate AND the source connection is duckdb; else error
|
|
388
|
+
conn_name = dep_model.model_config.get_connection()
|
|
389
|
+
conn_props = self.conn_set.get_connection(conn_name)
|
|
390
|
+
is_duckdb_conn = isinstance(conn_props, ConnectionProperties) and conn_props.type == ConnectionTypeEnum.DUCKDB
|
|
391
|
+
if not is_duckdb_conn:
|
|
392
|
+
raise u.ConfigurationError(
|
|
393
|
+
f'Model "{self.name}" cannot reference source model "{dependent_model_name}". '
|
|
394
|
+
'To be referenced by a build or federate model, the source must have load_to_vdl=True or a duckdb connection type.'
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
self.model_config.depends_on.add(dependent_model_name)
|
|
398
|
+
return dependent_model_name
|
|
399
|
+
|
|
400
|
+
def _ref_for_python(self, dependent_model_name: str) -> pl.LazyFrame:
|
|
401
|
+
if dependent_model_name not in self.upstreams:
|
|
402
|
+
raise u.ConfigurationError(f'Model "{self.name}" must include model "{dependent_model_name}" as a dependency to use')
|
|
403
|
+
df = self.upstreams[dependent_model_name].result
|
|
404
|
+
assert df is not None
|
|
405
|
+
return df
|
|
406
|
+
|
|
407
|
+
def _get_compile_sql_model_args_from_ctx_args(
|
|
408
|
+
self, ctx: dict[str, Any], ctx_args: ContextArgs
|
|
409
|
+
) -> dict[str, Any]:
|
|
410
|
+
is_placeholder = lambda placeholder: placeholder in ctx_args._placeholders
|
|
411
|
+
kwargs = {
|
|
412
|
+
"proj_vars": ctx_args.proj_vars, "env_vars": ctx_args.env_vars, "user": ctx_args.user, "prms": ctx_args.prms,
|
|
413
|
+
"configurables": ctx_args.configurables, "ctx": ctx, "is_placeholder": is_placeholder, "set_placeholder": ctx_args.set_placeholder,
|
|
414
|
+
"param_exists": ctx_args.param_exists
|
|
415
|
+
}
|
|
416
|
+
return kwargs
|
|
417
|
+
|
|
418
|
+
def _get_compiled_sql_query_str(self, raw_query: str, kwargs: dict[str, Any]) -> str:
|
|
419
|
+
try:
|
|
420
|
+
template = self.j2_env.from_string(raw_query)
|
|
421
|
+
query = template.render(kwargs)
|
|
422
|
+
except Exception as e:
|
|
423
|
+
raise FileExecutionError(f'Failed to compile sql model "{self.name}"', e) from e
|
|
424
|
+
return query
|
|
425
|
+
|
|
426
|
+
def process_pass_through_columns(self, models_dict: dict[str, DataModel]) -> None:
|
|
427
|
+
if getattr(self, "processed_pass_through_columns", False):
|
|
428
|
+
return
|
|
429
|
+
|
|
430
|
+
for col in self.model_config.columns:
|
|
431
|
+
if col.pass_through:
|
|
432
|
+
# Validate pass-through column has exactly one dependency
|
|
433
|
+
if len(col.depends_on) != 1:
|
|
434
|
+
raise u.ConfigurationError(
|
|
435
|
+
f'Column "{self.name}.{col.name}" has pass_through=true, which must have exactly one depends_on value'
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
# Get the upstream column reference
|
|
439
|
+
upstream_col_ref = next(iter(col.depends_on))
|
|
440
|
+
table_name, col_name = upstream_col_ref.split('.')
|
|
441
|
+
self.model_config.depends_on.add(table_name)
|
|
442
|
+
|
|
443
|
+
# Get the upstream model
|
|
444
|
+
if table_name not in models_dict:
|
|
445
|
+
raise u.ConfigurationError(
|
|
446
|
+
f'Column "{self.name}.{col.name}" depends on unknown model "{table_name}"'
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
# Do not rely on self.upstreams here, as it may not be fully populated for metadata passthrough purposes
|
|
450
|
+
for dep_model_name in self.model_config.depends_on:
|
|
451
|
+
dep_model = models_dict[dep_model_name]
|
|
452
|
+
dep_model.process_pass_through_columns(models_dict)
|
|
453
|
+
|
|
454
|
+
for col in self.model_config.columns:
|
|
455
|
+
if col.pass_through:
|
|
456
|
+
upstream_col_ref = next(iter(col.depends_on))
|
|
457
|
+
table_name, col_name = upstream_col_ref.split('.')
|
|
458
|
+
upstream_model = models_dict[table_name]
|
|
459
|
+
|
|
460
|
+
# Find the upstream column config
|
|
461
|
+
upstream_col = next(
|
|
462
|
+
(c for c in upstream_model.model_config.columns if c.name == col_name),
|
|
463
|
+
None
|
|
464
|
+
)
|
|
465
|
+
if upstream_col is None:
|
|
466
|
+
raise u.ConfigurationError(
|
|
467
|
+
f'Column "{self.name}.{col.name}" depends on unknown column "{upstream_col_ref}"'
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
# Copy metadata from upstream column
|
|
471
|
+
col.type = upstream_col.type if col.type == "" else col.type
|
|
472
|
+
col.condition = upstream_col.condition if col.condition == [] else col.condition
|
|
473
|
+
col.description = upstream_col.description if col.description == "" else col.description
|
|
474
|
+
col.category = upstream_col.category if col.category == mc.ColumnCategory.MISC else col.category
|
|
475
|
+
|
|
476
|
+
self.processed_pass_through_columns = True
|
|
477
|
+
|
|
478
|
+
def retrieve_dependent_query_models(self, dependent_model_names: set[str]) -> None:
|
|
479
|
+
if self.name not in dependent_model_names:
|
|
480
|
+
dependent_model_names.add(self.name)
|
|
481
|
+
for dep_model in self.upstreams.values():
|
|
482
|
+
dep_model.retrieve_dependent_query_models(dependent_model_names)
|
|
483
|
+
|
|
484
|
+
def _log_sql_to_run(self, sql: str, placeholders: dict[str, Any]) -> None:
|
|
485
|
+
log_msg = f"SQL to run for model '{self.name}':\n{sql}"
|
|
486
|
+
log_msg += f"\n\n(with placeholders: {placeholders})"
|
|
487
|
+
self.logger.debug(log_msg)
|
|
488
|
+
|
|
489
|
+
|
|
490
|
+
@dataclass
|
|
491
|
+
class DbviewModel(QueryModel):
|
|
492
|
+
model_config: mc.DbviewModelConfig
|
|
493
|
+
query_file: mq.SqlQueryFile
|
|
494
|
+
compiled_query: mq.SqlModelQuery | None = field(default=None, init=False)
|
|
495
|
+
sources: dict[str, src.Source] = field(default_factory=dict, init=False)
|
|
496
|
+
|
|
497
|
+
@property
|
|
498
|
+
def model_type(self) -> ModelType:
|
|
499
|
+
return ModelType.DBVIEW
|
|
500
|
+
|
|
501
|
+
def _get_compile_sql_model_args(
|
|
502
|
+
self, ctx: dict[str, Any], ctx_args: ContextArgs, models_dict: dict[str, DataModel]
|
|
503
|
+
) -> dict[str, Any]:
|
|
504
|
+
kwargs = self._get_compile_sql_model_args_from_ctx_args(ctx, ctx_args)
|
|
505
|
+
|
|
506
|
+
def source(source_name: str) -> str:
|
|
507
|
+
if source_name not in models_dict or not isinstance(source_model := models_dict[source_name], SourceModel):
|
|
508
|
+
raise u.ConfigurationError(f'Dbview "{self.name}" references unknown source "{source_name}"')
|
|
509
|
+
if source_model.model_config.get_connection() != self.model_config.get_connection():
|
|
510
|
+
raise u.ConfigurationError(f'Dbview "{self.name}" references source "{source_name}" with different connection')
|
|
511
|
+
|
|
512
|
+
# Check if the source model has load_to_vdl=False but this dbview has translate_to_duckdb=True
|
|
513
|
+
if not source_model.model_config.load_to_vdl and self.model_config.translate_to_duckdb:
|
|
514
|
+
raise u.ConfigurationError(
|
|
515
|
+
f'Dbview "{self.name}" with translate_to_duckdb=True cannot reference source "{source_name}" '
|
|
516
|
+
f'which has load_to_vdl=False'
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
self.model_config.depends_on.add(source_name)
|
|
520
|
+
self.sources[source_name] = source_model.model_config
|
|
521
|
+
return "{{ source(\"" + source_name + "\") }}"
|
|
522
|
+
|
|
523
|
+
kwargs["source"] = source
|
|
524
|
+
kwargs["ref"] = source
|
|
525
|
+
return kwargs
|
|
526
|
+
|
|
527
|
+
def _get_duckdb_query(self, read_dialect: str, query: str) -> str:
|
|
528
|
+
kwargs = {
|
|
529
|
+
"source": lambda source_name: "vdl." + source_name
|
|
530
|
+
}
|
|
531
|
+
compiled_query = self._get_compiled_sql_query_str(query, kwargs)
|
|
532
|
+
duckdb_query = sqlglot.transpile(compiled_query, read=read_dialect, write="duckdb", pretty=True)[0]
|
|
533
|
+
return "-- translated to duckdb\n" + duckdb_query
|
|
534
|
+
|
|
535
|
+
def _compile_sql_model(self, kwargs: dict[str, Any]) -> mq.SqlModelQuery:
|
|
536
|
+
compiled_query_str = self._get_compiled_sql_query_str(self.query_file.raw_query, kwargs)
|
|
537
|
+
|
|
538
|
+
connection_name = self.model_config.get_connection()
|
|
539
|
+
connection_props = self.conn_set.get_connection(connection_name)
|
|
540
|
+
|
|
541
|
+
if self.model_config.translate_to_duckdb and isinstance(connection_props, ConnectionProperties):
|
|
542
|
+
# Forbid translate_to_duckdb when dbview connection is duckdb
|
|
543
|
+
if connection_props.type == ConnectionTypeEnum.DUCKDB:
|
|
544
|
+
raise u.ConfigurationError(
|
|
545
|
+
f'Dbview "{self.name}" has translate_to_duckdb=True but its connection is duckdb. Use a federate model instead.'
|
|
546
|
+
)
|
|
547
|
+
macros = {
|
|
548
|
+
"source": lambda source_name: "vdl." + source_name
|
|
549
|
+
}
|
|
550
|
+
compiled_query2 = self._get_compiled_sql_query_str(compiled_query_str, macros)
|
|
551
|
+
compiled_query_str = self._get_duckdb_query(connection_props.dialect, compiled_query2)
|
|
552
|
+
is_duckdb = True
|
|
553
|
+
else:
|
|
554
|
+
macros = {
|
|
555
|
+
"source": lambda source_name: self.sources[source_name].get_table()
|
|
556
|
+
}
|
|
557
|
+
compiled_query_str = self._get_compiled_sql_query_str(compiled_query_str, macros)
|
|
558
|
+
is_duckdb = False
|
|
559
|
+
|
|
560
|
+
compiled_query = mq.SqlModelQuery(compiled_query_str, is_duckdb)
|
|
561
|
+
return compiled_query
|
|
562
|
+
|
|
563
|
+
def compile(
|
|
564
|
+
self, ctx: dict[str, Any], ctx_args: ContextArgs, models_dict: dict[str, DataModel], recurse: bool
|
|
565
|
+
) -> None:
|
|
566
|
+
if self.compiled_query is not None:
|
|
567
|
+
return
|
|
568
|
+
else:
|
|
569
|
+
self.compiled_query = mq.WorkInProgress() # type: ignore
|
|
570
|
+
|
|
571
|
+
start = time.time()
|
|
572
|
+
|
|
573
|
+
kwargs = self._get_compile_sql_model_args(ctx, ctx_args, models_dict)
|
|
574
|
+
self.compiled_query = self._compile_sql_model(kwargs)
|
|
575
|
+
|
|
576
|
+
self.logger.log_activity_time(
|
|
577
|
+
f"compiling dbview model '{self.name}'", start,
|
|
578
|
+
additional_data={
|
|
579
|
+
"activity": "compiling data model",
|
|
580
|
+
"model_name": self.name,
|
|
581
|
+
"model_type": self.model_type.value
|
|
582
|
+
}
|
|
583
|
+
)
|
|
584
|
+
|
|
585
|
+
async def _run_sql_model(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
|
|
586
|
+
assert self.compiled_query is not None
|
|
587
|
+
is_duckdb = self.compiled_query.is_duckdb
|
|
588
|
+
query = self.compiled_query.query
|
|
589
|
+
connection_name = self.model_config.get_connection()
|
|
590
|
+
|
|
591
|
+
def run_sql_query_on_connection(is_duckdb: bool, query: str, placeholders: dict) -> pl.DataFrame:
|
|
592
|
+
try:
|
|
593
|
+
if is_duckdb:
|
|
594
|
+
local_conn = conn.cursor()
|
|
595
|
+
try:
|
|
596
|
+
self.logger.info(f"Running dbview '{self.name}' on duckdb")
|
|
597
|
+
return local_conn.sql(query, params=placeholders).pl()
|
|
598
|
+
except duckdb.CatalogException as e:
|
|
599
|
+
raise InvalidInputError(409, f'dependent_data_model_not_found', f'Model "{self.name}" depends on static data models that cannot be found. Try building the Virtual Data Lake (VDL) first.')
|
|
600
|
+
except Exception as e:
|
|
601
|
+
raise RuntimeError(e)
|
|
602
|
+
finally:
|
|
603
|
+
local_conn.close()
|
|
604
|
+
else:
|
|
605
|
+
self.logger.info(f"Running dbview '{self.name}' on connection: {connection_name}")
|
|
606
|
+
return self.conn_set.run_sql_query_from_conn_name(query, connection_name, placeholders)
|
|
607
|
+
except RuntimeError as e:
|
|
608
|
+
raise FileExecutionError(f'Failed to run dbview sql model "{self.name}"', e)
|
|
609
|
+
|
|
610
|
+
self._log_sql_to_run(query, placeholders)
|
|
611
|
+
result = await asyncio.to_thread(run_sql_query_on_connection, is_duckdb, query, placeholders)
|
|
612
|
+
self.result = result.lazy()
|
|
613
|
+
|
|
614
|
+
async def run_model(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
|
|
615
|
+
start = time.time()
|
|
616
|
+
|
|
617
|
+
await self._run_sql_model(conn, placeholders)
|
|
618
|
+
|
|
619
|
+
self.logger.log_activity_time(
|
|
620
|
+
f"running dbview model '{self.name}'", start,
|
|
621
|
+
additional_data={
|
|
622
|
+
"activity": "running data model",
|
|
623
|
+
"model_name": self.name,
|
|
624
|
+
"model_type": self.model_type.value
|
|
625
|
+
}
|
|
626
|
+
)
|
|
627
|
+
|
|
628
|
+
await super().run_model(conn, placeholders)
|
|
629
|
+
|
|
630
|
+
|
|
631
|
+
@dataclass
|
|
632
|
+
class FederateModel(QueryModel):
|
|
633
|
+
model_config: mc.FederateModelConfig
|
|
634
|
+
query_file: mq.SqlQueryFile | mq.PyQueryFile
|
|
635
|
+
compiled_query: mq.SqlModelQuery | mq.PyModelQuery | None = field(default=None, init=False)
|
|
636
|
+
|
|
637
|
+
@property
|
|
638
|
+
def model_type(self) -> ModelType:
|
|
639
|
+
return ModelType.FEDERATE
|
|
640
|
+
|
|
641
|
+
def _get_compile_sql_model_args(
|
|
642
|
+
self, ctx: dict[str, Any], ctx_args: ContextArgs, models_dict: dict[str, DataModel]
|
|
643
|
+
) -> dict[str, Any]:
|
|
644
|
+
kwargs = self._get_compile_sql_model_args_from_ctx_args(ctx, ctx_args)
|
|
645
|
+
|
|
646
|
+
def ref(dependent_model_name: str) -> str:
|
|
647
|
+
dependent_model = self._ref_for_sql(dependent_model_name, models_dict)
|
|
648
|
+
dep = models_dict[dependent_model]
|
|
649
|
+
if isinstance(dep, BuildModel):
|
|
650
|
+
return "vdl." + dependent_model
|
|
651
|
+
if isinstance(dep, SourceModel):
|
|
652
|
+
if dep.model_config.load_to_vdl:
|
|
653
|
+
return "vdl." + dependent_model
|
|
654
|
+
conn_name = dep.model_config.get_connection()
|
|
655
|
+
table_name = dep.model_config.get_table()
|
|
656
|
+
return f"db_{conn_name}.{table_name}"
|
|
657
|
+
return dependent_model
|
|
658
|
+
|
|
659
|
+
kwargs["ref"] = ref
|
|
660
|
+
return kwargs
|
|
661
|
+
|
|
662
|
+
def _compile_sql_model(
|
|
663
|
+
self, query_file: mq.SqlQueryFile, ctx: dict[str, Any], ctx_args: ContextArgs, models_dict: dict[str, DataModel]
|
|
664
|
+
) -> mq.SqlModelQuery:
|
|
665
|
+
kwargs = self._get_compile_sql_model_args(ctx, ctx_args, models_dict)
|
|
666
|
+
compiled_query_str = self._get_compiled_sql_query_str(query_file.raw_query, kwargs)
|
|
667
|
+
compiled_query = mq.SqlModelQuery(compiled_query_str, is_duckdb=True)
|
|
668
|
+
return compiled_query
|
|
669
|
+
|
|
670
|
+
def _get_python_model_args(self, ctx: dict[str, Any], ctx_args: ContextArgs) -> ModelArgs:
|
|
671
|
+
dependencies = self.model_config.depends_on
|
|
672
|
+
connections = self.conn_set.get_connections_as_dict()
|
|
673
|
+
|
|
674
|
+
def _run_external_sql(connection_name: str, sql_query: str) -> pl.DataFrame:
|
|
675
|
+
return self._run_sql_query_on_connection(connection_name, sql_query, ctx_args._placeholders)
|
|
676
|
+
|
|
677
|
+
build_model_args = BuildModelArgs(
|
|
678
|
+
**ctx_args._conn_args.__dict__,
|
|
679
|
+
connections=connections, dependencies=dependencies,
|
|
680
|
+
_ref_func=self._ref_for_python, _run_external_sql_func=_run_external_sql
|
|
681
|
+
)
|
|
682
|
+
|
|
683
|
+
# Instantiate ModelArgs with flattened arguments
|
|
684
|
+
combined_args = {
|
|
685
|
+
**ctx_args.__dict__, **build_model_args.__dict__, "ctx": ctx,
|
|
686
|
+
}
|
|
687
|
+
model_args = ModelArgs(**combined_args)
|
|
688
|
+
return model_args
|
|
689
|
+
|
|
690
|
+
def _compile_python_model(
|
|
691
|
+
self, query_file: mq.PyQueryFile, ctx: dict[str, Any], ctx_args: ContextArgs
|
|
692
|
+
) -> mq.PyModelQuery:
|
|
693
|
+
sqrl_args = self._get_python_model_args(ctx, ctx_args)
|
|
694
|
+
|
|
695
|
+
def compiled_query() -> pl.LazyFrame | pd.DataFrame:
|
|
696
|
+
try:
|
|
697
|
+
return query_file.raw_query(sqrl_args)
|
|
698
|
+
except Exception as e:
|
|
699
|
+
raise FileExecutionError(f'Failed to run "{c.MAIN_FUNC}" function for python model "{self.name}"', e) from e
|
|
700
|
+
|
|
701
|
+
return mq.PyModelQuery(compiled_query)
|
|
702
|
+
|
|
703
|
+
def compile(
|
|
704
|
+
self, ctx: dict[str, Any], ctx_args: ContextArgs, models_dict: dict[str, DataModel], recurse: bool
|
|
705
|
+
) -> None:
|
|
706
|
+
if self.compiled_query is not None:
|
|
707
|
+
return
|
|
708
|
+
else:
|
|
709
|
+
self.compiled_query = mq.WorkInProgress() # type: ignore
|
|
710
|
+
|
|
711
|
+
start = time.time()
|
|
712
|
+
|
|
713
|
+
if isinstance(self.query_file, mq.SqlQueryFile):
|
|
714
|
+
self.compiled_query = self._compile_sql_model(self.query_file, ctx, ctx_args, models_dict)
|
|
715
|
+
elif isinstance(self.query_file, mq.PyQueryFile):
|
|
716
|
+
self.compiled_query = self._compile_python_model(self.query_file, ctx, ctx_args)
|
|
717
|
+
else:
|
|
718
|
+
raise NotImplementedError(f"Query type not supported: {self.query_file.__class__.__name__}")
|
|
719
|
+
|
|
720
|
+
self.logger.log_activity_time(
|
|
721
|
+
f"compiling federate model '{self.name}'", start,
|
|
722
|
+
additional_data={
|
|
723
|
+
"activity": "compiling data model",
|
|
724
|
+
"model_name": self.name,
|
|
725
|
+
"model_type": self.model_type.value
|
|
726
|
+
}
|
|
727
|
+
)
|
|
728
|
+
|
|
729
|
+
if not recurse:
|
|
730
|
+
return
|
|
731
|
+
|
|
732
|
+
dependencies = self.model_config.depends_on
|
|
733
|
+
self.wait_count = len(dependencies)
|
|
734
|
+
|
|
735
|
+
for name in dependencies:
|
|
736
|
+
dep_model = models_dict[name]
|
|
737
|
+
self._add_upstream(dep_model)
|
|
738
|
+
dep_model.compile(ctx, ctx_args, models_dict, recurse)
|
|
739
|
+
|
|
740
|
+
async def _run_sql_model(self, compiled_query: mq.SqlModelQuery, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
|
|
741
|
+
local_conn = conn.cursor()
|
|
742
|
+
try:
|
|
743
|
+
self.register_all_upstream_python_df(local_conn)
|
|
744
|
+
query = compiled_query.query
|
|
745
|
+
|
|
746
|
+
def create_table(local_conn: duckdb.DuckDBPyConnection):
|
|
747
|
+
# DuckDB doesn't support specifying named parameters that are not used in the query, so filtering them out
|
|
748
|
+
placeholder_exists = lambda key: re.search(r"\$" + key + r"(?!\w)", query)
|
|
749
|
+
existing_placeholders = {key: value for key, value in placeholders.items() if placeholder_exists(key)}
|
|
750
|
+
|
|
751
|
+
create_query = self.model_config.get_sql_for_create(self.name, query)
|
|
752
|
+
self._log_sql_to_run(create_query, existing_placeholders)
|
|
753
|
+
try:
|
|
754
|
+
return local_conn.execute(create_query, existing_placeholders)
|
|
755
|
+
except duckdb.CatalogException as e:
|
|
756
|
+
if self.name == "__fake_target":
|
|
757
|
+
raise InvalidInputError(409, "invalid_sql_query", f"Provided SQL query depends on static data models that cannot be found. Try building the Virtual Data Lake (VDL) first.")
|
|
758
|
+
else:
|
|
759
|
+
raise InvalidInputError(409, f'dependent_data_model_not_found', f'Model "{self.name}" depends on static data models that cannot be found. Try building the Virtual Data Lake (VDL) first.')
|
|
760
|
+
except Exception as e:
|
|
761
|
+
if self.name == "__fake_target":
|
|
762
|
+
raise InvalidInputError(400, "invalid_sql_query", f"Failed to run provided SQL query")
|
|
763
|
+
else:
|
|
764
|
+
raise FileExecutionError(f'Failed to run federate sql model "{self.name}"', e) from e
|
|
765
|
+
|
|
766
|
+
await asyncio.to_thread(create_table, local_conn)
|
|
767
|
+
if self.needs_python_df or self.is_target:
|
|
768
|
+
self.result = await asyncio.to_thread(self._load_duckdb_view_to_python_df, local_conn)
|
|
769
|
+
finally:
|
|
770
|
+
local_conn.close()
|
|
771
|
+
|
|
772
|
+
async def _run_python_model(self, compiled_query: mq.PyModelQuery) -> None:
|
|
773
|
+
query_result = await asyncio.to_thread(compiled_query.query)
|
|
774
|
+
if isinstance(query_result, pd.DataFrame):
|
|
775
|
+
query_result = pl.from_pandas(query_result)
|
|
776
|
+
|
|
777
|
+
self.result = query_result.lazy()
|
|
778
|
+
|
|
779
|
+
async def run_model(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
|
|
780
|
+
start = time.time()
|
|
781
|
+
|
|
782
|
+
if isinstance(self.compiled_query, mq.SqlModelQuery):
|
|
783
|
+
await self._run_sql_model(self.compiled_query, conn, placeholders)
|
|
784
|
+
elif isinstance(self.compiled_query, mq.PyModelQuery):
|
|
785
|
+
await self._run_python_model(self.compiled_query)
|
|
786
|
+
else:
|
|
787
|
+
raise NotImplementedError(f"Query type not supported: {self.query_file.__class__.__name__}")
|
|
788
|
+
|
|
789
|
+
self.logger.log_activity_time(
|
|
790
|
+
f"running federate model '{self.name}'", start,
|
|
791
|
+
additional_data={
|
|
792
|
+
"activity": "running data model",
|
|
793
|
+
"model_name": self.name,
|
|
794
|
+
"model_type": self.model_type.value
|
|
795
|
+
}
|
|
796
|
+
)
|
|
797
|
+
|
|
798
|
+
await super().run_model(conn, placeholders)
|
|
799
|
+
|
|
800
|
+
|
|
801
|
+
@dataclass
|
|
802
|
+
class BuildModel(StaticModel, QueryModel):
|
|
803
|
+
model_config: mc.BuildModelConfig
|
|
804
|
+
query_file: mq.SqlQueryFile | mq.PyQueryFile
|
|
805
|
+
compiled_query: mq.SqlModelQuery | mq.PyModelQuery | None = field(default=None, init=False)
|
|
806
|
+
|
|
807
|
+
@property
|
|
808
|
+
def model_type(self) -> ModelType:
|
|
809
|
+
return ModelType.BUILD
|
|
810
|
+
|
|
811
|
+
def _add_upstream_for_build(self, other: StaticModel) -> None:
|
|
812
|
+
self.upstreams_for_build[other.name] = other
|
|
813
|
+
other.downstreams_for_build[self.name] = self
|
|
814
|
+
|
|
815
|
+
if isinstance(self.query_file, mq.PyQueryFile):
|
|
816
|
+
other.needs_python_df_for_build = True
|
|
817
|
+
|
|
818
|
+
def _get_compile_sql_model_args(
|
|
819
|
+
self, conn_args: ConnectionsArgs, models_dict: dict[str, StaticModel]
|
|
820
|
+
) -> dict[str, Any]:
|
|
821
|
+
kwargs: dict[str, Any] = {
|
|
822
|
+
"proj_vars": conn_args.proj_vars, "env_vars": conn_args.env_vars
|
|
823
|
+
}
|
|
824
|
+
|
|
825
|
+
def ref_for_build(dependent_model_name: str) -> str:
|
|
826
|
+
dependent_model = self._ref_for_sql(dependent_model_name, models_dict)
|
|
827
|
+
dep = models_dict[dependent_model]
|
|
828
|
+
if isinstance(dep, SourceModel) and not dep.model_config.load_to_vdl:
|
|
829
|
+
conn_name = dep.model_config.get_connection()
|
|
830
|
+
table_name = dep.model_config.get_table()
|
|
831
|
+
return f"db_{conn_name}.{table_name}"
|
|
832
|
+
return dependent_model
|
|
833
|
+
|
|
834
|
+
kwargs["ref"] = ref_for_build
|
|
835
|
+
return kwargs
|
|
836
|
+
|
|
837
|
+
def _compile_sql_model(
|
|
838
|
+
self, query_file: mq.SqlQueryFile, conn_args: ConnectionsArgs, models_dict: dict[str, StaticModel]
|
|
839
|
+
) -> mq.SqlModelQuery:
|
|
840
|
+
kwargs = self._get_compile_sql_model_args(conn_args, models_dict)
|
|
841
|
+
compiled_query_str = self._get_compiled_sql_query_str(query_file.raw_query, kwargs)
|
|
842
|
+
compiled_query = mq.SqlModelQuery(compiled_query_str, is_duckdb=True)
|
|
843
|
+
return compiled_query
|
|
844
|
+
|
|
845
|
+
def _ref_for_python(self, dependent_model_name: str) -> pl.LazyFrame:
|
|
846
|
+
if dependent_model_name not in self.upstreams_for_build:
|
|
847
|
+
raise u.ConfigurationError(f'Model "{self.name}" must include model "{dependent_model_name}" as a dependency to use')
|
|
848
|
+
df = self.upstreams_for_build[dependent_model_name].result
|
|
849
|
+
assert df is not None
|
|
850
|
+
return df
|
|
851
|
+
|
|
852
|
+
def _get_compile_python_model_args(self, conn_args: ConnectionsArgs) -> BuildModelArgs:
|
|
853
|
+
|
|
854
|
+
def _run_external_sql(connection_name: str, sql_query: str):
|
|
855
|
+
return self._run_sql_query_on_connection(connection_name, sql_query)
|
|
856
|
+
|
|
857
|
+
return BuildModelArgs(
|
|
858
|
+
**conn_args.__dict__,
|
|
859
|
+
connections=self.conn_set.get_connections_as_dict(), dependencies=self.model_config.depends_on,
|
|
860
|
+
_ref_func=self._ref_for_python, _run_external_sql_func=_run_external_sql
|
|
861
|
+
)
|
|
862
|
+
|
|
863
|
+
def _compile_python_model(
|
|
864
|
+
self, query_file: mq.PyQueryFile, conn_args: ConnectionsArgs
|
|
865
|
+
) -> mq.PyModelQuery:
|
|
866
|
+
sqrl_args = self._get_compile_python_model_args(conn_args)
|
|
867
|
+
|
|
868
|
+
def compiled_query() -> pl.LazyFrame | pd.DataFrame:
|
|
869
|
+
try:
|
|
870
|
+
return query_file.raw_query(sqrl_args)
|
|
871
|
+
except Exception as e:
|
|
872
|
+
raise FileExecutionError(f'Failed to run "{c.MAIN_FUNC}" function for build model "{self.name}"', e)
|
|
873
|
+
|
|
874
|
+
return mq.PyModelQuery(compiled_query)
|
|
875
|
+
|
|
876
|
+
def compile_for_build(self, conn_args: ConnectionsArgs, models_dict: dict[str, StaticModel]) -> None:
|
|
877
|
+
start = time.time()
|
|
878
|
+
|
|
879
|
+
if isinstance(self.query_file, mq.SqlQueryFile):
|
|
880
|
+
self.compiled_query = self._compile_sql_model(self.query_file, conn_args, models_dict)
|
|
881
|
+
elif isinstance(self.query_file, mq.PyQueryFile):
|
|
882
|
+
self.compiled_query = self._compile_python_model(self.query_file, conn_args)
|
|
883
|
+
else:
|
|
884
|
+
raise NotImplementedError(f"Query type not supported: {self.query_file.__class__.__name__}")
|
|
885
|
+
|
|
886
|
+
self.logger.log_activity_time(
|
|
887
|
+
f"compiling build model '{self.name}'", start,
|
|
888
|
+
additional_data={
|
|
889
|
+
"activity": "compiling data model",
|
|
890
|
+
"model_name": self.name,
|
|
891
|
+
"model_type": self.model_type.value
|
|
892
|
+
}
|
|
893
|
+
)
|
|
894
|
+
|
|
895
|
+
dependencies = self.model_config.depends_on
|
|
896
|
+
self.wait_count_for_build = len(dependencies)
|
|
897
|
+
|
|
898
|
+
for name in dependencies:
|
|
899
|
+
dep_model = models_dict[name]
|
|
900
|
+
self._add_upstream_for_build(dep_model)
|
|
901
|
+
|
|
902
|
+
async def _build_sql_model(self, compiled_query: mq.SqlModelQuery, conn: duckdb.DuckDBPyConnection) -> None:
|
|
903
|
+
query = compiled_query.query
|
|
904
|
+
|
|
905
|
+
def create_table():
|
|
906
|
+
create_query = self.model_config.get_sql_for_build(self.name, query)
|
|
907
|
+
local_conn = conn.cursor()
|
|
908
|
+
# local_conn = conn
|
|
909
|
+
try:
|
|
910
|
+
return u.run_duckdb_stmt(self.logger, local_conn, create_query, model_name=self.name)
|
|
911
|
+
except Exception as e:
|
|
912
|
+
raise FileExecutionError(f'Failed to build static sql model "{self.name}"', e) from e
|
|
913
|
+
finally:
|
|
914
|
+
local_conn.close()
|
|
915
|
+
# pass
|
|
916
|
+
|
|
917
|
+
# await asyncio.to_thread(create_table)
|
|
918
|
+
create_table() # without threading
|
|
919
|
+
|
|
920
|
+
async def _build_python_model(self, compiled_query: mq.PyModelQuery, conn: duckdb.DuckDBPyConnection) -> None:
|
|
921
|
+
query_result = await asyncio.to_thread(compiled_query.query)
|
|
922
|
+
if isinstance(query_result, pd.DataFrame):
|
|
923
|
+
query_result = pl.from_pandas(query_result).lazy()
|
|
924
|
+
if self.needs_python_df_for_build:
|
|
925
|
+
self.result = query_result.lazy()
|
|
926
|
+
# await asyncio.to_thread(self._create_table_from_df, conn, query_result)
|
|
927
|
+
self._create_table_from_df(conn, query_result) # without threading
|
|
928
|
+
|
|
929
|
+
async def build_model(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
|
|
930
|
+
start = time.time()
|
|
931
|
+
print(f"[{u.get_current_time()}] 🔨 BUILDING: build model '{self.name}'")
|
|
932
|
+
|
|
933
|
+
if isinstance(self.compiled_query, mq.SqlModelQuery):
|
|
934
|
+
await self._build_sql_model(self.compiled_query, conn)
|
|
935
|
+
elif isinstance(self.compiled_query, mq.PyModelQuery):
|
|
936
|
+
# First ensure all upstream models have an associated Python dataframe
|
|
937
|
+
def load_df(conn: duckdb.DuckDBPyConnection, dep_model: DataModel):
|
|
938
|
+
if dep_model.result is None:
|
|
939
|
+
local_conn = conn.cursor()
|
|
940
|
+
# local_conn = conn
|
|
941
|
+
try:
|
|
942
|
+
dep_model.result = dep_model._load_duckdb_view_to_python_df(local_conn)
|
|
943
|
+
finally:
|
|
944
|
+
local_conn.close()
|
|
945
|
+
# pass
|
|
946
|
+
|
|
947
|
+
coroutines = []
|
|
948
|
+
for dep_model in self.upstreams_for_build.values():
|
|
949
|
+
coro = asyncio.to_thread(load_df, conn, dep_model)
|
|
950
|
+
coroutines.append(coro)
|
|
951
|
+
await u.asyncio_gather(coroutines)
|
|
952
|
+
|
|
953
|
+
# Then run the model's Python function to build the model
|
|
954
|
+
await self._build_python_model(self.compiled_query, conn)
|
|
955
|
+
else:
|
|
956
|
+
raise NotImplementedError(f"Query type not supported: {self.query_file.__class__.__name__}")
|
|
957
|
+
|
|
958
|
+
print(f"[{u.get_current_time()}] ✅ FINISHED: build model '{self.name}'")
|
|
959
|
+
self.logger.log_activity_time(
|
|
960
|
+
f"building static build model '{self.name}' into VDL", start,
|
|
961
|
+
additional_data={
|
|
962
|
+
"activity": "building data model into VDL",
|
|
963
|
+
"model_name": self.name,
|
|
964
|
+
"model_type": self.model_type.value
|
|
965
|
+
}
|
|
966
|
+
)
|
|
967
|
+
|
|
968
|
+
await super().build_model(conn, full_refresh)
|
|
969
|
+
|
|
970
|
+
|
|
971
|
+
@dataclass
|
|
972
|
+
class DAG:
|
|
973
|
+
dataset: DatasetConfig | None
|
|
974
|
+
target_model: DataModel
|
|
975
|
+
models_dict: dict[str, DataModel]
|
|
976
|
+
datalake_db_path: str | None = field(default=None)
|
|
977
|
+
logger: u.Logger = field(default_factory=lambda: u.Logger(""))
|
|
978
|
+
parameter_set: ParameterSet | None = field(default=None, init=False) # set in apply_selections
|
|
979
|
+
placeholders: dict[str, Any] = field(init=False, default_factory=dict)
|
|
980
|
+
|
|
981
|
+
def _get_msg_extension(self) -> str:
|
|
982
|
+
return f" for dataset '{self.dataset.name}'" if self.dataset else ""
|
|
983
|
+
|
|
984
|
+
def compile_build_models(self, conn_args: ConnectionsArgs) -> None:
|
|
985
|
+
static_models: dict[str, StaticModel] = {
|
|
986
|
+
k: v for k, v in self.models_dict.items() if isinstance(v, StaticModel)
|
|
987
|
+
}
|
|
988
|
+
for model in static_models.values():
|
|
989
|
+
if isinstance(model, BuildModel):
|
|
990
|
+
model.compile_for_build(conn_args, static_models)
|
|
991
|
+
|
|
992
|
+
def apply_selections(
|
|
993
|
+
self, param_cfg_set: ParameterConfigsSet, user: AbstractUser, selections: dict[str, str]
|
|
994
|
+
) -> None:
|
|
995
|
+
start = time.time()
|
|
996
|
+
|
|
997
|
+
dataset_params = self.dataset.parameters if self.dataset else None
|
|
998
|
+
parameter_set = param_cfg_set.apply_selections(dataset_params, selections, user)
|
|
999
|
+
self.parameter_set = parameter_set
|
|
1000
|
+
msg_extension = self._get_msg_extension()
|
|
1001
|
+
|
|
1002
|
+
dataset_name = self.dataset.name if self.dataset else None
|
|
1003
|
+
self.logger.log_activity_time(
|
|
1004
|
+
"applying selections" + msg_extension, start,
|
|
1005
|
+
additional_data={"activity": "applying selections", "dataset_name": dataset_name}
|
|
1006
|
+
)
|
|
1007
|
+
|
|
1008
|
+
def _compile_context(
|
|
1009
|
+
self, param_args: ParametersArgs, context_func: ContextFunc, user: AbstractUser, configurables: dict[str, str]
|
|
1010
|
+
) -> tuple[dict[str, Any], ContextArgs]:
|
|
1011
|
+
start = time.time()
|
|
1012
|
+
|
|
1013
|
+
context = {}
|
|
1014
|
+
assert isinstance(self.parameter_set, ParameterSet)
|
|
1015
|
+
prms = self.parameter_set.get_parameters_as_dict()
|
|
1016
|
+
args = ContextArgs(
|
|
1017
|
+
**param_args.__dict__, user=user, prms=prms, configurables=configurables, _conn_args=param_args
|
|
1018
|
+
)
|
|
1019
|
+
msg_extension = self._get_msg_extension()
|
|
1020
|
+
|
|
1021
|
+
try:
|
|
1022
|
+
context_func(context, args)
|
|
1023
|
+
except Exception as e:
|
|
1024
|
+
raise FileExecutionError(f'Failed to run {c.CONTEXT_FILE}' + msg_extension, e) from e
|
|
1025
|
+
|
|
1026
|
+
dataset_name = self.dataset.name if self.dataset else None
|
|
1027
|
+
self.logger.log_activity_time(
|
|
1028
|
+
"running context.py" + msg_extension, start,
|
|
1029
|
+
additional_data={"activity": "running context.py", "dataset_name": dataset_name}
|
|
1030
|
+
)
|
|
1031
|
+
return context, args
|
|
1032
|
+
|
|
1033
|
+
def _compile_models(self, context: dict[str, Any], ctx_args: ContextArgs, recurse: bool) -> None:
|
|
1034
|
+
self.target_model.compile(context, ctx_args, self.models_dict, recurse)
|
|
1035
|
+
|
|
1036
|
+
def _get_terminal_nodes(self) -> set[str]:
|
|
1037
|
+
start = time.time()
|
|
1038
|
+
terminal_nodes = self.target_model.get_terminal_nodes(set())
|
|
1039
|
+
for model in self.models_dict.values():
|
|
1040
|
+
model.confirmed_no_cycles = False
|
|
1041
|
+
self.logger.log_activity_time("validating no cycles in model dependencies", start)
|
|
1042
|
+
return terminal_nodes
|
|
1043
|
+
|
|
1044
|
+
def _attach_connections_with_type_duckdb(self, conn: duckdb.DuckDBPyConnection) -> None:
|
|
1045
|
+
for conn_name, connection in self.target_model.conn_set.get_connections_as_dict().items():
|
|
1046
|
+
if not isinstance(connection, ConnectionProperties):
|
|
1047
|
+
continue
|
|
1048
|
+
attach_uri = connection.attach_uri_for_duckdb
|
|
1049
|
+
if attach_uri is None:
|
|
1050
|
+
continue
|
|
1051
|
+
attach_stmt = f"ATTACH IF NOT EXISTS '{attach_uri}' AS db_{conn_name} (READ_ONLY)"
|
|
1052
|
+
u.run_duckdb_stmt(self.logger, conn, attach_stmt, redacted_values=[attach_uri])
|
|
1053
|
+
|
|
1054
|
+
async def _run_models(self) -> None:
|
|
1055
|
+
terminal_nodes = self._get_terminal_nodes()
|
|
1056
|
+
|
|
1057
|
+
conn = u.create_duckdb_connection(datalake_db_path=self.datalake_db_path)
|
|
1058
|
+
try:
|
|
1059
|
+
self._attach_connections_with_type_duckdb(conn)
|
|
1060
|
+
|
|
1061
|
+
coroutines = []
|
|
1062
|
+
for model_name in terminal_nodes:
|
|
1063
|
+
model = self.models_dict[model_name] if model_name != "__fake_target" else self.target_model
|
|
1064
|
+
coroutines.append(model.run_model(conn, self.placeholders))
|
|
1065
|
+
await u.asyncio_gather(coroutines)
|
|
1066
|
+
|
|
1067
|
+
finally:
|
|
1068
|
+
conn.close()
|
|
1069
|
+
|
|
1070
|
+
async def execute(
|
|
1071
|
+
self, param_args: ParametersArgs, param_cfg_set: ParameterConfigsSet, context_func: ContextFunc, user: AbstractUser, selections: dict[str, str],
|
|
1072
|
+
*, runquery: bool = True, recurse: bool = True, configurables: dict[str, str] = {}
|
|
1073
|
+
) -> None:
|
|
1074
|
+
recurse = (recurse or runquery)
|
|
1075
|
+
|
|
1076
|
+
self.apply_selections(param_cfg_set, user, selections)
|
|
1077
|
+
|
|
1078
|
+
context, ctx_args = self._compile_context(param_args, context_func, user, configurables)
|
|
1079
|
+
|
|
1080
|
+
self._compile_models(context, ctx_args, recurse)
|
|
1081
|
+
|
|
1082
|
+
self.placeholders = dict(ctx_args._placeholders)
|
|
1083
|
+
if runquery:
|
|
1084
|
+
await self._run_models()
|
|
1085
|
+
|
|
1086
|
+
self.target_model.process_pass_through_columns(self.models_dict)
|
|
1087
|
+
|
|
1088
|
+
def get_all_query_models(self) -> set[str]:
|
|
1089
|
+
all_model_names = set()
|
|
1090
|
+
self.target_model.retrieve_dependent_query_models(all_model_names)
|
|
1091
|
+
return all_model_names
|
|
1092
|
+
|
|
1093
|
+
def get_all_data_models(self) -> list[rm.DataModelItem]:
|
|
1094
|
+
data_models = []
|
|
1095
|
+
for model_name, model in self.models_dict.items():
|
|
1096
|
+
is_queryable = model.is_queryable
|
|
1097
|
+
data_model = rm.DataModelItem(name=model_name, model_type=model.model_type.value, config=model.model_config, is_queryable=is_queryable)
|
|
1098
|
+
data_models.append(data_model)
|
|
1099
|
+
return data_models
|
|
1100
|
+
|
|
1101
|
+
def get_all_model_lineage(self) -> list[rm.LineageRelation]:
|
|
1102
|
+
model_lineage = []
|
|
1103
|
+
for model_name, model in self.models_dict.items():
|
|
1104
|
+
if not isinstance(model, QueryModel):
|
|
1105
|
+
continue
|
|
1106
|
+
for dep_model_name in model.model_config.depends_on:
|
|
1107
|
+
edge_type = "buildtime" if isinstance(model, BuildModel) else "runtime"
|
|
1108
|
+
source_model = rm.LineageNode(name=dep_model_name, type="model")
|
|
1109
|
+
target_model = rm.LineageNode(name=model_name, type="model")
|
|
1110
|
+
model_lineage.append(rm.LineageRelation(type=edge_type, source=source_model, target=target_model))
|
|
1111
|
+
return model_lineage
|
|
1112
|
+
|
|
1113
|
+
|
|
1114
|
+
class ModelsIO:
|
|
1115
|
+
|
|
1116
|
+
@classmethod
|
|
1117
|
+
def _load_model_config(cls, filepath: Path, model_type: ModelType, env_vars: SquirrelsEnvVars) -> mc.ModelConfig:
|
|
1118
|
+
yaml_path = filepath.with_suffix('.yml')
|
|
1119
|
+
config_dict = u.load_yaml_config(yaml_path) if yaml_path.exists() else {}
|
|
1120
|
+
|
|
1121
|
+
if model_type == ModelType.DBVIEW:
|
|
1122
|
+
default_conn_name = env_vars.connections_default_name_used
|
|
1123
|
+
config = mc.DbviewModelConfig(**config_dict).finalize_connection(default_conn_name=default_conn_name)
|
|
1124
|
+
return config
|
|
1125
|
+
elif model_type == ModelType.FEDERATE:
|
|
1126
|
+
return mc.FederateModelConfig(**config_dict)
|
|
1127
|
+
elif model_type == ModelType.BUILD:
|
|
1128
|
+
return mc.BuildModelConfig(**config_dict)
|
|
1129
|
+
else:
|
|
1130
|
+
return mc.ModelConfig(**config_dict)
|
|
1131
|
+
|
|
1132
|
+
@classmethod
|
|
1133
|
+
def _populate_from_file(
|
|
1134
|
+
cls, raw_queries_by_model: dict[str, mq.QueryFileWithConfig], dp: str, file: str, model_type: ModelType, env_vars: SquirrelsEnvVars
|
|
1135
|
+
) -> None:
|
|
1136
|
+
filepath = Path(dp, file)
|
|
1137
|
+
file_stem, extension = os.path.splitext(file)
|
|
1138
|
+
|
|
1139
|
+
if extension == '.py':
|
|
1140
|
+
module = pm.PyModule(filepath, project_path=env_vars.project_path)
|
|
1141
|
+
raw_query = module.get_func_or_class(c.MAIN_FUNC)
|
|
1142
|
+
query_file = mq.PyQueryFile(filepath.as_posix(), raw_query)
|
|
1143
|
+
elif extension == '.sql':
|
|
1144
|
+
query_file = mq.SqlQueryFile(filepath.as_posix(), filepath.read_text())
|
|
1145
|
+
else:
|
|
1146
|
+
return # Skip files that are not query files
|
|
1147
|
+
|
|
1148
|
+
if file_stem in raw_queries_by_model:
|
|
1149
|
+
assert isinstance(prior_query_file := raw_queries_by_model[file_stem].query_file, mq.QueryFile)
|
|
1150
|
+
conflicts = [prior_query_file.filepath, query_file.filepath]
|
|
1151
|
+
raise u.ConfigurationError(f"Multiple models found for '{file_stem}': {conflicts}")
|
|
1152
|
+
|
|
1153
|
+
model_config = cls._load_model_config(filepath, model_type, env_vars)
|
|
1154
|
+
raw_queries_by_model[file_stem] = mq.QueryFileWithConfig(query_file, model_config)
|
|
1155
|
+
|
|
1156
|
+
@classmethod
|
|
1157
|
+
def _populate_raw_queries_for_type(
|
|
1158
|
+
cls, folder_path: Path, model_type: ModelType, env_vars: SquirrelsEnvVars
|
|
1159
|
+
) -> dict[str, mq.QueryFileWithConfig]:
|
|
1160
|
+
raw_queries_by_model: dict[str, mq.QueryFileWithConfig] = {}
|
|
1161
|
+
for dp, _, filenames in os.walk(folder_path):
|
|
1162
|
+
for file in filenames:
|
|
1163
|
+
cls._populate_from_file(raw_queries_by_model, dp, file, model_type, env_vars)
|
|
1164
|
+
return raw_queries_by_model
|
|
1165
|
+
|
|
1166
|
+
@classmethod
|
|
1167
|
+
def load_build_files(cls, logger: u.Logger, env_vars: SquirrelsEnvVars) -> dict[str, mq.QueryFileWithConfig]:
|
|
1168
|
+
start = time.time()
|
|
1169
|
+
builds_path = u.Path(env_vars.project_path, c.MODELS_FOLDER, c.BUILDS_FOLDER)
|
|
1170
|
+
raw_queries_by_model = cls._populate_raw_queries_for_type(builds_path, ModelType.BUILD, env_vars=env_vars)
|
|
1171
|
+
logger.log_activity_time("loading build files", start)
|
|
1172
|
+
return raw_queries_by_model
|
|
1173
|
+
|
|
1174
|
+
@classmethod
|
|
1175
|
+
def load_dbview_files(cls, logger: u.Logger, env_vars: SquirrelsEnvVars) -> dict[str, mq.QueryFileWithConfig]:
|
|
1176
|
+
start = time.time()
|
|
1177
|
+
dbviews_path = u.Path(env_vars.project_path, c.MODELS_FOLDER, c.DBVIEWS_FOLDER)
|
|
1178
|
+
raw_queries_by_model = cls._populate_raw_queries_for_type(dbviews_path, ModelType.DBVIEW, env_vars=env_vars)
|
|
1179
|
+
logger.log_activity_time("loading dbview files", start)
|
|
1180
|
+
return raw_queries_by_model
|
|
1181
|
+
|
|
1182
|
+
@classmethod
|
|
1183
|
+
def load_federate_files(cls, logger: u.Logger, env_vars: SquirrelsEnvVars) -> dict[str, mq.QueryFileWithConfig]:
|
|
1184
|
+
start = time.time()
|
|
1185
|
+
federates_path = u.Path(env_vars.project_path, c.MODELS_FOLDER, c.FEDERATES_FOLDER)
|
|
1186
|
+
raw_queries_by_model = cls._populate_raw_queries_for_type(federates_path, ModelType.FEDERATE, env_vars=env_vars)
|
|
1187
|
+
logger.log_activity_time("loading federate files", start)
|
|
1188
|
+
return raw_queries_by_model
|
|
1189
|
+
|
|
1190
|
+
@classmethod
|
|
1191
|
+
def load_context_func(cls, logger: u.Logger, project_path: str) -> ContextFunc:
|
|
1192
|
+
start = time.time()
|
|
1193
|
+
|
|
1194
|
+
context_path = u.Path(project_path, c.PYCONFIGS_FOLDER, c.CONTEXT_FILE)
|
|
1195
|
+
context_func: ContextFunc = pm.PyModule(
|
|
1196
|
+
context_path, project_path=project_path
|
|
1197
|
+
).get_func_or_class(c.MAIN_FUNC, default_attr=lambda ctx, sqrl: None)
|
|
1198
|
+
|
|
1199
|
+
logger.log_activity_time("loading file for context.py", start)
|
|
1200
|
+
return context_func
|
|
1201
|
+
|