squirrels 0.4.0__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of squirrels might be problematic. Click here for more details.
- squirrels/__init__.py +10 -6
- squirrels/_api_response_models.py +93 -44
- squirrels/_api_server.py +571 -219
- squirrels/_auth.py +451 -0
- squirrels/_command_line.py +61 -20
- squirrels/_connection_set.py +38 -25
- squirrels/_constants.py +44 -34
- squirrels/_dashboards_io.py +34 -16
- squirrels/_exceptions.py +57 -0
- squirrels/_initializer.py +117 -44
- squirrels/_manifest.py +124 -62
- squirrels/_model_builder.py +111 -0
- squirrels/_model_configs.py +74 -0
- squirrels/_model_queries.py +52 -0
- squirrels/_models.py +860 -354
- squirrels/_package_loader.py +8 -4
- squirrels/_parameter_configs.py +45 -65
- squirrels/_parameter_sets.py +15 -13
- squirrels/_project.py +561 -0
- squirrels/_py_module.py +4 -3
- squirrels/_seeds.py +35 -16
- squirrels/_sources.py +106 -0
- squirrels/_utils.py +166 -63
- squirrels/_version.py +1 -1
- squirrels/arguments/init_time_args.py +78 -15
- squirrels/arguments/run_time_args.py +62 -101
- squirrels/dashboards.py +4 -4
- squirrels/data_sources.py +94 -162
- squirrels/dataset_result.py +86 -0
- squirrels/dateutils.py +4 -4
- squirrels/package_data/base_project/.env +30 -0
- squirrels/package_data/base_project/.env.example +30 -0
- squirrels/package_data/base_project/.gitignore +3 -2
- squirrels/package_data/base_project/assets/expenses.db +0 -0
- squirrels/package_data/base_project/connections.yml +11 -3
- squirrels/package_data/base_project/dashboards/dashboard_example.py +15 -13
- squirrels/package_data/base_project/dashboards/dashboard_example.yml +22 -0
- squirrels/package_data/base_project/docker/.dockerignore +5 -2
- squirrels/package_data/base_project/docker/Dockerfile +3 -3
- squirrels/package_data/base_project/docker/compose.yml +1 -1
- squirrels/package_data/base_project/duckdb_init.sql +9 -0
- squirrels/package_data/base_project/macros/macros_example.sql +15 -0
- squirrels/package_data/base_project/models/builds/build_example.py +26 -0
- squirrels/package_data/base_project/models/builds/build_example.sql +16 -0
- squirrels/package_data/base_project/models/builds/build_example.yml +55 -0
- squirrels/package_data/base_project/models/dbviews/dbview_example.sql +12 -22
- squirrels/package_data/base_project/models/dbviews/dbview_example.yml +26 -0
- squirrels/package_data/base_project/models/federates/federate_example.py +38 -15
- squirrels/package_data/base_project/models/federates/federate_example.sql +16 -2
- squirrels/package_data/base_project/models/federates/federate_example.yml +65 -0
- squirrels/package_data/base_project/models/sources.yml +39 -0
- squirrels/package_data/base_project/parameters.yml +36 -21
- squirrels/package_data/base_project/pyconfigs/connections.py +6 -11
- squirrels/package_data/base_project/pyconfigs/context.py +20 -33
- squirrels/package_data/base_project/pyconfigs/parameters.py +19 -21
- squirrels/package_data/base_project/pyconfigs/user.py +23 -0
- squirrels/package_data/base_project/seeds/seed_categories.yml +15 -0
- squirrels/package_data/base_project/seeds/seed_subcategories.csv +15 -15
- squirrels/package_data/base_project/seeds/seed_subcategories.yml +21 -0
- squirrels/package_data/base_project/squirrels.yml.j2 +17 -40
- squirrels/parameters.py +20 -20
- {squirrels-0.4.0.dist-info → squirrels-0.5.0rc0.dist-info}/METADATA +31 -32
- squirrels-0.5.0rc0.dist-info/RECORD +70 -0
- {squirrels-0.4.0.dist-info → squirrels-0.5.0rc0.dist-info}/WHEEL +1 -1
- squirrels-0.5.0rc0.dist-info/entry_points.txt +3 -0
- {squirrels-0.4.0.dist-info → squirrels-0.5.0rc0.dist-info/licenses}/LICENSE +1 -1
- squirrels/_authenticator.py +0 -85
- squirrels/_environcfg.py +0 -84
- squirrels/package_data/assets/favicon.ico +0 -0
- squirrels/package_data/assets/index.css +0 -1
- squirrels/package_data/assets/index.js +0 -58
- squirrels/package_data/base_project/dashboards.yml +0 -10
- squirrels/package_data/base_project/env.yml +0 -29
- squirrels/package_data/base_project/models/dbviews/dbview_example.py +0 -47
- squirrels/package_data/base_project/pyconfigs/auth.py +0 -45
- squirrels/package_data/templates/index.html +0 -18
- squirrels/project.py +0 -378
- squirrels/user_base.py +0 -55
- squirrels-0.4.0.dist-info/RECORD +0 -60
- squirrels-0.4.0.dist-info/entry_points.txt +0 -4
squirrels/_models.py
CHANGED
|
@@ -1,147 +1,120 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
|
-
from typing import
|
|
3
|
-
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Callable, Any
|
|
3
|
+
from dataclasses import dataclass, field, KW_ONLY
|
|
4
4
|
from abc import ABCMeta, abstractmethod
|
|
5
5
|
from enum import Enum
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
|
|
8
|
-
import
|
|
9
|
-
|
|
10
|
-
from . import _constants as c, _utils as u, _py_module as pm
|
|
11
|
-
from .
|
|
12
|
-
from .
|
|
13
|
-
from .
|
|
14
|
-
from .
|
|
7
|
+
import asyncio, os, re, time, duckdb, sqlglot
|
|
8
|
+
import polars as pl, pandas as pd, networkx as nx
|
|
9
|
+
|
|
10
|
+
from . import _constants as c, _utils as u, _py_module as pm, _model_queries as mq, _model_configs as mc, _sources as src, _api_response_models as arm
|
|
11
|
+
from ._exceptions import FileExecutionError, InvalidInputError
|
|
12
|
+
from .arguments.run_time_args import ContextArgs, ModelArgs, BuildModelArgs
|
|
13
|
+
from ._auth import BaseUser
|
|
14
|
+
from ._connection_set import ConnectionsArgs, ConnectionSet, ConnectionProperties
|
|
15
|
+
from ._manifest import DatasetConfig
|
|
15
16
|
from ._parameter_sets import ParameterConfigsSet, ParametersArgs, ParameterSet
|
|
16
17
|
|
|
17
18
|
ContextFunc = Callable[[dict[str, Any], ContextArgs], None]
|
|
18
19
|
|
|
19
20
|
|
|
20
21
|
class ModelType(Enum):
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
TABLE = 0
|
|
27
|
-
VIEW = 1
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
@dataclass
|
|
31
|
-
class _SqlModelConfig:
|
|
32
|
-
## Applicable for dbview models
|
|
33
|
-
connection_name: str
|
|
34
|
-
|
|
35
|
-
## Applicable for federated models
|
|
36
|
-
materialized: _Materialization
|
|
37
|
-
|
|
38
|
-
def set_attribute(self, *, connection_name: str | None = None, materialized: str | None = None, **kwargs) -> str:
|
|
39
|
-
if connection_name is not None:
|
|
40
|
-
if not isinstance(connection_name, str):
|
|
41
|
-
raise u.ConfigurationError("The 'connection_name' argument of 'config' macro must be a string")
|
|
42
|
-
self.connection_name = connection_name
|
|
43
|
-
|
|
44
|
-
if materialized is not None:
|
|
45
|
-
if not isinstance(materialized, str):
|
|
46
|
-
raise u.ConfigurationError("The 'materialized' argument of 'config' macro must be a string")
|
|
47
|
-
try:
|
|
48
|
-
self.materialized = _Materialization[materialized.upper()]
|
|
49
|
-
except KeyError as e:
|
|
50
|
-
valid_options = [x.name for x in _Materialization]
|
|
51
|
-
raise u.ConfigurationError(f"The 'materialized' argument value '{materialized}' is not valid. Must be one of: {valid_options}") from e
|
|
52
|
-
return ""
|
|
53
|
-
|
|
54
|
-
def get_sql_for_create(self, model_name: str, select_query: str) -> str:
|
|
55
|
-
create_prefix = f"CREATE {self.materialized.name} {model_name} AS\n"
|
|
56
|
-
return create_prefix + select_query
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
@dataclass(frozen=True)
|
|
60
|
-
class QueryFile:
|
|
61
|
-
filepath: str
|
|
62
|
-
model_type: ModelType
|
|
63
|
-
|
|
64
|
-
@dataclass(frozen=True)
|
|
65
|
-
class SqlQueryFile(QueryFile):
|
|
66
|
-
raw_query: str
|
|
67
|
-
|
|
68
|
-
@dataclass(frozen=True)
|
|
69
|
-
class _RawPyQuery:
|
|
70
|
-
query: Callable[[ModelArgs], pd.DataFrame]
|
|
71
|
-
dependencies_func: Callable[[ModelDepsArgs], Iterable[str]]
|
|
72
|
-
|
|
73
|
-
@dataclass(frozen=True)
|
|
74
|
-
class PyQueryFile(QueryFile):
|
|
75
|
-
raw_query: _RawPyQuery
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
@dataclass
|
|
79
|
-
class _Query(metaclass=ABCMeta):
|
|
80
|
-
query: Any
|
|
81
|
-
|
|
82
|
-
@dataclass
|
|
83
|
-
class _WorkInProgress(_Query):
|
|
84
|
-
query: None = field(default=None, init=False)
|
|
85
|
-
|
|
86
|
-
@dataclass
|
|
87
|
-
class SqlModelQuery(_Query):
|
|
88
|
-
query: str
|
|
89
|
-
config: _SqlModelConfig
|
|
90
|
-
|
|
91
|
-
@dataclass
|
|
92
|
-
class PyModelQuery(_Query):
|
|
93
|
-
query: Callable[[], pd.DataFrame]
|
|
22
|
+
SOURCE = "source"
|
|
23
|
+
DBVIEW = "dbview"
|
|
24
|
+
FEDERATE = "federate"
|
|
25
|
+
SEED = "seed"
|
|
26
|
+
BUILD = "build"
|
|
94
27
|
|
|
95
28
|
|
|
96
29
|
@dataclass
|
|
97
|
-
class
|
|
30
|
+
class DataModel(metaclass=ABCMeta):
|
|
98
31
|
name: str
|
|
32
|
+
model_config: mc.ModelConfig
|
|
99
33
|
is_target: bool = field(default=False, init=False)
|
|
100
34
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
result: pd.DataFrame | None = field(default=None, init=False, repr=False)
|
|
35
|
+
result: pl.LazyFrame | None = field(default=None, init=False, repr=False)
|
|
36
|
+
needs_python_df: bool = field(default=False, init=False)
|
|
104
37
|
|
|
105
38
|
wait_count: int = field(default=0, init=False, repr=False)
|
|
106
39
|
confirmed_no_cycles: bool = field(default=False, init=False)
|
|
107
|
-
upstreams: dict[str,
|
|
108
|
-
downstreams: dict[str,
|
|
40
|
+
upstreams: dict[str, DataModel] = field(default_factory=dict, init=False, repr=False)
|
|
41
|
+
downstreams: dict[str, DataModel] = field(default_factory=dict, init=False, repr=False)
|
|
42
|
+
|
|
43
|
+
_: KW_ONLY
|
|
44
|
+
logger: u.Logger = field(default_factory=lambda: u.Logger(""))
|
|
45
|
+
env_vars: dict[str, str] = field(default_factory=dict)
|
|
46
|
+
conn_set: ConnectionSet = field(default_factory=ConnectionSet)
|
|
109
47
|
|
|
48
|
+
@property
|
|
110
49
|
@abstractmethod
|
|
111
|
-
def
|
|
50
|
+
def model_type(self) -> ModelType:
|
|
112
51
|
pass
|
|
113
52
|
|
|
114
|
-
|
|
115
|
-
|
|
53
|
+
@property
|
|
54
|
+
def is_queryable(self) -> bool:
|
|
55
|
+
return True
|
|
56
|
+
|
|
57
|
+
def compile(
|
|
58
|
+
self, ctx: dict[str, Any], ctx_args: ContextArgs, models_dict: dict[str, DataModel], recurse: bool
|
|
116
59
|
) -> None:
|
|
117
60
|
pass
|
|
118
61
|
|
|
119
|
-
@abstractmethod
|
|
120
62
|
def get_terminal_nodes(self, depencency_path: set[str]) -> set[str]:
|
|
121
|
-
|
|
63
|
+
if self.confirmed_no_cycles:
|
|
64
|
+
return set()
|
|
122
65
|
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
66
|
+
if self.name in depencency_path:
|
|
67
|
+
raise u.ConfigurationError(f'Cycle found in model dependency graph')
|
|
68
|
+
|
|
69
|
+
terminal_nodes = set()
|
|
70
|
+
if len(self.upstreams) == 0:
|
|
71
|
+
terminal_nodes.add(self.name)
|
|
72
|
+
else:
|
|
73
|
+
new_path = set(depencency_path)
|
|
74
|
+
new_path.add(self.name)
|
|
75
|
+
for dep_model in self.upstreams.values():
|
|
76
|
+
terminal_nodes.update(dep_model.get_terminal_nodes(new_path))
|
|
77
|
+
|
|
78
|
+
self.confirmed_no_cycles = True
|
|
79
|
+
return terminal_nodes
|
|
80
|
+
|
|
81
|
+
def _load_duckdb_view_to_python_df(self, conn: duckdb.DuckDBPyConnection, *, use_venv: bool = False) -> pl.LazyFrame:
|
|
82
|
+
table_name = ("venv." if use_venv else "") + self.name
|
|
83
|
+
try:
|
|
84
|
+
return conn.sql(f"FROM {table_name}").pl().lazy()
|
|
85
|
+
except duckdb.CatalogException as e:
|
|
86
|
+
raise u.ConfigurationError(f'Failed to load duckdb table or view "{self.name}" to python dataframe') from e
|
|
129
87
|
|
|
130
|
-
|
|
88
|
+
def _run_sql_query_on_connection(self, connection_name: str, query: str, placeholders: dict = {}) -> pl.DataFrame:
|
|
89
|
+
self.logger.info(f"Running sql query on connection '{connection_name}': {query}")
|
|
90
|
+
return self.conn_set.run_sql_query_from_conn_name(query, connection_name, placeholders)
|
|
91
|
+
|
|
92
|
+
async def _trigger(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
|
|
131
93
|
self.wait_count -= 1
|
|
132
94
|
if (self.wait_count == 0):
|
|
133
95
|
await self.run_model(conn, placeholders)
|
|
134
96
|
|
|
135
|
-
|
|
136
|
-
async def run_model(self, conn: Connection, placeholders: dict = {}) -> None:
|
|
97
|
+
async def run_model(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
|
|
137
98
|
coroutines = []
|
|
138
99
|
for model in self.downstreams.values():
|
|
139
100
|
coroutines.append(model._trigger(conn, placeholders))
|
|
140
|
-
await
|
|
101
|
+
await u.asyncio_gather(coroutines)
|
|
141
102
|
|
|
142
103
|
def retrieve_dependent_query_models(self, dependent_model_names: set[str]) -> None:
|
|
143
104
|
pass
|
|
144
|
-
|
|
105
|
+
|
|
106
|
+
def _register_all_upstream_python_df_helper(self, conn: duckdb.DuckDBPyConnection, tables_set: set[str]) -> None:
|
|
107
|
+
if self.result is not None and self.name not in tables_set:
|
|
108
|
+
conn.register(self.name, self.result)
|
|
109
|
+
for dep_model in self.upstreams.values():
|
|
110
|
+
dep_model._register_all_upstream_python_df_helper(conn, tables_set)
|
|
111
|
+
|
|
112
|
+
def register_all_upstream_python_df(self, conn: duckdb.DuckDBPyConnection) -> None:
|
|
113
|
+
show_tables_query = f"SHOW TABLES"
|
|
114
|
+
tables_df = conn.sql(show_tables_query).pl()
|
|
115
|
+
tables_set = set(tables_df["name"])
|
|
116
|
+
self._register_all_upstream_python_df_helper(conn, tables_set)
|
|
117
|
+
|
|
145
118
|
def get_max_path_length_to_target(self) -> int | None:
|
|
146
119
|
if not hasattr(self, "max_path_len_to_target"):
|
|
147
120
|
path_lengths = []
|
|
@@ -154,283 +127,748 @@ class Referable(metaclass=ABCMeta):
|
|
|
154
127
|
self.max_path_len_to_target = 0 if self.is_target else None
|
|
155
128
|
return self.max_path_len_to_target
|
|
156
129
|
|
|
130
|
+
async def _trigger_build(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
|
|
131
|
+
pass
|
|
132
|
+
|
|
133
|
+
def _create_table_from_df(self, conn: duckdb.DuckDBPyConnection, query_result: pl.LazyFrame | pd.DataFrame):
|
|
134
|
+
local_conn = conn.cursor()
|
|
135
|
+
try:
|
|
136
|
+
local_conn.register("df", query_result)
|
|
137
|
+
local_conn.execute(f"CREATE OR REPLACE TABLE {self.name} AS SELECT * FROM df")
|
|
138
|
+
finally:
|
|
139
|
+
local_conn.close()
|
|
140
|
+
|
|
141
|
+
def process_pass_through_columns(self, models_dict: dict[str, DataModel]) -> None:
|
|
142
|
+
pass
|
|
143
|
+
|
|
157
144
|
|
|
158
145
|
@dataclass
|
|
159
|
-
class
|
|
160
|
-
|
|
146
|
+
class StaticModel(DataModel):
|
|
147
|
+
needs_python_df_for_build: bool = field(default=False, init=False)
|
|
148
|
+
wait_count_for_build: int = field(default=0, init=False, repr=False)
|
|
149
|
+
upstreams_for_build: dict[str, StaticModel] = field(default_factory=dict, init=False, repr=False)
|
|
150
|
+
downstreams_for_build: dict[str, StaticModel] = field(default_factory=dict, init=False, repr=False)
|
|
151
|
+
|
|
152
|
+
def get_terminal_nodes_for_build(self, depencency_path: set[str]) -> set[str]:
|
|
153
|
+
if self.confirmed_no_cycles:
|
|
154
|
+
return set()
|
|
155
|
+
|
|
156
|
+
if self.name in depencency_path:
|
|
157
|
+
raise u.ConfigurationError(f'Cycle found in model dependency graph')
|
|
158
|
+
|
|
159
|
+
terminal_nodes = set()
|
|
160
|
+
if len(self.upstreams_for_build) == 0:
|
|
161
|
+
terminal_nodes.add(self.name)
|
|
162
|
+
else:
|
|
163
|
+
new_path = set(depencency_path)
|
|
164
|
+
new_path.add(self.name)
|
|
165
|
+
for dep_model in self.upstreams_for_build.values():
|
|
166
|
+
terminal_nodes.update(dep_model.get_terminal_nodes_for_build(new_path))
|
|
167
|
+
|
|
168
|
+
self.confirmed_no_cycles = True
|
|
169
|
+
return terminal_nodes
|
|
170
|
+
|
|
171
|
+
def _get_result(self, conn: duckdb.DuckDBPyConnection) -> pl.LazyFrame:
|
|
172
|
+
local_conn = conn.cursor()
|
|
173
|
+
try:
|
|
174
|
+
return self._load_duckdb_view_to_python_df(local_conn, use_venv=True)
|
|
175
|
+
except Exception as e:
|
|
176
|
+
raise InvalidInputError(61, f'Model "{self.name}" depends on static data models that cannot be found.')
|
|
177
|
+
finally:
|
|
178
|
+
local_conn.close()
|
|
179
|
+
|
|
180
|
+
async def run_model(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
|
|
181
|
+
start = time.time()
|
|
161
182
|
|
|
162
|
-
|
|
183
|
+
if (self.needs_python_df or self.is_target) and self.result is None:
|
|
184
|
+
self.result = await asyncio.to_thread(self._get_result, conn)
|
|
185
|
+
|
|
186
|
+
self.logger.log_activity_time(f"loading static model '{self.name}'", start)
|
|
187
|
+
|
|
188
|
+
await super().run_model(conn, placeholders)
|
|
189
|
+
|
|
190
|
+
def compile_for_build(
|
|
191
|
+
self, conn_args: ConnectionsArgs, models_dict: dict[str, StaticModel]
|
|
192
|
+
) -> None:
|
|
193
|
+
pass
|
|
194
|
+
|
|
195
|
+
async def _trigger_build(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
|
|
196
|
+
self.wait_count_for_build -= 1
|
|
197
|
+
if (self.wait_count_for_build == 0):
|
|
198
|
+
await self.build_model(conn, full_refresh)
|
|
199
|
+
|
|
200
|
+
async def build_model(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
|
|
201
|
+
if self.needs_python_df and self.result is None:
|
|
202
|
+
local_conn = conn.cursor()
|
|
203
|
+
try:
|
|
204
|
+
self.result = await asyncio.to_thread(self._load_duckdb_view_to_python_df, local_conn)
|
|
205
|
+
finally:
|
|
206
|
+
local_conn.close()
|
|
207
|
+
|
|
208
|
+
coroutines = []
|
|
209
|
+
for model in self.downstreams_for_build.values():
|
|
210
|
+
coroutines.append(model._trigger_build(conn, full_refresh))
|
|
211
|
+
await u.asyncio_gather(coroutines)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
@dataclass
|
|
215
|
+
class Seed(StaticModel):
|
|
216
|
+
model_config: mc.SeedConfig
|
|
217
|
+
result: pl.LazyFrame
|
|
218
|
+
|
|
219
|
+
@property
|
|
220
|
+
def model_type(self) -> ModelType:
|
|
163
221
|
return ModelType.SEED
|
|
222
|
+
|
|
223
|
+
async def build_model(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
|
|
224
|
+
start = time.time()
|
|
164
225
|
|
|
165
|
-
|
|
166
|
-
|
|
226
|
+
print(f"[{u.get_current_time()}] 🔨 BUILDING: seed model '{self.name}'")
|
|
227
|
+
await asyncio.to_thread(self._create_table_from_df, conn, self.result)
|
|
228
|
+
|
|
229
|
+
print(f"[{u.get_current_time()}] ✅ FINISHED: seed model '{self.name}'")
|
|
230
|
+
self.logger.log_activity_time(f"building seed model '{self.name}' to venv", start)
|
|
231
|
+
|
|
232
|
+
await super().build_model(conn, full_refresh)
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
@dataclass
|
|
236
|
+
class SourceModel(StaticModel):
|
|
237
|
+
model_config: src.Source
|
|
238
|
+
|
|
239
|
+
@property
|
|
240
|
+
def model_type(self) -> ModelType:
|
|
241
|
+
return ModelType.SOURCE
|
|
167
242
|
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
243
|
+
@property
|
|
244
|
+
def is_queryable(self) -> bool:
|
|
245
|
+
return self.model_config.load_to_duckdb
|
|
246
|
+
|
|
247
|
+
def _build_source_model(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
|
|
248
|
+
local_conn = conn.cursor()
|
|
249
|
+
try:
|
|
250
|
+
source = self.model_config
|
|
251
|
+
conn_name = source.get_connection()
|
|
252
|
+
|
|
253
|
+
connection_props = self.conn_set.get_connection(conn_name)
|
|
254
|
+
if isinstance(connection_props, ConnectionProperties):
|
|
255
|
+
dialect = connection_props.dialect
|
|
256
|
+
else:
|
|
257
|
+
raise u.ConfigurationError(f'Unable to use connection "{conn_name}" for source "{self.name}"')
|
|
258
|
+
|
|
259
|
+
result = u.run_duckdb_stmt(self.logger, local_conn, f"FROM (SHOW DATABASES) WHERE database_name = 'db_{conn_name}'").fetchone()
|
|
260
|
+
if result is None:
|
|
261
|
+
return # skip this source if connection is not attached
|
|
262
|
+
|
|
263
|
+
table_name = source.get_table()
|
|
264
|
+
new_table_name = self.name
|
|
172
265
|
|
|
266
|
+
if len(source.columns) == 0:
|
|
267
|
+
stmt = f"CREATE OR REPLACE TABLE {new_table_name} AS SELECT * FROM db_{conn_name}.{table_name}"
|
|
268
|
+
u.run_duckdb_stmt(self.logger, local_conn, stmt)
|
|
269
|
+
return
|
|
270
|
+
|
|
271
|
+
increasing_column = source.update_hints.increasing_column
|
|
272
|
+
recreate_table = full_refresh or increasing_column is None
|
|
273
|
+
if recreate_table:
|
|
274
|
+
u.run_duckdb_stmt(self.logger, local_conn, f"DROP TABLE IF EXISTS {new_table_name}")
|
|
275
|
+
|
|
276
|
+
create_table_cols_clause = source.get_cols_for_create_table_stmt()
|
|
277
|
+
stmt = f"CREATE TABLE IF NOT EXISTS {new_table_name} ({create_table_cols_clause})"
|
|
278
|
+
u.run_duckdb_stmt(self.logger, local_conn, stmt)
|
|
279
|
+
|
|
280
|
+
if not recreate_table:
|
|
281
|
+
if source.update_hints.selective_overwrite_value is not None:
|
|
282
|
+
stmt = f"DELETE FROM {new_table_name} WHERE {increasing_column} >= $value"
|
|
283
|
+
u.run_duckdb_stmt(self.logger, local_conn, stmt, params={"value": source.update_hints.selective_overwrite_value})
|
|
284
|
+
elif not source.update_hints.strictly_increasing:
|
|
285
|
+
stmt = f"DELETE FROM {new_table_name} WHERE {increasing_column} = ({source.get_max_incr_col_query(new_table_name)})"
|
|
286
|
+
u.run_duckdb_stmt(self.logger, local_conn, stmt)
|
|
287
|
+
|
|
288
|
+
max_val_of_incr_col = None
|
|
289
|
+
if increasing_column is not None:
|
|
290
|
+
max_val_of_incr_col_tuple = u.run_duckdb_stmt(self.logger, local_conn, source.get_max_incr_col_query(new_table_name)).fetchone()
|
|
291
|
+
max_val_of_incr_col = max_val_of_incr_col_tuple[0] if isinstance(max_val_of_incr_col_tuple, tuple) else None
|
|
292
|
+
if max_val_of_incr_col is None:
|
|
293
|
+
recreate_table = True
|
|
294
|
+
|
|
295
|
+
insert_cols_clause = source.get_cols_for_insert_stmt()
|
|
296
|
+
insert_replace_clause = source.get_insert_replace_clause()
|
|
297
|
+
query = source.get_query_for_insert(dialect, conn_name, table_name, max_val_of_incr_col, full_refresh=recreate_table)
|
|
298
|
+
stmt = f"INSERT {insert_replace_clause} INTO {new_table_name} ({insert_cols_clause}) {query}"
|
|
299
|
+
u.run_duckdb_stmt(self.logger, local_conn, stmt)
|
|
300
|
+
finally:
|
|
301
|
+
local_conn.close()
|
|
302
|
+
|
|
303
|
+
async def build_model(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
|
|
304
|
+
if self.model_config.load_to_duckdb:
|
|
305
|
+
start = time.time()
|
|
306
|
+
print(f"[{u.get_current_time()}] 🔨 BUILDING: source model '{self.name}'")
|
|
307
|
+
|
|
308
|
+
await asyncio.to_thread(self._build_source_model, conn, full_refresh)
|
|
309
|
+
|
|
310
|
+
print(f"[{u.get_current_time()}] ✅ FINISHED: source model '{self.name}'")
|
|
311
|
+
self.logger.log_activity_time(f"building source model '{self.name}' to venv", start)
|
|
312
|
+
|
|
313
|
+
await super().build_model(conn, full_refresh)
|
|
314
|
+
|
|
173
315
|
|
|
174
316
|
@dataclass
|
|
175
|
-
class
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
317
|
+
class QueryModel(DataModel):
|
|
318
|
+
model_config: mc.QueryModelConfig
|
|
319
|
+
query_file: mq.QueryFile
|
|
320
|
+
compiled_query: mq.Query | None = field(default=None, init=False)
|
|
321
|
+
_: KW_ONLY
|
|
180
322
|
j2_env: u.j2.Environment = field(default_factory=lambda: u.j2.Environment(loader=u.j2.FileSystemLoader(".")))
|
|
181
|
-
compiled_query: _Query | None = field(default=None, init=False)
|
|
182
|
-
|
|
183
|
-
def get_model_type(self) -> ModelType:
|
|
184
|
-
return self.query_file.model_type
|
|
185
323
|
|
|
186
|
-
def _add_upstream(self, other:
|
|
324
|
+
def _add_upstream(self, other: DataModel) -> None:
|
|
187
325
|
self.upstreams[other.name] = other
|
|
188
326
|
other.downstreams[self.name] = self
|
|
189
327
|
|
|
190
|
-
if isinstance(self.query_file,
|
|
191
|
-
other.
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
is_placeholder = lambda placeholder: placeholder in placeholders
|
|
328
|
+
if isinstance(self.query_file, mq.PyQueryFile):
|
|
329
|
+
other.needs_python_df = True
|
|
330
|
+
|
|
331
|
+
def _ref_for_sql(self, dependent_model_name: str, models_dict: dict[str, DataModel]) -> str:
|
|
332
|
+
if dependent_model_name not in models_dict:
|
|
333
|
+
raise u.ConfigurationError(f'Model "{self.name}" references unknown model "{dependent_model_name}"')
|
|
334
|
+
|
|
335
|
+
dep_model = models_dict[dependent_model_name]
|
|
336
|
+
if isinstance(dep_model, SourceModel) and not dep_model.model_config.load_to_duckdb:
|
|
337
|
+
raise u.ConfigurationError(
|
|
338
|
+
f'Model "{self.name}" cannot reference source model "{dependent_model_name}" which has load_to_duckdb=False'
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
self.model_config.depends_on.add(dependent_model_name)
|
|
342
|
+
return dependent_model_name
|
|
343
|
+
|
|
344
|
+
def _ref_for_python(self, dependent_model_name: str) -> pl.LazyFrame:
|
|
345
|
+
if dependent_model_name not in self.upstreams:
|
|
346
|
+
raise u.ConfigurationError(f'Model "{self.name}" must include model "{dependent_model_name}" as a dependency to use')
|
|
347
|
+
df = self.upstreams[dependent_model_name].result
|
|
348
|
+
assert df is not None
|
|
349
|
+
return df
|
|
350
|
+
|
|
351
|
+
def _get_compile_sql_model_args_from_ctx_args(
|
|
352
|
+
self, ctx: dict[str, Any], ctx_args: ContextArgs
|
|
353
|
+
) -> dict[str, Any]:
|
|
354
|
+
is_placeholder = lambda placeholder: placeholder in ctx_args.placeholders
|
|
218
355
|
kwargs = {
|
|
219
356
|
"proj_vars": ctx_args.proj_vars, "env_vars": ctx_args.env_vars, "user": ctx_args.user, "prms": ctx_args.prms,
|
|
220
357
|
"traits": ctx_args.traits, "ctx": ctx, "is_placeholder": is_placeholder, "set_placeholder": ctx_args.set_placeholder,
|
|
221
|
-
"
|
|
358
|
+
"param_exists": ctx_args.param_exists
|
|
222
359
|
}
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
if dependent_model_name not in models_dict:
|
|
227
|
-
raise u.ConfigurationError(f'Model "{self.name}" references unknown model "{dependent_model_name}"')
|
|
228
|
-
dependencies.add(dependent_model_name)
|
|
229
|
-
return dependent_model_name
|
|
230
|
-
kwargs["ref"] = ref
|
|
231
|
-
|
|
360
|
+
return kwargs
|
|
361
|
+
|
|
362
|
+
def _get_compiled_sql_query_str(self, raw_query: str, kwargs: dict[str, Any]) -> str:
|
|
232
363
|
try:
|
|
233
|
-
template = self.j2_env.from_string(
|
|
234
|
-
query =
|
|
364
|
+
template = self.j2_env.from_string(raw_query)
|
|
365
|
+
query = template.render(kwargs)
|
|
235
366
|
except Exception as e:
|
|
236
|
-
raise
|
|
367
|
+
raise FileExecutionError(f'Failed to compile sql model "{self.name}"', e) from e
|
|
368
|
+
return query
|
|
369
|
+
|
|
370
|
+
def process_pass_through_columns(self, models_dict: dict[str, DataModel]) -> None:
|
|
371
|
+
if getattr(self, "processed_pass_through_columns", False):
|
|
372
|
+
return
|
|
373
|
+
|
|
374
|
+
for col in self.model_config.columns:
|
|
375
|
+
if col.pass_through:
|
|
376
|
+
# Validate pass-through column has exactly one dependency
|
|
377
|
+
if len(col.depends_on) != 1:
|
|
378
|
+
raise u.ConfigurationError(
|
|
379
|
+
f'Column "{self.name}.{col.name}" has pass_through=true, which must have exactly one depends_on value'
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
# Get the upstream column reference
|
|
383
|
+
upstream_col_ref = next(iter(col.depends_on))
|
|
384
|
+
table_name, col_name = upstream_col_ref.split('.')
|
|
385
|
+
self.model_config.depends_on.add(table_name)
|
|
386
|
+
|
|
387
|
+
# Get the upstream model
|
|
388
|
+
if table_name not in models_dict:
|
|
389
|
+
raise u.ConfigurationError(
|
|
390
|
+
f'Column "{self.name}.{col.name}" depends on unknown model "{table_name}"'
|
|
391
|
+
)
|
|
392
|
+
|
|
393
|
+
# Do not rely on self.upstreams here, as it may not be fully populated for metadata passthrough purposes
|
|
394
|
+
for dep_model_name in self.model_config.depends_on:
|
|
395
|
+
dep_model = models_dict[dep_model_name]
|
|
396
|
+
dep_model.process_pass_through_columns(models_dict)
|
|
397
|
+
|
|
398
|
+
for col in self.model_config.columns:
|
|
399
|
+
if col.pass_through:
|
|
400
|
+
upstream_col_ref = next(iter(col.depends_on))
|
|
401
|
+
table_name, col_name = upstream_col_ref.split('.')
|
|
402
|
+
upstream_model = models_dict[table_name]
|
|
403
|
+
|
|
404
|
+
# Find the upstream column config
|
|
405
|
+
upstream_col = next(
|
|
406
|
+
(c for c in upstream_model.model_config.columns if c.name == col_name),
|
|
407
|
+
None
|
|
408
|
+
)
|
|
409
|
+
if upstream_col is None:
|
|
410
|
+
raise u.ConfigurationError(
|
|
411
|
+
f'Column "{self.name}.{col.name}" depends on unknown column "{upstream_col_ref}"'
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
# Copy metadata from upstream column
|
|
415
|
+
col.type = upstream_col.type if col.type == "" else col.type
|
|
416
|
+
col.condition = upstream_col.condition if col.condition == "" else col.condition
|
|
417
|
+
col.description = upstream_col.description if col.description == "" else col.description
|
|
418
|
+
col.category = upstream_col.category if col.category == mc.ColumnCategory.MISC else col.category
|
|
419
|
+
|
|
420
|
+
self.processed_pass_through_columns = True
|
|
237
421
|
|
|
238
|
-
|
|
239
|
-
|
|
422
|
+
def retrieve_dependent_query_models(self, dependent_model_names: set[str]) -> None:
|
|
423
|
+
if self.name not in dependent_model_names:
|
|
424
|
+
dependent_model_names.add(self.name)
|
|
425
|
+
for dep_model in self.upstreams.values():
|
|
426
|
+
dep_model.retrieve_dependent_query_models(dependent_model_names)
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
@dataclass
|
|
430
|
+
class DbviewModel(QueryModel):
|
|
431
|
+
model_config: mc.DbviewModelConfig
|
|
432
|
+
query_file: mq.SqlQueryFile
|
|
433
|
+
compiled_query: mq.SqlModelQuery | None = field(default=None, init=False)
|
|
434
|
+
sources: dict[str, src.Source] = field(default_factory=dict, init=False)
|
|
435
|
+
|
|
436
|
+
@property
|
|
437
|
+
def model_type(self) -> ModelType:
|
|
438
|
+
return ModelType.DBVIEW
|
|
439
|
+
|
|
440
|
+
def _get_compile_sql_model_args(
|
|
441
|
+
self, ctx: dict[str, Any], ctx_args: ContextArgs, models_dict: dict[str, DataModel]
|
|
442
|
+
) -> dict[str, Any]:
|
|
443
|
+
kwargs = self._get_compile_sql_model_args_from_ctx_args(ctx, ctx_args)
|
|
444
|
+
|
|
445
|
+
def source(source_name: str) -> str:
|
|
446
|
+
if source_name not in models_dict or not isinstance(source_model := models_dict[source_name], SourceModel):
|
|
447
|
+
raise u.ConfigurationError(f'Dbview "{self.name}" references unknown source "{source_name}"')
|
|
448
|
+
if source_model.model_config.get_connection() != self.model_config.get_connection():
|
|
449
|
+
raise u.ConfigurationError(f'Dbview "{self.name}" references source "{source_name}" with different connection')
|
|
450
|
+
|
|
451
|
+
# Check if the source model has load_to_duckdb=False but this dbview has translate_to_duckdb=True
|
|
452
|
+
if not source_model.model_config.load_to_duckdb and self.model_config.translate_to_duckdb:
|
|
453
|
+
raise u.ConfigurationError(
|
|
454
|
+
f'Dbview "{self.name}" with translate_to_duckdb=True cannot reference source "{source_name}" '
|
|
455
|
+
f'which has load_to_duckdb=False'
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
self.model_config.depends_on.add(source_name)
|
|
459
|
+
self.sources[source_name] = source_model.model_config
|
|
460
|
+
return "{{ source(\"" + source_name + "\") }}"
|
|
461
|
+
|
|
462
|
+
kwargs["source"] = source
|
|
463
|
+
return kwargs
|
|
464
|
+
|
|
465
|
+
def _get_duckdb_query(self, read_dialect: str, query: str) -> str:
|
|
466
|
+
kwargs = {
|
|
467
|
+
"source": lambda source_name: "venv." + source_name
|
|
468
|
+
}
|
|
469
|
+
compiled_query = self._get_compiled_sql_query_str(query, kwargs)
|
|
470
|
+
return sqlglot.transpile(compiled_query, read=read_dialect, write="duckdb")[0]
|
|
240
471
|
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
472
|
+
def _compile_sql_model(self, kwargs: dict[str, Any]) -> mq.SqlModelQuery:
|
|
473
|
+
compiled_query_str = self._get_compiled_sql_query_str(self.query_file.raw_query, kwargs)
|
|
474
|
+
|
|
475
|
+
connection_name = self.model_config.get_connection()
|
|
476
|
+
connection_props = self.conn_set.get_connection(connection_name)
|
|
245
477
|
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
478
|
+
if self.model_config.translate_to_duckdb and isinstance(connection_props, ConnectionProperties):
|
|
479
|
+
macros = {
|
|
480
|
+
"source": lambda source_name: "venv." + source_name
|
|
481
|
+
}
|
|
482
|
+
compiled_query2 = self._get_compiled_sql_query_str(compiled_query_str, macros)
|
|
483
|
+
compiled_query_str = self._get_duckdb_query(connection_props.dialect, compiled_query2)
|
|
484
|
+
is_duckdb = True
|
|
485
|
+
else:
|
|
486
|
+
macros = {
|
|
487
|
+
"source": lambda source_name: self.sources[source_name].get_table()
|
|
488
|
+
}
|
|
489
|
+
compiled_query_str = self._get_compiled_sql_query_str(compiled_query_str, macros)
|
|
490
|
+
is_duckdb = False
|
|
491
|
+
|
|
492
|
+
compiled_query = mq.SqlModelQuery(compiled_query_str, is_duckdb)
|
|
493
|
+
return compiled_query
|
|
494
|
+
|
|
495
|
+
def compile(
|
|
496
|
+
self, ctx: dict[str, Any], ctx_args: ContextArgs, models_dict: dict[str, DataModel], recurse: bool
|
|
497
|
+
) -> None:
|
|
498
|
+
if self.compiled_query is not None:
|
|
499
|
+
return
|
|
500
|
+
else:
|
|
501
|
+
self.compiled_query = mq.WorkInProgress() # type: ignore
|
|
256
502
|
|
|
257
|
-
|
|
258
|
-
connections = self.conn_set.get_engines_as_dict()
|
|
503
|
+
start = time.time()
|
|
259
504
|
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
raise u.ConfigurationError(f'Model "{self.name}" must include model "{dependent_model_name}" as a dependency to use')
|
|
263
|
-
return pd.DataFrame(self.upstreams[dependent_model_name].result)
|
|
505
|
+
kwargs = self._get_compile_sql_model_args(ctx, ctx_args, models_dict)
|
|
506
|
+
self.compiled_query = self._compile_sql_model(kwargs)
|
|
264
507
|
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
508
|
+
self.logger.log_activity_time(f"compiling dbview model '{self.name}'", start)
|
|
509
|
+
|
|
510
|
+
async def _run_sql_model(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
|
|
511
|
+
assert self.compiled_query is not None
|
|
512
|
+
is_duckdb = self.compiled_query.is_duckdb
|
|
513
|
+
query = self.compiled_query.query
|
|
514
|
+
connection_name = self.model_config.get_connection()
|
|
268
515
|
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
516
|
+
def run_sql_query_on_connection(is_duckdb: bool, query: str, placeholders: dict) -> pl.DataFrame:
|
|
517
|
+
try:
|
|
518
|
+
if is_duckdb:
|
|
519
|
+
local_conn = conn.cursor()
|
|
520
|
+
try:
|
|
521
|
+
self.logger.info(f"Running duckdb query: {query}")
|
|
522
|
+
return local_conn.sql(query, params=placeholders).pl()
|
|
523
|
+
except duckdb.CatalogException as e:
|
|
524
|
+
raise InvalidInputError(61, f'Model "{self.name}" depends on static data models that cannot be found.')
|
|
525
|
+
except Exception as e:
|
|
526
|
+
raise RuntimeError(e)
|
|
527
|
+
finally:
|
|
528
|
+
local_conn.close()
|
|
529
|
+
else:
|
|
530
|
+
return self._run_sql_query_on_connection(connection_name, query, placeholders)
|
|
531
|
+
except RuntimeError as e:
|
|
532
|
+
raise FileExecutionError(f'Failed to run dbview sql model "{self.name}"', e)
|
|
533
|
+
|
|
534
|
+
result = await asyncio.to_thread(run_sql_query_on_connection, is_duckdb, query, placeholders)
|
|
535
|
+
self.result = result.lazy()
|
|
536
|
+
|
|
537
|
+
async def run_model(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
|
|
538
|
+
start = time.time()
|
|
539
|
+
|
|
540
|
+
await self._run_sql_model(conn, placeholders)
|
|
541
|
+
|
|
542
|
+
self.logger.log_activity_time(f"running dbview model '{self.name}'", start)
|
|
543
|
+
|
|
544
|
+
await super().run_model(conn, placeholders)
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
@dataclass
|
|
548
|
+
class FederateModel(QueryModel):
|
|
549
|
+
model_config: mc.FederateModelConfig
|
|
550
|
+
query_file: mq.SqlQueryFile | mq.PyQueryFile
|
|
551
|
+
compiled_query: mq.SqlModelQuery | mq.PyModelQuery | None = field(default=None, init=False)
|
|
552
|
+
|
|
553
|
+
@property
|
|
554
|
+
def model_type(self) -> ModelType:
|
|
555
|
+
return ModelType.FEDERATE
|
|
556
|
+
|
|
557
|
+
def _get_compile_sql_model_args(
|
|
558
|
+
self, ctx: dict[str, Any], ctx_args: ContextArgs, models_dict: dict[str, DataModel]
|
|
559
|
+
) -> dict[str, Any]:
|
|
560
|
+
kwargs = self._get_compile_sql_model_args_from_ctx_args(ctx, ctx_args)
|
|
561
|
+
|
|
562
|
+
def ref(dependent_model_name: str) -> str:
|
|
563
|
+
dependent_model = self._ref_for_sql(dependent_model_name, models_dict)
|
|
564
|
+
prefix = "venv." if isinstance(models_dict[dependent_model], (SourceModel, BuildModel)) else ""
|
|
565
|
+
return prefix + dependent_model
|
|
566
|
+
|
|
567
|
+
kwargs["ref"] = ref
|
|
568
|
+
return kwargs
|
|
569
|
+
|
|
570
|
+
def _compile_sql_model(
|
|
571
|
+
self, query_file: mq.SqlQueryFile, ctx: dict[str, Any], ctx_args: ContextArgs, models_dict: dict[str, DataModel]
|
|
572
|
+
) -> mq.SqlModelQuery:
|
|
573
|
+
kwargs = self._get_compile_sql_model_args(ctx, ctx_args, models_dict)
|
|
574
|
+
compiled_query_str = self._get_compiled_sql_query_str(query_file.raw_query, kwargs)
|
|
575
|
+
compiled_query = mq.SqlModelQuery(compiled_query_str, is_duckdb=True)
|
|
576
|
+
return compiled_query
|
|
577
|
+
|
|
578
|
+
def _get_python_model_args(self, ctx: dict[str, Any], ctx_args: ContextArgs) -> ModelArgs:
|
|
579
|
+
dependencies = self.model_config.depends_on
|
|
580
|
+
connections = self.conn_set.get_connections_as_dict()
|
|
581
|
+
|
|
582
|
+
def run_external_sql(connection_name: str, sql_query: str) -> pl.DataFrame:
|
|
583
|
+
return self._run_sql_query_on_connection(connection_name, sql_query, ctx_args.placeholders)
|
|
584
|
+
|
|
585
|
+
conn_args = ConnectionsArgs(ctx_args.project_path, ctx_args.proj_vars, ctx_args.env_vars)
|
|
586
|
+
build_model_args = BuildModelArgs(conn_args, connections, dependencies, self._ref_for_python, run_external_sql)
|
|
587
|
+
return ModelArgs(ctx_args, build_model_args, ctx)
|
|
588
|
+
|
|
589
|
+
def _compile_python_model(
|
|
590
|
+
self, query_file: mq.PyQueryFile, ctx: dict[str, Any], ctx_args: ContextArgs
|
|
591
|
+
) -> mq.PyModelQuery:
|
|
592
|
+
sqrl_args = self._get_python_model_args(ctx, ctx_args)
|
|
274
593
|
|
|
275
|
-
def compiled_query():
|
|
594
|
+
def compiled_query() -> pl.LazyFrame | pd.DataFrame:
|
|
276
595
|
try:
|
|
277
|
-
|
|
278
|
-
raw_query: _RawPyQuery = self.query_file.raw_query
|
|
279
|
-
return raw_query.query(sqrl_args)
|
|
596
|
+
return query_file.raw_query(sqrl_args)
|
|
280
597
|
except Exception as e:
|
|
281
|
-
raise
|
|
598
|
+
raise FileExecutionError(f'Failed to run "{c.MAIN_FUNC}" function for python model "{self.name}"', e) from e
|
|
282
599
|
|
|
283
|
-
return PyModelQuery(compiled_query)
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
self, ctx: dict[str, Any], ctx_args: ContextArgs,
|
|
600
|
+
return mq.PyModelQuery(compiled_query)
|
|
601
|
+
|
|
602
|
+
def compile(
|
|
603
|
+
self, ctx: dict[str, Any], ctx_args: ContextArgs, models_dict: dict[str, DataModel], recurse: bool
|
|
287
604
|
) -> None:
|
|
288
605
|
if self.compiled_query is not None:
|
|
289
606
|
return
|
|
290
607
|
else:
|
|
291
|
-
self.compiled_query =
|
|
608
|
+
self.compiled_query = mq.WorkInProgress() # type: ignore
|
|
292
609
|
|
|
293
610
|
start = time.time()
|
|
294
611
|
|
|
295
|
-
if isinstance(self.query_file, SqlQueryFile):
|
|
296
|
-
compiled_query
|
|
297
|
-
elif isinstance(self.query_file, PyQueryFile):
|
|
298
|
-
compiled_query
|
|
612
|
+
if isinstance(self.query_file, mq.SqlQueryFile):
|
|
613
|
+
self.compiled_query = self._compile_sql_model(self.query_file, ctx, ctx_args, models_dict)
|
|
614
|
+
elif isinstance(self.query_file, mq.PyQueryFile):
|
|
615
|
+
self.compiled_query = self._compile_python_model(self.query_file, ctx, ctx_args)
|
|
299
616
|
else:
|
|
300
617
|
raise NotImplementedError(f"Query type not supported: {self.query_file.__class__.__name__}")
|
|
301
618
|
|
|
302
|
-
self.
|
|
303
|
-
self.wait_count = len(set(dependencies))
|
|
304
|
-
|
|
305
|
-
model_type = self.get_model_type().name.lower()
|
|
306
|
-
self.logger.log_activity_time(f"compiling {model_type} model '{self.name}'", start)
|
|
619
|
+
self.logger.log_activity_time(f"compiling federate model '{self.name}'", start)
|
|
307
620
|
|
|
308
621
|
if not recurse:
|
|
309
622
|
return
|
|
310
623
|
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
624
|
+
dependencies = self.model_config.depends_on
|
|
625
|
+
self.wait_count = len(dependencies)
|
|
626
|
+
|
|
627
|
+
for name in dependencies:
|
|
628
|
+
dep_model = models_dict[name]
|
|
314
629
|
self._add_upstream(dep_model)
|
|
315
|
-
|
|
316
|
-
coroutines.append(coro)
|
|
317
|
-
await asyncio.gather(*coroutines)
|
|
318
|
-
|
|
319
|
-
def get_terminal_nodes(self, depencency_path: set[str]) -> set[str]:
|
|
320
|
-
if self.confirmed_no_cycles:
|
|
321
|
-
return set()
|
|
322
|
-
|
|
323
|
-
if self.name in depencency_path:
|
|
324
|
-
raise u.ConfigurationError(f'Cycle found in model dependency graph')
|
|
630
|
+
dep_model.compile(ctx, ctx_args, models_dict, recurse)
|
|
325
631
|
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
new_path.add(self.name)
|
|
332
|
-
for dep_model in self.upstreams.values():
|
|
333
|
-
terminal_nodes_under_dep = dep_model.get_terminal_nodes(new_path)
|
|
334
|
-
terminal_nodes = terminal_nodes.union(terminal_nodes_under_dep)
|
|
335
|
-
|
|
336
|
-
self.confirmed_no_cycles = True
|
|
337
|
-
return terminal_nodes
|
|
632
|
+
async def _run_sql_model(self, compiled_query: mq.SqlModelQuery, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
|
|
633
|
+
local_conn = conn.cursor()
|
|
634
|
+
try:
|
|
635
|
+
self.register_all_upstream_python_df(local_conn)
|
|
636
|
+
query = compiled_query.query
|
|
338
637
|
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
query = self.compiled_query.query
|
|
638
|
+
def create_table(local_conn: duckdb.DuckDBPyConnection):
|
|
639
|
+
placeholer_exists = lambda key: re.search(r"\$" + key + r"(?!\w)", query)
|
|
640
|
+
existing_placeholders = {key: value for key, value in placeholders.items() if placeholer_exists(key)}
|
|
343
641
|
|
|
344
|
-
|
|
345
|
-
def run_sql_query():
|
|
346
|
-
try:
|
|
347
|
-
return self.conn_set.run_sql_query_from_conn_name(query, config.connection_name, placeholders)
|
|
348
|
-
except RuntimeError as e:
|
|
349
|
-
raise u.FileExecutionError(f'Failed to run dbview sql model "{self.name}"', e) from e
|
|
350
|
-
|
|
351
|
-
df = await asyncio.to_thread(run_sql_query)
|
|
352
|
-
await asyncio.to_thread(self._load_pandas_to_table, df, conn)
|
|
353
|
-
if self.needs_pandas or self.is_target:
|
|
354
|
-
self.result = df
|
|
355
|
-
elif self.query_file.model_type == ModelType.FEDERATE:
|
|
356
|
-
def create_table():
|
|
357
|
-
create_query = config.get_sql_for_create(self.name, query)
|
|
642
|
+
create_query = self.model_config.get_sql_for_create(self.name, query)
|
|
358
643
|
try:
|
|
359
|
-
return
|
|
644
|
+
return local_conn.execute(create_query, existing_placeholders)
|
|
645
|
+
except duckdb.CatalogException as e:
|
|
646
|
+
raise InvalidInputError(61, f'Model "{self.name}" depends on static data models that cannot be found.')
|
|
360
647
|
except Exception as e:
|
|
361
|
-
|
|
648
|
+
if self.name == "__fake_target":
|
|
649
|
+
raise InvalidInputError(204, f"Failed to run provided SQL query")
|
|
650
|
+
else:
|
|
651
|
+
raise FileExecutionError(f'Failed to run federate sql model "{self.name}"', e) from e
|
|
362
652
|
|
|
363
|
-
await asyncio.to_thread(create_table)
|
|
364
|
-
if self.
|
|
365
|
-
self.result = await asyncio.to_thread(self.
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
653
|
+
await asyncio.to_thread(create_table, local_conn)
|
|
654
|
+
if self.needs_python_df or self.is_target:
|
|
655
|
+
self.result = await asyncio.to_thread(self._load_duckdb_view_to_python_df, local_conn)
|
|
656
|
+
finally:
|
|
657
|
+
local_conn.close()
|
|
658
|
+
|
|
659
|
+
async def _run_python_model(self, compiled_query: mq.PyModelQuery) -> None:
|
|
660
|
+
query_result = await asyncio.to_thread(compiled_query.query)
|
|
661
|
+
if isinstance(query_result, pd.DataFrame):
|
|
662
|
+
query_result = pl.from_pandas(query_result)
|
|
663
|
+
|
|
664
|
+
self.result = query_result.lazy()
|
|
369
665
|
|
|
370
|
-
|
|
371
|
-
if self.needs_sql_table:
|
|
372
|
-
await asyncio.to_thread(self._load_pandas_to_table, df, conn)
|
|
373
|
-
if self.needs_pandas or self.is_target:
|
|
374
|
-
self.result = df
|
|
375
|
-
|
|
376
|
-
async def run_model(self, conn: Connection, placeholders: dict = {}) -> None:
|
|
666
|
+
async def run_model(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
|
|
377
667
|
start = time.time()
|
|
378
668
|
|
|
379
|
-
if isinstance(self.
|
|
380
|
-
await self._run_sql_model(conn, placeholders)
|
|
381
|
-
elif isinstance(self.
|
|
382
|
-
await self._run_python_model(
|
|
669
|
+
if isinstance(self.compiled_query, mq.SqlModelQuery):
|
|
670
|
+
await self._run_sql_model(self.compiled_query, conn, placeholders)
|
|
671
|
+
elif isinstance(self.compiled_query, mq.PyModelQuery):
|
|
672
|
+
await self._run_python_model(self.compiled_query)
|
|
383
673
|
else:
|
|
384
674
|
raise NotImplementedError(f"Query type not supported: {self.query_file.__class__.__name__}")
|
|
385
675
|
|
|
386
|
-
|
|
387
|
-
self.logger.log_activity_time(f"running {model_type} model '{self.name}'", start)
|
|
676
|
+
self.logger.log_activity_time(f"running federate model '{self.name}'", start)
|
|
388
677
|
|
|
389
678
|
await super().run_model(conn, placeholders)
|
|
390
679
|
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
680
|
+
|
|
681
|
+
@dataclass
|
|
682
|
+
class BuildModel(StaticModel, QueryModel):
|
|
683
|
+
model_config: mc.BuildModelConfig
|
|
684
|
+
query_file: mq.SqlQueryFile | mq.PyQueryFile
|
|
685
|
+
compiled_query: mq.SqlModelQuery | mq.PyModelQuery | None = field(default=None, init=False)
|
|
686
|
+
|
|
687
|
+
@property
|
|
688
|
+
def model_type(self) -> ModelType:
|
|
689
|
+
return ModelType.BUILD
|
|
690
|
+
|
|
691
|
+
def _add_upstream_for_build(self, other: StaticModel) -> None:
|
|
692
|
+
self.upstreams_for_build[other.name] = other
|
|
693
|
+
other.downstreams_for_build[self.name] = self
|
|
694
|
+
|
|
695
|
+
if isinstance(self.query_file, mq.PyQueryFile):
|
|
696
|
+
other.needs_python_df_for_build = True
|
|
697
|
+
|
|
698
|
+
def _get_compile_sql_model_args(
|
|
699
|
+
self, conn_args: ConnectionsArgs, models_dict: dict[str, StaticModel]
|
|
700
|
+
) -> dict[str, Any]:
|
|
701
|
+
kwargs: dict[str, Any] = {
|
|
702
|
+
"proj_vars": conn_args.proj_vars, "env_vars": conn_args.env_vars
|
|
703
|
+
}
|
|
704
|
+
|
|
705
|
+
def ref_for_build(dependent_model_name: str) -> str:
|
|
706
|
+
dependent_model = self._ref_for_sql(dependent_model_name, dict(models_dict))
|
|
707
|
+
return dependent_model
|
|
708
|
+
|
|
709
|
+
kwargs["ref"] = ref_for_build
|
|
710
|
+
return kwargs
|
|
711
|
+
|
|
712
|
+
def _compile_sql_model(
|
|
713
|
+
self, query_file: mq.SqlQueryFile, conn_args: ConnectionsArgs, models_dict: dict[str, StaticModel]
|
|
714
|
+
) -> mq.SqlModelQuery:
|
|
715
|
+
kwargs = self._get_compile_sql_model_args(conn_args, models_dict)
|
|
716
|
+
compiled_query_str = self._get_compiled_sql_query_str(query_file.raw_query, kwargs)
|
|
717
|
+
compiled_query = mq.SqlModelQuery(compiled_query_str, is_duckdb=True)
|
|
718
|
+
return compiled_query
|
|
719
|
+
|
|
720
|
+
def _ref_for_python(self, dependent_model_name: str) -> pl.LazyFrame:
|
|
721
|
+
if dependent_model_name not in self.upstreams_for_build:
|
|
722
|
+
raise u.ConfigurationError(f'Model "{self.name}" must include model "{dependent_model_name}" as a dependency to use')
|
|
723
|
+
df = self.upstreams_for_build[dependent_model_name].result
|
|
724
|
+
assert df is not None
|
|
725
|
+
return df
|
|
726
|
+
|
|
727
|
+
def _get_compile_python_model_args(self, conn_args: ConnectionsArgs) -> BuildModelArgs:
|
|
728
|
+
|
|
729
|
+
def run_external_sql(connection_name: str, sql_query: str):
|
|
730
|
+
return self._run_sql_query_on_connection(connection_name, sql_query)
|
|
731
|
+
|
|
732
|
+
return BuildModelArgs(
|
|
733
|
+
conn_args, self.conn_set.get_connections_as_dict(), self.model_config.depends_on, self._ref_for_python, run_external_sql
|
|
734
|
+
)
|
|
735
|
+
|
|
736
|
+
def _compile_python_model(
|
|
737
|
+
self, query_file: mq.PyQueryFile, conn_args: ConnectionsArgs
|
|
738
|
+
) -> mq.PyModelQuery:
|
|
739
|
+
sqrl_args = self._get_compile_python_model_args(conn_args)
|
|
740
|
+
|
|
741
|
+
def compiled_query() -> pl.LazyFrame | pd.DataFrame:
|
|
742
|
+
try:
|
|
743
|
+
return query_file.raw_query(sqrl_args)
|
|
744
|
+
except Exception as e:
|
|
745
|
+
raise FileExecutionError(f'Failed to run "{c.MAIN_FUNC}" function for build model "{self.name}"', e)
|
|
746
|
+
|
|
747
|
+
return mq.PyModelQuery(compiled_query)
|
|
748
|
+
|
|
749
|
+
def compile_for_build(self, conn_args: ConnectionsArgs, models_dict: dict[str, StaticModel]) -> None:
|
|
750
|
+
start = time.time()
|
|
751
|
+
|
|
752
|
+
if isinstance(self.query_file, mq.SqlQueryFile):
|
|
753
|
+
self.compiled_query = self._compile_sql_model(self.query_file, conn_args, models_dict)
|
|
754
|
+
elif isinstance(self.query_file, mq.PyQueryFile):
|
|
755
|
+
self.compiled_query = self._compile_python_model(self.query_file, conn_args)
|
|
756
|
+
else:
|
|
757
|
+
raise NotImplementedError(f"Query type not supported: {self.query_file.__class__.__name__}")
|
|
758
|
+
|
|
759
|
+
self.logger.log_activity_time(f"compiling build model '{self.name}'", start)
|
|
760
|
+
|
|
761
|
+
dependencies = self.model_config.depends_on
|
|
762
|
+
self.wait_count_for_build = len(dependencies)
|
|
763
|
+
|
|
764
|
+
for name in dependencies:
|
|
765
|
+
dep_model = models_dict[name]
|
|
766
|
+
self._add_upstream_for_build(dep_model)
|
|
767
|
+
|
|
768
|
+
async def _build_sql_model(self, compiled_query: mq.SqlModelQuery, conn: duckdb.DuckDBPyConnection) -> None:
|
|
769
|
+
query = compiled_query.query
|
|
770
|
+
|
|
771
|
+
def create_table():
|
|
772
|
+
create_query = self.model_config.get_sql_for_build(self.name, query)
|
|
773
|
+
local_conn = conn.cursor()
|
|
774
|
+
try:
|
|
775
|
+
return u.run_duckdb_stmt(self.logger, local_conn, create_query)
|
|
776
|
+
except Exception as e:
|
|
777
|
+
raise FileExecutionError(f'Failed to build static sql model "{self.name}"', e) from e
|
|
778
|
+
finally:
|
|
779
|
+
local_conn.close()
|
|
780
|
+
|
|
781
|
+
await asyncio.to_thread(create_table)
|
|
782
|
+
|
|
783
|
+
async def _build_python_model(self, compiled_query: mq.PyModelQuery, conn: duckdb.DuckDBPyConnection) -> None:
|
|
784
|
+
query_result = await asyncio.to_thread(compiled_query.query)
|
|
785
|
+
if isinstance(query_result, pd.DataFrame):
|
|
786
|
+
query_result = pl.from_pandas(query_result).lazy()
|
|
787
|
+
if self.needs_python_df_for_build:
|
|
788
|
+
self.result = query_result.lazy()
|
|
789
|
+
await asyncio.to_thread(self._create_table_from_df, conn, query_result)
|
|
790
|
+
|
|
791
|
+
async def build_model(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
|
|
792
|
+
start = time.time()
|
|
793
|
+
print(f"[{u.get_current_time()}] 🔨 BUILDING: build model '{self.name}'")
|
|
794
|
+
|
|
795
|
+
if isinstance(self.compiled_query, mq.SqlModelQuery):
|
|
796
|
+
await self._build_sql_model(self.compiled_query, conn)
|
|
797
|
+
elif isinstance(self.compiled_query, mq.PyModelQuery):
|
|
798
|
+
# First ensure all upstream models have an associated Python dataframe
|
|
799
|
+
def load_df(conn: duckdb.DuckDBPyConnection, dep_model: DataModel):
|
|
800
|
+
if dep_model.result is None:
|
|
801
|
+
local_conn = conn.cursor()
|
|
802
|
+
try:
|
|
803
|
+
dep_model.result = dep_model._load_duckdb_view_to_python_df(local_conn)
|
|
804
|
+
finally:
|
|
805
|
+
local_conn.close()
|
|
806
|
+
|
|
807
|
+
coroutines = []
|
|
808
|
+
for dep_model in self.upstreams_for_build.values():
|
|
809
|
+
coro = asyncio.to_thread(load_df, conn, dep_model)
|
|
810
|
+
coroutines.append(coro)
|
|
811
|
+
await u.asyncio_gather(coroutines)
|
|
812
|
+
|
|
813
|
+
# Then run the model's Python function to build the model
|
|
814
|
+
await self._build_python_model(self.compiled_query, conn)
|
|
815
|
+
else:
|
|
816
|
+
raise NotImplementedError(f"Query type not supported: {self.query_file.__class__.__name__}")
|
|
817
|
+
|
|
818
|
+
print(f"[{u.get_current_time()}] ✅ FINISHED: build model '{self.name}'")
|
|
819
|
+
self.logger.log_activity_time(f"building static build model '{self.name}'", start)
|
|
820
|
+
|
|
821
|
+
await super().build_model(conn, full_refresh)
|
|
396
822
|
|
|
397
823
|
|
|
398
824
|
@dataclass
|
|
399
825
|
class DAG:
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
826
|
+
dataset: DatasetConfig | None
|
|
827
|
+
target_model: DataModel
|
|
828
|
+
models_dict: dict[str, DataModel]
|
|
829
|
+
duckdb_filepath: str = field(default="")
|
|
404
830
|
logger: u.Logger = field(default_factory=lambda: u.Logger(""))
|
|
405
831
|
parameter_set: ParameterSet | None = field(default=None, init=False) # set in apply_selections
|
|
406
832
|
placeholders: dict[str, Any] = field(init=False, default_factory=dict)
|
|
407
833
|
|
|
834
|
+
def _get_msg_extension(self) -> str:
|
|
835
|
+
return f" for dataset '{self.dataset.name}'" if self.dataset else ""
|
|
836
|
+
|
|
837
|
+
def compile_build_models(self, conn_args: ConnectionsArgs) -> None:
|
|
838
|
+
static_models: dict[str, StaticModel] = {k: v for k, v in self.models_dict.items() if isinstance(v, StaticModel)}
|
|
839
|
+
for model in static_models.values():
|
|
840
|
+
if isinstance(model, BuildModel):
|
|
841
|
+
model.compile_for_build(conn_args, static_models)
|
|
842
|
+
|
|
408
843
|
def apply_selections(
|
|
409
|
-
self, param_cfg_set: ParameterConfigsSet, user:
|
|
844
|
+
self, param_cfg_set: ParameterConfigsSet, user: BaseUser | None, selections: dict[str, str]
|
|
410
845
|
) -> None:
|
|
411
846
|
start = time.time()
|
|
412
|
-
dataset_params = self.dataset.parameters
|
|
413
|
-
parameter_set = param_cfg_set.apply_selections(
|
|
414
|
-
dataset_params, selections, user, updates_only=updates_only, request_version=request_version
|
|
415
|
-
)
|
|
847
|
+
dataset_params = self.dataset.parameters if self.dataset else None
|
|
848
|
+
parameter_set = param_cfg_set.apply_selections(dataset_params, selections, user)
|
|
416
849
|
self.parameter_set = parameter_set
|
|
417
|
-
|
|
850
|
+
msg_extension = self._get_msg_extension()
|
|
851
|
+
self.logger.log_activity_time("applying selections" + msg_extension, start)
|
|
418
852
|
|
|
419
|
-
def _compile_context(
|
|
853
|
+
def _compile_context(
|
|
854
|
+
self, param_args: ParametersArgs, context_func: ContextFunc, user: BaseUser | None, default_traits: dict[str, Any]
|
|
855
|
+
) -> tuple[dict[str, Any], ContextArgs]:
|
|
420
856
|
start = time.time()
|
|
421
857
|
context = {}
|
|
422
858
|
assert isinstance(self.parameter_set, ParameterSet)
|
|
423
859
|
prms = self.parameter_set.get_parameters_as_dict()
|
|
424
|
-
|
|
860
|
+
traits = self.dataset.traits if self.dataset else default_traits
|
|
861
|
+
args = ContextArgs(param_args, user, prms, traits)
|
|
862
|
+
msg_extension = self._get_msg_extension()
|
|
425
863
|
try:
|
|
426
864
|
context_func(context, args)
|
|
427
865
|
except Exception as e:
|
|
428
|
-
raise
|
|
429
|
-
self.logger.log_activity_time(
|
|
866
|
+
raise FileExecutionError(f'Failed to run {c.CONTEXT_FILE}' + msg_extension, e) from e
|
|
867
|
+
self.logger.log_activity_time("running context.py" + msg_extension, start)
|
|
430
868
|
return context, args
|
|
431
869
|
|
|
432
|
-
|
|
433
|
-
|
|
870
|
+
def _compile_models(self, context: dict[str, Any], ctx_args: ContextArgs, recurse: bool) -> None:
|
|
871
|
+
self.target_model.compile(context, ctx_args, self.models_dict, recurse)
|
|
434
872
|
|
|
435
873
|
def _get_terminal_nodes(self) -> set[str]:
|
|
436
874
|
start = time.time()
|
|
@@ -440,39 +878,53 @@ class DAG:
|
|
|
440
878
|
self.logger.log_activity_time(f"validating no cycles in model dependencies", start)
|
|
441
879
|
return terminal_nodes
|
|
442
880
|
|
|
443
|
-
async def _run_models(self
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
881
|
+
async def _run_models(self) -> None:
|
|
882
|
+
terminal_nodes = self._get_terminal_nodes()
|
|
883
|
+
|
|
884
|
+
# create an empty duckdb venv file if it does not exist
|
|
885
|
+
try:
|
|
886
|
+
conn = duckdb.connect(self.duckdb_filepath)
|
|
887
|
+
conn.close()
|
|
888
|
+
except duckdb.IOException as e:
|
|
889
|
+
# unable to create duckdb venv file means it's in use and already exists
|
|
890
|
+
# do not throw error here since attaching in read-only mode later may still work
|
|
891
|
+
pass
|
|
447
892
|
|
|
448
|
-
|
|
893
|
+
conn = u.create_duckdb_connection()
|
|
894
|
+
try:
|
|
895
|
+
read_only = "(READ_ONLY)" if self.duckdb_filepath else ""
|
|
896
|
+
try:
|
|
897
|
+
conn.execute(f"ATTACH '{self.duckdb_filepath}' AS venv {read_only}")
|
|
898
|
+
except duckdb.IOException as e:
|
|
899
|
+
self.logger.warn(f"Unable to attach to duckdb venv file: {self.duckdb_filepath}")
|
|
900
|
+
raise e
|
|
901
|
+
|
|
449
902
|
coroutines = []
|
|
450
903
|
for model_name in terminal_nodes:
|
|
451
|
-
model = self.models_dict[model_name]
|
|
452
|
-
coroutines.append(model.run_model(conn, placeholders))
|
|
453
|
-
await
|
|
454
|
-
|
|
455
|
-
|
|
904
|
+
model = self.models_dict[model_name] if model_name != "__fake_target" else self.target_model
|
|
905
|
+
coroutines.append(model.run_model(conn, self.placeholders))
|
|
906
|
+
await u.asyncio_gather(coroutines)
|
|
907
|
+
|
|
908
|
+
finally:
|
|
909
|
+
conn.close()
|
|
456
910
|
|
|
457
911
|
async def execute(
|
|
458
|
-
self, param_args: ParametersArgs, param_cfg_set: ParameterConfigsSet, context_func: ContextFunc, user:
|
|
459
|
-
*,
|
|
460
|
-
) ->
|
|
912
|
+
self, param_args: ParametersArgs, param_cfg_set: ParameterConfigsSet, context_func: ContextFunc, user: BaseUser | None, selections: dict[str, str],
|
|
913
|
+
*, runquery: bool = True, recurse: bool = True, default_traits: dict[str, Any] = {}
|
|
914
|
+
) -> None:
|
|
461
915
|
recurse = (recurse or runquery)
|
|
462
916
|
|
|
463
|
-
self.apply_selections(param_cfg_set, user, selections
|
|
917
|
+
self.apply_selections(param_cfg_set, user, selections)
|
|
464
918
|
|
|
465
|
-
context, ctx_args = self._compile_context(param_args, context_func, user)
|
|
919
|
+
context, ctx_args = self._compile_context(param_args, context_func, user, default_traits)
|
|
466
920
|
|
|
467
|
-
|
|
921
|
+
self._compile_models(context, ctx_args, recurse)
|
|
468
922
|
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
placeholders = ctx_args._placeholders.copy()
|
|
923
|
+
self.placeholders = ctx_args.placeholders
|
|
472
924
|
if runquery:
|
|
473
|
-
await self._run_models(
|
|
474
|
-
|
|
475
|
-
|
|
925
|
+
await self._run_models()
|
|
926
|
+
|
|
927
|
+
self.target_model.process_pass_through_columns(self.models_dict)
|
|
476
928
|
|
|
477
929
|
def get_all_query_models(self) -> set[str]:
|
|
478
930
|
all_model_names = set()
|
|
@@ -483,10 +935,9 @@ class DAG:
|
|
|
483
935
|
G = nx.DiGraph()
|
|
484
936
|
|
|
485
937
|
for model_name, model in self.models_dict.items():
|
|
486
|
-
model_type = model.get_model_type()
|
|
487
938
|
level = model.get_max_path_length_to_target()
|
|
488
939
|
if level is not None:
|
|
489
|
-
G.add_node(model_name, layer=-level, model_type=model_type)
|
|
940
|
+
G.add_node(model_name, layer=-level, model_type=model.model_type)
|
|
490
941
|
|
|
491
942
|
for model_name in G.nodes:
|
|
492
943
|
model = self.models_dict[model_name]
|
|
@@ -494,46 +945,101 @@ class DAG:
|
|
|
494
945
|
G.add_edge(model_name, dep_model_name)
|
|
495
946
|
|
|
496
947
|
return G
|
|
948
|
+
|
|
949
|
+
def get_all_data_models(self) -> list[arm.DataModelItem]:
|
|
950
|
+
data_models = []
|
|
951
|
+
for model_name, model in self.models_dict.items():
|
|
952
|
+
is_queryable = model.is_queryable
|
|
953
|
+
data_model = arm.DataModelItem(name=model_name, model_type=model.model_type.value, config=model.model_config, is_queryable=is_queryable)
|
|
954
|
+
data_models.append(data_model)
|
|
955
|
+
return data_models
|
|
956
|
+
|
|
957
|
+
def get_all_model_lineage(self) -> list[arm.LineageRelation]:
|
|
958
|
+
model_lineage = []
|
|
959
|
+
for model_name, model in self.models_dict.items():
|
|
960
|
+
if not isinstance(model, QueryModel):
|
|
961
|
+
continue
|
|
962
|
+
for dep_model_name in model.model_config.depends_on:
|
|
963
|
+
edge_type = "buildtime" if isinstance(model, BuildModel) else "runtime"
|
|
964
|
+
source_model = arm.LineageNode(name=dep_model_name, type="model")
|
|
965
|
+
target_model = arm.LineageNode(name=model_name, type="model")
|
|
966
|
+
model_lineage.append(arm.LineageRelation(type=edge_type, source=source_model, target=target_model))
|
|
967
|
+
return model_lineage
|
|
497
968
|
|
|
498
969
|
|
|
499
970
|
class ModelsIO:
|
|
500
971
|
|
|
501
972
|
@classmethod
|
|
502
|
-
def
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
973
|
+
def _load_model_config(cls, filepath: Path, model_type: ModelType, env_vars: dict[str, str]) -> mc.ModelConfig:
|
|
974
|
+
yaml_path = filepath.with_suffix('.yml')
|
|
975
|
+
config_dict = u.load_yaml_config(yaml_path) if yaml_path.exists() else {}
|
|
976
|
+
|
|
977
|
+
if model_type == ModelType.DBVIEW:
|
|
978
|
+
config = mc.DbviewModelConfig(**config_dict).finalize_connection(env_vars)
|
|
979
|
+
return config
|
|
980
|
+
elif model_type == ModelType.FEDERATE:
|
|
981
|
+
return mc.FederateModelConfig(**config_dict)
|
|
982
|
+
elif model_type == ModelType.BUILD:
|
|
983
|
+
return mc.BuildModelConfig(**config_dict)
|
|
984
|
+
else:
|
|
985
|
+
return mc.ModelConfig(**config_dict)
|
|
986
|
+
|
|
987
|
+
@classmethod
|
|
988
|
+
def _populate_from_file(
|
|
989
|
+
cls, raw_queries_by_model: dict[str, mq.QueryFileWithConfig], dp: str, file: str, model_type: ModelType, env_vars: dict[str, str]
|
|
990
|
+
) -> None:
|
|
991
|
+
filepath = Path(dp, file)
|
|
992
|
+
file_stem, extension = os.path.splitext(file)
|
|
993
|
+
|
|
994
|
+
if extension == '.py':
|
|
995
|
+
module = pm.PyModule(filepath)
|
|
996
|
+
raw_query = module.get_func_or_class(c.MAIN_FUNC)
|
|
997
|
+
query_file = mq.PyQueryFile(filepath.as_posix(), raw_query)
|
|
998
|
+
elif extension == '.sql':
|
|
999
|
+
query_file = mq.SqlQueryFile(filepath.as_posix(), filepath.read_text())
|
|
1000
|
+
else:
|
|
1001
|
+
return # Skip files that are not query files
|
|
529
1002
|
|
|
1003
|
+
if file_stem in raw_queries_by_model:
|
|
1004
|
+
assert isinstance(prior_query_file := raw_queries_by_model[file_stem].query_file, mq.QueryFile)
|
|
1005
|
+
conflicts = [prior_query_file.filepath, query_file.filepath]
|
|
1006
|
+
raise u.ConfigurationError(f"Multiple models found for '{file_stem}': {conflicts}")
|
|
1007
|
+
|
|
1008
|
+
model_config = cls._load_model_config(filepath, model_type, env_vars)
|
|
1009
|
+
raw_queries_by_model[file_stem] = mq.QueryFileWithConfig(query_file, model_config)
|
|
1010
|
+
|
|
1011
|
+
@classmethod
|
|
1012
|
+
def _populate_raw_queries_for_type(
|
|
1013
|
+
cls, folder_path: Path, model_type: ModelType, *, env_vars: dict[str, str] = {}
|
|
1014
|
+
) -> dict[str, mq.QueryFileWithConfig]:
|
|
1015
|
+
raw_queries_by_model: dict[str, mq.QueryFileWithConfig] = {}
|
|
1016
|
+
for dp, _, filenames in os.walk(folder_path):
|
|
1017
|
+
for file in filenames:
|
|
1018
|
+
cls._populate_from_file(raw_queries_by_model, dp, file, model_type, env_vars)
|
|
1019
|
+
return raw_queries_by_model
|
|
1020
|
+
|
|
1021
|
+
@classmethod
|
|
1022
|
+
def load_build_files(cls, logger: u.Logger, base_path: str) -> dict[str, mq.QueryFileWithConfig]:
|
|
1023
|
+
start = time.time()
|
|
1024
|
+
builds_path = u.Path(base_path, c.MODELS_FOLDER, c.BUILDS_FOLDER)
|
|
1025
|
+
raw_queries_by_model = cls._populate_raw_queries_for_type(builds_path, ModelType.BUILD)
|
|
1026
|
+
logger.log_activity_time("loading build files", start)
|
|
1027
|
+
return raw_queries_by_model
|
|
1028
|
+
|
|
1029
|
+
@classmethod
|
|
1030
|
+
def load_dbview_files(cls, logger: u.Logger, base_path: str, env_vars: dict[str, str]) -> dict[str, mq.QueryFileWithConfig]:
|
|
1031
|
+
start = time.time()
|
|
530
1032
|
dbviews_path = u.Path(base_path, c.MODELS_FOLDER, c.DBVIEWS_FOLDER)
|
|
531
|
-
|
|
1033
|
+
raw_queries_by_model = cls._populate_raw_queries_for_type(dbviews_path, ModelType.DBVIEW, env_vars=env_vars)
|
|
1034
|
+
logger.log_activity_time("loading dbview files", start)
|
|
1035
|
+
return raw_queries_by_model
|
|
532
1036
|
|
|
1037
|
+
@classmethod
|
|
1038
|
+
def load_federate_files(cls, logger: u.Logger, base_path: str) -> dict[str, mq.QueryFileWithConfig]:
|
|
1039
|
+
start = time.time()
|
|
533
1040
|
federates_path = u.Path(base_path, c.MODELS_FOLDER, c.FEDERATES_FOLDER)
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
logger.log_activity_time("loading files for models", start)
|
|
1041
|
+
raw_queries_by_model = cls._populate_raw_queries_for_type(federates_path, ModelType.FEDERATE)
|
|
1042
|
+
logger.log_activity_time("loading federate files", start)
|
|
537
1043
|
return raw_queries_by_model
|
|
538
1044
|
|
|
539
1045
|
@classmethod
|