squirrels 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of squirrels might be problematic. Click here for more details.
- dateutils/__init__.py +6 -0
- dateutils/_enums.py +25 -0
- squirrels/dateutils.py → dateutils/_implementation.py +58 -111
- dateutils/types.py +6 -0
- squirrels/__init__.py +13 -11
- squirrels/_api_routes/__init__.py +5 -0
- squirrels/_api_routes/auth.py +271 -0
- squirrels/_api_routes/base.py +165 -0
- squirrels/_api_routes/dashboards.py +150 -0
- squirrels/_api_routes/data_management.py +145 -0
- squirrels/_api_routes/datasets.py +257 -0
- squirrels/_api_routes/oauth2.py +298 -0
- squirrels/_api_routes/project.py +252 -0
- squirrels/_api_server.py +256 -450
- squirrels/_arguments/__init__.py +0 -0
- squirrels/_arguments/init_time_args.py +108 -0
- squirrels/_arguments/run_time_args.py +147 -0
- squirrels/_auth.py +960 -0
- squirrels/_command_line.py +126 -45
- squirrels/_compile_prompts.py +147 -0
- squirrels/_connection_set.py +48 -26
- squirrels/_constants.py +68 -38
- squirrels/_dashboards.py +160 -0
- squirrels/_data_sources.py +570 -0
- squirrels/_dataset_types.py +84 -0
- squirrels/_exceptions.py +29 -0
- squirrels/_initializer.py +177 -80
- squirrels/_logging.py +115 -0
- squirrels/_manifest.py +208 -79
- squirrels/_model_builder.py +69 -0
- squirrels/_model_configs.py +74 -0
- squirrels/_model_queries.py +52 -0
- squirrels/_models.py +926 -367
- squirrels/_package_data/base_project/.env +42 -0
- squirrels/_package_data/base_project/.env.example +42 -0
- squirrels/_package_data/base_project/assets/expenses.db +0 -0
- squirrels/_package_data/base_project/connections.yml +16 -0
- squirrels/_package_data/base_project/dashboards/dashboard_example.py +34 -0
- squirrels/_package_data/base_project/dashboards/dashboard_example.yml +22 -0
- squirrels/{package_data → _package_data}/base_project/docker/.dockerignore +5 -2
- squirrels/{package_data → _package_data}/base_project/docker/Dockerfile +3 -3
- squirrels/{package_data → _package_data}/base_project/docker/compose.yml +1 -1
- squirrels/_package_data/base_project/duckdb_init.sql +10 -0
- squirrels/{package_data/base_project/.gitignore → _package_data/base_project/gitignore} +3 -2
- squirrels/_package_data/base_project/macros/macros_example.sql +17 -0
- squirrels/_package_data/base_project/models/builds/build_example.py +26 -0
- squirrels/_package_data/base_project/models/builds/build_example.sql +16 -0
- squirrels/_package_data/base_project/models/builds/build_example.yml +57 -0
- squirrels/_package_data/base_project/models/dbviews/dbview_example.sql +12 -0
- squirrels/_package_data/base_project/models/dbviews/dbview_example.yml +26 -0
- squirrels/_package_data/base_project/models/federates/federate_example.py +37 -0
- squirrels/_package_data/base_project/models/federates/federate_example.sql +19 -0
- squirrels/_package_data/base_project/models/federates/federate_example.yml +65 -0
- squirrels/_package_data/base_project/models/sources.yml +38 -0
- squirrels/{package_data → _package_data}/base_project/parameters.yml +56 -40
- squirrels/_package_data/base_project/pyconfigs/connections.py +14 -0
- squirrels/{package_data → _package_data}/base_project/pyconfigs/context.py +21 -40
- squirrels/_package_data/base_project/pyconfigs/parameters.py +141 -0
- squirrels/_package_data/base_project/pyconfigs/user.py +44 -0
- squirrels/_package_data/base_project/seeds/seed_categories.yml +15 -0
- squirrels/_package_data/base_project/seeds/seed_subcategories.csv +15 -0
- squirrels/_package_data/base_project/seeds/seed_subcategories.yml +21 -0
- squirrels/_package_data/base_project/squirrels.yml.j2 +61 -0
- squirrels/_package_data/templates/dataset_results.html +112 -0
- squirrels/_package_data/templates/oauth_login.html +271 -0
- squirrels/_package_data/templates/squirrels_studio.html +20 -0
- squirrels/_package_loader.py +8 -4
- squirrels/_parameter_configs.py +104 -103
- squirrels/_parameter_options.py +348 -0
- squirrels/_parameter_sets.py +57 -47
- squirrels/_parameters.py +1664 -0
- squirrels/_project.py +721 -0
- squirrels/_py_module.py +7 -5
- squirrels/_schemas/__init__.py +0 -0
- squirrels/_schemas/auth_models.py +167 -0
- squirrels/_schemas/query_param_models.py +75 -0
- squirrels/{_api_response_models.py → _schemas/response_models.py} +126 -47
- squirrels/_seeds.py +35 -16
- squirrels/_sources.py +110 -0
- squirrels/_utils.py +248 -73
- squirrels/_version.py +1 -1
- squirrels/arguments.py +7 -0
- squirrels/auth.py +4 -0
- squirrels/connections.py +3 -0
- squirrels/dashboards.py +2 -81
- squirrels/data_sources.py +14 -631
- squirrels/parameter_options.py +13 -348
- squirrels/parameters.py +14 -1266
- squirrels/types.py +16 -0
- squirrels-0.5.0.dist-info/METADATA +113 -0
- squirrels-0.5.0.dist-info/RECORD +97 -0
- {squirrels-0.4.0.dist-info → squirrels-0.5.0.dist-info}/WHEEL +1 -1
- squirrels-0.5.0.dist-info/entry_points.txt +3 -0
- {squirrels-0.4.0.dist-info → squirrels-0.5.0.dist-info/licenses}/LICENSE +1 -1
- squirrels/_authenticator.py +0 -85
- squirrels/_dashboards_io.py +0 -61
- squirrels/_environcfg.py +0 -84
- squirrels/arguments/init_time_args.py +0 -40
- squirrels/arguments/run_time_args.py +0 -208
- squirrels/package_data/assets/favicon.ico +0 -0
- squirrels/package_data/assets/index.css +0 -1
- squirrels/package_data/assets/index.js +0 -58
- squirrels/package_data/base_project/assets/expenses.db +0 -0
- squirrels/package_data/base_project/connections.yml +0 -7
- squirrels/package_data/base_project/dashboards/dashboard_example.py +0 -32
- squirrels/package_data/base_project/dashboards.yml +0 -10
- squirrels/package_data/base_project/env.yml +0 -29
- squirrels/package_data/base_project/models/dbviews/dbview_example.py +0 -47
- squirrels/package_data/base_project/models/dbviews/dbview_example.sql +0 -22
- squirrels/package_data/base_project/models/federates/federate_example.py +0 -21
- squirrels/package_data/base_project/models/federates/federate_example.sql +0 -3
- squirrels/package_data/base_project/pyconfigs/auth.py +0 -45
- squirrels/package_data/base_project/pyconfigs/connections.py +0 -19
- squirrels/package_data/base_project/pyconfigs/parameters.py +0 -95
- squirrels/package_data/base_project/seeds/seed_subcategories.csv +0 -15
- squirrels/package_data/base_project/squirrels.yml.j2 +0 -94
- squirrels/package_data/templates/index.html +0 -18
- squirrels/project.py +0 -378
- squirrels/user_base.py +0 -55
- squirrels-0.4.0.dist-info/METADATA +0 -117
- squirrels-0.4.0.dist-info/RECORD +0 -60
- squirrels-0.4.0.dist-info/entry_points.txt +0 -4
- /squirrels/{package_data → _package_data}/base_project/assets/weather.db +0 -0
- /squirrels/{package_data → _package_data}/base_project/seeds/seed_categories.csv +0 -0
- /squirrels/{package_data → _package_data}/base_project/tmp/.gitignore +0 -0
squirrels/_models.py
CHANGED
|
@@ -1,147 +1,121 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
|
-
from typing import
|
|
3
|
-
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Callable, Any
|
|
3
|
+
from dataclasses import dataclass, field, KW_ONLY
|
|
4
4
|
from abc import ABCMeta, abstractmethod
|
|
5
5
|
from enum import Enum
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
|
|
8
|
-
import
|
|
9
|
-
|
|
10
|
-
from . import _constants as c, _utils as u, _py_module as pm
|
|
11
|
-
from .
|
|
12
|
-
from .
|
|
13
|
-
from .
|
|
14
|
-
from .
|
|
7
|
+
import asyncio, os, re, time, duckdb, sqlglot
|
|
8
|
+
import polars as pl, pandas as pd
|
|
9
|
+
|
|
10
|
+
from . import _constants as c, _utils as u, _py_module as pm, _model_queries as mq, _model_configs as mc, _sources as src
|
|
11
|
+
from ._schemas import response_models as rm
|
|
12
|
+
from ._exceptions import FileExecutionError, InvalidInputError
|
|
13
|
+
from ._arguments.run_time_args import ContextArgs, ModelArgs, BuildModelArgs
|
|
14
|
+
from ._auth import AbstractUser
|
|
15
|
+
from ._connection_set import ConnectionsArgs, ConnectionSet, ConnectionProperties
|
|
16
|
+
from ._manifest import DatasetConfig, ConnectionTypeEnum
|
|
15
17
|
from ._parameter_sets import ParameterConfigsSet, ParametersArgs, ParameterSet
|
|
16
18
|
|
|
17
19
|
ContextFunc = Callable[[dict[str, Any], ContextArgs], None]
|
|
18
20
|
|
|
19
21
|
|
|
20
22
|
class ModelType(Enum):
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
TABLE = 0
|
|
27
|
-
VIEW = 1
|
|
23
|
+
SEED = "seed"
|
|
24
|
+
SOURCE = "source"
|
|
25
|
+
BUILD = "build"
|
|
26
|
+
DBVIEW = "dbview"
|
|
27
|
+
FEDERATE = "federate"
|
|
28
28
|
|
|
29
29
|
|
|
30
30
|
@dataclass
|
|
31
|
-
class
|
|
32
|
-
## Applicable for dbview models
|
|
33
|
-
connection_name: str
|
|
34
|
-
|
|
35
|
-
## Applicable for federated models
|
|
36
|
-
materialized: _Materialization
|
|
37
|
-
|
|
38
|
-
def set_attribute(self, *, connection_name: str | None = None, materialized: str | None = None, **kwargs) -> str:
|
|
39
|
-
if connection_name is not None:
|
|
40
|
-
if not isinstance(connection_name, str):
|
|
41
|
-
raise u.ConfigurationError("The 'connection_name' argument of 'config' macro must be a string")
|
|
42
|
-
self.connection_name = connection_name
|
|
43
|
-
|
|
44
|
-
if materialized is not None:
|
|
45
|
-
if not isinstance(materialized, str):
|
|
46
|
-
raise u.ConfigurationError("The 'materialized' argument of 'config' macro must be a string")
|
|
47
|
-
try:
|
|
48
|
-
self.materialized = _Materialization[materialized.upper()]
|
|
49
|
-
except KeyError as e:
|
|
50
|
-
valid_options = [x.name for x in _Materialization]
|
|
51
|
-
raise u.ConfigurationError(f"The 'materialized' argument value '{materialized}' is not valid. Must be one of: {valid_options}") from e
|
|
52
|
-
return ""
|
|
53
|
-
|
|
54
|
-
def get_sql_for_create(self, model_name: str, select_query: str) -> str:
|
|
55
|
-
create_prefix = f"CREATE {self.materialized.name} {model_name} AS\n"
|
|
56
|
-
return create_prefix + select_query
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
@dataclass(frozen=True)
|
|
60
|
-
class QueryFile:
|
|
61
|
-
filepath: str
|
|
62
|
-
model_type: ModelType
|
|
63
|
-
|
|
64
|
-
@dataclass(frozen=True)
|
|
65
|
-
class SqlQueryFile(QueryFile):
|
|
66
|
-
raw_query: str
|
|
67
|
-
|
|
68
|
-
@dataclass(frozen=True)
|
|
69
|
-
class _RawPyQuery:
|
|
70
|
-
query: Callable[[ModelArgs], pd.DataFrame]
|
|
71
|
-
dependencies_func: Callable[[ModelDepsArgs], Iterable[str]]
|
|
72
|
-
|
|
73
|
-
@dataclass(frozen=True)
|
|
74
|
-
class PyQueryFile(QueryFile):
|
|
75
|
-
raw_query: _RawPyQuery
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
@dataclass
|
|
79
|
-
class _Query(metaclass=ABCMeta):
|
|
80
|
-
query: Any
|
|
81
|
-
|
|
82
|
-
@dataclass
|
|
83
|
-
class _WorkInProgress(_Query):
|
|
84
|
-
query: None = field(default=None, init=False)
|
|
85
|
-
|
|
86
|
-
@dataclass
|
|
87
|
-
class SqlModelQuery(_Query):
|
|
88
|
-
query: str
|
|
89
|
-
config: _SqlModelConfig
|
|
90
|
-
|
|
91
|
-
@dataclass
|
|
92
|
-
class PyModelQuery(_Query):
|
|
93
|
-
query: Callable[[], pd.DataFrame]
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
@dataclass
|
|
97
|
-
class Referable(metaclass=ABCMeta):
|
|
31
|
+
class DataModel(metaclass=ABCMeta):
|
|
98
32
|
name: str
|
|
33
|
+
model_config: mc.ModelConfig
|
|
99
34
|
is_target: bool = field(default=False, init=False)
|
|
100
35
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
result: pd.DataFrame | None = field(default=None, init=False, repr=False)
|
|
36
|
+
result: pl.LazyFrame | None = field(default=None, init=False, repr=False)
|
|
37
|
+
needs_python_df: bool = field(default=False, init=False)
|
|
104
38
|
|
|
105
39
|
wait_count: int = field(default=0, init=False, repr=False)
|
|
106
40
|
confirmed_no_cycles: bool = field(default=False, init=False)
|
|
107
|
-
upstreams: dict[str,
|
|
108
|
-
downstreams: dict[str,
|
|
41
|
+
upstreams: dict[str, DataModel] = field(default_factory=dict, init=False, repr=False)
|
|
42
|
+
downstreams: dict[str, DataModel] = field(default_factory=dict, init=False, repr=False)
|
|
43
|
+
|
|
44
|
+
_: KW_ONLY
|
|
45
|
+
logger: u.Logger = field(default_factory=lambda: u.Logger(""))
|
|
46
|
+
env_vars: dict[str, str] = field(default_factory=dict)
|
|
47
|
+
conn_set: ConnectionSet = field(default_factory=ConnectionSet)
|
|
109
48
|
|
|
49
|
+
@property
|
|
110
50
|
@abstractmethod
|
|
111
|
-
def
|
|
51
|
+
def model_type(self) -> ModelType:
|
|
112
52
|
pass
|
|
113
53
|
|
|
114
|
-
|
|
115
|
-
|
|
54
|
+
@property
|
|
55
|
+
def is_queryable(self) -> bool:
|
|
56
|
+
return True
|
|
57
|
+
|
|
58
|
+
def compile(
|
|
59
|
+
self, ctx: dict[str, Any], ctx_args: ContextArgs, models_dict: dict[str, DataModel], recurse: bool
|
|
116
60
|
) -> None:
|
|
117
61
|
pass
|
|
118
62
|
|
|
119
|
-
@abstractmethod
|
|
120
63
|
def get_terminal_nodes(self, depencency_path: set[str]) -> set[str]:
|
|
121
|
-
|
|
64
|
+
if self.confirmed_no_cycles:
|
|
65
|
+
return set()
|
|
122
66
|
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
67
|
+
if self.name in depencency_path:
|
|
68
|
+
raise u.ConfigurationError(f'Cycle found in model dependency graph')
|
|
69
|
+
|
|
70
|
+
terminal_nodes = set()
|
|
71
|
+
if len(self.upstreams) == 0:
|
|
72
|
+
terminal_nodes.add(self.name)
|
|
73
|
+
else:
|
|
74
|
+
new_path = set(depencency_path)
|
|
75
|
+
new_path.add(self.name)
|
|
76
|
+
for dep_model in self.upstreams.values():
|
|
77
|
+
terminal_nodes.update(dep_model.get_terminal_nodes(new_path))
|
|
78
|
+
|
|
79
|
+
self.confirmed_no_cycles = True
|
|
80
|
+
return terminal_nodes
|
|
81
|
+
|
|
82
|
+
def _load_duckdb_view_to_python_df(self, conn: duckdb.DuckDBPyConnection, *, use_datalake: bool = False) -> pl.LazyFrame:
|
|
83
|
+
table_name = ("vdl." if use_datalake else "") + self.name
|
|
84
|
+
try:
|
|
85
|
+
return conn.sql(f"FROM {table_name}").pl().lazy()
|
|
86
|
+
except duckdb.CatalogException as e:
|
|
87
|
+
raise u.ConfigurationError(f'Failed to load duckdb table or view "{self.name}" to python dataframe') from e
|
|
88
|
+
|
|
89
|
+
def _run_sql_query_on_connection(self, connection_name: str, query: str, placeholders: dict = {}) -> pl.DataFrame:
|
|
90
|
+
self.logger.info(f"Running sql query on connection '{connection_name}': {query}")
|
|
91
|
+
return self.conn_set.run_sql_query_from_conn_name(query, connection_name, placeholders)
|
|
129
92
|
|
|
130
|
-
async def _trigger(self, conn:
|
|
93
|
+
async def _trigger(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
|
|
131
94
|
self.wait_count -= 1
|
|
132
95
|
if (self.wait_count == 0):
|
|
133
96
|
await self.run_model(conn, placeholders)
|
|
134
97
|
|
|
135
|
-
|
|
136
|
-
async def run_model(self, conn: Connection, placeholders: dict = {}) -> None:
|
|
98
|
+
async def run_model(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
|
|
137
99
|
coroutines = []
|
|
138
100
|
for model in self.downstreams.values():
|
|
139
101
|
coroutines.append(model._trigger(conn, placeholders))
|
|
140
|
-
await
|
|
102
|
+
await u.asyncio_gather(coroutines)
|
|
141
103
|
|
|
142
104
|
def retrieve_dependent_query_models(self, dependent_model_names: set[str]) -> None:
|
|
143
105
|
pass
|
|
144
|
-
|
|
106
|
+
|
|
107
|
+
def _register_all_upstream_python_df_helper(self, conn: duckdb.DuckDBPyConnection, tables_set: set[str]) -> None:
|
|
108
|
+
if self.result is not None and self.name not in tables_set:
|
|
109
|
+
conn.register(self.name, self.result)
|
|
110
|
+
for dep_model in self.upstreams.values():
|
|
111
|
+
dep_model._register_all_upstream_python_df_helper(conn, tables_set)
|
|
112
|
+
|
|
113
|
+
def register_all_upstream_python_df(self, conn: duckdb.DuckDBPyConnection) -> None:
|
|
114
|
+
show_tables_query = f"SHOW TABLES"
|
|
115
|
+
tables_df = conn.sql(show_tables_query).pl()
|
|
116
|
+
tables_set = set(tables_df["name"])
|
|
117
|
+
self._register_all_upstream_python_df_helper(conn, tables_set)
|
|
118
|
+
|
|
145
119
|
def get_max_path_length_to_target(self) -> int | None:
|
|
146
120
|
if not hasattr(self, "max_path_len_to_target"):
|
|
147
121
|
path_lengths = []
|
|
@@ -154,283 +128,819 @@ class Referable(metaclass=ABCMeta):
|
|
|
154
128
|
self.max_path_len_to_target = 0 if self.is_target else None
|
|
155
129
|
return self.max_path_len_to_target
|
|
156
130
|
|
|
131
|
+
async def _trigger_build(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
|
|
132
|
+
pass
|
|
133
|
+
|
|
134
|
+
def _create_table_from_df(self, conn: duckdb.DuckDBPyConnection, query_result: pl.LazyFrame | pd.DataFrame):
|
|
135
|
+
local_conn = conn.cursor()
|
|
136
|
+
# local_conn = conn
|
|
137
|
+
try:
|
|
138
|
+
assert query_result is not None
|
|
139
|
+
local_conn.execute(f"CREATE OR REPLACE TABLE {self.name} AS FROM query_result")
|
|
140
|
+
finally:
|
|
141
|
+
local_conn.close()
|
|
142
|
+
# pass
|
|
143
|
+
|
|
144
|
+
def process_pass_through_columns(self, models_dict: dict[str, DataModel]) -> None:
|
|
145
|
+
pass
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
@dataclass
|
|
149
|
+
class StaticModel(DataModel):
|
|
150
|
+
needs_python_df_for_build: bool = field(default=False, init=False)
|
|
151
|
+
wait_count_for_build: int = field(default=0, init=False, repr=False)
|
|
152
|
+
upstreams_for_build: dict[str, StaticModel] = field(default_factory=dict, init=False, repr=False)
|
|
153
|
+
downstreams_for_build: dict[str, StaticModel] = field(default_factory=dict, init=False, repr=False)
|
|
154
|
+
|
|
155
|
+
def get_terminal_nodes_for_build(self, depencency_path: set[str]) -> set[str]:
|
|
156
|
+
if self.confirmed_no_cycles:
|
|
157
|
+
return set()
|
|
158
|
+
|
|
159
|
+
if self.name in depencency_path:
|
|
160
|
+
raise u.ConfigurationError(f'Cycle found in model dependency graph')
|
|
161
|
+
|
|
162
|
+
terminal_nodes = set()
|
|
163
|
+
if len(self.upstreams_for_build) == 0:
|
|
164
|
+
terminal_nodes.add(self.name)
|
|
165
|
+
else:
|
|
166
|
+
new_path = set(depencency_path)
|
|
167
|
+
new_path.add(self.name)
|
|
168
|
+
for dep_model in self.upstreams_for_build.values():
|
|
169
|
+
terminal_nodes.update(dep_model.get_terminal_nodes_for_build(new_path))
|
|
170
|
+
|
|
171
|
+
self.confirmed_no_cycles = True
|
|
172
|
+
return terminal_nodes
|
|
173
|
+
|
|
174
|
+
def _get_result(self, conn: duckdb.DuckDBPyConnection) -> pl.LazyFrame:
|
|
175
|
+
local_conn = conn.cursor()
|
|
176
|
+
try:
|
|
177
|
+
return self._load_duckdb_view_to_python_df(local_conn, use_datalake=True)
|
|
178
|
+
except Exception as e:
|
|
179
|
+
raise InvalidInputError(409, f'dependent_data_model_not_found', f'Model "{self.name}" depends on static data models that cannot be found. Try building the Virtual Data Lake (VDL) first.')
|
|
180
|
+
finally:
|
|
181
|
+
local_conn.close()
|
|
182
|
+
|
|
183
|
+
async def run_model(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
|
|
184
|
+
start = time.time()
|
|
185
|
+
|
|
186
|
+
if (self.needs_python_df or self.is_target) and self.result is None:
|
|
187
|
+
self.result = await asyncio.to_thread(self._get_result, conn)
|
|
188
|
+
|
|
189
|
+
self.logger.log_activity_time(f"loading static model '{self.name}'", start)
|
|
190
|
+
|
|
191
|
+
await super().run_model(conn, placeholders)
|
|
192
|
+
|
|
193
|
+
def compile_for_build(
|
|
194
|
+
self, conn_args: ConnectionsArgs, models_dict: dict[str, StaticModel]
|
|
195
|
+
) -> None:
|
|
196
|
+
pass
|
|
197
|
+
|
|
198
|
+
async def _trigger_build(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
|
|
199
|
+
self.wait_count_for_build -= 1
|
|
200
|
+
if (self.wait_count_for_build == 0):
|
|
201
|
+
await self.build_model(conn, full_refresh)
|
|
202
|
+
|
|
203
|
+
async def build_model(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
|
|
204
|
+
if self.needs_python_df and self.result is None:
|
|
205
|
+
local_conn = conn.cursor()
|
|
206
|
+
try:
|
|
207
|
+
self.result = await asyncio.to_thread(self._load_duckdb_view_to_python_df, local_conn)
|
|
208
|
+
finally:
|
|
209
|
+
local_conn.close()
|
|
210
|
+
|
|
211
|
+
coroutines = []
|
|
212
|
+
for model in self.downstreams_for_build.values():
|
|
213
|
+
coroutines.append(model._trigger_build(conn, full_refresh))
|
|
214
|
+
await u.asyncio_gather(coroutines)
|
|
215
|
+
|
|
157
216
|
|
|
158
217
|
@dataclass
|
|
159
|
-
class Seed(
|
|
160
|
-
|
|
218
|
+
class Seed(StaticModel):
|
|
219
|
+
model_config: mc.SeedConfig
|
|
220
|
+
result: pl.LazyFrame
|
|
161
221
|
|
|
162
|
-
|
|
222
|
+
@property
|
|
223
|
+
def model_type(self) -> ModelType:
|
|
163
224
|
return ModelType.SEED
|
|
225
|
+
|
|
226
|
+
async def build_model(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
|
|
227
|
+
start = time.time()
|
|
164
228
|
|
|
165
|
-
|
|
166
|
-
|
|
229
|
+
print(f"[{u.get_current_time()}] 🔨 BUILDING: seed model '{self.name}'")
|
|
230
|
+
# await asyncio.to_thread(self._create_table_from_df, conn, self.result)
|
|
231
|
+
self._create_table_from_df(conn, self.result) # without threading
|
|
232
|
+
|
|
233
|
+
print(f"[{u.get_current_time()}] ✅ FINISHED: seed model '{self.name}'")
|
|
234
|
+
self.logger.log_activity_time(f"building seed model '{self.name}' into VDL", start)
|
|
235
|
+
|
|
236
|
+
await super().build_model(conn, full_refresh)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
@dataclass
|
|
240
|
+
class SourceModel(StaticModel):
|
|
241
|
+
model_config: src.Source
|
|
242
|
+
|
|
243
|
+
@property
|
|
244
|
+
def model_type(self) -> ModelType:
|
|
245
|
+
return ModelType.SOURCE
|
|
246
|
+
|
|
247
|
+
@property
|
|
248
|
+
def connection_props(self) -> ConnectionProperties:
|
|
249
|
+
conn_name = self.model_config.get_connection()
|
|
250
|
+
conn_props = self.conn_set.get_connection(conn_name)
|
|
251
|
+
if isinstance(conn_props, ConnectionProperties):
|
|
252
|
+
return conn_props
|
|
253
|
+
raise u.ConfigurationError(f'Unable to use connection "{conn_name}" for source "{self.name}". Connection "{conn_name}" must be a ConnectionProperties object')
|
|
167
254
|
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
255
|
+
@property
|
|
256
|
+
def is_queryable(self) -> bool:
|
|
257
|
+
connection_props = self.connection_props
|
|
258
|
+
return self.model_config.load_to_vdl or connection_props.type == ConnectionTypeEnum.DUCKDB
|
|
259
|
+
|
|
260
|
+
def _build_source_model(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
|
|
261
|
+
local_conn = conn.cursor()
|
|
262
|
+
# local_conn = conn
|
|
263
|
+
|
|
264
|
+
local_conn.begin()
|
|
265
|
+
try:
|
|
266
|
+
source = self.model_config
|
|
267
|
+
conn_name = source.get_connection()
|
|
268
|
+
|
|
269
|
+
connection_props = self.connection_props
|
|
270
|
+
dialect = connection_props.dialect
|
|
271
|
+
attach_uri = connection_props.attach_uri_for_duckdb
|
|
272
|
+
if attach_uri is None:
|
|
273
|
+
raise u.ConfigurationError(f'Loading to duckdb is not supported for source "{self.name}" since its connection "{conn_name}" uses an unsupported dialect')
|
|
274
|
+
|
|
275
|
+
result = u.run_duckdb_stmt(self.logger, local_conn, f"FROM (SHOW DATABASES) WHERE database_name = 'db_{conn_name}'").fetchone()
|
|
276
|
+
if result is None:
|
|
277
|
+
return # skip this source if connection is not attached
|
|
278
|
+
|
|
279
|
+
table_name = source.get_table()
|
|
280
|
+
new_table_name = self.name
|
|
281
|
+
|
|
282
|
+
if len(source.columns) == 0:
|
|
283
|
+
stmt = f"CREATE OR REPLACE TABLE {new_table_name} AS FROM db_{conn_name}.{table_name}"
|
|
284
|
+
u.run_duckdb_stmt(self.logger, local_conn, stmt)
|
|
285
|
+
local_conn.commit()
|
|
286
|
+
return
|
|
287
|
+
|
|
288
|
+
increasing_column = source.update_hints.increasing_column
|
|
289
|
+
recreate_table = full_refresh or increasing_column is None
|
|
290
|
+
if recreate_table:
|
|
291
|
+
u.run_duckdb_stmt(self.logger, local_conn, f"DROP TABLE IF EXISTS {new_table_name}")
|
|
292
|
+
|
|
293
|
+
create_table_cols_clause = source.get_cols_for_create_table_stmt()
|
|
294
|
+
stmt = f"CREATE TABLE IF NOT EXISTS {new_table_name} ({create_table_cols_clause})"
|
|
295
|
+
u.run_duckdb_stmt(self.logger, local_conn, stmt)
|
|
296
|
+
|
|
297
|
+
if not recreate_table:
|
|
298
|
+
if source.update_hints.selective_overwrite_value is not None:
|
|
299
|
+
stmt = f"DELETE FROM {new_table_name} WHERE {increasing_column} >= $value"
|
|
300
|
+
u.run_duckdb_stmt(self.logger, local_conn, stmt, params={"value": source.update_hints.selective_overwrite_value})
|
|
301
|
+
elif not source.update_hints.strictly_increasing:
|
|
302
|
+
stmt = f"DELETE FROM {new_table_name} WHERE {increasing_column} = ({source.get_max_incr_col_query(new_table_name)})"
|
|
303
|
+
u.run_duckdb_stmt(self.logger, local_conn, stmt)
|
|
304
|
+
|
|
305
|
+
max_val_of_incr_col = None
|
|
306
|
+
if increasing_column is not None:
|
|
307
|
+
max_val_of_incr_col_tuple = u.run_duckdb_stmt(self.logger, local_conn, source.get_max_incr_col_query(new_table_name)).fetchone()
|
|
308
|
+
max_val_of_incr_col = max_val_of_incr_col_tuple[0] if isinstance(max_val_of_incr_col_tuple, tuple) else None
|
|
309
|
+
if max_val_of_incr_col is None:
|
|
310
|
+
recreate_table = True
|
|
311
|
+
|
|
312
|
+
query = source.get_query_for_upsert(dialect, conn_name, table_name, max_val_of_incr_col, full_refresh=recreate_table)
|
|
313
|
+
|
|
314
|
+
primary_keys = ", ".join(source.primary_key) if source.primary_key else ""
|
|
315
|
+
match_condition = f"USING ({primary_keys})" if primary_keys else "ON false"
|
|
316
|
+
stmt = (
|
|
317
|
+
f"MERGE INTO {new_table_name} "
|
|
318
|
+
f"USING ({query}) AS src "
|
|
319
|
+
f"{match_condition} "
|
|
320
|
+
f"WHEN MATCHED THEN UPDATE "
|
|
321
|
+
f"WHEN NOT MATCHED THEN INSERT BY NAME"
|
|
322
|
+
)
|
|
323
|
+
u.run_duckdb_stmt(self.logger, local_conn, stmt)
|
|
324
|
+
|
|
325
|
+
local_conn.commit()
|
|
326
|
+
|
|
327
|
+
finally:
|
|
328
|
+
local_conn.close()
|
|
329
|
+
# pass
|
|
330
|
+
|
|
331
|
+
async def build_model(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
|
|
332
|
+
if self.model_config.load_to_vdl:
|
|
333
|
+
start = time.time()
|
|
334
|
+
print(f"[{u.get_current_time()}] 🔨 BUILDING: source model '{self.name}'")
|
|
335
|
+
|
|
336
|
+
# await asyncio.to_thread(self._build_source_model, conn, full_refresh)
|
|
337
|
+
self._build_source_model(conn, full_refresh) # without threading
|
|
338
|
+
|
|
339
|
+
print(f"[{u.get_current_time()}] ✅ FINISHED: source model '{self.name}'")
|
|
340
|
+
self.logger.log_activity_time(f"building source model '{self.name}' into VDL", start)
|
|
172
341
|
|
|
342
|
+
await super().build_model(conn, full_refresh)
|
|
343
|
+
|
|
173
344
|
|
|
174
345
|
@dataclass
|
|
175
|
-
class
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
346
|
+
class QueryModel(DataModel):
|
|
347
|
+
model_config: mc.QueryModelConfig
|
|
348
|
+
query_file: mq.QueryFile
|
|
349
|
+
compiled_query: mq.Query | None = field(default=None, init=False)
|
|
350
|
+
_: KW_ONLY
|
|
180
351
|
j2_env: u.j2.Environment = field(default_factory=lambda: u.j2.Environment(loader=u.j2.FileSystemLoader(".")))
|
|
181
|
-
compiled_query: _Query | None = field(default=None, init=False)
|
|
182
352
|
|
|
183
|
-
def
|
|
184
|
-
return self.query_file.model_type
|
|
185
|
-
|
|
186
|
-
def _add_upstream(self, other: Referable) -> None:
|
|
353
|
+
def _add_upstream(self, other: DataModel) -> None:
|
|
187
354
|
self.upstreams[other.name] = other
|
|
188
355
|
other.downstreams[self.name] = self
|
|
189
356
|
|
|
190
|
-
if isinstance(self.query_file,
|
|
191
|
-
other.
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
357
|
+
if isinstance(self.query_file, mq.PyQueryFile):
|
|
358
|
+
other.needs_python_df = True
|
|
359
|
+
|
|
360
|
+
def _ref_for_sql(self, dependent_model_name: str, models_dict: dict[str, DataModel]) -> str:
|
|
361
|
+
if dependent_model_name not in models_dict:
|
|
362
|
+
raise u.ConfigurationError(f'Model "{self.name}" references unknown model "{dependent_model_name}"')
|
|
363
|
+
|
|
364
|
+
dep_model = models_dict[dependent_model_name]
|
|
365
|
+
if isinstance(dep_model, SourceModel) and not dep_model.model_config.load_to_vdl:
|
|
366
|
+
# Allow when caller is Build or Federate AND the source connection is duckdb; else error
|
|
367
|
+
conn_name = dep_model.model_config.get_connection()
|
|
368
|
+
conn_props = self.conn_set.get_connection(conn_name)
|
|
369
|
+
is_duckdb_conn = isinstance(conn_props, ConnectionProperties) and conn_props.type == ConnectionTypeEnum.DUCKDB
|
|
370
|
+
if not is_duckdb_conn:
|
|
371
|
+
raise u.ConfigurationError(
|
|
372
|
+
f'Model "{self.name}" cannot reference source model "{dependent_model_name}". '
|
|
373
|
+
'To be referenced by a build or federate model, the source must have load_to_vdl=True or a duckdb connection type.'
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
self.model_config.depends_on.add(dependent_model_name)
|
|
377
|
+
return dependent_model_name
|
|
378
|
+
|
|
379
|
+
def _ref_for_python(self, dependent_model_name: str) -> pl.LazyFrame:
|
|
380
|
+
if dependent_model_name not in self.upstreams:
|
|
381
|
+
raise u.ConfigurationError(f'Model "{self.name}" must include model "{dependent_model_name}" as a dependency to use')
|
|
382
|
+
df = self.upstreams[dependent_model_name].result
|
|
383
|
+
assert df is not None
|
|
384
|
+
return df
|
|
385
|
+
|
|
386
|
+
def _get_compile_sql_model_args_from_ctx_args(
|
|
387
|
+
self, ctx: dict[str, Any], ctx_args: ContextArgs
|
|
388
|
+
) -> dict[str, Any]:
|
|
389
|
+
is_placeholder = lambda placeholder: placeholder in ctx_args._placeholders_copy
|
|
218
390
|
kwargs = {
|
|
219
391
|
"proj_vars": ctx_args.proj_vars, "env_vars": ctx_args.env_vars, "user": ctx_args.user, "prms": ctx_args.prms,
|
|
220
|
-
"
|
|
221
|
-
"
|
|
392
|
+
"configurables": ctx_args.configurables, "ctx": ctx, "is_placeholder": is_placeholder, "set_placeholder": ctx_args.set_placeholder,
|
|
393
|
+
"param_exists": ctx_args.param_exists
|
|
222
394
|
}
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
if dependent_model_name not in models_dict:
|
|
227
|
-
raise u.ConfigurationError(f'Model "{self.name}" references unknown model "{dependent_model_name}"')
|
|
228
|
-
dependencies.add(dependent_model_name)
|
|
229
|
-
return dependent_model_name
|
|
230
|
-
kwargs["ref"] = ref
|
|
231
|
-
|
|
395
|
+
return kwargs
|
|
396
|
+
|
|
397
|
+
def _get_compiled_sql_query_str(self, raw_query: str, kwargs: dict[str, Any]) -> str:
|
|
232
398
|
try:
|
|
233
|
-
template = self.j2_env.from_string(
|
|
234
|
-
query =
|
|
399
|
+
template = self.j2_env.from_string(raw_query)
|
|
400
|
+
query = template.render(kwargs)
|
|
235
401
|
except Exception as e:
|
|
236
|
-
raise
|
|
402
|
+
raise FileExecutionError(f'Failed to compile sql model "{self.name}"', e) from e
|
|
403
|
+
return query
|
|
404
|
+
|
|
405
|
+
def process_pass_through_columns(self, models_dict: dict[str, DataModel]) -> None:
|
|
406
|
+
if getattr(self, "processed_pass_through_columns", False):
|
|
407
|
+
return
|
|
408
|
+
|
|
409
|
+
for col in self.model_config.columns:
|
|
410
|
+
if col.pass_through:
|
|
411
|
+
# Validate pass-through column has exactly one dependency
|
|
412
|
+
if len(col.depends_on) != 1:
|
|
413
|
+
raise u.ConfigurationError(
|
|
414
|
+
f'Column "{self.name}.{col.name}" has pass_through=true, which must have exactly one depends_on value'
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
# Get the upstream column reference
|
|
418
|
+
upstream_col_ref = next(iter(col.depends_on))
|
|
419
|
+
table_name, col_name = upstream_col_ref.split('.')
|
|
420
|
+
self.model_config.depends_on.add(table_name)
|
|
421
|
+
|
|
422
|
+
# Get the upstream model
|
|
423
|
+
if table_name not in models_dict:
|
|
424
|
+
raise u.ConfigurationError(
|
|
425
|
+
f'Column "{self.name}.{col.name}" depends on unknown model "{table_name}"'
|
|
426
|
+
)
|
|
237
427
|
|
|
238
|
-
|
|
239
|
-
|
|
428
|
+
# Do not rely on self.upstreams here, as it may not be fully populated for metadata passthrough purposes
|
|
429
|
+
for dep_model_name in self.model_config.depends_on:
|
|
430
|
+
dep_model = models_dict[dep_model_name]
|
|
431
|
+
dep_model.process_pass_through_columns(models_dict)
|
|
432
|
+
|
|
433
|
+
for col in self.model_config.columns:
|
|
434
|
+
if col.pass_through:
|
|
435
|
+
upstream_col_ref = next(iter(col.depends_on))
|
|
436
|
+
table_name, col_name = upstream_col_ref.split('.')
|
|
437
|
+
upstream_model = models_dict[table_name]
|
|
438
|
+
|
|
439
|
+
# Find the upstream column config
|
|
440
|
+
upstream_col = next(
|
|
441
|
+
(c for c in upstream_model.model_config.columns if c.name == col_name),
|
|
442
|
+
None
|
|
443
|
+
)
|
|
444
|
+
if upstream_col is None:
|
|
445
|
+
raise u.ConfigurationError(
|
|
446
|
+
f'Column "{self.name}.{col.name}" depends on unknown column "{upstream_col_ref}"'
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
# Copy metadata from upstream column
|
|
450
|
+
col.type = upstream_col.type if col.type == "" else col.type
|
|
451
|
+
col.condition = upstream_col.condition if col.condition == "" else col.condition
|
|
452
|
+
col.description = upstream_col.description if col.description == "" else col.description
|
|
453
|
+
col.category = upstream_col.category if col.category == mc.ColumnCategory.MISC else col.category
|
|
454
|
+
|
|
455
|
+
self.processed_pass_through_columns = True
|
|
456
|
+
|
|
457
|
+
def retrieve_dependent_query_models(self, dependent_model_names: set[str]) -> None:
|
|
458
|
+
if self.name not in dependent_model_names:
|
|
459
|
+
dependent_model_names.add(self.name)
|
|
460
|
+
for dep_model in self.upstreams.values():
|
|
461
|
+
dep_model.retrieve_dependent_query_models(dependent_model_names)
|
|
240
462
|
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
463
|
+
def _log_sql_to_run(self, sql: str, placeholders: dict[str, Any]) -> None:
|
|
464
|
+
log_msg = f"SQL to run for model '{self.name}':\n{sql}"
|
|
465
|
+
log_msg += f"\n\n(with placeholders: {placeholders})"
|
|
466
|
+
self.logger.debug(log_msg)
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
@dataclass
|
|
470
|
+
class DbviewModel(QueryModel):
|
|
471
|
+
model_config: mc.DbviewModelConfig
|
|
472
|
+
query_file: mq.SqlQueryFile
|
|
473
|
+
compiled_query: mq.SqlModelQuery | None = field(default=None, init=False)
|
|
474
|
+
sources: dict[str, src.Source] = field(default_factory=dict, init=False)
|
|
475
|
+
|
|
476
|
+
@property
|
|
477
|
+
def model_type(self) -> ModelType:
|
|
478
|
+
return ModelType.DBVIEW
|
|
479
|
+
|
|
480
|
+
def _get_compile_sql_model_args(
|
|
481
|
+
self, ctx: dict[str, Any], ctx_args: ContextArgs, models_dict: dict[str, DataModel]
|
|
482
|
+
) -> dict[str, Any]:
|
|
483
|
+
kwargs = self._get_compile_sql_model_args_from_ctx_args(ctx, ctx_args)
|
|
245
484
|
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
485
|
+
def source(source_name: str) -> str:
|
|
486
|
+
if source_name not in models_dict or not isinstance(source_model := models_dict[source_name], SourceModel):
|
|
487
|
+
raise u.ConfigurationError(f'Dbview "{self.name}" references unknown source "{source_name}"')
|
|
488
|
+
if source_model.model_config.get_connection() != self.model_config.get_connection():
|
|
489
|
+
raise u.ConfigurationError(f'Dbview "{self.name}" references source "{source_name}" with different connection')
|
|
490
|
+
|
|
491
|
+
# Check if the source model has load_to_vdl=False but this dbview has translate_to_duckdb=True
|
|
492
|
+
if not source_model.model_config.load_to_vdl and self.model_config.translate_to_duckdb:
|
|
493
|
+
raise u.ConfigurationError(
|
|
494
|
+
f'Dbview "{self.name}" with translate_to_duckdb=True cannot reference source "{source_name}" '
|
|
495
|
+
f'which has load_to_vdl=False'
|
|
496
|
+
)
|
|
497
|
+
|
|
498
|
+
self.model_config.depends_on.add(source_name)
|
|
499
|
+
self.sources[source_name] = source_model.model_config
|
|
500
|
+
return "{{ source(\"" + source_name + "\") }}"
|
|
256
501
|
|
|
257
|
-
|
|
258
|
-
|
|
502
|
+
kwargs["source"] = source
|
|
503
|
+
kwargs["ref"] = source
|
|
504
|
+
return kwargs
|
|
259
505
|
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
506
|
+
def _get_duckdb_query(self, read_dialect: str, query: str) -> str:
|
|
507
|
+
kwargs = {
|
|
508
|
+
"source": lambda source_name: "vdl." + source_name
|
|
509
|
+
}
|
|
510
|
+
compiled_query = self._get_compiled_sql_query_str(query, kwargs)
|
|
511
|
+
duckdb_query = sqlglot.transpile(compiled_query, read=read_dialect, write="duckdb", pretty=True)[0]
|
|
512
|
+
return "-- translated to duckdb\n" + duckdb_query
|
|
513
|
+
|
|
514
|
+
def _compile_sql_model(self, kwargs: dict[str, Any]) -> mq.SqlModelQuery:
|
|
515
|
+
compiled_query_str = self._get_compiled_sql_query_str(self.query_file.raw_query, kwargs)
|
|
516
|
+
|
|
517
|
+
connection_name = self.model_config.get_connection()
|
|
518
|
+
connection_props = self.conn_set.get_connection(connection_name)
|
|
264
519
|
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
520
|
+
if self.model_config.translate_to_duckdb and isinstance(connection_props, ConnectionProperties):
|
|
521
|
+
# Forbid translate_to_duckdb when dbview connection is duckdb
|
|
522
|
+
if connection_props.type == ConnectionTypeEnum.DUCKDB:
|
|
523
|
+
raise u.ConfigurationError(
|
|
524
|
+
f'Dbview "{self.name}" has translate_to_duckdb=True but its connection is duckdb. Use a federate model instead.'
|
|
525
|
+
)
|
|
526
|
+
macros = {
|
|
527
|
+
"source": lambda source_name: "vdl." + source_name
|
|
528
|
+
}
|
|
529
|
+
compiled_query2 = self._get_compiled_sql_query_str(compiled_query_str, macros)
|
|
530
|
+
compiled_query_str = self._get_duckdb_query(connection_props.dialect, compiled_query2)
|
|
531
|
+
is_duckdb = True
|
|
532
|
+
else:
|
|
533
|
+
macros = {
|
|
534
|
+
"source": lambda source_name: self.sources[source_name].get_table()
|
|
535
|
+
}
|
|
536
|
+
compiled_query_str = self._get_compiled_sql_query_str(compiled_query_str, macros)
|
|
537
|
+
is_duckdb = False
|
|
268
538
|
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
539
|
+
compiled_query = mq.SqlModelQuery(compiled_query_str, is_duckdb)
|
|
540
|
+
return compiled_query
|
|
541
|
+
|
|
542
|
+
def compile(
|
|
543
|
+
self, ctx: dict[str, Any], ctx_args: ContextArgs, models_dict: dict[str, DataModel], recurse: bool
|
|
544
|
+
) -> None:
|
|
545
|
+
if self.compiled_query is not None:
|
|
546
|
+
return
|
|
547
|
+
else:
|
|
548
|
+
self.compiled_query = mq.WorkInProgress() # type: ignore
|
|
549
|
+
|
|
550
|
+
start = time.time()
|
|
551
|
+
|
|
552
|
+
kwargs = self._get_compile_sql_model_args(ctx, ctx_args, models_dict)
|
|
553
|
+
self.compiled_query = self._compile_sql_model(kwargs)
|
|
554
|
+
|
|
555
|
+
self.logger.log_activity_time(f"compiling dbview model '{self.name}'", start)
|
|
556
|
+
|
|
557
|
+
async def _run_sql_model(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
|
|
558
|
+
assert self.compiled_query is not None
|
|
559
|
+
is_duckdb = self.compiled_query.is_duckdb
|
|
560
|
+
query = self.compiled_query.query
|
|
561
|
+
connection_name = self.model_config.get_connection()
|
|
562
|
+
|
|
563
|
+
def run_sql_query_on_connection(is_duckdb: bool, query: str, placeholders: dict) -> pl.DataFrame:
|
|
564
|
+
try:
|
|
565
|
+
if is_duckdb:
|
|
566
|
+
local_conn = conn.cursor()
|
|
567
|
+
try:
|
|
568
|
+
self.logger.info(f"Running dbview '{self.name}' on duckdb")
|
|
569
|
+
return local_conn.sql(query, params=placeholders).pl()
|
|
570
|
+
except duckdb.CatalogException as e:
|
|
571
|
+
raise InvalidInputError(409, f'dependent_data_model_not_found', f'Model "{self.name}" depends on static data models that cannot be found. Try building the Virtual Data Lake (VDL) first.')
|
|
572
|
+
except Exception as e:
|
|
573
|
+
raise RuntimeError(e)
|
|
574
|
+
finally:
|
|
575
|
+
local_conn.close()
|
|
576
|
+
else:
|
|
577
|
+
self.logger.info(f"Running dbview '{self.name}' on connection: {connection_name}")
|
|
578
|
+
return self.conn_set.run_sql_query_from_conn_name(query, connection_name, placeholders)
|
|
579
|
+
except RuntimeError as e:
|
|
580
|
+
raise FileExecutionError(f'Failed to run dbview sql model "{self.name}"', e)
|
|
581
|
+
|
|
582
|
+
self._log_sql_to_run(query, placeholders)
|
|
583
|
+
result = await asyncio.to_thread(run_sql_query_on_connection, is_duckdb, query, placeholders)
|
|
584
|
+
self.result = result.lazy()
|
|
585
|
+
|
|
586
|
+
async def run_model(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
|
|
587
|
+
start = time.time()
|
|
588
|
+
|
|
589
|
+
await self._run_sql_model(conn, placeholders)
|
|
590
|
+
|
|
591
|
+
self.logger.log_activity_time(f"running dbview model '{self.name}'", start)
|
|
592
|
+
|
|
593
|
+
await super().run_model(conn, placeholders)
|
|
594
|
+
|
|
595
|
+
|
|
596
|
+
@dataclass
|
|
597
|
+
class FederateModel(QueryModel):
|
|
598
|
+
model_config: mc.FederateModelConfig
|
|
599
|
+
query_file: mq.SqlQueryFile | mq.PyQueryFile
|
|
600
|
+
compiled_query: mq.SqlModelQuery | mq.PyModelQuery | None = field(default=None, init=False)
|
|
601
|
+
|
|
602
|
+
@property
|
|
603
|
+
def model_type(self) -> ModelType:
|
|
604
|
+
return ModelType.FEDERATE
|
|
605
|
+
|
|
606
|
+
def _get_compile_sql_model_args(
|
|
607
|
+
self, ctx: dict[str, Any], ctx_args: ContextArgs, models_dict: dict[str, DataModel]
|
|
608
|
+
) -> dict[str, Any]:
|
|
609
|
+
kwargs = self._get_compile_sql_model_args_from_ctx_args(ctx, ctx_args)
|
|
610
|
+
|
|
611
|
+
def ref(dependent_model_name: str) -> str:
|
|
612
|
+
dependent_model = self._ref_for_sql(dependent_model_name, models_dict)
|
|
613
|
+
dep = models_dict[dependent_model]
|
|
614
|
+
if isinstance(dep, BuildModel):
|
|
615
|
+
return "vdl." + dependent_model
|
|
616
|
+
if isinstance(dep, SourceModel):
|
|
617
|
+
if dep.model_config.load_to_vdl:
|
|
618
|
+
return "vdl." + dependent_model
|
|
619
|
+
conn_name = dep.model_config.get_connection()
|
|
620
|
+
table_name = dep.model_config.get_table()
|
|
621
|
+
return f"db_{conn_name}.{table_name}"
|
|
622
|
+
return dependent_model
|
|
623
|
+
|
|
624
|
+
kwargs["ref"] = ref
|
|
625
|
+
return kwargs
|
|
626
|
+
|
|
627
|
+
def _compile_sql_model(
|
|
628
|
+
self, query_file: mq.SqlQueryFile, ctx: dict[str, Any], ctx_args: ContextArgs, models_dict: dict[str, DataModel]
|
|
629
|
+
) -> mq.SqlModelQuery:
|
|
630
|
+
kwargs = self._get_compile_sql_model_args(ctx, ctx_args, models_dict)
|
|
631
|
+
compiled_query_str = self._get_compiled_sql_query_str(query_file.raw_query, kwargs)
|
|
632
|
+
compiled_query = mq.SqlModelQuery(compiled_query_str, is_duckdb=True)
|
|
633
|
+
return compiled_query
|
|
634
|
+
|
|
635
|
+
def _get_python_model_args(self, ctx: dict[str, Any], ctx_args: ContextArgs) -> ModelArgs:
|
|
636
|
+
dependencies = self.model_config.depends_on
|
|
637
|
+
connections = self.conn_set.get_connections_as_dict()
|
|
638
|
+
|
|
639
|
+
def run_external_sql(connection_name: str, sql_query: str) -> pl.DataFrame:
|
|
640
|
+
return self._run_sql_query_on_connection(connection_name, sql_query, ctx_args._placeholders_copy)
|
|
641
|
+
|
|
642
|
+
conn_args = ConnectionsArgs(ctx_args.project_path, ctx_args.proj_vars, ctx_args.env_vars)
|
|
643
|
+
build_model_args = BuildModelArgs(conn_args, connections, dependencies, self._ref_for_python, run_external_sql)
|
|
644
|
+
return ModelArgs(ctx_args, build_model_args, ctx)
|
|
645
|
+
|
|
646
|
+
def _compile_python_model(
|
|
647
|
+
self, query_file: mq.PyQueryFile, ctx: dict[str, Any], ctx_args: ContextArgs
|
|
648
|
+
) -> mq.PyModelQuery:
|
|
649
|
+
sqrl_args = self._get_python_model_args(ctx, ctx_args)
|
|
274
650
|
|
|
275
|
-
def compiled_query():
|
|
651
|
+
def compiled_query() -> pl.LazyFrame | pd.DataFrame:
|
|
276
652
|
try:
|
|
277
|
-
|
|
278
|
-
raw_query: _RawPyQuery = self.query_file.raw_query
|
|
279
|
-
return raw_query.query(sqrl_args)
|
|
653
|
+
return query_file.raw_query(sqrl_args)
|
|
280
654
|
except Exception as e:
|
|
281
|
-
raise
|
|
655
|
+
raise FileExecutionError(f'Failed to run "{c.MAIN_FUNC}" function for python model "{self.name}"', e) from e
|
|
282
656
|
|
|
283
|
-
return PyModelQuery(compiled_query)
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
self, ctx: dict[str, Any], ctx_args: ContextArgs,
|
|
657
|
+
return mq.PyModelQuery(compiled_query)
|
|
658
|
+
|
|
659
|
+
def compile(
|
|
660
|
+
self, ctx: dict[str, Any], ctx_args: ContextArgs, models_dict: dict[str, DataModel], recurse: bool
|
|
287
661
|
) -> None:
|
|
288
662
|
if self.compiled_query is not None:
|
|
289
663
|
return
|
|
290
664
|
else:
|
|
291
|
-
self.compiled_query =
|
|
665
|
+
self.compiled_query = mq.WorkInProgress() # type: ignore
|
|
292
666
|
|
|
293
667
|
start = time.time()
|
|
294
668
|
|
|
295
|
-
if isinstance(self.query_file, SqlQueryFile):
|
|
296
|
-
compiled_query
|
|
297
|
-
elif isinstance(self.query_file, PyQueryFile):
|
|
298
|
-
compiled_query
|
|
669
|
+
if isinstance(self.query_file, mq.SqlQueryFile):
|
|
670
|
+
self.compiled_query = self._compile_sql_model(self.query_file, ctx, ctx_args, models_dict)
|
|
671
|
+
elif isinstance(self.query_file, mq.PyQueryFile):
|
|
672
|
+
self.compiled_query = self._compile_python_model(self.query_file, ctx, ctx_args)
|
|
299
673
|
else:
|
|
300
674
|
raise NotImplementedError(f"Query type not supported: {self.query_file.__class__.__name__}")
|
|
301
675
|
|
|
302
|
-
self.
|
|
303
|
-
self.wait_count = len(set(dependencies))
|
|
304
|
-
|
|
305
|
-
model_type = self.get_model_type().name.lower()
|
|
306
|
-
self.logger.log_activity_time(f"compiling {model_type} model '{self.name}'", start)
|
|
676
|
+
self.logger.log_activity_time(f"compiling federate model '{self.name}'", start)
|
|
307
677
|
|
|
308
678
|
if not recurse:
|
|
309
679
|
return
|
|
310
680
|
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
681
|
+
dependencies = self.model_config.depends_on
|
|
682
|
+
self.wait_count = len(dependencies)
|
|
683
|
+
|
|
684
|
+
for name in dependencies:
|
|
685
|
+
dep_model = models_dict[name]
|
|
314
686
|
self._add_upstream(dep_model)
|
|
315
|
-
|
|
316
|
-
coroutines.append(coro)
|
|
317
|
-
await asyncio.gather(*coroutines)
|
|
318
|
-
|
|
319
|
-
def get_terminal_nodes(self, depencency_path: set[str]) -> set[str]:
|
|
320
|
-
if self.confirmed_no_cycles:
|
|
321
|
-
return set()
|
|
322
|
-
|
|
323
|
-
if self.name in depencency_path:
|
|
324
|
-
raise u.ConfigurationError(f'Cycle found in model dependency graph')
|
|
687
|
+
dep_model.compile(ctx, ctx_args, models_dict, recurse)
|
|
325
688
|
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
new_path.add(self.name)
|
|
332
|
-
for dep_model in self.upstreams.values():
|
|
333
|
-
terminal_nodes_under_dep = dep_model.get_terminal_nodes(new_path)
|
|
334
|
-
terminal_nodes = terminal_nodes.union(terminal_nodes_under_dep)
|
|
335
|
-
|
|
336
|
-
self.confirmed_no_cycles = True
|
|
337
|
-
return terminal_nodes
|
|
689
|
+
async def _run_sql_model(self, compiled_query: mq.SqlModelQuery, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
|
|
690
|
+
local_conn = conn.cursor()
|
|
691
|
+
try:
|
|
692
|
+
self.register_all_upstream_python_df(local_conn)
|
|
693
|
+
query = compiled_query.query
|
|
338
694
|
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
695
|
+
def create_table(local_conn: duckdb.DuckDBPyConnection):
|
|
696
|
+
# DuckDB doesn't support specifying named parameters that are not used in the query, so filtering them out
|
|
697
|
+
placeholder_exists = lambda key: re.search(r"\$" + key + r"(?!\w)", query)
|
|
698
|
+
existing_placeholders = {key: value for key, value in placeholders.items() if placeholder_exists(key)}
|
|
343
699
|
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
try:
|
|
347
|
-
return self.conn_set.run_sql_query_from_conn_name(query, config.connection_name, placeholders)
|
|
348
|
-
except RuntimeError as e:
|
|
349
|
-
raise u.FileExecutionError(f'Failed to run dbview sql model "{self.name}"', e) from e
|
|
350
|
-
|
|
351
|
-
df = await asyncio.to_thread(run_sql_query)
|
|
352
|
-
await asyncio.to_thread(self._load_pandas_to_table, df, conn)
|
|
353
|
-
if self.needs_pandas or self.is_target:
|
|
354
|
-
self.result = df
|
|
355
|
-
elif self.query_file.model_type == ModelType.FEDERATE:
|
|
356
|
-
def create_table():
|
|
357
|
-
create_query = config.get_sql_for_create(self.name, query)
|
|
700
|
+
create_query = self.model_config.get_sql_for_create(self.name, query)
|
|
701
|
+
self._log_sql_to_run(create_query, existing_placeholders)
|
|
358
702
|
try:
|
|
359
|
-
return
|
|
703
|
+
return local_conn.execute(create_query, existing_placeholders)
|
|
704
|
+
except duckdb.CatalogException as e:
|
|
705
|
+
if self.name == "__fake_target":
|
|
706
|
+
raise InvalidInputError(409, "invalid_sql_query", f"Provided SQL query depends on static data models that cannot be found. Try building the Virtual Data Lake (VDL) first.")
|
|
707
|
+
else:
|
|
708
|
+
raise InvalidInputError(409, f'dependent_data_model_not_found', f'Model "{self.name}" depends on static data models that cannot be found. Try building the Virtual Data Lake (VDL) first.')
|
|
360
709
|
except Exception as e:
|
|
361
|
-
|
|
710
|
+
if self.name == "__fake_target":
|
|
711
|
+
raise InvalidInputError(400, "invalid_sql_query", f"Failed to run provided SQL query")
|
|
712
|
+
else:
|
|
713
|
+
raise FileExecutionError(f'Failed to run federate sql model "{self.name}"', e) from e
|
|
362
714
|
|
|
363
|
-
await asyncio.to_thread(create_table)
|
|
364
|
-
if self.
|
|
365
|
-
self.result = await asyncio.to_thread(self.
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
715
|
+
await asyncio.to_thread(create_table, local_conn)
|
|
716
|
+
if self.needs_python_df or self.is_target:
|
|
717
|
+
self.result = await asyncio.to_thread(self._load_duckdb_view_to_python_df, local_conn)
|
|
718
|
+
finally:
|
|
719
|
+
local_conn.close()
|
|
720
|
+
|
|
721
|
+
async def _run_python_model(self, compiled_query: mq.PyModelQuery) -> None:
|
|
722
|
+
query_result = await asyncio.to_thread(compiled_query.query)
|
|
723
|
+
if isinstance(query_result, pd.DataFrame):
|
|
724
|
+
query_result = pl.from_pandas(query_result)
|
|
725
|
+
|
|
726
|
+
self.result = query_result.lazy()
|
|
369
727
|
|
|
370
|
-
|
|
371
|
-
if self.needs_sql_table:
|
|
372
|
-
await asyncio.to_thread(self._load_pandas_to_table, df, conn)
|
|
373
|
-
if self.needs_pandas or self.is_target:
|
|
374
|
-
self.result = df
|
|
375
|
-
|
|
376
|
-
async def run_model(self, conn: Connection, placeholders: dict = {}) -> None:
|
|
728
|
+
async def run_model(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
|
|
377
729
|
start = time.time()
|
|
378
730
|
|
|
379
|
-
if isinstance(self.
|
|
380
|
-
await self._run_sql_model(conn, placeholders)
|
|
381
|
-
elif isinstance(self.
|
|
382
|
-
await self._run_python_model(
|
|
731
|
+
if isinstance(self.compiled_query, mq.SqlModelQuery):
|
|
732
|
+
await self._run_sql_model(self.compiled_query, conn, placeholders)
|
|
733
|
+
elif isinstance(self.compiled_query, mq.PyModelQuery):
|
|
734
|
+
await self._run_python_model(self.compiled_query)
|
|
383
735
|
else:
|
|
384
736
|
raise NotImplementedError(f"Query type not supported: {self.query_file.__class__.__name__}")
|
|
385
737
|
|
|
386
|
-
|
|
387
|
-
self.logger.log_activity_time(f"running {model_type} model '{self.name}'", start)
|
|
738
|
+
self.logger.log_activity_time(f"running federate model '{self.name}'", start)
|
|
388
739
|
|
|
389
740
|
await super().run_model(conn, placeholders)
|
|
390
741
|
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
742
|
+
|
|
743
|
+
@dataclass
|
|
744
|
+
class BuildModel(StaticModel, QueryModel):
|
|
745
|
+
model_config: mc.BuildModelConfig
|
|
746
|
+
query_file: mq.SqlQueryFile | mq.PyQueryFile
|
|
747
|
+
compiled_query: mq.SqlModelQuery | mq.PyModelQuery | None = field(default=None, init=False)
|
|
748
|
+
|
|
749
|
+
@property
|
|
750
|
+
def model_type(self) -> ModelType:
|
|
751
|
+
return ModelType.BUILD
|
|
752
|
+
|
|
753
|
+
def _add_upstream_for_build(self, other: StaticModel) -> None:
|
|
754
|
+
self.upstreams_for_build[other.name] = other
|
|
755
|
+
other.downstreams_for_build[self.name] = self
|
|
756
|
+
|
|
757
|
+
if isinstance(self.query_file, mq.PyQueryFile):
|
|
758
|
+
other.needs_python_df_for_build = True
|
|
759
|
+
|
|
760
|
+
def _get_compile_sql_model_args(
|
|
761
|
+
self, conn_args: ConnectionsArgs, models_dict: dict[str, StaticModel]
|
|
762
|
+
) -> dict[str, Any]:
|
|
763
|
+
kwargs: dict[str, Any] = {
|
|
764
|
+
"proj_vars": conn_args.proj_vars, "env_vars": conn_args.env_vars
|
|
765
|
+
}
|
|
766
|
+
|
|
767
|
+
def ref_for_build(dependent_model_name: str) -> str:
|
|
768
|
+
dependent_model = self._ref_for_sql(dependent_model_name, models_dict)
|
|
769
|
+
dep = models_dict[dependent_model]
|
|
770
|
+
if isinstance(dep, SourceModel) and not dep.model_config.load_to_vdl:
|
|
771
|
+
conn_name = dep.model_config.get_connection()
|
|
772
|
+
table_name = dep.model_config.get_table()
|
|
773
|
+
return f"db_{conn_name}.{table_name}"
|
|
774
|
+
return dependent_model
|
|
775
|
+
|
|
776
|
+
kwargs["ref"] = ref_for_build
|
|
777
|
+
return kwargs
|
|
778
|
+
|
|
779
|
+
def _compile_sql_model(
|
|
780
|
+
self, query_file: mq.SqlQueryFile, conn_args: ConnectionsArgs, models_dict: dict[str, StaticModel]
|
|
781
|
+
) -> mq.SqlModelQuery:
|
|
782
|
+
kwargs = self._get_compile_sql_model_args(conn_args, models_dict)
|
|
783
|
+
compiled_query_str = self._get_compiled_sql_query_str(query_file.raw_query, kwargs)
|
|
784
|
+
compiled_query = mq.SqlModelQuery(compiled_query_str, is_duckdb=True)
|
|
785
|
+
return compiled_query
|
|
786
|
+
|
|
787
|
+
def _ref_for_python(self, dependent_model_name: str) -> pl.LazyFrame:
|
|
788
|
+
if dependent_model_name not in self.upstreams_for_build:
|
|
789
|
+
raise u.ConfigurationError(f'Model "{self.name}" must include model "{dependent_model_name}" as a dependency to use')
|
|
790
|
+
df = self.upstreams_for_build[dependent_model_name].result
|
|
791
|
+
assert df is not None
|
|
792
|
+
return df
|
|
793
|
+
|
|
794
|
+
def _get_compile_python_model_args(self, conn_args: ConnectionsArgs) -> BuildModelArgs:
|
|
795
|
+
|
|
796
|
+
def run_external_sql(connection_name: str, sql_query: str):
|
|
797
|
+
return self._run_sql_query_on_connection(connection_name, sql_query)
|
|
798
|
+
|
|
799
|
+
return BuildModelArgs(
|
|
800
|
+
conn_args, self.conn_set.get_connections_as_dict(), self.model_config.depends_on, self._ref_for_python, run_external_sql
|
|
801
|
+
)
|
|
802
|
+
|
|
803
|
+
def _compile_python_model(
|
|
804
|
+
self, query_file: mq.PyQueryFile, conn_args: ConnectionsArgs
|
|
805
|
+
) -> mq.PyModelQuery:
|
|
806
|
+
sqrl_args = self._get_compile_python_model_args(conn_args)
|
|
807
|
+
|
|
808
|
+
def compiled_query() -> pl.LazyFrame | pd.DataFrame:
|
|
809
|
+
try:
|
|
810
|
+
return query_file.raw_query(sqrl_args)
|
|
811
|
+
except Exception as e:
|
|
812
|
+
raise FileExecutionError(f'Failed to run "{c.MAIN_FUNC}" function for build model "{self.name}"', e)
|
|
813
|
+
|
|
814
|
+
return mq.PyModelQuery(compiled_query)
|
|
815
|
+
|
|
816
|
+
def compile_for_build(self, conn_args: ConnectionsArgs, models_dict: dict[str, StaticModel]) -> None:
|
|
817
|
+
start = time.time()
|
|
818
|
+
|
|
819
|
+
if isinstance(self.query_file, mq.SqlQueryFile):
|
|
820
|
+
self.compiled_query = self._compile_sql_model(self.query_file, conn_args, models_dict)
|
|
821
|
+
elif isinstance(self.query_file, mq.PyQueryFile):
|
|
822
|
+
self.compiled_query = self._compile_python_model(self.query_file, conn_args)
|
|
823
|
+
else:
|
|
824
|
+
raise NotImplementedError(f"Query type not supported: {self.query_file.__class__.__name__}")
|
|
825
|
+
|
|
826
|
+
self.logger.log_activity_time(f"compiling build model '{self.name}'", start)
|
|
827
|
+
|
|
828
|
+
dependencies = self.model_config.depends_on
|
|
829
|
+
self.wait_count_for_build = len(dependencies)
|
|
830
|
+
|
|
831
|
+
for name in dependencies:
|
|
832
|
+
dep_model = models_dict[name]
|
|
833
|
+
self._add_upstream_for_build(dep_model)
|
|
834
|
+
|
|
835
|
+
async def _build_sql_model(self, compiled_query: mq.SqlModelQuery, conn: duckdb.DuckDBPyConnection) -> None:
|
|
836
|
+
query = compiled_query.query
|
|
837
|
+
|
|
838
|
+
def create_table():
|
|
839
|
+
create_query = self.model_config.get_sql_for_build(self.name, query)
|
|
840
|
+
local_conn = conn.cursor()
|
|
841
|
+
# local_conn = conn
|
|
842
|
+
try:
|
|
843
|
+
return u.run_duckdb_stmt(self.logger, local_conn, create_query, model_name=self.name)
|
|
844
|
+
except Exception as e:
|
|
845
|
+
raise FileExecutionError(f'Failed to build static sql model "{self.name}"', e) from e
|
|
846
|
+
finally:
|
|
847
|
+
local_conn.close()
|
|
848
|
+
# pass
|
|
849
|
+
|
|
850
|
+
# await asyncio.to_thread(create_table)
|
|
851
|
+
create_table() # without threading
|
|
852
|
+
|
|
853
|
+
async def _build_python_model(self, compiled_query: mq.PyModelQuery, conn: duckdb.DuckDBPyConnection) -> None:
|
|
854
|
+
query_result = await asyncio.to_thread(compiled_query.query)
|
|
855
|
+
if isinstance(query_result, pd.DataFrame):
|
|
856
|
+
query_result = pl.from_pandas(query_result).lazy()
|
|
857
|
+
if self.needs_python_df_for_build:
|
|
858
|
+
self.result = query_result.lazy()
|
|
859
|
+
# await asyncio.to_thread(self._create_table_from_df, conn, query_result)
|
|
860
|
+
self._create_table_from_df(conn, query_result) # without threading
|
|
861
|
+
|
|
862
|
+
async def build_model(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
|
|
863
|
+
start = time.time()
|
|
864
|
+
print(f"[{u.get_current_time()}] 🔨 BUILDING: build model '{self.name}'")
|
|
865
|
+
|
|
866
|
+
if isinstance(self.compiled_query, mq.SqlModelQuery):
|
|
867
|
+
await self._build_sql_model(self.compiled_query, conn)
|
|
868
|
+
elif isinstance(self.compiled_query, mq.PyModelQuery):
|
|
869
|
+
# First ensure all upstream models have an associated Python dataframe
|
|
870
|
+
def load_df(conn: duckdb.DuckDBPyConnection, dep_model: DataModel):
|
|
871
|
+
if dep_model.result is None:
|
|
872
|
+
local_conn = conn.cursor()
|
|
873
|
+
# local_conn = conn
|
|
874
|
+
try:
|
|
875
|
+
dep_model.result = dep_model._load_duckdb_view_to_python_df(local_conn)
|
|
876
|
+
finally:
|
|
877
|
+
local_conn.close()
|
|
878
|
+
# pass
|
|
879
|
+
|
|
880
|
+
coroutines = []
|
|
881
|
+
for dep_model in self.upstreams_for_build.values():
|
|
882
|
+
coro = asyncio.to_thread(load_df, conn, dep_model)
|
|
883
|
+
coroutines.append(coro)
|
|
884
|
+
await u.asyncio_gather(coroutines)
|
|
885
|
+
|
|
886
|
+
# Then run the model's Python function to build the model
|
|
887
|
+
await self._build_python_model(self.compiled_query, conn)
|
|
888
|
+
else:
|
|
889
|
+
raise NotImplementedError(f"Query type not supported: {self.query_file.__class__.__name__}")
|
|
890
|
+
|
|
891
|
+
print(f"[{u.get_current_time()}] ✅ FINISHED: build model '{self.name}'")
|
|
892
|
+
self.logger.log_activity_time(f"building static build model '{self.name}'", start)
|
|
893
|
+
|
|
894
|
+
await super().build_model(conn, full_refresh)
|
|
396
895
|
|
|
397
896
|
|
|
398
897
|
@dataclass
|
|
399
898
|
class DAG:
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
899
|
+
dataset: DatasetConfig | None
|
|
900
|
+
target_model: DataModel
|
|
901
|
+
models_dict: dict[str, DataModel]
|
|
902
|
+
datalake_db_path: str | None = field(default=None)
|
|
404
903
|
logger: u.Logger = field(default_factory=lambda: u.Logger(""))
|
|
405
904
|
parameter_set: ParameterSet | None = field(default=None, init=False) # set in apply_selections
|
|
406
905
|
placeholders: dict[str, Any] = field(init=False, default_factory=dict)
|
|
407
906
|
|
|
907
|
+
def _get_msg_extension(self) -> str:
|
|
908
|
+
return f" for dataset '{self.dataset.name}'" if self.dataset else ""
|
|
909
|
+
|
|
910
|
+
def compile_build_models(self, conn_args: ConnectionsArgs) -> None:
|
|
911
|
+
static_models: dict[str, StaticModel] = {k: v for k, v in self.models_dict.items() if isinstance(v, StaticModel)}
|
|
912
|
+
for model in static_models.values():
|
|
913
|
+
if isinstance(model, BuildModel):
|
|
914
|
+
model.compile_for_build(conn_args, static_models)
|
|
915
|
+
|
|
408
916
|
def apply_selections(
|
|
409
|
-
self, param_cfg_set: ParameterConfigsSet, user:
|
|
917
|
+
self, param_cfg_set: ParameterConfigsSet, user: AbstractUser, selections: dict[str, str]
|
|
410
918
|
) -> None:
|
|
411
919
|
start = time.time()
|
|
412
|
-
dataset_params = self.dataset.parameters
|
|
413
|
-
parameter_set = param_cfg_set.apply_selections(
|
|
414
|
-
dataset_params, selections, user, updates_only=updates_only, request_version=request_version
|
|
415
|
-
)
|
|
920
|
+
dataset_params = self.dataset.parameters if self.dataset else None
|
|
921
|
+
parameter_set = param_cfg_set.apply_selections(dataset_params, selections, user)
|
|
416
922
|
self.parameter_set = parameter_set
|
|
417
|
-
|
|
923
|
+
msg_extension = self._get_msg_extension()
|
|
924
|
+
self.logger.log_activity_time("applying selections" + msg_extension, start)
|
|
418
925
|
|
|
419
|
-
def _compile_context(
|
|
926
|
+
def _compile_context(
|
|
927
|
+
self, param_args: ParametersArgs, context_func: ContextFunc, user: AbstractUser, configurables: dict[str, str]
|
|
928
|
+
) -> tuple[dict[str, Any], ContextArgs]:
|
|
420
929
|
start = time.time()
|
|
421
930
|
context = {}
|
|
422
931
|
assert isinstance(self.parameter_set, ParameterSet)
|
|
423
932
|
prms = self.parameter_set.get_parameters_as_dict()
|
|
424
|
-
args = ContextArgs(param_args
|
|
933
|
+
args = ContextArgs(param_args, user, prms, configurables)
|
|
934
|
+
msg_extension = self._get_msg_extension()
|
|
425
935
|
try:
|
|
426
936
|
context_func(context, args)
|
|
427
937
|
except Exception as e:
|
|
428
|
-
raise
|
|
429
|
-
self.logger.log_activity_time(
|
|
938
|
+
raise FileExecutionError(f'Failed to run {c.CONTEXT_FILE}' + msg_extension, e) from e
|
|
939
|
+
self.logger.log_activity_time("running context.py" + msg_extension, start)
|
|
430
940
|
return context, args
|
|
431
941
|
|
|
432
|
-
|
|
433
|
-
|
|
942
|
+
def _compile_models(self, context: dict[str, Any], ctx_args: ContextArgs, recurse: bool) -> None:
|
|
943
|
+
self.target_model.compile(context, ctx_args, self.models_dict, recurse)
|
|
434
944
|
|
|
435
945
|
def _get_terminal_nodes(self) -> set[str]:
|
|
436
946
|
start = time.time()
|
|
@@ -440,100 +950,149 @@ class DAG:
|
|
|
440
950
|
self.logger.log_activity_time(f"validating no cycles in model dependencies", start)
|
|
441
951
|
return terminal_nodes
|
|
442
952
|
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
953
|
+
def _attach_connections_with_type_duckdb(self, conn: duckdb.DuckDBPyConnection) -> None:
|
|
954
|
+
for conn_name, connection in self.target_model.conn_set.get_connections_as_dict().items():
|
|
955
|
+
if not isinstance(connection, ConnectionProperties):
|
|
956
|
+
continue
|
|
957
|
+
attach_uri = connection.attach_uri_for_duckdb
|
|
958
|
+
if attach_uri is None:
|
|
959
|
+
continue
|
|
960
|
+
attach_stmt = f"ATTACH IF NOT EXISTS '{attach_uri}' AS db_{conn_name} (READ_ONLY)"
|
|
961
|
+
u.run_duckdb_stmt(self.logger, conn, attach_stmt, redacted_values=[attach_uri])
|
|
962
|
+
|
|
963
|
+
async def _run_models(self) -> None:
|
|
964
|
+
terminal_nodes = self._get_terminal_nodes()
|
|
965
|
+
|
|
966
|
+
conn = u.create_duckdb_connection(datalake_db_path=self.datalake_db_path)
|
|
967
|
+
try:
|
|
968
|
+
self._attach_connections_with_type_duckdb(conn)
|
|
969
|
+
|
|
449
970
|
coroutines = []
|
|
450
971
|
for model_name in terminal_nodes:
|
|
451
|
-
model = self.models_dict[model_name]
|
|
452
|
-
coroutines.append(model.run_model(conn, placeholders))
|
|
453
|
-
await
|
|
454
|
-
|
|
455
|
-
|
|
972
|
+
model = self.models_dict[model_name] if model_name != "__fake_target" else self.target_model
|
|
973
|
+
coroutines.append(model.run_model(conn, self.placeholders))
|
|
974
|
+
await u.asyncio_gather(coroutines)
|
|
975
|
+
|
|
976
|
+
finally:
|
|
977
|
+
conn.close()
|
|
456
978
|
|
|
457
979
|
async def execute(
|
|
458
|
-
self, param_args: ParametersArgs, param_cfg_set: ParameterConfigsSet, context_func: ContextFunc, user:
|
|
459
|
-
*,
|
|
460
|
-
) ->
|
|
980
|
+
self, param_args: ParametersArgs, param_cfg_set: ParameterConfigsSet, context_func: ContextFunc, user: AbstractUser, selections: dict[str, str],
|
|
981
|
+
*, runquery: bool = True, recurse: bool = True, configurables: dict[str, str] = {}
|
|
982
|
+
) -> None:
|
|
461
983
|
recurse = (recurse or runquery)
|
|
462
984
|
|
|
463
|
-
self.apply_selections(param_cfg_set, user, selections
|
|
985
|
+
self.apply_selections(param_cfg_set, user, selections)
|
|
464
986
|
|
|
465
|
-
context, ctx_args = self._compile_context(param_args, context_func, user)
|
|
987
|
+
context, ctx_args = self._compile_context(param_args, context_func, user, configurables)
|
|
466
988
|
|
|
467
|
-
|
|
989
|
+
self._compile_models(context, ctx_args, recurse)
|
|
468
990
|
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
placeholders = ctx_args._placeholders.copy()
|
|
991
|
+
self.placeholders = ctx_args._placeholders_copy
|
|
472
992
|
if runquery:
|
|
473
|
-
await self._run_models(
|
|
474
|
-
|
|
475
|
-
|
|
993
|
+
await self._run_models()
|
|
994
|
+
|
|
995
|
+
self.target_model.process_pass_through_columns(self.models_dict)
|
|
476
996
|
|
|
477
997
|
def get_all_query_models(self) -> set[str]:
|
|
478
998
|
all_model_names = set()
|
|
479
999
|
self.target_model.retrieve_dependent_query_models(all_model_names)
|
|
480
1000
|
return all_model_names
|
|
481
1001
|
|
|
482
|
-
def
|
|
483
|
-
|
|
484
|
-
|
|
1002
|
+
def get_all_data_models(self) -> list[rm.DataModelItem]:
|
|
1003
|
+
data_models = []
|
|
485
1004
|
for model_name, model in self.models_dict.items():
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
1005
|
+
is_queryable = model.is_queryable
|
|
1006
|
+
data_model = rm.DataModelItem(name=model_name, model_type=model.model_type.value, config=model.model_config, is_queryable=is_queryable)
|
|
1007
|
+
data_models.append(data_model)
|
|
1008
|
+
return data_models
|
|
1009
|
+
|
|
1010
|
+
def get_all_model_lineage(self) -> list[rm.LineageRelation]:
|
|
1011
|
+
model_lineage = []
|
|
1012
|
+
for model_name, model in self.models_dict.items():
|
|
1013
|
+
if not isinstance(model, QueryModel):
|
|
1014
|
+
continue
|
|
1015
|
+
for dep_model_name in model.model_config.depends_on:
|
|
1016
|
+
edge_type = "buildtime" if isinstance(model, BuildModel) else "runtime"
|
|
1017
|
+
source_model = rm.LineageNode(name=dep_model_name, type="model")
|
|
1018
|
+
target_model = rm.LineageNode(name=model_name, type="model")
|
|
1019
|
+
model_lineage.append(rm.LineageRelation(type=edge_type, source=source_model, target=target_model))
|
|
1020
|
+
return model_lineage
|
|
1021
|
+
|
|
1022
|
+
|
|
1023
|
+
class ModelsIO:
|
|
1024
|
+
|
|
1025
|
+
@classmethod
|
|
1026
|
+
def _load_model_config(cls, filepath: Path, model_type: ModelType, env_vars: dict[str, str]) -> mc.ModelConfig:
|
|
1027
|
+
yaml_path = filepath.with_suffix('.yml')
|
|
1028
|
+
config_dict = u.load_yaml_config(yaml_path) if yaml_path.exists() else {}
|
|
490
1029
|
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
1030
|
+
if model_type == ModelType.DBVIEW:
|
|
1031
|
+
config = mc.DbviewModelConfig(**config_dict).finalize_connection(env_vars)
|
|
1032
|
+
return config
|
|
1033
|
+
elif model_type == ModelType.FEDERATE:
|
|
1034
|
+
return mc.FederateModelConfig(**config_dict)
|
|
1035
|
+
elif model_type == ModelType.BUILD:
|
|
1036
|
+
return mc.BuildModelConfig(**config_dict)
|
|
1037
|
+
else:
|
|
1038
|
+
return mc.ModelConfig(**config_dict)
|
|
1039
|
+
|
|
1040
|
+
@classmethod
|
|
1041
|
+
def _populate_from_file(
|
|
1042
|
+
cls, raw_queries_by_model: dict[str, mq.QueryFileWithConfig], dp: str, file: str, model_type: ModelType, env_vars: dict[str, str]
|
|
1043
|
+
) -> None:
|
|
1044
|
+
filepath = Path(dp, file)
|
|
1045
|
+
file_stem, extension = os.path.splitext(file)
|
|
1046
|
+
|
|
1047
|
+
if extension == '.py':
|
|
1048
|
+
module = pm.PyModule(filepath)
|
|
1049
|
+
raw_query = module.get_func_or_class(c.MAIN_FUNC)
|
|
1050
|
+
query_file = mq.PyQueryFile(filepath.as_posix(), raw_query)
|
|
1051
|
+
elif extension == '.sql':
|
|
1052
|
+
query_file = mq.SqlQueryFile(filepath.as_posix(), filepath.read_text())
|
|
1053
|
+
else:
|
|
1054
|
+
return # Skip files that are not query files
|
|
1055
|
+
|
|
1056
|
+
if file_stem in raw_queries_by_model:
|
|
1057
|
+
assert isinstance(prior_query_file := raw_queries_by_model[file_stem].query_file, mq.QueryFile)
|
|
1058
|
+
conflicts = [prior_query_file.filepath, query_file.filepath]
|
|
1059
|
+
raise u.ConfigurationError(f"Multiple models found for '{file_stem}': {conflicts}")
|
|
495
1060
|
|
|
496
|
-
|
|
1061
|
+
model_config = cls._load_model_config(filepath, model_type, env_vars)
|
|
1062
|
+
raw_queries_by_model[file_stem] = mq.QueryFileWithConfig(query_file, model_config)
|
|
497
1063
|
|
|
1064
|
+
@classmethod
|
|
1065
|
+
def _populate_raw_queries_for_type(
|
|
1066
|
+
cls, folder_path: Path, model_type: ModelType, *, env_vars: dict[str, str] = {}
|
|
1067
|
+
) -> dict[str, mq.QueryFileWithConfig]:
|
|
1068
|
+
raw_queries_by_model: dict[str, mq.QueryFileWithConfig] = {}
|
|
1069
|
+
for dp, _, filenames in os.walk(folder_path):
|
|
1070
|
+
for file in filenames:
|
|
1071
|
+
cls._populate_from_file(raw_queries_by_model, dp, file, model_type, env_vars)
|
|
1072
|
+
return raw_queries_by_model
|
|
498
1073
|
|
|
499
|
-
|
|
1074
|
+
@classmethod
|
|
1075
|
+
def load_build_files(cls, logger: u.Logger, base_path: str) -> dict[str, mq.QueryFileWithConfig]:
|
|
1076
|
+
start = time.time()
|
|
1077
|
+
builds_path = u.Path(base_path, c.MODELS_FOLDER, c.BUILDS_FOLDER)
|
|
1078
|
+
raw_queries_by_model = cls._populate_raw_queries_for_type(builds_path, ModelType.BUILD)
|
|
1079
|
+
logger.log_activity_time("loading build files", start)
|
|
1080
|
+
return raw_queries_by_model
|
|
500
1081
|
|
|
501
1082
|
@classmethod
|
|
502
|
-
def
|
|
1083
|
+
def load_dbview_files(cls, logger: u.Logger, base_path: str, env_vars: dict[str, str]) -> dict[str, mq.QueryFileWithConfig]:
|
|
503
1084
|
start = time.time()
|
|
504
|
-
raw_queries_by_model: dict[str, QueryFile] = {}
|
|
505
|
-
|
|
506
|
-
def populate_from_file(dp: str, file: str, model_type: ModelType) -> None:
|
|
507
|
-
filepath = Path(dp, file)
|
|
508
|
-
file_stem, extension = os.path.splitext(file)
|
|
509
|
-
if extension == '.py':
|
|
510
|
-
module = pm.PyModule(filepath)
|
|
511
|
-
dependencies_func = module.get_func_or_class(c.DEP_FUNC, default_attr=lambda sqrl: [])
|
|
512
|
-
raw_query = _RawPyQuery(module.get_func_or_class(c.MAIN_FUNC), dependencies_func)
|
|
513
|
-
query_file = PyQueryFile(filepath.as_posix(), model_type, raw_query)
|
|
514
|
-
elif extension == '.sql':
|
|
515
|
-
query_file = SqlQueryFile(filepath.as_posix(), model_type, filepath.read_text())
|
|
516
|
-
else:
|
|
517
|
-
query_file = None
|
|
518
|
-
|
|
519
|
-
if query_file is not None:
|
|
520
|
-
if file_stem in raw_queries_by_model:
|
|
521
|
-
conflicts = [raw_queries_by_model[file_stem].filepath, filepath]
|
|
522
|
-
raise u.ConfigurationError(f"Multiple models found for '{file_stem}': {conflicts}")
|
|
523
|
-
raw_queries_by_model[file_stem] = query_file
|
|
524
|
-
|
|
525
|
-
def populate_raw_queries_for_type(folder_path: Path, model_type: ModelType) -> None:
|
|
526
|
-
for dp, _, filenames in os.walk(folder_path):
|
|
527
|
-
for file in filenames:
|
|
528
|
-
populate_from_file(dp, file, model_type)
|
|
529
|
-
|
|
530
1085
|
dbviews_path = u.Path(base_path, c.MODELS_FOLDER, c.DBVIEWS_FOLDER)
|
|
531
|
-
|
|
1086
|
+
raw_queries_by_model = cls._populate_raw_queries_for_type(dbviews_path, ModelType.DBVIEW, env_vars=env_vars)
|
|
1087
|
+
logger.log_activity_time("loading dbview files", start)
|
|
1088
|
+
return raw_queries_by_model
|
|
532
1089
|
|
|
1090
|
+
@classmethod
|
|
1091
|
+
def load_federate_files(cls, logger: u.Logger, base_path: str) -> dict[str, mq.QueryFileWithConfig]:
|
|
1092
|
+
start = time.time()
|
|
533
1093
|
federates_path = u.Path(base_path, c.MODELS_FOLDER, c.FEDERATES_FOLDER)
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
logger.log_activity_time("loading files for models", start)
|
|
1094
|
+
raw_queries_by_model = cls._populate_raw_queries_for_type(federates_path, ModelType.FEDERATE)
|
|
1095
|
+
logger.log_activity_time("loading federate files", start)
|
|
537
1096
|
return raw_queries_by_model
|
|
538
1097
|
|
|
539
1098
|
@classmethod
|