squirrels 0.4.1__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of squirrels might be problematic. Click here for more details.

Files changed (125) hide show
  1. dateutils/__init__.py +6 -0
  2. dateutils/_enums.py +25 -0
  3. squirrels/dateutils.py → dateutils/_implementation.py +58 -111
  4. dateutils/types.py +6 -0
  5. squirrels/__init__.py +13 -11
  6. squirrels/_api_routes/__init__.py +5 -0
  7. squirrels/_api_routes/auth.py +271 -0
  8. squirrels/_api_routes/base.py +165 -0
  9. squirrels/_api_routes/dashboards.py +150 -0
  10. squirrels/_api_routes/data_management.py +145 -0
  11. squirrels/_api_routes/datasets.py +257 -0
  12. squirrels/_api_routes/oauth2.py +298 -0
  13. squirrels/_api_routes/project.py +252 -0
  14. squirrels/_api_server.py +256 -450
  15. squirrels/_arguments/__init__.py +0 -0
  16. squirrels/_arguments/init_time_args.py +108 -0
  17. squirrels/_arguments/run_time_args.py +147 -0
  18. squirrels/_auth.py +960 -0
  19. squirrels/_command_line.py +126 -45
  20. squirrels/_compile_prompts.py +147 -0
  21. squirrels/_connection_set.py +48 -26
  22. squirrels/_constants.py +68 -38
  23. squirrels/_dashboards.py +160 -0
  24. squirrels/_data_sources.py +570 -0
  25. squirrels/_dataset_types.py +84 -0
  26. squirrels/_exceptions.py +29 -0
  27. squirrels/_initializer.py +177 -80
  28. squirrels/_logging.py +115 -0
  29. squirrels/_manifest.py +208 -79
  30. squirrels/_model_builder.py +69 -0
  31. squirrels/_model_configs.py +74 -0
  32. squirrels/_model_queries.py +52 -0
  33. squirrels/_models.py +926 -367
  34. squirrels/_package_data/base_project/.env +42 -0
  35. squirrels/_package_data/base_project/.env.example +42 -0
  36. squirrels/_package_data/base_project/assets/expenses.db +0 -0
  37. squirrels/_package_data/base_project/connections.yml +16 -0
  38. squirrels/_package_data/base_project/dashboards/dashboard_example.py +34 -0
  39. squirrels/_package_data/base_project/dashboards/dashboard_example.yml +22 -0
  40. squirrels/{package_data → _package_data}/base_project/docker/.dockerignore +5 -2
  41. squirrels/{package_data → _package_data}/base_project/docker/Dockerfile +3 -3
  42. squirrels/{package_data → _package_data}/base_project/docker/compose.yml +1 -1
  43. squirrels/_package_data/base_project/duckdb_init.sql +10 -0
  44. squirrels/{package_data/base_project/.gitignore → _package_data/base_project/gitignore} +3 -2
  45. squirrels/_package_data/base_project/macros/macros_example.sql +17 -0
  46. squirrels/_package_data/base_project/models/builds/build_example.py +26 -0
  47. squirrels/_package_data/base_project/models/builds/build_example.sql +16 -0
  48. squirrels/_package_data/base_project/models/builds/build_example.yml +57 -0
  49. squirrels/_package_data/base_project/models/dbviews/dbview_example.sql +12 -0
  50. squirrels/_package_data/base_project/models/dbviews/dbview_example.yml +26 -0
  51. squirrels/_package_data/base_project/models/federates/federate_example.py +37 -0
  52. squirrels/_package_data/base_project/models/federates/federate_example.sql +19 -0
  53. squirrels/_package_data/base_project/models/federates/federate_example.yml +65 -0
  54. squirrels/_package_data/base_project/models/sources.yml +38 -0
  55. squirrels/{package_data → _package_data}/base_project/parameters.yml +56 -40
  56. squirrels/_package_data/base_project/pyconfigs/connections.py +14 -0
  57. squirrels/{package_data → _package_data}/base_project/pyconfigs/context.py +21 -40
  58. squirrels/_package_data/base_project/pyconfigs/parameters.py +141 -0
  59. squirrels/_package_data/base_project/pyconfigs/user.py +44 -0
  60. squirrels/_package_data/base_project/seeds/seed_categories.yml +15 -0
  61. squirrels/_package_data/base_project/seeds/seed_subcategories.csv +15 -0
  62. squirrels/_package_data/base_project/seeds/seed_subcategories.yml +21 -0
  63. squirrels/_package_data/base_project/squirrels.yml.j2 +61 -0
  64. squirrels/_package_data/templates/dataset_results.html +112 -0
  65. squirrels/_package_data/templates/oauth_login.html +271 -0
  66. squirrels/_package_data/templates/squirrels_studio.html +20 -0
  67. squirrels/_package_loader.py +8 -4
  68. squirrels/_parameter_configs.py +104 -103
  69. squirrels/_parameter_options.py +348 -0
  70. squirrels/_parameter_sets.py +57 -47
  71. squirrels/_parameters.py +1664 -0
  72. squirrels/_project.py +721 -0
  73. squirrels/_py_module.py +7 -5
  74. squirrels/_schemas/__init__.py +0 -0
  75. squirrels/_schemas/auth_models.py +167 -0
  76. squirrels/_schemas/query_param_models.py +75 -0
  77. squirrels/{_api_response_models.py → _schemas/response_models.py} +126 -47
  78. squirrels/_seeds.py +35 -16
  79. squirrels/_sources.py +110 -0
  80. squirrels/_utils.py +248 -73
  81. squirrels/_version.py +1 -1
  82. squirrels/arguments.py +7 -0
  83. squirrels/auth.py +4 -0
  84. squirrels/connections.py +3 -0
  85. squirrels/dashboards.py +2 -81
  86. squirrels/data_sources.py +14 -631
  87. squirrels/parameter_options.py +13 -348
  88. squirrels/parameters.py +14 -1266
  89. squirrels/types.py +16 -0
  90. squirrels-0.5.0.dist-info/METADATA +113 -0
  91. squirrels-0.5.0.dist-info/RECORD +97 -0
  92. {squirrels-0.4.1.dist-info → squirrels-0.5.0.dist-info}/WHEEL +1 -1
  93. squirrels-0.5.0.dist-info/entry_points.txt +3 -0
  94. {squirrels-0.4.1.dist-info → squirrels-0.5.0.dist-info/licenses}/LICENSE +1 -1
  95. squirrels/_authenticator.py +0 -85
  96. squirrels/_dashboards_io.py +0 -61
  97. squirrels/_environcfg.py +0 -84
  98. squirrels/arguments/init_time_args.py +0 -40
  99. squirrels/arguments/run_time_args.py +0 -208
  100. squirrels/package_data/assets/favicon.ico +0 -0
  101. squirrels/package_data/assets/index.css +0 -1
  102. squirrels/package_data/assets/index.js +0 -58
  103. squirrels/package_data/base_project/assets/expenses.db +0 -0
  104. squirrels/package_data/base_project/connections.yml +0 -7
  105. squirrels/package_data/base_project/dashboards/dashboard_example.py +0 -32
  106. squirrels/package_data/base_project/dashboards.yml +0 -10
  107. squirrels/package_data/base_project/env.yml +0 -29
  108. squirrels/package_data/base_project/models/dbviews/dbview_example.py +0 -47
  109. squirrels/package_data/base_project/models/dbviews/dbview_example.sql +0 -22
  110. squirrels/package_data/base_project/models/federates/federate_example.py +0 -21
  111. squirrels/package_data/base_project/models/federates/federate_example.sql +0 -3
  112. squirrels/package_data/base_project/pyconfigs/auth.py +0 -45
  113. squirrels/package_data/base_project/pyconfigs/connections.py +0 -19
  114. squirrels/package_data/base_project/pyconfigs/parameters.py +0 -95
  115. squirrels/package_data/base_project/seeds/seed_subcategories.csv +0 -15
  116. squirrels/package_data/base_project/squirrels.yml.j2 +0 -94
  117. squirrels/package_data/templates/index.html +0 -18
  118. squirrels/project.py +0 -378
  119. squirrels/user_base.py +0 -55
  120. squirrels-0.4.1.dist-info/METADATA +0 -117
  121. squirrels-0.4.1.dist-info/RECORD +0 -60
  122. squirrels-0.4.1.dist-info/entry_points.txt +0 -4
  123. /squirrels/{package_data → _package_data}/base_project/assets/weather.db +0 -0
  124. /squirrels/{package_data → _package_data}/base_project/seeds/seed_categories.csv +0 -0
  125. /squirrels/{package_data → _package_data}/base_project/tmp/.gitignore +0 -0
squirrels/_models.py CHANGED
@@ -1,147 +1,121 @@
1
1
  from __future__ import annotations
2
- from typing import Iterable, Callable, Any
3
- from dataclasses import dataclass, field
2
+ from typing import Callable, Any
3
+ from dataclasses import dataclass, field, KW_ONLY
4
4
  from abc import ABCMeta, abstractmethod
5
5
  from enum import Enum
6
6
  from pathlib import Path
7
- from sqlalchemy import create_engine, text, Connection
8
- import asyncio, os, time, pandas as pd, networkx as nx
9
-
10
- from . import _constants as c, _utils as u, _py_module as pm
11
- from .arguments.run_time_args import ContextArgs, ModelDepsArgs, ModelArgs
12
- from ._authenticator import User
13
- from ._connection_set import ConnectionSet
14
- from ._manifest import ManifestConfig, DatasetConfig
7
+ import asyncio, os, re, time, duckdb, sqlglot
8
+ import polars as pl, pandas as pd
9
+
10
+ from . import _constants as c, _utils as u, _py_module as pm, _model_queries as mq, _model_configs as mc, _sources as src
11
+ from ._schemas import response_models as rm
12
+ from ._exceptions import FileExecutionError, InvalidInputError
13
+ from ._arguments.run_time_args import ContextArgs, ModelArgs, BuildModelArgs
14
+ from ._auth import AbstractUser
15
+ from ._connection_set import ConnectionsArgs, ConnectionSet, ConnectionProperties
16
+ from ._manifest import DatasetConfig, ConnectionTypeEnum
15
17
  from ._parameter_sets import ParameterConfigsSet, ParametersArgs, ParameterSet
16
18
 
17
19
  ContextFunc = Callable[[dict[str, Any], ContextArgs], None]
18
20
 
19
21
 
20
22
  class ModelType(Enum):
21
- DBVIEW = 1
22
- FEDERATE = 2
23
- SEED = 3
24
-
25
- class _Materialization(Enum):
26
- TABLE = 0
27
- VIEW = 1
23
+ SEED = "seed"
24
+ SOURCE = "source"
25
+ BUILD = "build"
26
+ DBVIEW = "dbview"
27
+ FEDERATE = "federate"
28
28
 
29
29
 
30
30
  @dataclass
31
- class _SqlModelConfig:
32
- ## Applicable for dbview models
33
- connection_name: str
34
-
35
- ## Applicable for federated models
36
- materialized: _Materialization
37
-
38
- def set_attribute(self, *, connection_name: str | None = None, materialized: str | None = None, **kwargs) -> str:
39
- if connection_name is not None:
40
- if not isinstance(connection_name, str):
41
- raise u.ConfigurationError("The 'connection_name' argument of 'config' macro must be a string")
42
- self.connection_name = connection_name
43
-
44
- if materialized is not None:
45
- if not isinstance(materialized, str):
46
- raise u.ConfigurationError("The 'materialized' argument of 'config' macro must be a string")
47
- try:
48
- self.materialized = _Materialization[materialized.upper()]
49
- except KeyError as e:
50
- valid_options = [x.name for x in _Materialization]
51
- raise u.ConfigurationError(f"The 'materialized' argument value '{materialized}' is not valid. Must be one of: {valid_options}") from e
52
- return ""
53
-
54
- def get_sql_for_create(self, model_name: str, select_query: str) -> str:
55
- create_prefix = f"CREATE {self.materialized.name} {model_name} AS\n"
56
- return create_prefix + select_query
57
-
58
-
59
- @dataclass(frozen=True)
60
- class QueryFile:
61
- filepath: str
62
- model_type: ModelType
63
-
64
- @dataclass(frozen=True)
65
- class SqlQueryFile(QueryFile):
66
- raw_query: str
67
-
68
- @dataclass(frozen=True)
69
- class _RawPyQuery:
70
- query: Callable[[ModelArgs], pd.DataFrame]
71
- dependencies_func: Callable[[ModelDepsArgs], Iterable[str]]
72
-
73
- @dataclass(frozen=True)
74
- class PyQueryFile(QueryFile):
75
- raw_query: _RawPyQuery
76
-
77
-
78
- @dataclass
79
- class _Query(metaclass=ABCMeta):
80
- query: Any
81
-
82
- @dataclass
83
- class _WorkInProgress(_Query):
84
- query: None = field(default=None, init=False)
85
-
86
- @dataclass
87
- class SqlModelQuery(_Query):
88
- query: str
89
- config: _SqlModelConfig
90
-
91
- @dataclass
92
- class PyModelQuery(_Query):
93
- query: Callable[[], pd.DataFrame]
94
-
95
-
96
- @dataclass
97
- class Referable(metaclass=ABCMeta):
31
+ class DataModel(metaclass=ABCMeta):
98
32
  name: str
33
+ model_config: mc.ModelConfig
99
34
  is_target: bool = field(default=False, init=False)
100
35
 
101
- needs_sql_table: bool = field(default=False, init=False)
102
- needs_pandas: bool = field(default=False, init=False)
103
- result: pd.DataFrame | None = field(default=None, init=False, repr=False)
36
+ result: pl.LazyFrame | None = field(default=None, init=False, repr=False)
37
+ needs_python_df: bool = field(default=False, init=False)
104
38
 
105
39
  wait_count: int = field(default=0, init=False, repr=False)
106
40
  confirmed_no_cycles: bool = field(default=False, init=False)
107
- upstreams: dict[str, Referable] = field(default_factory=dict, init=False, repr=False)
108
- downstreams: dict[str, Referable] = field(default_factory=dict, init=False, repr=False)
41
+ upstreams: dict[str, DataModel] = field(default_factory=dict, init=False, repr=False)
42
+ downstreams: dict[str, DataModel] = field(default_factory=dict, init=False, repr=False)
43
+
44
+ _: KW_ONLY
45
+ logger: u.Logger = field(default_factory=lambda: u.Logger(""))
46
+ env_vars: dict[str, str] = field(default_factory=dict)
47
+ conn_set: ConnectionSet = field(default_factory=ConnectionSet)
109
48
 
49
+ @property
110
50
  @abstractmethod
111
- def get_model_type(self) -> ModelType:
51
+ def model_type(self) -> ModelType:
112
52
  pass
113
53
 
114
- async def compile(
115
- self, ctx: dict[str, Any], ctx_args: ContextArgs, placeholders: dict[str, Any], models_dict: dict[str, Referable], recurse: bool
54
+ @property
55
+ def is_queryable(self) -> bool:
56
+ return True
57
+
58
+ def compile(
59
+ self, ctx: dict[str, Any], ctx_args: ContextArgs, models_dict: dict[str, DataModel], recurse: bool
116
60
  ) -> None:
117
61
  pass
118
62
 
119
- @abstractmethod
120
63
  def get_terminal_nodes(self, depencency_path: set[str]) -> set[str]:
121
- pass
64
+ if self.confirmed_no_cycles:
65
+ return set()
122
66
 
123
- def _load_pandas_to_table(self, df: pd.DataFrame, conn: Connection) -> None:
124
- df.to_sql(self.name, conn, index=False)
125
-
126
- def _load_table_to_pandas(self, conn: Connection) -> pd.DataFrame:
127
- query = f"SELECT * FROM {self.name}"
128
- return pd.read_sql(query, conn)
67
+ if self.name in depencency_path:
68
+ raise u.ConfigurationError(f'Cycle found in model dependency graph')
69
+
70
+ terminal_nodes = set()
71
+ if len(self.upstreams) == 0:
72
+ terminal_nodes.add(self.name)
73
+ else:
74
+ new_path = set(depencency_path)
75
+ new_path.add(self.name)
76
+ for dep_model in self.upstreams.values():
77
+ terminal_nodes.update(dep_model.get_terminal_nodes(new_path))
78
+
79
+ self.confirmed_no_cycles = True
80
+ return terminal_nodes
81
+
82
+ def _load_duckdb_view_to_python_df(self, conn: duckdb.DuckDBPyConnection, *, use_datalake: bool = False) -> pl.LazyFrame:
83
+ table_name = ("vdl." if use_datalake else "") + self.name
84
+ try:
85
+ return conn.sql(f"FROM {table_name}").pl().lazy()
86
+ except duckdb.CatalogException as e:
87
+ raise u.ConfigurationError(f'Failed to load duckdb table or view "{self.name}" to python dataframe') from e
88
+
89
+ def _run_sql_query_on_connection(self, connection_name: str, query: str, placeholders: dict = {}) -> pl.DataFrame:
90
+ self.logger.info(f"Running sql query on connection '{connection_name}': {query}")
91
+ return self.conn_set.run_sql_query_from_conn_name(query, connection_name, placeholders)
129
92
 
130
- async def _trigger(self, conn: Connection, placeholders: dict = {}) -> None:
93
+ async def _trigger(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
131
94
  self.wait_count -= 1
132
95
  if (self.wait_count == 0):
133
96
  await self.run_model(conn, placeholders)
134
97
 
135
- @abstractmethod
136
- async def run_model(self, conn: Connection, placeholders: dict = {}) -> None:
98
+ async def run_model(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
137
99
  coroutines = []
138
100
  for model in self.downstreams.values():
139
101
  coroutines.append(model._trigger(conn, placeholders))
140
- await asyncio.gather(*coroutines)
102
+ await u.asyncio_gather(coroutines)
141
103
 
142
104
  def retrieve_dependent_query_models(self, dependent_model_names: set[str]) -> None:
143
105
  pass
144
-
106
+
107
+ def _register_all_upstream_python_df_helper(self, conn: duckdb.DuckDBPyConnection, tables_set: set[str]) -> None:
108
+ if self.result is not None and self.name not in tables_set:
109
+ conn.register(self.name, self.result)
110
+ for dep_model in self.upstreams.values():
111
+ dep_model._register_all_upstream_python_df_helper(conn, tables_set)
112
+
113
+ def register_all_upstream_python_df(self, conn: duckdb.DuckDBPyConnection) -> None:
114
+ show_tables_query = f"SHOW TABLES"
115
+ tables_df = conn.sql(show_tables_query).pl()
116
+ tables_set = set(tables_df["name"])
117
+ self._register_all_upstream_python_df_helper(conn, tables_set)
118
+
145
119
  def get_max_path_length_to_target(self) -> int | None:
146
120
  if not hasattr(self, "max_path_len_to_target"):
147
121
  path_lengths = []
@@ -154,283 +128,819 @@ class Referable(metaclass=ABCMeta):
154
128
  self.max_path_len_to_target = 0 if self.is_target else None
155
129
  return self.max_path_len_to_target
156
130
 
131
+ async def _trigger_build(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
132
+ pass
133
+
134
+ def _create_table_from_df(self, conn: duckdb.DuckDBPyConnection, query_result: pl.LazyFrame | pd.DataFrame):
135
+ local_conn = conn.cursor()
136
+ # local_conn = conn
137
+ try:
138
+ assert query_result is not None
139
+ local_conn.execute(f"CREATE OR REPLACE TABLE {self.name} AS FROM query_result")
140
+ finally:
141
+ local_conn.close()
142
+ # pass
143
+
144
+ def process_pass_through_columns(self, models_dict: dict[str, DataModel]) -> None:
145
+ pass
146
+
147
+
148
+ @dataclass
149
+ class StaticModel(DataModel):
150
+ needs_python_df_for_build: bool = field(default=False, init=False)
151
+ wait_count_for_build: int = field(default=0, init=False, repr=False)
152
+ upstreams_for_build: dict[str, StaticModel] = field(default_factory=dict, init=False, repr=False)
153
+ downstreams_for_build: dict[str, StaticModel] = field(default_factory=dict, init=False, repr=False)
154
+
155
+ def get_terminal_nodes_for_build(self, depencency_path: set[str]) -> set[str]:
156
+ if self.confirmed_no_cycles:
157
+ return set()
158
+
159
+ if self.name in depencency_path:
160
+ raise u.ConfigurationError(f'Cycle found in model dependency graph')
161
+
162
+ terminal_nodes = set()
163
+ if len(self.upstreams_for_build) == 0:
164
+ terminal_nodes.add(self.name)
165
+ else:
166
+ new_path = set(depencency_path)
167
+ new_path.add(self.name)
168
+ for dep_model in self.upstreams_for_build.values():
169
+ terminal_nodes.update(dep_model.get_terminal_nodes_for_build(new_path))
170
+
171
+ self.confirmed_no_cycles = True
172
+ return terminal_nodes
173
+
174
+ def _get_result(self, conn: duckdb.DuckDBPyConnection) -> pl.LazyFrame:
175
+ local_conn = conn.cursor()
176
+ try:
177
+ return self._load_duckdb_view_to_python_df(local_conn, use_datalake=True)
178
+ except Exception as e:
179
+ raise InvalidInputError(409, f'dependent_data_model_not_found', f'Model "{self.name}" depends on static data models that cannot be found. Try building the Virtual Data Lake (VDL) first.')
180
+ finally:
181
+ local_conn.close()
182
+
183
+ async def run_model(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
184
+ start = time.time()
185
+
186
+ if (self.needs_python_df or self.is_target) and self.result is None:
187
+ self.result = await asyncio.to_thread(self._get_result, conn)
188
+
189
+ self.logger.log_activity_time(f"loading static model '{self.name}'", start)
190
+
191
+ await super().run_model(conn, placeholders)
192
+
193
+ def compile_for_build(
194
+ self, conn_args: ConnectionsArgs, models_dict: dict[str, StaticModel]
195
+ ) -> None:
196
+ pass
197
+
198
+ async def _trigger_build(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
199
+ self.wait_count_for_build -= 1
200
+ if (self.wait_count_for_build == 0):
201
+ await self.build_model(conn, full_refresh)
202
+
203
+ async def build_model(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
204
+ if self.needs_python_df and self.result is None:
205
+ local_conn = conn.cursor()
206
+ try:
207
+ self.result = await asyncio.to_thread(self._load_duckdb_view_to_python_df, local_conn)
208
+ finally:
209
+ local_conn.close()
210
+
211
+ coroutines = []
212
+ for model in self.downstreams_for_build.values():
213
+ coroutines.append(model._trigger_build(conn, full_refresh))
214
+ await u.asyncio_gather(coroutines)
215
+
157
216
 
158
217
  @dataclass
159
- class Seed(Referable):
160
- result: pd.DataFrame
218
+ class Seed(StaticModel):
219
+ model_config: mc.SeedConfig
220
+ result: pl.LazyFrame
161
221
 
162
- def get_model_type(self) -> ModelType:
222
+ @property
223
+ def model_type(self) -> ModelType:
163
224
  return ModelType.SEED
225
+
226
+ async def build_model(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
227
+ start = time.time()
164
228
 
165
- def get_terminal_nodes(self, depencency_path: set[str]) -> set[str]:
166
- return {self.name}
229
+ print(f"[{u.get_current_time()}] 🔨 BUILDING: seed model '{self.name}'")
230
+ # await asyncio.to_thread(self._create_table_from_df, conn, self.result)
231
+ self._create_table_from_df(conn, self.result) # without threading
232
+
233
+ print(f"[{u.get_current_time()}] ✅ FINISHED: seed model '{self.name}'")
234
+ self.logger.log_activity_time(f"building seed model '{self.name}' into VDL", start)
235
+
236
+ await super().build_model(conn, full_refresh)
237
+
238
+
239
+ @dataclass
240
+ class SourceModel(StaticModel):
241
+ model_config: src.Source
242
+
243
+ @property
244
+ def model_type(self) -> ModelType:
245
+ return ModelType.SOURCE
246
+
247
+ @property
248
+ def connection_props(self) -> ConnectionProperties:
249
+ conn_name = self.model_config.get_connection()
250
+ conn_props = self.conn_set.get_connection(conn_name)
251
+ if isinstance(conn_props, ConnectionProperties):
252
+ return conn_props
253
+ raise u.ConfigurationError(f'Unable to use connection "{conn_name}" for source "{self.name}". Connection "{conn_name}" must be a ConnectionProperties object')
167
254
 
168
- async def run_model(self, conn: Connection, placeholders: dict = {}) -> None:
169
- if self.needs_sql_table:
170
- await asyncio.to_thread(self._load_pandas_to_table, self.result, conn)
171
- await super().run_model(conn, placeholders)
255
+ @property
256
+ def is_queryable(self) -> bool:
257
+ connection_props = self.connection_props
258
+ return self.model_config.load_to_vdl or connection_props.type == ConnectionTypeEnum.DUCKDB
259
+
260
+ def _build_source_model(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
261
+ local_conn = conn.cursor()
262
+ # local_conn = conn
263
+
264
+ local_conn.begin()
265
+ try:
266
+ source = self.model_config
267
+ conn_name = source.get_connection()
268
+
269
+ connection_props = self.connection_props
270
+ dialect = connection_props.dialect
271
+ attach_uri = connection_props.attach_uri_for_duckdb
272
+ if attach_uri is None:
273
+ raise u.ConfigurationError(f'Loading to duckdb is not supported for source "{self.name}" since its connection "{conn_name}" uses an unsupported dialect')
274
+
275
+ result = u.run_duckdb_stmt(self.logger, local_conn, f"FROM (SHOW DATABASES) WHERE database_name = 'db_{conn_name}'").fetchone()
276
+ if result is None:
277
+ return # skip this source if connection is not attached
278
+
279
+ table_name = source.get_table()
280
+ new_table_name = self.name
281
+
282
+ if len(source.columns) == 0:
283
+ stmt = f"CREATE OR REPLACE TABLE {new_table_name} AS FROM db_{conn_name}.{table_name}"
284
+ u.run_duckdb_stmt(self.logger, local_conn, stmt)
285
+ local_conn.commit()
286
+ return
287
+
288
+ increasing_column = source.update_hints.increasing_column
289
+ recreate_table = full_refresh or increasing_column is None
290
+ if recreate_table:
291
+ u.run_duckdb_stmt(self.logger, local_conn, f"DROP TABLE IF EXISTS {new_table_name}")
292
+
293
+ create_table_cols_clause = source.get_cols_for_create_table_stmt()
294
+ stmt = f"CREATE TABLE IF NOT EXISTS {new_table_name} ({create_table_cols_clause})"
295
+ u.run_duckdb_stmt(self.logger, local_conn, stmt)
296
+
297
+ if not recreate_table:
298
+ if source.update_hints.selective_overwrite_value is not None:
299
+ stmt = f"DELETE FROM {new_table_name} WHERE {increasing_column} >= $value"
300
+ u.run_duckdb_stmt(self.logger, local_conn, stmt, params={"value": source.update_hints.selective_overwrite_value})
301
+ elif not source.update_hints.strictly_increasing:
302
+ stmt = f"DELETE FROM {new_table_name} WHERE {increasing_column} = ({source.get_max_incr_col_query(new_table_name)})"
303
+ u.run_duckdb_stmt(self.logger, local_conn, stmt)
304
+
305
+ max_val_of_incr_col = None
306
+ if increasing_column is not None:
307
+ max_val_of_incr_col_tuple = u.run_duckdb_stmt(self.logger, local_conn, source.get_max_incr_col_query(new_table_name)).fetchone()
308
+ max_val_of_incr_col = max_val_of_incr_col_tuple[0] if isinstance(max_val_of_incr_col_tuple, tuple) else None
309
+ if max_val_of_incr_col is None:
310
+ recreate_table = True
311
+
312
+ query = source.get_query_for_upsert(dialect, conn_name, table_name, max_val_of_incr_col, full_refresh=recreate_table)
313
+
314
+ primary_keys = ", ".join(source.primary_key) if source.primary_key else ""
315
+ match_condition = f"USING ({primary_keys})" if primary_keys else "ON false"
316
+ stmt = (
317
+ f"MERGE INTO {new_table_name} "
318
+ f"USING ({query}) AS src "
319
+ f"{match_condition} "
320
+ f"WHEN MATCHED THEN UPDATE "
321
+ f"WHEN NOT MATCHED THEN INSERT BY NAME"
322
+ )
323
+ u.run_duckdb_stmt(self.logger, local_conn, stmt)
324
+
325
+ local_conn.commit()
326
+
327
+ finally:
328
+ local_conn.close()
329
+ # pass
330
+
331
+ async def build_model(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
332
+ if self.model_config.load_to_vdl:
333
+ start = time.time()
334
+ print(f"[{u.get_current_time()}] 🔨 BUILDING: source model '{self.name}'")
335
+
336
+ # await asyncio.to_thread(self._build_source_model, conn, full_refresh)
337
+ self._build_source_model(conn, full_refresh) # without threading
338
+
339
+ print(f"[{u.get_current_time()}] ✅ FINISHED: source model '{self.name}'")
340
+ self.logger.log_activity_time(f"building source model '{self.name}' into VDL", start)
172
341
 
342
+ await super().build_model(conn, full_refresh)
343
+
173
344
 
174
345
  @dataclass
175
- class Model(Referable):
176
- query_file: QueryFile
177
- manifest_cfg: ManifestConfig
178
- conn_set: ConnectionSet
179
- logger: u.Logger = field(default_factory=lambda: u.Logger(""))
346
+ class QueryModel(DataModel):
347
+ model_config: mc.QueryModelConfig
348
+ query_file: mq.QueryFile
349
+ compiled_query: mq.Query | None = field(default=None, init=False)
350
+ _: KW_ONLY
180
351
  j2_env: u.j2.Environment = field(default_factory=lambda: u.j2.Environment(loader=u.j2.FileSystemLoader(".")))
181
- compiled_query: _Query | None = field(default=None, init=False)
182
352
 
183
- def get_model_type(self) -> ModelType:
184
- return self.query_file.model_type
185
-
186
- def _add_upstream(self, other: Referable) -> None:
353
+ def _add_upstream(self, other: DataModel) -> None:
187
354
  self.upstreams[other.name] = other
188
355
  other.downstreams[self.name] = self
189
356
 
190
- if isinstance(self.query_file, SqlQueryFile):
191
- other.needs_sql_table = True
192
- elif isinstance(self.query_file, PyQueryFile):
193
- other.needs_pandas = True
194
-
195
- def _get_dbview_conn_name(self) -> str:
196
- dbview_config = self.manifest_cfg.dbviews.get(self.name)
197
- if dbview_config is None or dbview_config.connection_name is None:
198
- return self.manifest_cfg.settings.get(c.DB_CONN_DEFAULT_USED_SETTING, c.DEFAULT_DB_CONN)
199
- return dbview_config.connection_name
200
-
201
- def _get_materialized(self) -> _Materialization:
202
- federate_config = self.manifest_cfg.federates.get(self.name)
203
- if federate_config is None or federate_config.materialized is None:
204
- materialized = self.manifest_cfg.settings.get(c.DEFAULT_MATERIALIZE_SETTING, c.DEFAULT_MATERIALIZE)
205
- else:
206
- materialized = federate_config.materialized
207
- return _Materialization[materialized.upper()]
208
-
209
- async def _compile_sql_model(
210
- self, ctx: dict[str, Any], ctx_args: ContextArgs, placeholders: dict[str, Any], models_dict: dict[str, Referable]
211
- ) -> tuple[SqlModelQuery, set]:
212
- assert isinstance(self.query_file, SqlQueryFile)
213
-
214
- connection_name = self._get_dbview_conn_name()
215
- materialized = self._get_materialized()
216
- configuration = _SqlModelConfig(connection_name, materialized)
217
- is_placeholder = lambda placeholder: placeholder in placeholders
357
+ if isinstance(self.query_file, mq.PyQueryFile):
358
+ other.needs_python_df = True
359
+
360
+ def _ref_for_sql(self, dependent_model_name: str, models_dict: dict[str, DataModel]) -> str:
361
+ if dependent_model_name not in models_dict:
362
+ raise u.ConfigurationError(f'Model "{self.name}" references unknown model "{dependent_model_name}"')
363
+
364
+ dep_model = models_dict[dependent_model_name]
365
+ if isinstance(dep_model, SourceModel) and not dep_model.model_config.load_to_vdl:
366
+ # Allow when caller is Build or Federate AND the source connection is duckdb; else error
367
+ conn_name = dep_model.model_config.get_connection()
368
+ conn_props = self.conn_set.get_connection(conn_name)
369
+ is_duckdb_conn = isinstance(conn_props, ConnectionProperties) and conn_props.type == ConnectionTypeEnum.DUCKDB
370
+ if not is_duckdb_conn:
371
+ raise u.ConfigurationError(
372
+ f'Model "{self.name}" cannot reference source model "{dependent_model_name}". '
373
+ 'To be referenced by a build or federate model, the source must have load_to_vdl=True or a duckdb connection type.'
374
+ )
375
+
376
+ self.model_config.depends_on.add(dependent_model_name)
377
+ return dependent_model_name
378
+
379
+ def _ref_for_python(self, dependent_model_name: str) -> pl.LazyFrame:
380
+ if dependent_model_name not in self.upstreams:
381
+ raise u.ConfigurationError(f'Model "{self.name}" must include model "{dependent_model_name}" as a dependency to use')
382
+ df = self.upstreams[dependent_model_name].result
383
+ assert df is not None
384
+ return df
385
+
386
+ def _get_compile_sql_model_args_from_ctx_args(
387
+ self, ctx: dict[str, Any], ctx_args: ContextArgs
388
+ ) -> dict[str, Any]:
389
+ is_placeholder = lambda placeholder: placeholder in ctx_args._placeholders_copy
218
390
  kwargs = {
219
391
  "proj_vars": ctx_args.proj_vars, "env_vars": ctx_args.env_vars, "user": ctx_args.user, "prms": ctx_args.prms,
220
- "traits": ctx_args.traits, "ctx": ctx, "is_placeholder": is_placeholder, "set_placeholder": ctx_args.set_placeholder,
221
- "config": configuration.set_attribute, "param_exists": ctx_args.param_exists
392
+ "configurables": ctx_args.configurables, "ctx": ctx, "is_placeholder": is_placeholder, "set_placeholder": ctx_args.set_placeholder,
393
+ "param_exists": ctx_args.param_exists
222
394
  }
223
- dependencies = set()
224
- if self.query_file.model_type == ModelType.FEDERATE:
225
- def ref(dependent_model_name):
226
- if dependent_model_name not in models_dict:
227
- raise u.ConfigurationError(f'Model "{self.name}" references unknown model "{dependent_model_name}"')
228
- dependencies.add(dependent_model_name)
229
- return dependent_model_name
230
- kwargs["ref"] = ref
231
-
395
+ return kwargs
396
+
397
+ def _get_compiled_sql_query_str(self, raw_query: str, kwargs: dict[str, Any]) -> str:
232
398
  try:
233
- template = self.j2_env.from_string(self.query_file.raw_query)
234
- query = await asyncio.to_thread(template.render, kwargs)
399
+ template = self.j2_env.from_string(raw_query)
400
+ query = template.render(kwargs)
235
401
  except Exception as e:
236
- raise u.FileExecutionError(f'Failed to compile sql model "{self.name}"', e) from e
402
+ raise FileExecutionError(f'Failed to compile sql model "{self.name}"', e) from e
403
+ return query
404
+
405
+ def process_pass_through_columns(self, models_dict: dict[str, DataModel]) -> None:
406
+ if getattr(self, "processed_pass_through_columns", False):
407
+ return
408
+
409
+ for col in self.model_config.columns:
410
+ if col.pass_through:
411
+ # Validate pass-through column has exactly one dependency
412
+ if len(col.depends_on) != 1:
413
+ raise u.ConfigurationError(
414
+ f'Column "{self.name}.{col.name}" has pass_through=true, which must have exactly one depends_on value'
415
+ )
416
+
417
+ # Get the upstream column reference
418
+ upstream_col_ref = next(iter(col.depends_on))
419
+ table_name, col_name = upstream_col_ref.split('.')
420
+ self.model_config.depends_on.add(table_name)
421
+
422
+ # Get the upstream model
423
+ if table_name not in models_dict:
424
+ raise u.ConfigurationError(
425
+ f'Column "{self.name}.{col.name}" depends on unknown model "{table_name}"'
426
+ )
237
427
 
238
- compiled_query = SqlModelQuery(query, configuration)
239
- return compiled_query, dependencies
428
+ # Do not rely on self.upstreams here, as it may not be fully populated for metadata passthrough purposes
429
+ for dep_model_name in self.model_config.depends_on:
430
+ dep_model = models_dict[dep_model_name]
431
+ dep_model.process_pass_through_columns(models_dict)
432
+
433
+ for col in self.model_config.columns:
434
+ if col.pass_through:
435
+ upstream_col_ref = next(iter(col.depends_on))
436
+ table_name, col_name = upstream_col_ref.split('.')
437
+ upstream_model = models_dict[table_name]
438
+
439
+ # Find the upstream column config
440
+ upstream_col = next(
441
+ (c for c in upstream_model.model_config.columns if c.name == col_name),
442
+ None
443
+ )
444
+ if upstream_col is None:
445
+ raise u.ConfigurationError(
446
+ f'Column "{self.name}.{col.name}" depends on unknown column "{upstream_col_ref}"'
447
+ )
448
+
449
+ # Copy metadata from upstream column
450
+ col.type = upstream_col.type if col.type == "" else col.type
451
+ col.condition = upstream_col.condition if col.condition == "" else col.condition
452
+ col.description = upstream_col.description if col.description == "" else col.description
453
+ col.category = upstream_col.category if col.category == mc.ColumnCategory.MISC else col.category
454
+
455
+ self.processed_pass_through_columns = True
456
+
457
+ def retrieve_dependent_query_models(self, dependent_model_names: set[str]) -> None:
458
+ if self.name not in dependent_model_names:
459
+ dependent_model_names.add(self.name)
460
+ for dep_model in self.upstreams.values():
461
+ dep_model.retrieve_dependent_query_models(dependent_model_names)
240
462
 
241
- async def _compile_python_model(
242
- self, ctx: dict[str, Any], ctx_args: ContextArgs, placeholders: dict[str, Any], models_dict: dict[str, Referable]
243
- ) -> tuple[PyModelQuery, Iterable]:
244
- assert isinstance(self.query_file, PyQueryFile)
463
+ def _log_sql_to_run(self, sql: str, placeholders: dict[str, Any]) -> None:
464
+ log_msg = f"SQL to run for model '{self.name}':\n{sql}"
465
+ log_msg += f"\n\n(with placeholders: {placeholders})"
466
+ self.logger.debug(log_msg)
467
+
468
+
469
+ @dataclass
470
+ class DbviewModel(QueryModel):
471
+ model_config: mc.DbviewModelConfig
472
+ query_file: mq.SqlQueryFile
473
+ compiled_query: mq.SqlModelQuery | None = field(default=None, init=False)
474
+ sources: dict[str, src.Source] = field(default_factory=dict, init=False)
475
+
476
+ @property
477
+ def model_type(self) -> ModelType:
478
+ return ModelType.DBVIEW
479
+
480
+ def _get_compile_sql_model_args(
481
+ self, ctx: dict[str, Any], ctx_args: ContextArgs, models_dict: dict[str, DataModel]
482
+ ) -> dict[str, Any]:
483
+ kwargs = self._get_compile_sql_model_args_from_ctx_args(ctx, ctx_args)
245
484
 
246
- sqrl_args = ModelDepsArgs(
247
- ctx_args.proj_vars, ctx_args.env_vars, ctx_args.user, ctx_args.prms, ctx_args.traits, placeholders, ctx
248
- )
249
- try:
250
- dependencies = await asyncio.to_thread(self.query_file.raw_query.dependencies_func, sqrl_args)
251
- for dependent_model_name in dependencies:
252
- if dependent_model_name not in models_dict:
253
- raise u.ConfigurationError(f'Model "{self.name}" references unknown model "{dependent_model_name}"')
254
- except Exception as e:
255
- raise u.FileExecutionError(f'Failed to run "{c.DEP_FUNC}" function for python model "{self.name}"', e) from e
485
+ def source(source_name: str) -> str:
486
+ if source_name not in models_dict or not isinstance(source_model := models_dict[source_name], SourceModel):
487
+ raise u.ConfigurationError(f'Dbview "{self.name}" references unknown source "{source_name}"')
488
+ if source_model.model_config.get_connection() != self.model_config.get_connection():
489
+ raise u.ConfigurationError(f'Dbview "{self.name}" references source "{source_name}" with different connection')
490
+
491
+ # Check if the source model has load_to_vdl=False but this dbview has translate_to_duckdb=True
492
+ if not source_model.model_config.load_to_vdl and self.model_config.translate_to_duckdb:
493
+ raise u.ConfigurationError(
494
+ f'Dbview "{self.name}" with translate_to_duckdb=True cannot reference source "{source_name}" '
495
+ f'which has load_to_vdl=False'
496
+ )
497
+
498
+ self.model_config.depends_on.add(source_name)
499
+ self.sources[source_name] = source_model.model_config
500
+ return "{{ source(\"" + source_name + "\") }}"
256
501
 
257
- dbview_conn_name = self._get_dbview_conn_name()
258
- connections = self.conn_set.get_engines_as_dict()
502
+ kwargs["source"] = source
503
+ kwargs["ref"] = source
504
+ return kwargs
259
505
 
260
- def ref(dependent_model_name):
261
- if dependent_model_name not in self.upstreams:
262
- raise u.ConfigurationError(f'Model "{self.name}" must include model "{dependent_model_name}" as a dependency to use')
263
- return pd.DataFrame(self.upstreams[dependent_model_name].result)
506
+ def _get_duckdb_query(self, read_dialect: str, query: str) -> str:
507
+ kwargs = {
508
+ "source": lambda source_name: "vdl." + source_name
509
+ }
510
+ compiled_query = self._get_compiled_sql_query_str(query, kwargs)
511
+ duckdb_query = sqlglot.transpile(compiled_query, read=read_dialect, write="duckdb", pretty=True)[0]
512
+ return "-- translated to duckdb\n" + duckdb_query
513
+
514
+ def _compile_sql_model(self, kwargs: dict[str, Any]) -> mq.SqlModelQuery:
515
+ compiled_query_str = self._get_compiled_sql_query_str(self.query_file.raw_query, kwargs)
516
+
517
+ connection_name = self.model_config.get_connection()
518
+ connection_props = self.conn_set.get_connection(connection_name)
264
519
 
265
- def run_external_sql(sql_query: str, connection_name: str | None):
266
- connection_name = dbview_conn_name if connection_name is None else connection_name
267
- return self.conn_set.run_sql_query_from_conn_name(sql_query, connection_name, placeholders)
520
+ if self.model_config.translate_to_duckdb and isinstance(connection_props, ConnectionProperties):
521
+ # Forbid translate_to_duckdb when dbview connection is duckdb
522
+ if connection_props.type == ConnectionTypeEnum.DUCKDB:
523
+ raise u.ConfigurationError(
524
+ f'Dbview "{self.name}" has translate_to_duckdb=True but its connection is duckdb. Use a federate model instead.'
525
+ )
526
+ macros = {
527
+ "source": lambda source_name: "vdl." + source_name
528
+ }
529
+ compiled_query2 = self._get_compiled_sql_query_str(compiled_query_str, macros)
530
+ compiled_query_str = self._get_duckdb_query(connection_props.dialect, compiled_query2)
531
+ is_duckdb = True
532
+ else:
533
+ macros = {
534
+ "source": lambda source_name: self.sources[source_name].get_table()
535
+ }
536
+ compiled_query_str = self._get_compiled_sql_query_str(compiled_query_str, macros)
537
+ is_duckdb = False
268
538
 
269
- use_duckdb = self.manifest_cfg.settings_obj.do_use_duckdb()
270
- sqrl_args = ModelArgs(
271
- ctx_args.proj_vars, ctx_args.env_vars, ctx_args.user, ctx_args.prms, ctx_args.traits, placeholders, ctx,
272
- dbview_conn_name, connections, dependencies, ref, run_external_sql, use_duckdb
273
- )
539
+ compiled_query = mq.SqlModelQuery(compiled_query_str, is_duckdb)
540
+ return compiled_query
541
+
542
+ def compile(
543
+ self, ctx: dict[str, Any], ctx_args: ContextArgs, models_dict: dict[str, DataModel], recurse: bool
544
+ ) -> None:
545
+ if self.compiled_query is not None:
546
+ return
547
+ else:
548
+ self.compiled_query = mq.WorkInProgress() # type: ignore
549
+
550
+ start = time.time()
551
+
552
+ kwargs = self._get_compile_sql_model_args(ctx, ctx_args, models_dict)
553
+ self.compiled_query = self._compile_sql_model(kwargs)
554
+
555
+ self.logger.log_activity_time(f"compiling dbview model '{self.name}'", start)
556
+
557
+ async def _run_sql_model(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
558
+ assert self.compiled_query is not None
559
+ is_duckdb = self.compiled_query.is_duckdb
560
+ query = self.compiled_query.query
561
+ connection_name = self.model_config.get_connection()
562
+
563
+ def run_sql_query_on_connection(is_duckdb: bool, query: str, placeholders: dict) -> pl.DataFrame:
564
+ try:
565
+ if is_duckdb:
566
+ local_conn = conn.cursor()
567
+ try:
568
+ self.logger.info(f"Running dbview '{self.name}' on duckdb")
569
+ return local_conn.sql(query, params=placeholders).pl()
570
+ except duckdb.CatalogException as e:
571
+ raise InvalidInputError(409, f'dependent_data_model_not_found', f'Model "{self.name}" depends on static data models that cannot be found. Try building the Virtual Data Lake (VDL) first.')
572
+ except Exception as e:
573
+ raise RuntimeError(e)
574
+ finally:
575
+ local_conn.close()
576
+ else:
577
+ self.logger.info(f"Running dbview '{self.name}' on connection: {connection_name}")
578
+ return self.conn_set.run_sql_query_from_conn_name(query, connection_name, placeholders)
579
+ except RuntimeError as e:
580
+ raise FileExecutionError(f'Failed to run dbview sql model "{self.name}"', e)
581
+
582
+ self._log_sql_to_run(query, placeholders)
583
+ result = await asyncio.to_thread(run_sql_query_on_connection, is_duckdb, query, placeholders)
584
+ self.result = result.lazy()
585
+
586
+ async def run_model(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
587
+ start = time.time()
588
+
589
+ await self._run_sql_model(conn, placeholders)
590
+
591
+ self.logger.log_activity_time(f"running dbview model '{self.name}'", start)
592
+
593
+ await super().run_model(conn, placeholders)
594
+
595
+
596
+ @dataclass
597
+ class FederateModel(QueryModel):
598
+ model_config: mc.FederateModelConfig
599
+ query_file: mq.SqlQueryFile | mq.PyQueryFile
600
+ compiled_query: mq.SqlModelQuery | mq.PyModelQuery | None = field(default=None, init=False)
601
+
602
+ @property
603
+ def model_type(self) -> ModelType:
604
+ return ModelType.FEDERATE
605
+
606
+ def _get_compile_sql_model_args(
607
+ self, ctx: dict[str, Any], ctx_args: ContextArgs, models_dict: dict[str, DataModel]
608
+ ) -> dict[str, Any]:
609
+ kwargs = self._get_compile_sql_model_args_from_ctx_args(ctx, ctx_args)
610
+
611
+ def ref(dependent_model_name: str) -> str:
612
+ dependent_model = self._ref_for_sql(dependent_model_name, models_dict)
613
+ dep = models_dict[dependent_model]
614
+ if isinstance(dep, BuildModel):
615
+ return "vdl." + dependent_model
616
+ if isinstance(dep, SourceModel):
617
+ if dep.model_config.load_to_vdl:
618
+ return "vdl." + dependent_model
619
+ conn_name = dep.model_config.get_connection()
620
+ table_name = dep.model_config.get_table()
621
+ return f"db_{conn_name}.{table_name}"
622
+ return dependent_model
623
+
624
+ kwargs["ref"] = ref
625
+ return kwargs
626
+
627
+ def _compile_sql_model(
628
+ self, query_file: mq.SqlQueryFile, ctx: dict[str, Any], ctx_args: ContextArgs, models_dict: dict[str, DataModel]
629
+ ) -> mq.SqlModelQuery:
630
+ kwargs = self._get_compile_sql_model_args(ctx, ctx_args, models_dict)
631
+ compiled_query_str = self._get_compiled_sql_query_str(query_file.raw_query, kwargs)
632
+ compiled_query = mq.SqlModelQuery(compiled_query_str, is_duckdb=True)
633
+ return compiled_query
634
+
635
+ def _get_python_model_args(self, ctx: dict[str, Any], ctx_args: ContextArgs) -> ModelArgs:
636
+ dependencies = self.model_config.depends_on
637
+ connections = self.conn_set.get_connections_as_dict()
638
+
639
+ def run_external_sql(connection_name: str, sql_query: str) -> pl.DataFrame:
640
+ return self._run_sql_query_on_connection(connection_name, sql_query, ctx_args._placeholders_copy)
641
+
642
+ conn_args = ConnectionsArgs(ctx_args.project_path, ctx_args.proj_vars, ctx_args.env_vars)
643
+ build_model_args = BuildModelArgs(conn_args, connections, dependencies, self._ref_for_python, run_external_sql)
644
+ return ModelArgs(ctx_args, build_model_args, ctx)
645
+
646
+ def _compile_python_model(
647
+ self, query_file: mq.PyQueryFile, ctx: dict[str, Any], ctx_args: ContextArgs
648
+ ) -> mq.PyModelQuery:
649
+ sqrl_args = self._get_python_model_args(ctx, ctx_args)
274
650
 
275
- def compiled_query():
651
+ def compiled_query() -> pl.LazyFrame | pd.DataFrame:
276
652
  try:
277
- assert isinstance(self.query_file, PyQueryFile)
278
- raw_query: _RawPyQuery = self.query_file.raw_query
279
- return raw_query.query(sqrl_args)
653
+ return query_file.raw_query(sqrl_args)
280
654
  except Exception as e:
281
- raise u.FileExecutionError(f'Failed to run "{c.MAIN_FUNC}" function for python model "{self.name}"', e) from e
655
+ raise FileExecutionError(f'Failed to run "{c.MAIN_FUNC}" function for python model "{self.name}"', e) from e
282
656
 
283
- return PyModelQuery(compiled_query), dependencies
284
-
285
- async def compile(
286
- self, ctx: dict[str, Any], ctx_args: ContextArgs, placeholders: dict[str, Any], models_dict: dict[str, Referable], recurse: bool
657
+ return mq.PyModelQuery(compiled_query)
658
+
659
+ def compile(
660
+ self, ctx: dict[str, Any], ctx_args: ContextArgs, models_dict: dict[str, DataModel], recurse: bool
287
661
  ) -> None:
288
662
  if self.compiled_query is not None:
289
663
  return
290
664
  else:
291
- self.compiled_query = _WorkInProgress()
665
+ self.compiled_query = mq.WorkInProgress() # type: ignore
292
666
 
293
667
  start = time.time()
294
668
 
295
- if isinstance(self.query_file, SqlQueryFile):
296
- compiled_query, dependencies = await self._compile_sql_model(ctx, ctx_args, placeholders, models_dict)
297
- elif isinstance(self.query_file, PyQueryFile):
298
- compiled_query, dependencies = await self._compile_python_model(ctx, ctx_args, placeholders, models_dict)
669
+ if isinstance(self.query_file, mq.SqlQueryFile):
670
+ self.compiled_query = self._compile_sql_model(self.query_file, ctx, ctx_args, models_dict)
671
+ elif isinstance(self.query_file, mq.PyQueryFile):
672
+ self.compiled_query = self._compile_python_model(self.query_file, ctx, ctx_args)
299
673
  else:
300
674
  raise NotImplementedError(f"Query type not supported: {self.query_file.__class__.__name__}")
301
675
 
302
- self.compiled_query = compiled_query
303
- self.wait_count = len(set(dependencies))
304
-
305
- model_type = self.get_model_type().name.lower()
306
- self.logger.log_activity_time(f"compiling {model_type} model '{self.name}'", start)
676
+ self.logger.log_activity_time(f"compiling federate model '{self.name}'", start)
307
677
 
308
678
  if not recurse:
309
679
  return
310
680
 
311
- dep_models = [models_dict[x] for x in dependencies]
312
- coroutines = []
313
- for dep_model in dep_models:
681
+ dependencies = self.model_config.depends_on
682
+ self.wait_count = len(dependencies)
683
+
684
+ for name in dependencies:
685
+ dep_model = models_dict[name]
314
686
  self._add_upstream(dep_model)
315
- coro = dep_model.compile(ctx, ctx_args, placeholders, models_dict, recurse)
316
- coroutines.append(coro)
317
- await asyncio.gather(*coroutines)
318
-
319
- def get_terminal_nodes(self, depencency_path: set[str]) -> set[str]:
320
- if self.confirmed_no_cycles:
321
- return set()
322
-
323
- if self.name in depencency_path:
324
- raise u.ConfigurationError(f'Cycle found in model dependency graph')
687
+ dep_model.compile(ctx, ctx_args, models_dict, recurse)
325
688
 
326
- terminal_nodes = set()
327
- if len(self.upstreams) == 0:
328
- terminal_nodes.add(self.name)
329
- else:
330
- new_path = set(depencency_path)
331
- new_path.add(self.name)
332
- for dep_model in self.upstreams.values():
333
- terminal_nodes_under_dep = dep_model.get_terminal_nodes(new_path)
334
- terminal_nodes = terminal_nodes.union(terminal_nodes_under_dep)
335
-
336
- self.confirmed_no_cycles = True
337
- return terminal_nodes
689
+ async def _run_sql_model(self, compiled_query: mq.SqlModelQuery, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
690
+ local_conn = conn.cursor()
691
+ try:
692
+ self.register_all_upstream_python_df(local_conn)
693
+ query = compiled_query.query
338
694
 
339
- async def _run_sql_model(self, conn: Connection, placeholders: dict = {}) -> None:
340
- assert(isinstance(self.compiled_query, SqlModelQuery))
341
- config = self.compiled_query.config
342
- query = self.compiled_query.query
695
+ def create_table(local_conn: duckdb.DuckDBPyConnection):
696
+ # DuckDB doesn't support specifying named parameters that are not used in the query, so filtering them out
697
+ placeholder_exists = lambda key: re.search(r"\$" + key + r"(?!\w)", query)
698
+ existing_placeholders = {key: value for key, value in placeholders.items() if placeholder_exists(key)}
343
699
 
344
- if self.query_file.model_type == ModelType.DBVIEW:
345
- def run_sql_query():
346
- try:
347
- return self.conn_set.run_sql_query_from_conn_name(query, config.connection_name, placeholders)
348
- except RuntimeError as e:
349
- raise u.FileExecutionError(f'Failed to run dbview sql model "{self.name}"', e) from e
350
-
351
- df = await asyncio.to_thread(run_sql_query)
352
- await asyncio.to_thread(self._load_pandas_to_table, df, conn)
353
- if self.needs_pandas or self.is_target:
354
- self.result = df
355
- elif self.query_file.model_type == ModelType.FEDERATE:
356
- def create_table():
357
- create_query = config.get_sql_for_create(self.name, query)
700
+ create_query = self.model_config.get_sql_for_create(self.name, query)
701
+ self._log_sql_to_run(create_query, existing_placeholders)
358
702
  try:
359
- return conn.execute(text(create_query), placeholders)
703
+ return local_conn.execute(create_query, existing_placeholders)
704
+ except duckdb.CatalogException as e:
705
+ if self.name == "__fake_target":
706
+ raise InvalidInputError(409, "invalid_sql_query", f"Provided SQL query depends on static data models that cannot be found. Try building the Virtual Data Lake (VDL) first.")
707
+ else:
708
+ raise InvalidInputError(409, f'dependent_data_model_not_found', f'Model "{self.name}" depends on static data models that cannot be found. Try building the Virtual Data Lake (VDL) first.')
360
709
  except Exception as e:
361
- raise u.FileExecutionError(f'Failed to run federate sql model "{self.name}"', e) from e
710
+ if self.name == "__fake_target":
711
+ raise InvalidInputError(400, "invalid_sql_query", f"Failed to run provided SQL query")
712
+ else:
713
+ raise FileExecutionError(f'Failed to run federate sql model "{self.name}"', e) from e
362
714
 
363
- await asyncio.to_thread(create_table)
364
- if self.needs_pandas or self.is_target:
365
- self.result = await asyncio.to_thread(self._load_table_to_pandas, conn)
366
-
367
- async def _run_python_model(self, conn: Connection) -> None:
368
- assert(isinstance(self.compiled_query, PyModelQuery))
715
+ await asyncio.to_thread(create_table, local_conn)
716
+ if self.needs_python_df or self.is_target:
717
+ self.result = await asyncio.to_thread(self._load_duckdb_view_to_python_df, local_conn)
718
+ finally:
719
+ local_conn.close()
720
+
721
+ async def _run_python_model(self, compiled_query: mq.PyModelQuery) -> None:
722
+ query_result = await asyncio.to_thread(compiled_query.query)
723
+ if isinstance(query_result, pd.DataFrame):
724
+ query_result = pl.from_pandas(query_result)
725
+
726
+ self.result = query_result.lazy()
369
727
 
370
- df = await asyncio.to_thread(self.compiled_query.query)
371
- if self.needs_sql_table:
372
- await asyncio.to_thread(self._load_pandas_to_table, df, conn)
373
- if self.needs_pandas or self.is_target:
374
- self.result = df
375
-
376
- async def run_model(self, conn: Connection, placeholders: dict = {}) -> None:
728
+ async def run_model(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
377
729
  start = time.time()
378
730
 
379
- if isinstance(self.query_file, SqlQueryFile):
380
- await self._run_sql_model(conn, placeholders)
381
- elif isinstance(self.query_file, PyQueryFile):
382
- await self._run_python_model(conn)
731
+ if isinstance(self.compiled_query, mq.SqlModelQuery):
732
+ await self._run_sql_model(self.compiled_query, conn, placeholders)
733
+ elif isinstance(self.compiled_query, mq.PyModelQuery):
734
+ await self._run_python_model(self.compiled_query)
383
735
  else:
384
736
  raise NotImplementedError(f"Query type not supported: {self.query_file.__class__.__name__}")
385
737
 
386
- model_type = self.get_model_type().name.lower()
387
- self.logger.log_activity_time(f"running {model_type} model '{self.name}'", start)
738
+ self.logger.log_activity_time(f"running federate model '{self.name}'", start)
388
739
 
389
740
  await super().run_model(conn, placeholders)
390
741
 
391
- def retrieve_dependent_query_models(self, dependent_model_names: set[str]) -> None:
392
- if self.name not in dependent_model_names:
393
- dependent_model_names.add(self.name)
394
- for dep_model in self.upstreams.values():
395
- dep_model.retrieve_dependent_query_models(dependent_model_names)
742
+
743
+ @dataclass
744
+ class BuildModel(StaticModel, QueryModel):
745
+ model_config: mc.BuildModelConfig
746
+ query_file: mq.SqlQueryFile | mq.PyQueryFile
747
+ compiled_query: mq.SqlModelQuery | mq.PyModelQuery | None = field(default=None, init=False)
748
+
749
+ @property
750
+ def model_type(self) -> ModelType:
751
+ return ModelType.BUILD
752
+
753
+ def _add_upstream_for_build(self, other: StaticModel) -> None:
754
+ self.upstreams_for_build[other.name] = other
755
+ other.downstreams_for_build[self.name] = self
756
+
757
+ if isinstance(self.query_file, mq.PyQueryFile):
758
+ other.needs_python_df_for_build = True
759
+
760
+ def _get_compile_sql_model_args(
761
+ self, conn_args: ConnectionsArgs, models_dict: dict[str, StaticModel]
762
+ ) -> dict[str, Any]:
763
+ kwargs: dict[str, Any] = {
764
+ "proj_vars": conn_args.proj_vars, "env_vars": conn_args.env_vars
765
+ }
766
+
767
+ def ref_for_build(dependent_model_name: str) -> str:
768
+ dependent_model = self._ref_for_sql(dependent_model_name, models_dict)
769
+ dep = models_dict[dependent_model]
770
+ if isinstance(dep, SourceModel) and not dep.model_config.load_to_vdl:
771
+ conn_name = dep.model_config.get_connection()
772
+ table_name = dep.model_config.get_table()
773
+ return f"db_{conn_name}.{table_name}"
774
+ return dependent_model
775
+
776
+ kwargs["ref"] = ref_for_build
777
+ return kwargs
778
+
779
+ def _compile_sql_model(
780
+ self, query_file: mq.SqlQueryFile, conn_args: ConnectionsArgs, models_dict: dict[str, StaticModel]
781
+ ) -> mq.SqlModelQuery:
782
+ kwargs = self._get_compile_sql_model_args(conn_args, models_dict)
783
+ compiled_query_str = self._get_compiled_sql_query_str(query_file.raw_query, kwargs)
784
+ compiled_query = mq.SqlModelQuery(compiled_query_str, is_duckdb=True)
785
+ return compiled_query
786
+
787
+ def _ref_for_python(self, dependent_model_name: str) -> pl.LazyFrame:
788
+ if dependent_model_name not in self.upstreams_for_build:
789
+ raise u.ConfigurationError(f'Model "{self.name}" must include model "{dependent_model_name}" as a dependency to use')
790
+ df = self.upstreams_for_build[dependent_model_name].result
791
+ assert df is not None
792
+ return df
793
+
794
+ def _get_compile_python_model_args(self, conn_args: ConnectionsArgs) -> BuildModelArgs:
795
+
796
+ def run_external_sql(connection_name: str, sql_query: str):
797
+ return self._run_sql_query_on_connection(connection_name, sql_query)
798
+
799
+ return BuildModelArgs(
800
+ conn_args, self.conn_set.get_connections_as_dict(), self.model_config.depends_on, self._ref_for_python, run_external_sql
801
+ )
802
+
803
+ def _compile_python_model(
804
+ self, query_file: mq.PyQueryFile, conn_args: ConnectionsArgs
805
+ ) -> mq.PyModelQuery:
806
+ sqrl_args = self._get_compile_python_model_args(conn_args)
807
+
808
+ def compiled_query() -> pl.LazyFrame | pd.DataFrame:
809
+ try:
810
+ return query_file.raw_query(sqrl_args)
811
+ except Exception as e:
812
+ raise FileExecutionError(f'Failed to run "{c.MAIN_FUNC}" function for build model "{self.name}"', e)
813
+
814
+ return mq.PyModelQuery(compiled_query)
815
+
816
+ def compile_for_build(self, conn_args: ConnectionsArgs, models_dict: dict[str, StaticModel]) -> None:
817
+ start = time.time()
818
+
819
+ if isinstance(self.query_file, mq.SqlQueryFile):
820
+ self.compiled_query = self._compile_sql_model(self.query_file, conn_args, models_dict)
821
+ elif isinstance(self.query_file, mq.PyQueryFile):
822
+ self.compiled_query = self._compile_python_model(self.query_file, conn_args)
823
+ else:
824
+ raise NotImplementedError(f"Query type not supported: {self.query_file.__class__.__name__}")
825
+
826
+ self.logger.log_activity_time(f"compiling build model '{self.name}'", start)
827
+
828
+ dependencies = self.model_config.depends_on
829
+ self.wait_count_for_build = len(dependencies)
830
+
831
+ for name in dependencies:
832
+ dep_model = models_dict[name]
833
+ self._add_upstream_for_build(dep_model)
834
+
835
+ async def _build_sql_model(self, compiled_query: mq.SqlModelQuery, conn: duckdb.DuckDBPyConnection) -> None:
836
+ query = compiled_query.query
837
+
838
+ def create_table():
839
+ create_query = self.model_config.get_sql_for_build(self.name, query)
840
+ local_conn = conn.cursor()
841
+ # local_conn = conn
842
+ try:
843
+ return u.run_duckdb_stmt(self.logger, local_conn, create_query, model_name=self.name)
844
+ except Exception as e:
845
+ raise FileExecutionError(f'Failed to build static sql model "{self.name}"', e) from e
846
+ finally:
847
+ local_conn.close()
848
+ # pass
849
+
850
+ # await asyncio.to_thread(create_table)
851
+ create_table() # without threading
852
+
853
+ async def _build_python_model(self, compiled_query: mq.PyModelQuery, conn: duckdb.DuckDBPyConnection) -> None:
854
+ query_result = await asyncio.to_thread(compiled_query.query)
855
+ if isinstance(query_result, pd.DataFrame):
856
+ query_result = pl.from_pandas(query_result).lazy()
857
+ if self.needs_python_df_for_build:
858
+ self.result = query_result.lazy()
859
+ # await asyncio.to_thread(self._create_table_from_df, conn, query_result)
860
+ self._create_table_from_df(conn, query_result) # without threading
861
+
862
+ async def build_model(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
863
+ start = time.time()
864
+ print(f"[{u.get_current_time()}] 🔨 BUILDING: build model '{self.name}'")
865
+
866
+ if isinstance(self.compiled_query, mq.SqlModelQuery):
867
+ await self._build_sql_model(self.compiled_query, conn)
868
+ elif isinstance(self.compiled_query, mq.PyModelQuery):
869
+ # First ensure all upstream models have an associated Python dataframe
870
+ def load_df(conn: duckdb.DuckDBPyConnection, dep_model: DataModel):
871
+ if dep_model.result is None:
872
+ local_conn = conn.cursor()
873
+ # local_conn = conn
874
+ try:
875
+ dep_model.result = dep_model._load_duckdb_view_to_python_df(local_conn)
876
+ finally:
877
+ local_conn.close()
878
+ # pass
879
+
880
+ coroutines = []
881
+ for dep_model in self.upstreams_for_build.values():
882
+ coro = asyncio.to_thread(load_df, conn, dep_model)
883
+ coroutines.append(coro)
884
+ await u.asyncio_gather(coroutines)
885
+
886
+ # Then run the model's Python function to build the model
887
+ await self._build_python_model(self.compiled_query, conn)
888
+ else:
889
+ raise NotImplementedError(f"Query type not supported: {self.query_file.__class__.__name__}")
890
+
891
+ print(f"[{u.get_current_time()}] ✅ FINISHED: build model '{self.name}'")
892
+ self.logger.log_activity_time(f"building static build model '{self.name}'", start)
893
+
894
+ await super().build_model(conn, full_refresh)
396
895
 
397
896
 
398
897
  @dataclass
399
898
  class DAG:
400
- manifest_cfg: ManifestConfig
401
- dataset: DatasetConfig
402
- target_model: Referable
403
- models_dict: dict[str, Referable]
899
+ dataset: DatasetConfig | None
900
+ target_model: DataModel
901
+ models_dict: dict[str, DataModel]
902
+ datalake_db_path: str | None = field(default=None)
404
903
  logger: u.Logger = field(default_factory=lambda: u.Logger(""))
405
904
  parameter_set: ParameterSet | None = field(default=None, init=False) # set in apply_selections
406
905
  placeholders: dict[str, Any] = field(init=False, default_factory=dict)
407
906
 
907
+ def _get_msg_extension(self) -> str:
908
+ return f" for dataset '{self.dataset.name}'" if self.dataset else ""
909
+
910
+ def compile_build_models(self, conn_args: ConnectionsArgs) -> None:
911
+ static_models: dict[str, StaticModel] = {k: v for k, v in self.models_dict.items() if isinstance(v, StaticModel)}
912
+ for model in static_models.values():
913
+ if isinstance(model, BuildModel):
914
+ model.compile_for_build(conn_args, static_models)
915
+
408
916
  def apply_selections(
409
- self, param_cfg_set: ParameterConfigsSet, user: User | None, selections: dict[str, str], *, updates_only: bool = False, request_version: int | None = None
917
+ self, param_cfg_set: ParameterConfigsSet, user: AbstractUser, selections: dict[str, str]
410
918
  ) -> None:
411
919
  start = time.time()
412
- dataset_params = self.dataset.parameters
413
- parameter_set = param_cfg_set.apply_selections(
414
- dataset_params, selections, user, updates_only=updates_only, request_version=request_version
415
- )
920
+ dataset_params = self.dataset.parameters if self.dataset else None
921
+ parameter_set = param_cfg_set.apply_selections(dataset_params, selections, user)
416
922
  self.parameter_set = parameter_set
417
- self.logger.log_activity_time(f"applying selections for dataset '{self.dataset.name}'", start)
923
+ msg_extension = self._get_msg_extension()
924
+ self.logger.log_activity_time("applying selections" + msg_extension, start)
418
925
 
419
- def _compile_context(self, param_args: ParametersArgs, context_func: ContextFunc, user: User | None) -> tuple[dict[str, Any], ContextArgs]:
926
+ def _compile_context(
927
+ self, param_args: ParametersArgs, context_func: ContextFunc, user: AbstractUser, configurables: dict[str, str]
928
+ ) -> tuple[dict[str, Any], ContextArgs]:
420
929
  start = time.time()
421
930
  context = {}
422
931
  assert isinstance(self.parameter_set, ParameterSet)
423
932
  prms = self.parameter_set.get_parameters_as_dict()
424
- args = ContextArgs(param_args.proj_vars, param_args.env_vars, user, prms, self.dataset.traits, self.placeholders)
933
+ args = ContextArgs(param_args, user, prms, configurables)
934
+ msg_extension = self._get_msg_extension()
425
935
  try:
426
936
  context_func(context, args)
427
937
  except Exception as e:
428
- raise u.FileExecutionError(f'Failed to run {c.CONTEXT_FILE} for dataset "{self.dataset.name}"', e) from e
429
- self.logger.log_activity_time(f"running context.py for dataset '{self.dataset.name}'", start)
938
+ raise FileExecutionError(f'Failed to run {c.CONTEXT_FILE}' + msg_extension, e) from e
939
+ self.logger.log_activity_time("running context.py" + msg_extension, start)
430
940
  return context, args
431
941
 
432
- async def _compile_models(self, context: dict[str, Any], ctx_args: ContextArgs, recurse: bool) -> None:
433
- await self.target_model.compile(context, ctx_args, self.placeholders, self.models_dict, recurse)
942
+ def _compile_models(self, context: dict[str, Any], ctx_args: ContextArgs, recurse: bool) -> None:
943
+ self.target_model.compile(context, ctx_args, self.models_dict, recurse)
434
944
 
435
945
  def _get_terminal_nodes(self) -> set[str]:
436
946
  start = time.time()
@@ -440,100 +950,149 @@ class DAG:
440
950
  self.logger.log_activity_time(f"validating no cycles in model dependencies", start)
441
951
  return terminal_nodes
442
952
 
443
- async def _run_models(self, terminal_nodes: set[str], placeholders: dict = {}) -> None:
444
- use_duckdb = self.manifest_cfg.settings_obj.do_use_duckdb()
445
- conn_url = "duckdb:///" if use_duckdb else "sqlite:///?check_same_thread=False"
446
- engine = create_engine(conn_url)
447
-
448
- with engine.connect() as conn:
953
+ def _attach_connections_with_type_duckdb(self, conn: duckdb.DuckDBPyConnection) -> None:
954
+ for conn_name, connection in self.target_model.conn_set.get_connections_as_dict().items():
955
+ if not isinstance(connection, ConnectionProperties):
956
+ continue
957
+ attach_uri = connection.attach_uri_for_duckdb
958
+ if attach_uri is None:
959
+ continue
960
+ attach_stmt = f"ATTACH IF NOT EXISTS '{attach_uri}' AS db_{conn_name} (READ_ONLY)"
961
+ u.run_duckdb_stmt(self.logger, conn, attach_stmt, redacted_values=[attach_uri])
962
+
963
+ async def _run_models(self) -> None:
964
+ terminal_nodes = self._get_terminal_nodes()
965
+
966
+ conn = u.create_duckdb_connection(datalake_db_path=self.datalake_db_path)
967
+ try:
968
+ self._attach_connections_with_type_duckdb(conn)
969
+
449
970
  coroutines = []
450
971
  for model_name in terminal_nodes:
451
- model = self.models_dict[model_name]
452
- coroutines.append(model.run_model(conn, placeholders))
453
- await asyncio.gather(*coroutines)
454
-
455
- engine.dispose()
972
+ model = self.models_dict[model_name] if model_name != "__fake_target" else self.target_model
973
+ coroutines.append(model.run_model(conn, self.placeholders))
974
+ await u.asyncio_gather(coroutines)
975
+
976
+ finally:
977
+ conn.close()
456
978
 
457
979
  async def execute(
458
- self, param_args: ParametersArgs, param_cfg_set: ParameterConfigsSet, context_func: ContextFunc, user: User | None, selections: dict[str, str],
459
- *, request_version: int | None = None, runquery: bool = True, recurse: bool = True
460
- ) -> dict[str, Any]:
980
+ self, param_args: ParametersArgs, param_cfg_set: ParameterConfigsSet, context_func: ContextFunc, user: AbstractUser, selections: dict[str, str],
981
+ *, runquery: bool = True, recurse: bool = True, configurables: dict[str, str] = {}
982
+ ) -> None:
461
983
  recurse = (recurse or runquery)
462
984
 
463
- self.apply_selections(param_cfg_set, user, selections, request_version=request_version)
985
+ self.apply_selections(param_cfg_set, user, selections)
464
986
 
465
- context, ctx_args = self._compile_context(param_args, context_func, user)
987
+ context, ctx_args = self._compile_context(param_args, context_func, user, configurables)
466
988
 
467
- await self._compile_models(context, ctx_args, recurse)
989
+ self._compile_models(context, ctx_args, recurse)
468
990
 
469
- terminal_nodes = self._get_terminal_nodes()
470
-
471
- placeholders = ctx_args._placeholders.copy()
991
+ self.placeholders = ctx_args._placeholders_copy
472
992
  if runquery:
473
- await self._run_models(terminal_nodes, placeholders)
474
-
475
- return placeholders
993
+ await self._run_models()
994
+
995
+ self.target_model.process_pass_through_columns(self.models_dict)
476
996
 
477
997
  def get_all_query_models(self) -> set[str]:
478
998
  all_model_names = set()
479
999
  self.target_model.retrieve_dependent_query_models(all_model_names)
480
1000
  return all_model_names
481
1001
 
482
- def to_networkx_graph(self) -> nx.DiGraph:
483
- G = nx.DiGraph()
484
-
1002
+ def get_all_data_models(self) -> list[rm.DataModelItem]:
1003
+ data_models = []
485
1004
  for model_name, model in self.models_dict.items():
486
- model_type = model.get_model_type()
487
- level = model.get_max_path_length_to_target()
488
- if level is not None:
489
- G.add_node(model_name, layer=-level, model_type=model_type)
1005
+ is_queryable = model.is_queryable
1006
+ data_model = rm.DataModelItem(name=model_name, model_type=model.model_type.value, config=model.model_config, is_queryable=is_queryable)
1007
+ data_models.append(data_model)
1008
+ return data_models
1009
+
1010
+ def get_all_model_lineage(self) -> list[rm.LineageRelation]:
1011
+ model_lineage = []
1012
+ for model_name, model in self.models_dict.items():
1013
+ if not isinstance(model, QueryModel):
1014
+ continue
1015
+ for dep_model_name in model.model_config.depends_on:
1016
+ edge_type = "buildtime" if isinstance(model, BuildModel) else "runtime"
1017
+ source_model = rm.LineageNode(name=dep_model_name, type="model")
1018
+ target_model = rm.LineageNode(name=model_name, type="model")
1019
+ model_lineage.append(rm.LineageRelation(type=edge_type, source=source_model, target=target_model))
1020
+ return model_lineage
1021
+
1022
+
1023
+ class ModelsIO:
1024
+
1025
+ @classmethod
1026
+ def _load_model_config(cls, filepath: Path, model_type: ModelType, env_vars: dict[str, str]) -> mc.ModelConfig:
1027
+ yaml_path = filepath.with_suffix('.yml')
1028
+ config_dict = u.load_yaml_config(yaml_path) if yaml_path.exists() else {}
490
1029
 
491
- for model_name in G.nodes:
492
- model = self.models_dict[model_name]
493
- for dep_model_name in model.downstreams:
494
- G.add_edge(model_name, dep_model_name)
1030
+ if model_type == ModelType.DBVIEW:
1031
+ config = mc.DbviewModelConfig(**config_dict).finalize_connection(env_vars)
1032
+ return config
1033
+ elif model_type == ModelType.FEDERATE:
1034
+ return mc.FederateModelConfig(**config_dict)
1035
+ elif model_type == ModelType.BUILD:
1036
+ return mc.BuildModelConfig(**config_dict)
1037
+ else:
1038
+ return mc.ModelConfig(**config_dict)
1039
+
1040
+ @classmethod
1041
+ def _populate_from_file(
1042
+ cls, raw_queries_by_model: dict[str, mq.QueryFileWithConfig], dp: str, file: str, model_type: ModelType, env_vars: dict[str, str]
1043
+ ) -> None:
1044
+ filepath = Path(dp, file)
1045
+ file_stem, extension = os.path.splitext(file)
1046
+
1047
+ if extension == '.py':
1048
+ module = pm.PyModule(filepath)
1049
+ raw_query = module.get_func_or_class(c.MAIN_FUNC)
1050
+ query_file = mq.PyQueryFile(filepath.as_posix(), raw_query)
1051
+ elif extension == '.sql':
1052
+ query_file = mq.SqlQueryFile(filepath.as_posix(), filepath.read_text())
1053
+ else:
1054
+ return # Skip files that are not query files
1055
+
1056
+ if file_stem in raw_queries_by_model:
1057
+ assert isinstance(prior_query_file := raw_queries_by_model[file_stem].query_file, mq.QueryFile)
1058
+ conflicts = [prior_query_file.filepath, query_file.filepath]
1059
+ raise u.ConfigurationError(f"Multiple models found for '{file_stem}': {conflicts}")
495
1060
 
496
- return G
1061
+ model_config = cls._load_model_config(filepath, model_type, env_vars)
1062
+ raw_queries_by_model[file_stem] = mq.QueryFileWithConfig(query_file, model_config)
497
1063
 
1064
+ @classmethod
1065
+ def _populate_raw_queries_for_type(
1066
+ cls, folder_path: Path, model_type: ModelType, *, env_vars: dict[str, str] = {}
1067
+ ) -> dict[str, mq.QueryFileWithConfig]:
1068
+ raw_queries_by_model: dict[str, mq.QueryFileWithConfig] = {}
1069
+ for dp, _, filenames in os.walk(folder_path):
1070
+ for file in filenames:
1071
+ cls._populate_from_file(raw_queries_by_model, dp, file, model_type, env_vars)
1072
+ return raw_queries_by_model
498
1073
 
499
- class ModelsIO:
1074
+ @classmethod
1075
+ def load_build_files(cls, logger: u.Logger, base_path: str) -> dict[str, mq.QueryFileWithConfig]:
1076
+ start = time.time()
1077
+ builds_path = u.Path(base_path, c.MODELS_FOLDER, c.BUILDS_FOLDER)
1078
+ raw_queries_by_model = cls._populate_raw_queries_for_type(builds_path, ModelType.BUILD)
1079
+ logger.log_activity_time("loading build files", start)
1080
+ return raw_queries_by_model
500
1081
 
501
1082
  @classmethod
502
- def load_files(cls, logger: u.Logger, base_path: str) -> dict[str, QueryFile]:
1083
+ def load_dbview_files(cls, logger: u.Logger, base_path: str, env_vars: dict[str, str]) -> dict[str, mq.QueryFileWithConfig]:
503
1084
  start = time.time()
504
- raw_queries_by_model: dict[str, QueryFile] = {}
505
-
506
- def populate_from_file(dp: str, file: str, model_type: ModelType) -> None:
507
- filepath = Path(dp, file)
508
- file_stem, extension = os.path.splitext(file)
509
- if extension == '.py':
510
- module = pm.PyModule(filepath)
511
- dependencies_func = module.get_func_or_class(c.DEP_FUNC, default_attr=lambda sqrl: [])
512
- raw_query = _RawPyQuery(module.get_func_or_class(c.MAIN_FUNC), dependencies_func)
513
- query_file = PyQueryFile(filepath.as_posix(), model_type, raw_query)
514
- elif extension == '.sql':
515
- query_file = SqlQueryFile(filepath.as_posix(), model_type, filepath.read_text())
516
- else:
517
- query_file = None
518
-
519
- if query_file is not None:
520
- if file_stem in raw_queries_by_model:
521
- conflicts = [raw_queries_by_model[file_stem].filepath, filepath]
522
- raise u.ConfigurationError(f"Multiple models found for '{file_stem}': {conflicts}")
523
- raw_queries_by_model[file_stem] = query_file
524
-
525
- def populate_raw_queries_for_type(folder_path: Path, model_type: ModelType) -> None:
526
- for dp, _, filenames in os.walk(folder_path):
527
- for file in filenames:
528
- populate_from_file(dp, file, model_type)
529
-
530
1085
  dbviews_path = u.Path(base_path, c.MODELS_FOLDER, c.DBVIEWS_FOLDER)
531
- populate_raw_queries_for_type(dbviews_path, ModelType.DBVIEW)
1086
+ raw_queries_by_model = cls._populate_raw_queries_for_type(dbviews_path, ModelType.DBVIEW, env_vars=env_vars)
1087
+ logger.log_activity_time("loading dbview files", start)
1088
+ return raw_queries_by_model
532
1089
 
1090
+ @classmethod
1091
+ def load_federate_files(cls, logger: u.Logger, base_path: str) -> dict[str, mq.QueryFileWithConfig]:
1092
+ start = time.time()
533
1093
  federates_path = u.Path(base_path, c.MODELS_FOLDER, c.FEDERATES_FOLDER)
534
- populate_raw_queries_for_type(federates_path, ModelType.FEDERATE)
535
-
536
- logger.log_activity_time("loading files for models", start)
1094
+ raw_queries_by_model = cls._populate_raw_queries_for_type(federates_path, ModelType.FEDERATE)
1095
+ logger.log_activity_time("loading federate files", start)
537
1096
  return raw_queries_by_model
538
1097
 
539
1098
  @classmethod