squirrels 0.1.0__py3-none-any.whl → 0.6.0.post0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. dateutils/__init__.py +6 -0
  2. dateutils/_enums.py +25 -0
  3. squirrels/dateutils.py → dateutils/_implementation.py +409 -380
  4. dateutils/types.py +6 -0
  5. squirrels/__init__.py +21 -18
  6. squirrels/_api_routes/__init__.py +5 -0
  7. squirrels/_api_routes/auth.py +337 -0
  8. squirrels/_api_routes/base.py +196 -0
  9. squirrels/_api_routes/dashboards.py +156 -0
  10. squirrels/_api_routes/data_management.py +148 -0
  11. squirrels/_api_routes/datasets.py +220 -0
  12. squirrels/_api_routes/project.py +289 -0
  13. squirrels/_api_server.py +552 -134
  14. squirrels/_arguments/__init__.py +0 -0
  15. squirrels/_arguments/init_time_args.py +83 -0
  16. squirrels/_arguments/run_time_args.py +111 -0
  17. squirrels/_auth.py +777 -0
  18. squirrels/_command_line.py +239 -107
  19. squirrels/_compile_prompts.py +147 -0
  20. squirrels/_connection_set.py +94 -0
  21. squirrels/_constants.py +141 -64
  22. squirrels/_dashboards.py +179 -0
  23. squirrels/_data_sources.py +570 -0
  24. squirrels/_dataset_types.py +91 -0
  25. squirrels/_env_vars.py +209 -0
  26. squirrels/_exceptions.py +29 -0
  27. squirrels/_http_error_responses.py +52 -0
  28. squirrels/_initializer.py +319 -110
  29. squirrels/_logging.py +121 -0
  30. squirrels/_manifest.py +357 -187
  31. squirrels/_mcp_server.py +578 -0
  32. squirrels/_model_builder.py +69 -0
  33. squirrels/_model_configs.py +74 -0
  34. squirrels/_model_queries.py +52 -0
  35. squirrels/_models.py +1201 -0
  36. squirrels/_package_data/base_project/.env +7 -0
  37. squirrels/_package_data/base_project/.env.example +44 -0
  38. squirrels/_package_data/base_project/connections.yml +16 -0
  39. squirrels/_package_data/base_project/dashboards/dashboard_example.py +40 -0
  40. squirrels/_package_data/base_project/dashboards/dashboard_example.yml +22 -0
  41. squirrels/_package_data/base_project/docker/.dockerignore +16 -0
  42. squirrels/_package_data/base_project/docker/Dockerfile +16 -0
  43. squirrels/_package_data/base_project/docker/compose.yml +7 -0
  44. squirrels/_package_data/base_project/duckdb_init.sql +10 -0
  45. squirrels/_package_data/base_project/gitignore +13 -0
  46. squirrels/_package_data/base_project/macros/macros_example.sql +17 -0
  47. squirrels/_package_data/base_project/models/builds/build_example.py +26 -0
  48. squirrels/_package_data/base_project/models/builds/build_example.sql +16 -0
  49. squirrels/_package_data/base_project/models/builds/build_example.yml +57 -0
  50. squirrels/_package_data/base_project/models/dbviews/dbview_example.sql +17 -0
  51. squirrels/_package_data/base_project/models/dbviews/dbview_example.yml +32 -0
  52. squirrels/_package_data/base_project/models/federates/federate_example.py +51 -0
  53. squirrels/_package_data/base_project/models/federates/federate_example.sql +21 -0
  54. squirrels/_package_data/base_project/models/federates/federate_example.yml +65 -0
  55. squirrels/_package_data/base_project/models/sources.yml +38 -0
  56. squirrels/_package_data/base_project/parameters.yml +142 -0
  57. squirrels/_package_data/base_project/pyconfigs/connections.py +19 -0
  58. squirrels/_package_data/base_project/pyconfigs/context.py +96 -0
  59. squirrels/_package_data/base_project/pyconfigs/parameters.py +141 -0
  60. squirrels/_package_data/base_project/pyconfigs/user.py +56 -0
  61. squirrels/_package_data/base_project/resources/expenses.db +0 -0
  62. squirrels/_package_data/base_project/resources/public/.gitkeep +0 -0
  63. squirrels/_package_data/base_project/resources/weather.db +0 -0
  64. squirrels/_package_data/base_project/seeds/seed_categories.csv +6 -0
  65. squirrels/_package_data/base_project/seeds/seed_categories.yml +15 -0
  66. squirrels/_package_data/base_project/seeds/seed_subcategories.csv +15 -0
  67. squirrels/_package_data/base_project/seeds/seed_subcategories.yml +21 -0
  68. squirrels/_package_data/base_project/squirrels.yml.j2 +61 -0
  69. squirrels/_package_data/base_project/tmp/.gitignore +2 -0
  70. squirrels/_package_data/templates/login_successful.html +53 -0
  71. squirrels/_package_data/templates/squirrels_studio.html +22 -0
  72. squirrels/_package_loader.py +29 -0
  73. squirrels/_parameter_configs.py +592 -0
  74. squirrels/_parameter_options.py +348 -0
  75. squirrels/_parameter_sets.py +207 -0
  76. squirrels/_parameters.py +1703 -0
  77. squirrels/_project.py +796 -0
  78. squirrels/_py_module.py +122 -0
  79. squirrels/_request_context.py +33 -0
  80. squirrels/_schemas/__init__.py +0 -0
  81. squirrels/_schemas/auth_models.py +83 -0
  82. squirrels/_schemas/query_param_models.py +70 -0
  83. squirrels/_schemas/request_models.py +26 -0
  84. squirrels/_schemas/response_models.py +286 -0
  85. squirrels/_seeds.py +97 -0
  86. squirrels/_sources.py +112 -0
  87. squirrels/_utils.py +540 -149
  88. squirrels/_version.py +1 -3
  89. squirrels/arguments.py +7 -0
  90. squirrels/auth.py +4 -0
  91. squirrels/connections.py +3 -0
  92. squirrels/dashboards.py +3 -0
  93. squirrels/data_sources.py +14 -282
  94. squirrels/parameter_options.py +13 -189
  95. squirrels/parameters.py +14 -801
  96. squirrels/types.py +18 -0
  97. squirrels-0.6.0.post0.dist-info/METADATA +148 -0
  98. squirrels-0.6.0.post0.dist-info/RECORD +101 -0
  99. {squirrels-0.1.0.dist-info → squirrels-0.6.0.post0.dist-info}/WHEEL +1 -2
  100. {squirrels-0.1.0.dist-info → squirrels-0.6.0.post0.dist-info}/entry_points.txt +1 -0
  101. squirrels-0.6.0.post0.dist-info/licenses/LICENSE +201 -0
  102. squirrels/_credentials_manager.py +0 -87
  103. squirrels/_module_loader.py +0 -37
  104. squirrels/_parameter_set.py +0 -151
  105. squirrels/_renderer.py +0 -286
  106. squirrels/_timed_imports.py +0 -37
  107. squirrels/connection_set.py +0 -126
  108. squirrels/package_data/base_project/.gitignore +0 -4
  109. squirrels/package_data/base_project/connections.py +0 -21
  110. squirrels/package_data/base_project/database/sample_database.db +0 -0
  111. squirrels/package_data/base_project/database/seattle_weather.db +0 -0
  112. squirrels/package_data/base_project/datasets/sample_dataset/context.py +0 -8
  113. squirrels/package_data/base_project/datasets/sample_dataset/database_view1.py +0 -23
  114. squirrels/package_data/base_project/datasets/sample_dataset/database_view1.sql.j2 +0 -7
  115. squirrels/package_data/base_project/datasets/sample_dataset/final_view.py +0 -10
  116. squirrels/package_data/base_project/datasets/sample_dataset/final_view.sql.j2 +0 -2
  117. squirrels/package_data/base_project/datasets/sample_dataset/parameters.py +0 -30
  118. squirrels/package_data/base_project/datasets/sample_dataset/selections.cfg +0 -6
  119. squirrels/package_data/base_project/squirrels.yaml +0 -26
  120. squirrels/package_data/static/favicon.ico +0 -0
  121. squirrels/package_data/static/script.js +0 -234
  122. squirrels/package_data/static/style.css +0 -110
  123. squirrels/package_data/templates/index.html +0 -32
  124. squirrels-0.1.0.dist-info/LICENSE +0 -22
  125. squirrels-0.1.0.dist-info/METADATA +0 -67
  126. squirrels-0.1.0.dist-info/RECORD +0 -40
  127. squirrels-0.1.0.dist-info/top_level.txt +0 -1
squirrels/_models.py ADDED
@@ -0,0 +1,1201 @@
1
+ from __future__ import annotations
2
+ from typing import Callable, Any
3
+ from dataclasses import dataclass, field, KW_ONLY
4
+ from abc import ABCMeta, abstractmethod
5
+ from enum import Enum
6
+ from pathlib import Path
7
+ import asyncio, os, re, time, duckdb, sqlglot
8
+ import polars as pl, pandas as pd
9
+
10
+ from . import _constants as c, _utils as u, _py_module as pm, _model_queries as mq, _model_configs as mc, _sources as src
11
+ from ._schemas import response_models as rm
12
+ from ._exceptions import FileExecutionError, InvalidInputError
13
+ from ._arguments.run_time_args import ContextArgs, ModelArgs, BuildModelArgs
14
+ from ._auth import AbstractUser
15
+ from ._connection_set import ConnectionsArgs, ConnectionSet, ConnectionProperties
16
+ from ._manifest import DatasetConfig, ConnectionTypeEnum
17
+ from ._parameter_sets import ParameterConfigsSet, ParametersArgs, ParameterSet
18
+ from ._env_vars import SquirrelsEnvVars
19
+
20
+ ContextFunc = Callable[[dict[str, Any], ContextArgs], None]
21
+
22
+
23
+ class ModelType(Enum):
24
+ SEED = "seed"
25
+ SOURCE = "source"
26
+ BUILD = "build"
27
+ DBVIEW = "dbview"
28
+ FEDERATE = "federate"
29
+
30
+
31
+ @dataclass
32
+ class DataModel(metaclass=ABCMeta):
33
+ name: str
34
+ model_config: mc.ModelConfig
35
+ is_target: bool = field(default=False, init=False)
36
+
37
+ result: pl.LazyFrame | None = field(default=None, init=False, repr=False)
38
+ needs_python_df: bool = field(default=False, init=False)
39
+
40
+ wait_count: int = field(default=0, init=False, repr=False)
41
+ confirmed_no_cycles: bool = field(default=False, init=False)
42
+ upstreams: dict[str, DataModel] = field(default_factory=dict, init=False, repr=False)
43
+ downstreams: dict[str, DataModel] = field(default_factory=dict, init=False, repr=False)
44
+
45
+ _: KW_ONLY
46
+ logger: u.Logger = field(default_factory=lambda: u.Logger(""))
47
+ conn_set: ConnectionSet = field(default_factory=ConnectionSet)
48
+
49
+ @property
50
+ @abstractmethod
51
+ def model_type(self) -> ModelType:
52
+ pass
53
+
54
+ @property
55
+ def is_queryable(self) -> bool:
56
+ return True
57
+
58
+ def compile(
59
+ self, ctx: dict[str, Any], ctx_args: ContextArgs, models_dict: dict[str, DataModel], recurse: bool
60
+ ) -> None:
61
+ pass
62
+
63
+ def get_terminal_nodes(self, depencency_path: set[str]) -> set[str]:
64
+ if self.confirmed_no_cycles:
65
+ return set()
66
+
67
+ if self.name in depencency_path:
68
+ raise u.ConfigurationError(f'Cycle found in model dependency graph')
69
+
70
+ terminal_nodes = set()
71
+ if len(self.upstreams) == 0:
72
+ terminal_nodes.add(self.name)
73
+ else:
74
+ new_path = set(depencency_path)
75
+ new_path.add(self.name)
76
+ for dep_model in self.upstreams.values():
77
+ terminal_nodes.update(dep_model.get_terminal_nodes(new_path))
78
+
79
+ self.confirmed_no_cycles = True
80
+ return terminal_nodes
81
+
82
+ def _load_duckdb_view_to_python_df(self, conn: duckdb.DuckDBPyConnection, *, use_datalake: bool = False) -> pl.LazyFrame:
83
+ table_name = ("vdl." if use_datalake else "") + self.name
84
+ try:
85
+ return conn.sql(f"FROM {table_name}").pl().lazy()
86
+ except duckdb.CatalogException as e:
87
+ raise u.ConfigurationError(f'Failed to load duckdb table or view "{self.name}" to python dataframe') from e
88
+
89
+ def _run_sql_query_on_connection(self, connection_name: str, query: str, placeholders: dict = {}) -> pl.DataFrame:
90
+ self.logger.debug(f"Running SQL query on connection '{connection_name}':\n{query}")
91
+ return self.conn_set.run_sql_query_from_conn_name(query, connection_name, placeholders)
92
+
93
+ async def _trigger(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
94
+ self.wait_count -= 1
95
+ if (self.wait_count == 0):
96
+ await self.run_model(conn, placeholders)
97
+
98
+ async def run_model(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
99
+ coroutines = []
100
+ for model in self.downstreams.values():
101
+ coroutines.append(model._trigger(conn, placeholders))
102
+ await u.asyncio_gather(coroutines)
103
+
104
+ def retrieve_dependent_query_models(self, dependent_model_names: set[str]) -> None:
105
+ pass
106
+
107
+ def _register_all_upstream_python_df_helper(self, conn: duckdb.DuckDBPyConnection, tables_set: set[str]) -> None:
108
+ if self.result is not None and self.name not in tables_set:
109
+ conn.register(self.name, self.result)
110
+ for dep_model in self.upstreams.values():
111
+ dep_model._register_all_upstream_python_df_helper(conn, tables_set)
112
+
113
+ def register_all_upstream_python_df(self, conn: duckdb.DuckDBPyConnection) -> None:
114
+ show_tables_query = f"SHOW TABLES"
115
+ tables_df = conn.sql(show_tables_query).pl()
116
+ tables_set = set(tables_df["name"])
117
+ self._register_all_upstream_python_df_helper(conn, tables_set)
118
+
119
+ def get_max_path_length_to_target(self) -> int | None:
120
+ if not hasattr(self, "max_path_len_to_target"):
121
+ path_lengths = []
122
+ for child_model in self.downstreams.values():
123
+ assert isinstance(child_model_path_length := child_model.get_max_path_length_to_target(), int)
124
+ path_lengths.append(child_model_path_length+1)
125
+ if len(path_lengths) > 0:
126
+ self.max_path_len_to_target = max(path_lengths)
127
+ else:
128
+ self.max_path_len_to_target = 0 if self.is_target else None
129
+ return self.max_path_len_to_target
130
+
131
+ async def _trigger_build(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
132
+ pass
133
+
134
+ def _create_table_from_df(self, conn: duckdb.DuckDBPyConnection, query_result: pl.LazyFrame | pd.DataFrame):
135
+ local_conn = conn.cursor()
136
+ # local_conn = conn
137
+ try:
138
+ assert query_result is not None
139
+ local_conn.execute(f"CREATE OR REPLACE TABLE {self.name} AS FROM query_result")
140
+ finally:
141
+ local_conn.close()
142
+ # pass
143
+
144
+ def process_pass_through_columns(self, models_dict: dict[str, DataModel]) -> None:
145
+ pass
146
+
147
+
148
+ @dataclass
149
+ class StaticModel(DataModel):
150
+ needs_python_df_for_build: bool = field(default=False, init=False)
151
+ wait_count_for_build: int = field(default=0, init=False, repr=False)
152
+ upstreams_for_build: dict[str, StaticModel] = field(default_factory=dict, init=False, repr=False)
153
+ downstreams_for_build: dict[str, StaticModel] = field(default_factory=dict, init=False, repr=False)
154
+
155
+ def get_terminal_nodes_for_build(self, depencency_path: set[str]) -> set[str]:
156
+ if self.confirmed_no_cycles:
157
+ return set()
158
+
159
+ if self.name in depencency_path:
160
+ raise u.ConfigurationError(f'Cycle found in model dependency graph')
161
+
162
+ terminal_nodes = set()
163
+ if len(self.upstreams_for_build) == 0:
164
+ terminal_nodes.add(self.name)
165
+ else:
166
+ new_path = set(depencency_path)
167
+ new_path.add(self.name)
168
+ for dep_model in self.upstreams_for_build.values():
169
+ terminal_nodes.update(dep_model.get_terminal_nodes_for_build(new_path))
170
+
171
+ self.confirmed_no_cycles = True
172
+ return terminal_nodes
173
+
174
+ def _get_result(self, conn: duckdb.DuckDBPyConnection) -> pl.LazyFrame:
175
+ local_conn = conn.cursor()
176
+ try:
177
+ return self._load_duckdb_view_to_python_df(local_conn, use_datalake=True)
178
+ except Exception as e:
179
+ raise InvalidInputError(409, f'dependent_data_model_not_found', f'Model "{self.name}" depends on static data models that cannot be found. Try building the Virtual Data Lake (VDL) first.')
180
+ finally:
181
+ local_conn.close()
182
+
183
+ async def run_model(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
184
+ if (self.needs_python_df or self.is_target) and self.result is None:
185
+ start = time.time()
186
+
187
+ self.result = await asyncio.to_thread(self._get_result, conn)
188
+
189
+ self.logger.log_activity_time(
190
+ f"loading {self.model_type.value} model '{self.name}' into memory", start,
191
+ additional_data={
192
+ "activity": "loading static data model into memory",
193
+ "model_name": self.name,
194
+ "model_type": self.model_type.value
195
+ }
196
+ )
197
+
198
+ await super().run_model(conn, placeholders)
199
+
200
+ def compile_for_build(
201
+ self, conn_args: ConnectionsArgs, models_dict: dict[str, StaticModel]
202
+ ) -> None:
203
+ pass
204
+
205
+ async def _trigger_build(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
206
+ self.wait_count_for_build -= 1
207
+ if (self.wait_count_for_build == 0):
208
+ await self.build_model(conn, full_refresh)
209
+
210
+ async def build_model(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
211
+ if self.needs_python_df and self.result is None:
212
+ local_conn = conn.cursor()
213
+ try:
214
+ self.result = await asyncio.to_thread(self._load_duckdb_view_to_python_df, local_conn)
215
+ finally:
216
+ local_conn.close()
217
+
218
+ coroutines = []
219
+ for model in self.downstreams_for_build.values():
220
+ coroutines.append(model._trigger_build(conn, full_refresh))
221
+ await u.asyncio_gather(coroutines)
222
+
223
+
224
+ @dataclass
225
+ class Seed(StaticModel):
226
+ model_config: mc.SeedConfig
227
+ result: pl.LazyFrame
228
+
229
+ @property
230
+ def model_type(self) -> ModelType:
231
+ return ModelType.SEED
232
+
233
+ async def build_model(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
234
+ start = time.time()
235
+
236
+ print(f"[{u.get_current_time()}] 🔨 BUILDING: seed model '{self.name}'")
237
+ # await asyncio.to_thread(self._create_table_from_df, conn, self.result)
238
+ self._create_table_from_df(conn, self.result) # without threading
239
+
240
+ print(f"[{u.get_current_time()}] ✅ FINISHED: seed model '{self.name}'")
241
+ self.logger.log_activity_time(
242
+ f"building seed model '{self.name}' into VDL", start,
243
+ additional_data={
244
+ "activity": "building data model into VDL",
245
+ "model_name": self.name,
246
+ "model_type": self.model_type.value
247
+ }
248
+ )
249
+
250
+ await super().build_model(conn, full_refresh)
251
+
252
+
253
+ @dataclass
254
+ class SourceModel(StaticModel):
255
+ model_config: src.Source
256
+
257
+ @property
258
+ def model_type(self) -> ModelType:
259
+ return ModelType.SOURCE
260
+
261
+ @property
262
+ def connection_props(self) -> ConnectionProperties:
263
+ conn_name = self.model_config.get_connection()
264
+ conn_props = self.conn_set.get_connection(conn_name)
265
+ if isinstance(conn_props, ConnectionProperties):
266
+ return conn_props
267
+ raise u.ConfigurationError(f'Unable to use connection "{conn_name}" for source "{self.name}". Connection "{conn_name}" must be a ConnectionProperties object')
268
+
269
+ @property
270
+ def is_queryable(self) -> bool:
271
+ connection_props = self.connection_props
272
+ return self.model_config.load_to_vdl or connection_props.type == ConnectionTypeEnum.DUCKDB
273
+
274
+ def _build_source_model(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
275
+ local_conn = conn.cursor()
276
+ # local_conn = conn
277
+
278
+ local_conn.begin()
279
+ try:
280
+ source = self.model_config
281
+ conn_name = source.get_connection()
282
+
283
+ connection_props = self.connection_props
284
+ dialect = connection_props.dialect
285
+ attach_uri = connection_props.attach_uri_for_duckdb
286
+ if attach_uri is None:
287
+ raise u.ConfigurationError(f'Loading to duckdb is not supported for source "{self.name}" since its connection "{conn_name}" uses an unsupported dialect')
288
+
289
+ result = u.run_duckdb_stmt(self.logger, local_conn, f"FROM (SHOW DATABASES) WHERE database_name = 'db_{conn_name}'").fetchone()
290
+ if result is None:
291
+ return # skip this source if connection is not attached
292
+
293
+ table_name = source.get_table()
294
+ new_table_name = self.name
295
+
296
+ if len(source.columns) == 0:
297
+ stmt = f"CREATE OR REPLACE TABLE {new_table_name} AS FROM db_{conn_name}.{table_name}"
298
+ u.run_duckdb_stmt(self.logger, local_conn, stmt)
299
+ local_conn.commit()
300
+ return
301
+
302
+ increasing_column = source.update_hints.increasing_column
303
+ recreate_table = full_refresh or increasing_column is None
304
+ if recreate_table:
305
+ u.run_duckdb_stmt(self.logger, local_conn, f"DROP TABLE IF EXISTS {new_table_name}")
306
+
307
+ create_table_cols_clause = source.get_cols_for_create_table_stmt()
308
+ stmt = f"CREATE TABLE IF NOT EXISTS {new_table_name} ({create_table_cols_clause})"
309
+ u.run_duckdb_stmt(self.logger, local_conn, stmt)
310
+
311
+ if not recreate_table:
312
+ if source.update_hints.selective_overwrite_value is not None:
313
+ stmt = f"DELETE FROM {new_table_name} WHERE {increasing_column} >= $value"
314
+ u.run_duckdb_stmt(self.logger, local_conn, stmt, params={"value": source.update_hints.selective_overwrite_value})
315
+ elif not source.update_hints.strictly_increasing:
316
+ stmt = f"DELETE FROM {new_table_name} WHERE {increasing_column} = ({source.get_max_incr_col_query(new_table_name)})"
317
+ u.run_duckdb_stmt(self.logger, local_conn, stmt)
318
+
319
+ max_val_of_incr_col = None
320
+ if increasing_column is not None:
321
+ max_val_of_incr_col_tuple = u.run_duckdb_stmt(self.logger, local_conn, source.get_max_incr_col_query(new_table_name)).fetchone()
322
+ max_val_of_incr_col = max_val_of_incr_col_tuple[0] if isinstance(max_val_of_incr_col_tuple, tuple) else None
323
+ if max_val_of_incr_col is None:
324
+ recreate_table = True
325
+
326
+ query = source.get_query_for_upsert(dialect, conn_name, table_name, max_val_of_incr_col, full_refresh=recreate_table)
327
+
328
+ primary_keys = ", ".join(source.primary_key) if source.primary_key else ""
329
+ match_condition = f"USING ({primary_keys})" if primary_keys else "ON false"
330
+ stmt = (
331
+ f"MERGE INTO {new_table_name} "
332
+ f"USING ({query}) AS src "
333
+ f"{match_condition} "
334
+ f"WHEN MATCHED THEN UPDATE "
335
+ f"WHEN NOT MATCHED THEN INSERT BY NAME"
336
+ )
337
+ u.run_duckdb_stmt(self.logger, local_conn, stmt)
338
+
339
+ local_conn.commit()
340
+
341
+ finally:
342
+ local_conn.close()
343
+ # pass
344
+
345
+ async def build_model(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
346
+ if self.model_config.load_to_vdl:
347
+ start = time.time()
348
+ print(f"[{u.get_current_time()}] 🔨 BUILDING: source model '{self.name}'")
349
+
350
+ # await asyncio.to_thread(self._build_source_model, conn, full_refresh)
351
+ self._build_source_model(conn, full_refresh) # without threading
352
+
353
+ print(f"[{u.get_current_time()}] ✅ FINISHED: source model '{self.name}'")
354
+ self.logger.log_activity_time(
355
+ f"building source model '{self.name}' into VDL", start,
356
+ additional_data={
357
+ "activity": "building data model into VDL",
358
+ "model_name": self.name,
359
+ "model_type": self.model_type.value
360
+ }
361
+ )
362
+
363
+ await super().build_model(conn, full_refresh)
364
+
365
+
366
+ @dataclass
367
+ class QueryModel(DataModel):
368
+ model_config: mc.QueryModelConfig
369
+ query_file: mq.QueryFile
370
+ compiled_query: mq.Query | None = field(default=None, init=False)
371
+ _: KW_ONLY
372
+ j2_env: u.j2.Environment = field(default_factory=lambda: u.j2.Environment(loader=u.j2.FileSystemLoader(".")))
373
+
374
+ def _add_upstream(self, other: DataModel) -> None:
375
+ self.upstreams[other.name] = other
376
+ other.downstreams[self.name] = self
377
+
378
+ if isinstance(self.query_file, mq.PyQueryFile):
379
+ other.needs_python_df = True
380
+
381
+ def _ref_for_sql(self, dependent_model_name: str, models_dict: dict[str, DataModel]) -> str:
382
+ if dependent_model_name not in models_dict:
383
+ raise u.ConfigurationError(f'Model "{self.name}" references unknown model "{dependent_model_name}"')
384
+
385
+ dep_model = models_dict[dependent_model_name]
386
+ if isinstance(dep_model, SourceModel) and not dep_model.model_config.load_to_vdl:
387
+ # Allow when caller is Build or Federate AND the source connection is duckdb; else error
388
+ conn_name = dep_model.model_config.get_connection()
389
+ conn_props = self.conn_set.get_connection(conn_name)
390
+ is_duckdb_conn = isinstance(conn_props, ConnectionProperties) and conn_props.type == ConnectionTypeEnum.DUCKDB
391
+ if not is_duckdb_conn:
392
+ raise u.ConfigurationError(
393
+ f'Model "{self.name}" cannot reference source model "{dependent_model_name}". '
394
+ 'To be referenced by a build or federate model, the source must have load_to_vdl=True or a duckdb connection type.'
395
+ )
396
+
397
+ self.model_config.depends_on.add(dependent_model_name)
398
+ return dependent_model_name
399
+
400
+ def _ref_for_python(self, dependent_model_name: str) -> pl.LazyFrame:
401
+ if dependent_model_name not in self.upstreams:
402
+ raise u.ConfigurationError(f'Model "{self.name}" must include model "{dependent_model_name}" as a dependency to use')
403
+ df = self.upstreams[dependent_model_name].result
404
+ assert df is not None
405
+ return df
406
+
407
+ def _get_compile_sql_model_args_from_ctx_args(
408
+ self, ctx: dict[str, Any], ctx_args: ContextArgs
409
+ ) -> dict[str, Any]:
410
+ is_placeholder = lambda placeholder: placeholder in ctx_args._placeholders
411
+ kwargs = {
412
+ "proj_vars": ctx_args.proj_vars, "env_vars": ctx_args.env_vars, "user": ctx_args.user, "prms": ctx_args.prms,
413
+ "configurables": ctx_args.configurables, "ctx": ctx, "is_placeholder": is_placeholder, "set_placeholder": ctx_args.set_placeholder,
414
+ "param_exists": ctx_args.param_exists
415
+ }
416
+ return kwargs
417
+
418
+ def _get_compiled_sql_query_str(self, raw_query: str, kwargs: dict[str, Any]) -> str:
419
+ try:
420
+ template = self.j2_env.from_string(raw_query)
421
+ query = template.render(kwargs)
422
+ except Exception as e:
423
+ raise FileExecutionError(f'Failed to compile sql model "{self.name}"', e) from e
424
+ return query
425
+
426
+ def process_pass_through_columns(self, models_dict: dict[str, DataModel]) -> None:
427
+ if getattr(self, "processed_pass_through_columns", False):
428
+ return
429
+
430
+ for col in self.model_config.columns:
431
+ if col.pass_through:
432
+ # Validate pass-through column has exactly one dependency
433
+ if len(col.depends_on) != 1:
434
+ raise u.ConfigurationError(
435
+ f'Column "{self.name}.{col.name}" has pass_through=true, which must have exactly one depends_on value'
436
+ )
437
+
438
+ # Get the upstream column reference
439
+ upstream_col_ref = next(iter(col.depends_on))
440
+ table_name, col_name = upstream_col_ref.split('.')
441
+ self.model_config.depends_on.add(table_name)
442
+
443
+ # Get the upstream model
444
+ if table_name not in models_dict:
445
+ raise u.ConfigurationError(
446
+ f'Column "{self.name}.{col.name}" depends on unknown model "{table_name}"'
447
+ )
448
+
449
+ # Do not rely on self.upstreams here, as it may not be fully populated for metadata passthrough purposes
450
+ for dep_model_name in self.model_config.depends_on:
451
+ dep_model = models_dict[dep_model_name]
452
+ dep_model.process_pass_through_columns(models_dict)
453
+
454
+ for col in self.model_config.columns:
455
+ if col.pass_through:
456
+ upstream_col_ref = next(iter(col.depends_on))
457
+ table_name, col_name = upstream_col_ref.split('.')
458
+ upstream_model = models_dict[table_name]
459
+
460
+ # Find the upstream column config
461
+ upstream_col = next(
462
+ (c for c in upstream_model.model_config.columns if c.name == col_name),
463
+ None
464
+ )
465
+ if upstream_col is None:
466
+ raise u.ConfigurationError(
467
+ f'Column "{self.name}.{col.name}" depends on unknown column "{upstream_col_ref}"'
468
+ )
469
+
470
+ # Copy metadata from upstream column
471
+ col.type = upstream_col.type if col.type == "" else col.type
472
+ col.condition = upstream_col.condition if col.condition == [] else col.condition
473
+ col.description = upstream_col.description if col.description == "" else col.description
474
+ col.category = upstream_col.category if col.category == mc.ColumnCategory.MISC else col.category
475
+
476
+ self.processed_pass_through_columns = True
477
+
478
+ def retrieve_dependent_query_models(self, dependent_model_names: set[str]) -> None:
479
+ if self.name not in dependent_model_names:
480
+ dependent_model_names.add(self.name)
481
+ for dep_model in self.upstreams.values():
482
+ dep_model.retrieve_dependent_query_models(dependent_model_names)
483
+
484
+ def _log_sql_to_run(self, sql: str, placeholders: dict[str, Any]) -> None:
485
+ log_msg = f"SQL to run for model '{self.name}':\n{sql}"
486
+ log_msg += f"\n\n(with placeholders: {placeholders})"
487
+ self.logger.debug(log_msg)
488
+
489
+
490
+ @dataclass
491
+ class DbviewModel(QueryModel):
492
+ model_config: mc.DbviewModelConfig
493
+ query_file: mq.SqlQueryFile
494
+ compiled_query: mq.SqlModelQuery | None = field(default=None, init=False)
495
+ sources: dict[str, src.Source] = field(default_factory=dict, init=False)
496
+
497
+ @property
498
+ def model_type(self) -> ModelType:
499
+ return ModelType.DBVIEW
500
+
501
+ def _get_compile_sql_model_args(
502
+ self, ctx: dict[str, Any], ctx_args: ContextArgs, models_dict: dict[str, DataModel]
503
+ ) -> dict[str, Any]:
504
+ kwargs = self._get_compile_sql_model_args_from_ctx_args(ctx, ctx_args)
505
+
506
+ def source(source_name: str) -> str:
507
+ if source_name not in models_dict or not isinstance(source_model := models_dict[source_name], SourceModel):
508
+ raise u.ConfigurationError(f'Dbview "{self.name}" references unknown source "{source_name}"')
509
+ if source_model.model_config.get_connection() != self.model_config.get_connection():
510
+ raise u.ConfigurationError(f'Dbview "{self.name}" references source "{source_name}" with different connection')
511
+
512
+ # Check if the source model has load_to_vdl=False but this dbview has translate_to_duckdb=True
513
+ if not source_model.model_config.load_to_vdl and self.model_config.translate_to_duckdb:
514
+ raise u.ConfigurationError(
515
+ f'Dbview "{self.name}" with translate_to_duckdb=True cannot reference source "{source_name}" '
516
+ f'which has load_to_vdl=False'
517
+ )
518
+
519
+ self.model_config.depends_on.add(source_name)
520
+ self.sources[source_name] = source_model.model_config
521
+ return "{{ source(\"" + source_name + "\") }}"
522
+
523
+ kwargs["source"] = source
524
+ kwargs["ref"] = source
525
+ return kwargs
526
+
527
+ def _get_duckdb_query(self, read_dialect: str, query: str) -> str:
528
+ kwargs = {
529
+ "source": lambda source_name: "vdl." + source_name
530
+ }
531
+ compiled_query = self._get_compiled_sql_query_str(query, kwargs)
532
+ duckdb_query = sqlglot.transpile(compiled_query, read=read_dialect, write="duckdb", pretty=True)[0]
533
+ return "-- translated to duckdb\n" + duckdb_query
534
+
535
+ def _compile_sql_model(self, kwargs: dict[str, Any]) -> mq.SqlModelQuery:
536
+ compiled_query_str = self._get_compiled_sql_query_str(self.query_file.raw_query, kwargs)
537
+
538
+ connection_name = self.model_config.get_connection()
539
+ connection_props = self.conn_set.get_connection(connection_name)
540
+
541
+ if self.model_config.translate_to_duckdb and isinstance(connection_props, ConnectionProperties):
542
+ # Forbid translate_to_duckdb when dbview connection is duckdb
543
+ if connection_props.type == ConnectionTypeEnum.DUCKDB:
544
+ raise u.ConfigurationError(
545
+ f'Dbview "{self.name}" has translate_to_duckdb=True but its connection is duckdb. Use a federate model instead.'
546
+ )
547
+ macros = {
548
+ "source": lambda source_name: "vdl." + source_name
549
+ }
550
+ compiled_query2 = self._get_compiled_sql_query_str(compiled_query_str, macros)
551
+ compiled_query_str = self._get_duckdb_query(connection_props.dialect, compiled_query2)
552
+ is_duckdb = True
553
+ else:
554
+ macros = {
555
+ "source": lambda source_name: self.sources[source_name].get_table()
556
+ }
557
+ compiled_query_str = self._get_compiled_sql_query_str(compiled_query_str, macros)
558
+ is_duckdb = False
559
+
560
+ compiled_query = mq.SqlModelQuery(compiled_query_str, is_duckdb)
561
+ return compiled_query
562
+
563
+ def compile(
564
+ self, ctx: dict[str, Any], ctx_args: ContextArgs, models_dict: dict[str, DataModel], recurse: bool
565
+ ) -> None:
566
+ if self.compiled_query is not None:
567
+ return
568
+ else:
569
+ self.compiled_query = mq.WorkInProgress() # type: ignore
570
+
571
+ start = time.time()
572
+
573
+ kwargs = self._get_compile_sql_model_args(ctx, ctx_args, models_dict)
574
+ self.compiled_query = self._compile_sql_model(kwargs)
575
+
576
+ self.logger.log_activity_time(
577
+ f"compiling dbview model '{self.name}'", start,
578
+ additional_data={
579
+ "activity": "compiling data model",
580
+ "model_name": self.name,
581
+ "model_type": self.model_type.value
582
+ }
583
+ )
584
+
585
+ async def _run_sql_model(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
586
+ assert self.compiled_query is not None
587
+ is_duckdb = self.compiled_query.is_duckdb
588
+ query = self.compiled_query.query
589
+ connection_name = self.model_config.get_connection()
590
+
591
+ def run_sql_query_on_connection(is_duckdb: bool, query: str, placeholders: dict) -> pl.DataFrame:
592
+ try:
593
+ if is_duckdb:
594
+ local_conn = conn.cursor()
595
+ try:
596
+ self.logger.info(f"Running dbview '{self.name}' on duckdb")
597
+ return local_conn.sql(query, params=placeholders).pl()
598
+ except duckdb.CatalogException as e:
599
+ raise InvalidInputError(409, f'dependent_data_model_not_found', f'Model "{self.name}" depends on static data models that cannot be found. Try building the Virtual Data Lake (VDL) first.')
600
+ except Exception as e:
601
+ raise RuntimeError(e)
602
+ finally:
603
+ local_conn.close()
604
+ else:
605
+ self.logger.info(f"Running dbview '{self.name}' on connection: {connection_name}")
606
+ return self.conn_set.run_sql_query_from_conn_name(query, connection_name, placeholders)
607
+ except RuntimeError as e:
608
+ raise FileExecutionError(f'Failed to run dbview sql model "{self.name}"', e)
609
+
610
+ self._log_sql_to_run(query, placeholders)
611
+ result = await asyncio.to_thread(run_sql_query_on_connection, is_duckdb, query, placeholders)
612
+ self.result = result.lazy()
613
+
614
+ async def run_model(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
615
+ start = time.time()
616
+
617
+ await self._run_sql_model(conn, placeholders)
618
+
619
+ self.logger.log_activity_time(
620
+ f"running dbview model '{self.name}'", start,
621
+ additional_data={
622
+ "activity": "running data model",
623
+ "model_name": self.name,
624
+ "model_type": self.model_type.value
625
+ }
626
+ )
627
+
628
+ await super().run_model(conn, placeholders)
629
+
630
+
631
+ @dataclass
632
+ class FederateModel(QueryModel):
633
+ model_config: mc.FederateModelConfig
634
+ query_file: mq.SqlQueryFile | mq.PyQueryFile
635
+ compiled_query: mq.SqlModelQuery | mq.PyModelQuery | None = field(default=None, init=False)
636
+
637
+ @property
638
+ def model_type(self) -> ModelType:
639
+ return ModelType.FEDERATE
640
+
641
+ def _get_compile_sql_model_args(
642
+ self, ctx: dict[str, Any], ctx_args: ContextArgs, models_dict: dict[str, DataModel]
643
+ ) -> dict[str, Any]:
644
+ kwargs = self._get_compile_sql_model_args_from_ctx_args(ctx, ctx_args)
645
+
646
+ def ref(dependent_model_name: str) -> str:
647
+ dependent_model = self._ref_for_sql(dependent_model_name, models_dict)
648
+ dep = models_dict[dependent_model]
649
+ if isinstance(dep, BuildModel):
650
+ return "vdl." + dependent_model
651
+ if isinstance(dep, SourceModel):
652
+ if dep.model_config.load_to_vdl:
653
+ return "vdl." + dependent_model
654
+ conn_name = dep.model_config.get_connection()
655
+ table_name = dep.model_config.get_table()
656
+ return f"db_{conn_name}.{table_name}"
657
+ return dependent_model
658
+
659
+ kwargs["ref"] = ref
660
+ return kwargs
661
+
662
+ def _compile_sql_model(
663
+ self, query_file: mq.SqlQueryFile, ctx: dict[str, Any], ctx_args: ContextArgs, models_dict: dict[str, DataModel]
664
+ ) -> mq.SqlModelQuery:
665
+ kwargs = self._get_compile_sql_model_args(ctx, ctx_args, models_dict)
666
+ compiled_query_str = self._get_compiled_sql_query_str(query_file.raw_query, kwargs)
667
+ compiled_query = mq.SqlModelQuery(compiled_query_str, is_duckdb=True)
668
+ return compiled_query
669
+
670
+ def _get_python_model_args(self, ctx: dict[str, Any], ctx_args: ContextArgs) -> ModelArgs:
671
+ dependencies = self.model_config.depends_on
672
+ connections = self.conn_set.get_connections_as_dict()
673
+
674
+ def _run_external_sql(connection_name: str, sql_query: str) -> pl.DataFrame:
675
+ return self._run_sql_query_on_connection(connection_name, sql_query, ctx_args._placeholders)
676
+
677
+ build_model_args = BuildModelArgs(
678
+ **ctx_args._conn_args.__dict__,
679
+ connections=connections, dependencies=dependencies,
680
+ _ref_func=self._ref_for_python, _run_external_sql_func=_run_external_sql
681
+ )
682
+
683
+ # Instantiate ModelArgs with flattened arguments
684
+ combined_args = {
685
+ **ctx_args.__dict__, **build_model_args.__dict__, "ctx": ctx,
686
+ }
687
+ model_args = ModelArgs(**combined_args)
688
+ return model_args
689
+
690
+ def _compile_python_model(
691
+ self, query_file: mq.PyQueryFile, ctx: dict[str, Any], ctx_args: ContextArgs
692
+ ) -> mq.PyModelQuery:
693
+ sqrl_args = self._get_python_model_args(ctx, ctx_args)
694
+
695
+ def compiled_query() -> pl.LazyFrame | pd.DataFrame:
696
+ try:
697
+ return query_file.raw_query(sqrl_args)
698
+ except Exception as e:
699
+ raise FileExecutionError(f'Failed to run "{c.MAIN_FUNC}" function for python model "{self.name}"', e) from e
700
+
701
+ return mq.PyModelQuery(compiled_query)
702
+
703
+ def compile(
704
+ self, ctx: dict[str, Any], ctx_args: ContextArgs, models_dict: dict[str, DataModel], recurse: bool
705
+ ) -> None:
706
+ if self.compiled_query is not None:
707
+ return
708
+ else:
709
+ self.compiled_query = mq.WorkInProgress() # type: ignore
710
+
711
+ start = time.time()
712
+
713
+ if isinstance(self.query_file, mq.SqlQueryFile):
714
+ self.compiled_query = self._compile_sql_model(self.query_file, ctx, ctx_args, models_dict)
715
+ elif isinstance(self.query_file, mq.PyQueryFile):
716
+ self.compiled_query = self._compile_python_model(self.query_file, ctx, ctx_args)
717
+ else:
718
+ raise NotImplementedError(f"Query type not supported: {self.query_file.__class__.__name__}")
719
+
720
+ self.logger.log_activity_time(
721
+ f"compiling federate model '{self.name}'", start,
722
+ additional_data={
723
+ "activity": "compiling data model",
724
+ "model_name": self.name,
725
+ "model_type": self.model_type.value
726
+ }
727
+ )
728
+
729
+ if not recurse:
730
+ return
731
+
732
+ dependencies = self.model_config.depends_on
733
+ self.wait_count = len(dependencies)
734
+
735
+ for name in dependencies:
736
+ dep_model = models_dict[name]
737
+ self._add_upstream(dep_model)
738
+ dep_model.compile(ctx, ctx_args, models_dict, recurse)
739
+
740
+ async def _run_sql_model(self, compiled_query: mq.SqlModelQuery, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
741
+ local_conn = conn.cursor()
742
+ try:
743
+ self.register_all_upstream_python_df(local_conn)
744
+ query = compiled_query.query
745
+
746
+ def create_table(local_conn: duckdb.DuckDBPyConnection):
747
+ # DuckDB doesn't support specifying named parameters that are not used in the query, so filtering them out
748
+ placeholder_exists = lambda key: re.search(r"\$" + key + r"(?!\w)", query)
749
+ existing_placeholders = {key: value for key, value in placeholders.items() if placeholder_exists(key)}
750
+
751
+ create_query = self.model_config.get_sql_for_create(self.name, query)
752
+ self._log_sql_to_run(create_query, existing_placeholders)
753
+ try:
754
+ return local_conn.execute(create_query, existing_placeholders)
755
+ except duckdb.CatalogException as e:
756
+ if self.name == "__fake_target":
757
+ raise InvalidInputError(409, "invalid_sql_query", f"Provided SQL query depends on static data models that cannot be found. Try building the Virtual Data Lake (VDL) first.")
758
+ else:
759
+ raise InvalidInputError(409, f'dependent_data_model_not_found', f'Model "{self.name}" depends on static data models that cannot be found. Try building the Virtual Data Lake (VDL) first.')
760
+ except Exception as e:
761
+ if self.name == "__fake_target":
762
+ raise InvalidInputError(400, "invalid_sql_query", f"Failed to run provided SQL query")
763
+ else:
764
+ raise FileExecutionError(f'Failed to run federate sql model "{self.name}"', e) from e
765
+
766
+ await asyncio.to_thread(create_table, local_conn)
767
+ if self.needs_python_df or self.is_target:
768
+ self.result = await asyncio.to_thread(self._load_duckdb_view_to_python_df, local_conn)
769
+ finally:
770
+ local_conn.close()
771
+
772
+ async def _run_python_model(self, compiled_query: mq.PyModelQuery) -> None:
773
+ query_result = await asyncio.to_thread(compiled_query.query)
774
+ if isinstance(query_result, pd.DataFrame):
775
+ query_result = pl.from_pandas(query_result)
776
+
777
+ self.result = query_result.lazy()
778
+
779
+ async def run_model(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
780
+ start = time.time()
781
+
782
+ if isinstance(self.compiled_query, mq.SqlModelQuery):
783
+ await self._run_sql_model(self.compiled_query, conn, placeholders)
784
+ elif isinstance(self.compiled_query, mq.PyModelQuery):
785
+ await self._run_python_model(self.compiled_query)
786
+ else:
787
+ raise NotImplementedError(f"Query type not supported: {self.query_file.__class__.__name__}")
788
+
789
+ self.logger.log_activity_time(
790
+ f"running federate model '{self.name}'", start,
791
+ additional_data={
792
+ "activity": "running data model",
793
+ "model_name": self.name,
794
+ "model_type": self.model_type.value
795
+ }
796
+ )
797
+
798
+ await super().run_model(conn, placeholders)
799
+
800
+
801
+ @dataclass
802
+ class BuildModel(StaticModel, QueryModel):
803
+ model_config: mc.BuildModelConfig
804
+ query_file: mq.SqlQueryFile | mq.PyQueryFile
805
+ compiled_query: mq.SqlModelQuery | mq.PyModelQuery | None = field(default=None, init=False)
806
+
807
+ @property
808
+ def model_type(self) -> ModelType:
809
+ return ModelType.BUILD
810
+
811
+ def _add_upstream_for_build(self, other: StaticModel) -> None:
812
+ self.upstreams_for_build[other.name] = other
813
+ other.downstreams_for_build[self.name] = self
814
+
815
+ if isinstance(self.query_file, mq.PyQueryFile):
816
+ other.needs_python_df_for_build = True
817
+
818
+ def _get_compile_sql_model_args(
819
+ self, conn_args: ConnectionsArgs, models_dict: dict[str, StaticModel]
820
+ ) -> dict[str, Any]:
821
+ kwargs: dict[str, Any] = {
822
+ "proj_vars": conn_args.proj_vars, "env_vars": conn_args.env_vars
823
+ }
824
+
825
+ def ref_for_build(dependent_model_name: str) -> str:
826
+ dependent_model = self._ref_for_sql(dependent_model_name, models_dict)
827
+ dep = models_dict[dependent_model]
828
+ if isinstance(dep, SourceModel) and not dep.model_config.load_to_vdl:
829
+ conn_name = dep.model_config.get_connection()
830
+ table_name = dep.model_config.get_table()
831
+ return f"db_{conn_name}.{table_name}"
832
+ return dependent_model
833
+
834
+ kwargs["ref"] = ref_for_build
835
+ return kwargs
836
+
837
+ def _compile_sql_model(
838
+ self, query_file: mq.SqlQueryFile, conn_args: ConnectionsArgs, models_dict: dict[str, StaticModel]
839
+ ) -> mq.SqlModelQuery:
840
+ kwargs = self._get_compile_sql_model_args(conn_args, models_dict)
841
+ compiled_query_str = self._get_compiled_sql_query_str(query_file.raw_query, kwargs)
842
+ compiled_query = mq.SqlModelQuery(compiled_query_str, is_duckdb=True)
843
+ return compiled_query
844
+
845
+ def _ref_for_python(self, dependent_model_name: str) -> pl.LazyFrame:
846
+ if dependent_model_name not in self.upstreams_for_build:
847
+ raise u.ConfigurationError(f'Model "{self.name}" must include model "{dependent_model_name}" as a dependency to use')
848
+ df = self.upstreams_for_build[dependent_model_name].result
849
+ assert df is not None
850
+ return df
851
+
852
+ def _get_compile_python_model_args(self, conn_args: ConnectionsArgs) -> BuildModelArgs:
853
+
854
+ def _run_external_sql(connection_name: str, sql_query: str):
855
+ return self._run_sql_query_on_connection(connection_name, sql_query)
856
+
857
+ return BuildModelArgs(
858
+ **conn_args.__dict__,
859
+ connections=self.conn_set.get_connections_as_dict(), dependencies=self.model_config.depends_on,
860
+ _ref_func=self._ref_for_python, _run_external_sql_func=_run_external_sql
861
+ )
862
+
863
+ def _compile_python_model(
864
+ self, query_file: mq.PyQueryFile, conn_args: ConnectionsArgs
865
+ ) -> mq.PyModelQuery:
866
+ sqrl_args = self._get_compile_python_model_args(conn_args)
867
+
868
+ def compiled_query() -> pl.LazyFrame | pd.DataFrame:
869
+ try:
870
+ return query_file.raw_query(sqrl_args)
871
+ except Exception as e:
872
+ raise FileExecutionError(f'Failed to run "{c.MAIN_FUNC}" function for build model "{self.name}"', e)
873
+
874
+ return mq.PyModelQuery(compiled_query)
875
+
876
+ def compile_for_build(self, conn_args: ConnectionsArgs, models_dict: dict[str, StaticModel]) -> None:
877
+ start = time.time()
878
+
879
+ if isinstance(self.query_file, mq.SqlQueryFile):
880
+ self.compiled_query = self._compile_sql_model(self.query_file, conn_args, models_dict)
881
+ elif isinstance(self.query_file, mq.PyQueryFile):
882
+ self.compiled_query = self._compile_python_model(self.query_file, conn_args)
883
+ else:
884
+ raise NotImplementedError(f"Query type not supported: {self.query_file.__class__.__name__}")
885
+
886
+ self.logger.log_activity_time(
887
+ f"compiling build model '{self.name}'", start,
888
+ additional_data={
889
+ "activity": "compiling data model",
890
+ "model_name": self.name,
891
+ "model_type": self.model_type.value
892
+ }
893
+ )
894
+
895
+ dependencies = self.model_config.depends_on
896
+ self.wait_count_for_build = len(dependencies)
897
+
898
+ for name in dependencies:
899
+ dep_model = models_dict[name]
900
+ self._add_upstream_for_build(dep_model)
901
+
902
+ async def _build_sql_model(self, compiled_query: mq.SqlModelQuery, conn: duckdb.DuckDBPyConnection) -> None:
903
+ query = compiled_query.query
904
+
905
+ def create_table():
906
+ create_query = self.model_config.get_sql_for_build(self.name, query)
907
+ local_conn = conn.cursor()
908
+ # local_conn = conn
909
+ try:
910
+ return u.run_duckdb_stmt(self.logger, local_conn, create_query, model_name=self.name)
911
+ except Exception as e:
912
+ raise FileExecutionError(f'Failed to build static sql model "{self.name}"', e) from e
913
+ finally:
914
+ local_conn.close()
915
+ # pass
916
+
917
+ # await asyncio.to_thread(create_table)
918
+ create_table() # without threading
919
+
920
+ async def _build_python_model(self, compiled_query: mq.PyModelQuery, conn: duckdb.DuckDBPyConnection) -> None:
921
+ query_result = await asyncio.to_thread(compiled_query.query)
922
+ if isinstance(query_result, pd.DataFrame):
923
+ query_result = pl.from_pandas(query_result).lazy()
924
+ if self.needs_python_df_for_build:
925
+ self.result = query_result.lazy()
926
+ # await asyncio.to_thread(self._create_table_from_df, conn, query_result)
927
+ self._create_table_from_df(conn, query_result) # without threading
928
+
929
+ async def build_model(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
930
+ start = time.time()
931
+ print(f"[{u.get_current_time()}] 🔨 BUILDING: build model '{self.name}'")
932
+
933
+ if isinstance(self.compiled_query, mq.SqlModelQuery):
934
+ await self._build_sql_model(self.compiled_query, conn)
935
+ elif isinstance(self.compiled_query, mq.PyModelQuery):
936
+ # First ensure all upstream models have an associated Python dataframe
937
+ def load_df(conn: duckdb.DuckDBPyConnection, dep_model: DataModel):
938
+ if dep_model.result is None:
939
+ local_conn = conn.cursor()
940
+ # local_conn = conn
941
+ try:
942
+ dep_model.result = dep_model._load_duckdb_view_to_python_df(local_conn)
943
+ finally:
944
+ local_conn.close()
945
+ # pass
946
+
947
+ coroutines = []
948
+ for dep_model in self.upstreams_for_build.values():
949
+ coro = asyncio.to_thread(load_df, conn, dep_model)
950
+ coroutines.append(coro)
951
+ await u.asyncio_gather(coroutines)
952
+
953
+ # Then run the model's Python function to build the model
954
+ await self._build_python_model(self.compiled_query, conn)
955
+ else:
956
+ raise NotImplementedError(f"Query type not supported: {self.query_file.__class__.__name__}")
957
+
958
+ print(f"[{u.get_current_time()}] ✅ FINISHED: build model '{self.name}'")
959
+ self.logger.log_activity_time(
960
+ f"building static build model '{self.name}' into VDL", start,
961
+ additional_data={
962
+ "activity": "building data model into VDL",
963
+ "model_name": self.name,
964
+ "model_type": self.model_type.value
965
+ }
966
+ )
967
+
968
+ await super().build_model(conn, full_refresh)
969
+
970
+
971
+ @dataclass
972
+ class DAG:
973
+ dataset: DatasetConfig | None
974
+ target_model: DataModel
975
+ models_dict: dict[str, DataModel]
976
+ datalake_db_path: str | None = field(default=None)
977
+ logger: u.Logger = field(default_factory=lambda: u.Logger(""))
978
+ parameter_set: ParameterSet | None = field(default=None, init=False) # set in apply_selections
979
+ placeholders: dict[str, Any] = field(init=False, default_factory=dict)
980
+
981
+ def _get_msg_extension(self) -> str:
982
+ return f" for dataset '{self.dataset.name}'" if self.dataset else ""
983
+
984
+ def compile_build_models(self, conn_args: ConnectionsArgs) -> None:
985
+ static_models: dict[str, StaticModel] = {
986
+ k: v for k, v in self.models_dict.items() if isinstance(v, StaticModel)
987
+ }
988
+ for model in static_models.values():
989
+ if isinstance(model, BuildModel):
990
+ model.compile_for_build(conn_args, static_models)
991
+
992
+ def apply_selections(
993
+ self, param_cfg_set: ParameterConfigsSet, user: AbstractUser, selections: dict[str, str]
994
+ ) -> None:
995
+ start = time.time()
996
+
997
+ dataset_params = self.dataset.parameters if self.dataset else None
998
+ parameter_set = param_cfg_set.apply_selections(dataset_params, selections, user)
999
+ self.parameter_set = parameter_set
1000
+ msg_extension = self._get_msg_extension()
1001
+
1002
+ dataset_name = self.dataset.name if self.dataset else None
1003
+ self.logger.log_activity_time(
1004
+ "applying selections" + msg_extension, start,
1005
+ additional_data={"activity": "applying selections", "dataset_name": dataset_name}
1006
+ )
1007
+
1008
+ def _compile_context(
1009
+ self, param_args: ParametersArgs, context_func: ContextFunc, user: AbstractUser, configurables: dict[str, str]
1010
+ ) -> tuple[dict[str, Any], ContextArgs]:
1011
+ start = time.time()
1012
+
1013
+ context = {}
1014
+ assert isinstance(self.parameter_set, ParameterSet)
1015
+ prms = self.parameter_set.get_parameters_as_dict()
1016
+ args = ContextArgs(
1017
+ **param_args.__dict__, user=user, prms=prms, configurables=configurables, _conn_args=param_args
1018
+ )
1019
+ msg_extension = self._get_msg_extension()
1020
+
1021
+ try:
1022
+ context_func(context, args)
1023
+ except Exception as e:
1024
+ raise FileExecutionError(f'Failed to run {c.CONTEXT_FILE}' + msg_extension, e) from e
1025
+
1026
+ dataset_name = self.dataset.name if self.dataset else None
1027
+ self.logger.log_activity_time(
1028
+ "running context.py" + msg_extension, start,
1029
+ additional_data={"activity": "running context.py", "dataset_name": dataset_name}
1030
+ )
1031
+ return context, args
1032
+
1033
+ def _compile_models(self, context: dict[str, Any], ctx_args: ContextArgs, recurse: bool) -> None:
1034
+ self.target_model.compile(context, ctx_args, self.models_dict, recurse)
1035
+
1036
+ def _get_terminal_nodes(self) -> set[str]:
1037
+ start = time.time()
1038
+ terminal_nodes = self.target_model.get_terminal_nodes(set())
1039
+ for model in self.models_dict.values():
1040
+ model.confirmed_no_cycles = False
1041
+ self.logger.log_activity_time("validating no cycles in model dependencies", start)
1042
+ return terminal_nodes
1043
+
1044
+ def _attach_connections_with_type_duckdb(self, conn: duckdb.DuckDBPyConnection) -> None:
1045
+ for conn_name, connection in self.target_model.conn_set.get_connections_as_dict().items():
1046
+ if not isinstance(connection, ConnectionProperties):
1047
+ continue
1048
+ attach_uri = connection.attach_uri_for_duckdb
1049
+ if attach_uri is None:
1050
+ continue
1051
+ attach_stmt = f"ATTACH IF NOT EXISTS '{attach_uri}' AS db_{conn_name} (READ_ONLY)"
1052
+ u.run_duckdb_stmt(self.logger, conn, attach_stmt, redacted_values=[attach_uri])
1053
+
1054
+ async def _run_models(self) -> None:
1055
+ terminal_nodes = self._get_terminal_nodes()
1056
+
1057
+ conn = u.create_duckdb_connection(datalake_db_path=self.datalake_db_path)
1058
+ try:
1059
+ self._attach_connections_with_type_duckdb(conn)
1060
+
1061
+ coroutines = []
1062
+ for model_name in terminal_nodes:
1063
+ model = self.models_dict[model_name] if model_name != "__fake_target" else self.target_model
1064
+ coroutines.append(model.run_model(conn, self.placeholders))
1065
+ await u.asyncio_gather(coroutines)
1066
+
1067
+ finally:
1068
+ conn.close()
1069
+
1070
+ async def execute(
1071
+ self, param_args: ParametersArgs, param_cfg_set: ParameterConfigsSet, context_func: ContextFunc, user: AbstractUser, selections: dict[str, str],
1072
+ *, runquery: bool = True, recurse: bool = True, configurables: dict[str, str] = {}
1073
+ ) -> None:
1074
+ recurse = (recurse or runquery)
1075
+
1076
+ self.apply_selections(param_cfg_set, user, selections)
1077
+
1078
+ context, ctx_args = self._compile_context(param_args, context_func, user, configurables)
1079
+
1080
+ self._compile_models(context, ctx_args, recurse)
1081
+
1082
+ self.placeholders = dict(ctx_args._placeholders)
1083
+ if runquery:
1084
+ await self._run_models()
1085
+
1086
+ self.target_model.process_pass_through_columns(self.models_dict)
1087
+
1088
+ def get_all_query_models(self) -> set[str]:
1089
+ all_model_names = set()
1090
+ self.target_model.retrieve_dependent_query_models(all_model_names)
1091
+ return all_model_names
1092
+
1093
+ def get_all_data_models(self) -> list[rm.DataModelItem]:
1094
+ data_models = []
1095
+ for model_name, model in self.models_dict.items():
1096
+ is_queryable = model.is_queryable
1097
+ data_model = rm.DataModelItem(name=model_name, model_type=model.model_type.value, config=model.model_config, is_queryable=is_queryable)
1098
+ data_models.append(data_model)
1099
+ return data_models
1100
+
1101
+ def get_all_model_lineage(self) -> list[rm.LineageRelation]:
1102
+ model_lineage = []
1103
+ for model_name, model in self.models_dict.items():
1104
+ if not isinstance(model, QueryModel):
1105
+ continue
1106
+ for dep_model_name in model.model_config.depends_on:
1107
+ edge_type = "buildtime" if isinstance(model, BuildModel) else "runtime"
1108
+ source_model = rm.LineageNode(name=dep_model_name, type="model")
1109
+ target_model = rm.LineageNode(name=model_name, type="model")
1110
+ model_lineage.append(rm.LineageRelation(type=edge_type, source=source_model, target=target_model))
1111
+ return model_lineage
1112
+
1113
+
1114
+ class ModelsIO:
1115
+
1116
+ @classmethod
1117
+ def _load_model_config(cls, filepath: Path, model_type: ModelType, env_vars: SquirrelsEnvVars) -> mc.ModelConfig:
1118
+ yaml_path = filepath.with_suffix('.yml')
1119
+ config_dict = u.load_yaml_config(yaml_path) if yaml_path.exists() else {}
1120
+
1121
+ if model_type == ModelType.DBVIEW:
1122
+ default_conn_name = env_vars.connections_default_name_used
1123
+ config = mc.DbviewModelConfig(**config_dict).finalize_connection(default_conn_name=default_conn_name)
1124
+ return config
1125
+ elif model_type == ModelType.FEDERATE:
1126
+ return mc.FederateModelConfig(**config_dict)
1127
+ elif model_type == ModelType.BUILD:
1128
+ return mc.BuildModelConfig(**config_dict)
1129
+ else:
1130
+ return mc.ModelConfig(**config_dict)
1131
+
1132
+ @classmethod
1133
+ def _populate_from_file(
1134
+ cls, raw_queries_by_model: dict[str, mq.QueryFileWithConfig], dp: str, file: str, model_type: ModelType, env_vars: SquirrelsEnvVars
1135
+ ) -> None:
1136
+ filepath = Path(dp, file)
1137
+ file_stem, extension = os.path.splitext(file)
1138
+
1139
+ if extension == '.py':
1140
+ module = pm.PyModule(filepath, project_path=env_vars.project_path)
1141
+ raw_query = module.get_func_or_class(c.MAIN_FUNC)
1142
+ query_file = mq.PyQueryFile(filepath.as_posix(), raw_query)
1143
+ elif extension == '.sql':
1144
+ query_file = mq.SqlQueryFile(filepath.as_posix(), filepath.read_text())
1145
+ else:
1146
+ return # Skip files that are not query files
1147
+
1148
+ if file_stem in raw_queries_by_model:
1149
+ assert isinstance(prior_query_file := raw_queries_by_model[file_stem].query_file, mq.QueryFile)
1150
+ conflicts = [prior_query_file.filepath, query_file.filepath]
1151
+ raise u.ConfigurationError(f"Multiple models found for '{file_stem}': {conflicts}")
1152
+
1153
+ model_config = cls._load_model_config(filepath, model_type, env_vars)
1154
+ raw_queries_by_model[file_stem] = mq.QueryFileWithConfig(query_file, model_config)
1155
+
1156
+ @classmethod
1157
+ def _populate_raw_queries_for_type(
1158
+ cls, folder_path: Path, model_type: ModelType, env_vars: SquirrelsEnvVars
1159
+ ) -> dict[str, mq.QueryFileWithConfig]:
1160
+ raw_queries_by_model: dict[str, mq.QueryFileWithConfig] = {}
1161
+ for dp, _, filenames in os.walk(folder_path):
1162
+ for file in filenames:
1163
+ cls._populate_from_file(raw_queries_by_model, dp, file, model_type, env_vars)
1164
+ return raw_queries_by_model
1165
+
1166
+ @classmethod
1167
+ def load_build_files(cls, logger: u.Logger, env_vars: SquirrelsEnvVars) -> dict[str, mq.QueryFileWithConfig]:
1168
+ start = time.time()
1169
+ builds_path = u.Path(env_vars.project_path, c.MODELS_FOLDER, c.BUILDS_FOLDER)
1170
+ raw_queries_by_model = cls._populate_raw_queries_for_type(builds_path, ModelType.BUILD, env_vars=env_vars)
1171
+ logger.log_activity_time("loading build files", start)
1172
+ return raw_queries_by_model
1173
+
1174
+ @classmethod
1175
+ def load_dbview_files(cls, logger: u.Logger, env_vars: SquirrelsEnvVars) -> dict[str, mq.QueryFileWithConfig]:
1176
+ start = time.time()
1177
+ dbviews_path = u.Path(env_vars.project_path, c.MODELS_FOLDER, c.DBVIEWS_FOLDER)
1178
+ raw_queries_by_model = cls._populate_raw_queries_for_type(dbviews_path, ModelType.DBVIEW, env_vars=env_vars)
1179
+ logger.log_activity_time("loading dbview files", start)
1180
+ return raw_queries_by_model
1181
+
1182
+ @classmethod
1183
+ def load_federate_files(cls, logger: u.Logger, env_vars: SquirrelsEnvVars) -> dict[str, mq.QueryFileWithConfig]:
1184
+ start = time.time()
1185
+ federates_path = u.Path(env_vars.project_path, c.MODELS_FOLDER, c.FEDERATES_FOLDER)
1186
+ raw_queries_by_model = cls._populate_raw_queries_for_type(federates_path, ModelType.FEDERATE, env_vars=env_vars)
1187
+ logger.log_activity_time("loading federate files", start)
1188
+ return raw_queries_by_model
1189
+
1190
+ @classmethod
1191
+ def load_context_func(cls, logger: u.Logger, project_path: str) -> ContextFunc:
1192
+ start = time.time()
1193
+
1194
+ context_path = u.Path(project_path, c.PYCONFIGS_FOLDER, c.CONTEXT_FILE)
1195
+ context_func: ContextFunc = pm.PyModule(
1196
+ context_path, project_path=project_path
1197
+ ).get_func_or_class(c.MAIN_FUNC, default_attr=lambda ctx, sqrl: None)
1198
+
1199
+ logger.log_activity_time("loading file for context.py", start)
1200
+ return context_func
1201
+