squirrels 0.5.0b4__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of squirrels might be problematic. Click here for more details.
- squirrels/__init__.py +2 -0
- squirrels/_api_routes/auth.py +83 -74
- squirrels/_api_routes/base.py +58 -41
- squirrels/_api_routes/dashboards.py +37 -21
- squirrels/_api_routes/data_management.py +72 -27
- squirrels/_api_routes/datasets.py +107 -84
- squirrels/_api_routes/oauth2.py +11 -13
- squirrels/_api_routes/project.py +71 -33
- squirrels/_api_server.py +130 -63
- squirrels/_arguments/run_time_args.py +9 -9
- squirrels/_auth.py +117 -162
- squirrels/_command_line.py +68 -32
- squirrels/_compile_prompts.py +147 -0
- squirrels/_connection_set.py +11 -2
- squirrels/_constants.py +22 -8
- squirrels/_data_sources.py +38 -32
- squirrels/_dataset_types.py +2 -4
- squirrels/_initializer.py +1 -1
- squirrels/_logging.py +117 -0
- squirrels/_manifest.py +125 -58
- squirrels/_model_builder.py +10 -54
- squirrels/_models.py +224 -108
- squirrels/_package_data/base_project/.env +15 -4
- squirrels/_package_data/base_project/.env.example +14 -3
- squirrels/_package_data/base_project/connections.yml +4 -3
- squirrels/_package_data/base_project/dashboards/dashboard_example.py +2 -2
- squirrels/_package_data/base_project/dashboards/dashboard_example.yml +4 -4
- squirrels/_package_data/base_project/duckdb_init.sql +1 -0
- squirrels/_package_data/base_project/models/dbviews/dbview_example.sql +7 -2
- squirrels/_package_data/base_project/models/dbviews/dbview_example.yml +16 -10
- squirrels/_package_data/base_project/models/federates/federate_example.py +22 -15
- squirrels/_package_data/base_project/models/federates/federate_example.sql +3 -7
- squirrels/_package_data/base_project/models/federates/federate_example.yml +1 -1
- squirrels/_package_data/base_project/models/sources.yml +5 -6
- squirrels/_package_data/base_project/parameters.yml +24 -38
- squirrels/_package_data/base_project/pyconfigs/connections.py +5 -1
- squirrels/_package_data/base_project/pyconfigs/context.py +23 -12
- squirrels/_package_data/base_project/pyconfigs/parameters.py +68 -33
- squirrels/_package_data/base_project/pyconfigs/user.py +11 -18
- squirrels/_package_data/base_project/seeds/seed_categories.yml +1 -1
- squirrels/_package_data/base_project/seeds/seed_subcategories.yml +1 -1
- squirrels/_package_data/base_project/squirrels.yml.j2 +18 -28
- squirrels/_package_data/templates/squirrels_studio.html +20 -0
- squirrels/_parameter_configs.py +43 -22
- squirrels/_parameter_options.py +1 -1
- squirrels/_parameter_sets.py +8 -10
- squirrels/_project.py +351 -234
- squirrels/_request_context.py +33 -0
- squirrels/_schemas/auth_models.py +32 -9
- squirrels/_schemas/query_param_models.py +9 -1
- squirrels/_schemas/response_models.py +36 -10
- squirrels/_seeds.py +1 -1
- squirrels/_sources.py +23 -19
- squirrels/_utils.py +83 -35
- squirrels/_version.py +1 -1
- squirrels/arguments.py +5 -0
- squirrels/auth.py +4 -1
- squirrels/connections.py +2 -0
- squirrels/dashboards.py +3 -1
- squirrels/data_sources.py +6 -0
- squirrels/parameter_options.py +5 -0
- squirrels/parameters.py +5 -0
- squirrels/types.py +6 -1
- {squirrels-0.5.0b4.dist-info → squirrels-0.5.1.dist-info}/METADATA +28 -13
- squirrels-0.5.1.dist-info/RECORD +98 -0
- squirrels-0.5.0b4.dist-info/RECORD +0 -94
- {squirrels-0.5.0b4.dist-info → squirrels-0.5.1.dist-info}/WHEEL +0 -0
- {squirrels-0.5.0b4.dist-info → squirrels-0.5.1.dist-info}/entry_points.txt +0 -0
- {squirrels-0.5.0b4.dist-info → squirrels-0.5.1.dist-info}/licenses/LICENSE +0 -0
squirrels/_models.py
CHANGED
|
@@ -5,26 +5,26 @@ from abc import ABCMeta, abstractmethod
|
|
|
5
5
|
from enum import Enum
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
import asyncio, os, re, time, duckdb, sqlglot
|
|
8
|
-
import polars as pl, pandas as pd
|
|
8
|
+
import polars as pl, pandas as pd
|
|
9
9
|
|
|
10
10
|
from . import _constants as c, _utils as u, _py_module as pm, _model_queries as mq, _model_configs as mc, _sources as src
|
|
11
11
|
from ._schemas import response_models as rm
|
|
12
12
|
from ._exceptions import FileExecutionError, InvalidInputError
|
|
13
13
|
from ._arguments.run_time_args import ContextArgs, ModelArgs, BuildModelArgs
|
|
14
|
-
from ._auth import
|
|
14
|
+
from ._auth import AbstractUser
|
|
15
15
|
from ._connection_set import ConnectionsArgs, ConnectionSet, ConnectionProperties
|
|
16
|
-
from ._manifest import DatasetConfig
|
|
16
|
+
from ._manifest import DatasetConfig, ConnectionTypeEnum
|
|
17
17
|
from ._parameter_sets import ParameterConfigsSet, ParametersArgs, ParameterSet
|
|
18
18
|
|
|
19
19
|
ContextFunc = Callable[[dict[str, Any], ContextArgs], None]
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
class ModelType(Enum):
|
|
23
|
+
SEED = "seed"
|
|
23
24
|
SOURCE = "source"
|
|
25
|
+
BUILD = "build"
|
|
24
26
|
DBVIEW = "dbview"
|
|
25
27
|
FEDERATE = "federate"
|
|
26
|
-
SEED = "seed"
|
|
27
|
-
BUILD = "build"
|
|
28
28
|
|
|
29
29
|
|
|
30
30
|
@dataclass
|
|
@@ -79,15 +79,15 @@ class DataModel(metaclass=ABCMeta):
|
|
|
79
79
|
self.confirmed_no_cycles = True
|
|
80
80
|
return terminal_nodes
|
|
81
81
|
|
|
82
|
-
def _load_duckdb_view_to_python_df(self, conn: duckdb.DuckDBPyConnection, *,
|
|
83
|
-
table_name = ("
|
|
82
|
+
def _load_duckdb_view_to_python_df(self, conn: duckdb.DuckDBPyConnection, *, use_datalake: bool = False) -> pl.LazyFrame:
|
|
83
|
+
table_name = ("vdl." if use_datalake else "") + self.name
|
|
84
84
|
try:
|
|
85
85
|
return conn.sql(f"FROM {table_name}").pl().lazy()
|
|
86
86
|
except duckdb.CatalogException as e:
|
|
87
87
|
raise u.ConfigurationError(f'Failed to load duckdb table or view "{self.name}" to python dataframe') from e
|
|
88
88
|
|
|
89
89
|
def _run_sql_query_on_connection(self, connection_name: str, query: str, placeholders: dict = {}) -> pl.DataFrame:
|
|
90
|
-
self.logger.
|
|
90
|
+
self.logger.debug(f"Running SQL query on connection '{connection_name}':\n{query}")
|
|
91
91
|
return self.conn_set.run_sql_query_from_conn_name(query, connection_name, placeholders)
|
|
92
92
|
|
|
93
93
|
async def _trigger(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
|
|
@@ -133,11 +133,13 @@ class DataModel(metaclass=ABCMeta):
|
|
|
133
133
|
|
|
134
134
|
def _create_table_from_df(self, conn: duckdb.DuckDBPyConnection, query_result: pl.LazyFrame | pd.DataFrame):
|
|
135
135
|
local_conn = conn.cursor()
|
|
136
|
+
# local_conn = conn
|
|
136
137
|
try:
|
|
137
|
-
|
|
138
|
-
local_conn.execute(f"CREATE OR REPLACE TABLE {self.name} AS
|
|
138
|
+
assert query_result is not None
|
|
139
|
+
local_conn.execute(f"CREATE OR REPLACE TABLE {self.name} AS FROM query_result")
|
|
139
140
|
finally:
|
|
140
141
|
local_conn.close()
|
|
142
|
+
# pass
|
|
141
143
|
|
|
142
144
|
def process_pass_through_columns(self, models_dict: dict[str, DataModel]) -> None:
|
|
143
145
|
pass
|
|
@@ -172,19 +174,26 @@ class StaticModel(DataModel):
|
|
|
172
174
|
def _get_result(self, conn: duckdb.DuckDBPyConnection) -> pl.LazyFrame:
|
|
173
175
|
local_conn = conn.cursor()
|
|
174
176
|
try:
|
|
175
|
-
return self._load_duckdb_view_to_python_df(local_conn,
|
|
177
|
+
return self._load_duckdb_view_to_python_df(local_conn, use_datalake=True)
|
|
176
178
|
except Exception as e:
|
|
177
|
-
raise InvalidInputError(409, f'
|
|
179
|
+
raise InvalidInputError(409, f'dependent_data_model_not_found', f'Model "{self.name}" depends on static data models that cannot be found. Try building the Virtual Data Lake (VDL) first.')
|
|
178
180
|
finally:
|
|
179
181
|
local_conn.close()
|
|
180
182
|
|
|
181
183
|
async def run_model(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
|
|
182
|
-
start = time.time()
|
|
183
|
-
|
|
184
184
|
if (self.needs_python_df or self.is_target) and self.result is None:
|
|
185
|
+
start = time.time()
|
|
186
|
+
|
|
185
187
|
self.result = await asyncio.to_thread(self._get_result, conn)
|
|
186
|
-
|
|
187
|
-
|
|
188
|
+
|
|
189
|
+
self.logger.log_activity_time(
|
|
190
|
+
f"loading {self.model_type.value} model '{self.name}' into memory", start,
|
|
191
|
+
additional_data={
|
|
192
|
+
"activity": "loading static data model into memory",
|
|
193
|
+
"model_name": self.name,
|
|
194
|
+
"model_type": self.model_type.value
|
|
195
|
+
}
|
|
196
|
+
)
|
|
188
197
|
|
|
189
198
|
await super().run_model(conn, placeholders)
|
|
190
199
|
|
|
@@ -225,10 +234,18 @@ class Seed(StaticModel):
|
|
|
225
234
|
start = time.time()
|
|
226
235
|
|
|
227
236
|
print(f"[{u.get_current_time()}] 🔨 BUILDING: seed model '{self.name}'")
|
|
228
|
-
await asyncio.to_thread(self._create_table_from_df, conn, self.result)
|
|
237
|
+
# await asyncio.to_thread(self._create_table_from_df, conn, self.result)
|
|
238
|
+
self._create_table_from_df(conn, self.result) # without threading
|
|
229
239
|
|
|
230
240
|
print(f"[{u.get_current_time()}] ✅ FINISHED: seed model '{self.name}'")
|
|
231
|
-
self.logger.log_activity_time(
|
|
241
|
+
self.logger.log_activity_time(
|
|
242
|
+
f"building seed model '{self.name}' into VDL", start,
|
|
243
|
+
additional_data={
|
|
244
|
+
"activity": "building data model into VDL",
|
|
245
|
+
"model_name": self.name,
|
|
246
|
+
"model_type": self.model_type.value
|
|
247
|
+
}
|
|
248
|
+
)
|
|
232
249
|
|
|
233
250
|
await super().build_model(conn, full_refresh)
|
|
234
251
|
|
|
@@ -240,24 +257,32 @@ class SourceModel(StaticModel):
|
|
|
240
257
|
@property
|
|
241
258
|
def model_type(self) -> ModelType:
|
|
242
259
|
return ModelType.SOURCE
|
|
260
|
+
|
|
261
|
+
@property
|
|
262
|
+
def connection_props(self) -> ConnectionProperties:
|
|
263
|
+
conn_name = self.model_config.get_connection()
|
|
264
|
+
conn_props = self.conn_set.get_connection(conn_name)
|
|
265
|
+
if isinstance(conn_props, ConnectionProperties):
|
|
266
|
+
return conn_props
|
|
267
|
+
raise u.ConfigurationError(f'Unable to use connection "{conn_name}" for source "{self.name}". Connection "{conn_name}" must be a ConnectionProperties object')
|
|
243
268
|
|
|
244
269
|
@property
|
|
245
270
|
def is_queryable(self) -> bool:
|
|
246
|
-
|
|
271
|
+
connection_props = self.connection_props
|
|
272
|
+
return self.model_config.load_to_vdl or connection_props.type == ConnectionTypeEnum.DUCKDB
|
|
247
273
|
|
|
248
274
|
def _build_source_model(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
|
|
249
275
|
local_conn = conn.cursor()
|
|
276
|
+
# local_conn = conn
|
|
277
|
+
|
|
278
|
+
local_conn.begin()
|
|
250
279
|
try:
|
|
251
280
|
source = self.model_config
|
|
252
281
|
conn_name = source.get_connection()
|
|
253
282
|
|
|
254
|
-
connection_props = self.
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
attach_uri = connection_props.attach_uri_for_duckdb
|
|
258
|
-
else:
|
|
259
|
-
raise u.ConfigurationError(f'Unable to use connection "{conn_name}" for source "{self.name}". Connection "{conn_name}" must be a ConnectionProperties object')
|
|
260
|
-
|
|
283
|
+
connection_props = self.connection_props
|
|
284
|
+
dialect = connection_props.dialect
|
|
285
|
+
attach_uri = connection_props.attach_uri_for_duckdb
|
|
261
286
|
if attach_uri is None:
|
|
262
287
|
raise u.ConfigurationError(f'Loading to duckdb is not supported for source "{self.name}" since its connection "{conn_name}" uses an unsupported dialect')
|
|
263
288
|
|
|
@@ -269,8 +294,9 @@ class SourceModel(StaticModel):
|
|
|
269
294
|
new_table_name = self.name
|
|
270
295
|
|
|
271
296
|
if len(source.columns) == 0:
|
|
272
|
-
stmt = f"CREATE OR REPLACE TABLE {new_table_name} AS
|
|
297
|
+
stmt = f"CREATE OR REPLACE TABLE {new_table_name} AS FROM db_{conn_name}.{table_name}"
|
|
273
298
|
u.run_duckdb_stmt(self.logger, local_conn, stmt)
|
|
299
|
+
local_conn.commit()
|
|
274
300
|
return
|
|
275
301
|
|
|
276
302
|
increasing_column = source.update_hints.increasing_column
|
|
@@ -297,25 +323,44 @@ class SourceModel(StaticModel):
|
|
|
297
323
|
if max_val_of_incr_col is None:
|
|
298
324
|
recreate_table = True
|
|
299
325
|
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
326
|
+
query = source.get_query_for_upsert(dialect, conn_name, table_name, max_val_of_incr_col, full_refresh=recreate_table)
|
|
327
|
+
|
|
328
|
+
primary_keys = ", ".join(source.primary_key) if source.primary_key else ""
|
|
329
|
+
match_condition = f"USING ({primary_keys})" if primary_keys else "ON false"
|
|
330
|
+
stmt = (
|
|
331
|
+
f"MERGE INTO {new_table_name} "
|
|
332
|
+
f"USING ({query}) AS src "
|
|
333
|
+
f"{match_condition} "
|
|
334
|
+
f"WHEN MATCHED THEN UPDATE "
|
|
335
|
+
f"WHEN NOT MATCHED THEN INSERT BY NAME"
|
|
336
|
+
)
|
|
304
337
|
u.run_duckdb_stmt(self.logger, local_conn, stmt)
|
|
338
|
+
|
|
339
|
+
local_conn.commit()
|
|
340
|
+
|
|
305
341
|
finally:
|
|
306
342
|
local_conn.close()
|
|
343
|
+
# pass
|
|
307
344
|
|
|
308
345
|
async def build_model(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
|
|
309
|
-
if self.model_config.
|
|
346
|
+
if self.model_config.load_to_vdl:
|
|
310
347
|
start = time.time()
|
|
311
348
|
print(f"[{u.get_current_time()}] 🔨 BUILDING: source model '{self.name}'")
|
|
312
349
|
|
|
313
|
-
await asyncio.to_thread(self._build_source_model, conn, full_refresh)
|
|
350
|
+
# await asyncio.to_thread(self._build_source_model, conn, full_refresh)
|
|
351
|
+
self._build_source_model(conn, full_refresh) # without threading
|
|
314
352
|
|
|
315
353
|
print(f"[{u.get_current_time()}] ✅ FINISHED: source model '{self.name}'")
|
|
316
|
-
self.logger.log_activity_time(
|
|
354
|
+
self.logger.log_activity_time(
|
|
355
|
+
f"building source model '{self.name}' into VDL", start,
|
|
356
|
+
additional_data={
|
|
357
|
+
"activity": "building data model into VDL",
|
|
358
|
+
"model_name": self.name,
|
|
359
|
+
"model_type": self.model_type.value
|
|
360
|
+
}
|
|
361
|
+
)
|
|
317
362
|
|
|
318
|
-
|
|
363
|
+
await super().build_model(conn, full_refresh)
|
|
319
364
|
|
|
320
365
|
|
|
321
366
|
@dataclass
|
|
@@ -338,10 +383,16 @@ class QueryModel(DataModel):
|
|
|
338
383
|
raise u.ConfigurationError(f'Model "{self.name}" references unknown model "{dependent_model_name}"')
|
|
339
384
|
|
|
340
385
|
dep_model = models_dict[dependent_model_name]
|
|
341
|
-
if isinstance(dep_model, SourceModel) and not dep_model.model_config.
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
)
|
|
386
|
+
if isinstance(dep_model, SourceModel) and not dep_model.model_config.load_to_vdl:
|
|
387
|
+
# Allow when caller is Build or Federate AND the source connection is duckdb; else error
|
|
388
|
+
conn_name = dep_model.model_config.get_connection()
|
|
389
|
+
conn_props = self.conn_set.get_connection(conn_name)
|
|
390
|
+
is_duckdb_conn = isinstance(conn_props, ConnectionProperties) and conn_props.type == ConnectionTypeEnum.DUCKDB
|
|
391
|
+
if not is_duckdb_conn:
|
|
392
|
+
raise u.ConfigurationError(
|
|
393
|
+
f'Model "{self.name}" cannot reference source model "{dependent_model_name}". '
|
|
394
|
+
'To be referenced by a build or federate model, the source must have load_to_vdl=True or a duckdb connection type.'
|
|
395
|
+
)
|
|
345
396
|
|
|
346
397
|
self.model_config.depends_on.add(dependent_model_name)
|
|
347
398
|
return dependent_model_name
|
|
@@ -359,7 +410,7 @@ class QueryModel(DataModel):
|
|
|
359
410
|
is_placeholder = lambda placeholder: placeholder in ctx_args._placeholders_copy
|
|
360
411
|
kwargs = {
|
|
361
412
|
"proj_vars": ctx_args.proj_vars, "env_vars": ctx_args.env_vars, "user": ctx_args.user, "prms": ctx_args.prms,
|
|
362
|
-
"
|
|
413
|
+
"configurables": ctx_args.configurables, "ctx": ctx, "is_placeholder": is_placeholder, "set_placeholder": ctx_args.set_placeholder,
|
|
363
414
|
"param_exists": ctx_args.param_exists
|
|
364
415
|
}
|
|
365
416
|
return kwargs
|
|
@@ -433,7 +484,7 @@ class QueryModel(DataModel):
|
|
|
433
484
|
def _log_sql_to_run(self, sql: str, placeholders: dict[str, Any]) -> None:
|
|
434
485
|
log_msg = f"SQL to run for model '{self.name}':\n{sql}"
|
|
435
486
|
log_msg += f"\n\n(with placeholders: {placeholders})"
|
|
436
|
-
self.logger.
|
|
487
|
+
self.logger.debug(log_msg)
|
|
437
488
|
|
|
438
489
|
|
|
439
490
|
@dataclass
|
|
@@ -458,11 +509,11 @@ class DbviewModel(QueryModel):
|
|
|
458
509
|
if source_model.model_config.get_connection() != self.model_config.get_connection():
|
|
459
510
|
raise u.ConfigurationError(f'Dbview "{self.name}" references source "{source_name}" with different connection')
|
|
460
511
|
|
|
461
|
-
# Check if the source model has
|
|
462
|
-
if not source_model.model_config.
|
|
512
|
+
# Check if the source model has load_to_vdl=False but this dbview has translate_to_duckdb=True
|
|
513
|
+
if not source_model.model_config.load_to_vdl and self.model_config.translate_to_duckdb:
|
|
463
514
|
raise u.ConfigurationError(
|
|
464
515
|
f'Dbview "{self.name}" with translate_to_duckdb=True cannot reference source "{source_name}" '
|
|
465
|
-
f'which has
|
|
516
|
+
f'which has load_to_vdl=False'
|
|
466
517
|
)
|
|
467
518
|
|
|
468
519
|
self.model_config.depends_on.add(source_name)
|
|
@@ -475,10 +526,11 @@ class DbviewModel(QueryModel):
|
|
|
475
526
|
|
|
476
527
|
def _get_duckdb_query(self, read_dialect: str, query: str) -> str:
|
|
477
528
|
kwargs = {
|
|
478
|
-
"source": lambda source_name: "
|
|
529
|
+
"source": lambda source_name: "vdl." + source_name
|
|
479
530
|
}
|
|
480
531
|
compiled_query = self._get_compiled_sql_query_str(query, kwargs)
|
|
481
|
-
|
|
532
|
+
duckdb_query = sqlglot.transpile(compiled_query, read=read_dialect, write="duckdb", pretty=True)[0]
|
|
533
|
+
return "-- translated to duckdb\n" + duckdb_query
|
|
482
534
|
|
|
483
535
|
def _compile_sql_model(self, kwargs: dict[str, Any]) -> mq.SqlModelQuery:
|
|
484
536
|
compiled_query_str = self._get_compiled_sql_query_str(self.query_file.raw_query, kwargs)
|
|
@@ -487,15 +539,20 @@ class DbviewModel(QueryModel):
|
|
|
487
539
|
connection_props = self.conn_set.get_connection(connection_name)
|
|
488
540
|
|
|
489
541
|
if self.model_config.translate_to_duckdb and isinstance(connection_props, ConnectionProperties):
|
|
490
|
-
|
|
491
|
-
|
|
542
|
+
# Forbid translate_to_duckdb when dbview connection is duckdb
|
|
543
|
+
if connection_props.type == ConnectionTypeEnum.DUCKDB:
|
|
544
|
+
raise u.ConfigurationError(
|
|
545
|
+
f'Dbview "{self.name}" has translate_to_duckdb=True but its connection is duckdb. Use a federate model instead.'
|
|
546
|
+
)
|
|
547
|
+
macros = {
|
|
548
|
+
"source": lambda source_name: "vdl." + source_name
|
|
492
549
|
}
|
|
493
550
|
compiled_query2 = self._get_compiled_sql_query_str(compiled_query_str, macros)
|
|
494
551
|
compiled_query_str = self._get_duckdb_query(connection_props.dialect, compiled_query2)
|
|
495
552
|
is_duckdb = True
|
|
496
553
|
else:
|
|
497
|
-
macros = {
|
|
498
|
-
"source": lambda source_name: self.sources[source_name].get_table()
|
|
554
|
+
macros = {
|
|
555
|
+
"source": lambda source_name: self.sources[source_name].get_table()
|
|
499
556
|
}
|
|
500
557
|
compiled_query_str = self._get_compiled_sql_query_str(compiled_query_str, macros)
|
|
501
558
|
is_duckdb = False
|
|
@@ -516,7 +573,14 @@ class DbviewModel(QueryModel):
|
|
|
516
573
|
kwargs = self._get_compile_sql_model_args(ctx, ctx_args, models_dict)
|
|
517
574
|
self.compiled_query = self._compile_sql_model(kwargs)
|
|
518
575
|
|
|
519
|
-
self.logger.log_activity_time(
|
|
576
|
+
self.logger.log_activity_time(
|
|
577
|
+
f"compiling dbview model '{self.name}'", start,
|
|
578
|
+
additional_data={
|
|
579
|
+
"activity": "compiling data model",
|
|
580
|
+
"model_name": self.name,
|
|
581
|
+
"model_type": self.model_type.value
|
|
582
|
+
}
|
|
583
|
+
)
|
|
520
584
|
|
|
521
585
|
async def _run_sql_model(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
|
|
522
586
|
assert self.compiled_query is not None
|
|
@@ -532,7 +596,7 @@ class DbviewModel(QueryModel):
|
|
|
532
596
|
self.logger.info(f"Running dbview '{self.name}' on duckdb")
|
|
533
597
|
return local_conn.sql(query, params=placeholders).pl()
|
|
534
598
|
except duckdb.CatalogException as e:
|
|
535
|
-
raise InvalidInputError(409, f'
|
|
599
|
+
raise InvalidInputError(409, f'dependent_data_model_not_found', f'Model "{self.name}" depends on static data models that cannot be found. Try building the Virtual Data Lake (VDL) first.')
|
|
536
600
|
except Exception as e:
|
|
537
601
|
raise RuntimeError(e)
|
|
538
602
|
finally:
|
|
@@ -552,7 +616,14 @@ class DbviewModel(QueryModel):
|
|
|
552
616
|
|
|
553
617
|
await self._run_sql_model(conn, placeholders)
|
|
554
618
|
|
|
555
|
-
self.logger.log_activity_time(
|
|
619
|
+
self.logger.log_activity_time(
|
|
620
|
+
f"running dbview model '{self.name}'", start,
|
|
621
|
+
additional_data={
|
|
622
|
+
"activity": "running data model",
|
|
623
|
+
"model_name": self.name,
|
|
624
|
+
"model_type": self.model_type.value
|
|
625
|
+
}
|
|
626
|
+
)
|
|
556
627
|
|
|
557
628
|
await super().run_model(conn, placeholders)
|
|
558
629
|
|
|
@@ -574,8 +645,16 @@ class FederateModel(QueryModel):
|
|
|
574
645
|
|
|
575
646
|
def ref(dependent_model_name: str) -> str:
|
|
576
647
|
dependent_model = self._ref_for_sql(dependent_model_name, models_dict)
|
|
577
|
-
|
|
578
|
-
|
|
648
|
+
dep = models_dict[dependent_model]
|
|
649
|
+
if isinstance(dep, BuildModel):
|
|
650
|
+
return "vdl." + dependent_model
|
|
651
|
+
if isinstance(dep, SourceModel):
|
|
652
|
+
if dep.model_config.load_to_vdl:
|
|
653
|
+
return "vdl." + dependent_model
|
|
654
|
+
conn_name = dep.model_config.get_connection()
|
|
655
|
+
table_name = dep.model_config.get_table()
|
|
656
|
+
return f"db_{conn_name}.{table_name}"
|
|
657
|
+
return dependent_model
|
|
579
658
|
|
|
580
659
|
kwargs["ref"] = ref
|
|
581
660
|
return kwargs
|
|
@@ -629,7 +708,14 @@ class FederateModel(QueryModel):
|
|
|
629
708
|
else:
|
|
630
709
|
raise NotImplementedError(f"Query type not supported: {self.query_file.__class__.__name__}")
|
|
631
710
|
|
|
632
|
-
self.logger.log_activity_time(
|
|
711
|
+
self.logger.log_activity_time(
|
|
712
|
+
f"compiling federate model '{self.name}'", start,
|
|
713
|
+
additional_data={
|
|
714
|
+
"activity": "compiling data model",
|
|
715
|
+
"model_name": self.name,
|
|
716
|
+
"model_type": self.model_type.value
|
|
717
|
+
}
|
|
718
|
+
)
|
|
633
719
|
|
|
634
720
|
if not recurse:
|
|
635
721
|
return
|
|
@@ -658,10 +744,13 @@ class FederateModel(QueryModel):
|
|
|
658
744
|
try:
|
|
659
745
|
return local_conn.execute(create_query, existing_placeholders)
|
|
660
746
|
except duckdb.CatalogException as e:
|
|
661
|
-
|
|
747
|
+
if self.name == "__fake_target":
|
|
748
|
+
raise InvalidInputError(409, "invalid_sql_query", f"Provided SQL query depends on static data models that cannot be found. Try building the Virtual Data Lake (VDL) first.")
|
|
749
|
+
else:
|
|
750
|
+
raise InvalidInputError(409, f'dependent_data_model_not_found', f'Model "{self.name}" depends on static data models that cannot be found. Try building the Virtual Data Lake (VDL) first.')
|
|
662
751
|
except Exception as e:
|
|
663
752
|
if self.name == "__fake_target":
|
|
664
|
-
raise InvalidInputError(400, "
|
|
753
|
+
raise InvalidInputError(400, "invalid_sql_query", f"Failed to run provided SQL query")
|
|
665
754
|
else:
|
|
666
755
|
raise FileExecutionError(f'Failed to run federate sql model "{self.name}"', e) from e
|
|
667
756
|
|
|
@@ -688,7 +777,14 @@ class FederateModel(QueryModel):
|
|
|
688
777
|
else:
|
|
689
778
|
raise NotImplementedError(f"Query type not supported: {self.query_file.__class__.__name__}")
|
|
690
779
|
|
|
691
|
-
self.logger.log_activity_time(
|
|
780
|
+
self.logger.log_activity_time(
|
|
781
|
+
f"running federate model '{self.name}'", start,
|
|
782
|
+
additional_data={
|
|
783
|
+
"activity": "running data model",
|
|
784
|
+
"model_name": self.name,
|
|
785
|
+
"model_type": self.model_type.value
|
|
786
|
+
}
|
|
787
|
+
)
|
|
692
788
|
|
|
693
789
|
await super().run_model(conn, placeholders)
|
|
694
790
|
|
|
@@ -718,7 +814,12 @@ class BuildModel(StaticModel, QueryModel):
|
|
|
718
814
|
}
|
|
719
815
|
|
|
720
816
|
def ref_for_build(dependent_model_name: str) -> str:
|
|
721
|
-
dependent_model = self._ref_for_sql(dependent_model_name,
|
|
817
|
+
dependent_model = self._ref_for_sql(dependent_model_name, models_dict)
|
|
818
|
+
dep = models_dict[dependent_model]
|
|
819
|
+
if isinstance(dep, SourceModel) and not dep.model_config.load_to_vdl:
|
|
820
|
+
conn_name = dep.model_config.get_connection()
|
|
821
|
+
table_name = dep.model_config.get_table()
|
|
822
|
+
return f"db_{conn_name}.{table_name}"
|
|
722
823
|
return dependent_model
|
|
723
824
|
|
|
724
825
|
kwargs["ref"] = ref_for_build
|
|
@@ -771,7 +872,14 @@ class BuildModel(StaticModel, QueryModel):
|
|
|
771
872
|
else:
|
|
772
873
|
raise NotImplementedError(f"Query type not supported: {self.query_file.__class__.__name__}")
|
|
773
874
|
|
|
774
|
-
self.logger.log_activity_time(
|
|
875
|
+
self.logger.log_activity_time(
|
|
876
|
+
f"compiling build model '{self.name}'", start,
|
|
877
|
+
additional_data={
|
|
878
|
+
"activity": "compiling data model",
|
|
879
|
+
"model_name": self.name,
|
|
880
|
+
"model_type": self.model_type.value
|
|
881
|
+
}
|
|
882
|
+
)
|
|
775
883
|
|
|
776
884
|
dependencies = self.model_config.depends_on
|
|
777
885
|
self.wait_count_for_build = len(dependencies)
|
|
@@ -786,14 +894,17 @@ class BuildModel(StaticModel, QueryModel):
|
|
|
786
894
|
def create_table():
|
|
787
895
|
create_query = self.model_config.get_sql_for_build(self.name, query)
|
|
788
896
|
local_conn = conn.cursor()
|
|
897
|
+
# local_conn = conn
|
|
789
898
|
try:
|
|
790
899
|
return u.run_duckdb_stmt(self.logger, local_conn, create_query, model_name=self.name)
|
|
791
900
|
except Exception as e:
|
|
792
901
|
raise FileExecutionError(f'Failed to build static sql model "{self.name}"', e) from e
|
|
793
902
|
finally:
|
|
794
903
|
local_conn.close()
|
|
904
|
+
# pass
|
|
795
905
|
|
|
796
|
-
await asyncio.to_thread(create_table)
|
|
906
|
+
# await asyncio.to_thread(create_table)
|
|
907
|
+
create_table() # without threading
|
|
797
908
|
|
|
798
909
|
async def _build_python_model(self, compiled_query: mq.PyModelQuery, conn: duckdb.DuckDBPyConnection) -> None:
|
|
799
910
|
query_result = await asyncio.to_thread(compiled_query.query)
|
|
@@ -801,7 +912,8 @@ class BuildModel(StaticModel, QueryModel):
|
|
|
801
912
|
query_result = pl.from_pandas(query_result).lazy()
|
|
802
913
|
if self.needs_python_df_for_build:
|
|
803
914
|
self.result = query_result.lazy()
|
|
804
|
-
await asyncio.to_thread(self._create_table_from_df, conn, query_result)
|
|
915
|
+
# await asyncio.to_thread(self._create_table_from_df, conn, query_result)
|
|
916
|
+
self._create_table_from_df(conn, query_result) # without threading
|
|
805
917
|
|
|
806
918
|
async def build_model(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
|
|
807
919
|
start = time.time()
|
|
@@ -814,24 +926,33 @@ class BuildModel(StaticModel, QueryModel):
|
|
|
814
926
|
def load_df(conn: duckdb.DuckDBPyConnection, dep_model: DataModel):
|
|
815
927
|
if dep_model.result is None:
|
|
816
928
|
local_conn = conn.cursor()
|
|
929
|
+
# local_conn = conn
|
|
817
930
|
try:
|
|
818
931
|
dep_model.result = dep_model._load_duckdb_view_to_python_df(local_conn)
|
|
819
932
|
finally:
|
|
820
933
|
local_conn.close()
|
|
934
|
+
# pass
|
|
821
935
|
|
|
822
936
|
coroutines = []
|
|
823
937
|
for dep_model in self.upstreams_for_build.values():
|
|
824
938
|
coro = asyncio.to_thread(load_df, conn, dep_model)
|
|
825
939
|
coroutines.append(coro)
|
|
826
940
|
await u.asyncio_gather(coroutines)
|
|
827
|
-
|
|
941
|
+
|
|
828
942
|
# Then run the model's Python function to build the model
|
|
829
943
|
await self._build_python_model(self.compiled_query, conn)
|
|
830
944
|
else:
|
|
831
945
|
raise NotImplementedError(f"Query type not supported: {self.query_file.__class__.__name__}")
|
|
832
946
|
|
|
833
947
|
print(f"[{u.get_current_time()}] ✅ FINISHED: build model '{self.name}'")
|
|
834
|
-
self.logger.log_activity_time(
|
|
948
|
+
self.logger.log_activity_time(
|
|
949
|
+
f"building static build model '{self.name}' into VDL", start,
|
|
950
|
+
additional_data={
|
|
951
|
+
"activity": "building data model into VDL",
|
|
952
|
+
"model_name": self.name,
|
|
953
|
+
"model_type": self.model_type.value
|
|
954
|
+
}
|
|
955
|
+
)
|
|
835
956
|
|
|
836
957
|
await super().build_model(conn, full_refresh)
|
|
837
958
|
|
|
@@ -841,7 +962,7 @@ class DAG:
|
|
|
841
962
|
dataset: DatasetConfig | None
|
|
842
963
|
target_model: DataModel
|
|
843
964
|
models_dict: dict[str, DataModel]
|
|
844
|
-
|
|
965
|
+
datalake_db_path: str | None = field(default=None)
|
|
845
966
|
logger: u.Logger = field(default_factory=lambda: u.Logger(""))
|
|
846
967
|
parameter_set: ParameterSet | None = field(default=None, init=False) # set in apply_selections
|
|
847
968
|
placeholders: dict[str, Any] = field(init=False, default_factory=dict)
|
|
@@ -850,36 +971,50 @@ class DAG:
|
|
|
850
971
|
return f" for dataset '{self.dataset.name}'" if self.dataset else ""
|
|
851
972
|
|
|
852
973
|
def compile_build_models(self, conn_args: ConnectionsArgs) -> None:
|
|
853
|
-
static_models: dict[str, StaticModel] = {
|
|
974
|
+
static_models: dict[str, StaticModel] = {
|
|
975
|
+
k: v for k, v in self.models_dict.items() if isinstance(v, StaticModel)
|
|
976
|
+
}
|
|
854
977
|
for model in static_models.values():
|
|
855
978
|
if isinstance(model, BuildModel):
|
|
856
979
|
model.compile_for_build(conn_args, static_models)
|
|
857
980
|
|
|
858
981
|
def apply_selections(
|
|
859
|
-
self, param_cfg_set: ParameterConfigsSet, user:
|
|
982
|
+
self, param_cfg_set: ParameterConfigsSet, user: AbstractUser, selections: dict[str, str]
|
|
860
983
|
) -> None:
|
|
861
984
|
start = time.time()
|
|
985
|
+
|
|
862
986
|
dataset_params = self.dataset.parameters if self.dataset else None
|
|
863
987
|
parameter_set = param_cfg_set.apply_selections(dataset_params, selections, user)
|
|
864
988
|
self.parameter_set = parameter_set
|
|
865
989
|
msg_extension = self._get_msg_extension()
|
|
866
|
-
|
|
990
|
+
|
|
991
|
+
dataset_name = self.dataset.name if self.dataset else None
|
|
992
|
+
self.logger.log_activity_time(
|
|
993
|
+
"applying selections" + msg_extension, start,
|
|
994
|
+
additional_data={"activity": "applying selections", "dataset_name": dataset_name}
|
|
995
|
+
)
|
|
867
996
|
|
|
868
997
|
def _compile_context(
|
|
869
|
-
self, param_args: ParametersArgs, context_func: ContextFunc, user:
|
|
998
|
+
self, param_args: ParametersArgs, context_func: ContextFunc, user: AbstractUser, configurables: dict[str, str]
|
|
870
999
|
) -> tuple[dict[str, Any], ContextArgs]:
|
|
871
1000
|
start = time.time()
|
|
1001
|
+
|
|
872
1002
|
context = {}
|
|
873
1003
|
assert isinstance(self.parameter_set, ParameterSet)
|
|
874
1004
|
prms = self.parameter_set.get_parameters_as_dict()
|
|
875
|
-
|
|
876
|
-
args = ContextArgs(param_args, user, prms, traits)
|
|
1005
|
+
args = ContextArgs(param_args, user, prms, configurables)
|
|
877
1006
|
msg_extension = self._get_msg_extension()
|
|
1007
|
+
|
|
878
1008
|
try:
|
|
879
1009
|
context_func(context, args)
|
|
880
1010
|
except Exception as e:
|
|
881
1011
|
raise FileExecutionError(f'Failed to run {c.CONTEXT_FILE}' + msg_extension, e) from e
|
|
882
|
-
|
|
1012
|
+
|
|
1013
|
+
dataset_name = self.dataset.name if self.dataset else None
|
|
1014
|
+
self.logger.log_activity_time(
|
|
1015
|
+
"running context.py" + msg_extension, start,
|
|
1016
|
+
additional_data={"activity": "running context.py", "dataset_name": dataset_name}
|
|
1017
|
+
)
|
|
883
1018
|
return context, args
|
|
884
1019
|
|
|
885
1020
|
def _compile_models(self, context: dict[str, Any], ctx_args: ContextArgs, recurse: bool) -> None:
|
|
@@ -890,29 +1025,25 @@ class DAG:
|
|
|
890
1025
|
terminal_nodes = self.target_model.get_terminal_nodes(set())
|
|
891
1026
|
for model in self.models_dict.values():
|
|
892
1027
|
model.confirmed_no_cycles = False
|
|
893
|
-
self.logger.log_activity_time(
|
|
1028
|
+
self.logger.log_activity_time("validating no cycles in model dependencies", start)
|
|
894
1029
|
return terminal_nodes
|
|
895
1030
|
|
|
1031
|
+
def _attach_connections_with_type_duckdb(self, conn: duckdb.DuckDBPyConnection) -> None:
|
|
1032
|
+
for conn_name, connection in self.target_model.conn_set.get_connections_as_dict().items():
|
|
1033
|
+
if not isinstance(connection, ConnectionProperties):
|
|
1034
|
+
continue
|
|
1035
|
+
attach_uri = connection.attach_uri_for_duckdb
|
|
1036
|
+
if attach_uri is None:
|
|
1037
|
+
continue
|
|
1038
|
+
attach_stmt = f"ATTACH IF NOT EXISTS '{attach_uri}' AS db_{conn_name} (READ_ONLY)"
|
|
1039
|
+
u.run_duckdb_stmt(self.logger, conn, attach_stmt, redacted_values=[attach_uri])
|
|
1040
|
+
|
|
896
1041
|
async def _run_models(self) -> None:
|
|
897
1042
|
terminal_nodes = self._get_terminal_nodes()
|
|
898
1043
|
|
|
899
|
-
|
|
1044
|
+
conn = u.create_duckdb_connection(datalake_db_path=self.datalake_db_path)
|
|
900
1045
|
try:
|
|
901
|
-
|
|
902
|
-
conn.close()
|
|
903
|
-
except duckdb.IOException as e:
|
|
904
|
-
# unable to create duckdb venv file means it's in use and already exists
|
|
905
|
-
# do not throw error here since attaching in read-only mode later may still work
|
|
906
|
-
pass
|
|
907
|
-
|
|
908
|
-
conn = u.create_duckdb_connection()
|
|
909
|
-
try:
|
|
910
|
-
read_only = "(READ_ONLY)" if self.duckdb_filepath else ""
|
|
911
|
-
try:
|
|
912
|
-
conn.execute(f"ATTACH '{self.duckdb_filepath}' AS venv {read_only}")
|
|
913
|
-
except duckdb.IOException as e:
|
|
914
|
-
self.logger.warning(f"Unable to attach to duckdb venv file: {self.duckdb_filepath}")
|
|
915
|
-
raise e
|
|
1046
|
+
self._attach_connections_with_type_duckdb(conn)
|
|
916
1047
|
|
|
917
1048
|
coroutines = []
|
|
918
1049
|
for model_name in terminal_nodes:
|
|
@@ -924,14 +1055,14 @@ class DAG:
|
|
|
924
1055
|
conn.close()
|
|
925
1056
|
|
|
926
1057
|
async def execute(
|
|
927
|
-
self, param_args: ParametersArgs, param_cfg_set: ParameterConfigsSet, context_func: ContextFunc, user:
|
|
928
|
-
*, runquery: bool = True, recurse: bool = True,
|
|
1058
|
+
self, param_args: ParametersArgs, param_cfg_set: ParameterConfigsSet, context_func: ContextFunc, user: AbstractUser, selections: dict[str, str],
|
|
1059
|
+
*, runquery: bool = True, recurse: bool = True, configurables: dict[str, str] = {}
|
|
929
1060
|
) -> None:
|
|
930
1061
|
recurse = (recurse or runquery)
|
|
931
1062
|
|
|
932
1063
|
self.apply_selections(param_cfg_set, user, selections)
|
|
933
1064
|
|
|
934
|
-
context, ctx_args = self._compile_context(param_args, context_func, user,
|
|
1065
|
+
context, ctx_args = self._compile_context(param_args, context_func, user, configurables)
|
|
935
1066
|
|
|
936
1067
|
self._compile_models(context, ctx_args, recurse)
|
|
937
1068
|
|
|
@@ -946,21 +1077,6 @@ class DAG:
|
|
|
946
1077
|
self.target_model.retrieve_dependent_query_models(all_model_names)
|
|
947
1078
|
return all_model_names
|
|
948
1079
|
|
|
949
|
-
def to_networkx_graph(self) -> nx.DiGraph:
|
|
950
|
-
G = nx.DiGraph()
|
|
951
|
-
|
|
952
|
-
for model_name, model in self.models_dict.items():
|
|
953
|
-
level = model.get_max_path_length_to_target()
|
|
954
|
-
if level is not None:
|
|
955
|
-
G.add_node(model_name, layer=-level, model_type=model.model_type)
|
|
956
|
-
|
|
957
|
-
for model_name in G.nodes:
|
|
958
|
-
model = self.models_dict[model_name]
|
|
959
|
-
for dep_model_name in model.downstreams:
|
|
960
|
-
G.add_edge(model_name, dep_model_name)
|
|
961
|
-
|
|
962
|
-
return G
|
|
963
|
-
|
|
964
1080
|
def get_all_data_models(self) -> list[rm.DataModelItem]:
|
|
965
1081
|
data_models = []
|
|
966
1082
|
for model_name, model in self.models_dict.items():
|