squirrels 0.5.0b4__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of squirrels might be problematic. Click here for more details.

Files changed (69) hide show
  1. squirrels/__init__.py +2 -0
  2. squirrels/_api_routes/auth.py +83 -74
  3. squirrels/_api_routes/base.py +58 -41
  4. squirrels/_api_routes/dashboards.py +37 -21
  5. squirrels/_api_routes/data_management.py +72 -27
  6. squirrels/_api_routes/datasets.py +107 -84
  7. squirrels/_api_routes/oauth2.py +11 -13
  8. squirrels/_api_routes/project.py +71 -33
  9. squirrels/_api_server.py +130 -63
  10. squirrels/_arguments/run_time_args.py +9 -9
  11. squirrels/_auth.py +117 -162
  12. squirrels/_command_line.py +68 -32
  13. squirrels/_compile_prompts.py +147 -0
  14. squirrels/_connection_set.py +11 -2
  15. squirrels/_constants.py +22 -8
  16. squirrels/_data_sources.py +38 -32
  17. squirrels/_dataset_types.py +2 -4
  18. squirrels/_initializer.py +1 -1
  19. squirrels/_logging.py +117 -0
  20. squirrels/_manifest.py +125 -58
  21. squirrels/_model_builder.py +10 -54
  22. squirrels/_models.py +224 -108
  23. squirrels/_package_data/base_project/.env +15 -4
  24. squirrels/_package_data/base_project/.env.example +14 -3
  25. squirrels/_package_data/base_project/connections.yml +4 -3
  26. squirrels/_package_data/base_project/dashboards/dashboard_example.py +2 -2
  27. squirrels/_package_data/base_project/dashboards/dashboard_example.yml +4 -4
  28. squirrels/_package_data/base_project/duckdb_init.sql +1 -0
  29. squirrels/_package_data/base_project/models/dbviews/dbview_example.sql +7 -2
  30. squirrels/_package_data/base_project/models/dbviews/dbview_example.yml +16 -10
  31. squirrels/_package_data/base_project/models/federates/federate_example.py +22 -15
  32. squirrels/_package_data/base_project/models/federates/federate_example.sql +3 -7
  33. squirrels/_package_data/base_project/models/federates/federate_example.yml +1 -1
  34. squirrels/_package_data/base_project/models/sources.yml +5 -6
  35. squirrels/_package_data/base_project/parameters.yml +24 -38
  36. squirrels/_package_data/base_project/pyconfigs/connections.py +5 -1
  37. squirrels/_package_data/base_project/pyconfigs/context.py +23 -12
  38. squirrels/_package_data/base_project/pyconfigs/parameters.py +68 -33
  39. squirrels/_package_data/base_project/pyconfigs/user.py +11 -18
  40. squirrels/_package_data/base_project/seeds/seed_categories.yml +1 -1
  41. squirrels/_package_data/base_project/seeds/seed_subcategories.yml +1 -1
  42. squirrels/_package_data/base_project/squirrels.yml.j2 +18 -28
  43. squirrels/_package_data/templates/squirrels_studio.html +20 -0
  44. squirrels/_parameter_configs.py +43 -22
  45. squirrels/_parameter_options.py +1 -1
  46. squirrels/_parameter_sets.py +8 -10
  47. squirrels/_project.py +351 -234
  48. squirrels/_request_context.py +33 -0
  49. squirrels/_schemas/auth_models.py +32 -9
  50. squirrels/_schemas/query_param_models.py +9 -1
  51. squirrels/_schemas/response_models.py +36 -10
  52. squirrels/_seeds.py +1 -1
  53. squirrels/_sources.py +23 -19
  54. squirrels/_utils.py +83 -35
  55. squirrels/_version.py +1 -1
  56. squirrels/arguments.py +5 -0
  57. squirrels/auth.py +4 -1
  58. squirrels/connections.py +2 -0
  59. squirrels/dashboards.py +3 -1
  60. squirrels/data_sources.py +6 -0
  61. squirrels/parameter_options.py +5 -0
  62. squirrels/parameters.py +5 -0
  63. squirrels/types.py +6 -1
  64. {squirrels-0.5.0b4.dist-info → squirrels-0.5.1.dist-info}/METADATA +28 -13
  65. squirrels-0.5.1.dist-info/RECORD +98 -0
  66. squirrels-0.5.0b4.dist-info/RECORD +0 -94
  67. {squirrels-0.5.0b4.dist-info → squirrels-0.5.1.dist-info}/WHEEL +0 -0
  68. {squirrels-0.5.0b4.dist-info → squirrels-0.5.1.dist-info}/entry_points.txt +0 -0
  69. {squirrels-0.5.0b4.dist-info → squirrels-0.5.1.dist-info}/licenses/LICENSE +0 -0
squirrels/_models.py CHANGED
@@ -5,26 +5,26 @@ from abc import ABCMeta, abstractmethod
5
5
  from enum import Enum
6
6
  from pathlib import Path
7
7
  import asyncio, os, re, time, duckdb, sqlglot
8
- import polars as pl, pandas as pd, networkx as nx
8
+ import polars as pl, pandas as pd
9
9
 
10
10
  from . import _constants as c, _utils as u, _py_module as pm, _model_queries as mq, _model_configs as mc, _sources as src
11
11
  from ._schemas import response_models as rm
12
12
  from ._exceptions import FileExecutionError, InvalidInputError
13
13
  from ._arguments.run_time_args import ContextArgs, ModelArgs, BuildModelArgs
14
- from ._auth import BaseUser
14
+ from ._auth import AbstractUser
15
15
  from ._connection_set import ConnectionsArgs, ConnectionSet, ConnectionProperties
16
- from ._manifest import DatasetConfig
16
+ from ._manifest import DatasetConfig, ConnectionTypeEnum
17
17
  from ._parameter_sets import ParameterConfigsSet, ParametersArgs, ParameterSet
18
18
 
19
19
  ContextFunc = Callable[[dict[str, Any], ContextArgs], None]
20
20
 
21
21
 
22
22
  class ModelType(Enum):
23
+ SEED = "seed"
23
24
  SOURCE = "source"
25
+ BUILD = "build"
24
26
  DBVIEW = "dbview"
25
27
  FEDERATE = "federate"
26
- SEED = "seed"
27
- BUILD = "build"
28
28
 
29
29
 
30
30
  @dataclass
@@ -79,15 +79,15 @@ class DataModel(metaclass=ABCMeta):
79
79
  self.confirmed_no_cycles = True
80
80
  return terminal_nodes
81
81
 
82
- def _load_duckdb_view_to_python_df(self, conn: duckdb.DuckDBPyConnection, *, use_venv: bool = False) -> pl.LazyFrame:
83
- table_name = ("venv." if use_venv else "") + self.name
82
+ def _load_duckdb_view_to_python_df(self, conn: duckdb.DuckDBPyConnection, *, use_datalake: bool = False) -> pl.LazyFrame:
83
+ table_name = ("vdl." if use_datalake else "") + self.name
84
84
  try:
85
85
  return conn.sql(f"FROM {table_name}").pl().lazy()
86
86
  except duckdb.CatalogException as e:
87
87
  raise u.ConfigurationError(f'Failed to load duckdb table or view "{self.name}" to python dataframe') from e
88
88
 
89
89
  def _run_sql_query_on_connection(self, connection_name: str, query: str, placeholders: dict = {}) -> pl.DataFrame:
90
- self.logger.info(f"Running sql query on connection '{connection_name}': {query}")
90
+ self.logger.debug(f"Running SQL query on connection '{connection_name}':\n{query}")
91
91
  return self.conn_set.run_sql_query_from_conn_name(query, connection_name, placeholders)
92
92
 
93
93
  async def _trigger(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
@@ -133,11 +133,13 @@ class DataModel(metaclass=ABCMeta):
133
133
 
134
134
  def _create_table_from_df(self, conn: duckdb.DuckDBPyConnection, query_result: pl.LazyFrame | pd.DataFrame):
135
135
  local_conn = conn.cursor()
136
+ # local_conn = conn
136
137
  try:
137
- local_conn.register("df", query_result)
138
- local_conn.execute(f"CREATE OR REPLACE TABLE {self.name} AS SELECT * FROM df")
138
+ assert query_result is not None
139
+ local_conn.execute(f"CREATE OR REPLACE TABLE {self.name} AS FROM query_result")
139
140
  finally:
140
141
  local_conn.close()
142
+ # pass
141
143
 
142
144
  def process_pass_through_columns(self, models_dict: dict[str, DataModel]) -> None:
143
145
  pass
@@ -172,19 +174,26 @@ class StaticModel(DataModel):
172
174
  def _get_result(self, conn: duckdb.DuckDBPyConnection) -> pl.LazyFrame:
173
175
  local_conn = conn.cursor()
174
176
  try:
175
- return self._load_duckdb_view_to_python_df(local_conn, use_venv=True)
177
+ return self._load_duckdb_view_to_python_df(local_conn, use_datalake=True)
176
178
  except Exception as e:
177
- raise InvalidInputError(409, f'Dependent data model not found.', f'Model "{self.name}" depends on static data models that cannot be found. Trying building the virtual data environment first.')
179
+ raise InvalidInputError(409, f'dependent_data_model_not_found', f'Model "{self.name}" depends on static data models that cannot be found. Try building the Virtual Data Lake (VDL) first.')
178
180
  finally:
179
181
  local_conn.close()
180
182
 
181
183
  async def run_model(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
182
- start = time.time()
183
-
184
184
  if (self.needs_python_df or self.is_target) and self.result is None:
185
+ start = time.time()
186
+
185
187
  self.result = await asyncio.to_thread(self._get_result, conn)
186
-
187
- self.logger.log_activity_time(f"loading static model '{self.name}'", start)
188
+
189
+ self.logger.log_activity_time(
190
+ f"loading {self.model_type.value} model '{self.name}' into memory", start,
191
+ additional_data={
192
+ "activity": "loading static data model into memory",
193
+ "model_name": self.name,
194
+ "model_type": self.model_type.value
195
+ }
196
+ )
188
197
 
189
198
  await super().run_model(conn, placeholders)
190
199
 
@@ -225,10 +234,18 @@ class Seed(StaticModel):
225
234
  start = time.time()
226
235
 
227
236
  print(f"[{u.get_current_time()}] 🔨 BUILDING: seed model '{self.name}'")
228
- await asyncio.to_thread(self._create_table_from_df, conn, self.result)
237
+ # await asyncio.to_thread(self._create_table_from_df, conn, self.result)
238
+ self._create_table_from_df(conn, self.result) # without threading
229
239
 
230
240
  print(f"[{u.get_current_time()}] ✅ FINISHED: seed model '{self.name}'")
231
- self.logger.log_activity_time(f"building seed model '{self.name}' to venv", start)
241
+ self.logger.log_activity_time(
242
+ f"building seed model '{self.name}' into VDL", start,
243
+ additional_data={
244
+ "activity": "building data model into VDL",
245
+ "model_name": self.name,
246
+ "model_type": self.model_type.value
247
+ }
248
+ )
232
249
 
233
250
  await super().build_model(conn, full_refresh)
234
251
 
@@ -240,24 +257,32 @@ class SourceModel(StaticModel):
240
257
  @property
241
258
  def model_type(self) -> ModelType:
242
259
  return ModelType.SOURCE
260
+
261
+ @property
262
+ def connection_props(self) -> ConnectionProperties:
263
+ conn_name = self.model_config.get_connection()
264
+ conn_props = self.conn_set.get_connection(conn_name)
265
+ if isinstance(conn_props, ConnectionProperties):
266
+ return conn_props
267
+ raise u.ConfigurationError(f'Unable to use connection "{conn_name}" for source "{self.name}". Connection "{conn_name}" must be a ConnectionProperties object')
243
268
 
244
269
  @property
245
270
  def is_queryable(self) -> bool:
246
- return self.model_config.load_to_duckdb
271
+ connection_props = self.connection_props
272
+ return self.model_config.load_to_vdl or connection_props.type == ConnectionTypeEnum.DUCKDB
247
273
 
248
274
  def _build_source_model(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
249
275
  local_conn = conn.cursor()
276
+ # local_conn = conn
277
+
278
+ local_conn.begin()
250
279
  try:
251
280
  source = self.model_config
252
281
  conn_name = source.get_connection()
253
282
 
254
- connection_props = self.conn_set.get_connection(conn_name)
255
- if isinstance(connection_props, ConnectionProperties):
256
- dialect = connection_props.dialect
257
- attach_uri = connection_props.attach_uri_for_duckdb
258
- else:
259
- raise u.ConfigurationError(f'Unable to use connection "{conn_name}" for source "{self.name}". Connection "{conn_name}" must be a ConnectionProperties object')
260
-
283
+ connection_props = self.connection_props
284
+ dialect = connection_props.dialect
285
+ attach_uri = connection_props.attach_uri_for_duckdb
261
286
  if attach_uri is None:
262
287
  raise u.ConfigurationError(f'Loading to duckdb is not supported for source "{self.name}" since its connection "{conn_name}" uses an unsupported dialect')
263
288
 
@@ -269,8 +294,9 @@ class SourceModel(StaticModel):
269
294
  new_table_name = self.name
270
295
 
271
296
  if len(source.columns) == 0:
272
- stmt = f"CREATE OR REPLACE TABLE {new_table_name} AS SELECT * FROM db_{conn_name}.{table_name}"
297
+ stmt = f"CREATE OR REPLACE TABLE {new_table_name} AS FROM db_{conn_name}.{table_name}"
273
298
  u.run_duckdb_stmt(self.logger, local_conn, stmt)
299
+ local_conn.commit()
274
300
  return
275
301
 
276
302
  increasing_column = source.update_hints.increasing_column
@@ -297,25 +323,44 @@ class SourceModel(StaticModel):
297
323
  if max_val_of_incr_col is None:
298
324
  recreate_table = True
299
325
 
300
- insert_cols_clause = source.get_cols_for_insert_stmt()
301
- insert_replace_clause = source.get_insert_replace_clause()
302
- query = source.get_query_for_insert(dialect, conn_name, table_name, max_val_of_incr_col, full_refresh=recreate_table)
303
- stmt = f"INSERT {insert_replace_clause} INTO {new_table_name} ({insert_cols_clause}) {query}"
326
+ query = source.get_query_for_upsert(dialect, conn_name, table_name, max_val_of_incr_col, full_refresh=recreate_table)
327
+
328
+ primary_keys = ", ".join(source.primary_key) if source.primary_key else ""
329
+ match_condition = f"USING ({primary_keys})" if primary_keys else "ON false"
330
+ stmt = (
331
+ f"MERGE INTO {new_table_name} "
332
+ f"USING ({query}) AS src "
333
+ f"{match_condition} "
334
+ f"WHEN MATCHED THEN UPDATE "
335
+ f"WHEN NOT MATCHED THEN INSERT BY NAME"
336
+ )
304
337
  u.run_duckdb_stmt(self.logger, local_conn, stmt)
338
+
339
+ local_conn.commit()
340
+
305
341
  finally:
306
342
  local_conn.close()
343
+ # pass
307
344
 
308
345
  async def build_model(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
309
- if self.model_config.load_to_duckdb:
346
+ if self.model_config.load_to_vdl:
310
347
  start = time.time()
311
348
  print(f"[{u.get_current_time()}] 🔨 BUILDING: source model '{self.name}'")
312
349
 
313
- await asyncio.to_thread(self._build_source_model, conn, full_refresh)
350
+ # await asyncio.to_thread(self._build_source_model, conn, full_refresh)
351
+ self._build_source_model(conn, full_refresh) # without threading
314
352
 
315
353
  print(f"[{u.get_current_time()}] ✅ FINISHED: source model '{self.name}'")
316
- self.logger.log_activity_time(f"building source model '{self.name}' to venv", start)
354
+ self.logger.log_activity_time(
355
+ f"building source model '{self.name}' into VDL", start,
356
+ additional_data={
357
+ "activity": "building data model into VDL",
358
+ "model_name": self.name,
359
+ "model_type": self.model_type.value
360
+ }
361
+ )
317
362
 
318
- await super().build_model(conn, full_refresh)
363
+ await super().build_model(conn, full_refresh)
319
364
 
320
365
 
321
366
  @dataclass
@@ -338,10 +383,16 @@ class QueryModel(DataModel):
338
383
  raise u.ConfigurationError(f'Model "{self.name}" references unknown model "{dependent_model_name}"')
339
384
 
340
385
  dep_model = models_dict[dependent_model_name]
341
- if isinstance(dep_model, SourceModel) and not dep_model.model_config.load_to_duckdb:
342
- raise u.ConfigurationError(
343
- f'Model "{self.name}" cannot reference source model "{dependent_model_name}" which has load_to_duckdb=False'
344
- )
386
+ if isinstance(dep_model, SourceModel) and not dep_model.model_config.load_to_vdl:
387
+ # Allow when caller is Build or Federate AND the source connection is duckdb; else error
388
+ conn_name = dep_model.model_config.get_connection()
389
+ conn_props = self.conn_set.get_connection(conn_name)
390
+ is_duckdb_conn = isinstance(conn_props, ConnectionProperties) and conn_props.type == ConnectionTypeEnum.DUCKDB
391
+ if not is_duckdb_conn:
392
+ raise u.ConfigurationError(
393
+ f'Model "{self.name}" cannot reference source model "{dependent_model_name}". '
394
+ 'To be referenced by a build or federate model, the source must have load_to_vdl=True or a duckdb connection type.'
395
+ )
345
396
 
346
397
  self.model_config.depends_on.add(dependent_model_name)
347
398
  return dependent_model_name
@@ -359,7 +410,7 @@ class QueryModel(DataModel):
359
410
  is_placeholder = lambda placeholder: placeholder in ctx_args._placeholders_copy
360
411
  kwargs = {
361
412
  "proj_vars": ctx_args.proj_vars, "env_vars": ctx_args.env_vars, "user": ctx_args.user, "prms": ctx_args.prms,
362
- "traits": ctx_args.traits, "ctx": ctx, "is_placeholder": is_placeholder, "set_placeholder": ctx_args.set_placeholder,
413
+ "configurables": ctx_args.configurables, "ctx": ctx, "is_placeholder": is_placeholder, "set_placeholder": ctx_args.set_placeholder,
363
414
  "param_exists": ctx_args.param_exists
364
415
  }
365
416
  return kwargs
@@ -433,7 +484,7 @@ class QueryModel(DataModel):
433
484
  def _log_sql_to_run(self, sql: str, placeholders: dict[str, Any]) -> None:
434
485
  log_msg = f"SQL to run for model '{self.name}':\n{sql}"
435
486
  log_msg += f"\n\n(with placeholders: {placeholders})"
436
- self.logger.info(log_msg)
487
+ self.logger.debug(log_msg)
437
488
 
438
489
 
439
490
  @dataclass
@@ -458,11 +509,11 @@ class DbviewModel(QueryModel):
458
509
  if source_model.model_config.get_connection() != self.model_config.get_connection():
459
510
  raise u.ConfigurationError(f'Dbview "{self.name}" references source "{source_name}" with different connection')
460
511
 
461
- # Check if the source model has load_to_duckdb=False but this dbview has translate_to_duckdb=True
462
- if not source_model.model_config.load_to_duckdb and self.model_config.translate_to_duckdb:
512
+ # Check if the source model has load_to_vdl=False but this dbview has translate_to_duckdb=True
513
+ if not source_model.model_config.load_to_vdl and self.model_config.translate_to_duckdb:
463
514
  raise u.ConfigurationError(
464
515
  f'Dbview "{self.name}" with translate_to_duckdb=True cannot reference source "{source_name}" '
465
- f'which has load_to_duckdb=False'
516
+ f'which has load_to_vdl=False'
466
517
  )
467
518
 
468
519
  self.model_config.depends_on.add(source_name)
@@ -475,10 +526,11 @@ class DbviewModel(QueryModel):
475
526
 
476
527
  def _get_duckdb_query(self, read_dialect: str, query: str) -> str:
477
528
  kwargs = {
478
- "source": lambda source_name: "venv." + source_name
529
+ "source": lambda source_name: "vdl." + source_name
479
530
  }
480
531
  compiled_query = self._get_compiled_sql_query_str(query, kwargs)
481
- return sqlglot.transpile(compiled_query, read=read_dialect, write="duckdb")[0]
532
+ duckdb_query = sqlglot.transpile(compiled_query, read=read_dialect, write="duckdb", pretty=True)[0]
533
+ return "-- translated to duckdb\n" + duckdb_query
482
534
 
483
535
  def _compile_sql_model(self, kwargs: dict[str, Any]) -> mq.SqlModelQuery:
484
536
  compiled_query_str = self._get_compiled_sql_query_str(self.query_file.raw_query, kwargs)
@@ -487,15 +539,20 @@ class DbviewModel(QueryModel):
487
539
  connection_props = self.conn_set.get_connection(connection_name)
488
540
 
489
541
  if self.model_config.translate_to_duckdb and isinstance(connection_props, ConnectionProperties):
490
- macros = {
491
- "source": lambda source_name: "venv." + source_name
542
+ # Forbid translate_to_duckdb when dbview connection is duckdb
543
+ if connection_props.type == ConnectionTypeEnum.DUCKDB:
544
+ raise u.ConfigurationError(
545
+ f'Dbview "{self.name}" has translate_to_duckdb=True but its connection is duckdb. Use a federate model instead.'
546
+ )
547
+ macros = {
548
+ "source": lambda source_name: "vdl." + source_name
492
549
  }
493
550
  compiled_query2 = self._get_compiled_sql_query_str(compiled_query_str, macros)
494
551
  compiled_query_str = self._get_duckdb_query(connection_props.dialect, compiled_query2)
495
552
  is_duckdb = True
496
553
  else:
497
- macros = {
498
- "source": lambda source_name: self.sources[source_name].get_table()
554
+ macros = {
555
+ "source": lambda source_name: self.sources[source_name].get_table()
499
556
  }
500
557
  compiled_query_str = self._get_compiled_sql_query_str(compiled_query_str, macros)
501
558
  is_duckdb = False
@@ -516,7 +573,14 @@ class DbviewModel(QueryModel):
516
573
  kwargs = self._get_compile_sql_model_args(ctx, ctx_args, models_dict)
517
574
  self.compiled_query = self._compile_sql_model(kwargs)
518
575
 
519
- self.logger.log_activity_time(f"compiling dbview model '{self.name}'", start)
576
+ self.logger.log_activity_time(
577
+ f"compiling dbview model '{self.name}'", start,
578
+ additional_data={
579
+ "activity": "compiling data model",
580
+ "model_name": self.name,
581
+ "model_type": self.model_type.value
582
+ }
583
+ )
520
584
 
521
585
  async def _run_sql_model(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
522
586
  assert self.compiled_query is not None
@@ -532,7 +596,7 @@ class DbviewModel(QueryModel):
532
596
  self.logger.info(f"Running dbview '{self.name}' on duckdb")
533
597
  return local_conn.sql(query, params=placeholders).pl()
534
598
  except duckdb.CatalogException as e:
535
- raise InvalidInputError(409, f'Dependent data model not found.', f'Model "{self.name}" depends on static data models that cannot be found. Trying building the virtual data environment first.')
599
+ raise InvalidInputError(409, f'dependent_data_model_not_found', f'Model "{self.name}" depends on static data models that cannot be found. Try building the Virtual Data Lake (VDL) first.')
536
600
  except Exception as e:
537
601
  raise RuntimeError(e)
538
602
  finally:
@@ -552,7 +616,14 @@ class DbviewModel(QueryModel):
552
616
 
553
617
  await self._run_sql_model(conn, placeholders)
554
618
 
555
- self.logger.log_activity_time(f"running dbview model '{self.name}'", start)
619
+ self.logger.log_activity_time(
620
+ f"running dbview model '{self.name}'", start,
621
+ additional_data={
622
+ "activity": "running data model",
623
+ "model_name": self.name,
624
+ "model_type": self.model_type.value
625
+ }
626
+ )
556
627
 
557
628
  await super().run_model(conn, placeholders)
558
629
 
@@ -574,8 +645,16 @@ class FederateModel(QueryModel):
574
645
 
575
646
  def ref(dependent_model_name: str) -> str:
576
647
  dependent_model = self._ref_for_sql(dependent_model_name, models_dict)
577
- prefix = "venv." if isinstance(models_dict[dependent_model], (SourceModel, BuildModel)) else ""
578
- return prefix + dependent_model
648
+ dep = models_dict[dependent_model]
649
+ if isinstance(dep, BuildModel):
650
+ return "vdl." + dependent_model
651
+ if isinstance(dep, SourceModel):
652
+ if dep.model_config.load_to_vdl:
653
+ return "vdl." + dependent_model
654
+ conn_name = dep.model_config.get_connection()
655
+ table_name = dep.model_config.get_table()
656
+ return f"db_{conn_name}.{table_name}"
657
+ return dependent_model
579
658
 
580
659
  kwargs["ref"] = ref
581
660
  return kwargs
@@ -629,7 +708,14 @@ class FederateModel(QueryModel):
629
708
  else:
630
709
  raise NotImplementedError(f"Query type not supported: {self.query_file.__class__.__name__}")
631
710
 
632
- self.logger.log_activity_time(f"compiling federate model '{self.name}'", start)
711
+ self.logger.log_activity_time(
712
+ f"compiling federate model '{self.name}'", start,
713
+ additional_data={
714
+ "activity": "compiling data model",
715
+ "model_name": self.name,
716
+ "model_type": self.model_type.value
717
+ }
718
+ )
633
719
 
634
720
  if not recurse:
635
721
  return
@@ -658,10 +744,13 @@ class FederateModel(QueryModel):
658
744
  try:
659
745
  return local_conn.execute(create_query, existing_placeholders)
660
746
  except duckdb.CatalogException as e:
661
- raise InvalidInputError(409, f'Dependent data model not found.', f'Model "{self.name}" depends on static data models that cannot be found. Trying building the virtual data environment first.')
747
+ if self.name == "__fake_target":
748
+ raise InvalidInputError(409, "invalid_sql_query", f"Provided SQL query depends on static data models that cannot be found. Try building the Virtual Data Lake (VDL) first.")
749
+ else:
750
+ raise InvalidInputError(409, f'dependent_data_model_not_found', f'Model "{self.name}" depends on static data models that cannot be found. Try building the Virtual Data Lake (VDL) first.')
662
751
  except Exception as e:
663
752
  if self.name == "__fake_target":
664
- raise InvalidInputError(400, "Invalid SQL query", f"Failed to run provided SQL query")
753
+ raise InvalidInputError(400, "invalid_sql_query", f"Failed to run provided SQL query")
665
754
  else:
666
755
  raise FileExecutionError(f'Failed to run federate sql model "{self.name}"', e) from e
667
756
 
@@ -688,7 +777,14 @@ class FederateModel(QueryModel):
688
777
  else:
689
778
  raise NotImplementedError(f"Query type not supported: {self.query_file.__class__.__name__}")
690
779
 
691
- self.logger.log_activity_time(f"running federate model '{self.name}'", start)
780
+ self.logger.log_activity_time(
781
+ f"running federate model '{self.name}'", start,
782
+ additional_data={
783
+ "activity": "running data model",
784
+ "model_name": self.name,
785
+ "model_type": self.model_type.value
786
+ }
787
+ )
692
788
 
693
789
  await super().run_model(conn, placeholders)
694
790
 
@@ -718,7 +814,12 @@ class BuildModel(StaticModel, QueryModel):
718
814
  }
719
815
 
720
816
  def ref_for_build(dependent_model_name: str) -> str:
721
- dependent_model = self._ref_for_sql(dependent_model_name, dict(models_dict))
817
+ dependent_model = self._ref_for_sql(dependent_model_name, models_dict)
818
+ dep = models_dict[dependent_model]
819
+ if isinstance(dep, SourceModel) and not dep.model_config.load_to_vdl:
820
+ conn_name = dep.model_config.get_connection()
821
+ table_name = dep.model_config.get_table()
822
+ return f"db_{conn_name}.{table_name}"
722
823
  return dependent_model
723
824
 
724
825
  kwargs["ref"] = ref_for_build
@@ -771,7 +872,14 @@ class BuildModel(StaticModel, QueryModel):
771
872
  else:
772
873
  raise NotImplementedError(f"Query type not supported: {self.query_file.__class__.__name__}")
773
874
 
774
- self.logger.log_activity_time(f"compiling build model '{self.name}'", start)
875
+ self.logger.log_activity_time(
876
+ f"compiling build model '{self.name}'", start,
877
+ additional_data={
878
+ "activity": "compiling data model",
879
+ "model_name": self.name,
880
+ "model_type": self.model_type.value
881
+ }
882
+ )
775
883
 
776
884
  dependencies = self.model_config.depends_on
777
885
  self.wait_count_for_build = len(dependencies)
@@ -786,14 +894,17 @@ class BuildModel(StaticModel, QueryModel):
786
894
  def create_table():
787
895
  create_query = self.model_config.get_sql_for_build(self.name, query)
788
896
  local_conn = conn.cursor()
897
+ # local_conn = conn
789
898
  try:
790
899
  return u.run_duckdb_stmt(self.logger, local_conn, create_query, model_name=self.name)
791
900
  except Exception as e:
792
901
  raise FileExecutionError(f'Failed to build static sql model "{self.name}"', e) from e
793
902
  finally:
794
903
  local_conn.close()
904
+ # pass
795
905
 
796
- await asyncio.to_thread(create_table)
906
+ # await asyncio.to_thread(create_table)
907
+ create_table() # without threading
797
908
 
798
909
  async def _build_python_model(self, compiled_query: mq.PyModelQuery, conn: duckdb.DuckDBPyConnection) -> None:
799
910
  query_result = await asyncio.to_thread(compiled_query.query)
@@ -801,7 +912,8 @@ class BuildModel(StaticModel, QueryModel):
801
912
  query_result = pl.from_pandas(query_result).lazy()
802
913
  if self.needs_python_df_for_build:
803
914
  self.result = query_result.lazy()
804
- await asyncio.to_thread(self._create_table_from_df, conn, query_result)
915
+ # await asyncio.to_thread(self._create_table_from_df, conn, query_result)
916
+ self._create_table_from_df(conn, query_result) # without threading
805
917
 
806
918
  async def build_model(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
807
919
  start = time.time()
@@ -814,24 +926,33 @@ class BuildModel(StaticModel, QueryModel):
814
926
  def load_df(conn: duckdb.DuckDBPyConnection, dep_model: DataModel):
815
927
  if dep_model.result is None:
816
928
  local_conn = conn.cursor()
929
+ # local_conn = conn
817
930
  try:
818
931
  dep_model.result = dep_model._load_duckdb_view_to_python_df(local_conn)
819
932
  finally:
820
933
  local_conn.close()
934
+ # pass
821
935
 
822
936
  coroutines = []
823
937
  for dep_model in self.upstreams_for_build.values():
824
938
  coro = asyncio.to_thread(load_df, conn, dep_model)
825
939
  coroutines.append(coro)
826
940
  await u.asyncio_gather(coroutines)
827
-
941
+
828
942
  # Then run the model's Python function to build the model
829
943
  await self._build_python_model(self.compiled_query, conn)
830
944
  else:
831
945
  raise NotImplementedError(f"Query type not supported: {self.query_file.__class__.__name__}")
832
946
 
833
947
  print(f"[{u.get_current_time()}] ✅ FINISHED: build model '{self.name}'")
834
- self.logger.log_activity_time(f"building static build model '{self.name}'", start)
948
+ self.logger.log_activity_time(
949
+ f"building static build model '{self.name}' into VDL", start,
950
+ additional_data={
951
+ "activity": "building data model into VDL",
952
+ "model_name": self.name,
953
+ "model_type": self.model_type.value
954
+ }
955
+ )
835
956
 
836
957
  await super().build_model(conn, full_refresh)
837
958
 
@@ -841,7 +962,7 @@ class DAG:
841
962
  dataset: DatasetConfig | None
842
963
  target_model: DataModel
843
964
  models_dict: dict[str, DataModel]
844
- duckdb_filepath: str = field(default="")
965
+ datalake_db_path: str | None = field(default=None)
845
966
  logger: u.Logger = field(default_factory=lambda: u.Logger(""))
846
967
  parameter_set: ParameterSet | None = field(default=None, init=False) # set in apply_selections
847
968
  placeholders: dict[str, Any] = field(init=False, default_factory=dict)
@@ -850,36 +971,50 @@ class DAG:
850
971
  return f" for dataset '{self.dataset.name}'" if self.dataset else ""
851
972
 
852
973
  def compile_build_models(self, conn_args: ConnectionsArgs) -> None:
853
- static_models: dict[str, StaticModel] = {k: v for k, v in self.models_dict.items() if isinstance(v, StaticModel)}
974
+ static_models: dict[str, StaticModel] = {
975
+ k: v for k, v in self.models_dict.items() if isinstance(v, StaticModel)
976
+ }
854
977
  for model in static_models.values():
855
978
  if isinstance(model, BuildModel):
856
979
  model.compile_for_build(conn_args, static_models)
857
980
 
858
981
  def apply_selections(
859
- self, param_cfg_set: ParameterConfigsSet, user: BaseUser | None, selections: dict[str, str]
982
+ self, param_cfg_set: ParameterConfigsSet, user: AbstractUser, selections: dict[str, str]
860
983
  ) -> None:
861
984
  start = time.time()
985
+
862
986
  dataset_params = self.dataset.parameters if self.dataset else None
863
987
  parameter_set = param_cfg_set.apply_selections(dataset_params, selections, user)
864
988
  self.parameter_set = parameter_set
865
989
  msg_extension = self._get_msg_extension()
866
- self.logger.log_activity_time("applying selections" + msg_extension, start)
990
+
991
+ dataset_name = self.dataset.name if self.dataset else None
992
+ self.logger.log_activity_time(
993
+ "applying selections" + msg_extension, start,
994
+ additional_data={"activity": "applying selections", "dataset_name": dataset_name}
995
+ )
867
996
 
868
997
  def _compile_context(
869
- self, param_args: ParametersArgs, context_func: ContextFunc, user: BaseUser | None, default_traits: dict[str, Any]
998
+ self, param_args: ParametersArgs, context_func: ContextFunc, user: AbstractUser, configurables: dict[str, str]
870
999
  ) -> tuple[dict[str, Any], ContextArgs]:
871
1000
  start = time.time()
1001
+
872
1002
  context = {}
873
1003
  assert isinstance(self.parameter_set, ParameterSet)
874
1004
  prms = self.parameter_set.get_parameters_as_dict()
875
- traits = self.dataset.traits if self.dataset else default_traits
876
- args = ContextArgs(param_args, user, prms, traits)
1005
+ args = ContextArgs(param_args, user, prms, configurables)
877
1006
  msg_extension = self._get_msg_extension()
1007
+
878
1008
  try:
879
1009
  context_func(context, args)
880
1010
  except Exception as e:
881
1011
  raise FileExecutionError(f'Failed to run {c.CONTEXT_FILE}' + msg_extension, e) from e
882
- self.logger.log_activity_time("running context.py" + msg_extension, start)
1012
+
1013
+ dataset_name = self.dataset.name if self.dataset else None
1014
+ self.logger.log_activity_time(
1015
+ "running context.py" + msg_extension, start,
1016
+ additional_data={"activity": "running context.py", "dataset_name": dataset_name}
1017
+ )
883
1018
  return context, args
884
1019
 
885
1020
  def _compile_models(self, context: dict[str, Any], ctx_args: ContextArgs, recurse: bool) -> None:
@@ -890,29 +1025,25 @@ class DAG:
890
1025
  terminal_nodes = self.target_model.get_terminal_nodes(set())
891
1026
  for model in self.models_dict.values():
892
1027
  model.confirmed_no_cycles = False
893
- self.logger.log_activity_time(f"validating no cycles in model dependencies", start)
1028
+ self.logger.log_activity_time("validating no cycles in model dependencies", start)
894
1029
  return terminal_nodes
895
1030
 
1031
+ def _attach_connections_with_type_duckdb(self, conn: duckdb.DuckDBPyConnection) -> None:
1032
+ for conn_name, connection in self.target_model.conn_set.get_connections_as_dict().items():
1033
+ if not isinstance(connection, ConnectionProperties):
1034
+ continue
1035
+ attach_uri = connection.attach_uri_for_duckdb
1036
+ if attach_uri is None:
1037
+ continue
1038
+ attach_stmt = f"ATTACH IF NOT EXISTS '{attach_uri}' AS db_{conn_name} (READ_ONLY)"
1039
+ u.run_duckdb_stmt(self.logger, conn, attach_stmt, redacted_values=[attach_uri])
1040
+
896
1041
  async def _run_models(self) -> None:
897
1042
  terminal_nodes = self._get_terminal_nodes()
898
1043
 
899
- # create an empty duckdb venv file if it does not exist
1044
+ conn = u.create_duckdb_connection(datalake_db_path=self.datalake_db_path)
900
1045
  try:
901
- conn = duckdb.connect(self.duckdb_filepath)
902
- conn.close()
903
- except duckdb.IOException as e:
904
- # unable to create duckdb venv file means it's in use and already exists
905
- # do not throw error here since attaching in read-only mode later may still work
906
- pass
907
-
908
- conn = u.create_duckdb_connection()
909
- try:
910
- read_only = "(READ_ONLY)" if self.duckdb_filepath else ""
911
- try:
912
- conn.execute(f"ATTACH '{self.duckdb_filepath}' AS venv {read_only}")
913
- except duckdb.IOException as e:
914
- self.logger.warning(f"Unable to attach to duckdb venv file: {self.duckdb_filepath}")
915
- raise e
1046
+ self._attach_connections_with_type_duckdb(conn)
916
1047
 
917
1048
  coroutines = []
918
1049
  for model_name in terminal_nodes:
@@ -924,14 +1055,14 @@ class DAG:
924
1055
  conn.close()
925
1056
 
926
1057
  async def execute(
927
- self, param_args: ParametersArgs, param_cfg_set: ParameterConfigsSet, context_func: ContextFunc, user: BaseUser | None, selections: dict[str, str],
928
- *, runquery: bool = True, recurse: bool = True, default_traits: dict[str, Any] = {}
1058
+ self, param_args: ParametersArgs, param_cfg_set: ParameterConfigsSet, context_func: ContextFunc, user: AbstractUser, selections: dict[str, str],
1059
+ *, runquery: bool = True, recurse: bool = True, configurables: dict[str, str] = {}
929
1060
  ) -> None:
930
1061
  recurse = (recurse or runquery)
931
1062
 
932
1063
  self.apply_selections(param_cfg_set, user, selections)
933
1064
 
934
- context, ctx_args = self._compile_context(param_args, context_func, user, default_traits)
1065
+ context, ctx_args = self._compile_context(param_args, context_func, user, configurables)
935
1066
 
936
1067
  self._compile_models(context, ctx_args, recurse)
937
1068
 
@@ -946,21 +1077,6 @@ class DAG:
946
1077
  self.target_model.retrieve_dependent_query_models(all_model_names)
947
1078
  return all_model_names
948
1079
 
949
- def to_networkx_graph(self) -> nx.DiGraph:
950
- G = nx.DiGraph()
951
-
952
- for model_name, model in self.models_dict.items():
953
- level = model.get_max_path_length_to_target()
954
- if level is not None:
955
- G.add_node(model_name, layer=-level, model_type=model.model_type)
956
-
957
- for model_name in G.nodes:
958
- model = self.models_dict[model_name]
959
- for dep_model_name in model.downstreams:
960
- G.add_edge(model_name, dep_model_name)
961
-
962
- return G
963
-
964
1080
  def get_all_data_models(self) -> list[rm.DataModelItem]:
965
1081
  data_models = []
966
1082
  for model_name, model in self.models_dict.items():