squirrels 0.5.0rc0__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of squirrels might be problematic. Click here for more details.

Files changed (108) hide show
  1. dateutils/__init__.py +6 -0
  2. dateutils/_enums.py +25 -0
  3. squirrels/dateutils.py → dateutils/_implementation.py +58 -111
  4. dateutils/types.py +6 -0
  5. squirrels/__init__.py +10 -12
  6. squirrels/_api_routes/__init__.py +5 -0
  7. squirrels/_api_routes/auth.py +271 -0
  8. squirrels/_api_routes/base.py +171 -0
  9. squirrels/_api_routes/dashboards.py +158 -0
  10. squirrels/_api_routes/data_management.py +148 -0
  11. squirrels/_api_routes/datasets.py +265 -0
  12. squirrels/_api_routes/oauth2.py +298 -0
  13. squirrels/_api_routes/project.py +252 -0
  14. squirrels/_api_server.py +245 -781
  15. squirrels/_arguments/__init__.py +0 -0
  16. squirrels/{arguments → _arguments}/init_time_args.py +7 -2
  17. squirrels/{arguments → _arguments}/run_time_args.py +13 -35
  18. squirrels/_auth.py +720 -212
  19. squirrels/_command_line.py +81 -41
  20. squirrels/_compile_prompts.py +147 -0
  21. squirrels/_connection_set.py +16 -7
  22. squirrels/_constants.py +29 -9
  23. squirrels/{_dashboards_io.py → _dashboards.py} +87 -6
  24. squirrels/_data_sources.py +570 -0
  25. squirrels/{dataset_result.py → _dataset_types.py} +2 -4
  26. squirrels/_exceptions.py +9 -37
  27. squirrels/_initializer.py +83 -59
  28. squirrels/_logging.py +117 -0
  29. squirrels/_manifest.py +129 -62
  30. squirrels/_model_builder.py +10 -52
  31. squirrels/_model_configs.py +3 -3
  32. squirrels/_model_queries.py +1 -1
  33. squirrels/_models.py +249 -118
  34. squirrels/{package_data → _package_data}/base_project/.env +16 -4
  35. squirrels/{package_data → _package_data}/base_project/.env.example +15 -3
  36. squirrels/{package_data → _package_data}/base_project/connections.yml +4 -3
  37. squirrels/{package_data → _package_data}/base_project/dashboards/dashboard_example.py +4 -4
  38. squirrels/_package_data/base_project/dashboards/dashboard_example.yml +22 -0
  39. squirrels/{package_data → _package_data}/base_project/duckdb_init.sql +1 -0
  40. squirrels/_package_data/base_project/macros/macros_example.sql +17 -0
  41. squirrels/{package_data → _package_data}/base_project/models/builds/build_example.py +2 -2
  42. squirrels/{package_data → _package_data}/base_project/models/builds/build_example.sql +1 -1
  43. squirrels/{package_data → _package_data}/base_project/models/builds/build_example.yml +2 -0
  44. squirrels/_package_data/base_project/models/dbviews/dbview_example.sql +17 -0
  45. squirrels/_package_data/base_project/models/dbviews/dbview_example.yml +32 -0
  46. squirrels/_package_data/base_project/models/federates/federate_example.py +48 -0
  47. squirrels/_package_data/base_project/models/federates/federate_example.sql +21 -0
  48. squirrels/{package_data → _package_data}/base_project/models/federates/federate_example.yml +7 -7
  49. squirrels/{package_data → _package_data}/base_project/models/sources.yml +5 -6
  50. squirrels/{package_data → _package_data}/base_project/parameters.yml +32 -45
  51. squirrels/_package_data/base_project/pyconfigs/connections.py +18 -0
  52. squirrels/{package_data → _package_data}/base_project/pyconfigs/context.py +31 -22
  53. squirrels/_package_data/base_project/pyconfigs/parameters.py +141 -0
  54. squirrels/_package_data/base_project/pyconfigs/user.py +44 -0
  55. squirrels/{package_data → _package_data}/base_project/seeds/seed_categories.yml +1 -1
  56. squirrels/{package_data → _package_data}/base_project/seeds/seed_subcategories.yml +1 -1
  57. squirrels/_package_data/base_project/squirrels.yml.j2 +61 -0
  58. squirrels/_package_data/templates/dataset_results.html +112 -0
  59. squirrels/_package_data/templates/oauth_login.html +271 -0
  60. squirrels/_package_data/templates/squirrels_studio.html +20 -0
  61. squirrels/_parameter_configs.py +76 -55
  62. squirrels/_parameter_options.py +348 -0
  63. squirrels/_parameter_sets.py +53 -45
  64. squirrels/_parameters.py +1664 -0
  65. squirrels/_project.py +403 -242
  66. squirrels/_py_module.py +3 -2
  67. squirrels/_request_context.py +33 -0
  68. squirrels/_schemas/__init__.py +0 -0
  69. squirrels/_schemas/auth_models.py +167 -0
  70. squirrels/_schemas/query_param_models.py +75 -0
  71. squirrels/{_api_response_models.py → _schemas/response_models.py} +48 -18
  72. squirrels/_seeds.py +1 -1
  73. squirrels/_sources.py +23 -19
  74. squirrels/_utils.py +121 -39
  75. squirrels/_version.py +1 -1
  76. squirrels/arguments.py +7 -0
  77. squirrels/auth.py +4 -0
  78. squirrels/connections.py +3 -0
  79. squirrels/dashboards.py +2 -81
  80. squirrels/data_sources.py +14 -563
  81. squirrels/parameter_options.py +13 -348
  82. squirrels/parameters.py +14 -1266
  83. squirrels/types.py +16 -0
  84. {squirrels-0.5.0rc0.dist-info → squirrels-0.5.1.dist-info}/METADATA +42 -30
  85. squirrels-0.5.1.dist-info/RECORD +98 -0
  86. squirrels/package_data/base_project/dashboards/dashboard_example.yml +0 -22
  87. squirrels/package_data/base_project/macros/macros_example.sql +0 -15
  88. squirrels/package_data/base_project/models/dbviews/dbview_example.sql +0 -12
  89. squirrels/package_data/base_project/models/dbviews/dbview_example.yml +0 -26
  90. squirrels/package_data/base_project/models/federates/federate_example.py +0 -44
  91. squirrels/package_data/base_project/models/federates/federate_example.sql +0 -17
  92. squirrels/package_data/base_project/pyconfigs/connections.py +0 -14
  93. squirrels/package_data/base_project/pyconfigs/parameters.py +0 -93
  94. squirrels/package_data/base_project/pyconfigs/user.py +0 -23
  95. squirrels/package_data/base_project/squirrels.yml.j2 +0 -71
  96. squirrels-0.5.0rc0.dist-info/RECORD +0 -70
  97. /squirrels/{package_data → _package_data}/base_project/assets/expenses.db +0 -0
  98. /squirrels/{package_data → _package_data}/base_project/assets/weather.db +0 -0
  99. /squirrels/{package_data → _package_data}/base_project/docker/.dockerignore +0 -0
  100. /squirrels/{package_data → _package_data}/base_project/docker/Dockerfile +0 -0
  101. /squirrels/{package_data → _package_data}/base_project/docker/compose.yml +0 -0
  102. /squirrels/{package_data/base_project/.gitignore → _package_data/base_project/gitignore} +0 -0
  103. /squirrels/{package_data → _package_data}/base_project/seeds/seed_categories.csv +0 -0
  104. /squirrels/{package_data → _package_data}/base_project/seeds/seed_subcategories.csv +0 -0
  105. /squirrels/{package_data → _package_data}/base_project/tmp/.gitignore +0 -0
  106. {squirrels-0.5.0rc0.dist-info → squirrels-0.5.1.dist-info}/WHEEL +0 -0
  107. {squirrels-0.5.0rc0.dist-info → squirrels-0.5.1.dist-info}/entry_points.txt +0 -0
  108. {squirrels-0.5.0rc0.dist-info → squirrels-0.5.1.dist-info}/licenses/LICENSE +0 -0
squirrels/_models.py CHANGED
@@ -5,25 +5,26 @@ from abc import ABCMeta, abstractmethod
5
5
  from enum import Enum
6
6
  from pathlib import Path
7
7
  import asyncio, os, re, time, duckdb, sqlglot
8
- import polars as pl, pandas as pd, networkx as nx
8
+ import polars as pl, pandas as pd
9
9
 
10
- from . import _constants as c, _utils as u, _py_module as pm, _model_queries as mq, _model_configs as mc, _sources as src, _api_response_models as arm
10
+ from . import _constants as c, _utils as u, _py_module as pm, _model_queries as mq, _model_configs as mc, _sources as src
11
+ from ._schemas import response_models as rm
11
12
  from ._exceptions import FileExecutionError, InvalidInputError
12
- from .arguments.run_time_args import ContextArgs, ModelArgs, BuildModelArgs
13
- from ._auth import BaseUser
13
+ from ._arguments.run_time_args import ContextArgs, ModelArgs, BuildModelArgs
14
+ from ._auth import AbstractUser
14
15
  from ._connection_set import ConnectionsArgs, ConnectionSet, ConnectionProperties
15
- from ._manifest import DatasetConfig
16
+ from ._manifest import DatasetConfig, ConnectionTypeEnum
16
17
  from ._parameter_sets import ParameterConfigsSet, ParametersArgs, ParameterSet
17
18
 
18
19
  ContextFunc = Callable[[dict[str, Any], ContextArgs], None]
19
20
 
20
21
 
21
22
  class ModelType(Enum):
23
+ SEED = "seed"
22
24
  SOURCE = "source"
25
+ BUILD = "build"
23
26
  DBVIEW = "dbview"
24
27
  FEDERATE = "federate"
25
- SEED = "seed"
26
- BUILD = "build"
27
28
 
28
29
 
29
30
  @dataclass
@@ -78,15 +79,15 @@ class DataModel(metaclass=ABCMeta):
78
79
  self.confirmed_no_cycles = True
79
80
  return terminal_nodes
80
81
 
81
- def _load_duckdb_view_to_python_df(self, conn: duckdb.DuckDBPyConnection, *, use_venv: bool = False) -> pl.LazyFrame:
82
- table_name = ("venv." if use_venv else "") + self.name
82
+ def _load_duckdb_view_to_python_df(self, conn: duckdb.DuckDBPyConnection, *, use_datalake: bool = False) -> pl.LazyFrame:
83
+ table_name = ("vdl." if use_datalake else "") + self.name
83
84
  try:
84
85
  return conn.sql(f"FROM {table_name}").pl().lazy()
85
86
  except duckdb.CatalogException as e:
86
87
  raise u.ConfigurationError(f'Failed to load duckdb table or view "{self.name}" to python dataframe') from e
87
88
 
88
89
  def _run_sql_query_on_connection(self, connection_name: str, query: str, placeholders: dict = {}) -> pl.DataFrame:
89
- self.logger.info(f"Running sql query on connection '{connection_name}': {query}")
90
+ self.logger.debug(f"Running SQL query on connection '{connection_name}':\n{query}")
90
91
  return self.conn_set.run_sql_query_from_conn_name(query, connection_name, placeholders)
91
92
 
92
93
  async def _trigger(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
@@ -132,11 +133,13 @@ class DataModel(metaclass=ABCMeta):
132
133
 
133
134
  def _create_table_from_df(self, conn: duckdb.DuckDBPyConnection, query_result: pl.LazyFrame | pd.DataFrame):
134
135
  local_conn = conn.cursor()
136
+ # local_conn = conn
135
137
  try:
136
- local_conn.register("df", query_result)
137
- local_conn.execute(f"CREATE OR REPLACE TABLE {self.name} AS SELECT * FROM df")
138
+ assert query_result is not None
139
+ local_conn.execute(f"CREATE OR REPLACE TABLE {self.name} AS FROM query_result")
138
140
  finally:
139
141
  local_conn.close()
142
+ # pass
140
143
 
141
144
  def process_pass_through_columns(self, models_dict: dict[str, DataModel]) -> None:
142
145
  pass
@@ -171,19 +174,26 @@ class StaticModel(DataModel):
171
174
  def _get_result(self, conn: duckdb.DuckDBPyConnection) -> pl.LazyFrame:
172
175
  local_conn = conn.cursor()
173
176
  try:
174
- return self._load_duckdb_view_to_python_df(local_conn, use_venv=True)
177
+ return self._load_duckdb_view_to_python_df(local_conn, use_datalake=True)
175
178
  except Exception as e:
176
- raise InvalidInputError(61, f'Model "{self.name}" depends on static data models that cannot be found.')
179
+ raise InvalidInputError(409, f'dependent_data_model_not_found', f'Model "{self.name}" depends on static data models that cannot be found. Try building the Virtual Data Lake (VDL) first.')
177
180
  finally:
178
181
  local_conn.close()
179
182
 
180
183
  async def run_model(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
181
- start = time.time()
182
-
183
184
  if (self.needs_python_df or self.is_target) and self.result is None:
185
+ start = time.time()
186
+
184
187
  self.result = await asyncio.to_thread(self._get_result, conn)
185
-
186
- self.logger.log_activity_time(f"loading static model '{self.name}'", start)
188
+
189
+ self.logger.log_activity_time(
190
+ f"loading {self.model_type.value} model '{self.name}' into memory", start,
191
+ additional_data={
192
+ "activity": "loading static data model into memory",
193
+ "model_name": self.name,
194
+ "model_type": self.model_type.value
195
+ }
196
+ )
187
197
 
188
198
  await super().run_model(conn, placeholders)
189
199
 
@@ -224,10 +234,18 @@ class Seed(StaticModel):
224
234
  start = time.time()
225
235
 
226
236
  print(f"[{u.get_current_time()}] 🔨 BUILDING: seed model '{self.name}'")
227
- await asyncio.to_thread(self._create_table_from_df, conn, self.result)
237
+ # await asyncio.to_thread(self._create_table_from_df, conn, self.result)
238
+ self._create_table_from_df(conn, self.result) # without threading
228
239
 
229
240
  print(f"[{u.get_current_time()}] ✅ FINISHED: seed model '{self.name}'")
230
- self.logger.log_activity_time(f"building seed model '{self.name}' to venv", start)
241
+ self.logger.log_activity_time(
242
+ f"building seed model '{self.name}' into VDL", start,
243
+ additional_data={
244
+ "activity": "building data model into VDL",
245
+ "model_name": self.name,
246
+ "model_type": self.model_type.value
247
+ }
248
+ )
231
249
 
232
250
  await super().build_model(conn, full_refresh)
233
251
 
@@ -239,22 +257,34 @@ class SourceModel(StaticModel):
239
257
  @property
240
258
  def model_type(self) -> ModelType:
241
259
  return ModelType.SOURCE
260
+
261
+ @property
262
+ def connection_props(self) -> ConnectionProperties:
263
+ conn_name = self.model_config.get_connection()
264
+ conn_props = self.conn_set.get_connection(conn_name)
265
+ if isinstance(conn_props, ConnectionProperties):
266
+ return conn_props
267
+ raise u.ConfigurationError(f'Unable to use connection "{conn_name}" for source "{self.name}". Connection "{conn_name}" must be a ConnectionProperties object')
242
268
 
243
269
  @property
244
270
  def is_queryable(self) -> bool:
245
- return self.model_config.load_to_duckdb
271
+ connection_props = self.connection_props
272
+ return self.model_config.load_to_vdl or connection_props.type == ConnectionTypeEnum.DUCKDB
246
273
 
247
274
  def _build_source_model(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
248
275
  local_conn = conn.cursor()
276
+ # local_conn = conn
277
+
278
+ local_conn.begin()
249
279
  try:
250
280
  source = self.model_config
251
281
  conn_name = source.get_connection()
252
282
 
253
- connection_props = self.conn_set.get_connection(conn_name)
254
- if isinstance(connection_props, ConnectionProperties):
255
- dialect = connection_props.dialect
256
- else:
257
- raise u.ConfigurationError(f'Unable to use connection "{conn_name}" for source "{self.name}"')
283
+ connection_props = self.connection_props
284
+ dialect = connection_props.dialect
285
+ attach_uri = connection_props.attach_uri_for_duckdb
286
+ if attach_uri is None:
287
+ raise u.ConfigurationError(f'Loading to duckdb is not supported for source "{self.name}" since its connection "{conn_name}" uses an unsupported dialect')
258
288
 
259
289
  result = u.run_duckdb_stmt(self.logger, local_conn, f"FROM (SHOW DATABASES) WHERE database_name = 'db_{conn_name}'").fetchone()
260
290
  if result is None:
@@ -264,8 +294,9 @@ class SourceModel(StaticModel):
264
294
  new_table_name = self.name
265
295
 
266
296
  if len(source.columns) == 0:
267
- stmt = f"CREATE OR REPLACE TABLE {new_table_name} AS SELECT * FROM db_{conn_name}.{table_name}"
297
+ stmt = f"CREATE OR REPLACE TABLE {new_table_name} AS FROM db_{conn_name}.{table_name}"
268
298
  u.run_duckdb_stmt(self.logger, local_conn, stmt)
299
+ local_conn.commit()
269
300
  return
270
301
 
271
302
  increasing_column = source.update_hints.increasing_column
@@ -292,25 +323,44 @@ class SourceModel(StaticModel):
292
323
  if max_val_of_incr_col is None:
293
324
  recreate_table = True
294
325
 
295
- insert_cols_clause = source.get_cols_for_insert_stmt()
296
- insert_replace_clause = source.get_insert_replace_clause()
297
- query = source.get_query_for_insert(dialect, conn_name, table_name, max_val_of_incr_col, full_refresh=recreate_table)
298
- stmt = f"INSERT {insert_replace_clause} INTO {new_table_name} ({insert_cols_clause}) {query}"
326
+ query = source.get_query_for_upsert(dialect, conn_name, table_name, max_val_of_incr_col, full_refresh=recreate_table)
327
+
328
+ primary_keys = ", ".join(source.primary_key) if source.primary_key else ""
329
+ match_condition = f"USING ({primary_keys})" if primary_keys else "ON false"
330
+ stmt = (
331
+ f"MERGE INTO {new_table_name} "
332
+ f"USING ({query}) AS src "
333
+ f"{match_condition} "
334
+ f"WHEN MATCHED THEN UPDATE "
335
+ f"WHEN NOT MATCHED THEN INSERT BY NAME"
336
+ )
299
337
  u.run_duckdb_stmt(self.logger, local_conn, stmt)
338
+
339
+ local_conn.commit()
340
+
300
341
  finally:
301
342
  local_conn.close()
343
+ # pass
302
344
 
303
345
  async def build_model(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
304
- if self.model_config.load_to_duckdb:
346
+ if self.model_config.load_to_vdl:
305
347
  start = time.time()
306
348
  print(f"[{u.get_current_time()}] 🔨 BUILDING: source model '{self.name}'")
307
349
 
308
- await asyncio.to_thread(self._build_source_model, conn, full_refresh)
350
+ # await asyncio.to_thread(self._build_source_model, conn, full_refresh)
351
+ self._build_source_model(conn, full_refresh) # without threading
309
352
 
310
353
  print(f"[{u.get_current_time()}] ✅ FINISHED: source model '{self.name}'")
311
- self.logger.log_activity_time(f"building source model '{self.name}' to venv", start)
354
+ self.logger.log_activity_time(
355
+ f"building source model '{self.name}' into VDL", start,
356
+ additional_data={
357
+ "activity": "building data model into VDL",
358
+ "model_name": self.name,
359
+ "model_type": self.model_type.value
360
+ }
361
+ )
312
362
 
313
- await super().build_model(conn, full_refresh)
363
+ await super().build_model(conn, full_refresh)
314
364
 
315
365
 
316
366
  @dataclass
@@ -333,10 +383,16 @@ class QueryModel(DataModel):
333
383
  raise u.ConfigurationError(f'Model "{self.name}" references unknown model "{dependent_model_name}"')
334
384
 
335
385
  dep_model = models_dict[dependent_model_name]
336
- if isinstance(dep_model, SourceModel) and not dep_model.model_config.load_to_duckdb:
337
- raise u.ConfigurationError(
338
- f'Model "{self.name}" cannot reference source model "{dependent_model_name}" which has load_to_duckdb=False'
339
- )
386
+ if isinstance(dep_model, SourceModel) and not dep_model.model_config.load_to_vdl:
387
+ # Allow when caller is Build or Federate AND the source connection is duckdb; else error
388
+ conn_name = dep_model.model_config.get_connection()
389
+ conn_props = self.conn_set.get_connection(conn_name)
390
+ is_duckdb_conn = isinstance(conn_props, ConnectionProperties) and conn_props.type == ConnectionTypeEnum.DUCKDB
391
+ if not is_duckdb_conn:
392
+ raise u.ConfigurationError(
393
+ f'Model "{self.name}" cannot reference source model "{dependent_model_name}". '
394
+ 'To be referenced by a build or federate model, the source must have load_to_vdl=True or a duckdb connection type.'
395
+ )
340
396
 
341
397
  self.model_config.depends_on.add(dependent_model_name)
342
398
  return dependent_model_name
@@ -351,10 +407,10 @@ class QueryModel(DataModel):
351
407
  def _get_compile_sql_model_args_from_ctx_args(
352
408
  self, ctx: dict[str, Any], ctx_args: ContextArgs
353
409
  ) -> dict[str, Any]:
354
- is_placeholder = lambda placeholder: placeholder in ctx_args.placeholders
410
+ is_placeholder = lambda placeholder: placeholder in ctx_args._placeholders_copy
355
411
  kwargs = {
356
412
  "proj_vars": ctx_args.proj_vars, "env_vars": ctx_args.env_vars, "user": ctx_args.user, "prms": ctx_args.prms,
357
- "traits": ctx_args.traits, "ctx": ctx, "is_placeholder": is_placeholder, "set_placeholder": ctx_args.set_placeholder,
413
+ "configurables": ctx_args.configurables, "ctx": ctx, "is_placeholder": is_placeholder, "set_placeholder": ctx_args.set_placeholder,
358
414
  "param_exists": ctx_args.param_exists
359
415
  }
360
416
  return kwargs
@@ -424,6 +480,11 @@ class QueryModel(DataModel):
424
480
  dependent_model_names.add(self.name)
425
481
  for dep_model in self.upstreams.values():
426
482
  dep_model.retrieve_dependent_query_models(dependent_model_names)
483
+
484
+ def _log_sql_to_run(self, sql: str, placeholders: dict[str, Any]) -> None:
485
+ log_msg = f"SQL to run for model '{self.name}':\n{sql}"
486
+ log_msg += f"\n\n(with placeholders: {placeholders})"
487
+ self.logger.debug(log_msg)
427
488
 
428
489
 
429
490
  @dataclass
@@ -448,11 +509,11 @@ class DbviewModel(QueryModel):
448
509
  if source_model.model_config.get_connection() != self.model_config.get_connection():
449
510
  raise u.ConfigurationError(f'Dbview "{self.name}" references source "{source_name}" with different connection')
450
511
 
451
- # Check if the source model has load_to_duckdb=False but this dbview has translate_to_duckdb=True
452
- if not source_model.model_config.load_to_duckdb and self.model_config.translate_to_duckdb:
512
+ # Check if the source model has load_to_vdl=False but this dbview has translate_to_duckdb=True
513
+ if not source_model.model_config.load_to_vdl and self.model_config.translate_to_duckdb:
453
514
  raise u.ConfigurationError(
454
515
  f'Dbview "{self.name}" with translate_to_duckdb=True cannot reference source "{source_name}" '
455
- f'which has load_to_duckdb=False'
516
+ f'which has load_to_vdl=False'
456
517
  )
457
518
 
458
519
  self.model_config.depends_on.add(source_name)
@@ -460,14 +521,16 @@ class DbviewModel(QueryModel):
460
521
  return "{{ source(\"" + source_name + "\") }}"
461
522
 
462
523
  kwargs["source"] = source
524
+ kwargs["ref"] = source
463
525
  return kwargs
464
526
 
465
527
  def _get_duckdb_query(self, read_dialect: str, query: str) -> str:
466
528
  kwargs = {
467
- "source": lambda source_name: "venv." + source_name
529
+ "source": lambda source_name: "vdl." + source_name
468
530
  }
469
531
  compiled_query = self._get_compiled_sql_query_str(query, kwargs)
470
- return sqlglot.transpile(compiled_query, read=read_dialect, write="duckdb")[0]
532
+ duckdb_query = sqlglot.transpile(compiled_query, read=read_dialect, write="duckdb", pretty=True)[0]
533
+ return "-- translated to duckdb\n" + duckdb_query
471
534
 
472
535
  def _compile_sql_model(self, kwargs: dict[str, Any]) -> mq.SqlModelQuery:
473
536
  compiled_query_str = self._get_compiled_sql_query_str(self.query_file.raw_query, kwargs)
@@ -476,8 +539,13 @@ class DbviewModel(QueryModel):
476
539
  connection_props = self.conn_set.get_connection(connection_name)
477
540
 
478
541
  if self.model_config.translate_to_duckdb and isinstance(connection_props, ConnectionProperties):
542
+ # Forbid translate_to_duckdb when dbview connection is duckdb
543
+ if connection_props.type == ConnectionTypeEnum.DUCKDB:
544
+ raise u.ConfigurationError(
545
+ f'Dbview "{self.name}" has translate_to_duckdb=True but its connection is duckdb. Use a federate model instead.'
546
+ )
479
547
  macros = {
480
- "source": lambda source_name: "venv." + source_name
548
+ "source": lambda source_name: "vdl." + source_name
481
549
  }
482
550
  compiled_query2 = self._get_compiled_sql_query_str(compiled_query_str, macros)
483
551
  compiled_query_str = self._get_duckdb_query(connection_props.dialect, compiled_query2)
@@ -505,7 +573,14 @@ class DbviewModel(QueryModel):
505
573
  kwargs = self._get_compile_sql_model_args(ctx, ctx_args, models_dict)
506
574
  self.compiled_query = self._compile_sql_model(kwargs)
507
575
 
508
- self.logger.log_activity_time(f"compiling dbview model '{self.name}'", start)
576
+ self.logger.log_activity_time(
577
+ f"compiling dbview model '{self.name}'", start,
578
+ additional_data={
579
+ "activity": "compiling data model",
580
+ "model_name": self.name,
581
+ "model_type": self.model_type.value
582
+ }
583
+ )
509
584
 
510
585
  async def _run_sql_model(self, conn: duckdb.DuckDBPyConnection, placeholders: dict = {}) -> None:
511
586
  assert self.compiled_query is not None
@@ -518,19 +593,21 @@ class DbviewModel(QueryModel):
518
593
  if is_duckdb:
519
594
  local_conn = conn.cursor()
520
595
  try:
521
- self.logger.info(f"Running duckdb query: {query}")
596
+ self.logger.info(f"Running dbview '{self.name}' on duckdb")
522
597
  return local_conn.sql(query, params=placeholders).pl()
523
598
  except duckdb.CatalogException as e:
524
- raise InvalidInputError(61, f'Model "{self.name}" depends on static data models that cannot be found.')
599
+ raise InvalidInputError(409, f'dependent_data_model_not_found', f'Model "{self.name}" depends on static data models that cannot be found. Try building the Virtual Data Lake (VDL) first.')
525
600
  except Exception as e:
526
601
  raise RuntimeError(e)
527
602
  finally:
528
603
  local_conn.close()
529
604
  else:
530
- return self._run_sql_query_on_connection(connection_name, query, placeholders)
605
+ self.logger.info(f"Running dbview '{self.name}' on connection: {connection_name}")
606
+ return self.conn_set.run_sql_query_from_conn_name(query, connection_name, placeholders)
531
607
  except RuntimeError as e:
532
608
  raise FileExecutionError(f'Failed to run dbview sql model "{self.name}"', e)
533
609
 
610
+ self._log_sql_to_run(query, placeholders)
534
611
  result = await asyncio.to_thread(run_sql_query_on_connection, is_duckdb, query, placeholders)
535
612
  self.result = result.lazy()
536
613
 
@@ -539,7 +616,14 @@ class DbviewModel(QueryModel):
539
616
 
540
617
  await self._run_sql_model(conn, placeholders)
541
618
 
542
- self.logger.log_activity_time(f"running dbview model '{self.name}'", start)
619
+ self.logger.log_activity_time(
620
+ f"running dbview model '{self.name}'", start,
621
+ additional_data={
622
+ "activity": "running data model",
623
+ "model_name": self.name,
624
+ "model_type": self.model_type.value
625
+ }
626
+ )
543
627
 
544
628
  await super().run_model(conn, placeholders)
545
629
 
@@ -561,8 +645,16 @@ class FederateModel(QueryModel):
561
645
 
562
646
  def ref(dependent_model_name: str) -> str:
563
647
  dependent_model = self._ref_for_sql(dependent_model_name, models_dict)
564
- prefix = "venv." if isinstance(models_dict[dependent_model], (SourceModel, BuildModel)) else ""
565
- return prefix + dependent_model
648
+ dep = models_dict[dependent_model]
649
+ if isinstance(dep, BuildModel):
650
+ return "vdl." + dependent_model
651
+ if isinstance(dep, SourceModel):
652
+ if dep.model_config.load_to_vdl:
653
+ return "vdl." + dependent_model
654
+ conn_name = dep.model_config.get_connection()
655
+ table_name = dep.model_config.get_table()
656
+ return f"db_{conn_name}.{table_name}"
657
+ return dependent_model
566
658
 
567
659
  kwargs["ref"] = ref
568
660
  return kwargs
@@ -580,7 +672,7 @@ class FederateModel(QueryModel):
580
672
  connections = self.conn_set.get_connections_as_dict()
581
673
 
582
674
  def run_external_sql(connection_name: str, sql_query: str) -> pl.DataFrame:
583
- return self._run_sql_query_on_connection(connection_name, sql_query, ctx_args.placeholders)
675
+ return self._run_sql_query_on_connection(connection_name, sql_query, ctx_args._placeholders_copy)
584
676
 
585
677
  conn_args = ConnectionsArgs(ctx_args.project_path, ctx_args.proj_vars, ctx_args.env_vars)
586
678
  build_model_args = BuildModelArgs(conn_args, connections, dependencies, self._ref_for_python, run_external_sql)
@@ -616,7 +708,14 @@ class FederateModel(QueryModel):
616
708
  else:
617
709
  raise NotImplementedError(f"Query type not supported: {self.query_file.__class__.__name__}")
618
710
 
619
- self.logger.log_activity_time(f"compiling federate model '{self.name}'", start)
711
+ self.logger.log_activity_time(
712
+ f"compiling federate model '{self.name}'", start,
713
+ additional_data={
714
+ "activity": "compiling data model",
715
+ "model_name": self.name,
716
+ "model_type": self.model_type.value
717
+ }
718
+ )
620
719
 
621
720
  if not recurse:
622
721
  return
@@ -636,17 +735,22 @@ class FederateModel(QueryModel):
636
735
  query = compiled_query.query
637
736
 
638
737
  def create_table(local_conn: duckdb.DuckDBPyConnection):
639
- placeholer_exists = lambda key: re.search(r"\$" + key + r"(?!\w)", query)
640
- existing_placeholders = {key: value for key, value in placeholders.items() if placeholer_exists(key)}
738
+ # DuckDB doesn't support specifying named parameters that are not used in the query, so filtering them out
739
+ placeholder_exists = lambda key: re.search(r"\$" + key + r"(?!\w)", query)
740
+ existing_placeholders = {key: value for key, value in placeholders.items() if placeholder_exists(key)}
641
741
 
642
742
  create_query = self.model_config.get_sql_for_create(self.name, query)
743
+ self._log_sql_to_run(create_query, existing_placeholders)
643
744
  try:
644
745
  return local_conn.execute(create_query, existing_placeholders)
645
746
  except duckdb.CatalogException as e:
646
- raise InvalidInputError(61, f'Model "{self.name}" depends on static data models that cannot be found.')
747
+ if self.name == "__fake_target":
748
+ raise InvalidInputError(409, "invalid_sql_query", f"Provided SQL query depends on static data models that cannot be found. Try building the Virtual Data Lake (VDL) first.")
749
+ else:
750
+ raise InvalidInputError(409, f'dependent_data_model_not_found', f'Model "{self.name}" depends on static data models that cannot be found. Try building the Virtual Data Lake (VDL) first.')
647
751
  except Exception as e:
648
752
  if self.name == "__fake_target":
649
- raise InvalidInputError(204, f"Failed to run provided SQL query")
753
+ raise InvalidInputError(400, "invalid_sql_query", f"Failed to run provided SQL query")
650
754
  else:
651
755
  raise FileExecutionError(f'Failed to run federate sql model "{self.name}"', e) from e
652
756
 
@@ -673,7 +777,14 @@ class FederateModel(QueryModel):
673
777
  else:
674
778
  raise NotImplementedError(f"Query type not supported: {self.query_file.__class__.__name__}")
675
779
 
676
- self.logger.log_activity_time(f"running federate model '{self.name}'", start)
780
+ self.logger.log_activity_time(
781
+ f"running federate model '{self.name}'", start,
782
+ additional_data={
783
+ "activity": "running data model",
784
+ "model_name": self.name,
785
+ "model_type": self.model_type.value
786
+ }
787
+ )
677
788
 
678
789
  await super().run_model(conn, placeholders)
679
790
 
@@ -703,7 +814,12 @@ class BuildModel(StaticModel, QueryModel):
703
814
  }
704
815
 
705
816
  def ref_for_build(dependent_model_name: str) -> str:
706
- dependent_model = self._ref_for_sql(dependent_model_name, dict(models_dict))
817
+ dependent_model = self._ref_for_sql(dependent_model_name, models_dict)
818
+ dep = models_dict[dependent_model]
819
+ if isinstance(dep, SourceModel) and not dep.model_config.load_to_vdl:
820
+ conn_name = dep.model_config.get_connection()
821
+ table_name = dep.model_config.get_table()
822
+ return f"db_{conn_name}.{table_name}"
707
823
  return dependent_model
708
824
 
709
825
  kwargs["ref"] = ref_for_build
@@ -756,7 +872,14 @@ class BuildModel(StaticModel, QueryModel):
756
872
  else:
757
873
  raise NotImplementedError(f"Query type not supported: {self.query_file.__class__.__name__}")
758
874
 
759
- self.logger.log_activity_time(f"compiling build model '{self.name}'", start)
875
+ self.logger.log_activity_time(
876
+ f"compiling build model '{self.name}'", start,
877
+ additional_data={
878
+ "activity": "compiling data model",
879
+ "model_name": self.name,
880
+ "model_type": self.model_type.value
881
+ }
882
+ )
760
883
 
761
884
  dependencies = self.model_config.depends_on
762
885
  self.wait_count_for_build = len(dependencies)
@@ -771,14 +894,17 @@ class BuildModel(StaticModel, QueryModel):
771
894
  def create_table():
772
895
  create_query = self.model_config.get_sql_for_build(self.name, query)
773
896
  local_conn = conn.cursor()
897
+ # local_conn = conn
774
898
  try:
775
- return u.run_duckdb_stmt(self.logger, local_conn, create_query)
899
+ return u.run_duckdb_stmt(self.logger, local_conn, create_query, model_name=self.name)
776
900
  except Exception as e:
777
901
  raise FileExecutionError(f'Failed to build static sql model "{self.name}"', e) from e
778
902
  finally:
779
903
  local_conn.close()
904
+ # pass
780
905
 
781
- await asyncio.to_thread(create_table)
906
+ # await asyncio.to_thread(create_table)
907
+ create_table() # without threading
782
908
 
783
909
  async def _build_python_model(self, compiled_query: mq.PyModelQuery, conn: duckdb.DuckDBPyConnection) -> None:
784
910
  query_result = await asyncio.to_thread(compiled_query.query)
@@ -786,7 +912,8 @@ class BuildModel(StaticModel, QueryModel):
786
912
  query_result = pl.from_pandas(query_result).lazy()
787
913
  if self.needs_python_df_for_build:
788
914
  self.result = query_result.lazy()
789
- await asyncio.to_thread(self._create_table_from_df, conn, query_result)
915
+ # await asyncio.to_thread(self._create_table_from_df, conn, query_result)
916
+ self._create_table_from_df(conn, query_result) # without threading
790
917
 
791
918
  async def build_model(self, conn: duckdb.DuckDBPyConnection, full_refresh: bool) -> None:
792
919
  start = time.time()
@@ -799,24 +926,33 @@ class BuildModel(StaticModel, QueryModel):
799
926
  def load_df(conn: duckdb.DuckDBPyConnection, dep_model: DataModel):
800
927
  if dep_model.result is None:
801
928
  local_conn = conn.cursor()
929
+ # local_conn = conn
802
930
  try:
803
931
  dep_model.result = dep_model._load_duckdb_view_to_python_df(local_conn)
804
932
  finally:
805
933
  local_conn.close()
934
+ # pass
806
935
 
807
936
  coroutines = []
808
937
  for dep_model in self.upstreams_for_build.values():
809
938
  coro = asyncio.to_thread(load_df, conn, dep_model)
810
939
  coroutines.append(coro)
811
940
  await u.asyncio_gather(coroutines)
812
-
941
+
813
942
  # Then run the model's Python function to build the model
814
943
  await self._build_python_model(self.compiled_query, conn)
815
944
  else:
816
945
  raise NotImplementedError(f"Query type not supported: {self.query_file.__class__.__name__}")
817
946
 
818
947
  print(f"[{u.get_current_time()}] ✅ FINISHED: build model '{self.name}'")
819
- self.logger.log_activity_time(f"building static build model '{self.name}'", start)
948
+ self.logger.log_activity_time(
949
+ f"building static build model '{self.name}' into VDL", start,
950
+ additional_data={
951
+ "activity": "building data model into VDL",
952
+ "model_name": self.name,
953
+ "model_type": self.model_type.value
954
+ }
955
+ )
820
956
 
821
957
  await super().build_model(conn, full_refresh)
822
958
 
@@ -826,7 +962,7 @@ class DAG:
826
962
  dataset: DatasetConfig | None
827
963
  target_model: DataModel
828
964
  models_dict: dict[str, DataModel]
829
- duckdb_filepath: str = field(default="")
965
+ datalake_db_path: str | None = field(default=None)
830
966
  logger: u.Logger = field(default_factory=lambda: u.Logger(""))
831
967
  parameter_set: ParameterSet | None = field(default=None, init=False) # set in apply_selections
832
968
  placeholders: dict[str, Any] = field(init=False, default_factory=dict)
@@ -835,36 +971,50 @@ class DAG:
835
971
  return f" for dataset '{self.dataset.name}'" if self.dataset else ""
836
972
 
837
973
  def compile_build_models(self, conn_args: ConnectionsArgs) -> None:
838
- static_models: dict[str, StaticModel] = {k: v for k, v in self.models_dict.items() if isinstance(v, StaticModel)}
974
+ static_models: dict[str, StaticModel] = {
975
+ k: v for k, v in self.models_dict.items() if isinstance(v, StaticModel)
976
+ }
839
977
  for model in static_models.values():
840
978
  if isinstance(model, BuildModel):
841
979
  model.compile_for_build(conn_args, static_models)
842
980
 
843
981
  def apply_selections(
844
- self, param_cfg_set: ParameterConfigsSet, user: BaseUser | None, selections: dict[str, str]
982
+ self, param_cfg_set: ParameterConfigsSet, user: AbstractUser, selections: dict[str, str]
845
983
  ) -> None:
846
984
  start = time.time()
985
+
847
986
  dataset_params = self.dataset.parameters if self.dataset else None
848
987
  parameter_set = param_cfg_set.apply_selections(dataset_params, selections, user)
849
988
  self.parameter_set = parameter_set
850
989
  msg_extension = self._get_msg_extension()
851
- self.logger.log_activity_time("applying selections" + msg_extension, start)
990
+
991
+ dataset_name = self.dataset.name if self.dataset else None
992
+ self.logger.log_activity_time(
993
+ "applying selections" + msg_extension, start,
994
+ additional_data={"activity": "applying selections", "dataset_name": dataset_name}
995
+ )
852
996
 
853
997
  def _compile_context(
854
- self, param_args: ParametersArgs, context_func: ContextFunc, user: BaseUser | None, default_traits: dict[str, Any]
998
+ self, param_args: ParametersArgs, context_func: ContextFunc, user: AbstractUser, configurables: dict[str, str]
855
999
  ) -> tuple[dict[str, Any], ContextArgs]:
856
1000
  start = time.time()
1001
+
857
1002
  context = {}
858
1003
  assert isinstance(self.parameter_set, ParameterSet)
859
1004
  prms = self.parameter_set.get_parameters_as_dict()
860
- traits = self.dataset.traits if self.dataset else default_traits
861
- args = ContextArgs(param_args, user, prms, traits)
1005
+ args = ContextArgs(param_args, user, prms, configurables)
862
1006
  msg_extension = self._get_msg_extension()
1007
+
863
1008
  try:
864
1009
  context_func(context, args)
865
1010
  except Exception as e:
866
1011
  raise FileExecutionError(f'Failed to run {c.CONTEXT_FILE}' + msg_extension, e) from e
867
- self.logger.log_activity_time("running context.py" + msg_extension, start)
1012
+
1013
+ dataset_name = self.dataset.name if self.dataset else None
1014
+ self.logger.log_activity_time(
1015
+ "running context.py" + msg_extension, start,
1016
+ additional_data={"activity": "running context.py", "dataset_name": dataset_name}
1017
+ )
868
1018
  return context, args
869
1019
 
870
1020
  def _compile_models(self, context: dict[str, Any], ctx_args: ContextArgs, recurse: bool) -> None:
@@ -875,29 +1025,25 @@ class DAG:
875
1025
  terminal_nodes = self.target_model.get_terminal_nodes(set())
876
1026
  for model in self.models_dict.values():
877
1027
  model.confirmed_no_cycles = False
878
- self.logger.log_activity_time(f"validating no cycles in model dependencies", start)
1028
+ self.logger.log_activity_time("validating no cycles in model dependencies", start)
879
1029
  return terminal_nodes
880
1030
 
1031
+ def _attach_connections_with_type_duckdb(self, conn: duckdb.DuckDBPyConnection) -> None:
1032
+ for conn_name, connection in self.target_model.conn_set.get_connections_as_dict().items():
1033
+ if not isinstance(connection, ConnectionProperties):
1034
+ continue
1035
+ attach_uri = connection.attach_uri_for_duckdb
1036
+ if attach_uri is None:
1037
+ continue
1038
+ attach_stmt = f"ATTACH IF NOT EXISTS '{attach_uri}' AS db_{conn_name} (READ_ONLY)"
1039
+ u.run_duckdb_stmt(self.logger, conn, attach_stmt, redacted_values=[attach_uri])
1040
+
881
1041
  async def _run_models(self) -> None:
882
1042
  terminal_nodes = self._get_terminal_nodes()
883
1043
 
884
- # create an empty duckdb venv file if it does not exist
885
- try:
886
- conn = duckdb.connect(self.duckdb_filepath)
887
- conn.close()
888
- except duckdb.IOException as e:
889
- # unable to create duckdb venv file means it's in use and already exists
890
- # do not throw error here since attaching in read-only mode later may still work
891
- pass
892
-
893
- conn = u.create_duckdb_connection()
1044
+ conn = u.create_duckdb_connection(datalake_db_path=self.datalake_db_path)
894
1045
  try:
895
- read_only = "(READ_ONLY)" if self.duckdb_filepath else ""
896
- try:
897
- conn.execute(f"ATTACH '{self.duckdb_filepath}' AS venv {read_only}")
898
- except duckdb.IOException as e:
899
- self.logger.warn(f"Unable to attach to duckdb venv file: {self.duckdb_filepath}")
900
- raise e
1046
+ self._attach_connections_with_type_duckdb(conn)
901
1047
 
902
1048
  coroutines = []
903
1049
  for model_name in terminal_nodes:
@@ -909,18 +1055,18 @@ class DAG:
909
1055
  conn.close()
910
1056
 
911
1057
  async def execute(
912
- self, param_args: ParametersArgs, param_cfg_set: ParameterConfigsSet, context_func: ContextFunc, user: BaseUser | None, selections: dict[str, str],
913
- *, runquery: bool = True, recurse: bool = True, default_traits: dict[str, Any] = {}
1058
+ self, param_args: ParametersArgs, param_cfg_set: ParameterConfigsSet, context_func: ContextFunc, user: AbstractUser, selections: dict[str, str],
1059
+ *, runquery: bool = True, recurse: bool = True, configurables: dict[str, str] = {}
914
1060
  ) -> None:
915
1061
  recurse = (recurse or runquery)
916
1062
 
917
1063
  self.apply_selections(param_cfg_set, user, selections)
918
1064
 
919
- context, ctx_args = self._compile_context(param_args, context_func, user, default_traits)
1065
+ context, ctx_args = self._compile_context(param_args, context_func, user, configurables)
920
1066
 
921
1067
  self._compile_models(context, ctx_args, recurse)
922
1068
 
923
- self.placeholders = ctx_args.placeholders
1069
+ self.placeholders = ctx_args._placeholders_copy
924
1070
  if runquery:
925
1071
  await self._run_models()
926
1072
 
@@ -931,39 +1077,24 @@ class DAG:
931
1077
  self.target_model.retrieve_dependent_query_models(all_model_names)
932
1078
  return all_model_names
933
1079
 
934
- def to_networkx_graph(self) -> nx.DiGraph:
935
- G = nx.DiGraph()
936
-
937
- for model_name, model in self.models_dict.items():
938
- level = model.get_max_path_length_to_target()
939
- if level is not None:
940
- G.add_node(model_name, layer=-level, model_type=model.model_type)
941
-
942
- for model_name in G.nodes:
943
- model = self.models_dict[model_name]
944
- for dep_model_name in model.downstreams:
945
- G.add_edge(model_name, dep_model_name)
946
-
947
- return G
948
-
949
- def get_all_data_models(self) -> list[arm.DataModelItem]:
1080
+ def get_all_data_models(self) -> list[rm.DataModelItem]:
950
1081
  data_models = []
951
1082
  for model_name, model in self.models_dict.items():
952
1083
  is_queryable = model.is_queryable
953
- data_model = arm.DataModelItem(name=model_name, model_type=model.model_type.value, config=model.model_config, is_queryable=is_queryable)
1084
+ data_model = rm.DataModelItem(name=model_name, model_type=model.model_type.value, config=model.model_config, is_queryable=is_queryable)
954
1085
  data_models.append(data_model)
955
1086
  return data_models
956
1087
 
957
- def get_all_model_lineage(self) -> list[arm.LineageRelation]:
1088
+ def get_all_model_lineage(self) -> list[rm.LineageRelation]:
958
1089
  model_lineage = []
959
1090
  for model_name, model in self.models_dict.items():
960
1091
  if not isinstance(model, QueryModel):
961
1092
  continue
962
1093
  for dep_model_name in model.model_config.depends_on:
963
1094
  edge_type = "buildtime" if isinstance(model, BuildModel) else "runtime"
964
- source_model = arm.LineageNode(name=dep_model_name, type="model")
965
- target_model = arm.LineageNode(name=model_name, type="model")
966
- model_lineage.append(arm.LineageRelation(type=edge_type, source=source_model, target=target_model))
1095
+ source_model = rm.LineageNode(name=dep_model_name, type="model")
1096
+ target_model = rm.LineageNode(name=model_name, type="model")
1097
+ model_lineage.append(rm.LineageRelation(type=edge_type, source=source_model, target=target_model))
967
1098
  return model_lineage
968
1099
 
969
1100