squirrels 0.5.0rc0__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of squirrels might be problematic. Click here for more details.
- dateutils/__init__.py +6 -0
- dateutils/_enums.py +25 -0
- squirrels/dateutils.py → dateutils/_implementation.py +58 -111
- dateutils/types.py +6 -0
- squirrels/__init__.py +10 -12
- squirrels/_api_routes/__init__.py +5 -0
- squirrels/_api_routes/auth.py +271 -0
- squirrels/_api_routes/base.py +171 -0
- squirrels/_api_routes/dashboards.py +158 -0
- squirrels/_api_routes/data_management.py +148 -0
- squirrels/_api_routes/datasets.py +265 -0
- squirrels/_api_routes/oauth2.py +298 -0
- squirrels/_api_routes/project.py +252 -0
- squirrels/_api_server.py +245 -781
- squirrels/_arguments/__init__.py +0 -0
- squirrels/{arguments → _arguments}/init_time_args.py +7 -2
- squirrels/{arguments → _arguments}/run_time_args.py +13 -35
- squirrels/_auth.py +720 -212
- squirrels/_command_line.py +81 -41
- squirrels/_compile_prompts.py +147 -0
- squirrels/_connection_set.py +16 -7
- squirrels/_constants.py +29 -9
- squirrels/{_dashboards_io.py → _dashboards.py} +87 -6
- squirrels/_data_sources.py +570 -0
- squirrels/{dataset_result.py → _dataset_types.py} +2 -4
- squirrels/_exceptions.py +9 -37
- squirrels/_initializer.py +83 -59
- squirrels/_logging.py +117 -0
- squirrels/_manifest.py +129 -62
- squirrels/_model_builder.py +10 -52
- squirrels/_model_configs.py +3 -3
- squirrels/_model_queries.py +1 -1
- squirrels/_models.py +249 -118
- squirrels/{package_data → _package_data}/base_project/.env +16 -4
- squirrels/{package_data → _package_data}/base_project/.env.example +15 -3
- squirrels/{package_data → _package_data}/base_project/connections.yml +4 -3
- squirrels/{package_data → _package_data}/base_project/dashboards/dashboard_example.py +4 -4
- squirrels/_package_data/base_project/dashboards/dashboard_example.yml +22 -0
- squirrels/{package_data → _package_data}/base_project/duckdb_init.sql +1 -0
- squirrels/_package_data/base_project/macros/macros_example.sql +17 -0
- squirrels/{package_data → _package_data}/base_project/models/builds/build_example.py +2 -2
- squirrels/{package_data → _package_data}/base_project/models/builds/build_example.sql +1 -1
- squirrels/{package_data → _package_data}/base_project/models/builds/build_example.yml +2 -0
- squirrels/_package_data/base_project/models/dbviews/dbview_example.sql +17 -0
- squirrels/_package_data/base_project/models/dbviews/dbview_example.yml +32 -0
- squirrels/_package_data/base_project/models/federates/federate_example.py +48 -0
- squirrels/_package_data/base_project/models/federates/federate_example.sql +21 -0
- squirrels/{package_data → _package_data}/base_project/models/federates/federate_example.yml +7 -7
- squirrels/{package_data → _package_data}/base_project/models/sources.yml +5 -6
- squirrels/{package_data → _package_data}/base_project/parameters.yml +32 -45
- squirrels/_package_data/base_project/pyconfigs/connections.py +18 -0
- squirrels/{package_data → _package_data}/base_project/pyconfigs/context.py +31 -22
- squirrels/_package_data/base_project/pyconfigs/parameters.py +141 -0
- squirrels/_package_data/base_project/pyconfigs/user.py +44 -0
- squirrels/{package_data → _package_data}/base_project/seeds/seed_categories.yml +1 -1
- squirrels/{package_data → _package_data}/base_project/seeds/seed_subcategories.yml +1 -1
- squirrels/_package_data/base_project/squirrels.yml.j2 +61 -0
- squirrels/_package_data/templates/dataset_results.html +112 -0
- squirrels/_package_data/templates/oauth_login.html +271 -0
- squirrels/_package_data/templates/squirrels_studio.html +20 -0
- squirrels/_parameter_configs.py +76 -55
- squirrels/_parameter_options.py +348 -0
- squirrels/_parameter_sets.py +53 -45
- squirrels/_parameters.py +1664 -0
- squirrels/_project.py +403 -242
- squirrels/_py_module.py +3 -2
- squirrels/_request_context.py +33 -0
- squirrels/_schemas/__init__.py +0 -0
- squirrels/_schemas/auth_models.py +167 -0
- squirrels/_schemas/query_param_models.py +75 -0
- squirrels/{_api_response_models.py → _schemas/response_models.py} +48 -18
- squirrels/_seeds.py +1 -1
- squirrels/_sources.py +23 -19
- squirrels/_utils.py +121 -39
- squirrels/_version.py +1 -1
- squirrels/arguments.py +7 -0
- squirrels/auth.py +4 -0
- squirrels/connections.py +3 -0
- squirrels/dashboards.py +2 -81
- squirrels/data_sources.py +14 -563
- squirrels/parameter_options.py +13 -348
- squirrels/parameters.py +14 -1266
- squirrels/types.py +16 -0
- {squirrels-0.5.0rc0.dist-info → squirrels-0.5.1.dist-info}/METADATA +42 -30
- squirrels-0.5.1.dist-info/RECORD +98 -0
- squirrels/package_data/base_project/dashboards/dashboard_example.yml +0 -22
- squirrels/package_data/base_project/macros/macros_example.sql +0 -15
- squirrels/package_data/base_project/models/dbviews/dbview_example.sql +0 -12
- squirrels/package_data/base_project/models/dbviews/dbview_example.yml +0 -26
- squirrels/package_data/base_project/models/federates/federate_example.py +0 -44
- squirrels/package_data/base_project/models/federates/federate_example.sql +0 -17
- squirrels/package_data/base_project/pyconfigs/connections.py +0 -14
- squirrels/package_data/base_project/pyconfigs/parameters.py +0 -93
- squirrels/package_data/base_project/pyconfigs/user.py +0 -23
- squirrels/package_data/base_project/squirrels.yml.j2 +0 -71
- squirrels-0.5.0rc0.dist-info/RECORD +0 -70
- /squirrels/{package_data → _package_data}/base_project/assets/expenses.db +0 -0
- /squirrels/{package_data → _package_data}/base_project/assets/weather.db +0 -0
- /squirrels/{package_data → _package_data}/base_project/docker/.dockerignore +0 -0
- /squirrels/{package_data → _package_data}/base_project/docker/Dockerfile +0 -0
- /squirrels/{package_data → _package_data}/base_project/docker/compose.yml +0 -0
- /squirrels/{package_data/base_project/.gitignore → _package_data/base_project/gitignore} +0 -0
- /squirrels/{package_data → _package_data}/base_project/seeds/seed_categories.csv +0 -0
- /squirrels/{package_data → _package_data}/base_project/seeds/seed_subcategories.csv +0 -0
- /squirrels/{package_data → _package_data}/base_project/tmp/.gitignore +0 -0
- {squirrels-0.5.0rc0.dist-info → squirrels-0.5.1.dist-info}/WHEEL +0 -0
- {squirrels-0.5.0rc0.dist-info → squirrels-0.5.1.dist-info}/entry_points.txt +0 -0
- {squirrels-0.5.0rc0.dist-info → squirrels-0.5.1.dist-info}/licenses/LICENSE +0 -0
squirrels/_project.py
CHANGED
|
@@ -1,92 +1,113 @@
|
|
|
1
|
-
from dotenv import dotenv_values
|
|
2
|
-
from
|
|
1
|
+
from dotenv import dotenv_values, load_dotenv
|
|
2
|
+
from pathlib import Path
|
|
3
3
|
import asyncio, typing as t, functools as ft, shutil, json, os
|
|
4
|
-
import
|
|
5
|
-
import sqlglot, sqlglot.expressions
|
|
4
|
+
import sqlglot, sqlglot.expressions, duckdb, polars as pl
|
|
6
5
|
|
|
7
|
-
from ._auth import Authenticator,
|
|
6
|
+
from ._auth import Authenticator, AuthProviderArgs, ProviderFunctionType
|
|
7
|
+
from ._schemas.auth_models import CustomUserFields, AbstractUser, GuestUser, RegisteredUser
|
|
8
|
+
from ._schemas import response_models as rm
|
|
8
9
|
from ._model_builder import ModelBuilder
|
|
9
10
|
from ._exceptions import InvalidInputError, ConfigurationError
|
|
10
|
-
from . import
|
|
11
|
+
from ._py_module import PyModule
|
|
12
|
+
from . import _dashboards as d, _utils as u, _constants as c, _manifest as mf, _connection_set as cs
|
|
11
13
|
from . import _seeds as s, _models as m, _model_configs as mc, _model_queries as mq, _sources as so
|
|
12
|
-
from . import _parameter_sets as ps,
|
|
14
|
+
from . import _parameter_sets as ps, _dataset_types as dr, _logging as l
|
|
13
15
|
|
|
14
|
-
T = t.TypeVar("T", bound=
|
|
16
|
+
T = t.TypeVar("T", bound=d.Dashboard)
|
|
15
17
|
M = t.TypeVar("M", bound=m.DataModel)
|
|
16
18
|
|
|
17
19
|
|
|
18
|
-
class _CustomJsonFormatter(l.Formatter):
|
|
19
|
-
def format(self, record: l.LogRecord) -> str:
|
|
20
|
-
super().format(record)
|
|
21
|
-
info = {
|
|
22
|
-
"timestamp": self.formatTime(record),
|
|
23
|
-
"project_id": record.name,
|
|
24
|
-
"level": record.levelname,
|
|
25
|
-
"message": record.getMessage(),
|
|
26
|
-
"thread": record.thread,
|
|
27
|
-
"thread_name": record.threadName,
|
|
28
|
-
"process": record.process,
|
|
29
|
-
**record.__dict__.get("info", {})
|
|
30
|
-
}
|
|
31
|
-
output = {
|
|
32
|
-
"data": record.__dict__.get("data", {}),
|
|
33
|
-
"info": info
|
|
34
|
-
}
|
|
35
|
-
return json.dumps(output)
|
|
36
|
-
|
|
37
|
-
|
|
38
20
|
class SquirrelsProject:
|
|
39
21
|
"""
|
|
40
22
|
Initiate an instance of this class to interact with a Squirrels project through Python code. For example this can be handy to experiment with the datasets produced by Squirrels in a Jupyter notebook.
|
|
41
23
|
"""
|
|
42
24
|
|
|
43
|
-
def __init__(
|
|
25
|
+
def __init__(
|
|
26
|
+
self, *, filepath: str = ".", load_dotenv_globally: bool = False,
|
|
27
|
+
log_to_file: bool = False, log_level: str | None = None, log_format: str | None = None,
|
|
28
|
+
) -> None:
|
|
44
29
|
"""
|
|
45
30
|
Constructor for SquirrelsProject class. Loads the file contents of the Squirrels project into memory as member fields.
|
|
46
31
|
|
|
47
32
|
Arguments:
|
|
48
33
|
filepath: The path to the Squirrels project file. Defaults to the current working directory.
|
|
49
|
-
log_level: The logging level to use. Options are "DEBUG", "INFO", and "WARNING". Default is "INFO".
|
|
50
|
-
|
|
51
|
-
log_format: The format of the log records. Options are "text" and "json". Default is "text".
|
|
34
|
+
log_level: The logging level to use. Options are "DEBUG", "INFO", and "WARNING". Default is from SQRL_LOGGING__LOG_LEVEL environment variable or "INFO".
|
|
35
|
+
log_to_file: Whether to enable logging to file(s) in the "logs/" folder with rotation and retention policies. Default is False.
|
|
36
|
+
log_format: The format of the log records. Options are "text" and "json". Default is from SQRL_LOGGING__LOG_FORMAT environment variable or "text".
|
|
52
37
|
"""
|
|
53
38
|
self._filepath = filepath
|
|
54
|
-
self.
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
39
|
+
self._load_dotenv_globally = load_dotenv_globally
|
|
40
|
+
self._logger = self._get_logger(filepath, log_to_file, log_level, log_format)
|
|
41
|
+
self._ensure_virtual_datalake_exists(filepath)
|
|
42
|
+
|
|
43
|
+
def _get_logger(self, filepath: str, log_to_file: bool, log_level: str | None, log_format: str | None) -> u.Logger:
|
|
44
|
+
env_vars = self._env_vars
|
|
45
|
+
# CLI arguments take precedence over environment variables
|
|
46
|
+
log_level = log_level if log_level is not None else env_vars.get(c.SQRL_LOGGING_LOG_LEVEL, "INFO")
|
|
47
|
+
log_format = log_format if log_format is not None else env_vars.get(c.SQRL_LOGGING_LOG_FORMAT, "text")
|
|
48
|
+
log_to_file = log_to_file or u.to_bool(env_vars.get(c.SQRL_LOGGING_LOG_TO_FILE, "false"))
|
|
49
|
+
log_file_size_mb = int(env_vars.get(c.SQRL_LOGGING_LOG_FILE_SIZE_MB, 50))
|
|
50
|
+
log_file_backup_count = int(env_vars.get(c.SQRL_LOGGING_LOG_FILE_BACKUP_COUNT, 1))
|
|
51
|
+
return l.get_logger(filepath, log_to_file, log_level, log_format, log_file_size_mb, log_file_backup_count)
|
|
52
|
+
|
|
53
|
+
def _ensure_virtual_datalake_exists(self, project_path: str) -> None:
|
|
54
|
+
target_path = u.Path(project_path, c.TARGET_FOLDER)
|
|
55
|
+
target_path.mkdir(parents=True, exist_ok=True)
|
|
56
|
+
|
|
57
|
+
# Attempt to set up the virtual data lake with DATA_PATH if possible
|
|
58
|
+
try:
|
|
59
|
+
is_ducklake = self._datalake_db_path.startswith("ducklake:")
|
|
71
60
|
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
61
|
+
data_path = self._env_vars.get(c.SQRL_VDL_DATA_PATH, c.DEFAULT_VDL_DATA_PATH)
|
|
62
|
+
data_path = data_path.format(project_path=project_path)
|
|
63
|
+
|
|
64
|
+
options = f"(DATA_PATH '{data_path}')" if is_ducklake else ""
|
|
65
|
+
attach_stmt = f"ATTACH '{self._datalake_db_path}' AS vdl {options}"
|
|
66
|
+
with duckdb.connect() as conn:
|
|
67
|
+
conn.execute(attach_stmt)
|
|
68
|
+
# TODO: support incremental loads for build models and avoid cleaning up old files all the time
|
|
69
|
+
conn.execute("CALL ducklake_expire_snapshots('vdl', older_than => now())")
|
|
70
|
+
conn.execute("CALL ducklake_cleanup_old_files('vdl', cleanup_all => true)")
|
|
79
71
|
|
|
80
|
-
|
|
72
|
+
except Exception as e:
|
|
73
|
+
if "DATA_PATH parameter" in str(e):
|
|
74
|
+
first_line = str(e).split("\n")[0]
|
|
75
|
+
note = "NOTE: Squirrels does not allow changing the data path for an existing Virtual Data Lake (VDL)"
|
|
76
|
+
raise u.ConfigurationError(f"{first_line}\n\n{note}")
|
|
77
|
+
|
|
78
|
+
if is_ducklake and not any(x in self._datalake_db_path for x in [":sqlite:", ":postgres:", ":mysql:"]):
|
|
79
|
+
extended_error = "\n Note: if you're using DuckDB for the metadata database, only one process can connect to the VDL at a time."
|
|
80
|
+
else:
|
|
81
|
+
extended_error = ""
|
|
82
|
+
|
|
83
|
+
raise u.ConfigurationError(f"Failed to attach Virtual Data Lake (VDL).{extended_error}") from e
|
|
81
84
|
|
|
82
85
|
@ft.cached_property
|
|
83
86
|
def _env_vars(self) -> dict[str, str]:
|
|
84
87
|
dotenv_files = [c.DOTENV_FILE, c.DOTENV_LOCAL_FILE]
|
|
85
88
|
dotenv_vars = {}
|
|
86
89
|
for file in dotenv_files:
|
|
87
|
-
|
|
90
|
+
full_path = u.Path(self._filepath, file)
|
|
91
|
+
if self._load_dotenv_globally:
|
|
92
|
+
load_dotenv(full_path)
|
|
93
|
+
dotenv_vars.update({k: v for k, v in dotenv_values(full_path).items() if v is not None})
|
|
88
94
|
return {**os.environ, **dotenv_vars}
|
|
89
95
|
|
|
96
|
+
@ft.cached_property
|
|
97
|
+
def _elevated_access_level(self) -> u.ACCESS_LEVEL:
|
|
98
|
+
elevated_access_level = self._env_vars.get(c.SQRL_PERMISSIONS_ELEVATED_ACCESS_LEVEL, "admin").lower()
|
|
99
|
+
|
|
100
|
+
if elevated_access_level not in ["admin", "member", "guest"]:
|
|
101
|
+
raise u.ConfigurationError(f"{c.SQRL_PERMISSIONS_ELEVATED_ACCESS_LEVEL} has been set to an invalid access level: {elevated_access_level}")
|
|
102
|
+
|
|
103
|
+
return elevated_access_level
|
|
104
|
+
|
|
105
|
+
@ft.cached_property
|
|
106
|
+
def _datalake_db_path(self) -> str:
|
|
107
|
+
datalake_db_path = self._env_vars.get(c.SQRL_VDL_CATALOG_DB_PATH, c.DEFAULT_VDL_CATALOG_DB_PATH)
|
|
108
|
+
datalake_db_path = datalake_db_path.format(project_path=self._filepath)
|
|
109
|
+
return datalake_db_path
|
|
110
|
+
|
|
90
111
|
@ft.cached_property
|
|
91
112
|
def _manifest_cfg(self) -> mf.ManifestConfig:
|
|
92
113
|
return mf.ManifestIO.load_from_file(self._logger, self._filepath, self._env_vars)
|
|
@@ -127,29 +148,77 @@ class SquirrelsProject:
|
|
|
127
148
|
def _conn_set(self) -> cs.ConnectionSet:
|
|
128
149
|
return cs.ConnectionSetIO.load_from_file(self._logger, self._filepath, self._manifest_cfg, self._conn_args)
|
|
129
150
|
|
|
151
|
+
@ft.cached_property
|
|
152
|
+
def _custom_user_fields_cls_and_provider_functions(self) -> tuple[type[CustomUserFields], list[ProviderFunctionType]]:
|
|
153
|
+
user_module_path = u.Path(self._filepath, c.PYCONFIGS_FOLDER, c.USER_FILE)
|
|
154
|
+
user_module = PyModule(user_module_path)
|
|
155
|
+
|
|
156
|
+
# Load CustomUserFields class (adds to Authenticator.providers as side effect)
|
|
157
|
+
CustomUserFieldsCls = user_module.get_func_or_class("CustomUserFields", default_attr=CustomUserFields)
|
|
158
|
+
provider_functions = Authenticator.providers
|
|
159
|
+
Authenticator.providers = []
|
|
160
|
+
|
|
161
|
+
if not issubclass(CustomUserFieldsCls, CustomUserFields):
|
|
162
|
+
raise ConfigurationError(f"CustomUserFields class in '{c.USER_FILE}' must inherit from CustomUserFields")
|
|
163
|
+
|
|
164
|
+
return CustomUserFieldsCls, provider_functions
|
|
165
|
+
|
|
166
|
+
@ft.cached_property
|
|
167
|
+
def _auth_args(self) -> AuthProviderArgs:
|
|
168
|
+
conn_args = self._conn_args
|
|
169
|
+
return AuthProviderArgs(conn_args.project_path, conn_args.proj_vars, conn_args.env_vars)
|
|
170
|
+
|
|
130
171
|
@ft.cached_property
|
|
131
172
|
def _auth(self) -> Authenticator:
|
|
132
|
-
|
|
173
|
+
CustomUserFieldsCls, provider_functions = self._custom_user_fields_cls_and_provider_functions
|
|
174
|
+
external_only = (self._manifest_cfg.authentication.type == mf.AuthenticationType.EXTERNAL)
|
|
175
|
+
return Authenticator(self._logger, self._filepath, self._auth_args, provider_functions, custom_user_fields_cls=CustomUserFieldsCls, external_only=external_only)
|
|
176
|
+
|
|
177
|
+
@ft.cached_property
|
|
178
|
+
def _guest_user(self) -> AbstractUser:
|
|
179
|
+
custom_fields = self._auth.CustomUserFields()
|
|
180
|
+
return GuestUser(username="", custom_fields=custom_fields)
|
|
181
|
+
|
|
182
|
+
@ft.cached_property
|
|
183
|
+
def _admin_user(self) -> AbstractUser:
|
|
184
|
+
custom_fields = self._auth.CustomUserFields()
|
|
185
|
+
return RegisteredUser(username="", access_level="admin", custom_fields=custom_fields)
|
|
133
186
|
|
|
134
187
|
@ft.cached_property
|
|
135
188
|
def _param_args(self) -> ps.ParametersArgs:
|
|
136
|
-
|
|
189
|
+
conn_args = self._conn_args
|
|
190
|
+
return ps.ParametersArgs(conn_args.project_path, conn_args.proj_vars, conn_args.env_vars)
|
|
137
191
|
|
|
138
192
|
@ft.cached_property
|
|
139
193
|
def _param_cfg_set(self) -> ps.ParameterConfigsSet:
|
|
140
194
|
return ps.ParameterConfigsSetIO.load_from_file(
|
|
141
|
-
self._logger, self._filepath, self._manifest_cfg, self._seeds, self._conn_set, self._param_args
|
|
195
|
+
self._logger, self._filepath, self._manifest_cfg, self._seeds, self._conn_set, self._param_args, self._datalake_db_path
|
|
142
196
|
)
|
|
143
197
|
|
|
144
198
|
@ft.cached_property
|
|
145
199
|
def _j2_env(self) -> u.EnvironmentWithMacros:
|
|
146
|
-
|
|
200
|
+
env = u.EnvironmentWithMacros(self._logger, loader=u.j2.FileSystemLoader(self._filepath))
|
|
201
|
+
|
|
202
|
+
def value_to_str(value: t.Any, attribute: str | None = None) -> str:
|
|
203
|
+
if attribute is None:
|
|
204
|
+
return str(value)
|
|
205
|
+
else:
|
|
206
|
+
return str(getattr(value, attribute))
|
|
207
|
+
|
|
208
|
+
def join(value: list[t.Any], d: str = ", ", attribute: str | None = None) -> str:
|
|
209
|
+
return d.join(map(lambda x: value_to_str(x, attribute), value))
|
|
210
|
+
|
|
211
|
+
def quote(value: t.Any, q: str = "'", attribute: str | None = None) -> str:
|
|
212
|
+
return q + value_to_str(value, attribute) + q
|
|
213
|
+
|
|
214
|
+
def quote_and_join(value: list[t.Any], q: str = "'", d: str = ", ", attribute: str | None = None) -> str:
|
|
215
|
+
return d.join(map(lambda x: quote(x, q, attribute), value))
|
|
216
|
+
|
|
217
|
+
env.filters["join"] = join
|
|
218
|
+
env.filters["quote"] = quote
|
|
219
|
+
env.filters["quote_and_join"] = quote_and_join
|
|
220
|
+
return env
|
|
147
221
|
|
|
148
|
-
@ft.cached_property
|
|
149
|
-
def _duckdb_venv_path(self) -> str:
|
|
150
|
-
duckdb_filepath_setting_val = self._env_vars.get(c.SQRL_DUCKDB_VENV_DB_FILE_PATH, f"{c.TARGET_FOLDER}/{c.DUCKDB_VENV_FILE}")
|
|
151
|
-
return str(u.Path(self._filepath, duckdb_filepath_setting_val))
|
|
152
|
-
|
|
153
222
|
def close(self) -> None:
|
|
154
223
|
"""
|
|
155
224
|
Deliberately close any open resources within the Squirrels project, such as database connections (instead of relying on the garbage collector).
|
|
@@ -184,20 +253,20 @@ class SquirrelsProject:
|
|
|
184
253
|
return models_dict
|
|
185
254
|
|
|
186
255
|
|
|
187
|
-
async def build(self, *, full_refresh: bool = False, select: str | None = None
|
|
256
|
+
async def build(self, *, full_refresh: bool = False, select: str | None = None) -> None:
|
|
188
257
|
"""
|
|
189
|
-
Build the
|
|
258
|
+
Build the Virtual Data Lake (VDL) for the Squirrels project
|
|
190
259
|
|
|
191
260
|
Arguments:
|
|
192
|
-
full_refresh: Whether to drop all tables and rebuild the
|
|
193
|
-
|
|
261
|
+
full_refresh: Whether to drop all tables and rebuild the VDL from scratch. Default is False.
|
|
262
|
+
select: The name of a specific model to build. If None, all models are built. Default is None.
|
|
194
263
|
"""
|
|
195
264
|
models_dict: dict[str, m.StaticModel] = self._get_static_models()
|
|
196
|
-
builder = ModelBuilder(self.
|
|
197
|
-
await builder.build(full_refresh, select
|
|
265
|
+
builder = ModelBuilder(self._datalake_db_path, self._conn_set, models_dict, self._conn_args, self._logger)
|
|
266
|
+
await builder.build(full_refresh, select)
|
|
198
267
|
|
|
199
268
|
def _get_models_dict(self, always_python_df: bool) -> dict[str, m.DataModel]:
|
|
200
|
-
models_dict: dict[str, m.DataModel] =
|
|
269
|
+
models_dict: dict[str, m.DataModel] = self._get_static_models()
|
|
201
270
|
|
|
202
271
|
for name, val in self._dbview_model_files.items():
|
|
203
272
|
self._add_model(models_dict, m.DbviewModel(
|
|
@@ -213,19 +282,18 @@ class SquirrelsProject:
|
|
|
213
282
|
|
|
214
283
|
return models_dict
|
|
215
284
|
|
|
216
|
-
def _generate_dag(self, dataset: str
|
|
217
|
-
models_dict = self._get_models_dict(always_python_df)
|
|
285
|
+
def _generate_dag(self, dataset: str) -> m.DAG:
|
|
286
|
+
models_dict = self._get_models_dict(always_python_df=False)
|
|
218
287
|
|
|
219
288
|
dataset_config = self._manifest_cfg.datasets[dataset]
|
|
220
|
-
|
|
221
|
-
target_model = models_dict[target_model_name]
|
|
289
|
+
target_model = models_dict[dataset_config.model]
|
|
222
290
|
target_model.is_target = True
|
|
223
|
-
dag = m.DAG(dataset_config, target_model, models_dict, self.
|
|
291
|
+
dag = m.DAG(dataset_config, target_model, models_dict, self._datalake_db_path, self._logger)
|
|
224
292
|
|
|
225
293
|
return dag
|
|
226
294
|
|
|
227
|
-
def _generate_dag_with_fake_target(self, sql_query: str | None) -> m.DAG:
|
|
228
|
-
models_dict = self._get_models_dict(always_python_df=
|
|
295
|
+
def _generate_dag_with_fake_target(self, sql_query: str | None, *, always_python_df: bool = False) -> m.DAG:
|
|
296
|
+
models_dict = self._get_models_dict(always_python_df=always_python_df)
|
|
229
297
|
|
|
230
298
|
if sql_query is None:
|
|
231
299
|
dependencies = set(models_dict.keys())
|
|
@@ -235,227 +303,260 @@ class SquirrelsProject:
|
|
|
235
303
|
substitutions = {}
|
|
236
304
|
for model_name in dependencies:
|
|
237
305
|
model = models_dict[model_name]
|
|
238
|
-
if isinstance(model, m.SourceModel) and not model.
|
|
239
|
-
raise InvalidInputError(
|
|
240
|
-
if isinstance(model,
|
|
241
|
-
substitutions[model_name] = f"
|
|
306
|
+
if isinstance(model, m.SourceModel) and not model.is_queryable:
|
|
307
|
+
raise InvalidInputError(400, "cannot_query_source_model", f"Source model '{model_name}' cannot be queried with DuckDB")
|
|
308
|
+
if isinstance(model, m.BuildModel):
|
|
309
|
+
substitutions[model_name] = f"vdl.{model_name}"
|
|
310
|
+
elif isinstance(model, m.SourceModel):
|
|
311
|
+
if model.model_config.load_to_vdl:
|
|
312
|
+
substitutions[model_name] = f"vdl.{model_name}"
|
|
313
|
+
else:
|
|
314
|
+
# DuckDB connection without load_to_vdl - reference via attached database
|
|
315
|
+
conn_name = model.model_config.get_connection()
|
|
316
|
+
table_name = model.model_config.get_table()
|
|
317
|
+
substitutions[model_name] = f"db_{conn_name}.{table_name}"
|
|
242
318
|
|
|
243
319
|
sql_query = parsed.transform(
|
|
244
|
-
lambda node: sqlglot.expressions.Table(this=substitutions[node.name])
|
|
320
|
+
lambda node: sqlglot.expressions.Table(this=substitutions[node.name], alias=node.alias)
|
|
245
321
|
if isinstance(node, sqlglot.expressions.Table) and node.name in substitutions
|
|
246
322
|
else node
|
|
247
323
|
).sql()
|
|
248
324
|
|
|
249
325
|
model_config = mc.FederateModelConfig(depends_on=dependencies)
|
|
250
|
-
query_file = mq.SqlQueryFile("", sql_query or "")
|
|
326
|
+
query_file = mq.SqlQueryFile("", sql_query or "SELECT 1")
|
|
251
327
|
fake_target_model = m.FederateModel(
|
|
252
328
|
"__fake_target", model_config, query_file, logger=self._logger, env_vars=self._env_vars, conn_set=self._conn_set, j2_env=self._j2_env
|
|
253
329
|
)
|
|
254
330
|
fake_target_model.is_target = True
|
|
255
|
-
dag = m.DAG(None, fake_target_model, models_dict, self.
|
|
331
|
+
dag = m.DAG(None, fake_target_model, models_dict, self._datalake_db_path, self._logger)
|
|
256
332
|
return dag
|
|
257
333
|
|
|
258
|
-
def
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
G = dag.to_networkx_graph()
|
|
265
|
-
|
|
266
|
-
fig, _ = plt.subplots()
|
|
267
|
-
pos = nx.multipartite_layout(G, subset_key="layer")
|
|
268
|
-
colors = [color_map[node[1]] for node in G.nodes(data="model_type")] # type: ignore
|
|
269
|
-
nx.draw(G, pos=pos, node_shape='^', node_size=1000, node_color=colors, arrowsize=20)
|
|
270
|
-
|
|
271
|
-
y_values = [val[1] for val in pos.values()]
|
|
272
|
-
scale = max(y_values) - min(y_values) if len(y_values) > 0 else 0
|
|
273
|
-
label_pos = {key: (val[0], val[1]-0.002-0.1*scale) for key, val in pos.items()}
|
|
274
|
-
nx.draw_networkx_labels(G, pos=label_pos, font_size=8)
|
|
275
|
-
|
|
276
|
-
fig.tight_layout()
|
|
277
|
-
plt.margins(x=0.1, y=0.1)
|
|
278
|
-
fig.savefig(u.Path(output_folder, "dag.png"))
|
|
279
|
-
plt.close(fig)
|
|
280
|
-
|
|
281
|
-
async def _get_compiled_dag(self, *, sql_query: str | None = None, selections: dict[str, t.Any] = {}, user: BaseUser | None = None) -> m.DAG:
|
|
282
|
-
dag = self._generate_dag_with_fake_target(sql_query)
|
|
334
|
+
async def _get_compiled_dag(
|
|
335
|
+
self, user: AbstractUser, *, sql_query: str | None = None, selections: dict[str, t.Any] = {}, configurables: dict[str, str] = {},
|
|
336
|
+
always_python_df: bool = False
|
|
337
|
+
) -> m.DAG:
|
|
338
|
+
dag = self._generate_dag_with_fake_target(sql_query, always_python_df=always_python_df)
|
|
283
339
|
|
|
284
|
-
|
|
285
|
-
await dag.execute(
|
|
340
|
+
configurables = {**self._manifest_cfg.get_default_configurables(), **configurables}
|
|
341
|
+
await dag.execute(
|
|
342
|
+
self._param_args, self._param_cfg_set, self._context_func, user, selections,
|
|
343
|
+
runquery=False, configurables=configurables
|
|
344
|
+
)
|
|
286
345
|
return dag
|
|
287
346
|
|
|
288
|
-
def _get_all_connections(self) -> list[
|
|
347
|
+
def _get_all_connections(self) -> list[rm.ConnectionItemModel]:
|
|
289
348
|
connections = []
|
|
290
349
|
for conn_name, conn_props in self._conn_set.get_connections_as_dict().items():
|
|
291
350
|
if isinstance(conn_props, mf.ConnectionProperties):
|
|
292
351
|
label = conn_props.label if conn_props.label is not None else conn_name
|
|
293
|
-
connections.append(
|
|
352
|
+
connections.append(rm.ConnectionItemModel(name=conn_name, label=label))
|
|
294
353
|
return connections
|
|
295
354
|
|
|
296
|
-
def _get_all_data_models(self, compiled_dag: m.DAG) -> list[
|
|
355
|
+
def _get_all_data_models(self, compiled_dag: m.DAG) -> list[rm.DataModelItem]:
|
|
297
356
|
return compiled_dag.get_all_data_models()
|
|
298
357
|
|
|
299
|
-
async def get_all_data_models(self) -> list[
|
|
358
|
+
async def get_all_data_models(self) -> list[rm.DataModelItem]:
|
|
300
359
|
"""
|
|
301
360
|
Get all data models in the project
|
|
302
361
|
|
|
303
362
|
Returns:
|
|
304
363
|
A list of DataModelItem objects
|
|
305
364
|
"""
|
|
306
|
-
compiled_dag = await self._get_compiled_dag()
|
|
365
|
+
compiled_dag = await self._get_compiled_dag(self._admin_user)
|
|
307
366
|
return self._get_all_data_models(compiled_dag)
|
|
308
367
|
|
|
309
|
-
def _get_all_data_lineage(self, compiled_dag: m.DAG) -> list[
|
|
368
|
+
def _get_all_data_lineage(self, compiled_dag: m.DAG) -> list[rm.LineageRelation]:
|
|
310
369
|
all_lineage = compiled_dag.get_all_model_lineage()
|
|
311
370
|
|
|
312
371
|
# Add dataset nodes to the lineage
|
|
313
372
|
for dataset in self._manifest_cfg.datasets.values():
|
|
314
|
-
target_dataset =
|
|
315
|
-
source_model =
|
|
316
|
-
all_lineage.append(
|
|
373
|
+
target_dataset = rm.LineageNode(name=dataset.name, type="dataset")
|
|
374
|
+
source_model = rm.LineageNode(name=dataset.model, type="model")
|
|
375
|
+
all_lineage.append(rm.LineageRelation(type="runtime", source=source_model, target=target_dataset))
|
|
317
376
|
|
|
318
377
|
# Add dashboard nodes to the lineage
|
|
319
378
|
for dashboard in self._dashboards.values():
|
|
320
|
-
target_dashboard =
|
|
379
|
+
target_dashboard = rm.LineageNode(name=dashboard.dashboard_name, type="dashboard")
|
|
321
380
|
datasets = set(x.dataset for x in dashboard.config.depends_on)
|
|
322
381
|
for dataset in datasets:
|
|
323
|
-
source_dataset =
|
|
324
|
-
all_lineage.append(
|
|
382
|
+
source_dataset = rm.LineageNode(name=dataset, type="dataset")
|
|
383
|
+
all_lineage.append(rm.LineageRelation(type="runtime", source=source_dataset, target=target_dashboard))
|
|
325
384
|
|
|
326
385
|
return all_lineage
|
|
327
386
|
|
|
328
|
-
async def get_all_data_lineage(self) -> list[
|
|
387
|
+
async def get_all_data_lineage(self) -> list[rm.LineageRelation]:
|
|
329
388
|
"""
|
|
330
389
|
Get all data lineage in the project
|
|
331
390
|
|
|
332
391
|
Returns:
|
|
333
392
|
A list of LineageRelation objects
|
|
334
393
|
"""
|
|
335
|
-
compiled_dag = await self._get_compiled_dag()
|
|
394
|
+
compiled_dag = await self._get_compiled_dag(self._admin_user)
|
|
336
395
|
return self._get_all_data_lineage(compiled_dag)
|
|
337
396
|
|
|
338
|
-
async def _write_dataset_outputs_given_test_set(
|
|
339
|
-
self, dataset: str, select: str, test_set: str | None, runquery: bool, recurse: bool
|
|
340
|
-
) -> t.Any | None:
|
|
341
|
-
dataset_conf = self._manifest_cfg.datasets[dataset]
|
|
342
|
-
default_test_set_conf = self._manifest_cfg.get_default_test_set(dataset)
|
|
343
|
-
if test_set in self._manifest_cfg.selection_test_sets:
|
|
344
|
-
test_set_conf = self._manifest_cfg.selection_test_sets[test_set]
|
|
345
|
-
elif test_set is None or test_set == default_test_set_conf.name:
|
|
346
|
-
test_set, test_set_conf = default_test_set_conf.name, default_test_set_conf
|
|
347
|
-
else:
|
|
348
|
-
raise ConfigurationError(f"No test set named '{test_set}' was found when compiling dataset '{dataset}'. The test set must be defined if not default for dataset.")
|
|
349
|
-
|
|
350
|
-
error_msg_intro = f"Cannot compile dataset '{dataset}' with test set '{test_set}'."
|
|
351
|
-
if test_set_conf.datasets is not None and dataset not in test_set_conf.datasets:
|
|
352
|
-
raise ConfigurationError(f"{error_msg_intro}\n Applicable datasets for test set '{test_set}' does not include dataset '{dataset}'.")
|
|
353
|
-
|
|
354
|
-
user_attributes = test_set_conf.user_attributes.copy() if test_set_conf.user_attributes is not None else {}
|
|
355
|
-
selections = test_set_conf.parameters.copy()
|
|
356
|
-
username, is_admin = user_attributes.pop("username", ""), user_attributes.pop("is_admin", False)
|
|
357
|
-
if test_set_conf.is_authenticated:
|
|
358
|
-
user = self._auth.User(username=username, is_admin=is_admin, **user_attributes)
|
|
359
|
-
elif dataset_conf.scope == mf.PermissionScope.PUBLIC:
|
|
360
|
-
user = None
|
|
361
|
-
else:
|
|
362
|
-
raise ConfigurationError(f"{error_msg_intro}\n Non-public datasets require a test set with 'user_attributes' section defined")
|
|
363
|
-
|
|
364
|
-
if dataset_conf.scope == mf.PermissionScope.PRIVATE and not is_admin:
|
|
365
|
-
raise ConfigurationError(f"{error_msg_intro}\n Private datasets require a test set with user_attribute 'is_admin' set to true")
|
|
366
|
-
|
|
367
|
-
# always_python_df is set to True for creating CSV files from results (when runquery is True)
|
|
368
|
-
dag = self._generate_dag(dataset, target_model_name=select, always_python_df=runquery)
|
|
369
|
-
await dag.execute(
|
|
370
|
-
self._param_args, self._param_cfg_set, self._context_func, user, selections,
|
|
371
|
-
runquery=runquery, recurse=recurse, default_traits=self._manifest_cfg.get_default_traits()
|
|
372
|
-
)
|
|
373
|
-
|
|
374
|
-
output_folder = u.Path(self._filepath, c.TARGET_FOLDER, c.COMPILE_FOLDER, dataset, test_set)
|
|
375
|
-
if output_folder.exists():
|
|
376
|
-
shutil.rmtree(output_folder)
|
|
377
|
-
output_folder.mkdir(parents=True, exist_ok=True)
|
|
378
|
-
|
|
379
|
-
def write_placeholders() -> None:
|
|
380
|
-
output_filepath = u.Path(output_folder, "placeholders.json")
|
|
381
|
-
with open(output_filepath, 'w') as f:
|
|
382
|
-
json.dump(dag.placeholders, f, indent=4)
|
|
383
|
-
|
|
384
|
-
def write_model_outputs(model: m.DataModel) -> None:
|
|
385
|
-
assert isinstance(model, m.QueryModel)
|
|
386
|
-
subfolder = c.DBVIEWS_FOLDER if model.model_type == m.ModelType.DBVIEW else c.FEDERATES_FOLDER
|
|
387
|
-
subpath = u.Path(output_folder, subfolder)
|
|
388
|
-
subpath.mkdir(parents=True, exist_ok=True)
|
|
389
|
-
if isinstance(model.compiled_query, mq.SqlModelQuery):
|
|
390
|
-
output_filepath = u.Path(subpath, model.name+'.sql')
|
|
391
|
-
query = model.compiled_query.query
|
|
392
|
-
with open(output_filepath, 'w') as f:
|
|
393
|
-
f.write(query)
|
|
394
|
-
if runquery and isinstance(model.result, pl.LazyFrame):
|
|
395
|
-
output_filepath = u.Path(subpath, model.name+'.csv')
|
|
396
|
-
model.result.collect().write_csv(output_filepath)
|
|
397
|
-
|
|
398
|
-
write_placeholders()
|
|
399
|
-
all_model_names = dag.get_all_query_models()
|
|
400
|
-
coroutines = [asyncio.to_thread(write_model_outputs, dag.models_dict[name]) for name in all_model_names]
|
|
401
|
-
await u.asyncio_gather(coroutines)
|
|
402
|
-
|
|
403
|
-
if recurse:
|
|
404
|
-
self._draw_dag(dag, output_folder)
|
|
405
|
-
|
|
406
|
-
if isinstance(dag.target_model, m.QueryModel) and dag.target_model.compiled_query is not None:
|
|
407
|
-
return dag.target_model.compiled_query.query
|
|
408
|
-
|
|
409
397
|
async def compile(
|
|
410
|
-
self, *,
|
|
411
|
-
|
|
398
|
+
self, *, selected_model: str | None = None, test_set: str | None = None, do_all_test_sets: bool = False,
|
|
399
|
+
runquery: bool = False, clear: bool = False, buildtime_only: bool = False, runtime_only: bool = False
|
|
412
400
|
) -> None:
|
|
413
401
|
"""
|
|
414
|
-
|
|
402
|
+
Compile models into the "target/compile" folder.
|
|
415
403
|
|
|
416
|
-
|
|
404
|
+
Behavior:
|
|
405
|
+
- Buildtime outputs: target/compile/buildtime/*.sql (for SQL build models) and dag.png
|
|
406
|
+
- Runtime outputs: target/compile/runtime/[test_set]/dbviews/*.sql, federates/*.sql, dag.png
|
|
407
|
+
If runquery=True, also write CSVs for runtime models.
|
|
408
|
+
- Options: clear entire compile folder first; compile only buildtime or only runtime.
|
|
417
409
|
|
|
418
410
|
Arguments:
|
|
419
|
-
dataset: The name of the dataset to compile. Ignored if "do_all_datasets" argument is True, but required (i.e., cannot be None) if "do_all_datasets" is False. Default is None.
|
|
420
|
-
do_all_datasets: If True, compile all datasets and ignore the "dataset" argument. Default is False.
|
|
421
411
|
selected_model: The name of the model to compile. If specified, the compiled SQL query is also printed in the terminal. If None, all models for the selected dataset are compiled. Default is None.
|
|
422
412
|
test_set: The name of the test set to compile with. If None, the default test set is used (which can vary by dataset). Ignored if `do_all_test_sets` argument is True. Default is None.
|
|
423
413
|
do_all_test_sets: Whether to compile all applicable test sets for the selected dataset(s). If True, the `test_set` argument is ignored. Default is False.
|
|
424
|
-
runquery
|
|
414
|
+
runquery: Whether to run all compiled queries and save each result as a CSV file. If True and `selected_model` is specified, all upstream models of the selected model is compiled as well. Default is False.
|
|
415
|
+
clear: Whether to clear the "target/compile/" folder before compiling. Default is False.
|
|
416
|
+
buildtime_only: Whether to compile only buildtime models. Default is False.
|
|
417
|
+
runtime_only: Whether to compile only runtime models. Default is False.
|
|
425
418
|
"""
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
419
|
+
border = "=" * 80
|
|
420
|
+
underlines = "-" * len(border)
|
|
421
|
+
|
|
422
|
+
compile_root = Path(self._filepath, c.TARGET_FOLDER, c.COMPILE_FOLDER)
|
|
423
|
+
if clear and compile_root.exists():
|
|
424
|
+
shutil.rmtree(compile_root)
|
|
425
|
+
|
|
426
|
+
models_dict = self._get_models_dict(always_python_df=False)
|
|
427
|
+
|
|
428
|
+
if selected_model is not None:
|
|
429
|
+
selected_model = u.normalize_name(selected_model)
|
|
430
|
+
if selected_model not in models_dict:
|
|
431
|
+
print(f"No such model found: {selected_model}")
|
|
432
|
+
return
|
|
433
|
+
if not isinstance(models_dict[selected_model], m.QueryModel):
|
|
434
|
+
print(f"Model '{selected_model}' is not a query model. Nothing to do.")
|
|
435
|
+
return
|
|
437
436
|
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
437
|
+
model_to_compile = None
|
|
438
|
+
|
|
439
|
+
# Buildtime compilation
|
|
440
|
+
if not runtime_only:
|
|
441
|
+
print(underlines)
|
|
442
|
+
print(f"Compiling buildtime models")
|
|
443
|
+
print(underlines)
|
|
444
|
+
|
|
445
|
+
buildtime_folder = Path(compile_root, c.COMPILE_BUILDTIME_FOLDER)
|
|
446
|
+
buildtime_folder.mkdir(parents=True, exist_ok=True)
|
|
447
|
+
|
|
448
|
+
def write_buildtime_model(model: m.DataModel, static_models: dict[str, m.StaticModel]) -> None:
|
|
449
|
+
if not isinstance(model, m.BuildModel):
|
|
450
|
+
return
|
|
451
|
+
|
|
452
|
+
model.compile_for_build(self._conn_args, static_models)
|
|
453
|
+
|
|
454
|
+
if isinstance(model.compiled_query, mq.SqlModelQuery):
|
|
455
|
+
out_path = Path(buildtime_folder, f"{model.name}.sql")
|
|
456
|
+
with open(out_path, 'w') as f:
|
|
457
|
+
f.write(model.compiled_query.query)
|
|
458
|
+
print(f"Successfully compiled build model: {model.name}")
|
|
459
|
+
elif isinstance(model.compiled_query, mq.PyModelQuery):
|
|
460
|
+
print(f"The build model '{model.name}' is in Python. Compilation for Python is not supported yet.")
|
|
461
|
+
|
|
462
|
+
static_models = self._get_static_models()
|
|
463
|
+
if selected_model is not None:
|
|
464
|
+
model_to_compile = models_dict[selected_model]
|
|
465
|
+
write_buildtime_model(model_to_compile, static_models)
|
|
466
|
+
else:
|
|
467
|
+
coros = [asyncio.to_thread(write_buildtime_model, m, static_models) for m in static_models.values()]
|
|
468
|
+
await u.asyncio_gather(coros)
|
|
444
469
|
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
queries = await u.asyncio_gather(coroutines)
|
|
470
|
+
print(underlines)
|
|
471
|
+
print()
|
|
449
472
|
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
473
|
+
# Runtime compilation
|
|
474
|
+
if not buildtime_only:
|
|
475
|
+
if do_all_test_sets:
|
|
476
|
+
test_set_names_set = set(self._manifest_cfg.selection_test_sets.keys())
|
|
477
|
+
test_set_names_set.add(c.DEFAULT_TEST_SET_NAME)
|
|
478
|
+
test_set_names = list(test_set_names_set)
|
|
479
|
+
else:
|
|
480
|
+
test_set_names = [test_set or c.DEFAULT_TEST_SET_NAME]
|
|
481
|
+
|
|
482
|
+
for ts_name in test_set_names:
|
|
483
|
+
print(underlines)
|
|
484
|
+
print(f"Compiling runtime models (test set '{ts_name}')")
|
|
485
|
+
print(underlines)
|
|
486
|
+
|
|
487
|
+
# Build user and selections from test set config if present
|
|
488
|
+
ts_conf = self._manifest_cfg.selection_test_sets.get(ts_name, self._manifest_cfg.get_default_test_set())
|
|
489
|
+
# Separate base fields from custom fields
|
|
490
|
+
access_level = ts_conf.user.access_level
|
|
491
|
+
custom_fields = self._auth.CustomUserFields(**ts_conf.user.custom_fields)
|
|
492
|
+
if access_level == "guest":
|
|
493
|
+
user = GuestUser(username="", custom_fields=custom_fields)
|
|
494
|
+
else:
|
|
495
|
+
user = RegisteredUser(username="", access_level=access_level, custom_fields=custom_fields)
|
|
496
|
+
|
|
497
|
+
# Generate DAG across all models. When runquery=True, force models to produce Python dataframes so CSVs can be written.
|
|
498
|
+
dag = await self._get_compiled_dag(
|
|
499
|
+
user=user, selections=ts_conf.parameters, configurables=ts_conf.configurables, always_python_df=runquery,
|
|
500
|
+
)
|
|
501
|
+
if runquery:
|
|
502
|
+
await dag._run_models()
|
|
503
|
+
|
|
504
|
+
# Prepare output folders
|
|
505
|
+
runtime_folder = Path(compile_root, c.COMPILE_RUNTIME_FOLDER, ts_name)
|
|
506
|
+
dbviews_folder = Path(runtime_folder, c.DBVIEWS_FOLDER)
|
|
507
|
+
federates_folder = Path(runtime_folder, c.FEDERATES_FOLDER)
|
|
508
|
+
dbviews_folder.mkdir(parents=True, exist_ok=True)
|
|
509
|
+
federates_folder.mkdir(parents=True, exist_ok=True)
|
|
510
|
+
with open(Path(runtime_folder, "placeholders.json"), "w") as f:
|
|
511
|
+
json.dump(dag.placeholders, f)
|
|
512
|
+
|
|
513
|
+
# Function to write runtime models
|
|
514
|
+
def write_runtime_model(model: m.DataModel) -> None:
|
|
515
|
+
if not isinstance(model, m.QueryModel):
|
|
516
|
+
return
|
|
517
|
+
|
|
518
|
+
if model.model_type not in (m.ModelType.DBVIEW, m.ModelType.FEDERATE):
|
|
519
|
+
return
|
|
520
|
+
|
|
521
|
+
subfolder = dbviews_folder if model.model_type == m.ModelType.DBVIEW else federates_folder
|
|
522
|
+
model_type = "dbview" if model.model_type == m.ModelType.DBVIEW else "federate"
|
|
523
|
+
|
|
524
|
+
if isinstance(model.compiled_query, mq.SqlModelQuery):
|
|
525
|
+
out_sql = Path(subfolder, f"{model.name}.sql")
|
|
526
|
+
with open(out_sql, 'w') as f:
|
|
527
|
+
f.write(model.compiled_query.query)
|
|
528
|
+
print(f"Successfully compiled {model_type} model: {model.name}")
|
|
529
|
+
elif isinstance(model.compiled_query, mq.PyModelQuery):
|
|
530
|
+
print(f"The {model_type} model '{model.name}' is in Python. Compilation for Python is not supported yet.")
|
|
531
|
+
|
|
532
|
+
if runquery and isinstance(model.result, pl.LazyFrame):
|
|
533
|
+
out_csv = Path(subfolder, f"{model.name}.csv")
|
|
534
|
+
model.result.collect().write_csv(out_csv)
|
|
535
|
+
print(f"Successfully created CSV for {model_type} model: {model.name}")
|
|
536
|
+
|
|
537
|
+
# If selected_model is provided for runtime, only emit that model's outputs
|
|
538
|
+
if selected_model is not None:
|
|
539
|
+
model_to_compile = dag.models_dict[selected_model]
|
|
540
|
+
write_runtime_model(model_to_compile)
|
|
541
|
+
else:
|
|
542
|
+
coros = [asyncio.to_thread(write_runtime_model, model) for model in dag.models_dict.values()]
|
|
543
|
+
await u.asyncio_gather(coros)
|
|
544
|
+
|
|
545
|
+
print(underlines)
|
|
546
|
+
print()
|
|
547
|
+
|
|
548
|
+
print(f"All compilations complete! See the '{c.TARGET_FOLDER}/{c.COMPILE_FOLDER}/' folder for results.")
|
|
549
|
+
if model_to_compile and isinstance(model_to_compile, m.QueryModel) and isinstance(model_to_compile.compiled_query, mq.SqlModelQuery):
|
|
550
|
+
print()
|
|
551
|
+
print(border)
|
|
552
|
+
print(f"Compiled SQL query for model '{model_to_compile.name}':")
|
|
553
|
+
print(underlines)
|
|
554
|
+
print(model_to_compile.compiled_query.query)
|
|
555
|
+
print(border)
|
|
454
556
|
print()
|
|
455
557
|
|
|
456
|
-
def _permission_error(self, user:
|
|
457
|
-
|
|
458
|
-
return InvalidInputError(25, f"User{username} does not have permission to access {scope} {data_type}: {data_name}")
|
|
558
|
+
def _permission_error(self, user: AbstractUser, data_type: str, data_name: str, scope: str) -> InvalidInputError:
|
|
559
|
+
return InvalidInputError(403, f"unauthorized_access_to_{data_type}", f"User '{user}' does not have permission to access {scope} {data_type}: {data_name}")
|
|
459
560
|
|
|
460
561
|
def seed(self, name: str) -> pl.LazyFrame:
|
|
461
562
|
"""
|
|
@@ -491,7 +592,8 @@ class SquirrelsProject:
|
|
|
491
592
|
)
|
|
492
593
|
|
|
493
594
|
async def dataset(
|
|
494
|
-
self, name: str, *, selections: dict[str, t.Any] = {}, user:
|
|
595
|
+
self, name: str, *, selections: dict[str, t.Any] = {}, user: AbstractUser | None = None, require_auth: bool = True,
|
|
596
|
+
configurables: dict[str, str] = {}
|
|
495
597
|
) -> dr.DatasetResult:
|
|
496
598
|
"""
|
|
497
599
|
Async method to retrieve a dataset as a DatasetResult object (with metadata) given parameter selections.
|
|
@@ -504,14 +606,17 @@ class SquirrelsProject:
|
|
|
504
606
|
Returns:
|
|
505
607
|
A DatasetResult object containing the dataset result (as a polars DataFrame), its description, and the column details.
|
|
506
608
|
"""
|
|
609
|
+
if user is None:
|
|
610
|
+
user = self._guest_user
|
|
611
|
+
|
|
507
612
|
scope = self._manifest_cfg.datasets[name].scope
|
|
508
613
|
if require_auth and not self._auth.can_user_access_scope(user, scope):
|
|
509
614
|
raise self._permission_error(user, "dataset", name, scope.name)
|
|
510
615
|
|
|
511
616
|
dag = self._generate_dag(name)
|
|
617
|
+
configurables = {**self._manifest_cfg.get_default_configurables(name), **configurables}
|
|
512
618
|
await dag.execute(
|
|
513
|
-
self._param_args, self._param_cfg_set, self._context_func, user, dict(selections),
|
|
514
|
-
default_traits=self._manifest_cfg.get_default_traits()
|
|
619
|
+
self._param_args, self._param_cfg_set, self._context_func, user, dict(selections), configurables=configurables
|
|
515
620
|
)
|
|
516
621
|
assert isinstance(dag.target_model.result, pl.LazyFrame)
|
|
517
622
|
return dr.DatasetResult(
|
|
@@ -520,7 +625,8 @@ class SquirrelsProject:
|
|
|
520
625
|
)
|
|
521
626
|
|
|
522
627
|
async def dashboard(
|
|
523
|
-
self, name: str, *, selections: dict[str, t.Any] = {}, user:
|
|
628
|
+
self, name: str, *, selections: dict[str, t.Any] = {}, user: AbstractUser | None = None, dashboard_type: t.Type[T] = d.PngDashboard,
|
|
629
|
+
configurables: dict[str, str] = {}
|
|
524
630
|
) -> T:
|
|
525
631
|
"""
|
|
526
632
|
Async method to retrieve a dashboard given parameter selections.
|
|
@@ -534,13 +640,18 @@ class SquirrelsProject:
|
|
|
534
640
|
Returns:
|
|
535
641
|
The dashboard type specified by the "dashboard_type" argument.
|
|
536
642
|
"""
|
|
643
|
+
if user is None:
|
|
644
|
+
user = self._guest_user
|
|
645
|
+
|
|
537
646
|
scope = self._dashboards[name].config.scope
|
|
538
647
|
if not self._auth.can_user_access_scope(user, scope):
|
|
539
648
|
raise self._permission_error(user, "dashboard", name, scope.name)
|
|
540
649
|
|
|
541
650
|
async def get_dataset_df(dataset_name: str, fixed_params: dict[str, t.Any]) -> pl.DataFrame:
|
|
542
651
|
final_selections = {**selections, **fixed_params}
|
|
543
|
-
result = await self.dataset(
|
|
652
|
+
result = await self.dataset(
|
|
653
|
+
dataset_name, selections=final_selections, user=user, require_auth=False, configurables=configurables
|
|
654
|
+
)
|
|
544
655
|
return result.df
|
|
545
656
|
|
|
546
657
|
args = d.DashboardArgs(self._param_args, get_dataset_df)
|
|
@@ -550,12 +661,62 @@ class SquirrelsProject:
|
|
|
550
661
|
raise KeyError(f"No dashboard file found for: {name}")
|
|
551
662
|
|
|
552
663
|
async def query_models(
|
|
553
|
-
self, sql_query: str, *, selections: dict[str, t.Any] = {},
|
|
664
|
+
self, sql_query: str, *, user: AbstractUser | None = None, selections: dict[str, t.Any] = {}, configurables: dict[str, str] = {}
|
|
554
665
|
) -> dr.DatasetResult:
|
|
555
|
-
|
|
666
|
+
if user is None:
|
|
667
|
+
user = self._guest_user
|
|
668
|
+
|
|
669
|
+
dag = await self._get_compiled_dag(user=user, sql_query=sql_query, selections=selections, configurables=configurables)
|
|
556
670
|
await dag._run_models()
|
|
557
671
|
assert isinstance(dag.target_model.result, pl.LazyFrame)
|
|
558
672
|
return dr.DatasetResult(
|
|
559
673
|
target_model_config=dag.target_model.model_config,
|
|
560
674
|
df=dag.target_model.result.collect().with_row_index("_row_num", offset=1)
|
|
561
675
|
)
|
|
676
|
+
|
|
677
|
+
async def get_compiled_model_query(
|
|
678
|
+
self, model_name: str, *, user: AbstractUser | None = None, selections: dict[str, t.Any] = {}, configurables: dict[str, str] = {}
|
|
679
|
+
) -> rm.CompiledQueryModel:
|
|
680
|
+
"""
|
|
681
|
+
Compile the specified data model and return its language and compiled definition.
|
|
682
|
+
"""
|
|
683
|
+
if user is None:
|
|
684
|
+
user = self._guest_user
|
|
685
|
+
|
|
686
|
+
name = u.normalize_name(model_name)
|
|
687
|
+
models_dict = self._get_models_dict(always_python_df=False)
|
|
688
|
+
if name not in models_dict:
|
|
689
|
+
raise InvalidInputError(404, "model_not_found", f"No data model found with name: {model_name}")
|
|
690
|
+
|
|
691
|
+
model = models_dict[name]
|
|
692
|
+
# Only build, dbview, and federate models support runtime compiled definition in this context
|
|
693
|
+
if not isinstance(model, (m.BuildModel, m.DbviewModel, m.FederateModel)):
|
|
694
|
+
raise InvalidInputError(400, "unsupported_model_type", "Only build, dbview, and federate models currently support compiled definition via this endpoint")
|
|
695
|
+
|
|
696
|
+
# Build a DAG with this model as the target, without a dataset context
|
|
697
|
+
model.is_target = True
|
|
698
|
+
dag = m.DAG(None, model, models_dict, self._datalake_db_path, self._logger)
|
|
699
|
+
|
|
700
|
+
cfg = {**self._manifest_cfg.get_default_configurables(), **configurables}
|
|
701
|
+
await dag.execute(
|
|
702
|
+
self._param_args, self._param_cfg_set, self._context_func, user, selections, runquery=False, configurables=cfg
|
|
703
|
+
)
|
|
704
|
+
|
|
705
|
+
language = "sql" if isinstance(model.query_file, mq.SqlQueryFile) else "python"
|
|
706
|
+
if isinstance(model, m.BuildModel):
|
|
707
|
+
# Compile SQL build models; Python build models not yet supported
|
|
708
|
+
if isinstance(model.query_file, mq.SqlQueryFile):
|
|
709
|
+
static_models = self._get_static_models()
|
|
710
|
+
compiled = model._compile_sql_model(model.query_file, self._conn_args, static_models)
|
|
711
|
+
definition = compiled.query
|
|
712
|
+
else:
|
|
713
|
+
definition = "# Compiling Python build models is currently not supported. This will be available in a future version of Squirrels..."
|
|
714
|
+
elif isinstance(model.compiled_query, mq.SqlModelQuery):
|
|
715
|
+
definition = model.compiled_query.query
|
|
716
|
+
elif isinstance(model.compiled_query, mq.PyModelQuery):
|
|
717
|
+
definition = "# Compiling Python data models is currently not supported. This will be available in a future version of Squirrels..."
|
|
718
|
+
else:
|
|
719
|
+
raise NotImplementedError(f"Query type not supported: {model.compiled_query.__class__.__name__}")
|
|
720
|
+
|
|
721
|
+
return rm.CompiledQueryModel(language=language, definition=definition, placeholders=dag.placeholders)
|
|
722
|
+
|