squirrels 0.4.1__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of squirrels might be problematic. Click here for more details.
- dateutils/__init__.py +6 -0
- dateutils/_enums.py +25 -0
- squirrels/dateutils.py → dateutils/_implementation.py +58 -111
- dateutils/types.py +6 -0
- squirrels/__init__.py +13 -11
- squirrels/_api_routes/__init__.py +5 -0
- squirrels/_api_routes/auth.py +271 -0
- squirrels/_api_routes/base.py +165 -0
- squirrels/_api_routes/dashboards.py +150 -0
- squirrels/_api_routes/data_management.py +145 -0
- squirrels/_api_routes/datasets.py +257 -0
- squirrels/_api_routes/oauth2.py +298 -0
- squirrels/_api_routes/project.py +252 -0
- squirrels/_api_server.py +256 -450
- squirrels/_arguments/__init__.py +0 -0
- squirrels/_arguments/init_time_args.py +108 -0
- squirrels/_arguments/run_time_args.py +147 -0
- squirrels/_auth.py +960 -0
- squirrels/_command_line.py +126 -45
- squirrels/_compile_prompts.py +147 -0
- squirrels/_connection_set.py +48 -26
- squirrels/_constants.py +68 -38
- squirrels/_dashboards.py +160 -0
- squirrels/_data_sources.py +570 -0
- squirrels/_dataset_types.py +84 -0
- squirrels/_exceptions.py +29 -0
- squirrels/_initializer.py +177 -80
- squirrels/_logging.py +115 -0
- squirrels/_manifest.py +208 -79
- squirrels/_model_builder.py +69 -0
- squirrels/_model_configs.py +74 -0
- squirrels/_model_queries.py +52 -0
- squirrels/_models.py +926 -367
- squirrels/_package_data/base_project/.env +42 -0
- squirrels/_package_data/base_project/.env.example +42 -0
- squirrels/_package_data/base_project/assets/expenses.db +0 -0
- squirrels/_package_data/base_project/connections.yml +16 -0
- squirrels/_package_data/base_project/dashboards/dashboard_example.py +34 -0
- squirrels/_package_data/base_project/dashboards/dashboard_example.yml +22 -0
- squirrels/{package_data → _package_data}/base_project/docker/.dockerignore +5 -2
- squirrels/{package_data → _package_data}/base_project/docker/Dockerfile +3 -3
- squirrels/{package_data → _package_data}/base_project/docker/compose.yml +1 -1
- squirrels/_package_data/base_project/duckdb_init.sql +10 -0
- squirrels/{package_data/base_project/.gitignore → _package_data/base_project/gitignore} +3 -2
- squirrels/_package_data/base_project/macros/macros_example.sql +17 -0
- squirrels/_package_data/base_project/models/builds/build_example.py +26 -0
- squirrels/_package_data/base_project/models/builds/build_example.sql +16 -0
- squirrels/_package_data/base_project/models/builds/build_example.yml +57 -0
- squirrels/_package_data/base_project/models/dbviews/dbview_example.sql +12 -0
- squirrels/_package_data/base_project/models/dbviews/dbview_example.yml +26 -0
- squirrels/_package_data/base_project/models/federates/federate_example.py +37 -0
- squirrels/_package_data/base_project/models/federates/federate_example.sql +19 -0
- squirrels/_package_data/base_project/models/federates/federate_example.yml +65 -0
- squirrels/_package_data/base_project/models/sources.yml +38 -0
- squirrels/{package_data → _package_data}/base_project/parameters.yml +56 -40
- squirrels/_package_data/base_project/pyconfigs/connections.py +14 -0
- squirrels/{package_data → _package_data}/base_project/pyconfigs/context.py +21 -40
- squirrels/_package_data/base_project/pyconfigs/parameters.py +141 -0
- squirrels/_package_data/base_project/pyconfigs/user.py +44 -0
- squirrels/_package_data/base_project/seeds/seed_categories.yml +15 -0
- squirrels/_package_data/base_project/seeds/seed_subcategories.csv +15 -0
- squirrels/_package_data/base_project/seeds/seed_subcategories.yml +21 -0
- squirrels/_package_data/base_project/squirrels.yml.j2 +61 -0
- squirrels/_package_data/templates/dataset_results.html +112 -0
- squirrels/_package_data/templates/oauth_login.html +271 -0
- squirrels/_package_data/templates/squirrels_studio.html +20 -0
- squirrels/_package_loader.py +8 -4
- squirrels/_parameter_configs.py +104 -103
- squirrels/_parameter_options.py +348 -0
- squirrels/_parameter_sets.py +57 -47
- squirrels/_parameters.py +1664 -0
- squirrels/_project.py +721 -0
- squirrels/_py_module.py +7 -5
- squirrels/_schemas/__init__.py +0 -0
- squirrels/_schemas/auth_models.py +167 -0
- squirrels/_schemas/query_param_models.py +75 -0
- squirrels/{_api_response_models.py → _schemas/response_models.py} +126 -47
- squirrels/_seeds.py +35 -16
- squirrels/_sources.py +110 -0
- squirrels/_utils.py +248 -73
- squirrels/_version.py +1 -1
- squirrels/arguments.py +7 -0
- squirrels/auth.py +4 -0
- squirrels/connections.py +3 -0
- squirrels/dashboards.py +2 -81
- squirrels/data_sources.py +14 -631
- squirrels/parameter_options.py +13 -348
- squirrels/parameters.py +14 -1266
- squirrels/types.py +16 -0
- squirrels-0.5.0.dist-info/METADATA +113 -0
- squirrels-0.5.0.dist-info/RECORD +97 -0
- {squirrels-0.4.1.dist-info → squirrels-0.5.0.dist-info}/WHEEL +1 -1
- squirrels-0.5.0.dist-info/entry_points.txt +3 -0
- {squirrels-0.4.1.dist-info → squirrels-0.5.0.dist-info/licenses}/LICENSE +1 -1
- squirrels/_authenticator.py +0 -85
- squirrels/_dashboards_io.py +0 -61
- squirrels/_environcfg.py +0 -84
- squirrels/arguments/init_time_args.py +0 -40
- squirrels/arguments/run_time_args.py +0 -208
- squirrels/package_data/assets/favicon.ico +0 -0
- squirrels/package_data/assets/index.css +0 -1
- squirrels/package_data/assets/index.js +0 -58
- squirrels/package_data/base_project/assets/expenses.db +0 -0
- squirrels/package_data/base_project/connections.yml +0 -7
- squirrels/package_data/base_project/dashboards/dashboard_example.py +0 -32
- squirrels/package_data/base_project/dashboards.yml +0 -10
- squirrels/package_data/base_project/env.yml +0 -29
- squirrels/package_data/base_project/models/dbviews/dbview_example.py +0 -47
- squirrels/package_data/base_project/models/dbviews/dbview_example.sql +0 -22
- squirrels/package_data/base_project/models/federates/federate_example.py +0 -21
- squirrels/package_data/base_project/models/federates/federate_example.sql +0 -3
- squirrels/package_data/base_project/pyconfigs/auth.py +0 -45
- squirrels/package_data/base_project/pyconfigs/connections.py +0 -19
- squirrels/package_data/base_project/pyconfigs/parameters.py +0 -95
- squirrels/package_data/base_project/seeds/seed_subcategories.csv +0 -15
- squirrels/package_data/base_project/squirrels.yml.j2 +0 -94
- squirrels/package_data/templates/index.html +0 -18
- squirrels/project.py +0 -378
- squirrels/user_base.py +0 -55
- squirrels-0.4.1.dist-info/METADATA +0 -117
- squirrels-0.4.1.dist-info/RECORD +0 -60
- squirrels-0.4.1.dist-info/entry_points.txt +0 -4
- /squirrels/{package_data → _package_data}/base_project/assets/weather.db +0 -0
- /squirrels/{package_data → _package_data}/base_project/seeds/seed_categories.csv +0 -0
- /squirrels/{package_data → _package_data}/base_project/tmp/.gitignore +0 -0
squirrels/_manifest.py
CHANGED
|
@@ -1,22 +1,32 @@
|
|
|
1
|
-
from
|
|
1
|
+
from functools import cached_property
|
|
2
|
+
from typing import Literal, Any
|
|
3
|
+
from urllib.parse import urlparse
|
|
4
|
+
from sqlalchemy import Engine, create_engine
|
|
2
5
|
from typing_extensions import Self
|
|
3
6
|
from enum import Enum
|
|
4
7
|
from pydantic import BaseModel, Field, field_validator, model_validator, ValidationInfo, ValidationError
|
|
5
|
-
import yaml, time
|
|
8
|
+
import yaml, time, re
|
|
6
9
|
|
|
7
|
-
from . import _constants as c, _utils as
|
|
8
|
-
from ._environcfg import EnvironConfig
|
|
10
|
+
from . import _constants as c, _utils as u
|
|
9
11
|
|
|
10
12
|
|
|
11
13
|
class ProjectVarsConfig(BaseModel, extra="allow"):
|
|
12
14
|
name: str
|
|
13
15
|
label: str = ""
|
|
16
|
+
description: str = ""
|
|
14
17
|
major_version: int
|
|
15
18
|
|
|
19
|
+
@field_validator("name")
|
|
20
|
+
@classmethod
|
|
21
|
+
def validate_name(cls, v: str) -> str:
|
|
22
|
+
if not re.fullmatch(r"[A-Za-z0-9_-]+", v):
|
|
23
|
+
raise ValueError("Project name must only contain alphanumeric characters, underscores, and dashes.")
|
|
24
|
+
return v
|
|
25
|
+
|
|
16
26
|
@model_validator(mode="after")
|
|
17
27
|
def finalize_label(self) -> Self:
|
|
18
28
|
if self.label == "":
|
|
19
|
-
self.label = self.name
|
|
29
|
+
self.label = u.to_title_case(self.name)
|
|
20
30
|
return self
|
|
21
31
|
|
|
22
32
|
|
|
@@ -36,41 +46,133 @@ class _ConfigWithNameBaseModel(BaseModel):
|
|
|
36
46
|
name: str
|
|
37
47
|
|
|
38
48
|
|
|
39
|
-
class
|
|
40
|
-
|
|
41
|
-
|
|
49
|
+
class ConnectionTypeEnum(Enum):
|
|
50
|
+
SQLALCHEMY = "sqlalchemy"
|
|
51
|
+
CONNECTORX = "connectorx"
|
|
52
|
+
ADBC = "adbc"
|
|
53
|
+
DUCKDB = "duckdb"
|
|
42
54
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
55
|
+
|
|
56
|
+
class ConnectionProperties(BaseModel):
|
|
57
|
+
"""
|
|
58
|
+
A class for holding the properties of a connection
|
|
59
|
+
|
|
60
|
+
Arguments:
|
|
61
|
+
type: The type of connection, one of "sqlalchemy", "connectorx", or "adbc"
|
|
62
|
+
uri: The URI for the connection
|
|
63
|
+
"""
|
|
64
|
+
label: str | None = None
|
|
65
|
+
type: ConnectionTypeEnum = Field(default=ConnectionTypeEnum.SQLALCHEMY)
|
|
66
|
+
uri: str
|
|
67
|
+
sa_create_engine_args: dict[str, Any] = Field(default_factory=dict)
|
|
68
|
+
|
|
69
|
+
@cached_property
|
|
70
|
+
def engine(self) -> Engine:
|
|
71
|
+
"""
|
|
72
|
+
Creates and caches a SQLAlchemy engine if the connection type is sqlalchemy.
|
|
73
|
+
Returns None for other connection types.
|
|
74
|
+
"""
|
|
75
|
+
if self.type == ConnectionTypeEnum.SQLALCHEMY:
|
|
76
|
+
return create_engine(self.uri, **self.sa_create_engine_args)
|
|
77
|
+
else:
|
|
78
|
+
raise ValueError(f'Connection type "{self.type}" does not support engine property')
|
|
79
|
+
|
|
80
|
+
@cached_property
|
|
81
|
+
def dialect(self) -> str:
|
|
82
|
+
default_dialect = None
|
|
83
|
+
if self.type == ConnectionTypeEnum.SQLALCHEMY:
|
|
84
|
+
dialect = self.engine.dialect.name
|
|
85
|
+
elif self.type == ConnectionTypeEnum.DUCKDB:
|
|
86
|
+
dialect = self.uri.split(':')[0]
|
|
87
|
+
default_dialect = 'duckdb'
|
|
88
|
+
else:
|
|
89
|
+
url = urlparse(self.uri)
|
|
90
|
+
dialect = url.scheme
|
|
91
|
+
|
|
92
|
+
processed_dialect = next((d for d in ['sqlite', 'postgres', 'mysql', 'duckdb'] if dialect.lower().startswith(d)), default_dialect)
|
|
93
|
+
dialect = processed_dialect if processed_dialect is not None else dialect
|
|
94
|
+
return dialect
|
|
95
|
+
|
|
96
|
+
@cached_property
|
|
97
|
+
def attach_uri_for_duckdb(self) -> str | None:
|
|
98
|
+
if self.type == ConnectionTypeEnum.DUCKDB:
|
|
99
|
+
return self.uri
|
|
100
|
+
elif self.type == ConnectionTypeEnum.SQLALCHEMY:
|
|
101
|
+
url = self.engine.url
|
|
102
|
+
host = url.host
|
|
103
|
+
port = url.port
|
|
104
|
+
username = url.username
|
|
105
|
+
password = url.password
|
|
106
|
+
database = url.database
|
|
107
|
+
database_as_file = database if database is not None else ""
|
|
108
|
+
else:
|
|
109
|
+
url = urlparse(self.uri)
|
|
110
|
+
host = url.hostname
|
|
111
|
+
port = url.port
|
|
112
|
+
username = url.username
|
|
113
|
+
password = url.password
|
|
114
|
+
database = url.path.lstrip('/')
|
|
115
|
+
database_as_file = self.uri.replace(f"{self.dialect}://", "")
|
|
116
|
+
|
|
117
|
+
if self.dialect in ('postgres', 'mysql'):
|
|
118
|
+
attach_uri = f"{self.dialect}:dbname={database} user={username} password={password} host={host} port={port}"
|
|
119
|
+
elif self.dialect == "sqlite":
|
|
120
|
+
attach_uri = f"{self.dialect}:{database_as_file}"
|
|
121
|
+
elif self.dialect == "duckdb":
|
|
122
|
+
attach_uri = database_as_file
|
|
123
|
+
else:
|
|
124
|
+
attach_uri = None
|
|
125
|
+
|
|
126
|
+
return attach_uri
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class DbConnConfig(ConnectionProperties, _ConfigWithNameBaseModel):
|
|
130
|
+
def finalize_uri(self, base_path: str) -> Self:
|
|
131
|
+
self.uri = self.uri.format(project_path=base_path)
|
|
46
132
|
return self
|
|
47
133
|
|
|
48
134
|
|
|
135
|
+
class DatasetConfigurablesConfig(BaseModel):
|
|
136
|
+
name: str
|
|
137
|
+
default: str
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class ConfigurablesConfig(DatasetConfigurablesConfig):
|
|
141
|
+
label: str = ""
|
|
142
|
+
description: str = ""
|
|
143
|
+
|
|
144
|
+
|
|
49
145
|
class ParametersConfig(BaseModel):
|
|
50
146
|
type: str
|
|
51
147
|
factory: str
|
|
52
148
|
arguments: dict[str, Any]
|
|
53
149
|
|
|
54
150
|
|
|
55
|
-
class
|
|
56
|
-
|
|
151
|
+
class PermissionScope(Enum):
|
|
152
|
+
PUBLIC = 0
|
|
153
|
+
PROTECTED = 1
|
|
154
|
+
PRIVATE = 2
|
|
57
155
|
|
|
58
156
|
|
|
59
|
-
class
|
|
60
|
-
|
|
157
|
+
class AuthenticationEnforcement(Enum):
|
|
158
|
+
REQUIRED = "required"
|
|
159
|
+
OPTIONAL = "optional"
|
|
160
|
+
DISABLED = "disabled"
|
|
61
161
|
|
|
162
|
+
class AuthenticationType(Enum):
|
|
163
|
+
MANAGED = "managed"
|
|
164
|
+
EXTERNAL = "external"
|
|
62
165
|
|
|
63
|
-
class
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
PRIVATE = 2
|
|
166
|
+
class AuthenticationConfig(BaseModel):
|
|
167
|
+
enforcement: AuthenticationEnforcement = AuthenticationEnforcement.OPTIONAL
|
|
168
|
+
type: AuthenticationType = AuthenticationType.MANAGED
|
|
67
169
|
|
|
68
170
|
|
|
69
171
|
class AnalyticsOutputConfig(_ConfigWithNameBaseModel):
|
|
70
172
|
label: str = ""
|
|
71
173
|
description: str = ""
|
|
72
|
-
scope:
|
|
73
|
-
parameters: list[str] = Field(
|
|
174
|
+
scope: PermissionScope = PermissionScope.PUBLIC
|
|
175
|
+
parameters: list[str] | None = Field(default=None, description="The list of parameter names used by the dataset/dashboard")
|
|
74
176
|
|
|
75
177
|
@model_validator(mode="after")
|
|
76
178
|
def finalize_label(self) -> Self:
|
|
@@ -80,19 +182,18 @@ class AnalyticsOutputConfig(_ConfigWithNameBaseModel):
|
|
|
80
182
|
|
|
81
183
|
@field_validator("scope", mode="before")
|
|
82
184
|
@classmethod
|
|
83
|
-
def validate_scope(cls, value: str, info: ValidationInfo) ->
|
|
185
|
+
def validate_scope(cls, value: str, info: ValidationInfo) -> PermissionScope:
|
|
84
186
|
try:
|
|
85
|
-
return
|
|
187
|
+
return PermissionScope[str(value).upper()]
|
|
86
188
|
except KeyError as e:
|
|
87
189
|
name = info.data.get("name")
|
|
88
|
-
scope_list = [scope.name.lower() for scope in
|
|
190
|
+
scope_list = [scope.name.lower() for scope in PermissionScope]
|
|
89
191
|
raise ValueError(f'Scope "{value}" is invalid for dataset/dashboard "{name}". Scope must be one of {scope_list}') from e
|
|
90
192
|
|
|
91
193
|
|
|
92
194
|
class DatasetConfig(AnalyticsOutputConfig):
|
|
93
195
|
model: str = ""
|
|
94
|
-
|
|
95
|
-
default_test_set: str = ""
|
|
196
|
+
configurables: list[DatasetConfigurablesConfig] = Field(default_factory=list)
|
|
96
197
|
|
|
97
198
|
def __hash__(self) -> int:
|
|
98
199
|
return hash("dataset_"+self.name)
|
|
@@ -104,47 +205,27 @@ class DatasetConfig(AnalyticsOutputConfig):
|
|
|
104
205
|
return self
|
|
105
206
|
|
|
106
207
|
|
|
107
|
-
class
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
208
|
+
class TestSetsUserConfig(BaseModel):
|
|
209
|
+
access_level: Literal["admin", "member", "guest"] = "guest"
|
|
210
|
+
custom_fields: dict[str, Any] = Field(default_factory=dict)
|
|
111
211
|
|
|
112
212
|
class TestSetsConfig(_ConfigWithNameBaseModel):
|
|
113
|
-
|
|
114
|
-
is_authenticated: bool = False
|
|
115
|
-
user_attributes: dict[str, Any] = Field(default_factory=dict)
|
|
213
|
+
user: TestSetsUserConfig = Field(default_factory=TestSetsUserConfig)
|
|
116
214
|
parameters: dict[str, Any] = Field(default_factory=dict)
|
|
117
|
-
|
|
118
|
-
@model_validator(mode="after")
|
|
119
|
-
def finalize_is_authenticated(self) -> Self:
|
|
120
|
-
if len(self.user_attributes) > 0:
|
|
121
|
-
self.is_authenticated = True
|
|
122
|
-
return self
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
class Settings(BaseModel):
|
|
126
|
-
data: dict[str, Any]
|
|
127
|
-
|
|
128
|
-
def get_default_connection_name(self) -> str:
|
|
129
|
-
return self.data.get(c.DB_CONN_DEFAULT_USED_SETTING, c.DEFAULT_DB_CONN)
|
|
130
|
-
|
|
131
|
-
def do_use_duckdb(self) -> bool:
|
|
132
|
-
return self.data.get(c.IN_MEMORY_DB_SETTING, c.SQLITE) == c.DUCKDB
|
|
215
|
+
configurables: dict[str, Any] = Field(default_factory=dict)
|
|
133
216
|
|
|
134
217
|
|
|
135
218
|
class ManifestConfig(BaseModel):
|
|
136
|
-
env_cfg: EnvironConfig
|
|
137
219
|
project_variables: ProjectVarsConfig
|
|
220
|
+
authentication: AuthenticationConfig = Field(default_factory=AuthenticationConfig)
|
|
138
221
|
packages: list[PackageConfig] = Field(default_factory=list)
|
|
139
222
|
connections: dict[str, DbConnConfig] = Field(default_factory=dict)
|
|
140
223
|
parameters: list[ParametersConfig] = Field(default_factory=list)
|
|
224
|
+
configurables: dict[str, ConfigurablesConfig] = Field(default_factory=dict)
|
|
141
225
|
selection_test_sets: dict[str, TestSetsConfig] = Field(default_factory=dict)
|
|
142
|
-
dbviews: dict[str, DbviewConfig] = Field(default_factory=dict)
|
|
143
|
-
federates: dict[str, FederateConfig] = Field(default_factory=dict)
|
|
144
226
|
datasets: dict[str, DatasetConfig] = Field(default_factory=dict)
|
|
145
|
-
dashboards: dict[str, DashboardConfig] = Field(default_factory=dict)
|
|
146
|
-
settings: dict[str, Any] = Field(default_factory=dict)
|
|
147
227
|
base_path: str = "."
|
|
228
|
+
env_vars: dict[str, str] = Field(default_factory=dict)
|
|
148
229
|
|
|
149
230
|
@field_validator("packages")
|
|
150
231
|
@classmethod
|
|
@@ -156,13 +237,13 @@ class ManifestConfig(BaseModel):
|
|
|
156
237
|
set_of_directories.add(package.directory)
|
|
157
238
|
return packages
|
|
158
239
|
|
|
159
|
-
@field_validator("connections", "selection_test_sets", "
|
|
240
|
+
@field_validator("connections", "selection_test_sets", "datasets", "configurables", mode="before")
|
|
160
241
|
@classmethod
|
|
161
242
|
def names_are_unique(cls, values: list[dict] | dict[str, dict], info: ValidationInfo) -> dict[str, dict]:
|
|
162
243
|
if isinstance(values, list):
|
|
163
244
|
values_as_dict = {}
|
|
164
245
|
for obj in values:
|
|
165
|
-
name = obj["name"]
|
|
246
|
+
name = u.normalize_name(obj["name"])
|
|
166
247
|
if name in values_as_dict:
|
|
167
248
|
raise ValueError(f'In the {info.field_name} section, the name "{name}" was specified multiple times')
|
|
168
249
|
values_as_dict[name] = obj
|
|
@@ -173,45 +254,93 @@ class ManifestConfig(BaseModel):
|
|
|
173
254
|
@model_validator(mode="after")
|
|
174
255
|
def finalize_connections(self) -> Self:
|
|
175
256
|
for conn in self.connections.values():
|
|
176
|
-
conn.
|
|
257
|
+
conn.finalize_uri(self.base_path)
|
|
177
258
|
return self
|
|
178
259
|
|
|
179
|
-
@
|
|
180
|
-
def
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
260
|
+
@model_validator(mode="after")
|
|
261
|
+
def validate_authentication_and_scopes(self) -> Self:
|
|
262
|
+
"""
|
|
263
|
+
Enforce authentication rules:
|
|
264
|
+
- If authentication.is_required, no dataset may be PUBLIC.
|
|
265
|
+
"""
|
|
266
|
+
if self.authentication.enforcement == AuthenticationEnforcement.REQUIRED:
|
|
267
|
+
invalid = [name for name, ds in self.datasets.items() if ds.scope == PermissionScope.PUBLIC]
|
|
268
|
+
if invalid:
|
|
269
|
+
raise ValueError(
|
|
270
|
+
"Authentication is required, so datasets cannot be public. "
|
|
271
|
+
f"Update the scope for datasets: {invalid}"
|
|
272
|
+
)
|
|
273
|
+
return self
|
|
274
|
+
|
|
275
|
+
@model_validator(mode="after")
|
|
276
|
+
def validate_dataset_configurables(self) -> Self:
|
|
277
|
+
"""
|
|
278
|
+
Validate that dataset configurables reference valid project-level configurables.
|
|
279
|
+
"""
|
|
280
|
+
for dataset_name, dataset_cfg in self.datasets.items():
|
|
281
|
+
for cfg_override in dataset_cfg.configurables:
|
|
282
|
+
if cfg_override.name not in self.configurables:
|
|
283
|
+
raise ValueError(
|
|
284
|
+
f'Dataset "{dataset_name}" references configurable "{cfg_override.name}" which is not defined '
|
|
285
|
+
f'in the project configurables'
|
|
286
|
+
)
|
|
287
|
+
return self
|
|
288
|
+
|
|
289
|
+
def get_default_test_set(self) -> TestSetsConfig:
|
|
184
290
|
"""
|
|
185
291
|
Raises KeyError if dataset name doesn't exist
|
|
186
292
|
"""
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
default_name = default_name_1 if default_name_1 else default_name_2
|
|
190
|
-
default_test_set = self.selection_test_sets.get(default_name, TestSetsConfig(name=default_name))
|
|
293
|
+
default_default_test_set = TestSetsConfig(name=c.DEFAULT_TEST_SET_NAME)
|
|
294
|
+
default_test_set = self.selection_test_sets.get(c.DEFAULT_TEST_SET_NAME, default_default_test_set)
|
|
191
295
|
return default_test_set
|
|
192
296
|
|
|
193
|
-
def
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
297
|
+
def get_default_configurables(self, dataset_name: str | None = None) -> dict[str, str]:
|
|
298
|
+
"""
|
|
299
|
+
Return a dictionary of configurable name to its default value.
|
|
300
|
+
|
|
301
|
+
If dataset_name is provided, merges project-level defaults with dataset-specific overrides.
|
|
302
|
+
|
|
303
|
+
Supports both list- and dict-shaped internal storage for configurables.
|
|
304
|
+
"""
|
|
305
|
+
defaults: dict[str, str] = {}
|
|
306
|
+
for name, cfg in self.configurables.items():
|
|
307
|
+
defaults[name] = str(cfg.default)
|
|
308
|
+
|
|
309
|
+
# Apply dataset-specific overrides if dataset_name is provided
|
|
310
|
+
if dataset_name is not None:
|
|
311
|
+
dataset_cfg = self.datasets.get(dataset_name)
|
|
312
|
+
if dataset_cfg:
|
|
313
|
+
for cfg_override in dataset_cfg.configurables:
|
|
314
|
+
defaults[cfg_override.name] = cfg_override.default
|
|
315
|
+
|
|
316
|
+
return defaults
|
|
199
317
|
|
|
200
318
|
|
|
201
319
|
class ManifestIO:
|
|
202
320
|
|
|
203
321
|
@classmethod
|
|
204
|
-
def load_from_file(cls, logger:
|
|
322
|
+
def load_from_file(cls, logger: u.Logger, base_path: str, env_vars: dict[str, str]) -> ManifestConfig:
|
|
205
323
|
start = time.time()
|
|
206
324
|
|
|
207
|
-
raw_content =
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
325
|
+
raw_content = u.read_file(u.Path(base_path, c.MANIFEST_FILE))
|
|
326
|
+
content = u.render_string(raw_content, base_path=base_path, env_vars=env_vars)
|
|
327
|
+
manifest_content: dict[str, Any] = yaml.safe_load(content)
|
|
328
|
+
|
|
329
|
+
auth_cfg: dict[str, Any] = manifest_content.get("authentication", {})
|
|
330
|
+
is_auth_required = bool(auth_cfg.get("is_required", False))
|
|
331
|
+
|
|
332
|
+
if is_auth_required:
|
|
333
|
+
# If authentication is required, assume PROTECTED when scope is not specified
|
|
334
|
+
# while explicitly forbidding PUBLIC (enforced in model validator)
|
|
335
|
+
datasets_raw = manifest_content.get("datasets", [])
|
|
336
|
+
for ds in datasets_raw:
|
|
337
|
+
if isinstance(ds, dict) and "scope" not in ds:
|
|
338
|
+
ds["scope"] = "protected"
|
|
339
|
+
|
|
211
340
|
try:
|
|
212
|
-
manifest_cfg = ManifestConfig(base_path=base_path,
|
|
341
|
+
manifest_cfg = ManifestConfig(base_path=base_path, **manifest_content)
|
|
213
342
|
except ValidationError as e:
|
|
214
|
-
raise
|
|
343
|
+
raise u.ConfigurationError(f"Failed to process {c.MANIFEST_FILE} file. " + str(e)) from e
|
|
215
344
|
|
|
216
345
|
logger.log_activity_time(f"loading {c.MANIFEST_FILE} file", start)
|
|
217
346
|
return manifest_cfg
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
import duckdb, time
|
|
3
|
+
|
|
4
|
+
from . import _utils as u, _connection_set as cs, _models as m
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class ModelBuilder:
|
|
9
|
+
_datalake_db_path: str
|
|
10
|
+
_conn_set: cs.ConnectionSet
|
|
11
|
+
_static_models: dict[str, m.StaticModel]
|
|
12
|
+
_conn_args: cs.ConnectionsArgs = field(default_factory=lambda: cs.ConnectionsArgs(".", {}, {}))
|
|
13
|
+
_logger: u.Logger = field(default_factory=lambda: u.Logger(""))
|
|
14
|
+
|
|
15
|
+
def _attach_connections(self, duckdb_conn: duckdb.DuckDBPyConnection) -> None:
|
|
16
|
+
for conn_name, conn_props in self._conn_set.get_connections_as_dict().items():
|
|
17
|
+
if not isinstance(conn_props, m.ConnectionProperties):
|
|
18
|
+
continue
|
|
19
|
+
attach_uri = conn_props.attach_uri_for_duckdb
|
|
20
|
+
if attach_uri is None:
|
|
21
|
+
continue # skip unsupported dialects
|
|
22
|
+
attach_stmt = f"ATTACH IF NOT EXISTS '{attach_uri}' AS db_{conn_name} (READ_ONLY)"
|
|
23
|
+
u.run_duckdb_stmt(self._logger, duckdb_conn, attach_stmt, redacted_values=[attach_uri])
|
|
24
|
+
|
|
25
|
+
async def _build_models(self, duckdb_conn: duckdb.DuckDBPyConnection, select: str | None, full_refresh: bool) -> None:
|
|
26
|
+
"""
|
|
27
|
+
Compile and construct the build models as DuckDB tables.
|
|
28
|
+
"""
|
|
29
|
+
# Compile the build models
|
|
30
|
+
models_list = self._static_models.values() if select is None else [self._static_models[select]]
|
|
31
|
+
for model in models_list:
|
|
32
|
+
model.compile_for_build(self._conn_args, self._static_models)
|
|
33
|
+
|
|
34
|
+
# Find all terminal nodes
|
|
35
|
+
terminal_nodes = set()
|
|
36
|
+
if select is None:
|
|
37
|
+
for model in models_list:
|
|
38
|
+
terminal_nodes.update(model.get_terminal_nodes_for_build(set()))
|
|
39
|
+
for model in models_list:
|
|
40
|
+
model.confirmed_no_cycles = False
|
|
41
|
+
else:
|
|
42
|
+
terminal_nodes.add(select)
|
|
43
|
+
|
|
44
|
+
# Run the build models
|
|
45
|
+
coroutines = []
|
|
46
|
+
for model_name in terminal_nodes:
|
|
47
|
+
model = self._static_models[model_name]
|
|
48
|
+
# await model.build_model(duckdb_conn, full_refresh)
|
|
49
|
+
coro = model.build_model(duckdb_conn, full_refresh)
|
|
50
|
+
coroutines.append(coro)
|
|
51
|
+
await u.asyncio_gather(coroutines)
|
|
52
|
+
|
|
53
|
+
async def build(self, full_refresh: bool, select: str | None) -> None:
|
|
54
|
+
start = time.time()
|
|
55
|
+
|
|
56
|
+
# Connect directly to DuckLake instead of attaching (supports concurrent connections)
|
|
57
|
+
duckdb_conn = u.create_duckdb_connection(self._datalake_db_path)
|
|
58
|
+
|
|
59
|
+
try:
|
|
60
|
+
# Attach connections
|
|
61
|
+
self._attach_connections(duckdb_conn)
|
|
62
|
+
|
|
63
|
+
# Construct build models
|
|
64
|
+
await self._build_models(duckdb_conn, select, full_refresh)
|
|
65
|
+
|
|
66
|
+
finally:
|
|
67
|
+
duckdb_conn.close()
|
|
68
|
+
|
|
69
|
+
self._logger.log_activity_time("TOTAL TIME to build the Virtual Data Lake (VDL)", start)
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from pydantic import BaseModel, Field
|
|
3
|
+
|
|
4
|
+
from . import _constants as c
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class ColumnCategory(Enum):
|
|
8
|
+
DIMENSION = "dimension"
|
|
9
|
+
MEASURE = "measure"
|
|
10
|
+
MISC = "misc"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ColumnConfig(BaseModel):
|
|
14
|
+
name: str = Field(description="The name of the column")
|
|
15
|
+
type: str = Field(default="", description="The type of the column such as 'string', 'integer', 'float', 'boolean', 'datetime', etc.")
|
|
16
|
+
condition: str = Field(default="", description="The condition of when the column is included")
|
|
17
|
+
description: str = Field(default="", description="The description of the column")
|
|
18
|
+
category: ColumnCategory = Field(default=ColumnCategory.MISC, description="The category of the column, either 'dimension', 'measure', or 'misc'")
|
|
19
|
+
depends_on: set[str] = Field(default_factory=set, description="List of dependent columns")
|
|
20
|
+
pass_through: bool = Field(default=False, description="Whether the column should be passed through to the federate")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ModelConfig(BaseModel):
|
|
24
|
+
description: str = Field(default="", description="The description of the model")
|
|
25
|
+
columns: list[ColumnConfig] = Field(default_factory=list, description="The columns of the model")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class SeedConfig(ModelConfig):
|
|
29
|
+
cast_column_types: bool = Field(default=False, description="Whether the column types should be cast to the appropriate type")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ConnectionInterface(BaseModel):
|
|
33
|
+
connection: str | None = Field(default=None, description="The connection name of the source model / database view")
|
|
34
|
+
|
|
35
|
+
def finalize_connection(self, env_vars: dict[str, str]):
|
|
36
|
+
if self.connection is None:
|
|
37
|
+
self.connection = env_vars.get(c.SQRL_CONNECTIONS_DEFAULT_NAME_USED, "default")
|
|
38
|
+
return self
|
|
39
|
+
|
|
40
|
+
def get_connection(self) -> str:
|
|
41
|
+
assert self.connection is not None, "Connection must be set"
|
|
42
|
+
return self.connection
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class QueryModelConfig(ModelConfig):
|
|
46
|
+
depends_on: set[str] = Field(default_factory=set, description="The dependencies of the model")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class BuildModelConfig(QueryModelConfig):
|
|
50
|
+
materialization: str = Field(default="VIEW", description="The materialization of the model (ignored if Python model which is always a table)")
|
|
51
|
+
|
|
52
|
+
def get_sql_for_build(self, model_name: str, select_query: str) -> str:
|
|
53
|
+
if self.materialization.upper() == "TABLE":
|
|
54
|
+
materialization = "TABLE"
|
|
55
|
+
elif self.materialization.upper() == "VIEW":
|
|
56
|
+
materialization = "VIEW"
|
|
57
|
+
else:
|
|
58
|
+
raise ValueError(f"Invalid materialization: {self.materialization}")
|
|
59
|
+
|
|
60
|
+
create_prefix = f"CREATE OR REPLACE {materialization} {model_name} AS\n\n"
|
|
61
|
+
return create_prefix + select_query
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class DbviewModelConfig(ConnectionInterface, QueryModelConfig):
|
|
65
|
+
translate_to_duckdb: bool = Field(default=False, description="Whether to translate the query to DuckDB and use DuckDB tables at runtime")
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class FederateModelConfig(QueryModelConfig):
|
|
69
|
+
eager: bool = Field(default=False, description="Whether the model should be materialized for SQL models")
|
|
70
|
+
|
|
71
|
+
def get_sql_for_create(self, model_name: str, select_query: str) -> str:
|
|
72
|
+
materialization = "TABLE" if self.eager else "VIEW"
|
|
73
|
+
create_prefix = f"CREATE {materialization} {model_name} AS\n\n"
|
|
74
|
+
return create_prefix + select_query
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from abc import ABCMeta
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from typing import Callable, Generic, TypeVar, Any
|
|
4
|
+
import polars as pl, pandas as pd
|
|
5
|
+
|
|
6
|
+
from ._arguments.run_time_args import BuildModelArgs
|
|
7
|
+
from ._model_configs import ModelConfig
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# Input query file classes
|
|
11
|
+
|
|
12
|
+
@dataclass(frozen=True)
|
|
13
|
+
class QueryFile(metaclass=ABCMeta):
|
|
14
|
+
filepath: str
|
|
15
|
+
raw_query: Any
|
|
16
|
+
|
|
17
|
+
@dataclass(frozen=True)
|
|
18
|
+
class SqlQueryFile(QueryFile):
|
|
19
|
+
raw_query: str
|
|
20
|
+
|
|
21
|
+
@dataclass(frozen=True)
|
|
22
|
+
class PyQueryFile(QueryFile):
|
|
23
|
+
raw_query: Callable[[BuildModelArgs], pl.LazyFrame | pd.DataFrame]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
Q = TypeVar('Q', bound=QueryFile)
|
|
27
|
+
M = TypeVar('M', bound=ModelConfig)
|
|
28
|
+
|
|
29
|
+
@dataclass(frozen=True)
|
|
30
|
+
class QueryFileWithConfig(Generic[Q, M]):
|
|
31
|
+
query_file: Q
|
|
32
|
+
config: M
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# Compiled query classes
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class Query(metaclass=ABCMeta):
|
|
39
|
+
query: Any
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class WorkInProgress(Query):
|
|
43
|
+
query: None = field(default=None, init=False)
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class SqlModelQuery(Query):
|
|
47
|
+
query: str
|
|
48
|
+
is_duckdb: bool
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class PyModelQuery(Query):
|
|
52
|
+
query: Callable[[], pl.LazyFrame | pd.DataFrame]
|