squirrels 0.3.3__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of squirrels might be problematic. Click here for more details.
- squirrels/__init__.py +7 -3
- squirrels/_api_response_models.py +96 -72
- squirrels/_api_server.py +375 -201
- squirrels/_authenticator.py +23 -22
- squirrels/_command_line.py +70 -46
- squirrels/_connection_set.py +23 -25
- squirrels/_constants.py +29 -78
- squirrels/_dashboards_io.py +61 -0
- squirrels/_environcfg.py +53 -50
- squirrels/_initializer.py +184 -141
- squirrels/_manifest.py +168 -195
- squirrels/_models.py +159 -292
- squirrels/_package_loader.py +7 -8
- squirrels/_parameter_configs.py +173 -141
- squirrels/_parameter_sets.py +49 -38
- squirrels/_py_module.py +7 -7
- squirrels/_seeds.py +13 -12
- squirrels/_utils.py +114 -54
- squirrels/_version.py +1 -1
- squirrels/arguments/init_time_args.py +16 -10
- squirrels/arguments/run_time_args.py +89 -24
- squirrels/dashboards.py +82 -0
- squirrels/data_sources.py +212 -232
- squirrels/dateutils.py +29 -26
- squirrels/package_data/assets/index.css +1 -1
- squirrels/package_data/assets/index.js +27 -18
- squirrels/package_data/base_project/.gitignore +2 -2
- squirrels/package_data/base_project/connections.yml +1 -1
- squirrels/package_data/base_project/dashboards/dashboard_example.py +32 -0
- squirrels/package_data/base_project/dashboards.yml +10 -0
- squirrels/package_data/base_project/docker/.dockerignore +9 -4
- squirrels/package_data/base_project/docker/Dockerfile +7 -6
- squirrels/package_data/base_project/docker/compose.yml +1 -1
- squirrels/package_data/base_project/env.yml +2 -2
- squirrels/package_data/base_project/models/dbviews/{database_view1.py → dbview_example.py} +2 -1
- squirrels/package_data/base_project/models/dbviews/{database_view1.sql → dbview_example.sql} +3 -2
- squirrels/package_data/base_project/models/federates/{dataset_example.py → federate_example.py} +6 -6
- squirrels/package_data/base_project/models/federates/{dataset_example.sql → federate_example.sql} +1 -1
- squirrels/package_data/base_project/parameters.yml +6 -4
- squirrels/package_data/base_project/pyconfigs/auth.py +1 -1
- squirrels/package_data/base_project/pyconfigs/connections.py +1 -1
- squirrels/package_data/base_project/pyconfigs/context.py +38 -10
- squirrels/package_data/base_project/pyconfigs/parameters.py +15 -7
- squirrels/package_data/base_project/squirrels.yml.j2 +14 -7
- squirrels/package_data/templates/index.html +3 -3
- squirrels/parameter_options.py +103 -106
- squirrels/parameters.py +347 -195
- squirrels/project.py +378 -0
- squirrels/user_base.py +14 -6
- {squirrels-0.3.3.dist-info → squirrels-0.4.1.dist-info}/METADATA +9 -21
- squirrels-0.4.1.dist-info/RECORD +60 -0
- squirrels/_timer.py +0 -23
- squirrels-0.3.3.dist-info/RECORD +0 -56
- {squirrels-0.3.3.dist-info → squirrels-0.4.1.dist-info}/LICENSE +0 -0
- {squirrels-0.3.3.dist-info → squirrels-0.4.1.dist-info}/WHEEL +0 -0
- {squirrels-0.3.3.dist-info → squirrels-0.4.1.dist-info}/entry_points.txt +0 -0
squirrels/_parameter_sets.py
CHANGED
|
@@ -2,15 +2,14 @@ from __future__ import annotations
|
|
|
2
2
|
from typing import Optional, Sequence
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from collections import OrderedDict
|
|
5
|
-
import concurrent.futures, pandas as pd
|
|
5
|
+
import time, concurrent.futures, pandas as pd
|
|
6
6
|
|
|
7
|
-
from . import _utils as u, _constants as c, parameters as p, _parameter_configs as
|
|
7
|
+
from . import _utils as u, _constants as c, parameters as p, _parameter_configs as _pc, _py_module as pm, _api_response_models as arm
|
|
8
8
|
from .arguments.init_time_args import ParametersArgs
|
|
9
|
-
from ._manifest import
|
|
10
|
-
from ._connection_set import
|
|
11
|
-
from ._seeds import
|
|
9
|
+
from ._manifest import ParametersConfig, ManifestConfig
|
|
10
|
+
from ._connection_set import ConnectionSet, ConnectionsArgs
|
|
11
|
+
from ._seeds import Seeds
|
|
12
12
|
from .user_base import User
|
|
13
|
-
from ._timer import timer, time
|
|
14
13
|
|
|
15
14
|
|
|
16
15
|
@dataclass
|
|
@@ -31,25 +30,25 @@ class ParameterSet:
|
|
|
31
30
|
|
|
32
31
|
|
|
33
32
|
@dataclass
|
|
34
|
-
class
|
|
33
|
+
class ParameterConfigsSet:
|
|
35
34
|
"""
|
|
36
35
|
Pool of parameter configs, can create multiple for unit testing purposes
|
|
37
36
|
"""
|
|
38
|
-
_data: dict[str,
|
|
39
|
-
_data_source_params: dict[str,
|
|
37
|
+
_data: dict[str, _pc.ParameterConfigBase] = field(default_factory=OrderedDict)
|
|
38
|
+
_data_source_params: dict[str, _pc.DataSourceParameterConfig] = field(default_factory=dict)
|
|
40
39
|
|
|
41
|
-
def get(self, name: Optional[str]) -> Optional[
|
|
40
|
+
def get(self, name: Optional[str]) -> Optional[_pc.ParameterConfigBase]:
|
|
42
41
|
try:
|
|
43
42
|
return self._data[name] if name is not None else None
|
|
44
43
|
except KeyError as e:
|
|
45
44
|
raise u.ConfigurationError(f'Unable to find parameter named "{name}"') from e
|
|
46
45
|
|
|
47
|
-
def add(self, param_config:
|
|
46
|
+
def add(self, param_config: _pc.ParameterConfigBase) -> None:
|
|
48
47
|
self._data[param_config.name] = param_config
|
|
49
|
-
if isinstance(param_config,
|
|
48
|
+
if isinstance(param_config, _pc.DataSourceParameterConfig):
|
|
50
49
|
self._data_source_params[param_config.name] = param_config
|
|
51
50
|
|
|
52
|
-
def _get_all_ds_param_configs(self) -> Sequence[
|
|
51
|
+
def _get_all_ds_param_configs(self) -> Sequence[_pc.DataSourceParameterConfig]:
|
|
53
52
|
return list(self._data_source_params.values())
|
|
54
53
|
|
|
55
54
|
def __convert_datasource_params(self, df_dict: dict[str, pd.DataFrame]) -> None:
|
|
@@ -60,11 +59,12 @@ class _ParameterConfigsSet:
|
|
|
60
59
|
name = stack[-1]
|
|
61
60
|
if name not in done:
|
|
62
61
|
param = self._data_source_params.get(name, self.get(name))
|
|
62
|
+
assert param is not None
|
|
63
63
|
parent_name = param.parent_name
|
|
64
64
|
if parent_name is not None and parent_name not in done:
|
|
65
65
|
stack.append(parent_name)
|
|
66
66
|
continue
|
|
67
|
-
if isinstance(param,
|
|
67
|
+
if isinstance(param, _pc.DataSourceParameterConfig):
|
|
68
68
|
if name not in df_dict:
|
|
69
69
|
raise u.ConfigurationError(f'No reference data found for parameter "{name}"')
|
|
70
70
|
self._data[name] = param.convert(df_dict[name])
|
|
@@ -73,12 +73,12 @@ class _ParameterConfigsSet:
|
|
|
73
73
|
|
|
74
74
|
def __validate_param_relationships(self) -> None:
|
|
75
75
|
for param_config in self._data.values():
|
|
76
|
-
assert isinstance(param_config,
|
|
76
|
+
assert isinstance(param_config, _pc.ParameterConfig)
|
|
77
77
|
parent_name = param_config.parent_name
|
|
78
78
|
parent = self.get(parent_name)
|
|
79
79
|
if parent:
|
|
80
|
-
if not isinstance(param_config,
|
|
81
|
-
if not isinstance(parent,
|
|
80
|
+
if not isinstance(param_config, _pc.SelectionParameterConfig):
|
|
81
|
+
if not isinstance(parent, _pc.SingleSelectParameterConfig):
|
|
82
82
|
raise u.ConfigurationError(f'Only single-select parameters can be parents of non-select parameters. ' +
|
|
83
83
|
f'Parameter "{parent_name}" is the parent of non-select parameter ' +
|
|
84
84
|
f'"{param_config.name}" but "{parent_name}" is not a single-select parameter.')
|
|
@@ -92,7 +92,7 @@ class _ParameterConfigsSet:
|
|
|
92
92
|
f'among the options of non-select parameter "{param_config.name}".')
|
|
93
93
|
seen.update(lookup_keys)
|
|
94
94
|
|
|
95
|
-
if not isinstance(parent,
|
|
95
|
+
if not isinstance(parent, _pc.SelectionParameterConfig):
|
|
96
96
|
raise u.ConfigurationError(f'Only selection parameters can be parents. Parameter "{parent_name}" is the parent of ' +
|
|
97
97
|
f'"{param_config.name}" but "{parent_name}" is not a selection parameter.')
|
|
98
98
|
|
|
@@ -107,7 +107,7 @@ class _ParameterConfigsSet:
|
|
|
107
107
|
*, updates_only: bool = False, request_version: Optional[int] = None
|
|
108
108
|
) -> ParameterSet:
|
|
109
109
|
if dataset_params is None:
|
|
110
|
-
dataset_params = self._data.keys()
|
|
110
|
+
dataset_params = list(self._data.keys())
|
|
111
111
|
|
|
112
112
|
parameters_by_name: dict[str, p.Parameter] = {}
|
|
113
113
|
params_to_process = selections.keys() if selections and updates_only else dataset_params
|
|
@@ -119,6 +119,7 @@ class _ParameterConfigsSet:
|
|
|
119
119
|
children = []
|
|
120
120
|
if curr_name not in parameters_by_name:
|
|
121
121
|
param_conf = self.get(curr_name)
|
|
122
|
+
assert isinstance(param_conf, _pc.ParameterConfig)
|
|
122
123
|
parent_name = param_conf.parent_name
|
|
123
124
|
if parent_name is None:
|
|
124
125
|
parent = None
|
|
@@ -127,9 +128,10 @@ class _ParameterConfigsSet:
|
|
|
127
128
|
continue
|
|
128
129
|
else:
|
|
129
130
|
parent = parameters_by_name.get(parent_name)
|
|
131
|
+
assert isinstance(parent, p._SelectionParameter) or parent is None
|
|
130
132
|
param = param_conf.with_selection(selections.get(curr_name), user, parent)
|
|
131
133
|
parameters_by_name[curr_name] = param
|
|
132
|
-
if isinstance(param_conf,
|
|
134
|
+
if isinstance(param_conf, _pc.SelectionParameterConfig):
|
|
133
135
|
children = list(x for x in param_conf.children.keys() if x in dataset_params)
|
|
134
136
|
stack.pop()
|
|
135
137
|
stack.extend(children)
|
|
@@ -137,21 +139,24 @@ class _ParameterConfigsSet:
|
|
|
137
139
|
ordered_parameters = OrderedDict((key, parameters_by_name[key]) for key in dataset_params if key in parameters_by_name)
|
|
138
140
|
return ParameterSet(ordered_parameters)
|
|
139
141
|
|
|
140
|
-
def get_all_api_field_info(self) -> dict[str,
|
|
141
|
-
|
|
142
|
+
def get_all_api_field_info(self) -> dict[str, _pc.APIParamFieldInfo]:
|
|
143
|
+
api_field_infos = {}
|
|
144
|
+
for param, config in self._data.items():
|
|
145
|
+
assert isinstance(config, _pc.ParameterConfig)
|
|
146
|
+
api_field_infos[param] = config.get_api_field_info()
|
|
147
|
+
return api_field_infos
|
|
142
148
|
|
|
143
149
|
|
|
144
150
|
class ParameterConfigsSetIO:
|
|
145
151
|
"""
|
|
146
|
-
Static class for the singleton object of
|
|
152
|
+
Static class for the singleton object of ParameterConfigsSet
|
|
147
153
|
"""
|
|
148
|
-
|
|
149
|
-
obj: _ParameterConfigsSet
|
|
154
|
+
obj: ParameterConfigsSet # this is static (set in load_from_file) to simplify development experience for pyconfigs/parameters.py
|
|
150
155
|
|
|
151
156
|
@classmethod
|
|
152
|
-
def
|
|
153
|
-
def get_dataframe(ds_param_config:
|
|
154
|
-
return ds_param_config.name, ds_param_config.get_dataframe(
|
|
157
|
+
def _get_df_dict_from_data_sources(cls, default_conn_name: str, seeds: Seeds, conn_set: ConnectionSet) -> dict[str, pd.DataFrame]:
|
|
158
|
+
def get_dataframe(ds_param_config: _pc.DataSourceParameterConfig) -> tuple[str, pd.DataFrame]:
|
|
159
|
+
return ds_param_config.name, ds_param_config.get_dataframe(default_conn_name, conn_set, seeds)
|
|
155
160
|
|
|
156
161
|
ds_param_configs = cls.obj._get_all_ds_param_configs()
|
|
157
162
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
@@ -160,24 +165,30 @@ class ParameterConfigsSetIO:
|
|
|
160
165
|
return df_dict
|
|
161
166
|
|
|
162
167
|
@classmethod
|
|
163
|
-
def
|
|
168
|
+
def _add_from_dict(cls, param_config: ParametersConfig) -> None:
|
|
164
169
|
ptype = getattr(p, param_config.type)
|
|
165
170
|
factory = getattr(ptype, param_config.factory)
|
|
166
171
|
factory(**param_config.arguments)
|
|
167
172
|
|
|
168
173
|
@classmethod
|
|
169
|
-
def
|
|
174
|
+
def get_param_args(cls, conn_args: ConnectionsArgs) -> ParametersArgs:
|
|
175
|
+
return ParametersArgs(conn_args.proj_vars, conn_args.env_vars)
|
|
176
|
+
|
|
177
|
+
@classmethod
|
|
178
|
+
def load_from_file(
|
|
179
|
+
cls, logger: u.Logger, base_path: str, manifest_cfg: ManifestConfig, seeds: Seeds, conn_set: ConnectionSet, param_args: ParametersArgs
|
|
180
|
+
) -> ParameterConfigsSet:
|
|
170
181
|
start = time.time()
|
|
171
|
-
cls.obj =
|
|
182
|
+
cls.obj = ParameterConfigsSet()
|
|
172
183
|
|
|
173
|
-
for param_as_dict in
|
|
174
|
-
cls.
|
|
184
|
+
for param_as_dict in manifest_cfg.parameters:
|
|
185
|
+
cls._add_from_dict(param_as_dict)
|
|
175
186
|
|
|
176
|
-
|
|
177
|
-
cls.args = ParametersArgs(conn_args.proj_vars, conn_args.env_vars)
|
|
178
|
-
pm.run_pyconfig_main(c.PARAMETERS_FILE, {"sqrl": cls.args})
|
|
187
|
+
pm.run_pyconfig_main(base_path, c.PARAMETERS_FILE, {"sqrl": param_args})
|
|
179
188
|
|
|
180
|
-
|
|
189
|
+
default_conn_name = manifest_cfg.settings_obj.get_default_connection_name()
|
|
190
|
+
df_dict = cls._get_df_dict_from_data_sources(default_conn_name, seeds, conn_set)
|
|
181
191
|
cls.obj._post_process_params(df_dict)
|
|
182
192
|
|
|
183
|
-
|
|
193
|
+
logger.log_activity_time("loading parameters", start)
|
|
194
|
+
return cls.obj
|
squirrels/_py_module.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from typing import Type, Optional, Any
|
|
2
|
-
from types import ModuleType
|
|
3
2
|
import importlib.util
|
|
4
3
|
|
|
5
4
|
from . import _constants as c, _utils as u
|
|
@@ -10,25 +9,26 @@ class PyModule:
|
|
|
10
9
|
"""
|
|
11
10
|
Constructor for PyModule, an abstract module for a file that may or may not exist
|
|
12
11
|
|
|
13
|
-
|
|
12
|
+
Arguments:
|
|
14
13
|
filepath (str | pathlib.Path): The file path to the python module
|
|
15
14
|
is_required: If true, throw an error if the file path doesn't exist
|
|
16
15
|
"""
|
|
17
16
|
self.filepath = str(filepath)
|
|
18
17
|
try:
|
|
19
18
|
spec = importlib.util.spec_from_file_location(self.filepath, self.filepath)
|
|
19
|
+
assert spec is not None and spec.loader is not None
|
|
20
20
|
self.module = importlib.util.module_from_spec(spec)
|
|
21
21
|
spec.loader.exec_module(self.module)
|
|
22
22
|
except FileNotFoundError as e:
|
|
23
23
|
if is_required:
|
|
24
24
|
raise u.ConfigurationError(f"Required file not found: '{self.filepath}'") from e
|
|
25
|
-
self.module
|
|
25
|
+
self.module = default_class
|
|
26
26
|
|
|
27
27
|
def get_func_or_class(self, attr_name: str, *, default_attr: Any = None, is_required: bool = True) -> Any:
|
|
28
28
|
"""
|
|
29
29
|
Get an attribute of the module. Usually a python function or class.
|
|
30
30
|
|
|
31
|
-
|
|
31
|
+
Arguments:
|
|
32
32
|
attr_name: The attribute name
|
|
33
33
|
default_attr: The default function or class to use if the attribute cannot be found
|
|
34
34
|
is_required: If true, throw an error if the attribute cannot be found, unless default_attr is not None
|
|
@@ -42,15 +42,15 @@ class PyModule:
|
|
|
42
42
|
return func_or_class
|
|
43
43
|
|
|
44
44
|
|
|
45
|
-
def run_pyconfig_main(filename: str, kwargs: dict[str, Any] = {}) -> None:
|
|
45
|
+
def run_pyconfig_main(base_path: str, filename: str, kwargs: dict[str, Any] = {}) -> None:
|
|
46
46
|
"""
|
|
47
47
|
Given a python file in the 'pyconfigs' folder, run its main function
|
|
48
48
|
|
|
49
|
-
|
|
49
|
+
Arguments:
|
|
50
50
|
filename: The name of the file to run main function
|
|
51
51
|
kwargs: Dictionary of the main function arguments
|
|
52
52
|
"""
|
|
53
|
-
filepath = u.
|
|
53
|
+
filepath = u.Path(base_path, c.PYCONFIGS_FOLDER, filename)
|
|
54
54
|
module = PyModule(filepath)
|
|
55
55
|
main_function = module.get_func_or_class(c.MAIN_FUNC, is_required=False)
|
|
56
56
|
if main_function:
|
squirrels/_seeds.py
CHANGED
|
@@ -1,38 +1,39 @@
|
|
|
1
1
|
from dataclasses import dataclass
|
|
2
|
-
import os, glob, pandas as pd
|
|
2
|
+
import os, time, glob, pandas as pd
|
|
3
3
|
|
|
4
|
-
from .
|
|
5
|
-
from .
|
|
6
|
-
from . import _utils as u, _constants as c
|
|
4
|
+
from ._manifest import ManifestConfig
|
|
5
|
+
from . import _utils as _u, _constants as c
|
|
7
6
|
|
|
8
7
|
|
|
9
8
|
@dataclass
|
|
10
9
|
class Seeds:
|
|
11
10
|
_data: dict[str, pd.DataFrame]
|
|
11
|
+
_manifest_cfg: ManifestConfig
|
|
12
12
|
|
|
13
13
|
def run_query(self, sql_query: str) -> pd.DataFrame:
|
|
14
|
-
|
|
14
|
+
use_duckdb = self._manifest_cfg.settings_obj.do_use_duckdb()
|
|
15
|
+
return _u.run_sql_on_dataframes(sql_query, self._data, use_duckdb)
|
|
15
16
|
|
|
16
17
|
def get_dataframes(self) -> dict[str, pd.DataFrame]:
|
|
17
18
|
return self._data.copy()
|
|
18
19
|
|
|
19
20
|
|
|
20
21
|
class SeedsIO:
|
|
21
|
-
obj: Seeds
|
|
22
22
|
|
|
23
23
|
@classmethod
|
|
24
|
-
def
|
|
24
|
+
def load_files(cls, logger: _u.Logger, base_path: str, manifest_cfg: ManifestConfig) -> Seeds:
|
|
25
25
|
start = time.time()
|
|
26
|
-
infer_schema: bool =
|
|
27
|
-
na_values: list[str] =
|
|
26
|
+
infer_schema: bool = manifest_cfg.settings.get(c.SEEDS_INFER_SCHEMA_SETTING, True)
|
|
27
|
+
na_values: list[str] = manifest_cfg.settings.get(c.SEEDS_NA_VALUES_SETTING, ["NA"])
|
|
28
28
|
csv_dtype = None if infer_schema else str
|
|
29
29
|
|
|
30
30
|
seeds_dict = {}
|
|
31
|
-
csv_files = glob.glob(os.path.join(c.SEEDS_FOLDER, '**/*.csv'), recursive=True)
|
|
31
|
+
csv_files = glob.glob(os.path.join(base_path, c.SEEDS_FOLDER, '**/*.csv'), recursive=True)
|
|
32
32
|
for csv_file in csv_files:
|
|
33
33
|
file_stem = os.path.splitext(os.path.basename(csv_file))[0]
|
|
34
34
|
df = pd.read_csv(csv_file, dtype=csv_dtype, keep_default_na=False, na_values=na_values)
|
|
35
35
|
seeds_dict[file_stem] = df
|
|
36
36
|
|
|
37
|
-
|
|
38
|
-
|
|
37
|
+
seeds = Seeds(seeds_dict, manifest_cfg)
|
|
38
|
+
logger.log_activity_time("loading seed files", start)
|
|
39
|
+
return seeds
|
squirrels/_utils.py
CHANGED
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
from typing import Sequence, Optional, Union, TypeVar, Callable
|
|
2
2
|
from pathlib import Path
|
|
3
|
-
|
|
3
|
+
from pandas.api import types as pd_types
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
import os, time, logging, json, sqlite3, pandas as pd
|
|
6
|
+
import jinja2 as j2, jinja2.nodes as j2_nodes
|
|
4
7
|
|
|
5
8
|
from . import _constants as c
|
|
6
9
|
|
|
@@ -21,75 +24,114 @@ class ConfigurationError(Exception):
|
|
|
21
24
|
"""
|
|
22
25
|
pass
|
|
23
26
|
|
|
24
|
-
class FileExecutionError(
|
|
27
|
+
class FileExecutionError(Exception):
|
|
25
28
|
def __init__(self, message: str, error: Exception, *args) -> None:
|
|
26
|
-
|
|
29
|
+
t = " "
|
|
30
|
+
new_message = f"\n" + message + f"\n{t}Produced error message:\n{t}{t}{error} (see above for more details on handled exception)"
|
|
27
31
|
super().__init__(new_message, *args)
|
|
32
|
+
self.error = error
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
## Other utility classes
|
|
36
|
+
|
|
37
|
+
class Logger(logging.Logger):
|
|
38
|
+
def log_activity_time(self, activity: str, start_timestamp: float, *, request_id: str | None = None) -> None:
|
|
39
|
+
end_timestamp = time.time()
|
|
40
|
+
time_taken = round((end_timestamp-start_timestamp) * 10**3, 3)
|
|
41
|
+
data = { "activity": activity, "start_timestamp": start_timestamp, "end_timestamp": end_timestamp, "time_taken_ms": time_taken }
|
|
42
|
+
info = { "request_id": request_id } if request_id else {}
|
|
43
|
+
self.debug(f'Time taken for "{activity}": {time_taken}ms', extra={"data": data, "info": info})
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class EnvironmentWithMacros(j2.Environment):
|
|
47
|
+
def __init__(self, logger: logging.Logger, loader: j2.FileSystemLoader, *args, **kwargs):
|
|
48
|
+
super().__init__(*args, loader=loader, **kwargs)
|
|
49
|
+
self._logger = logger
|
|
50
|
+
self._macros = self._load_macro_templates(logger)
|
|
51
|
+
|
|
52
|
+
def _load_macro_templates(self, logger: logging.Logger) -> str:
|
|
53
|
+
macros_dirs = self._get_macro_folders_from_packages()
|
|
54
|
+
macro_templates = []
|
|
55
|
+
for macros_dir in macros_dirs:
|
|
56
|
+
for root, _, files in os.walk(macros_dir):
|
|
57
|
+
files: list[str]
|
|
58
|
+
for filename in files:
|
|
59
|
+
if any(filename.endswith(x) for x in [".sql", ".j2", ".jinja", ".jinja2"]):
|
|
60
|
+
filepath = Path(root, filename)
|
|
61
|
+
logger.info(f"Loaded macros from: {filepath}")
|
|
62
|
+
with open(filepath, 'r') as f:
|
|
63
|
+
content = f.read()
|
|
64
|
+
macro_templates.append(content)
|
|
65
|
+
return '\n'.join(macro_templates)
|
|
66
|
+
|
|
67
|
+
def _get_macro_folders_from_packages(self) -> list[Path]:
|
|
68
|
+
assert isinstance(self.loader, j2.FileSystemLoader)
|
|
69
|
+
packages_folder = Path(self.loader.searchpath[0], c.PACKAGES_FOLDER)
|
|
70
|
+
|
|
71
|
+
subdirectories = []
|
|
72
|
+
if os.path.exists(packages_folder):
|
|
73
|
+
for item in os.listdir(packages_folder):
|
|
74
|
+
item_path = Path(packages_folder, item)
|
|
75
|
+
if os.path.isdir(item_path):
|
|
76
|
+
subdirectories.append(Path(item_path, c.MACROS_FOLDER))
|
|
77
|
+
|
|
78
|
+
subdirectories.append(Path(self.loader.searchpath[0], c.MACROS_FOLDER))
|
|
79
|
+
return subdirectories
|
|
28
80
|
|
|
81
|
+
def _parse(self, source: str, name: str | None, filename: str | None) -> j2_nodes.Template:
|
|
82
|
+
source = self._macros + source
|
|
83
|
+
return super()._parse(source, name, filename)
|
|
29
84
|
|
|
30
|
-
## Utility functions/variables
|
|
31
|
-
|
|
32
|
-
def join_paths(*paths: FilePath) -> Path:
|
|
33
|
-
"""
|
|
34
|
-
Joins paths together.
|
|
35
|
-
|
|
36
|
-
Parameters:
|
|
37
|
-
paths (str | pathlib.Path): The paths to join.
|
|
38
85
|
|
|
39
|
-
|
|
40
|
-
(pathlib.Path) The joined path.
|
|
41
|
-
"""
|
|
42
|
-
return Path(*paths)
|
|
86
|
+
## Utility functions/variables
|
|
43
87
|
|
|
88
|
+
def log_activity_time(logger: logging.Logger, activity: str, start_timestamp: float, *, request_id: str | None = None) -> None:
|
|
89
|
+
end_timestamp = time.time()
|
|
90
|
+
time_taken = round((end_timestamp-start_timestamp) * 10**3, 3)
|
|
91
|
+
data = { "activity": activity, "start_timestamp": start_timestamp, "end_timestamp": end_timestamp, "time_taken_ms": time_taken }
|
|
92
|
+
info = { "request_id": request_id } if request_id else {}
|
|
93
|
+
logger.debug(f'Time taken for "{activity}": {time_taken}ms', extra={"data": data, "info": info})
|
|
44
94
|
|
|
45
|
-
_j2_env = j2.Environment(loader=j2.FileSystemLoader('.'))
|
|
46
95
|
|
|
47
|
-
def render_string(raw_str: str, **kwargs
|
|
96
|
+
def render_string(raw_str: str, *, base_path: str = ".", **kwargs) -> str:
|
|
48
97
|
"""
|
|
49
98
|
Given a template string, render it with the given keyword arguments
|
|
50
99
|
|
|
51
|
-
|
|
100
|
+
Arguments:
|
|
52
101
|
raw_str: The template string
|
|
53
102
|
kwargs: The keyword arguments
|
|
54
103
|
|
|
55
104
|
Returns:
|
|
56
105
|
The rendered string
|
|
57
106
|
"""
|
|
58
|
-
|
|
107
|
+
j2_env = j2.Environment(loader=j2.FileSystemLoader(base_path))
|
|
108
|
+
template = j2_env.from_string(raw_str)
|
|
59
109
|
return template.render(kwargs)
|
|
60
110
|
|
|
61
111
|
|
|
62
|
-
|
|
63
|
-
def __process_file_handler(file_handler: Callable[[FilePath], T], filepath: FilePath, is_required: bool) -> Optional[T]:
|
|
64
|
-
try:
|
|
65
|
-
return file_handler(filepath)
|
|
66
|
-
except FileNotFoundError as e:
|
|
67
|
-
if is_required:
|
|
68
|
-
raise ConfigurationError(f"Required file not found: '{str(filepath)}'") from e
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
def read_file(filepath: FilePath, *, is_required: bool = True) -> Optional[str]:
|
|
112
|
+
def read_file(filepath: FilePath) -> str:
|
|
72
113
|
"""
|
|
73
114
|
Reads a file and return its content if required
|
|
74
115
|
|
|
75
|
-
|
|
116
|
+
Arguments:
|
|
76
117
|
filepath (str | pathlib.Path): The path to the file to read
|
|
77
118
|
is_required: If true, throw error if file doesn't exist
|
|
78
119
|
|
|
79
120
|
Returns:
|
|
80
121
|
Content of the file, or None if doesn't exist and not required
|
|
81
122
|
"""
|
|
82
|
-
|
|
123
|
+
try:
|
|
83
124
|
with open(filepath, 'r') as f:
|
|
84
125
|
return f.read()
|
|
85
|
-
|
|
126
|
+
except FileNotFoundError as e:
|
|
127
|
+
raise ConfigurationError(f"Required file not found: '{str(filepath)}'") from e
|
|
86
128
|
|
|
87
129
|
|
|
88
130
|
def normalize_name(name: str) -> str:
|
|
89
131
|
"""
|
|
90
132
|
Normalizes names to the convention of the squirrels manifest file.
|
|
91
133
|
|
|
92
|
-
|
|
134
|
+
Arguments:
|
|
93
135
|
name: The name to normalize.
|
|
94
136
|
|
|
95
137
|
Returns:
|
|
@@ -102,7 +144,7 @@ def normalize_name_for_api(name: str) -> str:
|
|
|
102
144
|
"""
|
|
103
145
|
Normalizes names to the REST API convention.
|
|
104
146
|
|
|
105
|
-
|
|
147
|
+
Arguments:
|
|
106
148
|
name: The name to normalize.
|
|
107
149
|
|
|
108
150
|
Returns:
|
|
@@ -115,7 +157,7 @@ def load_json_or_comma_delimited_str_as_list(input_str: Union[str, Sequence]) ->
|
|
|
115
157
|
"""
|
|
116
158
|
Given a string, load it as a list either by json string or comma delimited value
|
|
117
159
|
|
|
118
|
-
|
|
160
|
+
Arguments:
|
|
119
161
|
input_str: The input string
|
|
120
162
|
|
|
121
163
|
Returns:
|
|
@@ -143,7 +185,7 @@ def process_if_not_none(input_val: Optional[X], processor: Callable[[X], Y]) ->
|
|
|
143
185
|
"""
|
|
144
186
|
Given a input value and a function that processes the value, return the output of the function unless input is None
|
|
145
187
|
|
|
146
|
-
|
|
188
|
+
Arguments:
|
|
147
189
|
input_val: The input value
|
|
148
190
|
processor: The function that processes the input value
|
|
149
191
|
|
|
@@ -155,42 +197,60 @@ def process_if_not_none(input_val: Optional[X], processor: Callable[[X], Y]) ->
|
|
|
155
197
|
return processor(input_val)
|
|
156
198
|
|
|
157
199
|
|
|
158
|
-
def
|
|
159
|
-
"""
|
|
160
|
-
Determines whether to use DuckDB instead of SQLite for embedded database
|
|
161
|
-
|
|
162
|
-
Returns:
|
|
163
|
-
A boolean
|
|
164
|
-
"""
|
|
165
|
-
from ._manifest import ManifestIO
|
|
166
|
-
return (ManifestIO.obj.settings.get(c.IN_MEMORY_DB_SETTING, c.SQLITE) == c.DUCKDB)
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
def run_sql_on_dataframes(sql_query: str, dataframes: dict[str, pd.DataFrame]) -> pd.DataFrame:
|
|
200
|
+
def run_sql_on_dataframes(sql_query: str, dataframes: dict[str, pd.DataFrame], do_use_duckdb: bool) -> pd.DataFrame:
|
|
170
201
|
"""
|
|
171
202
|
Runs a SQL query against a collection of dataframes
|
|
172
203
|
|
|
173
|
-
|
|
204
|
+
Arguments:
|
|
174
205
|
sql_query: The SQL query to run
|
|
175
206
|
dataframes: A dictionary of table names to their pandas Dataframe
|
|
176
207
|
|
|
177
208
|
Returns:
|
|
178
209
|
The result as a pandas Dataframe from running the query
|
|
179
210
|
"""
|
|
180
|
-
do_use_duckdb = use_duckdb()
|
|
181
211
|
if do_use_duckdb:
|
|
182
212
|
import duckdb
|
|
183
|
-
|
|
213
|
+
duckdb_conn = duckdb.connect()
|
|
184
214
|
else:
|
|
185
215
|
conn = sqlite3.connect(":memory:")
|
|
186
216
|
|
|
187
217
|
try:
|
|
188
218
|
for name, df in dataframes.items():
|
|
189
219
|
if do_use_duckdb:
|
|
190
|
-
|
|
220
|
+
duckdb_conn.execute(f"CREATE TABLE {name} AS FROM df")
|
|
191
221
|
else:
|
|
192
222
|
df.to_sql(name, conn, index=False)
|
|
193
223
|
|
|
194
|
-
return
|
|
224
|
+
return duckdb_conn.execute(sql_query).df() if do_use_duckdb else pd.read_sql(sql_query, conn)
|
|
195
225
|
finally:
|
|
196
|
-
conn.close()
|
|
226
|
+
duckdb_conn.close() if do_use_duckdb else conn.close()
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def df_to_json0(df: pd.DataFrame, dimensions: list[str] | None = None) -> dict:
|
|
230
|
+
"""
|
|
231
|
+
Convert a pandas DataFrame to the response format that the dataset result API of Squirrels outputs.
|
|
232
|
+
|
|
233
|
+
Arguments:
|
|
234
|
+
df: The dataframe to convert into an API response
|
|
235
|
+
dimensions: The list of declared dimensions. If None, all non-numeric columns are assumed as dimensions
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
The response of a Squirrels dataset result API
|
|
239
|
+
"""
|
|
240
|
+
in_df_json = json.loads(df.to_json(orient='table', index=False))
|
|
241
|
+
out_fields = []
|
|
242
|
+
non_numeric_fields = []
|
|
243
|
+
for in_column in in_df_json["schema"]["fields"]:
|
|
244
|
+
col_name: str = in_column["name"]
|
|
245
|
+
out_column = { "name": col_name, "type": in_column["type"] }
|
|
246
|
+
out_fields.append(out_column)
|
|
247
|
+
|
|
248
|
+
if not pd_types.is_numeric_dtype(df[col_name].dtype):
|
|
249
|
+
non_numeric_fields.append(col_name)
|
|
250
|
+
|
|
251
|
+
out_dimensions = non_numeric_fields if dimensions is None else dimensions
|
|
252
|
+
dataset_json = {
|
|
253
|
+
"schema": { "fields": out_fields, "dimensions": out_dimensions },
|
|
254
|
+
"data": in_df_json["data"]
|
|
255
|
+
}
|
|
256
|
+
return dataset_json
|
squirrels/_version.py
CHANGED
|
@@ -1,32 +1,38 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Callable, Any
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
@dataclass
|
|
6
6
|
class BaseArguments:
|
|
7
|
-
|
|
8
|
-
|
|
7
|
+
_proj_vars: dict[str, Any]
|
|
8
|
+
_env_vars: dict[str, Any]
|
|
9
|
+
|
|
10
|
+
@property
|
|
11
|
+
def proj_vars(self) -> dict[str, Any]:
|
|
12
|
+
return self._proj_vars.copy()
|
|
13
|
+
|
|
14
|
+
@property
|
|
15
|
+
def env_vars(self) -> dict[str, Any]:
|
|
16
|
+
return self._env_vars.copy()
|
|
9
17
|
|
|
10
18
|
|
|
11
19
|
@dataclass
|
|
12
20
|
class ConnectionsArgs(BaseArguments):
|
|
13
|
-
_get_credential: Callable[[str], tuple[str, str]]
|
|
14
|
-
|
|
15
|
-
def __post_init__(self):
|
|
16
|
-
self.get_credential = self._get_credential
|
|
21
|
+
_get_credential: Callable[[str | None], tuple[str, str]]
|
|
17
22
|
|
|
18
|
-
def get_credential(self, key:
|
|
23
|
+
def get_credential(self, key: str | None) -> tuple[str, str]:
|
|
19
24
|
"""
|
|
20
|
-
Return (username, password) tuple configured for credentials key in
|
|
25
|
+
Return (username, password) tuple configured for credentials key in env.yaml
|
|
21
26
|
|
|
22
27
|
If key is None, returns tuple of empty strings ("", "")
|
|
23
28
|
|
|
24
|
-
|
|
29
|
+
Arguments:
|
|
25
30
|
key: The credentials key
|
|
26
31
|
|
|
27
32
|
Returns:
|
|
28
33
|
A tuple of 2 strings
|
|
29
34
|
"""
|
|
35
|
+
return self._get_credential(key)
|
|
30
36
|
|
|
31
37
|
|
|
32
38
|
@dataclass
|