squirrels 0.1.0__py3-none-any.whl → 0.6.0.post0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dateutils/__init__.py +6 -0
- dateutils/_enums.py +25 -0
- squirrels/dateutils.py → dateutils/_implementation.py +409 -380
- dateutils/types.py +6 -0
- squirrels/__init__.py +21 -18
- squirrels/_api_routes/__init__.py +5 -0
- squirrels/_api_routes/auth.py +337 -0
- squirrels/_api_routes/base.py +196 -0
- squirrels/_api_routes/dashboards.py +156 -0
- squirrels/_api_routes/data_management.py +148 -0
- squirrels/_api_routes/datasets.py +220 -0
- squirrels/_api_routes/project.py +289 -0
- squirrels/_api_server.py +552 -134
- squirrels/_arguments/__init__.py +0 -0
- squirrels/_arguments/init_time_args.py +83 -0
- squirrels/_arguments/run_time_args.py +111 -0
- squirrels/_auth.py +777 -0
- squirrels/_command_line.py +239 -107
- squirrels/_compile_prompts.py +147 -0
- squirrels/_connection_set.py +94 -0
- squirrels/_constants.py +141 -64
- squirrels/_dashboards.py +179 -0
- squirrels/_data_sources.py +570 -0
- squirrels/_dataset_types.py +91 -0
- squirrels/_env_vars.py +209 -0
- squirrels/_exceptions.py +29 -0
- squirrels/_http_error_responses.py +52 -0
- squirrels/_initializer.py +319 -110
- squirrels/_logging.py +121 -0
- squirrels/_manifest.py +357 -187
- squirrels/_mcp_server.py +578 -0
- squirrels/_model_builder.py +69 -0
- squirrels/_model_configs.py +74 -0
- squirrels/_model_queries.py +52 -0
- squirrels/_models.py +1201 -0
- squirrels/_package_data/base_project/.env +7 -0
- squirrels/_package_data/base_project/.env.example +44 -0
- squirrels/_package_data/base_project/connections.yml +16 -0
- squirrels/_package_data/base_project/dashboards/dashboard_example.py +40 -0
- squirrels/_package_data/base_project/dashboards/dashboard_example.yml +22 -0
- squirrels/_package_data/base_project/docker/.dockerignore +16 -0
- squirrels/_package_data/base_project/docker/Dockerfile +16 -0
- squirrels/_package_data/base_project/docker/compose.yml +7 -0
- squirrels/_package_data/base_project/duckdb_init.sql +10 -0
- squirrels/_package_data/base_project/gitignore +13 -0
- squirrels/_package_data/base_project/macros/macros_example.sql +17 -0
- squirrels/_package_data/base_project/models/builds/build_example.py +26 -0
- squirrels/_package_data/base_project/models/builds/build_example.sql +16 -0
- squirrels/_package_data/base_project/models/builds/build_example.yml +57 -0
- squirrels/_package_data/base_project/models/dbviews/dbview_example.sql +17 -0
- squirrels/_package_data/base_project/models/dbviews/dbview_example.yml +32 -0
- squirrels/_package_data/base_project/models/federates/federate_example.py +51 -0
- squirrels/_package_data/base_project/models/federates/federate_example.sql +21 -0
- squirrels/_package_data/base_project/models/federates/federate_example.yml +65 -0
- squirrels/_package_data/base_project/models/sources.yml +38 -0
- squirrels/_package_data/base_project/parameters.yml +142 -0
- squirrels/_package_data/base_project/pyconfigs/connections.py +19 -0
- squirrels/_package_data/base_project/pyconfigs/context.py +96 -0
- squirrels/_package_data/base_project/pyconfigs/parameters.py +141 -0
- squirrels/_package_data/base_project/pyconfigs/user.py +56 -0
- squirrels/_package_data/base_project/resources/expenses.db +0 -0
- squirrels/_package_data/base_project/resources/public/.gitkeep +0 -0
- squirrels/_package_data/base_project/resources/weather.db +0 -0
- squirrels/_package_data/base_project/seeds/seed_categories.csv +6 -0
- squirrels/_package_data/base_project/seeds/seed_categories.yml +15 -0
- squirrels/_package_data/base_project/seeds/seed_subcategories.csv +15 -0
- squirrels/_package_data/base_project/seeds/seed_subcategories.yml +21 -0
- squirrels/_package_data/base_project/squirrels.yml.j2 +61 -0
- squirrels/_package_data/base_project/tmp/.gitignore +2 -0
- squirrels/_package_data/templates/login_successful.html +53 -0
- squirrels/_package_data/templates/squirrels_studio.html +22 -0
- squirrels/_package_loader.py +29 -0
- squirrels/_parameter_configs.py +592 -0
- squirrels/_parameter_options.py +348 -0
- squirrels/_parameter_sets.py +207 -0
- squirrels/_parameters.py +1703 -0
- squirrels/_project.py +796 -0
- squirrels/_py_module.py +122 -0
- squirrels/_request_context.py +33 -0
- squirrels/_schemas/__init__.py +0 -0
- squirrels/_schemas/auth_models.py +83 -0
- squirrels/_schemas/query_param_models.py +70 -0
- squirrels/_schemas/request_models.py +26 -0
- squirrels/_schemas/response_models.py +286 -0
- squirrels/_seeds.py +97 -0
- squirrels/_sources.py +112 -0
- squirrels/_utils.py +540 -149
- squirrels/_version.py +1 -3
- squirrels/arguments.py +7 -0
- squirrels/auth.py +4 -0
- squirrels/connections.py +3 -0
- squirrels/dashboards.py +3 -0
- squirrels/data_sources.py +14 -282
- squirrels/parameter_options.py +13 -189
- squirrels/parameters.py +14 -801
- squirrels/types.py +18 -0
- squirrels-0.6.0.post0.dist-info/METADATA +148 -0
- squirrels-0.6.0.post0.dist-info/RECORD +101 -0
- {squirrels-0.1.0.dist-info → squirrels-0.6.0.post0.dist-info}/WHEEL +1 -2
- {squirrels-0.1.0.dist-info → squirrels-0.6.0.post0.dist-info}/entry_points.txt +1 -0
- squirrels-0.6.0.post0.dist-info/licenses/LICENSE +201 -0
- squirrels/_credentials_manager.py +0 -87
- squirrels/_module_loader.py +0 -37
- squirrels/_parameter_set.py +0 -151
- squirrels/_renderer.py +0 -286
- squirrels/_timed_imports.py +0 -37
- squirrels/connection_set.py +0 -126
- squirrels/package_data/base_project/.gitignore +0 -4
- squirrels/package_data/base_project/connections.py +0 -21
- squirrels/package_data/base_project/database/sample_database.db +0 -0
- squirrels/package_data/base_project/database/seattle_weather.db +0 -0
- squirrels/package_data/base_project/datasets/sample_dataset/context.py +0 -8
- squirrels/package_data/base_project/datasets/sample_dataset/database_view1.py +0 -23
- squirrels/package_data/base_project/datasets/sample_dataset/database_view1.sql.j2 +0 -7
- squirrels/package_data/base_project/datasets/sample_dataset/final_view.py +0 -10
- squirrels/package_data/base_project/datasets/sample_dataset/final_view.sql.j2 +0 -2
- squirrels/package_data/base_project/datasets/sample_dataset/parameters.py +0 -30
- squirrels/package_data/base_project/datasets/sample_dataset/selections.cfg +0 -6
- squirrels/package_data/base_project/squirrels.yaml +0 -26
- squirrels/package_data/static/favicon.ico +0 -0
- squirrels/package_data/static/script.js +0 -234
- squirrels/package_data/static/style.css +0 -110
- squirrels/package_data/templates/index.html +0 -32
- squirrels-0.1.0.dist-info/LICENSE +0 -22
- squirrels-0.1.0.dist-info/METADATA +0 -67
- squirrels-0.1.0.dist-info/RECORD +0 -40
- squirrels-0.1.0.dist-info/top_level.txt +0 -1
squirrels/_parameter_set.py
DELETED
|
@@ -1,151 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
from typing import Sequence, Dict, Any
|
|
3
|
-
from collections import OrderedDict
|
|
4
|
-
|
|
5
|
-
from squirrels import data_sources as d, parameters as p
|
|
6
|
-
from squirrels._timed_imports import pandas as pd
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class ParameterSetBase:
|
|
10
|
-
def __init__(self) -> None:
|
|
11
|
-
"""
|
|
12
|
-
Constructor for ParameterSetBase, the base class for ParameterSet. Similar to ParameterSet but without
|
|
13
|
-
a separate collection for DataSourceParameter's, and does not pre-set the parameters in constructor.
|
|
14
|
-
"""
|
|
15
|
-
self._parameters_dict: OrderedDict[str, p.Parameter] = OrderedDict()
|
|
16
|
-
|
|
17
|
-
def add_parameter(self, parameter: p.Parameter) -> None:
|
|
18
|
-
"""
|
|
19
|
-
Adds a parameter to the "parameter collection"
|
|
20
|
-
|
|
21
|
-
Parameters:
|
|
22
|
-
parameter: The parameter to add
|
|
23
|
-
"""
|
|
24
|
-
self._parameters_dict[parameter.name] = parameter
|
|
25
|
-
|
|
26
|
-
def get_parameter(self, param_name: str) -> p.Parameter:
|
|
27
|
-
"""
|
|
28
|
-
Gets the Parameter object given the parameter name
|
|
29
|
-
|
|
30
|
-
Parameters:
|
|
31
|
-
param_name: The parameter name
|
|
32
|
-
|
|
33
|
-
Returns:
|
|
34
|
-
The Parameter object corresponding to the parameter name
|
|
35
|
-
"""
|
|
36
|
-
if param_name in self._parameters_dict:
|
|
37
|
-
return self._parameters_dict[param_name]
|
|
38
|
-
else:
|
|
39
|
-
raise KeyError(f'No such parameter exists called "{param_name}"')
|
|
40
|
-
|
|
41
|
-
def __getitem__(self, param_name: str) -> p.Parameter:
|
|
42
|
-
return self.get_parameter(param_name)
|
|
43
|
-
|
|
44
|
-
def get_parameters_as_ordered_dict(self) -> OrderedDict[str, p.Parameter]:
|
|
45
|
-
"""
|
|
46
|
-
Returns the inner dictionary of the "parameter collection"
|
|
47
|
-
|
|
48
|
-
Returns:
|
|
49
|
-
A dictionary where key are the assigned names and values are the Parameter objects
|
|
50
|
-
"""
|
|
51
|
-
return OrderedDict(self._parameters_dict)
|
|
52
|
-
|
|
53
|
-
def merge(self, other: ParameterSetBase) -> ParameterSetBase:
|
|
54
|
-
"""
|
|
55
|
-
Merges the "parameter collection" of this and another ParameterSetBase
|
|
56
|
-
|
|
57
|
-
Parameters:
|
|
58
|
-
other: The other ParameterSetBase
|
|
59
|
-
|
|
60
|
-
Returns:
|
|
61
|
-
A new copy of the ParameterSetBase as a result of the merge
|
|
62
|
-
"""
|
|
63
|
-
new_param_set = ParameterSetBase()
|
|
64
|
-
new_param_set._parameters_dict = OrderedDict(self._parameters_dict)
|
|
65
|
-
new_param_set._parameters_dict.update(other._parameters_dict)
|
|
66
|
-
return new_param_set
|
|
67
|
-
|
|
68
|
-
def to_json_dict(self, debug: bool = False) -> Dict[str, Any]:
|
|
69
|
-
"""
|
|
70
|
-
Converts this object, and all parameters contained, into a JSON dictionary
|
|
71
|
-
|
|
72
|
-
Parameters:
|
|
73
|
-
debug: Set to True to make the "hidden" parameters show as part of the result
|
|
74
|
-
|
|
75
|
-
Returns:
|
|
76
|
-
A collection of parameters as a JSON dictionary used for the "parameters" endpoint
|
|
77
|
-
"""
|
|
78
|
-
parameters = []
|
|
79
|
-
for x in self._parameters_dict.values():
|
|
80
|
-
if not x.is_hidden or debug:
|
|
81
|
-
parameters.append(x.to_json_dict())
|
|
82
|
-
|
|
83
|
-
output = {
|
|
84
|
-
"response_version": 0,
|
|
85
|
-
"parameters": parameters
|
|
86
|
-
}
|
|
87
|
-
return output
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
class ParameterSet(ParameterSetBase):
|
|
91
|
-
def __init__(self, parameters: Sequence[p.Parameter]):
|
|
92
|
-
"""
|
|
93
|
-
Constructor for ParameterSet, a wrapper class for a sequence of parameters,
|
|
94
|
-
and stores the DataSourceParameters as a separate field as well
|
|
95
|
-
|
|
96
|
-
Parameters:
|
|
97
|
-
parameters: A sequence of parameters
|
|
98
|
-
"""
|
|
99
|
-
super().__init__()
|
|
100
|
-
self._data_source_params: OrderedDict[str, p.DataSourceParameter] = OrderedDict()
|
|
101
|
-
for param in parameters:
|
|
102
|
-
self._parameters_dict[param.name] = param
|
|
103
|
-
if isinstance(param, p.DataSourceParameter):
|
|
104
|
-
self._data_source_params[param.name] = param
|
|
105
|
-
|
|
106
|
-
def merge(self, other: ParameterSetBase) -> ParameterSet:
|
|
107
|
-
"""
|
|
108
|
-
Merges this object with another ParameterSet (by combining the parameters) to create a new ParameterSet.
|
|
109
|
-
|
|
110
|
-
The _parameters_dict are merged (with the other ParameterSet taking precedence when a name exist in both dict),
|
|
111
|
-
while the _data_source_params are only taken from this object. This object and the other ParameterSet remain
|
|
112
|
-
unchanged.
|
|
113
|
-
|
|
114
|
-
Parameters:
|
|
115
|
-
other: The other parameter set
|
|
116
|
-
|
|
117
|
-
Returns:
|
|
118
|
-
A new ParameterSet that contains all the parameters from this and the other parameter set.
|
|
119
|
-
"""
|
|
120
|
-
new_param_set_base = super().merge(other)
|
|
121
|
-
new_param_set = ParameterSet(())
|
|
122
|
-
new_param_set._parameters_dict = new_param_set_base._parameters_dict
|
|
123
|
-
new_param_set._data_source_params = self._data_source_params
|
|
124
|
-
return new_param_set
|
|
125
|
-
|
|
126
|
-
def get_datasources(self) -> Dict[str, d.DataSource]:
|
|
127
|
-
"""
|
|
128
|
-
Gets all the DataSource objects as values to a dictionary where keys are the DataSource parameter names.
|
|
129
|
-
|
|
130
|
-
Each DataSource object represents a lookup table with table name, connection name, corresponding columns to ID, label, etc.
|
|
131
|
-
|
|
132
|
-
Returns:
|
|
133
|
-
A dictionary where keys are the names of DataSourceParameter's and values are the corresponding DataSource.
|
|
134
|
-
"""
|
|
135
|
-
new_dict = {}
|
|
136
|
-
for param_name, ds_param in self._data_source_params.items():
|
|
137
|
-
new_dict[param_name] = ds_param.data_source
|
|
138
|
-
return new_dict
|
|
139
|
-
|
|
140
|
-
def convert_datasource_params(self, df_dict: Dict[str, pd.DataFrame]) -> None:
|
|
141
|
-
"""
|
|
142
|
-
Changes all the DataSourceParameters into other Parameter types. The _data_source_params field gets cleared.
|
|
143
|
-
|
|
144
|
-
Parameters:
|
|
145
|
-
df_dict: A dictionary of DataSourceParameter name to the pandas DataFrame of the lookup table data.
|
|
146
|
-
"""
|
|
147
|
-
# Done sequentially since parents must be converted first before children
|
|
148
|
-
for key, ds_param in self._data_source_params.items():
|
|
149
|
-
ds_param.parent = self.get_parameter(ds_param.parent.name) if ds_param.parent is not None else None
|
|
150
|
-
self._parameters_dict[key] = ds_param.convert(df_dict[key])
|
|
151
|
-
self._data_source_params.clear()
|
squirrels/_renderer.py
DELETED
|
@@ -1,286 +0,0 @@
|
|
|
1
|
-
from typing import Dict, Tuple, Optional, Union, Callable, Any
|
|
2
|
-
from functools import partial
|
|
3
|
-
from configparser import ConfigParser
|
|
4
|
-
import concurrent.futures, os, json, time
|
|
5
|
-
|
|
6
|
-
from squirrels import _constants as c, _manifest as mf, _utils
|
|
7
|
-
from squirrels.connection_set import ConnectionSet, sqldf
|
|
8
|
-
from squirrels.data_sources import DataSource
|
|
9
|
-
from squirrels._parameter_set import ParameterSet
|
|
10
|
-
from squirrels._utils import ConfigurationError
|
|
11
|
-
from squirrels._timed_imports import pandas as pd, timer
|
|
12
|
-
|
|
13
|
-
ContextFunc = Optional[Callable[..., Dict[str, Any]]]
|
|
14
|
-
DatabaseViews = Optional[Dict[str, pd.DataFrame]]
|
|
15
|
-
Query = Union[Callable[..., pd.DataFrame], str]
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class Renderer:
|
|
19
|
-
def __init__(self, dataset: str, manifest: mf.Manifest, conn_set: ConnectionSet, raw_param_set: ParameterSet,
|
|
20
|
-
context_func: Callable[..., Dict[str, Any]], raw_query_by_db_view: Dict[str, Query],
|
|
21
|
-
raw_final_view_query: Query, excel_file: Optional[pd.ExcelFile] = None):
|
|
22
|
-
self.dataset = dataset
|
|
23
|
-
self.manifest = manifest
|
|
24
|
-
self.conn_set = conn_set
|
|
25
|
-
self.context_func = context_func
|
|
26
|
-
self.raw_query_by_db_view = raw_query_by_db_view
|
|
27
|
-
self.raw_final_view_query = raw_final_view_query
|
|
28
|
-
|
|
29
|
-
start = time.time()
|
|
30
|
-
self.param_set: ParameterSet = self._convert_param_set_datasources(raw_param_set, excel_file)
|
|
31
|
-
timer.add_activity_time(f"convert datasources - dataset {dataset}", start)
|
|
32
|
-
|
|
33
|
-
def _convert_param_set_datasources(self, param_set: ParameterSet, excel_file: Optional[pd.ExcelFile] = None) -> ParameterSet:
|
|
34
|
-
datasources = param_set.get_datasources()
|
|
35
|
-
if excel_file is not None:
|
|
36
|
-
df_dict = pd.read_excel(excel_file, None)
|
|
37
|
-
for key in datasources:
|
|
38
|
-
if key not in df_dict:
|
|
39
|
-
raise ConfigurationError('No sheet found for parameter "{key}" in the Excel workbook')
|
|
40
|
-
else:
|
|
41
|
-
def get_dataframe_from_query(item: Tuple[str, DataSource]) -> pd.DataFrame:
|
|
42
|
-
key, datasource = item
|
|
43
|
-
df = self.conn_set.get_dataframe_from_query(datasource.connection_name, datasource.get_query())
|
|
44
|
-
return key, df
|
|
45
|
-
|
|
46
|
-
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
47
|
-
df_dict = dict(executor.map(get_dataframe_from_query, datasources.items()))
|
|
48
|
-
|
|
49
|
-
param_set.convert_datasource_params(df_dict)
|
|
50
|
-
return param_set
|
|
51
|
-
|
|
52
|
-
def apply_selections(self, selections: Dict[str, str], updates_only: bool = False) -> ParameterSet:
|
|
53
|
-
start = time.time()
|
|
54
|
-
parameter_set = self.param_set
|
|
55
|
-
parameters_dict = parameter_set.get_parameters_as_ordered_dict()
|
|
56
|
-
|
|
57
|
-
# iterating through parameters dict instead of query_params since order matters for cascading parameters
|
|
58
|
-
for param_name, parameter in parameters_dict.items():
|
|
59
|
-
if param_name in selections:
|
|
60
|
-
value = selections[param_name]
|
|
61
|
-
parameter = parameter_set.get_parameter(param_name).with_selection(value)
|
|
62
|
-
updates = parameter.get_all_dependent_params()
|
|
63
|
-
if updates_only:
|
|
64
|
-
parameter_set = updates
|
|
65
|
-
break
|
|
66
|
-
parameter_set = parameter_set.merge(updates)
|
|
67
|
-
timer.add_activity_time(f"apply selections - dataset {self.dataset}", start)
|
|
68
|
-
|
|
69
|
-
return parameter_set
|
|
70
|
-
|
|
71
|
-
def _render_context(self, context_func: ContextFunc, param_set: ParameterSet) -> Dict[str, Any]:
|
|
72
|
-
try:
|
|
73
|
-
return context_func(prms=param_set.get_parameters_as_ordered_dict()) if context_func is not None else {}
|
|
74
|
-
except Exception as e:
|
|
75
|
-
raise ConfigurationError(f'Error in the {c.CONTEXT_FILE} function for dataset "{self.dataset}"') from e
|
|
76
|
-
|
|
77
|
-
def _get_args(self, param_set: ParameterSet, context: Dict[str, Any], db_view: str = None) -> Dict:
|
|
78
|
-
if db_view is not None:
|
|
79
|
-
args = self.manifest.get_view_args(self.dataset, db_view)
|
|
80
|
-
else:
|
|
81
|
-
args = self.manifest.get_view_args(self.dataset)
|
|
82
|
-
return {
|
|
83
|
-
'prms': param_set.get_parameters_as_ordered_dict(),
|
|
84
|
-
'ctx': context,
|
|
85
|
-
'args': args
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
def _render_query_from_raw(self, raw_query: Query, args: Dict) -> Query:
|
|
89
|
-
if isinstance(raw_query, str):
|
|
90
|
-
template = _utils.j2_env.from_string(raw_query)
|
|
91
|
-
return template.render(args)
|
|
92
|
-
else:
|
|
93
|
-
return partial(raw_query, **args)
|
|
94
|
-
|
|
95
|
-
def _render_dataframe_from_sql(self, db_view_name: str, sql_str: str,
|
|
96
|
-
database_views: DatabaseViews = None) -> pd.DataFrame:
|
|
97
|
-
if database_views is not None:
|
|
98
|
-
return sqldf(sql_str, database_views)
|
|
99
|
-
else:
|
|
100
|
-
conn_name = self.manifest.get_database_view_db_connection(self.dataset, db_view_name)
|
|
101
|
-
return self.conn_set.get_dataframe_from_query(conn_name, sql_str)
|
|
102
|
-
|
|
103
|
-
def _render_dataframe_from_py_func(self, db_view_name: str, py_func: Callable[[Any], pd.DataFrame],
|
|
104
|
-
database_views: DatabaseViews = None) -> pd.DataFrame:
|
|
105
|
-
if database_views is not None:
|
|
106
|
-
try:
|
|
107
|
-
return py_func(database_views=database_views)
|
|
108
|
-
except Exception as e:
|
|
109
|
-
raise ConfigurationError(f'Error in the final view python function for dataset "{self.dataset}"') from e
|
|
110
|
-
else:
|
|
111
|
-
conn_name = self.manifest.get_database_view_db_connection(self.dataset, db_view_name)
|
|
112
|
-
connection_pool = self.conn_set.get_connection_pool(conn_name)
|
|
113
|
-
try:
|
|
114
|
-
return py_func(connection_pool=connection_pool, connection_set=self.conn_set)
|
|
115
|
-
except Exception as e:
|
|
116
|
-
raise ConfigurationError(f'Error in the python function for database view "{db_view_name}" in dataset "{self.dataset}"') from e
|
|
117
|
-
|
|
118
|
-
def _render_db_view_dataframes(self, query_by_db_view: Dict[str, Query]) -> Dict[str, pd.DataFrame]:
|
|
119
|
-
def run_single_query(item: Tuple[str, Query]) -> Tuple[str, pd.DataFrame]:
|
|
120
|
-
view_name, query = item
|
|
121
|
-
if isinstance(query, str):
|
|
122
|
-
return view_name, self._render_dataframe_from_sql(view_name, query)
|
|
123
|
-
else:
|
|
124
|
-
return view_name, self._render_dataframe_from_py_func(view_name, query)
|
|
125
|
-
|
|
126
|
-
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
127
|
-
df_by_view_name = executor.map(run_single_query, query_by_db_view.items())
|
|
128
|
-
|
|
129
|
-
return dict(df_by_view_name)
|
|
130
|
-
|
|
131
|
-
def _render_final_view_dataframe(self, df_by_db_views: Dict[str, pd.DataFrame],
|
|
132
|
-
final_view_query: Optional[Query]) -> pd.DataFrame:
|
|
133
|
-
if final_view_query in df_by_db_views:
|
|
134
|
-
return df_by_db_views[final_view_query]
|
|
135
|
-
elif isinstance(final_view_query, str):
|
|
136
|
-
return self._render_dataframe_from_sql("final_view", final_view_query, df_by_db_views)
|
|
137
|
-
else:
|
|
138
|
-
return self._render_dataframe_from_py_func("final_view", final_view_query, df_by_db_views)
|
|
139
|
-
|
|
140
|
-
def load_results(self, selections: Dict[str, str], run_query: bool = True) \
|
|
141
|
-
-> Tuple[ParameterSet, Dict[str, Query], Query, Dict[str, pd.DataFrame], Optional[pd.DataFrame]]:
|
|
142
|
-
|
|
143
|
-
# apply selections and render context
|
|
144
|
-
param_set = self.apply_selections(selections)
|
|
145
|
-
start = time.time()
|
|
146
|
-
context = self._render_context(self.context_func, param_set)
|
|
147
|
-
timer.add_activity_time(f"render context - dataset {self.dataset}", start)
|
|
148
|
-
|
|
149
|
-
# render database view queries
|
|
150
|
-
start = time.time()
|
|
151
|
-
query_by_db_view = {}
|
|
152
|
-
for db_view, raw_query in self.raw_query_by_db_view.items():
|
|
153
|
-
args = self._get_args(param_set, context, db_view)
|
|
154
|
-
query_by_db_view[db_view] = self._render_query_from_raw(raw_query, args)
|
|
155
|
-
timer.add_activity_time(f"render database view queries - dataset {self.dataset}", start)
|
|
156
|
-
|
|
157
|
-
# render final view query
|
|
158
|
-
start = time.time()
|
|
159
|
-
args = self._get_args(param_set, context)
|
|
160
|
-
final_view_query = self._render_query_from_raw(self.raw_final_view_query, args)
|
|
161
|
-
timer.add_activity_time(f"render final view query - dataset {self.dataset}", start)
|
|
162
|
-
|
|
163
|
-
# render all dataframes if "run_query" is enabled
|
|
164
|
-
df_by_db_views = {}
|
|
165
|
-
final_view_df = None
|
|
166
|
-
if run_query:
|
|
167
|
-
start = time.time()
|
|
168
|
-
df_by_db_views = self._render_db_view_dataframes(query_by_db_view)
|
|
169
|
-
timer.add_activity_time(f"execute dataview view queries - dataset {self.dataset}", start)
|
|
170
|
-
|
|
171
|
-
start = time.time()
|
|
172
|
-
final_view_df = self._render_final_view_dataframe(df_by_db_views, final_view_query)
|
|
173
|
-
timer.add_activity_time(f"execute final view query - dataset {self.dataset}", start)
|
|
174
|
-
|
|
175
|
-
return param_set, query_by_db_view, final_view_query, df_by_db_views, final_view_df
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
def default_context_func(*args, **kwargs):
|
|
179
|
-
return {}
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
class RendererIOWrapper:
|
|
183
|
-
def __init__(self, dataset: str, manifest: mf.Manifest, conn_set: ConnectionSet, excel_file_name: Optional[str] = None):
|
|
184
|
-
dataset_folder = manifest.get_dataset_folder(dataset)
|
|
185
|
-
parameters_path = _utils.join_paths(dataset_folder, c.PARAMETERS_FILE)
|
|
186
|
-
args = manifest.get_dataset_args(dataset)
|
|
187
|
-
parameters_module = _utils.import_file_as_module(parameters_path)
|
|
188
|
-
try:
|
|
189
|
-
parameter_set = ParameterSet(parameters_module.main(args=args))
|
|
190
|
-
except Exception as e:
|
|
191
|
-
raise ConfigurationError(f'Error in the {c.PARAMETERS_FILE} function for dataset "{dataset}"') from e
|
|
192
|
-
|
|
193
|
-
context_path = _utils.join_paths(dataset_folder, c.CONTEXT_FILE)
|
|
194
|
-
try:
|
|
195
|
-
context_module = _utils.import_file_as_module(context_path)
|
|
196
|
-
context_func = partial(context_module.main, args=args)
|
|
197
|
-
except FileNotFoundError:
|
|
198
|
-
context_func = default_context_func
|
|
199
|
-
|
|
200
|
-
excel_file = None
|
|
201
|
-
if excel_file_name is not None:
|
|
202
|
-
excel_file_path = _utils.join_paths(dataset_folder, excel_file_name)
|
|
203
|
-
excel_file = pd.ExcelFile(excel_file_path)
|
|
204
|
-
|
|
205
|
-
db_views = manifest.get_all_database_view_names(dataset)
|
|
206
|
-
raw_query_by_db_view = {}
|
|
207
|
-
for db_view in db_views:
|
|
208
|
-
db_view_template_path = str(manifest.get_database_view_file(dataset, db_view))
|
|
209
|
-
raw_query_by_db_view[db_view] = self._get_raw_query(db_view_template_path)
|
|
210
|
-
|
|
211
|
-
final_view_path = str(manifest.get_dataset_final_view_file(dataset))
|
|
212
|
-
if final_view_path in db_views:
|
|
213
|
-
raw_final_view_query = final_view_path
|
|
214
|
-
else:
|
|
215
|
-
raw_final_view_query = self._get_raw_query(final_view_path)
|
|
216
|
-
|
|
217
|
-
self.dataset_folder = dataset_folder
|
|
218
|
-
self.output_folder = _utils.join_paths(c.OUTPUTS_FOLDER, dataset)
|
|
219
|
-
self.renderer = Renderer(dataset, manifest, conn_set, parameter_set, context_func,
|
|
220
|
-
raw_query_by_db_view, raw_final_view_query, excel_file)
|
|
221
|
-
|
|
222
|
-
def _get_raw_query(self, template_path: str) -> Dict[str, Query]:
|
|
223
|
-
if template_path.endswith(".py"):
|
|
224
|
-
return _utils.import_file_as_module(template_path).main
|
|
225
|
-
else:
|
|
226
|
-
with open(template_path, 'r') as f:
|
|
227
|
-
sql_template = f.read()
|
|
228
|
-
return sql_template
|
|
229
|
-
|
|
230
|
-
def _get_selections(self, selection_cfg_file: Optional[str]) -> Dict[str, str]:
|
|
231
|
-
if selection_cfg_file is not None:
|
|
232
|
-
selection_cfg_path = _utils.join_paths(self.dataset_folder, selection_cfg_file)
|
|
233
|
-
config = ConfigParser()
|
|
234
|
-
config.read(selection_cfg_path)
|
|
235
|
-
if config.has_section(c.PARAMETERS_SECTION):
|
|
236
|
-
config_section = config[c.PARAMETERS_SECTION]
|
|
237
|
-
return dict(config_section.items())
|
|
238
|
-
return {}
|
|
239
|
-
|
|
240
|
-
def _write_sql_file(self, view_name: str, query: Any):
|
|
241
|
-
if isinstance(query, str):
|
|
242
|
-
db_view_sql_output_path = _utils.join_paths(self.output_folder, view_name+'.sql')
|
|
243
|
-
with open(db_view_sql_output_path, 'w') as f:
|
|
244
|
-
f.write(query)
|
|
245
|
-
|
|
246
|
-
def write_outputs(self, selection_cfg_file: Optional[str], run_query: bool) -> None:
|
|
247
|
-
# create output folder if it doesn't exist
|
|
248
|
-
if not os.path.exists(self.output_folder):
|
|
249
|
-
os.makedirs(self.output_folder)
|
|
250
|
-
|
|
251
|
-
# clear everything in output folder
|
|
252
|
-
files = os.listdir(self.output_folder)
|
|
253
|
-
for file in files:
|
|
254
|
-
file_path = _utils.join_paths(self.output_folder, file)
|
|
255
|
-
os.remove(file_path)
|
|
256
|
-
|
|
257
|
-
# apply selections and render outputs
|
|
258
|
-
selections = self._get_selections(selection_cfg_file)
|
|
259
|
-
result = self.renderer.load_results(selections, run_query)
|
|
260
|
-
param_set, query_by_db_view, final_view_query, df_by_db_views, final_view_df = result
|
|
261
|
-
|
|
262
|
-
# write the parameters response
|
|
263
|
-
param_set_dict = param_set.to_json_dict()
|
|
264
|
-
parameter_json_output_path = _utils.join_paths(self.output_folder, c.PARAMETERS_OUTPUT)
|
|
265
|
-
with open(parameter_json_output_path, 'w') as f:
|
|
266
|
-
json.dump(param_set_dict, f, indent=4)
|
|
267
|
-
|
|
268
|
-
# write the rendered sql queries for database views
|
|
269
|
-
for db_view, query in query_by_db_view.items():
|
|
270
|
-
self._write_sql_file(db_view, query)
|
|
271
|
-
|
|
272
|
-
# write the rendered sql query for final view
|
|
273
|
-
if final_view_query not in query_by_db_view:
|
|
274
|
-
self._write_sql_file(c.FINAL_VIEW_OUT_STEM, final_view_query)
|
|
275
|
-
|
|
276
|
-
# Run the sql queries and write output
|
|
277
|
-
if run_query:
|
|
278
|
-
for db_view, df in df_by_db_views.items():
|
|
279
|
-
csv_file = _utils.join_paths(self.output_folder, db_view+'.csv')
|
|
280
|
-
df.to_csv(csv_file, index=False)
|
|
281
|
-
|
|
282
|
-
final_csv_path = _utils.join_paths(self.output_folder, c.FINAL_VIEW_OUT_STEM+'.csv')
|
|
283
|
-
final_view_df.to_csv(final_csv_path, index=False)
|
|
284
|
-
|
|
285
|
-
final_json_path = _utils.join_paths(self.output_folder, c.FINAL_VIEW_OUT_STEM+'.json')
|
|
286
|
-
final_view_df.to_json(final_json_path, orient='table', index=False, indent=4)
|
squirrels/_timed_imports.py
DELETED
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
from typing import Dict, List
|
|
2
|
-
import time
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class Timer:
|
|
6
|
-
def __init__(self, verbose: bool = False):
|
|
7
|
-
self.times: Dict[str, List[float]] = dict()
|
|
8
|
-
self.verbose = verbose
|
|
9
|
-
|
|
10
|
-
def add_activity_time(self, activity: str, start: float):
|
|
11
|
-
if self.verbose:
|
|
12
|
-
time_taken = (time.time()-start) * 10**3
|
|
13
|
-
times_list = self.times.setdefault(activity, list())
|
|
14
|
-
times_list.append(time_taken)
|
|
15
|
-
print(f'Time taken for "{activity}": {time_taken}ms')
|
|
16
|
-
|
|
17
|
-
def report_times(self):
|
|
18
|
-
if self.verbose:
|
|
19
|
-
for activity, times_list in self.times.items():
|
|
20
|
-
total_time = sum(times_list)
|
|
21
|
-
avg_time = total_time / len(times_list)
|
|
22
|
-
print()
|
|
23
|
-
print(f'Time statistics for "{activity}":')
|
|
24
|
-
print(f' Total time: {total_time}ms')
|
|
25
|
-
print(f' Average time: {avg_time}ms')
|
|
26
|
-
|
|
27
|
-
timer = Timer()
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
start = time.time()
|
|
31
|
-
import pandas
|
|
32
|
-
from pandas.api import types as pd_types
|
|
33
|
-
timer.add_activity_time("import pandas", start)
|
|
34
|
-
|
|
35
|
-
start = time.time()
|
|
36
|
-
import jinja2
|
|
37
|
-
timer.add_activity_time("import jinja", start)
|
squirrels/connection_set.py
DELETED
|
@@ -1,126 +0,0 @@
|
|
|
1
|
-
from typing import Dict, Union
|
|
2
|
-
from importlib.machinery import SourceFileLoader
|
|
3
|
-
from sqlalchemy import Engine, Pool
|
|
4
|
-
import sqlite3
|
|
5
|
-
|
|
6
|
-
from squirrels import _constants as c, _manifest as mf
|
|
7
|
-
from squirrels._timed_imports import pandas as pd
|
|
8
|
-
from squirrels._utils import ConfigurationError
|
|
9
|
-
|
|
10
|
-
ConnectionPool = Union[Engine, Pool]
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class ConnectionSet:
|
|
14
|
-
def __init__(self, conn_pools: Dict[str, ConnectionPool]) -> None:
|
|
15
|
-
"""
|
|
16
|
-
Constructor for ConnectionSet, a wrapper class around a collection of Connection Pools or Sqlalchemy Engines
|
|
17
|
-
|
|
18
|
-
Parameters:
|
|
19
|
-
conn_pools: A dictionary of connection pool name to the corresponding Pool or Engine from sqlalchemy
|
|
20
|
-
"""
|
|
21
|
-
self._conn_pools = conn_pools
|
|
22
|
-
|
|
23
|
-
def get_connection_pool(self, conn_name: str = "default") -> ConnectionPool:
|
|
24
|
-
"""
|
|
25
|
-
Gets to sqlalchemy Pool or Engine from the database connection name
|
|
26
|
-
|
|
27
|
-
Parameters:
|
|
28
|
-
conn_name: Name of Pool or Engine. If not provided, defaults to "default"
|
|
29
|
-
|
|
30
|
-
Returns:
|
|
31
|
-
A sqlalchemy Pool or Engine
|
|
32
|
-
"""
|
|
33
|
-
try:
|
|
34
|
-
connection_pool = self._conn_pools[conn_name]
|
|
35
|
-
except KeyError as e:
|
|
36
|
-
raise ConfigurationError(f'Connection name "{conn_name}" was not configured') from e
|
|
37
|
-
return connection_pool
|
|
38
|
-
|
|
39
|
-
def __getitem__(self, conn_name: str) -> ConnectionPool:
|
|
40
|
-
"""
|
|
41
|
-
Same as get_connection_pool
|
|
42
|
-
"""
|
|
43
|
-
return self.get_connection_pool(conn_name)
|
|
44
|
-
|
|
45
|
-
def get_dataframe_from_query(self, conn_name: str, query: str) -> pd.DataFrame:
|
|
46
|
-
"""
|
|
47
|
-
Runs a SQL query on a database connection name, and returns the results as pandas DataFrame
|
|
48
|
-
|
|
49
|
-
Parameters:
|
|
50
|
-
conn_name: Name of Pool or Engine
|
|
51
|
-
query: The SQL query to run
|
|
52
|
-
|
|
53
|
-
Returns:
|
|
54
|
-
A pandas DataFrame
|
|
55
|
-
"""
|
|
56
|
-
connector = self.get_connection_pool(conn_name)
|
|
57
|
-
if isinstance(connector, Pool):
|
|
58
|
-
conn = connector.connect()
|
|
59
|
-
elif isinstance(connector, Engine):
|
|
60
|
-
conn = connector.raw_connection()
|
|
61
|
-
else:
|
|
62
|
-
raise TypeError(f'Type for connection name "{conn_name}" not supported')
|
|
63
|
-
|
|
64
|
-
try:
|
|
65
|
-
cur = conn.cursor()
|
|
66
|
-
cur.execute(query)
|
|
67
|
-
df = pd.DataFrame(data=cur.fetchall(), columns=[x[0] for x in cur.description])
|
|
68
|
-
finally:
|
|
69
|
-
conn.close()
|
|
70
|
-
|
|
71
|
-
return df
|
|
72
|
-
|
|
73
|
-
def _dispose(self) -> None:
|
|
74
|
-
"""
|
|
75
|
-
Disposes of all the connection pools in this ConnectionSet
|
|
76
|
-
"""
|
|
77
|
-
for pool in self._conn_pools.values():
|
|
78
|
-
pool.dispose()
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
def _from_file(manifest: mf.Manifest) -> ConnectionSet:
|
|
82
|
-
"""
|
|
83
|
-
Takes the DB Connections from both the squirrels.yaml and connections.py files and merges them
|
|
84
|
-
into a single ConnectionSet
|
|
85
|
-
|
|
86
|
-
Parameters:
|
|
87
|
-
manifest: The object of Manifest class, the interface for the squirrels.yaml file
|
|
88
|
-
|
|
89
|
-
Returns:
|
|
90
|
-
A ConnectionSet with the DB connections from both squirrels.yaml and connections.py
|
|
91
|
-
"""
|
|
92
|
-
connections = manifest.get_db_connections()
|
|
93
|
-
try:
|
|
94
|
-
module = SourceFileLoader(c.CONNECTIONS_FILE, c.CONNECTIONS_FILE).load_module()
|
|
95
|
-
except FileNotFoundError:
|
|
96
|
-
module = None
|
|
97
|
-
|
|
98
|
-
if module is not None:
|
|
99
|
-
proj_vars = manifest.get_proj_vars()
|
|
100
|
-
try:
|
|
101
|
-
conn_from_py_file = module.main(proj_vars)
|
|
102
|
-
except Exception as e:
|
|
103
|
-
raise ConfigurationError(f'Error in the {c.CONNECTIONS_FILE} file') from e
|
|
104
|
-
else:
|
|
105
|
-
conn_from_py_file = {}
|
|
106
|
-
return ConnectionSet({**connections, **conn_from_py_file})
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
def sqldf(query: str, df_by_db_views: Dict[str, pd.DataFrame]) -> pd.DataFrame:
|
|
110
|
-
"""
|
|
111
|
-
Uses a dictionary of dataframes to execute a SQL query in an in-memory sqlite database
|
|
112
|
-
|
|
113
|
-
Parameters:
|
|
114
|
-
query: The SQL query to run using sqlite
|
|
115
|
-
df_by_db_views: A dictionary of table names to their pandas Dataframe
|
|
116
|
-
|
|
117
|
-
Returns:
|
|
118
|
-
The result as a pandas Dataframe from running the query
|
|
119
|
-
"""
|
|
120
|
-
conn = sqlite3.connect(":memory:")
|
|
121
|
-
try:
|
|
122
|
-
for db_view, df in df_by_db_views.items():
|
|
123
|
-
df.to_sql(db_view, conn, index=False)
|
|
124
|
-
return pd.read_sql(query, conn)
|
|
125
|
-
finally:
|
|
126
|
-
conn.close()
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
from typing import Dict, Union, Any
|
|
2
|
-
from sqlalchemy import create_engine, Engine, Pool, QueuePool
|
|
3
|
-
|
|
4
|
-
from squirrels import get_credential
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
# Note: all connections must be shareable across multiple thread. No writes will occur on them
|
|
8
|
-
def main(proj: Dict[str, Any], *p_args, **kwargs) -> Dict[str, Union[Engine, Pool]]:
|
|
9
|
-
|
|
10
|
-
# ## Example of getting the username and password set with "$ squirrels set-credential [key]"
|
|
11
|
-
# cred = get_credential('my_key')
|
|
12
|
-
# # Use cred.username and cred.password to access the username and password
|
|
13
|
-
|
|
14
|
-
# Create a connection pool / engine
|
|
15
|
-
pool = create_engine('sqlite:///./database/sample_database.db')
|
|
16
|
-
|
|
17
|
-
# ## Example of using QueuePool instead for a custom db connector:
|
|
18
|
-
# connection_creator = lambda: sqlite3.connect('./database/sample_database.db', check_same_thread=False)
|
|
19
|
-
# pool = QueuePool(connection_creator)
|
|
20
|
-
|
|
21
|
-
return {'default': pool}
|
|
Binary file
|
|
Binary file
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
from typing import Dict, Any
|
|
2
|
-
import squirrels as sr
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
def main(prms: Dict[str, sr.Parameter], args: Dict[str, Any], *p_args, **kwargs) -> Dict[str, Any]:
|
|
6
|
-
limit_parameter: sr.NumberParameter = prms['upper_bound']
|
|
7
|
-
limit: str = limit_parameter.get_selected_value()
|
|
8
|
-
return {'limit': limit}
|
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
from typing import Dict, Any
|
|
2
|
-
import pandas as pd
|
|
3
|
-
|
|
4
|
-
import squirrels as sr
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def main(connection_set: sr.ConnectionSet,
|
|
8
|
-
prms: Dict[str, sr.Parameter], ctx: Dict[str, Any], args: Dict[str, Any],
|
|
9
|
-
*p_args, **kwargs) -> pd.DataFrame:
|
|
10
|
-
# pool = connection_set.get_connection_pool("default")
|
|
11
|
-
# conn = pool.connect() # use this to get a DBAPI connection from a Pool or sqlalchemy connection from an Engine
|
|
12
|
-
# conn = pool.raw_connection() # use this to get a DBAPI connection from an Engine
|
|
13
|
-
|
|
14
|
-
df = pd.DataFrame({
|
|
15
|
-
'dim1': ['a', 'b', 'c', 'd', 'e', 'f'],
|
|
16
|
-
'metric1': [1, 2, 3, 4, 5, 6],
|
|
17
|
-
'metric2': [2, 4, 5, 1, 7, 3]
|
|
18
|
-
})
|
|
19
|
-
limit_parameter: sr.NumberParameter = prms['upper_bound']
|
|
20
|
-
limit = limit_parameter.get_selected_value()
|
|
21
|
-
# limit: str = ctx['limit'] # use this instead if context.py is defined
|
|
22
|
-
|
|
23
|
-
return df.query(f'metric1 <= {limit}')
|
|
@@ -1,7 +0,0 @@
|
|
|
1
|
-
-- %USE some_db -- TBA: this line is optional when connecting to the "default" db_connection
|
|
2
|
-
|
|
3
|
-
-- note: if context.py is defined, you can use "ctx['limit']" instead of "prms['number_example'].get_selected_value()"
|
|
4
|
-
SELECT dim1, avg(metric1) as metric1, avg(metric2) as metric2
|
|
5
|
-
FROM fact_table
|
|
6
|
-
WHERE metric1 <= {{ prms['upper_bound'].get_selected_value() }}
|
|
7
|
-
GROUP BY dim1
|