squirrels 0.1.1.post1__py3-none-any.whl → 0.2.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of squirrels might be problematic. Click here for more details.
- squirrels/__init__.py +10 -16
- squirrels/_api_server.py +234 -80
- squirrels/_authenticator.py +84 -0
- squirrels/_command_line.py +60 -72
- squirrels/_connection_set.py +96 -0
- squirrels/_constants.py +114 -33
- squirrels/_environcfg.py +77 -0
- squirrels/_initializer.py +126 -67
- squirrels/_manifest.py +195 -168
- squirrels/_models.py +495 -0
- squirrels/_package_loader.py +26 -0
- squirrels/_parameter_configs.py +401 -0
- squirrels/_parameter_sets.py +188 -0
- squirrels/_py_module.py +60 -0
- squirrels/_timer.py +36 -0
- squirrels/_utils.py +81 -49
- squirrels/_version.py +2 -2
- squirrels/arguments/init_time_args.py +32 -0
- squirrels/arguments/run_time_args.py +82 -0
- squirrels/data_sources.py +380 -155
- squirrels/dateutils.py +86 -57
- squirrels/package_data/base_project/Dockerfile +15 -0
- squirrels/package_data/base_project/connections.yml +7 -0
- squirrels/package_data/base_project/database/{sample_database.db → expenses.db} +0 -0
- squirrels/package_data/base_project/environcfg.yml +29 -0
- squirrels/package_data/base_project/ignores/.dockerignore +8 -0
- squirrels/package_data/base_project/ignores/.gitignore +7 -0
- squirrels/package_data/base_project/models/dbviews/database_view1.py +36 -0
- squirrels/package_data/base_project/models/dbviews/database_view1.sql +15 -0
- squirrels/package_data/base_project/models/federates/dataset_example.py +20 -0
- squirrels/package_data/base_project/models/federates/dataset_example.sql +3 -0
- squirrels/package_data/base_project/parameters.yml +109 -0
- squirrels/package_data/base_project/pyconfigs/auth.py +47 -0
- squirrels/package_data/base_project/pyconfigs/connections.py +28 -0
- squirrels/package_data/base_project/pyconfigs/context.py +45 -0
- squirrels/package_data/base_project/pyconfigs/parameters.py +55 -0
- squirrels/package_data/base_project/seeds/mocks/category.csv +3 -0
- squirrels/package_data/base_project/seeds/mocks/max_filter.csv +2 -0
- squirrels/package_data/base_project/seeds/mocks/subcategory.csv +6 -0
- squirrels/package_data/base_project/squirrels.yml.j2 +57 -0
- squirrels/package_data/base_project/tmp/.gitignore +2 -0
- squirrels/package_data/static/script.js +159 -63
- squirrels/package_data/static/style.css +79 -15
- squirrels/package_data/static/widgets.js +133 -0
- squirrels/package_data/templates/index.html +65 -23
- squirrels/package_data/templates/index2.html +22 -0
- squirrels/parameter_options.py +216 -119
- squirrels/parameters.py +407 -478
- squirrels/user_base.py +58 -0
- squirrels-0.2.0.dev0.dist-info/METADATA +126 -0
- squirrels-0.2.0.dev0.dist-info/RECORD +56 -0
- {squirrels-0.1.1.post1.dist-info → squirrels-0.2.0.dev0.dist-info}/WHEEL +1 -2
- squirrels-0.2.0.dev0.dist-info/entry_points.txt +3 -0
- squirrels/_credentials_manager.py +0 -87
- squirrels/_module_loader.py +0 -37
- squirrels/_parameter_set.py +0 -151
- squirrels/_renderer.py +0 -286
- squirrels/_timed_imports.py +0 -37
- squirrels/connection_set.py +0 -126
- squirrels/package_data/base_project/.gitignore +0 -4
- squirrels/package_data/base_project/connections.py +0 -20
- squirrels/package_data/base_project/datasets/sample_dataset/context.py +0 -22
- squirrels/package_data/base_project/datasets/sample_dataset/database_view1.py +0 -29
- squirrels/package_data/base_project/datasets/sample_dataset/database_view1.sql.j2 +0 -12
- squirrels/package_data/base_project/datasets/sample_dataset/final_view.py +0 -11
- squirrels/package_data/base_project/datasets/sample_dataset/final_view.sql.j2 +0 -3
- squirrels/package_data/base_project/datasets/sample_dataset/parameters.py +0 -47
- squirrels/package_data/base_project/datasets/sample_dataset/selections.cfg +0 -9
- squirrels/package_data/base_project/squirrels.yaml +0 -22
- squirrels-0.1.1.post1.dist-info/METADATA +0 -67
- squirrels-0.1.1.post1.dist-info/RECORD +0 -40
- squirrels-0.1.1.post1.dist-info/entry_points.txt +0 -2
- squirrels-0.1.1.post1.dist-info/top_level.txt +0 -1
- {squirrels-0.1.1.post1.dist-info → squirrels-0.2.0.dev0.dist-info}/LICENSE +0 -0
squirrels/_renderer.py
DELETED
|
@@ -1,286 +0,0 @@
|
|
|
1
|
-
from typing import Dict, Tuple, Optional, Union, Callable, Any
|
|
2
|
-
from functools import partial
|
|
3
|
-
from configparser import ConfigParser
|
|
4
|
-
import concurrent.futures, os, json, time
|
|
5
|
-
|
|
6
|
-
from squirrels import _constants as c, _manifest as mf, _utils
|
|
7
|
-
from squirrels.connection_set import ConnectionSet, sqldf
|
|
8
|
-
from squirrels.data_sources import DataSource
|
|
9
|
-
from squirrels._parameter_set import ParameterSet
|
|
10
|
-
from squirrels._utils import ConfigurationError
|
|
11
|
-
from squirrels._timed_imports import pandas as pd, timer
|
|
12
|
-
|
|
13
|
-
ContextFunc = Optional[Callable[..., Dict[str, Any]]]
|
|
14
|
-
DatabaseViews = Optional[Dict[str, pd.DataFrame]]
|
|
15
|
-
Query = Union[Callable[..., pd.DataFrame], str]
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class Renderer:
|
|
19
|
-
def __init__(self, dataset: str, manifest: mf.Manifest, conn_set: ConnectionSet, raw_param_set: ParameterSet,
|
|
20
|
-
context_func: Callable[..., Dict[str, Any]], raw_query_by_db_view: Dict[str, Query],
|
|
21
|
-
raw_final_view_query: Query, excel_file: Optional[pd.ExcelFile] = None):
|
|
22
|
-
self.dataset = dataset
|
|
23
|
-
self.manifest = manifest
|
|
24
|
-
self.conn_set = conn_set
|
|
25
|
-
self.context_func = context_func
|
|
26
|
-
self.raw_query_by_db_view = raw_query_by_db_view
|
|
27
|
-
self.raw_final_view_query = raw_final_view_query
|
|
28
|
-
|
|
29
|
-
start = time.time()
|
|
30
|
-
self.param_set: ParameterSet = self._convert_param_set_datasources(raw_param_set, excel_file)
|
|
31
|
-
timer.add_activity_time(f"convert datasources - dataset {dataset}", start)
|
|
32
|
-
|
|
33
|
-
def _convert_param_set_datasources(self, param_set: ParameterSet, excel_file: Optional[pd.ExcelFile] = None) -> ParameterSet:
|
|
34
|
-
datasources = param_set.get_datasources()
|
|
35
|
-
if excel_file is not None:
|
|
36
|
-
df_dict = pd.read_excel(excel_file, None)
|
|
37
|
-
for key in datasources:
|
|
38
|
-
if key not in df_dict:
|
|
39
|
-
raise ConfigurationError('No sheet found for parameter "{key}" in the Excel workbook')
|
|
40
|
-
else:
|
|
41
|
-
def get_dataframe_from_query(item: Tuple[str, DataSource]) -> pd.DataFrame:
|
|
42
|
-
key, datasource = item
|
|
43
|
-
df = self.conn_set.get_dataframe_from_query(datasource.connection_name, datasource.get_query())
|
|
44
|
-
return key, df
|
|
45
|
-
|
|
46
|
-
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
47
|
-
df_dict = dict(executor.map(get_dataframe_from_query, datasources.items()))
|
|
48
|
-
|
|
49
|
-
param_set.convert_datasource_params(df_dict)
|
|
50
|
-
return param_set
|
|
51
|
-
|
|
52
|
-
def apply_selections(self, selections: Dict[str, str], updates_only: bool = False) -> ParameterSet:
|
|
53
|
-
start = time.time()
|
|
54
|
-
parameter_set = self.param_set
|
|
55
|
-
parameters_dict = parameter_set.get_parameters_as_ordered_dict()
|
|
56
|
-
|
|
57
|
-
# iterating through parameters dict instead of query_params since order matters for cascading parameters
|
|
58
|
-
for param_name, parameter in parameters_dict.items():
|
|
59
|
-
if param_name in selections:
|
|
60
|
-
value = selections[param_name]
|
|
61
|
-
parameter = parameter_set.get_parameter(param_name).with_selection(value)
|
|
62
|
-
updates = parameter.get_all_dependent_params()
|
|
63
|
-
if updates_only:
|
|
64
|
-
parameter_set = updates
|
|
65
|
-
break
|
|
66
|
-
parameter_set = parameter_set.merge(updates)
|
|
67
|
-
timer.add_activity_time(f"apply selections - dataset {self.dataset}", start)
|
|
68
|
-
|
|
69
|
-
return parameter_set
|
|
70
|
-
|
|
71
|
-
def _render_context(self, context_func: ContextFunc, param_set: ParameterSet) -> Dict[str, Any]:
|
|
72
|
-
try:
|
|
73
|
-
return context_func(prms=param_set.get_parameters_as_ordered_dict()) if context_func is not None else {}
|
|
74
|
-
except Exception as e:
|
|
75
|
-
raise ConfigurationError(f'Error in the {c.CONTEXT_FILE} function for dataset "{self.dataset}"') from e
|
|
76
|
-
|
|
77
|
-
def _get_args(self, param_set: ParameterSet, context: Dict[str, Any], db_view: str = None) -> Dict:
|
|
78
|
-
if db_view is not None:
|
|
79
|
-
args = self.manifest.get_view_args(self.dataset, db_view)
|
|
80
|
-
else:
|
|
81
|
-
args = self.manifest.get_view_args(self.dataset)
|
|
82
|
-
return {
|
|
83
|
-
'prms': param_set.get_parameters_as_ordered_dict(),
|
|
84
|
-
'ctx': context,
|
|
85
|
-
'args': args
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
def _render_query_from_raw(self, raw_query: Query, args: Dict) -> Query:
|
|
89
|
-
if isinstance(raw_query, str):
|
|
90
|
-
template = _utils.j2_env.from_string(raw_query)
|
|
91
|
-
return template.render(args)
|
|
92
|
-
else:
|
|
93
|
-
return partial(raw_query, **args)
|
|
94
|
-
|
|
95
|
-
def _render_dataframe_from_sql(self, db_view_name: str, sql_str: str,
|
|
96
|
-
database_views: DatabaseViews = None) -> pd.DataFrame:
|
|
97
|
-
if database_views is not None:
|
|
98
|
-
return sqldf(sql_str, database_views)
|
|
99
|
-
else:
|
|
100
|
-
conn_name = self.manifest.get_database_view_db_connection(self.dataset, db_view_name)
|
|
101
|
-
return self.conn_set.get_dataframe_from_query(conn_name, sql_str)
|
|
102
|
-
|
|
103
|
-
def _render_dataframe_from_py_func(self, db_view_name: str, py_func: Callable[[Any], pd.DataFrame],
|
|
104
|
-
database_views: DatabaseViews = None) -> pd.DataFrame:
|
|
105
|
-
if database_views is not None:
|
|
106
|
-
try:
|
|
107
|
-
return py_func(database_views=database_views)
|
|
108
|
-
except Exception as e:
|
|
109
|
-
raise ConfigurationError(f'Error in the final view python function for dataset "{self.dataset}"') from e
|
|
110
|
-
else:
|
|
111
|
-
conn_name = self.manifest.get_database_view_db_connection(self.dataset, db_view_name)
|
|
112
|
-
connection_pool = self.conn_set.get_connection_pool(conn_name)
|
|
113
|
-
try:
|
|
114
|
-
return py_func(connection_pool=connection_pool, connection_set=self.conn_set)
|
|
115
|
-
except Exception as e:
|
|
116
|
-
raise ConfigurationError(f'Error in the python function for database view "{db_view_name}" in dataset "{self.dataset}"') from e
|
|
117
|
-
|
|
118
|
-
def _render_db_view_dataframes(self, query_by_db_view: Dict[str, Query]) -> Dict[str, pd.DataFrame]:
|
|
119
|
-
def run_single_query(item: Tuple[str, Query]) -> Tuple[str, pd.DataFrame]:
|
|
120
|
-
view_name, query = item
|
|
121
|
-
if isinstance(query, str):
|
|
122
|
-
return view_name, self._render_dataframe_from_sql(view_name, query)
|
|
123
|
-
else:
|
|
124
|
-
return view_name, self._render_dataframe_from_py_func(view_name, query)
|
|
125
|
-
|
|
126
|
-
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
127
|
-
df_by_view_name = executor.map(run_single_query, query_by_db_view.items())
|
|
128
|
-
|
|
129
|
-
return dict(df_by_view_name)
|
|
130
|
-
|
|
131
|
-
def _render_final_view_dataframe(self, df_by_db_views: Dict[str, pd.DataFrame],
|
|
132
|
-
final_view_query: Optional[Query]) -> pd.DataFrame:
|
|
133
|
-
if final_view_query in df_by_db_views:
|
|
134
|
-
return df_by_db_views[final_view_query]
|
|
135
|
-
elif isinstance(final_view_query, str):
|
|
136
|
-
return self._render_dataframe_from_sql("final_view", final_view_query, df_by_db_views)
|
|
137
|
-
else:
|
|
138
|
-
return self._render_dataframe_from_py_func("final_view", final_view_query, df_by_db_views)
|
|
139
|
-
|
|
140
|
-
def load_results(self, selections: Dict[str, str], run_query: bool = True) \
|
|
141
|
-
-> Tuple[ParameterSet, Dict[str, Query], Query, Dict[str, pd.DataFrame], Optional[pd.DataFrame]]:
|
|
142
|
-
|
|
143
|
-
# apply selections and render context
|
|
144
|
-
param_set = self.apply_selections(selections)
|
|
145
|
-
start = time.time()
|
|
146
|
-
context = self._render_context(self.context_func, param_set)
|
|
147
|
-
timer.add_activity_time(f"render context - dataset {self.dataset}", start)
|
|
148
|
-
|
|
149
|
-
# render database view queries
|
|
150
|
-
start = time.time()
|
|
151
|
-
query_by_db_view = {}
|
|
152
|
-
for db_view, raw_query in self.raw_query_by_db_view.items():
|
|
153
|
-
args = self._get_args(param_set, context, db_view)
|
|
154
|
-
query_by_db_view[db_view] = self._render_query_from_raw(raw_query, args)
|
|
155
|
-
timer.add_activity_time(f"render database view queries - dataset {self.dataset}", start)
|
|
156
|
-
|
|
157
|
-
# render final view query
|
|
158
|
-
start = time.time()
|
|
159
|
-
args = self._get_args(param_set, context)
|
|
160
|
-
final_view_query = self._render_query_from_raw(self.raw_final_view_query, args)
|
|
161
|
-
timer.add_activity_time(f"render final view query - dataset {self.dataset}", start)
|
|
162
|
-
|
|
163
|
-
# render all dataframes if "run_query" is enabled
|
|
164
|
-
df_by_db_views = {}
|
|
165
|
-
final_view_df = None
|
|
166
|
-
if run_query:
|
|
167
|
-
start = time.time()
|
|
168
|
-
df_by_db_views = self._render_db_view_dataframes(query_by_db_view)
|
|
169
|
-
timer.add_activity_time(f"execute dataview view queries - dataset {self.dataset}", start)
|
|
170
|
-
|
|
171
|
-
start = time.time()
|
|
172
|
-
final_view_df = self._render_final_view_dataframe(df_by_db_views, final_view_query)
|
|
173
|
-
timer.add_activity_time(f"execute final view query - dataset {self.dataset}", start)
|
|
174
|
-
|
|
175
|
-
return param_set, query_by_db_view, final_view_query, df_by_db_views, final_view_df
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
def default_context_func(*args, **kwargs):
|
|
179
|
-
return {}
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
class RendererIOWrapper:
|
|
183
|
-
def __init__(self, dataset: str, manifest: mf.Manifest, conn_set: ConnectionSet, excel_file_name: Optional[str] = None):
|
|
184
|
-
dataset_folder = manifest.get_dataset_folder(dataset)
|
|
185
|
-
parameters_path = _utils.join_paths(dataset_folder, c.PARAMETERS_FILE)
|
|
186
|
-
args = manifest.get_dataset_args(dataset)
|
|
187
|
-
parameters_module = _utils.import_file_as_module(parameters_path)
|
|
188
|
-
try:
|
|
189
|
-
parameter_set = ParameterSet(parameters_module.main(args=args))
|
|
190
|
-
except Exception as e:
|
|
191
|
-
raise ConfigurationError(f'Error in the {c.PARAMETERS_FILE} function for dataset "{dataset}"') from e
|
|
192
|
-
|
|
193
|
-
context_path = _utils.join_paths(dataset_folder, c.CONTEXT_FILE)
|
|
194
|
-
try:
|
|
195
|
-
context_module = _utils.import_file_as_module(context_path)
|
|
196
|
-
context_func = partial(context_module.main, args=args)
|
|
197
|
-
except FileNotFoundError:
|
|
198
|
-
context_func = default_context_func
|
|
199
|
-
|
|
200
|
-
excel_file = None
|
|
201
|
-
if excel_file_name is not None:
|
|
202
|
-
excel_file_path = _utils.join_paths(dataset_folder, excel_file_name)
|
|
203
|
-
excel_file = pd.ExcelFile(excel_file_path)
|
|
204
|
-
|
|
205
|
-
db_views = manifest.get_all_database_view_names(dataset)
|
|
206
|
-
raw_query_by_db_view = {}
|
|
207
|
-
for db_view in db_views:
|
|
208
|
-
db_view_template_path = str(manifest.get_database_view_file(dataset, db_view))
|
|
209
|
-
raw_query_by_db_view[db_view] = self._get_raw_query(db_view_template_path)
|
|
210
|
-
|
|
211
|
-
final_view_path = str(manifest.get_dataset_final_view_file(dataset))
|
|
212
|
-
if final_view_path in db_views:
|
|
213
|
-
raw_final_view_query = final_view_path
|
|
214
|
-
else:
|
|
215
|
-
raw_final_view_query = self._get_raw_query(final_view_path)
|
|
216
|
-
|
|
217
|
-
self.dataset_folder = dataset_folder
|
|
218
|
-
self.output_folder = _utils.join_paths(c.OUTPUTS_FOLDER, dataset)
|
|
219
|
-
self.renderer = Renderer(dataset, manifest, conn_set, parameter_set, context_func,
|
|
220
|
-
raw_query_by_db_view, raw_final_view_query, excel_file)
|
|
221
|
-
|
|
222
|
-
def _get_raw_query(self, template_path: str) -> Dict[str, Query]:
|
|
223
|
-
if template_path.endswith(".py"):
|
|
224
|
-
return _utils.import_file_as_module(template_path).main
|
|
225
|
-
else:
|
|
226
|
-
with open(template_path, 'r') as f:
|
|
227
|
-
sql_template = f.read()
|
|
228
|
-
return sql_template
|
|
229
|
-
|
|
230
|
-
def _get_selections(self, selection_cfg_file: Optional[str]) -> Dict[str, str]:
|
|
231
|
-
if selection_cfg_file is not None:
|
|
232
|
-
selection_cfg_path = _utils.join_paths(self.dataset_folder, selection_cfg_file)
|
|
233
|
-
config = ConfigParser()
|
|
234
|
-
config.read(selection_cfg_path)
|
|
235
|
-
if config.has_section(c.PARAMETERS_SECTION):
|
|
236
|
-
config_section = config[c.PARAMETERS_SECTION]
|
|
237
|
-
return dict(config_section.items())
|
|
238
|
-
return {}
|
|
239
|
-
|
|
240
|
-
def _write_sql_file(self, view_name: str, query: Any):
|
|
241
|
-
if isinstance(query, str):
|
|
242
|
-
db_view_sql_output_path = _utils.join_paths(self.output_folder, view_name+'.sql')
|
|
243
|
-
with open(db_view_sql_output_path, 'w') as f:
|
|
244
|
-
f.write(query)
|
|
245
|
-
|
|
246
|
-
def write_outputs(self, selection_cfg_file: Optional[str], run_query: bool) -> None:
|
|
247
|
-
# create output folder if it doesn't exist
|
|
248
|
-
if not os.path.exists(self.output_folder):
|
|
249
|
-
os.makedirs(self.output_folder)
|
|
250
|
-
|
|
251
|
-
# clear everything in output folder
|
|
252
|
-
files = os.listdir(self.output_folder)
|
|
253
|
-
for file in files:
|
|
254
|
-
file_path = _utils.join_paths(self.output_folder, file)
|
|
255
|
-
os.remove(file_path)
|
|
256
|
-
|
|
257
|
-
# apply selections and render outputs
|
|
258
|
-
selections = self._get_selections(selection_cfg_file)
|
|
259
|
-
result = self.renderer.load_results(selections, run_query)
|
|
260
|
-
param_set, query_by_db_view, final_view_query, df_by_db_views, final_view_df = result
|
|
261
|
-
|
|
262
|
-
# write the parameters response
|
|
263
|
-
param_set_dict = param_set.to_json_dict()
|
|
264
|
-
parameter_json_output_path = _utils.join_paths(self.output_folder, c.PARAMETERS_OUTPUT)
|
|
265
|
-
with open(parameter_json_output_path, 'w') as f:
|
|
266
|
-
json.dump(param_set_dict, f, indent=4)
|
|
267
|
-
|
|
268
|
-
# write the rendered sql queries for database views
|
|
269
|
-
for db_view, query in query_by_db_view.items():
|
|
270
|
-
self._write_sql_file(db_view, query)
|
|
271
|
-
|
|
272
|
-
# write the rendered sql query for final view
|
|
273
|
-
if final_view_query not in query_by_db_view:
|
|
274
|
-
self._write_sql_file(c.FINAL_VIEW_OUT_STEM, final_view_query)
|
|
275
|
-
|
|
276
|
-
# Run the sql queries and write output
|
|
277
|
-
if run_query:
|
|
278
|
-
for db_view, df in df_by_db_views.items():
|
|
279
|
-
csv_file = _utils.join_paths(self.output_folder, db_view+'.csv')
|
|
280
|
-
df.to_csv(csv_file, index=False)
|
|
281
|
-
|
|
282
|
-
final_csv_path = _utils.join_paths(self.output_folder, c.FINAL_VIEW_OUT_STEM+'.csv')
|
|
283
|
-
final_view_df.to_csv(final_csv_path, index=False)
|
|
284
|
-
|
|
285
|
-
final_json_path = _utils.join_paths(self.output_folder, c.FINAL_VIEW_OUT_STEM+'.json')
|
|
286
|
-
final_view_df.to_json(final_json_path, orient='table', index=False, indent=4)
|
squirrels/_timed_imports.py
DELETED
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
from typing import Dict, List
|
|
2
|
-
import time
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class Timer:
|
|
6
|
-
def __init__(self, verbose: bool = False):
|
|
7
|
-
self.times: Dict[str, List[float]] = dict()
|
|
8
|
-
self.verbose = verbose
|
|
9
|
-
|
|
10
|
-
def add_activity_time(self, activity: str, start: float):
|
|
11
|
-
if self.verbose:
|
|
12
|
-
time_taken = (time.time()-start) * 10**3
|
|
13
|
-
times_list = self.times.setdefault(activity, list())
|
|
14
|
-
times_list.append(time_taken)
|
|
15
|
-
print(f'Time taken for "{activity}": {time_taken}ms')
|
|
16
|
-
|
|
17
|
-
def report_times(self):
|
|
18
|
-
if self.verbose:
|
|
19
|
-
for activity, times_list in self.times.items():
|
|
20
|
-
total_time = sum(times_list)
|
|
21
|
-
avg_time = total_time / len(times_list)
|
|
22
|
-
print()
|
|
23
|
-
print(f'Time statistics for "{activity}":')
|
|
24
|
-
print(f' Total time: {total_time}ms')
|
|
25
|
-
print(f' Average time: {avg_time}ms')
|
|
26
|
-
|
|
27
|
-
timer = Timer()
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
start = time.time()
|
|
31
|
-
import pandas
|
|
32
|
-
from pandas.api import types as pd_types
|
|
33
|
-
timer.add_activity_time("import pandas", start)
|
|
34
|
-
|
|
35
|
-
start = time.time()
|
|
36
|
-
import jinja2
|
|
37
|
-
timer.add_activity_time("import jinja", start)
|
squirrels/connection_set.py
DELETED
|
@@ -1,126 +0,0 @@
|
|
|
1
|
-
from typing import Dict, Union
|
|
2
|
-
from importlib.machinery import SourceFileLoader
|
|
3
|
-
from sqlalchemy import Engine, Pool
|
|
4
|
-
import sqlite3
|
|
5
|
-
|
|
6
|
-
from squirrels import _constants as c, _manifest as mf
|
|
7
|
-
from squirrels._timed_imports import pandas as pd
|
|
8
|
-
from squirrels._utils import ConfigurationError
|
|
9
|
-
|
|
10
|
-
ConnectionPool = Union[Engine, Pool]
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class ConnectionSet:
|
|
14
|
-
def __init__(self, conn_pools: Dict[str, ConnectionPool]) -> None:
|
|
15
|
-
"""
|
|
16
|
-
Constructor for ConnectionSet, a wrapper class around a collection of Connection Pools or Sqlalchemy Engines
|
|
17
|
-
|
|
18
|
-
Parameters:
|
|
19
|
-
conn_pools: A dictionary of connection pool name to the corresponding Pool or Engine from sqlalchemy
|
|
20
|
-
"""
|
|
21
|
-
self._conn_pools = conn_pools
|
|
22
|
-
|
|
23
|
-
def get_connection_pool(self, conn_name: str = "default") -> ConnectionPool:
|
|
24
|
-
"""
|
|
25
|
-
Gets to sqlalchemy Pool or Engine from the database connection name
|
|
26
|
-
|
|
27
|
-
Parameters:
|
|
28
|
-
conn_name: Name of Pool or Engine. If not provided, defaults to "default"
|
|
29
|
-
|
|
30
|
-
Returns:
|
|
31
|
-
A sqlalchemy Pool or Engine
|
|
32
|
-
"""
|
|
33
|
-
try:
|
|
34
|
-
connection_pool = self._conn_pools[conn_name]
|
|
35
|
-
except KeyError as e:
|
|
36
|
-
raise ConfigurationError(f'Connection name "{conn_name}" was not configured') from e
|
|
37
|
-
return connection_pool
|
|
38
|
-
|
|
39
|
-
def __getitem__(self, conn_name: str) -> ConnectionPool:
|
|
40
|
-
"""
|
|
41
|
-
Same as get_connection_pool
|
|
42
|
-
"""
|
|
43
|
-
return self.get_connection_pool(conn_name)
|
|
44
|
-
|
|
45
|
-
def get_dataframe_from_query(self, conn_name: str, query: str) -> pd.DataFrame:
|
|
46
|
-
"""
|
|
47
|
-
Runs a SQL query on a database connection name, and returns the results as pandas DataFrame
|
|
48
|
-
|
|
49
|
-
Parameters:
|
|
50
|
-
conn_name: Name of Pool or Engine
|
|
51
|
-
query: The SQL query to run
|
|
52
|
-
|
|
53
|
-
Returns:
|
|
54
|
-
A pandas DataFrame
|
|
55
|
-
"""
|
|
56
|
-
connector = self.get_connection_pool(conn_name)
|
|
57
|
-
if isinstance(connector, Pool):
|
|
58
|
-
conn = connector.connect()
|
|
59
|
-
elif isinstance(connector, Engine):
|
|
60
|
-
conn = connector.raw_connection()
|
|
61
|
-
else:
|
|
62
|
-
raise TypeError(f'Type for connection name "{conn_name}" not supported')
|
|
63
|
-
|
|
64
|
-
try:
|
|
65
|
-
cur = conn.cursor()
|
|
66
|
-
cur.execute(query)
|
|
67
|
-
df = pd.DataFrame(data=cur.fetchall(), columns=[x[0] for x in cur.description])
|
|
68
|
-
finally:
|
|
69
|
-
conn.close()
|
|
70
|
-
|
|
71
|
-
return df
|
|
72
|
-
|
|
73
|
-
def _dispose(self) -> None:
|
|
74
|
-
"""
|
|
75
|
-
Disposes of all the connection pools in this ConnectionSet
|
|
76
|
-
"""
|
|
77
|
-
for pool in self._conn_pools.values():
|
|
78
|
-
pool.dispose()
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
def _from_file(manifest: mf.Manifest) -> ConnectionSet:
|
|
82
|
-
"""
|
|
83
|
-
Takes the DB Connections from both the squirrels.yaml and connections.py files and merges them
|
|
84
|
-
into a single ConnectionSet
|
|
85
|
-
|
|
86
|
-
Parameters:
|
|
87
|
-
manifest: The object of Manifest class, the interface for the squirrels.yaml file
|
|
88
|
-
|
|
89
|
-
Returns:
|
|
90
|
-
A ConnectionSet with the DB connections from both squirrels.yaml and connections.py
|
|
91
|
-
"""
|
|
92
|
-
connections = manifest.get_db_connections()
|
|
93
|
-
try:
|
|
94
|
-
module = SourceFileLoader(c.CONNECTIONS_FILE, c.CONNECTIONS_FILE).load_module()
|
|
95
|
-
except FileNotFoundError:
|
|
96
|
-
module = None
|
|
97
|
-
|
|
98
|
-
if module is not None:
|
|
99
|
-
proj_vars = manifest.get_proj_vars()
|
|
100
|
-
try:
|
|
101
|
-
conn_from_py_file = module.main(proj_vars)
|
|
102
|
-
except Exception as e:
|
|
103
|
-
raise ConfigurationError(f'Error in the {c.CONNECTIONS_FILE} file') from e
|
|
104
|
-
else:
|
|
105
|
-
conn_from_py_file = {}
|
|
106
|
-
return ConnectionSet({**connections, **conn_from_py_file})
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
def sqldf(query: str, df_by_db_views: Dict[str, pd.DataFrame]) -> pd.DataFrame:
|
|
110
|
-
"""
|
|
111
|
-
Uses a dictionary of dataframes to execute a SQL query in an in-memory sqlite database
|
|
112
|
-
|
|
113
|
-
Parameters:
|
|
114
|
-
query: The SQL query to run using sqlite
|
|
115
|
-
df_by_db_views: A dictionary of table names to their pandas Dataframe
|
|
116
|
-
|
|
117
|
-
Returns:
|
|
118
|
-
The result as a pandas Dataframe from running the query
|
|
119
|
-
"""
|
|
120
|
-
conn = sqlite3.connect(":memory:")
|
|
121
|
-
try:
|
|
122
|
-
for db_view, df in df_by_db_views.items():
|
|
123
|
-
df.to_sql(db_view, conn, index=False)
|
|
124
|
-
return pd.read_sql(query, conn)
|
|
125
|
-
finally:
|
|
126
|
-
conn.close()
|
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
from typing import Dict, Union, Any
|
|
2
|
-
from sqlalchemy import create_engine, Engine, Pool, QueuePool
|
|
3
|
-
|
|
4
|
-
from squirrels import get_credential
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
# Note: all connections must be shareable across multiple thread. No writes will occur on them
|
|
8
|
-
def main(proj: Dict[str, Any], *p_args, **kwargs) -> Dict[str, Union[Engine, Pool]]:
|
|
9
|
-
|
|
10
|
-
## Example of getting the username and password set with "$ squirrels set-credential [key]"
|
|
11
|
-
# cred = get_credential('my_key') # then use cred.username and cred.password to access the username and password
|
|
12
|
-
|
|
13
|
-
# Create a connection pool / engine
|
|
14
|
-
pool = create_engine('sqlite:///./database/sample_database.db')
|
|
15
|
-
|
|
16
|
-
## Example of using QueuePool instead for a custom db connector:
|
|
17
|
-
# connection_creator = lambda: sqlite3.connect('./database/sample_database.db', check_same_thread=False)
|
|
18
|
-
# pool = QueuePool(connection_creator)
|
|
19
|
-
|
|
20
|
-
return {'default': pool}
|
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
from typing import Dict, Any
|
|
2
|
-
import squirrels as sr
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
def main(prms: Dict[str, sr.Parameter], args: Dict[str, Any], *p_args, **kwargs) -> Dict[str, Any]:
|
|
6
|
-
group_by_param: sr.SingleSelectParameter = prms["group_by"]
|
|
7
|
-
start_date_param: sr.DateParameter = prms["start_date"]
|
|
8
|
-
end_date_param: sr.DateParameter = prms["end_date"]
|
|
9
|
-
category_param: sr.MultiSelectParameter = prms["category"]
|
|
10
|
-
subcategory_param: sr.MultiSelectParameter = prms["subcategory"]
|
|
11
|
-
min_amount_filter: sr.NumberParameter = prms["min_filter"]
|
|
12
|
-
max_amount_filter: sr.NumberParameter = prms["max_filter"]
|
|
13
|
-
|
|
14
|
-
return {
|
|
15
|
-
"group_by_cols": group_by_param.get_selected("columns"),
|
|
16
|
-
"start_date": start_date_param.get_selected_date_quoted(),
|
|
17
|
-
"end_date": end_date_param.get_selected_date_quoted(),
|
|
18
|
-
"categories": category_param.get_selected_labels_quoted_joined(),
|
|
19
|
-
"subcategories": subcategory_param.get_selected_labels_quoted_joined(),
|
|
20
|
-
"min_amount": min_amount_filter.get_selected_value(),
|
|
21
|
-
"max_amount": max_amount_filter.get_selected_value()
|
|
22
|
-
}
|
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
from typing import Dict, Any
|
|
2
|
-
from sqlalchemy import text
|
|
3
|
-
import pandas as pd
|
|
4
|
-
|
|
5
|
-
import squirrels as sr
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
def main(connection_set: sr.ConnectionSet,
|
|
9
|
-
prms: Dict[str, sr.Parameter], ctx: Dict[str, Any], args: Dict[str, Any],
|
|
10
|
-
*p_args, **kwargs) -> pd.DataFrame:
|
|
11
|
-
|
|
12
|
-
query = f"""
|
|
13
|
-
SELECT {ctx["group_by_cols"]}
|
|
14
|
-
, sum(-Amount) as Total_Amount
|
|
15
|
-
FROM transactions
|
|
16
|
-
WHERE Category IN ({ctx["categories"]})
|
|
17
|
-
AND Subcategory IN ({ctx["subcategories"]})
|
|
18
|
-
AND "Date" >= {ctx["start_date"]}
|
|
19
|
-
AND "Date" <= {ctx["end_date"]}
|
|
20
|
-
AND -Amount >= {ctx["min_amount"]}
|
|
21
|
-
AND -Amount <= {ctx["max_amount"]}
|
|
22
|
-
GROUP BY {ctx["group_by_cols"]}
|
|
23
|
-
"""
|
|
24
|
-
|
|
25
|
-
engine = connection_set.get_connection_pool("default")
|
|
26
|
-
conn = engine.raw_connection()
|
|
27
|
-
df = pd.read_sql(query, conn)
|
|
28
|
-
conn.close()
|
|
29
|
-
return df
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
-- %USE some_db -- TBA: this line is optional when connecting to the "default" db_connection
|
|
2
|
-
|
|
3
|
-
SELECT {{ prms["group_by"].get_selected("columns") }} -- {{ ctx["group_by_cols"] }}
|
|
4
|
-
, sum(-Amount) as Total_Amount
|
|
5
|
-
FROM transactions
|
|
6
|
-
WHERE Category IN ({{ prms["category"].get_selected_labels_quoted_joined() }}) -- ({{ ctx["categories"] }})
|
|
7
|
-
AND Subcategory IN ({{ prms["subcategory"].get_selected_labels_quoted_joined() }}) -- ({{ ctx["subcategories"] }})
|
|
8
|
-
AND "Date" >= {{ prms["start_date"].get_selected_date_quoted() }} -- {{ ctx["start_date"] }}
|
|
9
|
-
AND "Date" <= {{ prms["end_date"].get_selected_date_quoted() }} -- {{ ctx["end_date"] }}
|
|
10
|
-
AND -Amount >= {{ prms["min_filter"].get_selected_value() }} -- {{ ctx["min_amount"] }}
|
|
11
|
-
AND -Amount <= {{ prms["max_filter"].get_selected_value() }} -- {{ ctx["max_amount"] }}
|
|
12
|
-
GROUP BY {{ prms["group_by"].get_selected("columns") }} -- {{ ctx["group_by_cols"] }}
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
from typing import Dict, Any
|
|
2
|
-
import pandas as pd
|
|
3
|
-
import squirrels as sr
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def main(database_views: Dict[str, pd.DataFrame],
|
|
7
|
-
prms: Dict[str, sr.Parameter], ctx: Dict[str, Any], args: Dict[str, Any],
|
|
8
|
-
*p_args, **kwargs) -> pd.DataFrame:
|
|
9
|
-
df = database_views['database_view1']
|
|
10
|
-
dim_cols = [x.strip() for x in ctx["group_by_cols"].split(",")]
|
|
11
|
-
return df.sort_values(dim_cols)
|
|
@@ -1,47 +0,0 @@
|
|
|
1
|
-
from typing import Dict, Sequence, Any
|
|
2
|
-
import squirrels as sr
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
def main(args: Dict[str, Any], *p_args, **kwargs) -> Sequence[sr.Parameter]:
|
|
6
|
-
|
|
7
|
-
## Example of creating SingleSelectParameter (similar for MultiSelectParameter)
|
|
8
|
-
group_by_options = [
|
|
9
|
-
sr.SelectParameterOption("g0", "Transaction", columns="ID,Date"),
|
|
10
|
-
sr.SelectParameterOption("g1", "Date", columns="Date"),
|
|
11
|
-
sr.SelectParameterOption("g2", "Category", columns="Category"),
|
|
12
|
-
sr.SelectParameterOption("g3", "Subcategory", columns="Category,Subcategory"),
|
|
13
|
-
]
|
|
14
|
-
group_by_param = sr.SingleSelectParameter("group_by", "Group By", group_by_options)
|
|
15
|
-
|
|
16
|
-
## Example of creating DateParameter
|
|
17
|
-
start_date_param = sr.DateParameter("start_date", "Start Date", "2023-01-01")
|
|
18
|
-
|
|
19
|
-
## Example of creating DateParameter from lookup query/table
|
|
20
|
-
end_date_ds = sr.DateDataSource("SELECT max(Date) as date FROM transactions", "date")
|
|
21
|
-
end_date_param = sr.DataSourceParameter(sr.DateParameter, "end_date", "End Date", end_date_ds)
|
|
22
|
-
|
|
23
|
-
## Example of creating MultiSelectParameter from lookup query/table
|
|
24
|
-
category_ds = sr.SelectionDataSource("SELECT DISTINCT Category_ID, Category FROM categories", "Category_ID", "Category")
|
|
25
|
-
category_filter = sr.DataSourceParameter(sr.MultiSelectParameter, "category", "Category Filter", category_ds)
|
|
26
|
-
|
|
27
|
-
## Example of creating MultiSelectParameter with parent from lookup query/table
|
|
28
|
-
subcategory_ds = sr.SelectionDataSource("categories", "Subcategory_ID", "Subcategory", parent_id_col="Category_ID")
|
|
29
|
-
subcategory_filter = sr.DataSourceParameter(sr.MultiSelectParameter, "subcategory", "Subcategory Filter", subcategory_ds, parent=category_filter)
|
|
30
|
-
|
|
31
|
-
## Example of creating NumberParameter
|
|
32
|
-
min_amount_filter = sr.NumberParameter("min_filter", "Amounts Greater Than", 0, 500, 10)
|
|
33
|
-
|
|
34
|
-
## Example of creating NumberParameter from lookup query/table
|
|
35
|
-
query = """
|
|
36
|
-
SELECT 0 as min_value, max(-Amount) as max_value, 10 as increment \
|
|
37
|
-
FROM transactions WHERE Category <> 'Income'
|
|
38
|
-
"""
|
|
39
|
-
max_amount_ds = sr.NumberDataSource(query, "min_value", "max_value", "increment", default_value_col="max_value")
|
|
40
|
-
max_amount_filter = sr.DataSourceParameter(sr.NumberParameter, "max_filter", "Amounts Less Than", max_amount_ds)
|
|
41
|
-
|
|
42
|
-
return [
|
|
43
|
-
group_by_param,
|
|
44
|
-
start_date_param, end_date_param,
|
|
45
|
-
category_filter, subcategory_filter,
|
|
46
|
-
min_amount_filter, max_amount_filter
|
|
47
|
-
]
|
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
modules: []
|
|
2
|
-
|
|
3
|
-
project_variables:
|
|
4
|
-
product: sample
|
|
5
|
-
major_version: 1
|
|
6
|
-
minor_version: 0
|
|
7
|
-
|
|
8
|
-
db_connections:
|
|
9
|
-
default:
|
|
10
|
-
credential_key: null
|
|
11
|
-
url: 'sqlite://${username}:${password}@/./database/sample_database.db'
|
|
12
|
-
|
|
13
|
-
datasets:
|
|
14
|
-
sample_dataset:
|
|
15
|
-
label: Sample Dataset
|
|
16
|
-
database_views:
|
|
17
|
-
database_view1:
|
|
18
|
-
file: database_view1.sql.j2
|
|
19
|
-
db_connection: default
|
|
20
|
-
final_view: database_view1
|
|
21
|
-
|
|
22
|
-
settings: {}
|