squirrels 0.1.0__py3-none-any.whl → 0.6.0.post0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dateutils/__init__.py +6 -0
- dateutils/_enums.py +25 -0
- squirrels/dateutils.py → dateutils/_implementation.py +409 -380
- dateutils/types.py +6 -0
- squirrels/__init__.py +21 -18
- squirrels/_api_routes/__init__.py +5 -0
- squirrels/_api_routes/auth.py +337 -0
- squirrels/_api_routes/base.py +196 -0
- squirrels/_api_routes/dashboards.py +156 -0
- squirrels/_api_routes/data_management.py +148 -0
- squirrels/_api_routes/datasets.py +220 -0
- squirrels/_api_routes/project.py +289 -0
- squirrels/_api_server.py +552 -134
- squirrels/_arguments/__init__.py +0 -0
- squirrels/_arguments/init_time_args.py +83 -0
- squirrels/_arguments/run_time_args.py +111 -0
- squirrels/_auth.py +777 -0
- squirrels/_command_line.py +239 -107
- squirrels/_compile_prompts.py +147 -0
- squirrels/_connection_set.py +94 -0
- squirrels/_constants.py +141 -64
- squirrels/_dashboards.py +179 -0
- squirrels/_data_sources.py +570 -0
- squirrels/_dataset_types.py +91 -0
- squirrels/_env_vars.py +209 -0
- squirrels/_exceptions.py +29 -0
- squirrels/_http_error_responses.py +52 -0
- squirrels/_initializer.py +319 -110
- squirrels/_logging.py +121 -0
- squirrels/_manifest.py +357 -187
- squirrels/_mcp_server.py +578 -0
- squirrels/_model_builder.py +69 -0
- squirrels/_model_configs.py +74 -0
- squirrels/_model_queries.py +52 -0
- squirrels/_models.py +1201 -0
- squirrels/_package_data/base_project/.env +7 -0
- squirrels/_package_data/base_project/.env.example +44 -0
- squirrels/_package_data/base_project/connections.yml +16 -0
- squirrels/_package_data/base_project/dashboards/dashboard_example.py +40 -0
- squirrels/_package_data/base_project/dashboards/dashboard_example.yml +22 -0
- squirrels/_package_data/base_project/docker/.dockerignore +16 -0
- squirrels/_package_data/base_project/docker/Dockerfile +16 -0
- squirrels/_package_data/base_project/docker/compose.yml +7 -0
- squirrels/_package_data/base_project/duckdb_init.sql +10 -0
- squirrels/_package_data/base_project/gitignore +13 -0
- squirrels/_package_data/base_project/macros/macros_example.sql +17 -0
- squirrels/_package_data/base_project/models/builds/build_example.py +26 -0
- squirrels/_package_data/base_project/models/builds/build_example.sql +16 -0
- squirrels/_package_data/base_project/models/builds/build_example.yml +57 -0
- squirrels/_package_data/base_project/models/dbviews/dbview_example.sql +17 -0
- squirrels/_package_data/base_project/models/dbviews/dbview_example.yml +32 -0
- squirrels/_package_data/base_project/models/federates/federate_example.py +51 -0
- squirrels/_package_data/base_project/models/federates/federate_example.sql +21 -0
- squirrels/_package_data/base_project/models/federates/federate_example.yml +65 -0
- squirrels/_package_data/base_project/models/sources.yml +38 -0
- squirrels/_package_data/base_project/parameters.yml +142 -0
- squirrels/_package_data/base_project/pyconfigs/connections.py +19 -0
- squirrels/_package_data/base_project/pyconfigs/context.py +96 -0
- squirrels/_package_data/base_project/pyconfigs/parameters.py +141 -0
- squirrels/_package_data/base_project/pyconfigs/user.py +56 -0
- squirrels/_package_data/base_project/resources/expenses.db +0 -0
- squirrels/_package_data/base_project/resources/public/.gitkeep +0 -0
- squirrels/_package_data/base_project/resources/weather.db +0 -0
- squirrels/_package_data/base_project/seeds/seed_categories.csv +6 -0
- squirrels/_package_data/base_project/seeds/seed_categories.yml +15 -0
- squirrels/_package_data/base_project/seeds/seed_subcategories.csv +15 -0
- squirrels/_package_data/base_project/seeds/seed_subcategories.yml +21 -0
- squirrels/_package_data/base_project/squirrels.yml.j2 +61 -0
- squirrels/_package_data/base_project/tmp/.gitignore +2 -0
- squirrels/_package_data/templates/login_successful.html +53 -0
- squirrels/_package_data/templates/squirrels_studio.html +22 -0
- squirrels/_package_loader.py +29 -0
- squirrels/_parameter_configs.py +592 -0
- squirrels/_parameter_options.py +348 -0
- squirrels/_parameter_sets.py +207 -0
- squirrels/_parameters.py +1703 -0
- squirrels/_project.py +796 -0
- squirrels/_py_module.py +122 -0
- squirrels/_request_context.py +33 -0
- squirrels/_schemas/__init__.py +0 -0
- squirrels/_schemas/auth_models.py +83 -0
- squirrels/_schemas/query_param_models.py +70 -0
- squirrels/_schemas/request_models.py +26 -0
- squirrels/_schemas/response_models.py +286 -0
- squirrels/_seeds.py +97 -0
- squirrels/_sources.py +112 -0
- squirrels/_utils.py +540 -149
- squirrels/_version.py +1 -3
- squirrels/arguments.py +7 -0
- squirrels/auth.py +4 -0
- squirrels/connections.py +3 -0
- squirrels/dashboards.py +3 -0
- squirrels/data_sources.py +14 -282
- squirrels/parameter_options.py +13 -189
- squirrels/parameters.py +14 -801
- squirrels/types.py +18 -0
- squirrels-0.6.0.post0.dist-info/METADATA +148 -0
- squirrels-0.6.0.post0.dist-info/RECORD +101 -0
- {squirrels-0.1.0.dist-info → squirrels-0.6.0.post0.dist-info}/WHEEL +1 -2
- {squirrels-0.1.0.dist-info → squirrels-0.6.0.post0.dist-info}/entry_points.txt +1 -0
- squirrels-0.6.0.post0.dist-info/licenses/LICENSE +201 -0
- squirrels/_credentials_manager.py +0 -87
- squirrels/_module_loader.py +0 -37
- squirrels/_parameter_set.py +0 -151
- squirrels/_renderer.py +0 -286
- squirrels/_timed_imports.py +0 -37
- squirrels/connection_set.py +0 -126
- squirrels/package_data/base_project/.gitignore +0 -4
- squirrels/package_data/base_project/connections.py +0 -21
- squirrels/package_data/base_project/database/sample_database.db +0 -0
- squirrels/package_data/base_project/database/seattle_weather.db +0 -0
- squirrels/package_data/base_project/datasets/sample_dataset/context.py +0 -8
- squirrels/package_data/base_project/datasets/sample_dataset/database_view1.py +0 -23
- squirrels/package_data/base_project/datasets/sample_dataset/database_view1.sql.j2 +0 -7
- squirrels/package_data/base_project/datasets/sample_dataset/final_view.py +0 -10
- squirrels/package_data/base_project/datasets/sample_dataset/final_view.sql.j2 +0 -2
- squirrels/package_data/base_project/datasets/sample_dataset/parameters.py +0 -30
- squirrels/package_data/base_project/datasets/sample_dataset/selections.cfg +0 -6
- squirrels/package_data/base_project/squirrels.yaml +0 -26
- squirrels/package_data/static/favicon.ico +0 -0
- squirrels/package_data/static/script.js +0 -234
- squirrels/package_data/static/style.css +0 -110
- squirrels/package_data/templates/index.html +0 -32
- squirrels-0.1.0.dist-info/LICENSE +0 -22
- squirrels-0.1.0.dist-info/METADATA +0 -67
- squirrels-0.1.0.dist-info/RECORD +0 -40
- squirrels-0.1.0.dist-info/top_level.txt +0 -1
squirrels/_seeds.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
import time
|
|
5
|
+
import glob
|
|
6
|
+
import json
|
|
7
|
+
|
|
8
|
+
import polars as pl
|
|
9
|
+
|
|
10
|
+
from ._exceptions import ConfigurationError
|
|
11
|
+
from . import _utils as u, _constants as c, _model_configs as mc
|
|
12
|
+
from ._env_vars import SquirrelsEnvVars
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class Seed:
|
|
17
|
+
config: mc.SeedConfig
|
|
18
|
+
df: pl.LazyFrame
|
|
19
|
+
|
|
20
|
+
def __post_init__(self):
|
|
21
|
+
if self.config.cast_column_types:
|
|
22
|
+
exprs = []
|
|
23
|
+
for col_config in self.config.columns:
|
|
24
|
+
col_type = col_config.type.lower()
|
|
25
|
+
if col_type.startswith("decimal"):
|
|
26
|
+
polars_dtype = self._parse_decimal_type(col_type)
|
|
27
|
+
else:
|
|
28
|
+
try:
|
|
29
|
+
polars_dtype = u.sqrl_dtypes_to_polars_dtypes[col_type]
|
|
30
|
+
except KeyError as e:
|
|
31
|
+
raise ConfigurationError(f"Unknown column type: '{col_type}'") from e
|
|
32
|
+
|
|
33
|
+
exprs.append(pl.col(col_config.name).cast(polars_dtype))
|
|
34
|
+
|
|
35
|
+
self.df = self.df.with_columns(*exprs)
|
|
36
|
+
|
|
37
|
+
@staticmethod
|
|
38
|
+
def _parse_decimal_type(col_type: str) -> pl.Decimal:
|
|
39
|
+
"""Parse a decimal type string and return the appropriate polars Decimal type.
|
|
40
|
+
|
|
41
|
+
Supports formats: "decimal" or "decimal(precision, scale)"
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
# Match decimal(precision, scale) pattern
|
|
45
|
+
match = re.match(r"decimal\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)", col_type)
|
|
46
|
+
if match:
|
|
47
|
+
precision = int(match.group(1))
|
|
48
|
+
scale = int(match.group(2))
|
|
49
|
+
return pl.Decimal(precision=precision, scale=scale)
|
|
50
|
+
|
|
51
|
+
if col_type == "decimal":
|
|
52
|
+
return pl.Decimal(precision=18, scale=2)
|
|
53
|
+
|
|
54
|
+
raise ConfigurationError(f"Unknown column type: '{col_type}'")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class Seeds:
|
|
59
|
+
_data: dict[str, Seed]
|
|
60
|
+
|
|
61
|
+
def run_query(self, sql_query: str) -> pl.DataFrame:
|
|
62
|
+
dataframes = {key: seed.df for key, seed in self._data.items()}
|
|
63
|
+
return u.run_sql_on_dataframes(sql_query, dataframes)
|
|
64
|
+
|
|
65
|
+
def get_dataframes(self) -> dict[str, Seed]:
|
|
66
|
+
return self._data.copy()
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class SeedsIO:
|
|
70
|
+
|
|
71
|
+
@classmethod
|
|
72
|
+
def load_files(cls, logger: u.Logger, env_vars: SquirrelsEnvVars) -> Seeds:
|
|
73
|
+
start = time.time()
|
|
74
|
+
project_path = env_vars.project_path
|
|
75
|
+
infer_schema_setting: bool = env_vars.seeds_infer_schema
|
|
76
|
+
na_values_setting: list[str] = env_vars.seeds_na_values
|
|
77
|
+
|
|
78
|
+
seeds_dict = {}
|
|
79
|
+
csv_files = glob.glob(os.path.join(project_path, c.SEEDS_FOLDER, '**/*.csv'), recursive=True)
|
|
80
|
+
for csv_file in csv_files:
|
|
81
|
+
config_file = os.path.splitext(csv_file)[0] + '.yml'
|
|
82
|
+
config_dict = u.load_yaml_config(config_file) if os.path.exists(config_file) else {}
|
|
83
|
+
config = mc.SeedConfig(**config_dict)
|
|
84
|
+
|
|
85
|
+
file_stem = os.path.splitext(os.path.basename(csv_file))[0]
|
|
86
|
+
infer_schema = not config.cast_column_types and infer_schema_setting
|
|
87
|
+
df = pl.read_csv(
|
|
88
|
+
csv_file, try_parse_dates=True,
|
|
89
|
+
infer_schema=infer_schema,
|
|
90
|
+
null_values=na_values_setting
|
|
91
|
+
).lazy()
|
|
92
|
+
|
|
93
|
+
seeds_dict[file_stem] = Seed(config, df)
|
|
94
|
+
|
|
95
|
+
seeds = Seeds(seeds_dict)
|
|
96
|
+
logger.log_activity_time("loading seed files", start)
|
|
97
|
+
return seeds
|
squirrels/_sources.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
from pydantic import BaseModel, Field, model_validator
|
|
3
|
+
import time, yaml
|
|
4
|
+
|
|
5
|
+
from . import _utils as u, _constants as c, _model_configs as mc
|
|
6
|
+
from ._env_vars import SquirrelsEnvVars
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class UpdateHints(BaseModel):
|
|
10
|
+
increasing_column: str | None = Field(default=None)
|
|
11
|
+
strictly_increasing: bool = Field(default=True, description="Delete the max value of the increasing column, ignored if selective_overwrite_value is set")
|
|
12
|
+
selective_overwrite_value: Any = Field(default=None, description="Delete all values of the increasing column greater than or equal to this value")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Source(mc.ConnectionInterface, mc.ModelConfig):
|
|
16
|
+
table: str | None = Field(default=None)
|
|
17
|
+
load_to_vdl: bool = Field(default=False, description="Whether to load the data to the 'virtual data lake' (VDL)")
|
|
18
|
+
primary_key: list[str] = Field(default_factory=list)
|
|
19
|
+
update_hints: UpdateHints = Field(default_factory=UpdateHints)
|
|
20
|
+
|
|
21
|
+
def finalize_table(self, source_name: str):
|
|
22
|
+
if self.table is None:
|
|
23
|
+
self.table = source_name
|
|
24
|
+
return self
|
|
25
|
+
|
|
26
|
+
def get_table(self) -> str:
|
|
27
|
+
assert self.table is not None, "Table must be set"
|
|
28
|
+
return self.table
|
|
29
|
+
|
|
30
|
+
def get_cols_for_create_table_stmt(self) -> str:
|
|
31
|
+
cols_clause = ", ".join([f"{col.name} {col.type}" for col in self.columns])
|
|
32
|
+
return cols_clause
|
|
33
|
+
|
|
34
|
+
def get_max_incr_col_query(self, source_name: str) -> str:
|
|
35
|
+
return f"SELECT max({self.update_hints.increasing_column}) FROM {source_name}"
|
|
36
|
+
|
|
37
|
+
def get_query_for_upsert(self, dialect: str, conn_name: str, table_name: str, max_value_of_increasing_col: Any | None, *, full_refresh: bool = True) -> str:
|
|
38
|
+
select_cols = ", ".join([col.name for col in self.columns])
|
|
39
|
+
if full_refresh or max_value_of_increasing_col is None:
|
|
40
|
+
return f"SELECT {select_cols} FROM db_{conn_name}.{table_name}"
|
|
41
|
+
|
|
42
|
+
increasing_col = self.update_hints.increasing_column
|
|
43
|
+
increasing_col_type = next(col.type for col in self.columns if col.name == increasing_col)
|
|
44
|
+
where_cond = f"{increasing_col}::{increasing_col_type} > '{max_value_of_increasing_col}'::{increasing_col_type}"
|
|
45
|
+
|
|
46
|
+
# TODO: figure out if using pushdown query is worth it
|
|
47
|
+
# if dialect in ['postgres', 'mysql']:
|
|
48
|
+
# pushdown_query = f"SELECT {select_cols} FROM {table_name} WHERE {where_cond}"
|
|
49
|
+
# transpiled_query = sqlglot.transpile(pushdown_query, read='duckdb', write=dialect)[0].replace("'", "''")
|
|
50
|
+
# return f"FROM {dialect}_query('db_{conn_name}', '{transpiled_query}')"
|
|
51
|
+
|
|
52
|
+
return f"SELECT {select_cols} FROM db_{conn_name}.{table_name} WHERE {where_cond}"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class Sources(BaseModel):
|
|
56
|
+
sources: dict[str, Source] = Field(default_factory=dict)
|
|
57
|
+
|
|
58
|
+
@model_validator(mode="before")
|
|
59
|
+
@classmethod
|
|
60
|
+
def convert_sources_list_to_dict(cls, data: dict[str, Any]) -> dict[str, Any]:
|
|
61
|
+
if "sources" in data and isinstance(data["sources"], list):
|
|
62
|
+
# Convert list of sources to dictionary
|
|
63
|
+
sources_dict = {}
|
|
64
|
+
for source in data["sources"]:
|
|
65
|
+
if isinstance(source, dict) and "name" in source:
|
|
66
|
+
name = source.pop("name") # Remove name from source config
|
|
67
|
+
if name in sources_dict:
|
|
68
|
+
raise u.ConfigurationError(f"Duplicate source name found: {name}")
|
|
69
|
+
sources_dict[name] = source
|
|
70
|
+
else:
|
|
71
|
+
raise u.ConfigurationError(f"All sources must have a name field in sources file")
|
|
72
|
+
data["sources"] = sources_dict
|
|
73
|
+
return data
|
|
74
|
+
|
|
75
|
+
@model_validator(mode="after")
|
|
76
|
+
def validate_column_types(self):
|
|
77
|
+
for source_name, source in self.sources.items():
|
|
78
|
+
for col in source.columns:
|
|
79
|
+
if not col.type:
|
|
80
|
+
raise u.ConfigurationError(f"Column '{col.name}' in source '{source_name}' must have a type specified")
|
|
81
|
+
return self
|
|
82
|
+
|
|
83
|
+
def finalize_null_fields(self, env_vars: SquirrelsEnvVars):
|
|
84
|
+
default_conn_name = env_vars.connections_default_name_used
|
|
85
|
+
for source_name, source in self.sources.items():
|
|
86
|
+
source.finalize_connection(default_conn_name=default_conn_name)
|
|
87
|
+
source.finalize_table(source_name)
|
|
88
|
+
return self
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class SourcesIO:
|
|
92
|
+
@classmethod
|
|
93
|
+
def load_file(cls, logger: u.Logger, env_vars: SquirrelsEnvVars, env_vars_unformatted: dict[str, str]) -> Sources:
|
|
94
|
+
start = time.time()
|
|
95
|
+
|
|
96
|
+
sources_path = u.Path(env_vars.project_path, c.MODELS_FOLDER, c.SOURCES_FILE)
|
|
97
|
+
if sources_path.exists():
|
|
98
|
+
raw_content = u.read_file(sources_path)
|
|
99
|
+
rendered = u.render_string(raw_content, project_path=env_vars.project_path, env_vars=env_vars_unformatted)
|
|
100
|
+
sources_data = yaml.safe_load(rendered) or {}
|
|
101
|
+
else:
|
|
102
|
+
sources_data = {}
|
|
103
|
+
|
|
104
|
+
if not isinstance(sources_data, dict):
|
|
105
|
+
raise u.ConfigurationError(
|
|
106
|
+
f"Parsed content from YAML file must be a dictionary. Got: {sources_data}"
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
sources = Sources(**sources_data).finalize_null_fields(env_vars)
|
|
110
|
+
|
|
111
|
+
logger.log_activity_time("loading sources", start)
|
|
112
|
+
return sources
|