sibi-dst 0.3.44__py3-none-any.whl → 0.3.46__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +38 -0
- sibi_dst/{df_helper → v1/df_helper}/_artifact_updater_multi_wrapper.py +1 -1
- sibi_dst/{df_helper → v1/df_helper}/_df_helper.py +3 -3
- sibi_dst/{df_helper → v1/df_helper}/_parquet_artifact.py +3 -3
- sibi_dst/{df_helper → v1/df_helper}/_parquet_reader.py +2 -2
- sibi_dst/{df_helper → v1/df_helper}/backends/django/_load_from_db.py +3 -3
- sibi_dst/{df_helper → v1/df_helper}/backends/http/_http_config.py +1 -1
- sibi_dst/{df_helper → v1/df_helper}/backends/parquet/_filter_handler.py +1 -1
- sibi_dst/{df_helper → v1/df_helper}/backends/parquet/_parquet_options.py +2 -2
- sibi_dst/{df_helper → v1/df_helper}/backends/sqlalchemy/_io_dask.py +2 -2
- sibi_dst/{df_helper → v1/df_helper}/backends/sqlalchemy/_load_from_db.py +2 -2
- sibi_dst/{df_helper → v1/df_helper}/backends/sqlalchemy/_sql_model_builder.py +2 -1
- sibi_dst/{df_helper → v1/df_helper}/core/_filter_handler.py +1 -1
- sibi_dst/v1/osmnx_helper/__init__.py +6 -0
- sibi_dst/{tests → v1/tests}/test_data_wrapper_class.py +11 -10
- sibi_dst/{utils → v1/utils}/__init__.py +2 -0
- sibi_dst/{utils → v1/utils}/clickhouse_writer.py +1 -1
- sibi_dst/v1/utils/data_from_http_source.py +49 -0
- sibi_dst/{utils → v1/utils}/data_utils.py +5 -3
- sibi_dst/{utils → v1/utils}/data_wrapper.py +3 -1
- sibi_dst/{utils → v1/utils}/date_utils.py +1 -1
- sibi_dst/{utils → v1/utils}/file_utils.py +1 -1
- sibi_dst/{utils → v1/utils}/filepath_generator.py +1 -1
- sibi_dst/{utils → v1/utils}/parquet_saver.py +1 -1
- sibi_dst/v1/utils/storage_config.py +28 -0
- sibi_dst/v2/df_helper/__init__.py +7 -0
- sibi_dst/v2/df_helper/_df_helper.py +214 -0
- sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +10 -0
- sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +82 -0
- sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +135 -0
- sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +142 -0
- sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +297 -0
- sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +9 -0
- sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +78 -0
- sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +122 -0
- sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +142 -0
- sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +283 -0
- sibi_dst/v2/df_helper/core/__init__.py +9 -0
- sibi_dst/v2/df_helper/core/_filter_handler.py +236 -0
- sibi_dst/v2/df_helper/core/_params_config.py +139 -0
- sibi_dst/v2/df_helper/core/_query_config.py +17 -0
- sibi_dst/v2/utils/__init__.py +5 -0
- sibi_dst/v2/utils/log_utils.py +120 -0
- {sibi_dst-0.3.44.dist-info → sibi_dst-0.3.46.dist-info}/METADATA +3 -2
- sibi_dst-0.3.46.dist-info/RECORD +80 -0
- sibi_dst/osmnx_helper/__init__.py +0 -9
- sibi_dst/osmnx_helper/v2/base_osm_map.py +0 -153
- sibi_dst/osmnx_helper/v2/basemaps/utils.py +0 -0
- sibi_dst-0.3.44.dist-info/RECORD +0 -62
- /sibi_dst/{df_helper/backends → v1}/__init__.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/__init__.py +0 -0
- /sibi_dst/{osmnx_helper/v1 → v1/df_helper/backends}/__init__.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/django/__init__.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/django/_db_connection.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/django/_io_dask.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/django/_sql_model_builder.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/http/__init__.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/parquet/__init__.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/sqlalchemy/__init__.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/sqlalchemy/_db_connection.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/backends/sqlalchemy/_filter_handler.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/core/__init__.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/core/_defaults.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/core/_params_config.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/core/_query_config.py +0 -0
- /sibi_dst/{df_helper → v1/df_helper}/data_cleaner.py +0 -0
- /sibi_dst/{geopy_helper → v1/geopy_helper}/__init__.py +0 -0
- /sibi_dst/{geopy_helper → v1/geopy_helper}/geo_location_service.py +0 -0
- /sibi_dst/{geopy_helper → v1/geopy_helper}/utils.py +0 -0
- /sibi_dst/{osmnx_helper/v1 → v1/osmnx_helper}/base_osm_map.py +0 -0
- /sibi_dst/{osmnx_helper/v1 → v1/osmnx_helper}/basemaps/__init__.py +0 -0
- /sibi_dst/{osmnx_helper/v1 → v1/osmnx_helper}/basemaps/calendar_html.py +0 -0
- /sibi_dst/{osmnx_helper/v1 → v1/osmnx_helper}/basemaps/router_plotter.py +0 -0
- /sibi_dst/{osmnx_helper/v1 → v1/osmnx_helper}/utils.py +0 -0
- /sibi_dst/{osmnx_helper/v2 → v1/tests}/__init__.py +0 -0
- /sibi_dst/{utils → v1/utils}/airflow_manager.py +0 -0
- /sibi_dst/{utils → v1/utils}/credentials.py +0 -0
- /sibi_dst/{utils → v1/utils}/df_utils.py +0 -0
- /sibi_dst/{utils → v1/utils}/log_utils.py +0 -0
- /sibi_dst/{utils → v1/utils}/phone_formatter.py +0 -0
- /sibi_dst/{utils → v1/utils}/storage_manager.py +0 -0
- /sibi_dst/{osmnx_helper/v2/basemaps → v2}/__init__.py +0 -0
- /sibi_dst/{tests → v2/df_helper/backends}/__init__.py +0 -0
- {sibi_dst-0.3.44.dist-info → sibi_dst-0.3.46.dist-info}/WHEEL +0 -0
@@ -0,0 +1,283 @@
|
|
1
|
+
import re
|
2
|
+
from collections import defaultdict
|
3
|
+
from datetime import datetime
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple, Type, get_args, get_origin
|
5
|
+
|
6
|
+
from sqlalchemy import and_, inspect, cast, func
|
7
|
+
from sqlalchemy.exc import ArgumentError, NoForeignKeysError
|
8
|
+
from sqlalchemy.orm import relationship, foreign, configure_mappers, clear_mappers
|
9
|
+
from sqlalchemy.sql.sqltypes import Integer, String, Float, DateTime, Boolean, Numeric, Text
|
10
|
+
|
11
|
+
from sqlmodel import SQLModel, create_engine
|
12
|
+
from sibi_dst.v2.utils import Logger
|
13
|
+
|
14
|
+
APPS_LABEL = "datacubes"
|
15
|
+
RESERVED_COLUMN_NAMES = {"metadata", "class_", "table"}
|
16
|
+
RESERVED_KEYWORDS = {"class", "def", "return", "yield", "global"}
|
17
|
+
|
18
|
+
MODEL_REGISTRY: Dict[str, Type] = {}
|
19
|
+
|
20
|
+
|
21
|
+
class SQLModelModelBuilder:
|
22
|
+
"""
|
23
|
+
Dynamically builds an ORM model for a single table by reflecting its columns
|
24
|
+
and reverse-engineering its relationships from foreign key metadata using SQLModel.
|
25
|
+
The generated model is mapped solely via its reflected __table__ attribute.
|
26
|
+
"""
|
27
|
+
|
28
|
+
def __init__(
|
29
|
+
self,
|
30
|
+
engine,
|
31
|
+
table_name: str,
|
32
|
+
add_relationships: bool = False,
|
33
|
+
debug: bool = False,
|
34
|
+
logger: Optional[Logger] = None,
|
35
|
+
) -> None:
|
36
|
+
self.engine = engine
|
37
|
+
self.table_name = table_name
|
38
|
+
self.add_relationships = add_relationships
|
39
|
+
self.debug = debug
|
40
|
+
self.logger = logger or Logger.default_logger(logger_name="sqlmodel_model_builder", debug=self.debug)
|
41
|
+
# Use SQLModel's shared metadata.
|
42
|
+
self.metadata = SQLModel.metadata
|
43
|
+
self.metadata.bind = self.engine
|
44
|
+
|
45
|
+
try:
|
46
|
+
self.metadata.reflect(only=[table_name], bind=self.engine)
|
47
|
+
except Exception as e:
|
48
|
+
self.logger.warning(f"Could not reflect table '{table_name}': {e}. Skipping model build.")
|
49
|
+
self.table = None
|
50
|
+
else:
|
51
|
+
self.table = self.metadata.tables.get(table_name)
|
52
|
+
if self.table is None:
|
53
|
+
self.logger.warning(f"Table '{table_name}' not found in the database. Skipping model build.")
|
54
|
+
self.model_name: str = self.normalize_class_name(table_name)
|
55
|
+
if self.debug:
|
56
|
+
self.logger.debug(f"Reflected table for '{table_name}': {self.table}")
|
57
|
+
|
58
|
+
def build_model(self) -> Optional[Type]:
|
59
|
+
try:
|
60
|
+
self.metadata.reflect(only=[self.table_name], bind=self.engine)
|
61
|
+
except Exception as e:
|
62
|
+
self.logger.warning(f"Could not reflect table '{self.table_name}': {e}. Skipping model build.")
|
63
|
+
return None
|
64
|
+
|
65
|
+
self.table = self.metadata.tables.get(self.table_name)
|
66
|
+
if self.table is None:
|
67
|
+
self.logger.warning(f"Table '{self.table_name}' not found in the database. Skipping model build.")
|
68
|
+
return None
|
69
|
+
|
70
|
+
# Force registration of the reflected table in the metadata.
|
71
|
+
try:
|
72
|
+
self.metadata._add_table(self.table_name, None, self.table)
|
73
|
+
except Exception as e:
|
74
|
+
self.logger.debug(f"Error forcing table registration: {e}")
|
75
|
+
|
76
|
+
columns, annotations = self.get_columns(self.table)
|
77
|
+
# Build the mapping dictionary using only __table__.
|
78
|
+
attrs: Dict[str, Any] = {
|
79
|
+
"__table__": self.table,
|
80
|
+
"__module__": f"{APPS_LABEL}.models",
|
81
|
+
"__mapper_args__": {"eager_defaults": True},
|
82
|
+
"__annotations__": annotations,
|
83
|
+
}
|
84
|
+
attrs.update(columns)
|
85
|
+
if self.add_relationships:
|
86
|
+
self._add_relationships(attrs, self.table)
|
87
|
+
model = type(self.model_name, (SQLModel,), attrs)
|
88
|
+
MODEL_REGISTRY[self.table_name] = model
|
89
|
+
|
90
|
+
try:
|
91
|
+
configure_mappers()
|
92
|
+
self.logger.debug(f"Configured mappers for model {self.model_name}.")
|
93
|
+
except Exception as e:
|
94
|
+
self.logger.error(f"Mapper configuration error for model {self.model_name}: {e}")
|
95
|
+
raise ValueError(f"Invalid mapping in model {self.model_name}: {e}") from e
|
96
|
+
|
97
|
+
# Register the mapping.
|
98
|
+
SQLModel.metadata.create_all(self.engine)
|
99
|
+
self.logger.debug(f"Created model {self.model_name} for table {self.table_name}.")
|
100
|
+
return model
|
101
|
+
|
102
|
+
def get_columns(self, table: Any) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
103
|
+
cols: Dict[str, Any] = {}
|
104
|
+
annotations: Dict[str, Any] = {}
|
105
|
+
for column in table.columns:
|
106
|
+
norm_name = self.normalize_column_name(column.name)
|
107
|
+
if norm_name in RESERVED_COLUMN_NAMES:
|
108
|
+
continue
|
109
|
+
if norm_name in cols:
|
110
|
+
self.logger.warning(f"Duplicate normalized column name '{norm_name}'; skipping duplicate for column '{column.name}'.")
|
111
|
+
continue
|
112
|
+
cols[norm_name] = column
|
113
|
+
annotations[norm_name] = self._python_type_for_column(column)
|
114
|
+
return cols, annotations
|
115
|
+
|
116
|
+
def _python_type_for_column(self, column: Any) -> Any:
|
117
|
+
col_type = type(column.type)
|
118
|
+
if issubclass(col_type, Integer):
|
119
|
+
return int
|
120
|
+
elif issubclass(col_type, (String, Text)):
|
121
|
+
return str
|
122
|
+
elif issubclass(col_type, (Float, Numeric)):
|
123
|
+
return float
|
124
|
+
elif issubclass(col_type, DateTime):
|
125
|
+
return datetime
|
126
|
+
elif issubclass(col_type, Boolean):
|
127
|
+
return bool
|
128
|
+
else:
|
129
|
+
return Any
|
130
|
+
|
131
|
+
def _add_relationships(self, attrs: Dict[str, Any], table: Any) -> None:
|
132
|
+
inspector = inspect(self.engine)
|
133
|
+
fk_info_list = inspector.get_foreign_keys(self.table.name)
|
134
|
+
fk_groups = defaultdict(list)
|
135
|
+
for fk_info in fk_info_list:
|
136
|
+
referred_table = fk_info.get("referred_table")
|
137
|
+
if referred_table:
|
138
|
+
fk_groups[referred_table].append(fk_info)
|
139
|
+
|
140
|
+
for related_table_name, fk_dicts in fk_groups.items():
|
141
|
+
try:
|
142
|
+
if related_table_name not in MODEL_REGISTRY:
|
143
|
+
self.logger.debug(f"Building missing model for related table {related_table_name}.")
|
144
|
+
remote_model = SQLModelModelBuilder(
|
145
|
+
self.engine,
|
146
|
+
related_table_name,
|
147
|
+
add_relationships=False,
|
148
|
+
debug=self.debug,
|
149
|
+
logger=self.logger,
|
150
|
+
).build_model()
|
151
|
+
if related_table_name not in MODEL_REGISTRY or remote_model is None:
|
152
|
+
raise ValueError(f"Failed to build model for table {related_table_name}.")
|
153
|
+
else:
|
154
|
+
remote_model = MODEL_REGISTRY[related_table_name]
|
155
|
+
except Exception as e:
|
156
|
+
self.logger.warning(f"Could not build model for table {related_table_name}: {e}")
|
157
|
+
continue
|
158
|
+
|
159
|
+
remote_table = remote_model.__table__
|
160
|
+
join_conditions = []
|
161
|
+
local_foreign_keys = []
|
162
|
+
remote_side_keys = []
|
163
|
+
for fk_info in fk_dicts:
|
164
|
+
local_cols = fk_info.get("constrained_columns", [])
|
165
|
+
remote_cols = fk_info.get("referred_columns", [])
|
166
|
+
if not local_cols or not remote_cols:
|
167
|
+
self.logger.warning(f"Incomplete FK definition for {related_table_name} in {self.table_name}.")
|
168
|
+
continue
|
169
|
+
local_col_name = local_cols[0]
|
170
|
+
remote_col_name = remote_cols[0]
|
171
|
+
try:
|
172
|
+
local_col = self.table.c[local_col_name]
|
173
|
+
except KeyError:
|
174
|
+
self.logger.warning(f"Local column {local_col_name} not found in {self.table_name}.")
|
175
|
+
continue
|
176
|
+
try:
|
177
|
+
remote_col = remote_table.columns[remote_col_name]
|
178
|
+
except KeyError:
|
179
|
+
self.logger.warning(f"Remote column {remote_col_name} not found in model {remote_model.__name__}.")
|
180
|
+
continue
|
181
|
+
if not local_col.foreign_keys:
|
182
|
+
self.logger.warning(f"Column {local_col_name} in {self.table_name} is not defined as a foreign key.")
|
183
|
+
continue
|
184
|
+
if remote_col.name not in remote_model.__table__.columns.keys():
|
185
|
+
self.logger.warning(f"Remote column {remote_col.name} not in table for model {remote_model.__name__}.")
|
186
|
+
continue
|
187
|
+
join_conditions.append(foreign(local_col) == remote_col)
|
188
|
+
local_foreign_keys.append(local_col)
|
189
|
+
remote_side_keys.append(remote_col)
|
190
|
+
if not join_conditions:
|
191
|
+
self.logger.warning(f"No valid join conditions for relationship from {self.table_name} to {related_table_name}.")
|
192
|
+
continue
|
193
|
+
primaryjoin_expr = join_conditions[0] if len(join_conditions) == 1 else and_(*join_conditions)
|
194
|
+
relationship_name = self.normalize_column_name(related_table_name)
|
195
|
+
if relationship_name in attrs:
|
196
|
+
continue
|
197
|
+
try:
|
198
|
+
rel = relationship(
|
199
|
+
lambda rt=related_table_name: MODEL_REGISTRY[rt],
|
200
|
+
primaryjoin=primaryjoin_expr,
|
201
|
+
foreign_keys=local_foreign_keys,
|
202
|
+
remote_side=remote_side_keys,
|
203
|
+
lazy="joined",
|
204
|
+
viewonly=True,
|
205
|
+
)
|
206
|
+
attrs[relationship_name] = rel
|
207
|
+
attrs.setdefault("__annotations__", {})[relationship_name] = List[remote_model]
|
208
|
+
self.logger.debug(f"Added relationship '{relationship_name}' referencing {related_table_name}.")
|
209
|
+
except (ArgumentError, NoForeignKeysError) as e:
|
210
|
+
self.logger.error(f"Error creating relationship '{relationship_name}' on model {self.model_name}: {e}")
|
211
|
+
continue
|
212
|
+
try:
|
213
|
+
configure_mappers()
|
214
|
+
self.logger.debug(f"Validated relationship '{relationship_name}' on model {self.model_name}.")
|
215
|
+
except Exception as e:
|
216
|
+
self.logger.error(f"Relationship '{relationship_name}' on model {self.model_name} failed configuration: {e}")
|
217
|
+
del attrs[relationship_name]
|
218
|
+
self.logger.debug(f"Removed relationship '{relationship_name}' from model {self.model_name}.")
|
219
|
+
clear_mappers()
|
220
|
+
continue
|
221
|
+
|
222
|
+
@staticmethod
|
223
|
+
def normalize_class_name(table_name: str) -> str:
|
224
|
+
return "".join(word.capitalize() for word in table_name.split("_"))
|
225
|
+
|
226
|
+
def normalize_column_name(self, column_name: Any) -> str:
|
227
|
+
try:
|
228
|
+
s = str(column_name)
|
229
|
+
except Exception as e:
|
230
|
+
self.logger.debug(f"Failed to convert column name {column_name} to string: {e}")
|
231
|
+
s = ""
|
232
|
+
norm_name = re.sub(r"\W|^(?=\d)", "_", s)
|
233
|
+
if norm_name in RESERVED_KEYWORDS:
|
234
|
+
norm_name += "_field"
|
235
|
+
return norm_name
|
236
|
+
|
237
|
+
@staticmethod
|
238
|
+
def export_models_to_file(filename: str) -> None:
|
239
|
+
reserved_attrs = {"metadata", "__tablename__", "__sqlmodel_relationships__", "__name__"}
|
240
|
+
import re
|
241
|
+
import typing
|
242
|
+
|
243
|
+
with open(filename, "w") as f:
|
244
|
+
f.write("from sqlmodel import SQLModel, Field, Relationship, Column\n")
|
245
|
+
f.write("from sqlalchemy import ForeignKey\n")
|
246
|
+
f.write("from sqlalchemy.sql.elements import DefaultClause\n")
|
247
|
+
f.write("from sqlalchemy.sql.sqltypes import INTEGER, DATE, VARCHAR, SMALLINT, FLOAT, CHAR, TEXT, DATETIME\n")
|
248
|
+
f.write("from sqlalchemy.dialects.mysql import TINYINT\n")
|
249
|
+
f.write("from typing import Any, List, Optional, Union\n")
|
250
|
+
f.write("import typing\n")
|
251
|
+
f.write("import sqlalchemy\n\n\n")
|
252
|
+
|
253
|
+
f.write("class Base(SQLModel):\n")
|
254
|
+
f.write(" class Config:\n")
|
255
|
+
f.write(" arbitrary_types_allowed = True\n\n\n")
|
256
|
+
|
257
|
+
for table_name, model in MODEL_REGISTRY.items():
|
258
|
+
f.write(f"class {model.__name__}(SQLModel, table=True):\n")
|
259
|
+
f.write(f" __tablename__ = '{table_name}'\n")
|
260
|
+
for column in model.__table__.columns:
|
261
|
+
col_repr = repr(column)
|
262
|
+
col_repr = re.sub(r", table=<[^>]+>", "", col_repr)
|
263
|
+
col_repr = re.sub(r",\s*server_default=DefaultClause\([^)]*\)", "", col_repr)
|
264
|
+
col_repr = re.sub(r",\s*display_width=\d+", "", col_repr)
|
265
|
+
f.write(f" {column.name}: Any = Field(sa_column={col_repr})\n")
|
266
|
+
annotations = typing.get_type_hints(model)
|
267
|
+
col_names = {col.name for col in model.__table__.columns}
|
268
|
+
for key, type_hint in annotations.items():
|
269
|
+
if key in col_names or key in reserved_attrs or key.startswith("__"):
|
270
|
+
continue
|
271
|
+
origin = get_origin(type_hint)
|
272
|
+
if origin in (list, List):
|
273
|
+
remote_model = get_args(type_hint)[0]
|
274
|
+
remote_model_name = remote_model.__name__
|
275
|
+
elif origin is Optional:
|
276
|
+
args = get_args(type_hint)
|
277
|
+
non_none = [arg for arg in args if arg is not type(None)]
|
278
|
+
remote_model_name = non_none[0].__name__ if non_none else "Any"
|
279
|
+
else:
|
280
|
+
remote_model_name = type_hint.__name__ if hasattr(type_hint, '__name__') else str(type_hint)
|
281
|
+
f.write(f" {key}: {type_hint} = Relationship(\"{remote_model_name}\")\n")
|
282
|
+
f.write("\n\n")
|
283
|
+
print(f"Models exported to {filename}")
|
@@ -0,0 +1,236 @@
|
|
1
|
+
import datetime
|
2
|
+
import itertools
|
3
|
+
import dask.dataframe as dd
|
4
|
+
import pandas as pd
|
5
|
+
from sqlalchemy import func, cast
|
6
|
+
from sqlalchemy.sql.sqltypes import Date, Time
|
7
|
+
from sibi_dst.v2.utils import Logger
|
8
|
+
import typing
|
9
|
+
|
10
|
+
|
11
|
+
class FilterHandler:
|
12
|
+
"""
|
13
|
+
Handles the application of filters to data sources with support for SQLAlchemy, SQLModel, and Dask backends.
|
14
|
+
|
15
|
+
This class abstracts the process of applying filters to various backends, specifically
|
16
|
+
SQLAlchemy/SQLModel queries and Dask DataFrames. It supports multiple filtering operations,
|
17
|
+
including exact matches, comparisons, and string-related operations such as contains and regex.
|
18
|
+
"""
|
19
|
+
|
20
|
+
def __init__(self, backend, logger=None, debug=False):
|
21
|
+
"""
|
22
|
+
Initialize the FilterHandler.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
backend: The backend to use ('sqlalchemy', 'sqlmodel', or 'dask').
|
26
|
+
logger: Optional logger for debugging purposes.
|
27
|
+
"""
|
28
|
+
self.backend = backend
|
29
|
+
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
30
|
+
self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
|
31
|
+
self.backend_methods = self._get_backend_methods(backend)
|
32
|
+
|
33
|
+
def apply_filters(self, query_or_df, model=None, filters=None):
|
34
|
+
"""
|
35
|
+
Apply filters to the data source based on the backend.
|
36
|
+
|
37
|
+
Args:
|
38
|
+
query_or_df: A SQLAlchemy/SQLModel query or Dask DataFrame.
|
39
|
+
model: SQLAlchemy/SQLModel model (required for SQLAlchemy/SQLModel backend).
|
40
|
+
filters: Dictionary of filters.
|
41
|
+
|
42
|
+
Returns:
|
43
|
+
Filtered query or DataFrame.
|
44
|
+
"""
|
45
|
+
filters = filters or {}
|
46
|
+
for key, value in filters.items():
|
47
|
+
field_name, casting, operation = self._parse_filter_key(key)
|
48
|
+
parsed_value = self._parse_filter_value(casting, value)
|
49
|
+
# For both SQLAlchemy and SQLModel, use the same backend methods.
|
50
|
+
if self.backend in ("sqlalchemy", "sqlmodel"):
|
51
|
+
column = self.backend_methods["get_column"](field_name, model, casting)
|
52
|
+
condition = self.backend_methods["apply_operation"](column, operation, parsed_value)
|
53
|
+
query_or_df = self.backend_methods["apply_condition"](query_or_df, condition)
|
54
|
+
elif self.backend == "dask":
|
55
|
+
column = self.backend_methods["get_column"](query_or_df, field_name, casting)
|
56
|
+
condition = self.backend_methods["apply_operation"](column, operation, parsed_value)
|
57
|
+
query_or_df = self.backend_methods["apply_condition"](query_or_df, condition)
|
58
|
+
else:
|
59
|
+
raise ValueError(f"Unsupported backend: {self.backend}")
|
60
|
+
|
61
|
+
return query_or_df
|
62
|
+
|
63
|
+
@staticmethod
|
64
|
+
def _parse_filter_key(key):
|
65
|
+
parts = key.split("__")
|
66
|
+
field_name = parts[0]
|
67
|
+
casting = None
|
68
|
+
operation = "exact"
|
69
|
+
|
70
|
+
if len(parts) == 3:
|
71
|
+
_, casting, operation = parts
|
72
|
+
elif len(parts) == 2:
|
73
|
+
if parts[1] in FilterHandler._comparison_operators():
|
74
|
+
operation = parts[1]
|
75
|
+
elif parts[1] in FilterHandler._dt_operators() + FilterHandler._date_operators():
|
76
|
+
casting = parts[1]
|
77
|
+
|
78
|
+
return field_name, casting, operation
|
79
|
+
|
80
|
+
def _parse_filter_value(self, casting, value):
|
81
|
+
"""
|
82
|
+
Convert filter value to an appropriate type based on the casting (e.g., date).
|
83
|
+
"""
|
84
|
+
if casting == "date":
|
85
|
+
if isinstance(value, str):
|
86
|
+
return pd.Timestamp(value) # Convert to datetime64[ns]
|
87
|
+
if isinstance(value, list):
|
88
|
+
return [pd.Timestamp(v) for v in value]
|
89
|
+
elif casting == "time" and isinstance(value, str):
|
90
|
+
parsed = datetime.time.fromisoformat(value)
|
91
|
+
self.logger.debug(f"Parsed value (time): {parsed}")
|
92
|
+
return parsed
|
93
|
+
return value
|
94
|
+
|
95
|
+
@staticmethod
|
96
|
+
def _get_backend_methods(backend):
|
97
|
+
if backend in ("sqlalchemy", "sqlmodel"):
|
98
|
+
return {
|
99
|
+
"get_column": FilterHandler._get_sqlalchemy_column,
|
100
|
+
"apply_operation": FilterHandler._apply_operation_sqlalchemy,
|
101
|
+
"apply_condition": lambda query, condition: query.filter(condition),
|
102
|
+
}
|
103
|
+
elif backend == "dask":
|
104
|
+
return {
|
105
|
+
"get_column": FilterHandler._get_dask_column,
|
106
|
+
"apply_operation": FilterHandler._apply_operation_dask,
|
107
|
+
"apply_condition": lambda df, condition: df[condition],
|
108
|
+
}
|
109
|
+
else:
|
110
|
+
raise ValueError(f"Unsupported backend: {backend}")
|
111
|
+
|
112
|
+
@staticmethod
|
113
|
+
def _get_sqlalchemy_column(field_name, model, casting):
|
114
|
+
"""
|
115
|
+
Retrieve and cast a column for SQLAlchemy/SQLModel based on the field name and casting.
|
116
|
+
|
117
|
+
Args:
|
118
|
+
field_name: The name of the field/column.
|
119
|
+
model: The SQLAlchemy/SQLModel model.
|
120
|
+
casting: The casting type ('date', 'time', etc.).
|
121
|
+
|
122
|
+
Returns:
|
123
|
+
The SQLAlchemy column object, optionally cast or transformed.
|
124
|
+
"""
|
125
|
+
column = getattr(model, field_name, None)
|
126
|
+
if not column:
|
127
|
+
raise AttributeError(f"Field '{field_name}' not found in model '{model.__name__}'")
|
128
|
+
if casting == "date":
|
129
|
+
column = cast(column, Date)
|
130
|
+
elif casting == "time":
|
131
|
+
column = cast(column, Time)
|
132
|
+
elif casting in FilterHandler._date_operators():
|
133
|
+
column = func.extract(casting, column)
|
134
|
+
return column
|
135
|
+
|
136
|
+
@staticmethod
|
137
|
+
def _get_dask_column(df, field_name, casting):
|
138
|
+
"""
|
139
|
+
Retrieve and optionally cast a column for Dask based on the field name and casting.
|
140
|
+
|
141
|
+
Args:
|
142
|
+
df: The Dask DataFrame.
|
143
|
+
field_name: The name of the field/column.
|
144
|
+
casting: The casting type ('date', 'time', etc.).
|
145
|
+
|
146
|
+
Returns:
|
147
|
+
The Dask Series, optionally cast or transformed.
|
148
|
+
"""
|
149
|
+
column = dd.to_datetime(df[field_name], errors="coerce") if casting in FilterHandler._dt_operators() else df[field_name]
|
150
|
+
if casting == "date":
|
151
|
+
column = column.dt.floor("D")
|
152
|
+
elif casting in FilterHandler._date_operators():
|
153
|
+
column = getattr(column.dt, casting)
|
154
|
+
return column
|
155
|
+
|
156
|
+
@staticmethod
|
157
|
+
def _apply_operation_sqlalchemy(column, operation, value):
|
158
|
+
operation_map = FilterHandler._operation_map_sqlalchemy()
|
159
|
+
if operation not in operation_map:
|
160
|
+
raise ValueError(f"Unsupported operation: {operation}")
|
161
|
+
return operation_map[operation](column, value)
|
162
|
+
|
163
|
+
@staticmethod
|
164
|
+
def _apply_operation_dask(column, operation, value):
|
165
|
+
operation_map = FilterHandler._operation_map_dask()
|
166
|
+
if operation not in operation_map:
|
167
|
+
raise ValueError(f"Unsupported operation: {operation}")
|
168
|
+
return operation_map[operation](column, value)
|
169
|
+
|
170
|
+
@staticmethod
|
171
|
+
def _operation_map_sqlalchemy():
|
172
|
+
return {
|
173
|
+
"exact": lambda col, val: col == val,
|
174
|
+
"gt": lambda col, val: col > val,
|
175
|
+
"gte": lambda col, val: col >= val,
|
176
|
+
"lt": lambda col, val: col < val,
|
177
|
+
"lte": lambda col, val: col <= val,
|
178
|
+
"in": lambda col, val: col.in_(val),
|
179
|
+
"range": lambda col, val: col.between(val[0], val[1]),
|
180
|
+
"contains": lambda col, val: col.like(f"%{val}%"),
|
181
|
+
"startswith": lambda col, val: col.like(f"{val}%"),
|
182
|
+
"endswith": lambda col, val: col.like(f"%{val}"),
|
183
|
+
"isnull": lambda col, val: col.is_(None) if val else col.isnot(None),
|
184
|
+
"not_exact": lambda col, val: col != val,
|
185
|
+
"not_contains": lambda col, val: ~col.like(f"%{val}%"),
|
186
|
+
"not_in": lambda col, val: ~col.in_(val),
|
187
|
+
"regex": lambda col, val: col.op("~")(val),
|
188
|
+
"icontains": lambda col, val: col.ilike(f"%{val}%"),
|
189
|
+
"istartswith": lambda col, val: col.ilike(f"{val}%"),
|
190
|
+
"iendswith": lambda col, val: col.ilike(f"%{val}"),
|
191
|
+
"iexact": lambda col, val: col.ilike(val),
|
192
|
+
"iregex": lambda col, val: col.op("~*")(val),
|
193
|
+
}
|
194
|
+
|
195
|
+
@staticmethod
|
196
|
+
def _operation_map_dask():
|
197
|
+
return {
|
198
|
+
"exact": lambda col, val: col == val,
|
199
|
+
"gt": lambda col, val: col > val,
|
200
|
+
"gte": lambda col, val: col >= val,
|
201
|
+
"lt": lambda col, val: col < val,
|
202
|
+
"lte": lambda col, val: col <= val,
|
203
|
+
"in": lambda col, val: col.isin(val),
|
204
|
+
"range": lambda col, val: (col >= val[0]) & (col <= val[1]),
|
205
|
+
"contains": lambda col, val: col.str.contains(val, regex=True),
|
206
|
+
"startswith": lambda col, val: col.str.startswith(val),
|
207
|
+
"endswith": lambda col, val: col.str.endswith(val),
|
208
|
+
"isnull": lambda col, val: col.isnull() if val else col.notnull(),
|
209
|
+
"not_exact": lambda col, val: col != val,
|
210
|
+
"not_contains": lambda col, val: ~col.str.contains(val, regex=True),
|
211
|
+
"not_in": lambda col, val: ~col.isin(val),
|
212
|
+
"regex": lambda col, val: col.str.contains(val, regex=True),
|
213
|
+
"icontains": lambda col, val: col.str.contains(val, case=False, regex=True),
|
214
|
+
"istartswith": lambda col, val: col.str.startswith(val, case=False),
|
215
|
+
"iendswith": lambda col, val: col.str.endswith(val, case=False),
|
216
|
+
"iexact": lambda col, val: col.str.contains(f"^{val}$", case=False, regex=True),
|
217
|
+
"iregex": lambda col, val: col.str.contains(val, case=False, regex=True),
|
218
|
+
}
|
219
|
+
|
220
|
+
@staticmethod
|
221
|
+
def _dt_operators():
|
222
|
+
return ["date", "time"]
|
223
|
+
|
224
|
+
@staticmethod
|
225
|
+
def _date_operators():
|
226
|
+
return ["year", "month", "day", "hour", "minute", "second", "week_day"]
|
227
|
+
|
228
|
+
@staticmethod
|
229
|
+
def _comparison_operators():
|
230
|
+
return [
|
231
|
+
"gte", "lte", "gt", "lt", "exact", "in", "range",
|
232
|
+
"contains", "startswith", "endswith", "isnull",
|
233
|
+
"not_exact", "not_contains", "not_in",
|
234
|
+
"regex", "icontains", "istartswith", "iendswith",
|
235
|
+
"iexact", "iregex"
|
236
|
+
]
|
@@ -0,0 +1,139 @@
|
|
1
|
+
from typing import Optional, Dict, Union, List
|
2
|
+
|
3
|
+
from pydantic import BaseModel, model_validator, Field
|
4
|
+
|
5
|
+
dataframe_params: Dict[str, Union[None, str, bool, int, None]] = {
|
6
|
+
"fieldnames": None,
|
7
|
+
"index_col": None,
|
8
|
+
"coerce_float": False,
|
9
|
+
"verbose": True,
|
10
|
+
"datetime_index": False,
|
11
|
+
"column_names": None,
|
12
|
+
"chunk_size": 1000,
|
13
|
+
}
|
14
|
+
# dataframe_options is a dictionary that provides additional options for modifying a pandas DataFrame.
|
15
|
+
# These options include parameters for handling duplicate values, sorting, grouping, and other DataFrame operations.
|
16
|
+
|
17
|
+
dataframe_options: Dict[str, Union[bool, str, int, None]] = {
|
18
|
+
"debug": False, # Whether to print debug information
|
19
|
+
"duplicate_expr": None, # Expression for identifying duplicate values
|
20
|
+
"duplicate_keep": 'last', # How to handle duplicate values ('first', 'last', or False)
|
21
|
+
"sort_field": None, # Field to use for sorting the DataFrame
|
22
|
+
"group_by_expr": None, # Expression for grouping the DataFrame
|
23
|
+
"group_expr": None # Expression for aggregating functions to the grouped DataFrame
|
24
|
+
}
|
25
|
+
|
26
|
+
LOOKUP_SEP = "__"
|
27
|
+
|
28
|
+
|
29
|
+
class ParamsConfig(BaseModel):
|
30
|
+
"""
|
31
|
+
Defines a configuration model for parameters with functionality for parsing,
|
32
|
+
validation, and conversion of legacy filters.
|
33
|
+
|
34
|
+
This class extends BaseModel from Pydantic and is designed to handle multiple
|
35
|
+
sets of configurations, including field mappings, filters, dataframe parameters,
|
36
|
+
and dataframe options. It allows for flexible parsing of parameters across a
|
37
|
+
variety of supported structures and ensures that legacy filters can be
|
38
|
+
appropriately converted for compatibility.
|
39
|
+
|
40
|
+
:ivar field_map: Maps field names to their equivalent legacy field names.
|
41
|
+
:type field_map: Optional[Dict]
|
42
|
+
:ivar legacy_filters: Indicates whether legacy filters should be processed.
|
43
|
+
:type legacy_filters: bool
|
44
|
+
:ivar sticky_filters: Stores additional filters as key-value pairs that persist
|
45
|
+
across parameter parsing.
|
46
|
+
:type sticky_filters: Dict[str, Union[str, bool, int, float, list, tuple]]
|
47
|
+
:ivar filters: Holds all the current filters including sticky and dynamically
|
48
|
+
parsed filters.
|
49
|
+
:type filters: Dict[str, Union[str, Dict, bool, int, float, list, tuple]]
|
50
|
+
:ivar df_params: Contains parameters related to dataframe configurations in a
|
51
|
+
structured format.
|
52
|
+
:type df_params: Dict[str, Union[tuple, str, bool, None]]
|
53
|
+
:ivar df_options: Stores optional configurations for a dataframe, allowing for
|
54
|
+
additional behavior customization.
|
55
|
+
:type df_options: Dict[str, Union[bool, str, None]]
|
56
|
+
:ivar params: Dictionary of parameters provided for configuration, supporting
|
57
|
+
both basic and nested structures.
|
58
|
+
:type params: Dict[str, Union[str, bool, int, float, List[Union[str, int, bool, float]]]]
|
59
|
+
"""
|
60
|
+
field_map: Optional[Dict] = Field(default_factory=dict)
|
61
|
+
legacy_filters: bool = False
|
62
|
+
sticky_filters: Dict[str, Union[str, bool, int, float, list, tuple]] = Field(default_factory=dict)
|
63
|
+
filters: Dict[str, Union[str, Dict, bool, int, float, list, tuple]] = Field(default_factory=dict)
|
64
|
+
df_params: Dict[str, Union[tuple, str, bool, None]] = Field(default_factory=dict)
|
65
|
+
df_options: Dict[str, Union[bool, str, None]] = Field(default_factory=dict)
|
66
|
+
params: Dict[str, Union[str, bool, int, float, List[Union[str, int, bool, float]]]] = Field(default_factory=dict)
|
67
|
+
|
68
|
+
@model_validator(mode='after')
|
69
|
+
def check_params(self):
|
70
|
+
if self.params is not None:
|
71
|
+
self.parse_params(self.params)
|
72
|
+
return self
|
73
|
+
|
74
|
+
def parse_params(self, params):
|
75
|
+
"""
|
76
|
+
Parses and separates the given parameters into specific categories such as dataframe parameters,
|
77
|
+
dataframe options, and filters. Updates existing class attributes with the parsed values,
|
78
|
+
retaining any sticky filters. Also handles the legacy filters if provided.
|
79
|
+
|
80
|
+
:param params: Dictionary containing parameters to process. These parameters can include specific
|
81
|
+
keys relevant for dataframe configuration (e.g., dataframe parameters, dataframe options)
|
82
|
+
as well as arbitrary filter settings.
|
83
|
+
:type params: dict
|
84
|
+
:return: None
|
85
|
+
"""
|
86
|
+
self.legacy_filters = params.pop('legacy_filters', self.legacy_filters)
|
87
|
+
self.field_map = params.pop('field_map', self.field_map)
|
88
|
+
self.sticky_filters = params.pop('params', self.sticky_filters)
|
89
|
+
df_params, df_options, filters = {}, {}, {}
|
90
|
+
for k, v in params.items():
|
91
|
+
if k in dataframe_params.keys():
|
92
|
+
df_params.update({k: v})
|
93
|
+
elif k in dataframe_options.keys():
|
94
|
+
df_options.update({k: v})
|
95
|
+
else:
|
96
|
+
filters.update({k: v})
|
97
|
+
self.filters = {**self.sticky_filters, **filters}
|
98
|
+
self.df_params = {**self.df_params, **df_params}
|
99
|
+
self.df_options = {**self.df_options, **df_options}
|
100
|
+
if self.legacy_filters:
|
101
|
+
self.convert_legacy_filters()
|
102
|
+
|
103
|
+
def convert_legacy_filters(self):
|
104
|
+
"""
|
105
|
+
Converts legacy filter fields in the `self.filters` dictionary to their
|
106
|
+
modern equivalents using the mappings provided in `self.field_map`.
|
107
|
+
This method ensures backward compatibility for filters by automatically
|
108
|
+
translating the old field names into the current system.
|
109
|
+
|
110
|
+
The function first verifies that the required dictionaries (`legacy_filters`,
|
111
|
+
`field_map`, `filters`) are valid. It creates a reverse map of `field_map` for
|
112
|
+
efficient lookup, processes the key names within `self.filters`, and updates
|
113
|
+
them to reflect the legacy mapping.
|
114
|
+
|
115
|
+
:raises KeyError: If any required dictionary key is missing during processing.
|
116
|
+
|
117
|
+
:param self.legacy_filters: A boolean flag indicating whether legacy filters
|
118
|
+
are being used.
|
119
|
+
:type self.legacy_filters: bool
|
120
|
+
|
121
|
+
"""
|
122
|
+
if not self.legacy_filters or not self.field_map or not self.filters:
|
123
|
+
return
|
124
|
+
# create a reverse map of the field_map
|
125
|
+
reverse_map = {v: k for k, v in self.field_map.items()}
|
126
|
+
|
127
|
+
new_filters = {}
|
128
|
+
for filter_field, value in self.filters.items():
|
129
|
+
# split the filter_field if LOOKUP_SEP exists
|
130
|
+
parts = filter_field.split(LOOKUP_SEP, 1)
|
131
|
+
|
132
|
+
# replace each part with its legacy equivalent if it exists
|
133
|
+
new_parts = [reverse_map.get(part, part) for part in parts]
|
134
|
+
|
135
|
+
# join the parts back together and add to the new filters
|
136
|
+
new_filter_field = LOOKUP_SEP.join(new_parts)
|
137
|
+
new_filters[new_filter_field] = value
|
138
|
+
|
139
|
+
self.filters = new_filters
|