sibi-dst 0.3.15__tar.gz → 0.3.17__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/PKG-INFO +2 -1
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/pyproject.toml +2 -1
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/df_helper/_df_helper.py +36 -20
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/df_helper/core/__init__.py +6 -4
- sibi_dst-0.3.17/sibi_dst/df_helper/core/_filter_handler.py +216 -0
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/df_helper/plugins/django/_django_load_from_db.py +32 -20
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/df_helper/plugins/django/_io_dask.py +0 -3
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/df_helper/plugins/http/_http_config.py +4 -4
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py +9 -9
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/df_helper/plugins/sql_alchemy/_sqlachemy_filter_handler.py +4 -2
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py +8 -6
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_model_builder.py +1 -1
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/df_helper/plugins/sql_model/_sqlmodel_load_from_db.py +2 -3
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/utils/__init__.py +2 -2
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/utils/_clickhouse_writer.py +16 -16
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/utils/_data_utils.py +5 -4
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/utils/_data_wrapper.py +8 -5
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/utils/_df_utils.py +5 -5
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/utils/_log_utils.py +3 -0
- sibi_dst-0.3.17/sibi_dst/utils/_parquet_saver.py +106 -0
- sibi_dst-0.3.15/sibi_dst/utils/_parquet_saver.py +0 -211
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/README.md +0 -0
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/__init__.py +0 -0
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/df_helper/__init__.py +0 -0
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/df_helper/core/_defaults.py +0 -0
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/df_helper/core/_params_config.py +0 -0
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/df_helper/core/_query_config.py +0 -0
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/df_helper/plugins/__init__.py +0 -0
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/df_helper/plugins/django/__init__.py +0 -0
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/df_helper/plugins/django/_django_db_connection.py +0 -0
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/df_helper/plugins/django/_django_sql_model_builder.py +0 -0
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/df_helper/plugins/django/_io_dask_alt.py +0 -0
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/df_helper/plugins/http/__init__.py +0 -0
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/df_helper/plugins/parquet/__init__.py +0 -0
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/df_helper/plugins/parquet/_parquet_filter_handler.py +0 -0
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/df_helper/plugins/parquet/_parquet_options.py +0 -0
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/df_helper/plugins/sql_alchemy/__init__.py +0 -0
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_db_connection.py +0 -0
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/df_helper/plugins/sql_model/__init__.py +0 -0
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/df_helper/plugins/sql_model/_sqlmodel_db_connection.py +0 -0
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/utils/_airflow_manager.py +0 -0
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/utils/_credentials.py +0 -0
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/utils/_date_utils.py +0 -0
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/utils/_file_utils.py +0 -0
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/utils/_filepath_generator.py +0 -0
- {sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/utils/_storage_manager.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sibi-dst
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.17
|
4
4
|
Summary: Data Science Toolkit
|
5
5
|
Author: Luis Valverde
|
6
6
|
Author-email: lvalverdeb@gmail.com
|
@@ -13,6 +13,7 @@ Requires-Dist: chardet (>=5.2.0,<6.0.0)
|
|
13
13
|
Requires-Dist: charset-normalizer (>=3.4.0,<4.0.0)
|
14
14
|
Requires-Dist: clickhouse-connect (>=0.8.7,<0.9.0)
|
15
15
|
Requires-Dist: clickhouse-driver (>=0.2.9,<0.3.0)
|
16
|
+
Requires-Dist: dask-expr (>=1.1.20,<2.0.0)
|
16
17
|
Requires-Dist: dask[complete] (>=2024.11.1,<2025.0.0)
|
17
18
|
Requires-Dist: django (>=5.1.4,<6.0.0)
|
18
19
|
Requires-Dist: djangorestframework (>=3.15.2,<4.0.0)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "sibi-dst"
|
3
|
-
version = "0.3.
|
3
|
+
version = "0.3.17"
|
4
4
|
description = "Data Science Toolkit"
|
5
5
|
authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
|
6
6
|
readme = "README.md"
|
@@ -33,6 +33,7 @@ charset-normalizer = "^3.4.0"
|
|
33
33
|
uvicorn = "^0.32.1"
|
34
34
|
sqlalchemy = "^2.0.36"
|
35
35
|
djangorestframework = "^3.15.2"
|
36
|
+
dask-expr = "^1.1.20"
|
36
37
|
|
37
38
|
|
38
39
|
[build-system]
|
@@ -1,13 +1,16 @@
|
|
1
1
|
import asyncio
|
2
2
|
import datetime
|
3
|
+
import logging
|
4
|
+
import warnings
|
3
5
|
from typing import Any, Dict, TypeVar
|
4
6
|
from typing import Union, Optional
|
5
7
|
|
6
8
|
import dask.dataframe as dd
|
9
|
+
import dask_expr
|
7
10
|
import pandas as pd
|
8
11
|
from pydantic import BaseModel
|
9
12
|
|
10
|
-
from sibi_dst.df_helper.core import QueryConfig, ParamsConfig
|
13
|
+
from sibi_dst.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
|
11
14
|
from sibi_dst.utils import Logger
|
12
15
|
from sibi_dst.utils import ParquetSaver, ClickHouseWriter
|
13
16
|
from .plugins.django import *
|
@@ -18,6 +21,12 @@ from .plugins.sql_alchemy import *
|
|
18
21
|
# Define a generic type variable for BaseModel subclasses
|
19
22
|
T = TypeVar("T", bound=BaseModel)
|
20
23
|
|
24
|
+
# It is considered acceptable in Django to access protected class members
|
25
|
+
warnings.filterwarnings(
|
26
|
+
"ignore",
|
27
|
+
message="Access to a protected member _meta",
|
28
|
+
category=UserWarning,
|
29
|
+
)
|
21
30
|
|
22
31
|
class DfHelper:
|
23
32
|
df: Union[dd.DataFrame, pd.DataFrame] = None
|
@@ -36,9 +45,12 @@ class DfHelper:
|
|
36
45
|
self.default_config = self.default_config or {}
|
37
46
|
kwargs = {**self.default_config.copy(), **kwargs}
|
38
47
|
self.source = source
|
39
|
-
self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
|
40
48
|
self.debug = kwargs.setdefault("debug", False)
|
41
|
-
self.
|
49
|
+
self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
|
50
|
+
# Configure logger level
|
51
|
+
self.logger.setLevel(logging.DEBUG if self.debug else logging.INFO)
|
52
|
+
# Configure logger level
|
53
|
+
self.logger.debug("Logger initialized in DEBUG mode.")
|
42
54
|
self.parquet_storage_path = kwargs.setdefault("parquet_storage_path", None)
|
43
55
|
self.dt_field = kwargs.setdefault("dt_field", None)
|
44
56
|
self.as_pandas = kwargs.setdefault("as_pandas", False)
|
@@ -47,7 +59,7 @@ class DfHelper:
|
|
47
59
|
self.post_init(**kwargs)
|
48
60
|
|
49
61
|
def post_init(self, **kwargs):
|
50
|
-
self.logger.
|
62
|
+
self.logger.debug(f"Source used: {self.source}")
|
51
63
|
self.plugin_query = self.__get_config(QueryConfig, kwargs)
|
52
64
|
self.plugin_params = self.__get_config(ParamsConfig, kwargs)
|
53
65
|
if self.source == 'django_db':
|
@@ -93,16 +105,15 @@ class DfHelper:
|
|
93
105
|
return self._load_from_parquet(**options)
|
94
106
|
elif self.source == 'http':
|
95
107
|
if asyncio.get_event_loop().is_running():
|
96
|
-
self.logger.
|
108
|
+
self.logger.debug("Running as a task from an event loop")
|
97
109
|
return asyncio.create_task(self._load_from_http(**options))
|
98
110
|
else:
|
99
|
-
self.logger.
|
111
|
+
self.logger.debug("Regular asyncio run...")
|
100
112
|
return asyncio.run(self._load_from_http(**options))
|
101
113
|
|
102
114
|
def _load_from_sqlalchemy(self, **options):
|
103
115
|
try:
|
104
116
|
options.setdefault("debug", self.debug)
|
105
|
-
options.setdefault("verbose_debug", self.verbose_debug)
|
106
117
|
db_loader = SqlAlchemyLoadFromDb(
|
107
118
|
self.plugin_sqlalchemy,
|
108
119
|
self.plugin_query,
|
@@ -113,9 +124,9 @@ class DfHelper:
|
|
113
124
|
self.df = db_loader.build_and_load()
|
114
125
|
self._process_loaded_data()
|
115
126
|
self._post_process_df()
|
116
|
-
self.logger.
|
127
|
+
self.logger.debug("Data successfully loaded from sqlalchemy database.")
|
117
128
|
except Exception as e:
|
118
|
-
self.logger.
|
129
|
+
self.logger.debug(f"Failed to load data from sqlalchemy database: {e}: options: {options}")
|
119
130
|
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
120
131
|
|
121
132
|
return self.df
|
@@ -123,7 +134,6 @@ class DfHelper:
|
|
123
134
|
def _load_from_db(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
124
135
|
try:
|
125
136
|
options.setdefault("debug", self.debug)
|
126
|
-
options.setdefault("verbose_debug", self.verbose_debug)
|
127
137
|
db_loader = DjangoLoadFromDb(
|
128
138
|
self.plugin_django_connection,
|
129
139
|
self.plugin_query,
|
@@ -134,9 +144,9 @@ class DfHelper:
|
|
134
144
|
self.df = db_loader.build_and_load()
|
135
145
|
self._process_loaded_data()
|
136
146
|
self._post_process_df()
|
137
|
-
self.logger.
|
147
|
+
self.logger.debug("Data successfully loaded from django database.")
|
138
148
|
except Exception as e:
|
139
|
-
self.logger.
|
149
|
+
self.logger.debug(f"Failed to load data from django database: {e}")
|
140
150
|
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
141
151
|
|
142
152
|
return self.df
|
@@ -144,12 +154,12 @@ class DfHelper:
|
|
144
154
|
async def _load_from_http(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
145
155
|
"""Delegate asynchronous HTTP data loading to HttpDataSource plugin."""
|
146
156
|
if not self.plugin_http:
|
147
|
-
self.logger.
|
157
|
+
self.logger.debug("HTTP plugin not configured properly.")
|
148
158
|
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
149
159
|
try:
|
150
160
|
self.df = await self.plugin_http.fetch_data(**options)
|
151
161
|
except Exception as e:
|
152
|
-
self.logger.
|
162
|
+
self.logger.debug(f"Failed to load data from http plugin: {e}")
|
153
163
|
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
154
164
|
return self.df
|
155
165
|
|
@@ -190,10 +200,10 @@ class DfHelper:
|
|
190
200
|
if datetime_index and self.df.index.dtype != 'datetime64[ns]':
|
191
201
|
self.df = self.df.map_partitions(lambda df: df.set_index(pd.to_datetime(df.index, errors='coerce')))
|
192
202
|
|
193
|
-
self.logger.
|
203
|
+
self.logger.debug("Post-processing of DataFrame completed.")
|
194
204
|
|
195
205
|
def _process_loaded_data(self):
|
196
|
-
self.logger.
|
206
|
+
self.logger.debug(f"Type of self.df: {type(self.df)}")
|
197
207
|
if self.df.map_partitions(len).compute().sum() > 0:
|
198
208
|
field_map = self.plugin_params.field_map or {}
|
199
209
|
if isinstance(field_map, dict):
|
@@ -211,25 +221,30 @@ class DfHelper:
|
|
211
221
|
# Apply renaming
|
212
222
|
self.df = self.df.map_partitions(rename_columns, mapping=rename_mapping)
|
213
223
|
|
214
|
-
self.logger.
|
224
|
+
self.logger.debug("Processing of loaded data completed.")
|
215
225
|
|
216
226
|
def save_to_parquet(self, parquet_filename: Optional[str] = None):
|
217
227
|
ps = ParquetSaver(self.df, self.parquet_storage_path, self.logger)
|
218
228
|
ps.save_to_parquet(parquet_filename)
|
219
|
-
self.logger.
|
229
|
+
self.logger.debug(f"Parquet saved to {parquet_filename} in parquet storage: {self.parquet_storage_path}.")
|
220
230
|
|
221
231
|
def save_to_clickhouse(self, **credentials):
|
222
232
|
if self.df.map_partitions(len).compute().sum() == 0:
|
223
|
-
self.logger.
|
233
|
+
self.logger.debug("Cannot write to clickhouse since Dataframe is empty")
|
224
234
|
return
|
225
235
|
cs = ClickHouseWriter(logger=self.logger, **credentials)
|
226
236
|
cs.save_to_clickhouse(self.df)
|
227
|
-
self.logger.
|
237
|
+
self.logger.debug("Save to ClickHouse completed.")
|
228
238
|
|
229
239
|
def _load_from_parquet(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
230
240
|
self.df = self.plugin_parquet.load_files()
|
231
241
|
if options:
|
242
|
+
"""
|
243
|
+
deprecated specific filter handling to a generic one
|
232
244
|
self.df = ParquetFilterHandler(logger=self.logger).apply_filters_dask(self.df, options)
|
245
|
+
|
246
|
+
"""
|
247
|
+
self.df = FilterHandler(backend='dask', logger=self.logger).apply_filters(self.df, filters=options)
|
233
248
|
return self.df
|
234
249
|
|
235
250
|
def load_period(self, **kwargs):
|
@@ -294,6 +309,7 @@ class DfHelper:
|
|
294
309
|
elif is_datetime_field:
|
295
310
|
kwargs[f"{mapped_field}__date__gte"] = start
|
296
311
|
kwargs[f"{mapped_field}__date__lte"] = end
|
312
|
+
self.logger.debug(f"load_period kwargs: {kwargs}")
|
297
313
|
return self.load(**kwargs)
|
298
314
|
|
299
315
|
@staticmethod
|
@@ -1,12 +1,13 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from ._params_config import ParamsConfig
|
4
|
-
from ._query_config import QueryConfig
|
5
3
|
from ._defaults import (
|
6
4
|
django_field_conversion_map_pandas,
|
7
5
|
django_field_conversion_map_dask,
|
8
6
|
sqlalchemy_field_conversion_map_dask,
|
9
7
|
normalize_sqlalchemy_type)
|
8
|
+
from ._filter_handler import FilterHandler
|
9
|
+
from ._params_config import ParamsConfig
|
10
|
+
from ._query_config import QueryConfig
|
10
11
|
|
11
12
|
__all__ = [
|
12
13
|
"ParamsConfig",
|
@@ -14,5 +15,6 @@ __all__ = [
|
|
14
15
|
"django_field_conversion_map_pandas",
|
15
16
|
"django_field_conversion_map_dask",
|
16
17
|
"sqlalchemy_field_conversion_map_dask",
|
17
|
-
"normalize_sqlalchemy_type"
|
18
|
-
|
18
|
+
"normalize_sqlalchemy_type",
|
19
|
+
"FilterHandler",
|
20
|
+
]
|
@@ -0,0 +1,216 @@
|
|
1
|
+
import datetime
|
2
|
+
import dask.dataframe as dd
|
3
|
+
import pandas as pd
|
4
|
+
from sqlalchemy import func, cast
|
5
|
+
from sqlalchemy.sql.sqltypes import Date, Time
|
6
|
+
from sibi_dst.utils import Logger
|
7
|
+
|
8
|
+
class FilterHandler:
|
9
|
+
def __init__(self, backend, logger=None):
|
10
|
+
"""
|
11
|
+
Initialize the FilterHandler.
|
12
|
+
|
13
|
+
Args:
|
14
|
+
backend: The backend to use ('sqlalchemy' or 'dask').
|
15
|
+
logger: Optional logger for debugging purposes.
|
16
|
+
"""
|
17
|
+
self.backend = backend
|
18
|
+
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__) # No-op logger if none provided
|
19
|
+
self.backend_methods = self._get_backend_methods(backend)
|
20
|
+
|
21
|
+
def apply_filters(self, query_or_df, model=None, filters=None):
|
22
|
+
"""
|
23
|
+
Apply filters to the data source based on the backend.
|
24
|
+
|
25
|
+
Args:
|
26
|
+
query_or_df: SQLAlchemy query or Dask DataFrame.
|
27
|
+
model: SQLAlchemy model (required for SQLAlchemy backend).
|
28
|
+
filters: Dictionary of filters.
|
29
|
+
|
30
|
+
Returns:
|
31
|
+
Filtered query or DataFrame.
|
32
|
+
"""
|
33
|
+
filters = filters or {}
|
34
|
+
for key, value in filters.items():
|
35
|
+
field_name, casting, operation = self._parse_filter_key(key)
|
36
|
+
parsed_value = self._parse_filter_value(casting, value)
|
37
|
+
#print(field_name, casting, operation, parsed_value)
|
38
|
+
# Get the column and apply backend-specific transformations
|
39
|
+
if self.backend == "sqlalchemy":
|
40
|
+
column = self.backend_methods["get_column"](field_name, model, casting)
|
41
|
+
condition = self.backend_methods["apply_operation"](column, operation, parsed_value)
|
42
|
+
query_or_df = self.backend_methods["apply_condition"](query_or_df, condition)
|
43
|
+
|
44
|
+
elif self.backend == "dask":
|
45
|
+
column = self.backend_methods["get_column"](query_or_df, field_name, casting)
|
46
|
+
condition = self.backend_methods["apply_operation"](column, operation, parsed_value)
|
47
|
+
query_or_df = self.backend_methods["apply_condition"](query_or_df, condition)
|
48
|
+
else:
|
49
|
+
raise ValueError(f"Unsupported backend: {self.backend}")
|
50
|
+
|
51
|
+
return query_or_df
|
52
|
+
|
53
|
+
@staticmethod
|
54
|
+
def _parse_filter_key(key):
|
55
|
+
parts = key.split("__")
|
56
|
+
field_name = parts[0]
|
57
|
+
casting = None
|
58
|
+
operation = "exact"
|
59
|
+
|
60
|
+
if len(parts) == 3:
|
61
|
+
_, casting, operation = parts
|
62
|
+
elif len(parts) == 2:
|
63
|
+
if parts[1] in FilterHandler._comparison_operators():
|
64
|
+
operation = parts[1]
|
65
|
+
elif parts[1] in FilterHandler._dt_operators() + FilterHandler._date_operators():
|
66
|
+
casting = parts[1]
|
67
|
+
|
68
|
+
return field_name, casting, operation
|
69
|
+
|
70
|
+
|
71
|
+
def _parse_filter_value(self, casting, value):
|
72
|
+
"""
|
73
|
+
Convert filter value to appropriate type based on the casting (e.g., date).
|
74
|
+
"""
|
75
|
+
if casting == "date":
|
76
|
+
if isinstance(value, str):
|
77
|
+
parsed = pd.Timestamp(value) # Convert to datetime64[ns]
|
78
|
+
return parsed
|
79
|
+
if isinstance(value, list):
|
80
|
+
parsed = [pd.Timestamp(v) for v in value]
|
81
|
+
return parsed
|
82
|
+
elif casting == "time" and isinstance(value, str):
|
83
|
+
parsed = datetime.time.fromisoformat(value)
|
84
|
+
self.logger.debug(f"Parsed value (time): {parsed}")
|
85
|
+
return parsed
|
86
|
+
return value
|
87
|
+
|
88
|
+
@staticmethod
|
89
|
+
def _get_backend_methods(backend):
|
90
|
+
if backend == "sqlalchemy":
|
91
|
+
return {
|
92
|
+
"get_column": FilterHandler._get_sqlalchemy_column,
|
93
|
+
"apply_operation": FilterHandler._apply_operation_sqlalchemy,
|
94
|
+
"apply_condition": lambda query, condition: query.filter(condition),
|
95
|
+
}
|
96
|
+
elif backend == "dask":
|
97
|
+
return {
|
98
|
+
"get_column": FilterHandler._get_dask_column,
|
99
|
+
"apply_operation": FilterHandler._apply_operation_dask,
|
100
|
+
"apply_condition": lambda df, condition: df[condition],
|
101
|
+
}
|
102
|
+
else:
|
103
|
+
raise ValueError(f"Unsupported backend: {backend}")
|
104
|
+
|
105
|
+
@staticmethod
|
106
|
+
def _get_sqlalchemy_column(field_name, model, casting):
|
107
|
+
"""
|
108
|
+
Retrieve and cast a column for SQLAlchemy based on the field name and casting.
|
109
|
+
|
110
|
+
Args:
|
111
|
+
field_name: The name of the field/column in the model.
|
112
|
+
model: The SQLAlchemy model.
|
113
|
+
casting: The casting type ('date', 'time', etc.).
|
114
|
+
|
115
|
+
Returns:
|
116
|
+
The SQLAlchemy column object, optionally cast or transformed.
|
117
|
+
"""
|
118
|
+
column = getattr(model, field_name, None)
|
119
|
+
if not column:
|
120
|
+
raise AttributeError(f"Field '{field_name}' not found in model '{model.__name__}'")
|
121
|
+
|
122
|
+
if casting == "date":
|
123
|
+
# Cast the column to Date for whole-date comparisons
|
124
|
+
column = cast(column, Date)
|
125
|
+
elif casting == "time":
|
126
|
+
# Cast the column to Time for time-specific comparisons
|
127
|
+
column = cast(column, Time)
|
128
|
+
elif casting in FilterHandler._date_operators():
|
129
|
+
# Extract date part (e.g., year, month) using SQLAlchemy functions
|
130
|
+
column = func.extract(casting, column)
|
131
|
+
|
132
|
+
return column
|
133
|
+
|
134
|
+
@staticmethod
|
135
|
+
def _get_dask_column(df, field_name, casting):
|
136
|
+
"""
|
137
|
+
Retrieve and optionally cast a column for Dask based on the field name and casting.
|
138
|
+
|
139
|
+
Args:
|
140
|
+
df: The Dask DataFrame.
|
141
|
+
field_name: The name of the field/column in the DataFrame.
|
142
|
+
casting: The casting type ('date', 'time', etc.).
|
143
|
+
|
144
|
+
Returns:
|
145
|
+
The Dask Series object, optionally cast or transformed.
|
146
|
+
"""
|
147
|
+
column = dd.to_datetime(df[field_name], errors="coerce") if casting in FilterHandler._dt_operators() else df[
|
148
|
+
field_name]
|
149
|
+
|
150
|
+
if casting == "date":
|
151
|
+
column = column.dt.floor("D") # Ensure truncation to the date level
|
152
|
+
elif casting in FilterHandler._date_operators():
|
153
|
+
column = getattr(column.dt, casting)
|
154
|
+
|
155
|
+
return column
|
156
|
+
|
157
|
+
@staticmethod
|
158
|
+
def _apply_operation_sqlalchemy(column, operation, value):
|
159
|
+
operation_map = FilterHandler._operation_map_sqlalchemy()
|
160
|
+
if operation not in operation_map:
|
161
|
+
raise ValueError(f"Unsupported operation: {operation}")
|
162
|
+
return operation_map[operation](column, value)
|
163
|
+
|
164
|
+
@staticmethod
|
165
|
+
def _apply_operation_dask(column, operation, value):
|
166
|
+
operation_map = FilterHandler._operation_map_dask()
|
167
|
+
if operation not in operation_map:
|
168
|
+
raise ValueError(f"Unsupported operation: {operation}")
|
169
|
+
return operation_map[operation](column, value)
|
170
|
+
|
171
|
+
@staticmethod
|
172
|
+
def _operation_map_sqlalchemy():
|
173
|
+
return {
|
174
|
+
"exact": lambda col, val: col == val,
|
175
|
+
"gt": lambda col, val: col > val,
|
176
|
+
"gte": lambda col, val: col >= val,
|
177
|
+
"lt": lambda col, val: col < val,
|
178
|
+
"lte": lambda col, val: col <= val,
|
179
|
+
"in": lambda col, val: col.in_(val),
|
180
|
+
"range": lambda col, val: col.between(val[0], val[1]),
|
181
|
+
"contains": lambda col, val: col.like(f"%{val}%"),
|
182
|
+
"startswith": lambda col, val: col.like(f"{val}%"),
|
183
|
+
"endswith": lambda col, val: col.like(f"%{val}"),
|
184
|
+
"isnull": lambda col, val: col.is_(None) if val else col.isnot(None),
|
185
|
+
}
|
186
|
+
|
187
|
+
@staticmethod
|
188
|
+
def _operation_map_dask():
|
189
|
+
return {
|
190
|
+
"exact": lambda col, val: col == val,
|
191
|
+
"gt": lambda col, val: col > val,
|
192
|
+
"gte": lambda col, val: col >= val,
|
193
|
+
"lt": lambda col, val: col < val,
|
194
|
+
"lte": lambda col, val: col <= val,
|
195
|
+
"in": lambda col, val: col.isin(val),
|
196
|
+
"range": lambda col, val: (col >= val[0]) & (col <= val[1]),
|
197
|
+
"contains": lambda col, val: col.str.contains(val, regex=True),
|
198
|
+
"startswith": lambda col, val: col.str.startswith(val),
|
199
|
+
"endswith": lambda col, val: col.str.endswith(val),
|
200
|
+
"isnull": lambda col, val: col.isnull() if val else col.notnull(),
|
201
|
+
}
|
202
|
+
|
203
|
+
@staticmethod
|
204
|
+
def _dt_operators():
|
205
|
+
return ["date", "time"]
|
206
|
+
|
207
|
+
@staticmethod
|
208
|
+
def _date_operators():
|
209
|
+
return ["year", "month", "day", "hour", "minute", "second", "week_day"]
|
210
|
+
|
211
|
+
@staticmethod
|
212
|
+
def _comparison_operators():
|
213
|
+
return [
|
214
|
+
"gte", "lte", "gt", "lt", "exact", "in", "range",
|
215
|
+
"contains", "startswith", "endswith", "isnull",
|
216
|
+
]
|
{sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/df_helper/plugins/django/_django_load_from_db.py
RENAMED
@@ -1,5 +1,8 @@
|
|
1
|
+
import warnings
|
2
|
+
|
1
3
|
import dask.dataframe as dd
|
2
4
|
import pandas as pd
|
5
|
+
from IPython.core.hooks import deprecated
|
3
6
|
from django.db.models import Q
|
4
7
|
|
5
8
|
from sibi_dst.df_helper.plugins.django import ReadFrameDask
|
@@ -12,13 +15,11 @@ class DjangoLoadFromDb:
|
|
12
15
|
def __init__(self, db_connection, db_query, db_params, logger, **kwargs):
|
13
16
|
self.connection_config = db_connection
|
14
17
|
self.debug = kwargs.pop('debug', False)
|
15
|
-
self.verbose_debug = kwargs.pop('verbose_debug', False)
|
16
18
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
17
19
|
if self.connection_config.model is None:
|
18
20
|
if self.debug:
|
19
|
-
self.logger.
|
20
|
-
|
21
|
-
print('Model must be specified')
|
21
|
+
self.logger.debug('Model must be specified')
|
22
|
+
|
22
23
|
raise ValueError('Model must be specified')
|
23
24
|
|
24
25
|
self.query_config = db_query
|
@@ -45,7 +46,7 @@ class DjangoLoadFromDb:
|
|
45
46
|
try:
|
46
47
|
self.df = ReadFrameDask(queryset, **self.params_config.df_params).read_frame()
|
47
48
|
except Exception as e:
|
48
|
-
self.logger.
|
49
|
+
self.logger.debug(f'Error loading query: {str(queryset.query)}, error message: {e}')
|
49
50
|
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
50
51
|
else:
|
51
52
|
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
@@ -69,16 +70,28 @@ class DjangoLoadFromDb:
|
|
69
70
|
:param df: Dask DataFrame whose columns' data types are to be converted.
|
70
71
|
:return: Dask DataFrame with converted column data types.
|
71
72
|
"""
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
73
|
+
"""
|
74
|
+
[DEPRECATED] Convert the data types of columns in a Dask DataFrame based on the field type in the Django model.
|
75
|
+
|
76
|
+
:param df: Dask DataFrame whose columns' data types are to be converted.
|
77
|
+
:return: Dask DataFrame with converted column data types.
|
78
|
+
"""
|
79
|
+
# Emit deprecation warning
|
80
|
+
warnings.warn(
|
81
|
+
"_convert_columns is deprecated and will be removed in a future release. "
|
82
|
+
"Consider using <new_method_name> instead.",
|
83
|
+
DeprecationWarning,
|
84
|
+
stacklevel=2,
|
85
|
+
)
|
86
|
+
|
87
|
+
# Log deprecation message if debug mode is enabled
|
80
88
|
if self.debug:
|
81
|
-
self.logger.
|
89
|
+
self.logger.warning(
|
90
|
+
"[DEPRECATION NOTICE] The `_convert_columns` method is deprecated and will be removed in a future release. "
|
91
|
+
"Consider using <new_method_name> instead."
|
92
|
+
)
|
93
|
+
|
94
|
+
self.logger.debug(f'Converting columns: {list(df.columns)}')
|
82
95
|
|
83
96
|
# Get field information from the Django model
|
84
97
|
model_fields = self.connection_config.model._meta.get_fields()
|
@@ -87,13 +100,13 @@ class DjangoLoadFromDb:
|
|
87
100
|
for field_name, field_type in field_type_map.items():
|
88
101
|
if field_name not in df.columns:
|
89
102
|
|
90
|
-
|
103
|
+
self.logger.debug(f"Column '{field_name}' not found in DataFrame columns.")
|
91
104
|
continue
|
92
105
|
|
93
106
|
conversion_func = django_field_conversion_map_dask.get(field_type)
|
94
107
|
if not conversion_func:
|
95
108
|
message=f"Field type '{field_type}' not found in conversion_map."
|
96
|
-
|
109
|
+
self.logger.debug(message)
|
97
110
|
continue
|
98
111
|
|
99
112
|
def apply_conversion(partition):
|
@@ -104,7 +117,7 @@ class DjangoLoadFromDb:
|
|
104
117
|
if field_name in partition.columns:
|
105
118
|
partition[field_name] = conversion_func(partition[field_name])
|
106
119
|
except Exception as e:
|
107
|
-
self.logger.
|
120
|
+
self.logger.debug(f"Error converting column '{field_name}' in partition: {str(e)}")
|
108
121
|
return partition
|
109
122
|
|
110
123
|
try:
|
@@ -113,9 +126,8 @@ class DjangoLoadFromDb:
|
|
113
126
|
apply_conversion,
|
114
127
|
meta=df,
|
115
128
|
)
|
116
|
-
|
117
|
-
is_verbose=True)
|
129
|
+
self.logger.debug(f"Successfully queued conversion for column '{field_name}' to type '{field_type}'.")
|
118
130
|
except Exception as e:
|
119
|
-
|
131
|
+
self.logger.debug(f"Failed to queue conversion for column '{field_name}': {str(e)}")
|
120
132
|
|
121
133
|
return df
|
@@ -31,17 +31,17 @@ class HttpConfig(BaseModel):
|
|
31
31
|
# Set up headers with API key if provided
|
32
32
|
headers = {"Authorization": f"Bearer {self.api_key.get_secret_value()}"} if self.api_key else {}
|
33
33
|
|
34
|
-
self.logger.
|
34
|
+
self.logger.debug(f"Fetching data from {formatted_url} with params {self.params}")
|
35
35
|
async with httpx.AsyncClient() as client:
|
36
36
|
response = await client.get(formatted_url, params=self.params, headers=headers, timeout=self.timeout)
|
37
37
|
response.raise_for_status()
|
38
38
|
data = response.json()
|
39
39
|
df = dd.from_pandas(pd.json_normalize(data), npartitions=1)
|
40
|
-
self.logger.
|
40
|
+
self.logger.debug("Data successfully loaded from HTTP JSON source.")
|
41
41
|
return df
|
42
42
|
except httpx.RequestError as e:
|
43
|
-
self.logger.
|
43
|
+
self.logger.debug(f"HTTP request error: {e}")
|
44
44
|
raise
|
45
45
|
except ValueError as e:
|
46
|
-
self.logger.
|
46
|
+
self.logger.debug(f"Error parsing JSON data: {e}")
|
47
47
|
raise
|
{sibi_dst-0.3.15 → sibi_dst-0.3.17}/sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py
RENAMED
@@ -8,7 +8,7 @@ from sqlalchemy.orm import sessionmaker
|
|
8
8
|
|
9
9
|
from sibi_dst.df_helper.plugins.sql_alchemy._sqlachemy_filter_handler import SqlAlchemyFilterHandler
|
10
10
|
from sibi_dst.utils import Logger
|
11
|
-
|
11
|
+
from sibi_dst.df_helper.core import FilterHandler
|
12
12
|
|
13
13
|
class SQLAlchemyDask:
|
14
14
|
def __init__(self, model, filters, engine_url, chunk_size=1000, logger=None, debug=False):
|
@@ -38,7 +38,6 @@ class SQLAlchemyDask:
|
|
38
38
|
"""
|
39
39
|
mapper = inspect(model)
|
40
40
|
sqlalchemy_to_dask_dtype = {
|
41
|
-
#'INTEGER': pd.to_numeric(x, errors="coerce"),
|
42
41
|
'INTEGER': 'Int64',
|
43
42
|
'SMALLINT': 'Int64',
|
44
43
|
'BIGINT': 'Int64',
|
@@ -72,11 +71,15 @@ class SQLAlchemyDask:
|
|
72
71
|
# Build query
|
73
72
|
self.query = select(self.model)
|
74
73
|
if self.filters:
|
75
|
-
|
74
|
+
"""
|
75
|
+
deprecated specific filter handling to a generic one
|
76
|
+
#self.query = SqlAlchemyFilterHandler.apply_filters_sqlalchemy(self.query, self.model, self.filters)
|
77
|
+
"""
|
78
|
+
self.query = FilterHandler(backend="sqlalchemy", logger=self.logger).apply_filters(self.query, model=self.model, filters=self.filters)
|
76
79
|
else:
|
77
80
|
n_records = 100
|
78
81
|
self.query = self.query.limit(n_records)
|
79
|
-
|
82
|
+
self.logger.debug(f"query:{self.query}")
|
80
83
|
# Infer dtypes
|
81
84
|
dtypes = self.infer_dtypes_from_model(self.model)
|
82
85
|
# Get the column order from the SQLAlchemy model
|
@@ -124,14 +127,11 @@ class SQLAlchemyDask:
|
|
124
127
|
else:
|
125
128
|
dask_df = dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
|
126
129
|
|
127
|
-
|
128
|
-
self.logger.info(f"Loaded {len(dask_df)} rows into Dask DataFrame.")
|
129
|
-
|
130
|
-
if isinstance(dask_df, dask_expr._collection.DataFrame):
|
131
|
-
dask_df = dask_df.to_legacy_dataframe()
|
130
|
+
self.logger.debug(f"Loaded {len(dask_df)} rows into Dask DataFrame.")
|
132
131
|
|
133
132
|
return dask_df
|
134
133
|
|
135
134
|
except Exception as e:
|
136
135
|
self.logger.error(f"Error executing query: {str(e)}")
|
136
|
+
self.logger.error(self.query)
|
137
137
|
return dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
|
@@ -1,7 +1,9 @@
|
|
1
|
-
from sqlalchemy import and_, or_, not_, func, cast
|
2
|
-
from sqlalchemy.sql.sqltypes import Date, Time
|
3
1
|
import datetime
|
4
2
|
|
3
|
+
from sqlalchemy import func, cast
|
4
|
+
from sqlalchemy.sql.sqltypes import Date, Time
|
5
|
+
|
6
|
+
|
5
7
|
class SqlAlchemyFilterHandler:
|
6
8
|
@staticmethod
|
7
9
|
def apply_filters_sqlalchemy(query, model, filters):
|