sibi-dst 0.3.14__py3-none-any.whl → 0.3.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_df_helper.py +42 -30
- sibi_dst/df_helper/core/__init__.py +6 -4
- sibi_dst/df_helper/core/_filter_handler.py +216 -0
- sibi_dst/df_helper/plugins/django/_django_load_from_db.py +32 -20
- sibi_dst/df_helper/plugins/django/_io_dask.py +0 -3
- sibi_dst/df_helper/plugins/http/_http_config.py +4 -4
- sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py +9 -9
- sibi_dst/df_helper/plugins/sql_alchemy/_sqlachemy_filter_handler.py +4 -2
- sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py +8 -6
- sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_model_builder.py +5 -2
- sibi_dst/df_helper/plugins/sql_model/_sqlmodel_load_from_db.py +2 -3
- sibi_dst/utils/_clickhouse_writer.py +16 -16
- sibi_dst/utils/_data_utils.py +40 -81
- sibi_dst/utils/_data_wrapper.py +8 -4
- sibi_dst/utils/_df_utils.py +5 -5
- sibi_dst/utils/_log_utils.py +3 -0
- sibi_dst/utils/_parquet_saver.py +3 -108
- {sibi_dst-0.3.14.dist-info → sibi_dst-0.3.16.dist-info}/METADATA +2 -1
- {sibi_dst-0.3.14.dist-info → sibi_dst-0.3.16.dist-info}/RECORD +20 -19
- {sibi_dst-0.3.14.dist-info → sibi_dst-0.3.16.dist-info}/WHEEL +0 -0
sibi_dst/df_helper/_df_helper.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
import asyncio
|
2
2
|
import datetime
|
3
|
+
import logging
|
4
|
+
import warnings
|
3
5
|
from typing import Any, Dict, TypeVar
|
4
6
|
from typing import Union, Optional
|
5
7
|
|
@@ -8,9 +10,9 @@ import dask_expr
|
|
8
10
|
import pandas as pd
|
9
11
|
from pydantic import BaseModel
|
10
12
|
|
11
|
-
from sibi_dst.
|
12
|
-
from sibi_dst.df_helper.core import QueryConfig, ParamsConfig
|
13
|
+
from sibi_dst.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
|
13
14
|
from sibi_dst.utils import Logger
|
15
|
+
from sibi_dst.utils import ParquetSaver, ClickHouseWriter
|
14
16
|
from .plugins.django import *
|
15
17
|
from .plugins.http import HttpConfig
|
16
18
|
from .plugins.parquet import ParquetConfig, ParquetFilterHandler
|
@@ -19,6 +21,13 @@ from .plugins.sql_alchemy import *
|
|
19
21
|
# Define a generic type variable for BaseModel subclasses
|
20
22
|
T = TypeVar("T", bound=BaseModel)
|
21
23
|
|
24
|
+
# It is considered acceptable in Django to access protected class members
|
25
|
+
warnings.filterwarnings(
|
26
|
+
"ignore",
|
27
|
+
message="Access to a protected member _meta",
|
28
|
+
category=UserWarning,
|
29
|
+
)
|
30
|
+
|
22
31
|
class DfHelper:
|
23
32
|
df: Union[dd.DataFrame, pd.DataFrame] = None
|
24
33
|
plugin_django_connection: Optional[DjangoConnectionConfig] = None
|
@@ -36,19 +45,21 @@ class DfHelper:
|
|
36
45
|
self.default_config = self.default_config or {}
|
37
46
|
kwargs = {**self.default_config.copy(), **kwargs}
|
38
47
|
self.source = source
|
39
|
-
self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
|
40
48
|
self.debug = kwargs.setdefault("debug", False)
|
41
|
-
self.
|
49
|
+
self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
|
50
|
+
# Configure logger level
|
51
|
+
self.logger.setLevel(logging.DEBUG if self.debug else logging.INFO)
|
52
|
+
# Configure logger level
|
53
|
+
self.logger.debug("Logger initialized in DEBUG mode.")
|
42
54
|
self.parquet_storage_path = kwargs.setdefault("parquet_storage_path", None)
|
43
|
-
self.dt_field=kwargs.setdefault("dt_field", None)
|
55
|
+
self.dt_field = kwargs.setdefault("dt_field", None)
|
44
56
|
self.as_pandas = kwargs.setdefault("as_pandas", False)
|
45
57
|
kwargs.setdefault("live", True)
|
46
58
|
kwargs.setdefault("logger", self.logger)
|
47
59
|
self.post_init(**kwargs)
|
48
60
|
|
49
|
-
|
50
61
|
def post_init(self, **kwargs):
|
51
|
-
self.logger.
|
62
|
+
self.logger.debug(f"Source used: {self.source}")
|
52
63
|
self.plugin_query = self.__get_config(QueryConfig, kwargs)
|
53
64
|
self.plugin_params = self.__get_config(ParamsConfig, kwargs)
|
54
65
|
if self.source == 'django_db':
|
@@ -59,7 +70,7 @@ class DfHelper:
|
|
59
70
|
elif self.source == 'http':
|
60
71
|
self.plugin_http = HttpConfig(**kwargs)
|
61
72
|
elif self.source == 'sqlalchemy':
|
62
|
-
self.plugin_sqlalchemy = self.__get_config(SqlAlchemyConnectionConfig,kwargs)
|
73
|
+
self.plugin_sqlalchemy = self.__get_config(SqlAlchemyConnectionConfig, kwargs)
|
63
74
|
|
64
75
|
@staticmethod
|
65
76
|
def __get_config(model: [T], kwargs: Dict[str, Any]) -> Union[T]:
|
@@ -94,17 +105,15 @@ class DfHelper:
|
|
94
105
|
return self._load_from_parquet(**options)
|
95
106
|
elif self.source == 'http':
|
96
107
|
if asyncio.get_event_loop().is_running():
|
97
|
-
self.logger.
|
108
|
+
self.logger.debug("Running as a task from an event loop")
|
98
109
|
return asyncio.create_task(self._load_from_http(**options))
|
99
110
|
else:
|
100
|
-
self.logger.
|
111
|
+
self.logger.debug("Regular asyncio run...")
|
101
112
|
return asyncio.run(self._load_from_http(**options))
|
102
113
|
|
103
|
-
|
104
114
|
def _load_from_sqlalchemy(self, **options):
|
105
115
|
try:
|
106
116
|
options.setdefault("debug", self.debug)
|
107
|
-
options.setdefault("verbose_debug", self.verbose_debug)
|
108
117
|
db_loader = SqlAlchemyLoadFromDb(
|
109
118
|
self.plugin_sqlalchemy,
|
110
119
|
self.plugin_query,
|
@@ -115,9 +124,9 @@ class DfHelper:
|
|
115
124
|
self.df = db_loader.build_and_load()
|
116
125
|
self._process_loaded_data()
|
117
126
|
self._post_process_df()
|
118
|
-
self.logger.
|
127
|
+
self.logger.debug("Data successfully loaded from sqlalchemy database.")
|
119
128
|
except Exception as e:
|
120
|
-
self.logger.
|
129
|
+
self.logger.debug(f"Failed to load data from sqlalchemy database: {e}: options: {options}")
|
121
130
|
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
122
131
|
|
123
132
|
return self.df
|
@@ -125,7 +134,6 @@ class DfHelper:
|
|
125
134
|
def _load_from_db(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
126
135
|
try:
|
127
136
|
options.setdefault("debug", self.debug)
|
128
|
-
options.setdefault("verbose_debug", self.verbose_debug)
|
129
137
|
db_loader = DjangoLoadFromDb(
|
130
138
|
self.plugin_django_connection,
|
131
139
|
self.plugin_query,
|
@@ -136,26 +144,25 @@ class DfHelper:
|
|
136
144
|
self.df = db_loader.build_and_load()
|
137
145
|
self._process_loaded_data()
|
138
146
|
self._post_process_df()
|
139
|
-
self.logger.
|
147
|
+
self.logger.debug("Data successfully loaded from django database.")
|
140
148
|
except Exception as e:
|
141
|
-
self.logger.
|
142
|
-
self.df=dd.from_pandas(pd.DataFrame(), npartitions=1)
|
149
|
+
self.logger.debug(f"Failed to load data from django database: {e}")
|
150
|
+
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
143
151
|
|
144
152
|
return self.df
|
145
153
|
|
146
154
|
async def _load_from_http(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
147
155
|
"""Delegate asynchronous HTTP data loading to HttpDataSource plugin."""
|
148
156
|
if not self.plugin_http:
|
149
|
-
self.logger.
|
157
|
+
self.logger.debug("HTTP plugin not configured properly.")
|
150
158
|
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
151
159
|
try:
|
152
160
|
self.df = await self.plugin_http.fetch_data(**options)
|
153
161
|
except Exception as e:
|
154
|
-
self.logger.
|
155
|
-
self.df=dd.from_pandas(pd.DataFrame(), npartitions=1)
|
162
|
+
self.logger.debug(f"Failed to load data from http plugin: {e}")
|
163
|
+
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
156
164
|
return self.df
|
157
165
|
|
158
|
-
|
159
166
|
def _post_process_df(self):
|
160
167
|
"""
|
161
168
|
Efficiently process the DataFrame by filtering, renaming, and setting indices.
|
@@ -193,10 +200,10 @@ class DfHelper:
|
|
193
200
|
if datetime_index and self.df.index.dtype != 'datetime64[ns]':
|
194
201
|
self.df = self.df.map_partitions(lambda df: df.set_index(pd.to_datetime(df.index, errors='coerce')))
|
195
202
|
|
196
|
-
self.logger.
|
203
|
+
self.logger.debug("Post-processing of DataFrame completed.")
|
197
204
|
|
198
205
|
def _process_loaded_data(self):
|
199
|
-
self.logger.
|
206
|
+
self.logger.debug(f"Type of self.df: {type(self.df)}")
|
200
207
|
if self.df.map_partitions(len).compute().sum() > 0:
|
201
208
|
field_map = self.plugin_params.field_map or {}
|
202
209
|
if isinstance(field_map, dict):
|
@@ -214,25 +221,30 @@ class DfHelper:
|
|
214
221
|
# Apply renaming
|
215
222
|
self.df = self.df.map_partitions(rename_columns, mapping=rename_mapping)
|
216
223
|
|
217
|
-
self.logger.
|
224
|
+
self.logger.debug("Processing of loaded data completed.")
|
218
225
|
|
219
226
|
def save_to_parquet(self, parquet_filename: Optional[str] = None):
|
220
227
|
ps = ParquetSaver(self.df, self.parquet_storage_path, self.logger)
|
221
228
|
ps.save_to_parquet(parquet_filename)
|
222
|
-
self.logger.
|
229
|
+
self.logger.debug(f"Parquet saved to {parquet_filename} in parquet storage: {self.parquet_storage_path}.")
|
223
230
|
|
224
231
|
def save_to_clickhouse(self, **credentials):
|
225
232
|
if self.df.map_partitions(len).compute().sum() == 0:
|
226
|
-
self.logger.
|
233
|
+
self.logger.debug("Cannot write to clickhouse since Dataframe is empty")
|
227
234
|
return
|
228
|
-
cs=ClickHouseWriter(logger=self.logger, **credentials)
|
235
|
+
cs = ClickHouseWriter(logger=self.logger, **credentials)
|
229
236
|
cs.save_to_clickhouse(self.df)
|
230
|
-
self.logger.
|
237
|
+
self.logger.debug("Save to ClickHouse completed.")
|
231
238
|
|
232
239
|
def _load_from_parquet(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
233
240
|
self.df = self.plugin_parquet.load_files()
|
234
241
|
if options:
|
242
|
+
"""
|
243
|
+
deprecated specific filter handling to a generic one
|
235
244
|
self.df = ParquetFilterHandler(logger=self.logger).apply_filters_dask(self.df, options)
|
245
|
+
|
246
|
+
"""
|
247
|
+
self.df = FilterHandler(backend='dask', logger=self.logger).apply_filters(self.df, filters=options)
|
236
248
|
return self.df
|
237
249
|
|
238
250
|
def load_period(self, **kwargs):
|
@@ -297,9 +309,9 @@ class DfHelper:
|
|
297
309
|
elif is_datetime_field:
|
298
310
|
kwargs[f"{mapped_field}__date__gte"] = start
|
299
311
|
kwargs[f"{mapped_field}__date__lte"] = end
|
312
|
+
self.logger.debug(f"load_period kwargs: {kwargs}")
|
300
313
|
return self.load(**kwargs)
|
301
314
|
|
302
|
-
|
303
315
|
@staticmethod
|
304
316
|
def parse_date(date_str: str) -> Union[datetime.datetime, datetime.date]:
|
305
317
|
try:
|
@@ -1,12 +1,13 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from ._params_config import ParamsConfig
|
4
|
-
from ._query_config import QueryConfig
|
5
3
|
from ._defaults import (
|
6
4
|
django_field_conversion_map_pandas,
|
7
5
|
django_field_conversion_map_dask,
|
8
6
|
sqlalchemy_field_conversion_map_dask,
|
9
7
|
normalize_sqlalchemy_type)
|
8
|
+
from ._filter_handler import FilterHandler
|
9
|
+
from ._params_config import ParamsConfig
|
10
|
+
from ._query_config import QueryConfig
|
10
11
|
|
11
12
|
__all__ = [
|
12
13
|
"ParamsConfig",
|
@@ -14,5 +15,6 @@ __all__ = [
|
|
14
15
|
"django_field_conversion_map_pandas",
|
15
16
|
"django_field_conversion_map_dask",
|
16
17
|
"sqlalchemy_field_conversion_map_dask",
|
17
|
-
"normalize_sqlalchemy_type"
|
18
|
-
|
18
|
+
"normalize_sqlalchemy_type",
|
19
|
+
"FilterHandler",
|
20
|
+
]
|
@@ -0,0 +1,216 @@
|
|
1
|
+
import datetime
|
2
|
+
import dask.dataframe as dd
|
3
|
+
import pandas as pd
|
4
|
+
from sqlalchemy import func, cast
|
5
|
+
from sqlalchemy.sql.sqltypes import Date, Time
|
6
|
+
from sibi_dst.utils import Logger
|
7
|
+
|
8
|
+
class FilterHandler:
|
9
|
+
def __init__(self, backend, logger=None):
|
10
|
+
"""
|
11
|
+
Initialize the FilterHandler.
|
12
|
+
|
13
|
+
Args:
|
14
|
+
backend: The backend to use ('sqlalchemy' or 'dask').
|
15
|
+
logger: Optional logger for debugging purposes.
|
16
|
+
"""
|
17
|
+
self.backend = backend
|
18
|
+
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__) # No-op logger if none provided
|
19
|
+
self.backend_methods = self._get_backend_methods(backend)
|
20
|
+
|
21
|
+
def apply_filters(self, query_or_df, model=None, filters=None):
|
22
|
+
"""
|
23
|
+
Apply filters to the data source based on the backend.
|
24
|
+
|
25
|
+
Args:
|
26
|
+
query_or_df: SQLAlchemy query or Dask DataFrame.
|
27
|
+
model: SQLAlchemy model (required for SQLAlchemy backend).
|
28
|
+
filters: Dictionary of filters.
|
29
|
+
|
30
|
+
Returns:
|
31
|
+
Filtered query or DataFrame.
|
32
|
+
"""
|
33
|
+
filters = filters or {}
|
34
|
+
for key, value in filters.items():
|
35
|
+
field_name, casting, operation = self._parse_filter_key(key)
|
36
|
+
parsed_value = self._parse_filter_value(casting, value)
|
37
|
+
#print(field_name, casting, operation, parsed_value)
|
38
|
+
# Get the column and apply backend-specific transformations
|
39
|
+
if self.backend == "sqlalchemy":
|
40
|
+
column = self.backend_methods["get_column"](field_name, model, casting)
|
41
|
+
condition = self.backend_methods["apply_operation"](column, operation, parsed_value)
|
42
|
+
query_or_df = self.backend_methods["apply_condition"](query_or_df, condition)
|
43
|
+
|
44
|
+
elif self.backend == "dask":
|
45
|
+
column = self.backend_methods["get_column"](query_or_df, field_name, casting)
|
46
|
+
condition = self.backend_methods["apply_operation"](column, operation, parsed_value)
|
47
|
+
query_or_df = self.backend_methods["apply_condition"](query_or_df, condition)
|
48
|
+
else:
|
49
|
+
raise ValueError(f"Unsupported backend: {self.backend}")
|
50
|
+
|
51
|
+
return query_or_df
|
52
|
+
|
53
|
+
@staticmethod
|
54
|
+
def _parse_filter_key(key):
|
55
|
+
parts = key.split("__")
|
56
|
+
field_name = parts[0]
|
57
|
+
casting = None
|
58
|
+
operation = "exact"
|
59
|
+
|
60
|
+
if len(parts) == 3:
|
61
|
+
_, casting, operation = parts
|
62
|
+
elif len(parts) == 2:
|
63
|
+
if parts[1] in FilterHandler._comparison_operators():
|
64
|
+
operation = parts[1]
|
65
|
+
elif parts[1] in FilterHandler._dt_operators() + FilterHandler._date_operators():
|
66
|
+
casting = parts[1]
|
67
|
+
|
68
|
+
return field_name, casting, operation
|
69
|
+
|
70
|
+
|
71
|
+
def _parse_filter_value(self, casting, value):
|
72
|
+
"""
|
73
|
+
Convert filter value to appropriate type based on the casting (e.g., date).
|
74
|
+
"""
|
75
|
+
if casting == "date":
|
76
|
+
if isinstance(value, str):
|
77
|
+
parsed = pd.Timestamp(value) # Convert to datetime64[ns]
|
78
|
+
return parsed
|
79
|
+
if isinstance(value, list):
|
80
|
+
parsed = [pd.Timestamp(v) for v in value]
|
81
|
+
return parsed
|
82
|
+
elif casting == "time" and isinstance(value, str):
|
83
|
+
parsed = datetime.time.fromisoformat(value)
|
84
|
+
self.logger.debug(f"Parsed value (time): {parsed}")
|
85
|
+
return parsed
|
86
|
+
return value
|
87
|
+
|
88
|
+
@staticmethod
|
89
|
+
def _get_backend_methods(backend):
|
90
|
+
if backend == "sqlalchemy":
|
91
|
+
return {
|
92
|
+
"get_column": FilterHandler._get_sqlalchemy_column,
|
93
|
+
"apply_operation": FilterHandler._apply_operation_sqlalchemy,
|
94
|
+
"apply_condition": lambda query, condition: query.filter(condition),
|
95
|
+
}
|
96
|
+
elif backend == "dask":
|
97
|
+
return {
|
98
|
+
"get_column": FilterHandler._get_dask_column,
|
99
|
+
"apply_operation": FilterHandler._apply_operation_dask,
|
100
|
+
"apply_condition": lambda df, condition: df[condition],
|
101
|
+
}
|
102
|
+
else:
|
103
|
+
raise ValueError(f"Unsupported backend: {backend}")
|
104
|
+
|
105
|
+
@staticmethod
|
106
|
+
def _get_sqlalchemy_column(field_name, model, casting):
|
107
|
+
"""
|
108
|
+
Retrieve and cast a column for SQLAlchemy based on the field name and casting.
|
109
|
+
|
110
|
+
Args:
|
111
|
+
field_name: The name of the field/column in the model.
|
112
|
+
model: The SQLAlchemy model.
|
113
|
+
casting: The casting type ('date', 'time', etc.).
|
114
|
+
|
115
|
+
Returns:
|
116
|
+
The SQLAlchemy column object, optionally cast or transformed.
|
117
|
+
"""
|
118
|
+
column = getattr(model, field_name, None)
|
119
|
+
if not column:
|
120
|
+
raise AttributeError(f"Field '{field_name}' not found in model '{model.__name__}'")
|
121
|
+
|
122
|
+
if casting == "date":
|
123
|
+
# Cast the column to Date for whole-date comparisons
|
124
|
+
column = cast(column, Date)
|
125
|
+
elif casting == "time":
|
126
|
+
# Cast the column to Time for time-specific comparisons
|
127
|
+
column = cast(column, Time)
|
128
|
+
elif casting in FilterHandler._date_operators():
|
129
|
+
# Extract date part (e.g., year, month) using SQLAlchemy functions
|
130
|
+
column = func.extract(casting, column)
|
131
|
+
|
132
|
+
return column
|
133
|
+
|
134
|
+
@staticmethod
|
135
|
+
def _get_dask_column(df, field_name, casting):
|
136
|
+
"""
|
137
|
+
Retrieve and optionally cast a column for Dask based on the field name and casting.
|
138
|
+
|
139
|
+
Args:
|
140
|
+
df: The Dask DataFrame.
|
141
|
+
field_name: The name of the field/column in the DataFrame.
|
142
|
+
casting: The casting type ('date', 'time', etc.).
|
143
|
+
|
144
|
+
Returns:
|
145
|
+
The Dask Series object, optionally cast or transformed.
|
146
|
+
"""
|
147
|
+
column = dd.to_datetime(df[field_name], errors="coerce") if casting in FilterHandler._dt_operators() else df[
|
148
|
+
field_name]
|
149
|
+
|
150
|
+
if casting == "date":
|
151
|
+
column = column.dt.floor("D") # Ensure truncation to the date level
|
152
|
+
elif casting in FilterHandler._date_operators():
|
153
|
+
column = getattr(column.dt, casting)
|
154
|
+
|
155
|
+
return column
|
156
|
+
|
157
|
+
@staticmethod
|
158
|
+
def _apply_operation_sqlalchemy(column, operation, value):
|
159
|
+
operation_map = FilterHandler._operation_map_sqlalchemy()
|
160
|
+
if operation not in operation_map:
|
161
|
+
raise ValueError(f"Unsupported operation: {operation}")
|
162
|
+
return operation_map[operation](column, value)
|
163
|
+
|
164
|
+
@staticmethod
|
165
|
+
def _apply_operation_dask(column, operation, value):
|
166
|
+
operation_map = FilterHandler._operation_map_dask()
|
167
|
+
if operation not in operation_map:
|
168
|
+
raise ValueError(f"Unsupported operation: {operation}")
|
169
|
+
return operation_map[operation](column, value)
|
170
|
+
|
171
|
+
@staticmethod
|
172
|
+
def _operation_map_sqlalchemy():
|
173
|
+
return {
|
174
|
+
"exact": lambda col, val: col == val,
|
175
|
+
"gt": lambda col, val: col > val,
|
176
|
+
"gte": lambda col, val: col >= val,
|
177
|
+
"lt": lambda col, val: col < val,
|
178
|
+
"lte": lambda col, val: col <= val,
|
179
|
+
"in": lambda col, val: col.in_(val),
|
180
|
+
"range": lambda col, val: col.between(val[0], val[1]),
|
181
|
+
"contains": lambda col, val: col.like(f"%{val}%"),
|
182
|
+
"startswith": lambda col, val: col.like(f"{val}%"),
|
183
|
+
"endswith": lambda col, val: col.like(f"%{val}"),
|
184
|
+
"isnull": lambda col, val: col.is_(None) if val else col.isnot(None),
|
185
|
+
}
|
186
|
+
|
187
|
+
@staticmethod
|
188
|
+
def _operation_map_dask():
|
189
|
+
return {
|
190
|
+
"exact": lambda col, val: col == val,
|
191
|
+
"gt": lambda col, val: col > val,
|
192
|
+
"gte": lambda col, val: col >= val,
|
193
|
+
"lt": lambda col, val: col < val,
|
194
|
+
"lte": lambda col, val: col <= val,
|
195
|
+
"in": lambda col, val: col.isin(val),
|
196
|
+
"range": lambda col, val: (col >= val[0]) & (col <= val[1]),
|
197
|
+
"contains": lambda col, val: col.str.contains(val, regex=True),
|
198
|
+
"startswith": lambda col, val: col.str.startswith(val),
|
199
|
+
"endswith": lambda col, val: col.str.endswith(val),
|
200
|
+
"isnull": lambda col, val: col.isnull() if val else col.notnull(),
|
201
|
+
}
|
202
|
+
|
203
|
+
@staticmethod
|
204
|
+
def _dt_operators():
|
205
|
+
return ["date", "time"]
|
206
|
+
|
207
|
+
@staticmethod
|
208
|
+
def _date_operators():
|
209
|
+
return ["year", "month", "day", "hour", "minute", "second", "week_day"]
|
210
|
+
|
211
|
+
@staticmethod
|
212
|
+
def _comparison_operators():
|
213
|
+
return [
|
214
|
+
"gte", "lte", "gt", "lt", "exact", "in", "range",
|
215
|
+
"contains", "startswith", "endswith", "isnull",
|
216
|
+
]
|
@@ -1,5 +1,8 @@
|
|
1
|
+
import warnings
|
2
|
+
|
1
3
|
import dask.dataframe as dd
|
2
4
|
import pandas as pd
|
5
|
+
from IPython.core.hooks import deprecated
|
3
6
|
from django.db.models import Q
|
4
7
|
|
5
8
|
from sibi_dst.df_helper.plugins.django import ReadFrameDask
|
@@ -12,13 +15,11 @@ class DjangoLoadFromDb:
|
|
12
15
|
def __init__(self, db_connection, db_query, db_params, logger, **kwargs):
|
13
16
|
self.connection_config = db_connection
|
14
17
|
self.debug = kwargs.pop('debug', False)
|
15
|
-
self.verbose_debug = kwargs.pop('verbose_debug', False)
|
16
18
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
17
19
|
if self.connection_config.model is None:
|
18
20
|
if self.debug:
|
19
|
-
self.logger.
|
20
|
-
|
21
|
-
print('Model must be specified')
|
21
|
+
self.logger.debug('Model must be specified')
|
22
|
+
|
22
23
|
raise ValueError('Model must be specified')
|
23
24
|
|
24
25
|
self.query_config = db_query
|
@@ -45,7 +46,7 @@ class DjangoLoadFromDb:
|
|
45
46
|
try:
|
46
47
|
self.df = ReadFrameDask(queryset, **self.params_config.df_params).read_frame()
|
47
48
|
except Exception as e:
|
48
|
-
self.logger.
|
49
|
+
self.logger.debug(f'Error loading query: {str(queryset.query)}, error message: {e}')
|
49
50
|
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
50
51
|
else:
|
51
52
|
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
@@ -69,16 +70,28 @@ class DjangoLoadFromDb:
|
|
69
70
|
:param df: Dask DataFrame whose columns' data types are to be converted.
|
70
71
|
:return: Dask DataFrame with converted column data types.
|
71
72
|
"""
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
73
|
+
"""
|
74
|
+
[DEPRECATED] Convert the data types of columns in a Dask DataFrame based on the field type in the Django model.
|
75
|
+
|
76
|
+
:param df: Dask DataFrame whose columns' data types are to be converted.
|
77
|
+
:return: Dask DataFrame with converted column data types.
|
78
|
+
"""
|
79
|
+
# Emit deprecation warning
|
80
|
+
warnings.warn(
|
81
|
+
"_convert_columns is deprecated and will be removed in a future release. "
|
82
|
+
"Consider using <new_method_name> instead.",
|
83
|
+
DeprecationWarning,
|
84
|
+
stacklevel=2,
|
85
|
+
)
|
86
|
+
|
87
|
+
# Log deprecation message if debug mode is enabled
|
80
88
|
if self.debug:
|
81
|
-
self.logger.
|
89
|
+
self.logger.warning(
|
90
|
+
"[DEPRECATION NOTICE] The `_convert_columns` method is deprecated and will be removed in a future release. "
|
91
|
+
"Consider using <new_method_name> instead."
|
92
|
+
)
|
93
|
+
|
94
|
+
self.logger.debug(f'Converting columns: {list(df.columns)}')
|
82
95
|
|
83
96
|
# Get field information from the Django model
|
84
97
|
model_fields = self.connection_config.model._meta.get_fields()
|
@@ -87,13 +100,13 @@ class DjangoLoadFromDb:
|
|
87
100
|
for field_name, field_type in field_type_map.items():
|
88
101
|
if field_name not in df.columns:
|
89
102
|
|
90
|
-
|
103
|
+
self.logger.debug(f"Column '{field_name}' not found in DataFrame columns.")
|
91
104
|
continue
|
92
105
|
|
93
106
|
conversion_func = django_field_conversion_map_dask.get(field_type)
|
94
107
|
if not conversion_func:
|
95
108
|
message=f"Field type '{field_type}' not found in conversion_map."
|
96
|
-
|
109
|
+
self.logger.debug(message)
|
97
110
|
continue
|
98
111
|
|
99
112
|
def apply_conversion(partition):
|
@@ -104,7 +117,7 @@ class DjangoLoadFromDb:
|
|
104
117
|
if field_name in partition.columns:
|
105
118
|
partition[field_name] = conversion_func(partition[field_name])
|
106
119
|
except Exception as e:
|
107
|
-
self.logger.
|
120
|
+
self.logger.debug(f"Error converting column '{field_name}' in partition: {str(e)}")
|
108
121
|
return partition
|
109
122
|
|
110
123
|
try:
|
@@ -113,9 +126,8 @@ class DjangoLoadFromDb:
|
|
113
126
|
apply_conversion,
|
114
127
|
meta=df,
|
115
128
|
)
|
116
|
-
|
117
|
-
is_verbose=True)
|
129
|
+
self.logger.debug(f"Successfully queued conversion for column '{field_name}' to type '{field_type}'.")
|
118
130
|
except Exception as e:
|
119
|
-
|
131
|
+
self.logger.debug(f"Failed to queue conversion for column '{field_name}': {str(e)}")
|
120
132
|
|
121
133
|
return df
|
@@ -31,17 +31,17 @@ class HttpConfig(BaseModel):
|
|
31
31
|
# Set up headers with API key if provided
|
32
32
|
headers = {"Authorization": f"Bearer {self.api_key.get_secret_value()}"} if self.api_key else {}
|
33
33
|
|
34
|
-
self.logger.
|
34
|
+
self.logger.debug(f"Fetching data from {formatted_url} with params {self.params}")
|
35
35
|
async with httpx.AsyncClient() as client:
|
36
36
|
response = await client.get(formatted_url, params=self.params, headers=headers, timeout=self.timeout)
|
37
37
|
response.raise_for_status()
|
38
38
|
data = response.json()
|
39
39
|
df = dd.from_pandas(pd.json_normalize(data), npartitions=1)
|
40
|
-
self.logger.
|
40
|
+
self.logger.debug("Data successfully loaded from HTTP JSON source.")
|
41
41
|
return df
|
42
42
|
except httpx.RequestError as e:
|
43
|
-
self.logger.
|
43
|
+
self.logger.debug(f"HTTP request error: {e}")
|
44
44
|
raise
|
45
45
|
except ValueError as e:
|
46
|
-
self.logger.
|
46
|
+
self.logger.debug(f"Error parsing JSON data: {e}")
|
47
47
|
raise
|
@@ -8,7 +8,7 @@ from sqlalchemy.orm import sessionmaker
|
|
8
8
|
|
9
9
|
from sibi_dst.df_helper.plugins.sql_alchemy._sqlachemy_filter_handler import SqlAlchemyFilterHandler
|
10
10
|
from sibi_dst.utils import Logger
|
11
|
-
|
11
|
+
from sibi_dst.df_helper.core import FilterHandler
|
12
12
|
|
13
13
|
class SQLAlchemyDask:
|
14
14
|
def __init__(self, model, filters, engine_url, chunk_size=1000, logger=None, debug=False):
|
@@ -38,7 +38,6 @@ class SQLAlchemyDask:
|
|
38
38
|
"""
|
39
39
|
mapper = inspect(model)
|
40
40
|
sqlalchemy_to_dask_dtype = {
|
41
|
-
#'INTEGER': pd.to_numeric(x, errors="coerce"),
|
42
41
|
'INTEGER': 'Int64',
|
43
42
|
'SMALLINT': 'Int64',
|
44
43
|
'BIGINT': 'Int64',
|
@@ -72,11 +71,15 @@ class SQLAlchemyDask:
|
|
72
71
|
# Build query
|
73
72
|
self.query = select(self.model)
|
74
73
|
if self.filters:
|
75
|
-
|
74
|
+
"""
|
75
|
+
deprecated specific filter handling to a generic one
|
76
|
+
#self.query = SqlAlchemyFilterHandler.apply_filters_sqlalchemy(self.query, self.model, self.filters)
|
77
|
+
"""
|
78
|
+
self.query = FilterHandler(backend="sqlalchemy", logger=self.logger).apply_filters(self.query, model=self.model, filters=self.filters)
|
76
79
|
else:
|
77
80
|
n_records = 100
|
78
81
|
self.query = self.query.limit(n_records)
|
79
|
-
|
82
|
+
self.logger.debug(f"query:{self.query}")
|
80
83
|
# Infer dtypes
|
81
84
|
dtypes = self.infer_dtypes_from_model(self.model)
|
82
85
|
# Get the column order from the SQLAlchemy model
|
@@ -124,14 +127,11 @@ class SQLAlchemyDask:
|
|
124
127
|
else:
|
125
128
|
dask_df = dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
|
126
129
|
|
127
|
-
|
128
|
-
self.logger.info(f"Loaded {len(dask_df)} rows into Dask DataFrame.")
|
129
|
-
|
130
|
-
if isinstance(dask_df, dask_expr._collection.DataFrame):
|
131
|
-
dask_df = dask_df.to_legacy_dataframe()
|
130
|
+
self.logger.debug(f"Loaded {len(dask_df)} rows into Dask DataFrame.")
|
132
131
|
|
133
132
|
return dask_df
|
134
133
|
|
135
134
|
except Exception as e:
|
136
135
|
self.logger.error(f"Error executing query: {str(e)}")
|
136
|
+
self.logger.error(self.query)
|
137
137
|
return dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
|
@@ -1,7 +1,9 @@
|
|
1
|
-
from sqlalchemy import and_, or_, not_, func, cast
|
2
|
-
from sqlalchemy.sql.sqltypes import Date, Time
|
3
1
|
import datetime
|
4
2
|
|
3
|
+
from sqlalchemy import func, cast
|
4
|
+
from sqlalchemy.sql.sqltypes import Date, Time
|
5
|
+
|
6
|
+
|
5
7
|
class SqlAlchemyFilterHandler:
|
6
8
|
@staticmethod
|
7
9
|
def apply_filters_sqlalchemy(query, model, filters):
|