sibi-dst 0.3.14__py3-none-any.whl → 0.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,7 @@
1
1
  import asyncio
2
2
  import datetime
3
+ import logging
4
+ import warnings
3
5
  from typing import Any, Dict, TypeVar
4
6
  from typing import Union, Optional
5
7
 
@@ -8,9 +10,9 @@ import dask_expr
8
10
  import pandas as pd
9
11
  from pydantic import BaseModel
10
12
 
11
- from sibi_dst.utils import ParquetSaver, ClickHouseWriter
12
- from sibi_dst.df_helper.core import QueryConfig, ParamsConfig
13
+ from sibi_dst.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
13
14
  from sibi_dst.utils import Logger
15
+ from sibi_dst.utils import ParquetSaver, ClickHouseWriter
14
16
  from .plugins.django import *
15
17
  from .plugins.http import HttpConfig
16
18
  from .plugins.parquet import ParquetConfig, ParquetFilterHandler
@@ -19,6 +21,13 @@ from .plugins.sql_alchemy import *
19
21
  # Define a generic type variable for BaseModel subclasses
20
22
  T = TypeVar("T", bound=BaseModel)
21
23
 
24
+ # It is considered acceptable in Django to access protected class members
25
+ warnings.filterwarnings(
26
+ "ignore",
27
+ message="Access to a protected member _meta",
28
+ category=UserWarning,
29
+ )
30
+
22
31
  class DfHelper:
23
32
  df: Union[dd.DataFrame, pd.DataFrame] = None
24
33
  plugin_django_connection: Optional[DjangoConnectionConfig] = None
@@ -36,19 +45,21 @@ class DfHelper:
36
45
  self.default_config = self.default_config or {}
37
46
  kwargs = {**self.default_config.copy(), **kwargs}
38
47
  self.source = source
39
- self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
40
48
  self.debug = kwargs.setdefault("debug", False)
41
- self.verbose_debug = kwargs.setdefault("verbose_debug", False)
49
+ self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
50
+ # Configure logger level
51
+ self.logger.setLevel(logging.DEBUG if self.debug else logging.INFO)
52
+ # Configure logger level
53
+ self.logger.debug("Logger initialized in DEBUG mode.")
42
54
  self.parquet_storage_path = kwargs.setdefault("parquet_storage_path", None)
43
- self.dt_field=kwargs.setdefault("dt_field", None)
55
+ self.dt_field = kwargs.setdefault("dt_field", None)
44
56
  self.as_pandas = kwargs.setdefault("as_pandas", False)
45
57
  kwargs.setdefault("live", True)
46
58
  kwargs.setdefault("logger", self.logger)
47
59
  self.post_init(**kwargs)
48
60
 
49
-
50
61
  def post_init(self, **kwargs):
51
- self.logger.info(f"Source used: {self.source}")
62
+ self.logger.debug(f"Source used: {self.source}")
52
63
  self.plugin_query = self.__get_config(QueryConfig, kwargs)
53
64
  self.plugin_params = self.__get_config(ParamsConfig, kwargs)
54
65
  if self.source == 'django_db':
@@ -59,7 +70,7 @@ class DfHelper:
59
70
  elif self.source == 'http':
60
71
  self.plugin_http = HttpConfig(**kwargs)
61
72
  elif self.source == 'sqlalchemy':
62
- self.plugin_sqlalchemy = self.__get_config(SqlAlchemyConnectionConfig,kwargs)
73
+ self.plugin_sqlalchemy = self.__get_config(SqlAlchemyConnectionConfig, kwargs)
63
74
 
64
75
  @staticmethod
65
76
  def __get_config(model: [T], kwargs: Dict[str, Any]) -> Union[T]:
@@ -94,17 +105,15 @@ class DfHelper:
94
105
  return self._load_from_parquet(**options)
95
106
  elif self.source == 'http':
96
107
  if asyncio.get_event_loop().is_running():
97
- self.logger.info("Running as a task from an event loop")
108
+ self.logger.debug("Running as a task from an event loop")
98
109
  return asyncio.create_task(self._load_from_http(**options))
99
110
  else:
100
- self.logger.info("Regular asyncio run...")
111
+ self.logger.debug("Regular asyncio run...")
101
112
  return asyncio.run(self._load_from_http(**options))
102
113
 
103
-
104
114
  def _load_from_sqlalchemy(self, **options):
105
115
  try:
106
116
  options.setdefault("debug", self.debug)
107
- options.setdefault("verbose_debug", self.verbose_debug)
108
117
  db_loader = SqlAlchemyLoadFromDb(
109
118
  self.plugin_sqlalchemy,
110
119
  self.plugin_query,
@@ -115,9 +124,9 @@ class DfHelper:
115
124
  self.df = db_loader.build_and_load()
116
125
  self._process_loaded_data()
117
126
  self._post_process_df()
118
- self.logger.info("Data successfully loaded from sqlalchemy database.")
127
+ self.logger.debug("Data successfully loaded from sqlalchemy database.")
119
128
  except Exception as e:
120
- self.logger.error(f"Failed to load data from sqlalchemy database: {e}: options: {options}")
129
+ self.logger.debug(f"Failed to load data from sqlalchemy database: {e}: options: {options}")
121
130
  self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
122
131
 
123
132
  return self.df
@@ -125,7 +134,6 @@ class DfHelper:
125
134
  def _load_from_db(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
126
135
  try:
127
136
  options.setdefault("debug", self.debug)
128
- options.setdefault("verbose_debug", self.verbose_debug)
129
137
  db_loader = DjangoLoadFromDb(
130
138
  self.plugin_django_connection,
131
139
  self.plugin_query,
@@ -136,26 +144,25 @@ class DfHelper:
136
144
  self.df = db_loader.build_and_load()
137
145
  self._process_loaded_data()
138
146
  self._post_process_df()
139
- self.logger.info("Data successfully loaded from django database.")
147
+ self.logger.debug("Data successfully loaded from django database.")
140
148
  except Exception as e:
141
- self.logger.error(f"Failed to load data from django database: {e}")
142
- self.df=dd.from_pandas(pd.DataFrame(), npartitions=1)
149
+ self.logger.debug(f"Failed to load data from django database: {e}")
150
+ self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
143
151
 
144
152
  return self.df
145
153
 
146
154
  async def _load_from_http(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
147
155
  """Delegate asynchronous HTTP data loading to HttpDataSource plugin."""
148
156
  if not self.plugin_http:
149
- self.logger.error("HTTP plugin not configured properly.")
157
+ self.logger.debug("HTTP plugin not configured properly.")
150
158
  return dd.from_pandas(pd.DataFrame(), npartitions=1)
151
159
  try:
152
160
  self.df = await self.plugin_http.fetch_data(**options)
153
161
  except Exception as e:
154
- self.logger.error(f"Failed to load data from http plugin: {e}")
155
- self.df=dd.from_pandas(pd.DataFrame(), npartitions=1)
162
+ self.logger.debug(f"Failed to load data from http plugin: {e}")
163
+ self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
156
164
  return self.df
157
165
 
158
-
159
166
  def _post_process_df(self):
160
167
  """
161
168
  Efficiently process the DataFrame by filtering, renaming, and setting indices.
@@ -193,10 +200,10 @@ class DfHelper:
193
200
  if datetime_index and self.df.index.dtype != 'datetime64[ns]':
194
201
  self.df = self.df.map_partitions(lambda df: df.set_index(pd.to_datetime(df.index, errors='coerce')))
195
202
 
196
- self.logger.info("Post-processing of DataFrame completed.")
203
+ self.logger.debug("Post-processing of DataFrame completed.")
197
204
 
198
205
  def _process_loaded_data(self):
199
- self.logger.info(f"Type of self.df: {type(self.df)}")
206
+ self.logger.debug(f"Type of self.df: {type(self.df)}")
200
207
  if self.df.map_partitions(len).compute().sum() > 0:
201
208
  field_map = self.plugin_params.field_map or {}
202
209
  if isinstance(field_map, dict):
@@ -214,25 +221,30 @@ class DfHelper:
214
221
  # Apply renaming
215
222
  self.df = self.df.map_partitions(rename_columns, mapping=rename_mapping)
216
223
 
217
- self.logger.info("Processing of loaded data completed.")
224
+ self.logger.debug("Processing of loaded data completed.")
218
225
 
219
226
  def save_to_parquet(self, parquet_filename: Optional[str] = None):
220
227
  ps = ParquetSaver(self.df, self.parquet_storage_path, self.logger)
221
228
  ps.save_to_parquet(parquet_filename)
222
- self.logger.info(f"Parquet saved to {parquet_filename} in parquet storage: {self.parquet_storage_path}.")
229
+ self.logger.debug(f"Parquet saved to {parquet_filename} in parquet storage: {self.parquet_storage_path}.")
223
230
 
224
231
  def save_to_clickhouse(self, **credentials):
225
232
  if self.df.map_partitions(len).compute().sum() == 0:
226
- self.logger.info("Cannot write to clickhouse since Dataframe is empty")
233
+ self.logger.debug("Cannot write to clickhouse since Dataframe is empty")
227
234
  return
228
- cs=ClickHouseWriter(logger=self.logger, **credentials)
235
+ cs = ClickHouseWriter(logger=self.logger, **credentials)
229
236
  cs.save_to_clickhouse(self.df)
230
- self.logger.info("Save to ClickHouse completed.")
237
+ self.logger.debug("Save to ClickHouse completed.")
231
238
 
232
239
  def _load_from_parquet(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
233
240
  self.df = self.plugin_parquet.load_files()
234
241
  if options:
242
+ """
243
+ deprecated specific filter handling to a generic one
235
244
  self.df = ParquetFilterHandler(logger=self.logger).apply_filters_dask(self.df, options)
245
+
246
+ """
247
+ self.df = FilterHandler(backend='dask', logger=self.logger).apply_filters(self.df, filters=options)
236
248
  return self.df
237
249
 
238
250
  def load_period(self, **kwargs):
@@ -297,9 +309,9 @@ class DfHelper:
297
309
  elif is_datetime_field:
298
310
  kwargs[f"{mapped_field}__date__gte"] = start
299
311
  kwargs[f"{mapped_field}__date__lte"] = end
312
+ self.logger.debug(f"load_period kwargs: {kwargs}")
300
313
  return self.load(**kwargs)
301
314
 
302
-
303
315
  @staticmethod
304
316
  def parse_date(date_str: str) -> Union[datetime.datetime, datetime.date]:
305
317
  try:
@@ -1,12 +1,13 @@
1
1
  from __future__ import annotations
2
2
 
3
- from ._params_config import ParamsConfig
4
- from ._query_config import QueryConfig
5
3
  from ._defaults import (
6
4
  django_field_conversion_map_pandas,
7
5
  django_field_conversion_map_dask,
8
6
  sqlalchemy_field_conversion_map_dask,
9
7
  normalize_sqlalchemy_type)
8
+ from ._filter_handler import FilterHandler
9
+ from ._params_config import ParamsConfig
10
+ from ._query_config import QueryConfig
10
11
 
11
12
  __all__ = [
12
13
  "ParamsConfig",
@@ -14,5 +15,6 @@ __all__ = [
14
15
  "django_field_conversion_map_pandas",
15
16
  "django_field_conversion_map_dask",
16
17
  "sqlalchemy_field_conversion_map_dask",
17
- "normalize_sqlalchemy_type"
18
- ]
18
+ "normalize_sqlalchemy_type",
19
+ "FilterHandler",
20
+ ]
@@ -0,0 +1,216 @@
1
+ import datetime
2
+ import dask.dataframe as dd
3
+ import pandas as pd
4
+ from sqlalchemy import func, cast
5
+ from sqlalchemy.sql.sqltypes import Date, Time
6
+ from sibi_dst.utils import Logger
7
+
8
+ class FilterHandler:
9
+ def __init__(self, backend, logger=None):
10
+ """
11
+ Initialize the FilterHandler.
12
+
13
+ Args:
14
+ backend: The backend to use ('sqlalchemy' or 'dask').
15
+ logger: Optional logger for debugging purposes.
16
+ """
17
+ self.backend = backend
18
+ self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__) # No-op logger if none provided
19
+ self.backend_methods = self._get_backend_methods(backend)
20
+
21
+ def apply_filters(self, query_or_df, model=None, filters=None):
22
+ """
23
+ Apply filters to the data source based on the backend.
24
+
25
+ Args:
26
+ query_or_df: SQLAlchemy query or Dask DataFrame.
27
+ model: SQLAlchemy model (required for SQLAlchemy backend).
28
+ filters: Dictionary of filters.
29
+
30
+ Returns:
31
+ Filtered query or DataFrame.
32
+ """
33
+ filters = filters or {}
34
+ for key, value in filters.items():
35
+ field_name, casting, operation = self._parse_filter_key(key)
36
+ parsed_value = self._parse_filter_value(casting, value)
37
+ #print(field_name, casting, operation, parsed_value)
38
+ # Get the column and apply backend-specific transformations
39
+ if self.backend == "sqlalchemy":
40
+ column = self.backend_methods["get_column"](field_name, model, casting)
41
+ condition = self.backend_methods["apply_operation"](column, operation, parsed_value)
42
+ query_or_df = self.backend_methods["apply_condition"](query_or_df, condition)
43
+
44
+ elif self.backend == "dask":
45
+ column = self.backend_methods["get_column"](query_or_df, field_name, casting)
46
+ condition = self.backend_methods["apply_operation"](column, operation, parsed_value)
47
+ query_or_df = self.backend_methods["apply_condition"](query_or_df, condition)
48
+ else:
49
+ raise ValueError(f"Unsupported backend: {self.backend}")
50
+
51
+ return query_or_df
52
+
53
+ @staticmethod
54
+ def _parse_filter_key(key):
55
+ parts = key.split("__")
56
+ field_name = parts[0]
57
+ casting = None
58
+ operation = "exact"
59
+
60
+ if len(parts) == 3:
61
+ _, casting, operation = parts
62
+ elif len(parts) == 2:
63
+ if parts[1] in FilterHandler._comparison_operators():
64
+ operation = parts[1]
65
+ elif parts[1] in FilterHandler._dt_operators() + FilterHandler._date_operators():
66
+ casting = parts[1]
67
+
68
+ return field_name, casting, operation
69
+
70
+
71
+ def _parse_filter_value(self, casting, value):
72
+ """
73
+ Convert filter value to appropriate type based on the casting (e.g., date).
74
+ """
75
+ if casting == "date":
76
+ if isinstance(value, str):
77
+ parsed = pd.Timestamp(value) # Convert to datetime64[ns]
78
+ return parsed
79
+ if isinstance(value, list):
80
+ parsed = [pd.Timestamp(v) for v in value]
81
+ return parsed
82
+ elif casting == "time" and isinstance(value, str):
83
+ parsed = datetime.time.fromisoformat(value)
84
+ self.logger.debug(f"Parsed value (time): {parsed}")
85
+ return parsed
86
+ return value
87
+
88
+ @staticmethod
89
+ def _get_backend_methods(backend):
90
+ if backend == "sqlalchemy":
91
+ return {
92
+ "get_column": FilterHandler._get_sqlalchemy_column,
93
+ "apply_operation": FilterHandler._apply_operation_sqlalchemy,
94
+ "apply_condition": lambda query, condition: query.filter(condition),
95
+ }
96
+ elif backend == "dask":
97
+ return {
98
+ "get_column": FilterHandler._get_dask_column,
99
+ "apply_operation": FilterHandler._apply_operation_dask,
100
+ "apply_condition": lambda df, condition: df[condition],
101
+ }
102
+ else:
103
+ raise ValueError(f"Unsupported backend: {backend}")
104
+
105
+ @staticmethod
106
+ def _get_sqlalchemy_column(field_name, model, casting):
107
+ """
108
+ Retrieve and cast a column for SQLAlchemy based on the field name and casting.
109
+
110
+ Args:
111
+ field_name: The name of the field/column in the model.
112
+ model: The SQLAlchemy model.
113
+ casting: The casting type ('date', 'time', etc.).
114
+
115
+ Returns:
116
+ The SQLAlchemy column object, optionally cast or transformed.
117
+ """
118
+ column = getattr(model, field_name, None)
119
+ if not column:
120
+ raise AttributeError(f"Field '{field_name}' not found in model '{model.__name__}'")
121
+
122
+ if casting == "date":
123
+ # Cast the column to Date for whole-date comparisons
124
+ column = cast(column, Date)
125
+ elif casting == "time":
126
+ # Cast the column to Time for time-specific comparisons
127
+ column = cast(column, Time)
128
+ elif casting in FilterHandler._date_operators():
129
+ # Extract date part (e.g., year, month) using SQLAlchemy functions
130
+ column = func.extract(casting, column)
131
+
132
+ return column
133
+
134
+ @staticmethod
135
+ def _get_dask_column(df, field_name, casting):
136
+ """
137
+ Retrieve and optionally cast a column for Dask based on the field name and casting.
138
+
139
+ Args:
140
+ df: The Dask DataFrame.
141
+ field_name: The name of the field/column in the DataFrame.
142
+ casting: The casting type ('date', 'time', etc.).
143
+
144
+ Returns:
145
+ The Dask Series object, optionally cast or transformed.
146
+ """
147
+ column = dd.to_datetime(df[field_name], errors="coerce") if casting in FilterHandler._dt_operators() else df[
148
+ field_name]
149
+
150
+ if casting == "date":
151
+ column = column.dt.floor("D") # Ensure truncation to the date level
152
+ elif casting in FilterHandler._date_operators():
153
+ column = getattr(column.dt, casting)
154
+
155
+ return column
156
+
157
+ @staticmethod
158
+ def _apply_operation_sqlalchemy(column, operation, value):
159
+ operation_map = FilterHandler._operation_map_sqlalchemy()
160
+ if operation not in operation_map:
161
+ raise ValueError(f"Unsupported operation: {operation}")
162
+ return operation_map[operation](column, value)
163
+
164
+ @staticmethod
165
+ def _apply_operation_dask(column, operation, value):
166
+ operation_map = FilterHandler._operation_map_dask()
167
+ if operation not in operation_map:
168
+ raise ValueError(f"Unsupported operation: {operation}")
169
+ return operation_map[operation](column, value)
170
+
171
+ @staticmethod
172
+ def _operation_map_sqlalchemy():
173
+ return {
174
+ "exact": lambda col, val: col == val,
175
+ "gt": lambda col, val: col > val,
176
+ "gte": lambda col, val: col >= val,
177
+ "lt": lambda col, val: col < val,
178
+ "lte": lambda col, val: col <= val,
179
+ "in": lambda col, val: col.in_(val),
180
+ "range": lambda col, val: col.between(val[0], val[1]),
181
+ "contains": lambda col, val: col.like(f"%{val}%"),
182
+ "startswith": lambda col, val: col.like(f"{val}%"),
183
+ "endswith": lambda col, val: col.like(f"%{val}"),
184
+ "isnull": lambda col, val: col.is_(None) if val else col.isnot(None),
185
+ }
186
+
187
+ @staticmethod
188
+ def _operation_map_dask():
189
+ return {
190
+ "exact": lambda col, val: col == val,
191
+ "gt": lambda col, val: col > val,
192
+ "gte": lambda col, val: col >= val,
193
+ "lt": lambda col, val: col < val,
194
+ "lte": lambda col, val: col <= val,
195
+ "in": lambda col, val: col.isin(val),
196
+ "range": lambda col, val: (col >= val[0]) & (col <= val[1]),
197
+ "contains": lambda col, val: col.str.contains(val, regex=True),
198
+ "startswith": lambda col, val: col.str.startswith(val),
199
+ "endswith": lambda col, val: col.str.endswith(val),
200
+ "isnull": lambda col, val: col.isnull() if val else col.notnull(),
201
+ }
202
+
203
+ @staticmethod
204
+ def _dt_operators():
205
+ return ["date", "time"]
206
+
207
+ @staticmethod
208
+ def _date_operators():
209
+ return ["year", "month", "day", "hour", "minute", "second", "week_day"]
210
+
211
+ @staticmethod
212
+ def _comparison_operators():
213
+ return [
214
+ "gte", "lte", "gt", "lt", "exact", "in", "range",
215
+ "contains", "startswith", "endswith", "isnull",
216
+ ]
@@ -1,5 +1,8 @@
1
+ import warnings
2
+
1
3
  import dask.dataframe as dd
2
4
  import pandas as pd
5
+ from IPython.core.hooks import deprecated
3
6
  from django.db.models import Q
4
7
 
5
8
  from sibi_dst.df_helper.plugins.django import ReadFrameDask
@@ -12,13 +15,11 @@ class DjangoLoadFromDb:
12
15
  def __init__(self, db_connection, db_query, db_params, logger, **kwargs):
13
16
  self.connection_config = db_connection
14
17
  self.debug = kwargs.pop('debug', False)
15
- self.verbose_debug = kwargs.pop('verbose_debug', False)
16
18
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
17
19
  if self.connection_config.model is None:
18
20
  if self.debug:
19
- self.logger.critical('Model must be specified')
20
- if self.verbose_debug:
21
- print('Model must be specified')
21
+ self.logger.debug('Model must be specified')
22
+
22
23
  raise ValueError('Model must be specified')
23
24
 
24
25
  self.query_config = db_query
@@ -45,7 +46,7 @@ class DjangoLoadFromDb:
45
46
  try:
46
47
  self.df = ReadFrameDask(queryset, **self.params_config.df_params).read_frame()
47
48
  except Exception as e:
48
- self.logger.critical(f'Error loading query: {str(queryset.query)}, error message: {e}')
49
+ self.logger.debug(f'Error loading query: {str(queryset.query)}, error message: {e}')
49
50
  self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
50
51
  else:
51
52
  self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
@@ -69,16 +70,28 @@ class DjangoLoadFromDb:
69
70
  :param df: Dask DataFrame whose columns' data types are to be converted.
70
71
  :return: Dask DataFrame with converted column data types.
71
72
  """
72
-
73
- def log_debug(message: str, is_verbose: bool = False):
74
- """Helper to handle debug and verbose debug logging."""
75
- if self.debug:
76
- self.logger.debug(message)
77
- if is_verbose and self.verbose_debug:
78
- print(message)
79
-
73
+ """
74
+ [DEPRECATED] Convert the data types of columns in a Dask DataFrame based on the field type in the Django model.
75
+
76
+ :param df: Dask DataFrame whose columns' data types are to be converted.
77
+ :return: Dask DataFrame with converted column data types.
78
+ """
79
+ # Emit deprecation warning
80
+ warnings.warn(
81
+ "_convert_columns is deprecated and will be removed in a future release. "
82
+ "Consider using <new_method_name> instead.",
83
+ DeprecationWarning,
84
+ stacklevel=2,
85
+ )
86
+
87
+ # Log deprecation message if debug mode is enabled
80
88
  if self.debug:
81
- self.logger.info(f'Converting columns: {list(df.columns)}')
89
+ self.logger.warning(
90
+ "[DEPRECATION NOTICE] The `_convert_columns` method is deprecated and will be removed in a future release. "
91
+ "Consider using <new_method_name> instead."
92
+ )
93
+
94
+ self.logger.debug(f'Converting columns: {list(df.columns)}')
82
95
 
83
96
  # Get field information from the Django model
84
97
  model_fields = self.connection_config.model._meta.get_fields()
@@ -87,13 +100,13 @@ class DjangoLoadFromDb:
87
100
  for field_name, field_type in field_type_map.items():
88
101
  if field_name not in df.columns:
89
102
 
90
- log_debug(f"Column '{field_name}' not found in DataFrame columns.")
103
+ self.logger.debug(f"Column '{field_name}' not found in DataFrame columns.")
91
104
  continue
92
105
 
93
106
  conversion_func = django_field_conversion_map_dask.get(field_type)
94
107
  if not conversion_func:
95
108
  message=f"Field type '{field_type}' not found in conversion_map."
96
- log_debug(message, is_verbose=True)
109
+ self.logger.debug(message)
97
110
  continue
98
111
 
99
112
  def apply_conversion(partition):
@@ -104,7 +117,7 @@ class DjangoLoadFromDb:
104
117
  if field_name in partition.columns:
105
118
  partition[field_name] = conversion_func(partition[field_name])
106
119
  except Exception as e:
107
- self.logger.error(f"Error converting column '{field_name}' in partition: {str(e)}")
120
+ self.logger.debug(f"Error converting column '{field_name}' in partition: {str(e)}")
108
121
  return partition
109
122
 
110
123
  try:
@@ -113,9 +126,8 @@ class DjangoLoadFromDb:
113
126
  apply_conversion,
114
127
  meta=df,
115
128
  )
116
- log_debug(f"Successfully queued conversion for column '{field_name}' to type '{field_type}'.",
117
- is_verbose=True)
129
+ self.logger.debug(f"Successfully queued conversion for column '{field_name}' to type '{field_type}'.")
118
130
  except Exception as e:
119
- log_debug(f"Failed to queue conversion for column '{field_name}': {str(e)}", is_verbose=True)
131
+ self.logger.debug(f"Failed to queue conversion for column '{field_name}': {str(e)}")
120
132
 
121
133
  return df
@@ -240,7 +240,4 @@ class ReadFrameDask:
240
240
  if verbose:
241
241
  self.update_with_verbose(dask_df, fieldnames, fields)
242
242
 
243
- if isinstance(dask_df, dask_expr._collection.DataFrame):
244
- dask_df = dask_df.to_legacy_dataframe()
245
-
246
243
  return dask_df
@@ -31,17 +31,17 @@ class HttpConfig(BaseModel):
31
31
  # Set up headers with API key if provided
32
32
  headers = {"Authorization": f"Bearer {self.api_key.get_secret_value()}"} if self.api_key else {}
33
33
 
34
- self.logger.info(f"Fetching data from {formatted_url} with params {self.params}")
34
+ self.logger.debug(f"Fetching data from {formatted_url} with params {self.params}")
35
35
  async with httpx.AsyncClient() as client:
36
36
  response = await client.get(formatted_url, params=self.params, headers=headers, timeout=self.timeout)
37
37
  response.raise_for_status()
38
38
  data = response.json()
39
39
  df = dd.from_pandas(pd.json_normalize(data), npartitions=1)
40
- self.logger.info("Data successfully loaded from HTTP JSON source.")
40
+ self.logger.debug("Data successfully loaded from HTTP JSON source.")
41
41
  return df
42
42
  except httpx.RequestError as e:
43
- self.logger.error(f"HTTP request error: {e}")
43
+ self.logger.debug(f"HTTP request error: {e}")
44
44
  raise
45
45
  except ValueError as e:
46
- self.logger.error(f"Error parsing JSON data: {e}")
46
+ self.logger.debug(f"Error parsing JSON data: {e}")
47
47
  raise
@@ -8,7 +8,7 @@ from sqlalchemy.orm import sessionmaker
8
8
 
9
9
  from sibi_dst.df_helper.plugins.sql_alchemy._sqlachemy_filter_handler import SqlAlchemyFilterHandler
10
10
  from sibi_dst.utils import Logger
11
-
11
+ from sibi_dst.df_helper.core import FilterHandler
12
12
 
13
13
  class SQLAlchemyDask:
14
14
  def __init__(self, model, filters, engine_url, chunk_size=1000, logger=None, debug=False):
@@ -38,7 +38,6 @@ class SQLAlchemyDask:
38
38
  """
39
39
  mapper = inspect(model)
40
40
  sqlalchemy_to_dask_dtype = {
41
- #'INTEGER': pd.to_numeric(x, errors="coerce"),
42
41
  'INTEGER': 'Int64',
43
42
  'SMALLINT': 'Int64',
44
43
  'BIGINT': 'Int64',
@@ -72,11 +71,15 @@ class SQLAlchemyDask:
72
71
  # Build query
73
72
  self.query = select(self.model)
74
73
  if self.filters:
75
- self.query = SqlAlchemyFilterHandler.apply_filters_sqlalchemy(self.query, self.model, self.filters)
74
+ """
75
+ deprecated specific filter handling to a generic one
76
+ #self.query = SqlAlchemyFilterHandler.apply_filters_sqlalchemy(self.query, self.model, self.filters)
77
+ """
78
+ self.query = FilterHandler(backend="sqlalchemy", logger=self.logger).apply_filters(self.query, model=self.model, filters=self.filters)
76
79
  else:
77
80
  n_records = 100
78
81
  self.query = self.query.limit(n_records)
79
-
82
+ self.logger.debug(f"query:{self.query}")
80
83
  # Infer dtypes
81
84
  dtypes = self.infer_dtypes_from_model(self.model)
82
85
  # Get the column order from the SQLAlchemy model
@@ -124,14 +127,11 @@ class SQLAlchemyDask:
124
127
  else:
125
128
  dask_df = dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
126
129
 
127
- if self.debug:
128
- self.logger.info(f"Loaded {len(dask_df)} rows into Dask DataFrame.")
129
-
130
- if isinstance(dask_df, dask_expr._collection.DataFrame):
131
- dask_df = dask_df.to_legacy_dataframe()
130
+ self.logger.debug(f"Loaded {len(dask_df)} rows into Dask DataFrame.")
132
131
 
133
132
  return dask_df
134
133
 
135
134
  except Exception as e:
136
135
  self.logger.error(f"Error executing query: {str(e)}")
136
+ self.logger.error(self.query)
137
137
  return dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
@@ -1,7 +1,9 @@
1
- from sqlalchemy import and_, or_, not_, func, cast
2
- from sqlalchemy.sql.sqltypes import Date, Time
3
1
  import datetime
4
2
 
3
+ from sqlalchemy import func, cast
4
+ from sqlalchemy.sql.sqltypes import Date, Time
5
+
6
+
5
7
  class SqlAlchemyFilterHandler:
6
8
  @staticmethod
7
9
  def apply_filters_sqlalchemy(query, model, filters):