sibi-dst 0.3.15__tar.gz → 0.3.16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/PKG-INFO +2 -1
  2. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/pyproject.toml +2 -1
  3. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/df_helper/_df_helper.py +36 -20
  4. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/df_helper/core/__init__.py +6 -4
  5. sibi_dst-0.3.16/sibi_dst/df_helper/core/_filter_handler.py +216 -0
  6. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/df_helper/plugins/django/_django_load_from_db.py +32 -20
  7. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/df_helper/plugins/django/_io_dask.py +0 -3
  8. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/df_helper/plugins/http/_http_config.py +4 -4
  9. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py +9 -9
  10. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/df_helper/plugins/sql_alchemy/_sqlachemy_filter_handler.py +4 -2
  11. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py +8 -6
  12. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_model_builder.py +1 -1
  13. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/df_helper/plugins/sql_model/_sqlmodel_load_from_db.py +2 -3
  14. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/utils/_clickhouse_writer.py +16 -16
  15. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/utils/_data_utils.py +5 -4
  16. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/utils/_data_wrapper.py +8 -4
  17. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/utils/_df_utils.py +5 -5
  18. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/utils/_log_utils.py +3 -0
  19. sibi_dst-0.3.16/sibi_dst/utils/_parquet_saver.py +106 -0
  20. sibi_dst-0.3.15/sibi_dst/utils/_parquet_saver.py +0 -211
  21. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/README.md +0 -0
  22. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/__init__.py +0 -0
  23. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/df_helper/__init__.py +0 -0
  24. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
  25. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/df_helper/core/_defaults.py +0 -0
  26. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/df_helper/core/_params_config.py +0 -0
  27. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/df_helper/core/_query_config.py +0 -0
  28. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/df_helper/plugins/__init__.py +0 -0
  29. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/df_helper/plugins/django/__init__.py +0 -0
  30. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/df_helper/plugins/django/_django_db_connection.py +0 -0
  31. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/df_helper/plugins/django/_django_sql_model_builder.py +0 -0
  32. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/df_helper/plugins/django/_io_dask_alt.py +0 -0
  33. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/df_helper/plugins/http/__init__.py +0 -0
  34. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/df_helper/plugins/parquet/__init__.py +0 -0
  35. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/df_helper/plugins/parquet/_parquet_filter_handler.py +0 -0
  36. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/df_helper/plugins/parquet/_parquet_options.py +0 -0
  37. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/df_helper/plugins/sql_alchemy/__init__.py +0 -0
  38. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_db_connection.py +0 -0
  39. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/df_helper/plugins/sql_model/__init__.py +0 -0
  40. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/df_helper/plugins/sql_model/_sqlmodel_db_connection.py +0 -0
  41. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/utils/__init__.py +0 -0
  42. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/utils/_airflow_manager.py +0 -0
  43. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/utils/_credentials.py +0 -0
  44. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/utils/_date_utils.py +0 -0
  45. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/utils/_file_utils.py +0 -0
  46. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/utils/_filepath_generator.py +0 -0
  47. {sibi_dst-0.3.15 → sibi_dst-0.3.16}/sibi_dst/utils/_storage_manager.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.15
3
+ Version: 0.3.16
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -13,6 +13,7 @@ Requires-Dist: chardet (>=5.2.0,<6.0.0)
13
13
  Requires-Dist: charset-normalizer (>=3.4.0,<4.0.0)
14
14
  Requires-Dist: clickhouse-connect (>=0.8.7,<0.9.0)
15
15
  Requires-Dist: clickhouse-driver (>=0.2.9,<0.3.0)
16
+ Requires-Dist: dask-expr (>=1.1.20,<2.0.0)
16
17
  Requires-Dist: dask[complete] (>=2024.11.1,<2025.0.0)
17
18
  Requires-Dist: django (>=5.1.4,<6.0.0)
18
19
  Requires-Dist: djangorestframework (>=3.15.2,<4.0.0)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sibi-dst"
3
- version = "0.3.15"
3
+ version = "0.3.16"
4
4
  description = "Data Science Toolkit"
5
5
  authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
6
6
  readme = "README.md"
@@ -33,6 +33,7 @@ charset-normalizer = "^3.4.0"
33
33
  uvicorn = "^0.32.1"
34
34
  sqlalchemy = "^2.0.36"
35
35
  djangorestframework = "^3.15.2"
36
+ dask-expr = "^1.1.20"
36
37
 
37
38
 
38
39
  [build-system]
@@ -1,13 +1,16 @@
1
1
  import asyncio
2
2
  import datetime
3
+ import logging
4
+ import warnings
3
5
  from typing import Any, Dict, TypeVar
4
6
  from typing import Union, Optional
5
7
 
6
8
  import dask.dataframe as dd
9
+ import dask_expr
7
10
  import pandas as pd
8
11
  from pydantic import BaseModel
9
12
 
10
- from sibi_dst.df_helper.core import QueryConfig, ParamsConfig
13
+ from sibi_dst.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
11
14
  from sibi_dst.utils import Logger
12
15
  from sibi_dst.utils import ParquetSaver, ClickHouseWriter
13
16
  from .plugins.django import *
@@ -18,6 +21,12 @@ from .plugins.sql_alchemy import *
18
21
  # Define a generic type variable for BaseModel subclasses
19
22
  T = TypeVar("T", bound=BaseModel)
20
23
 
24
+ # It is considered acceptable in Django to access protected class members
25
+ warnings.filterwarnings(
26
+ "ignore",
27
+ message="Access to a protected member _meta",
28
+ category=UserWarning,
29
+ )
21
30
 
22
31
  class DfHelper:
23
32
  df: Union[dd.DataFrame, pd.DataFrame] = None
@@ -36,9 +45,12 @@ class DfHelper:
36
45
  self.default_config = self.default_config or {}
37
46
  kwargs = {**self.default_config.copy(), **kwargs}
38
47
  self.source = source
39
- self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
40
48
  self.debug = kwargs.setdefault("debug", False)
41
- self.verbose_debug = kwargs.setdefault("verbose_debug", False)
49
+ self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
50
+ # Configure logger level
51
+ self.logger.setLevel(logging.DEBUG if self.debug else logging.INFO)
52
+ # Configure logger level
53
+ self.logger.debug("Logger initialized in DEBUG mode.")
42
54
  self.parquet_storage_path = kwargs.setdefault("parquet_storage_path", None)
43
55
  self.dt_field = kwargs.setdefault("dt_field", None)
44
56
  self.as_pandas = kwargs.setdefault("as_pandas", False)
@@ -47,7 +59,7 @@ class DfHelper:
47
59
  self.post_init(**kwargs)
48
60
 
49
61
  def post_init(self, **kwargs):
50
- self.logger.info(f"Source used: {self.source}")
62
+ self.logger.debug(f"Source used: {self.source}")
51
63
  self.plugin_query = self.__get_config(QueryConfig, kwargs)
52
64
  self.plugin_params = self.__get_config(ParamsConfig, kwargs)
53
65
  if self.source == 'django_db':
@@ -93,16 +105,15 @@ class DfHelper:
93
105
  return self._load_from_parquet(**options)
94
106
  elif self.source == 'http':
95
107
  if asyncio.get_event_loop().is_running():
96
- self.logger.info("Running as a task from an event loop")
108
+ self.logger.debug("Running as a task from an event loop")
97
109
  return asyncio.create_task(self._load_from_http(**options))
98
110
  else:
99
- self.logger.info("Regular asyncio run...")
111
+ self.logger.debug("Regular asyncio run...")
100
112
  return asyncio.run(self._load_from_http(**options))
101
113
 
102
114
  def _load_from_sqlalchemy(self, **options):
103
115
  try:
104
116
  options.setdefault("debug", self.debug)
105
- options.setdefault("verbose_debug", self.verbose_debug)
106
117
  db_loader = SqlAlchemyLoadFromDb(
107
118
  self.plugin_sqlalchemy,
108
119
  self.plugin_query,
@@ -113,9 +124,9 @@ class DfHelper:
113
124
  self.df = db_loader.build_and_load()
114
125
  self._process_loaded_data()
115
126
  self._post_process_df()
116
- self.logger.info("Data successfully loaded from sqlalchemy database.")
127
+ self.logger.debug("Data successfully loaded from sqlalchemy database.")
117
128
  except Exception as e:
118
- self.logger.error(f"Failed to load data from sqlalchemy database: {e}: options: {options}")
129
+ self.logger.debug(f"Failed to load data from sqlalchemy database: {e}: options: {options}")
119
130
  self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
120
131
 
121
132
  return self.df
@@ -123,7 +134,6 @@ class DfHelper:
123
134
  def _load_from_db(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
124
135
  try:
125
136
  options.setdefault("debug", self.debug)
126
- options.setdefault("verbose_debug", self.verbose_debug)
127
137
  db_loader = DjangoLoadFromDb(
128
138
  self.plugin_django_connection,
129
139
  self.plugin_query,
@@ -134,9 +144,9 @@ class DfHelper:
134
144
  self.df = db_loader.build_and_load()
135
145
  self._process_loaded_data()
136
146
  self._post_process_df()
137
- self.logger.info("Data successfully loaded from django database.")
147
+ self.logger.debug("Data successfully loaded from django database.")
138
148
  except Exception as e:
139
- self.logger.error(f"Failed to load data from django database: {e}")
149
+ self.logger.debug(f"Failed to load data from django database: {e}")
140
150
  self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
141
151
 
142
152
  return self.df
@@ -144,12 +154,12 @@ class DfHelper:
144
154
  async def _load_from_http(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
145
155
  """Delegate asynchronous HTTP data loading to HttpDataSource plugin."""
146
156
  if not self.plugin_http:
147
- self.logger.error("HTTP plugin not configured properly.")
157
+ self.logger.debug("HTTP plugin not configured properly.")
148
158
  return dd.from_pandas(pd.DataFrame(), npartitions=1)
149
159
  try:
150
160
  self.df = await self.plugin_http.fetch_data(**options)
151
161
  except Exception as e:
152
- self.logger.error(f"Failed to load data from http plugin: {e}")
162
+ self.logger.debug(f"Failed to load data from http plugin: {e}")
153
163
  self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
154
164
  return self.df
155
165
 
@@ -190,10 +200,10 @@ class DfHelper:
190
200
  if datetime_index and self.df.index.dtype != 'datetime64[ns]':
191
201
  self.df = self.df.map_partitions(lambda df: df.set_index(pd.to_datetime(df.index, errors='coerce')))
192
202
 
193
- self.logger.info("Post-processing of DataFrame completed.")
203
+ self.logger.debug("Post-processing of DataFrame completed.")
194
204
 
195
205
  def _process_loaded_data(self):
196
- self.logger.info(f"Type of self.df: {type(self.df)}")
206
+ self.logger.debug(f"Type of self.df: {type(self.df)}")
197
207
  if self.df.map_partitions(len).compute().sum() > 0:
198
208
  field_map = self.plugin_params.field_map or {}
199
209
  if isinstance(field_map, dict):
@@ -211,25 +221,30 @@ class DfHelper:
211
221
  # Apply renaming
212
222
  self.df = self.df.map_partitions(rename_columns, mapping=rename_mapping)
213
223
 
214
- self.logger.info("Processing of loaded data completed.")
224
+ self.logger.debug("Processing of loaded data completed.")
215
225
 
216
226
  def save_to_parquet(self, parquet_filename: Optional[str] = None):
217
227
  ps = ParquetSaver(self.df, self.parquet_storage_path, self.logger)
218
228
  ps.save_to_parquet(parquet_filename)
219
- self.logger.info(f"Parquet saved to {parquet_filename} in parquet storage: {self.parquet_storage_path}.")
229
+ self.logger.debug(f"Parquet saved to {parquet_filename} in parquet storage: {self.parquet_storage_path}.")
220
230
 
221
231
  def save_to_clickhouse(self, **credentials):
222
232
  if self.df.map_partitions(len).compute().sum() == 0:
223
- self.logger.info("Cannot write to clickhouse since Dataframe is empty")
233
+ self.logger.debug("Cannot write to clickhouse since Dataframe is empty")
224
234
  return
225
235
  cs = ClickHouseWriter(logger=self.logger, **credentials)
226
236
  cs.save_to_clickhouse(self.df)
227
- self.logger.info("Save to ClickHouse completed.")
237
+ self.logger.debug("Save to ClickHouse completed.")
228
238
 
229
239
  def _load_from_parquet(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
230
240
  self.df = self.plugin_parquet.load_files()
231
241
  if options:
242
+ """
243
+ deprecated specific filter handling to a generic one
232
244
  self.df = ParquetFilterHandler(logger=self.logger).apply_filters_dask(self.df, options)
245
+
246
+ """
247
+ self.df = FilterHandler(backend='dask', logger=self.logger).apply_filters(self.df, filters=options)
233
248
  return self.df
234
249
 
235
250
  def load_period(self, **kwargs):
@@ -294,6 +309,7 @@ class DfHelper:
294
309
  elif is_datetime_field:
295
310
  kwargs[f"{mapped_field}__date__gte"] = start
296
311
  kwargs[f"{mapped_field}__date__lte"] = end
312
+ self.logger.debug(f"load_period kwargs: {kwargs}")
297
313
  return self.load(**kwargs)
298
314
 
299
315
  @staticmethod
@@ -1,12 +1,13 @@
1
1
  from __future__ import annotations
2
2
 
3
- from ._params_config import ParamsConfig
4
- from ._query_config import QueryConfig
5
3
  from ._defaults import (
6
4
  django_field_conversion_map_pandas,
7
5
  django_field_conversion_map_dask,
8
6
  sqlalchemy_field_conversion_map_dask,
9
7
  normalize_sqlalchemy_type)
8
+ from ._filter_handler import FilterHandler
9
+ from ._params_config import ParamsConfig
10
+ from ._query_config import QueryConfig
10
11
 
11
12
  __all__ = [
12
13
  "ParamsConfig",
@@ -14,5 +15,6 @@ __all__ = [
14
15
  "django_field_conversion_map_pandas",
15
16
  "django_field_conversion_map_dask",
16
17
  "sqlalchemy_field_conversion_map_dask",
17
- "normalize_sqlalchemy_type"
18
- ]
18
+ "normalize_sqlalchemy_type",
19
+ "FilterHandler",
20
+ ]
@@ -0,0 +1,216 @@
1
+ import datetime
2
+ import dask.dataframe as dd
3
+ import pandas as pd
4
+ from sqlalchemy import func, cast
5
+ from sqlalchemy.sql.sqltypes import Date, Time
6
+ from sibi_dst.utils import Logger
7
+
8
+ class FilterHandler:
9
+ def __init__(self, backend, logger=None):
10
+ """
11
+ Initialize the FilterHandler.
12
+
13
+ Args:
14
+ backend: The backend to use ('sqlalchemy' or 'dask').
15
+ logger: Optional logger for debugging purposes.
16
+ """
17
+ self.backend = backend
18
+ self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__) # No-op logger if none provided
19
+ self.backend_methods = self._get_backend_methods(backend)
20
+
21
+ def apply_filters(self, query_or_df, model=None, filters=None):
22
+ """
23
+ Apply filters to the data source based on the backend.
24
+
25
+ Args:
26
+ query_or_df: SQLAlchemy query or Dask DataFrame.
27
+ model: SQLAlchemy model (required for SQLAlchemy backend).
28
+ filters: Dictionary of filters.
29
+
30
+ Returns:
31
+ Filtered query or DataFrame.
32
+ """
33
+ filters = filters or {}
34
+ for key, value in filters.items():
35
+ field_name, casting, operation = self._parse_filter_key(key)
36
+ parsed_value = self._parse_filter_value(casting, value)
37
+ #print(field_name, casting, operation, parsed_value)
38
+ # Get the column and apply backend-specific transformations
39
+ if self.backend == "sqlalchemy":
40
+ column = self.backend_methods["get_column"](field_name, model, casting)
41
+ condition = self.backend_methods["apply_operation"](column, operation, parsed_value)
42
+ query_or_df = self.backend_methods["apply_condition"](query_or_df, condition)
43
+
44
+ elif self.backend == "dask":
45
+ column = self.backend_methods["get_column"](query_or_df, field_name, casting)
46
+ condition = self.backend_methods["apply_operation"](column, operation, parsed_value)
47
+ query_or_df = self.backend_methods["apply_condition"](query_or_df, condition)
48
+ else:
49
+ raise ValueError(f"Unsupported backend: {self.backend}")
50
+
51
+ return query_or_df
52
+
53
+ @staticmethod
54
+ def _parse_filter_key(key):
55
+ parts = key.split("__")
56
+ field_name = parts[0]
57
+ casting = None
58
+ operation = "exact"
59
+
60
+ if len(parts) == 3:
61
+ _, casting, operation = parts
62
+ elif len(parts) == 2:
63
+ if parts[1] in FilterHandler._comparison_operators():
64
+ operation = parts[1]
65
+ elif parts[1] in FilterHandler._dt_operators() + FilterHandler._date_operators():
66
+ casting = parts[1]
67
+
68
+ return field_name, casting, operation
69
+
70
+
71
+ def _parse_filter_value(self, casting, value):
72
+ """
73
+ Convert filter value to appropriate type based on the casting (e.g., date).
74
+ """
75
+ if casting == "date":
76
+ if isinstance(value, str):
77
+ parsed = pd.Timestamp(value) # Convert to datetime64[ns]
78
+ return parsed
79
+ if isinstance(value, list):
80
+ parsed = [pd.Timestamp(v) for v in value]
81
+ return parsed
82
+ elif casting == "time" and isinstance(value, str):
83
+ parsed = datetime.time.fromisoformat(value)
84
+ self.logger.debug(f"Parsed value (time): {parsed}")
85
+ return parsed
86
+ return value
87
+
88
+ @staticmethod
89
+ def _get_backend_methods(backend):
90
+ if backend == "sqlalchemy":
91
+ return {
92
+ "get_column": FilterHandler._get_sqlalchemy_column,
93
+ "apply_operation": FilterHandler._apply_operation_sqlalchemy,
94
+ "apply_condition": lambda query, condition: query.filter(condition),
95
+ }
96
+ elif backend == "dask":
97
+ return {
98
+ "get_column": FilterHandler._get_dask_column,
99
+ "apply_operation": FilterHandler._apply_operation_dask,
100
+ "apply_condition": lambda df, condition: df[condition],
101
+ }
102
+ else:
103
+ raise ValueError(f"Unsupported backend: {backend}")
104
+
105
+ @staticmethod
106
+ def _get_sqlalchemy_column(field_name, model, casting):
107
+ """
108
+ Retrieve and cast a column for SQLAlchemy based on the field name and casting.
109
+
110
+ Args:
111
+ field_name: The name of the field/column in the model.
112
+ model: The SQLAlchemy model.
113
+ casting: The casting type ('date', 'time', etc.).
114
+
115
+ Returns:
116
+ The SQLAlchemy column object, optionally cast or transformed.
117
+ """
118
+ column = getattr(model, field_name, None)
119
+ if not column:
120
+ raise AttributeError(f"Field '{field_name}' not found in model '{model.__name__}'")
121
+
122
+ if casting == "date":
123
+ # Cast the column to Date for whole-date comparisons
124
+ column = cast(column, Date)
125
+ elif casting == "time":
126
+ # Cast the column to Time for time-specific comparisons
127
+ column = cast(column, Time)
128
+ elif casting in FilterHandler._date_operators():
129
+ # Extract date part (e.g., year, month) using SQLAlchemy functions
130
+ column = func.extract(casting, column)
131
+
132
+ return column
133
+
134
+ @staticmethod
135
+ def _get_dask_column(df, field_name, casting):
136
+ """
137
+ Retrieve and optionally cast a column for Dask based on the field name and casting.
138
+
139
+ Args:
140
+ df: The Dask DataFrame.
141
+ field_name: The name of the field/column in the DataFrame.
142
+ casting: The casting type ('date', 'time', etc.).
143
+
144
+ Returns:
145
+ The Dask Series object, optionally cast or transformed.
146
+ """
147
+ column = dd.to_datetime(df[field_name], errors="coerce") if casting in FilterHandler._dt_operators() else df[
148
+ field_name]
149
+
150
+ if casting == "date":
151
+ column = column.dt.floor("D") # Ensure truncation to the date level
152
+ elif casting in FilterHandler._date_operators():
153
+ column = getattr(column.dt, casting)
154
+
155
+ return column
156
+
157
+ @staticmethod
158
+ def _apply_operation_sqlalchemy(column, operation, value):
159
+ operation_map = FilterHandler._operation_map_sqlalchemy()
160
+ if operation not in operation_map:
161
+ raise ValueError(f"Unsupported operation: {operation}")
162
+ return operation_map[operation](column, value)
163
+
164
+ @staticmethod
165
+ def _apply_operation_dask(column, operation, value):
166
+ operation_map = FilterHandler._operation_map_dask()
167
+ if operation not in operation_map:
168
+ raise ValueError(f"Unsupported operation: {operation}")
169
+ return operation_map[operation](column, value)
170
+
171
+ @staticmethod
172
+ def _operation_map_sqlalchemy():
173
+ return {
174
+ "exact": lambda col, val: col == val,
175
+ "gt": lambda col, val: col > val,
176
+ "gte": lambda col, val: col >= val,
177
+ "lt": lambda col, val: col < val,
178
+ "lte": lambda col, val: col <= val,
179
+ "in": lambda col, val: col.in_(val),
180
+ "range": lambda col, val: col.between(val[0], val[1]),
181
+ "contains": lambda col, val: col.like(f"%{val}%"),
182
+ "startswith": lambda col, val: col.like(f"{val}%"),
183
+ "endswith": lambda col, val: col.like(f"%{val}"),
184
+ "isnull": lambda col, val: col.is_(None) if val else col.isnot(None),
185
+ }
186
+
187
+ @staticmethod
188
+ def _operation_map_dask():
189
+ return {
190
+ "exact": lambda col, val: col == val,
191
+ "gt": lambda col, val: col > val,
192
+ "gte": lambda col, val: col >= val,
193
+ "lt": lambda col, val: col < val,
194
+ "lte": lambda col, val: col <= val,
195
+ "in": lambda col, val: col.isin(val),
196
+ "range": lambda col, val: (col >= val[0]) & (col <= val[1]),
197
+ "contains": lambda col, val: col.str.contains(val, regex=True),
198
+ "startswith": lambda col, val: col.str.startswith(val),
199
+ "endswith": lambda col, val: col.str.endswith(val),
200
+ "isnull": lambda col, val: col.isnull() if val else col.notnull(),
201
+ }
202
+
203
+ @staticmethod
204
+ def _dt_operators():
205
+ return ["date", "time"]
206
+
207
+ @staticmethod
208
+ def _date_operators():
209
+ return ["year", "month", "day", "hour", "minute", "second", "week_day"]
210
+
211
+ @staticmethod
212
+ def _comparison_operators():
213
+ return [
214
+ "gte", "lte", "gt", "lt", "exact", "in", "range",
215
+ "contains", "startswith", "endswith", "isnull",
216
+ ]
@@ -1,5 +1,8 @@
1
+ import warnings
2
+
1
3
  import dask.dataframe as dd
2
4
  import pandas as pd
5
+ from IPython.core.hooks import deprecated
3
6
  from django.db.models import Q
4
7
 
5
8
  from sibi_dst.df_helper.plugins.django import ReadFrameDask
@@ -12,13 +15,11 @@ class DjangoLoadFromDb:
12
15
  def __init__(self, db_connection, db_query, db_params, logger, **kwargs):
13
16
  self.connection_config = db_connection
14
17
  self.debug = kwargs.pop('debug', False)
15
- self.verbose_debug = kwargs.pop('verbose_debug', False)
16
18
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
17
19
  if self.connection_config.model is None:
18
20
  if self.debug:
19
- self.logger.critical('Model must be specified')
20
- if self.verbose_debug:
21
- print('Model must be specified')
21
+ self.logger.debug('Model must be specified')
22
+
22
23
  raise ValueError('Model must be specified')
23
24
 
24
25
  self.query_config = db_query
@@ -45,7 +46,7 @@ class DjangoLoadFromDb:
45
46
  try:
46
47
  self.df = ReadFrameDask(queryset, **self.params_config.df_params).read_frame()
47
48
  except Exception as e:
48
- self.logger.critical(f'Error loading query: {str(queryset.query)}, error message: {e}')
49
+ self.logger.debug(f'Error loading query: {str(queryset.query)}, error message: {e}')
49
50
  self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
50
51
  else:
51
52
  self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
@@ -69,16 +70,28 @@ class DjangoLoadFromDb:
69
70
  :param df: Dask DataFrame whose columns' data types are to be converted.
70
71
  :return: Dask DataFrame with converted column data types.
71
72
  """
72
-
73
- def log_debug(message: str, is_verbose: bool = False):
74
- """Helper to handle debug and verbose debug logging."""
75
- if self.debug:
76
- self.logger.debug(message)
77
- if is_verbose and self.verbose_debug:
78
- print(message)
79
-
73
+ """
74
+ [DEPRECATED] Convert the data types of columns in a Dask DataFrame based on the field type in the Django model.
75
+
76
+ :param df: Dask DataFrame whose columns' data types are to be converted.
77
+ :return: Dask DataFrame with converted column data types.
78
+ """
79
+ # Emit deprecation warning
80
+ warnings.warn(
81
+ "_convert_columns is deprecated and will be removed in a future release. "
82
+ "Consider using <new_method_name> instead.",
83
+ DeprecationWarning,
84
+ stacklevel=2,
85
+ )
86
+
87
+ # Log deprecation message if debug mode is enabled
80
88
  if self.debug:
81
- self.logger.info(f'Converting columns: {list(df.columns)}')
89
+ self.logger.warning(
90
+ "[DEPRECATION NOTICE] The `_convert_columns` method is deprecated and will be removed in a future release. "
91
+ "Consider using <new_method_name> instead."
92
+ )
93
+
94
+ self.logger.debug(f'Converting columns: {list(df.columns)}')
82
95
 
83
96
  # Get field information from the Django model
84
97
  model_fields = self.connection_config.model._meta.get_fields()
@@ -87,13 +100,13 @@ class DjangoLoadFromDb:
87
100
  for field_name, field_type in field_type_map.items():
88
101
  if field_name not in df.columns:
89
102
 
90
- log_debug(f"Column '{field_name}' not found in DataFrame columns.")
103
+ self.logger.debug(f"Column '{field_name}' not found in DataFrame columns.")
91
104
  continue
92
105
 
93
106
  conversion_func = django_field_conversion_map_dask.get(field_type)
94
107
  if not conversion_func:
95
108
  message=f"Field type '{field_type}' not found in conversion_map."
96
- log_debug(message, is_verbose=True)
109
+ self.logger.debug(message)
97
110
  continue
98
111
 
99
112
  def apply_conversion(partition):
@@ -104,7 +117,7 @@ class DjangoLoadFromDb:
104
117
  if field_name in partition.columns:
105
118
  partition[field_name] = conversion_func(partition[field_name])
106
119
  except Exception as e:
107
- self.logger.error(f"Error converting column '{field_name}' in partition: {str(e)}")
120
+ self.logger.debug(f"Error converting column '{field_name}' in partition: {str(e)}")
108
121
  return partition
109
122
 
110
123
  try:
@@ -113,9 +126,8 @@ class DjangoLoadFromDb:
113
126
  apply_conversion,
114
127
  meta=df,
115
128
  )
116
- log_debug(f"Successfully queued conversion for column '{field_name}' to type '{field_type}'.",
117
- is_verbose=True)
129
+ self.logger.debug(f"Successfully queued conversion for column '{field_name}' to type '{field_type}'.")
118
130
  except Exception as e:
119
- log_debug(f"Failed to queue conversion for column '{field_name}': {str(e)}", is_verbose=True)
131
+ self.logger.debug(f"Failed to queue conversion for column '{field_name}': {str(e)}")
120
132
 
121
133
  return df
@@ -240,7 +240,4 @@ class ReadFrameDask:
240
240
  if verbose:
241
241
  self.update_with_verbose(dask_df, fieldnames, fields)
242
242
 
243
- if isinstance(dask_df, dask_expr._collection.DataFrame):
244
- dask_df = dask_df.to_legacy_dataframe()
245
-
246
243
  return dask_df
@@ -31,17 +31,17 @@ class HttpConfig(BaseModel):
31
31
  # Set up headers with API key if provided
32
32
  headers = {"Authorization": f"Bearer {self.api_key.get_secret_value()}"} if self.api_key else {}
33
33
 
34
- self.logger.info(f"Fetching data from {formatted_url} with params {self.params}")
34
+ self.logger.debug(f"Fetching data from {formatted_url} with params {self.params}")
35
35
  async with httpx.AsyncClient() as client:
36
36
  response = await client.get(formatted_url, params=self.params, headers=headers, timeout=self.timeout)
37
37
  response.raise_for_status()
38
38
  data = response.json()
39
39
  df = dd.from_pandas(pd.json_normalize(data), npartitions=1)
40
- self.logger.info("Data successfully loaded from HTTP JSON source.")
40
+ self.logger.debug("Data successfully loaded from HTTP JSON source.")
41
41
  return df
42
42
  except httpx.RequestError as e:
43
- self.logger.error(f"HTTP request error: {e}")
43
+ self.logger.debug(f"HTTP request error: {e}")
44
44
  raise
45
45
  except ValueError as e:
46
- self.logger.error(f"Error parsing JSON data: {e}")
46
+ self.logger.debug(f"Error parsing JSON data: {e}")
47
47
  raise
@@ -8,7 +8,7 @@ from sqlalchemy.orm import sessionmaker
8
8
 
9
9
  from sibi_dst.df_helper.plugins.sql_alchemy._sqlachemy_filter_handler import SqlAlchemyFilterHandler
10
10
  from sibi_dst.utils import Logger
11
-
11
+ from sibi_dst.df_helper.core import FilterHandler
12
12
 
13
13
  class SQLAlchemyDask:
14
14
  def __init__(self, model, filters, engine_url, chunk_size=1000, logger=None, debug=False):
@@ -38,7 +38,6 @@ class SQLAlchemyDask:
38
38
  """
39
39
  mapper = inspect(model)
40
40
  sqlalchemy_to_dask_dtype = {
41
- #'INTEGER': pd.to_numeric(x, errors="coerce"),
42
41
  'INTEGER': 'Int64',
43
42
  'SMALLINT': 'Int64',
44
43
  'BIGINT': 'Int64',
@@ -72,11 +71,15 @@ class SQLAlchemyDask:
72
71
  # Build query
73
72
  self.query = select(self.model)
74
73
  if self.filters:
75
- self.query = SqlAlchemyFilterHandler.apply_filters_sqlalchemy(self.query, self.model, self.filters)
74
+ """
75
+ deprecated specific filter handling to a generic one
76
+ #self.query = SqlAlchemyFilterHandler.apply_filters_sqlalchemy(self.query, self.model, self.filters)
77
+ """
78
+ self.query = FilterHandler(backend="sqlalchemy", logger=self.logger).apply_filters(self.query, model=self.model, filters=self.filters)
76
79
  else:
77
80
  n_records = 100
78
81
  self.query = self.query.limit(n_records)
79
-
82
+ self.logger.debug(f"query:{self.query}")
80
83
  # Infer dtypes
81
84
  dtypes = self.infer_dtypes_from_model(self.model)
82
85
  # Get the column order from the SQLAlchemy model
@@ -124,14 +127,11 @@ class SQLAlchemyDask:
124
127
  else:
125
128
  dask_df = dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
126
129
 
127
- if self.debug:
128
- self.logger.info(f"Loaded {len(dask_df)} rows into Dask DataFrame.")
129
-
130
- if isinstance(dask_df, dask_expr._collection.DataFrame):
131
- dask_df = dask_df.to_legacy_dataframe()
130
+ self.logger.debug(f"Loaded {len(dask_df)} rows into Dask DataFrame.")
132
131
 
133
132
  return dask_df
134
133
 
135
134
  except Exception as e:
136
135
  self.logger.error(f"Error executing query: {str(e)}")
136
+ self.logger.error(self.query)
137
137
  return dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
@@ -1,7 +1,9 @@
1
- from sqlalchemy import and_, or_, not_, func, cast
2
- from sqlalchemy.sql.sqltypes import Date, Time
3
1
  import datetime
4
2
 
3
+ from sqlalchemy import func, cast
4
+ from sqlalchemy.sql.sqltypes import Date, Time
5
+
6
+
5
7
  class SqlAlchemyFilterHandler:
6
8
  @staticmethod
7
9
  def apply_filters_sqlalchemy(query, model, filters):