sibi-dst 0.3.15__py3-none-any.whl → 0.3.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,16 @@
1
1
  import asyncio
2
2
  import datetime
3
+ import logging
4
+ import warnings
3
5
  from typing import Any, Dict, TypeVar
4
6
  from typing import Union, Optional
5
7
 
6
8
  import dask.dataframe as dd
9
+ import dask_expr
7
10
  import pandas as pd
8
11
  from pydantic import BaseModel
9
12
 
10
- from sibi_dst.df_helper.core import QueryConfig, ParamsConfig
13
+ from sibi_dst.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
11
14
  from sibi_dst.utils import Logger
12
15
  from sibi_dst.utils import ParquetSaver, ClickHouseWriter
13
16
  from .plugins.django import *
@@ -18,6 +21,12 @@ from .plugins.sql_alchemy import *
18
21
  # Define a generic type variable for BaseModel subclasses
19
22
  T = TypeVar("T", bound=BaseModel)
20
23
 
24
+ # It is considered acceptable in Django to access protected class members
25
+ warnings.filterwarnings(
26
+ "ignore",
27
+ message="Access to a protected member _meta",
28
+ category=UserWarning,
29
+ )
21
30
 
22
31
  class DfHelper:
23
32
  df: Union[dd.DataFrame, pd.DataFrame] = None
@@ -36,9 +45,12 @@ class DfHelper:
36
45
  self.default_config = self.default_config or {}
37
46
  kwargs = {**self.default_config.copy(), **kwargs}
38
47
  self.source = source
39
- self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
40
48
  self.debug = kwargs.setdefault("debug", False)
41
- self.verbose_debug = kwargs.setdefault("verbose_debug", False)
49
+ self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
50
+ # Configure logger level
51
+ self.logger.setLevel(logging.DEBUG if self.debug else logging.INFO)
52
+ # Configure logger level
53
+ self.logger.debug("Logger initialized in DEBUG mode.")
42
54
  self.parquet_storage_path = kwargs.setdefault("parquet_storage_path", None)
43
55
  self.dt_field = kwargs.setdefault("dt_field", None)
44
56
  self.as_pandas = kwargs.setdefault("as_pandas", False)
@@ -47,7 +59,7 @@ class DfHelper:
47
59
  self.post_init(**kwargs)
48
60
 
49
61
  def post_init(self, **kwargs):
50
- self.logger.info(f"Source used: {self.source}")
62
+ self.logger.debug(f"Source used: {self.source}")
51
63
  self.plugin_query = self.__get_config(QueryConfig, kwargs)
52
64
  self.plugin_params = self.__get_config(ParamsConfig, kwargs)
53
65
  if self.source == 'django_db':
@@ -93,16 +105,15 @@ class DfHelper:
93
105
  return self._load_from_parquet(**options)
94
106
  elif self.source == 'http':
95
107
  if asyncio.get_event_loop().is_running():
96
- self.logger.info("Running as a task from an event loop")
108
+ self.logger.debug("Running as a task from an event loop")
97
109
  return asyncio.create_task(self._load_from_http(**options))
98
110
  else:
99
- self.logger.info("Regular asyncio run...")
111
+ self.logger.debug("Regular asyncio run...")
100
112
  return asyncio.run(self._load_from_http(**options))
101
113
 
102
114
  def _load_from_sqlalchemy(self, **options):
103
115
  try:
104
116
  options.setdefault("debug", self.debug)
105
- options.setdefault("verbose_debug", self.verbose_debug)
106
117
  db_loader = SqlAlchemyLoadFromDb(
107
118
  self.plugin_sqlalchemy,
108
119
  self.plugin_query,
@@ -113,9 +124,9 @@ class DfHelper:
113
124
  self.df = db_loader.build_and_load()
114
125
  self._process_loaded_data()
115
126
  self._post_process_df()
116
- self.logger.info("Data successfully loaded from sqlalchemy database.")
127
+ self.logger.debug("Data successfully loaded from sqlalchemy database.")
117
128
  except Exception as e:
118
- self.logger.error(f"Failed to load data from sqlalchemy database: {e}: options: {options}")
129
+ self.logger.debug(f"Failed to load data from sqlalchemy database: {e}: options: {options}")
119
130
  self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
120
131
 
121
132
  return self.df
@@ -123,7 +134,6 @@ class DfHelper:
123
134
  def _load_from_db(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
124
135
  try:
125
136
  options.setdefault("debug", self.debug)
126
- options.setdefault("verbose_debug", self.verbose_debug)
127
137
  db_loader = DjangoLoadFromDb(
128
138
  self.plugin_django_connection,
129
139
  self.plugin_query,
@@ -134,9 +144,9 @@ class DfHelper:
134
144
  self.df = db_loader.build_and_load()
135
145
  self._process_loaded_data()
136
146
  self._post_process_df()
137
- self.logger.info("Data successfully loaded from django database.")
147
+ self.logger.debug("Data successfully loaded from django database.")
138
148
  except Exception as e:
139
- self.logger.error(f"Failed to load data from django database: {e}")
149
+ self.logger.debug(f"Failed to load data from django database: {e}")
140
150
  self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
141
151
 
142
152
  return self.df
@@ -144,12 +154,12 @@ class DfHelper:
144
154
  async def _load_from_http(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
145
155
  """Delegate asynchronous HTTP data loading to HttpDataSource plugin."""
146
156
  if not self.plugin_http:
147
- self.logger.error("HTTP plugin not configured properly.")
157
+ self.logger.debug("HTTP plugin not configured properly.")
148
158
  return dd.from_pandas(pd.DataFrame(), npartitions=1)
149
159
  try:
150
160
  self.df = await self.plugin_http.fetch_data(**options)
151
161
  except Exception as e:
152
- self.logger.error(f"Failed to load data from http plugin: {e}")
162
+ self.logger.debug(f"Failed to load data from http plugin: {e}")
153
163
  self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
154
164
  return self.df
155
165
 
@@ -190,10 +200,10 @@ class DfHelper:
190
200
  if datetime_index and self.df.index.dtype != 'datetime64[ns]':
191
201
  self.df = self.df.map_partitions(lambda df: df.set_index(pd.to_datetime(df.index, errors='coerce')))
192
202
 
193
- self.logger.info("Post-processing of DataFrame completed.")
203
+ self.logger.debug("Post-processing of DataFrame completed.")
194
204
 
195
205
  def _process_loaded_data(self):
196
- self.logger.info(f"Type of self.df: {type(self.df)}")
206
+ self.logger.debug(f"Type of self.df: {type(self.df)}")
197
207
  if self.df.map_partitions(len).compute().sum() > 0:
198
208
  field_map = self.plugin_params.field_map or {}
199
209
  if isinstance(field_map, dict):
@@ -211,25 +221,30 @@ class DfHelper:
211
221
  # Apply renaming
212
222
  self.df = self.df.map_partitions(rename_columns, mapping=rename_mapping)
213
223
 
214
- self.logger.info("Processing of loaded data completed.")
224
+ self.logger.debug("Processing of loaded data completed.")
215
225
 
216
226
  def save_to_parquet(self, parquet_filename: Optional[str] = None):
217
227
  ps = ParquetSaver(self.df, self.parquet_storage_path, self.logger)
218
228
  ps.save_to_parquet(parquet_filename)
219
- self.logger.info(f"Parquet saved to {parquet_filename} in parquet storage: {self.parquet_storage_path}.")
229
+ self.logger.debug(f"Parquet saved to {parquet_filename} in parquet storage: {self.parquet_storage_path}.")
220
230
 
221
231
  def save_to_clickhouse(self, **credentials):
222
232
  if self.df.map_partitions(len).compute().sum() == 0:
223
- self.logger.info("Cannot write to clickhouse since Dataframe is empty")
233
+ self.logger.debug("Cannot write to clickhouse since Dataframe is empty")
224
234
  return
225
235
  cs = ClickHouseWriter(logger=self.logger, **credentials)
226
236
  cs.save_to_clickhouse(self.df)
227
- self.logger.info("Save to ClickHouse completed.")
237
+ self.logger.debug("Save to ClickHouse completed.")
228
238
 
229
239
  def _load_from_parquet(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
230
240
  self.df = self.plugin_parquet.load_files()
231
241
  if options:
242
+ """
243
+ deprecated specific filter handling to a generic one
232
244
  self.df = ParquetFilterHandler(logger=self.logger).apply_filters_dask(self.df, options)
245
+
246
+ """
247
+ self.df = FilterHandler(backend='dask', logger=self.logger).apply_filters(self.df, filters=options)
233
248
  return self.df
234
249
 
235
250
  def load_period(self, **kwargs):
@@ -294,6 +309,7 @@ class DfHelper:
294
309
  elif is_datetime_field:
295
310
  kwargs[f"{mapped_field}__date__gte"] = start
296
311
  kwargs[f"{mapped_field}__date__lte"] = end
312
+ self.logger.debug(f"load_period kwargs: {kwargs}")
297
313
  return self.load(**kwargs)
298
314
 
299
315
  @staticmethod
@@ -1,12 +1,13 @@
1
1
  from __future__ import annotations
2
2
 
3
- from ._params_config import ParamsConfig
4
- from ._query_config import QueryConfig
5
3
  from ._defaults import (
6
4
  django_field_conversion_map_pandas,
7
5
  django_field_conversion_map_dask,
8
6
  sqlalchemy_field_conversion_map_dask,
9
7
  normalize_sqlalchemy_type)
8
+ from ._filter_handler import FilterHandler
9
+ from ._params_config import ParamsConfig
10
+ from ._query_config import QueryConfig
10
11
 
11
12
  __all__ = [
12
13
  "ParamsConfig",
@@ -14,5 +15,6 @@ __all__ = [
14
15
  "django_field_conversion_map_pandas",
15
16
  "django_field_conversion_map_dask",
16
17
  "sqlalchemy_field_conversion_map_dask",
17
- "normalize_sqlalchemy_type"
18
- ]
18
+ "normalize_sqlalchemy_type",
19
+ "FilterHandler",
20
+ ]
@@ -0,0 +1,216 @@
1
+ import datetime
2
+ import dask.dataframe as dd
3
+ import pandas as pd
4
+ from sqlalchemy import func, cast
5
+ from sqlalchemy.sql.sqltypes import Date, Time
6
+ from sibi_dst.utils import Logger
7
+
8
+ class FilterHandler:
9
+ def __init__(self, backend, logger=None):
10
+ """
11
+ Initialize the FilterHandler.
12
+
13
+ Args:
14
+ backend: The backend to use ('sqlalchemy' or 'dask').
15
+ logger: Optional logger for debugging purposes.
16
+ """
17
+ self.backend = backend
18
+ self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__) # No-op logger if none provided
19
+ self.backend_methods = self._get_backend_methods(backend)
20
+
21
+ def apply_filters(self, query_or_df, model=None, filters=None):
22
+ """
23
+ Apply filters to the data source based on the backend.
24
+
25
+ Args:
26
+ query_or_df: SQLAlchemy query or Dask DataFrame.
27
+ model: SQLAlchemy model (required for SQLAlchemy backend).
28
+ filters: Dictionary of filters.
29
+
30
+ Returns:
31
+ Filtered query or DataFrame.
32
+ """
33
+ filters = filters or {}
34
+ for key, value in filters.items():
35
+ field_name, casting, operation = self._parse_filter_key(key)
36
+ parsed_value = self._parse_filter_value(casting, value)
37
+ #print(field_name, casting, operation, parsed_value)
38
+ # Get the column and apply backend-specific transformations
39
+ if self.backend == "sqlalchemy":
40
+ column = self.backend_methods["get_column"](field_name, model, casting)
41
+ condition = self.backend_methods["apply_operation"](column, operation, parsed_value)
42
+ query_or_df = self.backend_methods["apply_condition"](query_or_df, condition)
43
+
44
+ elif self.backend == "dask":
45
+ column = self.backend_methods["get_column"](query_or_df, field_name, casting)
46
+ condition = self.backend_methods["apply_operation"](column, operation, parsed_value)
47
+ query_or_df = self.backend_methods["apply_condition"](query_or_df, condition)
48
+ else:
49
+ raise ValueError(f"Unsupported backend: {self.backend}")
50
+
51
+ return query_or_df
52
+
53
+ @staticmethod
54
+ def _parse_filter_key(key):
55
+ parts = key.split("__")
56
+ field_name = parts[0]
57
+ casting = None
58
+ operation = "exact"
59
+
60
+ if len(parts) == 3:
61
+ _, casting, operation = parts
62
+ elif len(parts) == 2:
63
+ if parts[1] in FilterHandler._comparison_operators():
64
+ operation = parts[1]
65
+ elif parts[1] in FilterHandler._dt_operators() + FilterHandler._date_operators():
66
+ casting = parts[1]
67
+
68
+ return field_name, casting, operation
69
+
70
+
71
+ def _parse_filter_value(self, casting, value):
72
+ """
73
+ Convert filter value to appropriate type based on the casting (e.g., date).
74
+ """
75
+ if casting == "date":
76
+ if isinstance(value, str):
77
+ parsed = pd.Timestamp(value) # Convert to datetime64[ns]
78
+ return parsed
79
+ if isinstance(value, list):
80
+ parsed = [pd.Timestamp(v) for v in value]
81
+ return parsed
82
+ elif casting == "time" and isinstance(value, str):
83
+ parsed = datetime.time.fromisoformat(value)
84
+ self.logger.debug(f"Parsed value (time): {parsed}")
85
+ return parsed
86
+ return value
87
+
88
+ @staticmethod
89
+ def _get_backend_methods(backend):
90
+ if backend == "sqlalchemy":
91
+ return {
92
+ "get_column": FilterHandler._get_sqlalchemy_column,
93
+ "apply_operation": FilterHandler._apply_operation_sqlalchemy,
94
+ "apply_condition": lambda query, condition: query.filter(condition),
95
+ }
96
+ elif backend == "dask":
97
+ return {
98
+ "get_column": FilterHandler._get_dask_column,
99
+ "apply_operation": FilterHandler._apply_operation_dask,
100
+ "apply_condition": lambda df, condition: df[condition],
101
+ }
102
+ else:
103
+ raise ValueError(f"Unsupported backend: {backend}")
104
+
105
+ @staticmethod
106
+ def _get_sqlalchemy_column(field_name, model, casting):
107
+ """
108
+ Retrieve and cast a column for SQLAlchemy based on the field name and casting.
109
+
110
+ Args:
111
+ field_name: The name of the field/column in the model.
112
+ model: The SQLAlchemy model.
113
+ casting: The casting type ('date', 'time', etc.).
114
+
115
+ Returns:
116
+ The SQLAlchemy column object, optionally cast or transformed.
117
+ """
118
+ column = getattr(model, field_name, None)
119
+ if not column:
120
+ raise AttributeError(f"Field '{field_name}' not found in model '{model.__name__}'")
121
+
122
+ if casting == "date":
123
+ # Cast the column to Date for whole-date comparisons
124
+ column = cast(column, Date)
125
+ elif casting == "time":
126
+ # Cast the column to Time for time-specific comparisons
127
+ column = cast(column, Time)
128
+ elif casting in FilterHandler._date_operators():
129
+ # Extract date part (e.g., year, month) using SQLAlchemy functions
130
+ column = func.extract(casting, column)
131
+
132
+ return column
133
+
134
+ @staticmethod
135
+ def _get_dask_column(df, field_name, casting):
136
+ """
137
+ Retrieve and optionally cast a column for Dask based on the field name and casting.
138
+
139
+ Args:
140
+ df: The Dask DataFrame.
141
+ field_name: The name of the field/column in the DataFrame.
142
+ casting: The casting type ('date', 'time', etc.).
143
+
144
+ Returns:
145
+ The Dask Series object, optionally cast or transformed.
146
+ """
147
+ column = dd.to_datetime(df[field_name], errors="coerce") if casting in FilterHandler._dt_operators() else df[
148
+ field_name]
149
+
150
+ if casting == "date":
151
+ column = column.dt.floor("D") # Ensure truncation to the date level
152
+ elif casting in FilterHandler._date_operators():
153
+ column = getattr(column.dt, casting)
154
+
155
+ return column
156
+
157
+ @staticmethod
158
+ def _apply_operation_sqlalchemy(column, operation, value):
159
+ operation_map = FilterHandler._operation_map_sqlalchemy()
160
+ if operation not in operation_map:
161
+ raise ValueError(f"Unsupported operation: {operation}")
162
+ return operation_map[operation](column, value)
163
+
164
+ @staticmethod
165
+ def _apply_operation_dask(column, operation, value):
166
+ operation_map = FilterHandler._operation_map_dask()
167
+ if operation not in operation_map:
168
+ raise ValueError(f"Unsupported operation: {operation}")
169
+ return operation_map[operation](column, value)
170
+
171
+ @staticmethod
172
+ def _operation_map_sqlalchemy():
173
+ return {
174
+ "exact": lambda col, val: col == val,
175
+ "gt": lambda col, val: col > val,
176
+ "gte": lambda col, val: col >= val,
177
+ "lt": lambda col, val: col < val,
178
+ "lte": lambda col, val: col <= val,
179
+ "in": lambda col, val: col.in_(val),
180
+ "range": lambda col, val: col.between(val[0], val[1]),
181
+ "contains": lambda col, val: col.like(f"%{val}%"),
182
+ "startswith": lambda col, val: col.like(f"{val}%"),
183
+ "endswith": lambda col, val: col.like(f"%{val}"),
184
+ "isnull": lambda col, val: col.is_(None) if val else col.isnot(None),
185
+ }
186
+
187
+ @staticmethod
188
+ def _operation_map_dask():
189
+ return {
190
+ "exact": lambda col, val: col == val,
191
+ "gt": lambda col, val: col > val,
192
+ "gte": lambda col, val: col >= val,
193
+ "lt": lambda col, val: col < val,
194
+ "lte": lambda col, val: col <= val,
195
+ "in": lambda col, val: col.isin(val),
196
+ "range": lambda col, val: (col >= val[0]) & (col <= val[1]),
197
+ "contains": lambda col, val: col.str.contains(val, regex=True),
198
+ "startswith": lambda col, val: col.str.startswith(val),
199
+ "endswith": lambda col, val: col.str.endswith(val),
200
+ "isnull": lambda col, val: col.isnull() if val else col.notnull(),
201
+ }
202
+
203
+ @staticmethod
204
+ def _dt_operators():
205
+ return ["date", "time"]
206
+
207
+ @staticmethod
208
+ def _date_operators():
209
+ return ["year", "month", "day", "hour", "minute", "second", "week_day"]
210
+
211
+ @staticmethod
212
+ def _comparison_operators():
213
+ return [
214
+ "gte", "lte", "gt", "lt", "exact", "in", "range",
215
+ "contains", "startswith", "endswith", "isnull",
216
+ ]
@@ -1,5 +1,8 @@
1
+ import warnings
2
+
1
3
  import dask.dataframe as dd
2
4
  import pandas as pd
5
+ from IPython.core.hooks import deprecated
3
6
  from django.db.models import Q
4
7
 
5
8
  from sibi_dst.df_helper.plugins.django import ReadFrameDask
@@ -12,13 +15,11 @@ class DjangoLoadFromDb:
12
15
  def __init__(self, db_connection, db_query, db_params, logger, **kwargs):
13
16
  self.connection_config = db_connection
14
17
  self.debug = kwargs.pop('debug', False)
15
- self.verbose_debug = kwargs.pop('verbose_debug', False)
16
18
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
17
19
  if self.connection_config.model is None:
18
20
  if self.debug:
19
- self.logger.critical('Model must be specified')
20
- if self.verbose_debug:
21
- print('Model must be specified')
21
+ self.logger.debug('Model must be specified')
22
+
22
23
  raise ValueError('Model must be specified')
23
24
 
24
25
  self.query_config = db_query
@@ -45,7 +46,7 @@ class DjangoLoadFromDb:
45
46
  try:
46
47
  self.df = ReadFrameDask(queryset, **self.params_config.df_params).read_frame()
47
48
  except Exception as e:
48
- self.logger.critical(f'Error loading query: {str(queryset.query)}, error message: {e}')
49
+ self.logger.debug(f'Error loading query: {str(queryset.query)}, error message: {e}')
49
50
  self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
50
51
  else:
51
52
  self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
@@ -69,16 +70,28 @@ class DjangoLoadFromDb:
69
70
  :param df: Dask DataFrame whose columns' data types are to be converted.
70
71
  :return: Dask DataFrame with converted column data types.
71
72
  """
72
-
73
- def log_debug(message: str, is_verbose: bool = False):
74
- """Helper to handle debug and verbose debug logging."""
75
- if self.debug:
76
- self.logger.debug(message)
77
- if is_verbose and self.verbose_debug:
78
- print(message)
79
-
73
+ """
74
+ [DEPRECATED] Convert the data types of columns in a Dask DataFrame based on the field type in the Django model.
75
+
76
+ :param df: Dask DataFrame whose columns' data types are to be converted.
77
+ :return: Dask DataFrame with converted column data types.
78
+ """
79
+ # Emit deprecation warning
80
+ warnings.warn(
81
+ "_convert_columns is deprecated and will be removed in a future release. "
82
+ "Consider using <new_method_name> instead.",
83
+ DeprecationWarning,
84
+ stacklevel=2,
85
+ )
86
+
87
+ # Log deprecation message if debug mode is enabled
80
88
  if self.debug:
81
- self.logger.info(f'Converting columns: {list(df.columns)}')
89
+ self.logger.warning(
90
+ "[DEPRECATION NOTICE] The `_convert_columns` method is deprecated and will be removed in a future release. "
91
+ "Consider using <new_method_name> instead."
92
+ )
93
+
94
+ self.logger.debug(f'Converting columns: {list(df.columns)}')
82
95
 
83
96
  # Get field information from the Django model
84
97
  model_fields = self.connection_config.model._meta.get_fields()
@@ -87,13 +100,13 @@ class DjangoLoadFromDb:
87
100
  for field_name, field_type in field_type_map.items():
88
101
  if field_name not in df.columns:
89
102
 
90
- log_debug(f"Column '{field_name}' not found in DataFrame columns.")
103
+ self.logger.debug(f"Column '{field_name}' not found in DataFrame columns.")
91
104
  continue
92
105
 
93
106
  conversion_func = django_field_conversion_map_dask.get(field_type)
94
107
  if not conversion_func:
95
108
  message=f"Field type '{field_type}' not found in conversion_map."
96
- log_debug(message, is_verbose=True)
109
+ self.logger.debug(message)
97
110
  continue
98
111
 
99
112
  def apply_conversion(partition):
@@ -104,7 +117,7 @@ class DjangoLoadFromDb:
104
117
  if field_name in partition.columns:
105
118
  partition[field_name] = conversion_func(partition[field_name])
106
119
  except Exception as e:
107
- self.logger.error(f"Error converting column '{field_name}' in partition: {str(e)}")
120
+ self.logger.debug(f"Error converting column '{field_name}' in partition: {str(e)}")
108
121
  return partition
109
122
 
110
123
  try:
@@ -113,9 +126,8 @@ class DjangoLoadFromDb:
113
126
  apply_conversion,
114
127
  meta=df,
115
128
  )
116
- log_debug(f"Successfully queued conversion for column '{field_name}' to type '{field_type}'.",
117
- is_verbose=True)
129
+ self.logger.debug(f"Successfully queued conversion for column '{field_name}' to type '{field_type}'.")
118
130
  except Exception as e:
119
- log_debug(f"Failed to queue conversion for column '{field_name}': {str(e)}", is_verbose=True)
131
+ self.logger.debug(f"Failed to queue conversion for column '{field_name}': {str(e)}")
120
132
 
121
133
  return df
@@ -240,7 +240,4 @@ class ReadFrameDask:
240
240
  if verbose:
241
241
  self.update_with_verbose(dask_df, fieldnames, fields)
242
242
 
243
- if isinstance(dask_df, dask_expr._collection.DataFrame):
244
- dask_df = dask_df.to_legacy_dataframe()
245
-
246
243
  return dask_df
@@ -31,17 +31,17 @@ class HttpConfig(BaseModel):
31
31
  # Set up headers with API key if provided
32
32
  headers = {"Authorization": f"Bearer {self.api_key.get_secret_value()}"} if self.api_key else {}
33
33
 
34
- self.logger.info(f"Fetching data from {formatted_url} with params {self.params}")
34
+ self.logger.debug(f"Fetching data from {formatted_url} with params {self.params}")
35
35
  async with httpx.AsyncClient() as client:
36
36
  response = await client.get(formatted_url, params=self.params, headers=headers, timeout=self.timeout)
37
37
  response.raise_for_status()
38
38
  data = response.json()
39
39
  df = dd.from_pandas(pd.json_normalize(data), npartitions=1)
40
- self.logger.info("Data successfully loaded from HTTP JSON source.")
40
+ self.logger.debug("Data successfully loaded from HTTP JSON source.")
41
41
  return df
42
42
  except httpx.RequestError as e:
43
- self.logger.error(f"HTTP request error: {e}")
43
+ self.logger.debug(f"HTTP request error: {e}")
44
44
  raise
45
45
  except ValueError as e:
46
- self.logger.error(f"Error parsing JSON data: {e}")
46
+ self.logger.debug(f"Error parsing JSON data: {e}")
47
47
  raise
@@ -8,7 +8,7 @@ from sqlalchemy.orm import sessionmaker
8
8
 
9
9
  from sibi_dst.df_helper.plugins.sql_alchemy._sqlachemy_filter_handler import SqlAlchemyFilterHandler
10
10
  from sibi_dst.utils import Logger
11
-
11
+ from sibi_dst.df_helper.core import FilterHandler
12
12
 
13
13
  class SQLAlchemyDask:
14
14
  def __init__(self, model, filters, engine_url, chunk_size=1000, logger=None, debug=False):
@@ -38,7 +38,6 @@ class SQLAlchemyDask:
38
38
  """
39
39
  mapper = inspect(model)
40
40
  sqlalchemy_to_dask_dtype = {
41
- #'INTEGER': pd.to_numeric(x, errors="coerce"),
42
41
  'INTEGER': 'Int64',
43
42
  'SMALLINT': 'Int64',
44
43
  'BIGINT': 'Int64',
@@ -72,11 +71,15 @@ class SQLAlchemyDask:
72
71
  # Build query
73
72
  self.query = select(self.model)
74
73
  if self.filters:
75
- self.query = SqlAlchemyFilterHandler.apply_filters_sqlalchemy(self.query, self.model, self.filters)
74
+ """
75
+ deprecated specific filter handling to a generic one
76
+ #self.query = SqlAlchemyFilterHandler.apply_filters_sqlalchemy(self.query, self.model, self.filters)
77
+ """
78
+ self.query = FilterHandler(backend="sqlalchemy", logger=self.logger).apply_filters(self.query, model=self.model, filters=self.filters)
76
79
  else:
77
80
  n_records = 100
78
81
  self.query = self.query.limit(n_records)
79
-
82
+ self.logger.debug(f"query:{self.query}")
80
83
  # Infer dtypes
81
84
  dtypes = self.infer_dtypes_from_model(self.model)
82
85
  # Get the column order from the SQLAlchemy model
@@ -124,14 +127,11 @@ class SQLAlchemyDask:
124
127
  else:
125
128
  dask_df = dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
126
129
 
127
- if self.debug:
128
- self.logger.info(f"Loaded {len(dask_df)} rows into Dask DataFrame.")
129
-
130
- if isinstance(dask_df, dask_expr._collection.DataFrame):
131
- dask_df = dask_df.to_legacy_dataframe()
130
+ self.logger.debug(f"Loaded {len(dask_df)} rows into Dask DataFrame.")
132
131
 
133
132
  return dask_df
134
133
 
135
134
  except Exception as e:
136
135
  self.logger.error(f"Error executing query: {str(e)}")
136
+ self.logger.error(self.query)
137
137
  return dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
@@ -1,7 +1,9 @@
1
- from sqlalchemy import and_, or_, not_, func, cast
2
- from sqlalchemy.sql.sqltypes import Date, Time
3
1
  import datetime
4
2
 
3
+ from sqlalchemy import func, cast
4
+ from sqlalchemy.sql.sqltypes import Date, Time
5
+
6
+
5
7
  class SqlAlchemyFilterHandler:
6
8
  @staticmethod
7
9
  def apply_filters_sqlalchemy(query, model, filters):
@@ -1,4 +1,5 @@
1
1
  import dask.dataframe as dd
2
+ import dask_expr
2
3
  import pandas as pd
3
4
 
4
5
  from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
@@ -28,7 +29,6 @@ class SqlAlchemyLoadFromDb:
28
29
  self.query_config = plugin_query
29
30
  self.params_config = plugin_params
30
31
  self.debug = kwargs.pop("debug", False)
31
- self.verbose_debug = kwargs.pop("verbose_debug", False)
32
32
 
33
33
  def build_and_load(self) -> dd.DataFrame:
34
34
  """
@@ -40,7 +40,6 @@ class SqlAlchemyLoadFromDb:
40
40
  def _build_and_load(self) -> dd.DataFrame:
41
41
 
42
42
  try:
43
- # reader = SQLAlchemyDask(model=self.model, filters=self.params_config.filters,engine_url=self.engine.url, logger=self.logger, chunk_size=1000, debug=self.debug)
44
43
  self.df = SQLAlchemyDask(
45
44
  model=self.model,
46
45
  filters=self.params_config.filters,
@@ -49,10 +48,13 @@ class SqlAlchemyLoadFromDb:
49
48
  chunk_size=1000,
50
49
  debug=self.debug).read_frame()
51
50
  if self.df is None or len(self.df.head().index) == 0:
52
- self.logger.warning("Query returned no results.")
53
- return dd.from_pandas(pd.DataFrame(), npartitions=1)
51
+ self.logger.debug("Query returned no results.")
52
+ dask_df=dd.from_pandas(pd.DataFrame(), npartitions=1)
54
53
 
54
+ return dask_df
55
55
  return self.df
56
56
  except Exception as e:
57
- self.logger.error(f"Failed to load data into Dask DataFrame.{e}")
58
- return dd.from_pandas(pd.DataFrame(), npartitions=1)
57
+ self.logger.debug(f"Failed to load data into Dask DataFrame.{e}")
58
+ dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
59
+
60
+ return dask_df
@@ -59,7 +59,7 @@ class SqlAlchemyModelBuilder:
59
59
  attrs = {
60
60
  "__tablename__": self.table_name,
61
61
  "__table__": self.table,
62
- #"__module__": f"{apps_label}.models",
62
+ "__module__": f"{apps_label}.models",
63
63
  "__mapper_args__": {"eager_defaults": True},
64
64
  }
65
65
 
@@ -26,7 +26,6 @@ class SQLModelLoadFromDb:
26
26
  self.query_config = db_query or {}
27
27
  self.params_config = db_params or {}
28
28
  self.debug = kwargs.pop("debug", False)
29
- self.verbose_debug = kwargs.pop("verbose_debug", False)
30
29
 
31
30
  def _default_logger(self):
32
31
  """Create a default logger."""
@@ -69,7 +68,7 @@ class SQLModelLoadFromDb:
69
68
  query = query.limit(n_records)
70
69
 
71
70
  # Debug: Log the SQL query
72
- self.logger.info(f"Executing query: {str(query)}")
71
+ self.logger.debug(f"Executing query: {str(query)}")
73
72
 
74
73
  # Execute the query
75
74
  results = session.exec(query).fetchall()
@@ -79,7 +78,7 @@ class SQLModelLoadFromDb:
79
78
  if results:
80
79
  df = dd.from_pandas(pd.DataFrame([r.dict() for r in results]), npartitions=1)
81
80
  else:
82
- self.logger.warning("Query returned no results.")
81
+ self.logger.debug("Query returned no results.")
83
82
  df = dd.from_pandas(pd.DataFrame(), npartitions=1)
84
83
 
85
84
  except Exception as e:
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
- from ._credentials import ConfigManager, ConfigLoader
2
+ from ._credentials import *
3
3
  from ._log_utils import Logger
4
- from ._date_utils import DateUtils, BusinessDays
4
+ from ._date_utils import *
5
5
  from ._data_utils import DataUtils
6
6
  from ._file_utils import FileUtils
7
7
  from ._filepath_generator import FilePathGenerator
@@ -34,7 +34,7 @@ class ClickHouseWriter:
34
34
  self.df = df.copy()
35
35
  self.order_by = kwargs.setdefault('order_by',self.order_by)
36
36
  if len(self.df.head().index) == 0:
37
- self.logger.info("Dataframe is empty")
37
+ self.logger.debug("Dataframe is empty")
38
38
  return
39
39
  self._handle_missing_values()
40
40
  self._connect()
@@ -51,7 +51,7 @@ class ClickHouseWriter:
51
51
  user=self.clickhouse_user,
52
52
  password=self.clickhouse_password
53
53
  )
54
- self.logger.info("Connected to ClickHouse")
54
+ self.logger.debug("Connected to ClickHouse")
55
55
  except Exception as e:
56
56
  self.logger.error(e)
57
57
  raise
@@ -80,7 +80,7 @@ class ClickHouseWriter:
80
80
  def _drop_table(self):
81
81
  if self.client:
82
82
  self.client.command('DROP TABLE IF EXISTS {}'.format(self.clickhouse_table))
83
- self.logger.info(f"Dropped table {self.clickhouse_table}")
83
+ self.logger.debug(f"Dropped table {self.clickhouse_table}")
84
84
 
85
85
  def _create_table_from_dask(self, engine=None):
86
86
  if engine is None:
@@ -88,18 +88,18 @@ class ClickHouseWriter:
88
88
  dtypes = self.df.dtypes
89
89
  clickhouse_schema = self._generate_clickhouse_schema(dtypes,self.dtype_to_clickhouse)
90
90
  create_table_sql= f"CREATE TABLE IF NOT EXISTS {self.clickhouse_table} ({clickhouse_schema}) {engine};"
91
- self.logger.info(f"Creating table SQL:{create_table_sql}")
91
+ self.logger.debug(f"Creating table SQL:{create_table_sql}")
92
92
  if self.client:
93
93
  self.client.command(create_table_sql)
94
- self.logger.info("Created table '{}'".format(self.clickhouse_table))
94
+ self.logger.debug("Created table '{}'".format(self.clickhouse_table))
95
95
 
96
96
  def _handle_missing_values(self):
97
97
  """
98
98
  Handle missing values in the Dask DataFrame before writing to ClickHouse.
99
99
  """
100
- self.logger.info("Checking for missing values...")
100
+ self.logger.debug("Checking for missing values...")
101
101
  missing_counts = self.df.isnull().sum().compute()
102
- self.logger.info(f"Missing values per column:\n{missing_counts}")
102
+ self.logger.debug(f"Missing values per column:\n{missing_counts}")
103
103
 
104
104
  # Replace missing values based on column types
105
105
  def replace_missing_values(df):
@@ -116,14 +116,14 @@ class ClickHouseWriter:
116
116
 
117
117
  # Apply replacement
118
118
  self.df = replace_missing_values(self.df)
119
- self.logger.info("Missing values replaced.")
119
+ self.logger.debug("Missing values replaced.")
120
120
 
121
121
  def _write_data(self):
122
122
  """
123
123
  Writes the Dask DataFrame to a ClickHouse table partition by partition.
124
124
  """
125
125
  if len(self.df.head().index) == 0:
126
- self.logger.info("No data found. Nothing written.")
126
+ self.logger.debug("No data found. Nothing written.")
127
127
  return
128
128
 
129
129
  for i, partition in enumerate(self.df.to_delayed()):
@@ -132,10 +132,10 @@ class ClickHouseWriter:
132
132
  df = partition.compute()
133
133
 
134
134
  if df.empty:
135
- self.logger.info(f"Partition {i} is empty. Skipping...")
135
+ self.logger.debug(f"Partition {i} is empty. Skipping...")
136
136
  continue
137
137
 
138
- self.logger.info(f"Writing partition {i} with {len(df)} rows to ClickHouse.")
138
+ self.logger.debug(f"Writing partition {i} with {len(df)} rows to ClickHouse.")
139
139
 
140
140
  # Write the partition to the ClickHouse table
141
141
  self.client.insert_df(self.clickhouse_table, df)
@@ -148,7 +148,7 @@ class ClickHouseWriter:
148
148
  Ensures a separate client instance is used per thread to avoid session conflicts.
149
149
  """
150
150
  if len(self.df.index) == 0:
151
- self.logger.info("No data found. Nothing written.")
151
+ self.logger.debug("No data found. Nothing written.")
152
152
  return
153
153
 
154
154
  def create_client():
@@ -170,13 +170,13 @@ class ClickHouseWriter:
170
170
  Write a single partition to ClickHouse using a separate client instance.
171
171
  """
172
172
  try:
173
- self.logger.info(f"Starting to process partition {index}")
173
+ self.logger.debug(f"Starting to process partition {index}")
174
174
  client = create_client() # Create a new client for the thread
175
175
 
176
176
  # Compute the Dask partition into a Pandas DataFrame
177
177
  df = partition.compute()
178
178
  if df.empty:
179
- self.logger.info(f"Partition {index} is empty. Skipping...")
179
+ self.logger.debug(f"Partition {index} is empty. Skipping...")
180
180
  return
181
181
 
182
182
  # Convert DataFrame to list of tuples
@@ -184,7 +184,7 @@ class ClickHouseWriter:
184
184
  columns = df.columns.tolist()
185
185
 
186
186
  # Perform the insert
187
- self.logger.info(f"Writing partition {index} with {len(df)} rows to ClickHouse.")
187
+ self.logger.debug(f"Writing partition {index} with {len(df)} rows to ClickHouse.")
188
188
  client.execute(f"INSERT INTO {self.clickhouse_table} ({', '.join(columns)}) VALUES", data)
189
189
 
190
190
  except Exception as e:
@@ -192,7 +192,7 @@ class ClickHouseWriter:
192
192
  finally:
193
193
  if 'client' in locals() and hasattr(client, 'close'):
194
194
  client.close()
195
- self.logger.info(f"Closed client for partition {index}")
195
+ self.logger.debug(f"Closed client for partition {index}")
196
196
 
197
197
  try:
198
198
  # Get delayed partitions and enumerate them
@@ -68,6 +68,7 @@ class DataUtils:
68
68
  - pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with merged lookup data.
69
69
  """
70
70
  # Return early if the DataFrame is empty
71
+ debug = kwargs.setdefault("debug", False)
71
72
  if self.is_dataframe_empty(df):
72
73
  return df
73
74
 
@@ -88,7 +89,7 @@ class DataUtils:
88
89
  column_names = kwargs.pop('column_names', ['temp_join_col', source_description_alias])
89
90
 
90
91
  if source_col not in df.columns:
91
- self.logger.info(f"{source_col} not in DataFrame columns")
92
+ self.logger.debug(f"{source_col} not in DataFrame columns")
92
93
  return df
93
94
 
94
95
  # Get unique IDs from source column
@@ -99,7 +100,7 @@ class DataUtils:
99
100
 
100
101
  # Check if any IDs are found
101
102
  if not len(ids):
102
- self.logger.info(f"No IDs found in the source column: {source_col}")
103
+ self.logger.debug(f"No IDs found in the source column: {source_col}")
103
104
  return df
104
105
 
105
106
  # Convert to a list only if necessary and sort
@@ -114,10 +115,10 @@ class DataUtils:
114
115
  f'{lookup_col}__in': ids
115
116
  })
116
117
  # Load lookup data
117
- lookup_instance = classname(debug=True, verbose_debug=True)
118
+ lookup_instance = classname(debug=debug)
118
119
  result = lookup_instance.load(**load_kwargs)
119
120
  if len(result.index) == 0:
120
- self.logger.info(f"No IDs found in the source column: {source_col}")
121
+ self.logger.debug(f"No IDs found in the source column: {source_col}")
121
122
  return df
122
123
  # Determine the join column on the result DataFrame
123
124
  temp_join_col = 'temp_join_col' if 'temp_join_col' in column_names else lookup_col
@@ -1,12 +1,15 @@
1
1
  import datetime
2
2
  from typing import Type, Any, Dict, Optional
3
+
3
4
  import fsspec
4
5
  import pandas as pd
5
6
  from IPython.display import display
6
- from sibi_dst.utils import Logger
7
7
  from tqdm import tqdm
8
+
9
+ from sibi_dst.utils import Logger
8
10
  from sibi_dst.utils import ParquetSaver
9
11
 
12
+
10
13
  class DataWrapper:
11
14
  DEFAULT_MAX_AGE_MINUTES = 1440
12
15
  DEFAULT_HISTORY_DAYS_THRESHOLD = 30
@@ -112,7 +115,7 @@ class DataWrapper:
112
115
  file_age_minutes = (current_time - file_modification_datetime).total_seconds() / 60
113
116
 
114
117
  if self.verbose:
115
- self.logger.info(
118
+ self.logger.debug(
116
119
  f"File {file_path} is {round(file_age_minutes, 2)} minutes old "
117
120
  f"(threshold: {self.max_age_minutes} minutes)"
118
121
  )
@@ -129,14 +132,14 @@ class DataWrapper:
129
132
  start_time = datetime.datetime.now()
130
133
 
131
134
  if self.verbose:
132
- self.logger.info(f"Processing {full_parquet_filename}...")
135
+ self.logger.debug(f"Processing {full_parquet_filename}...")
133
136
 
134
137
  data_object = self.dataclass(**self.class_params)
135
138
  df = data_object.load_period(dt_field=self.date_field, start=date, end=date)
136
139
 
137
140
  if len(df.index)==0:
138
141
  if self.verbose:
139
- self.logger.info("No data found for the specified date.")
142
+ self.logger.debug("No data found for the specified date.")
140
143
  return
141
144
 
142
145
  parquet_saver = ParquetSaver(df, folder, self.logger)
@@ -146,7 +149,7 @@ class DataWrapper:
146
149
  duration_seconds = (end_time - start_time).total_seconds()
147
150
 
148
151
  if self.verbose:
149
- self.logger.info(
152
+ self.logger.debug(
150
153
  f"Data saved to {full_parquet_filename}. Processing time: {duration_seconds:.2f} seconds"
151
154
  )
152
155
 
@@ -85,7 +85,7 @@ class DfUtils:
85
85
  # Ensure all specified columns exist in the DataFrame
86
86
  missing_columns = [col for col, _, _ in conditions if col not in df.columns]
87
87
  if missing_columns:
88
- self.logger.info(f"The following columns are missing in the DataFrame: {', '.join(missing_columns)}")
88
+ self.logger.debug(f"The following columns are missing in the DataFrame: {', '.join(missing_columns)}")
89
89
  return df
90
90
 
91
91
  # Build the combined filtering condition
@@ -117,7 +117,7 @@ class DfUtils:
117
117
  DataFrame: Grouped DataFrame with counts.
118
118
  """
119
119
  if debug:
120
- self.logger.info(f"Grouping by: {group_by_expr}")
120
+ self.logger.debug(f"Grouping by: {group_by_expr}")
121
121
 
122
122
  df_grouped = df.groupby(by=group_by_expr).size().reset_index(name=group_expr)
123
123
  return df_grouped
@@ -141,7 +141,7 @@ class DfUtils:
141
141
 
142
142
  if debug:
143
143
  df_duplicates = df[df.duplicated(subset=duplicate_expr)]
144
- self.logger.info(f"Duplicate Rows based on columns {duplicate_expr} are:\n{df_duplicates}")
144
+ self.logger.debug(f"Duplicate Rows based on columns {duplicate_expr} are:\n{df_duplicates}")
145
145
 
146
146
  if sort_field:
147
147
  if isinstance(df, dd.DataFrame):
@@ -224,9 +224,9 @@ class DfUtils:
224
224
  Returns:
225
225
  DataFrame: Resampled pivot table.
226
226
  """
227
- if isinstance(df, dd.DataFrame):
227
+ if isinstance(df, dd.core.DataFrame):
228
228
  # Implement Dask-compatible pivot and resample
229
- self.logger.info("Performing summarization with Dask DataFrame.")
229
+ self.logger.debug("Performing summarization with Dask DataFrame.")
230
230
  # Ensure the index is a datetime for resampling
231
231
  if not isinstance(df.index, (pd.DatetimeIndex, dd.core.DatetimeIndex)):
232
232
  self.logger.warning("Index is not a DatetimeIndex. Converting index to datetime.")
@@ -55,6 +55,9 @@ class Logger:
55
55
  log_file = log_file or logger_name
56
56
  return cls(log_dir=log_dir, logger_name=logger_name, log_file=log_file)
57
57
 
58
+ def setLevel(self, level):
59
+ self.logger.setLevel(level)
60
+
58
61
  def debug(self, msg):
59
62
  self.logger.debug(msg)
60
63
 
@@ -1,18 +1,16 @@
1
- import datetime
2
1
  from pathlib import Path
3
2
  from typing import Optional
4
3
 
5
- import dask.dataframe as dd
4
+ import dask_expr
6
5
  import fsspec
7
- import pandas as pd
8
6
  import pyarrow as pa
7
+
9
8
  from sibi_dst.utils import Logger
10
9
 
10
+
11
11
  class ParquetSaver:
12
12
  def __init__(self, df_result, parquet_storage_path, logger=None):
13
13
  # Ensure df_result is a Dask DataFrame
14
- if not isinstance(df_result, dd.DataFrame):
15
- df_result = dd.from_pandas(df_result, npartitions=1)
16
14
  self.df_result = df_result
17
15
  self.parquet_storage_path = parquet_storage_path
18
16
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
@@ -106,106 +104,3 @@ class ParquetSaver:
106
104
  str(full_path), engine="pyarrow", schema=schema, write_index=False
107
105
  )
108
106
 
109
- # import datetime
110
- # from pathlib import Path
111
- # from typing import Optional
112
- #
113
- # import dask.dataframe as dd
114
- # import fsspec
115
- # import pandas as pd
116
- # import pyarrow as pa
117
- # from sibi_dst.utils import Logger
118
- #
119
- # class ParquetSaver:
120
- # def __init__(self, df_result, parquet_storage_path, logger):
121
- # self.df_result = df_result
122
- # self.parquet_storage_path = parquet_storage_path
123
- # self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
124
- #
125
- #
126
- # def save_to_parquet(self, parquet_filename: Optional[str] = None, clear_existing=True):
127
- # full_path = self._construct_full_path(parquet_filename)
128
- #
129
- # if len(self.df_result) == 0:
130
- # self.logger.warning('No data to save')
131
- # return # Exit early if there's no data to save
132
- #
133
- # # Ensure directory exists and clear if necessary
134
- # self._ensure_directory_exists(full_path, clear_existing=True)
135
- #
136
- # # Define schema and save DataFrame to parquet
137
- # schema = self._define_schema()
138
- # self._convert_dtypes(schema)
139
- # self._save_dataframe_to_parquet(full_path, schema)
140
- #
141
- # def _define_schema(self) -> pa.Schema:
142
- # """Define a PyArrow schema dynamically based on df_result column types."""
143
- # pandas_dtype_to_pa = {
144
- # 'object': pa.string(),
145
- # 'string': pa.string(),
146
- # 'Int64': pa.int64(),
147
- # 'int64': pa.int64(),
148
- # 'float64': pa.float64(),
149
- # 'bool': pa.bool_(),
150
- # 'boolean': pa.bool_(), # pandas nullable boolean
151
- # 'datetime64[ns]': pa.timestamp('ns'),
152
- # 'timedelta[ns]': pa.duration('ns')
153
- # }
154
- #
155
- # fields = [
156
- # pa.field(col, pandas_dtype_to_pa.get(str(dtype), pa.string()))
157
- # for col, dtype in self.df_result.dtypes.items()
158
- # ]
159
- # return pa.schema(fields)
160
- #
161
- # def _convert_dtypes(self, schema: pa.Schema):
162
- # """Convert DataFrame columns to match the specified schema."""
163
- # dtype_mapping = {}
164
- # for field in schema:
165
- # col_name = field.name
166
- # if col_name in self.df_result.columns:
167
- # if pa.types.is_string(field.type):
168
- # dtype_mapping[col_name] = 'string'
169
- # elif pa.types.is_int64(field.type):
170
- # dtype_mapping[col_name] = 'Int64' # pandas nullable integer
171
- # elif pa.types.is_float64(field.type):
172
- # dtype_mapping[col_name] = 'float64'
173
- # elif pa.types.is_boolean(field.type):
174
- # dtype_mapping[col_name] = 'boolean' # pandas nullable boolean
175
- # elif pa.types.is_timestamp(field.type):
176
- # dtype_mapping[col_name] = 'datetime64[ns]'
177
- # else:
178
- # dtype_mapping[col_name] = 'object' # Fallback to object
179
- # self.df_result = self.df_result.astype(dtype_mapping)
180
- #
181
- # def _construct_full_path(self, parquet_filename: Optional[str]) -> Path:
182
- # """Construct and return the full path for the parquet file."""
183
- # fs, base_path = fsspec.core.url_to_fs(self.parquet_storage_path)
184
- # parquet_filename = parquet_filename or "default.parquet"
185
- # return Path(base_path) / parquet_filename
186
- #
187
- # @staticmethod
188
- # def _ensure_directory_exists(full_path: Path, clear_existing=False):
189
- # """Ensure that the directory for the path exists, clearing it if specified."""
190
- # fs, _ = fsspec.core.url_to_fs(str(full_path))
191
- # directory = str(full_path.parent)
192
- #
193
- # if fs.exists(directory):
194
- # if clear_existing:
195
- # fs.rm(directory, recursive=True)
196
- # else:
197
- # fs.mkdirs(directory, exist_ok=True)
198
- #
199
- # def _save_dataframe_to_parquet(self, full_path: Path, schema: pa.Schema):
200
- # """Save the DataFrame to parquet with fsspec using specified schema."""
201
- # fs, _ = fsspec.core.url_to_fs(str(full_path))
202
- # if fs.exists(full_path):
203
- # fs.rm(full_path, recursive=True)
204
- # if isinstance(self.df_result, dd.DataFrame):
205
- # self.df_result.to_parquet(
206
- # str(full_path), engine="pyarrow", schema=schema, write_index=False
207
- # )
208
- # elif isinstance(self.df_result, pd.DataFrame):
209
- # dd.from_pandas(self.df_result, npartitions=1).to_parquet(
210
- # str(full_path), engine="pyarrow", schema=schema, write_index=False
211
- # )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.15
3
+ Version: 0.3.17
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -13,6 +13,7 @@ Requires-Dist: chardet (>=5.2.0,<6.0.0)
13
13
  Requires-Dist: charset-normalizer (>=3.4.0,<4.0.0)
14
14
  Requires-Dist: clickhouse-connect (>=0.8.7,<0.9.0)
15
15
  Requires-Dist: clickhouse-driver (>=0.2.9,<0.3.0)
16
+ Requires-Dist: dask-expr (>=1.1.20,<2.0.0)
16
17
  Requires-Dist: dask[complete] (>=2024.11.1,<2025.0.0)
17
18
  Requires-Dist: django (>=5.1.4,<6.0.0)
18
19
  Requires-Dist: djangorestframework (>=3.15.2,<4.0.0)
@@ -1,45 +1,46 @@
1
1
  sibi_dst/__init__.py,sha256=1KaC0LYTHxjpENq-NXI325WcEYZ8GCBrHGkLoFxEcu0,251
2
2
  sibi_dst/df_helper/__init__.py,sha256=JXJBY47G6wOYhzNI646OBl3pSGWIy4282-3qPGYHU7w,167
3
- sibi_dst/df_helper/_df_helper.py,sha256=ZWhPj9K5q_amJ7eBOrvwAvncxRnI-baveKWWQWfyND8,13354
3
+ sibi_dst/df_helper/_df_helper.py,sha256=yOapAc3MLQnylGKs0TG4Nmf8gaLdM7Nvzt4H1bEp8ik,13898
4
4
  sibi_dst/df_helper/_parquet_artifact.py,sha256=f5oHwXtsNW6-ONSFsRB0AniVefA0THzP92J-nugp9vo,4973
5
- sibi_dst/df_helper/core/__init__.py,sha256=NSYY_evzq6XEkO06Nz6xLH5KznzRGI44cLbrnN3zHXQ,503
5
+ sibi_dst/df_helper/core/__init__.py,sha256=o4zDwgVmaijde3oix0ezb6KLxI5QFy-SGUhFTDVFLT4,569
6
6
  sibi_dst/df_helper/core/_defaults.py,sha256=pJU-lX7w4nrt0Anx35j08mVr_0oMGn1bTA_iCl_p1qI,6700
7
+ sibi_dst/df_helper/core/_filter_handler.py,sha256=SYZqpX4Vt6GAGR0L0LohlDOdjLLWQXJDiWWqFG-lSu0,8563
7
8
  sibi_dst/df_helper/core/_params_config.py,sha256=hO-PddoaGjFebqJFgtn76WwVHcCjzPW3z5i3NyK6mDw,3475
8
9
  sibi_dst/df_helper/core/_query_config.py,sha256=HEiyR_fBJjIMum-PSQroY3KaefQ2SpW1w1SQS8oT-NU,489
9
10
  sibi_dst/df_helper/plugins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
11
  sibi_dst/df_helper/plugins/django/__init__.py,sha256=VkvYql-PUyCKOfoKx5aGdUAki8X-kULfhnCvgSQzHDQ,311
11
12
  sibi_dst/df_helper/plugins/django/_django_db_connection.py,sha256=9fGvXXgqPk_TC7BvaSljxTgNDWoXi_LZxKabEgExznM,1654
12
- sibi_dst/df_helper/plugins/django/_django_load_from_db.py,sha256=NrkBb5LM1A_vo3wAotqj2sVVYIuTfFGrQqIXk3xOoDs,5162
13
+ sibi_dst/df_helper/plugins/django/_django_load_from_db.py,sha256=NSPNCNzvEAR-4AuTSPQWJsidBvT9zRHAN6L3JC1xRV0,5591
13
14
  sibi_dst/df_helper/plugins/django/_django_sql_model_builder.py,sha256=GprCh2c6PFlRBUCir4hh8pmf4Eqb-4OIK6Vz_xXQjMw,14864
14
- sibi_dst/df_helper/plugins/django/_io_dask.py,sha256=aGaHziEMWK4zk9kkMNq2QtVevqVOCWqoAlXT1lVgRok,9198
15
+ sibi_dst/df_helper/plugins/django/_io_dask.py,sha256=P3WmkuFzmWRzFchjsVD2OElIR3stuevwDH9G6Mu8IWE,9080
15
16
  sibi_dst/df_helper/plugins/django/_io_dask_alt.py,sha256=zDjLyYxBeL0ffn3yfE_7vqMLMpeEEk2o-zMr66sKkDw,6827
16
17
  sibi_dst/df_helper/plugins/http/__init__.py,sha256=AG9JSDRyVna2r1yxCQ9HcY32EaGnzWsfKgNLgPpSXjY,102
17
- sibi_dst/df_helper/plugins/http/_http_config.py,sha256=WH0d4vsxfZRhWrWI4iTVAnhsdY3421SBr9kXYZVfeYQ,2126
18
+ sibi_dst/df_helper/plugins/http/_http_config.py,sha256=NN3bol7NgBTDv70yOX7hJkazt1-dAAdFWVkYyHdIXsI,2128
18
19
  sibi_dst/df_helper/plugins/parquet/__init__.py,sha256=ClkyIsIh_ovEwqm0dTrkXImbPjLDTVHW2NQqqfQwWAw,187
19
20
  sibi_dst/df_helper/plugins/parquet/_parquet_filter_handler.py,sha256=6iFvblnVq0qj89QvieQuYxe_2RPX5ArKfq5zBcEIj90,3660
20
21
  sibi_dst/df_helper/plugins/parquet/_parquet_options.py,sha256=suJC7LfNEWAo-7_R62YTMSRku3k8orysft83VxRUems,4394
21
22
  sibi_dst/df_helper/plugins/sql_alchemy/__init__.py,sha256=FHorj40SbHc0OBzQ_ieG6MG-HLbf0tw6I_5eoIjJkOI,369
22
- sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py,sha256=1WQ390XBFWOSXo0ea6-hz1LM6Ppi-j6ToZYr7sQBldE,5330
23
- sibi_dst/df_helper/plugins/sql_alchemy/_sqlachemy_filter_handler.py,sha256=H8ypUjLKzYYl9BerfJjX_Uv9qBVkBR-wZiQlh3uRQXg,4669
23
+ sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py,sha256=fna8xZL8Ij6uCM_tZINO8vPdpJZaXs41gGzR4xn5zd8,5531
24
+ sibi_dst/df_helper/plugins/sql_alchemy/_sqlachemy_filter_handler.py,sha256=KShsLJYGVxN0ps9Wot7fF0nR0wW9WzcPIcWZ9f5vdBo,4654
24
25
  sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_db_connection.py,sha256=HtMsfH5com4dLVJxh3wdMUpQI3mz0cKDJz0CmFS2S8U,1648
25
- sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py,sha256=balWGKun0NKIfhLZW-_DCOhKuTzTo_C2NwZoKFwuSJo,2329
26
- sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_model_builder.py,sha256=rzzZdcRB5TS9uJ3ZIGQiNf04e3u2akqJEsoGCuyPE3c,4467
26
+ sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py,sha256=DBIM4kk86GxWkyiEZ4dSl_DdKa9SMvANCbympfzOqgQ,2169
27
+ sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_model_builder.py,sha256=IQK2jOXMNJRQOSD0VQ0p11BeDGlvxD8NfFRilw9Go80,4466
27
28
  sibi_dst/df_helper/plugins/sql_model/__init__.py,sha256=MXd4OOdTqR4cENSV733SGodPO6eQMCexANs-3w0qL5U,226
28
29
  sibi_dst/df_helper/plugins/sql_model/_sqlmodel_db_connection.py,sha256=6jmMjKIv5Btysj3kZMaXQ98IqKQkhnOC-JWtb1B8rus,4265
29
- sibi_dst/df_helper/plugins/sql_model/_sqlmodel_load_from_db.py,sha256=bLD4tEcGDKkJCfSO4b13_89tzVJcpz55I6uw9D4ERnE,3751
30
- sibi_dst/utils/__init__.py,sha256=nkX7tASNn57kw998YdqQQGY8qXv2J4LC4-g0GoQSiic,831
30
+ sibi_dst/df_helper/plugins/sql_model/_sqlmodel_load_from_db.py,sha256=jYwkIz7_E9Z6Mqw1a9TCWKWD146Tbx7mcQFxIpmKgKU,3686
31
+ sibi_dst/utils/__init__.py,sha256=TV229dPIIEzU5qCLI1G6fnCZW-VirUwSuffp7z7OTFg,783
31
32
  sibi_dst/utils/_airflow_manager.py,sha256=rlt3eolR5QvtxWhAtBTCpHXvxftnKM-ibPMv3fVwNZk,7524
32
- sibi_dst/utils/_clickhouse_writer.py,sha256=kNBQeDn3D4Javrz5L8uU_5itf8Mrvm9l29uxcmcKlbg,8555
33
+ sibi_dst/utils/_clickhouse_writer.py,sha256=JcnWN2635ATCOaFiB6NYglNXDwqKw0jC7Urs9WOZE20,8571
33
34
  sibi_dst/utils/_credentials.py,sha256=8i6z7y3y5S-6mSk4xrT2AwhzCA32mTn1n1iYX9IVyHk,1724
34
- sibi_dst/utils/_data_utils.py,sha256=uw0SW9G4GrvTX4IdUd8fmsMTMEG5aXOFcWOv4Au3H5g,7016
35
- sibi_dst/utils/_data_wrapper.py,sha256=SmNv1UoZLq7ovRVy4wipsWLMidKJXcRTp4HtxmaCQdk,9399
35
+ sibi_dst/utils/_data_utils.py,sha256=ch4j5FEs8ZnniUzpbeLO-b4Yco_6nwCu71xHaVqMGi4,7050
36
+ sibi_dst/utils/_data_wrapper.py,sha256=_hLZhKqSxcfXe8IyWM2paBxtW2JlOCq2jYhNGcInPi4,9406
36
37
  sibi_dst/utils/_date_utils.py,sha256=KYB07puKDrSG8tOm_i1HGX0TjLNUtSWjwfsCYBmW9co,10619
37
- sibi_dst/utils/_df_utils.py,sha256=9_dNYoZ9_ofU0t_sxMdsXALWCuh02gvqUrei-6Lhr6w,10910
38
+ sibi_dst/utils/_df_utils.py,sha256=pjEfkof9hggXQgYerG0p4DXrwBeIRynJFg4IX3Yrb4c,10919
38
39
  sibi_dst/utils/_file_utils.py,sha256=5EN90c8N1n9d-_xwz2RzaYcXRMQY_rws2Q3EA3pNAog,1254
39
40
  sibi_dst/utils/_filepath_generator.py,sha256=ytPSZ9GYOnnSP25zwA-0NjFHupPRZyXwixWnn_68_n0,6686
40
- sibi_dst/utils/_log_utils.py,sha256=AAenyubYUjk77WqiaNkjgkxws3dnAMIdaGl2Ryz_cA4,2245
41
- sibi_dst/utils/_parquet_saver.py,sha256=-A0o_vucyYe7wlwiby_0_yS-ZfT2GHwImyQHrCIBNwk,9051
41
+ sibi_dst/utils/_log_utils.py,sha256=rPp8z1UglwvqzBOOAvMOct0syQZ-54gGYafnJDRYZN4,2313
42
+ sibi_dst/utils/_parquet_saver.py,sha256=3BK0XXgMOOAdIw4OzbwMxmDrzDw3_MKi8RTpulIVUe0,4367
42
43
  sibi_dst/utils/_storage_manager.py,sha256=KP2HBXnLUMMquqcO30ecfuoU7g1z8RtaV3Dv0TvEXoY,3856
43
- sibi_dst-0.3.15.dist-info/METADATA,sha256=0XU32Bgt1RYV7Y12lmDxq_YmHaXya5d2qMYfYP8Yic0,2090
44
- sibi_dst-0.3.15.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
45
- sibi_dst-0.3.15.dist-info/RECORD,,
44
+ sibi_dst-0.3.17.dist-info/METADATA,sha256=Aw__Wr7myZwJfWGRNFy0Ye5FLXUnGf6b14GW5KBDGtE,2133
45
+ sibi_dst-0.3.17.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
46
+ sibi_dst-0.3.17.dist-info/RECORD,,