sibi-dst 0.3.63__py3-none-any.whl → 2025.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. sibi_dst/df_helper/_df_helper.py +186 -591
  2. sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -2
  3. sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +161 -115
  4. sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +291 -97
  5. sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +34 -105
  6. sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +175 -162
  7. sibi_dst/df_helper/core/__init__.py +0 -4
  8. sibi_dst/df_helper/core/_defaults.py +1 -50
  9. sibi_dst/df_helper/core/_query_config.py +2 -2
  10. sibi_dst/utils/__init__.py +0 -2
  11. sibi_dst/utils/data_wrapper.py +9 -12
  12. sibi_dst/utils/log_utils.py +15 -11
  13. sibi_dst/utils/update_planner.py +2 -0
  14. sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +325 -50
  15. sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +2 -2
  16. sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +330 -51
  17. sibi_dst/v3/__init__.py +0 -0
  18. sibi_dst/v3/backends/__init__.py +0 -0
  19. sibi_dst/v3/df_helper/__init__.py +0 -0
  20. sibi_dst/v3/df_helper/_df_helper.py +91 -0
  21. sibi_dst-2025.1.1.dist-info/METADATA +55 -0
  22. {sibi_dst-0.3.63.dist-info → sibi_dst-2025.1.1.dist-info}/RECORD +23 -26
  23. sibi_dst/df_helper/backends/django/__init__.py +0 -11
  24. sibi_dst/df_helper/backends/django/_db_connection.py +0 -88
  25. sibi_dst/df_helper/backends/django/_io_dask.py +0 -450
  26. sibi_dst/df_helper/backends/django/_load_from_db.py +0 -227
  27. sibi_dst/df_helper/backends/django/_sql_model_builder.py +0 -493
  28. sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py +0 -119
  29. sibi_dst/utils/airflow_manager.py +0 -212
  30. sibi_dst-0.3.63.dist-info/METADATA +0 -90
  31. {sibi_dst-0.3.63.dist-info → sibi_dst-2025.1.1.dist-info}/WHEEL +0 -0
@@ -1,637 +1,232 @@
1
1
  from __future__ import annotations
2
2
 
3
- import asyncio
4
- import datetime
5
- import logging
6
3
  import warnings
7
- from typing import Any, Dict, TypeVar
8
- from typing import Union, Optional
4
+ from typing import Any, Dict, Optional, Union, TypeVar
9
5
 
10
6
  import dask.dataframe as dd
11
7
  import fsspec
12
8
  import pandas as pd
13
- from dask import delayed, compute
14
9
  from pydantic import BaseModel
15
10
 
16
11
  from sibi_dst.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
17
- from sibi_dst.utils import Logger
18
- from sibi_dst.utils import ParquetSaver, ClickHouseWriter
19
- from .backends.django import *
12
+ from sibi_dst.utils import Logger, ParquetSaver, ClickHouseWriter
20
13
  from .backends.http import HttpConfig
21
14
  from .backends.parquet import ParquetConfig
22
- from .backends.sqlalchemy import *
15
+ from .backends.sqlalchemy import SqlAlchemyConnectionConfig, SqlAlchemyLoadFromDb
23
16
 
24
- # Define a generic type variable for BaseModel subclasses
17
+ warnings.filterwarnings("ignore")
25
18
  T = TypeVar("T", bound=BaseModel)
26
19
 
27
- # It is considered acceptable in Django to access protected class members
28
- warnings.filterwarnings(
29
- "ignore",
30
- message="Access to a protected member _meta",
31
- category=UserWarning,
32
- )
33
20
 
21
+ # --- Backend Strategy Pattern Implementation ---
22
+
23
+ class BaseBackend:
24
+ """Abstract base class defining clear sync and async loading interfaces."""
25
+
26
+ def __init__(self, helper: DfHelper):
27
+ self.helper = helper
28
+ self.logger = helper.logger
29
+ self.debug = helper.debug
30
+
31
+ def load(self, **options) -> dd.DataFrame | pd.DataFrame:
32
+ """Synchronous data loading method. Must be implemented by sync backends."""
33
+ raise NotImplementedError(f"Backend '{self.__class__.__name__}' does not support synchronous loading.")
34
+
35
+ async def aload(self, **options) -> dd.DataFrame | pd.DataFrame:
36
+ """Asynchronous data loading method. By default, it calls the sync version."""
37
+ return self.load(**options)
38
+
39
+
40
+ class SqlAlchemyBackend(BaseBackend):
41
+ def load(self, **options) -> dd.DataFrame:
42
+ try:
43
+ # Process incoming filter options into the ParamsConfig object
44
+ if options and hasattr(self.helper._backend_params, 'parse_params'):
45
+ self.helper._backend_params.parse_params(options)
46
+
47
+ db_loader = SqlAlchemyLoadFromDb(
48
+ plugin_sqlalchemy=self.helper.backend_db_connection,
49
+ plugin_query=self.helper._backend_query,
50
+ plugin_params=self.helper._backend_params,
51
+ logger=self.logger,
52
+ debug= self.debug
53
+ )
54
+ return db_loader.build_and_load()
55
+ except Exception as e:
56
+ self.logger.error(f"Failed to load data from sqlalchemy: {e}", exc_info=self.debug)
57
+ return dd.from_pandas(pd.DataFrame(), npartitions=1)
58
+
59
+
60
+ class ParquetBackend(BaseBackend):
61
+ """This backend is also purely synchronous."""
62
+
63
+ def load(self, **options) -> dd.DataFrame | pd.DataFrame:
64
+ try:
65
+ df = self.helper.backend_parquet.load_files()
66
+ if options and df is not None:
67
+ df = FilterHandler('dask', logger=self.logger, debug=False).apply_filters(df, filters=options)
68
+ return df
69
+ except Exception as e:
70
+ self.logger.error(f"Failed to load data from parquet: {e}", exc_info=True)
71
+ return dd.from_pandas(pd.DataFrame(), npartitions=1)
72
+
73
+
74
+ class HttpBackend(BaseBackend):
75
+ """This backend is purely asynchronous."""
76
+
77
+ def load(self, **options) -> dd.DataFrame | pd.DataFrame:
78
+ # This will correctly fail by raising NotImplementedError from the base class.
79
+ return self.helper.backend_http.fetch_data(**options)
80
+
81
+ async def aload(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
82
+ if not self.helper.backend_http:
83
+ self.logger.warning("HTTP plugin not configured properly.")
84
+ return dd.from_pandas(pd.DataFrame(), npartitions=1)
85
+ return await self.helper.backend_http.fetch_data(**options)
86
+
87
+
88
+ # --- Main DfHelper Facade Class ---
34
89
 
35
90
  class DfHelper:
36
91
  """
37
- DfHelper is a utility class for managing, loading, and processing data from
38
- various backends, such as Django databases, Parquet files, HTTP sources, and
39
- SQLAlchemy-based databases. The class abstracts the complexities of handling
40
- different backends and provides a unified interface for data operations.
41
-
42
- The class is particularly useful for projects that require flexibility in
43
- data source configuration and seamless integration with both Dask and Pandas
44
- for handling data frames. It includes robust mechanisms for post-processing
45
- data, filtering columns, renaming, and setting indices.
46
-
47
- :ivar df: The DataFrame currently being processed or loaded.
48
- :type df: Union[dd.DataFrame, pd.DataFrame]
49
- :type backend_connection: Optional[DjangoConnectionConfig | SqlAlchemyConnectionConfig]
50
- :ivar _backend_query: Internal configuration for query handling.
51
- :type _backend_query: Optional[QueryConfig]
52
- :ivar _backend_params: Internal parameters configuration for DataFrame handling.
53
- :type _backend_params: Optional[ParamsConfig]
54
- :ivar backend_parquet: Configuration for Parquet file handling.
55
- :type backend_parquet: Optional[ParquetConfig]
56
- :ivar backend_http: Configuration for interacting with HTTP-based backends.
57
- :type backend_http: Optional[HttpConfig]
58
- :ivar parquet_filename: The filename for a Parquet file, if applicable.
59
- :type parquet_filename: str
60
- :ivar logger: Logger instance used for debugging and information logging.
61
- :type logger: Logger
62
- :ivar default_config: Default configuration dictionary that can be overridden.
63
- :type default_config: Dict
92
+ A reusable utility for loading data. It provides both sync (`load`) and
93
+ async (`aload`) methods to accommodate different backends.
64
94
  """
65
- df: Union[dd.DataFrame, pd.DataFrame] = None
66
- backend_db_connection: Optional[Union[DjangoConnectionConfig | SqlAlchemyConnectionConfig]] = None
67
- _backend_query: Optional[QueryConfig] = None
68
- _backend_params: Optional[ParamsConfig] = None
69
- backend_parquet: Optional[ParquetConfig] = None
70
- backend_http: Optional[HttpConfig] = None
71
- parquet_filename: str = None
72
- logger: Logger
95
+ _BACKEND_STRATEGIES = {
96
+ 'sqlalchemy': SqlAlchemyBackend,
97
+ 'parquet': ParquetBackend,
98
+ 'http': HttpBackend,
99
+ }
100
+
73
101
  default_config: Dict = None
74
102
 
75
103
  def __init__(self, backend='sqlalchemy', **kwargs):
76
- # Ensure default_config is not shared across instances
77
104
  self.default_config = self.default_config or {}
78
105
  kwargs = {**self.default_config.copy(), **kwargs}
79
106
  self.backend = backend
80
- self.debug = kwargs.setdefault("debug", False)
107
+ self.debug = kwargs.get("debug", False)
81
108
  self.logger = kwargs.get("logger", Logger.default_logger(logger_name=self.__class__.__name__))
82
- # Configure logger level
83
109
  self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
84
- self.logger.debug("Logger initialized in DEBUG mode.")
85
- self.parquet_storage_path = kwargs.setdefault("parquet_storage_path", None)
86
- self.parquet_filename = kwargs.setdefault("parquet_filename", None)
87
- self.dt_field = kwargs.setdefault("dt_field", None)
88
- self.as_pandas = kwargs.setdefault("as_pandas", False)
89
- self.filesystem = kwargs.pop('filesystem', 'file')
90
- self.filesystem_options = kwargs.pop('filesystem_options', {})
91
- kwargs.setdefault("live", True)
110
+ self.fs = kwargs.get("fs", fsspec.filesystem('file'))
111
+ kwargs.setdefault("fs", self.fs)
92
112
  kwargs.setdefault("logger", self.logger)
93
- self.fs = kwargs.setdefault("fs", fsspec.filesystem('file'))
94
- self.__post_init(**kwargs)
95
-
96
- def __str__(self):
97
- return self.__class__.__name__
113
+ self._backend_query = self._get_config(QueryConfig, kwargs)
114
+ self._backend_params = self._get_config(ParamsConfig, kwargs)
115
+ self.backend_db_connection: Optional[SqlAlchemyConnectionConfig] = None
116
+ self.backend_parquet: Optional[ParquetConfig] = None
117
+ self.backend_http: Optional[HttpConfig] = None
118
+
119
+ if self.backend == 'sqlalchemy':
120
+ self.backend_db_connection = self._get_config(SqlAlchemyConnectionConfig, kwargs)
121
+ elif self.backend == 'parquet':
122
+ self.backend_parquet = self._get_config(ParquetConfig, kwargs)
123
+ elif self.backend == 'http':
124
+ self.backend_http = self._get_config(HttpConfig, kwargs)
98
125
 
99
- def __call__(self, **options):
100
- return self.load(**options)
126
+ strategy_class = self._BACKEND_STRATEGIES.get(self.backend)
127
+ if not strategy_class: raise ValueError(f"Unsupported backend: {self.backend}")
128
+ self.backend_strategy = strategy_class(self)
101
129
 
102
130
  def __enter__(self):
103
131
  return self
104
132
 
105
133
  def __exit__(self, exc_type, exc_value, traceback):
106
- self.__cleanup()
107
- return False
108
-
109
- def __cleanup(self):
110
- """
111
- Clean up resources when exiting the context manager.
112
- This method is called when the context manager exits.
113
- """
114
-
115
- if self.backend_db_connection:
116
- if getattr(self.backend_db_connection, "dispose_idle_connections", None):
117
- self.backend_db_connection.dispose_idle_connections()
118
- if getattr(self.backend_db_connection, "close", None):
119
- self.backend_db_connection.close()
120
-
121
- self.backend_db_connection = None
122
-
123
- if self.backend_parquet:
124
- self.backend_parquet = None
125
- if self.backend_http:
126
- self.backend_http = None
127
- self._backend_query = None
128
- self._backend_params = None
129
-
130
- def __post_init(self, **kwargs):
131
- """
132
- Initializes backend-specific configurations based on the provided backend type and other
133
- parameters. This method performs configuration setup dependent on the selected backend,
134
- such as 'django_db', 'parquet', 'http', or 'sqlalchemy'. Configuration for each backend
135
- type is fetched or instantiated as necessary using provided parameters or default values.
136
-
137
- :param kwargs: Dictionary of arguments passed during initialization of backend configurations.
138
- Additional parameters for specific backend types are extracted here.
139
- :return: None
140
- """
141
- # self.logger.debug(f"backend used: {self.backend}")
142
- # self.logger.debug(f"kwargs passed to backend plugins: {kwargs}")
143
- self._backend_query = self.__get_config(QueryConfig, kwargs)
144
- self._backend_params = self.__get_config(ParamsConfig, kwargs)
145
- if self.backend == 'django_db':
146
- self.backend_db_connection = self.__get_config(DjangoConnectionConfig, kwargs)
147
- elif self.backend == 'parquet':
148
- self.parquet_filename = kwargs.setdefault("parquet_filename", None)
149
- self.backend_parquet = ParquetConfig(**kwargs)
150
- elif self.backend == 'http':
151
- self.backend_http = HttpConfig(**kwargs)
152
- elif self.backend == 'sqlalchemy':
153
- self.backend_db_connection = self.__get_config(SqlAlchemyConnectionConfig, kwargs)
154
-
155
- def __get_config(self, model: [T], kwargs: Dict[str, Any]) -> Union[T]:
156
- """
157
- Initializes a Pydantic model with the keys it recognizes from the kwargs,
158
- and removes those keys from the kwargs dictionary.
159
- :param model: The Pydantic model class to initialize.
160
- :param kwargs: The dictionary of keyword arguments.
161
- :return: The initialized Pydantic model instance.
162
- """
163
- kwargs.setdefault("debug", self.debug)
164
- kwargs.setdefault("logger", self.logger)
165
- # Extract keys that the model can accept
166
- recognized_keys = set(model.__annotations__.keys())
167
- self.logger.debug(f"recognized keys: {recognized_keys}")
168
- model_kwargs = {k: kwargs.pop(k) for k in list(kwargs.keys()) if k in recognized_keys}
169
- # self.logger.debug(f"model_kwargs: {model_kwargs}")
134
+ self._cleanup()
135
+
136
+ def _cleanup(self):
137
+ active_config = getattr(self, f"backend_{self.backend}", None)
138
+ if active_config and hasattr(active_config, "close"):
139
+ self.logger.debug(f"Closing resources for '{self.backend}' backend.")
140
+ active_config.close()
141
+
142
+ def _get_config(self, model: T, kwargs: Dict[str, Any]) -> T:
143
+ recognized_keys = set(model.model_fields.keys())
144
+ model_kwargs = {k: kwargs[k] for k in recognized_keys if k in kwargs}
170
145
  return model(**model_kwargs)
171
146
 
172
- def load_parallel(self, **options):
173
- """
174
- Executes the `load` method in parallel using Dask, allowing multiple instances
175
- to run concurrently. This function leverages Dask's `delayed` and `compute`
176
- methods to schedule and process tasks in parallel. It is designed to handle
177
- concurrent workloads efficiently by utilizing up to 4 parallel executions of
178
- the `load` function.
179
-
180
- :param options: Keyword arguments to be passed to the `load` method. These options
181
- will be applied to all parallel instances of the `load` method.
182
- :return: A list of results, where each element represents the output
183
- from one of the parallel executions of the `load` method.
184
- """
185
- # Define tasks using Dask's delayed
186
- tasks = [delayed(self.load)(**options) for _ in range(4)]
187
- results = compute(*tasks)
188
- return results
189
-
190
- def load(self, **options):
191
- """
192
- Loads data from a dataframe backend, ensuring compatibility with multiple
193
- data processing backends. Provides the data in a pandas dataframe format
194
- if the `as_pandas` attribute is set to True.
195
-
196
- :param options: Arbitrary keyword arguments for dataframe loading customization.
197
- :type options: dict
198
- :return: The loaded dataframe, computed as a pandas dataframe if
199
- `as_pandas` is set to True, or kept in its native backend format otherwise.
200
- """
201
- # this will be the universal method to load data from a df irrespective of the backend
202
- self.df = self.__load(**options)
203
- if self.as_pandas:
204
- return self.df.compute()
205
- return self.df
206
-
207
- def __load(self, **options):
208
- """
209
- Private method responsible for loading data using a specified backend. This method
210
- abstracts away the details of interacting with the backend and dynamically calls the
211
- appropriate function depending on the backend type. It supports multiple backend
212
- types, such as `django_db`, `sqlalchemy`, `parquet`, and `http`. If the `http` backend
213
- is selected, it checks whether the asyncio event loop is running and either runs the
214
- process as a new asyncio task or synchronously.
215
-
216
- :param options: Arbitrary keyword arguments provided for backend-specific configurations.
217
- These should align with the requirements of the chosen backend.
218
- :type options: dict
219
-
220
- :return: The data loaded from the specified backend. The return type is dependent on
221
- the particular backend being used.
222
- :rtype: Depending on backend implementation; could be `Task`, `List`, `Dict`, or
223
- another format defined by the backend.
224
- """
225
- if self.backend == 'django_db':
226
- self._backend_params.parse_params(options)
227
- return self.__load_from_django_db(**options)
228
- elif self.backend == 'sqlalchemy':
229
- self._backend_params.parse_params(options)
230
- return self.__load_from_sqlalchemy(**options)
231
- elif self.backend == 'parquet':
232
- return self.__load_from_parquet(**options)
233
- elif self.backend == 'http':
234
- if asyncio.get_event_loop().is_running():
235
- self.logger.debug("Running as a task from an event loop")
236
- return asyncio.create_task(self.__load_from_http(**options))
237
- else:
238
- self.logger.debug("Regular asyncio run...")
239
- return asyncio.run(self.__load_from_http(**options))
240
- return dd.from_pandas(pd.DataFrame(), npartitions=1)
241
-
242
- def __load_from_sqlalchemy(self, **options):
243
- """
244
- Loads data from an SQLAlchemy database source into a dataframe. The method processes
245
- the loaded data and applies post-processing to transform it into the desired structure.
246
- If the operation fails, an empty pandas DataFrame is created as a fallback.
247
-
248
- :param options: Additional keyword arguments to configure the data loading process.
249
- These options can include configurations such as 'debug' and other parameters
250
- required by the `SqlAlchemyLoadFromDb` class.
251
- :type options: dict
252
- :return: A dataframe containing the data loaded from the SQLAlchemy database.
253
- :rtype: dask.dataframe.DataFrame
254
- """
255
- try:
256
- options.setdefault("debug", self.debug)
257
- db_loader = SqlAlchemyLoadFromDb(
258
- self.backend_db_connection,
259
- self._backend_query,
260
- self._backend_params,
261
- self.logger,
262
- **options
263
- )
264
- self.df = db_loader.build_and_load()
265
- self.__process_loaded_data()
266
- self.__post_process_df()
267
- self.backend_db_connection.close()
268
- self.logger.debug("Data successfully loaded from sqlalchemy database.")
269
- except Exception as e:
270
- self.logger.debug(f"Failed to load data from sqlalchemy database: {e}: options: {options}")
271
- self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
272
-
273
- return self.df
274
-
275
- def __load_from_django_db(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
276
- """
277
- Loads data from a Django database using a specific backend query mechanism. Processes the loaded data
278
- and applies further post-processing before returning the dataframe. If the operation fails, an
279
- empty dataframe with a single partition is returned instead.
280
-
281
- :param options: Additional settings for the database loading process, which include optional configurations
282
- like debug mode, among others.
283
- :type options: dict
284
- :return: A dataframe containing the loaded data either as a Pandas or Dask dataframe.
285
- :rtype: Union[pd.DataFrame, dd.DataFrame]
286
- """
287
- try:
288
- options.setdefault("debug", self.debug)
289
- db_loader = DjangoLoadFromDb(
290
- self.backend_db_connection,
291
- self._backend_query,
292
- self._backend_params,
293
- self.logger,
294
- **options
295
- )
296
- self.df = db_loader.build_and_load()
297
- self.__process_loaded_data()
298
- self.__post_process_df()
299
- self.logger.debug("Data successfully loaded from django database.")
300
- except Exception as e:
301
- self.logger.debug(f"Failed to load data from django database: {e}")
302
- self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
303
-
304
- return self.df
305
-
306
- async def __load_from_http(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
307
- """
308
- Loads data asynchronously from an HTTP source using the configured HTTP plugin.
309
- If the HTTP plugin is not properly configured, this method logs a debug message and
310
- returns an empty Dask DataFrame. If an exception occurs during data fetching, the error
311
- is logged and an empty Dask DataFrame with one partition is returned.
312
-
313
- :param options: Additional keyword arguments that are passed to the HTTP plugin for
314
- fetching the data.
315
- :returns: A DataFrame object that can either be a pandas or a Dask DataFrame. When the
316
- fetching operation fails, it defaults to returning an empty Dask DataFrame
317
- with a single partition.
318
- """
319
- if not self.backend_http:
320
- self.logger.debug("HTTP plugin not configured properly.")
321
- return dd.from_pandas(pd.DataFrame(), npartitions=1)
322
- try:
323
- self.df = await self.backend_http.fetch_data(**options)
324
- except Exception as e:
325
- self.logger.debug(f"Failed to load data from http plugin: {e}")
326
- self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
327
- return self.df
328
-
329
- def __post_process_df(self):
330
- """
331
- Processes a DataFrame according to the provided parameters defined within the
332
- `self._backend_params.df_params` dictionary. This involves filtering columns,
333
- renaming columns, setting an index column, and handling datetime indexing.
334
- The method modifies the DataFrame in place.
335
-
336
- :raises ValueError: If the lengths of `fieldnames` and `column_names` do not match,
337
- or if the specified `index_col` is not found in the DataFrame.
338
- """
339
- self.logger.debug("Post-processing DataFrame.")
147
+ def load(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
148
+ """Loads data synchronously. Fails if backend is async-only."""
149
+ self.logger.debug(f"Loading data from {self.backend} backend with options: {options}")
150
+ df = self.backend_strategy.load(**options)
151
+ df = self._process_loaded_data(df)
152
+ df = self._post_process_df(df)
153
+ return df.compute() if as_pandas else df
154
+
155
+ async def aload(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
156
+ """Loads data asynchronously from any backend."""
157
+ df = await self.backend_strategy.aload(**options)
158
+ df = self._process_loaded_data(df)
159
+ df = self._post_process_df(df)
160
+ return df.compute() if as_pandas else df
161
+
162
+ def _post_process_df(self, df: dd.DataFrame) -> dd.DataFrame:
340
163
  df_params = self._backend_params.df_params
341
- fieldnames = df_params.get("fieldnames", None)
342
- index_col = df_params.get("index_col", None)
343
- datetime_index = df_params.get("datetime_index", False)
344
- column_names = df_params.get("column_names", None)
345
-
346
- # Filter columns
164
+ if not df_params: return df
165
+ fieldnames, column_names, index_col = (df_params.get("fieldnames"), df_params.get("column_names"),
166
+ df_params.get("index_col"))
167
+ if not any([fieldnames, column_names, index_col]): return df
168
+ self.logger.debug("Post-processing DataFrame.")
347
169
  if fieldnames:
348
- existing_columns = set(self.df.columns)
349
- valid_fieldnames = list(filter(existing_columns.__contains__, fieldnames))
350
- self.df = self.df[valid_fieldnames]
351
-
352
- # Rename columns
353
- if column_names is not None:
354
- if len(fieldnames) != len(column_names):
355
- raise ValueError(
356
- f"Length mismatch: fieldnames ({len(fieldnames)}) and column_names ({len(column_names)}) must match."
357
- )
358
- rename_mapping = dict(zip(fieldnames, column_names))
359
- self.df = self.df.map_partitions(lambda df: df.rename(columns=rename_mapping))
360
-
361
- # Set index column
362
- if index_col is not None:
363
- if index_col in self.df.columns:
364
- self.df = self.df.set_index(index_col)
365
- else:
366
- raise ValueError(f"Index column '{index_col}' not found in DataFrame.")
367
-
368
- # Handle datetime index
369
- if datetime_index and self.df.index.dtype != 'datetime64[ns]':
370
- self.df = self.df.map_partitions(lambda df: df.set_index(pd.to_datetime(df.index, errors='coerce')))
371
-
372
- self.logger.debug("Post-processing of DataFrame completed.")
373
-
374
- def __process_loaded_data(self):
375
- """
376
- Processes the dataframe by applying renaming logic based on the given field map
377
- configuration. Inspects the dataframe for missing columns referenced in the field
378
- map and flags them with a warning. Applies renaming only for columns that exist
379
- in the dataframe while ensuring that no operations take place if the dataframe
380
- is empty.
381
-
382
- :param self: The instance of the class where the dataframe is being processed.
383
- :type self: object with attributes `df`, `_backend_params`, and `logger`.
384
-
385
- :raises Warning: Logs a warning if specified columns in the `field_map` are not
386
- present in the dataframe.
387
-
388
- :return: None
389
- """
390
- self.logger.debug(f"Processing loaded data...")
391
- if self.df.map_partitions(len).compute().sum() > 0:
392
- field_map = self._backend_params.field_map or {}
393
- if isinstance(field_map, dict) and field_map != {}:
394
- rename_mapping = {k: v for k, v in field_map.items() if k in self.df.columns}
395
- missing_columns = [k for k in field_map.keys() if k not in self.df.columns]
396
-
397
- if missing_columns:
398
- self.logger.warning(
399
- f"The following columns in field_map are not in the DataFrame: {missing_columns}, field map: {field_map}")
400
-
401
- def rename_columns(df, mapping):
402
- return df.rename(columns=mapping)
403
-
404
- if rename_mapping:
405
- # Apply renaming
406
- self.df = self.df.map_partitions(rename_columns, mapping=rename_mapping)
407
-
408
- self.logger.debug("Processing of loaded data completed.")
409
- else:
410
- self.logger.debug("DataFrame is empty, skipping processing.")
411
-
412
- def save_to_parquet(self, parquet_filename: Optional[str] = None, **kwargs):
413
- """
414
- Save the dataframe result to a Parquet file using specified configurations.
415
-
416
- This method leverages the ParquetSaver class to store the dataframe result
417
- into a Parquet file. It also provides functionality for overriding the default
418
- filesystem (`fs`) and storage path (`parquet_storage_path`). The method logs
419
- details about the saving operation for debugging purposes.
420
-
421
- :param parquet_filename: The name of the Parquet file to save the dataframe to.
422
- If not provided, a default name will be used.
423
- :param kwargs: Additional arguments to customize the saving process. These may
424
- include:
425
- - `fs`: Filesystem to be used for saving Parquet files. If not
426
- provided, defaults to the instance's filesystem attribute.
427
- - `parquet_storage_path`: The root path in the filesystem where
428
- Parquet files should be saved. If not provided, defaults to
429
- the instance's attribute for storage path.
430
- :return: None
431
- """
432
- if self.df.map_partitions(len).compute().sum() == 0:
433
- self.logger.debug("Cannot save to parquet since DataFrame is empty")
170
+ valid_fieldnames = [f for f in fieldnames if f in df.columns]
171
+ if len(valid_fieldnames) < len(fieldnames): self.logger.warning(
172
+ f"Missing columns for filtering: {set(fieldnames) - set(valid_fieldnames)}")
173
+ df = df[valid_fieldnames]
174
+ if column_names:
175
+ if len(df.columns) != len(column_names): raise ValueError(
176
+ f"Length mismatch: DataFrame has {len(df.columns)} columns, but {len(column_names)} names were provided.")
177
+ df = df.rename(columns=dict(zip(df.columns, column_names)))
178
+ if index_col:
179
+ if index_col not in df.columns: raise ValueError(f"Index column '{index_col}' not found in DataFrame.")
180
+ df = df.set_index(index_col)
181
+ return df
182
+
183
+ def _process_loaded_data(self, df: dd.DataFrame) -> dd.DataFrame:
184
+ field_map = self._backend_params.field_map or {}
185
+ if not isinstance(field_map, dict) or not field_map: return df
186
+ if hasattr(df, 'npartitions') and df.npartitions == 1 and not len(df.head(1)): return df
187
+ self.logger.debug("Processing loaded data...")
188
+ rename_mapping = {k: v for k, v in field_map.items() if k in df.columns}
189
+ if rename_mapping: df = df.rename(columns=rename_mapping)
190
+ return df
191
+
192
+ def save_to_parquet(self, df: dd.DataFrame, parquet_filename: str, **kwargs):
193
+ if hasattr(df, 'npartitions') and df.npartitions == 1 and not len(df.head(1)):
194
+ self.logger.warning("Cannot save to parquet; DataFrame is empty.")
434
195
  return
435
196
  fs = kwargs.pop('fs', self.fs)
436
- parquet_filename = parquet_filename or self.parquet_filename
437
- parquet_storage_path = kwargs.pop('parquet_storage_path', self.parquet_storage_path)
438
- ps = ParquetSaver(df_result=self.df, parquet_storage_path=parquet_storage_path, logger=self.logger, fs=fs)
439
- ps.save_to_parquet(parquet_filename)
440
- self.logger.debug(f"Parquet saved to {parquet_filename} in parquet storage: {parquet_storage_path}.")
441
-
442
- def save_to_clickhouse(self, **credentials):
443
- """
444
- Saves the current DataFrame to ClickHouse using the provided credentials. This
445
- method first checks if the DataFrame is empty. If it is empty, the method logs
446
- a debug message and does not proceed with saving. Otherwise, it initializes
447
- a ClickHouseWriter instance and uses it to save the DataFrame to ClickHouse,
448
- logging a debug message upon successful completion.
449
-
450
- :param credentials: Credentials required to connect to ClickHouse as keyword
451
- arguments.
452
- :type credentials: dict
453
- :return: None
454
- """
455
- if self.df.map_partitions(len).compute().sum() == 0:
456
- self.logger.debug("Cannot write to clickhouse since Dataframe is empty")
197
+ path = kwargs.pop('parquet_storage_path', self.backend_parquet.parquet_storage_path)
198
+ ParquetSaver(df, path, self.logger, fs).save_to_parquet(parquet_filename)
199
+ self.logger.debug(f"Parquet saved to {parquet_filename} in path: {path}.")
200
+
201
+ def save_to_clickhouse(self, df: dd.DataFrame, **credentials):
202
+ if hasattr(df, 'npartitions') and df.npartitions == 1 and not len(df.head(1)):
203
+ self.logger.warning("Cannot write to ClickHouse; DataFrame is empty.")
457
204
  return
458
- cs = ClickHouseWriter(logger=self.logger, **credentials)
459
- cs.save_to_clickhouse(self.df)
205
+ ClickHouseWriter(self.logger, **credentials).save_to_clickhouse(df)
460
206
  self.logger.debug("Save to ClickHouse completed.")
461
207
 
462
- def __load_from_parquet(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
463
- """
464
- Loads data from parquet files into a DataFrame, applies provided filters, and handles exceptions.
465
-
466
- This method leverages a backend-specific implementation to load data from parquet files into a
467
- DataFrame. If additional options are provided and the data is successfully loaded, filters are
468
- applied to the DataFrame using a filter handler. Errors during this process are handled gracefully
469
- by logging the issue and returning an empty Dask DataFrame.
470
-
471
- :param options: A dictionary of filter options to be applied to the DataFrame.
472
- :type options: dict
473
-
474
- :return: A DataFrame containing the loaded and filtered data. If the operation fails, an empty
475
- Dask DataFrame is returned.
476
- :rtype: Union[pd.DataFrame, dd.DataFrame]
477
- """
478
- try:
479
- self.df = self.backend_parquet.load_files()
480
- if options and self.df is not None:
481
- """
482
- deprecated specific filter handling to a generic one
483
- self.df = ParquetFilterHandler(logger=self.logger).apply_filters_dask(self.df, options)
484
-
485
- """
486
- self.df = FilterHandler(backend='dask', logger=self.logger).apply_filters(self.df, filters=options)
487
- return self.df
488
- except Exception as e:
489
- self.logger.debug(f"Failed to load data from parquet: {e}")
490
- return dd.from_pandas(pd.DataFrame(), npartitions=1)
491
-
492
- def load_period(self, **kwargs):
493
- """
494
- Loads a period with specified parameters.
495
-
496
- This method acts as a wrapper around the private ``__load_period`` method. It
497
- accepts arbitrary keyword arguments that are passed directly to the private
498
- method for execution. The purpose of allowing keyword arguments is to permit
499
- flexible configuration or parameterization for loading a specific period, based
500
- on the internal implementation of the private ``__load_period`` method.
501
-
502
- Note:
503
- The arguments and return values are entirely determined by the private
504
- method's behavior. This method is intentionally designed to mask details
505
- of the internal logic behind the abstraction.
506
-
507
- :param kwargs: Arbitrary keyword arguments to parameterize the internal logic
508
- of loading a period. The specific keys and values expected by the
509
- ``__load_period`` method depend on its own internal implementation.
510
- :return: The result of calling the private ``__load_period`` method with the
511
- provided keyword arguments. The return type is dependent on the internal
512
- implementation of ``__load_period``.
513
- """
514
- return self.__load_period(**kwargs)
515
-
516
- def __load_period(self, **kwargs):
517
- """
518
- Validates and processes the temporal filtering parameters `start` and `end` for querying,
519
- ensuring correctness and compatibility with a specified backend (Django or SQLAlchemy).
520
- This method dynamically maps and validates the provided datetime or date field from the
521
- model according to the configured backend, and applies the appropriate filters to query objects.
522
-
523
- This function enforces that both `start` and `end` are provided and checks if the start date
524
- is earlier or the same as the end date. It supports parsing string representations of dates
525
- and validates them against the date or datetime fields associated with the chosen backend.
526
- If the backend or field is incompatible or missing, appropriate errors are raised.
527
-
528
- The resulting filter conditions are integrated into `kwargs` for querying with the
529
- appropriate backend model.
530
-
531
- :param kwargs: Keyword arguments, including temporal filtering parameters and optionally a
532
- datetime or date field name. Supported parameters include:
533
- - **dt_field**: The name of the date or datetime field to use in filtering. Defaults
534
- to an internally set field if not explicitly provided.
535
- - **start**: The starting date or datetime for the query range. Can be a `str` or
536
- `datetime.date/datetime.datetime` object.
537
- - **end**: The ending date or datetime for the query range. Can be a `str` or
538
- `datetime.date/datetime.datetime` object.
539
-
540
- :return: Queryset or result of the load function with the applied temporal filters.
541
- :rtype: Any
542
-
543
- :raises ValueError: If the `dt_field` is not provided, if `start` or `end`
544
- are missing, if the `start` date is later than `end`, or if the `dt_field`
545
- does not exist in the backend model or its metadata.
546
- """
547
- dt_field = kwargs.pop("dt_field", self.dt_field)
548
- if dt_field is None:
549
- raise ValueError("dt_field must be provided")
550
-
551
- start = kwargs.pop("start", None)
552
- end = kwargs.pop("end", None)
553
-
554
- # Ensure start and end are provided
555
- if start is None or end is None:
556
- raise ValueError("Both 'start' and 'end' must be provided.")
557
-
558
- # Parse string dates
559
- if isinstance(start, str):
560
- start = self.parse_date(start)
561
- if isinstance(end, str):
562
- end = self.parse_date(end)
563
-
564
- # Validate that start <= end
565
- if start > end:
566
- raise ValueError("The 'start' date cannot be later than the 'end' date.")
567
-
568
- # Reverse map to original field name
569
- field_map = getattr(self._backend_params, 'field_map', {}) or {}
570
- reverse_map = {v: k for k, v in field_map.items()}
208
+ def load_period(self, dt_field: str, start: str, end: str, **kwargs) -> Union[pd.DataFrame, dd.DataFrame]:
209
+ """Synchronous convenience method for loading a date range."""
210
+ final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
211
+ return self.load(**final_kwargs)
212
+
213
+ async def aload_period(self, dt_field: str, start: str, end: str, **kwargs) -> Union[pd.DataFrame, dd.DataFrame]:
214
+ """Asynchronous convenience method for loading a date range."""
215
+ final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
216
+ return await self.aload(**final_kwargs)
217
+
218
+ def _prepare_period_filters(self, dt_field: str, start: str, end: str, **kwargs) -> dict:
219
+ start_date, end_date = pd.to_datetime(start).date(), pd.to_datetime(end).date()
220
+ if start_date > end_date: raise ValueError("'start' date cannot be later than 'end' date.")
221
+ field_map = self._backend_params.field_map or {}
222
+ reverse_map = {v: k for k, v in field_map.items()} if field_map else {}
223
+ if len(reverse_map) != len(field_map): self.logger.warning(
224
+ "field_map values are not unique; reverse mapping may be unreliable.")
571
225
  mapped_field = reverse_map.get(dt_field, dt_field)
572
-
573
- # Common logic for Django and SQLAlchemy
574
- if self.backend == 'django_db':
575
- model_fields = {field.name: field for field in self.backend_db_connection.model._meta.get_fields()}
576
- if mapped_field not in model_fields:
577
- raise ValueError(f"Field '{dt_field}' does not exist in the Django model.")
578
- field_type = type(model_fields[mapped_field]).__name__
579
- is_date_field = field_type == 'DateField'
580
- is_datetime_field = field_type == 'DateTimeField'
581
- elif self.backend == 'sqlalchemy':
582
- model = self.backend_db_connection.model
583
- fields = [column.name for column in model.__table__.columns]
584
- if mapped_field not in fields:
585
- raise ValueError(f"Field '{dt_field}' does not exist in the SQLAlchemy model.")
586
- column = getattr(model, mapped_field)
587
- field_type = str(column.type).upper()
588
- is_date_field = field_type == 'DATE'
589
- is_datetime_field = field_type == 'DATETIME'
590
- else:
591
- raise ValueError(f"Unsupported backend '{self.backend}'")
592
- # Build query filters
593
- if start == end:
594
- if is_date_field:
595
- kwargs[mapped_field] = start
596
- elif is_datetime_field:
597
- kwargs[f"{mapped_field}__date"] = start
226
+ if start_date == end_date:
227
+ kwargs[f"{mapped_field}__date"] = start_date
598
228
  else:
599
- if is_date_field:
600
- kwargs[f"{mapped_field}__gte"] = start
601
- kwargs[f"{mapped_field}__lte"] = end
602
- elif is_datetime_field:
603
- kwargs[f"{mapped_field}__date__gte"] = start
604
- kwargs[f"{mapped_field}__date__lte"] = end
605
- self.logger.debug(f"load_period kwargs: {kwargs}")
606
- return self.load(**kwargs)
607
-
608
- @staticmethod
609
- def parse_date(date_str: str) -> Union[datetime.datetime, datetime.date]:
610
- """
611
- Parses a date string and converts it to a `datetime.datetime` or
612
- `datetime.date` object.
613
-
614
- This method attempts to parse the given string in two distinct formats:
615
- 1. First, it tries to interpret the string as a datetime with the format
616
- ``%Y-%m-%d %H:%M:%S``. If successful, it returns a `datetime.datetime`
617
- object.
618
- 2. If the first format parsing fails, it attempts to parse the string as
619
- a date with the format ``%Y-%m-%d``. If successful, it returns a
620
- `datetime.date` object.
621
-
622
- If the string cannot be parsed in either of these formats, the method will
623
- raise a `ValueError`.
624
-
625
- :param date_str: The date string to be parsed. Expected to match one of the
626
- formats: ``%Y-%m-%d %H:%M:%S`` or ``%Y-%m-%d``.
627
- :type date_str: str
628
- :return: A `datetime.datetime` object if the string matches the first format,
629
- or a `datetime.date` object if the string matches the second format.
630
- :rtype: Union[datetime.datetime, datetime.date]
631
- :raises ValueError: Raised if neither date format can be successfully parsed
632
- from the provided string.
633
- """
634
- try:
635
- return datetime.datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S')
636
- except ValueError:
637
- return datetime.datetime.strptime(date_str, '%Y-%m-%d').date()
229
+ kwargs[f"{mapped_field}__date__range"] = [start_date, end_date]
230
+ self.logger.debug(f"Period load generated filters: {kwargs}")
231
+ return kwargs
232
+