sibi-dst 0.3.62__py3-none-any.whl → 0.3.64__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_df_helper.py +184 -591
- sibi_dst/df_helper/_parquet_artifact.py +2 -0
- sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -2
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +161 -115
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +141 -97
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +34 -105
- sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +175 -162
- sibi_dst/df_helper/core/_query_config.py +2 -2
- sibi_dst/utils/data_wrapper.py +2 -2
- sibi_dst/utils/log_utils.py +15 -11
- sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +325 -50
- sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +2 -2
- sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +330 -51
- sibi_dst/v3/__init__.py +0 -0
- sibi_dst/v3/backends/__init__.py +0 -0
- sibi_dst/v3/df_helper/__init__.py +0 -0
- sibi_dst/v3/df_helper/_df_helper.py +91 -0
- {sibi_dst-0.3.62.dist-info → sibi_dst-0.3.64.dist-info}/METADATA +1 -1
- {sibi_dst-0.3.62.dist-info → sibi_dst-0.3.64.dist-info}/RECORD +20 -17
- sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py +0 -119
- {sibi_dst-0.3.62.dist-info → sibi_dst-0.3.64.dist-info}/WHEEL +0 -0
sibi_dst/df_helper/_df_helper.py
CHANGED
@@ -1,637 +1,230 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
import asyncio
|
4
|
-
import datetime
|
5
|
-
import logging
|
6
3
|
import warnings
|
7
|
-
from typing import Any, Dict, TypeVar
|
8
|
-
from typing import Union, Optional
|
4
|
+
from typing import Any, Dict, Optional, Union, TypeVar
|
9
5
|
|
10
6
|
import dask.dataframe as dd
|
11
7
|
import fsspec
|
12
8
|
import pandas as pd
|
13
|
-
from dask import delayed, compute
|
14
9
|
from pydantic import BaseModel
|
15
10
|
|
16
11
|
from sibi_dst.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
|
17
|
-
from sibi_dst.utils import Logger
|
18
|
-
from sibi_dst.utils import ParquetSaver, ClickHouseWriter
|
19
|
-
from .backends.django import *
|
12
|
+
from sibi_dst.utils import Logger, ParquetSaver, ClickHouseWriter
|
20
13
|
from .backends.http import HttpConfig
|
21
14
|
from .backends.parquet import ParquetConfig
|
22
|
-
from .backends.sqlalchemy import
|
15
|
+
from .backends.sqlalchemy import SqlAlchemyConnectionConfig, SqlAlchemyLoadFromDb
|
23
16
|
|
24
|
-
|
17
|
+
warnings.filterwarnings("ignore")
|
25
18
|
T = TypeVar("T", bound=BaseModel)
|
26
19
|
|
27
|
-
# It is considered acceptable in Django to access protected class members
|
28
|
-
warnings.filterwarnings(
|
29
|
-
"ignore",
|
30
|
-
message="Access to a protected member _meta",
|
31
|
-
category=UserWarning,
|
32
|
-
)
|
33
20
|
|
21
|
+
# --- Backend Strategy Pattern Implementation ---
|
22
|
+
|
23
|
+
class BaseBackend:
|
24
|
+
"""Abstract base class defining clear sync and async loading interfaces."""
|
25
|
+
|
26
|
+
def __init__(self, helper: DfHelper):
|
27
|
+
self.helper = helper
|
28
|
+
self.logger = helper.logger
|
29
|
+
|
30
|
+
def load(self, **options) -> dd.DataFrame | pd.DataFrame:
|
31
|
+
"""Synchronous data loading method. Must be implemented by sync backends."""
|
32
|
+
raise NotImplementedError(f"Backend '{self.__class__.__name__}' does not support synchronous loading.")
|
33
|
+
|
34
|
+
async def aload(self, **options) -> dd.DataFrame | pd.DataFrame:
|
35
|
+
"""Asynchronous data loading method. By default, it calls the sync version."""
|
36
|
+
return self.load(**options)
|
37
|
+
|
38
|
+
|
39
|
+
class SqlAlchemyBackend(BaseBackend):
|
40
|
+
def load(self, **options) -> dd.DataFrame:
|
41
|
+
try:
|
42
|
+
# Process incoming filter options into the ParamsConfig object
|
43
|
+
if options and hasattr(self.helper._backend_params, 'parse_params'):
|
44
|
+
self.helper._backend_params.parse_params(options)
|
45
|
+
|
46
|
+
db_loader = SqlAlchemyLoadFromDb(
|
47
|
+
plugin_sqlalchemy=self.helper.backend_db_connection,
|
48
|
+
plugin_query=self.helper._backend_query,
|
49
|
+
plugin_params=self.helper._backend_params,
|
50
|
+
logger=self.logger
|
51
|
+
)
|
52
|
+
return db_loader.build_and_load()
|
53
|
+
except Exception as e:
|
54
|
+
self.logger.error(f"Failed to load data from sqlalchemy: {e}", exc_info=self.debug)
|
55
|
+
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
56
|
+
|
57
|
+
|
58
|
+
class ParquetBackend(BaseBackend):
|
59
|
+
"""This backend is also purely synchronous."""
|
60
|
+
|
61
|
+
def load(self, **options) -> dd.DataFrame | pd.DataFrame:
|
62
|
+
try:
|
63
|
+
df = self.helper.backend_parquet.load_files()
|
64
|
+
if options and df is not None:
|
65
|
+
df = FilterHandler('dask', self.logger).apply_filters(df, filters=options)
|
66
|
+
return df
|
67
|
+
except Exception as e:
|
68
|
+
self.logger.error(f"Failed to load data from parquet: {e}", exc_info=self.debug)
|
69
|
+
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
70
|
+
|
71
|
+
|
72
|
+
class HttpBackend(BaseBackend):
|
73
|
+
"""This backend is purely asynchronous."""
|
74
|
+
|
75
|
+
def load(self, **options) -> dd.DataFrame | pd.DataFrame:
|
76
|
+
# This will correctly fail by raising NotImplementedError from the base class.
|
77
|
+
return self.helper.backend_http.fetch_data(**options)
|
78
|
+
|
79
|
+
async def aload(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
80
|
+
if not self.helper.backend_http:
|
81
|
+
self.logger.warning("HTTP plugin not configured properly.")
|
82
|
+
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
83
|
+
return await self.helper.backend_http.fetch_data(**options)
|
84
|
+
|
85
|
+
|
86
|
+
# --- Main DfHelper Facade Class ---
|
34
87
|
|
35
88
|
class DfHelper:
|
36
89
|
"""
|
37
|
-
|
38
|
-
|
39
|
-
SQLAlchemy-based databases. The class abstracts the complexities of handling
|
40
|
-
different backends and provides a unified interface for data operations.
|
41
|
-
|
42
|
-
The class is particularly useful for projects that require flexibility in
|
43
|
-
data source configuration and seamless integration with both Dask and Pandas
|
44
|
-
for handling data frames. It includes robust mechanisms for post-processing
|
45
|
-
data, filtering columns, renaming, and setting indices.
|
46
|
-
|
47
|
-
:ivar df: The DataFrame currently being processed or loaded.
|
48
|
-
:type df: Union[dd.DataFrame, pd.DataFrame]
|
49
|
-
:type backend_connection: Optional[DjangoConnectionConfig | SqlAlchemyConnectionConfig]
|
50
|
-
:ivar _backend_query: Internal configuration for query handling.
|
51
|
-
:type _backend_query: Optional[QueryConfig]
|
52
|
-
:ivar _backend_params: Internal parameters configuration for DataFrame handling.
|
53
|
-
:type _backend_params: Optional[ParamsConfig]
|
54
|
-
:ivar backend_parquet: Configuration for Parquet file handling.
|
55
|
-
:type backend_parquet: Optional[ParquetConfig]
|
56
|
-
:ivar backend_http: Configuration for interacting with HTTP-based backends.
|
57
|
-
:type backend_http: Optional[HttpConfig]
|
58
|
-
:ivar parquet_filename: The filename for a Parquet file, if applicable.
|
59
|
-
:type parquet_filename: str
|
60
|
-
:ivar logger: Logger instance used for debugging and information logging.
|
61
|
-
:type logger: Logger
|
62
|
-
:ivar default_config: Default configuration dictionary that can be overridden.
|
63
|
-
:type default_config: Dict
|
90
|
+
A reusable utility for loading data. It provides both sync (`load`) and
|
91
|
+
async (`aload`) methods to accommodate different backends.
|
64
92
|
"""
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
parquet_filename: str = None
|
72
|
-
logger: Logger
|
93
|
+
_BACKEND_STRATEGIES = {
|
94
|
+
'sqlalchemy': SqlAlchemyBackend,
|
95
|
+
'parquet': ParquetBackend,
|
96
|
+
'http': HttpBackend,
|
97
|
+
}
|
98
|
+
|
73
99
|
default_config: Dict = None
|
74
100
|
|
75
101
|
def __init__(self, backend='sqlalchemy', **kwargs):
|
76
|
-
# Ensure default_config is not shared across instances
|
77
102
|
self.default_config = self.default_config or {}
|
78
103
|
kwargs = {**self.default_config.copy(), **kwargs}
|
79
104
|
self.backend = backend
|
80
|
-
self.debug = kwargs.
|
105
|
+
self.debug = kwargs.get("debug", False)
|
81
106
|
self.logger = kwargs.get("logger", Logger.default_logger(logger_name=self.__class__.__name__))
|
82
|
-
# Configure logger level
|
83
107
|
self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
|
84
|
-
self.
|
85
|
-
|
86
|
-
self.parquet_filename = kwargs.setdefault("parquet_filename", None)
|
87
|
-
self.dt_field = kwargs.setdefault("dt_field", None)
|
88
|
-
self.as_pandas = kwargs.setdefault("as_pandas", False)
|
89
|
-
self.filesystem = kwargs.pop('filesystem', 'file')
|
90
|
-
self.filesystem_options = kwargs.pop('filesystem_options', {})
|
91
|
-
kwargs.setdefault("live", True)
|
108
|
+
self.fs = kwargs.get("fs", fsspec.filesystem('file'))
|
109
|
+
kwargs.setdefault("fs", self.fs)
|
92
110
|
kwargs.setdefault("logger", self.logger)
|
93
|
-
self.
|
94
|
-
self.
|
95
|
-
|
96
|
-
|
97
|
-
|
111
|
+
self._backend_query = self._get_config(QueryConfig, kwargs)
|
112
|
+
self._backend_params = self._get_config(ParamsConfig, kwargs)
|
113
|
+
self.backend_db_connection: Optional[SqlAlchemyConnectionConfig] = None
|
114
|
+
self.backend_parquet: Optional[ParquetConfig] = None
|
115
|
+
self.backend_http: Optional[HttpConfig] = None
|
116
|
+
|
117
|
+
if self.backend == 'sqlalchemy':
|
118
|
+
self.backend_db_connection = self._get_config(SqlAlchemyConnectionConfig, kwargs)
|
119
|
+
elif self.backend == 'parquet':
|
120
|
+
self.backend_parquet = self._get_config(ParquetConfig, kwargs)
|
121
|
+
elif self.backend == 'http':
|
122
|
+
self.backend_http = self._get_config(HttpConfig, kwargs)
|
98
123
|
|
99
|
-
|
100
|
-
|
124
|
+
strategy_class = self._BACKEND_STRATEGIES.get(self.backend)
|
125
|
+
if not strategy_class: raise ValueError(f"Unsupported backend: {self.backend}")
|
126
|
+
self.backend_strategy = strategy_class(self)
|
101
127
|
|
102
128
|
def __enter__(self):
|
103
129
|
return self
|
104
130
|
|
105
131
|
def __exit__(self, exc_type, exc_value, traceback):
|
106
|
-
self.
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
""
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
self.backend_db_connection.dispose_idle_connections()
|
118
|
-
if getattr(self.backend_db_connection, "close", None):
|
119
|
-
self.backend_db_connection.close()
|
120
|
-
|
121
|
-
self.backend_db_connection = None
|
122
|
-
|
123
|
-
if self.backend_parquet:
|
124
|
-
self.backend_parquet = None
|
125
|
-
if self.backend_http:
|
126
|
-
self.backend_http = None
|
127
|
-
self._backend_query = None
|
128
|
-
self._backend_params = None
|
129
|
-
|
130
|
-
def __post_init(self, **kwargs):
|
131
|
-
"""
|
132
|
-
Initializes backend-specific configurations based on the provided backend type and other
|
133
|
-
parameters. This method performs configuration setup dependent on the selected backend,
|
134
|
-
such as 'django_db', 'parquet', 'http', or 'sqlalchemy'. Configuration for each backend
|
135
|
-
type is fetched or instantiated as necessary using provided parameters or default values.
|
136
|
-
|
137
|
-
:param kwargs: Dictionary of arguments passed during initialization of backend configurations.
|
138
|
-
Additional parameters for specific backend types are extracted here.
|
139
|
-
:return: None
|
140
|
-
"""
|
141
|
-
# self.logger.debug(f"backend used: {self.backend}")
|
142
|
-
# self.logger.debug(f"kwargs passed to backend plugins: {kwargs}")
|
143
|
-
self._backend_query = self.__get_config(QueryConfig, kwargs)
|
144
|
-
self._backend_params = self.__get_config(ParamsConfig, kwargs)
|
145
|
-
if self.backend == 'django_db':
|
146
|
-
self.backend_db_connection = self.__get_config(DjangoConnectionConfig, kwargs)
|
147
|
-
elif self.backend == 'parquet':
|
148
|
-
self.parquet_filename = kwargs.setdefault("parquet_filename", None)
|
149
|
-
self.backend_parquet = ParquetConfig(**kwargs)
|
150
|
-
elif self.backend == 'http':
|
151
|
-
self.backend_http = HttpConfig(**kwargs)
|
152
|
-
elif self.backend == 'sqlalchemy':
|
153
|
-
self.backend_db_connection = self.__get_config(SqlAlchemyConnectionConfig, kwargs)
|
154
|
-
|
155
|
-
def __get_config(self, model: [T], kwargs: Dict[str, Any]) -> Union[T]:
|
156
|
-
"""
|
157
|
-
Initializes a Pydantic model with the keys it recognizes from the kwargs,
|
158
|
-
and removes those keys from the kwargs dictionary.
|
159
|
-
:param model: The Pydantic model class to initialize.
|
160
|
-
:param kwargs: The dictionary of keyword arguments.
|
161
|
-
:return: The initialized Pydantic model instance.
|
162
|
-
"""
|
163
|
-
kwargs.setdefault("debug", self.debug)
|
164
|
-
kwargs.setdefault("logger", self.logger)
|
165
|
-
# Extract keys that the model can accept
|
166
|
-
recognized_keys = set(model.__annotations__.keys())
|
167
|
-
self.logger.debug(f"recognized keys: {recognized_keys}")
|
168
|
-
model_kwargs = {k: kwargs.pop(k) for k in list(kwargs.keys()) if k in recognized_keys}
|
169
|
-
# self.logger.debug(f"model_kwargs: {model_kwargs}")
|
132
|
+
self._cleanup()
|
133
|
+
|
134
|
+
def _cleanup(self):
|
135
|
+
active_config = getattr(self, f"backend_{self.backend}", None)
|
136
|
+
if active_config and hasattr(active_config, "close"):
|
137
|
+
self.logger.debug(f"Closing resources for '{self.backend}' backend.")
|
138
|
+
active_config.close()
|
139
|
+
|
140
|
+
def _get_config(self, model: T, kwargs: Dict[str, Any]) -> T:
|
141
|
+
recognized_keys = set(model.model_fields.keys())
|
142
|
+
model_kwargs = {k: kwargs[k] for k in recognized_keys if k in kwargs}
|
170
143
|
return model(**model_kwargs)
|
171
144
|
|
172
|
-
def
|
173
|
-
"""
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
return results
|
189
|
-
|
190
|
-
def load(self, **options):
|
191
|
-
"""
|
192
|
-
Loads data from a dataframe backend, ensuring compatibility with multiple
|
193
|
-
data processing backends. Provides the data in a pandas dataframe format
|
194
|
-
if the `as_pandas` attribute is set to True.
|
195
|
-
|
196
|
-
:param options: Arbitrary keyword arguments for dataframe loading customization.
|
197
|
-
:type options: dict
|
198
|
-
:return: The loaded dataframe, computed as a pandas dataframe if
|
199
|
-
`as_pandas` is set to True, or kept in its native backend format otherwise.
|
200
|
-
"""
|
201
|
-
# this will be the universal method to load data from a df irrespective of the backend
|
202
|
-
self.df = self.__load(**options)
|
203
|
-
if self.as_pandas:
|
204
|
-
return self.df.compute()
|
205
|
-
return self.df
|
206
|
-
|
207
|
-
def __load(self, **options):
|
208
|
-
"""
|
209
|
-
Private method responsible for loading data using a specified backend. This method
|
210
|
-
abstracts away the details of interacting with the backend and dynamically calls the
|
211
|
-
appropriate function depending on the backend type. It supports multiple backend
|
212
|
-
types, such as `django_db`, `sqlalchemy`, `parquet`, and `http`. If the `http` backend
|
213
|
-
is selected, it checks whether the asyncio event loop is running and either runs the
|
214
|
-
process as a new asyncio task or synchronously.
|
215
|
-
|
216
|
-
:param options: Arbitrary keyword arguments provided for backend-specific configurations.
|
217
|
-
These should align with the requirements of the chosen backend.
|
218
|
-
:type options: dict
|
219
|
-
|
220
|
-
:return: The data loaded from the specified backend. The return type is dependent on
|
221
|
-
the particular backend being used.
|
222
|
-
:rtype: Depending on backend implementation; could be `Task`, `List`, `Dict`, or
|
223
|
-
another format defined by the backend.
|
224
|
-
"""
|
225
|
-
if self.backend == 'django_db':
|
226
|
-
self._backend_params.parse_params(options)
|
227
|
-
return self.__load_from_django_db(**options)
|
228
|
-
elif self.backend == 'sqlalchemy':
|
229
|
-
self._backend_params.parse_params(options)
|
230
|
-
return self.__load_from_sqlalchemy(**options)
|
231
|
-
elif self.backend == 'parquet':
|
232
|
-
return self.__load_from_parquet(**options)
|
233
|
-
elif self.backend == 'http':
|
234
|
-
if asyncio.get_event_loop().is_running():
|
235
|
-
self.logger.debug("Running as a task from an event loop")
|
236
|
-
return asyncio.create_task(self.__load_from_http(**options))
|
237
|
-
else:
|
238
|
-
self.logger.debug("Regular asyncio run...")
|
239
|
-
return asyncio.run(self.__load_from_http(**options))
|
240
|
-
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
241
|
-
|
242
|
-
def __load_from_sqlalchemy(self, **options):
|
243
|
-
"""
|
244
|
-
Loads data from an SQLAlchemy database source into a dataframe. The method processes
|
245
|
-
the loaded data and applies post-processing to transform it into the desired structure.
|
246
|
-
If the operation fails, an empty pandas DataFrame is created as a fallback.
|
247
|
-
|
248
|
-
:param options: Additional keyword arguments to configure the data loading process.
|
249
|
-
These options can include configurations such as 'debug' and other parameters
|
250
|
-
required by the `SqlAlchemyLoadFromDb` class.
|
251
|
-
:type options: dict
|
252
|
-
:return: A dataframe containing the data loaded from the SQLAlchemy database.
|
253
|
-
:rtype: dask.dataframe.DataFrame
|
254
|
-
"""
|
255
|
-
try:
|
256
|
-
options.setdefault("debug", self.debug)
|
257
|
-
db_loader = SqlAlchemyLoadFromDb(
|
258
|
-
self.backend_db_connection,
|
259
|
-
self._backend_query,
|
260
|
-
self._backend_params,
|
261
|
-
self.logger,
|
262
|
-
**options
|
263
|
-
)
|
264
|
-
self.df = db_loader.build_and_load()
|
265
|
-
self.__process_loaded_data()
|
266
|
-
self.__post_process_df()
|
267
|
-
self.backend_db_connection.close()
|
268
|
-
self.logger.debug("Data successfully loaded from sqlalchemy database.")
|
269
|
-
except Exception as e:
|
270
|
-
self.logger.debug(f"Failed to load data from sqlalchemy database: {e}: options: {options}")
|
271
|
-
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
272
|
-
|
273
|
-
return self.df
|
274
|
-
|
275
|
-
def __load_from_django_db(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
276
|
-
"""
|
277
|
-
Loads data from a Django database using a specific backend query mechanism. Processes the loaded data
|
278
|
-
and applies further post-processing before returning the dataframe. If the operation fails, an
|
279
|
-
empty dataframe with a single partition is returned instead.
|
280
|
-
|
281
|
-
:param options: Additional settings for the database loading process, which include optional configurations
|
282
|
-
like debug mode, among others.
|
283
|
-
:type options: dict
|
284
|
-
:return: A dataframe containing the loaded data either as a Pandas or Dask dataframe.
|
285
|
-
:rtype: Union[pd.DataFrame, dd.DataFrame]
|
286
|
-
"""
|
287
|
-
try:
|
288
|
-
options.setdefault("debug", self.debug)
|
289
|
-
db_loader = DjangoLoadFromDb(
|
290
|
-
self.backend_db_connection,
|
291
|
-
self._backend_query,
|
292
|
-
self._backend_params,
|
293
|
-
self.logger,
|
294
|
-
**options
|
295
|
-
)
|
296
|
-
self.df = db_loader.build_and_load()
|
297
|
-
self.__process_loaded_data()
|
298
|
-
self.__post_process_df()
|
299
|
-
self.logger.debug("Data successfully loaded from django database.")
|
300
|
-
except Exception as e:
|
301
|
-
self.logger.debug(f"Failed to load data from django database: {e}")
|
302
|
-
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
303
|
-
|
304
|
-
return self.df
|
305
|
-
|
306
|
-
async def __load_from_http(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
307
|
-
"""
|
308
|
-
Loads data asynchronously from an HTTP source using the configured HTTP plugin.
|
309
|
-
If the HTTP plugin is not properly configured, this method logs a debug message and
|
310
|
-
returns an empty Dask DataFrame. If an exception occurs during data fetching, the error
|
311
|
-
is logged and an empty Dask DataFrame with one partition is returned.
|
312
|
-
|
313
|
-
:param options: Additional keyword arguments that are passed to the HTTP plugin for
|
314
|
-
fetching the data.
|
315
|
-
:returns: A DataFrame object that can either be a pandas or a Dask DataFrame. When the
|
316
|
-
fetching operation fails, it defaults to returning an empty Dask DataFrame
|
317
|
-
with a single partition.
|
318
|
-
"""
|
319
|
-
if not self.backend_http:
|
320
|
-
self.logger.debug("HTTP plugin not configured properly.")
|
321
|
-
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
322
|
-
try:
|
323
|
-
self.df = await self.backend_http.fetch_data(**options)
|
324
|
-
except Exception as e:
|
325
|
-
self.logger.debug(f"Failed to load data from http plugin: {e}")
|
326
|
-
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
327
|
-
return self.df
|
328
|
-
|
329
|
-
def __post_process_df(self):
|
330
|
-
"""
|
331
|
-
Processes a DataFrame according to the provided parameters defined within the
|
332
|
-
`self._backend_params.df_params` dictionary. This involves filtering columns,
|
333
|
-
renaming columns, setting an index column, and handling datetime indexing.
|
334
|
-
The method modifies the DataFrame in place.
|
335
|
-
|
336
|
-
:raises ValueError: If the lengths of `fieldnames` and `column_names` do not match,
|
337
|
-
or if the specified `index_col` is not found in the DataFrame.
|
338
|
-
"""
|
339
|
-
self.logger.debug("Post-processing DataFrame.")
|
145
|
+
def load(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
146
|
+
"""Loads data synchronously. Fails if backend is async-only."""
|
147
|
+
self.logger.debug(f"Loading data from {self.backend} backend with options: {options}")
|
148
|
+
df = self.backend_strategy.load(**options)
|
149
|
+
df = self._process_loaded_data(df)
|
150
|
+
df = self._post_process_df(df)
|
151
|
+
return df.compute() if as_pandas else df
|
152
|
+
|
153
|
+
async def aload(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
154
|
+
"""Loads data asynchronously from any backend."""
|
155
|
+
df = await self.backend_strategy.aload(**options)
|
156
|
+
df = self._process_loaded_data(df)
|
157
|
+
df = self._post_process_df(df)
|
158
|
+
return df.compute() if as_pandas else df
|
159
|
+
|
160
|
+
def _post_process_df(self, df: dd.DataFrame) -> dd.DataFrame:
|
340
161
|
df_params = self._backend_params.df_params
|
341
|
-
|
342
|
-
index_col = df_params.get("
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
# Filter columns
|
162
|
+
if not df_params: return df
|
163
|
+
fieldnames, column_names, index_col = (df_params.get("fieldnames"), df_params.get("column_names"),
|
164
|
+
df_params.get("index_col"))
|
165
|
+
if not any([fieldnames, column_names, index_col]): return df
|
166
|
+
self.logger.debug("Post-processing DataFrame.")
|
347
167
|
if fieldnames:
|
348
|
-
|
349
|
-
valid_fieldnames
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
def __process_loaded_data(self):
|
375
|
-
"""
|
376
|
-
Processes the dataframe by applying renaming logic based on the given field map
|
377
|
-
configuration. Inspects the dataframe for missing columns referenced in the field
|
378
|
-
map and flags them with a warning. Applies renaming only for columns that exist
|
379
|
-
in the dataframe while ensuring that no operations take place if the dataframe
|
380
|
-
is empty.
|
381
|
-
|
382
|
-
:param self: The instance of the class where the dataframe is being processed.
|
383
|
-
:type self: object with attributes `df`, `_backend_params`, and `logger`.
|
384
|
-
|
385
|
-
:raises Warning: Logs a warning if specified columns in the `field_map` are not
|
386
|
-
present in the dataframe.
|
387
|
-
|
388
|
-
:return: None
|
389
|
-
"""
|
390
|
-
self.logger.debug(f"Processing loaded data...")
|
391
|
-
if self.df.map_partitions(len).compute().sum() > 0:
|
392
|
-
field_map = self._backend_params.field_map or {}
|
393
|
-
if isinstance(field_map, dict) and field_map != {}:
|
394
|
-
rename_mapping = {k: v for k, v in field_map.items() if k in self.df.columns}
|
395
|
-
missing_columns = [k for k in field_map.keys() if k not in self.df.columns]
|
396
|
-
|
397
|
-
if missing_columns:
|
398
|
-
self.logger.warning(
|
399
|
-
f"The following columns in field_map are not in the DataFrame: {missing_columns}, field map: {field_map}")
|
400
|
-
|
401
|
-
def rename_columns(df, mapping):
|
402
|
-
return df.rename(columns=mapping)
|
403
|
-
|
404
|
-
if rename_mapping:
|
405
|
-
# Apply renaming
|
406
|
-
self.df = self.df.map_partitions(rename_columns, mapping=rename_mapping)
|
407
|
-
|
408
|
-
self.logger.debug("Processing of loaded data completed.")
|
409
|
-
else:
|
410
|
-
self.logger.debug("DataFrame is empty, skipping processing.")
|
411
|
-
|
412
|
-
def save_to_parquet(self, parquet_filename: Optional[str] = None, **kwargs):
|
413
|
-
"""
|
414
|
-
Save the dataframe result to a Parquet file using specified configurations.
|
415
|
-
|
416
|
-
This method leverages the ParquetSaver class to store the dataframe result
|
417
|
-
into a Parquet file. It also provides functionality for overriding the default
|
418
|
-
filesystem (`fs`) and storage path (`parquet_storage_path`). The method logs
|
419
|
-
details about the saving operation for debugging purposes.
|
420
|
-
|
421
|
-
:param parquet_filename: The name of the Parquet file to save the dataframe to.
|
422
|
-
If not provided, a default name will be used.
|
423
|
-
:param kwargs: Additional arguments to customize the saving process. These may
|
424
|
-
include:
|
425
|
-
- `fs`: Filesystem to be used for saving Parquet files. If not
|
426
|
-
provided, defaults to the instance's filesystem attribute.
|
427
|
-
- `parquet_storage_path`: The root path in the filesystem where
|
428
|
-
Parquet files should be saved. If not provided, defaults to
|
429
|
-
the instance's attribute for storage path.
|
430
|
-
:return: None
|
431
|
-
"""
|
432
|
-
if self.df.map_partitions(len).compute().sum() == 0:
|
433
|
-
self.logger.debug("Cannot save to parquet since DataFrame is empty")
|
168
|
+
valid_fieldnames = [f for f in fieldnames if f in df.columns]
|
169
|
+
if len(valid_fieldnames) < len(fieldnames): self.logger.warning(
|
170
|
+
f"Missing columns for filtering: {set(fieldnames) - set(valid_fieldnames)}")
|
171
|
+
df = df[valid_fieldnames]
|
172
|
+
if column_names:
|
173
|
+
if len(df.columns) != len(column_names): raise ValueError(
|
174
|
+
f"Length mismatch: DataFrame has {len(df.columns)} columns, but {len(column_names)} names were provided.")
|
175
|
+
df = df.rename(columns=dict(zip(df.columns, column_names)))
|
176
|
+
if index_col:
|
177
|
+
if index_col not in df.columns: raise ValueError(f"Index column '{index_col}' not found in DataFrame.")
|
178
|
+
df = df.set_index(index_col)
|
179
|
+
return df
|
180
|
+
|
181
|
+
def _process_loaded_data(self, df: dd.DataFrame) -> dd.DataFrame:
|
182
|
+
field_map = self._backend_params.field_map or {}
|
183
|
+
if not isinstance(field_map, dict) or not field_map: return df
|
184
|
+
if hasattr(df, 'npartitions') and df.npartitions == 1 and not len(df.head(1)): return df
|
185
|
+
self.logger.debug("Processing loaded data...")
|
186
|
+
rename_mapping = {k: v for k, v in field_map.items() if k in df.columns}
|
187
|
+
if rename_mapping: df = df.rename(columns=rename_mapping)
|
188
|
+
return df
|
189
|
+
|
190
|
+
def save_to_parquet(self, df: dd.DataFrame, parquet_filename: str, **kwargs):
|
191
|
+
if hasattr(df, 'npartitions') and df.npartitions == 1 and not len(df.head(1)):
|
192
|
+
self.logger.warning("Cannot save to parquet; DataFrame is empty.")
|
434
193
|
return
|
435
194
|
fs = kwargs.pop('fs', self.fs)
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
"""
|
444
|
-
Saves the current DataFrame to ClickHouse using the provided credentials. This
|
445
|
-
method first checks if the DataFrame is empty. If it is empty, the method logs
|
446
|
-
a debug message and does not proceed with saving. Otherwise, it initializes
|
447
|
-
a ClickHouseWriter instance and uses it to save the DataFrame to ClickHouse,
|
448
|
-
logging a debug message upon successful completion.
|
449
|
-
|
450
|
-
:param credentials: Credentials required to connect to ClickHouse as keyword
|
451
|
-
arguments.
|
452
|
-
:type credentials: dict
|
453
|
-
:return: None
|
454
|
-
"""
|
455
|
-
if self.df.map_partitions(len).compute().sum() == 0:
|
456
|
-
self.logger.debug("Cannot write to clickhouse since Dataframe is empty")
|
195
|
+
path = kwargs.pop('parquet_storage_path', self.backend_parquet.parquet_storage_path)
|
196
|
+
ParquetSaver(df, path, self.logger, fs).save_to_parquet(parquet_filename)
|
197
|
+
self.logger.debug(f"Parquet saved to {parquet_filename} in path: {path}.")
|
198
|
+
|
199
|
+
def save_to_clickhouse(self, df: dd.DataFrame, **credentials):
|
200
|
+
if hasattr(df, 'npartitions') and df.npartitions == 1 and not len(df.head(1)):
|
201
|
+
self.logger.warning("Cannot write to ClickHouse; DataFrame is empty.")
|
457
202
|
return
|
458
|
-
|
459
|
-
cs.save_to_clickhouse(self.df)
|
203
|
+
ClickHouseWriter(self.logger, **credentials).save_to_clickhouse(df)
|
460
204
|
self.logger.debug("Save to ClickHouse completed.")
|
461
205
|
|
462
|
-
def
|
463
|
-
"""
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
:
|
477
|
-
|
478
|
-
|
479
|
-
self.df = self.backend_parquet.load_files()
|
480
|
-
if options and self.df is not None:
|
481
|
-
"""
|
482
|
-
deprecated specific filter handling to a generic one
|
483
|
-
self.df = ParquetFilterHandler(logger=self.logger).apply_filters_dask(self.df, options)
|
484
|
-
|
485
|
-
"""
|
486
|
-
self.df = FilterHandler(backend='dask', logger=self.logger).apply_filters(self.df, filters=options)
|
487
|
-
return self.df
|
488
|
-
except Exception as e:
|
489
|
-
self.logger.debug(f"Failed to load data from parquet: {e}")
|
490
|
-
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
491
|
-
|
492
|
-
def load_period(self, **kwargs):
|
493
|
-
"""
|
494
|
-
Loads a period with specified parameters.
|
495
|
-
|
496
|
-
This method acts as a wrapper around the private ``__load_period`` method. It
|
497
|
-
accepts arbitrary keyword arguments that are passed directly to the private
|
498
|
-
method for execution. The purpose of allowing keyword arguments is to permit
|
499
|
-
flexible configuration or parameterization for loading a specific period, based
|
500
|
-
on the internal implementation of the private ``__load_period`` method.
|
501
|
-
|
502
|
-
Note:
|
503
|
-
The arguments and return values are entirely determined by the private
|
504
|
-
method's behavior. This method is intentionally designed to mask details
|
505
|
-
of the internal logic behind the abstraction.
|
506
|
-
|
507
|
-
:param kwargs: Arbitrary keyword arguments to parameterize the internal logic
|
508
|
-
of loading a period. The specific keys and values expected by the
|
509
|
-
``__load_period`` method depend on its own internal implementation.
|
510
|
-
:return: The result of calling the private ``__load_period`` method with the
|
511
|
-
provided keyword arguments. The return type is dependent on the internal
|
512
|
-
implementation of ``__load_period``.
|
513
|
-
"""
|
514
|
-
return self.__load_period(**kwargs)
|
515
|
-
|
516
|
-
def __load_period(self, **kwargs):
|
517
|
-
"""
|
518
|
-
Validates and processes the temporal filtering parameters `start` and `end` for querying,
|
519
|
-
ensuring correctness and compatibility with a specified backend (Django or SQLAlchemy).
|
520
|
-
This method dynamically maps and validates the provided datetime or date field from the
|
521
|
-
model according to the configured backend, and applies the appropriate filters to query objects.
|
522
|
-
|
523
|
-
This function enforces that both `start` and `end` are provided and checks if the start date
|
524
|
-
is earlier or the same as the end date. It supports parsing string representations of dates
|
525
|
-
and validates them against the date or datetime fields associated with the chosen backend.
|
526
|
-
If the backend or field is incompatible or missing, appropriate errors are raised.
|
527
|
-
|
528
|
-
The resulting filter conditions are integrated into `kwargs` for querying with the
|
529
|
-
appropriate backend model.
|
530
|
-
|
531
|
-
:param kwargs: Keyword arguments, including temporal filtering parameters and optionally a
|
532
|
-
datetime or date field name. Supported parameters include:
|
533
|
-
- **dt_field**: The name of the date or datetime field to use in filtering. Defaults
|
534
|
-
to an internally set field if not explicitly provided.
|
535
|
-
- **start**: The starting date or datetime for the query range. Can be a `str` or
|
536
|
-
`datetime.date/datetime.datetime` object.
|
537
|
-
- **end**: The ending date or datetime for the query range. Can be a `str` or
|
538
|
-
`datetime.date/datetime.datetime` object.
|
539
|
-
|
540
|
-
:return: Queryset or result of the load function with the applied temporal filters.
|
541
|
-
:rtype: Any
|
542
|
-
|
543
|
-
:raises ValueError: If the `dt_field` is not provided, if `start` or `end`
|
544
|
-
are missing, if the `start` date is later than `end`, or if the `dt_field`
|
545
|
-
does not exist in the backend model or its metadata.
|
546
|
-
"""
|
547
|
-
dt_field = kwargs.pop("dt_field", self.dt_field)
|
548
|
-
if dt_field is None:
|
549
|
-
raise ValueError("dt_field must be provided")
|
550
|
-
|
551
|
-
start = kwargs.pop("start", None)
|
552
|
-
end = kwargs.pop("end", None)
|
553
|
-
|
554
|
-
# Ensure start and end are provided
|
555
|
-
if start is None or end is None:
|
556
|
-
raise ValueError("Both 'start' and 'end' must be provided.")
|
557
|
-
|
558
|
-
# Parse string dates
|
559
|
-
if isinstance(start, str):
|
560
|
-
start = self.parse_date(start)
|
561
|
-
if isinstance(end, str):
|
562
|
-
end = self.parse_date(end)
|
563
|
-
|
564
|
-
# Validate that start <= end
|
565
|
-
if start > end:
|
566
|
-
raise ValueError("The 'start' date cannot be later than the 'end' date.")
|
567
|
-
|
568
|
-
# Reverse map to original field name
|
569
|
-
field_map = getattr(self._backend_params, 'field_map', {}) or {}
|
570
|
-
reverse_map = {v: k for k, v in field_map.items()}
|
206
|
+
def load_period(self, dt_field: str, start: str, end: str, **kwargs) -> Union[pd.DataFrame, dd.DataFrame]:
|
207
|
+
"""Synchronous convenience method for loading a date range."""
|
208
|
+
final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
|
209
|
+
return self.load(**final_kwargs)
|
210
|
+
|
211
|
+
async def aload_period(self, dt_field: str, start: str, end: str, **kwargs) -> Union[pd.DataFrame, dd.DataFrame]:
|
212
|
+
"""Asynchronous convenience method for loading a date range."""
|
213
|
+
final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
|
214
|
+
return await self.aload(**final_kwargs)
|
215
|
+
|
216
|
+
def _prepare_period_filters(self, dt_field: str, start: str, end: str, **kwargs) -> dict:
|
217
|
+
start_date, end_date = pd.to_datetime(start).date(), pd.to_datetime(end).date()
|
218
|
+
if start_date > end_date: raise ValueError("'start' date cannot be later than 'end' date.")
|
219
|
+
field_map = self._backend_params.field_map or {}
|
220
|
+
reverse_map = {v: k for k, v in field_map.items()} if field_map else {}
|
221
|
+
if len(reverse_map) != len(field_map): self.logger.warning(
|
222
|
+
"field_map values are not unique; reverse mapping may be unreliable.")
|
571
223
|
mapped_field = reverse_map.get(dt_field, dt_field)
|
572
|
-
|
573
|
-
|
574
|
-
if self.backend == 'django_db':
|
575
|
-
model_fields = {field.name: field for field in self.backend_db_connection.model._meta.get_fields()}
|
576
|
-
if mapped_field not in model_fields:
|
577
|
-
raise ValueError(f"Field '{dt_field}' does not exist in the Django model.")
|
578
|
-
field_type = type(model_fields[mapped_field]).__name__
|
579
|
-
is_date_field = field_type == 'DateField'
|
580
|
-
is_datetime_field = field_type == 'DateTimeField'
|
581
|
-
elif self.backend == 'sqlalchemy':
|
582
|
-
model = self.backend_db_connection.model
|
583
|
-
fields = [column.name for column in model.__table__.columns]
|
584
|
-
if mapped_field not in fields:
|
585
|
-
raise ValueError(f"Field '{dt_field}' does not exist in the SQLAlchemy model.")
|
586
|
-
column = getattr(model, mapped_field)
|
587
|
-
field_type = str(column.type).upper()
|
588
|
-
is_date_field = field_type == 'DATE'
|
589
|
-
is_datetime_field = field_type == 'DATETIME'
|
590
|
-
else:
|
591
|
-
raise ValueError(f"Unsupported backend '{self.backend}'")
|
592
|
-
# Build query filters
|
593
|
-
if start == end:
|
594
|
-
if is_date_field:
|
595
|
-
kwargs[mapped_field] = start
|
596
|
-
elif is_datetime_field:
|
597
|
-
kwargs[f"{mapped_field}__date"] = start
|
224
|
+
if start_date == end_date:
|
225
|
+
kwargs[f"{mapped_field}__date"] = start_date
|
598
226
|
else:
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
kwargs[f"{mapped_field}__date__gte"] = start
|
604
|
-
kwargs[f"{mapped_field}__date__lte"] = end
|
605
|
-
self.logger.debug(f"load_period kwargs: {kwargs}")
|
606
|
-
return self.load(**kwargs)
|
607
|
-
|
608
|
-
@staticmethod
|
609
|
-
def parse_date(date_str: str) -> Union[datetime.datetime, datetime.date]:
|
610
|
-
"""
|
611
|
-
Parses a date string and converts it to a `datetime.datetime` or
|
612
|
-
`datetime.date` object.
|
613
|
-
|
614
|
-
This method attempts to parse the given string in two distinct formats:
|
615
|
-
1. First, it tries to interpret the string as a datetime with the format
|
616
|
-
``%Y-%m-%d %H:%M:%S``. If successful, it returns a `datetime.datetime`
|
617
|
-
object.
|
618
|
-
2. If the first format parsing fails, it attempts to parse the string as
|
619
|
-
a date with the format ``%Y-%m-%d``. If successful, it returns a
|
620
|
-
`datetime.date` object.
|
621
|
-
|
622
|
-
If the string cannot be parsed in either of these formats, the method will
|
623
|
-
raise a `ValueError`.
|
624
|
-
|
625
|
-
:param date_str: The date string to be parsed. Expected to match one of the
|
626
|
-
formats: ``%Y-%m-%d %H:%M:%S`` or ``%Y-%m-%d``.
|
627
|
-
:type date_str: str
|
628
|
-
:return: A `datetime.datetime` object if the string matches the first format,
|
629
|
-
or a `datetime.date` object if the string matches the second format.
|
630
|
-
:rtype: Union[datetime.datetime, datetime.date]
|
631
|
-
:raises ValueError: Raised if neither date format can be successfully parsed
|
632
|
-
from the provided string.
|
633
|
-
"""
|
634
|
-
try:
|
635
|
-
return datetime.datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S')
|
636
|
-
except ValueError:
|
637
|
-
return datetime.datetime.strptime(date_str, '%Y-%m-%d').date()
|
227
|
+
kwargs[f"{mapped_field}__date__range"] = [start_date, end_date]
|
228
|
+
self.logger.debug(f"Period load generated filters: {kwargs}")
|
229
|
+
return kwargs
|
230
|
+
|