sibi-dst 0.3.17__tar.gz → 0.3.18__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sibi_dst-0.3.17 → sibi_dst-0.3.18}/PKG-INFO +1 -1
- {sibi_dst-0.3.17 → sibi_dst-0.3.18}/pyproject.toml +1 -1
- {sibi_dst-0.3.17 → sibi_dst-0.3.18}/sibi_dst/df_helper/_df_helper.py +49 -51
- {sibi_dst-0.3.17 → sibi_dst-0.3.18}/sibi_dst/df_helper/_parquet_artifact.py +1 -1
- {sibi_dst-0.3.17/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/django/_django_load_from_db.py +1 -1
- {sibi_dst-0.3.17/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/sql_alchemy/_io_sqlalchemy_dask.py +2 -3
- {sibi_dst-0.3.17/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/sql_alchemy/_sqlalchemy_load_from_db.py +5 -2
- {sibi_dst-0.3.17 → sibi_dst-0.3.18}/README.md +0 -0
- {sibi_dst-0.3.17 → sibi_dst-0.3.18}/sibi_dst/__init__.py +0 -0
- {sibi_dst-0.3.17 → sibi_dst-0.3.18}/sibi_dst/df_helper/__init__.py +0 -0
- {sibi_dst-0.3.17/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/__init__.py +0 -0
- {sibi_dst-0.3.17/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/django/__init__.py +0 -0
- {sibi_dst-0.3.17/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/django/_django_db_connection.py +0 -0
- {sibi_dst-0.3.17/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/django/_django_sql_model_builder.py +0 -0
- {sibi_dst-0.3.17/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/django/_io_dask.py +0 -0
- {sibi_dst-0.3.17/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/django/_io_dask_alt.py +0 -0
- {sibi_dst-0.3.17/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/http/__init__.py +0 -0
- {sibi_dst-0.3.17/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/http/_http_config.py +0 -0
- {sibi_dst-0.3.17/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/parquet/__init__.py +0 -0
- {sibi_dst-0.3.17/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/parquet/_parquet_filter_handler.py +0 -0
- {sibi_dst-0.3.17/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/parquet/_parquet_options.py +0 -0
- {sibi_dst-0.3.17/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/sql_alchemy/__init__.py +0 -0
- {sibi_dst-0.3.17/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/sql_alchemy/_sqlachemy_filter_handler.py +0 -0
- {sibi_dst-0.3.17/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/sql_alchemy/_sqlalchemy_db_connection.py +0 -0
- {sibi_dst-0.3.17/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/sql_alchemy/_sqlalchemy_model_builder.py +0 -0
- {sibi_dst-0.3.17/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/sql_model/__init__.py +0 -0
- {sibi_dst-0.3.17/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/sql_model/_sqlmodel_db_connection.py +0 -0
- {sibi_dst-0.3.17/sibi_dst/df_helper/plugins → sibi_dst-0.3.18/sibi_dst/df_helper/backends}/sql_model/_sqlmodel_load_from_db.py +0 -0
- {sibi_dst-0.3.17 → sibi_dst-0.3.18}/sibi_dst/df_helper/core/__init__.py +0 -0
- {sibi_dst-0.3.17 → sibi_dst-0.3.18}/sibi_dst/df_helper/core/_defaults.py +0 -0
- {sibi_dst-0.3.17 → sibi_dst-0.3.18}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-0.3.17 → sibi_dst-0.3.18}/sibi_dst/df_helper/core/_params_config.py +0 -0
- {sibi_dst-0.3.17 → sibi_dst-0.3.18}/sibi_dst/df_helper/core/_query_config.py +0 -0
- {sibi_dst-0.3.17 → sibi_dst-0.3.18}/sibi_dst/utils/__init__.py +0 -0
- {sibi_dst-0.3.17 → sibi_dst-0.3.18}/sibi_dst/utils/_airflow_manager.py +0 -0
- {sibi_dst-0.3.17 → sibi_dst-0.3.18}/sibi_dst/utils/_clickhouse_writer.py +0 -0
- {sibi_dst-0.3.17 → sibi_dst-0.3.18}/sibi_dst/utils/_credentials.py +0 -0
- {sibi_dst-0.3.17 → sibi_dst-0.3.18}/sibi_dst/utils/_data_utils.py +0 -0
- {sibi_dst-0.3.17 → sibi_dst-0.3.18}/sibi_dst/utils/_data_wrapper.py +0 -0
- {sibi_dst-0.3.17 → sibi_dst-0.3.18}/sibi_dst/utils/_date_utils.py +0 -0
- {sibi_dst-0.3.17 → sibi_dst-0.3.18}/sibi_dst/utils/_df_utils.py +0 -0
- {sibi_dst-0.3.17 → sibi_dst-0.3.18}/sibi_dst/utils/_file_utils.py +0 -0
- {sibi_dst-0.3.17 → sibi_dst-0.3.18}/sibi_dst/utils/_filepath_generator.py +0 -0
- {sibi_dst-0.3.17 → sibi_dst-0.3.18}/sibi_dst/utils/_log_utils.py +0 -0
- {sibi_dst-0.3.17 → sibi_dst-0.3.18}/sibi_dst/utils/_parquet_saver.py +0 -0
- {sibi_dst-0.3.17 → sibi_dst-0.3.18}/sibi_dst/utils/_storage_manager.py +0 -0
@@ -6,17 +6,16 @@ from typing import Any, Dict, TypeVar
|
|
6
6
|
from typing import Union, Optional
|
7
7
|
|
8
8
|
import dask.dataframe as dd
|
9
|
-
import dask_expr
|
10
9
|
import pandas as pd
|
11
10
|
from pydantic import BaseModel
|
12
11
|
|
13
12
|
from sibi_dst.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
|
14
13
|
from sibi_dst.utils import Logger
|
15
14
|
from sibi_dst.utils import ParquetSaver, ClickHouseWriter
|
16
|
-
from .
|
17
|
-
from .
|
18
|
-
from .
|
19
|
-
from .
|
15
|
+
from .backends.django import *
|
16
|
+
from .backends.http import HttpConfig
|
17
|
+
from .backends.parquet import ParquetConfig
|
18
|
+
from .backends.sql_alchemy import *
|
20
19
|
|
21
20
|
# Define a generic type variable for BaseModel subclasses
|
22
21
|
T = TypeVar("T", bound=BaseModel)
|
@@ -30,26 +29,25 @@ warnings.filterwarnings(
|
|
30
29
|
|
31
30
|
class DfHelper:
|
32
31
|
df: Union[dd.DataFrame, pd.DataFrame] = None
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
32
|
+
backend_django: Optional[DjangoConnectionConfig] = None
|
33
|
+
backend_query: Optional[QueryConfig] = None
|
34
|
+
backend_params: Optional[ParamsConfig] = None
|
35
|
+
backend_parquet: Optional[ParquetConfig] = None
|
36
|
+
backend_http: Optional[HttpConfig] = None
|
37
|
+
backend_sqlalchemy: Optional[SqlAlchemyConnectionConfig] = None
|
39
38
|
parquet_filename: str = None
|
40
39
|
logger: Logger
|
41
40
|
default_config: Dict = None
|
42
41
|
|
43
|
-
def __init__(self,
|
42
|
+
def __init__(self, backend='django_db', **kwargs):
|
44
43
|
# Ensure default_config is not shared across instances
|
45
44
|
self.default_config = self.default_config or {}
|
46
45
|
kwargs = {**self.default_config.copy(), **kwargs}
|
47
|
-
self.
|
46
|
+
self.backend = backend
|
48
47
|
self.debug = kwargs.setdefault("debug", False)
|
49
|
-
self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
|
48
|
+
self.logger = kwargs.get("logger",Logger.default_logger(logger_name=self.__class__.__name__))
|
50
49
|
# Configure logger level
|
51
50
|
self.logger.setLevel(logging.DEBUG if self.debug else logging.INFO)
|
52
|
-
# Configure logger level
|
53
51
|
self.logger.debug("Logger initialized in DEBUG mode.")
|
54
52
|
self.parquet_storage_path = kwargs.setdefault("parquet_storage_path", None)
|
55
53
|
self.dt_field = kwargs.setdefault("dt_field", None)
|
@@ -59,18 +57,18 @@ class DfHelper:
|
|
59
57
|
self.post_init(**kwargs)
|
60
58
|
|
61
59
|
def post_init(self, **kwargs):
|
62
|
-
self.logger.debug(f"
|
63
|
-
self.
|
64
|
-
self.
|
65
|
-
if self.
|
66
|
-
self.
|
67
|
-
elif self.
|
60
|
+
self.logger.debug(f"backend used: {self.backend}")
|
61
|
+
self.backend_query = self.__get_config(QueryConfig, kwargs)
|
62
|
+
self.backend_params = self.__get_config(ParamsConfig, kwargs)
|
63
|
+
if self.backend == 'django_db':
|
64
|
+
self.backend_django = self.__get_config(DjangoConnectionConfig, kwargs)
|
65
|
+
elif self.backend == 'parquet':
|
68
66
|
self.parquet_filename = kwargs.setdefault("parquet_filename", None)
|
69
|
-
self.
|
70
|
-
elif self.
|
71
|
-
self.
|
72
|
-
elif self.
|
73
|
-
self.
|
67
|
+
self.backend_parquet = ParquetConfig(**kwargs)
|
68
|
+
elif self.backend == 'http':
|
69
|
+
self.backend_http = HttpConfig(**kwargs)
|
70
|
+
elif self.backend == 'sqlalchemy':
|
71
|
+
self.backend_sqlalchemy = self.__get_config(SqlAlchemyConnectionConfig, kwargs)
|
74
72
|
|
75
73
|
@staticmethod
|
76
74
|
def __get_config(model: [T], kwargs: Dict[str, Any]) -> Union[T]:
|
@@ -87,7 +85,7 @@ class DfHelper:
|
|
87
85
|
return model(**model_kwargs)
|
88
86
|
|
89
87
|
def load(self, **options):
|
90
|
-
# this will be the universal method to load data from a df irrespective of the
|
88
|
+
# this will be the universal method to load data from a df irrespective of the backend
|
91
89
|
df = self._load(**options)
|
92
90
|
if self.as_pandas:
|
93
91
|
return df.compute()
|
@@ -95,15 +93,15 @@ class DfHelper:
|
|
95
93
|
|
96
94
|
def _load(self, **options):
|
97
95
|
|
98
|
-
if self.
|
99
|
-
self.
|
96
|
+
if self.backend == 'django_db':
|
97
|
+
self.backend_params.parse_params(options)
|
100
98
|
return self._load_from_db(**options)
|
101
|
-
elif self.
|
102
|
-
self.
|
99
|
+
elif self.backend == 'sqlalchemy':
|
100
|
+
self.backend_params.parse_params(options)
|
103
101
|
return self._load_from_sqlalchemy(**options)
|
104
|
-
elif self.
|
102
|
+
elif self.backend == 'parquet':
|
105
103
|
return self._load_from_parquet(**options)
|
106
|
-
elif self.
|
104
|
+
elif self.backend == 'http':
|
107
105
|
if asyncio.get_event_loop().is_running():
|
108
106
|
self.logger.debug("Running as a task from an event loop")
|
109
107
|
return asyncio.create_task(self._load_from_http(**options))
|
@@ -115,9 +113,9 @@ class DfHelper:
|
|
115
113
|
try:
|
116
114
|
options.setdefault("debug", self.debug)
|
117
115
|
db_loader = SqlAlchemyLoadFromDb(
|
118
|
-
self.
|
119
|
-
self.
|
120
|
-
self.
|
116
|
+
self.backend_sqlalchemy,
|
117
|
+
self.backend_query,
|
118
|
+
self.backend_params,
|
121
119
|
self.logger,
|
122
120
|
**options
|
123
121
|
)
|
@@ -135,9 +133,9 @@ class DfHelper:
|
|
135
133
|
try:
|
136
134
|
options.setdefault("debug", self.debug)
|
137
135
|
db_loader = DjangoLoadFromDb(
|
138
|
-
self.
|
139
|
-
self.
|
140
|
-
self.
|
136
|
+
self.backend_django,
|
137
|
+
self.backend_query,
|
138
|
+
self.backend_params,
|
141
139
|
self.logger,
|
142
140
|
**options
|
143
141
|
)
|
@@ -152,12 +150,12 @@ class DfHelper:
|
|
152
150
|
return self.df
|
153
151
|
|
154
152
|
async def _load_from_http(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
155
|
-
"""Delegate asynchronous HTTP data loading to
|
156
|
-
if not self.
|
153
|
+
"""Delegate asynchronous HTTP data loading to HttpDatabackend plugin."""
|
154
|
+
if not self.backend_http:
|
157
155
|
self.logger.debug("HTTP plugin not configured properly.")
|
158
156
|
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
159
157
|
try:
|
160
|
-
self.df = await self.
|
158
|
+
self.df = await self.backend_http.fetch_data(**options)
|
161
159
|
except Exception as e:
|
162
160
|
self.logger.debug(f"Failed to load data from http plugin: {e}")
|
163
161
|
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
@@ -168,7 +166,7 @@ class DfHelper:
|
|
168
166
|
Efficiently process the DataFrame by filtering, renaming, and setting indices.
|
169
167
|
Optimized for large datasets with Dask compatibility.
|
170
168
|
"""
|
171
|
-
df_params = self.
|
169
|
+
df_params = self.backend_params.df_params
|
172
170
|
fieldnames = df_params.get("fieldnames", None)
|
173
171
|
index_col = df_params.get("index_col", None)
|
174
172
|
datetime_index = df_params.get("datetime_index", False)
|
@@ -205,7 +203,7 @@ class DfHelper:
|
|
205
203
|
def _process_loaded_data(self):
|
206
204
|
self.logger.debug(f"Type of self.df: {type(self.df)}")
|
207
205
|
if self.df.map_partitions(len).compute().sum() > 0:
|
208
|
-
field_map = self.
|
206
|
+
field_map = self.backend_params.field_map or {}
|
209
207
|
if isinstance(field_map, dict):
|
210
208
|
rename_mapping = {k: v for k, v in field_map.items() if k in self.df.columns}
|
211
209
|
missing_columns = [k for k in field_map.keys() if k not in self.df.columns]
|
@@ -237,7 +235,7 @@ class DfHelper:
|
|
237
235
|
self.logger.debug("Save to ClickHouse completed.")
|
238
236
|
|
239
237
|
def _load_from_parquet(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
240
|
-
self.df = self.
|
238
|
+
self.df = self.backend_parquet.load_files()
|
241
239
|
if options:
|
242
240
|
"""
|
243
241
|
deprecated specific filter handling to a generic one
|
@@ -273,20 +271,20 @@ class DfHelper:
|
|
273
271
|
raise ValueError("The 'start' date cannot be later than the 'end' date.")
|
274
272
|
|
275
273
|
# Reverse map to original field name
|
276
|
-
field_map = getattr(self.
|
274
|
+
field_map = getattr(self.backend_params, 'field_map', {}) or {}
|
277
275
|
reverse_map = {v: k for k, v in field_map.items()}
|
278
276
|
mapped_field = reverse_map.get(dt_field, dt_field)
|
279
277
|
|
280
278
|
# Common logic for Django and SQLAlchemy
|
281
|
-
if self.
|
282
|
-
model_fields = {field.name: field for field in self.
|
279
|
+
if self.backend == 'django_db':
|
280
|
+
model_fields = {field.name: field for field in self.backend_django.model._meta.get_fields()}
|
283
281
|
if mapped_field not in model_fields:
|
284
282
|
raise ValueError(f"Field '{dt_field}' does not exist in the Django model.")
|
285
283
|
field_type = type(model_fields[mapped_field]).__name__
|
286
284
|
is_date_field = field_type == 'DateField'
|
287
285
|
is_datetime_field = field_type == 'DateTimeField'
|
288
|
-
elif self.
|
289
|
-
model = self.
|
286
|
+
elif self.backend == 'sqlalchemy':
|
287
|
+
model = self.backend_sqlalchemy.model
|
290
288
|
fields = [column.name for column in model.__table__.columns]
|
291
289
|
if mapped_field not in fields:
|
292
290
|
raise ValueError(f"Field '{dt_field}' does not exist in the SQLAlchemy model.")
|
@@ -295,7 +293,7 @@ class DfHelper:
|
|
295
293
|
is_date_field = field_type == 'DATE'
|
296
294
|
is_datetime_field = field_type == 'DATETIME'
|
297
295
|
else:
|
298
|
-
raise ValueError(f"Unsupported
|
296
|
+
raise ValueError(f"Unsupported backend '{self.backend}'")
|
299
297
|
# Build query filters
|
300
298
|
if start == end:
|
301
299
|
if is_date_field:
|
@@ -5,7 +5,7 @@ import pandas as pd
|
|
5
5
|
from IPython.core.hooks import deprecated
|
6
6
|
from django.db.models import Q
|
7
7
|
|
8
|
-
from sibi_dst.df_helper.
|
8
|
+
from sibi_dst.df_helper.backends.django import ReadFrameDask
|
9
9
|
from sibi_dst.utils import Logger
|
10
10
|
from sibi_dst.df_helper.core import django_field_conversion_map_dask
|
11
11
|
|
@@ -1,14 +1,13 @@
|
|
1
1
|
import itertools
|
2
2
|
|
3
3
|
import dask.dataframe as dd
|
4
|
-
import dask_expr
|
5
4
|
import pandas as pd
|
6
5
|
from sqlalchemy import create_engine, inspect, select
|
7
6
|
from sqlalchemy.orm import sessionmaker
|
8
7
|
|
9
|
-
from sibi_dst.df_helper.plugins.sql_alchemy._sqlachemy_filter_handler import SqlAlchemyFilterHandler
|
10
|
-
from sibi_dst.utils import Logger
|
11
8
|
from sibi_dst.df_helper.core import FilterHandler
|
9
|
+
from sibi_dst.utils import Logger
|
10
|
+
|
12
11
|
|
13
12
|
class SQLAlchemyDask:
|
14
13
|
def __init__(self, model, filters, engine_url, chunk_size=1000, logger=None, debug=False):
|
@@ -29,6 +29,7 @@ class SqlAlchemyLoadFromDb:
|
|
29
29
|
self.query_config = plugin_query
|
30
30
|
self.params_config = plugin_params
|
31
31
|
self.debug = kwargs.pop("debug", False)
|
32
|
+
self.chunk_size = kwargs.pop("chunk_size", 1000)
|
32
33
|
|
33
34
|
def build_and_load(self) -> dd.DataFrame:
|
34
35
|
"""
|
@@ -45,8 +46,10 @@ class SqlAlchemyLoadFromDb:
|
|
45
46
|
filters=self.params_config.filters,
|
46
47
|
engine_url=self.engine.url,
|
47
48
|
logger=self.logger,
|
48
|
-
chunk_size=
|
49
|
-
debug=self.debug
|
49
|
+
chunk_size=self.chunk_size,
|
50
|
+
debug=self.debug
|
51
|
+
).read_frame()
|
52
|
+
|
50
53
|
if self.df is None or len(self.df.head().index) == 0:
|
51
54
|
self.logger.debug("Query returned no results.")
|
52
55
|
dask_df=dd.from_pandas(pd.DataFrame(), npartitions=1)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|