sibi-dst 0.3.15__py3-none-any.whl → 0.3.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_df_helper.py +36 -20
- sibi_dst/df_helper/core/__init__.py +6 -4
- sibi_dst/df_helper/core/_filter_handler.py +216 -0
- sibi_dst/df_helper/plugins/django/_django_load_from_db.py +32 -20
- sibi_dst/df_helper/plugins/django/_io_dask.py +0 -3
- sibi_dst/df_helper/plugins/http/_http_config.py +4 -4
- sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py +9 -9
- sibi_dst/df_helper/plugins/sql_alchemy/_sqlachemy_filter_handler.py +4 -2
- sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py +8 -6
- sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_model_builder.py +1 -1
- sibi_dst/df_helper/plugins/sql_model/_sqlmodel_load_from_db.py +2 -3
- sibi_dst/utils/__init__.py +2 -2
- sibi_dst/utils/_clickhouse_writer.py +16 -16
- sibi_dst/utils/_data_utils.py +5 -4
- sibi_dst/utils/_data_wrapper.py +8 -5
- sibi_dst/utils/_df_utils.py +5 -5
- sibi_dst/utils/_log_utils.py +3 -0
- sibi_dst/utils/_parquet_saver.py +3 -108
- {sibi_dst-0.3.15.dist-info → sibi_dst-0.3.17.dist-info}/METADATA +2 -1
- {sibi_dst-0.3.15.dist-info → sibi_dst-0.3.17.dist-info}/RECORD +21 -20
- {sibi_dst-0.3.15.dist-info → sibi_dst-0.3.17.dist-info}/WHEEL +0 -0
sibi_dst/df_helper/_df_helper.py
CHANGED
@@ -1,13 +1,16 @@
|
|
1
1
|
import asyncio
|
2
2
|
import datetime
|
3
|
+
import logging
|
4
|
+
import warnings
|
3
5
|
from typing import Any, Dict, TypeVar
|
4
6
|
from typing import Union, Optional
|
5
7
|
|
6
8
|
import dask.dataframe as dd
|
9
|
+
import dask_expr
|
7
10
|
import pandas as pd
|
8
11
|
from pydantic import BaseModel
|
9
12
|
|
10
|
-
from sibi_dst.df_helper.core import QueryConfig, ParamsConfig
|
13
|
+
from sibi_dst.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
|
11
14
|
from sibi_dst.utils import Logger
|
12
15
|
from sibi_dst.utils import ParquetSaver, ClickHouseWriter
|
13
16
|
from .plugins.django import *
|
@@ -18,6 +21,12 @@ from .plugins.sql_alchemy import *
|
|
18
21
|
# Define a generic type variable for BaseModel subclasses
|
19
22
|
T = TypeVar("T", bound=BaseModel)
|
20
23
|
|
24
|
+
# It is considered acceptable in Django to access protected class members
|
25
|
+
warnings.filterwarnings(
|
26
|
+
"ignore",
|
27
|
+
message="Access to a protected member _meta",
|
28
|
+
category=UserWarning,
|
29
|
+
)
|
21
30
|
|
22
31
|
class DfHelper:
|
23
32
|
df: Union[dd.DataFrame, pd.DataFrame] = None
|
@@ -36,9 +45,12 @@ class DfHelper:
|
|
36
45
|
self.default_config = self.default_config or {}
|
37
46
|
kwargs = {**self.default_config.copy(), **kwargs}
|
38
47
|
self.source = source
|
39
|
-
self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
|
40
48
|
self.debug = kwargs.setdefault("debug", False)
|
41
|
-
self.
|
49
|
+
self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
|
50
|
+
# Configure logger level
|
51
|
+
self.logger.setLevel(logging.DEBUG if self.debug else logging.INFO)
|
52
|
+
# Configure logger level
|
53
|
+
self.logger.debug("Logger initialized in DEBUG mode.")
|
42
54
|
self.parquet_storage_path = kwargs.setdefault("parquet_storage_path", None)
|
43
55
|
self.dt_field = kwargs.setdefault("dt_field", None)
|
44
56
|
self.as_pandas = kwargs.setdefault("as_pandas", False)
|
@@ -47,7 +59,7 @@ class DfHelper:
|
|
47
59
|
self.post_init(**kwargs)
|
48
60
|
|
49
61
|
def post_init(self, **kwargs):
|
50
|
-
self.logger.
|
62
|
+
self.logger.debug(f"Source used: {self.source}")
|
51
63
|
self.plugin_query = self.__get_config(QueryConfig, kwargs)
|
52
64
|
self.plugin_params = self.__get_config(ParamsConfig, kwargs)
|
53
65
|
if self.source == 'django_db':
|
@@ -93,16 +105,15 @@ class DfHelper:
|
|
93
105
|
return self._load_from_parquet(**options)
|
94
106
|
elif self.source == 'http':
|
95
107
|
if asyncio.get_event_loop().is_running():
|
96
|
-
self.logger.
|
108
|
+
self.logger.debug("Running as a task from an event loop")
|
97
109
|
return asyncio.create_task(self._load_from_http(**options))
|
98
110
|
else:
|
99
|
-
self.logger.
|
111
|
+
self.logger.debug("Regular asyncio run...")
|
100
112
|
return asyncio.run(self._load_from_http(**options))
|
101
113
|
|
102
114
|
def _load_from_sqlalchemy(self, **options):
|
103
115
|
try:
|
104
116
|
options.setdefault("debug", self.debug)
|
105
|
-
options.setdefault("verbose_debug", self.verbose_debug)
|
106
117
|
db_loader = SqlAlchemyLoadFromDb(
|
107
118
|
self.plugin_sqlalchemy,
|
108
119
|
self.plugin_query,
|
@@ -113,9 +124,9 @@ class DfHelper:
|
|
113
124
|
self.df = db_loader.build_and_load()
|
114
125
|
self._process_loaded_data()
|
115
126
|
self._post_process_df()
|
116
|
-
self.logger.
|
127
|
+
self.logger.debug("Data successfully loaded from sqlalchemy database.")
|
117
128
|
except Exception as e:
|
118
|
-
self.logger.
|
129
|
+
self.logger.debug(f"Failed to load data from sqlalchemy database: {e}: options: {options}")
|
119
130
|
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
120
131
|
|
121
132
|
return self.df
|
@@ -123,7 +134,6 @@ class DfHelper:
|
|
123
134
|
def _load_from_db(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
124
135
|
try:
|
125
136
|
options.setdefault("debug", self.debug)
|
126
|
-
options.setdefault("verbose_debug", self.verbose_debug)
|
127
137
|
db_loader = DjangoLoadFromDb(
|
128
138
|
self.plugin_django_connection,
|
129
139
|
self.plugin_query,
|
@@ -134,9 +144,9 @@ class DfHelper:
|
|
134
144
|
self.df = db_loader.build_and_load()
|
135
145
|
self._process_loaded_data()
|
136
146
|
self._post_process_df()
|
137
|
-
self.logger.
|
147
|
+
self.logger.debug("Data successfully loaded from django database.")
|
138
148
|
except Exception as e:
|
139
|
-
self.logger.
|
149
|
+
self.logger.debug(f"Failed to load data from django database: {e}")
|
140
150
|
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
141
151
|
|
142
152
|
return self.df
|
@@ -144,12 +154,12 @@ class DfHelper:
|
|
144
154
|
async def _load_from_http(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
145
155
|
"""Delegate asynchronous HTTP data loading to HttpDataSource plugin."""
|
146
156
|
if not self.plugin_http:
|
147
|
-
self.logger.
|
157
|
+
self.logger.debug("HTTP plugin not configured properly.")
|
148
158
|
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
149
159
|
try:
|
150
160
|
self.df = await self.plugin_http.fetch_data(**options)
|
151
161
|
except Exception as e:
|
152
|
-
self.logger.
|
162
|
+
self.logger.debug(f"Failed to load data from http plugin: {e}")
|
153
163
|
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
154
164
|
return self.df
|
155
165
|
|
@@ -190,10 +200,10 @@ class DfHelper:
|
|
190
200
|
if datetime_index and self.df.index.dtype != 'datetime64[ns]':
|
191
201
|
self.df = self.df.map_partitions(lambda df: df.set_index(pd.to_datetime(df.index, errors='coerce')))
|
192
202
|
|
193
|
-
self.logger.
|
203
|
+
self.logger.debug("Post-processing of DataFrame completed.")
|
194
204
|
|
195
205
|
def _process_loaded_data(self):
|
196
|
-
self.logger.
|
206
|
+
self.logger.debug(f"Type of self.df: {type(self.df)}")
|
197
207
|
if self.df.map_partitions(len).compute().sum() > 0:
|
198
208
|
field_map = self.plugin_params.field_map or {}
|
199
209
|
if isinstance(field_map, dict):
|
@@ -211,25 +221,30 @@ class DfHelper:
|
|
211
221
|
# Apply renaming
|
212
222
|
self.df = self.df.map_partitions(rename_columns, mapping=rename_mapping)
|
213
223
|
|
214
|
-
self.logger.
|
224
|
+
self.logger.debug("Processing of loaded data completed.")
|
215
225
|
|
216
226
|
def save_to_parquet(self, parquet_filename: Optional[str] = None):
|
217
227
|
ps = ParquetSaver(self.df, self.parquet_storage_path, self.logger)
|
218
228
|
ps.save_to_parquet(parquet_filename)
|
219
|
-
self.logger.
|
229
|
+
self.logger.debug(f"Parquet saved to {parquet_filename} in parquet storage: {self.parquet_storage_path}.")
|
220
230
|
|
221
231
|
def save_to_clickhouse(self, **credentials):
|
222
232
|
if self.df.map_partitions(len).compute().sum() == 0:
|
223
|
-
self.logger.
|
233
|
+
self.logger.debug("Cannot write to clickhouse since Dataframe is empty")
|
224
234
|
return
|
225
235
|
cs = ClickHouseWriter(logger=self.logger, **credentials)
|
226
236
|
cs.save_to_clickhouse(self.df)
|
227
|
-
self.logger.
|
237
|
+
self.logger.debug("Save to ClickHouse completed.")
|
228
238
|
|
229
239
|
def _load_from_parquet(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
230
240
|
self.df = self.plugin_parquet.load_files()
|
231
241
|
if options:
|
242
|
+
"""
|
243
|
+
deprecated specific filter handling to a generic one
|
232
244
|
self.df = ParquetFilterHandler(logger=self.logger).apply_filters_dask(self.df, options)
|
245
|
+
|
246
|
+
"""
|
247
|
+
self.df = FilterHandler(backend='dask', logger=self.logger).apply_filters(self.df, filters=options)
|
233
248
|
return self.df
|
234
249
|
|
235
250
|
def load_period(self, **kwargs):
|
@@ -294,6 +309,7 @@ class DfHelper:
|
|
294
309
|
elif is_datetime_field:
|
295
310
|
kwargs[f"{mapped_field}__date__gte"] = start
|
296
311
|
kwargs[f"{mapped_field}__date__lte"] = end
|
312
|
+
self.logger.debug(f"load_period kwargs: {kwargs}")
|
297
313
|
return self.load(**kwargs)
|
298
314
|
|
299
315
|
@staticmethod
|
@@ -1,12 +1,13 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from ._params_config import ParamsConfig
|
4
|
-
from ._query_config import QueryConfig
|
5
3
|
from ._defaults import (
|
6
4
|
django_field_conversion_map_pandas,
|
7
5
|
django_field_conversion_map_dask,
|
8
6
|
sqlalchemy_field_conversion_map_dask,
|
9
7
|
normalize_sqlalchemy_type)
|
8
|
+
from ._filter_handler import FilterHandler
|
9
|
+
from ._params_config import ParamsConfig
|
10
|
+
from ._query_config import QueryConfig
|
10
11
|
|
11
12
|
__all__ = [
|
12
13
|
"ParamsConfig",
|
@@ -14,5 +15,6 @@ __all__ = [
|
|
14
15
|
"django_field_conversion_map_pandas",
|
15
16
|
"django_field_conversion_map_dask",
|
16
17
|
"sqlalchemy_field_conversion_map_dask",
|
17
|
-
"normalize_sqlalchemy_type"
|
18
|
-
|
18
|
+
"normalize_sqlalchemy_type",
|
19
|
+
"FilterHandler",
|
20
|
+
]
|
@@ -0,0 +1,216 @@
|
|
1
|
+
import datetime
|
2
|
+
import dask.dataframe as dd
|
3
|
+
import pandas as pd
|
4
|
+
from sqlalchemy import func, cast
|
5
|
+
from sqlalchemy.sql.sqltypes import Date, Time
|
6
|
+
from sibi_dst.utils import Logger
|
7
|
+
|
8
|
+
class FilterHandler:
|
9
|
+
def __init__(self, backend, logger=None):
|
10
|
+
"""
|
11
|
+
Initialize the FilterHandler.
|
12
|
+
|
13
|
+
Args:
|
14
|
+
backend: The backend to use ('sqlalchemy' or 'dask').
|
15
|
+
logger: Optional logger for debugging purposes.
|
16
|
+
"""
|
17
|
+
self.backend = backend
|
18
|
+
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__) # No-op logger if none provided
|
19
|
+
self.backend_methods = self._get_backend_methods(backend)
|
20
|
+
|
21
|
+
def apply_filters(self, query_or_df, model=None, filters=None):
|
22
|
+
"""
|
23
|
+
Apply filters to the data source based on the backend.
|
24
|
+
|
25
|
+
Args:
|
26
|
+
query_or_df: SQLAlchemy query or Dask DataFrame.
|
27
|
+
model: SQLAlchemy model (required for SQLAlchemy backend).
|
28
|
+
filters: Dictionary of filters.
|
29
|
+
|
30
|
+
Returns:
|
31
|
+
Filtered query or DataFrame.
|
32
|
+
"""
|
33
|
+
filters = filters or {}
|
34
|
+
for key, value in filters.items():
|
35
|
+
field_name, casting, operation = self._parse_filter_key(key)
|
36
|
+
parsed_value = self._parse_filter_value(casting, value)
|
37
|
+
#print(field_name, casting, operation, parsed_value)
|
38
|
+
# Get the column and apply backend-specific transformations
|
39
|
+
if self.backend == "sqlalchemy":
|
40
|
+
column = self.backend_methods["get_column"](field_name, model, casting)
|
41
|
+
condition = self.backend_methods["apply_operation"](column, operation, parsed_value)
|
42
|
+
query_or_df = self.backend_methods["apply_condition"](query_or_df, condition)
|
43
|
+
|
44
|
+
elif self.backend == "dask":
|
45
|
+
column = self.backend_methods["get_column"](query_or_df, field_name, casting)
|
46
|
+
condition = self.backend_methods["apply_operation"](column, operation, parsed_value)
|
47
|
+
query_or_df = self.backend_methods["apply_condition"](query_or_df, condition)
|
48
|
+
else:
|
49
|
+
raise ValueError(f"Unsupported backend: {self.backend}")
|
50
|
+
|
51
|
+
return query_or_df
|
52
|
+
|
53
|
+
@staticmethod
|
54
|
+
def _parse_filter_key(key):
|
55
|
+
parts = key.split("__")
|
56
|
+
field_name = parts[0]
|
57
|
+
casting = None
|
58
|
+
operation = "exact"
|
59
|
+
|
60
|
+
if len(parts) == 3:
|
61
|
+
_, casting, operation = parts
|
62
|
+
elif len(parts) == 2:
|
63
|
+
if parts[1] in FilterHandler._comparison_operators():
|
64
|
+
operation = parts[1]
|
65
|
+
elif parts[1] in FilterHandler._dt_operators() + FilterHandler._date_operators():
|
66
|
+
casting = parts[1]
|
67
|
+
|
68
|
+
return field_name, casting, operation
|
69
|
+
|
70
|
+
|
71
|
+
def _parse_filter_value(self, casting, value):
|
72
|
+
"""
|
73
|
+
Convert filter value to appropriate type based on the casting (e.g., date).
|
74
|
+
"""
|
75
|
+
if casting == "date":
|
76
|
+
if isinstance(value, str):
|
77
|
+
parsed = pd.Timestamp(value) # Convert to datetime64[ns]
|
78
|
+
return parsed
|
79
|
+
if isinstance(value, list):
|
80
|
+
parsed = [pd.Timestamp(v) for v in value]
|
81
|
+
return parsed
|
82
|
+
elif casting == "time" and isinstance(value, str):
|
83
|
+
parsed = datetime.time.fromisoformat(value)
|
84
|
+
self.logger.debug(f"Parsed value (time): {parsed}")
|
85
|
+
return parsed
|
86
|
+
return value
|
87
|
+
|
88
|
+
@staticmethod
|
89
|
+
def _get_backend_methods(backend):
|
90
|
+
if backend == "sqlalchemy":
|
91
|
+
return {
|
92
|
+
"get_column": FilterHandler._get_sqlalchemy_column,
|
93
|
+
"apply_operation": FilterHandler._apply_operation_sqlalchemy,
|
94
|
+
"apply_condition": lambda query, condition: query.filter(condition),
|
95
|
+
}
|
96
|
+
elif backend == "dask":
|
97
|
+
return {
|
98
|
+
"get_column": FilterHandler._get_dask_column,
|
99
|
+
"apply_operation": FilterHandler._apply_operation_dask,
|
100
|
+
"apply_condition": lambda df, condition: df[condition],
|
101
|
+
}
|
102
|
+
else:
|
103
|
+
raise ValueError(f"Unsupported backend: {backend}")
|
104
|
+
|
105
|
+
@staticmethod
|
106
|
+
def _get_sqlalchemy_column(field_name, model, casting):
|
107
|
+
"""
|
108
|
+
Retrieve and cast a column for SQLAlchemy based on the field name and casting.
|
109
|
+
|
110
|
+
Args:
|
111
|
+
field_name: The name of the field/column in the model.
|
112
|
+
model: The SQLAlchemy model.
|
113
|
+
casting: The casting type ('date', 'time', etc.).
|
114
|
+
|
115
|
+
Returns:
|
116
|
+
The SQLAlchemy column object, optionally cast or transformed.
|
117
|
+
"""
|
118
|
+
column = getattr(model, field_name, None)
|
119
|
+
if not column:
|
120
|
+
raise AttributeError(f"Field '{field_name}' not found in model '{model.__name__}'")
|
121
|
+
|
122
|
+
if casting == "date":
|
123
|
+
# Cast the column to Date for whole-date comparisons
|
124
|
+
column = cast(column, Date)
|
125
|
+
elif casting == "time":
|
126
|
+
# Cast the column to Time for time-specific comparisons
|
127
|
+
column = cast(column, Time)
|
128
|
+
elif casting in FilterHandler._date_operators():
|
129
|
+
# Extract date part (e.g., year, month) using SQLAlchemy functions
|
130
|
+
column = func.extract(casting, column)
|
131
|
+
|
132
|
+
return column
|
133
|
+
|
134
|
+
@staticmethod
|
135
|
+
def _get_dask_column(df, field_name, casting):
|
136
|
+
"""
|
137
|
+
Retrieve and optionally cast a column for Dask based on the field name and casting.
|
138
|
+
|
139
|
+
Args:
|
140
|
+
df: The Dask DataFrame.
|
141
|
+
field_name: The name of the field/column in the DataFrame.
|
142
|
+
casting: The casting type ('date', 'time', etc.).
|
143
|
+
|
144
|
+
Returns:
|
145
|
+
The Dask Series object, optionally cast or transformed.
|
146
|
+
"""
|
147
|
+
column = dd.to_datetime(df[field_name], errors="coerce") if casting in FilterHandler._dt_operators() else df[
|
148
|
+
field_name]
|
149
|
+
|
150
|
+
if casting == "date":
|
151
|
+
column = column.dt.floor("D") # Ensure truncation to the date level
|
152
|
+
elif casting in FilterHandler._date_operators():
|
153
|
+
column = getattr(column.dt, casting)
|
154
|
+
|
155
|
+
return column
|
156
|
+
|
157
|
+
@staticmethod
|
158
|
+
def _apply_operation_sqlalchemy(column, operation, value):
|
159
|
+
operation_map = FilterHandler._operation_map_sqlalchemy()
|
160
|
+
if operation not in operation_map:
|
161
|
+
raise ValueError(f"Unsupported operation: {operation}")
|
162
|
+
return operation_map[operation](column, value)
|
163
|
+
|
164
|
+
@staticmethod
|
165
|
+
def _apply_operation_dask(column, operation, value):
|
166
|
+
operation_map = FilterHandler._operation_map_dask()
|
167
|
+
if operation not in operation_map:
|
168
|
+
raise ValueError(f"Unsupported operation: {operation}")
|
169
|
+
return operation_map[operation](column, value)
|
170
|
+
|
171
|
+
@staticmethod
|
172
|
+
def _operation_map_sqlalchemy():
|
173
|
+
return {
|
174
|
+
"exact": lambda col, val: col == val,
|
175
|
+
"gt": lambda col, val: col > val,
|
176
|
+
"gte": lambda col, val: col >= val,
|
177
|
+
"lt": lambda col, val: col < val,
|
178
|
+
"lte": lambda col, val: col <= val,
|
179
|
+
"in": lambda col, val: col.in_(val),
|
180
|
+
"range": lambda col, val: col.between(val[0], val[1]),
|
181
|
+
"contains": lambda col, val: col.like(f"%{val}%"),
|
182
|
+
"startswith": lambda col, val: col.like(f"{val}%"),
|
183
|
+
"endswith": lambda col, val: col.like(f"%{val}"),
|
184
|
+
"isnull": lambda col, val: col.is_(None) if val else col.isnot(None),
|
185
|
+
}
|
186
|
+
|
187
|
+
@staticmethod
|
188
|
+
def _operation_map_dask():
|
189
|
+
return {
|
190
|
+
"exact": lambda col, val: col == val,
|
191
|
+
"gt": lambda col, val: col > val,
|
192
|
+
"gte": lambda col, val: col >= val,
|
193
|
+
"lt": lambda col, val: col < val,
|
194
|
+
"lte": lambda col, val: col <= val,
|
195
|
+
"in": lambda col, val: col.isin(val),
|
196
|
+
"range": lambda col, val: (col >= val[0]) & (col <= val[1]),
|
197
|
+
"contains": lambda col, val: col.str.contains(val, regex=True),
|
198
|
+
"startswith": lambda col, val: col.str.startswith(val),
|
199
|
+
"endswith": lambda col, val: col.str.endswith(val),
|
200
|
+
"isnull": lambda col, val: col.isnull() if val else col.notnull(),
|
201
|
+
}
|
202
|
+
|
203
|
+
@staticmethod
|
204
|
+
def _dt_operators():
|
205
|
+
return ["date", "time"]
|
206
|
+
|
207
|
+
@staticmethod
|
208
|
+
def _date_operators():
|
209
|
+
return ["year", "month", "day", "hour", "minute", "second", "week_day"]
|
210
|
+
|
211
|
+
@staticmethod
|
212
|
+
def _comparison_operators():
|
213
|
+
return [
|
214
|
+
"gte", "lte", "gt", "lt", "exact", "in", "range",
|
215
|
+
"contains", "startswith", "endswith", "isnull",
|
216
|
+
]
|
@@ -1,5 +1,8 @@
|
|
1
|
+
import warnings
|
2
|
+
|
1
3
|
import dask.dataframe as dd
|
2
4
|
import pandas as pd
|
5
|
+
from IPython.core.hooks import deprecated
|
3
6
|
from django.db.models import Q
|
4
7
|
|
5
8
|
from sibi_dst.df_helper.plugins.django import ReadFrameDask
|
@@ -12,13 +15,11 @@ class DjangoLoadFromDb:
|
|
12
15
|
def __init__(self, db_connection, db_query, db_params, logger, **kwargs):
|
13
16
|
self.connection_config = db_connection
|
14
17
|
self.debug = kwargs.pop('debug', False)
|
15
|
-
self.verbose_debug = kwargs.pop('verbose_debug', False)
|
16
18
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
17
19
|
if self.connection_config.model is None:
|
18
20
|
if self.debug:
|
19
|
-
self.logger.
|
20
|
-
|
21
|
-
print('Model must be specified')
|
21
|
+
self.logger.debug('Model must be specified')
|
22
|
+
|
22
23
|
raise ValueError('Model must be specified')
|
23
24
|
|
24
25
|
self.query_config = db_query
|
@@ -45,7 +46,7 @@ class DjangoLoadFromDb:
|
|
45
46
|
try:
|
46
47
|
self.df = ReadFrameDask(queryset, **self.params_config.df_params).read_frame()
|
47
48
|
except Exception as e:
|
48
|
-
self.logger.
|
49
|
+
self.logger.debug(f'Error loading query: {str(queryset.query)}, error message: {e}')
|
49
50
|
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
50
51
|
else:
|
51
52
|
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
@@ -69,16 +70,28 @@ class DjangoLoadFromDb:
|
|
69
70
|
:param df: Dask DataFrame whose columns' data types are to be converted.
|
70
71
|
:return: Dask DataFrame with converted column data types.
|
71
72
|
"""
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
73
|
+
"""
|
74
|
+
[DEPRECATED] Convert the data types of columns in a Dask DataFrame based on the field type in the Django model.
|
75
|
+
|
76
|
+
:param df: Dask DataFrame whose columns' data types are to be converted.
|
77
|
+
:return: Dask DataFrame with converted column data types.
|
78
|
+
"""
|
79
|
+
# Emit deprecation warning
|
80
|
+
warnings.warn(
|
81
|
+
"_convert_columns is deprecated and will be removed in a future release. "
|
82
|
+
"Consider using <new_method_name> instead.",
|
83
|
+
DeprecationWarning,
|
84
|
+
stacklevel=2,
|
85
|
+
)
|
86
|
+
|
87
|
+
# Log deprecation message if debug mode is enabled
|
80
88
|
if self.debug:
|
81
|
-
self.logger.
|
89
|
+
self.logger.warning(
|
90
|
+
"[DEPRECATION NOTICE] The `_convert_columns` method is deprecated and will be removed in a future release. "
|
91
|
+
"Consider using <new_method_name> instead."
|
92
|
+
)
|
93
|
+
|
94
|
+
self.logger.debug(f'Converting columns: {list(df.columns)}')
|
82
95
|
|
83
96
|
# Get field information from the Django model
|
84
97
|
model_fields = self.connection_config.model._meta.get_fields()
|
@@ -87,13 +100,13 @@ class DjangoLoadFromDb:
|
|
87
100
|
for field_name, field_type in field_type_map.items():
|
88
101
|
if field_name not in df.columns:
|
89
102
|
|
90
|
-
|
103
|
+
self.logger.debug(f"Column '{field_name}' not found in DataFrame columns.")
|
91
104
|
continue
|
92
105
|
|
93
106
|
conversion_func = django_field_conversion_map_dask.get(field_type)
|
94
107
|
if not conversion_func:
|
95
108
|
message=f"Field type '{field_type}' not found in conversion_map."
|
96
|
-
|
109
|
+
self.logger.debug(message)
|
97
110
|
continue
|
98
111
|
|
99
112
|
def apply_conversion(partition):
|
@@ -104,7 +117,7 @@ class DjangoLoadFromDb:
|
|
104
117
|
if field_name in partition.columns:
|
105
118
|
partition[field_name] = conversion_func(partition[field_name])
|
106
119
|
except Exception as e:
|
107
|
-
self.logger.
|
120
|
+
self.logger.debug(f"Error converting column '{field_name}' in partition: {str(e)}")
|
108
121
|
return partition
|
109
122
|
|
110
123
|
try:
|
@@ -113,9 +126,8 @@ class DjangoLoadFromDb:
|
|
113
126
|
apply_conversion,
|
114
127
|
meta=df,
|
115
128
|
)
|
116
|
-
|
117
|
-
is_verbose=True)
|
129
|
+
self.logger.debug(f"Successfully queued conversion for column '{field_name}' to type '{field_type}'.")
|
118
130
|
except Exception as e:
|
119
|
-
|
131
|
+
self.logger.debug(f"Failed to queue conversion for column '{field_name}': {str(e)}")
|
120
132
|
|
121
133
|
return df
|
@@ -31,17 +31,17 @@ class HttpConfig(BaseModel):
|
|
31
31
|
# Set up headers with API key if provided
|
32
32
|
headers = {"Authorization": f"Bearer {self.api_key.get_secret_value()}"} if self.api_key else {}
|
33
33
|
|
34
|
-
self.logger.
|
34
|
+
self.logger.debug(f"Fetching data from {formatted_url} with params {self.params}")
|
35
35
|
async with httpx.AsyncClient() as client:
|
36
36
|
response = await client.get(formatted_url, params=self.params, headers=headers, timeout=self.timeout)
|
37
37
|
response.raise_for_status()
|
38
38
|
data = response.json()
|
39
39
|
df = dd.from_pandas(pd.json_normalize(data), npartitions=1)
|
40
|
-
self.logger.
|
40
|
+
self.logger.debug("Data successfully loaded from HTTP JSON source.")
|
41
41
|
return df
|
42
42
|
except httpx.RequestError as e:
|
43
|
-
self.logger.
|
43
|
+
self.logger.debug(f"HTTP request error: {e}")
|
44
44
|
raise
|
45
45
|
except ValueError as e:
|
46
|
-
self.logger.
|
46
|
+
self.logger.debug(f"Error parsing JSON data: {e}")
|
47
47
|
raise
|
@@ -8,7 +8,7 @@ from sqlalchemy.orm import sessionmaker
|
|
8
8
|
|
9
9
|
from sibi_dst.df_helper.plugins.sql_alchemy._sqlachemy_filter_handler import SqlAlchemyFilterHandler
|
10
10
|
from sibi_dst.utils import Logger
|
11
|
-
|
11
|
+
from sibi_dst.df_helper.core import FilterHandler
|
12
12
|
|
13
13
|
class SQLAlchemyDask:
|
14
14
|
def __init__(self, model, filters, engine_url, chunk_size=1000, logger=None, debug=False):
|
@@ -38,7 +38,6 @@ class SQLAlchemyDask:
|
|
38
38
|
"""
|
39
39
|
mapper = inspect(model)
|
40
40
|
sqlalchemy_to_dask_dtype = {
|
41
|
-
#'INTEGER': pd.to_numeric(x, errors="coerce"),
|
42
41
|
'INTEGER': 'Int64',
|
43
42
|
'SMALLINT': 'Int64',
|
44
43
|
'BIGINT': 'Int64',
|
@@ -72,11 +71,15 @@ class SQLAlchemyDask:
|
|
72
71
|
# Build query
|
73
72
|
self.query = select(self.model)
|
74
73
|
if self.filters:
|
75
|
-
|
74
|
+
"""
|
75
|
+
deprecated specific filter handling to a generic one
|
76
|
+
#self.query = SqlAlchemyFilterHandler.apply_filters_sqlalchemy(self.query, self.model, self.filters)
|
77
|
+
"""
|
78
|
+
self.query = FilterHandler(backend="sqlalchemy", logger=self.logger).apply_filters(self.query, model=self.model, filters=self.filters)
|
76
79
|
else:
|
77
80
|
n_records = 100
|
78
81
|
self.query = self.query.limit(n_records)
|
79
|
-
|
82
|
+
self.logger.debug(f"query:{self.query}")
|
80
83
|
# Infer dtypes
|
81
84
|
dtypes = self.infer_dtypes_from_model(self.model)
|
82
85
|
# Get the column order from the SQLAlchemy model
|
@@ -124,14 +127,11 @@ class SQLAlchemyDask:
|
|
124
127
|
else:
|
125
128
|
dask_df = dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
|
126
129
|
|
127
|
-
|
128
|
-
self.logger.info(f"Loaded {len(dask_df)} rows into Dask DataFrame.")
|
129
|
-
|
130
|
-
if isinstance(dask_df, dask_expr._collection.DataFrame):
|
131
|
-
dask_df = dask_df.to_legacy_dataframe()
|
130
|
+
self.logger.debug(f"Loaded {len(dask_df)} rows into Dask DataFrame.")
|
132
131
|
|
133
132
|
return dask_df
|
134
133
|
|
135
134
|
except Exception as e:
|
136
135
|
self.logger.error(f"Error executing query: {str(e)}")
|
136
|
+
self.logger.error(self.query)
|
137
137
|
return dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
|
@@ -1,7 +1,9 @@
|
|
1
|
-
from sqlalchemy import and_, or_, not_, func, cast
|
2
|
-
from sqlalchemy.sql.sqltypes import Date, Time
|
3
1
|
import datetime
|
4
2
|
|
3
|
+
from sqlalchemy import func, cast
|
4
|
+
from sqlalchemy.sql.sqltypes import Date, Time
|
5
|
+
|
6
|
+
|
5
7
|
class SqlAlchemyFilterHandler:
|
6
8
|
@staticmethod
|
7
9
|
def apply_filters_sqlalchemy(query, model, filters):
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import dask.dataframe as dd
|
2
|
+
import dask_expr
|
2
3
|
import pandas as pd
|
3
4
|
|
4
5
|
from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
|
@@ -28,7 +29,6 @@ class SqlAlchemyLoadFromDb:
|
|
28
29
|
self.query_config = plugin_query
|
29
30
|
self.params_config = plugin_params
|
30
31
|
self.debug = kwargs.pop("debug", False)
|
31
|
-
self.verbose_debug = kwargs.pop("verbose_debug", False)
|
32
32
|
|
33
33
|
def build_and_load(self) -> dd.DataFrame:
|
34
34
|
"""
|
@@ -40,7 +40,6 @@ class SqlAlchemyLoadFromDb:
|
|
40
40
|
def _build_and_load(self) -> dd.DataFrame:
|
41
41
|
|
42
42
|
try:
|
43
|
-
# reader = SQLAlchemyDask(model=self.model, filters=self.params_config.filters,engine_url=self.engine.url, logger=self.logger, chunk_size=1000, debug=self.debug)
|
44
43
|
self.df = SQLAlchemyDask(
|
45
44
|
model=self.model,
|
46
45
|
filters=self.params_config.filters,
|
@@ -49,10 +48,13 @@ class SqlAlchemyLoadFromDb:
|
|
49
48
|
chunk_size=1000,
|
50
49
|
debug=self.debug).read_frame()
|
51
50
|
if self.df is None or len(self.df.head().index) == 0:
|
52
|
-
self.logger.
|
53
|
-
|
51
|
+
self.logger.debug("Query returned no results.")
|
52
|
+
dask_df=dd.from_pandas(pd.DataFrame(), npartitions=1)
|
54
53
|
|
54
|
+
return dask_df
|
55
55
|
return self.df
|
56
56
|
except Exception as e:
|
57
|
-
self.logger.
|
58
|
-
|
57
|
+
self.logger.debug(f"Failed to load data into Dask DataFrame.{e}")
|
58
|
+
dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
59
|
+
|
60
|
+
return dask_df
|
@@ -26,7 +26,6 @@ class SQLModelLoadFromDb:
|
|
26
26
|
self.query_config = db_query or {}
|
27
27
|
self.params_config = db_params or {}
|
28
28
|
self.debug = kwargs.pop("debug", False)
|
29
|
-
self.verbose_debug = kwargs.pop("verbose_debug", False)
|
30
29
|
|
31
30
|
def _default_logger(self):
|
32
31
|
"""Create a default logger."""
|
@@ -69,7 +68,7 @@ class SQLModelLoadFromDb:
|
|
69
68
|
query = query.limit(n_records)
|
70
69
|
|
71
70
|
# Debug: Log the SQL query
|
72
|
-
self.logger.
|
71
|
+
self.logger.debug(f"Executing query: {str(query)}")
|
73
72
|
|
74
73
|
# Execute the query
|
75
74
|
results = session.exec(query).fetchall()
|
@@ -79,7 +78,7 @@ class SQLModelLoadFromDb:
|
|
79
78
|
if results:
|
80
79
|
df = dd.from_pandas(pd.DataFrame([r.dict() for r in results]), npartitions=1)
|
81
80
|
else:
|
82
|
-
self.logger.
|
81
|
+
self.logger.debug("Query returned no results.")
|
83
82
|
df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
84
83
|
|
85
84
|
except Exception as e:
|
sibi_dst/utils/__init__.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
from ._credentials import
|
2
|
+
from ._credentials import *
|
3
3
|
from ._log_utils import Logger
|
4
|
-
from ._date_utils import
|
4
|
+
from ._date_utils import *
|
5
5
|
from ._data_utils import DataUtils
|
6
6
|
from ._file_utils import FileUtils
|
7
7
|
from ._filepath_generator import FilePathGenerator
|
@@ -34,7 +34,7 @@ class ClickHouseWriter:
|
|
34
34
|
self.df = df.copy()
|
35
35
|
self.order_by = kwargs.setdefault('order_by',self.order_by)
|
36
36
|
if len(self.df.head().index) == 0:
|
37
|
-
self.logger.
|
37
|
+
self.logger.debug("Dataframe is empty")
|
38
38
|
return
|
39
39
|
self._handle_missing_values()
|
40
40
|
self._connect()
|
@@ -51,7 +51,7 @@ class ClickHouseWriter:
|
|
51
51
|
user=self.clickhouse_user,
|
52
52
|
password=self.clickhouse_password
|
53
53
|
)
|
54
|
-
self.logger.
|
54
|
+
self.logger.debug("Connected to ClickHouse")
|
55
55
|
except Exception as e:
|
56
56
|
self.logger.error(e)
|
57
57
|
raise
|
@@ -80,7 +80,7 @@ class ClickHouseWriter:
|
|
80
80
|
def _drop_table(self):
|
81
81
|
if self.client:
|
82
82
|
self.client.command('DROP TABLE IF EXISTS {}'.format(self.clickhouse_table))
|
83
|
-
self.logger.
|
83
|
+
self.logger.debug(f"Dropped table {self.clickhouse_table}")
|
84
84
|
|
85
85
|
def _create_table_from_dask(self, engine=None):
|
86
86
|
if engine is None:
|
@@ -88,18 +88,18 @@ class ClickHouseWriter:
|
|
88
88
|
dtypes = self.df.dtypes
|
89
89
|
clickhouse_schema = self._generate_clickhouse_schema(dtypes,self.dtype_to_clickhouse)
|
90
90
|
create_table_sql= f"CREATE TABLE IF NOT EXISTS {self.clickhouse_table} ({clickhouse_schema}) {engine};"
|
91
|
-
self.logger.
|
91
|
+
self.logger.debug(f"Creating table SQL:{create_table_sql}")
|
92
92
|
if self.client:
|
93
93
|
self.client.command(create_table_sql)
|
94
|
-
self.logger.
|
94
|
+
self.logger.debug("Created table '{}'".format(self.clickhouse_table))
|
95
95
|
|
96
96
|
def _handle_missing_values(self):
|
97
97
|
"""
|
98
98
|
Handle missing values in the Dask DataFrame before writing to ClickHouse.
|
99
99
|
"""
|
100
|
-
self.logger.
|
100
|
+
self.logger.debug("Checking for missing values...")
|
101
101
|
missing_counts = self.df.isnull().sum().compute()
|
102
|
-
self.logger.
|
102
|
+
self.logger.debug(f"Missing values per column:\n{missing_counts}")
|
103
103
|
|
104
104
|
# Replace missing values based on column types
|
105
105
|
def replace_missing_values(df):
|
@@ -116,14 +116,14 @@ class ClickHouseWriter:
|
|
116
116
|
|
117
117
|
# Apply replacement
|
118
118
|
self.df = replace_missing_values(self.df)
|
119
|
-
self.logger.
|
119
|
+
self.logger.debug("Missing values replaced.")
|
120
120
|
|
121
121
|
def _write_data(self):
|
122
122
|
"""
|
123
123
|
Writes the Dask DataFrame to a ClickHouse table partition by partition.
|
124
124
|
"""
|
125
125
|
if len(self.df.head().index) == 0:
|
126
|
-
self.logger.
|
126
|
+
self.logger.debug("No data found. Nothing written.")
|
127
127
|
return
|
128
128
|
|
129
129
|
for i, partition in enumerate(self.df.to_delayed()):
|
@@ -132,10 +132,10 @@ class ClickHouseWriter:
|
|
132
132
|
df = partition.compute()
|
133
133
|
|
134
134
|
if df.empty:
|
135
|
-
self.logger.
|
135
|
+
self.logger.debug(f"Partition {i} is empty. Skipping...")
|
136
136
|
continue
|
137
137
|
|
138
|
-
self.logger.
|
138
|
+
self.logger.debug(f"Writing partition {i} with {len(df)} rows to ClickHouse.")
|
139
139
|
|
140
140
|
# Write the partition to the ClickHouse table
|
141
141
|
self.client.insert_df(self.clickhouse_table, df)
|
@@ -148,7 +148,7 @@ class ClickHouseWriter:
|
|
148
148
|
Ensures a separate client instance is used per thread to avoid session conflicts.
|
149
149
|
"""
|
150
150
|
if len(self.df.index) == 0:
|
151
|
-
self.logger.
|
151
|
+
self.logger.debug("No data found. Nothing written.")
|
152
152
|
return
|
153
153
|
|
154
154
|
def create_client():
|
@@ -170,13 +170,13 @@ class ClickHouseWriter:
|
|
170
170
|
Write a single partition to ClickHouse using a separate client instance.
|
171
171
|
"""
|
172
172
|
try:
|
173
|
-
self.logger.
|
173
|
+
self.logger.debug(f"Starting to process partition {index}")
|
174
174
|
client = create_client() # Create a new client for the thread
|
175
175
|
|
176
176
|
# Compute the Dask partition into a Pandas DataFrame
|
177
177
|
df = partition.compute()
|
178
178
|
if df.empty:
|
179
|
-
self.logger.
|
179
|
+
self.logger.debug(f"Partition {index} is empty. Skipping...")
|
180
180
|
return
|
181
181
|
|
182
182
|
# Convert DataFrame to list of tuples
|
@@ -184,7 +184,7 @@ class ClickHouseWriter:
|
|
184
184
|
columns = df.columns.tolist()
|
185
185
|
|
186
186
|
# Perform the insert
|
187
|
-
self.logger.
|
187
|
+
self.logger.debug(f"Writing partition {index} with {len(df)} rows to ClickHouse.")
|
188
188
|
client.execute(f"INSERT INTO {self.clickhouse_table} ({', '.join(columns)}) VALUES", data)
|
189
189
|
|
190
190
|
except Exception as e:
|
@@ -192,7 +192,7 @@ class ClickHouseWriter:
|
|
192
192
|
finally:
|
193
193
|
if 'client' in locals() and hasattr(client, 'close'):
|
194
194
|
client.close()
|
195
|
-
self.logger.
|
195
|
+
self.logger.debug(f"Closed client for partition {index}")
|
196
196
|
|
197
197
|
try:
|
198
198
|
# Get delayed partitions and enumerate them
|
sibi_dst/utils/_data_utils.py
CHANGED
@@ -68,6 +68,7 @@ class DataUtils:
|
|
68
68
|
- pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with merged lookup data.
|
69
69
|
"""
|
70
70
|
# Return early if the DataFrame is empty
|
71
|
+
debug = kwargs.setdefault("debug", False)
|
71
72
|
if self.is_dataframe_empty(df):
|
72
73
|
return df
|
73
74
|
|
@@ -88,7 +89,7 @@ class DataUtils:
|
|
88
89
|
column_names = kwargs.pop('column_names', ['temp_join_col', source_description_alias])
|
89
90
|
|
90
91
|
if source_col not in df.columns:
|
91
|
-
self.logger.
|
92
|
+
self.logger.debug(f"{source_col} not in DataFrame columns")
|
92
93
|
return df
|
93
94
|
|
94
95
|
# Get unique IDs from source column
|
@@ -99,7 +100,7 @@ class DataUtils:
|
|
99
100
|
|
100
101
|
# Check if any IDs are found
|
101
102
|
if not len(ids):
|
102
|
-
self.logger.
|
103
|
+
self.logger.debug(f"No IDs found in the source column: {source_col}")
|
103
104
|
return df
|
104
105
|
|
105
106
|
# Convert to a list only if necessary and sort
|
@@ -114,10 +115,10 @@ class DataUtils:
|
|
114
115
|
f'{lookup_col}__in': ids
|
115
116
|
})
|
116
117
|
# Load lookup data
|
117
|
-
lookup_instance = classname(debug=
|
118
|
+
lookup_instance = classname(debug=debug)
|
118
119
|
result = lookup_instance.load(**load_kwargs)
|
119
120
|
if len(result.index) == 0:
|
120
|
-
self.logger.
|
121
|
+
self.logger.debug(f"No IDs found in the source column: {source_col}")
|
121
122
|
return df
|
122
123
|
# Determine the join column on the result DataFrame
|
123
124
|
temp_join_col = 'temp_join_col' if 'temp_join_col' in column_names else lookup_col
|
sibi_dst/utils/_data_wrapper.py
CHANGED
@@ -1,12 +1,15 @@
|
|
1
1
|
import datetime
|
2
2
|
from typing import Type, Any, Dict, Optional
|
3
|
+
|
3
4
|
import fsspec
|
4
5
|
import pandas as pd
|
5
6
|
from IPython.display import display
|
6
|
-
from sibi_dst.utils import Logger
|
7
7
|
from tqdm import tqdm
|
8
|
+
|
9
|
+
from sibi_dst.utils import Logger
|
8
10
|
from sibi_dst.utils import ParquetSaver
|
9
11
|
|
12
|
+
|
10
13
|
class DataWrapper:
|
11
14
|
DEFAULT_MAX_AGE_MINUTES = 1440
|
12
15
|
DEFAULT_HISTORY_DAYS_THRESHOLD = 30
|
@@ -112,7 +115,7 @@ class DataWrapper:
|
|
112
115
|
file_age_minutes = (current_time - file_modification_datetime).total_seconds() / 60
|
113
116
|
|
114
117
|
if self.verbose:
|
115
|
-
self.logger.
|
118
|
+
self.logger.debug(
|
116
119
|
f"File {file_path} is {round(file_age_minutes, 2)} minutes old "
|
117
120
|
f"(threshold: {self.max_age_minutes} minutes)"
|
118
121
|
)
|
@@ -129,14 +132,14 @@ class DataWrapper:
|
|
129
132
|
start_time = datetime.datetime.now()
|
130
133
|
|
131
134
|
if self.verbose:
|
132
|
-
self.logger.
|
135
|
+
self.logger.debug(f"Processing {full_parquet_filename}...")
|
133
136
|
|
134
137
|
data_object = self.dataclass(**self.class_params)
|
135
138
|
df = data_object.load_period(dt_field=self.date_field, start=date, end=date)
|
136
139
|
|
137
140
|
if len(df.index)==0:
|
138
141
|
if self.verbose:
|
139
|
-
self.logger.
|
142
|
+
self.logger.debug("No data found for the specified date.")
|
140
143
|
return
|
141
144
|
|
142
145
|
parquet_saver = ParquetSaver(df, folder, self.logger)
|
@@ -146,7 +149,7 @@ class DataWrapper:
|
|
146
149
|
duration_seconds = (end_time - start_time).total_seconds()
|
147
150
|
|
148
151
|
if self.verbose:
|
149
|
-
self.logger.
|
152
|
+
self.logger.debug(
|
150
153
|
f"Data saved to {full_parquet_filename}. Processing time: {duration_seconds:.2f} seconds"
|
151
154
|
)
|
152
155
|
|
sibi_dst/utils/_df_utils.py
CHANGED
@@ -85,7 +85,7 @@ class DfUtils:
|
|
85
85
|
# Ensure all specified columns exist in the DataFrame
|
86
86
|
missing_columns = [col for col, _, _ in conditions if col not in df.columns]
|
87
87
|
if missing_columns:
|
88
|
-
self.logger.
|
88
|
+
self.logger.debug(f"The following columns are missing in the DataFrame: {', '.join(missing_columns)}")
|
89
89
|
return df
|
90
90
|
|
91
91
|
# Build the combined filtering condition
|
@@ -117,7 +117,7 @@ class DfUtils:
|
|
117
117
|
DataFrame: Grouped DataFrame with counts.
|
118
118
|
"""
|
119
119
|
if debug:
|
120
|
-
self.logger.
|
120
|
+
self.logger.debug(f"Grouping by: {group_by_expr}")
|
121
121
|
|
122
122
|
df_grouped = df.groupby(by=group_by_expr).size().reset_index(name=group_expr)
|
123
123
|
return df_grouped
|
@@ -141,7 +141,7 @@ class DfUtils:
|
|
141
141
|
|
142
142
|
if debug:
|
143
143
|
df_duplicates = df[df.duplicated(subset=duplicate_expr)]
|
144
|
-
self.logger.
|
144
|
+
self.logger.debug(f"Duplicate Rows based on columns {duplicate_expr} are:\n{df_duplicates}")
|
145
145
|
|
146
146
|
if sort_field:
|
147
147
|
if isinstance(df, dd.DataFrame):
|
@@ -224,9 +224,9 @@ class DfUtils:
|
|
224
224
|
Returns:
|
225
225
|
DataFrame: Resampled pivot table.
|
226
226
|
"""
|
227
|
-
if isinstance(df, dd.DataFrame):
|
227
|
+
if isinstance(df, dd.core.DataFrame):
|
228
228
|
# Implement Dask-compatible pivot and resample
|
229
|
-
self.logger.
|
229
|
+
self.logger.debug("Performing summarization with Dask DataFrame.")
|
230
230
|
# Ensure the index is a datetime for resampling
|
231
231
|
if not isinstance(df.index, (pd.DatetimeIndex, dd.core.DatetimeIndex)):
|
232
232
|
self.logger.warning("Index is not a DatetimeIndex. Converting index to datetime.")
|
sibi_dst/utils/_log_utils.py
CHANGED
sibi_dst/utils/_parquet_saver.py
CHANGED
@@ -1,18 +1,16 @@
|
|
1
|
-
import datetime
|
2
1
|
from pathlib import Path
|
3
2
|
from typing import Optional
|
4
3
|
|
5
|
-
import
|
4
|
+
import dask_expr
|
6
5
|
import fsspec
|
7
|
-
import pandas as pd
|
8
6
|
import pyarrow as pa
|
7
|
+
|
9
8
|
from sibi_dst.utils import Logger
|
10
9
|
|
10
|
+
|
11
11
|
class ParquetSaver:
|
12
12
|
def __init__(self, df_result, parquet_storage_path, logger=None):
|
13
13
|
# Ensure df_result is a Dask DataFrame
|
14
|
-
if not isinstance(df_result, dd.DataFrame):
|
15
|
-
df_result = dd.from_pandas(df_result, npartitions=1)
|
16
14
|
self.df_result = df_result
|
17
15
|
self.parquet_storage_path = parquet_storage_path
|
18
16
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
@@ -106,106 +104,3 @@ class ParquetSaver:
|
|
106
104
|
str(full_path), engine="pyarrow", schema=schema, write_index=False
|
107
105
|
)
|
108
106
|
|
109
|
-
# import datetime
|
110
|
-
# from pathlib import Path
|
111
|
-
# from typing import Optional
|
112
|
-
#
|
113
|
-
# import dask.dataframe as dd
|
114
|
-
# import fsspec
|
115
|
-
# import pandas as pd
|
116
|
-
# import pyarrow as pa
|
117
|
-
# from sibi_dst.utils import Logger
|
118
|
-
#
|
119
|
-
# class ParquetSaver:
|
120
|
-
# def __init__(self, df_result, parquet_storage_path, logger):
|
121
|
-
# self.df_result = df_result
|
122
|
-
# self.parquet_storage_path = parquet_storage_path
|
123
|
-
# self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
124
|
-
#
|
125
|
-
#
|
126
|
-
# def save_to_parquet(self, parquet_filename: Optional[str] = None, clear_existing=True):
|
127
|
-
# full_path = self._construct_full_path(parquet_filename)
|
128
|
-
#
|
129
|
-
# if len(self.df_result) == 0:
|
130
|
-
# self.logger.warning('No data to save')
|
131
|
-
# return # Exit early if there's no data to save
|
132
|
-
#
|
133
|
-
# # Ensure directory exists and clear if necessary
|
134
|
-
# self._ensure_directory_exists(full_path, clear_existing=True)
|
135
|
-
#
|
136
|
-
# # Define schema and save DataFrame to parquet
|
137
|
-
# schema = self._define_schema()
|
138
|
-
# self._convert_dtypes(schema)
|
139
|
-
# self._save_dataframe_to_parquet(full_path, schema)
|
140
|
-
#
|
141
|
-
# def _define_schema(self) -> pa.Schema:
|
142
|
-
# """Define a PyArrow schema dynamically based on df_result column types."""
|
143
|
-
# pandas_dtype_to_pa = {
|
144
|
-
# 'object': pa.string(),
|
145
|
-
# 'string': pa.string(),
|
146
|
-
# 'Int64': pa.int64(),
|
147
|
-
# 'int64': pa.int64(),
|
148
|
-
# 'float64': pa.float64(),
|
149
|
-
# 'bool': pa.bool_(),
|
150
|
-
# 'boolean': pa.bool_(), # pandas nullable boolean
|
151
|
-
# 'datetime64[ns]': pa.timestamp('ns'),
|
152
|
-
# 'timedelta[ns]': pa.duration('ns')
|
153
|
-
# }
|
154
|
-
#
|
155
|
-
# fields = [
|
156
|
-
# pa.field(col, pandas_dtype_to_pa.get(str(dtype), pa.string()))
|
157
|
-
# for col, dtype in self.df_result.dtypes.items()
|
158
|
-
# ]
|
159
|
-
# return pa.schema(fields)
|
160
|
-
#
|
161
|
-
# def _convert_dtypes(self, schema: pa.Schema):
|
162
|
-
# """Convert DataFrame columns to match the specified schema."""
|
163
|
-
# dtype_mapping = {}
|
164
|
-
# for field in schema:
|
165
|
-
# col_name = field.name
|
166
|
-
# if col_name in self.df_result.columns:
|
167
|
-
# if pa.types.is_string(field.type):
|
168
|
-
# dtype_mapping[col_name] = 'string'
|
169
|
-
# elif pa.types.is_int64(field.type):
|
170
|
-
# dtype_mapping[col_name] = 'Int64' # pandas nullable integer
|
171
|
-
# elif pa.types.is_float64(field.type):
|
172
|
-
# dtype_mapping[col_name] = 'float64'
|
173
|
-
# elif pa.types.is_boolean(field.type):
|
174
|
-
# dtype_mapping[col_name] = 'boolean' # pandas nullable boolean
|
175
|
-
# elif pa.types.is_timestamp(field.type):
|
176
|
-
# dtype_mapping[col_name] = 'datetime64[ns]'
|
177
|
-
# else:
|
178
|
-
# dtype_mapping[col_name] = 'object' # Fallback to object
|
179
|
-
# self.df_result = self.df_result.astype(dtype_mapping)
|
180
|
-
#
|
181
|
-
# def _construct_full_path(self, parquet_filename: Optional[str]) -> Path:
|
182
|
-
# """Construct and return the full path for the parquet file."""
|
183
|
-
# fs, base_path = fsspec.core.url_to_fs(self.parquet_storage_path)
|
184
|
-
# parquet_filename = parquet_filename or "default.parquet"
|
185
|
-
# return Path(base_path) / parquet_filename
|
186
|
-
#
|
187
|
-
# @staticmethod
|
188
|
-
# def _ensure_directory_exists(full_path: Path, clear_existing=False):
|
189
|
-
# """Ensure that the directory for the path exists, clearing it if specified."""
|
190
|
-
# fs, _ = fsspec.core.url_to_fs(str(full_path))
|
191
|
-
# directory = str(full_path.parent)
|
192
|
-
#
|
193
|
-
# if fs.exists(directory):
|
194
|
-
# if clear_existing:
|
195
|
-
# fs.rm(directory, recursive=True)
|
196
|
-
# else:
|
197
|
-
# fs.mkdirs(directory, exist_ok=True)
|
198
|
-
#
|
199
|
-
# def _save_dataframe_to_parquet(self, full_path: Path, schema: pa.Schema):
|
200
|
-
# """Save the DataFrame to parquet with fsspec using specified schema."""
|
201
|
-
# fs, _ = fsspec.core.url_to_fs(str(full_path))
|
202
|
-
# if fs.exists(full_path):
|
203
|
-
# fs.rm(full_path, recursive=True)
|
204
|
-
# if isinstance(self.df_result, dd.DataFrame):
|
205
|
-
# self.df_result.to_parquet(
|
206
|
-
# str(full_path), engine="pyarrow", schema=schema, write_index=False
|
207
|
-
# )
|
208
|
-
# elif isinstance(self.df_result, pd.DataFrame):
|
209
|
-
# dd.from_pandas(self.df_result, npartitions=1).to_parquet(
|
210
|
-
# str(full_path), engine="pyarrow", schema=schema, write_index=False
|
211
|
-
# )
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sibi-dst
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.17
|
4
4
|
Summary: Data Science Toolkit
|
5
5
|
Author: Luis Valverde
|
6
6
|
Author-email: lvalverdeb@gmail.com
|
@@ -13,6 +13,7 @@ Requires-Dist: chardet (>=5.2.0,<6.0.0)
|
|
13
13
|
Requires-Dist: charset-normalizer (>=3.4.0,<4.0.0)
|
14
14
|
Requires-Dist: clickhouse-connect (>=0.8.7,<0.9.0)
|
15
15
|
Requires-Dist: clickhouse-driver (>=0.2.9,<0.3.0)
|
16
|
+
Requires-Dist: dask-expr (>=1.1.20,<2.0.0)
|
16
17
|
Requires-Dist: dask[complete] (>=2024.11.1,<2025.0.0)
|
17
18
|
Requires-Dist: django (>=5.1.4,<6.0.0)
|
18
19
|
Requires-Dist: djangorestframework (>=3.15.2,<4.0.0)
|
@@ -1,45 +1,46 @@
|
|
1
1
|
sibi_dst/__init__.py,sha256=1KaC0LYTHxjpENq-NXI325WcEYZ8GCBrHGkLoFxEcu0,251
|
2
2
|
sibi_dst/df_helper/__init__.py,sha256=JXJBY47G6wOYhzNI646OBl3pSGWIy4282-3qPGYHU7w,167
|
3
|
-
sibi_dst/df_helper/_df_helper.py,sha256=
|
3
|
+
sibi_dst/df_helper/_df_helper.py,sha256=yOapAc3MLQnylGKs0TG4Nmf8gaLdM7Nvzt4H1bEp8ik,13898
|
4
4
|
sibi_dst/df_helper/_parquet_artifact.py,sha256=f5oHwXtsNW6-ONSFsRB0AniVefA0THzP92J-nugp9vo,4973
|
5
|
-
sibi_dst/df_helper/core/__init__.py,sha256=
|
5
|
+
sibi_dst/df_helper/core/__init__.py,sha256=o4zDwgVmaijde3oix0ezb6KLxI5QFy-SGUhFTDVFLT4,569
|
6
6
|
sibi_dst/df_helper/core/_defaults.py,sha256=pJU-lX7w4nrt0Anx35j08mVr_0oMGn1bTA_iCl_p1qI,6700
|
7
|
+
sibi_dst/df_helper/core/_filter_handler.py,sha256=SYZqpX4Vt6GAGR0L0LohlDOdjLLWQXJDiWWqFG-lSu0,8563
|
7
8
|
sibi_dst/df_helper/core/_params_config.py,sha256=hO-PddoaGjFebqJFgtn76WwVHcCjzPW3z5i3NyK6mDw,3475
|
8
9
|
sibi_dst/df_helper/core/_query_config.py,sha256=HEiyR_fBJjIMum-PSQroY3KaefQ2SpW1w1SQS8oT-NU,489
|
9
10
|
sibi_dst/df_helper/plugins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
11
|
sibi_dst/df_helper/plugins/django/__init__.py,sha256=VkvYql-PUyCKOfoKx5aGdUAki8X-kULfhnCvgSQzHDQ,311
|
11
12
|
sibi_dst/df_helper/plugins/django/_django_db_connection.py,sha256=9fGvXXgqPk_TC7BvaSljxTgNDWoXi_LZxKabEgExznM,1654
|
12
|
-
sibi_dst/df_helper/plugins/django/_django_load_from_db.py,sha256=
|
13
|
+
sibi_dst/df_helper/plugins/django/_django_load_from_db.py,sha256=NSPNCNzvEAR-4AuTSPQWJsidBvT9zRHAN6L3JC1xRV0,5591
|
13
14
|
sibi_dst/df_helper/plugins/django/_django_sql_model_builder.py,sha256=GprCh2c6PFlRBUCir4hh8pmf4Eqb-4OIK6Vz_xXQjMw,14864
|
14
|
-
sibi_dst/df_helper/plugins/django/_io_dask.py,sha256=
|
15
|
+
sibi_dst/df_helper/plugins/django/_io_dask.py,sha256=P3WmkuFzmWRzFchjsVD2OElIR3stuevwDH9G6Mu8IWE,9080
|
15
16
|
sibi_dst/df_helper/plugins/django/_io_dask_alt.py,sha256=zDjLyYxBeL0ffn3yfE_7vqMLMpeEEk2o-zMr66sKkDw,6827
|
16
17
|
sibi_dst/df_helper/plugins/http/__init__.py,sha256=AG9JSDRyVna2r1yxCQ9HcY32EaGnzWsfKgNLgPpSXjY,102
|
17
|
-
sibi_dst/df_helper/plugins/http/_http_config.py,sha256=
|
18
|
+
sibi_dst/df_helper/plugins/http/_http_config.py,sha256=NN3bol7NgBTDv70yOX7hJkazt1-dAAdFWVkYyHdIXsI,2128
|
18
19
|
sibi_dst/df_helper/plugins/parquet/__init__.py,sha256=ClkyIsIh_ovEwqm0dTrkXImbPjLDTVHW2NQqqfQwWAw,187
|
19
20
|
sibi_dst/df_helper/plugins/parquet/_parquet_filter_handler.py,sha256=6iFvblnVq0qj89QvieQuYxe_2RPX5ArKfq5zBcEIj90,3660
|
20
21
|
sibi_dst/df_helper/plugins/parquet/_parquet_options.py,sha256=suJC7LfNEWAo-7_R62YTMSRku3k8orysft83VxRUems,4394
|
21
22
|
sibi_dst/df_helper/plugins/sql_alchemy/__init__.py,sha256=FHorj40SbHc0OBzQ_ieG6MG-HLbf0tw6I_5eoIjJkOI,369
|
22
|
-
sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py,sha256=
|
23
|
-
sibi_dst/df_helper/plugins/sql_alchemy/_sqlachemy_filter_handler.py,sha256=
|
23
|
+
sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py,sha256=fna8xZL8Ij6uCM_tZINO8vPdpJZaXs41gGzR4xn5zd8,5531
|
24
|
+
sibi_dst/df_helper/plugins/sql_alchemy/_sqlachemy_filter_handler.py,sha256=KShsLJYGVxN0ps9Wot7fF0nR0wW9WzcPIcWZ9f5vdBo,4654
|
24
25
|
sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_db_connection.py,sha256=HtMsfH5com4dLVJxh3wdMUpQI3mz0cKDJz0CmFS2S8U,1648
|
25
|
-
sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py,sha256=
|
26
|
-
sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_model_builder.py,sha256=
|
26
|
+
sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py,sha256=DBIM4kk86GxWkyiEZ4dSl_DdKa9SMvANCbympfzOqgQ,2169
|
27
|
+
sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_model_builder.py,sha256=IQK2jOXMNJRQOSD0VQ0p11BeDGlvxD8NfFRilw9Go80,4466
|
27
28
|
sibi_dst/df_helper/plugins/sql_model/__init__.py,sha256=MXd4OOdTqR4cENSV733SGodPO6eQMCexANs-3w0qL5U,226
|
28
29
|
sibi_dst/df_helper/plugins/sql_model/_sqlmodel_db_connection.py,sha256=6jmMjKIv5Btysj3kZMaXQ98IqKQkhnOC-JWtb1B8rus,4265
|
29
|
-
sibi_dst/df_helper/plugins/sql_model/_sqlmodel_load_from_db.py,sha256=
|
30
|
-
sibi_dst/utils/__init__.py,sha256=
|
30
|
+
sibi_dst/df_helper/plugins/sql_model/_sqlmodel_load_from_db.py,sha256=jYwkIz7_E9Z6Mqw1a9TCWKWD146Tbx7mcQFxIpmKgKU,3686
|
31
|
+
sibi_dst/utils/__init__.py,sha256=TV229dPIIEzU5qCLI1G6fnCZW-VirUwSuffp7z7OTFg,783
|
31
32
|
sibi_dst/utils/_airflow_manager.py,sha256=rlt3eolR5QvtxWhAtBTCpHXvxftnKM-ibPMv3fVwNZk,7524
|
32
|
-
sibi_dst/utils/_clickhouse_writer.py,sha256=
|
33
|
+
sibi_dst/utils/_clickhouse_writer.py,sha256=JcnWN2635ATCOaFiB6NYglNXDwqKw0jC7Urs9WOZE20,8571
|
33
34
|
sibi_dst/utils/_credentials.py,sha256=8i6z7y3y5S-6mSk4xrT2AwhzCA32mTn1n1iYX9IVyHk,1724
|
34
|
-
sibi_dst/utils/_data_utils.py,sha256=
|
35
|
-
sibi_dst/utils/_data_wrapper.py,sha256=
|
35
|
+
sibi_dst/utils/_data_utils.py,sha256=ch4j5FEs8ZnniUzpbeLO-b4Yco_6nwCu71xHaVqMGi4,7050
|
36
|
+
sibi_dst/utils/_data_wrapper.py,sha256=_hLZhKqSxcfXe8IyWM2paBxtW2JlOCq2jYhNGcInPi4,9406
|
36
37
|
sibi_dst/utils/_date_utils.py,sha256=KYB07puKDrSG8tOm_i1HGX0TjLNUtSWjwfsCYBmW9co,10619
|
37
|
-
sibi_dst/utils/_df_utils.py,sha256=
|
38
|
+
sibi_dst/utils/_df_utils.py,sha256=pjEfkof9hggXQgYerG0p4DXrwBeIRynJFg4IX3Yrb4c,10919
|
38
39
|
sibi_dst/utils/_file_utils.py,sha256=5EN90c8N1n9d-_xwz2RzaYcXRMQY_rws2Q3EA3pNAog,1254
|
39
40
|
sibi_dst/utils/_filepath_generator.py,sha256=ytPSZ9GYOnnSP25zwA-0NjFHupPRZyXwixWnn_68_n0,6686
|
40
|
-
sibi_dst/utils/_log_utils.py,sha256=
|
41
|
-
sibi_dst/utils/_parquet_saver.py,sha256
|
41
|
+
sibi_dst/utils/_log_utils.py,sha256=rPp8z1UglwvqzBOOAvMOct0syQZ-54gGYafnJDRYZN4,2313
|
42
|
+
sibi_dst/utils/_parquet_saver.py,sha256=3BK0XXgMOOAdIw4OzbwMxmDrzDw3_MKi8RTpulIVUe0,4367
|
42
43
|
sibi_dst/utils/_storage_manager.py,sha256=KP2HBXnLUMMquqcO30ecfuoU7g1z8RtaV3Dv0TvEXoY,3856
|
43
|
-
sibi_dst-0.3.
|
44
|
-
sibi_dst-0.3.
|
45
|
-
sibi_dst-0.3.
|
44
|
+
sibi_dst-0.3.17.dist-info/METADATA,sha256=Aw__Wr7myZwJfWGRNFy0Ye5FLXUnGf6b14GW5KBDGtE,2133
|
45
|
+
sibi_dst-0.3.17.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
46
|
+
sibi_dst-0.3.17.dist-info/RECORD,,
|
File without changes
|