sibi-dst 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_df_helper.py +9 -2
- sibi_dst/df_helper/core/__init__.py +9 -2
- sibi_dst/df_helper/core/_defaults.py +99 -3
- sibi_dst/df_helper/plugins/django/_django_load_from_db.py +60 -9
- sibi_dst/df_helper/plugins/http/_http_config.py +2 -3
- sibi_dst/df_helper/plugins/parquet/_parquet_filter_handler.py +4 -0
- sibi_dst/df_helper/plugins/parquet/_parquet_options.py +2 -0
- sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py +118 -128
- sibi_dst/df_helper/plugins/sql_alchemy/_sqlachemy_filter_handler.py +2 -0
- sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py +28 -52
- sibi_dst/utils/__init__.py +2 -1
- sibi_dst/utils/_data_utils.py +116 -88
- sibi_dst/utils/_data_wrapper.py +2 -320
- sibi_dst/utils/_date_utils.py +130 -0
- sibi_dst/utils/_df_utils.py +91 -0
- {sibi_dst-0.3.10.dist-info → sibi_dst-0.3.12.dist-info}/METADATA +5 -2
- {sibi_dst-0.3.10.dist-info → sibi_dst-0.3.12.dist-info}/RECORD +18 -18
- {sibi_dst-0.3.10.dist-info → sibi_dst-0.3.12.dist-info}/WHEEL +0 -0
@@ -1,23 +1,30 @@
|
|
1
|
+
from typing import Dict
|
2
|
+
|
1
3
|
import dask.dataframe as dd
|
2
|
-
from sqlmodel import Session, select
|
3
|
-
from typing import Any, Dict, Optional
|
4
|
-
import logging
|
5
4
|
import pandas as pd
|
5
|
+
from sqlalchemy.inspection import inspect
|
6
|
+
from sqlalchemy.orm import sessionmaker
|
7
|
+
from sqlalchemy import select
|
8
|
+
#from sqlmodel import Session, select
|
9
|
+
|
10
|
+
from sibi_dst.df_helper.core import ParamsConfig, QueryConfig, sqlalchemy_field_conversion_map_dask, \
|
11
|
+
normalize_sqlalchemy_type
|
6
12
|
from sibi_dst.utils import Logger
|
7
|
-
from .
|
8
|
-
from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
|
13
|
+
from ._io_sqlalchemy_dask import SQLAlchemyDask
|
9
14
|
from ._sqlachemy_filter_handler import SqlAlchemyFilterHandler
|
15
|
+
from ._sqlalchemy_db_connection import SqlAlchemyConnectionConfig
|
16
|
+
|
10
17
|
|
11
18
|
class SqlAlchemyLoadFromDb:
|
12
19
|
df: dd.DataFrame
|
13
20
|
|
14
21
|
def __init__(
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
22
|
+
self,
|
23
|
+
plugin_sqlalchemy: SqlAlchemyConnectionConfig, # Expected to be an instance of SqlAlchemyConnection
|
24
|
+
plugin_query: QueryConfig = None,
|
25
|
+
plugin_params: ParamsConfig = None,
|
26
|
+
logger: Logger = None,
|
27
|
+
**kwargs,
|
21
28
|
):
|
22
29
|
"""
|
23
30
|
Initialize the loader with database connection, query, and parameters.
|
@@ -27,7 +34,7 @@ class SqlAlchemyLoadFromDb:
|
|
27
34
|
self.model = self.db_connection.model
|
28
35
|
self.engine = self.db_connection.engine
|
29
36
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
30
|
-
self.query_config =
|
37
|
+
self.query_config = plugin_query
|
31
38
|
self.params_config = plugin_params
|
32
39
|
self.debug = kwargs.pop("debug", False)
|
33
40
|
self.verbose_debug = kwargs.pop("verbose_debug", False)
|
@@ -40,44 +47,13 @@ class SqlAlchemyLoadFromDb:
|
|
40
47
|
return self.df
|
41
48
|
|
42
49
|
def _build_and_load(self) -> dd.DataFrame:
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
filters = self.params_config.filters
|
50
|
-
if filters:
|
51
|
-
n_records = 0
|
52
|
-
query = SqlAlchemyFilterHandler.apply_filters_sqlalchemy(query, self.model,self.params_config.filters)
|
53
|
-
else:
|
54
|
-
n_records = self.query_config.n_records or 100
|
55
|
-
|
56
|
-
if n_records:
|
57
|
-
query = query.limit(n_records)
|
58
|
-
|
59
|
-
# Debug: Log the SQL query
|
60
|
-
if self.debug:
|
61
|
-
self.logger.info(f"Executing query: {str(query)}")
|
62
|
-
|
63
|
-
# Execute the query
|
64
|
-
try:
|
65
|
-
results = session.exec(query).fetchall()
|
66
|
-
if results:
|
67
|
-
records = [
|
68
|
-
{key: getattr(result, key) for key in result.__table__.columns.keys()}
|
69
|
-
for result in results
|
70
|
-
]
|
71
|
-
df = dd.from_pandas(pd.DataFrame(records), npartitions=1)
|
72
|
-
except Exception as e:
|
73
|
-
self.logger.info(results)
|
74
|
-
self.logger.warning("Query returned no results.")
|
75
|
-
df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
76
|
-
|
77
|
-
except Exception as e:
|
78
|
-
print(query)
|
79
|
-
self.logger.error(f"Error loading data: {e}")
|
80
|
-
df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
81
|
-
|
50
|
+
try:
|
51
|
+
reader = SQLAlchemyDask(model=self.model, filters=self.params_config.filters,engine_url=self.engine.url, logger=self.logger, chunk_size=1000, verbose=self.debug)
|
52
|
+
df = reader.read_frame()
|
53
|
+
if df is None or len(df.index) == 0:
|
54
|
+
self.logger.warning("Query returned no results.")
|
55
|
+
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
82
56
|
return df
|
83
|
-
|
57
|
+
except Exception as e:
|
58
|
+
self.logger.error(f"Failed to load data into Dask DataFrame.{e}")
|
59
|
+
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
sibi_dst/utils/__init__.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
from ._credentials import ConfigManager, ConfigLoader
|
3
3
|
from ._log_utils import Logger
|
4
|
-
from ._date_utils import DateUtils
|
4
|
+
from ._date_utils import DateUtils, BusinessDays
|
5
5
|
from ._data_utils import DataUtils
|
6
6
|
from ._file_utils import FileUtils
|
7
7
|
from ._filepath_generator import FilePathGenerator
|
@@ -17,6 +17,7 @@ __all__=[
|
|
17
17
|
"ConfigLoader",
|
18
18
|
"Logger",
|
19
19
|
"DateUtils",
|
20
|
+
"BusinessDays",
|
20
21
|
"FileUtils",
|
21
22
|
"DataWrapper",
|
22
23
|
"DataUtils",
|
sibi_dst/utils/_data_utils.py
CHANGED
@@ -7,13 +7,35 @@ class DataUtils:
|
|
7
7
|
def __init__(self, logger=None):
|
8
8
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
9
9
|
|
10
|
+
def transform_numeric_cols(self, df, columns, fill_value=0, dtype=int):
|
11
|
+
if not columns:
|
12
|
+
self.logger.warning('No columns specified')
|
13
|
+
|
14
|
+
columns = [column for column in columns if column in df.columns]
|
15
|
+
for col in columns:
|
16
|
+
if isinstance(df, dd.DataFrame):
|
17
|
+
# Replace NaN with 0, then convert to boolean
|
18
|
+
df[col] = df[col].map_partitions(
|
19
|
+
lambda s: pd.to_numeric(s, errors='coerce') # Convert to numeric, invalid to NaN
|
20
|
+
.fillna(fill_value) # Replace NaN with 0
|
21
|
+
.astype(dtype),
|
22
|
+
meta=(col, dtype)
|
23
|
+
)
|
24
|
+
else:
|
25
|
+
# For Pandas DataFrame, handle mixed types and invalid values
|
26
|
+
df[col] = pd.to_numeric(df[col], errors='coerce') # Convert to numeric, invalid to NaN
|
27
|
+
df[col] = df[col].fillna(fill_value).astype(dtype)
|
28
|
+
|
29
|
+
return df
|
30
|
+
|
10
31
|
@staticmethod
|
11
|
-
def transform_numeric_columns(df, fill_value=0, transform_func=None):
|
32
|
+
def transform_numeric_columns(df, columns=None, fill_value=0, transform_func=None):
|
12
33
|
"""
|
13
34
|
Transform numeric columns in a DataFrame (Pandas or Dask), handling missing values and applying optional transformations.
|
14
35
|
|
15
36
|
Parameters:
|
16
37
|
- df (pandas.DataFrame or dask.dataframe.DataFrame): The DataFrame.
|
38
|
+
- columns (list of str, optional): Specific columns to transform. If None, all numeric columns are transformed.
|
17
39
|
- fill_value (int or float): The value to replace NA values with.
|
18
40
|
- transform_func (callable, optional): The transformation function to apply.
|
19
41
|
If None, no additional transformation is applied.
|
@@ -21,68 +43,64 @@ class DataUtils:
|
|
21
43
|
Returns:
|
22
44
|
- pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with transformed numeric columns.
|
23
45
|
"""
|
24
|
-
|
25
|
-
|
46
|
+
if columns is None:
|
47
|
+
# Detect numeric columns
|
48
|
+
columns = df.select_dtypes(include=['number']).columns.tolist()
|
26
49
|
|
27
|
-
if not
|
50
|
+
if not columns:
|
28
51
|
return df
|
29
52
|
|
53
|
+
columns = [column for column in columns if column in df.columns]
|
30
54
|
# Default transformation function (identity) if none is provided
|
31
55
|
if transform_func is None:
|
32
56
|
transform_func = lambda x: x
|
33
57
|
|
34
|
-
#
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
58
|
+
# Batch processing for Dask
|
59
|
+
if isinstance(df, dd.DataFrame):
|
60
|
+
def transform_partition(partition):
|
61
|
+
# Apply transformations for all numeric columns in a single pass
|
62
|
+
partition[columns] = partition[columns].fillna(fill_value).map(transform_func)
|
63
|
+
return partition
|
64
|
+
|
65
|
+
# Apply the transformation function to all specified columns
|
66
|
+
df = df.map_partitions(transform_partition, meta=df)
|
67
|
+
else:
|
68
|
+
# Pandas: Vectorized operations for all specified columns
|
69
|
+
df[columns] = df[columns].fillna(fill_value).map(transform_func)
|
43
70
|
|
44
|
-
df[col] = df[col].fillna(fill_value).astype(meta_type)
|
45
|
-
if isinstance(df, dd.DataFrame):
|
46
|
-
df[col] = df[col].map_partitions(
|
47
|
-
lambda s: s.apply(transform_func), meta=(col, meta_type)
|
48
|
-
)
|
49
|
-
else:
|
50
|
-
df[col] = df[col].apply(transform_func)
|
51
71
|
return df
|
52
72
|
|
53
73
|
@staticmethod
|
54
|
-
def transform_boolean_columns(df,
|
74
|
+
def transform_boolean_columns(df, columns=None):
|
55
75
|
"""
|
56
|
-
|
76
|
+
Detect if the provided columns in a DataFrame (Pandas or Dask) contain only 0 and 1
|
77
|
+
and convert them to boolean. Detection is performed using a sample.
|
57
78
|
|
58
79
|
Parameters:
|
59
80
|
- df (pandas.DataFrame or dask.dataframe.DataFrame): The DataFrame.
|
60
|
-
-
|
81
|
+
- columns (list of str): List of columns to check and transform.
|
82
|
+
- sample_size (int): Number of rows to sample for detection. Ignored for Pandas DataFrames.
|
61
83
|
|
62
84
|
Returns:
|
63
85
|
- pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with transformed boolean columns.
|
64
86
|
"""
|
87
|
+
# Apply transformation to each specified column
|
88
|
+
for col in columns:
|
89
|
+
if col in df.columns:
|
90
|
+
if isinstance(df, dd.DataFrame):
|
91
|
+
# Replace NaN with 0, then convert to boolean
|
92
|
+
df[col] = df[col].map_partitions(
|
93
|
+
lambda s: pd.to_numeric(s, errors='coerce') # Convert to numeric, invalid to NaN
|
94
|
+
.fillna(0) # Replace NaN with 0
|
95
|
+
.astype(int) # Ensure integer type
|
96
|
+
.astype(bool), # Convert to boolean
|
97
|
+
meta=(col, 'bool')
|
98
|
+
)
|
99
|
+
else:
|
100
|
+
# For Pandas DataFrame, handle mixed types and invalid values
|
101
|
+
df[col] = pd.to_numeric(df[col], errors='coerce') # Convert to numeric, invalid to NaN
|
102
|
+
df[col] = df[col].fillna(0).astype(int).astype(bool)
|
65
103
|
|
66
|
-
def is_boolean_like(col):
|
67
|
-
"""
|
68
|
-
Check if a column is boolean-like (contains only two unique values).
|
69
|
-
"""
|
70
|
-
unique_values = col.dropna().unique()
|
71
|
-
if isinstance(col, dd.Series):
|
72
|
-
unique_values = unique_values.compute()
|
73
|
-
return len(unique_values) <= 2 and set(unique_values).issubset({0, 1, True, False})
|
74
|
-
|
75
|
-
# Detect boolean-like columns
|
76
|
-
boolean_columns = [col for col in df.columns if is_boolean_like(df[col])]
|
77
|
-
|
78
|
-
# Apply transformation to each detected column
|
79
|
-
for col in boolean_columns:
|
80
|
-
if isinstance(df, dd.DataFrame):
|
81
|
-
df[col] = df[col].map_partitions(
|
82
|
-
lambda s: s == threshold, meta=(col, 'bool')
|
83
|
-
)
|
84
|
-
else:
|
85
|
-
df[col] = df[col] == threshold
|
86
104
|
return df
|
87
105
|
|
88
106
|
def merge_lookup_data(self, classname, df, **kwargs):
|
@@ -97,73 +115,67 @@ class DataUtils:
|
|
97
115
|
Returns:
|
98
116
|
- pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with merged lookup data.
|
99
117
|
"""
|
100
|
-
#
|
118
|
+
# Return early if the DataFrame is empty
|
101
119
|
if self.is_dataframe_empty(df):
|
102
120
|
return df
|
103
121
|
|
104
|
-
# Extract required parameters
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
122
|
+
# Extract and validate required parameters
|
123
|
+
required_params = ['source_col', 'lookup_col', 'lookup_description_col', 'source_description_alias']
|
124
|
+
missing_params = [param for param in required_params if param not in kwargs]
|
125
|
+
if missing_params:
|
126
|
+
raise ValueError(f"Missing required parameters: {', '.join(missing_params)}")
|
127
|
+
|
128
|
+
source_col = kwargs.pop('source_col')
|
129
|
+
lookup_col = kwargs.pop('lookup_col')
|
130
|
+
lookup_description_col = kwargs.pop('lookup_description_col')
|
131
|
+
source_description_alias = kwargs.pop('source_description_alias')
|
112
132
|
|
113
|
-
#
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
)
|
133
|
+
# Optional parameters with default values
|
134
|
+
fillna_source_description_alias = kwargs.pop('fillna_source_description_alias', False)
|
135
|
+
fieldnames = kwargs.pop('fieldnames', (lookup_col, lookup_description_col))
|
136
|
+
column_names = kwargs.pop('column_names', ['temp_join_col', source_description_alias])
|
118
137
|
|
119
138
|
if source_col not in df.columns:
|
120
|
-
self.logger.info(f
|
139
|
+
self.logger.info(f"{source_col} not in DataFrame columns")
|
121
140
|
return df
|
122
141
|
|
123
142
|
# Get unique IDs from source column
|
124
143
|
ids = df[source_col].dropna().unique()
|
125
144
|
if isinstance(ids, dd.Series):
|
126
145
|
ids = ids.compute()
|
127
|
-
|
128
|
-
|
129
|
-
if not ids:
|
130
|
-
self.logger.info(f'No IDs found in the source column: {source_col}')
|
146
|
+
if not len(ids):
|
147
|
+
self.logger.info(f"No IDs found in the source column: {source_col}")
|
131
148
|
return df
|
132
|
-
|
133
|
-
# Set default fieldnames and column_names if not provided
|
134
|
-
if fieldnames is None:
|
135
|
-
kwargs['fieldnames'] = (lookup_col, lookup_description_col)
|
136
|
-
if column_names is None:
|
137
|
-
kwargs['column_names'] = ['temp_join_col', source_description_alias]
|
138
|
-
|
149
|
+
ids = sorted(ids.tolist())
|
139
150
|
# Prepare kwargs for loading lookup data
|
140
151
|
load_kwargs = kwargs.copy()
|
141
|
-
load_kwargs
|
142
|
-
|
152
|
+
load_kwargs.update({
|
153
|
+
'fieldnames': fieldnames,
|
154
|
+
'column_names': column_names,
|
155
|
+
f'{lookup_col}__in': ids
|
156
|
+
})
|
143
157
|
# Load lookup data
|
144
|
-
lookup_instance = classname()
|
158
|
+
lookup_instance = classname(debug=True, verbose_debug=True)
|
145
159
|
result = lookup_instance.load(**load_kwargs)
|
146
|
-
|
160
|
+
if len(result.index) == 0:
|
161
|
+
self.logger.info(f"No IDs found in the source column: {source_col}")
|
162
|
+
return df
|
147
163
|
# Determine the join column on the result DataFrame
|
148
|
-
if 'temp_join_col' in
|
149
|
-
temp_join_col = 'temp_join_col'
|
150
|
-
else:
|
151
|
-
temp_join_col = lookup_col
|
164
|
+
temp_join_col = 'temp_join_col' if 'temp_join_col' in column_names else lookup_col
|
152
165
|
|
153
166
|
# Merge DataFrames
|
154
167
|
df = df.merge(result, how='left', left_on=source_col, right_on=temp_join_col)
|
155
168
|
|
156
169
|
if fillna_source_description_alias and source_description_alias in df.columns:
|
157
|
-
df[source_description_alias]
|
170
|
+
df[source_description_alias]=df[source_description_alias].fillna('')
|
158
171
|
|
159
172
|
# Drop temp_join_col if present
|
160
|
-
|
161
|
-
df = df.drop(columns='temp_join_col')
|
173
|
+
df = df.drop(columns='temp_join_col', errors='ignore')
|
162
174
|
|
163
175
|
return df
|
164
176
|
|
165
|
-
|
166
|
-
def is_dataframe_empty(df):
|
177
|
+
|
178
|
+
def is_dataframe_empty(self, df):
|
167
179
|
"""
|
168
180
|
Check if a DataFrame (Pandas or Dask) is empty.
|
169
181
|
|
@@ -174,14 +186,30 @@ class DataUtils:
|
|
174
186
|
- bool: True if the DataFrame is empty, False otherwise.
|
175
187
|
"""
|
176
188
|
if isinstance(df, dd.DataFrame):
|
177
|
-
|
178
|
-
|
179
|
-
|
189
|
+
try:
|
190
|
+
return len(df.index) == 0
|
191
|
+
except Exception as e:
|
192
|
+
self.logger.error(f"Error while processing Dask DataFrame: {e}")
|
193
|
+
return False
|
194
|
+
elif isinstance(df, pd.DataFrame):
|
180
195
|
return df.empty
|
196
|
+
else:
|
197
|
+
self.logger.error("Input must be a pandas or dask DataFrame.")
|
198
|
+
return False
|
181
199
|
|
182
200
|
@staticmethod
|
183
|
-
def
|
201
|
+
def convert_to_datetime_dask(df, date_fields):
|
202
|
+
"""
|
203
|
+
Convert specified columns in a Dask DataFrame to datetime, handling errors gracefully.
|
204
|
+
|
205
|
+
Parameters:
|
206
|
+
- df (dask.dataframe.DataFrame): The Dask DataFrame containing the columns.
|
207
|
+
- date_fields (list of str): List of column names to convert to datetime.
|
208
|
+
|
209
|
+
Returns:
|
210
|
+
- dask.dataframe.DataFrame: Updated DataFrame with specified columns converted to datetime.
|
211
|
+
"""
|
184
212
|
for col in date_fields:
|
185
213
|
if col in df.columns:
|
186
|
-
df[col] =
|
187
|
-
return df
|
214
|
+
df[col] = df[col].map_partitions(pd.to_datetime, errors="coerce", meta=(col, "datetime64[ns]"))
|
215
|
+
return df
|