sibi-dst 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,23 +1,30 @@
1
+ from typing import Dict
2
+
1
3
  import dask.dataframe as dd
2
- from sqlmodel import Session, select
3
- from typing import Any, Dict, Optional
4
- import logging
5
4
  import pandas as pd
5
+ from sqlalchemy.inspection import inspect
6
+ from sqlalchemy.orm import sessionmaker
7
+ from sqlalchemy import select
8
+ #from sqlmodel import Session, select
9
+
10
+ from sibi_dst.df_helper.core import ParamsConfig, QueryConfig, sqlalchemy_field_conversion_map_dask, \
11
+ normalize_sqlalchemy_type
6
12
  from sibi_dst.utils import Logger
7
- from ._sqlalchemy_db_connection import SqlAlchemyConnectionConfig
8
- from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
13
+ from ._io_sqlalchemy_dask import SQLAlchemyDask
9
14
  from ._sqlachemy_filter_handler import SqlAlchemyFilterHandler
15
+ from ._sqlalchemy_db_connection import SqlAlchemyConnectionConfig
16
+
10
17
 
11
18
  class SqlAlchemyLoadFromDb:
12
19
  df: dd.DataFrame
13
20
 
14
21
  def __init__(
15
- self,
16
- plugin_sqlalchemy: SqlAlchemyConnectionConfig, # Expected to be an instance of SqlAlchemyConnection
17
- plugin_query: QueryConfig = None,
18
- plugin_params: ParamsConfig = None,
19
- logger: Logger = None,
20
- **kwargs,
22
+ self,
23
+ plugin_sqlalchemy: SqlAlchemyConnectionConfig, # Expected to be an instance of SqlAlchemyConnection
24
+ plugin_query: QueryConfig = None,
25
+ plugin_params: ParamsConfig = None,
26
+ logger: Logger = None,
27
+ **kwargs,
21
28
  ):
22
29
  """
23
30
  Initialize the loader with database connection, query, and parameters.
@@ -27,7 +34,7 @@ class SqlAlchemyLoadFromDb:
27
34
  self.model = self.db_connection.model
28
35
  self.engine = self.db_connection.engine
29
36
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
30
- self.query_config = plugin_query
37
+ self.query_config = plugin_query
31
38
  self.params_config = plugin_params
32
39
  self.debug = kwargs.pop("debug", False)
33
40
  self.verbose_debug = kwargs.pop("verbose_debug", False)
@@ -40,44 +47,13 @@ class SqlAlchemyLoadFromDb:
40
47
  return self.df
41
48
 
42
49
  def _build_and_load(self) -> dd.DataFrame:
43
- """
44
- Query the database and load results into a Dask DataFrame.
45
- """
46
- with Session(self.engine) as session:
47
- try:
48
- query = select(self.model)
49
- filters = self.params_config.filters
50
- if filters:
51
- n_records = 0
52
- query = SqlAlchemyFilterHandler.apply_filters_sqlalchemy(query, self.model,self.params_config.filters)
53
- else:
54
- n_records = self.query_config.n_records or 100
55
-
56
- if n_records:
57
- query = query.limit(n_records)
58
-
59
- # Debug: Log the SQL query
60
- if self.debug:
61
- self.logger.info(f"Executing query: {str(query)}")
62
-
63
- # Execute the query
64
- try:
65
- results = session.exec(query).fetchall()
66
- if results:
67
- records = [
68
- {key: getattr(result, key) for key in result.__table__.columns.keys()}
69
- for result in results
70
- ]
71
- df = dd.from_pandas(pd.DataFrame(records), npartitions=1)
72
- except Exception as e:
73
- self.logger.info(results)
74
- self.logger.warning("Query returned no results.")
75
- df = dd.from_pandas(pd.DataFrame(), npartitions=1)
76
-
77
- except Exception as e:
78
- print(query)
79
- self.logger.error(f"Error loading data: {e}")
80
- df = dd.from_pandas(pd.DataFrame(), npartitions=1)
81
-
50
+ try:
51
+ reader = SQLAlchemyDask(model=self.model, filters=self.params_config.filters,engine_url=self.engine.url, logger=self.logger, chunk_size=1000, verbose=self.debug)
52
+ df = reader.read_frame()
53
+ if df is None or len(df.index) == 0:
54
+ self.logger.warning("Query returned no results.")
55
+ return dd.from_pandas(pd.DataFrame(), npartitions=1)
82
56
  return df
83
-
57
+ except Exception as e:
58
+ self.logger.error(f"Failed to load data into Dask DataFrame.{e}")
59
+ return dd.from_pandas(pd.DataFrame(), npartitions=1)
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
  from ._credentials import ConfigManager, ConfigLoader
3
3
  from ._log_utils import Logger
4
- from ._date_utils import DateUtils
4
+ from ._date_utils import DateUtils, BusinessDays
5
5
  from ._data_utils import DataUtils
6
6
  from ._file_utils import FileUtils
7
7
  from ._filepath_generator import FilePathGenerator
@@ -17,6 +17,7 @@ __all__=[
17
17
  "ConfigLoader",
18
18
  "Logger",
19
19
  "DateUtils",
20
+ "BusinessDays",
20
21
  "FileUtils",
21
22
  "DataWrapper",
22
23
  "DataUtils",
@@ -7,13 +7,35 @@ class DataUtils:
7
7
  def __init__(self, logger=None):
8
8
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
9
9
 
10
+ def transform_numeric_cols(self, df, columns, fill_value=0, dtype=int):
11
+ if not columns:
12
+ self.logger.warning('No columns specified')
13
+
14
+ columns = [column for column in columns if column in df.columns]
15
+ for col in columns:
16
+ if isinstance(df, dd.DataFrame):
17
+ # Replace NaN with 0, then convert to boolean
18
+ df[col] = df[col].map_partitions(
19
+ lambda s: pd.to_numeric(s, errors='coerce') # Convert to numeric, invalid to NaN
20
+ .fillna(fill_value) # Replace NaN with 0
21
+ .astype(dtype),
22
+ meta=(col, dtype)
23
+ )
24
+ else:
25
+ # For Pandas DataFrame, handle mixed types and invalid values
26
+ df[col] = pd.to_numeric(df[col], errors='coerce') # Convert to numeric, invalid to NaN
27
+ df[col] = df[col].fillna(fill_value).astype(dtype)
28
+
29
+ return df
30
+
10
31
  @staticmethod
11
- def transform_numeric_columns(df, fill_value=0, transform_func=None):
32
+ def transform_numeric_columns(df, columns=None, fill_value=0, transform_func=None):
12
33
  """
13
34
  Transform numeric columns in a DataFrame (Pandas or Dask), handling missing values and applying optional transformations.
14
35
 
15
36
  Parameters:
16
37
  - df (pandas.DataFrame or dask.dataframe.DataFrame): The DataFrame.
38
+ - columns (list of str, optional): Specific columns to transform. If None, all numeric columns are transformed.
17
39
  - fill_value (int or float): The value to replace NA values with.
18
40
  - transform_func (callable, optional): The transformation function to apply.
19
41
  If None, no additional transformation is applied.
@@ -21,68 +43,64 @@ class DataUtils:
21
43
  Returns:
22
44
  - pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with transformed numeric columns.
23
45
  """
24
- # Detect numeric columns
25
- numeric_columns = df.select_dtypes(include=['number']).columns.tolist()
46
+ if columns is None:
47
+ # Detect numeric columns
48
+ columns = df.select_dtypes(include=['number']).columns.tolist()
26
49
 
27
- if not numeric_columns:
50
+ if not columns:
28
51
  return df
29
52
 
53
+ columns = [column for column in columns if column in df.columns]
30
54
  # Default transformation function (identity) if none is provided
31
55
  if transform_func is None:
32
56
  transform_func = lambda x: x
33
57
 
34
- # Apply transformations
35
- for col in numeric_columns:
36
- dtype = df[col].dtype
37
- if pd.api.types.is_integer_dtype(dtype):
38
- meta_type = 'int64'
39
- elif pd.api.types.is_float_dtype(dtype):
40
- meta_type = 'float64'
41
- else:
42
- continue # Skip non-numeric columns
58
+ # Batch processing for Dask
59
+ if isinstance(df, dd.DataFrame):
60
+ def transform_partition(partition):
61
+ # Apply transformations for all numeric columns in a single pass
62
+ partition[columns] = partition[columns].fillna(fill_value).map(transform_func)
63
+ return partition
64
+
65
+ # Apply the transformation function to all specified columns
66
+ df = df.map_partitions(transform_partition, meta=df)
67
+ else:
68
+ # Pandas: Vectorized operations for all specified columns
69
+ df[columns] = df[columns].fillna(fill_value).map(transform_func)
43
70
 
44
- df[col] = df[col].fillna(fill_value).astype(meta_type)
45
- if isinstance(df, dd.DataFrame):
46
- df[col] = df[col].map_partitions(
47
- lambda s: s.apply(transform_func), meta=(col, meta_type)
48
- )
49
- else:
50
- df[col] = df[col].apply(transform_func)
51
71
  return df
52
72
 
53
73
  @staticmethod
54
- def transform_boolean_columns(df, threshold=1):
74
+ def transform_boolean_columns(df, columns=None):
55
75
  """
56
- Transform boolean-like columns in a DataFrame (Pandas or Dask) to actual booleans.
76
+ Detect if the provided columns in a DataFrame (Pandas or Dask) contain only 0 and 1
77
+ and convert them to boolean. Detection is performed using a sample.
57
78
 
58
79
  Parameters:
59
80
  - df (pandas.DataFrame or dask.dataframe.DataFrame): The DataFrame.
60
- - threshold (int or float): The value to evaluate as `True`.
81
+ - columns (list of str): List of columns to check and transform.
82
+ - sample_size (int): Number of rows to sample for detection. Ignored for Pandas DataFrames.
61
83
 
62
84
  Returns:
63
85
  - pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with transformed boolean columns.
64
86
  """
87
+ # Apply transformation to each specified column
88
+ for col in columns:
89
+ if col in df.columns:
90
+ if isinstance(df, dd.DataFrame):
91
+ # Replace NaN with 0, then convert to boolean
92
+ df[col] = df[col].map_partitions(
93
+ lambda s: pd.to_numeric(s, errors='coerce') # Convert to numeric, invalid to NaN
94
+ .fillna(0) # Replace NaN with 0
95
+ .astype(int) # Ensure integer type
96
+ .astype(bool), # Convert to boolean
97
+ meta=(col, 'bool')
98
+ )
99
+ else:
100
+ # For Pandas DataFrame, handle mixed types and invalid values
101
+ df[col] = pd.to_numeric(df[col], errors='coerce') # Convert to numeric, invalid to NaN
102
+ df[col] = df[col].fillna(0).astype(int).astype(bool)
65
103
 
66
- def is_boolean_like(col):
67
- """
68
- Check if a column is boolean-like (contains only two unique values).
69
- """
70
- unique_values = col.dropna().unique()
71
- if isinstance(col, dd.Series):
72
- unique_values = unique_values.compute()
73
- return len(unique_values) <= 2 and set(unique_values).issubset({0, 1, True, False})
74
-
75
- # Detect boolean-like columns
76
- boolean_columns = [col for col in df.columns if is_boolean_like(df[col])]
77
-
78
- # Apply transformation to each detected column
79
- for col in boolean_columns:
80
- if isinstance(df, dd.DataFrame):
81
- df[col] = df[col].map_partitions(
82
- lambda s: s == threshold, meta=(col, 'bool')
83
- )
84
- else:
85
- df[col] = df[col] == threshold
86
104
  return df
87
105
 
88
106
  def merge_lookup_data(self, classname, df, **kwargs):
@@ -97,73 +115,67 @@ class DataUtils:
97
115
  Returns:
98
116
  - pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with merged lookup data.
99
117
  """
100
- # Check if the DataFrame is empty
118
+ # Return early if the DataFrame is empty
101
119
  if self.is_dataframe_empty(df):
102
120
  return df
103
121
 
104
- # Extract required parameters with default values
105
- source_col = kwargs.pop('source_col', None)
106
- lookup_col = kwargs.pop('lookup_col', None)
107
- lookup_description_col = kwargs.pop('lookup_description_col', None)
108
- source_description_alias = kwargs.pop('source_description_alias', None)
109
- fillna_source_description_alias = kwargs.pop('fillna_source_description_alias', False)
110
- fieldnames = kwargs.get('fieldnames', None)
111
- column_names = kwargs.get('column_names', None)
122
+ # Extract and validate required parameters
123
+ required_params = ['source_col', 'lookup_col', 'lookup_description_col', 'source_description_alias']
124
+ missing_params = [param for param in required_params if param not in kwargs]
125
+ if missing_params:
126
+ raise ValueError(f"Missing required parameters: {', '.join(missing_params)}")
127
+
128
+ source_col = kwargs.pop('source_col')
129
+ lookup_col = kwargs.pop('lookup_col')
130
+ lookup_description_col = kwargs.pop('lookup_description_col')
131
+ source_description_alias = kwargs.pop('source_description_alias')
112
132
 
113
- # Validate required parameters
114
- if not all([source_col, lookup_col, lookup_description_col, source_description_alias]):
115
- raise ValueError(
116
- 'source_col, lookup_col, lookup_description_col, and source_description_alias must be specified'
117
- )
133
+ # Optional parameters with default values
134
+ fillna_source_description_alias = kwargs.pop('fillna_source_description_alias', False)
135
+ fieldnames = kwargs.pop('fieldnames', (lookup_col, lookup_description_col))
136
+ column_names = kwargs.pop('column_names', ['temp_join_col', source_description_alias])
118
137
 
119
138
  if source_col not in df.columns:
120
- self.logger.info(f'{source_col} not in DataFrame columns')
139
+ self.logger.info(f"{source_col} not in DataFrame columns")
121
140
  return df
122
141
 
123
142
  # Get unique IDs from source column
124
143
  ids = df[source_col].dropna().unique()
125
144
  if isinstance(ids, dd.Series):
126
145
  ids = ids.compute()
127
- ids = ids.tolist()
128
-
129
- if not ids:
130
- self.logger.info(f'No IDs found in the source column: {source_col}')
146
+ if not len(ids):
147
+ self.logger.info(f"No IDs found in the source column: {source_col}")
131
148
  return df
132
-
133
- # Set default fieldnames and column_names if not provided
134
- if fieldnames is None:
135
- kwargs['fieldnames'] = (lookup_col, lookup_description_col)
136
- if column_names is None:
137
- kwargs['column_names'] = ['temp_join_col', source_description_alias]
138
-
149
+ ids = sorted(ids.tolist())
139
150
  # Prepare kwargs for loading lookup data
140
151
  load_kwargs = kwargs.copy()
141
- load_kwargs[f'{lookup_col}__in'] = ids
142
-
152
+ load_kwargs.update({
153
+ 'fieldnames': fieldnames,
154
+ 'column_names': column_names,
155
+ f'{lookup_col}__in': ids
156
+ })
143
157
  # Load lookup data
144
- lookup_instance = classname()
158
+ lookup_instance = classname(debug=True, verbose_debug=True)
145
159
  result = lookup_instance.load(**load_kwargs)
146
-
160
+ if len(result.index) == 0:
161
+ self.logger.info(f"No IDs found in the source column: {source_col}")
162
+ return df
147
163
  # Determine the join column on the result DataFrame
148
- if 'temp_join_col' in kwargs.get("column_names", []):
149
- temp_join_col = 'temp_join_col'
150
- else:
151
- temp_join_col = lookup_col
164
+ temp_join_col = 'temp_join_col' if 'temp_join_col' in column_names else lookup_col
152
165
 
153
166
  # Merge DataFrames
154
167
  df = df.merge(result, how='left', left_on=source_col, right_on=temp_join_col)
155
168
 
156
169
  if fillna_source_description_alias and source_description_alias in df.columns:
157
- df[source_description_alias] = df[source_description_alias].fillna('')
170
+ df[source_description_alias]=df[source_description_alias].fillna('')
158
171
 
159
172
  # Drop temp_join_col if present
160
- if 'temp_join_col' in df.columns:
161
- df = df.drop(columns='temp_join_col')
173
+ df = df.drop(columns='temp_join_col', errors='ignore')
162
174
 
163
175
  return df
164
176
 
165
- @staticmethod
166
- def is_dataframe_empty(df):
177
+
178
+ def is_dataframe_empty(self, df):
167
179
  """
168
180
  Check if a DataFrame (Pandas or Dask) is empty.
169
181
 
@@ -174,14 +186,30 @@ class DataUtils:
174
186
  - bool: True if the DataFrame is empty, False otherwise.
175
187
  """
176
188
  if isinstance(df, dd.DataFrame):
177
- df_size = df.map_partitions(len).sum().compute()
178
- return df_size == 0
179
- else:
189
+ try:
190
+ return len(df.index) == 0
191
+ except Exception as e:
192
+ self.logger.error(f"Error while processing Dask DataFrame: {e}")
193
+ return False
194
+ elif isinstance(df, pd.DataFrame):
180
195
  return df.empty
196
+ else:
197
+ self.logger.error("Input must be a pandas or dask DataFrame.")
198
+ return False
181
199
 
182
200
  @staticmethod
183
- def convert_to_datetime(df, date_fields):
201
+ def convert_to_datetime_dask(df, date_fields):
202
+ """
203
+ Convert specified columns in a Dask DataFrame to datetime, handling errors gracefully.
204
+
205
+ Parameters:
206
+ - df (dask.dataframe.DataFrame): The Dask DataFrame containing the columns.
207
+ - date_fields (list of str): List of column names to convert to datetime.
208
+
209
+ Returns:
210
+ - dask.dataframe.DataFrame: Updated DataFrame with specified columns converted to datetime.
211
+ """
184
212
  for col in date_fields:
185
213
  if col in df.columns:
186
- df[col] = pd.to_datetime(df[col], errors='coerce')
187
- return df
214
+ df[col] = df[col].map_partitions(pd.to_datetime, errors="coerce", meta=(col, "datetime64[ns]"))
215
+ return df