sibi-dst 0.3.14__py3-none-any.whl → 0.3.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,13 +4,12 @@ from typing import Any, Dict, TypeVar
4
4
  from typing import Union, Optional
5
5
 
6
6
  import dask.dataframe as dd
7
- import dask_expr
8
7
  import pandas as pd
9
8
  from pydantic import BaseModel
10
9
 
11
- from sibi_dst.utils import ParquetSaver, ClickHouseWriter
12
10
  from sibi_dst.df_helper.core import QueryConfig, ParamsConfig
13
11
  from sibi_dst.utils import Logger
12
+ from sibi_dst.utils import ParquetSaver, ClickHouseWriter
14
13
  from .plugins.django import *
15
14
  from .plugins.http import HttpConfig
16
15
  from .plugins.parquet import ParquetConfig, ParquetFilterHandler
@@ -19,6 +18,7 @@ from .plugins.sql_alchemy import *
19
18
  # Define a generic type variable for BaseModel subclasses
20
19
  T = TypeVar("T", bound=BaseModel)
21
20
 
21
+
22
22
  class DfHelper:
23
23
  df: Union[dd.DataFrame, pd.DataFrame] = None
24
24
  plugin_django_connection: Optional[DjangoConnectionConfig] = None
@@ -40,13 +40,12 @@ class DfHelper:
40
40
  self.debug = kwargs.setdefault("debug", False)
41
41
  self.verbose_debug = kwargs.setdefault("verbose_debug", False)
42
42
  self.parquet_storage_path = kwargs.setdefault("parquet_storage_path", None)
43
- self.dt_field=kwargs.setdefault("dt_field", None)
43
+ self.dt_field = kwargs.setdefault("dt_field", None)
44
44
  self.as_pandas = kwargs.setdefault("as_pandas", False)
45
45
  kwargs.setdefault("live", True)
46
46
  kwargs.setdefault("logger", self.logger)
47
47
  self.post_init(**kwargs)
48
48
 
49
-
50
49
  def post_init(self, **kwargs):
51
50
  self.logger.info(f"Source used: {self.source}")
52
51
  self.plugin_query = self.__get_config(QueryConfig, kwargs)
@@ -59,7 +58,7 @@ class DfHelper:
59
58
  elif self.source == 'http':
60
59
  self.plugin_http = HttpConfig(**kwargs)
61
60
  elif self.source == 'sqlalchemy':
62
- self.plugin_sqlalchemy = self.__get_config(SqlAlchemyConnectionConfig,kwargs)
61
+ self.plugin_sqlalchemy = self.__get_config(SqlAlchemyConnectionConfig, kwargs)
63
62
 
64
63
  @staticmethod
65
64
  def __get_config(model: [T], kwargs: Dict[str, Any]) -> Union[T]:
@@ -100,7 +99,6 @@ class DfHelper:
100
99
  self.logger.info("Regular asyncio run...")
101
100
  return asyncio.run(self._load_from_http(**options))
102
101
 
103
-
104
102
  def _load_from_sqlalchemy(self, **options):
105
103
  try:
106
104
  options.setdefault("debug", self.debug)
@@ -139,7 +137,7 @@ class DfHelper:
139
137
  self.logger.info("Data successfully loaded from django database.")
140
138
  except Exception as e:
141
139
  self.logger.error(f"Failed to load data from django database: {e}")
142
- self.df=dd.from_pandas(pd.DataFrame(), npartitions=1)
140
+ self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
143
141
 
144
142
  return self.df
145
143
 
@@ -152,10 +150,9 @@ class DfHelper:
152
150
  self.df = await self.plugin_http.fetch_data(**options)
153
151
  except Exception as e:
154
152
  self.logger.error(f"Failed to load data from http plugin: {e}")
155
- self.df=dd.from_pandas(pd.DataFrame(), npartitions=1)
153
+ self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
156
154
  return self.df
157
155
 
158
-
159
156
  def _post_process_df(self):
160
157
  """
161
158
  Efficiently process the DataFrame by filtering, renaming, and setting indices.
@@ -225,7 +222,7 @@ class DfHelper:
225
222
  if self.df.map_partitions(len).compute().sum() == 0:
226
223
  self.logger.info("Cannot write to clickhouse since Dataframe is empty")
227
224
  return
228
- cs=ClickHouseWriter(logger=self.logger, **credentials)
225
+ cs = ClickHouseWriter(logger=self.logger, **credentials)
229
226
  cs.save_to_clickhouse(self.df)
230
227
  self.logger.info("Save to ClickHouse completed.")
231
228
 
@@ -299,7 +296,6 @@ class DfHelper:
299
296
  kwargs[f"{mapped_field}__date__lte"] = end
300
297
  return self.load(**kwargs)
301
298
 
302
-
303
299
  @staticmethod
304
300
  def parse_date(date_str: str) -> Union[datetime.datetime, datetime.date]:
305
301
  try:
@@ -82,9 +82,12 @@ class SqlAlchemyModelBuilder:
82
82
  dict: Dictionary of column attributes.
83
83
  """
84
84
  columns = {}
85
+ reserved_names = ["metadata", "class_", "table"]
86
+
85
87
  for column in table.columns:
86
88
  column_name = self.normalize_column_name(column.name)
87
- columns[column_name] = column
89
+ if column_name not in reserved_names:
90
+ columns[column_name] = column
88
91
  return columns
89
92
 
90
93
  def add_relationships(self, attrs, table: Table):
@@ -1,77 +1,32 @@
1
- import pandas as pd
2
1
  import dask.dataframe as dd
2
+ import pandas as pd
3
+
3
4
  from sibi_dst.utils import Logger
4
5
 
6
+
5
7
  class DataUtils:
6
8
 
7
- def __init__(self, logger=None):
9
+ def __init__(self, logger=None, **kwargs):
8
10
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
11
+ self.debug = kwargs.get('debug', False)
9
12
 
10
13
  def transform_numeric_cols(self, df, columns, fill_value=0, dtype=int):
11
14
  if not columns:
12
15
  self.logger.warning('No columns specified')
13
-
16
+ self.logger.debug(f'Dataframe type:{type(df)}')
14
17
  columns = [column for column in columns if column in df.columns]
15
18
  for col in columns:
16
- if isinstance(df, dd.DataFrame):
17
- # Replace NaN with 0, then convert to boolean
18
- df[col] = df[col].map_partitions(
19
- lambda s: pd.to_numeric(s, errors='coerce') # Convert to numeric, invalid to NaN
20
- .fillna(fill_value) # Replace NaN with 0
21
- .astype(dtype),
22
- meta=(col, dtype)
23
- )
24
- else:
25
- # For Pandas DataFrame, handle mixed types and invalid values
26
- df[col] = pd.to_numeric(df[col], errors='coerce') # Convert to numeric, invalid to NaN
27
- df[col] = df[col].fillna(fill_value).astype(dtype)
19
+ # Replace NaN with 0, then convert to boolean
20
+ df[col] = df[col].map_partitions(
21
+ lambda s: pd.to_numeric(s, errors='coerce') # Convert to numeric, invalid to NaN
22
+ .fillna(fill_value) # Replace NaN with 0
23
+ .astype(dtype),
24
+ meta=(col, dtype)
25
+ )
28
26
 
29
27
  return df
30
28
 
31
- @staticmethod
32
- def transform_numeric_columns(df, columns=None, fill_value=0, transform_func=None):
33
- """
34
- Transform numeric columns in a DataFrame (Pandas or Dask), handling missing values and applying optional transformations.
35
-
36
- Parameters:
37
- - df (pandas.DataFrame or dask.dataframe.DataFrame): The DataFrame.
38
- - columns (list of str, optional): Specific columns to transform. If None, all numeric columns are transformed.
39
- - fill_value (int or float): The value to replace NA values with.
40
- - transform_func (callable, optional): The transformation function to apply.
41
- If None, no additional transformation is applied.
42
-
43
- Returns:
44
- - pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with transformed numeric columns.
45
- """
46
- if columns is None:
47
- # Detect numeric columns
48
- columns = df.select_dtypes(include=['number']).columns.tolist()
49
-
50
- if not columns:
51
- return df
52
-
53
- columns = [column for column in columns if column in df.columns]
54
- # Default transformation function (identity) if none is provided
55
- if transform_func is None:
56
- transform_func = lambda x: x
57
-
58
- # Batch processing for Dask
59
- if isinstance(df, dd.DataFrame):
60
- def transform_partition(partition):
61
- # Apply transformations for all numeric columns in a single pass
62
- partition[columns] = partition[columns].fillna(fill_value).map(transform_func)
63
- return partition
64
-
65
- # Apply the transformation function to all specified columns
66
- df = df.map_partitions(transform_partition, meta=df)
67
- else:
68
- # Pandas: Vectorized operations for all specified columns
69
- df[columns] = df[columns].fillna(fill_value).map(transform_func)
70
-
71
- return df
72
-
73
- @staticmethod
74
- def transform_boolean_columns(df, columns=None):
29
+ def transform_boolean_columns(self, df, columns=None):
75
30
  """
76
31
  Detect if the provided columns in a DataFrame (Pandas or Dask) contain only 0 and 1
77
32
  and convert them to boolean. Detection is performed using a sample.
@@ -84,23 +39,20 @@ class DataUtils:
84
39
  Returns:
85
40
  - pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with transformed boolean columns.
86
41
  """
42
+
87
43
  # Apply transformation to each specified column
88
44
  for col in columns:
89
45
  if col in df.columns:
90
- if isinstance(df, dd.DataFrame):
91
- # Replace NaN with 0, then convert to boolean
92
- df[col] = df[col].map_partitions(
93
- lambda s: pd.to_numeric(s, errors='coerce') # Convert to numeric, invalid to NaN
94
- .fillna(0) # Replace NaN with 0
95
- .astype(int) # Ensure integer type
96
- .astype(bool), # Convert to boolean
97
- meta=(col, 'bool')
98
- )
99
- else:
100
- # For Pandas DataFrame, handle mixed types and invalid values
101
- df[col] = pd.to_numeric(df[col], errors='coerce') # Convert to numeric, invalid to NaN
102
- df[col] = df[col].fillna(0).astype(int).astype(bool)
103
-
46
+ # Replace NaN with 0, then convert to boolean
47
+ df[col] = df[col].map_partitions(
48
+ lambda s: pd.to_numeric(s, errors='coerce') # Convert to numeric, invalid to NaN
49
+ .fillna(0) # Replace NaN with 0
50
+ .astype(int) # Ensure integer type
51
+ .astype(bool), # Convert to boolean
52
+ meta=(col, 'bool')
53
+ )
54
+ if self.debug:
55
+ self.logger.debug(f'Dataframe type:{type(df)}, boolean applied to columns: {columns}')
104
56
  return df
105
57
 
106
58
  def merge_lookup_data(self, classname, df, **kwargs):
@@ -141,12 +93,19 @@ class DataUtils:
141
93
 
142
94
  # Get unique IDs from source column
143
95
  ids = df[source_col].dropna().unique()
144
- if isinstance(ids, dd.Series):
96
+ # Compute if it's a Dask Series
97
+ if isinstance(ids, dd.core.Series):
145
98
  ids = ids.compute()
99
+
100
+ # Check if any IDs are found
146
101
  if not len(ids):
147
102
  self.logger.info(f"No IDs found in the source column: {source_col}")
148
103
  return df
149
- ids = sorted(ids.tolist())
104
+
105
+ # Convert to a list only if necessary and sort
106
+ if not isinstance(ids, list):
107
+ ids = ids.tolist()
108
+ ids = sorted(ids)
150
109
  # Prepare kwargs for loading lookup data
151
110
  load_kwargs = kwargs.copy()
152
111
  load_kwargs.update({
@@ -167,14 +126,13 @@ class DataUtils:
167
126
  df = df.merge(result, how='left', left_on=source_col, right_on=temp_join_col)
168
127
 
169
128
  if fillna_source_description_alias and source_description_alias in df.columns:
170
- df[source_description_alias]=df[source_description_alias].fillna('')
129
+ df[source_description_alias] = df[source_description_alias].fillna('')
171
130
 
172
131
  # Drop temp_join_col if present
173
132
  df = df.drop(columns='temp_join_col', errors='ignore')
174
133
 
175
134
  return df
176
135
 
177
-
178
136
  def is_dataframe_empty(self, df):
179
137
  """
180
138
  Check if a DataFrame (Pandas or Dask) is empty.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.14
3
+ Version: 0.3.15
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -1,6 +1,6 @@
1
1
  sibi_dst/__init__.py,sha256=1KaC0LYTHxjpENq-NXI325WcEYZ8GCBrHGkLoFxEcu0,251
2
2
  sibi_dst/df_helper/__init__.py,sha256=JXJBY47G6wOYhzNI646OBl3pSGWIy4282-3qPGYHU7w,167
3
- sibi_dst/df_helper/_df_helper.py,sha256=iYG8uL1ILrBvjtH8oiSwbPHnlDsJLlHtSghDDlt7T-w,13365
3
+ sibi_dst/df_helper/_df_helper.py,sha256=ZWhPj9K5q_amJ7eBOrvwAvncxRnI-baveKWWQWfyND8,13354
4
4
  sibi_dst/df_helper/_parquet_artifact.py,sha256=f5oHwXtsNW6-ONSFsRB0AniVefA0THzP92J-nugp9vo,4973
5
5
  sibi_dst/df_helper/core/__init__.py,sha256=NSYY_evzq6XEkO06Nz6xLH5KznzRGI44cLbrnN3zHXQ,503
6
6
  sibi_dst/df_helper/core/_defaults.py,sha256=pJU-lX7w4nrt0Anx35j08mVr_0oMGn1bTA_iCl_p1qI,6700
@@ -23,7 +23,7 @@ sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py,sha256=1WQ390XBFWO
23
23
  sibi_dst/df_helper/plugins/sql_alchemy/_sqlachemy_filter_handler.py,sha256=H8ypUjLKzYYl9BerfJjX_Uv9qBVkBR-wZiQlh3uRQXg,4669
24
24
  sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_db_connection.py,sha256=HtMsfH5com4dLVJxh3wdMUpQI3mz0cKDJz0CmFS2S8U,1648
25
25
  sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py,sha256=balWGKun0NKIfhLZW-_DCOhKuTzTo_C2NwZoKFwuSJo,2329
26
- sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_model_builder.py,sha256=vrTTeFcrf8sFpzqLlQfspjduKuR7Wgn7mDPUASsQs6s,4355
26
+ sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_model_builder.py,sha256=rzzZdcRB5TS9uJ3ZIGQiNf04e3u2akqJEsoGCuyPE3c,4467
27
27
  sibi_dst/df_helper/plugins/sql_model/__init__.py,sha256=MXd4OOdTqR4cENSV733SGodPO6eQMCexANs-3w0qL5U,226
28
28
  sibi_dst/df_helper/plugins/sql_model/_sqlmodel_db_connection.py,sha256=6jmMjKIv5Btysj3kZMaXQ98IqKQkhnOC-JWtb1B8rus,4265
29
29
  sibi_dst/df_helper/plugins/sql_model/_sqlmodel_load_from_db.py,sha256=bLD4tEcGDKkJCfSO4b13_89tzVJcpz55I6uw9D4ERnE,3751
@@ -31,7 +31,7 @@ sibi_dst/utils/__init__.py,sha256=nkX7tASNn57kw998YdqQQGY8qXv2J4LC4-g0GoQSiic,83
31
31
  sibi_dst/utils/_airflow_manager.py,sha256=rlt3eolR5QvtxWhAtBTCpHXvxftnKM-ibPMv3fVwNZk,7524
32
32
  sibi_dst/utils/_clickhouse_writer.py,sha256=kNBQeDn3D4Javrz5L8uU_5itf8Mrvm9l29uxcmcKlbg,8555
33
33
  sibi_dst/utils/_credentials.py,sha256=8i6z7y3y5S-6mSk4xrT2AwhzCA32mTn1n1iYX9IVyHk,1724
34
- sibi_dst/utils/_data_utils.py,sha256=BvmjMNSkICy671BmjW68RhvDMfN5uAXwhffSV-wEwmk,9185
34
+ sibi_dst/utils/_data_utils.py,sha256=uw0SW9G4GrvTX4IdUd8fmsMTMEG5aXOFcWOv4Au3H5g,7016
35
35
  sibi_dst/utils/_data_wrapper.py,sha256=SmNv1UoZLq7ovRVy4wipsWLMidKJXcRTp4HtxmaCQdk,9399
36
36
  sibi_dst/utils/_date_utils.py,sha256=KYB07puKDrSG8tOm_i1HGX0TjLNUtSWjwfsCYBmW9co,10619
37
37
  sibi_dst/utils/_df_utils.py,sha256=9_dNYoZ9_ofU0t_sxMdsXALWCuh02gvqUrei-6Lhr6w,10910
@@ -40,6 +40,6 @@ sibi_dst/utils/_filepath_generator.py,sha256=ytPSZ9GYOnnSP25zwA-0NjFHupPRZyXwixW
40
40
  sibi_dst/utils/_log_utils.py,sha256=AAenyubYUjk77WqiaNkjgkxws3dnAMIdaGl2Ryz_cA4,2245
41
41
  sibi_dst/utils/_parquet_saver.py,sha256=-A0o_vucyYe7wlwiby_0_yS-ZfT2GHwImyQHrCIBNwk,9051
42
42
  sibi_dst/utils/_storage_manager.py,sha256=KP2HBXnLUMMquqcO30ecfuoU7g1z8RtaV3Dv0TvEXoY,3856
43
- sibi_dst-0.3.14.dist-info/METADATA,sha256=ysmNqT8NnhY_VlPmrQ2U3FnXWFEIvfwFRi8uSGRP6g0,2090
44
- sibi_dst-0.3.14.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
45
- sibi_dst-0.3.14.dist-info/RECORD,,
43
+ sibi_dst-0.3.15.dist-info/METADATA,sha256=0XU32Bgt1RYV7Y12lmDxq_YmHaXya5d2qMYfYP8Yic0,2090
44
+ sibi_dst-0.3.15.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
45
+ sibi_dst-0.3.15.dist-info/RECORD,,