PyPI - sibi-dst - Versions diffs - 0.3.14__py3-none-any.whl → 0.3.15__py3-none-any.whl - Mend

sibi-dst 0.3.14py3-none-any.whl → 0.3.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

sibi_dst/df_helper/_df_helper.py CHANGED Viewed

@@ -4,13 +4,12 @@ from typing import Any, Dict, TypeVar
 from typing import Union, Optional
 import dask.dataframe as dd
-import dask_expr
 import pandas as pd
 from pydantic import BaseModel
-from sibi_dst.utils import ParquetSaver, ClickHouseWriter
 from sibi_dst.df_helper.core import QueryConfig, ParamsConfig
 from sibi_dst.utils import Logger
+from sibi_dst.utils import ParquetSaver, ClickHouseWriter
 from .plugins.django import *
 from .plugins.http import HttpConfig
 from .plugins.parquet import ParquetConfig, ParquetFilterHandler
@@ -19,6 +18,7 @@ from .plugins.sql_alchemy import *
 # Define a generic type variable for BaseModel subclasses
 T = TypeVar("T", bound=BaseModel)
 class DfHelper:
     df: Union[dd.DataFrame, pd.DataFrame] = None
     plugin_django_connection: Optional[DjangoConnectionConfig] = None
@@ -40,13 +40,12 @@ class DfHelper:
         self.debug = kwargs.setdefault("debug", False)
         self.verbose_debug = kwargs.setdefault("verbose_debug", False)
         self.parquet_storage_path = kwargs.setdefault("parquet_storage_path", None)
-        self.dt_field=kwargs.setdefault("dt_field", None)
+        self.dt_field = kwargs.setdefault("dt_field", None)
         self.as_pandas = kwargs.setdefault("as_pandas", False)
         kwargs.setdefault("live", True)
         kwargs.setdefault("logger", self.logger)
         self.post_init(**kwargs)
     def post_init(self, **kwargs):
         self.logger.info(f"Source used: {self.source}")
         self.plugin_query = self.__get_config(QueryConfig, kwargs)
@@ -59,7 +58,7 @@ class DfHelper:
         elif self.source == 'http':
             self.plugin_http = HttpConfig(**kwargs)
         elif self.source == 'sqlalchemy':
-            self.plugin_sqlalchemy = self.__get_config(SqlAlchemyConnectionConfig,kwargs)
+            self.plugin_sqlalchemy = self.__get_config(SqlAlchemyConnectionConfig, kwargs)
     @staticmethod
     def __get_config(model: [T], kwargs: Dict[str, Any]) -> Union[T]:
@@ -100,7 +99,6 @@ class DfHelper:
                 self.logger.info("Regular asyncio run...")
                 return asyncio.run(self._load_from_http(**options))
     def _load_from_sqlalchemy(self, **options):
         try:
             options.setdefault("debug", self.debug)
@@ -139,7 +137,7 @@ class DfHelper:
             self.logger.info("Data successfully loaded from django database.")
         except Exception as e:
             self.logger.error(f"Failed to load data from django database: {e}")
-            self.df=dd.from_pandas(pd.DataFrame(), npartitions=1)
+            self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
         return self.df
@@ -152,10 +150,9 @@ class DfHelper:
             self.df = await self.plugin_http.fetch_data(**options)
         except Exception as e:
             self.logger.error(f"Failed to load data from http plugin: {e}")
-            self.df=dd.from_pandas(pd.DataFrame(), npartitions=1)
+            self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
         return self.df
     def _post_process_df(self):
         """
         Efficiently process the DataFrame by filtering, renaming, and setting indices.
@@ -225,7 +222,7 @@ class DfHelper:
         if self.df.map_partitions(len).compute().sum() == 0:
             self.logger.info("Cannot write to clickhouse since Dataframe is empty")
             return
-        cs=ClickHouseWriter(logger=self.logger, **credentials)
+        cs = ClickHouseWriter(logger=self.logger, **credentials)
         cs.save_to_clickhouse(self.df)
         self.logger.info("Save to ClickHouse completed.")
@@ -299,7 +296,6 @@ class DfHelper:
                 kwargs[f"{mapped_field}__date__lte"] = end
         return self.load(**kwargs)
     @staticmethod
     def parse_date(date_str: str) -> Union[datetime.datetime, datetime.date]:
         try:

sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_model_builder.py CHANGED Viewed

@@ -82,9 +82,12 @@ class SqlAlchemyModelBuilder:
             dict: Dictionary of column attributes.
         """
         columns = {}
+        reserved_names = ["metadata", "class_", "table"]
         for column in table.columns:
             column_name = self.normalize_column_name(column.name)
-            columns[column_name] = column
+            if column_name not in reserved_names:
+                columns[column_name] = column
         return columns
     def add_relationships(self, attrs, table: Table):

sibi_dst/utils/_data_utils.py CHANGED Viewed

@@ -1,77 +1,32 @@
-import pandas as pd
 import dask.dataframe as dd
+import pandas as pd
 from sibi_dst.utils import Logger
 class DataUtils:
-    def __init__(self, logger=None):
+    def __init__(self, logger=None, **kwargs):
         self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
+        self.debug = kwargs.get('debug', False)
     def transform_numeric_cols(self, df, columns, fill_value=0, dtype=int):
         if not columns:
             self.logger.warning('No columns specified')
+        self.logger.debug(f'Dataframe type:{type(df)}')
         columns = [column for column in columns if column in df.columns]
         for col in columns:
-            if isinstance(df, dd.DataFrame):
-                # Replace NaN with 0, then convert to boolean
-                df[col] = df[col].map_partitions(
-                    lambda s: pd.to_numeric(s, errors='coerce')  # Convert to numeric, invalid to NaN
-                    .fillna(fill_value)  # Replace NaN with 0
-                    .astype(dtype),
-                    meta=(col, dtype)
-                )
-            else:
-                # For Pandas DataFrame, handle mixed types and invalid values
-                df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert to numeric, invalid to NaN
-                df[col] = df[col].fillna(fill_value).astype(dtype)
+            # Replace NaN with 0, then convert to boolean
+            df[col] = df[col].map_partitions(
+                lambda s: pd.to_numeric(s, errors='coerce')  # Convert to numeric, invalid to NaN
+                .fillna(fill_value)  # Replace NaN with 0
+                .astype(dtype),
+                meta=(col, dtype)
+            )
         return df
-    @staticmethod
-    def transform_numeric_columns(df, columns=None, fill_value=0, transform_func=None):
-        """
-        Transform numeric columns in a DataFrame (Pandas or Dask), handling missing values and applying optional transformations.
-        Parameters:
-        - df (pandas.DataFrame or dask.dataframe.DataFrame): The DataFrame.
-        - columns (list of str, optional): Specific columns to transform. If None, all numeric columns are transformed.
-        - fill_value (int or float): The value to replace NA values with.
-        - transform_func (callable, optional): The transformation function to apply.
-          If None, no additional transformation is applied.
-        Returns:
-        - pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with transformed numeric columns.
-        """
-        if columns is None:
-            # Detect numeric columns
-            columns = df.select_dtypes(include=['number']).columns.tolist()
-        if not columns:
-            return df
-        columns = [column for column in columns if column in df.columns]
-        # Default transformation function (identity) if none is provided
-        if transform_func is None:
-            transform_func = lambda x: x
-        # Batch processing for Dask
-        if isinstance(df, dd.DataFrame):
-            def transform_partition(partition):
-                # Apply transformations for all numeric columns in a single pass
-                partition[columns] = partition[columns].fillna(fill_value).map(transform_func)
-                return partition
-            # Apply the transformation function to all specified columns
-            df = df.map_partitions(transform_partition, meta=df)
-        else:
-            # Pandas: Vectorized operations for all specified columns
-            df[columns] = df[columns].fillna(fill_value).map(transform_func)
-        return df
-    @staticmethod
-    def transform_boolean_columns(df, columns=None):
+    def transform_boolean_columns(self, df, columns=None):
         """
         Detect if the provided columns in a DataFrame (Pandas or Dask) contain only 0 and 1
         and convert them to boolean. Detection is performed using a sample.
@@ -84,23 +39,20 @@ class DataUtils:
         Returns:
         - pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with transformed boolean columns.
         """
         # Apply transformation to each specified column
         for col in columns:
             if col in df.columns:
-                if isinstance(df, dd.DataFrame):
-                    # Replace NaN with 0, then convert to boolean
-                    df[col] = df[col].map_partitions(
-                        lambda s: pd.to_numeric(s, errors='coerce')  # Convert to numeric, invalid to NaN
-                        .fillna(0)  # Replace NaN with 0
-                        .astype(int)  # Ensure integer type
-                        .astype(bool),  # Convert to boolean
-                        meta=(col, 'bool')
-                    )
-                else:
-                    # For Pandas DataFrame, handle mixed types and invalid values
-                    df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert to numeric, invalid to NaN
-                    df[col] = df[col].fillna(0).astype(int).astype(bool)
+                # Replace NaN with 0, then convert to boolean
+                df[col] = df[col].map_partitions(
+                    lambda s: pd.to_numeric(s, errors='coerce')  # Convert to numeric, invalid to NaN
+                    .fillna(0)  # Replace NaN with 0
+                    .astype(int)  # Ensure integer type
+                    .astype(bool),  # Convert to boolean
+                    meta=(col, 'bool')
+                )
+        if self.debug:
+            self.logger.debug(f'Dataframe type:{type(df)}, boolean applied to columns: {columns}')
         return df
     def merge_lookup_data(self, classname, df, **kwargs):
@@ -141,12 +93,19 @@ class DataUtils:
         # Get unique IDs from source column
         ids = df[source_col].dropna().unique()
-        if isinstance(ids, dd.Series):
+        # Compute if it's a Dask Series
+        if isinstance(ids, dd.core.Series):
             ids = ids.compute()
+        # Check if any IDs are found
         if not len(ids):
             self.logger.info(f"No IDs found in the source column: {source_col}")
             return df
-        ids = sorted(ids.tolist())
+        # Convert to a list only if necessary and sort
+        if not isinstance(ids, list):
+            ids = ids.tolist()
+        ids = sorted(ids)
         # Prepare kwargs for loading lookup data
         load_kwargs = kwargs.copy()
         load_kwargs.update({
@@ -167,14 +126,13 @@ class DataUtils:
         df = df.merge(result, how='left', left_on=source_col, right_on=temp_join_col)
         if fillna_source_description_alias and source_description_alias in df.columns:
-            df[source_description_alias]=df[source_description_alias].fillna('')
+            df[source_description_alias] = df[source_description_alias].fillna('')
         # Drop temp_join_col if present
         df = df.drop(columns='temp_join_col', errors='ignore')
         return df
     def is_dataframe_empty(self, df):
         """
         Check if a DataFrame (Pandas or Dask) is empty.

{sibi_dst-0.3.14.dist-info → sibi_dst-0.3.15.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sibi-dst
-Version: 0.3.14
+Version: 0.3.15
 Summary: Data Science Toolkit
 Author: Luis Valverde
 Author-email: lvalverdeb@gmail.com

{sibi_dst-0.3.14.dist-info → sibi_dst-0.3.15.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
 sibi_dst/__init__.py,sha256=1KaC0LYTHxjpENq-NXI325WcEYZ8GCBrHGkLoFxEcu0,251
 sibi_dst/df_helper/__init__.py,sha256=JXJBY47G6wOYhzNI646OBl3pSGWIy4282-3qPGYHU7w,167
-sibi_dst/df_helper/_df_helper.py,sha256=iYG8uL1ILrBvjtH8oiSwbPHnlDsJLlHtSghDDlt7T-w,13365
+sibi_dst/df_helper/_df_helper.py,sha256=ZWhPj9K5q_amJ7eBOrvwAvncxRnI-baveKWWQWfyND8,13354
 sibi_dst/df_helper/_parquet_artifact.py,sha256=f5oHwXtsNW6-ONSFsRB0AniVefA0THzP92J-nugp9vo,4973
 sibi_dst/df_helper/core/__init__.py,sha256=NSYY_evzq6XEkO06Nz6xLH5KznzRGI44cLbrnN3zHXQ,503
 sibi_dst/df_helper/core/_defaults.py,sha256=pJU-lX7w4nrt0Anx35j08mVr_0oMGn1bTA_iCl_p1qI,6700
@@ -23,7 +23,7 @@ sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py,sha256=1WQ390XBFWO
 sibi_dst/df_helper/plugins/sql_alchemy/_sqlachemy_filter_handler.py,sha256=H8ypUjLKzYYl9BerfJjX_Uv9qBVkBR-wZiQlh3uRQXg,4669
 sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_db_connection.py,sha256=HtMsfH5com4dLVJxh3wdMUpQI3mz0cKDJz0CmFS2S8U,1648
 sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py,sha256=balWGKun0NKIfhLZW-_DCOhKuTzTo_C2NwZoKFwuSJo,2329
-sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_model_builder.py,sha256=vrTTeFcrf8sFpzqLlQfspjduKuR7Wgn7mDPUASsQs6s,4355
+sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_model_builder.py,sha256=rzzZdcRB5TS9uJ3ZIGQiNf04e3u2akqJEsoGCuyPE3c,4467
 sibi_dst/df_helper/plugins/sql_model/__init__.py,sha256=MXd4OOdTqR4cENSV733SGodPO6eQMCexANs-3w0qL5U,226
 sibi_dst/df_helper/plugins/sql_model/_sqlmodel_db_connection.py,sha256=6jmMjKIv5Btysj3kZMaXQ98IqKQkhnOC-JWtb1B8rus,4265
 sibi_dst/df_helper/plugins/sql_model/_sqlmodel_load_from_db.py,sha256=bLD4tEcGDKkJCfSO4b13_89tzVJcpz55I6uw9D4ERnE,3751
@@ -31,7 +31,7 @@ sibi_dst/utils/__init__.py,sha256=nkX7tASNn57kw998YdqQQGY8qXv2J4LC4-g0GoQSiic,83
 sibi_dst/utils/_airflow_manager.py,sha256=rlt3eolR5QvtxWhAtBTCpHXvxftnKM-ibPMv3fVwNZk,7524
 sibi_dst/utils/_clickhouse_writer.py,sha256=kNBQeDn3D4Javrz5L8uU_5itf8Mrvm9l29uxcmcKlbg,8555
 sibi_dst/utils/_credentials.py,sha256=8i6z7y3y5S-6mSk4xrT2AwhzCA32mTn1n1iYX9IVyHk,1724
-sibi_dst/utils/_data_utils.py,sha256=BvmjMNSkICy671BmjW68RhvDMfN5uAXwhffSV-wEwmk,9185
+sibi_dst/utils/_data_utils.py,sha256=uw0SW9G4GrvTX4IdUd8fmsMTMEG5aXOFcWOv4Au3H5g,7016
 sibi_dst/utils/_data_wrapper.py,sha256=SmNv1UoZLq7ovRVy4wipsWLMidKJXcRTp4HtxmaCQdk,9399
 sibi_dst/utils/_date_utils.py,sha256=KYB07puKDrSG8tOm_i1HGX0TjLNUtSWjwfsCYBmW9co,10619
 sibi_dst/utils/_df_utils.py,sha256=9_dNYoZ9_ofU0t_sxMdsXALWCuh02gvqUrei-6Lhr6w,10910
@@ -40,6 +40,6 @@ sibi_dst/utils/_filepath_generator.py,sha256=ytPSZ9GYOnnSP25zwA-0NjFHupPRZyXwixW
 sibi_dst/utils/_log_utils.py,sha256=AAenyubYUjk77WqiaNkjgkxws3dnAMIdaGl2Ryz_cA4,2245
 sibi_dst/utils/_parquet_saver.py,sha256=-A0o_vucyYe7wlwiby_0_yS-ZfT2GHwImyQHrCIBNwk,9051
 sibi_dst/utils/_storage_manager.py,sha256=KP2HBXnLUMMquqcO30ecfuoU7g1z8RtaV3Dv0TvEXoY,3856
-sibi_dst-0.3.14.dist-info/METADATA,sha256=ysmNqT8NnhY_VlPmrQ2U3FnXWFEIvfwFRi8uSGRP6g0,2090
-sibi_dst-0.3.14.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-sibi_dst-0.3.14.dist-info/RECORD,,
+sibi_dst-0.3.15.dist-info/METADATA,sha256=0XU32Bgt1RYV7Y12lmDxq_YmHaXya5d2qMYfYP8Yic0,2090
+sibi_dst-0.3.15.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+sibi_dst-0.3.15.dist-info/RECORD,,

{sibi_dst-0.3.14.dist-info → sibi_dst-0.3.15.dist-info}/WHEEL RENAMED Viewed

File without changes

sibi-dst 0.3.14__py3-none-any.whl → 0.3.15__py3-none-any.whl

sibi-dst 0.3.14py3-none-any.whl → 0.3.15py3-none-any.whl