PyPI - sibi-dst - Versions diffs - 0.3.14__py3-none-any.whl → 0.3.16__py3-none-any.whl - Mend

sibi-dst 0.3.14py3-none-any.whl → 0.3.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

sibi_dst/df_helper/_df_helper.py +42 -30
sibi_dst/df_helper/core/__init__.py +6 -4
sibi_dst/df_helper/core/_filter_handler.py +216 -0
sibi_dst/df_helper/plugins/django/_django_load_from_db.py +32 -20
sibi_dst/df_helper/plugins/django/_io_dask.py +0 -3
sibi_dst/df_helper/plugins/http/_http_config.py +4 -4
sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py +9 -9
sibi_dst/df_helper/plugins/sql_alchemy/_sqlachemy_filter_handler.py +4 -2
sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py +8 -6
sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_model_builder.py +5 -2
sibi_dst/df_helper/plugins/sql_model/_sqlmodel_load_from_db.py +2 -3
sibi_dst/utils/_clickhouse_writer.py +16 -16
sibi_dst/utils/_data_utils.py +40 -81
sibi_dst/utils/_data_wrapper.py +8 -4
sibi_dst/utils/_df_utils.py +5 -5
sibi_dst/utils/_log_utils.py +3 -0
sibi_dst/utils/_parquet_saver.py +3 -108
{sibi_dst-0.3.14.dist-info → sibi_dst-0.3.16.dist-info}/METADATA +2 -1
{sibi_dst-0.3.14.dist-info → sibi_dst-0.3.16.dist-info}/RECORD +20 -19
{sibi_dst-0.3.14.dist-info → sibi_dst-0.3.16.dist-info}/WHEEL +0 -0

sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import dask.dataframe as dd
+import dask_expr
 import pandas as pd
 from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
@@ -28,7 +29,6 @@ class SqlAlchemyLoadFromDb:
         self.query_config = plugin_query
         self.params_config = plugin_params
         self.debug = kwargs.pop("debug", False)
-        self.verbose_debug = kwargs.pop("verbose_debug", False)
     def build_and_load(self) -> dd.DataFrame:
         """
@@ -40,7 +40,6 @@ class SqlAlchemyLoadFromDb:
     def _build_and_load(self) -> dd.DataFrame:
         try:
-            # reader = SQLAlchemyDask(model=self.model, filters=self.params_config.filters,engine_url=self.engine.url, logger=self.logger, chunk_size=1000, debug=self.debug)
             self.df = SQLAlchemyDask(
                 model=self.model,
                 filters=self.params_config.filters,
@@ -49,10 +48,13 @@ class SqlAlchemyLoadFromDb:
                 chunk_size=1000,
                 debug=self.debug).read_frame()
             if self.df is None or len(self.df.head().index) == 0:
-                self.logger.warning("Query returned no results.")
-                return dd.from_pandas(pd.DataFrame(), npartitions=1)
+                self.logger.debug("Query returned no results.")
+                dask_df=dd.from_pandas(pd.DataFrame(), npartitions=1)
+                return dask_df
             return self.df
         except Exception as e:
-            self.logger.error(f"Failed to load data into Dask DataFrame.{e}")
-            return dd.from_pandas(pd.DataFrame(), npartitions=1)
+            self.logger.debug(f"Failed to load data into Dask DataFrame.{e}")
+            dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
+            return dask_df

sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_model_builder.py CHANGED Viewed

@@ -59,7 +59,7 @@ class SqlAlchemyModelBuilder:
         attrs = {
             "__tablename__": self.table_name,
             "__table__": self.table,
-            #"__module__": f"{apps_label}.models",
+            "__module__": f"{apps_label}.models",
             "__mapper_args__": {"eager_defaults": True},
         }
@@ -82,9 +82,12 @@ class SqlAlchemyModelBuilder:
             dict: Dictionary of column attributes.
         """
         columns = {}
+        reserved_names = ["metadata", "class_", "table"]
         for column in table.columns:
             column_name = self.normalize_column_name(column.name)
-            columns[column_name] = column
+            if column_name not in reserved_names:
+                columns[column_name] = column
         return columns
     def add_relationships(self, attrs, table: Table):

sibi_dst/df_helper/plugins/sql_model/_sqlmodel_load_from_db.py CHANGED Viewed

@@ -26,7 +26,6 @@ class SQLModelLoadFromDb:
         self.query_config = db_query or {}
         self.params_config = db_params or {}
         self.debug = kwargs.pop("debug", False)
-        self.verbose_debug = kwargs.pop("verbose_debug", False)
     def _default_logger(self):
         """Create a default logger."""
@@ -69,7 +68,7 @@ class SQLModelLoadFromDb:
                     query = query.limit(n_records)
                 # Debug: Log the SQL query
-                self.logger.info(f"Executing query: {str(query)}")
+                self.logger.debug(f"Executing query: {str(query)}")
                 # Execute the query
                 results = session.exec(query).fetchall()
@@ -79,7 +78,7 @@ class SQLModelLoadFromDb:
                 if results:
                     df = dd.from_pandas(pd.DataFrame([r.dict() for r in results]), npartitions=1)
                 else:
-                    self.logger.warning("Query returned no results.")
+                    self.logger.debug("Query returned no results.")
                     df = dd.from_pandas(pd.DataFrame(), npartitions=1)
             except Exception as e:

sibi_dst/utils/_clickhouse_writer.py CHANGED Viewed

@@ -34,7 +34,7 @@ class ClickHouseWriter:
         self.df = df.copy()
         self.order_by = kwargs.setdefault('order_by',self.order_by)
         if len(self.df.head().index) == 0:
-            self.logger.info("Dataframe is empty")
+            self.logger.debug("Dataframe is empty")
             return
         self._handle_missing_values()
         self._connect()
@@ -51,7 +51,7 @@ class ClickHouseWriter:
                 user=self.clickhouse_user,
                 password=self.clickhouse_password
             )
-            self.logger.info("Connected to ClickHouse")
+            self.logger.debug("Connected to ClickHouse")
         except Exception as e:
             self.logger.error(e)
             raise
@@ -80,7 +80,7 @@ class ClickHouseWriter:
     def _drop_table(self):
         if self.client:
             self.client.command('DROP TABLE IF EXISTS {}'.format(self.clickhouse_table))
-            self.logger.info(f"Dropped table {self.clickhouse_table}")
+            self.logger.debug(f"Dropped table {self.clickhouse_table}")
     def _create_table_from_dask(self, engine=None):
         if engine is None:
@@ -88,18 +88,18 @@ class ClickHouseWriter:
         dtypes = self.df.dtypes
         clickhouse_schema = self._generate_clickhouse_schema(dtypes,self.dtype_to_clickhouse)
         create_table_sql= f"CREATE TABLE IF NOT EXISTS {self.clickhouse_table} ({clickhouse_schema}) {engine};"
-        self.logger.info(f"Creating table SQL:{create_table_sql}")
+        self.logger.debug(f"Creating table SQL:{create_table_sql}")
         if self.client:
             self.client.command(create_table_sql)
-            self.logger.info("Created table '{}'".format(self.clickhouse_table))
+            self.logger.debug("Created table '{}'".format(self.clickhouse_table))
     def _handle_missing_values(self):
         """
         Handle missing values in the Dask DataFrame before writing to ClickHouse.
         """
-        self.logger.info("Checking for missing values...")
+        self.logger.debug("Checking for missing values...")
         missing_counts = self.df.isnull().sum().compute()
-        self.logger.info(f"Missing values per column:\n{missing_counts}")
+        self.logger.debug(f"Missing values per column:\n{missing_counts}")
         # Replace missing values based on column types
         def replace_missing_values(df):
@@ -116,14 +116,14 @@ class ClickHouseWriter:
         # Apply replacement
         self.df = replace_missing_values(self.df)
-        self.logger.info("Missing values replaced.")
+        self.logger.debug("Missing values replaced.")
     def _write_data(self):
         """
         Writes the Dask DataFrame to a ClickHouse table partition by partition.
         """
         if len(self.df.head().index) == 0:
-            self.logger.info("No data found. Nothing written.")
+            self.logger.debug("No data found. Nothing written.")
             return
         for i, partition in enumerate(self.df.to_delayed()):
@@ -132,10 +132,10 @@ class ClickHouseWriter:
                 df = partition.compute()
                 if df.empty:
-                    self.logger.info(f"Partition {i} is empty. Skipping...")
+                    self.logger.debug(f"Partition {i} is empty. Skipping...")
                     continue
-                self.logger.info(f"Writing partition {i} with {len(df)} rows to ClickHouse.")
+                self.logger.debug(f"Writing partition {i} with {len(df)} rows to ClickHouse.")
                 # Write the partition to the ClickHouse table
                 self.client.insert_df(self.clickhouse_table, df)
@@ -148,7 +148,7 @@ class ClickHouseWriter:
         Ensures a separate client instance is used per thread to avoid session conflicts.
         """
         if len(self.df.index) == 0:
-            self.logger.info("No data found. Nothing written.")
+            self.logger.debug("No data found. Nothing written.")
             return
         def create_client():
@@ -170,13 +170,13 @@ class ClickHouseWriter:
             Write a single partition to ClickHouse using a separate client instance.
             """
             try:
-                self.logger.info(f"Starting to process partition {index}")
+                self.logger.debug(f"Starting to process partition {index}")
                 client = create_client()  # Create a new client for the thread
                 # Compute the Dask partition into a Pandas DataFrame
                 df = partition.compute()
                 if df.empty:
-                    self.logger.info(f"Partition {index} is empty. Skipping...")
+                    self.logger.debug(f"Partition {index} is empty. Skipping...")
                     return
                 # Convert DataFrame to list of tuples
@@ -184,7 +184,7 @@ class ClickHouseWriter:
                 columns = df.columns.tolist()
                 # Perform the insert
-                self.logger.info(f"Writing partition {index} with {len(df)} rows to ClickHouse.")
+                self.logger.debug(f"Writing partition {index} with {len(df)} rows to ClickHouse.")
                 client.execute(f"INSERT INTO {self.clickhouse_table} ({', '.join(columns)}) VALUES", data)
             except Exception as e:
@@ -192,7 +192,7 @@ class ClickHouseWriter:
             finally:
                 if 'client' in locals() and hasattr(client, 'close'):
                     client.close()
-                    self.logger.info(f"Closed client for partition {index}")
+                    self.logger.debug(f"Closed client for partition {index}")
         try:
             # Get delayed partitions and enumerate them

sibi_dst/utils/_data_utils.py CHANGED Viewed

@@ -1,77 +1,32 @@
-import pandas as pd
 import dask.dataframe as dd
+import pandas as pd
 from sibi_dst.utils import Logger
 class DataUtils:
-    def __init__(self, logger=None):
+    def __init__(self, logger=None, **kwargs):
         self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
+        self.debug = kwargs.get('debug', False)
     def transform_numeric_cols(self, df, columns, fill_value=0, dtype=int):
         if not columns:
             self.logger.warning('No columns specified')
+        self.logger.debug(f'Dataframe type:{type(df)}')
         columns = [column for column in columns if column in df.columns]
         for col in columns:
-            if isinstance(df, dd.DataFrame):
-                # Replace NaN with 0, then convert to boolean
-                df[col] = df[col].map_partitions(
-                    lambda s: pd.to_numeric(s, errors='coerce')  # Convert to numeric, invalid to NaN
-                    .fillna(fill_value)  # Replace NaN with 0
-                    .astype(dtype),
-                    meta=(col, dtype)
-                )
-            else:
-                # For Pandas DataFrame, handle mixed types and invalid values
-                df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert to numeric, invalid to NaN
-                df[col] = df[col].fillna(fill_value).astype(dtype)
+            # Replace NaN with 0, then convert to boolean
+            df[col] = df[col].map_partitions(
+                lambda s: pd.to_numeric(s, errors='coerce')  # Convert to numeric, invalid to NaN
+                .fillna(fill_value)  # Replace NaN with 0
+                .astype(dtype),
+                meta=(col, dtype)
+            )
         return df
-    @staticmethod
-    def transform_numeric_columns(df, columns=None, fill_value=0, transform_func=None):
-        """
-        Transform numeric columns in a DataFrame (Pandas or Dask), handling missing values and applying optional transformations.
-        Parameters:
-        - df (pandas.DataFrame or dask.dataframe.DataFrame): The DataFrame.
-        - columns (list of str, optional): Specific columns to transform. If None, all numeric columns are transformed.
-        - fill_value (int or float): The value to replace NA values with.
-        - transform_func (callable, optional): The transformation function to apply.
-          If None, no additional transformation is applied.
-        Returns:
-        - pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with transformed numeric columns.
-        """
-        if columns is None:
-            # Detect numeric columns
-            columns = df.select_dtypes(include=['number']).columns.tolist()
-        if not columns:
-            return df
-        columns = [column for column in columns if column in df.columns]
-        # Default transformation function (identity) if none is provided
-        if transform_func is None:
-            transform_func = lambda x: x
-        # Batch processing for Dask
-        if isinstance(df, dd.DataFrame):
-            def transform_partition(partition):
-                # Apply transformations for all numeric columns in a single pass
-                partition[columns] = partition[columns].fillna(fill_value).map(transform_func)
-                return partition
-            # Apply the transformation function to all specified columns
-            df = df.map_partitions(transform_partition, meta=df)
-        else:
-            # Pandas: Vectorized operations for all specified columns
-            df[columns] = df[columns].fillna(fill_value).map(transform_func)
-        return df
-    @staticmethod
-    def transform_boolean_columns(df, columns=None):
+    def transform_boolean_columns(self, df, columns=None):
         """
         Detect if the provided columns in a DataFrame (Pandas or Dask) contain only 0 and 1
         and convert them to boolean. Detection is performed using a sample.
@@ -84,23 +39,20 @@ class DataUtils:
         Returns:
         - pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with transformed boolean columns.
         """
         # Apply transformation to each specified column
         for col in columns:
             if col in df.columns:
-                if isinstance(df, dd.DataFrame):
-                    # Replace NaN with 0, then convert to boolean
-                    df[col] = df[col].map_partitions(
-                        lambda s: pd.to_numeric(s, errors='coerce')  # Convert to numeric, invalid to NaN
-                        .fillna(0)  # Replace NaN with 0
-                        .astype(int)  # Ensure integer type
-                        .astype(bool),  # Convert to boolean
-                        meta=(col, 'bool')
-                    )
-                else:
-                    # For Pandas DataFrame, handle mixed types and invalid values
-                    df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert to numeric, invalid to NaN
-                    df[col] = df[col].fillna(0).astype(int).astype(bool)
+                # Replace NaN with 0, then convert to boolean
+                df[col] = df[col].map_partitions(
+                    lambda s: pd.to_numeric(s, errors='coerce')  # Convert to numeric, invalid to NaN
+                    .fillna(0)  # Replace NaN with 0
+                    .astype(int)  # Ensure integer type
+                    .astype(bool),  # Convert to boolean
+                    meta=(col, 'bool')
+                )
+        if self.debug:
+            self.logger.debug(f'Dataframe type:{type(df)}, boolean applied to columns: {columns}')
         return df
     def merge_lookup_data(self, classname, df, **kwargs):
@@ -116,6 +68,7 @@ class DataUtils:
         - pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with merged lookup data.
         """
         # Return early if the DataFrame is empty
+        debug = kwargs.setdefault("debug", False)
         if self.is_dataframe_empty(df):
             return df
@@ -136,17 +89,24 @@ class DataUtils:
         column_names = kwargs.pop('column_names', ['temp_join_col', source_description_alias])
         if source_col not in df.columns:
-            self.logger.info(f"{source_col} not in DataFrame columns")
+            self.logger.debug(f"{source_col} not in DataFrame columns")
             return df
         # Get unique IDs from source column
         ids = df[source_col].dropna().unique()
-        if isinstance(ids, dd.Series):
+        # Compute if it's a Dask Series
+        if isinstance(ids, dd.core.Series):
             ids = ids.compute()
+        # Check if any IDs are found
         if not len(ids):
-            self.logger.info(f"No IDs found in the source column: {source_col}")
+            self.logger.debug(f"No IDs found in the source column: {source_col}")
             return df
-        ids = sorted(ids.tolist())
+        # Convert to a list only if necessary and sort
+        if not isinstance(ids, list):
+            ids = ids.tolist()
+        ids = sorted(ids)
         # Prepare kwargs for loading lookup data
         load_kwargs = kwargs.copy()
         load_kwargs.update({
@@ -155,10 +115,10 @@ class DataUtils:
             f'{lookup_col}__in': ids
         })
         # Load lookup data
-        lookup_instance = classname(debug=True, verbose_debug=True)
+        lookup_instance = classname(debug=debug)
         result = lookup_instance.load(**load_kwargs)
         if len(result.index) == 0:
-            self.logger.info(f"No IDs found in the source column: {source_col}")
+            self.logger.debug(f"No IDs found in the source column: {source_col}")
             return df
         # Determine the join column on the result DataFrame
         temp_join_col = 'temp_join_col' if 'temp_join_col' in column_names else lookup_col
@@ -167,14 +127,13 @@ class DataUtils:
         df = df.merge(result, how='left', left_on=source_col, right_on=temp_join_col)
         if fillna_source_description_alias and source_description_alias in df.columns:
-            df[source_description_alias]=df[source_description_alias].fillna('')
+            df[source_description_alias] = df[source_description_alias].fillna('')
         # Drop temp_join_col if present
         df = df.drop(columns='temp_join_col', errors='ignore')
         return df
     def is_dataframe_empty(self, df):
         """
         Check if a DataFrame (Pandas or Dask) is empty.

sibi_dst/utils/_data_wrapper.py CHANGED Viewed

@@ -1,8 +1,12 @@
 import datetime
 from typing import Type, Any, Dict, Optional
+import dask_expr
 import fsspec
 import pandas as pd
 from IPython.display import display
+from dask.dataframe import dd
 from sibi_dst.utils import Logger
 from tqdm import tqdm
 from sibi_dst.utils import ParquetSaver
@@ -112,7 +116,7 @@ class DataWrapper:
             file_age_minutes = (current_time - file_modification_datetime).total_seconds() / 60
             if self.verbose:
-                self.logger.info(
+                self.logger.debug(
                     f"File {file_path} is {round(file_age_minutes, 2)} minutes old "
                     f"(threshold: {self.max_age_minutes} minutes)"
                 )
@@ -129,14 +133,14 @@ class DataWrapper:
         start_time = datetime.datetime.now()
         if self.verbose:
-            self.logger.info(f"Processing {full_parquet_filename}...")
+            self.logger.debug(f"Processing {full_parquet_filename}...")
         data_object = self.dataclass(**self.class_params)
         df = data_object.load_period(dt_field=self.date_field, start=date, end=date)
         if len(df.index)==0:
             if self.verbose:
-                self.logger.info("No data found for the specified date.")
+                self.logger.debug("No data found for the specified date.")
             return
         parquet_saver = ParquetSaver(df, folder, self.logger)
@@ -146,7 +150,7 @@ class DataWrapper:
         duration_seconds = (end_time - start_time).total_seconds()
         if self.verbose:
-            self.logger.info(
+            self.logger.debug(
                 f"Data saved to {full_parquet_filename}. Processing time: {duration_seconds:.2f} seconds"
             )

sibi_dst/utils/_df_utils.py CHANGED Viewed

@@ -85,7 +85,7 @@ class DfUtils:
         # Ensure all specified columns exist in the DataFrame
         missing_columns = [col for col, _, _ in conditions if col not in df.columns]
         if missing_columns:
-            self.logger.info(f"The following columns are missing in the DataFrame: {', '.join(missing_columns)}")
+            self.logger.debug(f"The following columns are missing in the DataFrame: {', '.join(missing_columns)}")
             return df
         # Build the combined filtering condition
@@ -117,7 +117,7 @@ class DfUtils:
             DataFrame: Grouped DataFrame with counts.
         """
         if debug:
-            self.logger.info(f"Grouping by: {group_by_expr}")
+            self.logger.debug(f"Grouping by: {group_by_expr}")
         df_grouped = df.groupby(by=group_by_expr).size().reset_index(name=group_expr)
         return df_grouped
@@ -141,7 +141,7 @@ class DfUtils:
         if debug:
             df_duplicates = df[df.duplicated(subset=duplicate_expr)]
-            self.logger.info(f"Duplicate Rows based on columns {duplicate_expr} are:\n{df_duplicates}")
+            self.logger.debug(f"Duplicate Rows based on columns {duplicate_expr} are:\n{df_duplicates}")
         if sort_field:
             if isinstance(df, dd.DataFrame):
@@ -224,9 +224,9 @@ class DfUtils:
         Returns:
             DataFrame: Resampled pivot table.
         """
-        if isinstance(df, dd.DataFrame):
+        if isinstance(df, dd.core.DataFrame):
             # Implement Dask-compatible pivot and resample
-            self.logger.info("Performing summarization with Dask DataFrame.")
+            self.logger.debug("Performing summarization with Dask DataFrame.")
             # Ensure the index is a datetime for resampling
             if not isinstance(df.index, (pd.DatetimeIndex, dd.core.DatetimeIndex)):
                 self.logger.warning("Index is not a DatetimeIndex. Converting index to datetime.")

sibi_dst/utils/_log_utils.py CHANGED Viewed

@@ -55,6 +55,9 @@ class Logger:
         log_file = log_file or logger_name
         return cls(log_dir=log_dir, logger_name=logger_name, log_file=log_file)
+    def setLevel(self, level):
+        self.logger.setLevel(level)
     def debug(self, msg):
         self.logger.debug(msg)

sibi_dst/utils/_parquet_saver.py CHANGED Viewed

@@ -1,18 +1,16 @@
-import datetime
 from pathlib import Path
 from typing import Optional
-import dask.dataframe as dd
+import dask_expr
 import fsspec
-import pandas as pd
 import pyarrow as pa
 from sibi_dst.utils import Logger
 class ParquetSaver:
     def __init__(self, df_result, parquet_storage_path, logger=None):
         # Ensure df_result is a Dask DataFrame
-        if not isinstance(df_result, dd.DataFrame):
-            df_result = dd.from_pandas(df_result, npartitions=1)
         self.df_result = df_result
         self.parquet_storage_path = parquet_storage_path
         self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
@@ -106,106 +104,3 @@ class ParquetSaver:
             str(full_path), engine="pyarrow", schema=schema, write_index=False
         )
-# import datetime
-# from pathlib import Path
-# from typing import Optional
-#
-# import dask.dataframe as dd
-# import fsspec
-# import pandas as pd
-# import pyarrow as pa
-# from sibi_dst.utils import Logger
-#
-# class ParquetSaver:
-#     def __init__(self, df_result, parquet_storage_path, logger):
-#         self.df_result = df_result
-#         self.parquet_storage_path = parquet_storage_path
-#         self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
-#
-#
-#     def save_to_parquet(self, parquet_filename: Optional[str] = None, clear_existing=True):
-#         full_path = self._construct_full_path(parquet_filename)
-#
-#         if len(self.df_result) == 0:
-#             self.logger.warning('No data to save')
-#             return  # Exit early if there's no data to save
-#
-#         # Ensure directory exists and clear if necessary
-#         self._ensure_directory_exists(full_path, clear_existing=True)
-#
-#         # Define schema and save DataFrame to parquet
-#         schema = self._define_schema()
-#         self._convert_dtypes(schema)
-#         self._save_dataframe_to_parquet(full_path, schema)
-#
-#     def _define_schema(self) -> pa.Schema:
-#         """Define a PyArrow schema dynamically based on df_result column types."""
-#         pandas_dtype_to_pa = {
-#             'object': pa.string(),
-#             'string': pa.string(),
-#             'Int64': pa.int64(),
-#             'int64': pa.int64(),
-#             'float64': pa.float64(),
-#             'bool': pa.bool_(),
-#             'boolean': pa.bool_(),  # pandas nullable boolean
-#             'datetime64[ns]': pa.timestamp('ns'),
-#             'timedelta[ns]': pa.duration('ns')
-#         }
-#
-#         fields = [
-#             pa.field(col, pandas_dtype_to_pa.get(str(dtype), pa.string()))
-#             for col, dtype in self.df_result.dtypes.items()
-#         ]
-#         return pa.schema(fields)
-#
-#     def _convert_dtypes(self, schema: pa.Schema):
-#         """Convert DataFrame columns to match the specified schema."""
-#         dtype_mapping = {}
-#         for field in schema:
-#             col_name = field.name
-#             if col_name in self.df_result.columns:
-#                 if pa.types.is_string(field.type):
-#                     dtype_mapping[col_name] = 'string'
-#                 elif pa.types.is_int64(field.type):
-#                     dtype_mapping[col_name] = 'Int64'  # pandas nullable integer
-#                 elif pa.types.is_float64(field.type):
-#                     dtype_mapping[col_name] = 'float64'
-#                 elif pa.types.is_boolean(field.type):
-#                     dtype_mapping[col_name] = 'boolean'  # pandas nullable boolean
-#                 elif pa.types.is_timestamp(field.type):
-#                     dtype_mapping[col_name] = 'datetime64[ns]'
-#                 else:
-#                     dtype_mapping[col_name] = 'object'  # Fallback to object
-#         self.df_result = self.df_result.astype(dtype_mapping)
-#
-#     def _construct_full_path(self, parquet_filename: Optional[str]) -> Path:
-#         """Construct and return the full path for the parquet file."""
-#         fs, base_path = fsspec.core.url_to_fs(self.parquet_storage_path)
-#         parquet_filename = parquet_filename or "default.parquet"
-#         return Path(base_path) / parquet_filename
-#
-#     @staticmethod
-#     def _ensure_directory_exists(full_path: Path, clear_existing=False):
-#         """Ensure that the directory for the path exists, clearing it if specified."""
-#         fs, _ = fsspec.core.url_to_fs(str(full_path))
-#         directory = str(full_path.parent)
-#
-#         if fs.exists(directory):
-#             if clear_existing:
-#                 fs.rm(directory, recursive=True)
-#         else:
-#             fs.mkdirs(directory, exist_ok=True)
-#
-#     def _save_dataframe_to_parquet(self, full_path: Path, schema: pa.Schema):
-#         """Save the DataFrame to parquet with fsspec using specified schema."""
-#         fs, _ = fsspec.core.url_to_fs(str(full_path))
-#         if fs.exists(full_path):
-#             fs.rm(full_path, recursive=True)
-#         if isinstance(self.df_result, dd.DataFrame):
-#             self.df_result.to_parquet(
-#                 str(full_path), engine="pyarrow", schema=schema, write_index=False
-#             )
-#         elif isinstance(self.df_result, pd.DataFrame):
-#             dd.from_pandas(self.df_result, npartitions=1).to_parquet(
-#                 str(full_path), engine="pyarrow", schema=schema, write_index=False
-#             )

{sibi_dst-0.3.14.dist-info → sibi_dst-0.3.16.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sibi-dst
-Version: 0.3.14
+Version: 0.3.16
 Summary: Data Science Toolkit
 Author: Luis Valverde
 Author-email: lvalverdeb@gmail.com
@@ -13,6 +13,7 @@ Requires-Dist: chardet (>=5.2.0,<6.0.0)
 Requires-Dist: charset-normalizer (>=3.4.0,<4.0.0)
 Requires-Dist: clickhouse-connect (>=0.8.7,<0.9.0)
 Requires-Dist: clickhouse-driver (>=0.2.9,<0.3.0)
+Requires-Dist: dask-expr (>=1.1.20,<2.0.0)
 Requires-Dist: dask[complete] (>=2024.11.1,<2025.0.0)
 Requires-Dist: django (>=5.1.4,<6.0.0)
 Requires-Dist: djangorestframework (>=3.15.2,<4.0.0)

sibi-dst 0.3.14__py3-none-any.whl → 0.3.16__py3-none-any.whl

sibi-dst 0.3.14py3-none-any.whl → 0.3.16py3-none-any.whl