PyPI - sibi-dst - Versions diffs - 2025.1.9__tar.gz → 2025.1.11__tar.gz - Mend

sibi-dst 2025.1.9tar.gz → 2025.1.11tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

{sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sibi-dst
-Version: 2025.1.9
+Version: 2025.1.11
 Summary: Data Science Toolkit
 Author: Luis Valverde
 Author-email: lvalverdeb@gmail.com

{sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "sibi-dst"
-version = "2025.1.9"
+version = "2025.1.11"
 description = "Data Science Toolkit"
 authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
 readme = "README.md"

{sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/df_helper/_artifact_updater_multi_wrapper.py RENAMED Viewed

@@ -49,6 +49,7 @@ class ArtifactUpdaterMultiWrapperThreaded(ManagedResource):
         self.completion_times: Dict[str, float] = {}
         self.failed: List[str] = []
         self.original_classes: List[Type] = []
+        self.logger.info("ArtifactUpdaterMultiWrapperThreaded initialized")
     def get_artifact_classes(self, data_type: str) -> List[Type]:
         """Retrieve artifact classes by data type."""
@@ -270,6 +271,7 @@ class ArtifactUpdaterMultiWrapperAsync(ManagedResource):
         self.completion_times: Dict[str, float] = {}
         self.failed: List[str] = []
         self.original_classes: List[Type] = []
+        self.logger.info("ArtifactUpdaterMultiWrapperAsync initialized")
     def get_artifact_classes(self, data_type: str) -> List[Type]:
         """

{sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/df_helper/_df_helper.py RENAMED Viewed

@@ -28,6 +28,7 @@ class BaseBackend:
         self.logger = helper.logger
         self.debug = helper.debug
         self.total_records = helper.total_records  # no records loaded yet
+        self._entered = helper._entered  # Track if the helper is used in a context manager
     def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
         """Synchronous data loading method. Must be implemented by sync backends."""
@@ -67,7 +68,10 @@ class ParquetBackend(BaseBackend):
             df = self.helper.backend_parquet.load_files()
             if options and df is not None:
                 df = FilterHandler('dask', logger=self.logger, debug=False).apply_filters(df, filters=options)
-            self.total_records = len(df)
+            df = df.persist()
+            self.total_records = len(df) or -1  # If df is empty, set total_records to -1
             return self.total_records, df
         except Exception as e:
             self.total_records = -1  # Reset total_records on failure
@@ -105,6 +109,12 @@ class DfHelper(ManagedResource):
         'http': HttpBackend,
     }
+    _BACKEND_ATTR_MAP = {
+        'sqlalchemy': 'backend_db_connection',
+        'parquet': 'backend_parquet',
+        'http': 'backend_http',
+    }
     default_config: Dict = None
     def __init__(self, backend='sqlalchemy', **kwargs):
@@ -140,9 +150,15 @@ class DfHelper(ManagedResource):
         super().__exit__(exc_type, exc_value, traceback)
     def _cleanup(self):
-        active_config = getattr(self, f"backend_{self.backend}", None)
+        attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
+        if not attr_name:
+            self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.")
+            return
+        # Get the actual config object (e.g., self.backend_db_connection)
+        active_config = getattr(self, attr_name, None)
         if active_config and hasattr(active_config, "close"):
-            self.logger.debug(f"Closing resources for '{self.backend}' backend.")
+            self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.")
             active_config.close()
     def _get_config(self, model: T, kwargs: Dict[str, Any]) -> T:
@@ -156,6 +172,10 @@ class DfHelper(ManagedResource):
         self.total_records, df = self.backend_strategy.load(**options)
         df = self._process_loaded_data(df)
         df = self._post_process_df(df)
+        if not self._entered:
+            self.logger.warning(
+                "DfHelper instance was not used in a context manager; cleanup is being called manually.")
+            self._cleanup()
         return df.compute() if as_pandas else df
     async def aload(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
@@ -200,7 +220,11 @@ class DfHelper(ManagedResource):
             self.logger.warning("Cannot save to parquet; DataFrame is empty.")
             return
         fs = kwargs.pop('fs', self.fs)
-        path = kwargs.pop('parquet_storage_path', self.backend_parquet.parquet_storage_path)
+        if not fs:
+            raise ValueError("Filesystem (fs) must be provided to save to parquet.")
+        path = kwargs.pop('parquet_storage_path', None)
+        if not path:
+            raise ValueError("parquet_storage_path must be provided to save to parquet.")
         writer_config = {
             'df_result': df,
             'parquet_storage_path': path,

{sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/df_helper/backends/parquet/_parquet_options.py RENAMED Viewed

@@ -4,8 +4,8 @@ from typing import Optional, List
 import dask.dataframe as dd
 import fsspec
-from pydantic import BaseModel, model_validator, DirectoryPath, FilePath, ConfigDict
+import pandas as pd
+from pydantic import BaseModel, model_validator, ConfigDict
 from sibi_dst.utils import FilePathGenerator
 from sibi_dst.utils import Logger
@@ -93,7 +93,7 @@ class ParquetConfig(BaseModel):
         self.parquet_storage_path = self.parquet_storage_path.rstrip('/')
         if not self.fs.exists(self.parquet_storage_path):
             self.fs.mkdirs(self.parquet_storage_path, exist_ok=True)
-            #raise ValueError('Parquet storage path does not exist')
+            # raise ValueError('Parquet storage path does not exist')
         self.load_parquet = False
         if self.parquet_filename is not None:
             self.parquet_full_path = self.ensure_file_extension(
@@ -184,11 +184,36 @@ class ParquetConfig(BaseModel):
         :return: A Dask DataFrame containing loaded parquet file data.
         :rtype: dask.dataframe.DataFrame
         """
-        if self.load_parquet:
-            if self.parquet_folder_list:
-                return dd.read_parquet(self.parquet_folder_list, engine="pyarrow", filesystem=self.fs)
-            else:
-                return dd.read_parquet(self.parquet_full_path, engine="pyarrow", filesystem=self.fs)
+        if not self.load_parquet:
+            self.logger.warning("Parquet loading is disabled. Returning empty DataFrame.")
+            return dd.from_pandas(pd.DataFrame(), npartitions=1)
+        paths_to_load = []
+        if self.parquet_folder_list:
+            # Filter out any None values from the list
+            paths_to_load = [p for p in self.parquet_folder_list if p is not None]
+        elif self.parquet_full_path:
+            # Treat the single path as a list with one item
+            paths_to_load = [self.parquet_full_path]
+        if not paths_to_load:
+            self.logger.warning("No valid parquet file paths were provided. Returning empty DataFrame.")
+            return dd.from_pandas(pd.DataFrame(), npartitions=1)
+        try:
+            self.logger.debug(f"Attempting to load Parquet data from: {paths_to_load}")
+            return dd.read_parquet(
+                paths_to_load,
+                engine="pyarrow",
+                filesystem=self.fs,
+                exclude=["_*", ".*"]
+            )
+        except Exception as e:
+            # This robust error handling is excellent.
+            self.logger.error(f"Parquet loading failed for paths {paths_to_load}: {e}", exc_info=True)
+            self.logger.warning("Returning empty DataFrame due to loading error.")
+            return dd.from_pandas(pd.DataFrame(), npartitions=1)
     @staticmethod
     def ensure_file_extension(filepath: str, extension: str) -> str:

{sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py RENAMED Viewed

@@ -15,7 +15,7 @@ from sqlalchemy.engine import url as sqlalchemy_url
 from sqlalchemy.engine import Engine
 from sqlalchemy.exc import OperationalError, SQLAlchemyError
 from sqlalchemy.orm import sessionmaker, Session
-from sqlalchemy.pool import QueuePool, NullPool, StaticPool
+from sqlalchemy.pool import QueuePool, NullPool, StaticPool, Pool
 # Assuming these are your project's internal modules
 from sibi_dst.utils import Logger
@@ -54,7 +54,7 @@ class SqlAlchemyConnectionConfig(BaseModel):
     pool_timeout: int = int(os.environ.get("DB_POOL_TIMEOUT", 30))
     pool_recycle: int = int(os.environ.get("DB_POOL_RECYCLE", 1800))
     pool_pre_ping: bool = True
-    poolclass: Type[QueuePool] = QueuePool
+    poolclass: Type[Pool] = QueuePool
     # --- Internal & Runtime State ---
     model: Optional[Type[Any]] = None
@@ -172,7 +172,7 @@ class SqlAlchemyConnectionConfig(BaseModel):
                 return
             engine_wrapper['ref_count'] -= 1
-            self.logger.debug(f"Closing config. Ref count is now {engine_wrapper['ref_count']}.")
+            self.logger.debug(f"Closing connection within engine wrapper. Ref count is now {engine_wrapper['ref_count']}.")
             if engine_wrapper['ref_count'] <= 0:
                 self.logger.debug(f"Disposing engine as reference count is zero. Key: {key}")
@@ -195,7 +195,6 @@ class SqlAlchemyConnectionConfig(BaseModel):
             wrapper = self._engine_registry.get(self._engine_key_instance)
             if wrapper:
                 wrapper['active_connections'] += 1
-        # self.logger.debug(f"Connection checked out. Active: {self.active_connections}")
     def _on_checkin(self, *args) -> None:
         """Event listener for when a connection is returned to the pool."""
@@ -203,7 +202,6 @@ class SqlAlchemyConnectionConfig(BaseModel):
             wrapper = self._engine_registry.get(self._engine_key_instance)
             if wrapper:
                 wrapper['active_connections'] = max(0, wrapper['active_connections'] - 1)
-        # self.logger.debug(f"Connection checked in. Active: {self.active_connections}")
     @property
     def active_connections(self) -> int:

{sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/utils/data_wrapper.py RENAMED Viewed

@@ -153,44 +153,44 @@ class DataWrapper(ManagedResource):
             # Create a copy to avoid mutating the shared instance dictionary
             local_load_params = self.load_params.copy()
             local_load_params.update(date_filter)
-            local_class_instance = self.dataclass(**self.class_params)
-            df = local_class_instance.load(**local_load_params)
-            load_time = time.perf_counter() - load_start
-            if hasattr(local_class_instance, "total_records"):
-                self.logger.debug(
-                    f"Total records loaded by {local_class_instance.__class__.__name__}: {local_class_instance.total_records}")
-                if int(local_class_instance.total_records) == 0:  # If no records were loaded but not due to an error
-                    if self.mmanifest:
-                        self.mmanifest.record(
+            with self.dataclass(**self.class_params) as local_class_instance:
+                df = local_class_instance.load(**local_load_params)
+                load_time = time.perf_counter() - load_start
+                if hasattr(local_class_instance, "total_records"):
+                    self.logger.debug(
+                        f"Total records loaded by {local_class_instance.__class__.__name__}: {local_class_instance.total_records}")
+                    if int(local_class_instance.total_records) == 0:  # If no records were loaded but not due to an error
+                        if self.mmanifest:
+                            self.mmanifest.record(
                             full_path=path
                         )
-                    self.logger.info(f"No data found for {full_path}. Logged to missing manifest.")
-                elif int(local_class_instance.total_records) < 0:
-                    self.logger.warning(
-                        f"Negative record count ({local_class_instance.total_records}) for {full_path}. "
-                        "This may indicate an error in the data loading process."
-                    )
-                else:
-                    save_start = time.perf_counter()
-                    parquet_params ={
-                        "df_result": df,
-                        "parquet_storage_path": path,
-                        "fs": self.fs,
-                        "logger": self.logger,
-                        "debug": self.debug,
-                    }
-                    with ParquetSaver(**parquet_params) as ps:
-                        ps.save_to_parquet(self.parquet_filename, overwrite=True)
-                    save_time = time.perf_counter() - save_start
-                    total_time = time.perf_counter() - overall_start
-                    self.benchmarks[date] = {
-                        "load_duration": load_time,
-                        "save_duration": save_time,
-                        "total_duration": total_time
-                    }
-                    self._log_success(date, total_time, full_path)
+                        self.logger.info(f"No data found for {full_path}. Logged to missing manifest.")
+                    elif int(local_class_instance.total_records) < 0:
+                        self.logger.warning(
+                            f"Negative record count ({local_class_instance.total_records}) for {full_path}. "
+                            "This may indicate an error in the data loading process."
+                        )
+                    else:
+                        save_start = time.perf_counter()
+                        parquet_params ={
+                            "df_result": df,
+                            "parquet_storage_path": path,
+                            "fs": self.fs,
+                            "logger": self.logger,
+                            "debug": self.debug,
+                        }
+                        with ParquetSaver(**parquet_params) as ps:
+                            ps.save_to_parquet(self.parquet_filename, overwrite=True)
+                        save_time = time.perf_counter() - save_start
+                        total_time = time.perf_counter() - overall_start
+                        self.benchmarks[date] = {
+                            "load_duration": load_time,
+                            "save_duration": save_time,
+                            "total_duration": total_time
+                        }
+                        self._log_success(date, total_time, full_path)
         except Exception as e:
             self._log_failure(date, e)
             raise

{sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/utils/date_utils.py RENAMED Viewed

@@ -4,7 +4,7 @@ from typing import Union, Tuple, Callable, Dict, Optional
 import fsspec
 import numpy as np
 import pandas as pd
+import dask.dataframe as dd
 from .log_utils import Logger
@@ -305,154 +305,153 @@ class FileAgeChecker:
             raise ValueError(f"Unsupported modification time format for {file_path}") from e
+# --- Vectorized Helper Functions ---
+# These replace the slow, row-by-row .apply() logic. They operate
+# on entire DataFrame partitions for maximum efficiency.
+def _vectorized_busday_count(
+        partition: pd.DataFrame,
+        begin_col: str,
+        end_col: str,
+        holidays: list
+) -> pd.Series:
+    """Vectorized function to count business days on a DataFrame partition."""
+    if partition.empty:
+        return pd.Series([], dtype=float)
+    # Convert entire columns to datetime at once, coercing errors to NaT
+    start_dates = pd.to_datetime(partition[begin_col], errors='coerce').dt.date
+    end_dates = pd.to_datetime(partition[end_col], errors='coerce').dt.date
+    # Create a result series filled with NaN to handle rows with invalid dates
+    result = pd.Series(np.nan, index=partition.index, dtype=float)
+    # Create a boolean mask for valid, non-NaT date pairs
+    valid_mask = pd.notna(start_dates) & pd.notna(end_dates)
+    # Perform the vectorized calculation only on the valid subset of dates
+    result.loc[valid_mask] = np.busday_count(
+        start_dates[valid_mask],
+        end_dates[valid_mask],
+        holidays=holidays
+    )
+    return result
+def _vectorized_sla_end_date(
+        partition: pd.DataFrame,
+        start_col: str,
+        n_days_col: str,
+        holidays: list
+) -> pd.Series:
+    """Vectorized function to calculate the SLA end date on a DataFrame partition."""
+    if partition.empty:
+        return pd.Series([], dtype='datetime64[ns]')
+    start_dates = pd.to_datetime(partition[start_col], errors='coerce').dt.date
+    sla_days = partition[n_days_col]
+    # Create a result series filled with NaT for rows with invalid start dates
+    result = pd.Series(pd.NaT, index=partition.index, dtype='datetime64[ns]')
+    # Create a boolean mask for valid start dates and sla_days
+    valid_mask = pd.notna(start_dates) & pd.notna(sla_days)
+    # Perform the vectorized calculation only on the valid subset
+    result.loc[valid_mask] = np.busday_offset(
+        start_dates[valid_mask],
+        sla_days[valid_mask].astype(int),  # Ensure days are integers
+        roll='forward',
+        holidays=holidays
+    )
+    return result
+# --- Refactored BusinessDays Class ---
 class BusinessDays:
     """
-    Provides functionality for handling business days calculations with a custom
-    holiday list. The class includes methods for calculating the number of
-    business days, modifying dates by adding business days, and applying these
-    operations to Dask DataFrames.
-    :ivar logger: Logger instance for logging error, warning, and debug messages.
-    :type logger: logging.Logger
-    :ivar HOLIDAY_LIST: Dictionary mapping years to lists of holiday dates.
-    :type HOLIDAY_LIST: dict
-    :ivar bd_cal: Numpy busdaycalendar object containing holidays and week mask.
-    :type bd_cal: numpy.busdaycalendar
-    :ivar holidays: Array of holiday dates used by the business day calendar.
-    :type holidays: numpy.ndarray
-    :ivar week_mask: Boolean array indicating working days within a week.
-    :type week_mask: numpy.ndarray
+    Business days calculations with a custom holiday list.
+    Supports scalar and efficient, vectorized Dask DataFrame operations.
     """
-    def __init__(self, holiday_list, logger):
-        """
-        Initialize a BusinessDays object with a given holiday list.
-        """
+    def __init__(self, holiday_list: dict[str, list[str]], logger) -> None:
         self.logger = logger
         self.HOLIDAY_LIST = holiday_list
-        bd_holidays = [day for year in self.HOLIDAY_LIST for day in self.HOLIDAY_LIST[year]]
-        self.bd_cal = np.busdaycalendar(holidays=bd_holidays, weekmask="1111100")
-        self.holidays = self.bd_cal.holidays
-        self.week_mask = self.bd_cal.weekmask
-    def get_business_days_count(self, begin_date, end_date):
-        """
-        Calculate the number of business days between two dates.
-        """
-        try:
-            begin_date = pd.to_datetime(begin_date)
-            end_date = pd.to_datetime(end_date)
-        except Exception as e:
-            raise ValueError(f"Invalid date format: {e}")
-        years = [str(year) for year in range(begin_date.year, end_date.year + 1)]
-        if not all(year in self.HOLIDAY_LIST for year in years):
-            raise ValueError("Not all years in date range are in the holiday list")
-        return np.busday_count(
-            begin_date.strftime("%Y-%m-%d"),
-            end_date.strftime("%Y-%m-%d"),
-            busdaycal=self.bd_cal,
-        )
+        # Flatten and store as tuple for determinism
+        bd_holidays = [day for year in self.HOLIDAY_LIST for day in self.HOLIDAY_LIST[year]]
+        self.holidays = tuple(bd_holidays)
-    def calc_business_days_from_df(self, df, begin_date_col, end_date_col, result_col="business_days"):
-        """
-        Add a column to a Dask DataFrame with the number of business days between two date columns.
-        """
-        if not all(col in df.columns for col in [begin_date_col, end_date_col]):
-            self.logger.error("Column names not found in DataFrame")
-            raise ValueError("Required columns are missing")
-        # Extract holidays and weekmask to recreate the busdaycalendar
-        holidays = self.bd_cal.holidays
-        weekmask = self.bd_cal.weekmask
-        # Define a function to calculate business days
-        def calculate_business_days(row, holidays, weekmask):
-            begin_date = pd.to_datetime(row[begin_date_col])
-            end_date = pd.to_datetime(row[end_date_col])
-            if pd.isna(begin_date) or pd.isna(end_date):
-                return np.nan
-            busdaycal = np.busdaycalendar(holidays=holidays, weekmask=weekmask)
-            return np.busday_count(
-                begin_date.strftime("%Y-%m-%d"),
-                end_date.strftime("%Y-%m-%d"),
-                busdaycal=busdaycal,
-            )
-        # Define a wrapper function for partition-wise operations
-        def apply_business_days(partition, holidays, weekmask):
-            return partition.apply(
-                calculate_business_days, axis=1, holidays=holidays, weekmask=weekmask
-            )
-        # Apply the function using map_partitions
-        df[result_col] = df.map_partitions(
-            apply_business_days,
-            holidays,
-            weekmask,
-            meta=(result_col, "int64"),
+    def get_business_days_count(
+            self,
+            begin_date: str | datetime.date | pd.Timestamp,
+            end_date: str | datetime.date | pd.Timestamp,
+    ) -> int:
+        """Scalar method to count business days between two dates."""
+        begin = pd.to_datetime(begin_date)
+        end = pd.to_datetime(end_date)
+        return int(np.busday_count(begin.date(), end.date(), holidays=list(self.holidays)))
+    def calc_business_days_from_df(
+            self,
+            df: dd.DataFrame,
+            begin_date_col: str,
+            end_date_col: str,
+            result_col: str = "business_days",
+    ) -> dd.DataFrame:
+        """Calculates business days between two columns in a Dask DataFrame."""
+        missing = {begin_date_col, end_date_col} - set(df.columns)
+        if missing:
+            self.logger.error(f"Missing columns: {missing}")
+            raise ValueError("Required columns are missing from DataFrame")
+        return df.assign(
+            **{result_col: df.map_partitions(
+                _vectorized_busday_count,
+                begin_col=begin_date_col,
+                end_col=end_date_col,
+                holidays=list(self.holidays),
+                meta=(result_col, 'f8')  # f8 is float64
+            )}
         )
-        return df
-    def add_business_days(self, start_date, n_days):
-        """
-        Add n_days business days to start_date.
-        """
-        try:
-            start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
-        except ValueError:
-            raise ValueError("Date should be a string in the format YYYY-MM-DD")
-        if str(start_date.year) not in self.HOLIDAY_LIST:
-            self.logger.warning(f"Year {start_date.year} is not in the holiday list")
+    def add_business_days(
+            self,
+            start_date: str | datetime.date | pd.Timestamp,
+            n_days: int,
+    ) -> np.datetime64:
+        """Scalar method to add N business days to a start date."""
+        start = pd.to_datetime(start_date)
         return np.busday_offset(
-            start_date.strftime("%Y-%m-%d"),
+            start.date(),
             n_days,
-            roll="forward",
-            busdaycal=self.bd_cal,
+            roll='forward',
+            holidays=list(self.holidays),
         )
-    def calc_sla_end_date(self, df, start_date_col, n_days_col, result_col="sla_end_date"):
-        """
-        Add a column to a Dask DataFrame with SLA end dates based on start date and SLA days.
-        """
-        if not all(col in df.columns for col in [start_date_col, n_days_col]):
-            raise ValueError("Column names not found in DataFrame")
-        # Extract holidays and weekmask to recreate the busdaycalendar
-        holidays = self.bd_cal.holidays
-        weekmask = self.bd_cal.weekmask
-        # Define a function to calculate SLA end dates
-        def calculate_sla_end_date(row, holidays, weekmask):
-            start_date = pd.to_datetime(row[start_date_col])
-            n_days = row[n_days_col]
-            busdaycal = np.busdaycalendar(holidays=holidays, weekmask=weekmask)
-            return np.busday_offset(
-                start_date.strftime("%Y-%m-%d"),
-                n_days,
-                roll="forward",
-                busdaycal=busdaycal,
-            )
-        # Define a wrapper for partition-wise operation
-        def apply_sla_end_date(partition, holidays, weekmask):
-            return partition.apply(
-                calculate_sla_end_date, axis=1, holidays=holidays, weekmask=weekmask
-            )
-        # Apply the function using map_partitions
-        df[result_col] = df.map_partitions(
-            apply_sla_end_date,
-            holidays,
-            weekmask,
-            meta=(result_col, "object"),
+    def calc_sla_end_date(
+            self,
+            df: dd.DataFrame,
+            start_date_col: str,
+            n_days_col: str,
+            result_col: str = "sla_end_date",
+    ) -> dd.DataFrame:
+        """Calculates an SLA end date column for a Dask DataFrame."""
+        missing = {start_date_col, n_days_col} - set(df.columns)
+        if missing:
+            self.logger.error(f"Missing columns: {missing}")
+            raise ValueError("Required columns are missing from DataFrame")
+        return df.assign(
+            **{result_col: df.map_partitions(
+                _vectorized_sla_end_date,
+                start_col=start_date_col,
+                n_days_col=n_days_col,
+                holidays=list(self.holidays),
+                meta=(result_col, 'datetime64[ns]')
+            )}
         )
-        return df
 # Class enhancements
 # DateUtils.register_period('next_week', lambda: (datetime.date.today() + datetime.timedelta(days=7),
 #                                                 datetime.date.today() + datetime.timedelta(days=13)))

{sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py RENAMED Viewed

@@ -1,7 +1,8 @@
 import itertools
 import dask.dataframe as dd
 import pandas as pd
-from sqlmodel import create_engine, Session, select
+#from sqlmodel import create_engine, Session, select
 from sibi_dst.v2.df_helper.core import FilterHandler
 from sibi_dst.v2.utils import Logger
@@ -116,7 +117,7 @@ class SQLModelDask:
                 return dask_df
         except Exception as e:
-            self.logger.error(f"Error executing query: {str(e)}")
-            self.logger.error(self.query)
+            self.logger.error(f"_io_dask:Error executing query: {str(e)}")
+            self.logger.error(f"_io_dask:{self.query})
             # In case of error, return an empty Dask DataFrame with the expected columns.
             return dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)