PyPI - sibi-dst - Versions diffs - 0.3.62__py3-none-any.whl → 0.3.64__py3-none-any.whl - Mend

sibi-dst 0.3.62py3-none-any.whl → 0.3.64py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

sibi_dst/df_helper/_df_helper.py +184 -591
sibi_dst/df_helper/_parquet_artifact.py +2 -0
sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -2
sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +161 -115
sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +141 -97
sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +34 -105
sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +175 -162
sibi_dst/df_helper/core/_query_config.py +2 -2
sibi_dst/utils/data_wrapper.py +2 -2
sibi_dst/utils/log_utils.py +15 -11
sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +325 -50
sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +2 -2
sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +330 -51
sibi_dst/v3/__init__.py +0 -0
sibi_dst/v3/backends/__init__.py +0 -0
sibi_dst/v3/df_helper/__init__.py +0 -0
sibi_dst/v3/df_helper/_df_helper.py +91 -0
{sibi_dst-0.3.62.dist-info → sibi_dst-0.3.64.dist-info}/METADATA +1 -1
{sibi_dst-0.3.62.dist-info → sibi_dst-0.3.64.dist-info}/RECORD +20 -17
sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py +0 -119
{sibi_dst-0.3.62.dist-info → sibi_dst-0.3.64.dist-info}/WHEEL +0 -0

sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py CHANGED Viewed

@@ -1,135 +1,179 @@
-import itertools
+from typing import Type
+import dask
 import dask.dataframe as dd
 import pandas as pd
-from sqlalchemy import create_engine, inspect, select
-from sqlalchemy.orm import sessionmaker
+from sqlalchemy import (
+    inspect,
+    select,
+    func,
+)
+from sqlalchemy.engine import Engine
+from sqlalchemy.orm import declarative_base
+import time
+from sqlalchemy.exc import TimeoutError
+import sqlalchemy as sa
 from sibi_dst.df_helper.core import FilterHandler
 from sibi_dst.utils import Logger
 class SQLAlchemyDask:
-    def __init__(self, model, filters, engine_url, chunk_size=1000, logger=None, debug=False):
+    """
+    Loads data from a database into a Dask DataFrame using a memory-safe,
+    non-parallel, paginated approach.
+    This class avoids using a numeric `index_col for parallel loading.
+    """
+    _SQLALCHEMY_TO_DASK_DTYPE = {
+        "INTEGER": "Int64",
+        "SMALLINT": "Int64",
+        "BIGINT": "Int64",
+        "FLOAT": "float64",
+        "NUMERIC": "float64",
+        "BOOLEAN": "bool",
+        "VARCHAR": "object",
+        "TEXT": "object",
+        "DATE": "datetime64[ns]",
+        "DATETIME": "datetime64[ns]",
+        "TIME": "object",
+        "UUID": "object",
+    }
+    def __init__(
+            self,
+            model: Type[declarative_base()],
+            filters: dict,
+            engine: Engine,
+            chunk_size: int = 1000,
+            logger=None,
+            debug: bool = False,
+    ):
         """
-        Initialize with an SQLAlchemy query and database engine URL.
-        :param model: SQLAlchemy ORM model.
-        :param filters: Filters to apply on the query.
-        :param engine_url: Database connection string for SQLAlchemy engine.
-        :param chunk_size: Number of records per chunk for Dask partitions.
-        :param logger: Logger instance for logging.
-        :param debug: Whether to print detailed logs.
+        Initializes the data loader.
+        Args:
+            model: The SQLAlchemy ORM model for the table.
+            filters: A dictionary of filters to apply to the query.
+            engine: An SQLAlchemy Engine instance.
+            chunk_size: The number of records to fetch in each database query.
+            logger: A logger instance.
+            debug: Whether to enable detailed logging.
         """
-        self.query = None
         self.model = model
         self.filters = filters
+        self.engine = engine
         self.chunk_size = chunk_size
         self.debug = debug
-        self.engine = create_engine(engine_url)
-        self.Session = sessionmaker(bind=self.engine)
         self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
-        self.logger.set_level(logger.DEBUG if debug else logger.INFO)
+        self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
+        self.filter_handler_cls = FilterHandler
-    @staticmethod
-    def infer_dtypes_from_model(model):
+    @classmethod
+    def infer_meta_from_model(cls, model: Type[declarative_base()]) -> dict:
         """
-        Infer data types for Dask DataFrame based on SQLAlchemy ORM model columns.
+        Infers a metadata dictionary for Dask based on the SQLAlchemy model.
+        This helps Dask understand the DataFrame structure without reading data.
         """
         mapper = inspect(model)
-        sqlalchemy_to_dask_dtype = {
-            'INTEGER': 'Int64',
-            'SMALLINT': 'Int64',
-            'BIGINT': 'Int64',
-            'FLOAT': 'float64',
-            'NUMERIC': 'float64',
-            'BOOLEAN': 'bool',
-            'VARCHAR': 'object',
-            'TEXT': 'object',
-            'DATE': 'datetime64[ns]',
-            'DATETIME': 'datetime64[ns]',
-            'TIME': 'object',
-            'UUID': 'object',
-        }
         dtypes = {}
         for column in mapper.columns:
-            dtype = sqlalchemy_to_dask_dtype.get(str(column.type).upper(), 'object')
+            dtype_str = str(column.type).upper().split("(")[0]
+            dtype = cls._SQLALCHEMY_TO_DASK_DTYPE.get(dtype_str, "object")
             dtypes[column.name] = dtype
         return dtypes
-    def read_frame(self, fillna_value=None):
+    def read_frame(self, fillna_value=None) -> dd.DataFrame:
         """
-        Load data from an SQLAlchemy query into a Dask DataFrame.
+        Builds and executes a query to load data into a Dask DataFrame.
-        :param fillna_value: Value to replace NaN or NULL values with, if any.
-        :return: Dask DataFrame.
+        This method works by first running a COUNT query to get the total
+        size, then creating a series of delayed tasks that each fetch a
+        chunk of data using LIMIT/OFFSET.
+        Args:
+            fillna_value: Value to replace NaN or NULL values with, if any.
+        Returns:
+            A lazy Dask DataFrame.
         """
-        with self.Session() as session:
+        # 1. Build the base query and apply filters
+        query = select(self.model)
+        if self.filters:
+            query = self.filter_handler_cls(
+                backend="sqlalchemy", logger=self.logger, debug=self.debug
+            ).apply_filters(query, model=self.model, filters=self.filters)
+        self.logger.debug(f"Base query for pagination: {query}")
+        # 2. Get metadata for the Dask DataFrame structure
+        ordered_columns = [column.name for column in self.model.__table__.columns]
+        meta_dtypes = self.infer_meta_from_model(self.model)
+        meta_df = pd.DataFrame(columns=ordered_columns).astype(meta_dtypes)
+        # 3. Get the total record count to calculate the number of chunks
+        # try:
+        #     with self.engine.connect() as connection:
+        #         count_query = select(func.count()).select_from(query.alias())
+        #         total_records = connection.execute(count_query).scalar_one()
+        # except Exception as e:
+        #     self.logger.error(f"Failed to count records for pagination: {e}", exc_info=True)
+        #     return dd.from_pandas(meta_df, npartitions=1)
+        retry_attempts = 3
+        backoff_factor = 0.5  # start with a 0.5-second delay
+        for attempt in range(retry_attempts):
             try:
-                # Build query
-                self.query = select(self.model)
-                if self.filters:
-                    self.query = FilterHandler(backend="sqlalchemy", logger=self.logger, debug=self.debug).apply_filters(self.query,
-                                                                                                       model=self.model,
-                                                                                                       filters=self.filters)
+                with self.engine.connect() as connection:
+                    count_query = sa.select(sa.func.count()).select_from(query.alias())
+                    total_records = connection.execute(count_query).scalar_one()
+                # If successful, break the loop
+                break
+            except TimeoutError:
+                if attempt < retry_attempts - 1:
+                    self.logger.warning(
+                        f"Connection pool limit reached. Retrying in {backoff_factor} seconds..."
+                    )
+                    time.sleep(backoff_factor)
+                    backoff_factor *= 2  # Double the backoff time for the next attempt
                 else:
-                    n_records = 100
-                    self.query = self.query.limit(n_records)
-                self.logger.debug(f"query:{self.query}")
-                # Infer dtypes
-                dtypes = self.infer_dtypes_from_model(self.model)
-                # Get the column order from the SQLAlchemy model
-                ordered_columns = [column.name for column in self.model.__table__.columns]
-                # Execute query and fetch results in chunks
-                result_proxy = session.execute(self.query)
-                results = result_proxy.scalars().all()  # Fetch all rows
-                iterator = iter(results)
-                partitions = []
-                while True:
-                    chunk = list(itertools.islice(iterator, self.chunk_size))
-                    if not chunk:
-                        break
-                    # Convert chunk to Pandas DataFrame
-                    df = pd.DataFrame.from_records(
-                        [row._asdict() if hasattr(row, '_asdict') else row.__dict__ for row in chunk]
+                    self.logger.error(
+                        "Failed to get a connection from the pool after several retries.",
+                        exc_info=True
                     )
-                    # Drop internal SQLAlchemy state if it exists
-                    df = df.loc[:, ~df.columns.str.contains('_sa_instance_state')]
+                    return dd.from_pandas(meta_df, npartitions=1)
+            except Exception as e:
+                self.logger.error(f"An unexpected error occurred: {e}", exc_info=True)
+                return dd.from_pandas(meta_df, npartitions=1)
-                    # Reorder columns to match the model's order
-                    df = df[ordered_columns]
+        if total_records == 0:
+            self.logger.warning("Query returned 0 records.")
+            return dd.from_pandas(meta_df, npartitions=1)
-                    # Fill NaN values
-                    if fillna_value is not None:
-                        df = df.fillna(fillna_value)
+        self.logger.debug(f"Total records to fetch: {total_records}. Chunk size: {self.chunk_size}.")
-                    # Convert timezone-aware columns to naive
-                    for col in df.columns:
-                        if isinstance(df[col].dtype, pd.DatetimeTZDtype):
-                            df[col] = df[col].dt.tz_localize(None)
+        # 4. Create a list of Dask Delayed objects, one for each chunk
+        @dask.delayed
+        def get_chunk(sql_query, chunk_offset):
+            """A Dask-delayed function to fetch one chunk of data."""
+            # LIMIT/OFFSET must be applied in the delayed function
+            paginated_query = sql_query.limit(self.chunk_size).offset(chunk_offset)
+            df = pd.read_sql(paginated_query, self.engine)
-                    # Apply inferred dtypes
-                    df = df.astype(dtypes)
-                    # Create a Dask partition
-                    partitions.append(dd.from_pandas(df, npartitions=1))
+            if fillna_value is not None:
+                df = df.fillna(fillna_value)
-                # Concatenate all partitions
-                if partitions:
-                    dask_df = dd.concat(partitions, axis=0, ignore_index=True)
-                else:
-                    dask_df = dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
+            # Ensure column order and types match the meta
+            return df[ordered_columns].astype(meta_dtypes)
-                self.logger.debug(f"Loaded {len(dask_df)} rows into Dask DataFrame.")
+        offsets = range(0, total_records, self.chunk_size)
+        delayed_chunks = [get_chunk(query, offset) for offset in offsets]
-                return dask_df
+        # 5. Construct the final lazy Dask DataFrame from the delayed chunks
+        ddf = dd.from_delayed(delayed_chunks, meta=meta_df)
+        self.logger.debug(f"Successfully created a lazy Dask DataFrame with {ddf.npartitions} partitions.")
-            except Exception as e:
-                self.logger.error(f"Error executing query: {str(e)}")
-                self.logger.error(self.query)
-                return dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
+        return ddf

sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py CHANGED Viewed

@@ -3,143 +3,72 @@ import pandas as pd
 from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
 from sibi_dst.utils import Logger
-from ._io_dask import SQLAlchemyDask
 from ._db_connection import SqlAlchemyConnectionConfig
+from ._io_dask import SQLAlchemyDask
 class SqlAlchemyLoadFromDb:
     """
-    The SqlAlchemyLoadFromDb class provides functionality to load data from a
-    database using SQLAlchemy into a Dask DataFrame. It is capable of handling
-    large datasets efficiently by utilizing the Dask framework for parallel
-    computations.
-    This class is initialized with a database connection configuration, query
-    configuration, optional parameters, and a logger. It can execute a query
-    using the specified configurations and read the results into a Dask
-    DataFrame. This is useful for processing and analyzing large-scale data.
-    :ivar df: Dask DataFrame to store the loaded data.
-    :type df: dd.DataFrame
-    :ivar db_connection: Database connection configuration object, containing details
-        such as the table, model, and engine to be used for the query.
-    :type db_connection: SqlAlchemyConnectionConfig
-    :ivar table_name: Name of the database table being queried.
-    :type table_name: str
-    :ivar model: SQLAlchemy model associated with the database connection.
-    :type model: sqlalchemy.ext.declarative.api.DeclarativeMeta
-    :ivar engine: SQLAlchemy engine used for executing queries.
-    :type engine: sqlalchemy.engine.base.Engine
-    :ivar logger: Logger instance for logging debug and error information.
-    :type logger: Logger
-    :ivar query_config: Query configuration, including query-related details such
-        as the SQL query or query settings.
-    :type query_config: QueryConfig
-    :ivar params_config: Parameters configuration, including filter parameters for
-        the query.
-    :type params_config: ParamsConfig
-    :ivar debug: Debug flag indicating whether debug mode is enabled.
-    :type debug: bool
-    :ivar chunk_size: Size of data chunks to process at a time.
-    :type chunk_size: int
+    Orchestrates loading data from a database using SQLAlchemy into a Dask
+    DataFrame by configuring and delegating to the SQLAlchemyDask loader.
     """
-    df: dd.DataFrame = None
     def __init__(
             self,
-            plugin_sqlalchemy: SqlAlchemyConnectionConfig,  # Expected to be an instance of SqlAlchemyConnection
+            plugin_sqlalchemy: SqlAlchemyConnectionConfig,
             plugin_query: QueryConfig = None,
             plugin_params: ParamsConfig = None,
             logger: Logger = None,
             **kwargs,
     ):
         """
-        Initializes an instance of the class, setting up a database connection,
-        query configuration, parameter configuration, and other optional settings
-        like debugging and logging. The class aims to manage the integration and
-        interaction with SQLAlchemy-based database operations.
-        :param plugin_sqlalchemy:
-            The SQLAlchemy connection configuration object, which provides
-            the connection details like engine, table name, and model
-            associated with the database operations.
-        :param plugin_query:
-            The query configuration object, used to define specific query
-            options or rules. Defaults to None.
-        :param plugin_params:
-            The parameters configuration object, used for any additional
-            parameterized settings or configurations. Defaults to None.
-        :param logger:
-            Optional logger instance for logging purposes. If not provided,
-            a default logger is instantiated using the standard logging system.
-        :param kwargs:
-            Optional additional keyword arguments for customization. Can
-            include optional settings like `debug` mode or `chunk_size`
-            for batch operations.
+        Initializes the loader with all necessary configurations.
+        Args:
+            plugin_sqlalchemy: The database connection configuration object.
+            plugin_query: The query configuration object.
+            plugin_params: The parameters and filters configuration object.
+            logger: An optional logger instance.
+            **kwargs: Must contain 'index_column' for Dask partitioning.
         """
         self.db_connection = plugin_sqlalchemy
-        self.table_name = self.db_connection.table
         self.model = self.db_connection.model
         self.engine = self.db_connection.engine
         self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
         self.query_config = plugin_query
         self.params_config = plugin_params
-        self.debug = kwargs.pop("debug", False)
-        self.chunk_size = kwargs.pop("chunk_size", 1000)
+        self.debug = kwargs.get("debug", False)
+        self.chunk_size = kwargs.get("chunk_size", self.params_config.df_params.get("chunk_size", 1000))
     def build_and_load(self) -> dd.DataFrame:
         """
-        Builds and returns the resulting dataframe after calling the internal
-        build and load function. This method triggers the `_build_and_load`
-        function to process and prepare the data before returning it as
-        a dask dataframe.
+        Builds and loads a Dask DataFrame from a SQLAlchemy source.
-        :raises RuntimeError: If any error occurs during the build or load process.
+        This method is stateless and returns the DataFrame directly.
-        :return: The processed data in a dask dataframe.
-        :rtype: dd.DataFrame
-        """
-        self._build_and_load()
-        return self.df
-    def _build_and_load(self) -> dd.DataFrame:
-        """
-        Builds and loads a Dask DataFrame from a SQLAlchemy-compatible source.
-        This method initializes a SQLAlchemyDask object with the provided model,
-        filters, engine URL, logger, chunk size, and debug configuration.
-        It attempts to load the data using the ``read_frame`` method of
-        SQLAlchemyDask. If the data cannot be loaded or the query returns
-        no rows, it creates and returns an empty Dask DataFrame.
-        :raises Exception: On failure to load data or to create a DataFrame.
-        :return: A Dask DataFrame object containing the queried data or an
-                 empty DataFrame if the query returns no results or fails.
-        :rtype: dask.dataframe.DataFrame
+        Returns:
+            A Dask DataFrame containing the queried data or an empty,
+            correctly structured DataFrame if the query fails or returns no results.
         """
         try:
-            self.df = SQLAlchemyDask(
+            # Instantiate and use the low-level Dask loader
+            sqlalchemy_dask_loader=SQLAlchemyDask(
                 model=self.model,
-                filters=self.params_config.filters,
-                engine_url=self.engine.url,
-                logger=self.logger,
+                filters=self.params_config.filters if self.params_config else {},
+                engine=self.engine,
                 chunk_size=self.chunk_size,
+                logger=self.logger,
                 debug=self.debug
-            ).read_frame()
-            if self.df is None or len(self.df.head().index) == 0:
-                self.logger.debug("Query returned no results.")
-                dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
+            )
+            # Create the lazy DataFrame
+            dask_df = sqlalchemy_dask_loader.read_frame()
+            return dask_df
-                return dask_df
-            return self.df
-        except RuntimeError as e:
-            self.logger.info(f"Runtime Error {e}:Failed to load data into Dask DataFrame.")
-            dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
-            return dask_df
         except Exception as e:
-            self.logger.info(f"Exception {e}:Failed to load data into Dask DataFrame.")
-            dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
-            return dask_df
+            self.logger.error(f"Failed to build and load data: {e}", exc_info=True)
+            # Return an empty dataframe with the correct schema on failure
+            columns = [c.name for c in self.model.__table__.columns]
+            return dd.from_pandas(pd.DataFrame(columns=columns), npartitions=1)

sibi-dst 0.3.62__py3-none-any.whl → 0.3.64__py3-none-any.whl

sibi-dst 0.3.62py3-none-any.whl → 0.3.64py3-none-any.whl