PyPI - sibi-dst - Versions diffs - 2025.1.12__py3-none-any.whl → 2025.8.1__py3-none-any.whl - Mend

sibi-dst 2025.1.12py3-none-any.whl → 2025.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

sibi_dst/__init__.py +7 -1
sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +235 -342
sibi_dst/df_helper/_df_helper.py +417 -117
sibi_dst/df_helper/_parquet_artifact.py +255 -283
sibi_dst/df_helper/backends/parquet/_parquet_options.py +8 -4
sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +68 -107
sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +105 -255
sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +90 -42
sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +192 -0
sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +122 -72
sibi_dst/osmnx_helper/__init__.py +1 -0
sibi_dst/osmnx_helper/basemaps/route_map_plotter.py +203 -0
sibi_dst/osmnx_helper/route_path_builder.py +97 -0
sibi_dst/osmnx_helper/utils.py +2 -0
sibi_dst/utils/base.py +302 -96
sibi_dst/utils/clickhouse_writer.py +472 -206
sibi_dst/utils/data_utils.py +139 -186
sibi_dst/utils/data_wrapper.py +317 -73
sibi_dst/utils/date_utils.py +1 -0
sibi_dst/utils/df_utils.py +193 -213
sibi_dst/utils/file_utils.py +3 -2
sibi_dst/utils/filepath_generator.py +314 -152
sibi_dst/utils/log_utils.py +581 -242
sibi_dst/utils/manifest_manager.py +60 -76
sibi_dst/utils/parquet_saver.py +33 -27
sibi_dst/utils/phone_formatter.py +88 -95
sibi_dst/utils/update_planner.py +180 -178
sibi_dst/utils/webdav_client.py +116 -166
{sibi_dst-2025.1.12.dist-info → sibi_dst-2025.8.1.dist-info}/METADATA +1 -1
{sibi_dst-2025.1.12.dist-info → sibi_dst-2025.8.1.dist-info}/RECORD +32 -28
{sibi_dst-2025.1.12.dist-info → sibi_dst-2025.8.1.dist-info}/WHEEL +0 -0

sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py CHANGED Viewed

@@ -1,33 +1,29 @@
 from __future__ import annotations
-from typing import Type, Any
+import time
+from typing import Any, Dict, Tuple, Type
 import dask
 import dask.dataframe as dd
 import pandas as pd
-from sqlalchemy import (
-    inspect,
-    select
-)
+import sqlalchemy as sa
+from sqlalchemy import select, inspect
 from sqlalchemy.engine import Engine
-from sqlalchemy.orm import declarative_base
-import time
 from sqlalchemy.exc import TimeoutError as SASQLTimeoutError, OperationalError
-import sqlalchemy as sa
+from sqlalchemy.orm import declarative_base
 from sibi_dst.utils import ManagedResource
 from sibi_dst.df_helper.core import FilterHandler
+from ._db_gatekeeper import DBGatekeeper
 class SQLAlchemyDask(ManagedResource):
     """
     Loads data from a database into a Dask DataFrame using a memory-safe,
-    non-parallel, paginated approach.
-    This class avoids using a numeric `index_col for parallel loading.
+    non-parallel, paginated approach (LIMIT/OFFSET).
     """
-    _SQLALCHEMY_TO_DASK_DTYPE = {
+    _SQLALCHEMY_TO_DASK_DTYPE: Dict[str, str] = {
         "INTEGER": "Int64",
         "SMALLINT": "Int64",
         "BIGINT": "Int64",
@@ -38,66 +34,85 @@ class SQLAlchemyDask(ManagedResource):
         "TEXT": "object",
         "DATE": "datetime64[ns]",
         "DATETIME": "datetime64[ns]",
+        "TIMESTAMP": "datetime64[ns]",
         "TIME": "object",
         "UUID": "object",
     }
     def __init__(
-            self,
-            model: Type[declarative_base()],
-            filters: dict,
-            engine: Engine,
-            chunk_size: int = 1000,
-            **kwargs
+        self,
+        model: Type[declarative_base()],
+        filters: Dict[str, Any],
+        engine: Engine,
+        chunk_size: int = 1000,
+        **kwargs: Any,
     ):
-        """
-        Initializes the data loader.
-        Args:
-            model: The SQLAlchemy ORM model for the table.
-            filters: A dictionary of filters to apply to the query.
-            engine: An SQLAlchemy Engine instance.
-            chunk_size: The number of records to fetch in each database query.
-            logger: A logger instance.
-            debug: Whether to enable detailed logging.
-        """
         super().__init__(**kwargs)
         self.model = model
-        self.filters = filters
+        self.filters = filters or {}
         self.engine = engine
-        self.chunk_size = chunk_size
+        self.chunk_size = int(chunk_size)
         self.filter_handler_cls = FilterHandler
-        self.total_records = -1 # Initialize to -1 to indicate uncounted
+        self.total_records: int = -1  # -1 indicates failure/unknown
+        self._sem = DBGatekeeper.get(str(engine.url), max_concurrency=self._safe_cap())
-    @classmethod
-    def infer_meta_from_model(cls, model: Type[declarative_base()]) -> dict:
+    def _safe_cap(self) -> int:
         """
-        Infers a metadata dictionary for Dask based on the SQLAlchemy model.
-        This helps Dask understand the DataFrame structure without reading data.
+        Calculate a safe concurrency cap for DB work based on the engine's pool.
+        Returns: max(1, pool_size + max_overflow - 1)
+        - Works across SQLAlchemy 1.4/2.x
+        - Tolerates pools that expose size/max_overflow as methods or attrs
+        - Allows explicit override via self.db_gatekeeper_cap (if you pass it)
         """
+        # optional explicit override
+        explicit = getattr(self, "db_gatekeeper_cap", None)
+        if isinstance(explicit, int) and explicit > 0:
+            return explicit
+        pool = getattr(self.engine, "pool", None)
+        def _to_int(val, default):
+            if val is None:
+                return default
+            if callable(val):
+                try:
+                    return int(val())  # e.g., pool.size()
+                except Exception:
+                    return default
+            try:
+                return int(val)
+            except Exception:
+                return default
+        # size: QueuePool.size() -> int
+        size_candidate = getattr(pool, "size", None)  # method on QueuePool
+        pool_size = _to_int(size_candidate, 5)
+        # max_overflow: prefer attribute; fall back to private _max_overflow; avoid 'overflow()' (method)
+        max_overflow_attr = (
+                getattr(pool, "max_overflow", None) or  # SQLAlchemy 2.x QueuePool
+                getattr(pool, "_max_overflow", None)  # private fallback
+        )
+        max_overflow = _to_int(max_overflow_attr, 10)
+        cap = max(1, pool_size + max_overflow - 1)
+        self.logger.debug(f"Using a Cap of {cap} from pool size of {pool_size} and max overflow of {max_overflow}.")
+        return max(1, cap)
+    # ---------- meta ----------
+    @classmethod
+    def infer_meta_from_model(cls, model: Type[declarative_base()]) -> Dict[str, str]:
         mapper = inspect(model)
-        dtypes = {}
+        dtypes: Dict[str, str] = {}
         for column in mapper.columns:
             dtype_str = str(column.type).upper().split("(")[0]
             dtype = cls._SQLALCHEMY_TO_DASK_DTYPE.get(dtype_str, "object")
             dtypes[column.name] = dtype
         return dtypes
-    def read_frame(self, fillna_value=None) -> tuple[int | Any, Any] | Any:
-        """
-        Builds and executes a query to load data into a Dask DataFrame.
-        This method works by first running a COUNT query to get the total
-        size, then creating a series of delayed tasks that each fetch a
-        chunk of data using LIMIT/OFFSET.
-        Args:
-            fillna_value: Value to replace NaN or NULL values with, if any.
-        Returns:
-            A lazy Dask DataFrame.
-        """
-        # 1. Build the base query and apply filters
+    def read_frame(self, fillna_value=None) -> Tuple[int, dd.DataFrame]:
+        # Base selectable
         query = select(self.model)
         if self.filters:
             query = self.filter_handler_cls(
@@ -105,232 +120,67 @@ class SQLAlchemyDask(ManagedResource):
             ).apply_filters(query, model=self.model, filters=self.filters)
         else:
             query = query.limit(self.chunk_size)
-        if self.verbose:
-            self.logger.debug(f"Base query for pagination: {query}")
-        # 2. Get metadata for the Dask DataFrame structure
-        ordered_columns = [column.name for column in self.model.__table__.columns]
+        # Meta dataframe (stable column order & dtypes)
+        ordered_columns = [c.name for c in self.model.__table__.columns]
         meta_dtypes = self.infer_meta_from_model(self.model)
         meta_df = pd.DataFrame(columns=ordered_columns).astype(meta_dtypes)
-        # 3. Get the total record count to calculate the number of chunks
+        # Count with retry/backoff
         retry_attempts = 3
-        backoff_factor = 0.5  # start with a 0.5-second delay
-        total_records = 0
+        backoff = 0.5
+        total = 0
         for attempt in range(retry_attempts):
             try:
-                with self.engine.connect() as connection:
-                    count_query = sa.select(sa.func.count()).select_from(query.alias())
-                    total_records = connection.execute(count_query).scalar_one()
-                # If successful, break the loop
-                break
+                with self._sem:
+                    with self.engine.connect() as connection:
+                        count_q = sa.select(sa.func.count()).select_from(query.alias())
+                        total = connection.execute(count_q).scalar_one()
+                    break
             except SASQLTimeoutError:
                 if attempt < retry_attempts - 1:
-                    self.logger.warning(
-                        f"Connection pool limit reached. Retrying in {backoff_factor} seconds..."
-                    )
-                    time.sleep(backoff_factor)
-                    backoff_factor *= 2  # Double the backoff time for the next attempt
+                    self.logger.warning(f"Connection pool limit reached. Retrying in {backoff} seconds...")
+                    time.sleep(backoff)
+                    backoff *= 2
                 else:
-                    self.total_records = -1  # Indicate failure to count records
-                    self.logger.error(
-                        "Failed to get a connection from the pool after several retries.",
-                        exc_info=True
-                    )
+                    self.total_records = -1
+                    self.logger.error("Failed to get a connection from the pool after retries.", exc_info=True)
                     return self.total_records, dd.from_pandas(meta_df, npartitions=1)
             except OperationalError as oe:
-                # sometimes the DB driver wraps timeouts in OperationalError
-                if "timeout" in str(oe).lower():
-                    self.logger.warning("OperationalTimeout, retrying…", exc_info=True)
-                    time.sleep(backoff_factor)
-                    backoff_factor *= 2
+                if "timeout" in str(oe).lower() and attempt < retry_attempts - 1:
+                    self.logger.warning("Operational timeout, retrying…", exc_info=self.debug)
+                    time.sleep(backoff)
+                    backoff *= 2
                     continue
-                else:
-                    self.total_records = -1  # Indicate failure to count records
-                    self.logger.error("OperationalError", exc_info=True)
-                    return self.total_records, dd.from_pandas(meta_df, npartitions=1)
+                self.total_records = -1
+                self.logger.error("OperationalError during count.", exc_info=True)
+                return self.total_records, dd.from_pandas(meta_df, npartitions=1)
             except Exception as e:
-                self.total_records = -1  # Indicate failure to count records
-                self.logger.error(f"An unexpected error occurred: {e}", exc_info=True)
+                self.total_records = -1
+                self.logger.error(f"Unexpected error during count: {e}", exc_info=True)
                 return self.total_records, dd.from_pandas(meta_df, npartitions=1)
-        self.total_records = total_records
-        if total_records == 0:
+        self.total_records = int(total)
+        if total == 0:
             self.logger.warning("Query returned 0 records.")
+            super().close()
             return self.total_records, dd.from_pandas(meta_df, npartitions=1)
-        self.logger.debug(f"Total records to fetch: {total_records}. Chunk size: {self.chunk_size}.")
+        self.logger.debug(f"Total records to fetch: {total}. Chunk size: {self.chunk_size}.")
-        # 4. Create a list of Dask Delayed objects, one for each chunk
         @dask.delayed
         def get_chunk(sql_query, chunk_offset):
-            """A Dask-delayed function to fetch one chunk of data."""
-            # LIMIT/OFFSET must be applied in the delayed function
-            paginated_query = sql_query.limit(self.chunk_size).offset(chunk_offset)
-            df = pd.read_sql(paginated_query, self.engine)
-            if fillna_value is not None:
-                df = df.fillna(fillna_value)
-            # Ensure column order and types match the meta
-            return df[ordered_columns].astype(meta_dtypes)
-        offsets = range(0, total_records, self.chunk_size)
-        delayed_chunks = [get_chunk(query, offset) for offset in offsets]
-        # 5. Construct the final lazy Dask DataFrame from the delayed chunks
+            with self._sem:  # <<< cap concurrent DB fetches
+                paginated = sql_query.limit(self.chunk_size).offset(chunk_offset)
+                df = pd.read_sql(paginated, self.engine)
+                if fillna_value is not None:
+                    df = df.fillna(fillna_value)
+                return df[ordered_columns].astype(meta_dtypes)
+        offsets = range(0, total, self.chunk_size)
+        delayed_chunks = [get_chunk(query, off) for off in offsets]
         ddf = dd.from_delayed(delayed_chunks, meta=meta_df)
-        self.logger.debug(f"Successfully created a lazy Dask DataFrame with {ddf.npartitions} partitions.")
-        if not self._entered:
-            super().cleanup()
+        self.logger.debug(f"Created Dask DataFrame with {ddf.npartitions} partitions.")
         return self.total_records, ddf
-## Dask-Only Solution to test in better hardware
-# from typing import Type, Dict, Any
-# import math
-# import time
-# import pandas as pd
-# import dask
-# import dask.dataframe as dd
-#
-# import sqlalchemy as sa
-# from sqlalchemy import select, func
-# from sqlalchemy.engine import Engine
-# from sqlalchemy.exc import TimeoutError as SASQLTimeoutError, OperationalError
-# from sqlalchemy.orm import declarative_base
-#
-# from sibi_dst.df_helper.core import FilterHandler
-# from sibi_dst.utils import Logger
-#
-#
-# class SQLAlchemyDask:
-#     """
-#     Loads data into a Dask DataFrame.  If there’s exactly one integer PK,
-#     use dask.dataframe.read_sql_table; otherwise fall back to offset‐based
-#     pagination pushed into dask.delayed to keep memory use minimal.
-#     """
-#
-#     def __init__(
-#         self,
-#         model: Type[declarative_base()],
-#         filters: Dict[str, Any],
-#         engine: Engine,
-#         chunk_size: int = 1_000,
-#         logger=None,
-#         debug: bool = False,
-#     ):
-#         self.model      = model
-#         self.filters    = filters or {}
-#         self.engine     = engine
-#         self.chunk_size = chunk_size
-#         self.logger     = logger or Logger.default_logger(self.__class__.__name__)
-#         self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
-#         self.filter_handler_cls = FilterHandler
-#         self.debug = debug
-#
-#     def read_frame(self, fillna_value=None) -> dd.DataFrame:
-#         # 1) Build base query + filters
-#         base_q = select(self.model)
-#         if self.filters:
-#             base_q = self.filter_handler_cls(
-#                 backend="sqlalchemy",
-#                 logger=self.logger,
-#                 debug=self.debug,
-#             ).apply_filters(base_q, model=self.model, filters=self.filters)
-#
-#         # 2) Zero-row meta for dtype inference
-#         meta = pd.read_sql_query(base_q.limit(0), self.engine).iloc[:0]
-#         if meta.shape[1] == 0:
-#             self.logger.warning("No columns detected; returning empty DataFrame.")
-#             return dd.from_pandas(meta, npartitions=1)
-#
-#         # 3) Single‐PK parallel path?
-#         pk_cols = list(self.model.__table__.primary_key.columns)
-#         if (
-#             len(pk_cols) == 1
-#             and pd.api.types.is_integer_dtype(meta[pk_cols[0].name])
-#         ):
-#             try:
-#                 return self._ddf_via_read_sql_table(pk_cols[0], meta, fillna_value)
-#             except Exception:
-#                 self.logger.warning(
-#                     "read_sql_table path failed, falling back to offset pagination",
-#                     exc_info=True,
-#                 )
-#
-#         # 4) Composite PK or fallback → offset pagination in delayed tasks
-#         return self._offset_paginated_ddf(base_q, meta, fillna_value)
-#
-#     def _offset_paginated_ddf(self, base_q, meta, fillna):
-#         # 1) count total rows
-#         try:
-#             with self.engine.connect() as conn:
-#                 total = conn.execute(
-#                     select(func.count()).select_from(base_q.alias())
-#                 ).scalar_one()
-#         except Exception:
-#             self.logger.error("Failed to count records; returning empty DataFrame", exc_info=True)
-#             return dd.from_pandas(meta, npartitions=1)
-#
-#         if total == 0:
-#             self.logger.warning("Query returned 0 records.")
-#             return dd.from_pandas(meta, npartitions=1)
-#         self.logger.debug(f"Total records to fetch: {total}. Chunk size: {self.chunk_size}.")
-#         # 2) create delayed tasks per offset
-#         @dask.delayed
-#         def _fetch_chunk(offset: int) -> pd.DataFrame:
-#             q = base_q.limit(self.chunk_size).offset(offset)
-#             df = pd.read_sql_query(q, self.engine)
-#             if fillna is not None:
-#                 df = df.fillna(fillna)
-#             return df[meta.columns].astype(meta.dtypes.to_dict())
-#
-#         offsets = range(0, total, self.chunk_size)
-#         parts = [_fetch_chunk(off) for off in offsets]
-#
-#         ddf = dd.from_delayed(parts, meta=meta)
-#         self.logger.debug(f"Offset‐paginated read → {len(parts)} partitions")
-#         return ddf
-#
-#     def _ddf_via_read_sql_table(self, pk_col, meta, fillna) -> dd.DataFrame:
-#         # same as before: min/max + dd.read_sql_table
-#         backoff = 0.5
-#         for attempt in range(3):
-#             try:
-#                 with self.engine.connect() as conn:
-#                     min_id, max_id = conn.execute(
-#                         select(func.min(pk_col), func.max(pk_col))
-#                         .select_from(self.model.__table__)
-#                     ).one()
-#                 break
-#             except (SASQLTimeoutError, OperationalError) as e:
-#                 if "timeout" in str(e).lower() and attempt < 2:
-#                     self.logger.warning(f"Timeout fetching PK bounds; retrying in {backoff}s")
-#                     time.sleep(backoff)
-#                     backoff *= 2
-#                 else:
-#                     raise
-#
-#         if min_id is None or max_id is None:
-#             self.logger.warning("Table empty—no PK bounds.")
-#             return dd.from_pandas(meta, npartitions=1)
-#
-#         total = max_id - min_id + 1
-#         nparts = max(1, math.ceil(total / self.chunk_size))
-#         ddf = dd.read_sql_table(
-#             table=self.model.__table__.name,
-#             uri=str(self.engine.url),
-#             index_col=pk_col.name,
-#             limits=(min_id, max_id),
-#             npartitions=nparts,
-#             columns=list(meta.columns),
-#         )
-#         if fillna is not None:
-#             ddf = ddf.fillna(fillna)
-#         self.logger.debug(f"Parallel read via dask.read_sql_table → {nparts} partitions")
-#         return ddf

sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from __future__ import annotations
-from typing import Any
+from typing import Any, Tuple
 import dask.dataframe as dd
 import pandas as pd
@@ -13,68 +13,116 @@ from ._io_dask import SQLAlchemyDask
 class SqlAlchemyLoadFromDb(ManagedResource):
     """
-    Orchestrates loading data from a database using SQLAlchemy into a Dask
-    DataFrame by configuring and delegating to the SQLAlchemyDask loader.
+    Orchestrates loading data from a database using SQLAlchemy into a Dask DataFrame.
     """
     def __init__(
-            self,
-            plugin_sqlalchemy: SqlAlchemyConnectionConfig,
-            plugin_query: QueryConfig = None,
-            plugin_params: ParamsConfig = None,
-            **kwargs,
+        self,
+        plugin_sqlalchemy: SqlAlchemyConnectionConfig,
+        plugin_query: QueryConfig = None,
+        plugin_params: ParamsConfig = None,
+        **kwargs,
     ):
-        """
-        Initializes the loader with all necessary configurations.
-        Args:
-            plugin_sqlalchemy: The database connection configuration object.
-            plugin_query: The query configuration object.
-            plugin_params: The parameters and filters configuration object.
-            logger: An optional logger instance.
-            **kwargs: Must contain 'index_column' for Dask partitioning.
-        """
         super().__init__(**kwargs)
         self.db_connection = plugin_sqlalchemy
         self.model = self.db_connection.model
         self.engine = self.db_connection.engine
         self.query_config = plugin_query
         self.params_config = plugin_params
-        self.chunk_size = kwargs.get("chunk_size", self.params_config.df_params.get("chunk_size", 1000))
-        self.total_records = -1 # Initialize total_records to -1 to indicate no records loaded yet
-    def build_and_load(self) -> tuple[int | Any, Any] | dd.DataFrame:
-        """
-        Builds and loads a Dask DataFrame from a SQLAlchemy source.
-        This method is stateless and returns the DataFrame directly.
+        self.chunk_size = kwargs.get("chunk_size", self.params_config.df_params.get("chunk_size", 1000) if self.params_config else 1000)
+        self.total_records = -1
-        Returns:
-            A Dask DataFrame containing the queried data or an empty,
-            correctly structured DataFrame if the query fails or returns no results.
-        """
+    def build_and_load(self) -> Tuple[int, dd.DataFrame]:
         try:
-            # Instantiate and use the low-level Dask loader
-            sqlalchemy_dask_loader=SQLAlchemyDask(
+            with SQLAlchemyDask(
                 model=self.model,
                 filters=self.params_config.filters if self.params_config else {},
                 engine=self.engine,
                 chunk_size=self.chunk_size,
                 logger=self.logger,
                 verbose=self.verbose,
-                debug=self.debug
-            )
-            # Create the lazy DataFrame and read a record count
-            # if total_records less than 0, it means an error occurred during the loading process
-            self.total_records, dask_df = sqlalchemy_dask_loader.read_frame()
-            return self.total_records, dask_df
+                debug=self.debug,
+            ) as loader:
+                self.logger.debug(f"SQLAlchemyDask loader initialized for model: {self.model.__name__}")
+                self.total_records, dask_df = loader.read_frame()
+                return self.total_records, dask_df
         except Exception as e:
             self.total_records = -1
             self.logger.error(f"{self.model.__name__} Failed to build and load data: {e}", exc_info=True)
-            # Return an empty dataframe with the correct schema on failure
+            # empty df with correct columns
             columns = [c.name for c in self.model.__table__.columns]
             return self.total_records, dd.from_pandas(pd.DataFrame(columns=columns), npartitions=1)
+# from __future__ import annotations
+#
+# from typing import Any
+#
+# import dask.dataframe as dd
+# import pandas as pd
+#
+# from sibi_dst.utils import ManagedResource
+# from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
+# from ._db_connection import SqlAlchemyConnectionConfig
+# from ._io_dask import SQLAlchemyDask
+#
+# class SqlAlchemyLoadFromDb(ManagedResource):
+#     """
+#     Orchestrates loading data from a database using SQLAlchemy into a Dask
+#     DataFrame by configuring and delegating to the SQLAlchemyDask loader.
+#     """
+#
+#     def __init__(
+#             self,
+#             plugin_sqlalchemy: SqlAlchemyConnectionConfig,
+#             plugin_query: QueryConfig = None,
+#             plugin_params: ParamsConfig = None,
+#             **kwargs,
+#     ):
+#         """
+#         Initializes the loader with all necessary configurations.
+#
+#         Args:
+#             plugin_sqlalchemy: The database connection configuration object.
+#             plugin_query: The query configuration object.
+#             plugin_params: The parameters and filters configuration object.
+#             logger: An optional logger instance.
+#             **kwargs: Must contain 'index_column' for Dask partitioning.
+#         """
+#         super().__init__(**kwargs)
+#         self.db_connection = plugin_sqlalchemy
+#         self.model = self.db_connection.model
+#         self.engine = self.db_connection.engine
+#         self.query_config = plugin_query
+#         self.params_config = plugin_params
+#         self.chunk_size = kwargs.get("chunk_size", self.params_config.df_params.get("chunk_size", 1000))
+#         self.total_records = -1 # Initialize total_records to -1 to indicate no records loaded yet
+#
+#     def build_and_load(self) -> tuple[int | Any, Any] | dd.DataFrame:
+#         """
+#         Builds and loads a Dask DataFrame from a SQLAlchemy source.
+#
+#         This method is stateless and returns the DataFrame directly.
+#
+#         Returns:
+#             A Dask DataFrame containing the queried data or an empty,
+#             correctly structured DataFrame if the query fails or returns no results.
+#         """
+#         try:
+#             # Instantiate and use the low-level Dask loader
+#             with SQLAlchemyDask(model=self.model,filters=self.params_config.filters if self.params_config else {},
+#                 engine=self.engine,
+#                 chunk_size=self.chunk_size,
+#                 logger=self.logger,
+#                 verbose=self.verbose,
+#                 debug=self.debug) as sqlalchemy_dask_loader:
+#                 self.logger.debug(f"SQLAlchemyDask loader initialized for model: {self.model.__name__}")
+#                 # Create the lazy DataFrame and read a record count
+#                 # if total_records less than 0, it means an error occurred during the loading process
+#                 self.total_records, dask_df = sqlalchemy_dask_loader.read_frame()
+#                 return self.total_records, dask_df
+#         except Exception as e:
+#             self.total_records = -1
+#             self.logger.error(f"{self.model.__name__} Failed to build and load data: {e}", exc_info=True)
+#             # Return an empty dataframe with the correct schema on failure
+#             columns = [c.name for c in self.model.__table__.columns]
+#             return self.total_records, dd.from_pandas(pd.DataFrame(columns=columns), npartitions=1)

sibi-dst 2025.1.12__py3-none-any.whl → 2025.8.1__py3-none-any.whl

sibi-dst 2025.1.12py3-none-any.whl → 2025.8.1py3-none-any.whl