PyPI - sibi-dst - Versions diffs - 0.3.63__py3-none-any.whl → 0.3.64__py3-none-any.whl - Mend

sibi-dst 0.3.63py3-none-any.whl → 0.3.64py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

sibi_dst/df_helper/_df_helper.py +184 -591
sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -2
sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +161 -115
sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +141 -97
sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +34 -105
sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +175 -162
sibi_dst/df_helper/core/_query_config.py +2 -2
sibi_dst/utils/log_utils.py +15 -11
sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +325 -50
sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +2 -2
sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +330 -51
sibi_dst/v3/__init__.py +0 -0
sibi_dst/v3/backends/__init__.py +0 -0
sibi_dst/v3/df_helper/__init__.py +0 -0
sibi_dst/v3/df_helper/_df_helper.py +91 -0
{sibi_dst-0.3.63.dist-info → sibi_dst-0.3.64.dist-info}/METADATA +1 -1
{sibi_dst-0.3.63.dist-info → sibi_dst-0.3.64.dist-info}/RECORD +18 -15
sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py +0 -119
{sibi_dst-0.3.63.dist-info → sibi_dst-0.3.64.dist-info}/WHEEL +0 -0

sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py CHANGED Viewed

@@ -3,143 +3,72 @@ import pandas as pd
 from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
 from sibi_dst.utils import Logger
-from ._io_dask import SQLAlchemyDask
 from ._db_connection import SqlAlchemyConnectionConfig
+from ._io_dask import SQLAlchemyDask
 class SqlAlchemyLoadFromDb:
     """
-    The SqlAlchemyLoadFromDb class provides functionality to load data from a
-    database using SQLAlchemy into a Dask DataFrame. It is capable of handling
-    large datasets efficiently by utilizing the Dask framework for parallel
-    computations.
-    This class is initialized with a database connection configuration, query
-    configuration, optional parameters, and a logger. It can execute a query
-    using the specified configurations and read the results into a Dask
-    DataFrame. This is useful for processing and analyzing large-scale data.
-    :ivar df: Dask DataFrame to store the loaded data.
-    :type df: dd.DataFrame
-    :ivar db_connection: Database connection configuration object, containing details
-        such as the table, model, and engine to be used for the query.
-    :type db_connection: SqlAlchemyConnectionConfig
-    :ivar table_name: Name of the database table being queried.
-    :type table_name: str
-    :ivar model: SQLAlchemy model associated with the database connection.
-    :type model: sqlalchemy.ext.declarative.api.DeclarativeMeta
-    :ivar engine: SQLAlchemy engine used for executing queries.
-    :type engine: sqlalchemy.engine.base.Engine
-    :ivar logger: Logger instance for logging debug and error information.
-    :type logger: Logger
-    :ivar query_config: Query configuration, including query-related details such
-        as the SQL query or query settings.
-    :type query_config: QueryConfig
-    :ivar params_config: Parameters configuration, including filter parameters for
-        the query.
-    :type params_config: ParamsConfig
-    :ivar debug: Debug flag indicating whether debug mode is enabled.
-    :type debug: bool
-    :ivar chunk_size: Size of data chunks to process at a time.
-    :type chunk_size: int
+    Orchestrates loading data from a database using SQLAlchemy into a Dask
+    DataFrame by configuring and delegating to the SQLAlchemyDask loader.
     """
-    df: dd.DataFrame = None
     def __init__(
             self,
-            plugin_sqlalchemy: SqlAlchemyConnectionConfig,  # Expected to be an instance of SqlAlchemyConnection
+            plugin_sqlalchemy: SqlAlchemyConnectionConfig,
             plugin_query: QueryConfig = None,
             plugin_params: ParamsConfig = None,
             logger: Logger = None,
             **kwargs,
     ):
         """
-        Initializes an instance of the class, setting up a database connection,
-        query configuration, parameter configuration, and other optional settings
-        like debugging and logging. The class aims to manage the integration and
-        interaction with SQLAlchemy-based database operations.
-        :param plugin_sqlalchemy:
-            The SQLAlchemy connection configuration object, which provides
-            the connection details like engine, table name, and model
-            associated with the database operations.
-        :param plugin_query:
-            The query configuration object, used to define specific query
-            options or rules. Defaults to None.
-        :param plugin_params:
-            The parameters configuration object, used for any additional
-            parameterized settings or configurations. Defaults to None.
-        :param logger:
-            Optional logger instance for logging purposes. If not provided,
-            a default logger is instantiated using the standard logging system.
-        :param kwargs:
-            Optional additional keyword arguments for customization. Can
-            include optional settings like `debug` mode or `chunk_size`
-            for batch operations.
+        Initializes the loader with all necessary configurations.
+        Args:
+            plugin_sqlalchemy: The database connection configuration object.
+            plugin_query: The query configuration object.
+            plugin_params: The parameters and filters configuration object.
+            logger: An optional logger instance.
+            **kwargs: Must contain 'index_column' for Dask partitioning.
         """
         self.db_connection = plugin_sqlalchemy
-        self.table_name = self.db_connection.table
         self.model = self.db_connection.model
         self.engine = self.db_connection.engine
         self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
         self.query_config = plugin_query
         self.params_config = plugin_params
-        self.debug = kwargs.pop("debug", False)
-        self.chunk_size = kwargs.pop("chunk_size", 1000)
+        self.debug = kwargs.get("debug", False)
+        self.chunk_size = kwargs.get("chunk_size", self.params_config.df_params.get("chunk_size", 1000))
     def build_and_load(self) -> dd.DataFrame:
         """
-        Builds and returns the resulting dataframe after calling the internal
-        build and load function. This method triggers the `_build_and_load`
-        function to process and prepare the data before returning it as
-        a dask dataframe.
+        Builds and loads a Dask DataFrame from a SQLAlchemy source.
-        :raises RuntimeError: If any error occurs during the build or load process.
+        This method is stateless and returns the DataFrame directly.
-        :return: The processed data in a dask dataframe.
-        :rtype: dd.DataFrame
-        """
-        self._build_and_load()
-        return self.df
-    def _build_and_load(self) -> dd.DataFrame:
-        """
-        Builds and loads a Dask DataFrame from a SQLAlchemy-compatible source.
-        This method initializes a SQLAlchemyDask object with the provided model,
-        filters, engine URL, logger, chunk size, and debug configuration.
-        It attempts to load the data using the ``read_frame`` method of
-        SQLAlchemyDask. If the data cannot be loaded or the query returns
-        no rows, it creates and returns an empty Dask DataFrame.
-        :raises Exception: On failure to load data or to create a DataFrame.
-        :return: A Dask DataFrame object containing the queried data or an
-                 empty DataFrame if the query returns no results or fails.
-        :rtype: dask.dataframe.DataFrame
+        Returns:
+            A Dask DataFrame containing the queried data or an empty,
+            correctly structured DataFrame if the query fails or returns no results.
         """
         try:
-            self.df = SQLAlchemyDask(
+            # Instantiate and use the low-level Dask loader
+            sqlalchemy_dask_loader=SQLAlchemyDask(
                 model=self.model,
-                filters=self.params_config.filters,
-                engine_url=self.engine.url,
-                logger=self.logger,
+                filters=self.params_config.filters if self.params_config else {},
+                engine=self.engine,
                 chunk_size=self.chunk_size,
+                logger=self.logger,
                 debug=self.debug
-            ).read_frame()
-            if self.df is None or len(self.df.head().index) == 0:
-                self.logger.debug("Query returned no results.")
-                dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
+            )
+            # Create the lazy DataFrame
+            dask_df = sqlalchemy_dask_loader.read_frame()
+            return dask_df
-                return dask_df
-            return self.df
-        except RuntimeError as e:
-            self.logger.info(f"Runtime Error {e}:Failed to load data into Dask DataFrame.")
-            dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
-            return dask_df
         except Exception as e:
-            self.logger.info(f"Exception {e}:Failed to load data into Dask DataFrame.")
-            dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
-            return dask_df
+            self.logger.error(f"Failed to build and load data: {e}", exc_info=True)
+            # Return an empty dataframe with the correct schema on failure
+            columns = [c.name for c in self.model.__table__.columns]
+            return dd.from_pandas(pd.DataFrame(columns=columns), npartitions=1)

sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py CHANGED Viewed

@@ -1,193 +1,206 @@
 import re
+import keyword
+import threading
+from sqlalchemy import MetaData, Engine
+from sqlalchemy.orm import DeclarativeBase
-from sqlalchemy import MetaData, Table
-from sqlalchemy.orm import declarative_base, relationship
-# Base class for dynamically created models
-Base = declarative_base()
+class Base(DeclarativeBase):
+    """Shared declarative base for all ORM models."""
+    pass
-apps_label = "datacubes"
+apps_label = "datacubes.models"
 class SqlAlchemyModelBuilder:
     """
-    Provides functionality for building SQLAlchemy ORM models dynamically from
-    reflected database tables. This class is intended for use with a SQLAlchemy
-    engine and metadata to automatically generate ORM models for specified
-    database tables.
-    The primary purpose of this class is to simplify the process of creating
-    SQLAlchemy ORM models by reflecting tables from a connected database,
-    dynamically generating model classes, and handling relationships between
-    tables.
-    :ivar engine: SQLAlchemy engine connected to the database.
-    :type engine: Engine
-    :ivar table_name: Name of the table for which the model is generated.
-    :type table_name: str
-    :ivar metadata: SQLAlchemy MetaData instance for reflecting tables.
-    :type metadata: MetaData
-    :ivar table: Reflected SQLAlchemy Table object for the specified table name.
-    :type table: Optional[Table]
-    :ivar class_name: Dynamically normalized class name derived from table_name.
-    :type class_name: str
+    Builds a single SQLAlchemy ORM model from a specific database table.
+    This class is thread-safe and caches reflected table metadata to
+    improve performance across multiple instantiations.
     """
-    _model_cache = {}  # Local cache for model classes
+    _lock = threading.Lock()
+    _metadata_cache: dict[str, MetaData] = {}
-    def __init__(self, engine, table_name):
+    def __init__(self, engine: Engine, table_name: str):
         """
-        Initialize the model builder with a database engine and specific table.
+        Initializes the model builder for a specific table.
         Args:
-            engine: SQLAlchemy engine connected to the database.
-            table_name (str): Name of the table to generate the model for.
+            engine: The SQLAlchemy engine connected to the database.
+            table_name: The name of the table to generate the model for.
         """
         self.engine = engine
         self.table_name = table_name
-        self.metadata = MetaData()
-        self.table = None  # Placeholder for the specific table
-        self.class_name = self.normalize_class_name(self.table_name)
+        self.class_name = self._normalize_class_name(self.table_name)
-    def build_model(self) -> type:
-        """
-        Builds and returns a database model class corresponding to the specified table name.
-        The method checks if the model is already registered in the ORM's registry. If not,
-        it reflects the database schema of the specified table and dynamically creates the
-        model class.
-        :raises ValueError: If the specified table does not exist in the database.
-        :return: A database model class corresponding to the specified table name.
-        :rtype: type
-        """
-        # Check if the model is already registered
-        model = Base.registry._class_registry.get(self.class_name)
-        if model:
-            return model
+        engine_key = str(engine.url)
-        self.metadata.reflect(only=[self.table_name], bind=self.engine)
-        self.table = self.metadata.tables.get(self.table_name)
-        if self.table is None:
-            raise ValueError(f"Table '{self.table_name}' does not exist in the database.")
+        # ✅ REFACTOR: Acquire lock to make cache access and creation atomic,
+        # preventing a race condition between multiple threads.
+        with self._lock:
+            if engine_key not in self._metadata_cache:
+                self._metadata_cache[engine_key] = MetaData()
+            self.metadata = self._metadata_cache[engine_key]
-        model = self.create_model()
-        return model
-    def create_model(self) -> type:
+    def build_model(self) -> type:
         """
-        Generates a SQLAlchemy model class dynamically based on the specified table and
-        its columns. The method extracts column information, defines the necessary
-        attributes, and creates the model class if it doesn't already exist in the
-        SQLAlchemy base registry.
+        Builds and returns a database model class for the specified table.
+        This process is atomic and thread-safe.
-        :raises KeyError: If the table or table name does not exist in the provided
-            schema.
-        :raises Exception: If the model creation fails for any reason.
-        :return: The dynamically created or fetched model class.
-        :rtype: type
+        Raises:
+            ValueError: If the specified table does not exist in the database.
+        Returns:
+            The dynamically created ORM model class.
         """
-        # Normalize the class name from the table name
-        columns = self.get_columns(self.table)
-        # Define attributes for the model class
-        attrs = {
-            "__tablename__": self.table_name,
-            "__table__": self.table,
-            "__module__": f"{apps_label}.models",
-            "__mapper_args__": {"eager_defaults": True},
-        }
-        # Add columns and relationships to the model
-        attrs.update(columns)
-        #self.add_relationships(attrs, self.table)
-        model = Base.registry._class_registry.get(self.class_name)
-        if not model:
+        with self._lock:
+            # ✅ REFACTOR: Add a comment acknowledging the risk of using an
+            # internal API. This is a maintenance warning for future developers.
+            # NOTE: Using a private SQLAlchemy API. This is a performance
+            # optimization but may break in future versions of the library.
+            registered_model = Base.registry._class_registry.get(self.class_name)
+            if registered_model:
+                return registered_model
+            # Check if the table's schema is in our metadata cache
+            table = self.metadata.tables.get(self.table_name)
+            # If not cached, reflect it from the database
+            if table is None:
+                self.metadata.reflect(bind=self.engine, only=[self.table_name])
+                table = self.metadata.tables.get(self.table_name)
+            if table is None:
+                raise ValueError(
+                    f"Table '{self.table_name}' does not exist in the database."
+                )
+            # Create the model class dynamically.
+            attrs = {
+                "__tablename__": table.name,
+                "__table__": table,
+                "__module__": apps_label,
+            }
             model = type(self.class_name, (Base,), attrs)
-            # Add the class to Base.registry so it is registered
-            Base.registry._class_registry[self.class_name] = model
-        return model
-    def get_columns(self, table: Table):
-        """
-        Extracts and returns a dictionary of column names and their corresponding column
-        objects from a given table, excluding reserved names. Reserved names are used
-        internally and should not overlap with column names in the provided table. The
-        method ensures sanitized column names through normalization and filters out any
-        column matching reserved keywords.
-        :param table: The table object from which columns are to be extracted.
-        :type table: Table
-        :return: A dictionary containing the sanitized column names as keys and their
-            corresponding column objects as values, excluding reserved names.
-        :rtype: dict
-        """
-        columns = {}
-        reserved_names = ["metadata", "class_", "table"]
-        for column in table.columns:
-            column_name = self.normalize_column_name(column.name)
-            if column_name not in reserved_names:
-                columns[column_name] = column
-        return columns
-    def add_relationships(self, attrs, table: Table):
-        """
-        Adds relationships to the provided attributes dictionary for a given database table.
-        This method iterates through the foreign keys of the provided table, constructs
-        relationship attributes, and updates the attributes dictionary with relationships
-        that connect the current table to related tables.
-        :param attrs: Dictionary of attributes to which relationships will be added.
-                      The dictionary will be updated with new relationship mappings.
-        :type attrs: dict
-        :param table: A database table object containing foreign key relationships.
-                      The method will use this table to establish relationships.
-        :return: None
-        """
-        for fk in table.foreign_keys:
-            related_table_name = fk.column.table.name
-            related_class_name = self.normalize_class_name(related_table_name)
-            relationship_name = self.normalize_column_name(related_table_name)
-            attrs[relationship_name] = relationship(related_class_name, back_populates=None)
+            return model
     @staticmethod
-    def normalize_class_name(table_name: str) -> str:
-        """
-        Generate a normalized class name from a given table name by capitalizing
-        each word separated by underscores and concatenating them.
-        This static method takes a string representation of a table name, where
-        words are separated by underscores, and converts it into a camel case
-        class name. It processes the string by capitalizing the first letter of
-        each word and removing the underscores. The normalized class name
-        returned can be used programmatically for various purposes, such as
-        class generation or naming conventions.
-        :param table_name: The table name to normalize, with words separated by
-            underscores. E.g., 'sample_table' becomes 'SampleTable'.
-        :type table_name: str
-        :return: A normalized class name in camel case format.
-        :rtype: str
-        """
+    def _normalize_class_name(table_name: str) -> str:
+        """Converts a snake_case table_name to a CamelCase class name."""
         return "".join(word.capitalize() for word in table_name.split("_"))
     @staticmethod
-    def normalize_column_name(column_name: str) -> str:
+    def _normalize_column_name(column_name: str) -> str:
         """
-        Normalize a column name by replacing any non-word characters or leading numbers
-        with underscores, while ensuring it does not conflict with reserved keywords
-        such as 'class', 'def', 'return', etc. If the normalized name conflicts with
-        a Python reserved keyword, "_field" is appended to it.
-        :param column_name: The original name of the column to be normalized.
-        :type column_name: str
-        :return: A normalized column name that is safe and compatible for usage
-            in various contexts such as database columns or Python code.
-        :rtype: str
+        Sanitizes a column name to be a valid Python identifier.
+        (Kept for utility, though not used in the final model creation).
         """
-        column_name = re.sub(r"\W|^(?=\d)", "_", column_name)
-        if column_name in {"class", "def", "return", "yield", "global"}:
-            column_name += "_field"
-        return column_name
+        sane_name = re.sub(r"\W", "_", column_name)
+        sane_name = re.sub(r"^\d", r"_\g<0>", sane_name)
+        if keyword.iskeyword(sane_name):
+            return f"{sane_name}_field"
+        return sane_name
+# import re
+# import keyword
+# import threading
+# from sqlalchemy import MetaData, Engine
+# from sqlalchemy.orm import DeclarativeBase
+#
+#
+#
+# class Base(DeclarativeBase):
+#     """shared declarative base for all ORM models."""
+#     pass
+#
+#
+# apps_label = "datacubes.models"
+#
+#
+# class SqlAlchemyModelBuilder:
+#     """
+#     Builds a single SQLAlchemy ORM model from a specific database table.
+#     This class is thread-safe and caches reflected table metadata to
+#     improve performance across multiple instantiations.
+#     """
+#     _lock = threading.Lock()
+#     _metadata_cache: dict[str, MetaData] = {}
+#
+#     def __init__(self, engine: Engine, table_name: str):
+#         """
+#         Initializes the model builder for a specific table.
+#
+#         Args:
+#             engine: The SQLAlchemy engine connected to the database.
+#             table_name: The name of the table to generate the model for.
+#         """
+#         self.engine = engine
+#         self.table_name = table_name
+#         self.class_name = self._normalize_class_name(self.table_name)
+#
+#         # Use or create a cached MetaData object for this engine to avoid
+#         # re-reading the schema for tables that are already known.
+#         engine_key = str(engine.url)
+#         if engine_key not in self._metadata_cache:
+#             self._metadata_cache[engine_key] = MetaData()
+#         self.metadata = self._metadata_cache[engine_key]
+#
+#     def build_model(self) -> type:
+#         """
+#         Builds and returns a database model class for the specified table.
+#         This process is atomic and thread-safe.
+#
+#         Raises:
+#             ValueError: If the specified table does not exist in the database.
+#         Returns:
+#             The dynamically created ORM model class.
+#         """
+#         with self._lock:
+#             # First, check if the model class is already registered in SQLAlchemy
+#             registered_model = Base.registry._class_registry.get(self.class_name)
+#             if registered_model:
+#                 return registered_model
+#
+#             # Next, check if the table's schema is in our metadata cache
+#             table = self.metadata.tables.get(self.table_name)
+#
+#             # If not cached, reflect it from the database
+#             if table is None:
+#                 self.metadata.reflect(bind=self.engine, only=[self.table_name])
+#                 table = self.metadata.tables.get(self.table_name)
+#
+#             if table is None:
+#                 raise ValueError(
+#                     f"Table '{self.table_name}' does not exist in the database."
+#                 )
+#
+#             # Create the model class dynamically.
+#             # No need to add columns manually; __table__ handles it.
+#             attrs = {
+#                 "__tablename__": table.name,
+#                 "__table__": table,
+#                 "__module__": apps_label,
+#             }
+#             model = type(self.class_name, (Base,), attrs)
+#
+#             return model
+#
+#     @staticmethod
+#     def _normalize_class_name(table_name: str) -> str:
+#         """Converts a snake_case table_name to a CamelCase class name."""
+#         return "".join(word.capitalize() for word in table_name.split("_"))
+#
+#     @staticmethod
+#     def _normalize_column_name(column_name: str) -> str:
+#         """
+#         Sanitizes a column name to be a valid Python identifier.
+#         (Kept for utility, though not used in the final model creation).
+#         """
+#         sane_name = re.sub(r"\W", "_", column_name)
+#         sane_name = re.sub(r"^\d", r"_\g<0>", sane_name)
+#
+#         if keyword.iskeyword(sane_name):
+#             return f"{sane_name}_field"
+#         return sane_name

sibi_dst/df_helper/core/_query_config.py CHANGED Viewed

@@ -7,8 +7,8 @@ class QueryConfig(BaseModel):
     use_exclude: bool = False
     n_records: int = 100
     dt_field: Optional[str] = None
-    use_dask: bool = False
-    as_dask: bool = False
+    use_dask: bool = True
+    as_dask: bool = True
     @model_validator(mode='after')
     def check_n_records(self):

sibi_dst/utils/log_utils.py CHANGED Viewed

@@ -115,22 +115,26 @@ class Logger:
         """
         self.logger.setLevel(level)
-    def debug(self, msg: str):
+    def debug(self, msg: str, *args, **kwargs):
         """Log a debug message."""
-        self.logger.debug(msg)
+        self.logger.debug(msg, *args, **kwargs)
-    def info(self, msg: str):
+    def info(self, msg: str, *args, **kwargs):
         """Log an info message."""
-        self.logger.info(msg)
+        self.logger.info(msg, *args, **kwargs)
-    def warning(self, msg: str):
+    def warning(self, msg: str, *args, **kwargs):
         """Log a warning message."""
-        self.logger.warning(msg)
+        self.logger.warning(msg, *args, **kwargs)
-    def error(self, msg: str):
-        """Log an error message."""
-        self.logger.error(msg)
+    def error(self, msg: str, *args, **kwargs):
+        """
+        Log an error message.
+        To log exception information, use the `exc_info=True` keyword argument.
+        """
+        self.logger.error(msg, *args, **kwargs)
-    def critical(self, msg: str):
+    def critical(self, msg: str, *args, **kwargs):
         """Log a critical message."""
-        self.logger.critical(msg)
+        self.logger.critical(msg, *args, **kwargs)

sibi-dst 0.3.63__py3-none-any.whl → 0.3.64__py3-none-any.whl

sibi-dst 0.3.63py3-none-any.whl → 0.3.64py3-none-any.whl