PyPI - sibi-dst - Versions diffs - 0.3.56__py3-none-any.whl → 0.3.58__py3-none-any.whl - Mend

sibi-dst 0.3.56py3-none-any.whl → 0.3.58py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +164 -165
sibi_dst/df_helper/_df_helper.py +58 -26
sibi_dst/df_helper/_parquet_artifact.py +29 -11
sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +182 -89
sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +6 -2
sibi_dst/utils/__init__.py +2 -0
sibi_dst/utils/data_wrapper.py +34 -93
sibi_dst/utils/parquet_saver.py +15 -12
sibi_dst/utils/update_planner.py +237 -0
{sibi_dst-0.3.56.dist-info → sibi_dst-0.3.58.dist-info}/METADATA +1 -1
{sibi_dst-0.3.56.dist-info → sibi_dst-0.3.58.dist-info}/RECORD +12 -11
{sibi_dst-0.3.56.dist-info → sibi_dst-0.3.58.dist-info}/WHEEL +0 -0

sibi_dst/df_helper/_artifact_updater_multi_wrapper.py CHANGED Viewed

@@ -25,238 +25,237 @@ class ArtifactUpdaterMultiWrapper:
     def __init__(self, wrapped_classes=None, debug=False, **kwargs):
         self.wrapped_classes = wrapped_classes or {}
         self.debug = debug
-        self.logger = kwargs.setdefault('logger',Logger.default_logger(logger_name=self.__class__.__name__))
+        self.logger = kwargs.setdefault(
+            'logger', Logger.default_logger(logger_name=self.__class__.__name__)
+        )
         self.logger.set_level(logging.DEBUG if debug else logging.INFO)
         today = datetime.datetime.today()
-        self.today_str = today.strftime('%Y-%m-%d')
-        self.current_year_starts_on_str = datetime.date(today.year, 1, 1).strftime('%Y-%m-%d')
-        self.parquet_start_date = kwargs.get('parquet_start_date', self.current_year_starts_on_str)
-        self.parquet_end_date = kwargs.get('parquet_end_date', self.today_str)
-        # track concurrency and locks
+        self.parquet_start_date = kwargs.get(
+            'parquet_start_date',
+            datetime.date(today.year, 1, 1).strftime('%Y-%m-%d')
+        )
+        self.parquet_end_date = kwargs.get(
+            'parquet_end_date',
+            today.strftime('%Y-%m-%d')
+        )
+        # track pending/completed/failed artifacts
+        self.pending = set()
+        self.completed = set()
+        self.failed = set()
+        # concurrency primitives
         self.locks = {}
+        self.locks_lock = asyncio.Lock()
         self.worker_heartbeat = defaultdict(float)
-        # graceful shutdown handling
-        loop = asyncio.get_event_loop()
-        self.register_signal_handlers(loop)
+        self.workers_lock = asyncio.Lock()
         # dynamic scaling config
         self.min_workers = kwargs.get('min_workers', 1)
-        self.max_workers = kwargs.get('max_workers', 8)
-        self.memory_per_worker_gb = kwargs.get('memory_per_worker_gb', 1)  # default 1GB per worker
-        self.monitor_interval = kwargs.get('monitor_interval', 10)  # default monitor interval in seconds
+        self.max_workers = kwargs.get('max_workers', 3)
+        self.memory_per_worker_gb = kwargs.get('memory_per_worker_gb', 1)
+        self.monitor_interval = kwargs.get('monitor_interval', 10)
         self.retry_attempts = kwargs.get('retry_attempts', 3)
         self.update_timeout_seconds = kwargs.get('update_timeout_seconds', 600)
         self.lock_acquire_timeout_seconds = kwargs.get('lock_acquire_timeout_seconds', 10)
-    def register_signal_handlers(self, loop):
-        for sig in (signal.SIGINT, signal.SIGTERM):
-            loop.add_signal_handler(sig, lambda: asyncio.create_task(self.shutdown()))
-    async def shutdown(self):
-        self.logger.info("Shutdown signal received. Cleaning up...")
-        tasks = [t for t in asyncio.all_tasks() if t is not asyncio.current_task()]
-        [task.cancel() for task in tasks]
-        await asyncio.gather(*tasks, return_exceptions=True)
-        self.logger.info("Shutdown complete.")
-    def get_lock_for_artifact(self, artifact):
-        artifact_key = artifact.__class__.__name__
-        if artifact_key not in self.locks:
-            self.locks[artifact_key] = asyncio.Lock()
-        return self.locks[artifact_key]
+    async def get_lock_for_artifact(self, artifact):
+        key = artifact.__class__.__name__
+        async with self.locks_lock:
+            if key not in self.locks:
+                self.locks[key] = asyncio.Lock()
+            return self.locks[key]
     def get_artifacts(self, data_type):
         if data_type not in self.wrapped_classes:
             raise ValueError(f"Unsupported data type: {data_type}")
-        return [
-            artifact_class(
-                parquet_start_date=self.parquet_start_date,
-                parquet_end_date=self.parquet_end_date,
-                logger=self.logger,
-                debug=self.debug
-            )
-            for artifact_class in self.wrapped_classes[data_type]
-        ]
+        artifacts = [cls(
+            parquet_start_date=self.parquet_start_date,
+            parquet_end_date=self.parquet_end_date,
+            logger=self.logger,
+            debug=self.debug
+        ) for cls in self.wrapped_classes[data_type]]
+        # seed pending set and clear others
+        self.pending = set(artifacts)
+        self.completed.clear()
+        self.failed.clear()
+        return artifacts
     def estimate_complexity(self, artifact):
         try:
-            if hasattr(artifact, 'get_size_estimate'):
-                return artifact.get_size_estimate()
-        except Exception as e:
-            self.logger.warning(f"Failed to estimate complexity for {artifact}: {e}")
-        return 1  # default
+            return artifact.get_size_estimate()
+        except Exception:
+            return 1
     def prioritize_tasks(self, artifacts):
         queue = asyncio.PriorityQueue()
-        for artifact in artifacts:
-            complexity = self.estimate_complexity(artifact)
-            # we invert the complexity to ensure higher complexity -> higher priority
-            # if you want high complexity first, store negative complexity in the priority queue
-            # or if the smaller number means earlier processing, just keep as is
-            queue.put_nowait(PrioritizedItem(complexity, artifact))
+        for art in artifacts:
+            queue.put_nowait(PrioritizedItem(self.estimate_complexity(art), art))
         return queue
     async def resource_monitor(self, queue, workers):
-        """Monitor system resources and adjust worker count while queue is not empty."""
-        while True:
-            # break if queue done
-            if queue.empty():
-                await asyncio.sleep(0.5)
-                if queue.empty():
-                    break
+        while not queue.empty():
             try:
-                available_memory = psutil.virtual_memory().available
-                worker_memory_bytes = self.memory_per_worker_gb * (1024 ** 3)
-                max_workers_by_memory = available_memory // worker_memory_bytes
-                # figure out how many workers we can sustain
-                # note: we also cap by self.max_workers
-                optimal_workers = min(psutil.cpu_count(), max_workers_by_memory, self.max_workers)
-                # ensure at least self.min_workers is used
-                optimal_workers = max(self.min_workers, optimal_workers)
-                current_worker_count = len(workers)
-                if optimal_workers > current_worker_count:
-                    # we can add more workers if queue is not empty
-                    diff = optimal_workers - current_worker_count
-                    for _ in range(diff):
-                        worker_id = len(workers)
-                        # create a new worker
-                        w = asyncio.create_task(self.worker(queue, worker_id))
-                        workers.append(w)
-                        self.logger.info(f"Added worker {worker_id}. Total workers: {len(workers)}")
-                elif optimal_workers < current_worker_count:
-                    # remove some workers
-                    diff = current_worker_count - optimal_workers
-                    for _ in range(diff):
-                        w = workers.pop()
-                        w.cancel()
-                        self.logger.info(f"Removed a worker. Total workers: {len(workers)}")
+                avail = psutil.virtual_memory().available
+                max_by_mem = avail // (self.memory_per_worker_gb * 2**30)
+                optimal = max(self.min_workers,
+                              min(psutil.cpu_count(), max_by_mem, self.max_workers))
+                async with self.workers_lock:
+                    current = len(workers)
+                    if optimal > current:
+                        for _ in range(optimal - current):
+                            wid = len(workers)
+                            workers.append(asyncio.create_task(self.worker(queue, wid)))
+                            self.logger.info(f"Added worker {wid}")
+                    elif optimal < current:
+                        for _ in range(current - optimal):
+                            w = workers.pop()
+                            w.cancel()
+                            self.logger.info("Removed a worker")
                 await asyncio.sleep(self.monitor_interval)
             except asyncio.CancelledError:
-                # monitor is being shut down
                 break
             except Exception as e:
-                self.logger.error(f"Error in resource_monitor: {e}")
+                self.logger.error(f"Monitor error: {e}")
                 await asyncio.sleep(self.monitor_interval)
     @asynccontextmanager
     async def artifact_lock(self, artifact):
-        lock = self.get_lock_for_artifact(artifact)
+        lock = await self.get_lock_for_artifact(artifact)
         try:
             await asyncio.wait_for(lock.acquire(), timeout=self.lock_acquire_timeout_seconds)
             yield
-        except asyncio.TimeoutError:
-            self.logger.error(f"Timeout acquiring lock for artifact: {artifact.__class__.__name__}")
-            yield  # continue but no actual lock was acquired
         finally:
             if lock.locked():
                 lock.release()
     async def async_update_artifact(self, artifact, **kwargs):
-        for attempt in range(self.retry_attempts):
+        for attempt in range(1, self.retry_attempts + 1):
+            lock = await self.get_lock_for_artifact(artifact)
             try:
-                async with self.artifact_lock(artifact):
-                    self.logger.info(
-                        f"Updating artifact: {artifact.__class__.__name__}, Attempt: {attempt + 1} of {self.retry_attempts}" )
-                    start_time = time.time()
+                await asyncio.wait_for(lock.acquire(), timeout=self.lock_acquire_timeout_seconds)
+                try:
+                    self.logger.info(f"Updating {artifact.__class__.__name__} (attempt {attempt})")
                     await asyncio.wait_for(
                         asyncio.to_thread(artifact.update_parquet, **kwargs),
                         timeout=self.update_timeout_seconds
                     )
-                    elapsed_time = time.time() - start_time
+                    # mark success
+                    async with self.workers_lock:
+                        self.pending.discard(artifact)
+                        self.completed.add(artifact)
                     self.logger.info(
-                        f"Successfully updated artifact: {artifact.__class__.__name__} in {elapsed_time:.2f}s." )
+                        f"✅ {artifact.__class__.__name__} done — "
+                        f"{len(self.completed)}/{len(self.completed) + len(self.pending) + len(self.failed)} completed, "
+                        f"{len(self.failed)} failed"
+                    )
                     return
+                finally:
+                    if lock.locked():
+                        lock.release()
             except asyncio.TimeoutError:
-                self.logger.error(f"Timeout updating artifact {artifact.__class__.__name__}, Attempt: {attempt + 1}")
+                self.logger.warning(f"Timeout on {artifact.__class__.__name__}, attempt {attempt}")
             except Exception as e:
-                self.logger.error(
-                    f"Error updating artifact {artifact.__class__.__name__}, Attempt: {attempt + 1}: {e}" )
-            # exponential backoff
-            await asyncio.sleep(2 ** attempt)
+                self.logger.error(f"Error on {artifact}: {e}")
+            finally:
+                if lock.locked():
+                    lock.release()
+            await asyncio.sleep(2 ** (attempt - 1))
-        self.logger.error(f"All retry attempts failed for artifact: {artifact.__class__.__name__}")
+        # all retries exhausted -> mark failure
+        async with self.workers_lock:
+            self.pending.discard(artifact)
+            self.failed.add(artifact)
+        self.logger.error(f"✖️  Permanently failed {artifact.__class__.__name__}")
     async def worker(self, queue, worker_id, **kwargs):
-        """A worker that dynamically pulls tasks from the queue."""
         while True:
             try:
-                prioritized_item = await queue.get()
-                if prioritized_item is None:
-                    break
-                artifact = prioritized_item.artifact
-                # heartbeat
+                item = await queue.get()
+                art = item.artifact
                 self.worker_heartbeat[worker_id] = time.time()
-                await self.async_update_artifact(artifact, **kwargs)
+                await self.async_update_artifact(art, **kwargs)
             except asyncio.CancelledError:
-                self.logger.info(f"Worker {worker_id} shutting down gracefully.")
+                self.logger.info(f"Worker {worker_id} stopped")
                 break
-            except Exception as e:
-                self.logger.error(f"Error in worker {worker_id}: {e}")
             finally:
                 queue.task_done()
-    async def process_tasks(self, queue, initial_workers, **kwargs):
-        """Start a set of workers and a resource monitor to dynamically adjust them."""
-        # create initial workers
-        workers = []
-        for worker_id in range(initial_workers):
-            w = asyncio.create_task(self.worker(queue, worker_id, **kwargs))
-            workers.append(w)
-        # start resource monitor
-        monitor_task = asyncio.create_task(self.resource_monitor(queue, workers))
-        # wait until queue is done
-        try:
-            await queue.join()
-        finally:
-            # cancel resource monitor
-            monitor_task.cancel()
-            # all workers done
-            for w in workers:
-                w.cancel()
-            await asyncio.gather(*workers, return_exceptions=True)
+    def calculate_initial_workers(self, count: int) -> int:
+        avail = psutil.virtual_memory().available
+        max_by_mem = avail // (self.memory_per_worker_gb * 2**30)
+        return max(self.min_workers,
+                   min(psutil.cpu_count(), max_by_mem, count, self.max_workers))
     async def update_data(self, data_type, **kwargs):
-        self.logger.info(f"Processing wrapper group: {data_type} with {kwargs}")
+        self.logger.info(f"Starting update for {data_type}")
         artifacts = self.get_artifacts(data_type)
         queue = self.prioritize_tasks(artifacts)
+        init = self.calculate_initial_workers(len(artifacts))
+        tasks = [asyncio.create_task(self.worker(queue, i, **kwargs)) for i in range(init)]
+        monitor = asyncio.create_task(self.resource_monitor(queue, tasks))
+        await queue.join()
+        monitor.cancel()
+        for t in tasks:
+            t.cancel()
+        await asyncio.gather(*tasks, return_exceptions=True)
+        self.logger.info(self.format_results_table())
+        self.logger.info("All artifacts processed.")
+    def format_results_table(self):
+        results = self.get_update_status()
+        headers = ["Metric", "Value"]
+        rows = [
+            ["Total", results['total']],
+            ["Completed", results['completed']],
+            ["Pending", results['pending']],
+            ["Failed", results['failed']],
+            ["Pending Items", len(results['pending_items'])],
+            ["Failed Items", len(results['failed_items'])]
+        ]
+        # Find max lengths for alignment
+        max_metric = max(len(str(row[0])) for row in rows)
+        max_value = max(len(str(row[1])) for row in rows)
-        # compute initial worker count (this can be low if memory is low initially)
-        initial_workers = self.calculate_initial_workers(len(artifacts))
-        self.logger.info(f"Initial worker count: {initial_workers} for {len(artifacts)} artifacts")
+        format_str = "{:<%d}  {:>%d}" % (max_metric, max_value)
-        total_start_time = time.time()
-        await self.process_tasks(queue, initial_workers, **kwargs)
-        total_time = time.time() - total_start_time
-        self.logger.info(f"Total processing time: {total_time:.2f} seconds.")
+        table = [
+            "\n",
+            format_str.format(*headers),
+            "-" * (max_metric + max_value + 2)
+        ]
-    def calculate_initial_workers(self, artifact_count: int) -> int:
-        """Compute the initial number of workers before resource_monitor can adjust."""
-        self.logger.info("Calculating initial worker count...")
-        available_memory = psutil.virtual_memory().available
-        self.logger.info(f"Available memory: {available_memory / (1024 ** 3):.2f} GB")
-        worker_memory_bytes = self.memory_per_worker_gb * (1024 ** 3)
-        self.logger.info(f"Memory per worker: {worker_memory_bytes / (1024 ** 3):.2f} GB")
-        max_workers_by_memory = available_memory // worker_memory_bytes
-        self.logger.info(f"Max workers by memory: {max_workers_by_memory}")
-        # also consider CPU count and artifact_count
-        initial = min(psutil.cpu_count(), max_workers_by_memory, artifact_count, self.max_workers)
-        self.logger.info(f"Optimal workers: {initial} CPU: {psutil.cpu_count()} Max Workers: {self.max_workers}")
-        return max(self.min_workers, initial)
+        for row in rows:
+            table.append(format_str.format(row[0], row[1]))
+        return "\n".join(table)
+    def get_update_status(self):
+        total = len(self.pending) + len(self.completed) + len(self.failed)
+        return {
+            "total": total,
+            "completed": len(self.completed),
+            "pending": len(self.pending),
+            "failed": len(self.failed),
+            "pending_items": [a.__class__.__name__ for a in self.pending],
+            "failed_items": [a.__class__.__name__ for a in self.failed]
+        }
+# Top‑level driver
+# environment = None  # fill this in with your wrapped_classes dict
+#
+# async def main():
+#     wrapper = ArtifactUpdaterMultiWrapper(
+#         wrapped_classes=environment,
+#         debug=True
+#     )
+#     loop = asyncio.get_running_loop()
+#     for sig in (signal.SIGINT, signal.SIGTERM):
+#         loop.add_signal_handler(sig, lambda: asyncio.create_task(wrapper.shutdown()))
+#     await wrapper.update_data("your_data_type")
+#
+# if __name__ == "__main__":
+#     asyncio.run(main())

sibi_dst/df_helper/_df_helper.py CHANGED Viewed

@@ -1,3 +1,5 @@
+from __future__ import annotations
 import asyncio
 import datetime
 import logging
@@ -6,10 +8,10 @@ from typing import Any, Dict, TypeVar
 from typing import Union, Optional
 import dask.dataframe as dd
-from dask import delayed, compute
+import fsspec
 import pandas as pd
+from dask import delayed, compute
 from pydantic import BaseModel
-import fsspec
 from sibi_dst.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
 from sibi_dst.utils import Logger
@@ -45,7 +47,7 @@ class DfHelper:
     :ivar df: The DataFrame currently being processed or loaded.
     :type df: Union[dd.DataFrame, pd.DataFrame]
     :ivar backend_django: Configuration for interacting with Django database backends.
-    :type backend_django: Optional[DjangoConnectionConfig]
+    :type backend_connection: Optional[DjangoConnectionConfig]
     :ivar _backend_query: Internal configuration for query handling.
     :type _backend_query: Optional[QueryConfig]
     :ivar _backend_params: Internal parameters configuration for DataFrame handling.
@@ -54,8 +56,6 @@ class DfHelper:
     :type backend_parquet: Optional[ParquetConfig]
     :ivar backend_http: Configuration for interacting with HTTP-based backends.
     :type backend_http: Optional[HttpConfig]
-    :ivar backend_sqlalchemy: Configuration for interacting with SQLAlchemy-based databases.
-    :type backend_sqlalchemy: Optional[SqlAlchemyConnectionConfig]
     :ivar parquet_filename: The filename for a Parquet file, if applicable.
     :type parquet_filename: str
     :ivar logger: Logger instance used for debugging and information logging.
@@ -64,12 +64,11 @@ class DfHelper:
     :type default_config: Dict
     """
     df: Union[dd.DataFrame, pd.DataFrame] = None
-    backend_django: Optional[DjangoConnectionConfig] = None
+    backend_db_connection: Optional[Union[DjangoConnectionConfig | SqlAlchemyConnectionConfig]] = None
     _backend_query: Optional[QueryConfig] = None
     _backend_params: Optional[ParamsConfig] = None
     backend_parquet: Optional[ParquetConfig] = None
     backend_http: Optional[HttpConfig] = None
-    backend_sqlalchemy: Optional[SqlAlchemyConnectionConfig] = None
     parquet_filename: str = None
     logger: Logger
     default_config: Dict = None
@@ -91,7 +90,7 @@ class DfHelper:
         self.filesystem_options = kwargs.pop('filesystem_options', {})
         kwargs.setdefault("live", True)
         kwargs.setdefault("logger", self.logger)
-        self.fs =kwargs.setdefault("fs", fsspec.filesystem('file'))
+        self.fs = kwargs.setdefault("fs", fsspec.filesystem('file'))
         self.__post_init(**kwargs)
     def __str__(self):
@@ -100,6 +99,34 @@ class DfHelper:
     def __call__(self, **options):
         return self.load(**options)
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.__cleanup()
+        return False
+    def __cleanup(self):
+        """
+        Clean up resources when exiting the context manager.
+        This method is called when the context manager exits.
+        """
+        if self.backend_db_connection:
+            if getattr(self.backend_db_connection, "dispose_idle_connections", None):
+                self.backend_db_connection.dispose_idle_connections()
+            if getattr(self.backend_db_connection, "close", None):
+                self.backend_db_connection.close()
+        self.backend_db_connection = None
+        if self.backend_parquet:
+            self.backend_parquet = None
+        if self.backend_http:
+            self.backend_http = None
+        self._backend_query = None
+        self._backend_params = None
     def __post_init(self, **kwargs):
         """
         Initializes backend-specific configurations based on the provided backend type and other
@@ -111,20 +138,19 @@ class DfHelper:
                        Additional parameters for specific backend types are extracted here.
         :return: None
         """
-        self.logger.debug(f"backend used: {self.backend}")
-        self.logger.debug(f"kwargs passed to backend plugins: {kwargs}")
+        # self.logger.debug(f"backend used: {self.backend}")
+        # self.logger.debug(f"kwargs passed to backend plugins: {kwargs}")
         self._backend_query = self.__get_config(QueryConfig, kwargs)
         self._backend_params = self.__get_config(ParamsConfig, kwargs)
         if self.backend == 'django_db':
-            self.backend_django = self.__get_config(DjangoConnectionConfig, kwargs)
+            self.backend_db_connection = self.__get_config(DjangoConnectionConfig, kwargs)
         elif self.backend == 'parquet':
             self.parquet_filename = kwargs.setdefault("parquet_filename", None)
             self.backend_parquet = ParquetConfig(**kwargs)
         elif self.backend == 'http':
             self.backend_http = HttpConfig(**kwargs)
         elif self.backend == 'sqlalchemy':
-            self.backend_sqlalchemy = self.__get_config(SqlAlchemyConnectionConfig, kwargs)
+            self.backend_db_connection = self.__get_config(SqlAlchemyConnectionConfig, kwargs)
     def __get_config(self, model: [T], kwargs: Dict[str, Any]) -> Union[T]:
         """
@@ -134,11 +160,13 @@ class DfHelper:
         :param kwargs: The dictionary of keyword arguments.
         :return: The initialized Pydantic model instance.
         """
+        kwargs.setdefault("debug", self.debug)
+        kwargs.setdefault("logger", self.logger)
         # Extract keys that the model can accept
         recognized_keys = set(model.__annotations__.keys())
         self.logger.debug(f"recognized keys: {recognized_keys}")
         model_kwargs = {k: kwargs.pop(k) for k in list(kwargs.keys()) if k in recognized_keys}
-        self.logger.debug(f"model_kwargs: {model_kwargs}")
+        # self.logger.debug(f"model_kwargs: {model_kwargs}")
         return model(**model_kwargs)
     def load_parallel(self, **options):
@@ -171,10 +199,10 @@ class DfHelper:
             `as_pandas` is set to True, or kept in its native backend format otherwise.
         """
         # this will be the universal method to load data from a df irrespective of the backend
-        df = self.__load(**options)
+        self.df = self.__load(**options)
         if self.as_pandas:
-            return df.compute()
-        return df
+            return self.df.compute()
+        return self.df
     def __load(self, **options):
         """
@@ -196,7 +224,7 @@ class DfHelper:
         """
         if self.backend == 'django_db':
             self._backend_params.parse_params(options)
-            return self.__load_from_db(**options)
+            return self.__load_from_django_db(**options)
         elif self.backend == 'sqlalchemy':
             self._backend_params.parse_params(options)
             return self.__load_from_sqlalchemy(**options)
@@ -227,7 +255,7 @@ class DfHelper:
         try:
             options.setdefault("debug", self.debug)
             db_loader = SqlAlchemyLoadFromDb(
-                self.backend_sqlalchemy,
+                self.backend_db_connection,
                 self._backend_query,
                 self._backend_params,
                 self.logger,
@@ -236,6 +264,7 @@ class DfHelper:
             self.df = db_loader.build_and_load()
             self.__process_loaded_data()
             self.__post_process_df()
+            self.backend_db_connection.close()
             self.logger.debug("Data successfully loaded from sqlalchemy database.")
         except Exception as e:
             self.logger.debug(f"Failed to load data from sqlalchemy database: {e}: options: {options}")
@@ -243,7 +272,7 @@ class DfHelper:
         return self.df
-    def __load_from_db(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
+    def __load_from_django_db(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
         """
         Loads data from a Django database using a specific backend query mechanism. Processes the loaded data
         and applies further post-processing before returning the dataframe. If the operation fails, an
@@ -258,7 +287,7 @@ class DfHelper:
         try:
             options.setdefault("debug", self.debug)
             db_loader = DjangoLoadFromDb(
-                self.backend_django,
+                self.backend_db_connection,
                 self._backend_query,
                 self._backend_params,
                 self.logger,
@@ -307,6 +336,7 @@ class DfHelper:
         :raises ValueError: If the lengths of `fieldnames` and `column_names` do not match,
             or if the specified `index_col` is not found in the DataFrame.
         """
+        self.logger.debug("Post-processing DataFrame.")
         df_params = self._backend_params.df_params
         fieldnames = df_params.get("fieldnames", None)
         index_col = df_params.get("index_col", None)
@@ -357,16 +387,16 @@ class DfHelper:
         :return: None
         """
-        self.logger.debug(f"Type of self.df: {type(self.df)}")
+        self.logger.debug(f"Processing loaded data...")
         if self.df.map_partitions(len).compute().sum() > 0:
             field_map = self._backend_params.field_map or {}
-            if isinstance(field_map, dict):
+            if isinstance(field_map, dict) and field_map != {}:
                 rename_mapping = {k: v for k, v in field_map.items() if k in self.df.columns}
                 missing_columns = [k for k in field_map.keys() if k not in self.df.columns]
                 if missing_columns:
                     self.logger.warning(
-                        f"The following columns in field_map are not in the DataFrame: {missing_columns}")
+                        f"The following columns in field_map are not in the DataFrame: {missing_columns}, field map: {field_map}")
                 def rename_columns(df, mapping):
                     return df.rename(columns=mapping)
@@ -376,6 +406,8 @@ class DfHelper:
                     self.df = self.df.map_partitions(rename_columns, mapping=rename_mapping)
             self.logger.debug("Processing of loaded data completed.")
+        else:
+            self.logger.debug("DataFrame is empty, skipping processing.")
     def save_to_parquet(self, parquet_filename: Optional[str] = None, **kwargs):
         """
@@ -536,14 +568,14 @@ class DfHelper:
         # Common logic for Django and SQLAlchemy
         if self.backend == 'django_db':
-            model_fields = {field.name: field for field in self.backend_django.model._meta.get_fields()}
+            model_fields = {field.name: field for field in self.backend_db_connection.model._meta.get_fields()}
             if mapped_field not in model_fields:
                 raise ValueError(f"Field '{dt_field}' does not exist in the Django model.")
             field_type = type(model_fields[mapped_field]).__name__
             is_date_field = field_type == 'DateField'
             is_datetime_field = field_type == 'DateTimeField'
         elif self.backend == 'sqlalchemy':
-            model = self.backend_sqlalchemy.model
+            model = self.backend_db_connection.model
             fields = [column.name for column in model.__table__.columns]
             if mapped_field not in fields:
                 raise ValueError(f"Field '{dt_field}' does not exist in the SQLAlchemy model.")

sibi-dst 0.3.56__py3-none-any.whl → 0.3.58__py3-none-any.whl

sibi-dst 0.3.56py3-none-any.whl → 0.3.58py3-none-any.whl