PyPI - sibi-dst - Versions diffs - 2025.1.4__py3-none-any.whl → 2025.1.6__py3-none-any.whl - Mend

sibi-dst 2025.1.4py3-none-any.whl → 2025.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

sibi_dst/__init__.py +4 -1
sibi_dst/df_helper/__init__.py +2 -2
sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +355 -163
sibi_dst/df_helper/_df_helper.py +47 -30
sibi_dst/df_helper/_parquet_artifact.py +41 -53
sibi_dst/df_helper/_parquet_reader.py +11 -16
sibi_dst/df_helper/backends/parquet/_parquet_options.py +2 -1
sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +15 -11
sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +23 -16
sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +17 -11
sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +1 -103
sibi_dst/utils/__init__.py +3 -2
sibi_dst/utils/base.py +117 -0
sibi_dst/utils/clickhouse_writer.py +7 -5
sibi_dst/utils/data_wrapper.py +64 -89
sibi_dst/utils/date_utils.py +2 -1
sibi_dst/utils/log_utils.py +309 -77
sibi_dst/utils/manifest_manager.py +94 -373
sibi_dst/utils/parquet_saver.py +98 -173
sibi_dst/utils/storage_config.py +6 -0
sibi_dst/utils/storage_manager.py +2 -1
sibi_dst/utils/update_planner.py +75 -25
{sibi_dst-2025.1.4.dist-info → sibi_dst-2025.1.6.dist-info}/METADATA +4 -1
{sibi_dst-2025.1.4.dist-info → sibi_dst-2025.1.6.dist-info}/RECORD +25 -28
sibi_dst/v3/__init__.py +0 -0
sibi_dst/v3/backends/__init__.py +0 -0
sibi_dst/v3/df_helper/__init__.py +0 -0
sibi_dst/v3/df_helper/_df_helper.py +0 -91
{sibi_dst-2025.1.4.dist-info → sibi_dst-2025.1.6.dist-info}/WHEEL +0 -0

sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py CHANGED Viewed

@@ -1,13 +1,17 @@
+from __future__ import annotations
+from typing import Any
 import dask.dataframe as dd
 import pandas as pd
+from sibi_dst.utils import ManagedResource
 from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
-from sibi_dst.utils import Logger
 from ._db_connection import SqlAlchemyConnectionConfig
 from ._io_dask import SQLAlchemyDask
-class SqlAlchemyLoadFromDb:
+class SqlAlchemyLoadFromDb(ManagedResource):
     """
     Orchestrates loading data from a database using SQLAlchemy into a Dask
     DataFrame by configuring and delegating to the SQLAlchemyDask loader.
@@ -18,7 +22,6 @@ class SqlAlchemyLoadFromDb:
             plugin_sqlalchemy: SqlAlchemyConnectionConfig,
             plugin_query: QueryConfig = None,
             plugin_params: ParamsConfig = None,
-            logger: Logger = None,
             **kwargs,
     ):
         """
@@ -31,16 +34,16 @@ class SqlAlchemyLoadFromDb:
             logger: An optional logger instance.
             **kwargs: Must contain 'index_column' for Dask partitioning.
         """
+        super().__init__(**kwargs)
         self.db_connection = plugin_sqlalchemy
         self.model = self.db_connection.model
         self.engine = self.db_connection.engine
-        self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
         self.query_config = plugin_query
         self.params_config = plugin_params
-        self.debug = kwargs.get("debug", False)
         self.chunk_size = kwargs.get("chunk_size", self.params_config.df_params.get("chunk_size", 1000))
+        self.total_records = -1 # Initialize total_records to -1 to indicate no records loaded yet
-    def build_and_load(self) -> dd.DataFrame:
+    def build_and_load(self) -> tuple[int | Any, Any] | dd.DataFrame:
         """
         Builds and loads a Dask DataFrame from a SQLAlchemy source.
@@ -58,17 +61,20 @@ class SqlAlchemyLoadFromDb:
                 engine=self.engine,
                 chunk_size=self.chunk_size,
                 logger=self.logger,
+                verbose=self.verbose,
                 debug=self.debug
             )
-            # Create the lazy DataFrame
-            dask_df = sqlalchemy_dask_loader.read_frame()
-            return dask_df
+            # Create the lazy DataFrame and read a record count
+            # if total_records less than 0, it means an error occurred during the loading process
+            self.total_records, dask_df = sqlalchemy_dask_loader.read_frame()
+            return self.total_records, dask_df
         except Exception as e:
-            self.logger.error(f"Failed to build and load data: {e}", exc_info=True)
+            self.total_records = -1
+            self.logger.error(f"{self.model.__name__} Failed to build and load data: {e}", exc_info=True)
             # Return an empty dataframe with the correct schema on failure
             columns = [c.name for c in self.model.__table__.columns]
-            return dd.from_pandas(pd.DataFrame(columns=columns), npartitions=1)
+            return self.total_records, dd.from_pandas(pd.DataFrame(columns=columns), npartitions=1)

sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py CHANGED Viewed

@@ -54,8 +54,6 @@ class SqlAlchemyModelBuilder:
             The dynamically created ORM model class.
         """
         with self._lock:
-            # ✅ REFACTOR: Add a comment acknowledging the risk of using an
-            # internal API. This is a maintenance warning for future developers.
             # NOTE: Using a private SQLAlchemy API. This is a performance
             # optimization but may break in future versions of the library.
             registered_model = Base.registry._class_registry.get(self.class_name)
@@ -103,104 +101,4 @@ class SqlAlchemyModelBuilder:
             return f"{sane_name}_field"
         return sane_name
-# import re
-# import keyword
-# import threading
-# from sqlalchemy import MetaData, Engine
-# from sqlalchemy.orm import DeclarativeBase
-#
-#
-#
-# class Base(DeclarativeBase):
-#     """shared declarative base for all ORM models."""
-#     pass
-#
-#
-# apps_label = "datacubes.models"
-#
-#
-# class SqlAlchemyModelBuilder:
-#     """
-#     Builds a single SQLAlchemy ORM model from a specific database table.
-#     This class is thread-safe and caches reflected table metadata to
-#     improve performance across multiple instantiations.
-#     """
-#     _lock = threading.Lock()
-#     _metadata_cache: dict[str, MetaData] = {}
-#
-#     def __init__(self, engine: Engine, table_name: str):
-#         """
-#         Initializes the model builder for a specific table.
-#
-#         Args:
-#             engine: The SQLAlchemy engine connected to the database.
-#             table_name: The name of the table to generate the model for.
-#         """
-#         self.engine = engine
-#         self.table_name = table_name
-#         self.class_name = self._normalize_class_name(self.table_name)
-#
-#         # Use or create a cached MetaData object for this engine to avoid
-#         # re-reading the schema for tables that are already known.
-#         engine_key = str(engine.url)
-#         if engine_key not in self._metadata_cache:
-#             self._metadata_cache[engine_key] = MetaData()
-#         self.metadata = self._metadata_cache[engine_key]
-#
-#     def build_model(self) -> type:
-#         """
-#         Builds and returns a database model class for the specified table.
-#         This process is atomic and thread-safe.
-#
-#         Raises:
-#             ValueError: If the specified table does not exist in the database.
-#         Returns:
-#             The dynamically created ORM model class.
-#         """
-#         with self._lock:
-#             # First, check if the model class is already registered in SQLAlchemy
-#             registered_model = Base.registry._class_registry.get(self.class_name)
-#             if registered_model:
-#                 return registered_model
-#
-#             # Next, check if the table's schema is in our metadata cache
-#             table = self.metadata.tables.get(self.table_name)
-#
-#             # If not cached, reflect it from the database
-#             if table is None:
-#                 self.metadata.reflect(bind=self.engine, only=[self.table_name])
-#                 table = self.metadata.tables.get(self.table_name)
-#
-#             if table is None:
-#                 raise ValueError(
-#                     f"Table '{self.table_name}' does not exist in the database."
-#                 )
-#
-#             # Create the model class dynamically.
-#             # No need to add columns manually; __table__ handles it.
-#             attrs = {
-#                 "__tablename__": table.name,
-#                 "__table__": table,
-#                 "__module__": apps_label,
-#             }
-#             model = type(self.class_name, (Base,), attrs)
-#
-#             return model
-#
-#     @staticmethod
-#     def _normalize_class_name(table_name: str) -> str:
-#         """Converts a snake_case table_name to a CamelCase class name."""
-#         return "".join(word.capitalize() for word in table_name.split("_"))
-#
-#     @staticmethod
-#     def _normalize_column_name(column_name: str) -> str:
-#         """
-#         Sanitizes a column name to be a valid Python identifier.
-#         (Kept for utility, though not used in the final model creation).
-#         """
-#         sane_name = re.sub(r"\W", "_", column_name)
-#         sane_name = re.sub(r"^\d", r"_\g<0>", sane_name)
-#
-#         if keyword.iskeyword(sane_name):
-#             return f"{sane_name}_field"
-#         return sane_name

sibi_dst/utils/__init__.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 from .log_utils import Logger
+from .base import ManagedResource
 from .date_utils import *
 from .data_utils import DataUtils
 from .file_utils import FileUtils
@@ -20,6 +21,7 @@ from .manifest_manager import MissingManifestManager
 __all__ = [
     "Logger",
+    "ManagedResource",
     "ConfigManager",
     "ConfigLoader",
     "DateUtils",
@@ -38,6 +40,5 @@ __all__ = [
     "FsRegistry",
     "DataFromHttpSource",
     "WebDAVClient",
-    "MissingManifestManager",
+    "MissingManifestManager"
 ]

sibi_dst/utils/base.py ADDED Viewed

@@ -0,0 +1,117 @@
+import asyncio
+from .log_utils import Logger
+class ManagedResource:
+    """
+    A base class providing context management for resources like loggers and filesystems.
+    It handles the creation and cleanup of these resources, ensuring they are only
+    closed if they were created by the instance itself.
+    """
+    def __init__(self, **kwargs):
+        self.debug = kwargs.get("debug", False)
+        self.verbose = kwargs.get("verbose", False)
+        # --- Logger Management (Refactored) ---
+        logger = kwargs.get("logger")
+        if logger:
+            # An existing logger instance was provided by the user
+            self.logger = logger
+            self._own_logger = False
+            self.logger.debug(f"'{self.__class__.__name__}' is tapping into an existing logger.")
+        else:
+            # No pre-configured logger, so we will create and "own" a new one.
+            self._own_logger = True
+            logger_config = kwargs.get("logger_config", {})
+            # Set default logger_name if not specified in the config
+            logger_config.setdefault("logger_name", self.__class__.__name__)
+            # Set log_level based on debug flag, but respect user-provided level
+            default_level = Logger.DEBUG if self.debug else Logger.INFO
+            logger_config.setdefault("log_level", default_level)
+            # Create the logger using the provided or default configuration
+            self.logger = Logger.default_logger(**logger_config)
+            if self.logger:
+                self.logger.debug(f"'{self.__class__.__name__}' is starting its own logger.")
+        fs = kwargs.get("fs")
+        self._own_fs = fs is None
+        self.fs = fs or None # we want to allow None as a valid fs to trigger a failure if needed
+        self._entered = False
+    def __enter__(self):
+        """Enter the runtime context."""
+        self._entered = True
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Exit the runtime context and trigger cleanup."""
+        self.cleanup()
+        return False  # Propagate exceptions
+    # --- Asynchronous Context Management ---
+    async def __aenter__(self):
+        """Enter the runtime context for 'async with' statements."""
+        self._entered = True
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """Exit the runtime context and trigger cleanup for 'async with' statements."""
+        await self.acleanup()
+        return False  # Propagate exceptions
+    def __repr__(self) -> str:
+        """Return an unambiguous string representation of the ManagedResource."""
+        # Dynamically get the name of the class or subclass
+        class_name = self.__class__.__name__
+        # Determine the status of the logger and filesystem
+        logger_status = "own" if self._own_logger else "external"
+        fs_status = "own" if self._own_fs else "external"
+        return (
+            f"<{class_name} debug={self.debug}, "
+            f"logger='{logger_status}', fs='{fs_status}'>"
+        )
+    def cleanup(self):
+        """
+        Cleanup resources managed by this instance.
+        """
+        if self._own_fs and hasattr(self.fs, "clear_instance_cache"):
+            if self.logger:
+                self.logger.debug(f"'{self.__class__.__name__}' is clearing its own filesystem cache.")
+            self.fs.clear_instance_cache()
+        if self._own_logger and hasattr(self.logger, "shutdown"):
+            # Ensure the logger exists before trying to use or shut it down
+            if self.logger:
+                self.logger.debug(f"'{self.__class__.__name__}' is shutting down its own logger.")
+                self.logger.shutdown()
+            self.logger = None  # Set to None after shutdown
+        self._entered = False
+    async def acleanup(self):
+        """
+        Async Cleanup resources managed by this instance.
+        """
+        if self._own_fs and hasattr(self.fs, "clear_instance_cache"):
+            if self.logger:
+                self.logger.debug(f"'{self.__class__.__name__}' is clearing its own filesystem cache.")
+            self.fs.clear_instance_cache()
+        if self._own_logger and hasattr(self.logger, "shutdown"):
+            # Ensure the logger exists before trying to use or shut it down
+            if self.logger:
+                self.logger.debug(f"'{self.__class__.__name__}' is shutting down its own logger.")
+                self.logger.shutdown()
+            self.logger = None  # Set to None after shutdown
+        self._entered = False

sibi_dst/utils/clickhouse_writer.py CHANGED Viewed

@@ -1,14 +1,15 @@
 from concurrent.futures import ThreadPoolExecutor
+from typing import ClassVar, Dict
 import clickhouse_connect
 import pandas as pd
 from clickhouse_driver import Client
 import dask.dataframe as dd
-from .log_utils import Logger
+from . import ManagedResource
-class ClickHouseWriter:
+class ClickHouseWriter(ManagedResource):
     """
     Provides functionality to write a Dask DataFrame to a ClickHouse database using
     a specified schema. This class handles the creation of tables, schema generation,
@@ -36,7 +37,7 @@ class ClickHouseWriter:
     :ivar order_by: Field or column name to use for table ordering.
     :type order_by: str
     """
-    dtype_to_clickhouse = {
+    dtype_to_clickhouse:  ClassVar[Dict[str, str]] = {
         'int64': 'Int64',
         'int32': 'Int32',
         'float64': 'Float64',
@@ -48,7 +49,8 @@ class ClickHouseWriter:
     }
     df: dd.DataFrame
-    def __init__(self, logger=None, **kwargs):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
         self.clickhouse_host = kwargs.setdefault('host', "localhost")
         self.clickhouse_port = kwargs.setdefault('port', 8123)
         self.clickhouse_dbname = kwargs.setdefault('database', 'sibi_data')
@@ -56,7 +58,7 @@ class ClickHouseWriter:
         self.clickhouse_password = kwargs.setdefault('password', '')
         self.clickhouse_table = kwargs.setdefault('table', 'test_sibi_table')
-        self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
+        #self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
         self.client = None
         self.order_by = kwargs.setdefault('order_by', 'id')

sibi_dst/utils/data_wrapper.py CHANGED Viewed

@@ -3,26 +3,27 @@ import logging
 import threading
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import Type, Any, Dict, Optional, Union, List
+from typing import Type, Any, Dict, Optional, Union, List, ClassVar
 import fsspec
 import pandas as pd
 from tqdm import tqdm
+from . import ManagedResource
 from .log_utils import Logger
 from .parquet_saver import ParquetSaver
-class DataWrapper:
-    DEFAULT_PRIORITY_MAP = {
+class DataWrapper(ManagedResource):
+    DEFAULT_PRIORITY_MAP: ClassVar[Dict[str, int]] = {
         "overwrite": 1,
         "missing_in_history": 2,
         "existing_but_stale": 3,
         "missing_outside_history": 4,
         "file_is_recent": 0
     }
-    DEFAULT_MAX_AGE_MINUTES = 1440
-    DEFAULT_HISTORY_DAYS_THRESHOLD = 30
+    DEFAULT_MAX_AGE_MINUTES: int = 1440
+    DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
     def __init__(
             self,
@@ -30,26 +31,20 @@ class DataWrapper:
             date_field: str,
             data_path: str,
             parquet_filename: str,
-            fs: Optional[fsspec.AbstractFileSystem] = None,
-            debug: bool = False,
-            verbose: bool = False,
             class_params: Optional[Dict] = None,
             load_params: Optional[Dict] = None,
-            logger: Logger = None,
             show_progress: bool = False,
             timeout: float = 30,
             max_threads: int = 3,
             **kwargs: Any,
     ):
+        super().__init__(**kwargs)
         self.dataclass = dataclass
         self.date_field = date_field
         self.data_path = self._ensure_forward_slash(data_path)
         self.parquet_filename = parquet_filename
-        self.fs = fs or None
-        self.debug = debug
-        self.verbose = verbose
-        self.logger = logger or Logger.default_logger(logger_name=self.dataclass.__name__)
-        self.logger.set_level(logging.DEBUG if debug else logging.INFO)
+        if self.fs is None:
+            raise ValueError("Datawrapper requires a File system (fs) to be provided .")
         self.show_progress = show_progress
         self.timeout = timeout
         self.max_threads = max_threads
@@ -66,25 +61,15 @@ class DataWrapper:
         self.benchmarks: Dict[datetime.date, Dict[str, float]] = {}
         self.mmanifest = kwargs.get("mmanifest", None)
         self.update_planner=kwargs.get("update_planner", None)
-        self.datacls = self.dataclass(**self.class_params)
-    def __enter__(self):
-        """Context manager entry"""
-        return self
     def __exit__(self, exc_type, exc_val, exc_tb):
         """Context manager exit"""
-        if self.mmanifest and self.mmanifest._new_records:
+        if self.mmanifest:
             self.mmanifest.save()
-            self.mmanifest.cleanup_temp_manifests()
-        if exc_type is not None:
-            self.logger.error(f"Exception occurred: {exc_val}")
+        super().__exit__(exc_type, exc_val, exc_tb)
         return False
-    def _init_filesystem(self) -> fsspec.AbstractFileSystem:
-        with self._lock:
-            return fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
     @staticmethod
     def _convert_to_date(date: Union[datetime.date, str]) -> datetime.date:
         if isinstance(date, datetime.date):
@@ -101,78 +86,68 @@ class DataWrapper:
     def process(self, max_retries: int = 3):
         """Process updates with priority-based execution, retries, benchmarking and progress updates"""
         overall_start = time.perf_counter()
-        plan = self.update_planner.plan
-        # Use len(plan.index) instead of plan.empty for Dask compatibility
-        plan_count = len(plan.index)
-        if plan_count == 0:
-            self.logger.info("No updates required")
+        tasks = list(self.update_planner.get_tasks_by_priority())
+        if not tasks:
+            self.logger.info("No updates required based on the current plan.")
             return
-        self.logger.info(f"Update plan for {self.dataclass.__name__} includes {plan_count} items for update")
-        if self.verbose:
+        if self.update_planner.show_progress:
             self.update_planner.show_update_plan()
-        for priority in sorted(plan["update_priority"].unique()):
-            self._process_priority_group(plan, priority, max_retries)
+        for priority, dates in tasks:
+            self._execute_task_batch(priority, dates, max_retries)
         total_time = time.perf_counter() - overall_start
-        processed = len(self.processed_dates)
-        if processed:
-            self.logger.info(
-                f"Processed {processed} dates in {total_time:.1f}s "
-                f"(avg {total_time / processed:.1f}s per date)"
-            )
-            if self.show_progress or self.verbose:
+        if self.processed_dates:
+            count = len(self.processed_dates)
+            self.logger.info(f"Processed {count} dates in {total_time:.1f}s (avg {total_time / count:.1f}s/date)")
+            if self.update_planner.show_progress:
                 self.show_benchmark_summary()
-    def _process_priority_group(
-            self,
-            plan: pd.DataFrame,
-            priority: int,
-            max_retries: int
-    ):
-        """Process a single priority group with parallel execution and timing"""
-        dates = plan[plan["update_priority"] == priority]["date"].tolist()
-        if not dates:
-            return
+    def _execute_task_batch(self, priority: int, dates: List[datetime.date], max_retries: int):
+        """Executes a single batch of tasks (dates) using a thread pool."""
         desc = f"Processing {self.dataclass.__name__}, priority: {priority}"
-        self.logger.debug(f"Starting {desc.lower()}")
-        group_start = time.perf_counter()
         max_thr = min(len(dates), self.max_threads)
-        self.logger.debug(f"Max threads for priority {priority}: {max_thr}")
+        self.logger.info(f"Executing {len(dates)} tasks with priority {priority} using {max_thr} threads.")
         with ThreadPoolExecutor(max_workers=max_thr) as executor:
             futures = {executor.submit(self._process_date_with_retry, date, max_retries): date for date in dates}
-            for future in tqdm(as_completed(futures), total=len(futures), desc=desc, disable=not self.show_progress):
-                date = futures[future]
+            iterator = as_completed(futures)
+            if self.show_progress:
+                iterator = tqdm(iterator, total=len(futures), desc=desc)
+            for future in iterator:
                 try:
                     future.result(timeout=self.timeout)
                 except Exception as e:
-                    self.logger.error(f"Permanent failure processing {date}: {e}")
-        group_time = time.perf_counter() - group_start
-        self.logger.info(f"Priority {priority} group processed {len(dates)} dates in {group_time:.1f}s")
+                    self.logger.error(f"Permanent failure for {futures[future]}: {e}")
     def _process_date_with_retry(self, date: datetime.date, max_retries: int):
-        for attempt in range(1, max_retries + 1):
+        """Wrapper to apply retry logic to single date processing."""
+        for attempt in range(max_retries):
             try:
                 self._process_single_date(date)
                 return
             except Exception as e:
-                if attempt < max_retries:
-                    self.logger.warning(f"Retry {attempt}/{max_retries} for {date}: {e}")
+                if attempt < max_retries - 1:
+                    self.logger.warning(f"Retry {attempt + 1}/{max_retries} for {date}: {e}")
+                    time.sleep(2 ** attempt)  # Exponential backoff
                 else:
-                    raise RuntimeError(f"Failed processing {date} after {max_retries} attempts") from e
+                    self.logger.error(f"Failed processing {date} after {max_retries} attempts.")
+                    #raise
     def _process_single_date(self, date: datetime.date):
         """Core date processing logic with load/save timing and thread reporting"""
         path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
         self.logger.debug(f"Processing date {date.isoformat()} for {path}")
         if path in self.update_planner.skipped and self.update_planner.ignore_missing:
-            self.logger.info(f"Skipping {date} as it exists in the skipped list")
+            self.logger.debug(f"Skipping {date} as it exists in the skipped list")
             return
         full_path = f"{path}{self.parquet_filename}"
-        thread_name = threading.current_thread().name
-        self.logger.debug(f"[{thread_name}] Executing date: {date} -> saving to: {full_path}")
+        #thread_name = threading.current_thread().name
+        #self.logger.debug(f"[{thread_name}] Executing date: {date} -> saving to: {full_path}")
         overall_start = time.perf_counter()
         try:
@@ -180,30 +155,30 @@ class DataWrapper:
             date_filter = {f"{self.date_field}__date": {date.isoformat()}}
             self.logger.debug(f"Loading data for {date} with filter: {date_filter}")
             # Load data using the dataclass with the provided date filter
-            self.load_params.update(date_filter)
-            df = self.datacls.load(**self.load_params)
+            # Create a copy to avoid mutating the shared instance dictionary
+            local_load_params = self.load_params.copy()
+            local_load_params.update(date_filter)
+            local_class_instance = self.dataclass(**self.class_params)
+            df=local_class_instance.load(**local_load_params)
             load_time = time.perf_counter() - load_start
-            if df.head(1, compute=True).empty:
-                if self.mmanifest:
-                    schema = df._meta.dtypes.astype(str).to_dict()
-                    self.mmanifest.record(
-                        full_path=path
-                    )
-                self.logger.info(f"No data found for {date}. Logged to missing manifest.")
-                return
-            # Dask-compatible empty check
-            # if len(df.index) == 0:
-            #    self.logger.warning(f"No data found for {date}")
-            #    return
+            if hasattr(local_class_instance, "total_records"):
+                self.logger.debug(f"Total records loaded by {local_class_instance.__class__.__name__}: {local_class_instance.total_records}")
+                if int(local_class_instance.total_records) == 0:  # If no records were loaded but not due to an error
+                    if self.mmanifest:
+                        self.mmanifest.record(
+                            full_path=path
+                        )
+                    self.logger.info(f"No data found for {date}. Logged to missing manifest.")
+                    return
             save_start = time.perf_counter()
-            with self._lock:
-                ParquetSaver(
-                    df_result=df,
-                    parquet_storage_path=path,
-                    fs=self.fs,
-                    logger=self.logger
-                ).save_to_parquet(self.parquet_filename)
+            with ParquetSaver(
+                df_result=df,
+                parquet_storage_path=path,
+                fs=self.fs,
+                logger=self.logger
+            ) as ps:
+                ps.save_to_parquet(self.parquet_filename, overwrite=True)
             save_time = time.perf_counter() - save_start
             total_time = time.perf_counter() - overall_start
@@ -233,4 +208,4 @@ class DataWrapper:
             return
         df_bench = pd.DataFrame.from_records([{"date": d, **m} for d, m in self.benchmarks.items()])
         df_bench = df_bench.set_index("date").sort_index(ascending=not self.update_planner.reverse_order)
-        self.logger.info("Benchmark Summary:\n" + df_bench.to_string())
+        self.logger.info(f"Benchmark Summary:\n {self.dataclass.__name__}\n" + df_bench.to_string())

sibi_dst/utils/date_utils.py CHANGED Viewed

@@ -29,8 +29,9 @@ class DateUtils:
     """
     _PERIOD_FUNCTIONS: Dict[str, Callable[[], Tuple[datetime.date, datetime.date]]] = {}
-    def __init__(self, logger=None):
+    def __init__(self, logger=None, debug=False):
         self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
+        self.debug = debug
     @classmethod
     def _ensure_date(cls, value: Union[str, datetime.date, datetime.datetime, pd.Timestamp]) -> datetime.date:

sibi-dst 2025.1.4__py3-none-any.whl → 2025.1.6__py3-none-any.whl

sibi-dst 2025.1.4py3-none-any.whl → 2025.1.6py3-none-any.whl