PyPI - sibi-dst - Versions diffs - 2025.8.1__py3-none-any.whl → 2025.8.2__py3-none-any.whl - Mend

sibi-dst 2025.8.1py3-none-any.whl → 2025.8.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

sibi_dst/df_helper/__init__.py +3 -2
sibi_dst/df_helper/_artifact_updater_async.py +238 -0
sibi_dst/df_helper/_artifact_updater_threaded.py +195 -0
sibi_dst/df_helper/_df_helper.py +1 -1
sibi_dst/df_helper/_parquet_artifact.py +24 -4
sibi_dst/df_helper/_parquet_reader.py +9 -10
sibi_dst/utils/__init__.py +2 -0
sibi_dst/utils/base.py +153 -224
sibi_dst/utils/business_days.py +248 -0
sibi_dst/utils/data_wrapper.py +166 -106
sibi_dst/utils/date_utils.py +711 -394
sibi_dst/utils/file_age_checker.py +301 -0
sibi_dst/utils/periods.py +42 -0
sibi_dst/utils/update_planner.py +2 -2
{sibi_dst-2025.8.1.dist-info → sibi_dst-2025.8.2.dist-info}/METADATA +1 -1
{sibi_dst-2025.8.1.dist-info → sibi_dst-2025.8.2.dist-info}/RECORD +17 -13
sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +0 -315
{sibi_dst-2025.8.1.dist-info → sibi_dst-2025.8.2.dist-info}/WHEEL +0 -0

sibi_dst/utils/data_wrapper.py CHANGED Viewed

@@ -1,11 +1,10 @@
 import datetime
+import random
 import threading
 import time
-import random
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Type, Any, Dict, Optional, Union, List, ClassVar
-import dask.dataframe as dd
 import pandas as pd
 from tqdm import tqdm
@@ -61,12 +60,21 @@ class DataWrapper(ManagedResource):
         self.mmanifest = kwargs.get("mmanifest", None)
         self.update_planner = kwargs.get("update_planner", None)
+        # --- NEW: stop gate tripped during cleanup/interrupt to block further scheduling/retries
+        self._stop_event = threading.Event()
+        self.extra_logger = {"action_module_name": "data_wrapper", "dataclass": self.dataclass.__name__}
+    # ensure manifest is saved on context exit
     def __exit__(self, exc_type, exc_val, exc_tb):
         if self.mmanifest:
             self.mmanifest.save()
         super().__exit__(exc_type, exc_val, exc_tb)
         return False
+    # --- NEW: trip stop gate during class-specific cleanup (close/aclose/finalizer path)
+    def _cleanup(self) -> None:
+        self._stop_event.set()
     @staticmethod
     def _convert_to_date(date: Union[datetime.date, str]) -> datetime.date:
         if isinstance(date, datetime.date):
@@ -89,12 +97,7 @@ class DataWrapper(ManagedResource):
     ):
         """
         Execute the update plan with concurrency, retries and exponential backoff.
-        Args:
-            max_retries: attempts per date.
-            backoff_base: base for exponential backoff (delay = base**attempt).
-            backoff_jitter: multiplicative jitter factor in [0, backoff_jitter].
-            backoff_max: maximum backoff seconds per attempt (before jitter).
+        Stops scheduling immediately if closed or interrupted (Ctrl-C).
         """
         overall_start = time.perf_counter()
         tasks = list(self.update_planner.get_tasks_by_priority())
@@ -105,15 +108,22 @@ class DataWrapper(ManagedResource):
         if self.update_planner.show_progress:
             self.update_planner.show_update_plan()
-        for priority, dates in tasks:
-            self._execute_task_batch(priority, dates, max_retries, backoff_base, backoff_jitter, backoff_max)
-        total_time = time.perf_counter() - overall_start
-        if self.processed_dates:
-            count = len(self.processed_dates)
-            self.logger.info(f"Processed {count} dates in {total_time:.1f}s (avg {total_time / count:.1f}s/date)")
-            if self.update_planner.show_progress:
-                self.show_benchmark_summary()
+        try:
+            for priority, dates in tasks:
+                if self._stop_event.is_set():
+                    break
+                self._execute_task_batch(priority, dates, max_retries, backoff_base, backoff_jitter, backoff_max)
+        except KeyboardInterrupt:
+            self.logger.warning("KeyboardInterrupt received — stopping scheduling and shutting down.")
+            self._stop_event.set()
+            raise
+        finally:
+            total_time = time.perf_counter() - overall_start
+            if self.processed_dates:
+                count = len(self.processed_dates)
+                self.logger.info(f"Processed {count} dates in {total_time:.1f}s (avg {total_time / count:.1f}s/date)")
+                if self.update_planner.show_progress:
+                    self.show_benchmark_summary()
     def _execute_task_batch(
         self,
@@ -126,15 +136,27 @@ class DataWrapper(ManagedResource):
     ):
         desc = f"Processing {self.dataclass.__name__}, priority: {priority}"
         max_thr = min(len(dates), self.max_threads)
-        self.logger.info(f"Executing {len(dates)} tasks with priority {priority} using {max_thr} threads.")
-        with ThreadPoolExecutor(max_workers=max_thr) as executor:
-            futures = {
-                executor.submit(
-                    self._process_date_with_retry, date, max_retries, backoff_base, backoff_jitter, backoff_max
-                ): date
-                for date in dates
-            }
+        self.logger.info(f"Executing {len(dates)} tasks with priority {priority} using {max_thr} threads.", extra=self.extra_logger)
+        # Use explicit try/finally so we can request cancel of queued tasks on teardown
+        executor = ThreadPoolExecutor(max_workers=max_thr, thread_name_prefix="datawrapper")
+        try:
+            futures = {}
+            for date in dates:
+                if self._stop_event.is_set():
+                    break
+                try:
+                    fut = executor.submit(
+                        self._process_date_with_retry, date, max_retries, backoff_base, backoff_jitter, backoff_max
+                    )
+                    futures[fut] = date
+                except RuntimeError as e:
+                    # tolerate race: executor shutting down
+                    if "cannot schedule new futures after shutdown" in str(e).lower():
+                        self.logger.warning("Executor is shutting down; halting new submissions for this batch.")
+                        break
+                    raise
             iterator = as_completed(futures)
             if self.show_progress:
                 iterator = tqdm(iterator, total=len(futures), desc=desc)
@@ -143,7 +165,10 @@ class DataWrapper(ManagedResource):
                 try:
                     future.result(timeout=self.timeout)
                 except Exception as e:
-                    self.logger.error(f"Permanent failure for {futures[future]}: {e}")
+                    self.logger.error(f"Permanent failure for {futures[future]}: {e}", extra=self.extra_logger)
+        finally:
+            # Python 3.9+: cancel_futures prevents queued tasks from starting
+            executor.shutdown(wait=True, cancel_futures=True)
     def _process_date_with_retry(
         self,
@@ -154,11 +179,15 @@ class DataWrapper(ManagedResource):
         backoff_max: float,
     ):
         for attempt in range(max_retries):
+            # --- NEW: bail out quickly if shutdown/interrupt began
+            if self._stop_event.is_set():
+                raise RuntimeError("shutting_down")
             try:
                 self._process_single_date(date)
                 return
             except Exception as e:
-                if attempt < max_retries - 1:
+                if attempt < max_retries - 1 and not self._stop_event.is_set():
                     base_delay = min(backoff_base ** attempt, backoff_max)
                     delay = base_delay * (1 + random.uniform(0.0, max(0.0, backoff_jitter)))
                     self.logger.warning(
@@ -166,7 +195,8 @@ class DataWrapper(ManagedResource):
                     )
                     time.sleep(delay)
                 else:
-                    self.logger.error(f"Failed processing {date} after {max_retries} attempts.")
+                    self.logger.error(f"Failed processing {date} after {max_retries} attempts.", extra=self.extra_logger)
+                    raise
     def _process_single_date(self, date: datetime.date):
         path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
@@ -228,26 +258,28 @@ class DataWrapper(ManagedResource):
             raise
     def _log_success(self, date: datetime.date, duration: float, path: str):
-        self.logger.info(f"Completed {date} in {duration:.1f}s | Saved to {path}")
+        self.logger.info(f"Completed {date} in {duration:.1f}s | Saved to {path}", extra=self.extra_logger)
         self.processed_dates.append(date)
     def _log_failure(self, date: datetime.date, error: Exception):
-        self.logger.error(f"Failed processing {date}: {error}")
+        self.logger.error(f"Failed processing {date}: {error}", extra=self.extra_logger)
     def show_benchmark_summary(self):
         if not self.benchmarks:
-            self.logger.info("No benchmarking data to show")
+            self.logger.info("No benchmarking data to show", extra=self.extra_logger)
             return
         df_bench = pd.DataFrame.from_records([{"date": d, **m} for d, m in self.benchmarks.items()])
         df_bench = df_bench.set_index("date").sort_index(ascending=not self.update_planner.reverse_order)
-        self.logger.info(f"Benchmark Summary:\n {self.dataclass.__name__}\n" + df_bench.to_string())
+        self.logger.info(f"Benchmark Summary:\n {self.dataclass.__name__}\n" + df_bench.to_string(), extra=self.extra_logger)
 # import datetime
 # import threading
 # import time
+# import random
 # from concurrent.futures import ThreadPoolExecutor, as_completed
 # from typing import Type, Any, Dict, Optional, Union, List, ClassVar
 #
+# import dask.dataframe as dd
 # import pandas as pd
 # from tqdm import tqdm
 #
@@ -261,23 +293,23 @@ class DataWrapper(ManagedResource):
 #         "missing_in_history": 2,
 #         "existing_but_stale": 3,
 #         "missing_outside_history": 4,
-#         "file_is_recent": 0
+#         "file_is_recent": 0,
 #     }
 #     DEFAULT_MAX_AGE_MINUTES: int = 1440
 #     DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
 #
 #     def __init__(
-#             self,
-#             dataclass: Type,
-#             date_field: str,
-#             data_path: str,
-#             parquet_filename: str,
-#             class_params: Optional[Dict] = None,
-#             load_params: Optional[Dict] = None,
-#             show_progress: bool = False,
-#             timeout: float = 30,
-#             max_threads: int = 3,
-#             **kwargs: Any,
+#         self,
+#         dataclass: Type,
+#         date_field: str,
+#         data_path: str,
+#         parquet_filename: str,
+#         class_params: Optional[Dict] = None,
+#         load_params: Optional[Dict] = None,
+#         show_progress: bool = False,
+#         timeout: float = 30,
+#         max_threads: int = 3,
+#         **kwargs: Any,
 #     ):
 #         super().__init__(**kwargs)
 #         self.dataclass = dataclass
@@ -285,15 +317,15 @@ class DataWrapper(ManagedResource):
 #         self.data_path = self._ensure_forward_slash(data_path)
 #         self.parquet_filename = parquet_filename
 #         if self.fs is None:
-#             raise ValueError("Datawrapper requires a File system (fs) to be provided .")
+#             raise ValueError("DataWrapper requires a File system (fs) to be provided.")
 #         self.show_progress = show_progress
 #         self.timeout = timeout
 #         self.max_threads = max_threads
 #         self.class_params = class_params or {
-#             'debug': self.debug,
-#             'logger': self.logger,
-#             'fs': self.fs,
-#             'verbose': self.verbose,
+#             "debug": self.debug,
+#             "logger": self.logger,
+#             "fs": self.fs,
+#             "verbose": self.verbose,
 #         }
 #         self.load_params = load_params or {}
 #
@@ -304,7 +336,6 @@ class DataWrapper(ManagedResource):
 #         self.update_planner = kwargs.get("update_planner", None)
 #
 #     def __exit__(self, exc_type, exc_val, exc_tb):
-#         """Context manager exit"""
 #         if self.mmanifest:
 #             self.mmanifest.save()
 #         super().__exit__(exc_type, exc_val, exc_tb)
@@ -321,10 +352,24 @@ class DataWrapper(ManagedResource):
 #
 #     @staticmethod
 #     def _ensure_forward_slash(path: str) -> str:
-#         return path.rstrip('/') + '/'
+#         return path.rstrip("/") + "/"
 #
-#     def process(self, max_retries: int = 3):
-#         """Process updates with priority-based execution, retries, benchmarking and progress updates"""
+#     def process(
+#         self,
+#         max_retries: int = 3,
+#         backoff_base: float = 2.0,
+#         backoff_jitter: float = 0.1,
+#         backoff_max: float = 60.0,
+#     ):
+#         """
+#         Execute the update plan with concurrency, retries and exponential backoff.
+#
+#         Args:
+#             max_retries: attempts per date.
+#             backoff_base: base for exponential backoff (delay = base**attempt).
+#             backoff_jitter: multiplicative jitter factor in [0, backoff_jitter].
+#             backoff_max: maximum backoff seconds per attempt (before jitter).
+#         """
 #         overall_start = time.perf_counter()
 #         tasks = list(self.update_planner.get_tasks_by_priority())
 #         if not tasks:
@@ -335,7 +380,7 @@ class DataWrapper(ManagedResource):
 #             self.update_planner.show_update_plan()
 #
 #         for priority, dates in tasks:
-#             self._execute_task_batch(priority, dates, max_retries)
+#             self._execute_task_batch(priority, dates, max_retries, backoff_base, backoff_jitter, backoff_max)
 #
 #         total_time = time.perf_counter() - overall_start
 #         if self.processed_dates:
@@ -344,14 +389,26 @@ class DataWrapper(ManagedResource):
 #             if self.update_planner.show_progress:
 #                 self.show_benchmark_summary()
 #
-#     def _execute_task_batch(self, priority: int, dates: List[datetime.date], max_retries: int):
-#         """Executes a single batch of tasks (dates) using a thread pool."""
+#     def _execute_task_batch(
+#         self,
+#         priority: int,
+#         dates: List[datetime.date],
+#         max_retries: int,
+#         backoff_base: float,
+#         backoff_jitter: float,
+#         backoff_max: float,
+#     ):
 #         desc = f"Processing {self.dataclass.__name__}, priority: {priority}"
 #         max_thr = min(len(dates), self.max_threads)
 #         self.logger.info(f"Executing {len(dates)} tasks with priority {priority} using {max_thr} threads.")
 #
 #         with ThreadPoolExecutor(max_workers=max_thr) as executor:
-#             futures = {executor.submit(self._process_date_with_retry, date, max_retries): date for date in dates}
+#             futures = {
+#                 executor.submit(
+#                     self._process_date_with_retry, date, max_retries, backoff_base, backoff_jitter, backoff_max
+#                 ): date
+#                 for date in dates
+#             }
 #             iterator = as_completed(futures)
 #             if self.show_progress:
 #                 iterator = tqdm(iterator, total=len(futures), desc=desc)
@@ -362,22 +419,30 @@ class DataWrapper(ManagedResource):
 #                 except Exception as e:
 #                     self.logger.error(f"Permanent failure for {futures[future]}: {e}")
 #
-#     def _process_date_with_retry(self, date: datetime.date, max_retries: int):
-#         """Wrapper to apply retry logic to single date processing."""
+#     def _process_date_with_retry(
+#         self,
+#         date: datetime.date,
+#         max_retries: int,
+#         backoff_base: float,
+#         backoff_jitter: float,
+#         backoff_max: float,
+#     ):
 #         for attempt in range(max_retries):
 #             try:
 #                 self._process_single_date(date)
 #                 return
 #             except Exception as e:
 #                 if attempt < max_retries - 1:
-#                     self.logger.warning(f"Retry {attempt + 1}/{max_retries} for {date}: {e}")
-#                     time.sleep(2 ** attempt)  # Exponential backoff
+#                     base_delay = min(backoff_base ** attempt, backoff_max)
+#                     delay = base_delay * (1 + random.uniform(0.0, max(0.0, backoff_jitter)))
+#                     self.logger.warning(
+#                         f"Retry {attempt + 1}/{max_retries} for {date}: {e} (sleep {delay:.2f}s)"
+#                     )
+#                     time.sleep(delay)
 #                 else:
 #                     self.logger.error(f"Failed processing {date} after {max_retries} attempts.")
-#                     # raise
 #
 #     def _process_single_date(self, date: datetime.date):
-#         """Core date processing logic with load/save timing and thread reporting"""
 #         path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
 #         self.logger.debug(f"Processing date {date.isoformat()} for {path}")
 #         if path in self.update_planner.skipped and self.update_planner.ignore_missing:
@@ -385,74 +450,69 @@ class DataWrapper(ManagedResource):
 #             return
 #         full_path = f"{path}{self.parquet_filename}"
 #
-#         # thread_name = threading.current_thread().name
-#         # self.logger.debug(f"[{thread_name}] Executing date: {date} -> saving to: {full_path}")
-#
 #         overall_start = time.perf_counter()
 #         try:
 #             load_start = time.perf_counter()
 #             date_filter = {f"{self.date_field}__date": {date.isoformat()}}
 #             self.logger.debug(f"Loading data for {date} with filter: {date_filter}")
-#             # Load data using the dataclass with the provided date filter
-#             # Create a copy to avoid mutating the shared instance dictionary
+#
 #             local_load_params = self.load_params.copy()
 #             local_load_params.update(date_filter)
+#
 #             with self.dataclass(**self.class_params) as local_class_instance:
-#                 df = local_class_instance.load(**local_load_params)
+#                 df = local_class_instance.load(**local_load_params)  # expected to be Dask
 #                 load_time = time.perf_counter() - load_start
 #
 #                 if hasattr(local_class_instance, "total_records"):
-#                     self.logger.debug(
-#                         f"Total records loaded by {local_class_instance.__class__.__name__}: {local_class_instance.total_records}")
-#                     if int(local_class_instance.total_records) == 0:  # If no records were loaded but not due to an error
+#                     total_records = int(local_class_instance.total_records)
+#                     self.logger.debug(f"Total records loaded: {total_records}")
+#
+#                     if total_records == 0:
 #                         if self.mmanifest:
-#                             self.mmanifest.record(
-#                             full_path=path
-#                         )
+#                             self.mmanifest.record(full_path=path)
 #                         self.logger.info(f"No data found for {full_path}. Logged to missing manifest.")
-#                     elif int(local_class_instance.total_records) < 0:
-#                         self.logger.warning(
-#                             f"Negative record count ({local_class_instance.total_records}) for {full_path}. "
-#                             "This may indicate an error in the data loading process."
-#                         )
-#                     else:
-#                         save_start = time.perf_counter()
-#                         parquet_params ={
-#                             "df_result": df,
-#                             "parquet_storage_path": path,
-#                             "fs": self.fs,
-#                             "logger": self.logger,
-#                             "debug": self.debug,
-#                         }
-#                         with ParquetSaver(**parquet_params) as ps:
-#                             ps.save_to_parquet(self.parquet_filename, overwrite=True)
-#                         save_time = time.perf_counter() - save_start
-#
-#                         total_time = time.perf_counter() - overall_start
-#                         self.benchmarks[date] = {
-#                             "load_duration": load_time,
-#                             "save_duration": save_time,
-#                             "total_duration": total_time
-#                         }
-#                         self._log_success(date, total_time, full_path)
+#                         return
+#
+#                     if total_records < 0:
+#                         self.logger.warning(f"Negative record count ({total_records}) for {full_path}.")
+#                         return
+#
+#                 save_start = time.perf_counter()
+#                 parquet_params = {
+#                     "df_result": df,
+#                     "parquet_storage_path": path,
+#                     "fs": self.fs,
+#                     "logger": self.logger,
+#                     "debug": self.debug,
+#                 }
+#                 with ParquetSaver(**parquet_params) as ps:
+#                     ps.save_to_parquet(self.parquet_filename, overwrite=True)
+#                 save_time = time.perf_counter() - save_start
+#
+#                 total_time = time.perf_counter() - overall_start
+#                 self.benchmarks[date] = {
+#                     "load_duration": load_time,
+#                     "save_duration": save_time,
+#                     "total_duration": total_time,
+#                 }
+#                 self._log_success(date, total_time, full_path)
+#
 #         except Exception as e:
 #             self._log_failure(date, e)
 #             raise
 #
 #     def _log_success(self, date: datetime.date, duration: float, path: str):
-#         msg = f"Completed {date} in {duration:.1f}s | Saved to {path}"
-#         self.logger.info(msg)
+#         self.logger.info(f"Completed {date} in {duration:.1f}s | Saved to {path}")
 #         self.processed_dates.append(date)
 #
 #     def _log_failure(self, date: datetime.date, error: Exception):
-#         msg = f"Failed processing {date}: {error}"
-#         self.logger.error(msg)
+#         self.logger.error(f"Failed processing {date}: {error}")
 #
 #     def show_benchmark_summary(self):
-#         """Display a summary of load/save timings per date"""
 #         if not self.benchmarks:
 #             self.logger.info("No benchmarking data to show")
 #             return
 #         df_bench = pd.DataFrame.from_records([{"date": d, **m} for d, m in self.benchmarks.items()])
 #         df_bench = df_bench.set_index("date").sort_index(ascending=not self.update_planner.reverse_order)
 #         self.logger.info(f"Benchmark Summary:\n {self.dataclass.__name__}\n" + df_bench.to_string())
+#

sibi-dst 2025.8.1__py3-none-any.whl → 2025.8.2__py3-none-any.whl

sibi-dst 2025.8.1py3-none-any.whl → 2025.8.2py3-none-any.whl