PyPI - sibi-dst - Versions diffs - 2025.8.7__py3-none-any.whl → 2025.8.9__py3-none-any.whl - Mend

sibi-dst 2025.8.7py3-none-any.whl → 2025.8.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

sibi_dst/df_helper/_df_helper.py +105 -89
sibi_dst/df_helper/_parquet_artifact.py +11 -10
sibi_dst/df_helper/_parquet_reader.py +4 -0
sibi_dst/df_helper/backends/parquet/_parquet_options.py +504 -214
sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +11 -10
sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +9 -8
sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +4 -76
sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -104
sibi_dst/utils/boilerplate/__init__.py +6 -0
sibi_dst/utils/boilerplate/base_data_artifact.py +110 -0
sibi_dst/utils/boilerplate/base_data_cube.py +79 -0
sibi_dst/utils/data_wrapper.py +22 -263
sibi_dst/utils/iceberg_saver.py +126 -0
sibi_dst/utils/log_utils.py +108 -529
sibi_dst/utils/parquet_saver.py +110 -9
sibi_dst/utils/progress/__init__.py +5 -0
sibi_dst/utils/progress/jobs.py +82 -0
sibi_dst/utils/progress/sse_runner.py +82 -0
sibi_dst/utils/storage_hive.py +38 -1
sibi_dst/utils/update_planner.py +617 -116
{sibi_dst-2025.8.7.dist-info → sibi_dst-2025.8.9.dist-info}/METADATA +3 -2
{sibi_dst-2025.8.7.dist-info → sibi_dst-2025.8.9.dist-info}/RECORD +23 -16
{sibi_dst-2025.8.7.dist-info → sibi_dst-2025.8.9.dist-info}/WHEEL +0 -0

sibi_dst/utils/data_wrapper.py CHANGED Viewed

@@ -23,6 +23,8 @@ class DataWrapper(ManagedResource):
     DEFAULT_MAX_AGE_MINUTES: int = 1440
     DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
+    logger_extra = {"sibi_dst_component": __name__}
     def __init__(
         self,
         dataclass: Type,
@@ -62,7 +64,7 @@ class DataWrapper(ManagedResource):
         # --- NEW: stop gate tripped during cleanup/interrupt to block further scheduling/retries
         self._stop_event = threading.Event()
-        self.extra_logger = {"action_module_name": "data_wrapper", "dataclass": self.dataclass.__name__}
+        self.logger_extra.update({"action_module_name": "data_wrapper", "dataclass": self.dataclass.__name__})
     # ensure manifest is saved on context exit
     def __exit__(self, exc_type, exc_val, exc_tb):
@@ -114,14 +116,14 @@ class DataWrapper(ManagedResource):
                     break
                 self._execute_task_batch(priority, dates, max_retries, backoff_base, backoff_jitter, backoff_max)
         except KeyboardInterrupt:
-            self.logger.warning("KeyboardInterrupt received — stopping scheduling and shutting down.")
+            self.logger.warning("KeyboardInterrupt received — stopping scheduling and shutting down.", extra=self.logger_extra)
             self._stop_event.set()
             raise
         finally:
             total_time = time.perf_counter() - overall_start
             if self.processed_dates:
                 count = len(self.processed_dates)
-                self.logger.info(f"Processed {count} dates in {total_time:.1f}s (avg {total_time / count:.1f}s/date)")
+                self.logger.info(f"Processed {count} dates in {total_time:.1f}s (avg {total_time / count:.1f}s/date)", extra=self.logger_extra)
                 if self.update_planner.show_progress:
                     self.show_benchmark_summary()
@@ -136,7 +138,7 @@ class DataWrapper(ManagedResource):
     ):
         desc = f"Processing {self.dataclass.__name__}, priority: {priority}"
         max_thr = min(len(dates), self.max_threads)
-        self.logger.info(f"Executing {len(dates)} tasks with priority {priority} using {max_thr} threads.", extra=self.extra_logger)
+        self.logger.info(f"Executing {len(dates)} tasks with priority {priority} using {max_thr} threads.", extra=self.logger_extra)
         # Use explicit try/finally so we can request cancel of queued tasks on teardown
         executor = ThreadPoolExecutor(max_workers=max_thr, thread_name_prefix="datawrapper")
@@ -153,7 +155,7 @@ class DataWrapper(ManagedResource):
                 except RuntimeError as e:
                     # tolerate race: executor shutting down
                     if "cannot schedule new futures after shutdown" in str(e).lower():
-                        self.logger.warning("Executor is shutting down; halting new submissions for this batch.")
+                        self.logger.warning("Executor is shutting down; halting new submissions for this batch.", extra=self.logger_extra)
                         break
                     raise
@@ -165,7 +167,7 @@ class DataWrapper(ManagedResource):
                 try:
                     future.result(timeout=self.timeout)
                 except Exception as e:
-                    self.logger.error(f"Permanent failure for {futures[future]}: {e}", extra=self.extra_logger)
+                    self.logger.error(f"Permanent failure for {futures[future]}: {e}", extra=self.logger_extra)
         finally:
             # Python 3.9+: cancel_futures prevents queued tasks from starting
             executor.shutdown(wait=True, cancel_futures=True)
@@ -191,18 +193,19 @@ class DataWrapper(ManagedResource):
                     base_delay = min(backoff_base ** attempt, backoff_max)
                     delay = base_delay * (1 + random.uniform(0.0, max(0.0, backoff_jitter)))
                     self.logger.warning(
-                        f"Retry {attempt + 1}/{max_retries} for {date}: {e} (sleep {delay:.2f}s)"
+                        f"Retry {attempt + 1}/{max_retries} for {date}: {e} (sleep {delay:.2f}s)",
+                        extra=self.logger_extra
                     )
                     time.sleep(delay)
                 else:
-                    self.logger.error(f"Failed processing {date} after {max_retries} attempts.", extra=self.extra_logger)
+                    self.logger.error(f"Failed processing {date} after {max_retries} attempts.", extra=self.logger_extra)
                     raise
     def _process_single_date(self, date: datetime.date):
         path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
-        self.logger.debug(f"Processing date {date.isoformat()} for {path}")
+        self.logger.debug(f"Processing date {date.isoformat()} for {path}", extra=self.logger_extra)
         if path in self.update_planner.skipped and self.update_planner.ignore_missing:
-            self.logger.debug(f"Skipping {date} as it exists in the skipped list")
+            self.logger.debug(f"Skipping {date} as it exists in the skipped list", extra=self.logger_extra)
             return
         full_path = f"{path}{self.parquet_filename}"
@@ -210,7 +213,7 @@ class DataWrapper(ManagedResource):
         try:
             load_start = time.perf_counter()
             date_filter = {f"{self.date_field}__date": {date.isoformat()}}
-            self.logger.debug(f"Loading data for {date} with filter: {date_filter}")
+            self.logger.debug(f"{self.dataclass.__name__} is loading data for {date} with filter: {date_filter}", extra=self.logger_extra)
             local_load_params = self.load_params.copy()
             local_load_params.update(date_filter)
@@ -221,16 +224,16 @@ class DataWrapper(ManagedResource):
                 if hasattr(local_class_instance, "total_records"):
                     total_records = int(local_class_instance.total_records)
-                    self.logger.debug(f"Total records loaded: {total_records}")
+                    self.logger.debug(f"Total records loaded: {total_records}", extra=self.logger_extra)
                     if total_records == 0:
                         if self.mmanifest:
                             self.mmanifest.record(full_path=path)
-                        self.logger.info(f"No data found for {full_path}. Logged to missing manifest.")
+                        self.logger.info(f"No data found for {full_path}. Logged to missing manifest.", extra=self.logger_extra)
                         return
                     if total_records < 0:
-                        self.logger.warning(f"Negative record count ({total_records}) for {full_path}.")
+                        self.logger.warning(f"Negative record count ({total_records}) for {full_path}.", extra=self.logger_extra)
                         return
                 save_start = time.perf_counter()
@@ -258,261 +261,17 @@ class DataWrapper(ManagedResource):
             raise
     def _log_success(self, date: datetime.date, duration: float, path: str):
-        self.logger.info(f"Completed {date} in {duration:.1f}s | Saved to {path}", extra=self.extra_logger)
+        self.logger.info(f"Completed {date} in {duration:.1f}s | Saved to {path}", extra=self.logger_extra)
         self.processed_dates.append(date)
     def _log_failure(self, date: datetime.date, error: Exception):
-        self.logger.error(f"Failed processing {date}: {error}", extra=self.extra_logger)
+        self.logger.error(f"Failed processing {date}: {error}", extra=self.logger_extra)
     def show_benchmark_summary(self):
         if not self.benchmarks:
-            self.logger.info("No benchmarking data to show", extra=self.extra_logger)
+            self.logger.info("No benchmarking data to show", extra=self.logger_extra)
             return
         df_bench = pd.DataFrame.from_records([{"date": d, **m} for d, m in self.benchmarks.items()])
         df_bench = df_bench.set_index("date").sort_index(ascending=not self.update_planner.reverse_order)
-        self.logger.info(f"Benchmark Summary:\n {self.dataclass.__name__}\n" + df_bench.to_string(), extra=self.extra_logger)
-# import datetime
-# import threading
-# import time
-# import random
-# from concurrent.futures import ThreadPoolExecutor, as_completed
-# from typing import Type, Any, Dict, Optional, Union, List, ClassVar
-#
-# import dask.dataframe as dd
-# import pandas as pd
-# from tqdm import tqdm
-#
-# from . import ManagedResource
-# from .parquet_saver import ParquetSaver
-#
-#
-# class DataWrapper(ManagedResource):
-#     DEFAULT_PRIORITY_MAP: ClassVar[Dict[str, int]] = {
-#         "overwrite": 1,
-#         "missing_in_history": 2,
-#         "existing_but_stale": 3,
-#         "missing_outside_history": 4,
-#         "file_is_recent": 0,
-#     }
-#     DEFAULT_MAX_AGE_MINUTES: int = 1440
-#     DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
-#
-#     def __init__(
-#         self,
-#         dataclass: Type,
-#         date_field: str,
-#         data_path: str,
-#         parquet_filename: str,
-#         class_params: Optional[Dict] = None,
-#         load_params: Optional[Dict] = None,
-#         show_progress: bool = False,
-#         timeout: float = 30,
-#         max_threads: int = 3,
-#         **kwargs: Any,
-#     ):
-#         super().__init__(**kwargs)
-#         self.dataclass = dataclass
-#         self.date_field = date_field
-#         self.data_path = self._ensure_forward_slash(data_path)
-#         self.parquet_filename = parquet_filename
-#         if self.fs is None:
-#             raise ValueError("DataWrapper requires a File system (fs) to be provided.")
-#         self.show_progress = show_progress
-#         self.timeout = timeout
-#         self.max_threads = max_threads
-#         self.class_params = class_params or {
-#             "debug": self.debug,
-#             "logger": self.logger,
-#             "fs": self.fs,
-#             "verbose": self.verbose,
-#         }
-#         self.load_params = load_params or {}
-#
-#         self._lock = threading.Lock()
-#         self.processed_dates: List[datetime.date] = []
-#         self.benchmarks: Dict[datetime.date, Dict[str, float]] = {}
-#         self.mmanifest = kwargs.get("mmanifest", None)
-#         self.update_planner = kwargs.get("update_planner", None)
-#
-#     def __exit__(self, exc_type, exc_val, exc_tb):
-#         if self.mmanifest:
-#             self.mmanifest.save()
-#         super().__exit__(exc_type, exc_val, exc_tb)
-#         return False
-#
-#     @staticmethod
-#     def _convert_to_date(date: Union[datetime.date, str]) -> datetime.date:
-#         if isinstance(date, datetime.date):
-#             return date
-#         try:
-#             return pd.to_datetime(date).date()
-#         except ValueError as e:
-#             raise ValueError(f"Error converting {date} to datetime: {e}")
-#
-#     @staticmethod
-#     def _ensure_forward_slash(path: str) -> str:
-#         return path.rstrip("/") + "/"
-#
-#     def process(
-#         self,
-#         max_retries: int = 3,
-#         backoff_base: float = 2.0,
-#         backoff_jitter: float = 0.1,
-#         backoff_max: float = 60.0,
-#     ):
-#         """
-#         Execute the update plan with concurrency, retries and exponential backoff.
-#
-#         Args:
-#             max_retries: attempts per date.
-#             backoff_base: base for exponential backoff (delay = base**attempt).
-#             backoff_jitter: multiplicative jitter factor in [0, backoff_jitter].
-#             backoff_max: maximum backoff seconds per attempt (before jitter).
-#         """
-#         overall_start = time.perf_counter()
-#         tasks = list(self.update_planner.get_tasks_by_priority())
-#         if not tasks:
-#             self.logger.info("No updates required based on the current plan.")
-#             return
-#
-#         if self.update_planner.show_progress:
-#             self.update_planner.show_update_plan()
-#
-#         for priority, dates in tasks:
-#             self._execute_task_batch(priority, dates, max_retries, backoff_base, backoff_jitter, backoff_max)
-#
-#         total_time = time.perf_counter() - overall_start
-#         if self.processed_dates:
-#             count = len(self.processed_dates)
-#             self.logger.info(f"Processed {count} dates in {total_time:.1f}s (avg {total_time / count:.1f}s/date)")
-#             if self.update_planner.show_progress:
-#                 self.show_benchmark_summary()
-#
-#     def _execute_task_batch(
-#         self,
-#         priority: int,
-#         dates: List[datetime.date],
-#         max_retries: int,
-#         backoff_base: float,
-#         backoff_jitter: float,
-#         backoff_max: float,
-#     ):
-#         desc = f"Processing {self.dataclass.__name__}, priority: {priority}"
-#         max_thr = min(len(dates), self.max_threads)
-#         self.logger.info(f"Executing {len(dates)} tasks with priority {priority} using {max_thr} threads.")
-#
-#         with ThreadPoolExecutor(max_workers=max_thr) as executor:
-#             futures = {
-#                 executor.submit(
-#                     self._process_date_with_retry, date, max_retries, backoff_base, backoff_jitter, backoff_max
-#                 ): date
-#                 for date in dates
-#             }
-#             iterator = as_completed(futures)
-#             if self.show_progress:
-#                 iterator = tqdm(iterator, total=len(futures), desc=desc)
-#
-#             for future in iterator:
-#                 try:
-#                     future.result(timeout=self.timeout)
-#                 except Exception as e:
-#                     self.logger.error(f"Permanent failure for {futures[future]}: {e}")
-#
-#     def _process_date_with_retry(
-#         self,
-#         date: datetime.date,
-#         max_retries: int,
-#         backoff_base: float,
-#         backoff_jitter: float,
-#         backoff_max: float,
-#     ):
-#         for attempt in range(max_retries):
-#             try:
-#                 self._process_single_date(date)
-#                 return
-#             except Exception as e:
-#                 if attempt < max_retries - 1:
-#                     base_delay = min(backoff_base ** attempt, backoff_max)
-#                     delay = base_delay * (1 + random.uniform(0.0, max(0.0, backoff_jitter)))
-#                     self.logger.warning(
-#                         f"Retry {attempt + 1}/{max_retries} for {date}: {e} (sleep {delay:.2f}s)"
-#                     )
-#                     time.sleep(delay)
-#                 else:
-#                     self.logger.error(f"Failed processing {date} after {max_retries} attempts.")
-#
-#     def _process_single_date(self, date: datetime.date):
-#         path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
-#         self.logger.debug(f"Processing date {date.isoformat()} for {path}")
-#         if path in self.update_planner.skipped and self.update_planner.ignore_missing:
-#             self.logger.debug(f"Skipping {date} as it exists in the skipped list")
-#             return
-#         full_path = f"{path}{self.parquet_filename}"
-#
-#         overall_start = time.perf_counter()
-#         try:
-#             load_start = time.perf_counter()
-#             date_filter = {f"{self.date_field}__date": {date.isoformat()}}
-#             self.logger.debug(f"Loading data for {date} with filter: {date_filter}")
-#
-#             local_load_params = self.load_params.copy()
-#             local_load_params.update(date_filter)
-#
-#             with self.dataclass(**self.class_params) as local_class_instance:
-#                 df = local_class_instance.load(**local_load_params)  # expected to be Dask
-#                 load_time = time.perf_counter() - load_start
-#
-#                 if hasattr(local_class_instance, "total_records"):
-#                     total_records = int(local_class_instance.total_records)
-#                     self.logger.debug(f"Total records loaded: {total_records}")
-#
-#                     if total_records == 0:
-#                         if self.mmanifest:
-#                             self.mmanifest.record(full_path=path)
-#                         self.logger.info(f"No data found for {full_path}. Logged to missing manifest.")
-#                         return
-#
-#                     if total_records < 0:
-#                         self.logger.warning(f"Negative record count ({total_records}) for {full_path}.")
-#                         return
-#
-#                 save_start = time.perf_counter()
-#                 parquet_params = {
-#                     "df_result": df,
-#                     "parquet_storage_path": path,
-#                     "fs": self.fs,
-#                     "logger": self.logger,
-#                     "debug": self.debug,
-#                 }
-#                 with ParquetSaver(**parquet_params) as ps:
-#                     ps.save_to_parquet(self.parquet_filename, overwrite=True)
-#                 save_time = time.perf_counter() - save_start
-#
-#                 total_time = time.perf_counter() - overall_start
-#                 self.benchmarks[date] = {
-#                     "load_duration": load_time,
-#                     "save_duration": save_time,
-#                     "total_duration": total_time,
-#                 }
-#                 self._log_success(date, total_time, full_path)
-#
-#         except Exception as e:
-#             self._log_failure(date, e)
-#             raise
-#
-#     def _log_success(self, date: datetime.date, duration: float, path: str):
-#         self.logger.info(f"Completed {date} in {duration:.1f}s | Saved to {path}")
-#         self.processed_dates.append(date)
-#
-#     def _log_failure(self, date: datetime.date, error: Exception):
-#         self.logger.error(f"Failed processing {date}: {error}")
-#
-#     def show_benchmark_summary(self):
-#         if not self.benchmarks:
-#             self.logger.info("No benchmarking data to show")
-#             return
-#         df_bench = pd.DataFrame.from_records([{"date": d, **m} for d, m in self.benchmarks.items()])
-#         df_bench = df_bench.set_index("date").sort_index(ascending=not self.update_planner.reverse_order)
-#         self.logger.info(f"Benchmark Summary:\n {self.dataclass.__name__}\n" + df_bench.to_string())
-#
+        self.logger.info(f"Benchmark Summary:\n {self.dataclass.__name__}\n" + df_bench.to_string(), extra=self.logger_extra)

sibi_dst/utils/iceberg_saver.py ADDED Viewed

@@ -0,0 +1,126 @@
+import warnings
+import dask.dataframe as dd
+import pandas as pd
+import pyarrow as pa
+from pyiceberg.catalog import load_catalog
+from typing import Optional, Dict, Any
+from . import ManagedResource
+warnings.filterwarnings("ignore", message="Passing 'overwrite=True' to to_parquet is deprecated")
+class IcebergSaver(ManagedResource):
+    """
+    Saves a Dask DataFrame into an Apache Iceberg table using PyIceberg.
+    - Uses Arrow conversion per Dask partition.
+    - One Iceberg commit per partition (append mode), or a staged overwrite
+      (coalesce to N partitions, commit them in place of the old snapshot).
+    """
+    def __init__(
+        self,
+        df_result: dd.DataFrame,
+        catalog_name: str,
+        table_name: str,
+        *,
+        persist: bool = True,
+        npartitions: Optional[int] = 8,
+        arrow_schema: Optional[pa.Schema] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.df_result = df_result
+        self.catalog_name = catalog_name
+        self.table_name = table_name
+        self.persist = persist
+        self.npartitions = npartitions
+        self.arrow_schema = arrow_schema  # optional: enforce column order/types
+        # Iceberg writes don’t need self.fs; catalog handles IO.
+        # But we keep self.fs available in case you presign or stage files.
+        # Load table once
+        self.catalog = load_catalog(self.catalog_name)
+        self.table = self.catalog.load_table(self.table_name)
+    def save(self, *, mode: str = "append"):
+        """
+        mode:
+          - "append": append rows as new data files (one commit per partition)
+          - "overwrite": replace table data atomically (single staged commit)
+                         (requires coalescing to limit number of files)
+        """
+        if mode not in ("append", "overwrite"):
+            raise ValueError("mode must be 'append' or 'overwrite'")
+        # Optional persist to avoid recomputation across multiple consumers
+        ddf = self.df_result.persist() if self.persist else self.df_result
+        if self.npartitions:
+            ddf = ddf.repartition(npartitions=self.npartitions)
+        if mode == "append":
+            self._append_partitions(ddf)
+        else:
+            self._overwrite_atomic(ddf)
+    # ---------- internals ----------
+    def _to_arrow_table(self, pdf: pd.DataFrame) -> pa.Table:
+        if self.arrow_schema is None:
+            return pa.Table.from_pandas(pdf, preserve_index=False)
+        # Enforce schema (column order & target types) when provided
+        at = pa.Table.from_pandas(pdf, preserve_index=False, schema=self.arrow_schema)
+        # Some Arrow versions require select to exact order if pandas added cols
+        return at.select(self.arrow_schema.names)
+    def _append_partitions(self, ddf: dd.DataFrame):
+        """
+        Simple path: commit each partition as a separate append.
+        Good for moderate rates; for very high throughput, consider staging or
+        increasing npartitions to get larger files.
+        """
+        def _commit(pdf: pd.DataFrame):
+            if len(pdf) == 0:
+                return pdf.iloc[0:0]
+            at = self._to_arrow_table(pdf)
+            self.table.append(at)  # one atomic Iceberg commit
+            return pdf.iloc[0:0]
+        ddf.map_partitions(_commit, meta=ddf._meta).compute()
+        self.logger.info(f"Appended data to Iceberg table {self.table_name} (catalog={self.catalog_name}).")
+    def _overwrite_atomic(self, ddf: dd.DataFrame):
+        """
+        Safer full refresh: stage N Arrow batches and replace existing snapshot.
+        Strategy:
+          1) Build a single overwrite transaction.
+          2) Add files produced from each partition to the same transaction.
+          3) Commit once (atomic snapshot replacement).
+        """
+        from pyiceberg.table.ops import RewriteFiles  # operation helper
+        # Materialize partitions one by one and add to a rewrite op
+        # Note: PyIceberg API offers two patterns:
+        #  - table.overwrite(at) for “overwrite by filter” in one call (simple)
+        #  - lower-level staged ops (demonstrated conceptually below)
+        # Easiest “full-table” overwrite via filter(True) – clears table then writes new data:
+        # If you only want to replace certain partitions, use a filter expr.
+        def _collect_partitions(pdf: pd.DataFrame):
+            if len(pdf) == 0:
+                return None
+            return self._to_arrow_table(pdf)
+        batches = [b for b in ddf.map_partitions(_collect_partitions, meta=object).compute() if b is not None]
+        if not batches:
+            self.logger.warning("Overwrite requested but no rows in DataFrame; leaving table unchanged.")
+            return
+        # Commit as a single overwrite
+        self.table.overwrite(batches[0])
+        for at in batches[1:]:
+            self.table.append(at)  # append subsequent batches into the same snapshot lineage
+        # If you require truly single-snapshot replacement in one call, you can
+        # also union the batches into fewer (bigger) Arrow Tables before calling overwrite.
+        self.logger.info(f"Overwrote Iceberg table {self.table_name} with {len(batches)} batch(es).")

sibi-dst 2025.8.7__py3-none-any.whl → 2025.8.9__py3-none-any.whl

sibi-dst 2025.8.7py3-none-any.whl → 2025.8.9py3-none-any.whl