PyPI - sibi-dst - Versions diffs - 2025.1.13__py3-none-any.whl → 2025.8.1__py3-none-any.whl - Mend

sibi-dst 2025.1.13py3-none-any.whl → 2025.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

sibi_dst/__init__.py +7 -1
sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +235 -342
sibi_dst/df_helper/_df_helper.py +417 -117
sibi_dst/df_helper/_parquet_artifact.py +255 -283
sibi_dst/df_helper/backends/parquet/_parquet_options.py +8 -4
sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +68 -107
sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +105 -255
sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +90 -42
sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +192 -0
sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +122 -72
sibi_dst/osmnx_helper/route_path_builder.py +45 -46
sibi_dst/utils/base.py +302 -96
sibi_dst/utils/clickhouse_writer.py +472 -206
sibi_dst/utils/data_utils.py +139 -186
sibi_dst/utils/data_wrapper.py +317 -73
sibi_dst/utils/date_utils.py +1 -0
sibi_dst/utils/df_utils.py +193 -213
sibi_dst/utils/file_utils.py +3 -2
sibi_dst/utils/filepath_generator.py +314 -152
sibi_dst/utils/log_utils.py +581 -242
sibi_dst/utils/manifest_manager.py +60 -76
sibi_dst/utils/parquet_saver.py +33 -27
sibi_dst/utils/phone_formatter.py +88 -95
sibi_dst/utils/update_planner.py +180 -178
sibi_dst/utils/webdav_client.py +116 -166
{sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.1.dist-info}/METADATA +1 -1
{sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.1.dist-info}/RECORD +29 -27
{sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.1.dist-info}/WHEEL +0 -0

sibi_dst/utils/update_planner.py CHANGED Viewed

@@ -1,61 +1,21 @@
-import datetime
+import datetime as dt
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import List, Optional, Dict, Union, Tuple, Set, Iterator, ClassVar
 import pandas as pd
-from .date_utils import FileAgeChecker
-from pydantic import BaseModel, Field
-from rich.console import Console
-from rich.table import Table
 from sibi_dst.utils import ManagedResource
+from .date_utils import FileAgeChecker
-class UpdateConfig(BaseModel):
-    """
-    A unified Pydantic model for the data update process configuration.
-    Acts as a single source of truth for all settings.
-    """
-    overwrite: bool = False
-    reverse_order: bool = True
-    ignore_missing: bool = False
-    history_days_threshold: int = 30
-    max_age_minutes: int = 1440  # 24 hours
-    show_progress: bool = False
-    verbose: bool = False
-    debug: bool = False
-    start_date: datetime.date
-    end_date: datetime.date
-    custom_priority_map: Optional[Dict[str, int]] = None
-    max_threads: int = 3
-    timeout: float = 30.0
-    class Config:
-        arbitrary_types_allowed = True
 class UpdatePlanner(ManagedResource):
     """
-    A utility class to scan a date-partitioned filesystem and
-    generate an update plan indicating which dates need processing.
-    Attributes:
-        data_path:             Base path (always ends with '/').
-        filename:              Filename inside each date folder.
-        fs:                    fsspec filesystem instance.
-        age_checker:           FileAgeChecker for computing file ages.
-        reference_date:        The "today" date used for history windows (date or ISO string).
-        history_days_threshold: Number of days considered "in history".
-        max_age_minutes:       File staleness threshold in minutes.
-        overwrite:             If True, forces updates for all dates.
-        ignore_missing:        If True, skips missing files outside history.
-        reverse_order:         If True, sorts dates descending in output.
-        show_progress:         If True, displays a tqdm progress bar.
-        logger:                Logger for informational messages.
-    Note:
-        generate_plan() will overwrite self.plan and self.df_req, and returns a DataFrame of required updates.
+    Scans date-partitioned storage and builds an 'update plan' for dates that need processing.
+    Produces a Pandas DataFrame plan; it does *not* load data frames, so Dask-vs-Pandas
+    concerns do not apply here.
     """
-    DEFAULT_PRIORITY_MAP: ClassVar[Dict[str, int]]={
+    DEFAULT_PRIORITY_MAP: ClassVar[Dict[str, int]] = {
         "file_is_recent": 0,
         "missing_ignored": 0,
         "overwrite_forced": 1,
@@ -68,183 +28,221 @@ class UpdatePlanner(ManagedResource):
     DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
     def __init__(
-            self,
-            data_path: str,
-            filename: str,
-            description: str = "Update Planner",
-            reference_date: Union[str, datetime.date] = None,
-            history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
-            max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
-            overwrite: bool = False,
-            ignore_missing: bool = False,
-            custom_priority_map: Optional[Dict[str, int]] = None,
-            reverse_order: bool = False,
-            show_progress: bool = False,
-            skipped: Optional[List[str]] = None,
-            **kwargs
+        self,
+        parquet_storage_path: str,
+        parquet_filename: str,
+        description: str = "Update Planner",
+        reference_date: Union[str, dt.date, None] = None,
+        history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
+        max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
+        overwrite: bool = False,
+        ignore_missing: bool = False,
+        custom_priority_map: Optional[Dict[str, int]] = None,
+        reverse_order: bool = False,
+        show_progress: bool = False,
+        skipped: Optional[List[str]] = None,
+        **kwargs,
     ):
-        # Initialize state
         super().__init__(**kwargs)
-        self.plan: pd.DataFrame = pd.DataFrame()
-        self.df_req: pd.DataFrame = pd.DataFrame()
+        # Public-ish attributes
         self.description = description
-        self.data_path = self._ensure_trailing_slash(data_path)
-        self.filename = filename
+        self.data_path = self._ensure_trailing_slash(parquet_storage_path)
+        self.filename = parquet_filename
         self.reverse_order = reverse_order
         self.show_progress = show_progress
-        self.age_checker = FileAgeChecker(debug=self.debug, logger=self.logger)
+        self.overwrite = overwrite
+        self.ignore_missing = ignore_missing
+        self.history_days_threshold = history_days_threshold
+        self.max_age_minutes = max_age_minutes
+        self.priority_map = custom_priority_map or self.DEFAULT_PRIORITY_MAP
+        self.skipped = set(skipped or [])
-        # Normalize reference date
+        # Execution knobs from kwargs (fed by upstream config)
+        self.max_threads: int = int(kwargs.get("max_threads", 3))
+        self.timeout: float = float(kwargs.get("timeout", 30.0))
+        # Date window
+        self.start_date = kwargs.get("parquet_start_date")
+        self.end_date = kwargs.get("parquet_end_date")
+        # Reference "today"
         if reference_date is None:
-            self.reference_date = datetime.date.today()
+            self.reference_date = dt.date.today()
         else:
             self.reference_date = pd.to_datetime(reference_date).date()
-        # Thresholds and flags
-        self.history_days_threshold = history_days_threshold
-        self.max_age_minutes = max_age_minutes
-        self.overwrite = overwrite
-        self.ignore_missing = ignore_missing
-        self.priority_map = custom_priority_map or self.DEFAULT_PRIORITY_MAP
-        self.skipped = skipped or []
-    @staticmethod
-    def _ensure_trailing_slash(path: str) -> str:
-        """Ensure that the provided path ends with a single '/'."""
-        return path.rstrip('/') + '/'
-    def _generate_plan(
-            self,
-            start: datetime.date,
-            end: datetime.date,
-            freq: str = "D"
-    ) -> None:
-        """
-        Internal: populates self.plan with all dates, and self.df_req with only those needing update.
-        """
-        dates = pd.date_range(start=start, end=end, freq=freq).date.tolist()
-        history_start = self.reference_date - datetime.timedelta(days=self.history_days_threshold)
-        rows: List[Dict] = []
+        # Helpers & state
+        self.age_checker = FileAgeChecker(debug=self.debug, logger=self.logger)
+        self.plan: pd.DataFrame = pd.DataFrame()
+        self.df_req: pd.DataFrame = pd.DataFrame()
-        # Parallel file status checks
-        with ThreadPoolExecutor() as executor:
-            futures = {executor.submit(self._get_file_status, d): d for d in dates}
-            iterator = as_completed(futures)
-            if self.show_progress:
-                from tqdm import tqdm
-                iterator = tqdm(
-                    iterator,
-                    total=len(futures),
-                    desc=f"Scanning dates for {self.description}",
-                    unit="date",
-                    leave=False
-                )
-            for future in iterator:
-                d = futures[future]
-                try:
-                    exists, age = future.result()
-                    rows.append(self._make_row(d, history_start, exists, age))
-                except Exception as exc:
-                    self.logger.error(f"Error processing date {d}: {exc}")
-                    rows.append(self._make_row(d, history_start, False, None))
+        # internal run flag to print once per run if caller reuses instance
+        self._printed_this_run: bool = False
-        df = pd.DataFrame(rows)
-        df = df.sort_values(
-            by=["update_priority", "date"],
-            ascending=[True, not self.reverse_order]
-        ).reset_index(drop=True)
+    # --------------------- public helpers ---------------------
+    def has_plan(self) -> bool:
+        """Safe truthiness for plan existence."""
+        return isinstance(self.plan, pd.DataFrame) and not self.plan.empty
-        self.plan = df
-        self.df_req = df[df.update_required].copy()
+    def required_count(self) -> int:
+        return 0 if not isinstance(self.df_req, pd.DataFrame) else len(self.df_req)
+    # --------------------- core API ---------------------
     def generate_plan(
-            self,
-            start: Union[str, datetime.date],
-            end: Union[str, datetime.date]
+        self,
+        start: Union[str, dt.date, None] = None,
+        end: Union[str, dt.date, None] = None,
+        freq: str = "D",
     ) -> pd.DataFrame:
         """
-        Generate and return a DataFrame of dates requiring updates between start and end,
-        sorted by update_priority and date (descending if reverse_order=True).
+        Build a plan for [start, end]. Returns rows that require update (df_req).
         """
+        start = start or self.start_date
+        end = end or self.end_date
         sd = pd.to_datetime(start).date()
         ed = pd.to_datetime(end).date()
         if sd > ed:
             raise ValueError(f"Start date ({sd}) must be on or before end date ({ed}).")
         self.logger.info(f"Generating update plan for {self.description} from {sd} to {ed}")
-        self._generate_plan(sd, ed)
+        self._generate_plan(sd, ed, freq=freq)
         self.logger.info(
             f"Plan built for {self.description}: {len(self.plan)} dates evaluated, "
             f"{len(self.df_req)} require update"
         )
         return self.df_req
     def show_update_plan(self) -> None:
-        """
-        Display the full update plan as a styled DataFrame.
-        """
-        if self.plan.empty:
-            self.logger.warning("No update plan available. Call generate_plan() first.")
+        """Pretty-print the current plan once per run."""
+        if not self.has_plan():
+            self.logger.info("No update plan to show.")
+            return
+        if self._printed_this_run:
             return
-        console = Console(record=True)
+        try:
+            from rich.console import Console
+            from rich.table import Table
+        except Exception:
+            # Fallback: plain text
+            self.logger.info(f"Update Plan (plain list):\n{self.plan.to_string(index=False)}")
+            self._printed_this_run = True
+            return
-        table = Table(title=f"Update Plan for {self.data_path}", show_header=True, header_style="bold magenta")
+        table = Table(
+            title=f"Update Plan for {self.data_path}",
+            show_header=True,
+            header_style="bold magenta",
+        )
         for column in self.plan.columns:
             table.add_column(column, justify="left")
         for _, row in self.plan.iterrows():
-            table.add_row(*(str(item) for item in row))
+            table.add_row(*(str(row[col]) for col in self.plan.columns))
         console = Console()
         with console.capture() as capture:
             console.print(table)
-        plan_string = capture.get()
-        self.logger.info(f"Full Update Plan:\n{plan_string.strip()}")
+        self.logger.info(f"Full Update Plan:\n{capture.get().strip()}")
+        self._printed_this_run = True
-    def get_tasks_by_priority(self) -> Iterator[Tuple[int, List[datetime.date]]]:
-        """Yields batches of dates to be processed, grouped and sorted by priority."""
-        if self.plan.empty:
+    def get_tasks_by_priority(self) -> Iterator[Tuple[int, List[dt.date]]]:
+        """
+        Yield (priority, [dates...]) batches, smallest priority first.
+        """
+        if not self.has_plan():
             return
-        required_updates = self.plan[self.plan['update_required']].copy()
-        if required_updates.empty:
+        req = self.plan[self.plan["update_required"]]
+        if req.empty:
             return
-        for priority in sorted(required_updates["update_priority"].unique()):
-            dates_df = required_updates[required_updates["update_priority"] == priority]
-            # Sort dates within the priority group
-            sorted_dates = dates_df.sort_values(by=["date"], ascending=not self.reverse_order)
-            dates = sorted_dates["date"].tolist()
+        for priority in sorted(req["update_priority"].unique()):
+            dates_df = req[req["update_priority"] == priority]
+            # sort within group
+            dates_df = dates_df.sort_values(by="date", ascending=not self.reverse_order)
+            dates = dates_df["date"].tolist()
             if dates:
-                yield priority, dates
+                yield int(priority), dates
+    # --------------------- internals ---------------------
+    @staticmethod
+    def _ensure_trailing_slash(path: str) -> str:
+        return path.rstrip("/") + "/"
-    def _get_file_status(
-            self,
-            date: datetime.date
-    ) -> Tuple[bool, Optional[float]]:
+    def _generate_plan(self, start: dt.date, end: dt.date, freq: str = "D") -> None:
+        """
+        Populate self.plan with all dates and self.df_req with the subset to update.
+        """
+        dates = pd.date_range(start=start, end=end, freq=freq).date.tolist()
+        history_start = self.reference_date - dt.timedelta(days=self.history_days_threshold)
+        rows: List[Dict] = []
+        # bound threads
+        max_workers = max(1, int(self.max_threads))
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            futures = {executor.submit(self._get_file_status, d): d for d in dates}
+            iterator = as_completed(futures)
+            if self.show_progress:
+                try:
+                    from tqdm import tqdm
+                    iterator = tqdm(
+                        iterator, total=len(futures),
+                        desc=f"Scanning dates for {self.description}",
+                        unit="date", leave=False
+                    )
+                except Exception:
+                    pass  # no tqdm → proceed without progress bar
+            for future in iterator:
+                d = futures[future]
+                try:
+                    exists, age = future.result(timeout=self.timeout)
+                    rows.append(self._make_row(d, history_start, exists, age))
+                except Exception as exc:
+                    self.logger.error(f"Error processing date {d}: {exc}")
+                    rows.append(self._make_row(d, history_start, False, None))
+        df = pd.DataFrame(rows)
+        # consistent types
+        if not df.empty:
+            df["date"] = pd.to_datetime(df["date"]).dt.date
+            df["update_priority"] = df["update_priority"].astype(int)
+        df = df.sort_values(
+            by=["update_priority", "date"],
+            ascending=[True, not self.reverse_order],
+            kind="mergesort",  # stable
+        ).reset_index(drop=True)
+        self.plan = df
+        self.df_req = df[df["update_required"]].copy()
+        self._printed_this_run = False
+    def _get_file_status(self, date: dt.date) -> Tuple[bool, Optional[float]]:
         """
         Check file existence and age for the given date.
         """
         just_path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
         if just_path in self.skipped:
-            self.logger.debug(f"Update plan is skipping date {date} as it is in the skipped list.")
+            self.logger.debug(f"Skipping {date}: path in skipped list.")
             return False, None
         path = f"{just_path}{self.filename}"
         try:
             exists = self.fs.exists(path)
             age = self.age_checker.get_file_or_dir_age_minutes(path, self.fs) if exists else None
-            return exists, age
-        except Exception:
+            return bool(exists), age
+        except Exception as e:
+            self.logger.warning(f"exists/age check failed for {path}: {e}")
             return False, None
     def _make_row(
-            self,
-            date: datetime.date,
-            history_start: datetime.date,
-            file_exists: bool,
-            file_age: Optional[float]
+        self,
+        date: dt.date,
+        history_start: dt.date,
+        file_exists: bool,
+        file_age: Optional[float],
     ) -> Dict:
         """
         Build a single plan row based on flags and thresholds.
@@ -252,11 +250,11 @@ class UpdatePlanner(ManagedResource):
         within_history = history_start <= date <= self.reference_date
         update_required = False
-        # 1. Overwrite mode forces update
+        # 1) Overwrite forces update
         if self.overwrite:
             category = "overwrite_forced"
             update_required = True
-        # 2. Within history window: missing or stale
+        # 2) Inside history window
         elif within_history:
             if not file_exists:
                 category = "missing_in_history"
@@ -266,33 +264,37 @@ class UpdatePlanner(ManagedResource):
                 update_required = True
             else:
                 category = "file_is_recent"
-        # 3. Outside history, missing file
+        # 3) Outside history, missing file (and not ignoring)
         elif not file_exists and not self.ignore_missing:
             category = "create_missing"
             update_required = True
-        # 4. Everything else (existing files outside history, or ignored missing)
+        # 4) Everything else
         else:
             category = "missing_ignored" if not file_exists else "file_is_recent"
         return {
             "date": date,
-            "file_exists": file_exists,
+            "file_exists": bool(file_exists),
             "file_age_minutes": file_age,
             "update_category": category,
             "update_priority": self.priority_map.get(category, 99),
-            "update_required": update_required,
+            "update_required": bool(update_required),
             "description": self.description,
         }
-    def exclude_dates(self, dates: Set[datetime.date]) -> None:
+    def exclude_dates(self, dates: Set[dt.date]) -> None:
         """
         Exclude specific dates from the update plan.
         """
         if not isinstance(dates, set):
-            raise ValueError("dates must be a set of datetime.date objects.")
-        if self.plan.empty:
-            self.logger.warning("No update plan available. Call generate_plan() first.")
+            raise ValueError("dates must be a set[date].")
+        if not self.has_plan():
+            self.logger.info("No update plan to modify. Call generate_plan() first.")
             return
-        self.plan = self.plan[~self.plan['date'].isin(dates)]
-        self.df_req = self.plan[self.plan["update_required"]]
-        self.logger.info(f"Excluded {len(dates)} dates from the update plan.")
+        before = len(self.plan)
+        self.plan = self.plan[~self.plan["date"].isin(dates)]
+        self.df_req = self.plan[self.plan["update_required"]].copy()
+        self.logger.info(
+            f"Excluded {len(dates)} dates from the update plan (from {before} to {len(self.plan)} rows)."
+        )

sibi-dst 2025.1.13__py3-none-any.whl → 2025.8.1__py3-none-any.whl

sibi-dst 2025.1.13py3-none-any.whl → 2025.8.1py3-none-any.whl