PyPI - sibi-dst - Versions diffs - 2025.1.12__py3-none-any.whl → 2025.8.1__py3-none-any.whl - Mend

sibi-dst 2025.1.12py3-none-any.whl → 2025.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

sibi_dst/__init__.py +7 -1
sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +235 -342
sibi_dst/df_helper/_df_helper.py +417 -117
sibi_dst/df_helper/_parquet_artifact.py +255 -283
sibi_dst/df_helper/backends/parquet/_parquet_options.py +8 -4
sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +68 -107
sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +105 -255
sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +90 -42
sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +192 -0
sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +122 -72
sibi_dst/osmnx_helper/__init__.py +1 -0
sibi_dst/osmnx_helper/basemaps/route_map_plotter.py +203 -0
sibi_dst/osmnx_helper/route_path_builder.py +97 -0
sibi_dst/osmnx_helper/utils.py +2 -0
sibi_dst/utils/base.py +302 -96
sibi_dst/utils/clickhouse_writer.py +472 -206
sibi_dst/utils/data_utils.py +139 -186
sibi_dst/utils/data_wrapper.py +317 -73
sibi_dst/utils/date_utils.py +1 -0
sibi_dst/utils/df_utils.py +193 -213
sibi_dst/utils/file_utils.py +3 -2
sibi_dst/utils/filepath_generator.py +314 -152
sibi_dst/utils/log_utils.py +581 -242
sibi_dst/utils/manifest_manager.py +60 -76
sibi_dst/utils/parquet_saver.py +33 -27
sibi_dst/utils/phone_formatter.py +88 -95
sibi_dst/utils/update_planner.py +180 -178
sibi_dst/utils/webdav_client.py +116 -166
{sibi_dst-2025.1.12.dist-info → sibi_dst-2025.8.1.dist-info}/METADATA +1 -1
{sibi_dst-2025.1.12.dist-info → sibi_dst-2025.8.1.dist-info}/RECORD +32 -28
{sibi_dst-2025.1.12.dist-info → sibi_dst-2025.8.1.dist-info}/WHEEL +0 -0

sibi_dst/utils/manifest_manager.py CHANGED Viewed

@@ -3,50 +3,45 @@ import fsspec
 import threading
 import uuid
 from typing import List, Optional, Set, Dict, Any
-import json, base64, hashlib
 from sibi_dst.utils import Logger
 class MissingManifestManager:
     """
-    A thread-safe manager for a Parquet file manifest.
-    This class handles creating, reading, and appending to a Parquet manifest file
-    that tracks a list of paths. It is designed to be resilient, using atomic
-    file operations to prevent data corruption during writes, and can clean up
-    orphaned temporary files from previous runs.
-    Attributes:
-        fs (fsspec.AbstractFileSystem): The filesystem object to interact with.
-        manifest_path (str): The full path to the manifest file.
-        clear_existing (bool): If True, any existing manifest will be overwritten
-            on the first save operation of this instance's lifecycle.
-        logger (Logger): A logger instance for logging messages.
+    Thread-safe manager for a Parquet file manifest of missing partitions.
+    - Atomic writes via temp → copy → remove
+    - Cleans up orphan temp files (best-effort)
+    - Stores a simple table with a single column: 'path'
     """
     def __init__(
-            self,
-            fs: fsspec.AbstractFileSystem,
-            manifest_path: str,
-            clear_existing: bool = False,
-            **kwargs: Any,
+        self,
+        fs: fsspec.AbstractFileSystem,
+        manifest_path: str,
+        clear_existing: bool = False,
+        **kwargs: Any,
     ):
         self.fs: fsspec.AbstractFileSystem = fs
         self.manifest_path: str = manifest_path.rstrip("/")
         self.clear_existing: bool = clear_existing
+        self.clear_existing: bool = clear_existing
+        self.ignore_missing: bool = kwargs.get("ignore_missing", False)
+        if self.clear_existing:
+            self.ignore_missing = False
         self.debug: bool = kwargs.get("debug", False)
         self.logger: Logger = kwargs.get(
             "logger",
-            Logger.default_logger(logger_name="missing_manifest_manager")
+            Logger.default_logger(logger_name="missing_manifest_manager"),
         )
         self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
         self._new_records: List[Dict[str, str]] = []
         self._loaded_paths: Optional[Set[str]] = None
-        self._lock = threading.Lock()  # A standard Lock is sufficient
+        self._lock = threading.Lock()
-        # Clean up any orphaned temp files from previous failed runs
+        # Clean up any orphaned temp files from previous failed runs (best-effort)
         self._cleanup_orphaned_files()
     def _safe_exists(self, path: str) -> bool:
@@ -59,13 +54,8 @@ class MissingManifestManager:
     def load_existing(self) -> Set[str]:
         """
-        Loads the set of paths from the existing manifest file.
-        The result is cached in memory. If the manifest does not exist or fails
-        to load, an empty set is returned. This operation is thread-safe.
-        Returns:
-            A set of strings, where each string is a path from the manifest.
+        Loads the set of paths from the existing manifest file into memory.
+        Returns an empty set if not found or unreadable.
         """
         with self._lock:
             if self._loaded_paths is not None:
@@ -77,7 +67,6 @@ class MissingManifestManager:
             try:
                 df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
-                # Robustly extract non-empty, non-null paths
                 paths = (
                     df.get("path", pd.Series(dtype=str))
                     .dropna().astype(str)
@@ -96,9 +85,6 @@ class MissingManifestManager:
     def record(self, full_path: str) -> None:
         """
         Records a new path to be added to the manifest upon the next save.
-        Args:
-            full_path: The path to record.
         """
         if not full_path or not isinstance(full_path, str):
             return
@@ -107,12 +93,7 @@ class MissingManifestManager:
     def save(self) -> None:
         """
-        Saves all new records to the manifest file.
-        This method merges new records with existing ones (unless `clear_existing`
-        is True), removes duplicates, and writes the result back to the manifest.
-        The write operation is performed atomically by writing to a temporary file
-        first, then renaming or copying it to the final destination.
+        Saves all new records to the manifest file atomically.
         """
         with self._lock:
             if not self._new_records and not self.clear_existing:
@@ -143,65 +124,68 @@ class MissingManifestManager:
             # Ensure parent directory exists
             parent = self.manifest_path.rsplit("/", 1)[0]
-            self.fs.makedirs(parent, exist_ok=True)
+            try:
+                self.fs.makedirs(parent, exist_ok=True)
+            except TypeError:
+                try:
+                    self.fs.makedirs(parent)
+                except FileExistsError:
+                    pass
             # Perform an atomic write using a temporary file
             temp_path = f"{self.manifest_path}.tmp-{uuid.uuid4().hex}"
             try:
                 out_df.to_parquet(temp_path, filesystem=self.fs, index=False)
-                self.fs.copy(temp_path, self.manifest_path)
-                self.fs.rm_file(temp_path)
-                self.logger.info(f"Copied manifest to {self.manifest_path} (temp: {temp_path})")
+                # some fs lack atomic rename; copy then remove
+                if hasattr(self.fs, "rename"):
+                    try:
+                        self.fs.rename(temp_path, self.manifest_path)
+                    except Exception:
+                        self.fs.copy(temp_path, self.manifest_path)
+                        self.fs.rm_file(temp_path)
+                else:
+                    self.fs.copy(temp_path, self.manifest_path)
+                    self.fs.rm_file(temp_path)
+                self.logger.info(f"Wrote manifest to {self.manifest_path}")
             except Exception as e:
                 self.logger.error(f"Failed to write or move manifest: {e}")
-                # Re-raise so the caller knows the save operation failed
-                #raise
+                # not re-raising to avoid breaking the ETL run
             finally:
-                # CRITICAL: Always clean up the temporary file
-                if self._safe_exists(temp_path):
-                    try:
-                        self._cleanup_orphaned_files()
-                    except Exception as e:
-                        self.logger.error(f"Failed to remove temporary file '{temp_path}': {e}")
+                # Always try to clean temp leftovers
+                try:
+                    if self._safe_exists(temp_path):
+                        if hasattr(self.fs, "rm_file"):
+                            self.fs.rm_file(temp_path)
+                        else:
+                            self.fs.rm(temp_path, recursive=False)
+                except Exception:
+                    pass
             # Reset internal state
             self._new_records.clear()
-            self._loaded_paths = set(out_df["path"].tolist())
-            # After the first successful save, disable clear_existing behavior
+            try:
+                self._loaded_paths = set(out_df["path"].tolist())
+            except Exception:
+                self._loaded_paths = None
             self.clear_existing = False
     def _cleanup_orphaned_files(self) -> None:
-        """Finds and removes any orphaned temporary manifest files from prior runs."""
-        self.logger.debug("Checking for orphaned temporary files...")
-        if not hasattr(self.fs, "s3"):
-            self.logger.info("Filesystem is not s3fs; skipping temp cleanup.")
-            return
+        """Best-effort removal of leftover temporary manifest files."""
         try:
-            # Use glob to find all files matching the temp pattern in a filesystem-agnostic way
             temp_file_pattern = f"{self.manifest_path}.tmp-*"
             orphaned_files = self.fs.glob(temp_file_pattern)
             if not orphaned_files:
-                self.logger.debug("No orphaned files found.")
                 return
-            self.logger.info(f"Found {orphaned_files} orphaned temp manifest(s). Cleaning up...")
             for f_path in orphaned_files:
                 try:
-                    self.fs.rm_file(f_path)
+                    if hasattr(self.fs, "rm_file"):
+                        self.fs.rm_file(f_path)
+                    else:
+                        self.fs.rm(f_path, recursive=False)
                     self.logger.info(f"Deleted orphaned file: {f_path}")
                 except Exception as e:
                     self.logger.warning(f"Failed to delete orphaned temp file '{f_path}': {e}")
         except Exception as e:
-            # This is a non-critical operation, so we just log the error
-            self.logger.error(f"An unexpected error occurred during temp file cleanup: {e}")
-    @staticmethod
-    def _parse_s3_path(s3_path: str):
-        if not s3_path.startswith("s3://"):
-            raise ValueError("Invalid S3 path. Must start with 's3://'.")
-        path_parts = s3_path[5:].split("/", 1)
-        bucket_name = path_parts[0]
-        prefix = path_parts[1] if len(path_parts) > 1 else ""
-        return bucket_name, prefix
+            # Non-critical
+            self.logger.debug(f"Temp cleanup skipped: {e}")

sibi_dst/utils/parquet_saver.py CHANGED Viewed

@@ -1,10 +1,7 @@
-import logging
 import warnings
-from typing import Optional
 import dask.dataframe as dd
 import pyarrow as pa
-from fsspec import AbstractFileSystem
 from . import ManagedResource
@@ -14,19 +11,20 @@ warnings.filterwarnings("ignore", message="Passing 'overwrite=True' to to_parque
 class ParquetSaver(ManagedResource):
     """
     Saves Dask DataFrames to Parquet, with a workaround for S3-compatible
-    storage that fails on batch delete operations.
+    storage providers that misbehave on batch delete operations.
+    Assumes `df_result` is a Dask DataFrame.
     """
     def __init__(
-            self,
-            df_result: dd.DataFrame,
-            parquet_storage_path: str,
-            **kwargs,
+        self,
+        df_result: dd.DataFrame,
+        parquet_storage_path: str,
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.df_result = df_result
         self.parquet_storage_path = parquet_storage_path.rstrip("/")
-        # Determine protocol for special handling (e.g., 's3')
         if not self.fs:
             raise ValueError("File system (fs) must be provided to ParquetSaver.")
@@ -36,7 +34,7 @@ class ParquetSaver(ManagedResource):
     def save_to_parquet(self, output_directory_name: str = "default_output", overwrite: bool = True):
         """
-        Saves the DataFrame to a Parquet dataset.
+        Saves the Dask DataFrame to a Parquet dataset.
         If overwrite is True, it manually clears the destination directory before
         writing to avoid issues with certain S3-compatible storage providers.
@@ -52,18 +50,18 @@ class ParquetSaver(ManagedResource):
         schema = self._define_schema()
         self.logger.info(f"Saving DataFrame to Parquet dataset at: {full_path}")
-        self.df_result = self.df_result.persist()
+        # persist then write (lets the graph be shared if the caller reuses it)
+        ddf = self.df_result.persist()
         try:
-            # We call to_parquet with overwrite=False because we have already
-            # handled the directory clearing manually.
-            self.df_result.to_parquet(
+            ddf.to_parquet(
                 path=full_path,
                 engine="pyarrow",
                 schema=schema,
-                overwrite=False,
+                overwrite=False,         # we've handled deletion already
                 filesystem=self.fs,
                 write_index=False,
-                compute=True,  # Use compute=True over persisted ddf for immediate execution.
             )
             self.logger.info(f"Successfully saved Parquet dataset to: {full_path}")
         except Exception as e:
@@ -73,8 +71,8 @@ class ParquetSaver(ManagedResource):
     def _clear_directory_safely(self, directory: str):
         """
         Clears the contents of a directory robustly.
-        - For S3, it deletes files one-by-one to bypass the 'MissingContentMD5' error.
-        - For other filesystems, it uses the standard recursive remove.
+        - For S3, deletes files one-by-one to bypass brittle multi-delete.
+        - For other filesystems, uses the standard recursive remove.
         """
         if self.protocol == "s3":
             self.logger.warning(
@@ -82,15 +80,23 @@ class ParquetSaver(ManagedResource):
                 "This may be slow for directories with many files."
             )
             # Glob all contents (files and subdirs) and delete them individually.
-            # Calling fs.rm() on a single file path should trigger a single
-            # DeleteObject call, avoiding the faulty batch operation.
-            # We sort by length descending to delete contents of subdirectories first.
             all_paths = self.fs.glob(f"{directory}/**")
-            paths_to_delete = sorted([p for p in all_paths if p != directory], key=len, reverse=True)
-            for path in paths_to_delete:
+            # delete contents (deepest first)
+            for path in sorted([p for p in all_paths if p != directory], key=len, reverse=True):
                 self.logger.debug(f"Deleting: {path}")
-                self.fs.rm_file(path)
+                try:
+                    # prefer rm_file if available (minio, s3fs expose it)
+                    if hasattr(self.fs, "rm_file"):
+                        self.fs.rm_file(path)
+                    else:
+                        self.fs.rm(path, recursive=False)
+                except Exception as e:
+                    self.logger.warning(f"Failed to delete '{path}': {e}")
+            # remove the (now empty) directory if present
+            try:
+                self.fs.rm(directory, recursive=False)
+            except Exception:
+                pass
         else:
             # Standard, fast deletion for other filesystems (local, etc.)
             self.fs.rm(directory, recursive=True)
@@ -98,6 +104,7 @@ class ParquetSaver(ManagedResource):
     def _define_schema(self) -> pa.Schema:
         """
         Defines a PyArrow schema dynamically based on DataFrame's column types.
+        Works for Dask by using known dtypes on the collection.
         """
         pandas_dtype_to_pa = {
             "object": pa.string(), "string": pa.string(),
@@ -113,5 +120,4 @@ class ParquetSaver(ManagedResource):
             pa.field(c, pandas_dtype_to_pa.get(str(d), pa.string()))
             for c, d in self.df_result.dtypes.items()
         ]
-        return pa.schema(fields)
+        return pa.schema(fields)

sibi_dst/utils/phone_formatter.py CHANGED Viewed

@@ -1,127 +1,120 @@
 import re
 from enum import Enum
-from typing import Optional, Union, Callable
+from typing import Optional, Union, Callable, Tuple, Iterable
-class CountryCode(Enum):
-    """Enum for supported country codes, including phone number length and formatting rules."""
-    USA = ("1", 10, lambda number: f"({number[:3]}) {number[3:6]}-{number[6:]}")
-    UK = ("44", 10, lambda number: f"{number[:2]} {number[2:6]} {number[6:]}")
-    FRANCE = ("33", 9, lambda number: f"{number[:1]} {number[1:3]} {number[3:5]} {number[5:]}")
-    SPAIN = ("34", 9, lambda number: f"{number[:2]} {number[2:5]} {number[5:]}")
-    DEFAULT = ("506", 8, lambda number: f"{number[:4]}-{number[4:]}")
+def _only_digits(s: str) -> str:
+    return re.sub(r"\D", "", s)
-    def __init__(self, code: str, length: int, formatter: Callable[[str], str]):
-        """
-        Initialize a CountryCode enum member.
-        :param code: The country code.
-        :type code: str
-        :param length: The expected length of the phone number (excluding the country code).
-        :type length: int
-        :param formatter: A function to format the phone number.
-        :type formatter: Callable[[str], str]
-        """
-        self.code = code
-        self.length = length
-        self.formatter = formatter
-    @property
-    def value(self) -> str:
-        """
-        Get the country code value.
+def _normalize_raw_input(phone: Union[str, int, float]) -> str:
+    """
+    Normalize raw input to just digits, preserving leading zeros for strings.
+    Reject floats because they lose leading zeros and can be formatted (e.g., 1e10).
+    """
+    if isinstance(phone, float):
+        # Floats are unsafe for phone numbers; caller should pass string or int
+        raise ValueError("Phone numbers as float are ambiguous; pass a string or int.")
+    if isinstance(phone, int):
+        # int loses leading zeros by definition, but this matches your original behavior
+        return str(phone)
+    if not isinstance(phone, str):
+        raise TypeError("phone_number must be str|int")
+    phone = phone.strip()
+    # Allow leading '+' or '00' international format; we'll strip them before digit normalization
+    if phone.startswith("+"):
+        phone = phone[1:]
+    elif phone.startswith("00"):
+        phone = phone[2:]
+    return _only_digits(phone)
-        :return: The country code.
-        :rtype: str
-        """
-        return self.code
-    def validate_length(self, number: str) -> bool:
-        """
-        Validate the length of the phone number for this country.
+class CountryCode(Enum):
+    """
+    Supported countries with:
+      - dial_code: country calling code
+      - nsn_length: expected National Significant Number length (no country code)
+      - formatter: formats the national number
+      - trunk_prefix: '0' for countries that commonly include a trunk code domestically (strip if present)
+    """
-        :param number: The phone number part to validate.
-        :type number: str
-        :return: True if the number length is valid, False otherwise.
-        :rtype: bool
-        """
-        return len(number) == self.length
+    USA   = ("1",  10, lambda n: f"({n[:3]}) {n[3:6]}-{n[6:]}",  "")
+    UK    = ("44", 10, lambda n: f"{n[:2]} {n[2:6]} {n[6:]}",   "0")
+    FRANCE= ("33", 9,  lambda n: f"{n[:1]} {n[1:3]} {n[3:5]} {n[5:]}", "0")
+    SPAIN = ("34", 9,  lambda n: f"{n[:2]} {n[2:5]} {n[5:]}",   "")
+    # Default to Costa Rica in your original code
+    DEFAULT = ("506", 8, lambda n: f"{n[:4]}-{n[4:]}", "")
-    def format_number(self, number: str) -> str:
-        """
-        Format the phone number according to this country's rules.
+    def __init__(self, dial_code: str, nsn_length: int, formatter: Callable[[str], str], trunk_prefix: str):
+        self.dial_code = dial_code
+        self.nsn_length = nsn_length
+        self.formatter = formatter
+        self.trunk_prefix = trunk_prefix
+    def validate_length(self, nsn: str) -> bool:
+        return len(nsn) == self.nsn_length
+    def strip_trunk(self, nsn: str) -> str:
+        if self.trunk_prefix and nsn.startswith(self.trunk_prefix) and len(nsn) > self.nsn_length:
+            # If someone passed trunk + nsn (e.g., '0' + 10 digits for UK),
+            # remove only a single leading trunk.
+            return nsn[1:]
+        return nsn
+    def format_number(self, nsn: str) -> str:
+        return self.formatter(nsn)
-        :param number: The phone number part to format.
-        :type number: str
-        :return: The formatted number.
-        :rtype: str
-        """
-        return self.formatter(number)
 class PhoneNumberFormatter:
     """
-    A utility class for validating and formatting phone numbers based on country-specific rules.
-    The class supports phone numbers for the UK, USA, France, and Spain. It detects the country code
-    from the input or uses a default country code if missing. Phone numbers are formatted according
-    to country-specific rules.
+    Validate and format a phone number into E.164-like string with country-specific formatting of the NSN.
+    Keeps backward compatibility with your previous API.
     """
     def __init__(self, default_country_code: CountryCode = CountryCode.DEFAULT):
-        """
-        Initialize the PhoneNumberFormatter with a default country code.
-        :param default_country_code: The default country code to use if missing.
-        :type default_country_code: CountryCode
-        """
         self.default_country_code = default_country_code
     def format_phone_number(self, phone_number: Union[str, int, float]) -> Optional[str]:
         """
-        Validate and format a phone number according to country-specific rules.
-        If the input is numeric (e.g., an integer or float), it will be converted to a string.
-        If the country code is missing, the default country code will be used. The phone number
-        will be formatted according to the detected country's rules.
-        :param phone_number: The phone number to validate and format. Can be a string, integer, or float.
-        :type phone_number: Union[str, int, float]
-        :return: The formatted phone number, or None if the input is invalid.
-        :rtype: Optional[str]
+        Returns: "+<country_code> <pretty national format>" or None if invalid.
         """
-        # Convert numeric input to string
-        if isinstance(phone_number, (int, float)):
-            phone_number = str(int(phone_number))  # Convert to integer first to remove decimal points
-        # Remove all non-digit characters
-        digits = re.sub(r"\D", "", phone_number)
+        try:
+            digits = _normalize_raw_input(phone_number)
+        except (TypeError, ValueError):
+            return None
-        # Validate the length of the phone number
-        if not digits or len(digits) < 7:  # Minimum length for a valid phone number
+        if not digits or len(digits) < 7:  # minimal sanity check
             return None
-        # Detect the country code
-        country_code, number = self._detect_country_code(digits)
+        country, nsn = self._detect_country_code(digits)
-        # Validate the number length for the detected country
-        if not country_code.validate_length(number):
-            return None
+        # Strip a single trunk prefix if present (e.g., UK/FR leading '0' before the NSN)
+        nsn = country.strip_trunk(nsn)
-        # Format the phone number based on the country code
-        formatted_number = country_code.format_number(number)
+        if not country.validate_length(nsn):
+            return None
-        return f"+{country_code.value} {formatted_number}"
+        pretty = country.format_number(nsn)
+        return f"+{country.dial_code} {pretty}"
-    def _detect_country_code(self, digits: str) -> tuple[CountryCode, str]:
+    def _detect_country_code(self, digits: str) -> Tuple[CountryCode, str]:
         """
-        Detect the country code from the input digits.
-        :param digits: The phone number digits (without non-digit characters).
-        :type digits: str
-        :return: A tuple containing the detected country code and the remaining number.
-        :rtype: tuple[CountryCode, str]
+        Detect the country by trying the longest dial codes first to avoid prefix collisions.
+        Falls back to default if none matches.
         """
-        for country_code in CountryCode:
-            if digits.startswith(country_code.value):
-                return country_code, digits[len(country_code.value):]
+        # Iterate members excluding DEFAULT for detection, sorted by dial_code length desc
+        candidates: Iterable[CountryCode] = (
+            c for c in sorted(
+                (m for m in CountryCode if m is not CountryCode.DEFAULT),
+                key=lambda m: len(m.dial_code),
+                reverse=True,
+            )
+        )
+        for country in candidates:
+            if digits.startswith(country.dial_code):
+                return country, digits[len(country.dial_code):]
+        # No match → assume default country; entire string is NSN
         return self.default_country_code, digits

sibi-dst 2025.1.12__py3-none-any.whl → 2025.8.1__py3-none-any.whl

sibi-dst 2025.1.12py3-none-any.whl → 2025.8.1py3-none-any.whl