PyPI - sibi-dst - Versions diffs - 2025.1.13__py3-none-any.whl → 2025.8.1__py3-none-any.whl - Mend

sibi-dst 2025.1.13py3-none-any.whl → 2025.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

sibi_dst/__init__.py +7 -1
sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +235 -342
sibi_dst/df_helper/_df_helper.py +417 -117
sibi_dst/df_helper/_parquet_artifact.py +255 -283
sibi_dst/df_helper/backends/parquet/_parquet_options.py +8 -4
sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +68 -107
sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +105 -255
sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +90 -42
sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +192 -0
sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +122 -72
sibi_dst/osmnx_helper/route_path_builder.py +45 -46
sibi_dst/utils/base.py +302 -96
sibi_dst/utils/clickhouse_writer.py +472 -206
sibi_dst/utils/data_utils.py +139 -186
sibi_dst/utils/data_wrapper.py +317 -73
sibi_dst/utils/date_utils.py +1 -0
sibi_dst/utils/df_utils.py +193 -213
sibi_dst/utils/file_utils.py +3 -2
sibi_dst/utils/filepath_generator.py +314 -152
sibi_dst/utils/log_utils.py +581 -242
sibi_dst/utils/manifest_manager.py +60 -76
sibi_dst/utils/parquet_saver.py +33 -27
sibi_dst/utils/phone_formatter.py +88 -95
sibi_dst/utils/update_planner.py +180 -178
sibi_dst/utils/webdav_client.py +116 -166
{sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.1.dist-info}/METADATA +1 -1
{sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.1.dist-info}/RECORD +29 -27
{sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.1.dist-info}/WHEEL +0 -0

sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py ADDED Viewed

@@ -0,0 +1,192 @@
+from __future__ import annotations
+import hashlib
+import threading
+from typing import Dict, Optional, Tuple
+from sqlalchemy import MetaData, Table
+from sqlalchemy.engine import Engine
+from sqlalchemy.orm import DeclarativeBase
+class Base(DeclarativeBase):
+    """Shared declarative base for all ORM models."""
+    pass
+# Backward-compatible default module label for generated classes
+apps_label = "datacubes.models"
+class ModelRegistry:
+    """
+    Thread-safe registry that reflects tables once per (engine, schema) and
+    returns a single mapped class per (engine, schema, table).
+    """
+    def __init__(self) -> None:
+        self._metadata_cache: Dict[Tuple[str, Optional[str]], MetaData] = {}
+        self._model_cache: Dict[Tuple[str, Optional[str], str], type] = {}
+        self._lock = threading.RLock()
+        self._md_locks: Dict[Tuple[str, Optional[str]], threading.Lock] = {}
+    # ---------- key helpers ----------
+    @staticmethod
+    def _engine_key(engine: Engine) -> str:
+        return str(engine.url)
+    @staticmethod
+    def _qualified_key(schema: Optional[str], table: str) -> str:
+        return f"{schema}.{table}" if schema else table
+    @staticmethod
+    def _split_schema_and_table(name: str) -> Tuple[Optional[str], str]:
+        if "." in name:
+            s, t = name.split(".", 1)
+            return (s or None), t
+        return None, name
+    # ---------- class name helpers ----------
+    @staticmethod
+    def _normalize_class_name(table_name: str) -> str:
+        return "".join(part.capitalize() for part in table_name.split("_"))
+    @staticmethod
+    def _short_hash(*parts: str, length: int = 8) -> str:
+        h = hashlib.sha1("|".join(parts).encode("utf-8")).hexdigest()
+        return h[:length]
+    def _is_class_name_taken(self, class_name: str, module_label: str) -> bool:
+        # Avoid SA private registries; inspect mappers instead (public)
+        for mapper in list(Base.registry.mappers):
+            try:
+                cls = mapper.class_
+                if getattr(cls, "__name__", None) == class_name and getattr(cls, "__module__", None) == module_label:
+                    return True
+            except Exception:
+                continue
+        return False
+    def _find_existing_model_for_table(self, tbl: Table) -> Optional[type]:
+        for mapper in list(Base.registry.mappers):
+            try:
+                mapped_cls = mapper.class_
+                mapped_tbl = getattr(mapped_cls, "__table__", None)
+                if mapped_tbl is tbl:
+                    return mapped_cls
+                if isinstance(mapped_tbl, Table):
+                    if (mapped_tbl.schema == tbl.schema) and (mapped_tbl.name == tbl.name):
+                        return mapped_cls
+            except Exception:
+                continue
+        return None
+    # ---------- metadata helpers ----------
+    def _get_or_create_metadata(self, ekey: str, schema: Optional[str]) -> MetaData:
+        md_key = (ekey, schema)
+        with self._lock:
+            md = self._metadata_cache.get(md_key)
+            if md is None:
+                md = MetaData(schema=schema)
+                self._metadata_cache[md_key] = md
+            return md
+    def _get_or_create_md_lock(self, md_key: Tuple[str, Optional[str]]) -> threading.Lock:
+        with self._lock:
+            lock = self._md_locks.get(md_key)
+            if lock is None:
+                lock = threading.Lock()
+                self._md_locks[md_key] = lock
+            return lock
+    # ---------- public API ----------
+    def get_model(
+        self,
+        engine: Engine,
+        table_name: str,
+        *,
+        refresh: bool = False,
+        schema: Optional[str] = None,
+        module_label: Optional[str] = None,
+        prefer_stable_names: bool = True,
+    ) -> type:
+        s2, tname = self._split_schema_and_table(table_name)
+        schema = schema if schema is not None else s2
+        ekey = self._engine_key(engine)
+        model_key = (ekey, schema, tname)
+        md_key = (ekey, schema)
+        module_label = module_label or apps_label
+        if refresh:
+            with self._lock:
+                self._model_cache.pop(model_key, None)
+                self._metadata_cache.pop(md_key, None)
+                self._md_locks.pop(md_key, None)
+        # fast path: already cached model
+        with self._lock:
+            m = self._model_cache.get(model_key)
+            if m is not None:
+                return m
+        # ensure metadata and reflection are serialized per (engine, schema)
+        md = self._get_or_create_metadata(ekey, schema)
+        md_lock = self._get_or_create_md_lock(md_key)
+        qname = self._qualified_key(schema, tname)
+        tbl = md.tables.get(qname)
+        if tbl is None:
+            with md_lock:
+                # double-checked reflection
+                tbl = md.tables.get(qname)
+                if tbl is None:
+                    md.reflect(bind=engine, only=[qname])
+                tbl = md.tables.get(qname)
+        if tbl is None:
+            raise ValueError(f"Table '{qname}' does not exist in the database.")
+        # If a mapped model for this Table already exists (anywhere), reuse it
+        reused = self._find_existing_model_for_table(tbl)
+        if reused is not None:
+            with self._lock:
+                self._model_cache[model_key] = reused
+            return reused
+        # pick class name
+        base_name = self._normalize_class_name(tname)
+        final_name = base_name
+        if self._is_class_name_taken(base_name, module_label):
+            # optionally keep stable names by suffixing with a short hash
+            if prefer_stable_names:
+                suffix = self._short_hash(ekey, schema or "", tname)
+                final_name = f"{base_name}_{suffix}"
+            else:
+                # let SQLAlchemy registry replacement occur (not recommended)
+                suffix = self._short_hash(ekey, schema or "", tname)
+                final_name = f"{base_name}_{suffix}"
+        # build the model
+        attrs = {
+            "__tablename__": tbl.name,
+            "__table__": tbl,
+            "__module__": module_label,
+        }
+        model_cls = type(final_name, (Base,), attrs)
+        with self._lock:
+            self._model_cache[model_key] = model_cls
+        return model_cls
+    def clear(self) -> None:
+        with self._lock:
+            self._metadata_cache.clear()
+            self._model_cache.clear()
+            self._md_locks.clear()
+# Process-wide registry & helper
+_global_registry = ModelRegistry()
+def get_global_registry() -> ModelRegistry:
+    return _global_registry

sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py CHANGED Viewed

@@ -1,104 +1,154 @@
-import re
 import keyword
+import re
 import threading
-from sqlalchemy import MetaData, Engine
-from sqlalchemy.orm import DeclarativeBase
+from sqlalchemy.engine import Engine
-class Base(DeclarativeBase):
-    """Shared declarative base for all ORM models."""
-    pass
+from ._model_registry import ModelRegistry, apps_label
-apps_label = "datacubes.models"
+# Global process-wide registry for backward compatibility
+_global_model_registry = ModelRegistry()
 class SqlAlchemyModelBuilder:
     """
     Builds a single SQLAlchemy ORM model from a specific database table.
-    This class is thread-safe and caches reflected table metadata to
-    improve performance across multiple instantiations.
+    Thread-safe and uses a process-wide registry for reuse.
+    Backward compatibility:
+      - Keeps CamelCase(table) as preferred class name
+      - Publishes classes under `apps_label` unless overridden
+      - Public API unchanged
     """
     _lock = threading.Lock()
-    _metadata_cache: dict[str, MetaData] = {}
     def __init__(self, engine: Engine, table_name: str):
-        """
-        Initializes the model builder for a specific table.
-        Args:
-            engine: The SQLAlchemy engine connected to the database.
-            table_name: The name of the table to generate the model for.
-        """
         self.engine = engine
         self.table_name = table_name
-        self.class_name = self._normalize_class_name(self.table_name)
-        engine_key = str(engine.url)
-        # ✅ REFACTOR: Acquire lock to make cache access and creation atomic,
-        # preventing a race condition between multiple threads.
-        with self._lock:
-            if engine_key not in self._metadata_cache:
-                self._metadata_cache[engine_key] = MetaData()
-            self.metadata = self._metadata_cache[engine_key]
     def build_model(self) -> type:
-        """
-        Builds and returns a database model class for the specified table.
-        This process is atomic and thread-safe.
-        Raises:
-            ValueError: If the specified table does not exist in the database.
-        Returns:
-            The dynamically created ORM model class.
-        """
         with self._lock:
-            # NOTE: Using a private SQLAlchemy API. This is a performance
-            # optimization but may break in future versions of the library.
-            registered_model = Base.registry._class_registry.get(self.class_name)
-            if registered_model:
-                return registered_model
-            # Check if the table's schema is in our metadata cache
-            table = self.metadata.tables.get(self.table_name)
-            # If not cached, reflect it from the database
-            if table is None:
-                self.metadata.reflect(bind=self.engine, only=[self.table_name])
-                table = self.metadata.tables.get(self.table_name)
-            if table is None:
-                raise ValueError(
-                    f"Table '{self.table_name}' does not exist in the database."
-                )
-            # Create the model class dynamically.
-            attrs = {
-                "__tablename__": table.name,
-                "__table__": table,
-                "__module__": apps_label,
-            }
-            model = type(self.class_name, (Base,), attrs)
-            return model
+            return _global_model_registry.get_model(
+                engine=self.engine,
+                table_name=self.table_name,
+                module_label=apps_label,
+                prefer_stable_names=True,
+            )
     @staticmethod
     def _normalize_class_name(table_name: str) -> str:
-        """Converts a snake_case table_name to a CamelCase class name."""
         return "".join(word.capitalize() for word in table_name.split("_"))
     @staticmethod
     def _normalize_column_name(column_name: str) -> str:
-        """
-        Sanitizes a column name to be a valid Python identifier.
-        (Kept for utility, though not used in the final model creation).
-        """
         sane_name = re.sub(r"\W", "_", column_name)
         sane_name = re.sub(r"^\d", r"_\g<0>", sane_name)
         if keyword.iskeyword(sane_name):
             return f"{sane_name}_field"
         return sane_name
+# import re
+# import keyword
+# import threading
+# from sqlalchemy import MetaData, Engine
+# from sqlalchemy.orm import DeclarativeBase
+#
+#
+# class Base(DeclarativeBase):
+#     """Shared declarative base for all ORM models."""
+#     pass
+#
+#
+# apps_label = "datacubes.models"
+#
+#
+# class SqlAlchemyModelBuilder:
+#     """
+#     Builds a single SQLAlchemy ORM model from a specific database table.
+#     This class is thread-safe and caches reflected table metadata to
+#     improve performance across multiple instantiations.
+#     """
+#     _lock = threading.Lock()
+#     _metadata_cache: dict[str, MetaData] = {}
+#
+#     def __init__(self, engine: Engine, table_name: str):
+#         """
+#         Initializes the model builder for a specific table.
+#
+#         Args:
+#             engine: The SQLAlchemy engine connected to the database.
+#             table_name: The name of the table to generate the model for.
+#         """
+#         self.engine = engine
+#         self.table_name = table_name
+#         self.class_name = self._normalize_class_name(self.table_name)
+#
+#         engine_key = str(engine.url)
+#
+#         # ✅ REFACTOR: Acquire lock to make cache access and creation atomic,
+#         # preventing a race condition between multiple threads.
+#         with self._lock:
+#             if engine_key not in self._metadata_cache:
+#                 self._metadata_cache[engine_key] = MetaData()
+#             self.metadata = self._metadata_cache[engine_key]
+#
+#     def build_model(self) -> type:
+#         """
+#         Builds and returns a database model class for the specified table.
+#         This process is atomic and thread-safe.
+#
+#         Raises:
+#             ValueError: If the specified table does not exist in the database.
+#         Returns:
+#             The dynamically created ORM model class.
+#         """
+#         with self._lock:
+#             # NOTE: Using a private SQLAlchemy API. This is a performance
+#             # optimization but may break in future versions of the library.
+#             registered_model = Base.registry._class_registry.get(self.class_name)
+#             if registered_model:
+#                 return registered_model
+#
+#             # Check if the table's schema is in our metadata cache
+#             table = self.metadata.tables.get(self.table_name)
+#
+#             # If not cached, reflect it from the database
+#             if table is None:
+#                 self.metadata.reflect(bind=self.engine, only=[self.table_name])
+#                 table = self.metadata.tables.get(self.table_name)
+#
+#             if table is None:
+#                 raise ValueError(
+#                     f"Table '{self.table_name}' does not exist in the database."
+#                 )
+#
+#             # Create the model class dynamically.
+#             attrs = {
+#                 "__tablename__": table.name,
+#                 "__table__": table,
+#                 "__module__": apps_label,
+#             }
+#             model = type(self.class_name, (Base,), attrs)
+#
+#             return model
+#
+#     @staticmethod
+#     def _normalize_class_name(table_name: str) -> str:
+#         """Converts a snake_case table_name to a CamelCase class name."""
+#         return "".join(word.capitalize() for word in table_name.split("_"))
+#
+#     @staticmethod
+#     def _normalize_column_name(column_name: str) -> str:
+#         """
+#         Sanitizes a column name to be a valid Python identifier.
+#         (Kept for utility, though not used in the final model creation).
+#         """
+#         sane_name = re.sub(r"\W", "_", column_name)
+#         sane_name = re.sub(r"^\d", r"_\g<0>", sane_name)
+#
+#         if keyword.iskeyword(sane_name):
+#             return f"{sane_name}_field"
+#         return sane_name
+#
+#

sibi_dst/osmnx_helper/route_path_builder.py CHANGED Viewed

@@ -2,48 +2,44 @@ import numpy as np
 import pandas as pd
 import networkx as nx
 import osmnx as ox
-from typing import List
+from typing import List, Optional
+from pydantic import BaseModel
+class RoutePathBuilderConfig(BaseModel):
+    """
+    A Pydantic model to validate the configuration for the RoutePathBuilder.
+    """
+    graph: nx.MultiDiGraph
+    sort_key: List[str]  # Made mandatory
+    grouping_col: Optional[str] = None
+    lat_col: str = "latitude"
+    lon_col: str = "longitude"
+    class Config:
+        arbitrary_types_allowed = True
 class RoutePathBuilder:
     """
-    Builds shortest paths for consecutive GPS points (origins & destinations) within each associate's track.
+    Builds shortest paths (Dijkstra Algorithm) for consecutive GPS points.
+    This version requires an explicit sort_key for correctness.
     """
-    def __init__(
-        self,
-        graph: nx.MultiDiGraph,
-        lat_col: str = "latitude",
-        lon_col: str = "longitude",
-        grouping_col: str = "associate_id",
-        sort_key=None  # Default sort key for DataFrame
-    ):
+    def __init__(self, config: RoutePathBuilderConfig):
         """
-        :param graph: The OSMnx MultiDiGraph.
-        :param lat_col: Column name for latitude.
-        :param lon_col: Column name for longitude.
-        :param associate_col: Column name for associate/grouping key.
+        Initializes the builder with a validated configuration object.
         """
-        if sort_key is None:
-            sort_key = ["associate_id", "date_time"]
-        self.graph = graph
-        self.lat_col = lat_col
-        self.lon_col = lon_col
-        self.grouping_col = grouping_col
-        self.sort_key = sort_key
-        if self.sort_key is None:
-            self.sort_key = [self.grouping_col, "date_time"]
+        self.config = config
+    # Static methods _get_shortest_path and _path_length_from_nodes remain unchanged...
     @staticmethod
     def _get_shortest_path(u: int, v: int, graph: nx.MultiDiGraph) -> List[int]:
-        """Return the node sequence for the shortest path from u to v, or [] if none."""
         try:
-            return nx.shortest_path(graph, u, v, weight="length")
+            return nx.shortest_path(graph, u, v, weight="length", method="dijkstra")
         except nx.NetworkXNoPath:
             return []
     @staticmethod
     def _path_length_from_nodes(node_list: List[int], graph: nx.MultiDiGraph) -> float:
-        """Sum up the 'length' attribute along consecutive node pairs."""
         if len(node_list) < 2:
             return np.nan
         total = 0.0
@@ -53,46 +49,49 @@ class RoutePathBuilder:
             total += min(lengths) if lengths else 0
         return total
     def build_routes(self, df: pd.DataFrame) -> pd.DataFrame:
         """
-        Generate destination coordinates, snap to graph nodes, and compute shortest paths.
-        :param df: Input DataFrame containing grouping_col, latitude, and longitude columns.
-        :return: DataFrame with added columns:
-            ['dest_lat', 'dest_lon', 'origin_node', 'dest_node', 'path_nodes', 'path_coords', 'distance_m']
+        Generates routes from a DataFrame of GPS points.
         """
-        # 1) Build destination coordinates by shifting per grouping column
         df = df.copy()
-        df["dest_lat"] = df.groupby(self.grouping_col)[self.lat_col].shift(-1)
-        df["dest_lon"] = df.groupby(self.grouping_col)[self.lon_col].shift(-1)
-        # Drop tail rows without next point
+        df = df.sort_values(by=self.config.sort_key).reset_index(drop=True)
+        # 2. Create destination columns by shifting within each group or across the df
+        if self.config.grouping_col:
+            df["dest_lat"] = df.groupby(by=self.config.grouping_col)[self.config.lat_col].shift(-1)
+            df["dest_lon"] = df.groupby(by=self.config.grouping_col)[self.config.lon_col].shift(-1)
+        else:
+            df["dest_lat"] = df[self.config.lat_col].shift(-1)
+            df["dest_lon"] = df[self.config.lon_col].shift(-1)
         df = df.dropna(subset=["dest_lat", "dest_lon"]).reset_index(drop=True)
-        # 2) Snap origin & destination points to graph nodes
+        # 3. Snap origin & destination coordinates to the nearest graph nodes
         df["origin_node"] = ox.nearest_nodes(
-            self.graph, X=df[self.lon_col].values, Y=df[self.lat_col].values
+            self.config.graph, X=df[self.config.lon_col].values, Y=df[self.config.lat_col].values
         )
         df["dest_node"] = ox.nearest_nodes(
-            self.graph, X=df["dest_lon"].values, Y=df["dest_lat"].values
+            self.config.graph, X=df["dest_lon"].values, Y=df["dest_lat"].values
         )
-        # 3) Compute paths, coordinates, and distances
+        # 4. Calculate paths, coordinates, and distances
         df["path_nodes"] = [
-            self._get_shortest_path(u, v, self.graph)
+            self._get_shortest_path(u, v, self.config.graph)
             for u, v in zip(df["origin_node"], df["dest_node"])
         ]
+        df = df[df["path_nodes"].str.len() > 0].reset_index(drop=True)
         df["path_coords"] = df["path_nodes"].apply(
-            lambda nl: [(self.graph.nodes[n]["y"], self.graph.nodes[n]["x"]) for n in nl]
+            lambda nl: [(self.config.graph.nodes[n]["y"], self.config.graph.nodes[n]["x"]) for n in nl]
         )
         df["distance_m"] = df["path_nodes"].apply(
-            lambda nl: self._path_length_from_nodes(nl, self.graph)
+            lambda nl: self._path_length_from_nodes(nl, self.config.graph)
         )
-        # Ensure NaN distances become 0
         df["distance_m"] = df["distance_m"].fillna(0)
-        # Remove any legs with no path
-        df = df[df["path_nodes"].str.len() > 0].reset_index(drop=True)
-        return df.sort_values(self.sort_key).reset_index(drop=True)
+        # The final sort is no longer needed, as it was done at the beginning
+        return df

sibi-dst 2025.1.13__py3-none-any.whl → 2025.8.1__py3-none-any.whl

sibi-dst 2025.1.13py3-none-any.whl → 2025.8.1py3-none-any.whl