PyPI - sibi-dst - Versions diffs - 2025.9.1__tar.gz → 2025.9.2__tar.gz - Mend

sibi-dst 2025.9.1tar.gz → 2025.9.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

{sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sibi-dst
-Version: 2025.9.1
+Version: 2025.9.2
 Summary: Data Science Toolkit
 Author: Luis Valverde
 Author-email: lvalverdeb@gmail.com

{sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "sibi-dst"
-version = "2025.9.1"
+version = "2025.9.2"
 description = "Data Science Toolkit"
 authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
 readme = "README.md"

{sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/_df_helper.py RENAMED Viewed

@@ -372,357 +372,3 @@ class DfHelper(ManagedResource):
             return bool(ddf.head(1, npartitions=-1).shape[0])
         except Exception:
             return False
-# BEFORE SSE Handling
-# from __future__ import annotations
-#
-# import asyncio
-# from typing import Any, Dict, Optional, TypeVar, Union
-#
-# import dask.dataframe as dd
-# import pandas as pd
-# from fsspec import AbstractFileSystem
-# from pydantic import BaseModel
-#
-# from sibi_dst.df_helper.core import QueryConfig, ParamsConfig
-# from sibi_dst.utils import ManagedResource, ParquetSaver, ClickHouseWriter
-# from .backends.http import HttpConfig
-# from .backends.parquet import ParquetConfig
-# from .backends.sqlalchemy import SqlAlchemyConnectionConfig, SqlAlchemyLoadFromDb
-#
-# T = TypeVar("T", bound=BaseModel)
-#
-# def _is_dask_df(x) -> bool:
-#     return isinstance(x, dd.DataFrame)
-#
-# def _maybe_persist(df, persist: bool):
-#     return df.persist() if persist and _is_dask_df(df) else df
-#
-# def _maybe_compute(df, as_pandas: bool):
-#     return df.compute() if as_pandas and _is_dask_df(df) else df
-#
-#
-# # ---- Backend Strategy Pattern ----
-# class BaseBackend:
-#     def __init__(self, helper: "DfHelper"):
-#         self.helper = helper
-#         self.logger = helper.logger
-#         self.debug = helper.debug
-#         self.total_records = -1
-#
-#     def load(self, **options) -> Union[tuple[Any, Any], dd.DataFrame, pd.DataFrame]:
-#         raise NotImplementedError
-#
-#     async def aload(self, **options) -> Union[tuple[Any, Any], dd.DataFrame, pd.DataFrame]:
-#         return await asyncio.to_thread(self.load,**options)
-#
-#
-# class SqlAlchemyBackend(BaseBackend):
-#     def load(self, **options):
-#         try:
-#             if options and hasattr(self.helper._backend_params, "parse_params"):
-#                 self.helper._backend_params.parse_params(options)
-#
-#             with SqlAlchemyLoadFromDb(
-#                 plugin_sqlalchemy=self.helper.backend_db_connection,
-#                 plugin_query=self.helper._backend_query,
-#                 plugin_params=self.helper._backend_params,
-#                 logger=self.logger,
-#                 debug=self.debug,
-#             ) as db_loader:
-#                 self.total_records, result = db_loader.build_and_load()
-#                 return self.total_records, result
-#         except Exception as e:
-#             self.logger.error(f"Failed to load data from sqlalchemy: {e}", exc_info=self.debug, extra=self.helper.logger_extra)
-#             return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
-#
-#
-# class ParquetBackend(BaseBackend):
-#     def load(self, **options):
-#         try:
-#             df = self.helper.backend_parquet.load_files(**options)
-#             if not self.helper._has_any_rows(df):
-#                 self.total_records = 0
-#                 return 0, self._empty_like(df)
-#
-#                 # Let DfHelper decide about persist
-#             self.total_records = -1  # unknown without full count
-#             return self.total_records, df
-#
-#         except Exception as e:
-#             self.total_records = -1  # Reset total_records on failure
-#             self.logger.error(f"Failed to load data from parquet: {e}", exc_info=self.debug, extra=self.helper.logger_extra)
-#             return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
-#
-#     @staticmethod
-#     def _empty_like(ddf):
-#         empty_pdf = ddf._meta.iloc[0:0]
-#         return dd.from_pandas(empty_pdf, npartitions=1)
-#
-#
-# class HttpBackend(BaseBackend):
-#     def load(self, **options):
-#         # Avoid event-loop problems in sync code paths.
-#         # If someone calls .load() on an async backend, make it explicit.
-#         raise RuntimeError(
-#             "HttpBackend.load() is sync but this backend is async-only. "
-#             "Call `await helper.aload(...)` or `await helper.load_async(prefer_native=True, ...)`."
-#         )
-#
-#     async def aload(self, **options):
-#         if not self.helper.backend_http:
-#             self.logger.warning("HTTP plugin not configured properly.", extra=self.helper.logger_extra)
-#             self.total_records = -1
-#             return self.total_records, dd.from_pandas(pd.DataFrame(), npartitions=1)
-#
-#         result = await self.helper.backend_http.fetch_data(**options)
-#
-#         # Normalize to DataFrame if the plugin returns list/dict
-#         if isinstance(result, (list, dict)):
-#             pdf = pd.DataFrame(result)
-#             ddf = dd.from_pandas(pdf, npartitions=max(1, min(32, len(pdf) // 50_000 or 1)))
-#             self.total_records = len(pdf)
-#             return self.total_records, ddf
-#
-#         if isinstance(result, pd.DataFrame):
-#             self.total_records = len(result)
-#             ddf = dd.from_pandas(result, npartitions=max(1, min(32, len(result) // 50_000 or 1)))
-#             return self.total_records, ddf
-#
-#         # Fallback
-#         self.total_records = -1
-#         return self.total_records, dd.from_pandas(pd.DataFrame(), npartitions=1)
-#
-#
-# class DfHelper(ManagedResource):
-#     _BACKEND_STRATEGIES = {
-#         "sqlalchemy": SqlAlchemyBackend,
-#         "parquet": ParquetBackend,
-#         "http": HttpBackend,
-#     }
-#
-#     _BACKEND_ATTR_MAP = {
-#         "sqlalchemy": "backend_db_connection",
-#         "parquet": "backend_parquet",
-#         "http": "backend_http",
-#     }
-#
-#     default_config: Dict[str, Any] = None
-#     logger_extra: Dict[str, Any] = {"sibi_dst_component": __name__}
-#
-#     def __init__(self, backend="sqlalchemy", **kwargs):
-#         self.default_config = self.default_config or {}
-#         kwargs = {**self.default_config.copy(), **kwargs}
-#         super().__init__(**kwargs)
-#         self.backend = backend
-#
-#         # Ensure defaults flow to plugin configs
-#         kwargs.setdefault("debug", self.debug)
-#         kwargs.setdefault("fs", self.fs)
-#         kwargs.setdefault("logger", self.logger)
-#
-#         self.total_records = -1
-#         self._backend_query = self._get_config(QueryConfig, kwargs)
-#         self._backend_params = self._get_config(ParamsConfig, kwargs)
-#
-#         self.backend_db_connection: Optional[SqlAlchemyConnectionConfig] = None
-#         self.backend_parquet: Optional[ParquetConfig] = None
-#         self.backend_http: Optional[HttpConfig] = None
-#
-#         if self.backend == "sqlalchemy":
-#             self.backend_db_connection = self._get_config(SqlAlchemyConnectionConfig, kwargs)
-#         elif self.backend == "parquet":
-#             self.backend_parquet = self._get_config(ParquetConfig, kwargs)
-#         elif self.backend == "http":
-#             self.backend_http = self._get_config(HttpConfig, kwargs)
-#
-#         strategy_cls = self._BACKEND_STRATEGIES.get(self.backend)
-#         if not strategy_cls:
-#             raise ValueError(f"Unsupported backend: {self.backend}")
-#         self.backend_strategy = strategy_cls(self)
-#
-#     # ---------- ManagedResource hooks ----------
-#     def _cleanup(self):
-#         attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
-#         if not attr_name:
-#             self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.", extra=self.logger_extra)
-#             return
-#         active_config = getattr(self, attr_name, None)
-#         if active_config and hasattr(active_config, "close"):
-#             self.logger.debug(f"{self.__class__.__name__} is closing resources for backend '{self.backend}' backend using attribute '{attr_name}'.", extra=self.logger_extra)
-#             active_config.close()
-#
-#     async def _acleanup(self):
-#         self.logger.warning(
-#             "DfHelper instance was not used in an async context manager; cleanup is being called manually.",
-#             extra=self.logger_extra,
-#         )
-#         attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
-#         if not attr_name:
-#             self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.", extra=self.logger_extra)
-#             return
-#         active_config = getattr(self, attr_name, None)
-#         if active_config and hasattr(active_config, "aclose"):
-#             self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.", extra=self.logger_extra)
-#             await active_config.aclose()
-#
-#     # ---------- config helpers ----------
-#     def _get_config(self, model: T, kwargs: Dict[str, Any]) -> T:
-#         recognized = set(model.model_fields.keys())
-#         model_kwargs = {k: kwargs[k] for k in recognized if k in kwargs}
-#         return model(**model_kwargs)
-#
-#     # ---------- load/aload ----------
-#     def load(self, *, persist: bool = False, as_pandas: bool = False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
-#         self.logger.debug(f"Loading data from {self.backend} backend with options: {options}", extra=self.logger_extra)
-#         self.total_records, df = self.backend_strategy.load(**options)
-#         df = self._process_loaded_data(df)
-#         df = self._post_process_df(df)
-#         df = _maybe_persist(df, persist)
-#         return _maybe_compute(df, as_pandas)
-#
-#     async def aload(
-#         self,
-#         *,
-#         persist: bool = False,
-#         as_pandas: bool = False,
-#         timeout: Optional[float] = None,
-#         **options
-#     ) -> Union[pd.DataFrame, dd.DataFrame]:
-#         # 1) Async load if available, else run sync load in a thread.
-#         if hasattr(self.backend_strategy, "aload"):
-#             load_awaitable = self.backend_strategy.aload(**options)
-#         else:
-#             # Run ONLY the backend load step in a thread to avoid event-loop blocking.
-#             load_awaitable = asyncio.to_thread(self.backend_strategy.load, **options)
-#
-#         total, df = await (asyncio.wait_for(load_awaitable, timeout) if timeout else load_awaitable)
-#         self.total_records = total
-#
-#         # 2) Post-processing steps are sync; offload to threads.
-#         df = await asyncio.to_thread(self._process_loaded_data, df)
-#         df = await asyncio.to_thread(self._post_process_df, df)
-#
-#         # 3) Persist and compute can block; offload when needed.
-#         if persist and _is_dask_df(df):
-#             df = await asyncio.to_thread(df.persist)
-#
-#         if as_pandas and _is_dask_df(df):
-#             # Allow separate timeout for compute if desired; reuse same timeout here.
-#             compute_awaitable = asyncio.to_thread(df.compute)
-#             return await (asyncio.wait_for(compute_awaitable, timeout) if timeout else compute_awaitable)
-#
-#         return df
-#
-#     # ---------- dataframe post-processing ----------
-#     def _post_process_df(self, df: dd.DataFrame) -> dd.DataFrame:
-#         self.logger.debug(f"{self.__class__.__name__} is post-processing resulting dataframe with {len(df)} records.", extra=self.logger_extra)
-#         df_params = self._backend_params.df_params
-#         if not df_params:
-#             return df
-#         fieldnames = df_params.get("fieldnames")
-#         column_names = df_params.get("column_names")
-#         index_col = df_params.get("index_col")
-#
-#         if fieldnames:
-#             valid = [f for f in fieldnames if f in df.columns]
-#             if len(valid) < len(fieldnames):
-#                 self.logger.warning(f"Missing columns for filtering: {set(fieldnames) - set(valid)}", extra=self.logger_extra)
-#             df = df[valid]
-#         if column_names:
-#             if len(df.columns) != len(column_names):
-#                 raise ValueError(
-#                     f"Length mismatch: DataFrame has {len(df.columns)} columns, but {len(column_names)} names were provided."
-#                 )
-#             df = df.rename(columns=dict(zip(df.columns, column_names)))
-#         if index_col:
-#             if index_col not in df.columns:
-#                 raise ValueError(f"Index column '{index_col}' not found in DataFrame.")
-#             df = df.set_index(index_col)
-#
-#         self.logger.debug("Post-processing complete.", extra=self.logger_extra)
-#         return df
-#
-#     def _process_loaded_data(self, df: dd.DataFrame) -> dd.DataFrame:
-#         field_map = self._backend_params.field_map or {}
-#         if not isinstance(field_map, dict) or not field_map:
-#             return df
-#         if hasattr(df, "npartitions") and df.npartitions == 1 and not len(df.head(1)):
-#             return df
-#         self.logger.debug(f"{self.__class__.__name__} is applying rename mapping if/when necessary.", extra=self.logger_extra)
-#         rename_map = {k: v for k, v in field_map.items() if k in df.columns}
-#         if rename_map:
-#             df = df.rename(columns=rename_map)
-#         return df
-#
-#     # ---------- sinks ----------
-#     def save_to_parquet(self, df: dd.DataFrame, **kwargs):
-#         fs: AbstractFileSystem = kwargs.pop("fs", self.fs)
-#         path: str = kwargs.pop("parquet_storage_path", self.backend_parquet.parquet_storage_path if self.backend_parquet else None)
-#         parquet_filename = kwargs.pop("parquet_filename", self.backend_parquet.parquet_filename if self.backend_parquet else None)
-#         if not parquet_filename:
-#             raise ValueError("A 'parquet_filename' keyword argument must be provided.")
-#         if not fs:
-#             raise ValueError("A filesystem (fs) must be provided to save the parquet file.")
-#         if not path:
-#             raise ValueError("A 'parquet_storage_path' keyword argument must be provided.")
-#         if not self._has_any_rows(df):
-#             self.logger.warning("Skipping save: The provided DataFrame is empty.", extra=self.logger_extra)
-#             return
-#
-#         with ParquetSaver(
-#             df_result=df,
-#             parquet_storage_path=path,
-#             fs=fs,
-#             debug=self.debug,
-#             logger=self.logger,
-#             verbose=self.verbose,
-#             **kwargs,
-#         ) as saver:
-#             saver.save_to_parquet(parquet_filename)
-#
-#         self.logger.debug(f"Successfully saved '{parquet_filename}' to '{path}'.", extra=self.logger_extra)
-#
-#     def save_to_clickhouse(self, df: dd.DataFrame, **credentials):
-#         if not self._has_any_rows(df):
-#             self.logger.warning("Skipping save to ClickHouse: The provided DataFrame is empty.", extra=self.logger_extra)
-#             return
-#         with ClickHouseWriter(debug=self.debug, logger=self.logger, verbose=self.verbose, **credentials) as writer:
-#             writer.save_to_clickhouse(df)
-#             self.logger.debug("Save to ClickHouse completed.", extra=self.logger_extra)
-#
-#     # ---------- period loaders ----------
-#     def load_period(self, dt_field: str, start: str, end: str, **kwargs):
-#         final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
-#         return self.load(**final_kwargs)
-#
-#     async def aload_period(self, dt_field: str, start: str, end: str, **kwargs):
-#         final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
-#         return await self.aload(**final_kwargs)
-#
-#     def _prepare_period_filters(self, dt_field: str, start: str, end: str, **kwargs) -> dict:
-#         start_date, end_date = pd.to_datetime(start).date(), pd.to_datetime(end).date()
-#         if start_date > end_date:
-#             raise ValueError("'start' date cannot be later than 'end' date.")
-#         field_map = self._backend_params.field_map or {}
-#         reverse_map = {v: k for k, v in field_map.items()} if field_map else {}
-#         if len(reverse_map) != len(field_map):
-#             self.logger.warning("field_map values are not unique; reverse mapping may be unreliable.", extra=self.logger_extra)
-#         mapped_field = reverse_map.get(dt_field, dt_field)
-#         if start_date == end_date:
-#             kwargs[f"{mapped_field}__date"] = start_date
-#         else:
-#             kwargs[f"{mapped_field}__date__range"] = [start_date, end_date]
-#         self.logger.debug(f"Period load generated filters: {kwargs}", extra=self.logger_extra)
-#         return kwargs
-#
-#     @staticmethod
-#     def _has_any_rows(ddf: dd.DataFrame) -> bool:
-#         try:
-#             return bool(ddf.head(1, npartitions=-1).shape[0])
-#         except Exception:
-#             return False
-#
-#

{sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py RENAMED Viewed

@@ -30,7 +30,7 @@ class SqlAlchemyLoadFromDb(ManagedResource):
         self.engine = self.db_connection.engine
         self.query_config = plugin_query
         self.params_config = plugin_params
-        self.chunk_size = kwargs.get("chunk_size", self.params_config.df_params.get("chunk_size", 1000) if self.params_config else 1000)
+        self.chunk_size = kwargs.get("chunk_size", self.params_config.df_params.get("chunk_size", 10000) if self.params_config else 10000)
         self.total_records = -1
     def build_and_load(self) -> Tuple[int, dd.DataFrame]:

{sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/osmnx_helper/utils.py RENAMED Viewed

@@ -1,3 +1,5 @@
+from __future__ import annotations
 import math
 import os
 import pickle
@@ -9,236 +11,102 @@ import numpy as np
 import osmnx as ox
 from geopy.distance import geodesic
-#
-# options = {
-#    'ox_files_save_path': ox_files_save_path,
-#    'network_type': 'drive',
-#    'place': 'Costa Rica',
-#    'files_prefix': 'costa-rica-',
-# }
-# Usage example
-# handler = PBFHandler(**options)
-# handler.load()
+from typing import Optional
+from fsspec.core import url_to_fs
 class PBFHandler:
     """
-    Handles the creation, management, and visualization of graph data derived
-    from .pbf (Protocolbuffer Binary Format) files. This class enables the
-    loading, processing, saving, and reutilization of graph, node, and edge
-    data for geographical regions, supporting verbose mode for detailed outputs.
-    :ivar graph: The generated graph object representing the spatial network; can be None if not yet loaded or processed.
-    :type graph: Optional[NetworkX.Graph]
-    :ivar nodes: GeoDataFrame representing the nodes of the graph; can be None if not yet loaded or processed.
-    :type nodes: Optional[geopandas.GeoDataFrame]
-    :ivar edges: GeoDataFrame representing the edges of the graph; can be None if not yet loaded or processed.
-    :type edges: Optional[geopandas.GeoDataFrame]
-    :ivar rebuild: Indicates whether to rebuild the graph data, ignoring any existing cached files. Default is ``False``.
-    :type rebuild: bool
-    :ivar verbose: Enables verbose mode to provide detailed status messages during operations. Default is ``False``.
-    :type verbose: bool
-    :ivar place: The name of the geographical region to process with OpenStreetMap. Default is ``Costa Rica``.
-    :type place: str
-    :ivar filepath: The path to the directory where the graph, nodes, and edges pickle files are saved. Default is ``gis_data/``.
-    :type filepath: str
-    :ivar file_prefix: The prefix for the filenames of the saved graph, node, and edge pickle files. Default is ``costa-rica-``.
-    :type file_prefix: str
-    :ivar network_type: The type of network to extract from OpenStreetMap, such as "all" or other specific network types. Default is ``all``.
-    :type network_type: str
-    :ivar graph_file: Full path of the file to save or load the graph data as a pickle file.
-    :type graph_file: str
-    :ivar node_file: Full path of the file to save or load the graph's node data as a pickle file.
-    :type node_file: str
-    :ivar edge_file: Full path of the file to save or load the graph's edge data as a pickle file.
-    :type edge_file: str
+    Build/load OSMnx graph + nodes/edges; persist as pickle via fsspec.
     """
     def __init__(self, **kwargs):
         self.graph = None
-        self.nodes = None
-        self.edges = None
-        self.rebuild = kwargs.setdefault("rebuild", False)
-        self.verbose = kwargs.setdefault("verbose", False)
-        self.place = kwargs.setdefault('place', 'Costa Rica')
-        self.filepath = kwargs.setdefault('ox_files_save_path', "gis_data/")
-        self.file_prefix = kwargs.setdefault('file_prefix', 'costa-rica-')
-        self.network_type = kwargs.setdefault('network_type', 'all')
-        self.graph_file = f"{self.filepath}{self.file_prefix}graph.pkl"
-        self.node_file = f"{self.filepath}{self.file_prefix}nodes.pkl"
-        self.edge_file = f"{self.filepath}{self.file_prefix}edges.pkl"
-    def load(self):
-        """
-        Loads the required data files for processing. If the files do not exist or
-        if the `rebuild` flag is set to True, it will process and recreate the
-        necessary data from the source. Otherwise, it will load the data from
-        existing pickle files. This function ensures the target directory exists,
-        and processes files conditionally based on their presence.
-        :param verbose: Flag to control the verbosity of the function's output.
-        :param rebuild: Indicates whether the data should be rebuilt from the raw
-            source files.
-        :param graph_file: Path to the graph file to be loaded or rebuilt.
-        :param node_file: Path to the node file to be loaded or rebuilt.
-        :param edge_file: Path to the edge file to be loaded or rebuilt.
-        :param filepath: Path to the directory where files are processed and saved.
-        :return: None
-        """
+        self.nodes: Optional[gpd.GeoDataFrame] = None
+        self.edges: Optional[gpd.GeoDataFrame] = None
+        self.rebuild: bool = kwargs.setdefault("rebuild", False)
+        self.verbose: bool = kwargs.setdefault("verbose", False)
+        self.place: str = kwargs.setdefault("place", "Costa Rica")
+        self.network_type: str = kwargs.setdefault("network_type", "all")
+        base_url: str = kwargs.setdefault("data_path", "osmnx_data/pbf_files")
+        prefix: str = kwargs.setdefault("files_prefix", "costa-rica-").rstrip("-") + "-"
+        # Allow passing an fsspec instance directly
+        fs = kwargs.get("fs")
+        if fs is not None:
+            self.fs = fs
+            self.base = base_url.rstrip("/")
+        else:
+            self.fs, self.base = url_to_fs(base_url)
+        self.fs.mkdirs(self.base, exist_ok=True)
+        self.graph_file = f"{self.base.rstrip('/')}/{prefix}graph.pkl"
+        self.node_file = f"{self.base.rstrip('/')}/{prefix}nodes.pkl"
+        self.edge_file = f"{self.base.rstrip('/')}/{prefix}edges.pkl"
         if self.verbose:
-            print("Loading data...")
+            print(f"[PBFHandler] base={self.base}")
+            print(f"  graph={self.graph_file}")
+            print(f"  nodes={self.node_file}")
+            print(f"  edges={self.edge_file}")
-        files_to_check = [self.graph_file, self.node_file, self.edge_file]
+    # ---------- public API ----------
+    def load(self) -> None:
+        if self.verbose:
+            print("[PBFHandler] load()")
         if self.rebuild:
-            for file in files_to_check:
-                if os.path.exists(file):
-                    os.remove(file)
-        if not os.path.exists(self.filepath):
-            os.makedirs(self.filepath, exist_ok=True)
-            # self.process_pbf()
-            # self.save_to_pickle()
-        if not all(os.path.exists(f) for f in files_to_check):
+            self._delete_artifacts()
+        if not self._artifacts_exist():
             self.process_pbf()
             self.save_to_pickle()
         else:
             self.load_from_pickle()
+    def process_pbf(self) -> None:
+        if self.verbose:
+            print(f"[PBFHandler] processing: {self.place}")
+        self.graph = ox.graph_from_place(self.place, network_type=self.network_type)
+        self.nodes, self.edges = ox.graph_to_gdfs(self.graph)
+    def save_to_pickle(self) -> None:
+        if self.verbose:
+            print("[PBFHandler] saving via fsspec")
+        for path, obj in {
+            self.graph_file: self.graph,
+            self.node_file: self.nodes,
+            self.edge_file: self.edges,
+        }.items():
+            if obj is not None:
+                with self.fs.open(path, "wb") as f:
+                    pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)
+    def load_from_pickle(self) -> None:
+        if self.verbose:
+            print("[PBFHandler] loading via fsspec")
+        self.graph = self._load_pickle(self.graph_file)
+        self.nodes = self._load_pickle(self.node_file)
+        self.edges = self._load_pickle(self.edge_file)
+    # ---------- helpers ----------
+    def _artifacts_exist(self) -> bool:
+        return all(self.fs.exists(p) for p in (self.graph_file, self.node_file, self.edge_file))
+    def _delete_artifacts(self) -> None:
         if self.verbose:
-            print("Data loaded successfully.")
-    def process_pbf(self):
-        """
-        Processes the Protocolbuffer Binary Format (PBF) data specified for a given place by
-        utilizing the OSMnx library to create a graph representation and extracts nodes and
-        edges into GeoDataFrames. The function provides verbose output if enabled.
-        :param self: Refers to the current instance of the class containing this method.
-        :param self.verbose: bool
-            A flag to control verbose output. If True, detailed processing status messages are
-            logged to the console.
-        :param self.place: str
-            The name or description of the geographic place for which PBF data is processed. It
-            is used to construct a graph representation of the place.
-        :param self.network_type: str
-            The type of network graph to be created, typically one of 'all', 'walk', 'drive',
-            etc., reflecting the type of paths or streets included in the graph.
-        :return: None
-            This function does not return a value, but updates class attributes ``graph``,
-            ``nodes``, and ``edges``.
-        :raises Exception:
-            Raises a general exception when there is an error in processing the PBF data. Error
-            details are printed when verbose output is enabled.
-        """
-        try:
-            if self.verbose:
-                print(f"Processing PBF for {self.place}...")
-            self.graph = ox.graph_from_place(self.place, network_type=self.network_type)
-            self.nodes, self.edges = ox.graph_to_gdfs(self.graph)
-            if self.verbose:
-                print("PBF processed successfully.")
-        except Exception as e:
-            print(f"Error processing PBF: {e}")
-            raise
-    def save_to_pickle(self):
-        """
-        Saves data, including graph, nodes, and edges, to pickle files. Each data object is
-        saved to its corresponding file if available. If verbose mode is enabled, prints
-        messages indicating the saving progress and success.
-        :param self:
-            Represents the instance of the class that contains attributes `graph_file`,
-            `graph`, `node_file`, `nodes`, `edge_file`, `edges`, and `verbose`. These
-            attributes determine the files to save to and the data to save.
-        :raises Exception:
-            Raises an exception if an error occurs during the saving process.
-        :return:
-            None
-        """
-        try:
-            if self.verbose:
-                print("Saving data to pickle files...")
-            data_to_save = {
-                self.graph_file: self.graph,
-                self.node_file: self.nodes,
-                self.edge_file: self.edges
-            }
-            for file, data in data_to_save.items():
-                if data is not None:
-                    with open(file, 'wb') as f:
-                        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
-            if self.verbose:
-                print("Data saved to pickle files successfully.")
-        except Exception as e:
-            print(f"Error saving to pickle: {e}")
-            raise
-    def load_from_pickle(self):
-        """
-        Loads data from pickle files specified by the attributes `graph_file`, `node_file`,
-        and `edge_file` and assigns them to the corresponding attributes `graph`,
-        `nodes`, and `edges`, respectively. Displays verbose messages during the load
-        process if the `verbose` attribute is set to True.
-        :raises Exception: If an error occurs during reading or deserialization of the
-                           pickle files.
-        """
-        try:
-            if self.verbose:
-                print("Loading data from pickle files...")
-            files_to_load = {
-                self.graph_file: 'graph',
-                self.node_file: 'nodes',
-                self.edge_file: 'edges'
-            }
-            for file, attr in files_to_load.items():
-                with open(file, 'rb') as f:
-                    setattr(self, attr, pickle.load(f))
-            if self.verbose:
-                print("Data loaded from pickle files successfully.")
-        except Exception as e:
-            print(f"Error loading from pickle: {e}")
-            raise
-    def plot_graph(self):
-        """
-        Plots the loaded graph using the OSMnx library.
-        This method checks if a graph is loaded and, if available, plots it. Outputs
-        verbose messages during the process if verbosity is enabled.
-        :raises Exception: Raises if an error occurs during the plotting process.
-        :return: None
-        """
-        try:
-            if self.graph is not None:
-                if self.verbose:
-                    print("Plotting the graph...")
-                ox.plot_graph(self.graph)
-                if self.verbose:
-                    print("Graph plotted successfully.")
-            else:
-                print("Graph is not loaded. Please load a PBF file first.")
-        except Exception as e:
-            print(f"Error plotting the graph: {e}")
-            raise
+            print("[PBFHandler] deleting artifacts (rebuild=True)")
+        for p in (self.graph_file, self.node_file, self.edge_file):
+            if self.fs.exists(p):
+                try:
+                    self.fs.rm_file(p)
+                except Exception:
+                    self.fs.rm(p)
+    def _load_pickle(self, path: str):
+        with self.fs.open(path, "rb") as f:
+            return pickle.load(f)
 def get_bounding_box_from_points(gps_points, margin=0.001):

{sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/clickhouse_writer.py RENAMED Viewed

@@ -10,6 +10,14 @@ import clickhouse_connect
 from . import ManagedResource
+def _to_bool(val: Any) -> bool:
+    if isinstance(val, bool):
+        return val
+    if isinstance(val, (int, float)):
+        return bool(val)
+    if isinstance(val, str):
+        return val.strip().lower() in ("1", "true", "yes", "on")
+    return False
 class ClickHouseWriter(ManagedResource):
     """
@@ -47,6 +55,11 @@ class ClickHouseWriter(ManagedResource):
         database: str = "sibi_data",
         user: str = "default",
         password: str = "",
+        secure: bool = False,
+        verify: bool = False,
+        ca_cert: str = "",
+        client_cert: str = "",
+        compression: str = "",
         table: str = "test_sibi_table",
         order_by: str = "id",
         engine: Optional[str] = None,  # e.g. "ENGINE MergeTree ORDER BY (`id`)"
@@ -61,6 +74,11 @@ class ClickHouseWriter(ManagedResource):
         self.database = database
         self.user = user
         self.password = password
+        self.secure = _to_bool(secure)
+        self.verify = _to_bool(verify)
+        self.ca_cert = ca_cert
+        self.client_cert = client_cert
+        self.compression = compression  # e.g. 'lz4', 'zstd',
         self.table = table
         self.order_by = order_by
         self.engine = engine  # if None → default MergeTree ORDER BY
@@ -224,6 +242,7 @@ class ClickHouseWriter(ManagedResource):
     # ------------- low-level helpers -------------
     def _get_client(self):
+        print(self.secure, " ", self.verify)
         cli = getattr(self._tlocal, "client", None)
         if cli is not None:
             return cli
@@ -233,6 +252,11 @@ class ClickHouseWriter(ManagedResource):
             database=self.database,
             username=self.user,  # clickhouse-connect uses 'username'
             password=self.password,
+            secure=self.secure,
+            verify=self.verify,
+            ca_cert=self.ca_cert or None,
+            client_cert=self.client_cert or None,
+            compression=self.compression or None,
         )
         self._tlocal.client = cli
         return cli

sibi_dst-2025.9.2/sibi_dst/utils/dask_utils.py ADDED Viewed

@@ -0,0 +1,61 @@
+import asyncio
+from typing import List, Any, Dict
+import dask
+import dask.dataframe as dd
+def _to_int_safe(x) -> int:
+    """
+    Convert scalar-like to int safely.
+    Handles numpy scalars, pandas Series/DataFrame outputs.
+    """
+    if hasattr(x, "item"):        # numpy scalar, pandas scalar
+        return int(x.item())
+    if hasattr(x, "iloc"):        # Series-like
+        return int(x.iloc[0])
+    return int(x)
+def dask_is_probably_empty(ddf: dd.DataFrame) -> bool:
+    return getattr(ddf, "npartitions", 0) == 0 or len(ddf._meta.columns) == 0
+def dask_is_empty_truthful(ddf: dd.DataFrame) -> bool:
+    n = ddf.map_partitions(len).sum().compute()
+    return int(n) == 0
+def dask_is_empty(ddf: dd.DataFrame, *, sample: int = 4) -> bool:
+    if dask_is_probably_empty(ddf):
+        return True
+    k = min(max(sample, 1), ddf.npartitions)
+    probes = dask.compute(*[
+        ddf.get_partition(i).map_partitions(len) for i in range(k)
+    ])
+    if any(_to_int_safe(n) > 0 for n in probes):
+        return False
+    if k == ddf.npartitions and all(_to_int_safe(n) == 0 for n in probes):
+        return True
+    return dask_is_empty_truthful(ddf)
+class UniqueValuesExtractor:
+    @staticmethod
+    def _compute_to_list_sync(series) -> List[Any]:
+        """Run in a worker thread when Dask-backed."""
+        if hasattr(series, "compute"):
+            return series.compute().tolist()
+        return series.tolist()
+    async def compute_to_list(self, series) -> List[Any]:
+        # Offload potential Dask .compute() to a thread to avoid blocking the event loop
+        return await asyncio.to_thread(self._compute_to_list_sync, series)
+    async def extract_unique_values(self, df, *columns: str) -> Dict[str, List[Any]]:
+        async def one(col: str):
+            ser = df[col].dropna().unique()
+            return col, await self.compute_to_list(ser)
+        pairs = await asyncio.gather(*(one(c) for c in columns))
+        return dict(pairs)