sibi-dst 2025.9.1__tar.gz → 2025.9.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/PKG-INFO +1 -1
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/pyproject.toml +1 -1
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/_df_helper.py +0 -354
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +1 -1
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/osmnx_helper/utils.py +82 -214
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/clickhouse_writer.py +24 -0
- sibi_dst-2025.9.2/sibi_dst/utils/dask_utils.py +61 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/README.md +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/__init__.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/__init__.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/_artifact_updater_async.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/_artifact_updater_threaded.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/_parquet_reader.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/__init__.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/core/__init__.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/core/_defaults.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/core/_params_config.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/core/_query_config.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/data_cleaner.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/geopy_helper/__init__.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/geopy_helper/utils.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/osmnx_helper/__init__.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/osmnx_helper/basemaps/route_map_plotter.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/osmnx_helper/route_path_builder.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/tests/__init__.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/__init__.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/async_utils.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/base.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/boilerplate/__init__.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/boilerplate/base_attacher.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/boilerplate/base_data_cube.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/boilerplate/base_parquet_artifact.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/boilerplate/base_parquet_reader.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/business_days.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/credentials.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/data_from_http_source.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/data_utils.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/data_wrapper.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/date_utils.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/df_utils.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/file_age_checker.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/file_utils.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/filepath_generator.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/iceberg_saver.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/log_utils.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/manifest_manager.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/parquet_saver.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/periods.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/phone_formatter.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/progress/__init__.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/progress/jobs.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/progress/sse_runner.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/storage_config.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/storage_hive.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/storage_manager.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/update_planner.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/webdav_client.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/__init__.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/__init__.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/core/_query_config.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/utils/__init__.py +0 -0
- {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/utils/log_utils.py +0 -0
@@ -372,357 +372,3 @@ class DfHelper(ManagedResource):
|
|
372
372
|
return bool(ddf.head(1, npartitions=-1).shape[0])
|
373
373
|
except Exception:
|
374
374
|
return False
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
# BEFORE SSE Handling
|
379
|
-
# from __future__ import annotations
|
380
|
-
#
|
381
|
-
# import asyncio
|
382
|
-
# from typing import Any, Dict, Optional, TypeVar, Union
|
383
|
-
#
|
384
|
-
# import dask.dataframe as dd
|
385
|
-
# import pandas as pd
|
386
|
-
# from fsspec import AbstractFileSystem
|
387
|
-
# from pydantic import BaseModel
|
388
|
-
#
|
389
|
-
# from sibi_dst.df_helper.core import QueryConfig, ParamsConfig
|
390
|
-
# from sibi_dst.utils import ManagedResource, ParquetSaver, ClickHouseWriter
|
391
|
-
# from .backends.http import HttpConfig
|
392
|
-
# from .backends.parquet import ParquetConfig
|
393
|
-
# from .backends.sqlalchemy import SqlAlchemyConnectionConfig, SqlAlchemyLoadFromDb
|
394
|
-
#
|
395
|
-
# T = TypeVar("T", bound=BaseModel)
|
396
|
-
#
|
397
|
-
# def _is_dask_df(x) -> bool:
|
398
|
-
# return isinstance(x, dd.DataFrame)
|
399
|
-
#
|
400
|
-
# def _maybe_persist(df, persist: bool):
|
401
|
-
# return df.persist() if persist and _is_dask_df(df) else df
|
402
|
-
#
|
403
|
-
# def _maybe_compute(df, as_pandas: bool):
|
404
|
-
# return df.compute() if as_pandas and _is_dask_df(df) else df
|
405
|
-
#
|
406
|
-
#
|
407
|
-
# # ---- Backend Strategy Pattern ----
|
408
|
-
# class BaseBackend:
|
409
|
-
# def __init__(self, helper: "DfHelper"):
|
410
|
-
# self.helper = helper
|
411
|
-
# self.logger = helper.logger
|
412
|
-
# self.debug = helper.debug
|
413
|
-
# self.total_records = -1
|
414
|
-
#
|
415
|
-
# def load(self, **options) -> Union[tuple[Any, Any], dd.DataFrame, pd.DataFrame]:
|
416
|
-
# raise NotImplementedError
|
417
|
-
#
|
418
|
-
# async def aload(self, **options) -> Union[tuple[Any, Any], dd.DataFrame, pd.DataFrame]:
|
419
|
-
# return await asyncio.to_thread(self.load,**options)
|
420
|
-
#
|
421
|
-
#
|
422
|
-
# class SqlAlchemyBackend(BaseBackend):
|
423
|
-
# def load(self, **options):
|
424
|
-
# try:
|
425
|
-
# if options and hasattr(self.helper._backend_params, "parse_params"):
|
426
|
-
# self.helper._backend_params.parse_params(options)
|
427
|
-
#
|
428
|
-
# with SqlAlchemyLoadFromDb(
|
429
|
-
# plugin_sqlalchemy=self.helper.backend_db_connection,
|
430
|
-
# plugin_query=self.helper._backend_query,
|
431
|
-
# plugin_params=self.helper._backend_params,
|
432
|
-
# logger=self.logger,
|
433
|
-
# debug=self.debug,
|
434
|
-
# ) as db_loader:
|
435
|
-
# self.total_records, result = db_loader.build_and_load()
|
436
|
-
# return self.total_records, result
|
437
|
-
# except Exception as e:
|
438
|
-
# self.logger.error(f"Failed to load data from sqlalchemy: {e}", exc_info=self.debug, extra=self.helper.logger_extra)
|
439
|
-
# return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
440
|
-
#
|
441
|
-
#
|
442
|
-
# class ParquetBackend(BaseBackend):
|
443
|
-
# def load(self, **options):
|
444
|
-
# try:
|
445
|
-
# df = self.helper.backend_parquet.load_files(**options)
|
446
|
-
# if not self.helper._has_any_rows(df):
|
447
|
-
# self.total_records = 0
|
448
|
-
# return 0, self._empty_like(df)
|
449
|
-
#
|
450
|
-
# # Let DfHelper decide about persist
|
451
|
-
# self.total_records = -1 # unknown without full count
|
452
|
-
# return self.total_records, df
|
453
|
-
#
|
454
|
-
# except Exception as e:
|
455
|
-
# self.total_records = -1 # Reset total_records on failure
|
456
|
-
# self.logger.error(f"Failed to load data from parquet: {e}", exc_info=self.debug, extra=self.helper.logger_extra)
|
457
|
-
# return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
458
|
-
#
|
459
|
-
# @staticmethod
|
460
|
-
# def _empty_like(ddf):
|
461
|
-
# empty_pdf = ddf._meta.iloc[0:0]
|
462
|
-
# return dd.from_pandas(empty_pdf, npartitions=1)
|
463
|
-
#
|
464
|
-
#
|
465
|
-
# class HttpBackend(BaseBackend):
|
466
|
-
# def load(self, **options):
|
467
|
-
# # Avoid event-loop problems in sync code paths.
|
468
|
-
# # If someone calls .load() on an async backend, make it explicit.
|
469
|
-
# raise RuntimeError(
|
470
|
-
# "HttpBackend.load() is sync but this backend is async-only. "
|
471
|
-
# "Call `await helper.aload(...)` or `await helper.load_async(prefer_native=True, ...)`."
|
472
|
-
# )
|
473
|
-
#
|
474
|
-
# async def aload(self, **options):
|
475
|
-
# if not self.helper.backend_http:
|
476
|
-
# self.logger.warning("HTTP plugin not configured properly.", extra=self.helper.logger_extra)
|
477
|
-
# self.total_records = -1
|
478
|
-
# return self.total_records, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
479
|
-
#
|
480
|
-
# result = await self.helper.backend_http.fetch_data(**options)
|
481
|
-
#
|
482
|
-
# # Normalize to DataFrame if the plugin returns list/dict
|
483
|
-
# if isinstance(result, (list, dict)):
|
484
|
-
# pdf = pd.DataFrame(result)
|
485
|
-
# ddf = dd.from_pandas(pdf, npartitions=max(1, min(32, len(pdf) // 50_000 or 1)))
|
486
|
-
# self.total_records = len(pdf)
|
487
|
-
# return self.total_records, ddf
|
488
|
-
#
|
489
|
-
# if isinstance(result, pd.DataFrame):
|
490
|
-
# self.total_records = len(result)
|
491
|
-
# ddf = dd.from_pandas(result, npartitions=max(1, min(32, len(result) // 50_000 or 1)))
|
492
|
-
# return self.total_records, ddf
|
493
|
-
#
|
494
|
-
# # Fallback
|
495
|
-
# self.total_records = -1
|
496
|
-
# return self.total_records, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
497
|
-
#
|
498
|
-
#
|
499
|
-
# class DfHelper(ManagedResource):
|
500
|
-
# _BACKEND_STRATEGIES = {
|
501
|
-
# "sqlalchemy": SqlAlchemyBackend,
|
502
|
-
# "parquet": ParquetBackend,
|
503
|
-
# "http": HttpBackend,
|
504
|
-
# }
|
505
|
-
#
|
506
|
-
# _BACKEND_ATTR_MAP = {
|
507
|
-
# "sqlalchemy": "backend_db_connection",
|
508
|
-
# "parquet": "backend_parquet",
|
509
|
-
# "http": "backend_http",
|
510
|
-
# }
|
511
|
-
#
|
512
|
-
# default_config: Dict[str, Any] = None
|
513
|
-
# logger_extra: Dict[str, Any] = {"sibi_dst_component": __name__}
|
514
|
-
#
|
515
|
-
# def __init__(self, backend="sqlalchemy", **kwargs):
|
516
|
-
# self.default_config = self.default_config or {}
|
517
|
-
# kwargs = {**self.default_config.copy(), **kwargs}
|
518
|
-
# super().__init__(**kwargs)
|
519
|
-
# self.backend = backend
|
520
|
-
#
|
521
|
-
# # Ensure defaults flow to plugin configs
|
522
|
-
# kwargs.setdefault("debug", self.debug)
|
523
|
-
# kwargs.setdefault("fs", self.fs)
|
524
|
-
# kwargs.setdefault("logger", self.logger)
|
525
|
-
#
|
526
|
-
# self.total_records = -1
|
527
|
-
# self._backend_query = self._get_config(QueryConfig, kwargs)
|
528
|
-
# self._backend_params = self._get_config(ParamsConfig, kwargs)
|
529
|
-
#
|
530
|
-
# self.backend_db_connection: Optional[SqlAlchemyConnectionConfig] = None
|
531
|
-
# self.backend_parquet: Optional[ParquetConfig] = None
|
532
|
-
# self.backend_http: Optional[HttpConfig] = None
|
533
|
-
#
|
534
|
-
# if self.backend == "sqlalchemy":
|
535
|
-
# self.backend_db_connection = self._get_config(SqlAlchemyConnectionConfig, kwargs)
|
536
|
-
# elif self.backend == "parquet":
|
537
|
-
# self.backend_parquet = self._get_config(ParquetConfig, kwargs)
|
538
|
-
# elif self.backend == "http":
|
539
|
-
# self.backend_http = self._get_config(HttpConfig, kwargs)
|
540
|
-
#
|
541
|
-
# strategy_cls = self._BACKEND_STRATEGIES.get(self.backend)
|
542
|
-
# if not strategy_cls:
|
543
|
-
# raise ValueError(f"Unsupported backend: {self.backend}")
|
544
|
-
# self.backend_strategy = strategy_cls(self)
|
545
|
-
#
|
546
|
-
# # ---------- ManagedResource hooks ----------
|
547
|
-
# def _cleanup(self):
|
548
|
-
# attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
|
549
|
-
# if not attr_name:
|
550
|
-
# self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.", extra=self.logger_extra)
|
551
|
-
# return
|
552
|
-
# active_config = getattr(self, attr_name, None)
|
553
|
-
# if active_config and hasattr(active_config, "close"):
|
554
|
-
# self.logger.debug(f"{self.__class__.__name__} is closing resources for backend '{self.backend}' backend using attribute '{attr_name}'.", extra=self.logger_extra)
|
555
|
-
# active_config.close()
|
556
|
-
#
|
557
|
-
# async def _acleanup(self):
|
558
|
-
# self.logger.warning(
|
559
|
-
# "DfHelper instance was not used in an async context manager; cleanup is being called manually.",
|
560
|
-
# extra=self.logger_extra,
|
561
|
-
# )
|
562
|
-
# attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
|
563
|
-
# if not attr_name:
|
564
|
-
# self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.", extra=self.logger_extra)
|
565
|
-
# return
|
566
|
-
# active_config = getattr(self, attr_name, None)
|
567
|
-
# if active_config and hasattr(active_config, "aclose"):
|
568
|
-
# self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.", extra=self.logger_extra)
|
569
|
-
# await active_config.aclose()
|
570
|
-
#
|
571
|
-
# # ---------- config helpers ----------
|
572
|
-
# def _get_config(self, model: T, kwargs: Dict[str, Any]) -> T:
|
573
|
-
# recognized = set(model.model_fields.keys())
|
574
|
-
# model_kwargs = {k: kwargs[k] for k in recognized if k in kwargs}
|
575
|
-
# return model(**model_kwargs)
|
576
|
-
#
|
577
|
-
# # ---------- load/aload ----------
|
578
|
-
# def load(self, *, persist: bool = False, as_pandas: bool = False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
579
|
-
# self.logger.debug(f"Loading data from {self.backend} backend with options: {options}", extra=self.logger_extra)
|
580
|
-
# self.total_records, df = self.backend_strategy.load(**options)
|
581
|
-
# df = self._process_loaded_data(df)
|
582
|
-
# df = self._post_process_df(df)
|
583
|
-
# df = _maybe_persist(df, persist)
|
584
|
-
# return _maybe_compute(df, as_pandas)
|
585
|
-
#
|
586
|
-
# async def aload(
|
587
|
-
# self,
|
588
|
-
# *,
|
589
|
-
# persist: bool = False,
|
590
|
-
# as_pandas: bool = False,
|
591
|
-
# timeout: Optional[float] = None,
|
592
|
-
# **options
|
593
|
-
# ) -> Union[pd.DataFrame, dd.DataFrame]:
|
594
|
-
# # 1) Async load if available, else run sync load in a thread.
|
595
|
-
# if hasattr(self.backend_strategy, "aload"):
|
596
|
-
# load_awaitable = self.backend_strategy.aload(**options)
|
597
|
-
# else:
|
598
|
-
# # Run ONLY the backend load step in a thread to avoid event-loop blocking.
|
599
|
-
# load_awaitable = asyncio.to_thread(self.backend_strategy.load, **options)
|
600
|
-
#
|
601
|
-
# total, df = await (asyncio.wait_for(load_awaitable, timeout) if timeout else load_awaitable)
|
602
|
-
# self.total_records = total
|
603
|
-
#
|
604
|
-
# # 2) Post-processing steps are sync; offload to threads.
|
605
|
-
# df = await asyncio.to_thread(self._process_loaded_data, df)
|
606
|
-
# df = await asyncio.to_thread(self._post_process_df, df)
|
607
|
-
#
|
608
|
-
# # 3) Persist and compute can block; offload when needed.
|
609
|
-
# if persist and _is_dask_df(df):
|
610
|
-
# df = await asyncio.to_thread(df.persist)
|
611
|
-
#
|
612
|
-
# if as_pandas and _is_dask_df(df):
|
613
|
-
# # Allow separate timeout for compute if desired; reuse same timeout here.
|
614
|
-
# compute_awaitable = asyncio.to_thread(df.compute)
|
615
|
-
# return await (asyncio.wait_for(compute_awaitable, timeout) if timeout else compute_awaitable)
|
616
|
-
#
|
617
|
-
# return df
|
618
|
-
#
|
619
|
-
# # ---------- dataframe post-processing ----------
|
620
|
-
# def _post_process_df(self, df: dd.DataFrame) -> dd.DataFrame:
|
621
|
-
# self.logger.debug(f"{self.__class__.__name__} is post-processing resulting dataframe with {len(df)} records.", extra=self.logger_extra)
|
622
|
-
# df_params = self._backend_params.df_params
|
623
|
-
# if not df_params:
|
624
|
-
# return df
|
625
|
-
# fieldnames = df_params.get("fieldnames")
|
626
|
-
# column_names = df_params.get("column_names")
|
627
|
-
# index_col = df_params.get("index_col")
|
628
|
-
#
|
629
|
-
# if fieldnames:
|
630
|
-
# valid = [f for f in fieldnames if f in df.columns]
|
631
|
-
# if len(valid) < len(fieldnames):
|
632
|
-
# self.logger.warning(f"Missing columns for filtering: {set(fieldnames) - set(valid)}", extra=self.logger_extra)
|
633
|
-
# df = df[valid]
|
634
|
-
# if column_names:
|
635
|
-
# if len(df.columns) != len(column_names):
|
636
|
-
# raise ValueError(
|
637
|
-
# f"Length mismatch: DataFrame has {len(df.columns)} columns, but {len(column_names)} names were provided."
|
638
|
-
# )
|
639
|
-
# df = df.rename(columns=dict(zip(df.columns, column_names)))
|
640
|
-
# if index_col:
|
641
|
-
# if index_col not in df.columns:
|
642
|
-
# raise ValueError(f"Index column '{index_col}' not found in DataFrame.")
|
643
|
-
# df = df.set_index(index_col)
|
644
|
-
#
|
645
|
-
# self.logger.debug("Post-processing complete.", extra=self.logger_extra)
|
646
|
-
# return df
|
647
|
-
#
|
648
|
-
# def _process_loaded_data(self, df: dd.DataFrame) -> dd.DataFrame:
|
649
|
-
# field_map = self._backend_params.field_map or {}
|
650
|
-
# if not isinstance(field_map, dict) or not field_map:
|
651
|
-
# return df
|
652
|
-
# if hasattr(df, "npartitions") and df.npartitions == 1 and not len(df.head(1)):
|
653
|
-
# return df
|
654
|
-
# self.logger.debug(f"{self.__class__.__name__} is applying rename mapping if/when necessary.", extra=self.logger_extra)
|
655
|
-
# rename_map = {k: v for k, v in field_map.items() if k in df.columns}
|
656
|
-
# if rename_map:
|
657
|
-
# df = df.rename(columns=rename_map)
|
658
|
-
# return df
|
659
|
-
#
|
660
|
-
# # ---------- sinks ----------
|
661
|
-
# def save_to_parquet(self, df: dd.DataFrame, **kwargs):
|
662
|
-
# fs: AbstractFileSystem = kwargs.pop("fs", self.fs)
|
663
|
-
# path: str = kwargs.pop("parquet_storage_path", self.backend_parquet.parquet_storage_path if self.backend_parquet else None)
|
664
|
-
# parquet_filename = kwargs.pop("parquet_filename", self.backend_parquet.parquet_filename if self.backend_parquet else None)
|
665
|
-
# if not parquet_filename:
|
666
|
-
# raise ValueError("A 'parquet_filename' keyword argument must be provided.")
|
667
|
-
# if not fs:
|
668
|
-
# raise ValueError("A filesystem (fs) must be provided to save the parquet file.")
|
669
|
-
# if not path:
|
670
|
-
# raise ValueError("A 'parquet_storage_path' keyword argument must be provided.")
|
671
|
-
# if not self._has_any_rows(df):
|
672
|
-
# self.logger.warning("Skipping save: The provided DataFrame is empty.", extra=self.logger_extra)
|
673
|
-
# return
|
674
|
-
#
|
675
|
-
# with ParquetSaver(
|
676
|
-
# df_result=df,
|
677
|
-
# parquet_storage_path=path,
|
678
|
-
# fs=fs,
|
679
|
-
# debug=self.debug,
|
680
|
-
# logger=self.logger,
|
681
|
-
# verbose=self.verbose,
|
682
|
-
# **kwargs,
|
683
|
-
# ) as saver:
|
684
|
-
# saver.save_to_parquet(parquet_filename)
|
685
|
-
#
|
686
|
-
# self.logger.debug(f"Successfully saved '{parquet_filename}' to '{path}'.", extra=self.logger_extra)
|
687
|
-
#
|
688
|
-
# def save_to_clickhouse(self, df: dd.DataFrame, **credentials):
|
689
|
-
# if not self._has_any_rows(df):
|
690
|
-
# self.logger.warning("Skipping save to ClickHouse: The provided DataFrame is empty.", extra=self.logger_extra)
|
691
|
-
# return
|
692
|
-
# with ClickHouseWriter(debug=self.debug, logger=self.logger, verbose=self.verbose, **credentials) as writer:
|
693
|
-
# writer.save_to_clickhouse(df)
|
694
|
-
# self.logger.debug("Save to ClickHouse completed.", extra=self.logger_extra)
|
695
|
-
#
|
696
|
-
# # ---------- period loaders ----------
|
697
|
-
# def load_period(self, dt_field: str, start: str, end: str, **kwargs):
|
698
|
-
# final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
|
699
|
-
# return self.load(**final_kwargs)
|
700
|
-
#
|
701
|
-
# async def aload_period(self, dt_field: str, start: str, end: str, **kwargs):
|
702
|
-
# final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
|
703
|
-
# return await self.aload(**final_kwargs)
|
704
|
-
#
|
705
|
-
# def _prepare_period_filters(self, dt_field: str, start: str, end: str, **kwargs) -> dict:
|
706
|
-
# start_date, end_date = pd.to_datetime(start).date(), pd.to_datetime(end).date()
|
707
|
-
# if start_date > end_date:
|
708
|
-
# raise ValueError("'start' date cannot be later than 'end' date.")
|
709
|
-
# field_map = self._backend_params.field_map or {}
|
710
|
-
# reverse_map = {v: k for k, v in field_map.items()} if field_map else {}
|
711
|
-
# if len(reverse_map) != len(field_map):
|
712
|
-
# self.logger.warning("field_map values are not unique; reverse mapping may be unreliable.", extra=self.logger_extra)
|
713
|
-
# mapped_field = reverse_map.get(dt_field, dt_field)
|
714
|
-
# if start_date == end_date:
|
715
|
-
# kwargs[f"{mapped_field}__date"] = start_date
|
716
|
-
# else:
|
717
|
-
# kwargs[f"{mapped_field}__date__range"] = [start_date, end_date]
|
718
|
-
# self.logger.debug(f"Period load generated filters: {kwargs}", extra=self.logger_extra)
|
719
|
-
# return kwargs
|
720
|
-
#
|
721
|
-
# @staticmethod
|
722
|
-
# def _has_any_rows(ddf: dd.DataFrame) -> bool:
|
723
|
-
# try:
|
724
|
-
# return bool(ddf.head(1, npartitions=-1).shape[0])
|
725
|
-
# except Exception:
|
726
|
-
# return False
|
727
|
-
#
|
728
|
-
#
|
{sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py
RENAMED
@@ -30,7 +30,7 @@ class SqlAlchemyLoadFromDb(ManagedResource):
|
|
30
30
|
self.engine = self.db_connection.engine
|
31
31
|
self.query_config = plugin_query
|
32
32
|
self.params_config = plugin_params
|
33
|
-
self.chunk_size = kwargs.get("chunk_size", self.params_config.df_params.get("chunk_size",
|
33
|
+
self.chunk_size = kwargs.get("chunk_size", self.params_config.df_params.get("chunk_size", 10000) if self.params_config else 10000)
|
34
34
|
self.total_records = -1
|
35
35
|
|
36
36
|
def build_and_load(self) -> Tuple[int, dd.DataFrame]:
|
@@ -1,3 +1,5 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import math
|
2
4
|
import os
|
3
5
|
import pickle
|
@@ -9,236 +11,102 @@ import numpy as np
|
|
9
11
|
import osmnx as ox
|
10
12
|
from geopy.distance import geodesic
|
11
13
|
|
12
|
-
|
13
|
-
|
14
|
-
# options = {
|
15
|
-
# 'ox_files_save_path': ox_files_save_path,
|
16
|
-
# 'network_type': 'drive',
|
17
|
-
# 'place': 'Costa Rica',
|
18
|
-
# 'files_prefix': 'costa-rica-',
|
19
|
-
# }
|
20
|
-
# Usage example
|
21
|
-
# handler = PBFHandler(**options)
|
22
|
-
# handler.load()
|
23
|
-
|
14
|
+
from typing import Optional
|
15
|
+
from fsspec.core import url_to_fs
|
24
16
|
|
25
17
|
class PBFHandler:
|
26
18
|
"""
|
27
|
-
|
28
|
-
from .pbf (Protocolbuffer Binary Format) files. This class enables the
|
29
|
-
loading, processing, saving, and reutilization of graph, node, and edge
|
30
|
-
data for geographical regions, supporting verbose mode for detailed outputs.
|
31
|
-
|
32
|
-
:ivar graph: The generated graph object representing the spatial network; can be None if not yet loaded or processed.
|
33
|
-
:type graph: Optional[NetworkX.Graph]
|
34
|
-
:ivar nodes: GeoDataFrame representing the nodes of the graph; can be None if not yet loaded or processed.
|
35
|
-
:type nodes: Optional[geopandas.GeoDataFrame]
|
36
|
-
:ivar edges: GeoDataFrame representing the edges of the graph; can be None if not yet loaded or processed.
|
37
|
-
:type edges: Optional[geopandas.GeoDataFrame]
|
38
|
-
:ivar rebuild: Indicates whether to rebuild the graph data, ignoring any existing cached files. Default is ``False``.
|
39
|
-
:type rebuild: bool
|
40
|
-
:ivar verbose: Enables verbose mode to provide detailed status messages during operations. Default is ``False``.
|
41
|
-
:type verbose: bool
|
42
|
-
:ivar place: The name of the geographical region to process with OpenStreetMap. Default is ``Costa Rica``.
|
43
|
-
:type place: str
|
44
|
-
:ivar filepath: The path to the directory where the graph, nodes, and edges pickle files are saved. Default is ``gis_data/``.
|
45
|
-
:type filepath: str
|
46
|
-
:ivar file_prefix: The prefix for the filenames of the saved graph, node, and edge pickle files. Default is ``costa-rica-``.
|
47
|
-
:type file_prefix: str
|
48
|
-
:ivar network_type: The type of network to extract from OpenStreetMap, such as "all" or other specific network types. Default is ``all``.
|
49
|
-
:type network_type: str
|
50
|
-
:ivar graph_file: Full path of the file to save or load the graph data as a pickle file.
|
51
|
-
:type graph_file: str
|
52
|
-
:ivar node_file: Full path of the file to save or load the graph's node data as a pickle file.
|
53
|
-
:type node_file: str
|
54
|
-
:ivar edge_file: Full path of the file to save or load the graph's edge data as a pickle file.
|
55
|
-
:type edge_file: str
|
19
|
+
Build/load OSMnx graph + nodes/edges; persist as pickle via fsspec.
|
56
20
|
"""
|
21
|
+
|
57
22
|
def __init__(self, **kwargs):
|
58
23
|
self.graph = None
|
59
|
-
self.nodes = None
|
60
|
-
self.edges = None
|
61
|
-
|
62
|
-
self.
|
63
|
-
self.
|
64
|
-
self.
|
65
|
-
self.
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
:param node_file: Path to the node file to be loaded or rebuilt.
|
84
|
-
:param edge_file: Path to the edge file to be loaded or rebuilt.
|
85
|
-
:param filepath: Path to the directory where files are processed and saved.
|
86
|
-
|
87
|
-
:return: None
|
88
|
-
"""
|
24
|
+
self.nodes: Optional[gpd.GeoDataFrame] = None
|
25
|
+
self.edges: Optional[gpd.GeoDataFrame] = None
|
26
|
+
|
27
|
+
self.rebuild: bool = kwargs.setdefault("rebuild", False)
|
28
|
+
self.verbose: bool = kwargs.setdefault("verbose", False)
|
29
|
+
self.place: str = kwargs.setdefault("place", "Costa Rica")
|
30
|
+
self.network_type: str = kwargs.setdefault("network_type", "all")
|
31
|
+
base_url: str = kwargs.setdefault("data_path", "osmnx_data/pbf_files")
|
32
|
+
prefix: str = kwargs.setdefault("files_prefix", "costa-rica-").rstrip("-") + "-"
|
33
|
+
|
34
|
+
# Allow passing an fsspec instance directly
|
35
|
+
fs = kwargs.get("fs")
|
36
|
+
if fs is not None:
|
37
|
+
self.fs = fs
|
38
|
+
self.base = base_url.rstrip("/")
|
39
|
+
else:
|
40
|
+
self.fs, self.base = url_to_fs(base_url)
|
41
|
+
|
42
|
+
self.fs.mkdirs(self.base, exist_ok=True)
|
43
|
+
|
44
|
+
self.graph_file = f"{self.base.rstrip('/')}/{prefix}graph.pkl"
|
45
|
+
self.node_file = f"{self.base.rstrip('/')}/{prefix}nodes.pkl"
|
46
|
+
self.edge_file = f"{self.base.rstrip('/')}/{prefix}edges.pkl"
|
47
|
+
|
89
48
|
if self.verbose:
|
90
|
-
print("
|
49
|
+
print(f"[PBFHandler] base={self.base}")
|
50
|
+
print(f" graph={self.graph_file}")
|
51
|
+
print(f" nodes={self.node_file}")
|
52
|
+
print(f" edges={self.edge_file}")
|
91
53
|
|
92
|
-
|
54
|
+
# ---------- public API ----------
|
55
|
+
def load(self) -> None:
|
56
|
+
if self.verbose:
|
57
|
+
print("[PBFHandler] load()")
|
93
58
|
|
94
59
|
if self.rebuild:
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
if not os.path.exists(self.filepath):
|
99
|
-
os.makedirs(self.filepath, exist_ok=True)
|
100
|
-
# self.process_pbf()
|
101
|
-
# self.save_to_pickle()
|
102
|
-
if not all(os.path.exists(f) for f in files_to_check):
|
60
|
+
self._delete_artifacts()
|
61
|
+
|
62
|
+
if not self._artifacts_exist():
|
103
63
|
self.process_pbf()
|
104
64
|
self.save_to_pickle()
|
105
65
|
else:
|
106
66
|
self.load_from_pickle()
|
107
67
|
|
68
|
+
def process_pbf(self) -> None:
|
69
|
+
if self.verbose:
|
70
|
+
print(f"[PBFHandler] processing: {self.place}")
|
71
|
+
self.graph = ox.graph_from_place(self.place, network_type=self.network_type)
|
72
|
+
self.nodes, self.edges = ox.graph_to_gdfs(self.graph)
|
73
|
+
|
74
|
+
def save_to_pickle(self) -> None:
|
75
|
+
if self.verbose:
|
76
|
+
print("[PBFHandler] saving via fsspec")
|
77
|
+
for path, obj in {
|
78
|
+
self.graph_file: self.graph,
|
79
|
+
self.node_file: self.nodes,
|
80
|
+
self.edge_file: self.edges,
|
81
|
+
}.items():
|
82
|
+
if obj is not None:
|
83
|
+
with self.fs.open(path, "wb") as f:
|
84
|
+
pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)
|
85
|
+
|
86
|
+
def load_from_pickle(self) -> None:
|
87
|
+
if self.verbose:
|
88
|
+
print("[PBFHandler] loading via fsspec")
|
89
|
+
self.graph = self._load_pickle(self.graph_file)
|
90
|
+
self.nodes = self._load_pickle(self.node_file)
|
91
|
+
self.edges = self._load_pickle(self.edge_file)
|
92
|
+
|
93
|
+
# ---------- helpers ----------
|
94
|
+
def _artifacts_exist(self) -> bool:
|
95
|
+
return all(self.fs.exists(p) for p in (self.graph_file, self.node_file, self.edge_file))
|
96
|
+
|
97
|
+
def _delete_artifacts(self) -> None:
|
108
98
|
if self.verbose:
|
109
|
-
print("
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
A flag to control verbose output. If True, detailed processing status messages are
|
121
|
-
logged to the console.
|
122
|
-
|
123
|
-
:param self.place: str
|
124
|
-
The name or description of the geographic place for which PBF data is processed. It
|
125
|
-
is used to construct a graph representation of the place.
|
126
|
-
|
127
|
-
:param self.network_type: str
|
128
|
-
The type of network graph to be created, typically one of 'all', 'walk', 'drive',
|
129
|
-
etc., reflecting the type of paths or streets included in the graph.
|
130
|
-
|
131
|
-
:return: None
|
132
|
-
This function does not return a value, but updates class attributes ``graph``,
|
133
|
-
``nodes``, and ``edges``.
|
134
|
-
|
135
|
-
:raises Exception:
|
136
|
-
Raises a general exception when there is an error in processing the PBF data. Error
|
137
|
-
details are printed when verbose output is enabled.
|
138
|
-
"""
|
139
|
-
try:
|
140
|
-
if self.verbose:
|
141
|
-
print(f"Processing PBF for {self.place}...")
|
142
|
-
|
143
|
-
self.graph = ox.graph_from_place(self.place, network_type=self.network_type)
|
144
|
-
self.nodes, self.edges = ox.graph_to_gdfs(self.graph)
|
145
|
-
|
146
|
-
if self.verbose:
|
147
|
-
print("PBF processed successfully.")
|
148
|
-
except Exception as e:
|
149
|
-
print(f"Error processing PBF: {e}")
|
150
|
-
raise
|
151
|
-
|
152
|
-
def save_to_pickle(self):
|
153
|
-
"""
|
154
|
-
Saves data, including graph, nodes, and edges, to pickle files. Each data object is
|
155
|
-
saved to its corresponding file if available. If verbose mode is enabled, prints
|
156
|
-
messages indicating the saving progress and success.
|
157
|
-
|
158
|
-
:param self:
|
159
|
-
Represents the instance of the class that contains attributes `graph_file`,
|
160
|
-
`graph`, `node_file`, `nodes`, `edge_file`, `edges`, and `verbose`. These
|
161
|
-
attributes determine the files to save to and the data to save.
|
162
|
-
|
163
|
-
:raises Exception:
|
164
|
-
Raises an exception if an error occurs during the saving process.
|
165
|
-
|
166
|
-
:return:
|
167
|
-
None
|
168
|
-
"""
|
169
|
-
try:
|
170
|
-
if self.verbose:
|
171
|
-
print("Saving data to pickle files...")
|
172
|
-
|
173
|
-
data_to_save = {
|
174
|
-
self.graph_file: self.graph,
|
175
|
-
self.node_file: self.nodes,
|
176
|
-
self.edge_file: self.edges
|
177
|
-
}
|
178
|
-
|
179
|
-
for file, data in data_to_save.items():
|
180
|
-
if data is not None:
|
181
|
-
with open(file, 'wb') as f:
|
182
|
-
pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
|
183
|
-
|
184
|
-
if self.verbose:
|
185
|
-
print("Data saved to pickle files successfully.")
|
186
|
-
except Exception as e:
|
187
|
-
print(f"Error saving to pickle: {e}")
|
188
|
-
raise
|
189
|
-
|
190
|
-
def load_from_pickle(self):
|
191
|
-
"""
|
192
|
-
Loads data from pickle files specified by the attributes `graph_file`, `node_file`,
|
193
|
-
and `edge_file` and assigns them to the corresponding attributes `graph`,
|
194
|
-
`nodes`, and `edges`, respectively. Displays verbose messages during the load
|
195
|
-
process if the `verbose` attribute is set to True.
|
196
|
-
|
197
|
-
:raises Exception: If an error occurs during reading or deserialization of the
|
198
|
-
pickle files.
|
199
|
-
"""
|
200
|
-
try:
|
201
|
-
if self.verbose:
|
202
|
-
print("Loading data from pickle files...")
|
203
|
-
|
204
|
-
files_to_load = {
|
205
|
-
self.graph_file: 'graph',
|
206
|
-
self.node_file: 'nodes',
|
207
|
-
self.edge_file: 'edges'
|
208
|
-
}
|
209
|
-
|
210
|
-
for file, attr in files_to_load.items():
|
211
|
-
with open(file, 'rb') as f:
|
212
|
-
setattr(self, attr, pickle.load(f))
|
213
|
-
|
214
|
-
if self.verbose:
|
215
|
-
print("Data loaded from pickle files successfully.")
|
216
|
-
except Exception as e:
|
217
|
-
print(f"Error loading from pickle: {e}")
|
218
|
-
raise
|
219
|
-
|
220
|
-
def plot_graph(self):
|
221
|
-
"""
|
222
|
-
Plots the loaded graph using the OSMnx library.
|
223
|
-
|
224
|
-
This method checks if a graph is loaded and, if available, plots it. Outputs
|
225
|
-
verbose messages during the process if verbosity is enabled.
|
226
|
-
|
227
|
-
:raises Exception: Raises if an error occurs during the plotting process.
|
228
|
-
:return: None
|
229
|
-
"""
|
230
|
-
try:
|
231
|
-
if self.graph is not None:
|
232
|
-
if self.verbose:
|
233
|
-
print("Plotting the graph...")
|
234
|
-
ox.plot_graph(self.graph)
|
235
|
-
if self.verbose:
|
236
|
-
print("Graph plotted successfully.")
|
237
|
-
else:
|
238
|
-
print("Graph is not loaded. Please load a PBF file first.")
|
239
|
-
except Exception as e:
|
240
|
-
print(f"Error plotting the graph: {e}")
|
241
|
-
raise
|
99
|
+
print("[PBFHandler] deleting artifacts (rebuild=True)")
|
100
|
+
for p in (self.graph_file, self.node_file, self.edge_file):
|
101
|
+
if self.fs.exists(p):
|
102
|
+
try:
|
103
|
+
self.fs.rm_file(p)
|
104
|
+
except Exception:
|
105
|
+
self.fs.rm(p)
|
106
|
+
|
107
|
+
def _load_pickle(self, path: str):
|
108
|
+
with self.fs.open(path, "rb") as f:
|
109
|
+
return pickle.load(f)
|
242
110
|
|
243
111
|
|
244
112
|
def get_bounding_box_from_points(gps_points, margin=0.001):
|
@@ -10,6 +10,14 @@ import clickhouse_connect
|
|
10
10
|
|
11
11
|
from . import ManagedResource
|
12
12
|
|
13
|
+
def _to_bool(val: Any) -> bool:
|
14
|
+
if isinstance(val, bool):
|
15
|
+
return val
|
16
|
+
if isinstance(val, (int, float)):
|
17
|
+
return bool(val)
|
18
|
+
if isinstance(val, str):
|
19
|
+
return val.strip().lower() in ("1", "true", "yes", "on")
|
20
|
+
return False
|
13
21
|
|
14
22
|
class ClickHouseWriter(ManagedResource):
|
15
23
|
"""
|
@@ -47,6 +55,11 @@ class ClickHouseWriter(ManagedResource):
|
|
47
55
|
database: str = "sibi_data",
|
48
56
|
user: str = "default",
|
49
57
|
password: str = "",
|
58
|
+
secure: bool = False,
|
59
|
+
verify: bool = False,
|
60
|
+
ca_cert: str = "",
|
61
|
+
client_cert: str = "",
|
62
|
+
compression: str = "",
|
50
63
|
table: str = "test_sibi_table",
|
51
64
|
order_by: str = "id",
|
52
65
|
engine: Optional[str] = None, # e.g. "ENGINE MergeTree ORDER BY (`id`)"
|
@@ -61,6 +74,11 @@ class ClickHouseWriter(ManagedResource):
|
|
61
74
|
self.database = database
|
62
75
|
self.user = user
|
63
76
|
self.password = password
|
77
|
+
self.secure = _to_bool(secure)
|
78
|
+
self.verify = _to_bool(verify)
|
79
|
+
self.ca_cert = ca_cert
|
80
|
+
self.client_cert = client_cert
|
81
|
+
self.compression = compression # e.g. 'lz4', 'zstd',
|
64
82
|
self.table = table
|
65
83
|
self.order_by = order_by
|
66
84
|
self.engine = engine # if None → default MergeTree ORDER BY
|
@@ -224,6 +242,7 @@ class ClickHouseWriter(ManagedResource):
|
|
224
242
|
# ------------- low-level helpers -------------
|
225
243
|
|
226
244
|
def _get_client(self):
|
245
|
+
print(self.secure, " ", self.verify)
|
227
246
|
cli = getattr(self._tlocal, "client", None)
|
228
247
|
if cli is not None:
|
229
248
|
return cli
|
@@ -233,6 +252,11 @@ class ClickHouseWriter(ManagedResource):
|
|
233
252
|
database=self.database,
|
234
253
|
username=self.user, # clickhouse-connect uses 'username'
|
235
254
|
password=self.password,
|
255
|
+
secure=self.secure,
|
256
|
+
verify=self.verify,
|
257
|
+
ca_cert=self.ca_cert or None,
|
258
|
+
client_cert=self.client_cert or None,
|
259
|
+
compression=self.compression or None,
|
236
260
|
)
|
237
261
|
self._tlocal.client = cli
|
238
262
|
return cli
|
@@ -0,0 +1,61 @@
|
|
1
|
+
import asyncio
|
2
|
+
from typing import List, Any, Dict
|
3
|
+
|
4
|
+
import dask
|
5
|
+
import dask.dataframe as dd
|
6
|
+
|
7
|
+
def _to_int_safe(x) -> int:
|
8
|
+
"""
|
9
|
+
Convert scalar-like to int safely.
|
10
|
+
Handles numpy scalars, pandas Series/DataFrame outputs.
|
11
|
+
"""
|
12
|
+
if hasattr(x, "item"): # numpy scalar, pandas scalar
|
13
|
+
return int(x.item())
|
14
|
+
if hasattr(x, "iloc"): # Series-like
|
15
|
+
return int(x.iloc[0])
|
16
|
+
return int(x)
|
17
|
+
|
18
|
+
def dask_is_probably_empty(ddf: dd.DataFrame) -> bool:
|
19
|
+
return getattr(ddf, "npartitions", 0) == 0 or len(ddf._meta.columns) == 0
|
20
|
+
|
21
|
+
|
22
|
+
def dask_is_empty_truthful(ddf: dd.DataFrame) -> bool:
|
23
|
+
n = ddf.map_partitions(len).sum().compute()
|
24
|
+
return int(n) == 0
|
25
|
+
|
26
|
+
|
27
|
+
def dask_is_empty(ddf: dd.DataFrame, *, sample: int = 4) -> bool:
|
28
|
+
if dask_is_probably_empty(ddf):
|
29
|
+
return True
|
30
|
+
|
31
|
+
k = min(max(sample, 1), ddf.npartitions)
|
32
|
+
probes = dask.compute(*[
|
33
|
+
ddf.get_partition(i).map_partitions(len) for i in range(k)
|
34
|
+
])
|
35
|
+
|
36
|
+
if any(_to_int_safe(n) > 0 for n in probes):
|
37
|
+
return False
|
38
|
+
if k == ddf.npartitions and all(_to_int_safe(n) == 0 for n in probes):
|
39
|
+
return True
|
40
|
+
|
41
|
+
return dask_is_empty_truthful(ddf)
|
42
|
+
|
43
|
+
class UniqueValuesExtractor:
|
44
|
+
@staticmethod
|
45
|
+
def _compute_to_list_sync(series) -> List[Any]:
|
46
|
+
"""Run in a worker thread when Dask-backed."""
|
47
|
+
if hasattr(series, "compute"):
|
48
|
+
return series.compute().tolist()
|
49
|
+
return series.tolist()
|
50
|
+
|
51
|
+
async def compute_to_list(self, series) -> List[Any]:
|
52
|
+
# Offload potential Dask .compute() to a thread to avoid blocking the event loop
|
53
|
+
return await asyncio.to_thread(self._compute_to_list_sync, series)
|
54
|
+
|
55
|
+
async def extract_unique_values(self, df, *columns: str) -> Dict[str, List[Any]]:
|
56
|
+
async def one(col: str):
|
57
|
+
ser = df[col].dropna().unique()
|
58
|
+
return col, await self.compute_to_list(ser)
|
59
|
+
|
60
|
+
pairs = await asyncio.gather(*(one(c) for c in columns))
|
61
|
+
return dict(pairs)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/parquet/_parquet_options.py
RENAMED
File without changes
|
File without changes
|
{sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py
RENAMED
File without changes
|
{sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py
RENAMED
File without changes
|
File without changes
|
{sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py
RENAMED
File without changes
|
{sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py
RENAMED
File without changes
|
{sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py
RENAMED
File without changes
|
{sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py
RENAMED
File without changes
|
{sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py
RENAMED
File without changes
|
{sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py
RENAMED
File without changes
|
File without changes
|
{sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py
RENAMED
File without changes
|
File without changes
|
{sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py
RENAMED
File without changes
|
{sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|