sibi-dst 2025.9.1__tar.gz → 2025.9.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/PKG-INFO +1 -1
  2. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/pyproject.toml +1 -1
  3. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/_df_helper.py +0 -354
  4. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +1 -1
  5. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/osmnx_helper/utils.py +82 -214
  6. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/clickhouse_writer.py +24 -0
  7. sibi_dst-2025.9.2/sibi_dst/utils/dask_utils.py +61 -0
  8. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/README.md +0 -0
  9. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/__init__.py +0 -0
  10. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/__init__.py +0 -0
  11. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/_artifact_updater_async.py +0 -0
  12. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/_artifact_updater_threaded.py +0 -0
  13. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
  14. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/_parquet_reader.py +0 -0
  15. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/__init__.py +0 -0
  16. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
  17. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
  18. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
  19. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
  20. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
  21. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  22. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +0 -0
  23. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  24. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +0 -0
  25. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
  26. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/core/__init__.py +0 -0
  27. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/core/_defaults.py +0 -0
  28. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
  29. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/core/_params_config.py +0 -0
  30. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/core/_query_config.py +0 -0
  31. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/df_helper/data_cleaner.py +0 -0
  32. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/geopy_helper/__init__.py +0 -0
  33. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
  34. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/geopy_helper/utils.py +0 -0
  35. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/osmnx_helper/__init__.py +0 -0
  36. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
  37. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
  38. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
  39. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/osmnx_helper/basemaps/route_map_plotter.py +0 -0
  40. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
  41. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/osmnx_helper/route_path_builder.py +0 -0
  42. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/tests/__init__.py +0 -0
  43. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
  44. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/__init__.py +0 -0
  45. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/async_utils.py +0 -0
  46. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/base.py +0 -0
  47. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/boilerplate/__init__.py +0 -0
  48. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/boilerplate/base_attacher.py +0 -0
  49. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/boilerplate/base_data_cube.py +0 -0
  50. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/boilerplate/base_parquet_artifact.py +0 -0
  51. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/boilerplate/base_parquet_reader.py +0 -0
  52. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/business_days.py +0 -0
  53. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/credentials.py +0 -0
  54. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/data_from_http_source.py +0 -0
  55. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/data_utils.py +0 -0
  56. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/data_wrapper.py +0 -0
  57. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/date_utils.py +0 -0
  58. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/df_utils.py +0 -0
  59. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/file_age_checker.py +0 -0
  60. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/file_utils.py +0 -0
  61. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/filepath_generator.py +0 -0
  62. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/iceberg_saver.py +0 -0
  63. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/log_utils.py +0 -0
  64. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/manifest_manager.py +0 -0
  65. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/parquet_saver.py +0 -0
  66. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/periods.py +0 -0
  67. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/phone_formatter.py +0 -0
  68. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/progress/__init__.py +0 -0
  69. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/progress/jobs.py +0 -0
  70. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/progress/sse_runner.py +0 -0
  71. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/storage_config.py +0 -0
  72. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/storage_hive.py +0 -0
  73. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/storage_manager.py +0 -0
  74. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/update_planner.py +0 -0
  75. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/utils/webdav_client.py +0 -0
  76. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/__init__.py +0 -0
  77. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/__init__.py +0 -0
  78. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
  79. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
  80. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
  81. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  82. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  83. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  84. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
  85. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
  86. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -0
  87. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +0 -0
  88. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
  89. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
  90. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
  91. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
  92. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
  93. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/core/_query_config.py +0 -0
  94. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/utils/__init__.py +0 -0
  95. {sibi_dst-2025.9.1 → sibi_dst-2025.9.2}/sibi_dst/v2/utils/log_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 2025.9.1
3
+ Version: 2025.9.2
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sibi-dst"
3
- version = "2025.9.1"
3
+ version = "2025.9.2"
4
4
  description = "Data Science Toolkit"
5
5
  authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
6
6
  readme = "README.md"
@@ -372,357 +372,3 @@ class DfHelper(ManagedResource):
372
372
  return bool(ddf.head(1, npartitions=-1).shape[0])
373
373
  except Exception:
374
374
  return False
375
-
376
-
377
-
378
- # BEFORE SSE Handling
379
- # from __future__ import annotations
380
- #
381
- # import asyncio
382
- # from typing import Any, Dict, Optional, TypeVar, Union
383
- #
384
- # import dask.dataframe as dd
385
- # import pandas as pd
386
- # from fsspec import AbstractFileSystem
387
- # from pydantic import BaseModel
388
- #
389
- # from sibi_dst.df_helper.core import QueryConfig, ParamsConfig
390
- # from sibi_dst.utils import ManagedResource, ParquetSaver, ClickHouseWriter
391
- # from .backends.http import HttpConfig
392
- # from .backends.parquet import ParquetConfig
393
- # from .backends.sqlalchemy import SqlAlchemyConnectionConfig, SqlAlchemyLoadFromDb
394
- #
395
- # T = TypeVar("T", bound=BaseModel)
396
- #
397
- # def _is_dask_df(x) -> bool:
398
- # return isinstance(x, dd.DataFrame)
399
- #
400
- # def _maybe_persist(df, persist: bool):
401
- # return df.persist() if persist and _is_dask_df(df) else df
402
- #
403
- # def _maybe_compute(df, as_pandas: bool):
404
- # return df.compute() if as_pandas and _is_dask_df(df) else df
405
- #
406
- #
407
- # # ---- Backend Strategy Pattern ----
408
- # class BaseBackend:
409
- # def __init__(self, helper: "DfHelper"):
410
- # self.helper = helper
411
- # self.logger = helper.logger
412
- # self.debug = helper.debug
413
- # self.total_records = -1
414
- #
415
- # def load(self, **options) -> Union[tuple[Any, Any], dd.DataFrame, pd.DataFrame]:
416
- # raise NotImplementedError
417
- #
418
- # async def aload(self, **options) -> Union[tuple[Any, Any], dd.DataFrame, pd.DataFrame]:
419
- # return await asyncio.to_thread(self.load,**options)
420
- #
421
- #
422
- # class SqlAlchemyBackend(BaseBackend):
423
- # def load(self, **options):
424
- # try:
425
- # if options and hasattr(self.helper._backend_params, "parse_params"):
426
- # self.helper._backend_params.parse_params(options)
427
- #
428
- # with SqlAlchemyLoadFromDb(
429
- # plugin_sqlalchemy=self.helper.backend_db_connection,
430
- # plugin_query=self.helper._backend_query,
431
- # plugin_params=self.helper._backend_params,
432
- # logger=self.logger,
433
- # debug=self.debug,
434
- # ) as db_loader:
435
- # self.total_records, result = db_loader.build_and_load()
436
- # return self.total_records, result
437
- # except Exception as e:
438
- # self.logger.error(f"Failed to load data from sqlalchemy: {e}", exc_info=self.debug, extra=self.helper.logger_extra)
439
- # return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
440
- #
441
- #
442
- # class ParquetBackend(BaseBackend):
443
- # def load(self, **options):
444
- # try:
445
- # df = self.helper.backend_parquet.load_files(**options)
446
- # if not self.helper._has_any_rows(df):
447
- # self.total_records = 0
448
- # return 0, self._empty_like(df)
449
- #
450
- # # Let DfHelper decide about persist
451
- # self.total_records = -1 # unknown without full count
452
- # return self.total_records, df
453
- #
454
- # except Exception as e:
455
- # self.total_records = -1 # Reset total_records on failure
456
- # self.logger.error(f"Failed to load data from parquet: {e}", exc_info=self.debug, extra=self.helper.logger_extra)
457
- # return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
458
- #
459
- # @staticmethod
460
- # def _empty_like(ddf):
461
- # empty_pdf = ddf._meta.iloc[0:0]
462
- # return dd.from_pandas(empty_pdf, npartitions=1)
463
- #
464
- #
465
- # class HttpBackend(BaseBackend):
466
- # def load(self, **options):
467
- # # Avoid event-loop problems in sync code paths.
468
- # # If someone calls .load() on an async backend, make it explicit.
469
- # raise RuntimeError(
470
- # "HttpBackend.load() is sync but this backend is async-only. "
471
- # "Call `await helper.aload(...)` or `await helper.load_async(prefer_native=True, ...)`."
472
- # )
473
- #
474
- # async def aload(self, **options):
475
- # if not self.helper.backend_http:
476
- # self.logger.warning("HTTP plugin not configured properly.", extra=self.helper.logger_extra)
477
- # self.total_records = -1
478
- # return self.total_records, dd.from_pandas(pd.DataFrame(), npartitions=1)
479
- #
480
- # result = await self.helper.backend_http.fetch_data(**options)
481
- #
482
- # # Normalize to DataFrame if the plugin returns list/dict
483
- # if isinstance(result, (list, dict)):
484
- # pdf = pd.DataFrame(result)
485
- # ddf = dd.from_pandas(pdf, npartitions=max(1, min(32, len(pdf) // 50_000 or 1)))
486
- # self.total_records = len(pdf)
487
- # return self.total_records, ddf
488
- #
489
- # if isinstance(result, pd.DataFrame):
490
- # self.total_records = len(result)
491
- # ddf = dd.from_pandas(result, npartitions=max(1, min(32, len(result) // 50_000 or 1)))
492
- # return self.total_records, ddf
493
- #
494
- # # Fallback
495
- # self.total_records = -1
496
- # return self.total_records, dd.from_pandas(pd.DataFrame(), npartitions=1)
497
- #
498
- #
499
- # class DfHelper(ManagedResource):
500
- # _BACKEND_STRATEGIES = {
501
- # "sqlalchemy": SqlAlchemyBackend,
502
- # "parquet": ParquetBackend,
503
- # "http": HttpBackend,
504
- # }
505
- #
506
- # _BACKEND_ATTR_MAP = {
507
- # "sqlalchemy": "backend_db_connection",
508
- # "parquet": "backend_parquet",
509
- # "http": "backend_http",
510
- # }
511
- #
512
- # default_config: Dict[str, Any] = None
513
- # logger_extra: Dict[str, Any] = {"sibi_dst_component": __name__}
514
- #
515
- # def __init__(self, backend="sqlalchemy", **kwargs):
516
- # self.default_config = self.default_config or {}
517
- # kwargs = {**self.default_config.copy(), **kwargs}
518
- # super().__init__(**kwargs)
519
- # self.backend = backend
520
- #
521
- # # Ensure defaults flow to plugin configs
522
- # kwargs.setdefault("debug", self.debug)
523
- # kwargs.setdefault("fs", self.fs)
524
- # kwargs.setdefault("logger", self.logger)
525
- #
526
- # self.total_records = -1
527
- # self._backend_query = self._get_config(QueryConfig, kwargs)
528
- # self._backend_params = self._get_config(ParamsConfig, kwargs)
529
- #
530
- # self.backend_db_connection: Optional[SqlAlchemyConnectionConfig] = None
531
- # self.backend_parquet: Optional[ParquetConfig] = None
532
- # self.backend_http: Optional[HttpConfig] = None
533
- #
534
- # if self.backend == "sqlalchemy":
535
- # self.backend_db_connection = self._get_config(SqlAlchemyConnectionConfig, kwargs)
536
- # elif self.backend == "parquet":
537
- # self.backend_parquet = self._get_config(ParquetConfig, kwargs)
538
- # elif self.backend == "http":
539
- # self.backend_http = self._get_config(HttpConfig, kwargs)
540
- #
541
- # strategy_cls = self._BACKEND_STRATEGIES.get(self.backend)
542
- # if not strategy_cls:
543
- # raise ValueError(f"Unsupported backend: {self.backend}")
544
- # self.backend_strategy = strategy_cls(self)
545
- #
546
- # # ---------- ManagedResource hooks ----------
547
- # def _cleanup(self):
548
- # attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
549
- # if not attr_name:
550
- # self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.", extra=self.logger_extra)
551
- # return
552
- # active_config = getattr(self, attr_name, None)
553
- # if active_config and hasattr(active_config, "close"):
554
- # self.logger.debug(f"{self.__class__.__name__} is closing resources for backend '{self.backend}' backend using attribute '{attr_name}'.", extra=self.logger_extra)
555
- # active_config.close()
556
- #
557
- # async def _acleanup(self):
558
- # self.logger.warning(
559
- # "DfHelper instance was not used in an async context manager; cleanup is being called manually.",
560
- # extra=self.logger_extra,
561
- # )
562
- # attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
563
- # if not attr_name:
564
- # self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.", extra=self.logger_extra)
565
- # return
566
- # active_config = getattr(self, attr_name, None)
567
- # if active_config and hasattr(active_config, "aclose"):
568
- # self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.", extra=self.logger_extra)
569
- # await active_config.aclose()
570
- #
571
- # # ---------- config helpers ----------
572
- # def _get_config(self, model: T, kwargs: Dict[str, Any]) -> T:
573
- # recognized = set(model.model_fields.keys())
574
- # model_kwargs = {k: kwargs[k] for k in recognized if k in kwargs}
575
- # return model(**model_kwargs)
576
- #
577
- # # ---------- load/aload ----------
578
- # def load(self, *, persist: bool = False, as_pandas: bool = False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
579
- # self.logger.debug(f"Loading data from {self.backend} backend with options: {options}", extra=self.logger_extra)
580
- # self.total_records, df = self.backend_strategy.load(**options)
581
- # df = self._process_loaded_data(df)
582
- # df = self._post_process_df(df)
583
- # df = _maybe_persist(df, persist)
584
- # return _maybe_compute(df, as_pandas)
585
- #
586
- # async def aload(
587
- # self,
588
- # *,
589
- # persist: bool = False,
590
- # as_pandas: bool = False,
591
- # timeout: Optional[float] = None,
592
- # **options
593
- # ) -> Union[pd.DataFrame, dd.DataFrame]:
594
- # # 1) Async load if available, else run sync load in a thread.
595
- # if hasattr(self.backend_strategy, "aload"):
596
- # load_awaitable = self.backend_strategy.aload(**options)
597
- # else:
598
- # # Run ONLY the backend load step in a thread to avoid event-loop blocking.
599
- # load_awaitable = asyncio.to_thread(self.backend_strategy.load, **options)
600
- #
601
- # total, df = await (asyncio.wait_for(load_awaitable, timeout) if timeout else load_awaitable)
602
- # self.total_records = total
603
- #
604
- # # 2) Post-processing steps are sync; offload to threads.
605
- # df = await asyncio.to_thread(self._process_loaded_data, df)
606
- # df = await asyncio.to_thread(self._post_process_df, df)
607
- #
608
- # # 3) Persist and compute can block; offload when needed.
609
- # if persist and _is_dask_df(df):
610
- # df = await asyncio.to_thread(df.persist)
611
- #
612
- # if as_pandas and _is_dask_df(df):
613
- # # Allow separate timeout for compute if desired; reuse same timeout here.
614
- # compute_awaitable = asyncio.to_thread(df.compute)
615
- # return await (asyncio.wait_for(compute_awaitable, timeout) if timeout else compute_awaitable)
616
- #
617
- # return df
618
- #
619
- # # ---------- dataframe post-processing ----------
620
- # def _post_process_df(self, df: dd.DataFrame) -> dd.DataFrame:
621
- # self.logger.debug(f"{self.__class__.__name__} is post-processing resulting dataframe with {len(df)} records.", extra=self.logger_extra)
622
- # df_params = self._backend_params.df_params
623
- # if not df_params:
624
- # return df
625
- # fieldnames = df_params.get("fieldnames")
626
- # column_names = df_params.get("column_names")
627
- # index_col = df_params.get("index_col")
628
- #
629
- # if fieldnames:
630
- # valid = [f for f in fieldnames if f in df.columns]
631
- # if len(valid) < len(fieldnames):
632
- # self.logger.warning(f"Missing columns for filtering: {set(fieldnames) - set(valid)}", extra=self.logger_extra)
633
- # df = df[valid]
634
- # if column_names:
635
- # if len(df.columns) != len(column_names):
636
- # raise ValueError(
637
- # f"Length mismatch: DataFrame has {len(df.columns)} columns, but {len(column_names)} names were provided."
638
- # )
639
- # df = df.rename(columns=dict(zip(df.columns, column_names)))
640
- # if index_col:
641
- # if index_col not in df.columns:
642
- # raise ValueError(f"Index column '{index_col}' not found in DataFrame.")
643
- # df = df.set_index(index_col)
644
- #
645
- # self.logger.debug("Post-processing complete.", extra=self.logger_extra)
646
- # return df
647
- #
648
- # def _process_loaded_data(self, df: dd.DataFrame) -> dd.DataFrame:
649
- # field_map = self._backend_params.field_map or {}
650
- # if not isinstance(field_map, dict) or not field_map:
651
- # return df
652
- # if hasattr(df, "npartitions") and df.npartitions == 1 and not len(df.head(1)):
653
- # return df
654
- # self.logger.debug(f"{self.__class__.__name__} is applying rename mapping if/when necessary.", extra=self.logger_extra)
655
- # rename_map = {k: v for k, v in field_map.items() if k in df.columns}
656
- # if rename_map:
657
- # df = df.rename(columns=rename_map)
658
- # return df
659
- #
660
- # # ---------- sinks ----------
661
- # def save_to_parquet(self, df: dd.DataFrame, **kwargs):
662
- # fs: AbstractFileSystem = kwargs.pop("fs", self.fs)
663
- # path: str = kwargs.pop("parquet_storage_path", self.backend_parquet.parquet_storage_path if self.backend_parquet else None)
664
- # parquet_filename = kwargs.pop("parquet_filename", self.backend_parquet.parquet_filename if self.backend_parquet else None)
665
- # if not parquet_filename:
666
- # raise ValueError("A 'parquet_filename' keyword argument must be provided.")
667
- # if not fs:
668
- # raise ValueError("A filesystem (fs) must be provided to save the parquet file.")
669
- # if not path:
670
- # raise ValueError("A 'parquet_storage_path' keyword argument must be provided.")
671
- # if not self._has_any_rows(df):
672
- # self.logger.warning("Skipping save: The provided DataFrame is empty.", extra=self.logger_extra)
673
- # return
674
- #
675
- # with ParquetSaver(
676
- # df_result=df,
677
- # parquet_storage_path=path,
678
- # fs=fs,
679
- # debug=self.debug,
680
- # logger=self.logger,
681
- # verbose=self.verbose,
682
- # **kwargs,
683
- # ) as saver:
684
- # saver.save_to_parquet(parquet_filename)
685
- #
686
- # self.logger.debug(f"Successfully saved '{parquet_filename}' to '{path}'.", extra=self.logger_extra)
687
- #
688
- # def save_to_clickhouse(self, df: dd.DataFrame, **credentials):
689
- # if not self._has_any_rows(df):
690
- # self.logger.warning("Skipping save to ClickHouse: The provided DataFrame is empty.", extra=self.logger_extra)
691
- # return
692
- # with ClickHouseWriter(debug=self.debug, logger=self.logger, verbose=self.verbose, **credentials) as writer:
693
- # writer.save_to_clickhouse(df)
694
- # self.logger.debug("Save to ClickHouse completed.", extra=self.logger_extra)
695
- #
696
- # # ---------- period loaders ----------
697
- # def load_period(self, dt_field: str, start: str, end: str, **kwargs):
698
- # final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
699
- # return self.load(**final_kwargs)
700
- #
701
- # async def aload_period(self, dt_field: str, start: str, end: str, **kwargs):
702
- # final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
703
- # return await self.aload(**final_kwargs)
704
- #
705
- # def _prepare_period_filters(self, dt_field: str, start: str, end: str, **kwargs) -> dict:
706
- # start_date, end_date = pd.to_datetime(start).date(), pd.to_datetime(end).date()
707
- # if start_date > end_date:
708
- # raise ValueError("'start' date cannot be later than 'end' date.")
709
- # field_map = self._backend_params.field_map or {}
710
- # reverse_map = {v: k for k, v in field_map.items()} if field_map else {}
711
- # if len(reverse_map) != len(field_map):
712
- # self.logger.warning("field_map values are not unique; reverse mapping may be unreliable.", extra=self.logger_extra)
713
- # mapped_field = reverse_map.get(dt_field, dt_field)
714
- # if start_date == end_date:
715
- # kwargs[f"{mapped_field}__date"] = start_date
716
- # else:
717
- # kwargs[f"{mapped_field}__date__range"] = [start_date, end_date]
718
- # self.logger.debug(f"Period load generated filters: {kwargs}", extra=self.logger_extra)
719
- # return kwargs
720
- #
721
- # @staticmethod
722
- # def _has_any_rows(ddf: dd.DataFrame) -> bool:
723
- # try:
724
- # return bool(ddf.head(1, npartitions=-1).shape[0])
725
- # except Exception:
726
- # return False
727
- #
728
- #
@@ -30,7 +30,7 @@ class SqlAlchemyLoadFromDb(ManagedResource):
30
30
  self.engine = self.db_connection.engine
31
31
  self.query_config = plugin_query
32
32
  self.params_config = plugin_params
33
- self.chunk_size = kwargs.get("chunk_size", self.params_config.df_params.get("chunk_size", 1000) if self.params_config else 1000)
33
+ self.chunk_size = kwargs.get("chunk_size", self.params_config.df_params.get("chunk_size", 10000) if self.params_config else 10000)
34
34
  self.total_records = -1
35
35
 
36
36
  def build_and_load(self) -> Tuple[int, dd.DataFrame]:
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  import math
2
4
  import os
3
5
  import pickle
@@ -9,236 +11,102 @@ import numpy as np
9
11
  import osmnx as ox
10
12
  from geopy.distance import geodesic
11
13
 
12
-
13
- #
14
- # options = {
15
- # 'ox_files_save_path': ox_files_save_path,
16
- # 'network_type': 'drive',
17
- # 'place': 'Costa Rica',
18
- # 'files_prefix': 'costa-rica-',
19
- # }
20
- # Usage example
21
- # handler = PBFHandler(**options)
22
- # handler.load()
23
-
14
+ from typing import Optional
15
+ from fsspec.core import url_to_fs
24
16
 
25
17
  class PBFHandler:
26
18
  """
27
- Handles the creation, management, and visualization of graph data derived
28
- from .pbf (Protocolbuffer Binary Format) files. This class enables the
29
- loading, processing, saving, and reutilization of graph, node, and edge
30
- data for geographical regions, supporting verbose mode for detailed outputs.
31
-
32
- :ivar graph: The generated graph object representing the spatial network; can be None if not yet loaded or processed.
33
- :type graph: Optional[NetworkX.Graph]
34
- :ivar nodes: GeoDataFrame representing the nodes of the graph; can be None if not yet loaded or processed.
35
- :type nodes: Optional[geopandas.GeoDataFrame]
36
- :ivar edges: GeoDataFrame representing the edges of the graph; can be None if not yet loaded or processed.
37
- :type edges: Optional[geopandas.GeoDataFrame]
38
- :ivar rebuild: Indicates whether to rebuild the graph data, ignoring any existing cached files. Default is ``False``.
39
- :type rebuild: bool
40
- :ivar verbose: Enables verbose mode to provide detailed status messages during operations. Default is ``False``.
41
- :type verbose: bool
42
- :ivar place: The name of the geographical region to process with OpenStreetMap. Default is ``Costa Rica``.
43
- :type place: str
44
- :ivar filepath: The path to the directory where the graph, nodes, and edges pickle files are saved. Default is ``gis_data/``.
45
- :type filepath: str
46
- :ivar file_prefix: The prefix for the filenames of the saved graph, node, and edge pickle files. Default is ``costa-rica-``.
47
- :type file_prefix: str
48
- :ivar network_type: The type of network to extract from OpenStreetMap, such as "all" or other specific network types. Default is ``all``.
49
- :type network_type: str
50
- :ivar graph_file: Full path of the file to save or load the graph data as a pickle file.
51
- :type graph_file: str
52
- :ivar node_file: Full path of the file to save or load the graph's node data as a pickle file.
53
- :type node_file: str
54
- :ivar edge_file: Full path of the file to save or load the graph's edge data as a pickle file.
55
- :type edge_file: str
19
+ Build/load OSMnx graph + nodes/edges; persist as pickle via fsspec.
56
20
  """
21
+
57
22
  def __init__(self, **kwargs):
58
23
  self.graph = None
59
- self.nodes = None
60
- self.edges = None
61
- self.rebuild = kwargs.setdefault("rebuild", False)
62
- self.verbose = kwargs.setdefault("verbose", False)
63
- self.place = kwargs.setdefault('place', 'Costa Rica')
64
- self.filepath = kwargs.setdefault('ox_files_save_path', "gis_data/")
65
- self.file_prefix = kwargs.setdefault('file_prefix', 'costa-rica-')
66
- self.network_type = kwargs.setdefault('network_type', 'all')
67
- self.graph_file = f"{self.filepath}{self.file_prefix}graph.pkl"
68
- self.node_file = f"{self.filepath}{self.file_prefix}nodes.pkl"
69
- self.edge_file = f"{self.filepath}{self.file_prefix}edges.pkl"
70
-
71
- def load(self):
72
- """
73
- Loads the required data files for processing. If the files do not exist or
74
- if the `rebuild` flag is set to True, it will process and recreate the
75
- necessary data from the source. Otherwise, it will load the data from
76
- existing pickle files. This function ensures the target directory exists,
77
- and processes files conditionally based on their presence.
78
-
79
- :param verbose: Flag to control the verbosity of the function's output.
80
- :param rebuild: Indicates whether the data should be rebuilt from the raw
81
- source files.
82
- :param graph_file: Path to the graph file to be loaded or rebuilt.
83
- :param node_file: Path to the node file to be loaded or rebuilt.
84
- :param edge_file: Path to the edge file to be loaded or rebuilt.
85
- :param filepath: Path to the directory where files are processed and saved.
86
-
87
- :return: None
88
- """
24
+ self.nodes: Optional[gpd.GeoDataFrame] = None
25
+ self.edges: Optional[gpd.GeoDataFrame] = None
26
+
27
+ self.rebuild: bool = kwargs.setdefault("rebuild", False)
28
+ self.verbose: bool = kwargs.setdefault("verbose", False)
29
+ self.place: str = kwargs.setdefault("place", "Costa Rica")
30
+ self.network_type: str = kwargs.setdefault("network_type", "all")
31
+ base_url: str = kwargs.setdefault("data_path", "osmnx_data/pbf_files")
32
+ prefix: str = kwargs.setdefault("files_prefix", "costa-rica-").rstrip("-") + "-"
33
+
34
+ # Allow passing an fsspec instance directly
35
+ fs = kwargs.get("fs")
36
+ if fs is not None:
37
+ self.fs = fs
38
+ self.base = base_url.rstrip("/")
39
+ else:
40
+ self.fs, self.base = url_to_fs(base_url)
41
+
42
+ self.fs.mkdirs(self.base, exist_ok=True)
43
+
44
+ self.graph_file = f"{self.base.rstrip('/')}/{prefix}graph.pkl"
45
+ self.node_file = f"{self.base.rstrip('/')}/{prefix}nodes.pkl"
46
+ self.edge_file = f"{self.base.rstrip('/')}/{prefix}edges.pkl"
47
+
89
48
  if self.verbose:
90
- print("Loading data...")
49
+ print(f"[PBFHandler] base={self.base}")
50
+ print(f" graph={self.graph_file}")
51
+ print(f" nodes={self.node_file}")
52
+ print(f" edges={self.edge_file}")
91
53
 
92
- files_to_check = [self.graph_file, self.node_file, self.edge_file]
54
+ # ---------- public API ----------
55
+ def load(self) -> None:
56
+ if self.verbose:
57
+ print("[PBFHandler] load()")
93
58
 
94
59
  if self.rebuild:
95
- for file in files_to_check:
96
- if os.path.exists(file):
97
- os.remove(file)
98
- if not os.path.exists(self.filepath):
99
- os.makedirs(self.filepath, exist_ok=True)
100
- # self.process_pbf()
101
- # self.save_to_pickle()
102
- if not all(os.path.exists(f) for f in files_to_check):
60
+ self._delete_artifacts()
61
+
62
+ if not self._artifacts_exist():
103
63
  self.process_pbf()
104
64
  self.save_to_pickle()
105
65
  else:
106
66
  self.load_from_pickle()
107
67
 
68
+ def process_pbf(self) -> None:
69
+ if self.verbose:
70
+ print(f"[PBFHandler] processing: {self.place}")
71
+ self.graph = ox.graph_from_place(self.place, network_type=self.network_type)
72
+ self.nodes, self.edges = ox.graph_to_gdfs(self.graph)
73
+
74
+ def save_to_pickle(self) -> None:
75
+ if self.verbose:
76
+ print("[PBFHandler] saving via fsspec")
77
+ for path, obj in {
78
+ self.graph_file: self.graph,
79
+ self.node_file: self.nodes,
80
+ self.edge_file: self.edges,
81
+ }.items():
82
+ if obj is not None:
83
+ with self.fs.open(path, "wb") as f:
84
+ pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)
85
+
86
+ def load_from_pickle(self) -> None:
87
+ if self.verbose:
88
+ print("[PBFHandler] loading via fsspec")
89
+ self.graph = self._load_pickle(self.graph_file)
90
+ self.nodes = self._load_pickle(self.node_file)
91
+ self.edges = self._load_pickle(self.edge_file)
92
+
93
+ # ---------- helpers ----------
94
+ def _artifacts_exist(self) -> bool:
95
+ return all(self.fs.exists(p) for p in (self.graph_file, self.node_file, self.edge_file))
96
+
97
+ def _delete_artifacts(self) -> None:
108
98
  if self.verbose:
109
- print("Data loaded successfully.")
110
-
111
- def process_pbf(self):
112
- """
113
- Processes the Protocolbuffer Binary Format (PBF) data specified for a given place by
114
- utilizing the OSMnx library to create a graph representation and extracts nodes and
115
- edges into GeoDataFrames. The function provides verbose output if enabled.
116
-
117
- :param self: Refers to the current instance of the class containing this method.
118
-
119
- :param self.verbose: bool
120
- A flag to control verbose output. If True, detailed processing status messages are
121
- logged to the console.
122
-
123
- :param self.place: str
124
- The name or description of the geographic place for which PBF data is processed. It
125
- is used to construct a graph representation of the place.
126
-
127
- :param self.network_type: str
128
- The type of network graph to be created, typically one of 'all', 'walk', 'drive',
129
- etc., reflecting the type of paths or streets included in the graph.
130
-
131
- :return: None
132
- This function does not return a value, but updates class attributes ``graph``,
133
- ``nodes``, and ``edges``.
134
-
135
- :raises Exception:
136
- Raises a general exception when there is an error in processing the PBF data. Error
137
- details are printed when verbose output is enabled.
138
- """
139
- try:
140
- if self.verbose:
141
- print(f"Processing PBF for {self.place}...")
142
-
143
- self.graph = ox.graph_from_place(self.place, network_type=self.network_type)
144
- self.nodes, self.edges = ox.graph_to_gdfs(self.graph)
145
-
146
- if self.verbose:
147
- print("PBF processed successfully.")
148
- except Exception as e:
149
- print(f"Error processing PBF: {e}")
150
- raise
151
-
152
- def save_to_pickle(self):
153
- """
154
- Saves data, including graph, nodes, and edges, to pickle files. Each data object is
155
- saved to its corresponding file if available. If verbose mode is enabled, prints
156
- messages indicating the saving progress and success.
157
-
158
- :param self:
159
- Represents the instance of the class that contains attributes `graph_file`,
160
- `graph`, `node_file`, `nodes`, `edge_file`, `edges`, and `verbose`. These
161
- attributes determine the files to save to and the data to save.
162
-
163
- :raises Exception:
164
- Raises an exception if an error occurs during the saving process.
165
-
166
- :return:
167
- None
168
- """
169
- try:
170
- if self.verbose:
171
- print("Saving data to pickle files...")
172
-
173
- data_to_save = {
174
- self.graph_file: self.graph,
175
- self.node_file: self.nodes,
176
- self.edge_file: self.edges
177
- }
178
-
179
- for file, data in data_to_save.items():
180
- if data is not None:
181
- with open(file, 'wb') as f:
182
- pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
183
-
184
- if self.verbose:
185
- print("Data saved to pickle files successfully.")
186
- except Exception as e:
187
- print(f"Error saving to pickle: {e}")
188
- raise
189
-
190
- def load_from_pickle(self):
191
- """
192
- Loads data from pickle files specified by the attributes `graph_file`, `node_file`,
193
- and `edge_file` and assigns them to the corresponding attributes `graph`,
194
- `nodes`, and `edges`, respectively. Displays verbose messages during the load
195
- process if the `verbose` attribute is set to True.
196
-
197
- :raises Exception: If an error occurs during reading or deserialization of the
198
- pickle files.
199
- """
200
- try:
201
- if self.verbose:
202
- print("Loading data from pickle files...")
203
-
204
- files_to_load = {
205
- self.graph_file: 'graph',
206
- self.node_file: 'nodes',
207
- self.edge_file: 'edges'
208
- }
209
-
210
- for file, attr in files_to_load.items():
211
- with open(file, 'rb') as f:
212
- setattr(self, attr, pickle.load(f))
213
-
214
- if self.verbose:
215
- print("Data loaded from pickle files successfully.")
216
- except Exception as e:
217
- print(f"Error loading from pickle: {e}")
218
- raise
219
-
220
- def plot_graph(self):
221
- """
222
- Plots the loaded graph using the OSMnx library.
223
-
224
- This method checks if a graph is loaded and, if available, plots it. Outputs
225
- verbose messages during the process if verbosity is enabled.
226
-
227
- :raises Exception: Raises if an error occurs during the plotting process.
228
- :return: None
229
- """
230
- try:
231
- if self.graph is not None:
232
- if self.verbose:
233
- print("Plotting the graph...")
234
- ox.plot_graph(self.graph)
235
- if self.verbose:
236
- print("Graph plotted successfully.")
237
- else:
238
- print("Graph is not loaded. Please load a PBF file first.")
239
- except Exception as e:
240
- print(f"Error plotting the graph: {e}")
241
- raise
99
+ print("[PBFHandler] deleting artifacts (rebuild=True)")
100
+ for p in (self.graph_file, self.node_file, self.edge_file):
101
+ if self.fs.exists(p):
102
+ try:
103
+ self.fs.rm_file(p)
104
+ except Exception:
105
+ self.fs.rm(p)
106
+
107
+ def _load_pickle(self, path: str):
108
+ with self.fs.open(path, "rb") as f:
109
+ return pickle.load(f)
242
110
 
243
111
 
244
112
  def get_bounding_box_from_points(gps_points, margin=0.001):
@@ -10,6 +10,14 @@ import clickhouse_connect
10
10
 
11
11
  from . import ManagedResource
12
12
 
13
+ def _to_bool(val: Any) -> bool:
14
+ if isinstance(val, bool):
15
+ return val
16
+ if isinstance(val, (int, float)):
17
+ return bool(val)
18
+ if isinstance(val, str):
19
+ return val.strip().lower() in ("1", "true", "yes", "on")
20
+ return False
13
21
 
14
22
  class ClickHouseWriter(ManagedResource):
15
23
  """
@@ -47,6 +55,11 @@ class ClickHouseWriter(ManagedResource):
47
55
  database: str = "sibi_data",
48
56
  user: str = "default",
49
57
  password: str = "",
58
+ secure: bool = False,
59
+ verify: bool = False,
60
+ ca_cert: str = "",
61
+ client_cert: str = "",
62
+ compression: str = "",
50
63
  table: str = "test_sibi_table",
51
64
  order_by: str = "id",
52
65
  engine: Optional[str] = None, # e.g. "ENGINE MergeTree ORDER BY (`id`)"
@@ -61,6 +74,11 @@ class ClickHouseWriter(ManagedResource):
61
74
  self.database = database
62
75
  self.user = user
63
76
  self.password = password
77
+ self.secure = _to_bool(secure)
78
+ self.verify = _to_bool(verify)
79
+ self.ca_cert = ca_cert
80
+ self.client_cert = client_cert
81
+ self.compression = compression # e.g. 'lz4', 'zstd',
64
82
  self.table = table
65
83
  self.order_by = order_by
66
84
  self.engine = engine # if None → default MergeTree ORDER BY
@@ -224,6 +242,7 @@ class ClickHouseWriter(ManagedResource):
224
242
  # ------------- low-level helpers -------------
225
243
 
226
244
  def _get_client(self):
245
+ print(self.secure, " ", self.verify)
227
246
  cli = getattr(self._tlocal, "client", None)
228
247
  if cli is not None:
229
248
  return cli
@@ -233,6 +252,11 @@ class ClickHouseWriter(ManagedResource):
233
252
  database=self.database,
234
253
  username=self.user, # clickhouse-connect uses 'username'
235
254
  password=self.password,
255
+ secure=self.secure,
256
+ verify=self.verify,
257
+ ca_cert=self.ca_cert or None,
258
+ client_cert=self.client_cert or None,
259
+ compression=self.compression or None,
236
260
  )
237
261
  self._tlocal.client = cli
238
262
  return cli
@@ -0,0 +1,61 @@
1
+ import asyncio
2
+ from typing import List, Any, Dict
3
+
4
+ import dask
5
+ import dask.dataframe as dd
6
+
7
+ def _to_int_safe(x) -> int:
8
+ """
9
+ Convert scalar-like to int safely.
10
+ Handles numpy scalars, pandas Series/DataFrame outputs.
11
+ """
12
+ if hasattr(x, "item"): # numpy scalar, pandas scalar
13
+ return int(x.item())
14
+ if hasattr(x, "iloc"): # Series-like
15
+ return int(x.iloc[0])
16
+ return int(x)
17
+
18
+ def dask_is_probably_empty(ddf: dd.DataFrame) -> bool:
19
+ return getattr(ddf, "npartitions", 0) == 0 or len(ddf._meta.columns) == 0
20
+
21
+
22
+ def dask_is_empty_truthful(ddf: dd.DataFrame) -> bool:
23
+ n = ddf.map_partitions(len).sum().compute()
24
+ return int(n) == 0
25
+
26
+
27
+ def dask_is_empty(ddf: dd.DataFrame, *, sample: int = 4) -> bool:
28
+ if dask_is_probably_empty(ddf):
29
+ return True
30
+
31
+ k = min(max(sample, 1), ddf.npartitions)
32
+ probes = dask.compute(*[
33
+ ddf.get_partition(i).map_partitions(len) for i in range(k)
34
+ ])
35
+
36
+ if any(_to_int_safe(n) > 0 for n in probes):
37
+ return False
38
+ if k == ddf.npartitions and all(_to_int_safe(n) == 0 for n in probes):
39
+ return True
40
+
41
+ return dask_is_empty_truthful(ddf)
42
+
43
+ class UniqueValuesExtractor:
44
+ @staticmethod
45
+ def _compute_to_list_sync(series) -> List[Any]:
46
+ """Run in a worker thread when Dask-backed."""
47
+ if hasattr(series, "compute"):
48
+ return series.compute().tolist()
49
+ return series.tolist()
50
+
51
+ async def compute_to_list(self, series) -> List[Any]:
52
+ # Offload potential Dask .compute() to a thread to avoid blocking the event loop
53
+ return await asyncio.to_thread(self._compute_to_list_sync, series)
54
+
55
+ async def extract_unique_values(self, df, *columns: str) -> Dict[str, List[Any]]:
56
+ async def one(col: str):
57
+ ser = df[col].dropna().unique()
58
+ return col, await self.compute_to_list(ser)
59
+
60
+ pairs = await asyncio.gather(*(one(c) for c in columns))
61
+ return dict(pairs)
File without changes