sibi-dst 2025.8.9__tar.gz → 2025.9.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/PKG-INFO +2 -1
  2. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/pyproject.toml +2 -1
  3. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/_df_helper.py +27 -3
  4. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +2 -0
  5. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +1 -1
  6. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/osmnx_helper/utils.py +82 -214
  7. sibi_dst-2025.9.2/sibi_dst/utils/base.py +697 -0
  8. sibi_dst-2025.9.2/sibi_dst/utils/boilerplate/__init__.py +11 -0
  9. sibi_dst-2025.9.2/sibi_dst/utils/boilerplate/base_attacher.py +25 -0
  10. sibi_dst-2025.8.9/sibi_dst/utils/boilerplate/base_data_artifact.py → sibi_dst-2025.9.2/sibi_dst/utils/boilerplate/base_parquet_artifact.py +1 -1
  11. sibi_dst-2025.9.2/sibi_dst/utils/boilerplate/base_parquet_reader.py +21 -0
  12. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/clickhouse_writer.py +24 -0
  13. sibi_dst-2025.9.2/sibi_dst/utils/dask_utils.py +61 -0
  14. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/progress/sse_runner.py +2 -0
  15. sibi_dst-2025.8.9/sibi_dst/utils/base.py +0 -252
  16. sibi_dst-2025.8.9/sibi_dst/utils/boilerplate/__init__.py +0 -6
  17. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/README.md +0 -0
  18. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/__init__.py +0 -0
  19. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/__init__.py +0 -0
  20. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/_artifact_updater_async.py +0 -0
  21. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/_artifact_updater_threaded.py +0 -0
  22. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
  23. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/_parquet_reader.py +0 -0
  24. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/__init__.py +0 -0
  25. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
  26. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
  27. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
  28. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
  29. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  30. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +0 -0
  31. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  32. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +0 -0
  33. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
  34. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/core/__init__.py +0 -0
  35. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/core/_defaults.py +0 -0
  36. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
  37. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/core/_params_config.py +0 -0
  38. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/core/_query_config.py +0 -0
  39. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/data_cleaner.py +0 -0
  40. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/geopy_helper/__init__.py +0 -0
  41. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
  42. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/geopy_helper/utils.py +0 -0
  43. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/osmnx_helper/__init__.py +0 -0
  44. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
  45. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
  46. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
  47. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/osmnx_helper/basemaps/route_map_plotter.py +0 -0
  48. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
  49. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/osmnx_helper/route_path_builder.py +0 -0
  50. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/tests/__init__.py +0 -0
  51. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
  52. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/__init__.py +0 -0
  53. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/async_utils.py +0 -0
  54. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/boilerplate/base_data_cube.py +0 -0
  55. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/business_days.py +0 -0
  56. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/credentials.py +0 -0
  57. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/data_from_http_source.py +0 -0
  58. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/data_utils.py +0 -0
  59. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/data_wrapper.py +0 -0
  60. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/date_utils.py +0 -0
  61. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/df_utils.py +0 -0
  62. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/file_age_checker.py +0 -0
  63. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/file_utils.py +0 -0
  64. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/filepath_generator.py +0 -0
  65. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/iceberg_saver.py +0 -0
  66. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/log_utils.py +0 -0
  67. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/manifest_manager.py +0 -0
  68. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/parquet_saver.py +0 -0
  69. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/periods.py +0 -0
  70. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/phone_formatter.py +0 -0
  71. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/progress/__init__.py +0 -0
  72. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/progress/jobs.py +0 -0
  73. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/storage_config.py +0 -0
  74. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/storage_hive.py +0 -0
  75. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/storage_manager.py +0 -0
  76. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/update_planner.py +0 -0
  77. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/webdav_client.py +0 -0
  78. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/__init__.py +0 -0
  79. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/__init__.py +0 -0
  80. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
  81. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
  82. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
  83. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  84. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  85. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  86. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
  87. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
  88. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -0
  89. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +0 -0
  90. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
  91. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
  92. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
  93. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
  94. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
  95. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/core/_query_config.py +0 -0
  96. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/utils/__init__.py +0 -0
  97. {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/utils/log_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 2025.8.9
3
+ Version: 2025.9.2
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -21,6 +21,7 @@ Requires-Dist: pyarrow (>=20.0.0,<21.0.0)
21
21
  Requires-Dist: pydantic (>=2.11.7,<3.0.0)
22
22
  Requires-Dist: pyiceberg[hive,s3fs] (>=0.9.1,<0.10.0)
23
23
  Requires-Dist: pymysql (>=1.1.1,<2.0.0)
24
+ Requires-Dist: pyrosm (>=0.6.2,<0.7.0)
24
25
  Requires-Dist: s3fs (>=2025.5.1,<2026.0.0)
25
26
  Requires-Dist: sqlalchemy (>=2.0.41,<3.0.0)
26
27
  Requires-Dist: sse-starlette (>=3.0.2,<4.0.0)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sibi-dst"
3
- version = "2025.8.9"
3
+ version = "2025.9.2"
4
4
  description = "Data Science Toolkit"
5
5
  authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
6
6
  readme = "README.md"
@@ -25,6 +25,7 @@ opentelemetry-exporter-otlp = "^1.35.0"
25
25
  opentelemetry-sdk = "^1.35.0"
26
26
  pyiceberg = {extras = ["hive", "s3fs"], version = "^0.9.1"}
27
27
  sse-starlette = "^3.0.2"
28
+ pyrosm = "^0.6.2"
28
29
 
29
30
  [tool.poetry.group.dev]
30
31
  optional = true
@@ -137,6 +137,7 @@ class DfHelper(ManagedResource):
137
137
  def __init__(self, backend="sqlalchemy", **kwargs):
138
138
  self.default_config = self.default_config or {}
139
139
  kwargs = {**self.default_config.copy(), **kwargs}
140
+ kwargs.setdefault("auto_sse", True)
140
141
  super().__init__(**kwargs)
141
142
  self.backend = backend
142
143
 
@@ -166,6 +167,18 @@ class DfHelper(ManagedResource):
166
167
  self.backend_strategy = strategy_cls(self)
167
168
 
168
169
  # ---------- ManagedResource hooks ----------
170
+ def get_sse(self):
171
+ return self._ensure_sse()
172
+
173
+ def _emit_bg(self, event: str, **data: Any) -> None:
174
+ try:
175
+ loop = asyncio.get_running_loop()
176
+ except RuntimeError:
177
+ # no running loop: run to completion
178
+ asyncio.run(self.emit(event, **data))
179
+ else:
180
+ loop.create_task(self.emit(event, **data))
181
+
169
182
  def _cleanup(self):
170
183
  attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
171
184
  if not attr_name:
@@ -213,6 +226,7 @@ class DfHelper(ManagedResource):
213
226
  timeout: Optional[float] = None,
214
227
  **options
215
228
  ) -> Union[pd.DataFrame, dd.DataFrame]:
229
+ await self.emit(f"{self.__class__.__name__} load:start", message=f"Pulling data from {self.backend} backend")
216
230
  # 1) Async load if available, else run sync load in a thread.
217
231
  if hasattr(self.backend_strategy, "aload"):
218
232
  load_awaitable = self.backend_strategy.aload(**options)
@@ -224,18 +238,20 @@ class DfHelper(ManagedResource):
224
238
  self.total_records = total
225
239
 
226
240
  # 2) Post-processing steps are sync; offload to threads.
241
+ await self.emit(event=f"{self.__class__.__name__} load:progress", message=f"Post-processing {len(df)} records")
227
242
  df = await asyncio.to_thread(self._process_loaded_data, df)
228
243
  df = await asyncio.to_thread(self._post_process_df, df)
229
244
 
230
245
  # 3) Persist and compute can block; offload when needed.
231
246
  if persist and _is_dask_df(df):
232
247
  df = await asyncio.to_thread(df.persist)
233
-
234
248
  if as_pandas and _is_dask_df(df):
235
249
  # Allow separate timeout for compute if desired; reuse same timeout here.
236
250
  compute_awaitable = asyncio.to_thread(df.compute)
237
251
  return await (asyncio.wait_for(compute_awaitable, timeout) if timeout else compute_awaitable)
238
252
 
253
+ await self.emit(event=f"{self.__class__.__name__} load:progress", message=f"Returning {len(df)} records")
254
+
239
255
  return df
240
256
 
241
257
  # ---------- dataframe post-processing ----------
@@ -307,6 +323,11 @@ class DfHelper(ManagedResource):
307
323
 
308
324
  self.logger.debug(f"Successfully saved '{parquet_filename}' to '{path}'.", extra=self.logger_extra)
309
325
 
326
+ async def asave_to_parquet(self, df: dd.DataFrame, **kwargs):
327
+ await self.emit(event=f"{self.__class__.__name__} save:start", message=f"Saving {len(df)} records to parquet")
328
+ await asyncio.to_thread(self.save_to_parquet, df, **kwargs)
329
+ await self.emit(event=f"{self.__class__.__name__} save:end", message=f"Saved {len(df)} records to parquet")
330
+
310
331
  def save_to_clickhouse(self, df: dd.DataFrame, **credentials):
311
332
  if not self._has_any_rows(df):
312
333
  self.logger.warning("Skipping save to ClickHouse: The provided DataFrame is empty.", extra=self.logger_extra)
@@ -315,6 +336,11 @@ class DfHelper(ManagedResource):
315
336
  writer.save_to_clickhouse(df)
316
337
  self.logger.debug("Save to ClickHouse completed.", extra=self.logger_extra)
317
338
 
339
+ async def asave_to_clickhouse(self, df: dd.DataFrame, **credentials):
340
+ await self.emit(event=f"{self.__class__.__name__} save:start", message=f"Saving {len(df)} records to ClickHouse")
341
+ await asyncio.to_thread(self.save_to_clickhouse, df, **credentials)
342
+ await self.emit(event=f"{self.__class__.__name__} save:end", message=f"Saved {len(df)} records to ClickHouse")
343
+
318
344
  # ---------- period loaders ----------
319
345
  def load_period(self, dt_field: str, start: str, end: str, **kwargs):
320
346
  final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
@@ -346,5 +372,3 @@ class DfHelper(ManagedResource):
346
372
  return bool(ddf.head(1, npartitions=-1).shape[0])
347
373
  except Exception:
348
374
  return False
349
-
350
-
@@ -205,6 +205,8 @@ class ParquetConfig(BaseModel):
205
205
  filesystem=self.fs,
206
206
  filters=pq_filters,
207
207
  # Toggle based on file count; False is safer for many tiny files.
208
+ aggregate_files=True,
209
+ split_row_groups=True,
208
210
  gather_statistics=False,
209
211
  ignore_metadata_file=True,
210
212
  )
@@ -30,7 +30,7 @@ class SqlAlchemyLoadFromDb(ManagedResource):
30
30
  self.engine = self.db_connection.engine
31
31
  self.query_config = plugin_query
32
32
  self.params_config = plugin_params
33
- self.chunk_size = kwargs.get("chunk_size", self.params_config.df_params.get("chunk_size", 1000) if self.params_config else 1000)
33
+ self.chunk_size = kwargs.get("chunk_size", self.params_config.df_params.get("chunk_size", 10000) if self.params_config else 10000)
34
34
  self.total_records = -1
35
35
 
36
36
  def build_and_load(self) -> Tuple[int, dd.DataFrame]:
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  import math
2
4
  import os
3
5
  import pickle
@@ -9,236 +11,102 @@ import numpy as np
9
11
  import osmnx as ox
10
12
  from geopy.distance import geodesic
11
13
 
12
-
13
- #
14
- # options = {
15
- # 'ox_files_save_path': ox_files_save_path,
16
- # 'network_type': 'drive',
17
- # 'place': 'Costa Rica',
18
- # 'files_prefix': 'costa-rica-',
19
- # }
20
- # Usage example
21
- # handler = PBFHandler(**options)
22
- # handler.load()
23
-
14
+ from typing import Optional
15
+ from fsspec.core import url_to_fs
24
16
 
25
17
  class PBFHandler:
26
18
  """
27
- Handles the creation, management, and visualization of graph data derived
28
- from .pbf (Protocolbuffer Binary Format) files. This class enables the
29
- loading, processing, saving, and reutilization of graph, node, and edge
30
- data for geographical regions, supporting verbose mode for detailed outputs.
31
-
32
- :ivar graph: The generated graph object representing the spatial network; can be None if not yet loaded or processed.
33
- :type graph: Optional[NetworkX.Graph]
34
- :ivar nodes: GeoDataFrame representing the nodes of the graph; can be None if not yet loaded or processed.
35
- :type nodes: Optional[geopandas.GeoDataFrame]
36
- :ivar edges: GeoDataFrame representing the edges of the graph; can be None if not yet loaded or processed.
37
- :type edges: Optional[geopandas.GeoDataFrame]
38
- :ivar rebuild: Indicates whether to rebuild the graph data, ignoring any existing cached files. Default is ``False``.
39
- :type rebuild: bool
40
- :ivar verbose: Enables verbose mode to provide detailed status messages during operations. Default is ``False``.
41
- :type verbose: bool
42
- :ivar place: The name of the geographical region to process with OpenStreetMap. Default is ``Costa Rica``.
43
- :type place: str
44
- :ivar filepath: The path to the directory where the graph, nodes, and edges pickle files are saved. Default is ``gis_data/``.
45
- :type filepath: str
46
- :ivar file_prefix: The prefix for the filenames of the saved graph, node, and edge pickle files. Default is ``costa-rica-``.
47
- :type file_prefix: str
48
- :ivar network_type: The type of network to extract from OpenStreetMap, such as "all" or other specific network types. Default is ``all``.
49
- :type network_type: str
50
- :ivar graph_file: Full path of the file to save or load the graph data as a pickle file.
51
- :type graph_file: str
52
- :ivar node_file: Full path of the file to save or load the graph's node data as a pickle file.
53
- :type node_file: str
54
- :ivar edge_file: Full path of the file to save or load the graph's edge data as a pickle file.
55
- :type edge_file: str
19
+ Build/load OSMnx graph + nodes/edges; persist as pickle via fsspec.
56
20
  """
21
+
57
22
  def __init__(self, **kwargs):
58
23
  self.graph = None
59
- self.nodes = None
60
- self.edges = None
61
- self.rebuild = kwargs.setdefault("rebuild", False)
62
- self.verbose = kwargs.setdefault("verbose", False)
63
- self.place = kwargs.setdefault('place', 'Costa Rica')
64
- self.filepath = kwargs.setdefault('ox_files_save_path', "gis_data/")
65
- self.file_prefix = kwargs.setdefault('file_prefix', 'costa-rica-')
66
- self.network_type = kwargs.setdefault('network_type', 'all')
67
- self.graph_file = f"{self.filepath}{self.file_prefix}graph.pkl"
68
- self.node_file = f"{self.filepath}{self.file_prefix}nodes.pkl"
69
- self.edge_file = f"{self.filepath}{self.file_prefix}edges.pkl"
70
-
71
- def load(self):
72
- """
73
- Loads the required data files for processing. If the files do not exist or
74
- if the `rebuild` flag is set to True, it will process and recreate the
75
- necessary data from the source. Otherwise, it will load the data from
76
- existing pickle files. This function ensures the target directory exists,
77
- and processes files conditionally based on their presence.
78
-
79
- :param verbose: Flag to control the verbosity of the function's output.
80
- :param rebuild: Indicates whether the data should be rebuilt from the raw
81
- source files.
82
- :param graph_file: Path to the graph file to be loaded or rebuilt.
83
- :param node_file: Path to the node file to be loaded or rebuilt.
84
- :param edge_file: Path to the edge file to be loaded or rebuilt.
85
- :param filepath: Path to the directory where files are processed and saved.
86
-
87
- :return: None
88
- """
24
+ self.nodes: Optional[gpd.GeoDataFrame] = None
25
+ self.edges: Optional[gpd.GeoDataFrame] = None
26
+
27
+ self.rebuild: bool = kwargs.setdefault("rebuild", False)
28
+ self.verbose: bool = kwargs.setdefault("verbose", False)
29
+ self.place: str = kwargs.setdefault("place", "Costa Rica")
30
+ self.network_type: str = kwargs.setdefault("network_type", "all")
31
+ base_url: str = kwargs.setdefault("data_path", "osmnx_data/pbf_files")
32
+ prefix: str = kwargs.setdefault("files_prefix", "costa-rica-").rstrip("-") + "-"
33
+
34
+ # Allow passing an fsspec instance directly
35
+ fs = kwargs.get("fs")
36
+ if fs is not None:
37
+ self.fs = fs
38
+ self.base = base_url.rstrip("/")
39
+ else:
40
+ self.fs, self.base = url_to_fs(base_url)
41
+
42
+ self.fs.mkdirs(self.base, exist_ok=True)
43
+
44
+ self.graph_file = f"{self.base.rstrip('/')}/{prefix}graph.pkl"
45
+ self.node_file = f"{self.base.rstrip('/')}/{prefix}nodes.pkl"
46
+ self.edge_file = f"{self.base.rstrip('/')}/{prefix}edges.pkl"
47
+
89
48
  if self.verbose:
90
- print("Loading data...")
49
+ print(f"[PBFHandler] base={self.base}")
50
+ print(f" graph={self.graph_file}")
51
+ print(f" nodes={self.node_file}")
52
+ print(f" edges={self.edge_file}")
91
53
 
92
- files_to_check = [self.graph_file, self.node_file, self.edge_file]
54
+ # ---------- public API ----------
55
+ def load(self) -> None:
56
+ if self.verbose:
57
+ print("[PBFHandler] load()")
93
58
 
94
59
  if self.rebuild:
95
- for file in files_to_check:
96
- if os.path.exists(file):
97
- os.remove(file)
98
- if not os.path.exists(self.filepath):
99
- os.makedirs(self.filepath, exist_ok=True)
100
- # self.process_pbf()
101
- # self.save_to_pickle()
102
- if not all(os.path.exists(f) for f in files_to_check):
60
+ self._delete_artifacts()
61
+
62
+ if not self._artifacts_exist():
103
63
  self.process_pbf()
104
64
  self.save_to_pickle()
105
65
  else:
106
66
  self.load_from_pickle()
107
67
 
68
+ def process_pbf(self) -> None:
69
+ if self.verbose:
70
+ print(f"[PBFHandler] processing: {self.place}")
71
+ self.graph = ox.graph_from_place(self.place, network_type=self.network_type)
72
+ self.nodes, self.edges = ox.graph_to_gdfs(self.graph)
73
+
74
+ def save_to_pickle(self) -> None:
75
+ if self.verbose:
76
+ print("[PBFHandler] saving via fsspec")
77
+ for path, obj in {
78
+ self.graph_file: self.graph,
79
+ self.node_file: self.nodes,
80
+ self.edge_file: self.edges,
81
+ }.items():
82
+ if obj is not None:
83
+ with self.fs.open(path, "wb") as f:
84
+ pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)
85
+
86
+ def load_from_pickle(self) -> None:
87
+ if self.verbose:
88
+ print("[PBFHandler] loading via fsspec")
89
+ self.graph = self._load_pickle(self.graph_file)
90
+ self.nodes = self._load_pickle(self.node_file)
91
+ self.edges = self._load_pickle(self.edge_file)
92
+
93
+ # ---------- helpers ----------
94
+ def _artifacts_exist(self) -> bool:
95
+ return all(self.fs.exists(p) for p in (self.graph_file, self.node_file, self.edge_file))
96
+
97
+ def _delete_artifacts(self) -> None:
108
98
  if self.verbose:
109
- print("Data loaded successfully.")
110
-
111
- def process_pbf(self):
112
- """
113
- Processes the Protocolbuffer Binary Format (PBF) data specified for a given place by
114
- utilizing the OSMnx library to create a graph representation and extracts nodes and
115
- edges into GeoDataFrames. The function provides verbose output if enabled.
116
-
117
- :param self: Refers to the current instance of the class containing this method.
118
-
119
- :param self.verbose: bool
120
- A flag to control verbose output. If True, detailed processing status messages are
121
- logged to the console.
122
-
123
- :param self.place: str
124
- The name or description of the geographic place for which PBF data is processed. It
125
- is used to construct a graph representation of the place.
126
-
127
- :param self.network_type: str
128
- The type of network graph to be created, typically one of 'all', 'walk', 'drive',
129
- etc., reflecting the type of paths or streets included in the graph.
130
-
131
- :return: None
132
- This function does not return a value, but updates class attributes ``graph``,
133
- ``nodes``, and ``edges``.
134
-
135
- :raises Exception:
136
- Raises a general exception when there is an error in processing the PBF data. Error
137
- details are printed when verbose output is enabled.
138
- """
139
- try:
140
- if self.verbose:
141
- print(f"Processing PBF for {self.place}...")
142
-
143
- self.graph = ox.graph_from_place(self.place, network_type=self.network_type)
144
- self.nodes, self.edges = ox.graph_to_gdfs(self.graph)
145
-
146
- if self.verbose:
147
- print("PBF processed successfully.")
148
- except Exception as e:
149
- print(f"Error processing PBF: {e}")
150
- raise
151
-
152
- def save_to_pickle(self):
153
- """
154
- Saves data, including graph, nodes, and edges, to pickle files. Each data object is
155
- saved to its corresponding file if available. If verbose mode is enabled, prints
156
- messages indicating the saving progress and success.
157
-
158
- :param self:
159
- Represents the instance of the class that contains attributes `graph_file`,
160
- `graph`, `node_file`, `nodes`, `edge_file`, `edges`, and `verbose`. These
161
- attributes determine the files to save to and the data to save.
162
-
163
- :raises Exception:
164
- Raises an exception if an error occurs during the saving process.
165
-
166
- :return:
167
- None
168
- """
169
- try:
170
- if self.verbose:
171
- print("Saving data to pickle files...")
172
-
173
- data_to_save = {
174
- self.graph_file: self.graph,
175
- self.node_file: self.nodes,
176
- self.edge_file: self.edges
177
- }
178
-
179
- for file, data in data_to_save.items():
180
- if data is not None:
181
- with open(file, 'wb') as f:
182
- pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
183
-
184
- if self.verbose:
185
- print("Data saved to pickle files successfully.")
186
- except Exception as e:
187
- print(f"Error saving to pickle: {e}")
188
- raise
189
-
190
- def load_from_pickle(self):
191
- """
192
- Loads data from pickle files specified by the attributes `graph_file`, `node_file`,
193
- and `edge_file` and assigns them to the corresponding attributes `graph`,
194
- `nodes`, and `edges`, respectively. Displays verbose messages during the load
195
- process if the `verbose` attribute is set to True.
196
-
197
- :raises Exception: If an error occurs during reading or deserialization of the
198
- pickle files.
199
- """
200
- try:
201
- if self.verbose:
202
- print("Loading data from pickle files...")
203
-
204
- files_to_load = {
205
- self.graph_file: 'graph',
206
- self.node_file: 'nodes',
207
- self.edge_file: 'edges'
208
- }
209
-
210
- for file, attr in files_to_load.items():
211
- with open(file, 'rb') as f:
212
- setattr(self, attr, pickle.load(f))
213
-
214
- if self.verbose:
215
- print("Data loaded from pickle files successfully.")
216
- except Exception as e:
217
- print(f"Error loading from pickle: {e}")
218
- raise
219
-
220
- def plot_graph(self):
221
- """
222
- Plots the loaded graph using the OSMnx library.
223
-
224
- This method checks if a graph is loaded and, if available, plots it. Outputs
225
- verbose messages during the process if verbosity is enabled.
226
-
227
- :raises Exception: Raises if an error occurs during the plotting process.
228
- :return: None
229
- """
230
- try:
231
- if self.graph is not None:
232
- if self.verbose:
233
- print("Plotting the graph...")
234
- ox.plot_graph(self.graph)
235
- if self.verbose:
236
- print("Graph plotted successfully.")
237
- else:
238
- print("Graph is not loaded. Please load a PBF file first.")
239
- except Exception as e:
240
- print(f"Error plotting the graph: {e}")
241
- raise
99
+ print("[PBFHandler] deleting artifacts (rebuild=True)")
100
+ for p in (self.graph_file, self.node_file, self.edge_file):
101
+ if self.fs.exists(p):
102
+ try:
103
+ self.fs.rm_file(p)
104
+ except Exception:
105
+ self.fs.rm(p)
106
+
107
+ def _load_pickle(self, path: str):
108
+ with self.fs.open(path, "rb") as f:
109
+ return pickle.load(f)
242
110
 
243
111
 
244
112
  def get_bounding_box_from_points(gps_points, margin=0.001):