sibi-dst 2025.8.9__tar.gz → 2025.9.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/PKG-INFO +2 -1
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/pyproject.toml +2 -1
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/_df_helper.py +27 -3
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +2 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +1 -1
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/osmnx_helper/utils.py +82 -214
- sibi_dst-2025.9.2/sibi_dst/utils/base.py +697 -0
- sibi_dst-2025.9.2/sibi_dst/utils/boilerplate/__init__.py +11 -0
- sibi_dst-2025.9.2/sibi_dst/utils/boilerplate/base_attacher.py +25 -0
- sibi_dst-2025.8.9/sibi_dst/utils/boilerplate/base_data_artifact.py → sibi_dst-2025.9.2/sibi_dst/utils/boilerplate/base_parquet_artifact.py +1 -1
- sibi_dst-2025.9.2/sibi_dst/utils/boilerplate/base_parquet_reader.py +21 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/clickhouse_writer.py +24 -0
- sibi_dst-2025.9.2/sibi_dst/utils/dask_utils.py +61 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/progress/sse_runner.py +2 -0
- sibi_dst-2025.8.9/sibi_dst/utils/base.py +0 -252
- sibi_dst-2025.8.9/sibi_dst/utils/boilerplate/__init__.py +0 -6
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/README.md +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/__init__.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/__init__.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/_artifact_updater_async.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/_artifact_updater_threaded.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/_parquet_reader.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/__init__.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/core/__init__.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/core/_defaults.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/core/_params_config.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/core/_query_config.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/data_cleaner.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/geopy_helper/__init__.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/geopy_helper/utils.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/osmnx_helper/__init__.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/osmnx_helper/basemaps/route_map_plotter.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/osmnx_helper/route_path_builder.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/tests/__init__.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/__init__.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/async_utils.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/boilerplate/base_data_cube.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/business_days.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/credentials.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/data_from_http_source.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/data_utils.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/data_wrapper.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/date_utils.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/df_utils.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/file_age_checker.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/file_utils.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/filepath_generator.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/iceberg_saver.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/log_utils.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/manifest_manager.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/parquet_saver.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/periods.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/phone_formatter.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/progress/__init__.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/progress/jobs.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/storage_config.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/storage_hive.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/storage_manager.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/update_planner.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/utils/webdav_client.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/__init__.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/__init__.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/df_helper/core/_query_config.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/utils/__init__.py +0 -0
- {sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/v2/utils/log_utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sibi-dst
|
3
|
-
Version: 2025.
|
3
|
+
Version: 2025.9.2
|
4
4
|
Summary: Data Science Toolkit
|
5
5
|
Author: Luis Valverde
|
6
6
|
Author-email: lvalverdeb@gmail.com
|
@@ -21,6 +21,7 @@ Requires-Dist: pyarrow (>=20.0.0,<21.0.0)
|
|
21
21
|
Requires-Dist: pydantic (>=2.11.7,<3.0.0)
|
22
22
|
Requires-Dist: pyiceberg[hive,s3fs] (>=0.9.1,<0.10.0)
|
23
23
|
Requires-Dist: pymysql (>=1.1.1,<2.0.0)
|
24
|
+
Requires-Dist: pyrosm (>=0.6.2,<0.7.0)
|
24
25
|
Requires-Dist: s3fs (>=2025.5.1,<2026.0.0)
|
25
26
|
Requires-Dist: sqlalchemy (>=2.0.41,<3.0.0)
|
26
27
|
Requires-Dist: sse-starlette (>=3.0.2,<4.0.0)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "sibi-dst"
|
3
|
-
version = "2025.
|
3
|
+
version = "2025.9.2"
|
4
4
|
description = "Data Science Toolkit"
|
5
5
|
authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
|
6
6
|
readme = "README.md"
|
@@ -25,6 +25,7 @@ opentelemetry-exporter-otlp = "^1.35.0"
|
|
25
25
|
opentelemetry-sdk = "^1.35.0"
|
26
26
|
pyiceberg = {extras = ["hive", "s3fs"], version = "^0.9.1"}
|
27
27
|
sse-starlette = "^3.0.2"
|
28
|
+
pyrosm = "^0.6.2"
|
28
29
|
|
29
30
|
[tool.poetry.group.dev]
|
30
31
|
optional = true
|
@@ -137,6 +137,7 @@ class DfHelper(ManagedResource):
|
|
137
137
|
def __init__(self, backend="sqlalchemy", **kwargs):
|
138
138
|
self.default_config = self.default_config or {}
|
139
139
|
kwargs = {**self.default_config.copy(), **kwargs}
|
140
|
+
kwargs.setdefault("auto_sse", True)
|
140
141
|
super().__init__(**kwargs)
|
141
142
|
self.backend = backend
|
142
143
|
|
@@ -166,6 +167,18 @@ class DfHelper(ManagedResource):
|
|
166
167
|
self.backend_strategy = strategy_cls(self)
|
167
168
|
|
168
169
|
# ---------- ManagedResource hooks ----------
|
170
|
+
def get_sse(self):
|
171
|
+
return self._ensure_sse()
|
172
|
+
|
173
|
+
def _emit_bg(self, event: str, **data: Any) -> None:
|
174
|
+
try:
|
175
|
+
loop = asyncio.get_running_loop()
|
176
|
+
except RuntimeError:
|
177
|
+
# no running loop: run to completion
|
178
|
+
asyncio.run(self.emit(event, **data))
|
179
|
+
else:
|
180
|
+
loop.create_task(self.emit(event, **data))
|
181
|
+
|
169
182
|
def _cleanup(self):
|
170
183
|
attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
|
171
184
|
if not attr_name:
|
@@ -213,6 +226,7 @@ class DfHelper(ManagedResource):
|
|
213
226
|
timeout: Optional[float] = None,
|
214
227
|
**options
|
215
228
|
) -> Union[pd.DataFrame, dd.DataFrame]:
|
229
|
+
await self.emit(f"{self.__class__.__name__} load:start", message=f"Pulling data from {self.backend} backend")
|
216
230
|
# 1) Async load if available, else run sync load in a thread.
|
217
231
|
if hasattr(self.backend_strategy, "aload"):
|
218
232
|
load_awaitable = self.backend_strategy.aload(**options)
|
@@ -224,18 +238,20 @@ class DfHelper(ManagedResource):
|
|
224
238
|
self.total_records = total
|
225
239
|
|
226
240
|
# 2) Post-processing steps are sync; offload to threads.
|
241
|
+
await self.emit(event=f"{self.__class__.__name__} load:progress", message=f"Post-processing {len(df)} records")
|
227
242
|
df = await asyncio.to_thread(self._process_loaded_data, df)
|
228
243
|
df = await asyncio.to_thread(self._post_process_df, df)
|
229
244
|
|
230
245
|
# 3) Persist and compute can block; offload when needed.
|
231
246
|
if persist and _is_dask_df(df):
|
232
247
|
df = await asyncio.to_thread(df.persist)
|
233
|
-
|
234
248
|
if as_pandas and _is_dask_df(df):
|
235
249
|
# Allow separate timeout for compute if desired; reuse same timeout here.
|
236
250
|
compute_awaitable = asyncio.to_thread(df.compute)
|
237
251
|
return await (asyncio.wait_for(compute_awaitable, timeout) if timeout else compute_awaitable)
|
238
252
|
|
253
|
+
await self.emit(event=f"{self.__class__.__name__} load:progress", message=f"Returning {len(df)} records")
|
254
|
+
|
239
255
|
return df
|
240
256
|
|
241
257
|
# ---------- dataframe post-processing ----------
|
@@ -307,6 +323,11 @@ class DfHelper(ManagedResource):
|
|
307
323
|
|
308
324
|
self.logger.debug(f"Successfully saved '{parquet_filename}' to '{path}'.", extra=self.logger_extra)
|
309
325
|
|
326
|
+
async def asave_to_parquet(self, df: dd.DataFrame, **kwargs):
|
327
|
+
await self.emit(event=f"{self.__class__.__name__} save:start", message=f"Saving {len(df)} records to parquet")
|
328
|
+
await asyncio.to_thread(self.save_to_parquet, df, **kwargs)
|
329
|
+
await self.emit(event=f"{self.__class__.__name__} save:end", message=f"Saved {len(df)} records to parquet")
|
330
|
+
|
310
331
|
def save_to_clickhouse(self, df: dd.DataFrame, **credentials):
|
311
332
|
if not self._has_any_rows(df):
|
312
333
|
self.logger.warning("Skipping save to ClickHouse: The provided DataFrame is empty.", extra=self.logger_extra)
|
@@ -315,6 +336,11 @@ class DfHelper(ManagedResource):
|
|
315
336
|
writer.save_to_clickhouse(df)
|
316
337
|
self.logger.debug("Save to ClickHouse completed.", extra=self.logger_extra)
|
317
338
|
|
339
|
+
async def asave_to_clickhouse(self, df: dd.DataFrame, **credentials):
|
340
|
+
await self.emit(event=f"{self.__class__.__name__} save:start", message=f"Saving {len(df)} records to ClickHouse")
|
341
|
+
await asyncio.to_thread(self.save_to_clickhouse, df, **credentials)
|
342
|
+
await self.emit(event=f"{self.__class__.__name__} save:end", message=f"Saved {len(df)} records to ClickHouse")
|
343
|
+
|
318
344
|
# ---------- period loaders ----------
|
319
345
|
def load_period(self, dt_field: str, start: str, end: str, **kwargs):
|
320
346
|
final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
|
@@ -346,5 +372,3 @@ class DfHelper(ManagedResource):
|
|
346
372
|
return bool(ddf.head(1, npartitions=-1).shape[0])
|
347
373
|
except Exception:
|
348
374
|
return False
|
349
|
-
|
350
|
-
|
{sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/parquet/_parquet_options.py
RENAMED
@@ -205,6 +205,8 @@ class ParquetConfig(BaseModel):
|
|
205
205
|
filesystem=self.fs,
|
206
206
|
filters=pq_filters,
|
207
207
|
# Toggle based on file count; False is safer for many tiny files.
|
208
|
+
aggregate_files=True,
|
209
|
+
split_row_groups=True,
|
208
210
|
gather_statistics=False,
|
209
211
|
ignore_metadata_file=True,
|
210
212
|
)
|
{sibi_dst-2025.8.9 → sibi_dst-2025.9.2}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py
RENAMED
@@ -30,7 +30,7 @@ class SqlAlchemyLoadFromDb(ManagedResource):
|
|
30
30
|
self.engine = self.db_connection.engine
|
31
31
|
self.query_config = plugin_query
|
32
32
|
self.params_config = plugin_params
|
33
|
-
self.chunk_size = kwargs.get("chunk_size", self.params_config.df_params.get("chunk_size",
|
33
|
+
self.chunk_size = kwargs.get("chunk_size", self.params_config.df_params.get("chunk_size", 10000) if self.params_config else 10000)
|
34
34
|
self.total_records = -1
|
35
35
|
|
36
36
|
def build_and_load(self) -> Tuple[int, dd.DataFrame]:
|
@@ -1,3 +1,5 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import math
|
2
4
|
import os
|
3
5
|
import pickle
|
@@ -9,236 +11,102 @@ import numpy as np
|
|
9
11
|
import osmnx as ox
|
10
12
|
from geopy.distance import geodesic
|
11
13
|
|
12
|
-
|
13
|
-
|
14
|
-
# options = {
|
15
|
-
# 'ox_files_save_path': ox_files_save_path,
|
16
|
-
# 'network_type': 'drive',
|
17
|
-
# 'place': 'Costa Rica',
|
18
|
-
# 'files_prefix': 'costa-rica-',
|
19
|
-
# }
|
20
|
-
# Usage example
|
21
|
-
# handler = PBFHandler(**options)
|
22
|
-
# handler.load()
|
23
|
-
|
14
|
+
from typing import Optional
|
15
|
+
from fsspec.core import url_to_fs
|
24
16
|
|
25
17
|
class PBFHandler:
|
26
18
|
"""
|
27
|
-
|
28
|
-
from .pbf (Protocolbuffer Binary Format) files. This class enables the
|
29
|
-
loading, processing, saving, and reutilization of graph, node, and edge
|
30
|
-
data for geographical regions, supporting verbose mode for detailed outputs.
|
31
|
-
|
32
|
-
:ivar graph: The generated graph object representing the spatial network; can be None if not yet loaded or processed.
|
33
|
-
:type graph: Optional[NetworkX.Graph]
|
34
|
-
:ivar nodes: GeoDataFrame representing the nodes of the graph; can be None if not yet loaded or processed.
|
35
|
-
:type nodes: Optional[geopandas.GeoDataFrame]
|
36
|
-
:ivar edges: GeoDataFrame representing the edges of the graph; can be None if not yet loaded or processed.
|
37
|
-
:type edges: Optional[geopandas.GeoDataFrame]
|
38
|
-
:ivar rebuild: Indicates whether to rebuild the graph data, ignoring any existing cached files. Default is ``False``.
|
39
|
-
:type rebuild: bool
|
40
|
-
:ivar verbose: Enables verbose mode to provide detailed status messages during operations. Default is ``False``.
|
41
|
-
:type verbose: bool
|
42
|
-
:ivar place: The name of the geographical region to process with OpenStreetMap. Default is ``Costa Rica``.
|
43
|
-
:type place: str
|
44
|
-
:ivar filepath: The path to the directory where the graph, nodes, and edges pickle files are saved. Default is ``gis_data/``.
|
45
|
-
:type filepath: str
|
46
|
-
:ivar file_prefix: The prefix for the filenames of the saved graph, node, and edge pickle files. Default is ``costa-rica-``.
|
47
|
-
:type file_prefix: str
|
48
|
-
:ivar network_type: The type of network to extract from OpenStreetMap, such as "all" or other specific network types. Default is ``all``.
|
49
|
-
:type network_type: str
|
50
|
-
:ivar graph_file: Full path of the file to save or load the graph data as a pickle file.
|
51
|
-
:type graph_file: str
|
52
|
-
:ivar node_file: Full path of the file to save or load the graph's node data as a pickle file.
|
53
|
-
:type node_file: str
|
54
|
-
:ivar edge_file: Full path of the file to save or load the graph's edge data as a pickle file.
|
55
|
-
:type edge_file: str
|
19
|
+
Build/load OSMnx graph + nodes/edges; persist as pickle via fsspec.
|
56
20
|
"""
|
21
|
+
|
57
22
|
def __init__(self, **kwargs):
|
58
23
|
self.graph = None
|
59
|
-
self.nodes = None
|
60
|
-
self.edges = None
|
61
|
-
|
62
|
-
self.
|
63
|
-
self.
|
64
|
-
self.
|
65
|
-
self.
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
:param node_file: Path to the node file to be loaded or rebuilt.
|
84
|
-
:param edge_file: Path to the edge file to be loaded or rebuilt.
|
85
|
-
:param filepath: Path to the directory where files are processed and saved.
|
86
|
-
|
87
|
-
:return: None
|
88
|
-
"""
|
24
|
+
self.nodes: Optional[gpd.GeoDataFrame] = None
|
25
|
+
self.edges: Optional[gpd.GeoDataFrame] = None
|
26
|
+
|
27
|
+
self.rebuild: bool = kwargs.setdefault("rebuild", False)
|
28
|
+
self.verbose: bool = kwargs.setdefault("verbose", False)
|
29
|
+
self.place: str = kwargs.setdefault("place", "Costa Rica")
|
30
|
+
self.network_type: str = kwargs.setdefault("network_type", "all")
|
31
|
+
base_url: str = kwargs.setdefault("data_path", "osmnx_data/pbf_files")
|
32
|
+
prefix: str = kwargs.setdefault("files_prefix", "costa-rica-").rstrip("-") + "-"
|
33
|
+
|
34
|
+
# Allow passing an fsspec instance directly
|
35
|
+
fs = kwargs.get("fs")
|
36
|
+
if fs is not None:
|
37
|
+
self.fs = fs
|
38
|
+
self.base = base_url.rstrip("/")
|
39
|
+
else:
|
40
|
+
self.fs, self.base = url_to_fs(base_url)
|
41
|
+
|
42
|
+
self.fs.mkdirs(self.base, exist_ok=True)
|
43
|
+
|
44
|
+
self.graph_file = f"{self.base.rstrip('/')}/{prefix}graph.pkl"
|
45
|
+
self.node_file = f"{self.base.rstrip('/')}/{prefix}nodes.pkl"
|
46
|
+
self.edge_file = f"{self.base.rstrip('/')}/{prefix}edges.pkl"
|
47
|
+
|
89
48
|
if self.verbose:
|
90
|
-
print("
|
49
|
+
print(f"[PBFHandler] base={self.base}")
|
50
|
+
print(f" graph={self.graph_file}")
|
51
|
+
print(f" nodes={self.node_file}")
|
52
|
+
print(f" edges={self.edge_file}")
|
91
53
|
|
92
|
-
|
54
|
+
# ---------- public API ----------
|
55
|
+
def load(self) -> None:
|
56
|
+
if self.verbose:
|
57
|
+
print("[PBFHandler] load()")
|
93
58
|
|
94
59
|
if self.rebuild:
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
if not os.path.exists(self.filepath):
|
99
|
-
os.makedirs(self.filepath, exist_ok=True)
|
100
|
-
# self.process_pbf()
|
101
|
-
# self.save_to_pickle()
|
102
|
-
if not all(os.path.exists(f) for f in files_to_check):
|
60
|
+
self._delete_artifacts()
|
61
|
+
|
62
|
+
if not self._artifacts_exist():
|
103
63
|
self.process_pbf()
|
104
64
|
self.save_to_pickle()
|
105
65
|
else:
|
106
66
|
self.load_from_pickle()
|
107
67
|
|
68
|
+
def process_pbf(self) -> None:
|
69
|
+
if self.verbose:
|
70
|
+
print(f"[PBFHandler] processing: {self.place}")
|
71
|
+
self.graph = ox.graph_from_place(self.place, network_type=self.network_type)
|
72
|
+
self.nodes, self.edges = ox.graph_to_gdfs(self.graph)
|
73
|
+
|
74
|
+
def save_to_pickle(self) -> None:
|
75
|
+
if self.verbose:
|
76
|
+
print("[PBFHandler] saving via fsspec")
|
77
|
+
for path, obj in {
|
78
|
+
self.graph_file: self.graph,
|
79
|
+
self.node_file: self.nodes,
|
80
|
+
self.edge_file: self.edges,
|
81
|
+
}.items():
|
82
|
+
if obj is not None:
|
83
|
+
with self.fs.open(path, "wb") as f:
|
84
|
+
pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)
|
85
|
+
|
86
|
+
def load_from_pickle(self) -> None:
|
87
|
+
if self.verbose:
|
88
|
+
print("[PBFHandler] loading via fsspec")
|
89
|
+
self.graph = self._load_pickle(self.graph_file)
|
90
|
+
self.nodes = self._load_pickle(self.node_file)
|
91
|
+
self.edges = self._load_pickle(self.edge_file)
|
92
|
+
|
93
|
+
# ---------- helpers ----------
|
94
|
+
def _artifacts_exist(self) -> bool:
|
95
|
+
return all(self.fs.exists(p) for p in (self.graph_file, self.node_file, self.edge_file))
|
96
|
+
|
97
|
+
def _delete_artifacts(self) -> None:
|
108
98
|
if self.verbose:
|
109
|
-
print("
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
A flag to control verbose output. If True, detailed processing status messages are
|
121
|
-
logged to the console.
|
122
|
-
|
123
|
-
:param self.place: str
|
124
|
-
The name or description of the geographic place for which PBF data is processed. It
|
125
|
-
is used to construct a graph representation of the place.
|
126
|
-
|
127
|
-
:param self.network_type: str
|
128
|
-
The type of network graph to be created, typically one of 'all', 'walk', 'drive',
|
129
|
-
etc., reflecting the type of paths or streets included in the graph.
|
130
|
-
|
131
|
-
:return: None
|
132
|
-
This function does not return a value, but updates class attributes ``graph``,
|
133
|
-
``nodes``, and ``edges``.
|
134
|
-
|
135
|
-
:raises Exception:
|
136
|
-
Raises a general exception when there is an error in processing the PBF data. Error
|
137
|
-
details are printed when verbose output is enabled.
|
138
|
-
"""
|
139
|
-
try:
|
140
|
-
if self.verbose:
|
141
|
-
print(f"Processing PBF for {self.place}...")
|
142
|
-
|
143
|
-
self.graph = ox.graph_from_place(self.place, network_type=self.network_type)
|
144
|
-
self.nodes, self.edges = ox.graph_to_gdfs(self.graph)
|
145
|
-
|
146
|
-
if self.verbose:
|
147
|
-
print("PBF processed successfully.")
|
148
|
-
except Exception as e:
|
149
|
-
print(f"Error processing PBF: {e}")
|
150
|
-
raise
|
151
|
-
|
152
|
-
def save_to_pickle(self):
|
153
|
-
"""
|
154
|
-
Saves data, including graph, nodes, and edges, to pickle files. Each data object is
|
155
|
-
saved to its corresponding file if available. If verbose mode is enabled, prints
|
156
|
-
messages indicating the saving progress and success.
|
157
|
-
|
158
|
-
:param self:
|
159
|
-
Represents the instance of the class that contains attributes `graph_file`,
|
160
|
-
`graph`, `node_file`, `nodes`, `edge_file`, `edges`, and `verbose`. These
|
161
|
-
attributes determine the files to save to and the data to save.
|
162
|
-
|
163
|
-
:raises Exception:
|
164
|
-
Raises an exception if an error occurs during the saving process.
|
165
|
-
|
166
|
-
:return:
|
167
|
-
None
|
168
|
-
"""
|
169
|
-
try:
|
170
|
-
if self.verbose:
|
171
|
-
print("Saving data to pickle files...")
|
172
|
-
|
173
|
-
data_to_save = {
|
174
|
-
self.graph_file: self.graph,
|
175
|
-
self.node_file: self.nodes,
|
176
|
-
self.edge_file: self.edges
|
177
|
-
}
|
178
|
-
|
179
|
-
for file, data in data_to_save.items():
|
180
|
-
if data is not None:
|
181
|
-
with open(file, 'wb') as f:
|
182
|
-
pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
|
183
|
-
|
184
|
-
if self.verbose:
|
185
|
-
print("Data saved to pickle files successfully.")
|
186
|
-
except Exception as e:
|
187
|
-
print(f"Error saving to pickle: {e}")
|
188
|
-
raise
|
189
|
-
|
190
|
-
def load_from_pickle(self):
|
191
|
-
"""
|
192
|
-
Loads data from pickle files specified by the attributes `graph_file`, `node_file`,
|
193
|
-
and `edge_file` and assigns them to the corresponding attributes `graph`,
|
194
|
-
`nodes`, and `edges`, respectively. Displays verbose messages during the load
|
195
|
-
process if the `verbose` attribute is set to True.
|
196
|
-
|
197
|
-
:raises Exception: If an error occurs during reading or deserialization of the
|
198
|
-
pickle files.
|
199
|
-
"""
|
200
|
-
try:
|
201
|
-
if self.verbose:
|
202
|
-
print("Loading data from pickle files...")
|
203
|
-
|
204
|
-
files_to_load = {
|
205
|
-
self.graph_file: 'graph',
|
206
|
-
self.node_file: 'nodes',
|
207
|
-
self.edge_file: 'edges'
|
208
|
-
}
|
209
|
-
|
210
|
-
for file, attr in files_to_load.items():
|
211
|
-
with open(file, 'rb') as f:
|
212
|
-
setattr(self, attr, pickle.load(f))
|
213
|
-
|
214
|
-
if self.verbose:
|
215
|
-
print("Data loaded from pickle files successfully.")
|
216
|
-
except Exception as e:
|
217
|
-
print(f"Error loading from pickle: {e}")
|
218
|
-
raise
|
219
|
-
|
220
|
-
def plot_graph(self):
|
221
|
-
"""
|
222
|
-
Plots the loaded graph using the OSMnx library.
|
223
|
-
|
224
|
-
This method checks if a graph is loaded and, if available, plots it. Outputs
|
225
|
-
verbose messages during the process if verbosity is enabled.
|
226
|
-
|
227
|
-
:raises Exception: Raises if an error occurs during the plotting process.
|
228
|
-
:return: None
|
229
|
-
"""
|
230
|
-
try:
|
231
|
-
if self.graph is not None:
|
232
|
-
if self.verbose:
|
233
|
-
print("Plotting the graph...")
|
234
|
-
ox.plot_graph(self.graph)
|
235
|
-
if self.verbose:
|
236
|
-
print("Graph plotted successfully.")
|
237
|
-
else:
|
238
|
-
print("Graph is not loaded. Please load a PBF file first.")
|
239
|
-
except Exception as e:
|
240
|
-
print(f"Error plotting the graph: {e}")
|
241
|
-
raise
|
99
|
+
print("[PBFHandler] deleting artifacts (rebuild=True)")
|
100
|
+
for p in (self.graph_file, self.node_file, self.edge_file):
|
101
|
+
if self.fs.exists(p):
|
102
|
+
try:
|
103
|
+
self.fs.rm_file(p)
|
104
|
+
except Exception:
|
105
|
+
self.fs.rm(p)
|
106
|
+
|
107
|
+
def _load_pickle(self, path: str):
|
108
|
+
with self.fs.open(path, "rb") as f:
|
109
|
+
return pickle.load(f)
|
242
110
|
|
243
111
|
|
244
112
|
def get_bounding_box_from_points(gps_points, margin=0.001):
|