sibi-dst 2025.9.10__tar.gz → 2025.9.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/PKG-INFO +26 -30
  2. sibi_dst-2025.9.12/pyproject.toml +58 -0
  3. sibi_dst-2025.9.12/setup.cfg +4 -0
  4. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/__init__.py +11 -6
  5. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/df_helper/__init__.py +0 -1
  6. sibi_dst-2025.9.12/sibi_dst/df_helper/_artifact_updater_async.py +316 -0
  7. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +1 -3
  8. sibi_dst-2025.9.12/sibi_dst/osmnx_helper/__init__.py +9 -0
  9. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/__init__.py +2 -1
  10. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/boilerplate/base_pipeline.py +1 -2
  11. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/business_days.py +19 -51
  12. sibi_dst-2025.9.12/sibi_dst/utils/dask_utils.py +184 -0
  13. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/data_wrapper.py +0 -11
  14. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/filepath_generator.py +1 -154
  15. sibi_dst-2025.9.12/sibi_dst.egg-info/PKG-INFO +59 -0
  16. sibi_dst-2025.9.12/sibi_dst.egg-info/SOURCES.txt +103 -0
  17. sibi_dst-2025.9.12/sibi_dst.egg-info/dependency_links.txt +1 -0
  18. sibi_dst-2025.9.12/sibi_dst.egg-info/requires.txt +22 -0
  19. sibi_dst-2025.9.12/sibi_dst.egg-info/top_level.txt +1 -0
  20. sibi_dst-2025.9.10/pyproject.toml +0 -65
  21. sibi_dst-2025.9.10/sibi_dst/df_helper/_artifact_updater_async.py +0 -292
  22. sibi_dst-2025.9.10/sibi_dst/df_helper/data_cleaner.py +0 -132
  23. sibi_dst-2025.9.10/sibi_dst/osmnx_helper/__init__.py +0 -7
  24. sibi_dst-2025.9.10/sibi_dst/utils/dask_utils.py +0 -61
  25. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/README.md +0 -0
  26. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/df_helper/_artifact_updater_threaded.py +0 -0
  27. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/df_helper/_df_helper.py +0 -0
  28. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
  29. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/df_helper/_parquet_reader.py +0 -0
  30. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/df_helper/backends/__init__.py +0 -0
  31. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
  32. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
  33. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
  34. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
  35. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  36. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +0 -0
  37. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  38. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  39. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +0 -0
  40. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
  41. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/df_helper/core/__init__.py +0 -0
  42. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/df_helper/core/_defaults.py +0 -0
  43. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
  44. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/df_helper/core/_params_config.py +0 -0
  45. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/df_helper/core/_query_config.py +0 -0
  46. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/geopy_helper/__init__.py +0 -0
  47. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
  48. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/geopy_helper/utils.py +0 -0
  49. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
  50. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
  51. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
  52. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/osmnx_helper/basemaps/route_map_plotter.py +0 -0
  53. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
  54. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/osmnx_helper/route_path_builder.py +0 -0
  55. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/osmnx_helper/utils.py +0 -0
  56. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/tests/__init__.py +0 -0
  57. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/tests/test_baseclass.py +0 -0
  58. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
  59. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/async_utils.py +0 -0
  60. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/base.py +0 -0
  61. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/boilerplate/__init__.py +0 -0
  62. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/boilerplate/base_attacher.py +0 -0
  63. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/boilerplate/base_data_cube.py +0 -0
  64. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/boilerplate/base_parquet_artifact.py +0 -0
  65. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/boilerplate/base_parquet_reader.py +0 -0
  66. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/boilerplate/base_pipeline_template.py +0 -0
  67. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/boilerplate/hybrid_data_loader.py +0 -0
  68. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/clickhouse_writer.py +0 -0
  69. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/credentials.py +0 -0
  70. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/data_from_http_source.py +0 -0
  71. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/data_utils.py +0 -0
  72. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/date_utils.py +0 -0
  73. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/df_utils.py +0 -0
  74. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/file_age_checker.py +0 -0
  75. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/file_utils.py +0 -0
  76. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/iceberg_saver.py +0 -0
  77. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/log_utils.py +0 -0
  78. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/manifest_manager.py +0 -0
  79. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/parquet_saver.py +0 -0
  80. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/periods.py +0 -0
  81. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/phone_formatter.py +0 -0
  82. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/progress/__init__.py +0 -0
  83. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/progress/jobs.py +0 -0
  84. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/progress/sse_runner.py +0 -0
  85. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/storage_config.py +0 -0
  86. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/storage_hive.py +0 -0
  87. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/storage_manager.py +0 -0
  88. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/update_planner.py +0 -0
  89. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/webdav_client.py +0 -0
  90. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/utils/write_gatekeeper.py +0 -0
  91. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/v2/__init__.py +0 -0
  92. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/v2/df_helper/__init__.py +0 -0
  93. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
  94. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
  95. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
  96. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  97. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  98. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  99. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
  100. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
  101. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -0
  102. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +0 -0
  103. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
  104. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
  105. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
  106. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
  107. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
  108. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/v2/df_helper/core/_query_config.py +0 -0
  109. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/v2/utils/__init__.py +0 -0
  110. {sibi_dst-2025.9.10 → sibi_dst-2025.9.12}/sibi_dst/v2/utils/log_utils.py +0 -0
@@ -1,34 +1,31 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: sibi-dst
3
- Version: 2025.9.10
4
- Summary: Data Science Toolkit
5
- Author: Luis Valverde
6
- Author-email: lvalverdeb@gmail.com
7
- Requires-Python: >=3.11,<4.0
8
- Classifier: Programming Language :: Python :: 3
9
- Classifier: Programming Language :: Python :: 3.11
10
- Classifier: Programming Language :: Python :: 3.12
11
- Classifier: Programming Language :: Python :: 3.13
12
- Requires-Dist: clickhouse-connect (>=0.8.18,<0.9.0)
13
- Requires-Dist: clickhouse-driver (>=0.2.9,<0.3.0)
14
- Requires-Dist: dask[complete] (>=2025.9.0,<2026.0.0)
15
- Requires-Dist: distributed (>=2025.9.1,<2026.0.0)
16
- Requires-Dist: mysqlclient (>=2.2.7,<3.0.0)
17
- Requires-Dist: opentelemetry-exporter-otlp (>=1.35.0,<2.0.0)
18
- Requires-Dist: opentelemetry-sdk (>=1.35.0,<2.0.0)
19
- Requires-Dist: pandas (>=2.3.1,<3.0.0)
20
- Requires-Dist: psycopg2 (>=2.9.10,<3.0.0)
21
- Requires-Dist: pyarrow (>=20.0.0,<21.0.0)
22
- Requires-Dist: pydantic (>=2.11.7,<3.0.0)
23
- Requires-Dist: pyiceberg[hive,s3fs] (>=0.9.1,<0.10.0)
24
- Requires-Dist: pymysql (>=1.1.1,<2.0.0)
25
- Requires-Dist: pyrosm (>=0.6.2,<0.7.0)
26
- Requires-Dist: s3fs (>=2025.5.1,<2026.0.0)
27
- Requires-Dist: sqlalchemy (>=2.0.41,<3.0.0)
28
- Requires-Dist: sse-starlette (>=3.0.2,<4.0.0)
29
- Requires-Dist: tqdm (>=4.67.1,<5.0.0)
30
- Requires-Dist: webdav4 (>=0.10.0,<0.11.0)
3
+ Version: 2025.9.12
4
+ Summary: A data science toolkit for scalable data processing and analysis.
5
+ Requires-Python: >=3.11
31
6
  Description-Content-Type: text/markdown
7
+ Requires-Dist: clickhouse-connect>=0.9.2
8
+ Requires-Dist: clickhouse-driver>=0.2.9
9
+ Requires-Dist: dask>=2025.9.1
10
+ Requires-Dist: distributed>=2025.9.1
11
+ Requires-Dist: fastapi>=0.118.0
12
+ Requires-Dist: folium>=0.20.0
13
+ Requires-Dist: mysqlclient>=2.2.7
14
+ Requires-Dist: opentelemetry-api>=1.37.0
15
+ Requires-Dist: opentelemetry-exporter-otlp>=1.37.0
16
+ Requires-Dist: opentelemetry-sdk>=1.37.0
17
+ Requires-Dist: pandas>=2.3.3
18
+ Requires-Dist: psycopg2>=2.9.10
19
+ Requires-Dist: pyarrow>=21.0.0
20
+ Requires-Dist: pydantic>=2.11.10
21
+ Requires-Dist: pymysql>=1.1.2
22
+ Requires-Dist: redis>=6.4.0
23
+ Requires-Dist: s3fs>=2025.9.0
24
+ Requires-Dist: sqlalchemy>=2.0.43
25
+ Requires-Dist: tqdm>=4.67.1
26
+ Requires-Dist: uvicorn>=0.37.0
27
+ Requires-Dist: webdav4>=0.10.0
28
+ Requires-Dist: wheel>=0.45.1
32
29
 
33
30
  ### SIBI-DST
34
31
 
@@ -60,4 +57,3 @@ pip install sibi-dst[dev,test,geospatial] # Install all optional dependencies
60
57
 
61
58
 
62
59
  ```
63
-
@@ -0,0 +1,58 @@
1
+ [project]
2
+ name = "sibi-dst"
3
+ version = "2025.9.12"
4
+ description = "A data science toolkit for scalable data processing and analysis."
5
+ readme = "README.md"
6
+ requires-python = ">=3.11"
7
+ dependencies = [
8
+ "clickhouse-connect>=0.9.2",
9
+ "clickhouse-driver>=0.2.9",
10
+ "dask>=2025.9.1",
11
+ "distributed>=2025.9.1",
12
+ "fastapi>=0.118.0",
13
+ "folium>=0.20.0",
14
+ "mysqlclient>=2.2.7",
15
+ "opentelemetry-api>=1.37.0",
16
+ "opentelemetry-exporter-otlp>=1.37.0",
17
+ "opentelemetry-sdk>=1.37.0",
18
+ "pandas>=2.3.3",
19
+ "psycopg2>=2.9.10",
20
+ "pyarrow>=21.0.0",
21
+ "pydantic>=2.11.10",
22
+ "pymysql>=1.1.2",
23
+ "redis>=6.4.0",
24
+ "s3fs>=2025.9.0",
25
+ "sqlalchemy>=2.0.43",
26
+ "tqdm>=4.67.1",
27
+ "uvicorn>=0.37.0",
28
+ "webdav4>=0.10.0",
29
+ "wheel>=0.45.1",
30
+ ]
31
+
32
+
33
+ [dependency-groups]
34
+ dev = [
35
+ "black>=25.9.0",
36
+ "bokeh>=3.8.0",
37
+ "graphviz>=0.21",
38
+ "jupyter>=1.1.1",
39
+ "pytest>=8.4.2",
40
+ "python-dotenv>=1.1.1",
41
+ "wheel>=0.45.1",
42
+ ]
43
+ geospatial = [
44
+ "folium>=0.20.0",
45
+ "geopandas>=1.1.1",
46
+ "geopy>=2.4.1",
47
+ "networkx>=3.5",
48
+ "osmnx>=2.0.6",
49
+ "scikit-learn>=1.7.2",
50
+ ]
51
+
52
+ [build-system]
53
+ requires = ["setuptools>=65", "wheel"]
54
+ build-backend = "setuptools.build_meta"
55
+
56
+ [tool.setuptools.packages.find]
57
+ where = ["."]
58
+ include = ["sibi_dst*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -10,12 +10,17 @@ try:
10
10
  except version_reader.PackageNotFoundError:
11
11
  __version__ = "unknown"
12
12
 
13
- __all__ = [
14
- "__version__",
15
- ]
16
-
17
- import sibi_dst.df_helper as df_helper
13
+ from sibi_dst.df_helper import *
18
14
  from sibi_dst.osmnx_helper import *
19
15
  from sibi_dst.geopy_helper import *
20
- from sibi_dst.utils import *
16
+ from sibi_dst import utils as sibiutils
21
17
 
18
+
19
+ __all__ = [
20
+ "__version__",
21
+ "DfHelper",
22
+ "ParquetArtifact",
23
+ "ParquetReader",
24
+ "ArtifactUpdaterMultiWrapperAsync",
25
+ "sibiutils"
26
+ ]
@@ -3,7 +3,6 @@ from __future__ import annotations
3
3
  from ._df_helper import DfHelper
4
4
  from ._parquet_artifact import ParquetArtifact
5
5
  from ._parquet_reader import ParquetReader
6
- #from ._artifact_updater_multi_wrapper import ArtifactUpdaterMultiWrapperThreaded, ArtifactUpdaterMultiWrapperAsync
7
6
  from ._artifact_updater_async import ArtifactUpdaterMultiWrapperAsync
8
7
  from ._artifact_updater_threaded import ArtifactUpdaterMultiWrapperThreaded
9
8
 
@@ -0,0 +1,316 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import datetime
5
+ import random
6
+ import time
7
+ import pickle
8
+ from contextlib import ExitStack, suppress
9
+ from dataclasses import dataclass
10
+ from typing import Any, Callable, Dict, List, Optional, Sequence, Type
11
+
12
+ from sibi_dst.utils import ManagedResource, Logger
13
+ from sibi_dst.utils.dask_utils import DaskClientMixin
14
+
15
+
16
+ @dataclass(slots=True)
17
+ class _RetryCfg:
18
+ """Retry and backoff configuration."""
19
+ attempts: int = 3
20
+ backoff_base: float = 2.0
21
+ backoff_max: float = 60.0
22
+ jitter: float = 0.15
23
+
24
+
25
+ def run_artifact_update(
26
+ cls: Type,
27
+ artifact_class_kwargs: Dict[str, Any],
28
+ retry: _RetryCfg,
29
+ period: str,
30
+ artifact_kwargs: Dict[str, Any],
31
+ ) -> Dict[str, Any]:
32
+ """
33
+ Executed inside Dask worker.
34
+ Instantiates artifact and runs update_parquet() with retry logic.
35
+ Reconstructs logger and filesystem if not provided (worker isolation safe).
36
+ """
37
+ import logging
38
+ import fsspec
39
+ from sibi_dst.utils import Logger
40
+
41
+ # ---- Reinitialize a lightweight logger for the worker
42
+ worker_logger = Logger.default_logger(logger_name=cls.__name__) if hasattr(Logger, "default_logger") else logging.getLogger(cls.__name__)
43
+ worker_logger.set_level(logging.INFO)
44
+
45
+ # ---- Ensure fs is recreated if missing
46
+ fs = artifact_class_kwargs.get("fs")
47
+ if fs is None or isinstance(fs, str):
48
+ try:
49
+ fs_protocol = fs if isinstance(fs, str) else "file"
50
+ fs = fsspec.filesystem(fs_protocol)
51
+ except Exception:
52
+ fs = fsspec.filesystem("file")
53
+
54
+ # ---- Merge reconstructed environment into kwargs
55
+ artifact_kwargs_final = {
56
+ **artifact_class_kwargs,
57
+ "logger": worker_logger,
58
+ "fs": fs,
59
+ }
60
+
61
+ start_time = datetime.datetime.now()
62
+ success, error_msg, attempts = False, None, 0
63
+
64
+ for attempt in range(1, retry.attempts + 1):
65
+ attempts = attempt
66
+ try:
67
+ with ExitStack() as stack:
68
+ inst = cls(**artifact_kwargs_final)
69
+ inst = stack.enter_context(inst)
70
+ inst.update_parquet(period=period, **artifact_kwargs)
71
+ success = True
72
+ break
73
+ except Exception as e:
74
+ error_msg = str(e)
75
+ if attempt < retry.attempts:
76
+ delay = min(retry.backoff_base ** (attempt - 1), retry.backoff_max)
77
+ delay *= 1 + random.uniform(0, retry.jitter)
78
+ time.sleep(delay)
79
+
80
+ duration = (datetime.datetime.now() - start_time).total_seconds()
81
+ status = "😀" if success else "😩"
82
+ worker_logger.info(
83
+ f"{status} {cls.__name__} [{period}] finished in {duration:.2f}s ({attempts} attempt(s))"
84
+ )
85
+
86
+ return {
87
+ "artifact": cls.__name__,
88
+ "period": period,
89
+ "success": success,
90
+ "error": error_msg,
91
+ "attempts": attempts,
92
+ "duration_seconds": duration,
93
+ "started_at": start_time.isoformat(),
94
+ "ended_at": datetime.datetime.now().isoformat(),
95
+ }
96
+
97
+
98
+ # ---------------- Async Orchestrator ----------------
99
+ class ArtifactUpdaterMultiWrapperAsync(DaskClientMixin, ManagedResource):
100
+ """
101
+ Async orchestrator for concurrent artifact updates.
102
+
103
+ • Uses Dask client (via DaskClientMixin) or local threads.
104
+ • Automatically sanitizes non-picklable arguments (e.g., loggers, fs).
105
+ • Provides structured retries, async orchestration, and safe cleanup.
106
+ """
107
+
108
+ def __init__(
109
+ self,
110
+ wrapped_classes: Dict[str, Sequence[Type]],
111
+ *,
112
+ logger: Logger,
113
+ fs,
114
+ max_workers: int = 3,
115
+ retry_attempts: int = 3,
116
+ update_timeout_seconds: int = 600,
117
+ backoff_base: float = 2.0,
118
+ backoff_max: float = 60.0,
119
+ backoff_jitter: float = 0.15,
120
+ priority_fn: Optional[Callable[[Type], int]] = None,
121
+ artifact_class_kwargs: Optional[Dict[str, Any]] = None,
122
+ use_dask: bool = True,
123
+ dask_client: Optional[Any] = None,
124
+ debug: bool = False,
125
+ verbose: bool = False,
126
+ **kwargs: Any,
127
+ ) -> None:
128
+ super().__init__(logger=logger, fs=fs, debug=debug, verbose=verbose)
129
+
130
+ # ---- Client lifecycle management
131
+
132
+ self.own_dask_client = dask_client is None
133
+ self._init_dask_client(dask_client, logger=logger)
134
+ self.use_dask = use_dask
135
+
136
+ # ---- Core configuration
137
+ self.wrapped_classes = wrapped_classes
138
+ self.max_workers = max_workers
139
+ self.priority_fn = priority_fn
140
+ self.update_timeout_seconds = update_timeout_seconds
141
+
142
+ # ---- Retry configuration
143
+ self._retry = _RetryCfg(
144
+ attempts=retry_attempts,
145
+ backoff_base=backoff_base,
146
+ backoff_max=backoff_max,
147
+ jitter=backoff_jitter,
148
+ )
149
+
150
+ # ---- Artifact instantiation arguments
151
+ self.artifact_class_kwargs = {
152
+ "logger": logger,
153
+ "fs": fs,
154
+ "debug": debug,
155
+ "verbose": verbose,
156
+ **(artifact_class_kwargs or {}),
157
+ }
158
+
159
+ # ---- Runtime tracking
160
+ self.completion_secs: Dict[str, float] = {}
161
+ self.failed: List[str] = []
162
+ self._stop_event = asyncio.Event()
163
+
164
+ self.logger_extra = {"sibi_dst_component": self.__class__.__name__}
165
+
166
+ if self.use_dask:
167
+ self.logger.debug(f"Initialized with Dask client: {self.dask_client}")
168
+ else:
169
+ self.logger.debug(f"Running in local thread-based mode.")
170
+
171
+ async def update_data(self, period: str, **kwargs: Any) -> List[Dict[str, Any]]:
172
+ """Runs updates for all artifacts in a given period."""
173
+ self.completion_secs.clear()
174
+ self.failed.clear()
175
+ classes = self._classes_for(period)
176
+
177
+ self.logger.info(
178
+ f"Starting artifact updates for period '{period}' ({len(classes)} artifacts).",
179
+ extra=self.logger_extra,
180
+ )
181
+
182
+ try:
183
+ if self.use_dask:
184
+ futures = [self._submit_one_dask(cls, period, kwargs) for cls in classes]
185
+ results = await asyncio.to_thread(lambda: self.dask_client.gather(futures))
186
+ else:
187
+ sem = asyncio.Semaphore(self.max_workers)
188
+ tasks = [self._run_one_async(cls, period, sem, kwargs) for cls in classes]
189
+ results = await asyncio.gather(*tasks)
190
+
191
+ self.logger.info(
192
+ f"Completed {len(results)} artifact updates for period '{period}'.",
193
+ extra=self.logger_extra,
194
+ )
195
+ return results
196
+
197
+ finally:
198
+ # Always cleanup if we own the client
199
+ if getattr(self, "own_dask_client", False):
200
+ self._close_dask_client()
201
+
202
+
203
+ def _sanitize_kwargs_for_dask(self, kwargs: Dict[str, Any]) -> Dict[str, Any]:
204
+ """
205
+ Removes non-picklable runtime objects (e.g., loggers, fs) before sending to Dask.
206
+ """
207
+ clean: Dict[str, Any] = {}
208
+ for k, v in kwargs.items():
209
+ try:
210
+ pickle.dumps(v)
211
+ clean[k] = v
212
+ except Exception:
213
+ self.logger.debug(f"Skipping non-picklable key '{k}' for Dask worker.")
214
+ return clean
215
+
216
+ def _submit_one_dask(self, cls: Type, period: str, artifact_kwargs: Dict[str, Any]):
217
+ """Submit one artifact job to Dask."""
218
+ safe_kwargs = self._sanitize_kwargs_for_dask(self.artifact_class_kwargs)
219
+ return self.dask_client.submit(
220
+ run_artifact_update,
221
+ cls,
222
+ safe_kwargs,
223
+ self._retry,
224
+ period,
225
+ artifact_kwargs,
226
+ pure=False,
227
+ )
228
+
229
+ def _classes_for(self, period: str) -> List[Type]:
230
+ """Selects artifact classes for the given period."""
231
+ try:
232
+ classes = list(self.wrapped_classes[period])
233
+ except KeyError:
234
+ raise ValueError(f"No artifacts configured for period '{period}'.")
235
+ if not classes:
236
+ raise ValueError(f"No artifact classes found for '{period}'.")
237
+
238
+ if self.priority_fn:
239
+ with suppress(Exception):
240
+ classes.sort(key=self.priority_fn)
241
+ return classes
242
+
243
+ async def _run_one_async(
244
+ self,
245
+ cls: Type,
246
+ period: str,
247
+ sem: asyncio.Semaphore,
248
+ artifact_kwargs: Dict[str, Any],
249
+ ) -> Dict[str, Any]:
250
+ """Fallback local async execution (no Dask)."""
251
+ name = cls.__name__
252
+ start_time = datetime.datetime.now()
253
+
254
+ async with sem:
255
+ for attempt in range(1, self._retry.attempts + 1):
256
+ try:
257
+ def _sync_block():
258
+ with ExitStack() as stack:
259
+ inst = cls(**self.artifact_class_kwargs)
260
+ inst = stack.enter_context(inst)
261
+ inst.update_parquet(period=period, **artifact_kwargs)
262
+
263
+ await asyncio.wait_for(
264
+ asyncio.to_thread(_sync_block),
265
+ timeout=self.update_timeout_seconds,
266
+ )
267
+ duration = (datetime.datetime.now() - start_time).total_seconds()
268
+ self.completion_secs[name] = duration
269
+ self.logger.info(f"✅ {name} completed in {duration:.2f}s")
270
+ return {
271
+ "artifact": name,
272
+ "period": period,
273
+ "success": True,
274
+ "attempts": attempt,
275
+ "duration_seconds": duration,
276
+ }
277
+
278
+ except Exception as e:
279
+ if attempt < self._retry.attempts:
280
+ delay = min(self._retry.backoff_base ** attempt, self._retry.backoff_max)
281
+ delay *= 1 + random.uniform(0, self._retry.jitter)
282
+ self.logger.warning(f"Retry {attempt}/{self._retry.attempts} for {name}: {e}")
283
+ await asyncio.sleep(delay)
284
+ else:
285
+ duration = (datetime.datetime.now() - start_time).total_seconds()
286
+ self.failed.append(name)
287
+ self.logger.error(f"❌ {name} failed after {attempt} attempts: {e}")
288
+ return {
289
+ "artifact": name,
290
+ "period": period,
291
+ "success": False,
292
+ "attempts": attempt,
293
+ "error": str(e),
294
+ "duration_seconds": duration,
295
+ }
296
+
297
+
298
+ def get_update_status(self) -> Dict[str, Any]:
299
+ """Returns summary of completed, failed, and pending artifacts."""
300
+ done = set(self.completion_secs)
301
+ fail = set(self.failed)
302
+ all_names = {cls.__name__ for v in self.wrapped_classes.values() for cls in v}
303
+ return {
304
+ "total": len(all_names),
305
+ "completed": sorted(done),
306
+ "failed": sorted(fail),
307
+ "pending": sorted(all_names - done - fail),
308
+ "completion_times": self.completion_secs,
309
+ }
310
+
311
+ def _cleanup(self) -> None:
312
+ """Ensures safe resource closure."""
313
+ with suppress(Exception):
314
+ if getattr(self, "own_dask_client", False):
315
+ self._close_dask_client()
316
+
@@ -231,7 +231,7 @@ class ParquetConfig(BaseModel):
231
231
  Builds a list of path patterns for dask.read_parquet.
232
232
  Respects partition_on + start/end date if given.
233
233
  """
234
- print(f"_resolve_paths_for_read: {self.partition_on}")
234
+ self.logger.debug(f"_resolve_paths_for_read: {self.partition_on}")
235
235
  # Partitioned dataset by column
236
236
  if self.partition_on and self.parquet_start_date and self.parquet_end_date:
237
237
  if not isinstance(self.partition_on, (list, tuple)):
@@ -244,12 +244,10 @@ class ParquetConfig(BaseModel):
244
244
  days = pd.date_range(start=start, end=end, freq="D").date
245
245
 
246
246
  base = self.parquet_storage_path.rstrip("/")
247
- print("base:",base)
248
247
  result= [
249
248
  f"{base}/{parts[0]}={d.isoformat()}/*.parquet"
250
249
  for d in days
251
250
  ]
252
- print("result:",result)
253
251
  return result
254
252
 
255
253
  # Date-ranged folders (non-partitioned, using FilePathGenerator)
@@ -0,0 +1,9 @@
1
+ from .base_osm_map import BaseOsmMap
2
+ from .utils import PBFHandler
3
+ from .route_path_builder import RoutePathBuilder, RoutePathBuilderConfig
4
+ __all__ = [
5
+ "BaseOsmMap",
6
+ "RoutePathBuilder",
7
+ "RoutePathBuilderConfig",
8
+ "PBFHandler",
9
+ ]
@@ -24,6 +24,7 @@ from .manifest_manager import MissingManifestManager
24
24
  __all__ = [
25
25
  "Logger",
26
26
  "ManagedResource",
27
+
27
28
  "ConfigManager",
28
29
  "ConfigLoader",
29
30
  "DateUtils",
@@ -42,5 +43,5 @@ __all__ = [
42
43
  "FsRegistry",
43
44
  "DataFromHttpSource",
44
45
  "WebDAVClient",
45
- "MissingManifestManager"
46
+ "MissingManifestManager",
46
47
  ]
@@ -93,7 +93,7 @@ class BasePipeline(ManagedResource):
93
93
  df[self.date_field] = dd.to_datetime(df[self.date_field], errors="coerce")
94
94
  df["partition_date"] = df[self.date_field].dt.date.astype(str)
95
95
 
96
- out_path = self.storage_path.rstrip("/")+"/"+self._get_output_filename(fmt="parquet")
96
+ out_path = self.storage_path.rstrip("/")
97
97
  self.logger.info("Saving dataset to %s", out_path)
98
98
  ps = ParquetSaver(
99
99
  df_result=df,
@@ -111,7 +111,6 @@ class BasePipeline(ManagedResource):
111
111
  parquet_start_date=self.start_date,
112
112
  parquet_end_date=self.end_date,
113
113
  parquet_storage_path=self.storage_path,
114
- parquet_filename=self._get_output_filename(),
115
114
  fs=self.fs,
116
115
  debug=self.debug,
117
116
  logger=self.logger,