sibi-dst 2025.9.11__tar.gz → 2025.9.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/PKG-INFO +26 -30
- sibi_dst-2025.9.12/pyproject.toml +58 -0
- sibi_dst-2025.9.12/setup.cfg +4 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/__init__.py +11 -6
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/df_helper/__init__.py +0 -1
- sibi_dst-2025.9.12/sibi_dst/df_helper/_artifact_updater_async.py +316 -0
- sibi_dst-2025.9.12/sibi_dst/osmnx_helper/__init__.py +9 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/__init__.py +2 -1
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/boilerplate/base_pipeline.py +1 -2
- sibi_dst-2025.9.12/sibi_dst/utils/dask_utils.py +184 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/data_wrapper.py +0 -11
- sibi_dst-2025.9.12/sibi_dst.egg-info/PKG-INFO +59 -0
- sibi_dst-2025.9.12/sibi_dst.egg-info/SOURCES.txt +103 -0
- sibi_dst-2025.9.12/sibi_dst.egg-info/dependency_links.txt +1 -0
- sibi_dst-2025.9.12/sibi_dst.egg-info/requires.txt +22 -0
- sibi_dst-2025.9.12/sibi_dst.egg-info/top_level.txt +1 -0
- sibi_dst-2025.9.11/pyproject.toml +0 -65
- sibi_dst-2025.9.11/sibi_dst/df_helper/_artifact_updater_async.py +0 -292
- sibi_dst-2025.9.11/sibi_dst/df_helper/data_cleaner.py +0 -132
- sibi_dst-2025.9.11/sibi_dst/osmnx_helper/__init__.py +0 -7
- sibi_dst-2025.9.11/sibi_dst/utils/dask_utils.py +0 -61
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/README.md +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/df_helper/_artifact_updater_threaded.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/df_helper/_df_helper.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/df_helper/_parquet_reader.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/df_helper/backends/__init__.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/df_helper/core/__init__.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/df_helper/core/_defaults.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/df_helper/core/_params_config.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/df_helper/core/_query_config.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/geopy_helper/__init__.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/geopy_helper/utils.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/osmnx_helper/basemaps/route_map_plotter.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/osmnx_helper/route_path_builder.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/osmnx_helper/utils.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/tests/__init__.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/tests/test_baseclass.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/async_utils.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/base.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/boilerplate/__init__.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/boilerplate/base_attacher.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/boilerplate/base_data_cube.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/boilerplate/base_parquet_artifact.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/boilerplate/base_parquet_reader.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/boilerplate/base_pipeline_template.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/boilerplate/hybrid_data_loader.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/business_days.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/clickhouse_writer.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/credentials.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/data_from_http_source.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/data_utils.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/date_utils.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/df_utils.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/file_age_checker.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/file_utils.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/filepath_generator.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/iceberg_saver.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/log_utils.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/manifest_manager.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/parquet_saver.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/periods.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/phone_formatter.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/progress/__init__.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/progress/jobs.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/progress/sse_runner.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/storage_config.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/storage_hive.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/storage_manager.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/update_planner.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/webdav_client.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/utils/write_gatekeeper.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/v2/__init__.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/v2/df_helper/__init__.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/v2/df_helper/core/_query_config.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/v2/utils/__init__.py +0 -0
- {sibi_dst-2025.9.11 → sibi_dst-2025.9.12}/sibi_dst/v2/utils/log_utils.py +0 -0
@@ -1,34 +1,31 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: sibi-dst
|
3
|
-
Version: 2025.9.
|
4
|
-
Summary:
|
5
|
-
|
6
|
-
Author-email: lvalverdeb@gmail.com
|
7
|
-
Requires-Python: >=3.11,<4.0
|
8
|
-
Classifier: Programming Language :: Python :: 3
|
9
|
-
Classifier: Programming Language :: Python :: 3.11
|
10
|
-
Classifier: Programming Language :: Python :: 3.12
|
11
|
-
Classifier: Programming Language :: Python :: 3.13
|
12
|
-
Requires-Dist: clickhouse-connect (>=0.8.18,<0.9.0)
|
13
|
-
Requires-Dist: clickhouse-driver (>=0.2.9,<0.3.0)
|
14
|
-
Requires-Dist: dask[complete] (>=2025.9.0,<2026.0.0)
|
15
|
-
Requires-Dist: distributed (>=2025.9.1,<2026.0.0)
|
16
|
-
Requires-Dist: mysqlclient (>=2.2.7,<3.0.0)
|
17
|
-
Requires-Dist: opentelemetry-exporter-otlp (>=1.35.0,<2.0.0)
|
18
|
-
Requires-Dist: opentelemetry-sdk (>=1.35.0,<2.0.0)
|
19
|
-
Requires-Dist: pandas (>=2.3.1,<3.0.0)
|
20
|
-
Requires-Dist: psycopg2 (>=2.9.10,<3.0.0)
|
21
|
-
Requires-Dist: pyarrow (>=20.0.0,<21.0.0)
|
22
|
-
Requires-Dist: pydantic (>=2.11.7,<3.0.0)
|
23
|
-
Requires-Dist: pyiceberg[hive,s3fs] (>=0.9.1,<0.10.0)
|
24
|
-
Requires-Dist: pymysql (>=1.1.1,<2.0.0)
|
25
|
-
Requires-Dist: pyrosm (>=0.6.2,<0.7.0)
|
26
|
-
Requires-Dist: s3fs (>=2025.5.1,<2026.0.0)
|
27
|
-
Requires-Dist: sqlalchemy (>=2.0.41,<3.0.0)
|
28
|
-
Requires-Dist: sse-starlette (>=3.0.2,<4.0.0)
|
29
|
-
Requires-Dist: tqdm (>=4.67.1,<5.0.0)
|
30
|
-
Requires-Dist: webdav4 (>=0.10.0,<0.11.0)
|
3
|
+
Version: 2025.9.12
|
4
|
+
Summary: A data science toolkit for scalable data processing and analysis.
|
5
|
+
Requires-Python: >=3.11
|
31
6
|
Description-Content-Type: text/markdown
|
7
|
+
Requires-Dist: clickhouse-connect>=0.9.2
|
8
|
+
Requires-Dist: clickhouse-driver>=0.2.9
|
9
|
+
Requires-Dist: dask>=2025.9.1
|
10
|
+
Requires-Dist: distributed>=2025.9.1
|
11
|
+
Requires-Dist: fastapi>=0.118.0
|
12
|
+
Requires-Dist: folium>=0.20.0
|
13
|
+
Requires-Dist: mysqlclient>=2.2.7
|
14
|
+
Requires-Dist: opentelemetry-api>=1.37.0
|
15
|
+
Requires-Dist: opentelemetry-exporter-otlp>=1.37.0
|
16
|
+
Requires-Dist: opentelemetry-sdk>=1.37.0
|
17
|
+
Requires-Dist: pandas>=2.3.3
|
18
|
+
Requires-Dist: psycopg2>=2.9.10
|
19
|
+
Requires-Dist: pyarrow>=21.0.0
|
20
|
+
Requires-Dist: pydantic>=2.11.10
|
21
|
+
Requires-Dist: pymysql>=1.1.2
|
22
|
+
Requires-Dist: redis>=6.4.0
|
23
|
+
Requires-Dist: s3fs>=2025.9.0
|
24
|
+
Requires-Dist: sqlalchemy>=2.0.43
|
25
|
+
Requires-Dist: tqdm>=4.67.1
|
26
|
+
Requires-Dist: uvicorn>=0.37.0
|
27
|
+
Requires-Dist: webdav4>=0.10.0
|
28
|
+
Requires-Dist: wheel>=0.45.1
|
32
29
|
|
33
30
|
### SIBI-DST
|
34
31
|
|
@@ -60,4 +57,3 @@ pip install sibi-dst[dev,test,geospatial] # Install all optional dependencies
|
|
60
57
|
|
61
58
|
|
62
59
|
```
|
63
|
-
|
@@ -0,0 +1,58 @@
|
|
1
|
+
[project]
|
2
|
+
name = "sibi-dst"
|
3
|
+
version = "2025.9.12"
|
4
|
+
description = "A data science toolkit for scalable data processing and analysis."
|
5
|
+
readme = "README.md"
|
6
|
+
requires-python = ">=3.11"
|
7
|
+
dependencies = [
|
8
|
+
"clickhouse-connect>=0.9.2",
|
9
|
+
"clickhouse-driver>=0.2.9",
|
10
|
+
"dask>=2025.9.1",
|
11
|
+
"distributed>=2025.9.1",
|
12
|
+
"fastapi>=0.118.0",
|
13
|
+
"folium>=0.20.0",
|
14
|
+
"mysqlclient>=2.2.7",
|
15
|
+
"opentelemetry-api>=1.37.0",
|
16
|
+
"opentelemetry-exporter-otlp>=1.37.0",
|
17
|
+
"opentelemetry-sdk>=1.37.0",
|
18
|
+
"pandas>=2.3.3",
|
19
|
+
"psycopg2>=2.9.10",
|
20
|
+
"pyarrow>=21.0.0",
|
21
|
+
"pydantic>=2.11.10",
|
22
|
+
"pymysql>=1.1.2",
|
23
|
+
"redis>=6.4.0",
|
24
|
+
"s3fs>=2025.9.0",
|
25
|
+
"sqlalchemy>=2.0.43",
|
26
|
+
"tqdm>=4.67.1",
|
27
|
+
"uvicorn>=0.37.0",
|
28
|
+
"webdav4>=0.10.0",
|
29
|
+
"wheel>=0.45.1",
|
30
|
+
]
|
31
|
+
|
32
|
+
|
33
|
+
[dependency-groups]
|
34
|
+
dev = [
|
35
|
+
"black>=25.9.0",
|
36
|
+
"bokeh>=3.8.0",
|
37
|
+
"graphviz>=0.21",
|
38
|
+
"jupyter>=1.1.1",
|
39
|
+
"pytest>=8.4.2",
|
40
|
+
"python-dotenv>=1.1.1",
|
41
|
+
"wheel>=0.45.1",
|
42
|
+
]
|
43
|
+
geospatial = [
|
44
|
+
"folium>=0.20.0",
|
45
|
+
"geopandas>=1.1.1",
|
46
|
+
"geopy>=2.4.1",
|
47
|
+
"networkx>=3.5",
|
48
|
+
"osmnx>=2.0.6",
|
49
|
+
"scikit-learn>=1.7.2",
|
50
|
+
]
|
51
|
+
|
52
|
+
[build-system]
|
53
|
+
requires = ["setuptools>=65", "wheel"]
|
54
|
+
build-backend = "setuptools.build_meta"
|
55
|
+
|
56
|
+
[tool.setuptools.packages.find]
|
57
|
+
where = ["."]
|
58
|
+
include = ["sibi_dst*"]
|
@@ -10,12 +10,17 @@ try:
|
|
10
10
|
except version_reader.PackageNotFoundError:
|
11
11
|
__version__ = "unknown"
|
12
12
|
|
13
|
-
|
14
|
-
"__version__",
|
15
|
-
]
|
16
|
-
|
17
|
-
import sibi_dst.df_helper as df_helper
|
13
|
+
from sibi_dst.df_helper import *
|
18
14
|
from sibi_dst.osmnx_helper import *
|
19
15
|
from sibi_dst.geopy_helper import *
|
20
|
-
from sibi_dst
|
16
|
+
from sibi_dst import utils as sibiutils
|
21
17
|
|
18
|
+
|
19
|
+
__all__ = [
|
20
|
+
"__version__",
|
21
|
+
"DfHelper",
|
22
|
+
"ParquetArtifact",
|
23
|
+
"ParquetReader",
|
24
|
+
"ArtifactUpdaterMultiWrapperAsync",
|
25
|
+
"sibiutils"
|
26
|
+
]
|
@@ -3,7 +3,6 @@ from __future__ import annotations
|
|
3
3
|
from ._df_helper import DfHelper
|
4
4
|
from ._parquet_artifact import ParquetArtifact
|
5
5
|
from ._parquet_reader import ParquetReader
|
6
|
-
#from ._artifact_updater_multi_wrapper import ArtifactUpdaterMultiWrapperThreaded, ArtifactUpdaterMultiWrapperAsync
|
7
6
|
from ._artifact_updater_async import ArtifactUpdaterMultiWrapperAsync
|
8
7
|
from ._artifact_updater_threaded import ArtifactUpdaterMultiWrapperThreaded
|
9
8
|
|
@@ -0,0 +1,316 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import asyncio
|
4
|
+
import datetime
|
5
|
+
import random
|
6
|
+
import time
|
7
|
+
import pickle
|
8
|
+
from contextlib import ExitStack, suppress
|
9
|
+
from dataclasses import dataclass
|
10
|
+
from typing import Any, Callable, Dict, List, Optional, Sequence, Type
|
11
|
+
|
12
|
+
from sibi_dst.utils import ManagedResource, Logger
|
13
|
+
from sibi_dst.utils.dask_utils import DaskClientMixin
|
14
|
+
|
15
|
+
|
16
|
+
@dataclass(slots=True)
|
17
|
+
class _RetryCfg:
|
18
|
+
"""Retry and backoff configuration."""
|
19
|
+
attempts: int = 3
|
20
|
+
backoff_base: float = 2.0
|
21
|
+
backoff_max: float = 60.0
|
22
|
+
jitter: float = 0.15
|
23
|
+
|
24
|
+
|
25
|
+
def run_artifact_update(
|
26
|
+
cls: Type,
|
27
|
+
artifact_class_kwargs: Dict[str, Any],
|
28
|
+
retry: _RetryCfg,
|
29
|
+
period: str,
|
30
|
+
artifact_kwargs: Dict[str, Any],
|
31
|
+
) -> Dict[str, Any]:
|
32
|
+
"""
|
33
|
+
Executed inside Dask worker.
|
34
|
+
Instantiates artifact and runs update_parquet() with retry logic.
|
35
|
+
Reconstructs logger and filesystem if not provided (worker isolation safe).
|
36
|
+
"""
|
37
|
+
import logging
|
38
|
+
import fsspec
|
39
|
+
from sibi_dst.utils import Logger
|
40
|
+
|
41
|
+
# ---- Reinitialize a lightweight logger for the worker
|
42
|
+
worker_logger = Logger.default_logger(logger_name=cls.__name__) if hasattr(Logger, "default_logger") else logging.getLogger(cls.__name__)
|
43
|
+
worker_logger.set_level(logging.INFO)
|
44
|
+
|
45
|
+
# ---- Ensure fs is recreated if missing
|
46
|
+
fs = artifact_class_kwargs.get("fs")
|
47
|
+
if fs is None or isinstance(fs, str):
|
48
|
+
try:
|
49
|
+
fs_protocol = fs if isinstance(fs, str) else "file"
|
50
|
+
fs = fsspec.filesystem(fs_protocol)
|
51
|
+
except Exception:
|
52
|
+
fs = fsspec.filesystem("file")
|
53
|
+
|
54
|
+
# ---- Merge reconstructed environment into kwargs
|
55
|
+
artifact_kwargs_final = {
|
56
|
+
**artifact_class_kwargs,
|
57
|
+
"logger": worker_logger,
|
58
|
+
"fs": fs,
|
59
|
+
}
|
60
|
+
|
61
|
+
start_time = datetime.datetime.now()
|
62
|
+
success, error_msg, attempts = False, None, 0
|
63
|
+
|
64
|
+
for attempt in range(1, retry.attempts + 1):
|
65
|
+
attempts = attempt
|
66
|
+
try:
|
67
|
+
with ExitStack() as stack:
|
68
|
+
inst = cls(**artifact_kwargs_final)
|
69
|
+
inst = stack.enter_context(inst)
|
70
|
+
inst.update_parquet(period=period, **artifact_kwargs)
|
71
|
+
success = True
|
72
|
+
break
|
73
|
+
except Exception as e:
|
74
|
+
error_msg = str(e)
|
75
|
+
if attempt < retry.attempts:
|
76
|
+
delay = min(retry.backoff_base ** (attempt - 1), retry.backoff_max)
|
77
|
+
delay *= 1 + random.uniform(0, retry.jitter)
|
78
|
+
time.sleep(delay)
|
79
|
+
|
80
|
+
duration = (datetime.datetime.now() - start_time).total_seconds()
|
81
|
+
status = "😀" if success else "😩"
|
82
|
+
worker_logger.info(
|
83
|
+
f"{status} {cls.__name__} [{period}] finished in {duration:.2f}s ({attempts} attempt(s))"
|
84
|
+
)
|
85
|
+
|
86
|
+
return {
|
87
|
+
"artifact": cls.__name__,
|
88
|
+
"period": period,
|
89
|
+
"success": success,
|
90
|
+
"error": error_msg,
|
91
|
+
"attempts": attempts,
|
92
|
+
"duration_seconds": duration,
|
93
|
+
"started_at": start_time.isoformat(),
|
94
|
+
"ended_at": datetime.datetime.now().isoformat(),
|
95
|
+
}
|
96
|
+
|
97
|
+
|
98
|
+
# ---------------- Async Orchestrator ----------------
|
99
|
+
class ArtifactUpdaterMultiWrapperAsync(DaskClientMixin, ManagedResource):
|
100
|
+
"""
|
101
|
+
Async orchestrator for concurrent artifact updates.
|
102
|
+
|
103
|
+
• Uses Dask client (via DaskClientMixin) or local threads.
|
104
|
+
• Automatically sanitizes non-picklable arguments (e.g., loggers, fs).
|
105
|
+
• Provides structured retries, async orchestration, and safe cleanup.
|
106
|
+
"""
|
107
|
+
|
108
|
+
def __init__(
|
109
|
+
self,
|
110
|
+
wrapped_classes: Dict[str, Sequence[Type]],
|
111
|
+
*,
|
112
|
+
logger: Logger,
|
113
|
+
fs,
|
114
|
+
max_workers: int = 3,
|
115
|
+
retry_attempts: int = 3,
|
116
|
+
update_timeout_seconds: int = 600,
|
117
|
+
backoff_base: float = 2.0,
|
118
|
+
backoff_max: float = 60.0,
|
119
|
+
backoff_jitter: float = 0.15,
|
120
|
+
priority_fn: Optional[Callable[[Type], int]] = None,
|
121
|
+
artifact_class_kwargs: Optional[Dict[str, Any]] = None,
|
122
|
+
use_dask: bool = True,
|
123
|
+
dask_client: Optional[Any] = None,
|
124
|
+
debug: bool = False,
|
125
|
+
verbose: bool = False,
|
126
|
+
**kwargs: Any,
|
127
|
+
) -> None:
|
128
|
+
super().__init__(logger=logger, fs=fs, debug=debug, verbose=verbose)
|
129
|
+
|
130
|
+
# ---- Client lifecycle management
|
131
|
+
|
132
|
+
self.own_dask_client = dask_client is None
|
133
|
+
self._init_dask_client(dask_client, logger=logger)
|
134
|
+
self.use_dask = use_dask
|
135
|
+
|
136
|
+
# ---- Core configuration
|
137
|
+
self.wrapped_classes = wrapped_classes
|
138
|
+
self.max_workers = max_workers
|
139
|
+
self.priority_fn = priority_fn
|
140
|
+
self.update_timeout_seconds = update_timeout_seconds
|
141
|
+
|
142
|
+
# ---- Retry configuration
|
143
|
+
self._retry = _RetryCfg(
|
144
|
+
attempts=retry_attempts,
|
145
|
+
backoff_base=backoff_base,
|
146
|
+
backoff_max=backoff_max,
|
147
|
+
jitter=backoff_jitter,
|
148
|
+
)
|
149
|
+
|
150
|
+
# ---- Artifact instantiation arguments
|
151
|
+
self.artifact_class_kwargs = {
|
152
|
+
"logger": logger,
|
153
|
+
"fs": fs,
|
154
|
+
"debug": debug,
|
155
|
+
"verbose": verbose,
|
156
|
+
**(artifact_class_kwargs or {}),
|
157
|
+
}
|
158
|
+
|
159
|
+
# ---- Runtime tracking
|
160
|
+
self.completion_secs: Dict[str, float] = {}
|
161
|
+
self.failed: List[str] = []
|
162
|
+
self._stop_event = asyncio.Event()
|
163
|
+
|
164
|
+
self.logger_extra = {"sibi_dst_component": self.__class__.__name__}
|
165
|
+
|
166
|
+
if self.use_dask:
|
167
|
+
self.logger.debug(f"Initialized with Dask client: {self.dask_client}")
|
168
|
+
else:
|
169
|
+
self.logger.debug(f"Running in local thread-based mode.")
|
170
|
+
|
171
|
+
async def update_data(self, period: str, **kwargs: Any) -> List[Dict[str, Any]]:
|
172
|
+
"""Runs updates for all artifacts in a given period."""
|
173
|
+
self.completion_secs.clear()
|
174
|
+
self.failed.clear()
|
175
|
+
classes = self._classes_for(period)
|
176
|
+
|
177
|
+
self.logger.info(
|
178
|
+
f"Starting artifact updates for period '{period}' ({len(classes)} artifacts).",
|
179
|
+
extra=self.logger_extra,
|
180
|
+
)
|
181
|
+
|
182
|
+
try:
|
183
|
+
if self.use_dask:
|
184
|
+
futures = [self._submit_one_dask(cls, period, kwargs) for cls in classes]
|
185
|
+
results = await asyncio.to_thread(lambda: self.dask_client.gather(futures))
|
186
|
+
else:
|
187
|
+
sem = asyncio.Semaphore(self.max_workers)
|
188
|
+
tasks = [self._run_one_async(cls, period, sem, kwargs) for cls in classes]
|
189
|
+
results = await asyncio.gather(*tasks)
|
190
|
+
|
191
|
+
self.logger.info(
|
192
|
+
f"Completed {len(results)} artifact updates for period '{period}'.",
|
193
|
+
extra=self.logger_extra,
|
194
|
+
)
|
195
|
+
return results
|
196
|
+
|
197
|
+
finally:
|
198
|
+
# Always cleanup if we own the client
|
199
|
+
if getattr(self, "own_dask_client", False):
|
200
|
+
self._close_dask_client()
|
201
|
+
|
202
|
+
|
203
|
+
def _sanitize_kwargs_for_dask(self, kwargs: Dict[str, Any]) -> Dict[str, Any]:
|
204
|
+
"""
|
205
|
+
Removes non-picklable runtime objects (e.g., loggers, fs) before sending to Dask.
|
206
|
+
"""
|
207
|
+
clean: Dict[str, Any] = {}
|
208
|
+
for k, v in kwargs.items():
|
209
|
+
try:
|
210
|
+
pickle.dumps(v)
|
211
|
+
clean[k] = v
|
212
|
+
except Exception:
|
213
|
+
self.logger.debug(f"Skipping non-picklable key '{k}' for Dask worker.")
|
214
|
+
return clean
|
215
|
+
|
216
|
+
def _submit_one_dask(self, cls: Type, period: str, artifact_kwargs: Dict[str, Any]):
|
217
|
+
"""Submit one artifact job to Dask."""
|
218
|
+
safe_kwargs = self._sanitize_kwargs_for_dask(self.artifact_class_kwargs)
|
219
|
+
return self.dask_client.submit(
|
220
|
+
run_artifact_update,
|
221
|
+
cls,
|
222
|
+
safe_kwargs,
|
223
|
+
self._retry,
|
224
|
+
period,
|
225
|
+
artifact_kwargs,
|
226
|
+
pure=False,
|
227
|
+
)
|
228
|
+
|
229
|
+
def _classes_for(self, period: str) -> List[Type]:
|
230
|
+
"""Selects artifact classes for the given period."""
|
231
|
+
try:
|
232
|
+
classes = list(self.wrapped_classes[period])
|
233
|
+
except KeyError:
|
234
|
+
raise ValueError(f"No artifacts configured for period '{period}'.")
|
235
|
+
if not classes:
|
236
|
+
raise ValueError(f"No artifact classes found for '{period}'.")
|
237
|
+
|
238
|
+
if self.priority_fn:
|
239
|
+
with suppress(Exception):
|
240
|
+
classes.sort(key=self.priority_fn)
|
241
|
+
return classes
|
242
|
+
|
243
|
+
async def _run_one_async(
|
244
|
+
self,
|
245
|
+
cls: Type,
|
246
|
+
period: str,
|
247
|
+
sem: asyncio.Semaphore,
|
248
|
+
artifact_kwargs: Dict[str, Any],
|
249
|
+
) -> Dict[str, Any]:
|
250
|
+
"""Fallback local async execution (no Dask)."""
|
251
|
+
name = cls.__name__
|
252
|
+
start_time = datetime.datetime.now()
|
253
|
+
|
254
|
+
async with sem:
|
255
|
+
for attempt in range(1, self._retry.attempts + 1):
|
256
|
+
try:
|
257
|
+
def _sync_block():
|
258
|
+
with ExitStack() as stack:
|
259
|
+
inst = cls(**self.artifact_class_kwargs)
|
260
|
+
inst = stack.enter_context(inst)
|
261
|
+
inst.update_parquet(period=period, **artifact_kwargs)
|
262
|
+
|
263
|
+
await asyncio.wait_for(
|
264
|
+
asyncio.to_thread(_sync_block),
|
265
|
+
timeout=self.update_timeout_seconds,
|
266
|
+
)
|
267
|
+
duration = (datetime.datetime.now() - start_time).total_seconds()
|
268
|
+
self.completion_secs[name] = duration
|
269
|
+
self.logger.info(f"✅ {name} completed in {duration:.2f}s")
|
270
|
+
return {
|
271
|
+
"artifact": name,
|
272
|
+
"period": period,
|
273
|
+
"success": True,
|
274
|
+
"attempts": attempt,
|
275
|
+
"duration_seconds": duration,
|
276
|
+
}
|
277
|
+
|
278
|
+
except Exception as e:
|
279
|
+
if attempt < self._retry.attempts:
|
280
|
+
delay = min(self._retry.backoff_base ** attempt, self._retry.backoff_max)
|
281
|
+
delay *= 1 + random.uniform(0, self._retry.jitter)
|
282
|
+
self.logger.warning(f"Retry {attempt}/{self._retry.attempts} for {name}: {e}")
|
283
|
+
await asyncio.sleep(delay)
|
284
|
+
else:
|
285
|
+
duration = (datetime.datetime.now() - start_time).total_seconds()
|
286
|
+
self.failed.append(name)
|
287
|
+
self.logger.error(f"❌ {name} failed after {attempt} attempts: {e}")
|
288
|
+
return {
|
289
|
+
"artifact": name,
|
290
|
+
"period": period,
|
291
|
+
"success": False,
|
292
|
+
"attempts": attempt,
|
293
|
+
"error": str(e),
|
294
|
+
"duration_seconds": duration,
|
295
|
+
}
|
296
|
+
|
297
|
+
|
298
|
+
def get_update_status(self) -> Dict[str, Any]:
|
299
|
+
"""Returns summary of completed, failed, and pending artifacts."""
|
300
|
+
done = set(self.completion_secs)
|
301
|
+
fail = set(self.failed)
|
302
|
+
all_names = {cls.__name__ for v in self.wrapped_classes.values() for cls in v}
|
303
|
+
return {
|
304
|
+
"total": len(all_names),
|
305
|
+
"completed": sorted(done),
|
306
|
+
"failed": sorted(fail),
|
307
|
+
"pending": sorted(all_names - done - fail),
|
308
|
+
"completion_times": self.completion_secs,
|
309
|
+
}
|
310
|
+
|
311
|
+
def _cleanup(self) -> None:
|
312
|
+
"""Ensures safe resource closure."""
|
313
|
+
with suppress(Exception):
|
314
|
+
if getattr(self, "own_dask_client", False):
|
315
|
+
self._close_dask_client()
|
316
|
+
|
@@ -24,6 +24,7 @@ from .manifest_manager import MissingManifestManager
|
|
24
24
|
__all__ = [
|
25
25
|
"Logger",
|
26
26
|
"ManagedResource",
|
27
|
+
|
27
28
|
"ConfigManager",
|
28
29
|
"ConfigLoader",
|
29
30
|
"DateUtils",
|
@@ -42,5 +43,5 @@ __all__ = [
|
|
42
43
|
"FsRegistry",
|
43
44
|
"DataFromHttpSource",
|
44
45
|
"WebDAVClient",
|
45
|
-
"MissingManifestManager"
|
46
|
+
"MissingManifestManager",
|
46
47
|
]
|
@@ -93,7 +93,7 @@ class BasePipeline(ManagedResource):
|
|
93
93
|
df[self.date_field] = dd.to_datetime(df[self.date_field], errors="coerce")
|
94
94
|
df["partition_date"] = df[self.date_field].dt.date.astype(str)
|
95
95
|
|
96
|
-
out_path = self.storage_path.rstrip("/")
|
96
|
+
out_path = self.storage_path.rstrip("/")
|
97
97
|
self.logger.info("Saving dataset to %s", out_path)
|
98
98
|
ps = ParquetSaver(
|
99
99
|
df_result=df,
|
@@ -111,7 +111,6 @@ class BasePipeline(ManagedResource):
|
|
111
111
|
parquet_start_date=self.start_date,
|
112
112
|
parquet_end_date=self.end_date,
|
113
113
|
parquet_storage_path=self.storage_path,
|
114
|
-
parquet_filename=self._get_output_filename(),
|
115
114
|
fs=self.fs,
|
116
115
|
debug=self.debug,
|
117
116
|
logger=self.logger,
|