sibi-dst 2025.8.7__tar.gz → 2025.8.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/PKG-INFO +3 -2
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/pyproject.toml +3 -2
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/_df_helper.py +105 -89
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/_parquet_artifact.py +11 -10
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/_parquet_reader.py +4 -0
- sibi_dst-2025.8.8/sibi_dst/df_helper/backends/parquet/_parquet_options.py +565 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +11 -10
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +9 -8
- sibi_dst-2025.8.8/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +56 -0
- sibi_dst-2025.8.8/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +50 -0
- sibi_dst-2025.8.8/sibi_dst/utils/boilerplate/__init__.py +6 -0
- sibi_dst-2025.8.8/sibi_dst/utils/boilerplate/base_data_artifact.py +110 -0
- sibi_dst-2025.8.8/sibi_dst/utils/boilerplate/base_data_cube.py +79 -0
- sibi_dst-2025.8.8/sibi_dst/utils/data_wrapper.py +277 -0
- sibi_dst-2025.8.8/sibi_dst/utils/iceberg_saver.py +126 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/log_utils.py +0 -346
- sibi_dst-2025.8.8/sibi_dst/utils/parquet_saver.py +224 -0
- sibi_dst-2025.8.8/sibi_dst/utils/progress/__init__.py +5 -0
- sibi_dst-2025.8.8/sibi_dst/utils/progress/jobs.py +82 -0
- sibi_dst-2025.8.8/sibi_dst/utils/progress/sse_runner.py +82 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/storage_hive.py +38 -1
- sibi_dst-2025.8.8/sibi_dst/utils/update_planner.py +801 -0
- sibi_dst-2025.8.7/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -275
- sibi_dst-2025.8.7/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -128
- sibi_dst-2025.8.7/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -154
- sibi_dst-2025.8.7/sibi_dst/utils/data_wrapper.py +0 -518
- sibi_dst-2025.8.7/sibi_dst/utils/parquet_saver.py +0 -123
- sibi_dst-2025.8.7/sibi_dst/utils/update_planner.py +0 -300
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/README.md +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/__init__.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/__init__.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/_artifact_updater_async.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/_artifact_updater_threaded.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/backends/__init__.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/core/__init__.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/core/_defaults.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/core/_params_config.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/core/_query_config.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/df_helper/data_cleaner.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/geopy_helper/__init__.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/geopy_helper/utils.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/osmnx_helper/__init__.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/osmnx_helper/basemaps/route_map_plotter.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/osmnx_helper/route_path_builder.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/osmnx_helper/utils.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/tests/__init__.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/__init__.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/async_utils.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/base.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/business_days.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/clickhouse_writer.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/credentials.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/data_from_http_source.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/data_utils.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/date_utils.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/df_utils.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/file_age_checker.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/file_utils.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/filepath_generator.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/manifest_manager.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/periods.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/phone_formatter.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/storage_config.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/storage_manager.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/utils/webdav_client.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/__init__.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/df_helper/__init__.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/df_helper/core/_query_config.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/utils/__init__.py +0 -0
- {sibi_dst-2025.8.7 → sibi_dst-2025.8.8}/sibi_dst/v2/utils/log_utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sibi-dst
|
3
|
-
Version: 2025.8.
|
3
|
+
Version: 2025.8.8
|
4
4
|
Summary: Data Science Toolkit
|
5
5
|
Author: Luis Valverde
|
6
6
|
Author-email: lvalverdeb@gmail.com
|
@@ -19,10 +19,11 @@ Requires-Dist: pandas (>=2.3.1,<3.0.0)
|
|
19
19
|
Requires-Dist: psycopg2 (>=2.9.10,<3.0.0)
|
20
20
|
Requires-Dist: pyarrow (>=20.0.0,<21.0.0)
|
21
21
|
Requires-Dist: pydantic (>=2.11.7,<3.0.0)
|
22
|
+
Requires-Dist: pyiceberg[hive,s3fs] (>=0.9.1,<0.10.0)
|
22
23
|
Requires-Dist: pymysql (>=1.1.1,<2.0.0)
|
23
|
-
Requires-Dist: rich (>=14.0.0,<15.0.0)
|
24
24
|
Requires-Dist: s3fs (>=2025.5.1,<2026.0.0)
|
25
25
|
Requires-Dist: sqlalchemy (>=2.0.41,<3.0.0)
|
26
|
+
Requires-Dist: sse-starlette (>=3.0.2,<4.0.0)
|
26
27
|
Requires-Dist: tqdm (>=4.67.1,<5.0.0)
|
27
28
|
Requires-Dist: webdav4 (>=0.10.0,<0.11.0)
|
28
29
|
Description-Content-Type: text/markdown
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "sibi-dst"
|
3
|
-
version = "2025.8.
|
3
|
+
version = "2025.8.8"
|
4
4
|
description = "Data Science Toolkit"
|
5
5
|
authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
|
6
6
|
readme = "README.md"
|
@@ -21,9 +21,10 @@ pydantic = "^2.11.7"
|
|
21
21
|
sqlalchemy = "^2.0.41"
|
22
22
|
pymysql = "^1.1.1"
|
23
23
|
pyarrow = "^20.0.0"
|
24
|
-
rich = "^14.0.0"
|
25
24
|
opentelemetry-exporter-otlp = "^1.35.0"
|
26
25
|
opentelemetry-sdk = "^1.35.0"
|
26
|
+
pyiceberg = {extras = ["hive", "s3fs"], version = "^0.9.1"}
|
27
|
+
sse-starlette = "^3.0.2"
|
27
28
|
|
28
29
|
[tool.poetry.group.dev]
|
29
30
|
optional = true
|
@@ -16,6 +16,15 @@ from .backends.sqlalchemy import SqlAlchemyConnectionConfig, SqlAlchemyLoadFromD
|
|
16
16
|
|
17
17
|
T = TypeVar("T", bound=BaseModel)
|
18
18
|
|
19
|
+
def _is_dask_df(x) -> bool:
|
20
|
+
return isinstance(x, dd.DataFrame)
|
21
|
+
|
22
|
+
def _maybe_persist(df, persist: bool):
|
23
|
+
return df.persist() if persist and _is_dask_df(df) else df
|
24
|
+
|
25
|
+
def _maybe_compute(df, as_pandas: bool):
|
26
|
+
return df.compute() if as_pandas and _is_dask_df(df) else df
|
27
|
+
|
19
28
|
|
20
29
|
# ---- Backend Strategy Pattern ----
|
21
30
|
class BaseBackend:
|
@@ -23,13 +32,13 @@ class BaseBackend:
|
|
23
32
|
self.helper = helper
|
24
33
|
self.logger = helper.logger
|
25
34
|
self.debug = helper.debug
|
26
|
-
self.total_records =
|
35
|
+
self.total_records = -1
|
27
36
|
|
28
37
|
def load(self, **options) -> Union[tuple[Any, Any], dd.DataFrame, pd.DataFrame]:
|
29
38
|
raise NotImplementedError
|
30
39
|
|
31
40
|
async def aload(self, **options) -> Union[tuple[Any, Any], dd.DataFrame, pd.DataFrame]:
|
32
|
-
return self.load
|
41
|
+
return await asyncio.to_thread(self.load,**options)
|
33
42
|
|
34
43
|
|
35
44
|
class SqlAlchemyBackend(BaseBackend):
|
@@ -48,7 +57,7 @@ class SqlAlchemyBackend(BaseBackend):
|
|
48
57
|
self.total_records, result = db_loader.build_and_load()
|
49
58
|
return self.total_records, result
|
50
59
|
except Exception as e:
|
51
|
-
self.logger.error(f"Failed to load data from sqlalchemy: {e}", exc_info=self.debug)
|
60
|
+
self.logger.error(f"Failed to load data from sqlalchemy: {e}", exc_info=self.debug, extra=self.helper.logger_extra)
|
52
61
|
return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
53
62
|
|
54
63
|
|
@@ -56,53 +65,57 @@ class ParquetBackend(BaseBackend):
|
|
56
65
|
def load(self, **options):
|
57
66
|
try:
|
58
67
|
df = self.helper.backend_parquet.load_files(**options)
|
59
|
-
if self.
|
60
|
-
|
61
|
-
nrows = self._row_count(df)
|
62
|
-
if nrows == 0:
|
63
|
-
self.logger.debug("No records after filters; returning empty DataFrame.")
|
68
|
+
if not self.helper._has_any_rows(df):
|
69
|
+
self.total_records = 0
|
64
70
|
return 0, self._empty_like(df)
|
65
71
|
|
66
|
-
|
67
|
-
self.total_records =
|
72
|
+
# Let DfHelper decide about persist
|
73
|
+
self.total_records = -1 # unknown without full count
|
68
74
|
return self.total_records, df
|
69
75
|
|
70
76
|
except Exception as e:
|
71
77
|
self.total_records = -1 # Reset total_records on failure
|
72
|
-
self.logger.error(f"Failed to load data from parquet: {e}", exc_info=self.debug)
|
78
|
+
self.logger.error(f"Failed to load data from parquet: {e}", exc_info=self.debug, extra=self.helper.logger_extra)
|
73
79
|
return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
74
80
|
|
75
|
-
|
76
|
-
|
77
|
-
try:
|
78
|
-
# head with npartitions=-1 walks partitions until it gets n rows
|
79
|
-
return ddf.head(1, npartitions=-1).shape[0] == 0
|
80
|
-
except Exception:
|
81
|
-
return True
|
82
|
-
|
83
|
-
def _row_count(self, ddf) -> int:
|
84
|
-
"""Reliable row count for Dask DataFrame."""
|
85
|
-
return int(ddf.map_partitions(len).sum().compute())
|
86
|
-
|
87
|
-
def _empty_like(self, ddf):
|
88
|
-
"""Return an empty Dask DF with the SAME columns/dtypes."""
|
81
|
+
@staticmethod
|
82
|
+
def _empty_like(ddf):
|
89
83
|
empty_pdf = ddf._meta.iloc[0:0]
|
90
84
|
return dd.from_pandas(empty_pdf, npartitions=1)
|
91
85
|
|
92
86
|
|
93
87
|
class HttpBackend(BaseBackend):
|
94
88
|
def load(self, **options):
|
95
|
-
#
|
96
|
-
|
89
|
+
# Avoid event-loop problems in sync code paths.
|
90
|
+
# If someone calls .load() on an async backend, make it explicit.
|
91
|
+
raise RuntimeError(
|
92
|
+
"HttpBackend.load() is sync but this backend is async-only. "
|
93
|
+
"Call `await helper.aload(...)` or `await helper.load_async(prefer_native=True, ...)`."
|
94
|
+
)
|
97
95
|
|
98
96
|
async def aload(self, **options):
|
99
97
|
if not self.helper.backend_http:
|
100
|
-
self.logger.warning("HTTP plugin not configured properly.")
|
98
|
+
self.logger.warning("HTTP plugin not configured properly.", extra=self.helper.logger_extra)
|
101
99
|
self.total_records = -1
|
102
100
|
return self.total_records, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
101
|
+
|
103
102
|
result = await self.helper.backend_http.fetch_data(**options)
|
104
|
-
|
105
|
-
|
103
|
+
|
104
|
+
# Normalize to DataFrame if the plugin returns list/dict
|
105
|
+
if isinstance(result, (list, dict)):
|
106
|
+
pdf = pd.DataFrame(result)
|
107
|
+
ddf = dd.from_pandas(pdf, npartitions=max(1, min(32, len(pdf) // 50_000 or 1)))
|
108
|
+
self.total_records = len(pdf)
|
109
|
+
return self.total_records, ddf
|
110
|
+
|
111
|
+
if isinstance(result, pd.DataFrame):
|
112
|
+
self.total_records = len(result)
|
113
|
+
ddf = dd.from_pandas(result, npartitions=max(1, min(32, len(result) // 50_000 or 1)))
|
114
|
+
return self.total_records, ddf
|
115
|
+
|
116
|
+
# Fallback
|
117
|
+
self.total_records = -1
|
118
|
+
return self.total_records, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
106
119
|
|
107
120
|
|
108
121
|
class DfHelper(ManagedResource):
|
@@ -119,6 +132,7 @@ class DfHelper(ManagedResource):
|
|
119
132
|
}
|
120
133
|
|
121
134
|
default_config: Dict[str, Any] = None
|
135
|
+
logger_extra: Dict[str, Any] = {"sibi_dst_component": __name__}
|
122
136
|
|
123
137
|
def __init__(self, backend="sqlalchemy", **kwargs):
|
124
138
|
self.default_config = self.default_config or {}
|
@@ -155,24 +169,25 @@ class DfHelper(ManagedResource):
|
|
155
169
|
def _cleanup(self):
|
156
170
|
attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
|
157
171
|
if not attr_name:
|
158
|
-
self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.")
|
172
|
+
self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.", extra=self.logger_extra)
|
159
173
|
return
|
160
174
|
active_config = getattr(self, attr_name, None)
|
161
175
|
if active_config and hasattr(active_config, "close"):
|
162
|
-
self.logger.debug(f"
|
176
|
+
self.logger.debug(f"{self.__class__.__name__} is closing resources for backend '{self.backend}' backend using attribute '{attr_name}'.", extra=self.logger_extra)
|
163
177
|
active_config.close()
|
164
178
|
|
165
179
|
async def _acleanup(self):
|
166
180
|
self.logger.warning(
|
167
|
-
"DfHelper instance was not used in an async context manager; cleanup is being called manually."
|
181
|
+
"DfHelper instance was not used in an async context manager; cleanup is being called manually.",
|
182
|
+
extra=self.logger_extra,
|
168
183
|
)
|
169
184
|
attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
|
170
185
|
if not attr_name:
|
171
|
-
self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.")
|
186
|
+
self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.", extra=self.logger_extra)
|
172
187
|
return
|
173
188
|
active_config = getattr(self, attr_name, None)
|
174
189
|
if active_config and hasattr(active_config, "aclose"):
|
175
|
-
self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.")
|
190
|
+
self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.", extra=self.logger_extra)
|
176
191
|
await active_config.aclose()
|
177
192
|
|
178
193
|
# ---------- config helpers ----------
|
@@ -183,55 +198,49 @@ class DfHelper(ManagedResource):
|
|
183
198
|
|
184
199
|
# ---------- load/aload ----------
|
185
200
|
def load(self, *, persist: bool = False, as_pandas: bool = False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
186
|
-
self.logger.debug(f"Loading data from {self.backend} backend with options: {options}")
|
201
|
+
self.logger.debug(f"Loading data from {self.backend} backend with options: {options}", extra=self.logger_extra)
|
187
202
|
self.total_records, df = self.backend_strategy.load(**options)
|
188
203
|
df = self._process_loaded_data(df)
|
189
204
|
df = self._post_process_df(df)
|
190
|
-
|
191
|
-
df
|
192
|
-
|
205
|
+
df = _maybe_persist(df, persist)
|
206
|
+
return _maybe_compute(df, as_pandas)
|
207
|
+
|
208
|
+
async def aload(
|
209
|
+
self,
|
210
|
+
*,
|
211
|
+
persist: bool = False,
|
212
|
+
as_pandas: bool = False,
|
213
|
+
timeout: Optional[float] = None,
|
214
|
+
**options
|
215
|
+
) -> Union[pd.DataFrame, dd.DataFrame]:
|
216
|
+
# 1) Async load if available, else run sync load in a thread.
|
217
|
+
if hasattr(self.backend_strategy, "aload"):
|
218
|
+
load_awaitable = self.backend_strategy.aload(**options)
|
219
|
+
else:
|
220
|
+
# Run ONLY the backend load step in a thread to avoid event-loop blocking.
|
221
|
+
load_awaitable = asyncio.to_thread(self.backend_strategy.load, **options)
|
193
222
|
|
194
|
-
|
195
|
-
self.total_records
|
196
|
-
|
197
|
-
|
198
|
-
df =
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
otherwise runs the sync `load()` in a worker thread via asyncio.to_thread.
|
212
|
-
|
213
|
-
Args:
|
214
|
-
persist: same as `load`
|
215
|
-
as_pandas: same as `load`
|
216
|
-
prefer_native: if True and the backend overrides `aload`, use it.
|
217
|
-
otherwise force thread offload of `load()`.
|
218
|
-
**options: forwarded to `load` / `aload`
|
219
|
-
"""
|
220
|
-
# If the backend provided an override for `aload`, use it
|
221
|
-
if prefer_native and type(self.backend_strategy).aload is not BaseBackend.aload:
|
222
|
-
return await self.aload(persist=persist, as_pandas=as_pandas, **options)
|
223
|
-
|
224
|
-
# Fall back to offloading the sync path to a thread
|
225
|
-
return await asyncio.to_thread(
|
226
|
-
self.load,
|
227
|
-
persist=persist,
|
228
|
-
as_pandas=as_pandas,
|
229
|
-
**options,
|
230
|
-
)
|
223
|
+
total, df = await (asyncio.wait_for(load_awaitable, timeout) if timeout else load_awaitable)
|
224
|
+
self.total_records = total
|
225
|
+
|
226
|
+
# 2) Post-processing steps are sync; offload to threads.
|
227
|
+
df = await asyncio.to_thread(self._process_loaded_data, df)
|
228
|
+
df = await asyncio.to_thread(self._post_process_df, df)
|
229
|
+
|
230
|
+
# 3) Persist and compute can block; offload when needed.
|
231
|
+
if persist and _is_dask_df(df):
|
232
|
+
df = await asyncio.to_thread(df.persist)
|
233
|
+
|
234
|
+
if as_pandas and _is_dask_df(df):
|
235
|
+
# Allow separate timeout for compute if desired; reuse same timeout here.
|
236
|
+
compute_awaitable = asyncio.to_thread(df.compute)
|
237
|
+
return await (asyncio.wait_for(compute_awaitable, timeout) if timeout else compute_awaitable)
|
238
|
+
|
239
|
+
return df
|
231
240
|
|
232
241
|
# ---------- dataframe post-processing ----------
|
233
242
|
def _post_process_df(self, df: dd.DataFrame) -> dd.DataFrame:
|
234
|
-
self.logger.debug("
|
243
|
+
self.logger.debug(f"{self.__class__.__name__} is post-processing resulting dataframe with {len(df)} records.", extra=self.logger_extra)
|
235
244
|
df_params = self._backend_params.df_params
|
236
245
|
if not df_params:
|
237
246
|
return df
|
@@ -242,7 +251,7 @@ class DfHelper(ManagedResource):
|
|
242
251
|
if fieldnames:
|
243
252
|
valid = [f for f in fieldnames if f in df.columns]
|
244
253
|
if len(valid) < len(fieldnames):
|
245
|
-
self.logger.warning(f"Missing columns for filtering: {set(fieldnames) - set(valid)}")
|
254
|
+
self.logger.warning(f"Missing columns for filtering: {set(fieldnames) - set(valid)}", extra=self.logger_extra)
|
246
255
|
df = df[valid]
|
247
256
|
if column_names:
|
248
257
|
if len(df.columns) != len(column_names):
|
@@ -255,7 +264,7 @@ class DfHelper(ManagedResource):
|
|
255
264
|
raise ValueError(f"Index column '{index_col}' not found in DataFrame.")
|
256
265
|
df = df.set_index(index_col)
|
257
266
|
|
258
|
-
self.logger.debug("Post-processing complete.")
|
267
|
+
self.logger.debug("Post-processing complete.", extra=self.logger_extra)
|
259
268
|
return df
|
260
269
|
|
261
270
|
def _process_loaded_data(self, df: dd.DataFrame) -> dd.DataFrame:
|
@@ -264,7 +273,7 @@ class DfHelper(ManagedResource):
|
|
264
273
|
return df
|
265
274
|
if hasattr(df, "npartitions") and df.npartitions == 1 and not len(df.head(1)):
|
266
275
|
return df
|
267
|
-
self.logger.debug("
|
276
|
+
self.logger.debug(f"{self.__class__.__name__} is applying rename mapping if/when necessary.", extra=self.logger_extra)
|
268
277
|
rename_map = {k: v for k, v in field_map.items() if k in df.columns}
|
269
278
|
if rename_map:
|
270
279
|
df = df.rename(columns=rename_map)
|
@@ -274,15 +283,15 @@ class DfHelper(ManagedResource):
|
|
274
283
|
def save_to_parquet(self, df: dd.DataFrame, **kwargs):
|
275
284
|
fs: AbstractFileSystem = kwargs.pop("fs", self.fs)
|
276
285
|
path: str = kwargs.pop("parquet_storage_path", self.backend_parquet.parquet_storage_path if self.backend_parquet else None)
|
277
|
-
parquet_filename = kwargs.pop("parquet_filename"
|
286
|
+
parquet_filename = kwargs.pop("parquet_filename", self.backend_parquet.parquet_filename if self.backend_parquet else None)
|
278
287
|
if not parquet_filename:
|
279
288
|
raise ValueError("A 'parquet_filename' keyword argument must be provided.")
|
280
289
|
if not fs:
|
281
290
|
raise ValueError("A filesystem (fs) must be provided to save the parquet file.")
|
282
291
|
if not path:
|
283
292
|
raise ValueError("A 'parquet_storage_path' keyword argument must be provided.")
|
284
|
-
if
|
285
|
-
self.logger.warning("Skipping save: The provided DataFrame is empty.")
|
293
|
+
if not self._has_any_rows(df):
|
294
|
+
self.logger.warning("Skipping save: The provided DataFrame is empty.", extra=self.logger_extra)
|
286
295
|
return
|
287
296
|
|
288
297
|
with ParquetSaver(
|
@@ -296,15 +305,15 @@ class DfHelper(ManagedResource):
|
|
296
305
|
) as saver:
|
297
306
|
saver.save_to_parquet(parquet_filename)
|
298
307
|
|
299
|
-
self.logger.debug(f"Successfully saved '{parquet_filename}' to '{path}'.")
|
308
|
+
self.logger.debug(f"Successfully saved '{parquet_filename}' to '{path}'.", extra=self.logger_extra)
|
300
309
|
|
301
310
|
def save_to_clickhouse(self, df: dd.DataFrame, **credentials):
|
302
|
-
if
|
303
|
-
self.logger.warning("
|
311
|
+
if not self._has_any_rows(df):
|
312
|
+
self.logger.warning("Skipping save to ClickHouse: The provided DataFrame is empty.", extra=self.logger_extra)
|
304
313
|
return
|
305
314
|
with ClickHouseWriter(debug=self.debug, logger=self.logger, verbose=self.verbose, **credentials) as writer:
|
306
315
|
writer.save_to_clickhouse(df)
|
307
|
-
self.logger.debug("Save to ClickHouse completed.")
|
316
|
+
self.logger.debug("Save to ClickHouse completed.", extra=self.logger_extra)
|
308
317
|
|
309
318
|
# ---------- period loaders ----------
|
310
319
|
def load_period(self, dt_field: str, start: str, end: str, **kwargs):
|
@@ -322,13 +331,20 @@ class DfHelper(ManagedResource):
|
|
322
331
|
field_map = self._backend_params.field_map or {}
|
323
332
|
reverse_map = {v: k for k, v in field_map.items()} if field_map else {}
|
324
333
|
if len(reverse_map) != len(field_map):
|
325
|
-
self.logger.warning("field_map values are not unique; reverse mapping may be unreliable.")
|
334
|
+
self.logger.warning("field_map values are not unique; reverse mapping may be unreliable.", extra=self.logger_extra)
|
326
335
|
mapped_field = reverse_map.get(dt_field, dt_field)
|
327
336
|
if start_date == end_date:
|
328
337
|
kwargs[f"{mapped_field}__date"] = start_date
|
329
338
|
else:
|
330
339
|
kwargs[f"{mapped_field}__date__range"] = [start_date, end_date]
|
331
|
-
self.logger.debug(f"Period load generated filters: {kwargs}")
|
340
|
+
self.logger.debug(f"Period load generated filters: {kwargs}", extra=self.logger_extra)
|
332
341
|
return kwargs
|
333
342
|
|
343
|
+
@staticmethod
|
344
|
+
def _has_any_rows(ddf: dd.DataFrame) -> bool:
|
345
|
+
try:
|
346
|
+
return bool(ddf.head(1, npartitions=-1).shape[0])
|
347
|
+
except Exception:
|
348
|
+
return False
|
349
|
+
|
334
350
|
|
@@ -23,6 +23,7 @@ class ParquetArtifact(ManagedResource):
|
|
23
23
|
|
24
24
|
_global_lock = threading.RLock()
|
25
25
|
_active_runs: set[tuple[str, str]] = set()
|
26
|
+
logger_extra = {"sibi_dst_component": __name__}
|
26
27
|
|
27
28
|
def __init__(self, **kwargs: Any):
|
28
29
|
# Merge defaults from ManagedResource and caller kwargs
|
@@ -49,7 +50,7 @@ class ParquetArtifact(ManagedResource):
|
|
49
50
|
# ---------- lazy members ----------
|
50
51
|
@cached_property
|
51
52
|
def mmanifest(self) -> MissingManifestManager:
|
52
|
-
self.logger.info("Initializing MissingManifestManager...")
|
53
|
+
self.logger.info("Initializing MissingManifestManager...", extra=self.logger_extra)
|
53
54
|
manifest_path = self._build_manifest_path()
|
54
55
|
|
55
56
|
# ensure manifest directory exists
|
@@ -66,16 +67,16 @@ class ParquetArtifact(ManagedResource):
|
|
66
67
|
)
|
67
68
|
|
68
69
|
if not mgr._safe_exists(mgr.manifest_path):
|
69
|
-
self.logger.info(f"Creating new manifest at {mgr.manifest_path}")
|
70
|
+
self.logger.info(f"Creating new manifest at {mgr.manifest_path}", extra=self.logger_extra)
|
70
71
|
mgr.save()
|
71
72
|
else:
|
72
|
-
self.logger.info(f"Manifest already exists at {mgr.manifest_path}")
|
73
|
+
self.logger.info(f"Manifest already exists at {mgr.manifest_path}", extra=self.logger_extra)
|
73
74
|
|
74
75
|
return mgr
|
75
76
|
|
76
77
|
@cached_property
|
77
78
|
def update_planner(self) -> UpdatePlanner:
|
78
|
-
self.logger.info("Initializing UpdatePlanner...")
|
79
|
+
self.logger.info("Initializing UpdatePlanner...", extra=self.logger_extra)
|
79
80
|
skipped_files = self.mmanifest.load_existing() or []
|
80
81
|
|
81
82
|
cfg = {
|
@@ -91,7 +92,7 @@ class ParquetArtifact(ManagedResource):
|
|
91
92
|
|
92
93
|
@cached_property
|
93
94
|
def data_wrapper(self) -> DataWrapper:
|
94
|
-
self.logger.info("Initializing DataWrapper...")
|
95
|
+
self.logger.info("Initializing DataWrapper...", extra=self.logger_extra)
|
95
96
|
|
96
97
|
# Ensure the planner has a plan
|
97
98
|
if getattr(self.update_planner, "plan", None) is None:
|
@@ -170,7 +171,7 @@ class ParquetArtifact(ManagedResource):
|
|
170
171
|
with ParquetArtifact._global_lock:
|
171
172
|
if key in ParquetArtifact._active_runs:
|
172
173
|
self.logger.info(
|
173
|
-
f"Run already in progress for {key}; skipping this invocation."
|
174
|
+
f"Run already in progress for {key}; skipping this invocation.", extra=self.logger_extra
|
174
175
|
)
|
175
176
|
return
|
176
177
|
ParquetArtifact._active_runs.add(key)
|
@@ -182,7 +183,7 @@ class ParquetArtifact(ManagedResource):
|
|
182
183
|
plan = getattr(self.update_planner, "plan", None)
|
183
184
|
if plan is None or (hasattr(plan, "empty") and plan.empty):
|
184
185
|
# Planning uses Pandas; this is safe to check.
|
185
|
-
self.logger.info("No updates needed. Skipping Parquet generation.")
|
186
|
+
self.logger.info("No updates needed. Skipping Parquet generation.", extra=self.logger_extra)
|
186
187
|
return
|
187
188
|
|
188
189
|
# Print plan once per run
|
@@ -286,7 +287,7 @@ class ParquetArtifact(ManagedResource):
|
|
286
287
|
|
287
288
|
final_kwargs.update(period_params)
|
288
289
|
self.logger.debug(
|
289
|
-
f"kwargs passed to update_parquet/generate_parquet: {final_kwargs}"
|
290
|
+
f"kwargs passed to update_parquet/generate_parquet: {final_kwargs}", extra=self.logger_extra
|
290
291
|
)
|
291
292
|
|
292
293
|
# Delegate to generator (handles cache invalidation + forwarding knobs)
|
@@ -297,7 +298,7 @@ class ParquetArtifact(ManagedResource):
|
|
297
298
|
"""Ensure the directory exists across fsspec backends."""
|
298
299
|
with self._lock:
|
299
300
|
if not self.fs.exists(path):
|
300
|
-
self.logger.info(f"Creating directory: {path}")
|
301
|
+
self.logger.info(f"Creating directory: {path}", extra=self.logger_extra)
|
301
302
|
try:
|
302
303
|
self.fs.makedirs(path, exist_ok=True)
|
303
304
|
except TypeError:
|
@@ -317,4 +318,4 @@ class ParquetArtifact(ManagedResource):
|
|
317
318
|
if "data_wrapper" in self.__dict__ and hasattr(self.data_wrapper, "close"):
|
318
319
|
self.data_wrapper.close()
|
319
320
|
except Exception as e:
|
320
|
-
self.logger.warning(f"Error during resource cleanup: {e}")
|
321
|
+
self.logger.warning(f"Error during resource cleanup: {e}", extra=self.logger_extra)
|
@@ -70,6 +70,10 @@ class ParquetReader(DfHelper):
|
|
70
70
|
self.df = super().load(**kwargs)
|
71
71
|
return self.df
|
72
72
|
|
73
|
+
async def aload(self, **kwargs) -> Union[pd.DataFrame, dd.DataFrame]:
|
74
|
+
self.df = await super().aload(**kwargs)
|
75
|
+
return self.df
|
76
|
+
|
73
77
|
def directory_exists(self):
|
74
78
|
try:
|
75
79
|
info = self.fs.info(self.parquet_storage_path)
|