sibi-dst 2025.8.6__py3-none-any.whl → 2025.8.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_df_helper.py +111 -61
- sibi_dst/df_helper/_parquet_artifact.py +11 -10
- sibi_dst/df_helper/_parquet_reader.py +4 -0
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +504 -214
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +11 -10
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +9 -8
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +4 -76
- sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -104
- sibi_dst/utils/async_utils.py +12 -0
- sibi_dst/utils/boilerplate/__init__.py +6 -0
- sibi_dst/utils/boilerplate/base_data_artifact.py +110 -0
- sibi_dst/utils/boilerplate/base_data_cube.py +79 -0
- sibi_dst/utils/data_wrapper.py +22 -263
- sibi_dst/utils/iceberg_saver.py +126 -0
- sibi_dst/utils/log_utils.py +0 -346
- sibi_dst/utils/parquet_saver.py +110 -9
- sibi_dst/utils/progress/__init__.py +5 -0
- sibi_dst/utils/progress/jobs.py +82 -0
- sibi_dst/utils/progress/sse_runner.py +82 -0
- sibi_dst/utils/storage_hive.py +232 -0
- sibi_dst/utils/update_planner.py +617 -116
- {sibi_dst-2025.8.6.dist-info → sibi_dst-2025.8.8.dist-info}/METADATA +3 -2
- {sibi_dst-2025.8.6.dist-info → sibi_dst-2025.8.8.dist-info}/RECORD +24 -15
- {sibi_dst-2025.8.6.dist-info → sibi_dst-2025.8.8.dist-info}/WHEEL +0 -0
sibi_dst/df_helper/_df_helper.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import asyncio
|
3
4
|
from typing import Any, Dict, Optional, TypeVar, Union
|
4
5
|
|
5
6
|
import dask.dataframe as dd
|
@@ -15,6 +16,15 @@ from .backends.sqlalchemy import SqlAlchemyConnectionConfig, SqlAlchemyLoadFromD
|
|
15
16
|
|
16
17
|
T = TypeVar("T", bound=BaseModel)
|
17
18
|
|
19
|
+
def _is_dask_df(x) -> bool:
|
20
|
+
return isinstance(x, dd.DataFrame)
|
21
|
+
|
22
|
+
def _maybe_persist(df, persist: bool):
|
23
|
+
return df.persist() if persist and _is_dask_df(df) else df
|
24
|
+
|
25
|
+
def _maybe_compute(df, as_pandas: bool):
|
26
|
+
return df.compute() if as_pandas and _is_dask_df(df) else df
|
27
|
+
|
18
28
|
|
19
29
|
# ---- Backend Strategy Pattern ----
|
20
30
|
class BaseBackend:
|
@@ -22,13 +32,13 @@ class BaseBackend:
|
|
22
32
|
self.helper = helper
|
23
33
|
self.logger = helper.logger
|
24
34
|
self.debug = helper.debug
|
25
|
-
self.total_records =
|
35
|
+
self.total_records = -1
|
26
36
|
|
27
37
|
def load(self, **options) -> Union[tuple[Any, Any], dd.DataFrame, pd.DataFrame]:
|
28
38
|
raise NotImplementedError
|
29
39
|
|
30
40
|
async def aload(self, **options) -> Union[tuple[Any, Any], dd.DataFrame, pd.DataFrame]:
|
31
|
-
return self.load
|
41
|
+
return await asyncio.to_thread(self.load,**options)
|
32
42
|
|
33
43
|
|
34
44
|
class SqlAlchemyBackend(BaseBackend):
|
@@ -47,7 +57,7 @@ class SqlAlchemyBackend(BaseBackend):
|
|
47
57
|
self.total_records, result = db_loader.build_and_load()
|
48
58
|
return self.total_records, result
|
49
59
|
except Exception as e:
|
50
|
-
self.logger.error(f"Failed to load data from sqlalchemy: {e}", exc_info=self.debug)
|
60
|
+
self.logger.error(f"Failed to load data from sqlalchemy: {e}", exc_info=self.debug, extra=self.helper.logger_extra)
|
51
61
|
return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
52
62
|
|
53
63
|
|
@@ -55,56 +65,59 @@ class ParquetBackend(BaseBackend):
|
|
55
65
|
def load(self, **options):
|
56
66
|
try:
|
57
67
|
df = self.helper.backend_parquet.load_files(**options)
|
58
|
-
if self.
|
59
|
-
|
60
|
-
nrows = self._row_count(df)
|
61
|
-
if nrows == 0:
|
62
|
-
self.logger.debug("No records after filters; returning empty DataFrame.")
|
68
|
+
if not self.helper._has_any_rows(df):
|
69
|
+
self.total_records = 0
|
63
70
|
return 0, self._empty_like(df)
|
64
71
|
|
65
|
-
|
66
|
-
self.total_records =
|
72
|
+
# Let DfHelper decide about persist
|
73
|
+
self.total_records = -1 # unknown without full count
|
67
74
|
return self.total_records, df
|
68
75
|
|
69
76
|
except Exception as e:
|
70
77
|
self.total_records = -1 # Reset total_records on failure
|
71
|
-
self.logger.error(f"Failed to load data from parquet: {e}", exc_info=self.debug)
|
78
|
+
self.logger.error(f"Failed to load data from parquet: {e}", exc_info=self.debug, extra=self.helper.logger_extra)
|
72
79
|
return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
73
80
|
|
74
|
-
|
75
|
-
|
76
|
-
try:
|
77
|
-
# head with npartitions=-1 walks partitions until it gets n rows
|
78
|
-
return ddf.head(1, npartitions=-1).shape[0] == 0
|
79
|
-
except Exception:
|
80
|
-
return True
|
81
|
-
|
82
|
-
def _row_count(self, ddf) -> int:
|
83
|
-
"""Reliable row count for Dask DataFrame."""
|
84
|
-
return int(ddf.map_partitions(len).sum().compute())
|
85
|
-
|
86
|
-
def _empty_like(self, ddf):
|
87
|
-
"""Return an empty Dask DF with the SAME columns/dtypes."""
|
81
|
+
@staticmethod
|
82
|
+
def _empty_like(ddf):
|
88
83
|
empty_pdf = ddf._meta.iloc[0:0]
|
89
84
|
return dd.from_pandas(empty_pdf, npartitions=1)
|
90
85
|
|
91
86
|
|
92
87
|
class HttpBackend(BaseBackend):
|
93
88
|
def load(self, **options):
|
94
|
-
#
|
95
|
-
|
89
|
+
# Avoid event-loop problems in sync code paths.
|
90
|
+
# If someone calls .load() on an async backend, make it explicit.
|
91
|
+
raise RuntimeError(
|
92
|
+
"HttpBackend.load() is sync but this backend is async-only. "
|
93
|
+
"Call `await helper.aload(...)` or `await helper.load_async(prefer_native=True, ...)`."
|
94
|
+
)
|
96
95
|
|
97
96
|
async def aload(self, **options):
|
98
97
|
if not self.helper.backend_http:
|
99
|
-
self.logger.warning("HTTP plugin not configured properly.")
|
98
|
+
self.logger.warning("HTTP plugin not configured properly.", extra=self.helper.logger_extra)
|
100
99
|
self.total_records = -1
|
101
100
|
return self.total_records, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
101
|
+
|
102
102
|
result = await self.helper.backend_http.fetch_data(**options)
|
103
|
-
|
104
|
-
|
103
|
+
|
104
|
+
# Normalize to DataFrame if the plugin returns list/dict
|
105
|
+
if isinstance(result, (list, dict)):
|
106
|
+
pdf = pd.DataFrame(result)
|
107
|
+
ddf = dd.from_pandas(pdf, npartitions=max(1, min(32, len(pdf) // 50_000 or 1)))
|
108
|
+
self.total_records = len(pdf)
|
109
|
+
return self.total_records, ddf
|
110
|
+
|
111
|
+
if isinstance(result, pd.DataFrame):
|
112
|
+
self.total_records = len(result)
|
113
|
+
ddf = dd.from_pandas(result, npartitions=max(1, min(32, len(result) // 50_000 or 1)))
|
114
|
+
return self.total_records, ddf
|
115
|
+
|
116
|
+
# Fallback
|
117
|
+
self.total_records = -1
|
118
|
+
return self.total_records, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
105
119
|
|
106
120
|
|
107
|
-
# ---- Main DfHelper ----
|
108
121
|
class DfHelper(ManagedResource):
|
109
122
|
_BACKEND_STRATEGIES = {
|
110
123
|
"sqlalchemy": SqlAlchemyBackend,
|
@@ -119,6 +132,7 @@ class DfHelper(ManagedResource):
|
|
119
132
|
}
|
120
133
|
|
121
134
|
default_config: Dict[str, Any] = None
|
135
|
+
logger_extra: Dict[str, Any] = {"sibi_dst_component": __name__}
|
122
136
|
|
123
137
|
def __init__(self, backend="sqlalchemy", **kwargs):
|
124
138
|
self.default_config = self.default_config or {}
|
@@ -155,24 +169,25 @@ class DfHelper(ManagedResource):
|
|
155
169
|
def _cleanup(self):
|
156
170
|
attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
|
157
171
|
if not attr_name:
|
158
|
-
self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.")
|
172
|
+
self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.", extra=self.logger_extra)
|
159
173
|
return
|
160
174
|
active_config = getattr(self, attr_name, None)
|
161
175
|
if active_config and hasattr(active_config, "close"):
|
162
|
-
self.logger.debug(f"
|
176
|
+
self.logger.debug(f"{self.__class__.__name__} is closing resources for backend '{self.backend}' backend using attribute '{attr_name}'.", extra=self.logger_extra)
|
163
177
|
active_config.close()
|
164
178
|
|
165
179
|
async def _acleanup(self):
|
166
180
|
self.logger.warning(
|
167
|
-
"DfHelper instance was not used in an async context manager; cleanup is being called manually."
|
181
|
+
"DfHelper instance was not used in an async context manager; cleanup is being called manually.",
|
182
|
+
extra=self.logger_extra,
|
168
183
|
)
|
169
184
|
attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
|
170
185
|
if not attr_name:
|
171
|
-
self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.")
|
186
|
+
self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.", extra=self.logger_extra)
|
172
187
|
return
|
173
188
|
active_config = getattr(self, attr_name, None)
|
174
189
|
if active_config and hasattr(active_config, "aclose"):
|
175
|
-
self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.")
|
190
|
+
self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.", extra=self.logger_extra)
|
176
191
|
await active_config.aclose()
|
177
192
|
|
178
193
|
# ---------- config helpers ----------
|
@@ -183,24 +198,49 @@ class DfHelper(ManagedResource):
|
|
183
198
|
|
184
199
|
# ---------- load/aload ----------
|
185
200
|
def load(self, *, persist: bool = False, as_pandas: bool = False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
186
|
-
self.logger.debug(f"Loading data from {self.backend} backend with options: {options}")
|
201
|
+
self.logger.debug(f"Loading data from {self.backend} backend with options: {options}", extra=self.logger_extra)
|
187
202
|
self.total_records, df = self.backend_strategy.load(**options)
|
188
203
|
df = self._process_loaded_data(df)
|
189
204
|
df = self._post_process_df(df)
|
190
|
-
|
191
|
-
df
|
192
|
-
|
205
|
+
df = _maybe_persist(df, persist)
|
206
|
+
return _maybe_compute(df, as_pandas)
|
207
|
+
|
208
|
+
async def aload(
|
209
|
+
self,
|
210
|
+
*,
|
211
|
+
persist: bool = False,
|
212
|
+
as_pandas: bool = False,
|
213
|
+
timeout: Optional[float] = None,
|
214
|
+
**options
|
215
|
+
) -> Union[pd.DataFrame, dd.DataFrame]:
|
216
|
+
# 1) Async load if available, else run sync load in a thread.
|
217
|
+
if hasattr(self.backend_strategy, "aload"):
|
218
|
+
load_awaitable = self.backend_strategy.aload(**options)
|
219
|
+
else:
|
220
|
+
# Run ONLY the backend load step in a thread to avoid event-loop blocking.
|
221
|
+
load_awaitable = asyncio.to_thread(self.backend_strategy.load, **options)
|
193
222
|
|
194
|
-
|
195
|
-
self.total_records
|
196
|
-
|
197
|
-
|
198
|
-
df =
|
199
|
-
|
223
|
+
total, df = await (asyncio.wait_for(load_awaitable, timeout) if timeout else load_awaitable)
|
224
|
+
self.total_records = total
|
225
|
+
|
226
|
+
# 2) Post-processing steps are sync; offload to threads.
|
227
|
+
df = await asyncio.to_thread(self._process_loaded_data, df)
|
228
|
+
df = await asyncio.to_thread(self._post_process_df, df)
|
229
|
+
|
230
|
+
# 3) Persist and compute can block; offload when needed.
|
231
|
+
if persist and _is_dask_df(df):
|
232
|
+
df = await asyncio.to_thread(df.persist)
|
233
|
+
|
234
|
+
if as_pandas and _is_dask_df(df):
|
235
|
+
# Allow separate timeout for compute if desired; reuse same timeout here.
|
236
|
+
compute_awaitable = asyncio.to_thread(df.compute)
|
237
|
+
return await (asyncio.wait_for(compute_awaitable, timeout) if timeout else compute_awaitable)
|
238
|
+
|
239
|
+
return df
|
200
240
|
|
201
241
|
# ---------- dataframe post-processing ----------
|
202
242
|
def _post_process_df(self, df: dd.DataFrame) -> dd.DataFrame:
|
203
|
-
self.logger.debug("
|
243
|
+
self.logger.debug(f"{self.__class__.__name__} is post-processing resulting dataframe with {len(df)} records.", extra=self.logger_extra)
|
204
244
|
df_params = self._backend_params.df_params
|
205
245
|
if not df_params:
|
206
246
|
return df
|
@@ -211,7 +251,7 @@ class DfHelper(ManagedResource):
|
|
211
251
|
if fieldnames:
|
212
252
|
valid = [f for f in fieldnames if f in df.columns]
|
213
253
|
if len(valid) < len(fieldnames):
|
214
|
-
self.logger.warning(f"Missing columns for filtering: {set(fieldnames) - set(valid)}")
|
254
|
+
self.logger.warning(f"Missing columns for filtering: {set(fieldnames) - set(valid)}", extra=self.logger_extra)
|
215
255
|
df = df[valid]
|
216
256
|
if column_names:
|
217
257
|
if len(df.columns) != len(column_names):
|
@@ -224,7 +264,7 @@ class DfHelper(ManagedResource):
|
|
224
264
|
raise ValueError(f"Index column '{index_col}' not found in DataFrame.")
|
225
265
|
df = df.set_index(index_col)
|
226
266
|
|
227
|
-
self.logger.debug("Post-processing complete.")
|
267
|
+
self.logger.debug("Post-processing complete.", extra=self.logger_extra)
|
228
268
|
return df
|
229
269
|
|
230
270
|
def _process_loaded_data(self, df: dd.DataFrame) -> dd.DataFrame:
|
@@ -233,22 +273,25 @@ class DfHelper(ManagedResource):
|
|
233
273
|
return df
|
234
274
|
if hasattr(df, "npartitions") and df.npartitions == 1 and not len(df.head(1)):
|
235
275
|
return df
|
236
|
-
self.logger.debug("
|
276
|
+
self.logger.debug(f"{self.__class__.__name__} is applying rename mapping if/when necessary.", extra=self.logger_extra)
|
237
277
|
rename_map = {k: v for k, v in field_map.items() if k in df.columns}
|
238
278
|
if rename_map:
|
239
279
|
df = df.rename(columns=rename_map)
|
240
280
|
return df
|
241
281
|
|
242
282
|
# ---------- sinks ----------
|
243
|
-
def save_to_parquet(self, df: dd.DataFrame,
|
283
|
+
def save_to_parquet(self, df: dd.DataFrame, **kwargs):
|
244
284
|
fs: AbstractFileSystem = kwargs.pop("fs", self.fs)
|
245
|
-
path: str = kwargs.pop("parquet_storage_path")
|
285
|
+
path: str = kwargs.pop("parquet_storage_path", self.backend_parquet.parquet_storage_path if self.backend_parquet else None)
|
286
|
+
parquet_filename = kwargs.pop("parquet_filename", self.backend_parquet.parquet_filename if self.backend_parquet else None)
|
287
|
+
if not parquet_filename:
|
288
|
+
raise ValueError("A 'parquet_filename' keyword argument must be provided.")
|
246
289
|
if not fs:
|
247
290
|
raise ValueError("A filesystem (fs) must be provided to save the parquet file.")
|
248
291
|
if not path:
|
249
292
|
raise ValueError("A 'parquet_storage_path' keyword argument must be provided.")
|
250
|
-
if
|
251
|
-
self.logger.warning("Skipping save: The provided DataFrame is empty.")
|
293
|
+
if not self._has_any_rows(df):
|
294
|
+
self.logger.warning("Skipping save: The provided DataFrame is empty.", extra=self.logger_extra)
|
252
295
|
return
|
253
296
|
|
254
297
|
with ParquetSaver(
|
@@ -262,17 +305,17 @@ class DfHelper(ManagedResource):
|
|
262
305
|
) as saver:
|
263
306
|
saver.save_to_parquet(parquet_filename)
|
264
307
|
|
265
|
-
self.logger.debug(f"Successfully saved '{parquet_filename}' to '{path}'.")
|
308
|
+
self.logger.debug(f"Successfully saved '{parquet_filename}' to '{path}'.", extra=self.logger_extra)
|
266
309
|
|
267
310
|
def save_to_clickhouse(self, df: dd.DataFrame, **credentials):
|
268
|
-
if
|
269
|
-
self.logger.warning("
|
311
|
+
if not self._has_any_rows(df):
|
312
|
+
self.logger.warning("Skipping save to ClickHouse: The provided DataFrame is empty.", extra=self.logger_extra)
|
270
313
|
return
|
271
314
|
with ClickHouseWriter(debug=self.debug, logger=self.logger, verbose=self.verbose, **credentials) as writer:
|
272
315
|
writer.save_to_clickhouse(df)
|
273
|
-
self.logger.debug("Save to ClickHouse completed.")
|
316
|
+
self.logger.debug("Save to ClickHouse completed.", extra=self.logger_extra)
|
274
317
|
|
275
|
-
# ----------
|
318
|
+
# ---------- period loaders ----------
|
276
319
|
def load_period(self, dt_field: str, start: str, end: str, **kwargs):
|
277
320
|
final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
|
278
321
|
return self.load(**final_kwargs)
|
@@ -288,13 +331,20 @@ class DfHelper(ManagedResource):
|
|
288
331
|
field_map = self._backend_params.field_map or {}
|
289
332
|
reverse_map = {v: k for k, v in field_map.items()} if field_map else {}
|
290
333
|
if len(reverse_map) != len(field_map):
|
291
|
-
self.logger.warning("field_map values are not unique; reverse mapping may be unreliable.")
|
334
|
+
self.logger.warning("field_map values are not unique; reverse mapping may be unreliable.", extra=self.logger_extra)
|
292
335
|
mapped_field = reverse_map.get(dt_field, dt_field)
|
293
336
|
if start_date == end_date:
|
294
337
|
kwargs[f"{mapped_field}__date"] = start_date
|
295
338
|
else:
|
296
339
|
kwargs[f"{mapped_field}__date__range"] = [start_date, end_date]
|
297
|
-
self.logger.debug(f"Period load generated filters: {kwargs}")
|
340
|
+
self.logger.debug(f"Period load generated filters: {kwargs}", extra=self.logger_extra)
|
298
341
|
return kwargs
|
299
342
|
|
343
|
+
@staticmethod
|
344
|
+
def _has_any_rows(ddf: dd.DataFrame) -> bool:
|
345
|
+
try:
|
346
|
+
return bool(ddf.head(1, npartitions=-1).shape[0])
|
347
|
+
except Exception:
|
348
|
+
return False
|
349
|
+
|
300
350
|
|
@@ -23,6 +23,7 @@ class ParquetArtifact(ManagedResource):
|
|
23
23
|
|
24
24
|
_global_lock = threading.RLock()
|
25
25
|
_active_runs: set[tuple[str, str]] = set()
|
26
|
+
logger_extra = {"sibi_dst_component": __name__}
|
26
27
|
|
27
28
|
def __init__(self, **kwargs: Any):
|
28
29
|
# Merge defaults from ManagedResource and caller kwargs
|
@@ -49,7 +50,7 @@ class ParquetArtifact(ManagedResource):
|
|
49
50
|
# ---------- lazy members ----------
|
50
51
|
@cached_property
|
51
52
|
def mmanifest(self) -> MissingManifestManager:
|
52
|
-
self.logger.info("Initializing MissingManifestManager...")
|
53
|
+
self.logger.info("Initializing MissingManifestManager...", extra=self.logger_extra)
|
53
54
|
manifest_path = self._build_manifest_path()
|
54
55
|
|
55
56
|
# ensure manifest directory exists
|
@@ -66,16 +67,16 @@ class ParquetArtifact(ManagedResource):
|
|
66
67
|
)
|
67
68
|
|
68
69
|
if not mgr._safe_exists(mgr.manifest_path):
|
69
|
-
self.logger.info(f"Creating new manifest at {mgr.manifest_path}")
|
70
|
+
self.logger.info(f"Creating new manifest at {mgr.manifest_path}", extra=self.logger_extra)
|
70
71
|
mgr.save()
|
71
72
|
else:
|
72
|
-
self.logger.info(f"Manifest already exists at {mgr.manifest_path}")
|
73
|
+
self.logger.info(f"Manifest already exists at {mgr.manifest_path}", extra=self.logger_extra)
|
73
74
|
|
74
75
|
return mgr
|
75
76
|
|
76
77
|
@cached_property
|
77
78
|
def update_planner(self) -> UpdatePlanner:
|
78
|
-
self.logger.info("Initializing UpdatePlanner...")
|
79
|
+
self.logger.info("Initializing UpdatePlanner...", extra=self.logger_extra)
|
79
80
|
skipped_files = self.mmanifest.load_existing() or []
|
80
81
|
|
81
82
|
cfg = {
|
@@ -91,7 +92,7 @@ class ParquetArtifact(ManagedResource):
|
|
91
92
|
|
92
93
|
@cached_property
|
93
94
|
def data_wrapper(self) -> DataWrapper:
|
94
|
-
self.logger.info("Initializing DataWrapper...")
|
95
|
+
self.logger.info("Initializing DataWrapper...", extra=self.logger_extra)
|
95
96
|
|
96
97
|
# Ensure the planner has a plan
|
97
98
|
if getattr(self.update_planner, "plan", None) is None:
|
@@ -170,7 +171,7 @@ class ParquetArtifact(ManagedResource):
|
|
170
171
|
with ParquetArtifact._global_lock:
|
171
172
|
if key in ParquetArtifact._active_runs:
|
172
173
|
self.logger.info(
|
173
|
-
f"Run already in progress for {key}; skipping this invocation."
|
174
|
+
f"Run already in progress for {key}; skipping this invocation.", extra=self.logger_extra
|
174
175
|
)
|
175
176
|
return
|
176
177
|
ParquetArtifact._active_runs.add(key)
|
@@ -182,7 +183,7 @@ class ParquetArtifact(ManagedResource):
|
|
182
183
|
plan = getattr(self.update_planner, "plan", None)
|
183
184
|
if plan is None or (hasattr(plan, "empty") and plan.empty):
|
184
185
|
# Planning uses Pandas; this is safe to check.
|
185
|
-
self.logger.info("No updates needed. Skipping Parquet generation.")
|
186
|
+
self.logger.info("No updates needed. Skipping Parquet generation.", extra=self.logger_extra)
|
186
187
|
return
|
187
188
|
|
188
189
|
# Print plan once per run
|
@@ -286,7 +287,7 @@ class ParquetArtifact(ManagedResource):
|
|
286
287
|
|
287
288
|
final_kwargs.update(period_params)
|
288
289
|
self.logger.debug(
|
289
|
-
f"kwargs passed to update_parquet/generate_parquet: {final_kwargs}"
|
290
|
+
f"kwargs passed to update_parquet/generate_parquet: {final_kwargs}", extra=self.logger_extra
|
290
291
|
)
|
291
292
|
|
292
293
|
# Delegate to generator (handles cache invalidation + forwarding knobs)
|
@@ -297,7 +298,7 @@ class ParquetArtifact(ManagedResource):
|
|
297
298
|
"""Ensure the directory exists across fsspec backends."""
|
298
299
|
with self._lock:
|
299
300
|
if not self.fs.exists(path):
|
300
|
-
self.logger.info(f"Creating directory: {path}")
|
301
|
+
self.logger.info(f"Creating directory: {path}", extra=self.logger_extra)
|
301
302
|
try:
|
302
303
|
self.fs.makedirs(path, exist_ok=True)
|
303
304
|
except TypeError:
|
@@ -317,4 +318,4 @@ class ParquetArtifact(ManagedResource):
|
|
317
318
|
if "data_wrapper" in self.__dict__ and hasattr(self.data_wrapper, "close"):
|
318
319
|
self.data_wrapper.close()
|
319
320
|
except Exception as e:
|
320
|
-
self.logger.warning(f"Error during resource cleanup: {e}")
|
321
|
+
self.logger.warning(f"Error during resource cleanup: {e}", extra=self.logger_extra)
|
@@ -70,6 +70,10 @@ class ParquetReader(DfHelper):
|
|
70
70
|
self.df = super().load(**kwargs)
|
71
71
|
return self.df
|
72
72
|
|
73
|
+
async def aload(self, **kwargs) -> Union[pd.DataFrame, dd.DataFrame]:
|
74
|
+
self.df = await super().aload(**kwargs)
|
75
|
+
return self.df
|
76
|
+
|
73
77
|
def directory_exists(self):
|
74
78
|
try:
|
75
79
|
info = self.fs.info(self.parquet_storage_path)
|