sibi-dst 2025.8.8__py3-none-any.whl → 2025.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_df_helper.py +379 -1
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +2 -0
- sibi_dst/utils/base.py +567 -122
- sibi_dst/utils/boilerplate/__init__.py +9 -4
- sibi_dst/utils/boilerplate/base_attacher.py +25 -0
- sibi_dst/utils/boilerplate/{base_data_artifact.py → base_parquet_artifact.py} +1 -1
- sibi_dst/utils/boilerplate/base_parquet_reader.py +21 -0
- sibi_dst/utils/log_utils.py +108 -183
- sibi_dst/utils/progress/sse_runner.py +2 -0
- {sibi_dst-2025.8.8.dist-info → sibi_dst-2025.9.1.dist-info}/METADATA +2 -1
- {sibi_dst-2025.8.8.dist-info → sibi_dst-2025.9.1.dist-info}/RECORD +12 -10
- {sibi_dst-2025.8.8.dist-info → sibi_dst-2025.9.1.dist-info}/WHEEL +0 -0
sibi_dst/df_helper/_df_helper.py
CHANGED
@@ -137,6 +137,7 @@ class DfHelper(ManagedResource):
|
|
137
137
|
def __init__(self, backend="sqlalchemy", **kwargs):
|
138
138
|
self.default_config = self.default_config or {}
|
139
139
|
kwargs = {**self.default_config.copy(), **kwargs}
|
140
|
+
kwargs.setdefault("auto_sse", True)
|
140
141
|
super().__init__(**kwargs)
|
141
142
|
self.backend = backend
|
142
143
|
|
@@ -166,6 +167,18 @@ class DfHelper(ManagedResource):
|
|
166
167
|
self.backend_strategy = strategy_cls(self)
|
167
168
|
|
168
169
|
# ---------- ManagedResource hooks ----------
|
170
|
+
def get_sse(self):
|
171
|
+
return self._ensure_sse()
|
172
|
+
|
173
|
+
def _emit_bg(self, event: str, **data: Any) -> None:
|
174
|
+
try:
|
175
|
+
loop = asyncio.get_running_loop()
|
176
|
+
except RuntimeError:
|
177
|
+
# no running loop: run to completion
|
178
|
+
asyncio.run(self.emit(event, **data))
|
179
|
+
else:
|
180
|
+
loop.create_task(self.emit(event, **data))
|
181
|
+
|
169
182
|
def _cleanup(self):
|
170
183
|
attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
|
171
184
|
if not attr_name:
|
@@ -213,6 +226,7 @@ class DfHelper(ManagedResource):
|
|
213
226
|
timeout: Optional[float] = None,
|
214
227
|
**options
|
215
228
|
) -> Union[pd.DataFrame, dd.DataFrame]:
|
229
|
+
await self.emit(f"{self.__class__.__name__} load:start", message=f"Pulling data from {self.backend} backend")
|
216
230
|
# 1) Async load if available, else run sync load in a thread.
|
217
231
|
if hasattr(self.backend_strategy, "aload"):
|
218
232
|
load_awaitable = self.backend_strategy.aload(**options)
|
@@ -224,18 +238,20 @@ class DfHelper(ManagedResource):
|
|
224
238
|
self.total_records = total
|
225
239
|
|
226
240
|
# 2) Post-processing steps are sync; offload to threads.
|
241
|
+
await self.emit(event=f"{self.__class__.__name__} load:progress", message=f"Post-processing {len(df)} records")
|
227
242
|
df = await asyncio.to_thread(self._process_loaded_data, df)
|
228
243
|
df = await asyncio.to_thread(self._post_process_df, df)
|
229
244
|
|
230
245
|
# 3) Persist and compute can block; offload when needed.
|
231
246
|
if persist and _is_dask_df(df):
|
232
247
|
df = await asyncio.to_thread(df.persist)
|
233
|
-
|
234
248
|
if as_pandas and _is_dask_df(df):
|
235
249
|
# Allow separate timeout for compute if desired; reuse same timeout here.
|
236
250
|
compute_awaitable = asyncio.to_thread(df.compute)
|
237
251
|
return await (asyncio.wait_for(compute_awaitable, timeout) if timeout else compute_awaitable)
|
238
252
|
|
253
|
+
await self.emit(event=f"{self.__class__.__name__} load:progress", message=f"Returning {len(df)} records")
|
254
|
+
|
239
255
|
return df
|
240
256
|
|
241
257
|
# ---------- dataframe post-processing ----------
|
@@ -307,6 +323,11 @@ class DfHelper(ManagedResource):
|
|
307
323
|
|
308
324
|
self.logger.debug(f"Successfully saved '{parquet_filename}' to '{path}'.", extra=self.logger_extra)
|
309
325
|
|
326
|
+
async def asave_to_parquet(self, df: dd.DataFrame, **kwargs):
|
327
|
+
await self.emit(event=f"{self.__class__.__name__} save:start", message=f"Saving {len(df)} records to parquet")
|
328
|
+
await asyncio.to_thread(self.save_to_parquet, df, **kwargs)
|
329
|
+
await self.emit(event=f"{self.__class__.__name__} save:end", message=f"Saved {len(df)} records to parquet")
|
330
|
+
|
310
331
|
def save_to_clickhouse(self, df: dd.DataFrame, **credentials):
|
311
332
|
if not self._has_any_rows(df):
|
312
333
|
self.logger.warning("Skipping save to ClickHouse: The provided DataFrame is empty.", extra=self.logger_extra)
|
@@ -315,6 +336,11 @@ class DfHelper(ManagedResource):
|
|
315
336
|
writer.save_to_clickhouse(df)
|
316
337
|
self.logger.debug("Save to ClickHouse completed.", extra=self.logger_extra)
|
317
338
|
|
339
|
+
async def asave_to_clickhouse(self, df: dd.DataFrame, **credentials):
|
340
|
+
await self.emit(event=f"{self.__class__.__name__} save:start", message=f"Saving {len(df)} records to ClickHouse")
|
341
|
+
await asyncio.to_thread(self.save_to_clickhouse, df, **credentials)
|
342
|
+
await self.emit(event=f"{self.__class__.__name__} save:end", message=f"Saved {len(df)} records to ClickHouse")
|
343
|
+
|
318
344
|
# ---------- period loaders ----------
|
319
345
|
def load_period(self, dt_field: str, start: str, end: str, **kwargs):
|
320
346
|
final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
|
@@ -348,3 +374,355 @@ class DfHelper(ManagedResource):
|
|
348
374
|
return False
|
349
375
|
|
350
376
|
|
377
|
+
|
378
|
+
# BEFORE SSE Handling
|
379
|
+
# from __future__ import annotations
|
380
|
+
#
|
381
|
+
# import asyncio
|
382
|
+
# from typing import Any, Dict, Optional, TypeVar, Union
|
383
|
+
#
|
384
|
+
# import dask.dataframe as dd
|
385
|
+
# import pandas as pd
|
386
|
+
# from fsspec import AbstractFileSystem
|
387
|
+
# from pydantic import BaseModel
|
388
|
+
#
|
389
|
+
# from sibi_dst.df_helper.core import QueryConfig, ParamsConfig
|
390
|
+
# from sibi_dst.utils import ManagedResource, ParquetSaver, ClickHouseWriter
|
391
|
+
# from .backends.http import HttpConfig
|
392
|
+
# from .backends.parquet import ParquetConfig
|
393
|
+
# from .backends.sqlalchemy import SqlAlchemyConnectionConfig, SqlAlchemyLoadFromDb
|
394
|
+
#
|
395
|
+
# T = TypeVar("T", bound=BaseModel)
|
396
|
+
#
|
397
|
+
# def _is_dask_df(x) -> bool:
|
398
|
+
# return isinstance(x, dd.DataFrame)
|
399
|
+
#
|
400
|
+
# def _maybe_persist(df, persist: bool):
|
401
|
+
# return df.persist() if persist and _is_dask_df(df) else df
|
402
|
+
#
|
403
|
+
# def _maybe_compute(df, as_pandas: bool):
|
404
|
+
# return df.compute() if as_pandas and _is_dask_df(df) else df
|
405
|
+
#
|
406
|
+
#
|
407
|
+
# # ---- Backend Strategy Pattern ----
|
408
|
+
# class BaseBackend:
|
409
|
+
# def __init__(self, helper: "DfHelper"):
|
410
|
+
# self.helper = helper
|
411
|
+
# self.logger = helper.logger
|
412
|
+
# self.debug = helper.debug
|
413
|
+
# self.total_records = -1
|
414
|
+
#
|
415
|
+
# def load(self, **options) -> Union[tuple[Any, Any], dd.DataFrame, pd.DataFrame]:
|
416
|
+
# raise NotImplementedError
|
417
|
+
#
|
418
|
+
# async def aload(self, **options) -> Union[tuple[Any, Any], dd.DataFrame, pd.DataFrame]:
|
419
|
+
# return await asyncio.to_thread(self.load,**options)
|
420
|
+
#
|
421
|
+
#
|
422
|
+
# class SqlAlchemyBackend(BaseBackend):
|
423
|
+
# def load(self, **options):
|
424
|
+
# try:
|
425
|
+
# if options and hasattr(self.helper._backend_params, "parse_params"):
|
426
|
+
# self.helper._backend_params.parse_params(options)
|
427
|
+
#
|
428
|
+
# with SqlAlchemyLoadFromDb(
|
429
|
+
# plugin_sqlalchemy=self.helper.backend_db_connection,
|
430
|
+
# plugin_query=self.helper._backend_query,
|
431
|
+
# plugin_params=self.helper._backend_params,
|
432
|
+
# logger=self.logger,
|
433
|
+
# debug=self.debug,
|
434
|
+
# ) as db_loader:
|
435
|
+
# self.total_records, result = db_loader.build_and_load()
|
436
|
+
# return self.total_records, result
|
437
|
+
# except Exception as e:
|
438
|
+
# self.logger.error(f"Failed to load data from sqlalchemy: {e}", exc_info=self.debug, extra=self.helper.logger_extra)
|
439
|
+
# return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
440
|
+
#
|
441
|
+
#
|
442
|
+
# class ParquetBackend(BaseBackend):
|
443
|
+
# def load(self, **options):
|
444
|
+
# try:
|
445
|
+
# df = self.helper.backend_parquet.load_files(**options)
|
446
|
+
# if not self.helper._has_any_rows(df):
|
447
|
+
# self.total_records = 0
|
448
|
+
# return 0, self._empty_like(df)
|
449
|
+
#
|
450
|
+
# # Let DfHelper decide about persist
|
451
|
+
# self.total_records = -1 # unknown without full count
|
452
|
+
# return self.total_records, df
|
453
|
+
#
|
454
|
+
# except Exception as e:
|
455
|
+
# self.total_records = -1 # Reset total_records on failure
|
456
|
+
# self.logger.error(f"Failed to load data from parquet: {e}", exc_info=self.debug, extra=self.helper.logger_extra)
|
457
|
+
# return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
458
|
+
#
|
459
|
+
# @staticmethod
|
460
|
+
# def _empty_like(ddf):
|
461
|
+
# empty_pdf = ddf._meta.iloc[0:0]
|
462
|
+
# return dd.from_pandas(empty_pdf, npartitions=1)
|
463
|
+
#
|
464
|
+
#
|
465
|
+
# class HttpBackend(BaseBackend):
|
466
|
+
# def load(self, **options):
|
467
|
+
# # Avoid event-loop problems in sync code paths.
|
468
|
+
# # If someone calls .load() on an async backend, make it explicit.
|
469
|
+
# raise RuntimeError(
|
470
|
+
# "HttpBackend.load() is sync but this backend is async-only. "
|
471
|
+
# "Call `await helper.aload(...)` or `await helper.load_async(prefer_native=True, ...)`."
|
472
|
+
# )
|
473
|
+
#
|
474
|
+
# async def aload(self, **options):
|
475
|
+
# if not self.helper.backend_http:
|
476
|
+
# self.logger.warning("HTTP plugin not configured properly.", extra=self.helper.logger_extra)
|
477
|
+
# self.total_records = -1
|
478
|
+
# return self.total_records, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
479
|
+
#
|
480
|
+
# result = await self.helper.backend_http.fetch_data(**options)
|
481
|
+
#
|
482
|
+
# # Normalize to DataFrame if the plugin returns list/dict
|
483
|
+
# if isinstance(result, (list, dict)):
|
484
|
+
# pdf = pd.DataFrame(result)
|
485
|
+
# ddf = dd.from_pandas(pdf, npartitions=max(1, min(32, len(pdf) // 50_000 or 1)))
|
486
|
+
# self.total_records = len(pdf)
|
487
|
+
# return self.total_records, ddf
|
488
|
+
#
|
489
|
+
# if isinstance(result, pd.DataFrame):
|
490
|
+
# self.total_records = len(result)
|
491
|
+
# ddf = dd.from_pandas(result, npartitions=max(1, min(32, len(result) // 50_000 or 1)))
|
492
|
+
# return self.total_records, ddf
|
493
|
+
#
|
494
|
+
# # Fallback
|
495
|
+
# self.total_records = -1
|
496
|
+
# return self.total_records, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
497
|
+
#
|
498
|
+
#
|
499
|
+
# class DfHelper(ManagedResource):
|
500
|
+
# _BACKEND_STRATEGIES = {
|
501
|
+
# "sqlalchemy": SqlAlchemyBackend,
|
502
|
+
# "parquet": ParquetBackend,
|
503
|
+
# "http": HttpBackend,
|
504
|
+
# }
|
505
|
+
#
|
506
|
+
# _BACKEND_ATTR_MAP = {
|
507
|
+
# "sqlalchemy": "backend_db_connection",
|
508
|
+
# "parquet": "backend_parquet",
|
509
|
+
# "http": "backend_http",
|
510
|
+
# }
|
511
|
+
#
|
512
|
+
# default_config: Dict[str, Any] = None
|
513
|
+
# logger_extra: Dict[str, Any] = {"sibi_dst_component": __name__}
|
514
|
+
#
|
515
|
+
# def __init__(self, backend="sqlalchemy", **kwargs):
|
516
|
+
# self.default_config = self.default_config or {}
|
517
|
+
# kwargs = {**self.default_config.copy(), **kwargs}
|
518
|
+
# super().__init__(**kwargs)
|
519
|
+
# self.backend = backend
|
520
|
+
#
|
521
|
+
# # Ensure defaults flow to plugin configs
|
522
|
+
# kwargs.setdefault("debug", self.debug)
|
523
|
+
# kwargs.setdefault("fs", self.fs)
|
524
|
+
# kwargs.setdefault("logger", self.logger)
|
525
|
+
#
|
526
|
+
# self.total_records = -1
|
527
|
+
# self._backend_query = self._get_config(QueryConfig, kwargs)
|
528
|
+
# self._backend_params = self._get_config(ParamsConfig, kwargs)
|
529
|
+
#
|
530
|
+
# self.backend_db_connection: Optional[SqlAlchemyConnectionConfig] = None
|
531
|
+
# self.backend_parquet: Optional[ParquetConfig] = None
|
532
|
+
# self.backend_http: Optional[HttpConfig] = None
|
533
|
+
#
|
534
|
+
# if self.backend == "sqlalchemy":
|
535
|
+
# self.backend_db_connection = self._get_config(SqlAlchemyConnectionConfig, kwargs)
|
536
|
+
# elif self.backend == "parquet":
|
537
|
+
# self.backend_parquet = self._get_config(ParquetConfig, kwargs)
|
538
|
+
# elif self.backend == "http":
|
539
|
+
# self.backend_http = self._get_config(HttpConfig, kwargs)
|
540
|
+
#
|
541
|
+
# strategy_cls = self._BACKEND_STRATEGIES.get(self.backend)
|
542
|
+
# if not strategy_cls:
|
543
|
+
# raise ValueError(f"Unsupported backend: {self.backend}")
|
544
|
+
# self.backend_strategy = strategy_cls(self)
|
545
|
+
#
|
546
|
+
# # ---------- ManagedResource hooks ----------
|
547
|
+
# def _cleanup(self):
|
548
|
+
# attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
|
549
|
+
# if not attr_name:
|
550
|
+
# self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.", extra=self.logger_extra)
|
551
|
+
# return
|
552
|
+
# active_config = getattr(self, attr_name, None)
|
553
|
+
# if active_config and hasattr(active_config, "close"):
|
554
|
+
# self.logger.debug(f"{self.__class__.__name__} is closing resources for backend '{self.backend}' backend using attribute '{attr_name}'.", extra=self.logger_extra)
|
555
|
+
# active_config.close()
|
556
|
+
#
|
557
|
+
# async def _acleanup(self):
|
558
|
+
# self.logger.warning(
|
559
|
+
# "DfHelper instance was not used in an async context manager; cleanup is being called manually.",
|
560
|
+
# extra=self.logger_extra,
|
561
|
+
# )
|
562
|
+
# attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
|
563
|
+
# if not attr_name:
|
564
|
+
# self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.", extra=self.logger_extra)
|
565
|
+
# return
|
566
|
+
# active_config = getattr(self, attr_name, None)
|
567
|
+
# if active_config and hasattr(active_config, "aclose"):
|
568
|
+
# self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.", extra=self.logger_extra)
|
569
|
+
# await active_config.aclose()
|
570
|
+
#
|
571
|
+
# # ---------- config helpers ----------
|
572
|
+
# def _get_config(self, model: T, kwargs: Dict[str, Any]) -> T:
|
573
|
+
# recognized = set(model.model_fields.keys())
|
574
|
+
# model_kwargs = {k: kwargs[k] for k in recognized if k in kwargs}
|
575
|
+
# return model(**model_kwargs)
|
576
|
+
#
|
577
|
+
# # ---------- load/aload ----------
|
578
|
+
# def load(self, *, persist: bool = False, as_pandas: bool = False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
579
|
+
# self.logger.debug(f"Loading data from {self.backend} backend with options: {options}", extra=self.logger_extra)
|
580
|
+
# self.total_records, df = self.backend_strategy.load(**options)
|
581
|
+
# df = self._process_loaded_data(df)
|
582
|
+
# df = self._post_process_df(df)
|
583
|
+
# df = _maybe_persist(df, persist)
|
584
|
+
# return _maybe_compute(df, as_pandas)
|
585
|
+
#
|
586
|
+
# async def aload(
|
587
|
+
# self,
|
588
|
+
# *,
|
589
|
+
# persist: bool = False,
|
590
|
+
# as_pandas: bool = False,
|
591
|
+
# timeout: Optional[float] = None,
|
592
|
+
# **options
|
593
|
+
# ) -> Union[pd.DataFrame, dd.DataFrame]:
|
594
|
+
# # 1) Async load if available, else run sync load in a thread.
|
595
|
+
# if hasattr(self.backend_strategy, "aload"):
|
596
|
+
# load_awaitable = self.backend_strategy.aload(**options)
|
597
|
+
# else:
|
598
|
+
# # Run ONLY the backend load step in a thread to avoid event-loop blocking.
|
599
|
+
# load_awaitable = asyncio.to_thread(self.backend_strategy.load, **options)
|
600
|
+
#
|
601
|
+
# total, df = await (asyncio.wait_for(load_awaitable, timeout) if timeout else load_awaitable)
|
602
|
+
# self.total_records = total
|
603
|
+
#
|
604
|
+
# # 2) Post-processing steps are sync; offload to threads.
|
605
|
+
# df = await asyncio.to_thread(self._process_loaded_data, df)
|
606
|
+
# df = await asyncio.to_thread(self._post_process_df, df)
|
607
|
+
#
|
608
|
+
# # 3) Persist and compute can block; offload when needed.
|
609
|
+
# if persist and _is_dask_df(df):
|
610
|
+
# df = await asyncio.to_thread(df.persist)
|
611
|
+
#
|
612
|
+
# if as_pandas and _is_dask_df(df):
|
613
|
+
# # Allow separate timeout for compute if desired; reuse same timeout here.
|
614
|
+
# compute_awaitable = asyncio.to_thread(df.compute)
|
615
|
+
# return await (asyncio.wait_for(compute_awaitable, timeout) if timeout else compute_awaitable)
|
616
|
+
#
|
617
|
+
# return df
|
618
|
+
#
|
619
|
+
# # ---------- dataframe post-processing ----------
|
620
|
+
# def _post_process_df(self, df: dd.DataFrame) -> dd.DataFrame:
|
621
|
+
# self.logger.debug(f"{self.__class__.__name__} is post-processing resulting dataframe with {len(df)} records.", extra=self.logger_extra)
|
622
|
+
# df_params = self._backend_params.df_params
|
623
|
+
# if not df_params:
|
624
|
+
# return df
|
625
|
+
# fieldnames = df_params.get("fieldnames")
|
626
|
+
# column_names = df_params.get("column_names")
|
627
|
+
# index_col = df_params.get("index_col")
|
628
|
+
#
|
629
|
+
# if fieldnames:
|
630
|
+
# valid = [f for f in fieldnames if f in df.columns]
|
631
|
+
# if len(valid) < len(fieldnames):
|
632
|
+
# self.logger.warning(f"Missing columns for filtering: {set(fieldnames) - set(valid)}", extra=self.logger_extra)
|
633
|
+
# df = df[valid]
|
634
|
+
# if column_names:
|
635
|
+
# if len(df.columns) != len(column_names):
|
636
|
+
# raise ValueError(
|
637
|
+
# f"Length mismatch: DataFrame has {len(df.columns)} columns, but {len(column_names)} names were provided."
|
638
|
+
# )
|
639
|
+
# df = df.rename(columns=dict(zip(df.columns, column_names)))
|
640
|
+
# if index_col:
|
641
|
+
# if index_col not in df.columns:
|
642
|
+
# raise ValueError(f"Index column '{index_col}' not found in DataFrame.")
|
643
|
+
# df = df.set_index(index_col)
|
644
|
+
#
|
645
|
+
# self.logger.debug("Post-processing complete.", extra=self.logger_extra)
|
646
|
+
# return df
|
647
|
+
#
|
648
|
+
# def _process_loaded_data(self, df: dd.DataFrame) -> dd.DataFrame:
|
649
|
+
# field_map = self._backend_params.field_map or {}
|
650
|
+
# if not isinstance(field_map, dict) or not field_map:
|
651
|
+
# return df
|
652
|
+
# if hasattr(df, "npartitions") and df.npartitions == 1 and not len(df.head(1)):
|
653
|
+
# return df
|
654
|
+
# self.logger.debug(f"{self.__class__.__name__} is applying rename mapping if/when necessary.", extra=self.logger_extra)
|
655
|
+
# rename_map = {k: v for k, v in field_map.items() if k in df.columns}
|
656
|
+
# if rename_map:
|
657
|
+
# df = df.rename(columns=rename_map)
|
658
|
+
# return df
|
659
|
+
#
|
660
|
+
# # ---------- sinks ----------
|
661
|
+
# def save_to_parquet(self, df: dd.DataFrame, **kwargs):
|
662
|
+
# fs: AbstractFileSystem = kwargs.pop("fs", self.fs)
|
663
|
+
# path: str = kwargs.pop("parquet_storage_path", self.backend_parquet.parquet_storage_path if self.backend_parquet else None)
|
664
|
+
# parquet_filename = kwargs.pop("parquet_filename", self.backend_parquet.parquet_filename if self.backend_parquet else None)
|
665
|
+
# if not parquet_filename:
|
666
|
+
# raise ValueError("A 'parquet_filename' keyword argument must be provided.")
|
667
|
+
# if not fs:
|
668
|
+
# raise ValueError("A filesystem (fs) must be provided to save the parquet file.")
|
669
|
+
# if not path:
|
670
|
+
# raise ValueError("A 'parquet_storage_path' keyword argument must be provided.")
|
671
|
+
# if not self._has_any_rows(df):
|
672
|
+
# self.logger.warning("Skipping save: The provided DataFrame is empty.", extra=self.logger_extra)
|
673
|
+
# return
|
674
|
+
#
|
675
|
+
# with ParquetSaver(
|
676
|
+
# df_result=df,
|
677
|
+
# parquet_storage_path=path,
|
678
|
+
# fs=fs,
|
679
|
+
# debug=self.debug,
|
680
|
+
# logger=self.logger,
|
681
|
+
# verbose=self.verbose,
|
682
|
+
# **kwargs,
|
683
|
+
# ) as saver:
|
684
|
+
# saver.save_to_parquet(parquet_filename)
|
685
|
+
#
|
686
|
+
# self.logger.debug(f"Successfully saved '{parquet_filename}' to '{path}'.", extra=self.logger_extra)
|
687
|
+
#
|
688
|
+
# def save_to_clickhouse(self, df: dd.DataFrame, **credentials):
|
689
|
+
# if not self._has_any_rows(df):
|
690
|
+
# self.logger.warning("Skipping save to ClickHouse: The provided DataFrame is empty.", extra=self.logger_extra)
|
691
|
+
# return
|
692
|
+
# with ClickHouseWriter(debug=self.debug, logger=self.logger, verbose=self.verbose, **credentials) as writer:
|
693
|
+
# writer.save_to_clickhouse(df)
|
694
|
+
# self.logger.debug("Save to ClickHouse completed.", extra=self.logger_extra)
|
695
|
+
#
|
696
|
+
# # ---------- period loaders ----------
|
697
|
+
# def load_period(self, dt_field: str, start: str, end: str, **kwargs):
|
698
|
+
# final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
|
699
|
+
# return self.load(**final_kwargs)
|
700
|
+
#
|
701
|
+
# async def aload_period(self, dt_field: str, start: str, end: str, **kwargs):
|
702
|
+
# final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
|
703
|
+
# return await self.aload(**final_kwargs)
|
704
|
+
#
|
705
|
+
# def _prepare_period_filters(self, dt_field: str, start: str, end: str, **kwargs) -> dict:
|
706
|
+
# start_date, end_date = pd.to_datetime(start).date(), pd.to_datetime(end).date()
|
707
|
+
# if start_date > end_date:
|
708
|
+
# raise ValueError("'start' date cannot be later than 'end' date.")
|
709
|
+
# field_map = self._backend_params.field_map or {}
|
710
|
+
# reverse_map = {v: k for k, v in field_map.items()} if field_map else {}
|
711
|
+
# if len(reverse_map) != len(field_map):
|
712
|
+
# self.logger.warning("field_map values are not unique; reverse mapping may be unreliable.", extra=self.logger_extra)
|
713
|
+
# mapped_field = reverse_map.get(dt_field, dt_field)
|
714
|
+
# if start_date == end_date:
|
715
|
+
# kwargs[f"{mapped_field}__date"] = start_date
|
716
|
+
# else:
|
717
|
+
# kwargs[f"{mapped_field}__date__range"] = [start_date, end_date]
|
718
|
+
# self.logger.debug(f"Period load generated filters: {kwargs}", extra=self.logger_extra)
|
719
|
+
# return kwargs
|
720
|
+
#
|
721
|
+
# @staticmethod
|
722
|
+
# def _has_any_rows(ddf: dd.DataFrame) -> bool:
|
723
|
+
# try:
|
724
|
+
# return bool(ddf.head(1, npartitions=-1).shape[0])
|
725
|
+
# except Exception:
|
726
|
+
# return False
|
727
|
+
#
|
728
|
+
#
|
@@ -205,6 +205,8 @@ class ParquetConfig(BaseModel):
|
|
205
205
|
filesystem=self.fs,
|
206
206
|
filters=pq_filters,
|
207
207
|
# Toggle based on file count; False is safer for many tiny files.
|
208
|
+
aggregate_files=True,
|
209
|
+
split_row_groups=True,
|
208
210
|
gather_statistics=False,
|
209
211
|
ignore_metadata_file=True,
|
210
212
|
)
|