sibi-dst 2025.8.8__py3-none-any.whl → 2025.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -137,6 +137,7 @@ class DfHelper(ManagedResource):
137
137
  def __init__(self, backend="sqlalchemy", **kwargs):
138
138
  self.default_config = self.default_config or {}
139
139
  kwargs = {**self.default_config.copy(), **kwargs}
140
+ kwargs.setdefault("auto_sse", True)
140
141
  super().__init__(**kwargs)
141
142
  self.backend = backend
142
143
 
@@ -166,6 +167,18 @@ class DfHelper(ManagedResource):
166
167
  self.backend_strategy = strategy_cls(self)
167
168
 
168
169
  # ---------- ManagedResource hooks ----------
170
+ def get_sse(self):
171
+ return self._ensure_sse()
172
+
173
+ def _emit_bg(self, event: str, **data: Any) -> None:
174
+ try:
175
+ loop = asyncio.get_running_loop()
176
+ except RuntimeError:
177
+ # no running loop: run to completion
178
+ asyncio.run(self.emit(event, **data))
179
+ else:
180
+ loop.create_task(self.emit(event, **data))
181
+
169
182
  def _cleanup(self):
170
183
  attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
171
184
  if not attr_name:
@@ -213,6 +226,7 @@ class DfHelper(ManagedResource):
213
226
  timeout: Optional[float] = None,
214
227
  **options
215
228
  ) -> Union[pd.DataFrame, dd.DataFrame]:
229
+ await self.emit(f"{self.__class__.__name__} load:start", message=f"Pulling data from {self.backend} backend")
216
230
  # 1) Async load if available, else run sync load in a thread.
217
231
  if hasattr(self.backend_strategy, "aload"):
218
232
  load_awaitable = self.backend_strategy.aload(**options)
@@ -224,18 +238,20 @@ class DfHelper(ManagedResource):
224
238
  self.total_records = total
225
239
 
226
240
  # 2) Post-processing steps are sync; offload to threads.
241
+ await self.emit(event=f"{self.__class__.__name__} load:progress", message=f"Post-processing {len(df)} records")
227
242
  df = await asyncio.to_thread(self._process_loaded_data, df)
228
243
  df = await asyncio.to_thread(self._post_process_df, df)
229
244
 
230
245
  # 3) Persist and compute can block; offload when needed.
231
246
  if persist and _is_dask_df(df):
232
247
  df = await asyncio.to_thread(df.persist)
233
-
234
248
  if as_pandas and _is_dask_df(df):
235
249
  # Allow separate timeout for compute if desired; reuse same timeout here.
236
250
  compute_awaitable = asyncio.to_thread(df.compute)
237
251
  return await (asyncio.wait_for(compute_awaitable, timeout) if timeout else compute_awaitable)
238
252
 
253
+ await self.emit(event=f"{self.__class__.__name__} load:progress", message=f"Returning {len(df)} records")
254
+
239
255
  return df
240
256
 
241
257
  # ---------- dataframe post-processing ----------
@@ -307,6 +323,11 @@ class DfHelper(ManagedResource):
307
323
 
308
324
  self.logger.debug(f"Successfully saved '{parquet_filename}' to '{path}'.", extra=self.logger_extra)
309
325
 
326
+ async def asave_to_parquet(self, df: dd.DataFrame, **kwargs):
327
+ await self.emit(event=f"{self.__class__.__name__} save:start", message=f"Saving {len(df)} records to parquet")
328
+ await asyncio.to_thread(self.save_to_parquet, df, **kwargs)
329
+ await self.emit(event=f"{self.__class__.__name__} save:end", message=f"Saved {len(df)} records to parquet")
330
+
310
331
  def save_to_clickhouse(self, df: dd.DataFrame, **credentials):
311
332
  if not self._has_any_rows(df):
312
333
  self.logger.warning("Skipping save to ClickHouse: The provided DataFrame is empty.", extra=self.logger_extra)
@@ -315,6 +336,11 @@ class DfHelper(ManagedResource):
315
336
  writer.save_to_clickhouse(df)
316
337
  self.logger.debug("Save to ClickHouse completed.", extra=self.logger_extra)
317
338
 
339
+ async def asave_to_clickhouse(self, df: dd.DataFrame, **credentials):
340
+ await self.emit(event=f"{self.__class__.__name__} save:start", message=f"Saving {len(df)} records to ClickHouse")
341
+ await asyncio.to_thread(self.save_to_clickhouse, df, **credentials)
342
+ await self.emit(event=f"{self.__class__.__name__} save:end", message=f"Saved {len(df)} records to ClickHouse")
343
+
318
344
  # ---------- period loaders ----------
319
345
  def load_period(self, dt_field: str, start: str, end: str, **kwargs):
320
346
  final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
@@ -348,3 +374,355 @@ class DfHelper(ManagedResource):
348
374
  return False
349
375
 
350
376
 
377
+
378
+ # BEFORE SSE Handling
379
+ # from __future__ import annotations
380
+ #
381
+ # import asyncio
382
+ # from typing import Any, Dict, Optional, TypeVar, Union
383
+ #
384
+ # import dask.dataframe as dd
385
+ # import pandas as pd
386
+ # from fsspec import AbstractFileSystem
387
+ # from pydantic import BaseModel
388
+ #
389
+ # from sibi_dst.df_helper.core import QueryConfig, ParamsConfig
390
+ # from sibi_dst.utils import ManagedResource, ParquetSaver, ClickHouseWriter
391
+ # from .backends.http import HttpConfig
392
+ # from .backends.parquet import ParquetConfig
393
+ # from .backends.sqlalchemy import SqlAlchemyConnectionConfig, SqlAlchemyLoadFromDb
394
+ #
395
+ # T = TypeVar("T", bound=BaseModel)
396
+ #
397
+ # def _is_dask_df(x) -> bool:
398
+ # return isinstance(x, dd.DataFrame)
399
+ #
400
+ # def _maybe_persist(df, persist: bool):
401
+ # return df.persist() if persist and _is_dask_df(df) else df
402
+ #
403
+ # def _maybe_compute(df, as_pandas: bool):
404
+ # return df.compute() if as_pandas and _is_dask_df(df) else df
405
+ #
406
+ #
407
+ # # ---- Backend Strategy Pattern ----
408
+ # class BaseBackend:
409
+ # def __init__(self, helper: "DfHelper"):
410
+ # self.helper = helper
411
+ # self.logger = helper.logger
412
+ # self.debug = helper.debug
413
+ # self.total_records = -1
414
+ #
415
+ # def load(self, **options) -> Union[tuple[Any, Any], dd.DataFrame, pd.DataFrame]:
416
+ # raise NotImplementedError
417
+ #
418
+ # async def aload(self, **options) -> Union[tuple[Any, Any], dd.DataFrame, pd.DataFrame]:
419
+ # return await asyncio.to_thread(self.load,**options)
420
+ #
421
+ #
422
+ # class SqlAlchemyBackend(BaseBackend):
423
+ # def load(self, **options):
424
+ # try:
425
+ # if options and hasattr(self.helper._backend_params, "parse_params"):
426
+ # self.helper._backend_params.parse_params(options)
427
+ #
428
+ # with SqlAlchemyLoadFromDb(
429
+ # plugin_sqlalchemy=self.helper.backend_db_connection,
430
+ # plugin_query=self.helper._backend_query,
431
+ # plugin_params=self.helper._backend_params,
432
+ # logger=self.logger,
433
+ # debug=self.debug,
434
+ # ) as db_loader:
435
+ # self.total_records, result = db_loader.build_and_load()
436
+ # return self.total_records, result
437
+ # except Exception as e:
438
+ # self.logger.error(f"Failed to load data from sqlalchemy: {e}", exc_info=self.debug, extra=self.helper.logger_extra)
439
+ # return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
440
+ #
441
+ #
442
+ # class ParquetBackend(BaseBackend):
443
+ # def load(self, **options):
444
+ # try:
445
+ # df = self.helper.backend_parquet.load_files(**options)
446
+ # if not self.helper._has_any_rows(df):
447
+ # self.total_records = 0
448
+ # return 0, self._empty_like(df)
449
+ #
450
+ # # Let DfHelper decide about persist
451
+ # self.total_records = -1 # unknown without full count
452
+ # return self.total_records, df
453
+ #
454
+ # except Exception as e:
455
+ # self.total_records = -1 # Reset total_records on failure
456
+ # self.logger.error(f"Failed to load data from parquet: {e}", exc_info=self.debug, extra=self.helper.logger_extra)
457
+ # return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
458
+ #
459
+ # @staticmethod
460
+ # def _empty_like(ddf):
461
+ # empty_pdf = ddf._meta.iloc[0:0]
462
+ # return dd.from_pandas(empty_pdf, npartitions=1)
463
+ #
464
+ #
465
+ # class HttpBackend(BaseBackend):
466
+ # def load(self, **options):
467
+ # # Avoid event-loop problems in sync code paths.
468
+ # # If someone calls .load() on an async backend, make it explicit.
469
+ # raise RuntimeError(
470
+ # "HttpBackend.load() is sync but this backend is async-only. "
471
+ # "Call `await helper.aload(...)` or `await helper.load_async(prefer_native=True, ...)`."
472
+ # )
473
+ #
474
+ # async def aload(self, **options):
475
+ # if not self.helper.backend_http:
476
+ # self.logger.warning("HTTP plugin not configured properly.", extra=self.helper.logger_extra)
477
+ # self.total_records = -1
478
+ # return self.total_records, dd.from_pandas(pd.DataFrame(), npartitions=1)
479
+ #
480
+ # result = await self.helper.backend_http.fetch_data(**options)
481
+ #
482
+ # # Normalize to DataFrame if the plugin returns list/dict
483
+ # if isinstance(result, (list, dict)):
484
+ # pdf = pd.DataFrame(result)
485
+ # ddf = dd.from_pandas(pdf, npartitions=max(1, min(32, len(pdf) // 50_000 or 1)))
486
+ # self.total_records = len(pdf)
487
+ # return self.total_records, ddf
488
+ #
489
+ # if isinstance(result, pd.DataFrame):
490
+ # self.total_records = len(result)
491
+ # ddf = dd.from_pandas(result, npartitions=max(1, min(32, len(result) // 50_000 or 1)))
492
+ # return self.total_records, ddf
493
+ #
494
+ # # Fallback
495
+ # self.total_records = -1
496
+ # return self.total_records, dd.from_pandas(pd.DataFrame(), npartitions=1)
497
+ #
498
+ #
499
+ # class DfHelper(ManagedResource):
500
+ # _BACKEND_STRATEGIES = {
501
+ # "sqlalchemy": SqlAlchemyBackend,
502
+ # "parquet": ParquetBackend,
503
+ # "http": HttpBackend,
504
+ # }
505
+ #
506
+ # _BACKEND_ATTR_MAP = {
507
+ # "sqlalchemy": "backend_db_connection",
508
+ # "parquet": "backend_parquet",
509
+ # "http": "backend_http",
510
+ # }
511
+ #
512
+ # default_config: Dict[str, Any] = None
513
+ # logger_extra: Dict[str, Any] = {"sibi_dst_component": __name__}
514
+ #
515
+ # def __init__(self, backend="sqlalchemy", **kwargs):
516
+ # self.default_config = self.default_config or {}
517
+ # kwargs = {**self.default_config.copy(), **kwargs}
518
+ # super().__init__(**kwargs)
519
+ # self.backend = backend
520
+ #
521
+ # # Ensure defaults flow to plugin configs
522
+ # kwargs.setdefault("debug", self.debug)
523
+ # kwargs.setdefault("fs", self.fs)
524
+ # kwargs.setdefault("logger", self.logger)
525
+ #
526
+ # self.total_records = -1
527
+ # self._backend_query = self._get_config(QueryConfig, kwargs)
528
+ # self._backend_params = self._get_config(ParamsConfig, kwargs)
529
+ #
530
+ # self.backend_db_connection: Optional[SqlAlchemyConnectionConfig] = None
531
+ # self.backend_parquet: Optional[ParquetConfig] = None
532
+ # self.backend_http: Optional[HttpConfig] = None
533
+ #
534
+ # if self.backend == "sqlalchemy":
535
+ # self.backend_db_connection = self._get_config(SqlAlchemyConnectionConfig, kwargs)
536
+ # elif self.backend == "parquet":
537
+ # self.backend_parquet = self._get_config(ParquetConfig, kwargs)
538
+ # elif self.backend == "http":
539
+ # self.backend_http = self._get_config(HttpConfig, kwargs)
540
+ #
541
+ # strategy_cls = self._BACKEND_STRATEGIES.get(self.backend)
542
+ # if not strategy_cls:
543
+ # raise ValueError(f"Unsupported backend: {self.backend}")
544
+ # self.backend_strategy = strategy_cls(self)
545
+ #
546
+ # # ---------- ManagedResource hooks ----------
547
+ # def _cleanup(self):
548
+ # attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
549
+ # if not attr_name:
550
+ # self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.", extra=self.logger_extra)
551
+ # return
552
+ # active_config = getattr(self, attr_name, None)
553
+ # if active_config and hasattr(active_config, "close"):
554
+ # self.logger.debug(f"{self.__class__.__name__} is closing resources for backend '{self.backend}' backend using attribute '{attr_name}'.", extra=self.logger_extra)
555
+ # active_config.close()
556
+ #
557
+ # async def _acleanup(self):
558
+ # self.logger.warning(
559
+ # "DfHelper instance was not used in an async context manager; cleanup is being called manually.",
560
+ # extra=self.logger_extra,
561
+ # )
562
+ # attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
563
+ # if not attr_name:
564
+ # self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.", extra=self.logger_extra)
565
+ # return
566
+ # active_config = getattr(self, attr_name, None)
567
+ # if active_config and hasattr(active_config, "aclose"):
568
+ # self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.", extra=self.logger_extra)
569
+ # await active_config.aclose()
570
+ #
571
+ # # ---------- config helpers ----------
572
+ # def _get_config(self, model: T, kwargs: Dict[str, Any]) -> T:
573
+ # recognized = set(model.model_fields.keys())
574
+ # model_kwargs = {k: kwargs[k] for k in recognized if k in kwargs}
575
+ # return model(**model_kwargs)
576
+ #
577
+ # # ---------- load/aload ----------
578
+ # def load(self, *, persist: bool = False, as_pandas: bool = False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
579
+ # self.logger.debug(f"Loading data from {self.backend} backend with options: {options}", extra=self.logger_extra)
580
+ # self.total_records, df = self.backend_strategy.load(**options)
581
+ # df = self._process_loaded_data(df)
582
+ # df = self._post_process_df(df)
583
+ # df = _maybe_persist(df, persist)
584
+ # return _maybe_compute(df, as_pandas)
585
+ #
586
+ # async def aload(
587
+ # self,
588
+ # *,
589
+ # persist: bool = False,
590
+ # as_pandas: bool = False,
591
+ # timeout: Optional[float] = None,
592
+ # **options
593
+ # ) -> Union[pd.DataFrame, dd.DataFrame]:
594
+ # # 1) Async load if available, else run sync load in a thread.
595
+ # if hasattr(self.backend_strategy, "aload"):
596
+ # load_awaitable = self.backend_strategy.aload(**options)
597
+ # else:
598
+ # # Run ONLY the backend load step in a thread to avoid event-loop blocking.
599
+ # load_awaitable = asyncio.to_thread(self.backend_strategy.load, **options)
600
+ #
601
+ # total, df = await (asyncio.wait_for(load_awaitable, timeout) if timeout else load_awaitable)
602
+ # self.total_records = total
603
+ #
604
+ # # 2) Post-processing steps are sync; offload to threads.
605
+ # df = await asyncio.to_thread(self._process_loaded_data, df)
606
+ # df = await asyncio.to_thread(self._post_process_df, df)
607
+ #
608
+ # # 3) Persist and compute can block; offload when needed.
609
+ # if persist and _is_dask_df(df):
610
+ # df = await asyncio.to_thread(df.persist)
611
+ #
612
+ # if as_pandas and _is_dask_df(df):
613
+ # # Allow separate timeout for compute if desired; reuse same timeout here.
614
+ # compute_awaitable = asyncio.to_thread(df.compute)
615
+ # return await (asyncio.wait_for(compute_awaitable, timeout) if timeout else compute_awaitable)
616
+ #
617
+ # return df
618
+ #
619
+ # # ---------- dataframe post-processing ----------
620
+ # def _post_process_df(self, df: dd.DataFrame) -> dd.DataFrame:
621
+ # self.logger.debug(f"{self.__class__.__name__} is post-processing resulting dataframe with {len(df)} records.", extra=self.logger_extra)
622
+ # df_params = self._backend_params.df_params
623
+ # if not df_params:
624
+ # return df
625
+ # fieldnames = df_params.get("fieldnames")
626
+ # column_names = df_params.get("column_names")
627
+ # index_col = df_params.get("index_col")
628
+ #
629
+ # if fieldnames:
630
+ # valid = [f for f in fieldnames if f in df.columns]
631
+ # if len(valid) < len(fieldnames):
632
+ # self.logger.warning(f"Missing columns for filtering: {set(fieldnames) - set(valid)}", extra=self.logger_extra)
633
+ # df = df[valid]
634
+ # if column_names:
635
+ # if len(df.columns) != len(column_names):
636
+ # raise ValueError(
637
+ # f"Length mismatch: DataFrame has {len(df.columns)} columns, but {len(column_names)} names were provided."
638
+ # )
639
+ # df = df.rename(columns=dict(zip(df.columns, column_names)))
640
+ # if index_col:
641
+ # if index_col not in df.columns:
642
+ # raise ValueError(f"Index column '{index_col}' not found in DataFrame.")
643
+ # df = df.set_index(index_col)
644
+ #
645
+ # self.logger.debug("Post-processing complete.", extra=self.logger_extra)
646
+ # return df
647
+ #
648
+ # def _process_loaded_data(self, df: dd.DataFrame) -> dd.DataFrame:
649
+ # field_map = self._backend_params.field_map or {}
650
+ # if not isinstance(field_map, dict) or not field_map:
651
+ # return df
652
+ # if hasattr(df, "npartitions") and df.npartitions == 1 and not len(df.head(1)):
653
+ # return df
654
+ # self.logger.debug(f"{self.__class__.__name__} is applying rename mapping if/when necessary.", extra=self.logger_extra)
655
+ # rename_map = {k: v for k, v in field_map.items() if k in df.columns}
656
+ # if rename_map:
657
+ # df = df.rename(columns=rename_map)
658
+ # return df
659
+ #
660
+ # # ---------- sinks ----------
661
+ # def save_to_parquet(self, df: dd.DataFrame, **kwargs):
662
+ # fs: AbstractFileSystem = kwargs.pop("fs", self.fs)
663
+ # path: str = kwargs.pop("parquet_storage_path", self.backend_parquet.parquet_storage_path if self.backend_parquet else None)
664
+ # parquet_filename = kwargs.pop("parquet_filename", self.backend_parquet.parquet_filename if self.backend_parquet else None)
665
+ # if not parquet_filename:
666
+ # raise ValueError("A 'parquet_filename' keyword argument must be provided.")
667
+ # if not fs:
668
+ # raise ValueError("A filesystem (fs) must be provided to save the parquet file.")
669
+ # if not path:
670
+ # raise ValueError("A 'parquet_storage_path' keyword argument must be provided.")
671
+ # if not self._has_any_rows(df):
672
+ # self.logger.warning("Skipping save: The provided DataFrame is empty.", extra=self.logger_extra)
673
+ # return
674
+ #
675
+ # with ParquetSaver(
676
+ # df_result=df,
677
+ # parquet_storage_path=path,
678
+ # fs=fs,
679
+ # debug=self.debug,
680
+ # logger=self.logger,
681
+ # verbose=self.verbose,
682
+ # **kwargs,
683
+ # ) as saver:
684
+ # saver.save_to_parquet(parquet_filename)
685
+ #
686
+ # self.logger.debug(f"Successfully saved '{parquet_filename}' to '{path}'.", extra=self.logger_extra)
687
+ #
688
+ # def save_to_clickhouse(self, df: dd.DataFrame, **credentials):
689
+ # if not self._has_any_rows(df):
690
+ # self.logger.warning("Skipping save to ClickHouse: The provided DataFrame is empty.", extra=self.logger_extra)
691
+ # return
692
+ # with ClickHouseWriter(debug=self.debug, logger=self.logger, verbose=self.verbose, **credentials) as writer:
693
+ # writer.save_to_clickhouse(df)
694
+ # self.logger.debug("Save to ClickHouse completed.", extra=self.logger_extra)
695
+ #
696
+ # # ---------- period loaders ----------
697
+ # def load_period(self, dt_field: str, start: str, end: str, **kwargs):
698
+ # final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
699
+ # return self.load(**final_kwargs)
700
+ #
701
+ # async def aload_period(self, dt_field: str, start: str, end: str, **kwargs):
702
+ # final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
703
+ # return await self.aload(**final_kwargs)
704
+ #
705
+ # def _prepare_period_filters(self, dt_field: str, start: str, end: str, **kwargs) -> dict:
706
+ # start_date, end_date = pd.to_datetime(start).date(), pd.to_datetime(end).date()
707
+ # if start_date > end_date:
708
+ # raise ValueError("'start' date cannot be later than 'end' date.")
709
+ # field_map = self._backend_params.field_map or {}
710
+ # reverse_map = {v: k for k, v in field_map.items()} if field_map else {}
711
+ # if len(reverse_map) != len(field_map):
712
+ # self.logger.warning("field_map values are not unique; reverse mapping may be unreliable.", extra=self.logger_extra)
713
+ # mapped_field = reverse_map.get(dt_field, dt_field)
714
+ # if start_date == end_date:
715
+ # kwargs[f"{mapped_field}__date"] = start_date
716
+ # else:
717
+ # kwargs[f"{mapped_field}__date__range"] = [start_date, end_date]
718
+ # self.logger.debug(f"Period load generated filters: {kwargs}", extra=self.logger_extra)
719
+ # return kwargs
720
+ #
721
+ # @staticmethod
722
+ # def _has_any_rows(ddf: dd.DataFrame) -> bool:
723
+ # try:
724
+ # return bool(ddf.head(1, npartitions=-1).shape[0])
725
+ # except Exception:
726
+ # return False
727
+ #
728
+ #
@@ -205,6 +205,8 @@ class ParquetConfig(BaseModel):
205
205
  filesystem=self.fs,
206
206
  filters=pq_filters,
207
207
  # Toggle based on file count; False is safer for many tiny files.
208
+ aggregate_files=True,
209
+ split_row_groups=True,
208
210
  gather_statistics=False,
209
211
  ignore_metadata_file=True,
210
212
  )