sibi-dst 2025.8.3__py3-none-any.whl → 2025.8.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_df_helper.py +12 -304
- sibi_dst/df_helper/backends/parquet/__init__.py +0 -2
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +57 -16
- sibi_dst/df_helper/core/_filter_handler.py +212 -79
- {sibi_dst-2025.8.3.dist-info → sibi_dst-2025.8.5.dist-info}/METADATA +1 -1
- {sibi_dst-2025.8.3.dist-info → sibi_dst-2025.8.5.dist-info}/RECORD +7 -8
- sibi_dst/df_helper/backends/parquet/_filter_handler.py +0 -126
- {sibi_dst-2025.8.3.dist-info → sibi_dst-2025.8.5.dist-info}/WHEEL +0 -0
sibi_dst/df_helper/_df_helper.py
CHANGED
@@ -7,7 +7,7 @@ import pandas as pd
|
|
7
7
|
from fsspec import AbstractFileSystem
|
8
8
|
from pydantic import BaseModel
|
9
9
|
|
10
|
-
from sibi_dst.df_helper.core import QueryConfig, ParamsConfig
|
10
|
+
from sibi_dst.df_helper.core import QueryConfig, ParamsConfig
|
11
11
|
from sibi_dst.utils import ManagedResource, ParquetSaver, ClickHouseWriter
|
12
12
|
from .backends.http import HttpConfig
|
13
13
|
from .backends.parquet import ParquetConfig
|
@@ -54,16 +54,13 @@ class SqlAlchemyBackend(BaseBackend):
|
|
54
54
|
class ParquetBackend(BaseBackend):
|
55
55
|
def load(self, **options):
|
56
56
|
try:
|
57
|
-
df = self.helper.backend_parquet.load_files()
|
57
|
+
df = self.helper.backend_parquet.load_files(**options)
|
58
58
|
if self._is_empty(df):
|
59
59
|
return -1, self._empty_like(df)
|
60
|
-
|
61
|
-
if
|
62
|
-
|
63
|
-
|
64
|
-
if nrows == 0:
|
65
|
-
self.logger.debug("No records after filters; returning empty DataFrame.")
|
66
|
-
return 0, self._empty_like(df)
|
60
|
+
nrows = self._row_count(df)
|
61
|
+
if nrows == 0:
|
62
|
+
self.logger.debug("No records after filters; returning empty DataFrame.")
|
63
|
+
return 0, self._empty_like(df)
|
67
64
|
|
68
65
|
df = df.persist()
|
69
66
|
self.total_records = self._row_count(df) or -1
|
@@ -185,18 +182,20 @@ class DfHelper(ManagedResource):
|
|
185
182
|
return model(**model_kwargs)
|
186
183
|
|
187
184
|
# ---------- load/aload ----------
|
188
|
-
def load(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
185
|
+
def load(self, *, persist: bool = False, as_pandas: bool = False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
189
186
|
self.logger.debug(f"Loading data from {self.backend} backend with options: {options}")
|
190
187
|
self.total_records, df = self.backend_strategy.load(**options)
|
191
188
|
df = self._process_loaded_data(df)
|
192
189
|
df = self._post_process_df(df)
|
193
|
-
self.logger.debug(f"Finished loading data from {self.backend} backend with options: {options}")
|
190
|
+
#self.logger.debug(f"Finished loading data from {self.backend} backend with options: {options}")
|
191
|
+
df = df.persist() if persist else df
|
194
192
|
return df.compute() if as_pandas else df
|
195
193
|
|
196
|
-
async def aload(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
194
|
+
async def aload(self, *, persist: bool = False, as_pandas: bool = False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
197
195
|
self.total_records, df = await self.backend_strategy.aload(**options)
|
198
196
|
df = self._process_loaded_data(df)
|
199
197
|
df = self._post_process_df(df)
|
198
|
+
df = df.persist() if persist else df
|
200
199
|
return df.compute() if as_pandas else df
|
201
200
|
|
202
201
|
# ---------- dataframe post-processing ----------
|
@@ -298,295 +297,4 @@ class DfHelper(ManagedResource):
|
|
298
297
|
self.logger.debug(f"Period load generated filters: {kwargs}")
|
299
298
|
return kwargs
|
300
299
|
|
301
|
-
|
302
|
-
#
|
303
|
-
# from typing import Any, Dict, Optional, Union, TypeVar
|
304
|
-
#
|
305
|
-
# import dask.dataframe as dd
|
306
|
-
# import pandas as pd
|
307
|
-
# from fsspec import AbstractFileSystem
|
308
|
-
# from pydantic import BaseModel
|
309
|
-
#
|
310
|
-
# from sibi_dst.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
|
311
|
-
# from sibi_dst.utils import ManagedResource
|
312
|
-
# from sibi_dst.utils import ParquetSaver, ClickHouseWriter
|
313
|
-
# from .backends.http import HttpConfig
|
314
|
-
# from .backends.parquet import ParquetConfig
|
315
|
-
# from .backends.sqlalchemy import SqlAlchemyConnectionConfig, SqlAlchemyLoadFromDb
|
316
|
-
#
|
317
|
-
# T = TypeVar("T", bound=BaseModel)
|
318
|
-
#
|
319
|
-
#
|
320
|
-
# # --- Backend Strategy Pattern Implementation ---
|
321
|
-
#
|
322
|
-
# class BaseBackend:
|
323
|
-
# """Abstract base class defining clear sync and async loading interfaces."""
|
324
|
-
#
|
325
|
-
# def __init__(self, helper: DfHelper):
|
326
|
-
# self.helper = helper
|
327
|
-
# self.logger = helper.logger
|
328
|
-
# self.debug = helper.debug
|
329
|
-
# self.total_records = helper.total_records # no records loaded yet
|
330
|
-
#
|
331
|
-
# def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
|
332
|
-
# """Synchronous data loading method. Must be implemented by sync backends."""
|
333
|
-
# raise NotImplementedError(f"Backend '{self.__class__.__name__}' does not support synchronous loading.")
|
334
|
-
#
|
335
|
-
# async def aload(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
|
336
|
-
# """Asynchronous data loading method. By default, it calls the sync version."""
|
337
|
-
# return self.load(**options)
|
338
|
-
#
|
339
|
-
#
|
340
|
-
# class SqlAlchemyBackend(BaseBackend):
|
341
|
-
# def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
|
342
|
-
# try:
|
343
|
-
# # Process incoming filter options into the ParamsConfig object
|
344
|
-
# if options and hasattr(self.helper._backend_params, 'parse_params'):
|
345
|
-
# self.helper._backend_params.parse_params(options)
|
346
|
-
#
|
347
|
-
# with SqlAlchemyLoadFromDb(
|
348
|
-
# plugin_sqlalchemy=self.helper.backend_db_connection,
|
349
|
-
# plugin_query=self.helper._backend_query,
|
350
|
-
# plugin_params=self.helper._backend_params,
|
351
|
-
# logger=self.logger,
|
352
|
-
# debug=self.debug
|
353
|
-
# ) as db_loader:
|
354
|
-
# self.total_records, result = db_loader.build_and_load()
|
355
|
-
# return self.total_records, result
|
356
|
-
# except Exception as e:
|
357
|
-
# self.logger.error(f"Failed to load data from sqlalchemy: {e}", exc_info=self.debug)
|
358
|
-
# return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
359
|
-
#
|
360
|
-
#
|
361
|
-
# class ParquetBackend(BaseBackend):
|
362
|
-
# """This backend is also purely synchronous."""
|
363
|
-
#
|
364
|
-
# def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
|
365
|
-
# try:
|
366
|
-
# df = self.helper.backend_parquet.load_files()
|
367
|
-
# if len(df.head(1)) == 0:
|
368
|
-
# return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
369
|
-
# if options and df is not None:
|
370
|
-
# df = FilterHandler('dask', logger=self.logger, debug=False).apply_filters(df, filters=options)
|
371
|
-
# if len(df.head(1)) == 0:
|
372
|
-
# self.logger.debug("No records found after applying filters; returning empty DataFrame.")
|
373
|
-
# return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
374
|
-
# df = df.persist()
|
375
|
-
#
|
376
|
-
# self.total_records = len(df) or -1 # If df is empty, set total_records to -1
|
377
|
-
# return self.total_records, df
|
378
|
-
# except Exception as e:
|
379
|
-
# self.total_records = -1 # Reset total_records on failure
|
380
|
-
# self.logger.error(f"Failed to load data from parquet: {e}")
|
381
|
-
# return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
382
|
-
#
|
383
|
-
#
|
384
|
-
# class HttpBackend(BaseBackend):
|
385
|
-
# """This backend is purely asynchronous."""
|
386
|
-
#
|
387
|
-
# def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
|
388
|
-
# # This will correctly fail by raising NotImplementedError from the base class.
|
389
|
-
# return self.helper.backend_http.fetch_data(**options)
|
390
|
-
#
|
391
|
-
# async def aload(self, **options) -> tuple[Any, Any] | Union[pd.DataFrame, dd.DataFrame]:
|
392
|
-
# if not self.helper.backend_http:
|
393
|
-
# self.logger.warning("HTTP plugin not configured properly.")
|
394
|
-
# self.total_records = -1
|
395
|
-
# return self.total_records, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
396
|
-
# result = await self.helper.backend_http.fetch_data(**options)
|
397
|
-
# self.total_records = len(result)
|
398
|
-
# return self.total_records, result
|
399
|
-
#
|
400
|
-
#
|
401
|
-
# # --- Main DfHelper Facade Class ---
|
402
|
-
#
|
403
|
-
# class DfHelper(ManagedResource):
|
404
|
-
# """
|
405
|
-
# A reusable utility for loading data. It provides both sync (`load`) and
|
406
|
-
# async (`aload`) methods to accommodate different backends.
|
407
|
-
# """
|
408
|
-
# _BACKEND_STRATEGIES = {
|
409
|
-
# 'sqlalchemy': SqlAlchemyBackend,
|
410
|
-
# 'parquet': ParquetBackend,
|
411
|
-
# 'http': HttpBackend,
|
412
|
-
# }
|
413
|
-
#
|
414
|
-
# _BACKEND_ATTR_MAP = {
|
415
|
-
# 'sqlalchemy': 'backend_db_connection',
|
416
|
-
# 'parquet': 'backend_parquet',
|
417
|
-
# 'http': 'backend_http',
|
418
|
-
# }
|
419
|
-
#
|
420
|
-
# default_config: Dict = None
|
421
|
-
#
|
422
|
-
# def __init__(self, backend='sqlalchemy', **kwargs):
|
423
|
-
# self.default_config = self.default_config or {}
|
424
|
-
# kwargs = {**self.default_config.copy(), **kwargs}
|
425
|
-
# super().__init__(**kwargs)
|
426
|
-
# self.backend = backend
|
427
|
-
#
|
428
|
-
# # Need to set default values for backend-specific configurations
|
429
|
-
# kwargs.setdefault("debug", self.debug)
|
430
|
-
# kwargs.setdefault("fs", self.fs)
|
431
|
-
# kwargs.setdefault("logger", self.logger)
|
432
|
-
# self.total_records = -1 # Initialize total_records to -1 to indicate no records loaded yet
|
433
|
-
# self._backend_query = self._get_config(QueryConfig, kwargs)
|
434
|
-
# self._backend_params = self._get_config(ParamsConfig, kwargs)
|
435
|
-
# self.backend_db_connection: Optional[SqlAlchemyConnectionConfig] = None
|
436
|
-
# self.backend_parquet: Optional[ParquetConfig] = None
|
437
|
-
# self.backend_http: Optional[HttpConfig] = None
|
438
|
-
#
|
439
|
-
# if self.backend == 'sqlalchemy':
|
440
|
-
# self.backend_db_connection = self._get_config(SqlAlchemyConnectionConfig, kwargs)
|
441
|
-
# elif self.backend == 'parquet':
|
442
|
-
# self.backend_parquet = self._get_config(ParquetConfig, kwargs)
|
443
|
-
# elif self.backend == 'http':
|
444
|
-
# self.backend_http = self._get_config(HttpConfig, kwargs)
|
445
|
-
#
|
446
|
-
# strategy_class = self._BACKEND_STRATEGIES.get(self.backend)
|
447
|
-
# if not strategy_class: raise ValueError(f"Unsupported backend: {self.backend}")
|
448
|
-
# self.backend_strategy = strategy_class(self)
|
449
|
-
#
|
450
|
-
# def _cleanup(self):
|
451
|
-
# attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
|
452
|
-
# if not attr_name:
|
453
|
-
# self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.")
|
454
|
-
# return
|
455
|
-
# # Get the actual config object (e.g., self.backend_db_connection)
|
456
|
-
# active_config = getattr(self, attr_name, None)
|
457
|
-
#
|
458
|
-
# if active_config and hasattr(active_config, "close"):
|
459
|
-
# self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.")
|
460
|
-
# active_config.close()
|
461
|
-
#
|
462
|
-
# async def _acleanup(self):
|
463
|
-
# self.logger.warning("DfHelper instance was not used in an async context manager; cleanup is being called manually.")
|
464
|
-
# attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
|
465
|
-
# if not attr_name:
|
466
|
-
# self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.")
|
467
|
-
# return
|
468
|
-
# # Get the actual config object (e.g., self.backend_db_connection)
|
469
|
-
# active_config = getattr(self, attr_name, None)
|
470
|
-
# if active_config and hasattr(active_config, "aclose"):
|
471
|
-
# self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.")
|
472
|
-
# await active_config.aclose()
|
473
|
-
#
|
474
|
-
# def _get_config(self, model: T, kwargs: Dict[str, Any]) -> T:
|
475
|
-
# recognized_keys = set(model.model_fields.keys())
|
476
|
-
# model_kwargs = {k: kwargs[k] for k in recognized_keys if k in kwargs}
|
477
|
-
# return model(**model_kwargs)
|
478
|
-
#
|
479
|
-
# def load(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
480
|
-
# """Loads data synchronously. Fails if backend is async-only."""
|
481
|
-
# self.logger.debug(f"Loading data from {self.backend} backend with options: {options}")
|
482
|
-
# self.total_records, df = self.backend_strategy.load(**options)
|
483
|
-
# df = self._process_loaded_data(df)
|
484
|
-
# df = self._post_process_df(df)
|
485
|
-
# self.logger.debug(f"Finished loading data from {self.backend} backend with options: {options}")
|
486
|
-
# return df.compute() if as_pandas else df
|
487
|
-
#
|
488
|
-
# async def aload(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
489
|
-
# """Loads data asynchronously from any backend."""
|
490
|
-
# self.total_records, df = await self.backend_strategy.aload(**options)
|
491
|
-
# df = self._process_loaded_data(df)
|
492
|
-
# df = self._post_process_df(df)
|
493
|
-
# return df.compute() if as_pandas else df
|
494
|
-
#
|
495
|
-
# def _post_process_df(self, df: dd.DataFrame) -> dd.DataFrame:
|
496
|
-
# self.logger.debug("Post-processing DataFrame.")
|
497
|
-
# df_params = self._backend_params.df_params
|
498
|
-
# if not df_params: return df
|
499
|
-
# fieldnames, column_names, index_col = (df_params.get("fieldnames"), df_params.get("column_names"),
|
500
|
-
# df_params.get("index_col"))
|
501
|
-
# if not any([fieldnames, column_names, index_col]): return df
|
502
|
-
#
|
503
|
-
# if fieldnames:
|
504
|
-
# valid_fieldnames = [f for f in fieldnames if f in df.columns]
|
505
|
-
# if len(valid_fieldnames) < len(fieldnames): self.logger.warning(
|
506
|
-
# f"Missing columns for filtering: {set(fieldnames) - set(valid_fieldnames)}")
|
507
|
-
# df = df[valid_fieldnames]
|
508
|
-
# if column_names:
|
509
|
-
# if len(df.columns) != len(column_names): raise ValueError(
|
510
|
-
# f"Length mismatch: DataFrame has {len(df.columns)} columns, but {len(column_names)} names were provided.")
|
511
|
-
# df = df.rename(columns=dict(zip(df.columns, column_names)))
|
512
|
-
# if index_col:
|
513
|
-
# if index_col not in df.columns: raise ValueError(f"Index column '{index_col}' not found in DataFrame.")
|
514
|
-
# df = df.set_index(index_col)
|
515
|
-
# self.logger.debug("Post-processing complete.")
|
516
|
-
# return df
|
517
|
-
#
|
518
|
-
# def _process_loaded_data(self, df: dd.DataFrame) -> dd.DataFrame:
|
519
|
-
# field_map = self._backend_params.field_map or {}
|
520
|
-
# if not isinstance(field_map, dict) or not field_map: return df
|
521
|
-
# if hasattr(df, 'npartitions') and df.npartitions == 1 and not len(df.head(1)): return df
|
522
|
-
# self.logger.debug("Processing loaded data...applying rename mapping if necessary.")
|
523
|
-
# rename_mapping = {k: v for k, v in field_map.items() if k in df.columns}
|
524
|
-
# if rename_mapping: df = df.rename(columns=rename_mapping)
|
525
|
-
# self.logger.debug("Rename mapping complete...")
|
526
|
-
# return df
|
527
|
-
#
|
528
|
-
# def save_to_parquet(self, df: dd.DataFrame, parquet_filename: str, **kwargs):
|
529
|
-
# """Saves a Dask DataFrame to a Parquet file with validation."""
|
530
|
-
#
|
531
|
-
# # Use .get() for cleaner access to optional arguments.
|
532
|
-
# fs: AbstractFileSystem = kwargs.get('fs', self.fs)
|
533
|
-
# path: str = kwargs.get('parquet_storage_path')
|
534
|
-
#
|
535
|
-
# # Guard clauses to fail fast with clear errors.
|
536
|
-
# if not fs:
|
537
|
-
# raise ValueError("A filesystem (fs) must be provided to save the parquet file.")
|
538
|
-
# if not path:
|
539
|
-
# raise ValueError("A 'parquet_storage_path' keyword argument must be provided.")
|
540
|
-
#
|
541
|
-
# # An efficient, idiomatic way to check if a Dask DataFrame is empty.
|
542
|
-
# if len(df.head(1)) == 0:
|
543
|
-
# self.logger.warning("Skipping save: The provided DataFrame is empty.")
|
544
|
-
# return
|
545
|
-
#
|
546
|
-
# with ParquetSaver(
|
547
|
-
# df_result=df,
|
548
|
-
# parquet_storage_path=path,
|
549
|
-
# fs=fs,
|
550
|
-
# debug=self.debug,
|
551
|
-
# logger=self.logger,
|
552
|
-
# verbose=self.verbose,
|
553
|
-
# **kwargs
|
554
|
-
# ) as saver:
|
555
|
-
# saver.save_to_parquet(parquet_filename)
|
556
|
-
#
|
557
|
-
# self.logger.debug(f"Successfully saved '{parquet_filename}' to '{path}'.")
|
558
|
-
#
|
559
|
-
# def save_to_clickhouse(self, df: dd.DataFrame, **credentials):
|
560
|
-
# if hasattr(df, 'npartitions') and df.npartitions == 1 and not len(df.head(1)):
|
561
|
-
# self.logger.warning("Cannot write to ClickHouse; DataFrame is empty.")
|
562
|
-
# return
|
563
|
-
#
|
564
|
-
# with ClickHouseWriter(debug=self.debug, logger=self.logger, verbose=self.verbose, **credentials) as writer:
|
565
|
-
# writer.save_to_clickhouse(df)
|
566
|
-
# self.logger.debug("Save to ClickHouse completed.")
|
567
|
-
#
|
568
|
-
# def load_period(self, dt_field: str, start: str, end: str, **kwargs) -> Union[pd.DataFrame, dd.DataFrame]:
|
569
|
-
# """Synchronous convenience method for loading a date range."""
|
570
|
-
# final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
|
571
|
-
# return self.load(**final_kwargs)
|
572
|
-
#
|
573
|
-
# async def aload_period(self, dt_field: str, start: str, end: str, **kwargs) -> Union[pd.DataFrame, dd.DataFrame]:
|
574
|
-
# """Asynchronous convenience method for loading a date range."""
|
575
|
-
# final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
|
576
|
-
# return await self.aload(**final_kwargs)
|
577
|
-
#
|
578
|
-
# def _prepare_period_filters(self, dt_field: str, start: str, end: str, **kwargs) -> dict:
|
579
|
-
# start_date, end_date = pd.to_datetime(start).date(), pd.to_datetime(end).date()
|
580
|
-
# if start_date > end_date:
|
581
|
-
# raise ValueError("'start' date cannot be later than 'end' date.")
|
582
|
-
# field_map = self._backend_params.field_map or {}
|
583
|
-
# reverse_map = {v: k for k, v in field_map.items()} if field_map else {}
|
584
|
-
# if len(reverse_map) != len(field_map):
|
585
|
-
# self.logger.warning("field_map values are not unique; reverse mapping may be unreliable.")
|
586
|
-
# mapped_field = reverse_map.get(dt_field, dt_field)
|
587
|
-
# if start_date == end_date:
|
588
|
-
# kwargs[f"{mapped_field}__date"] = start_date
|
589
|
-
# else:
|
590
|
-
# kwargs[f"{mapped_field}__date__range"] = [start_date, end_date]
|
591
|
-
# self.logger.debug(f"Period load generated filters: {kwargs}")
|
592
|
-
# return kwargs
|
300
|
+
|
@@ -6,6 +6,8 @@ import dask.dataframe as dd
|
|
6
6
|
import fsspec
|
7
7
|
import pandas as pd
|
8
8
|
from pydantic import BaseModel, model_validator, ConfigDict
|
9
|
+
|
10
|
+
from sibi_dst.df_helper.core import FilterHandler
|
9
11
|
from sibi_dst.utils import FilePathGenerator
|
10
12
|
from sibi_dst.utils import Logger
|
11
13
|
|
@@ -175,40 +177,79 @@ class ParquetConfig(BaseModel):
|
|
175
177
|
total_size += self.fs.size(path)
|
176
178
|
return total_size
|
177
179
|
|
178
|
-
def load_files(self):
|
180
|
+
def load_files(self, **filters):
|
179
181
|
"""
|
180
|
-
Loads parquet files into a Dask DataFrame based on the specified conditions.
|
181
|
-
|
182
|
-
parquet folder paths or a single specified parquet path.
|
183
|
-
|
184
|
-
:return: A Dask DataFrame containing loaded parquet file data.
|
185
|
-
:rtype: dask.dataframe.DataFrame
|
182
|
+
Loads parquet files into a Dask DataFrame based on the specified conditions.
|
183
|
+
Supports Parquet predicate pushdown (pyarrow) + residual Dask mask.
|
186
184
|
"""
|
187
185
|
if not self.load_parquet:
|
188
186
|
self.logger.warning("Parquet loading is disabled. Returning empty DataFrame.")
|
189
187
|
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
190
188
|
|
189
|
+
# Resolve paths
|
191
190
|
paths_to_load = []
|
192
191
|
if self.parquet_folder_list:
|
193
|
-
|
194
|
-
paths_to_load = [p for p in self.parquet_folder_list if p is not None]
|
192
|
+
paths_to_load = [p for p in self.parquet_folder_list if p]
|
195
193
|
elif self.parquet_full_path:
|
196
|
-
# Treat the single path as a list with one item
|
197
194
|
paths_to_load = [self.parquet_full_path]
|
198
195
|
|
199
196
|
if not paths_to_load:
|
200
197
|
self.logger.warning("No valid parquet file paths were provided. Returning empty DataFrame.")
|
201
198
|
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
202
199
|
|
200
|
+
# Prepare filters
|
201
|
+
fh = None
|
202
|
+
expr = None
|
203
|
+
pq_filters = None
|
204
|
+
residual_filters = None
|
205
|
+
if filters:
|
206
|
+
fh = FilterHandler(backend="dask", debug=self.debug, logger=self.logger)
|
207
|
+
|
208
|
+
# Use the compiler + pushdown split so we don't double-apply
|
209
|
+
try:
|
210
|
+
# If you added split_pushdown_and_residual earlier:
|
211
|
+
pq_filters, residual_filters = fh.split_pushdown_and_residual(filters)
|
212
|
+
expr = fh.compile_filters(residual_filters) if residual_filters else None
|
213
|
+
except AttributeError:
|
214
|
+
# Fallback if you didn't add split_*: push everything down and also mask (redundant but correct)
|
215
|
+
expr = fh.compile_filters(filters)
|
216
|
+
pq_filters = expr.to_parquet_filters()
|
217
|
+
|
203
218
|
try:
|
204
219
|
self.logger.debug(f"Attempting to load Parquet data from: {paths_to_load}")
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
220
|
+
|
221
|
+
# Optional: prune columns. Keep it simple unless you want to compute from filters.
|
222
|
+
columns = None # or a concrete list if you know it
|
223
|
+
|
224
|
+
if fh and pq_filters:
|
225
|
+
self.logger.debug(f"Applying Parquet filters: {pq_filters}")
|
226
|
+
dd_result = dd.read_parquet(
|
227
|
+
paths_to_load,
|
228
|
+
engine="pyarrow",
|
229
|
+
filesystem=self.fs, # your fsspec filesystem (e.g., s3fs)
|
230
|
+
filters=pq_filters,
|
231
|
+
columns=columns,
|
232
|
+
gather_statistics=False, # uncomment if you have *many* files and don't need global stats
|
233
|
+
)
|
234
|
+
# Apply only residual mask (if any)
|
235
|
+
if expr is not None:
|
236
|
+
dd_result = dd_result[expr.mask(dd_result)]
|
237
|
+
else:
|
238
|
+
dd_result = dd.read_parquet(
|
239
|
+
paths_to_load,
|
240
|
+
engine="pyarrow",
|
241
|
+
filesystem=self.fs,
|
242
|
+
columns=columns,
|
243
|
+
gather_statistics=False,
|
244
|
+
)
|
245
|
+
# If we didn't push down, but have filters, apply them here
|
246
|
+
if expr is None and fh and filters:
|
247
|
+
expr = fh.compile_filters(filters)
|
248
|
+
if expr is not None:
|
249
|
+
dd_result = dd_result[expr.mask(dd_result)]
|
250
|
+
|
211
251
|
return dd_result
|
252
|
+
|
212
253
|
except FileNotFoundError as e:
|
213
254
|
self.logger.debug(f"Parquet files not found at paths {paths_to_load}: {e}")
|
214
255
|
self.logger.debug("Returning empty DataFrame due to missing parquet files.")
|
@@ -1,4 +1,6 @@
|
|
1
1
|
import datetime
|
2
|
+
from dataclasses import dataclass
|
3
|
+
from typing import Optional, Dict, Any, List, Union, Tuple, Callable
|
2
4
|
|
3
5
|
import dask.dataframe as dd
|
4
6
|
import pandas as pd
|
@@ -8,69 +10,231 @@ from sqlalchemy.sql.sqltypes import Date, Time
|
|
8
10
|
from sibi_dst.utils import Logger
|
9
11
|
|
10
12
|
|
13
|
+
# -------------------- Deferred filter expression AST --------------------
|
14
|
+
class Expr:
|
15
|
+
def mask(self, df: dd.DataFrame) -> dd.Series:
|
16
|
+
raise NotImplementedError
|
17
|
+
|
18
|
+
def to_parquet_filters(self) -> List[Union[Tuple[str, str, Any], List[Tuple[str, str, Any]]]]:
|
19
|
+
# By default, nothing to push down
|
20
|
+
return []
|
21
|
+
|
22
|
+
def __and__(self, other: "Expr") -> "Expr": return And(self, other)
|
23
|
+
def __or__(self, other: "Expr") -> "Expr": return Or(self, other)
|
24
|
+
def __invert__(self) -> "Expr": return Not(self)
|
25
|
+
|
26
|
+
|
27
|
+
@dataclass(frozen=True)
|
28
|
+
class TrueExpr(Expr):
|
29
|
+
"""Matches all rows; useful as a neutral starting point."""
|
30
|
+
def mask(self, df: dd.DataFrame) -> dd.Series:
|
31
|
+
return df.map_partitions(lambda p: pd.Series(True, index=p.index),
|
32
|
+
meta=pd.Series(dtype=bool))
|
33
|
+
|
34
|
+
|
35
|
+
@dataclass(frozen=True)
|
36
|
+
class ColOp(Expr):
|
37
|
+
field: str
|
38
|
+
casting: Optional[str]
|
39
|
+
op: str
|
40
|
+
value: Any
|
41
|
+
handler: "FilterHandler" # reuse your parsing + Dask ops
|
42
|
+
|
43
|
+
def mask(self, df: dd.DataFrame) -> dd.Series:
|
44
|
+
col = self.handler._get_dask_column(df, self.field, self.casting)
|
45
|
+
val = self.handler._parse_filter_value(self.casting, self.value)
|
46
|
+
return self.handler._apply_operation_dask(col, self.op, val)
|
47
|
+
|
48
|
+
def to_parquet_filters(self):
|
49
|
+
# Only basic comparisons can be pushed down
|
50
|
+
if self.op not in {"exact", "gt", "gte", "lt", "lte", "in", "range"}:
|
51
|
+
return []
|
52
|
+
val = self.handler._parse_filter_value(self.casting, self.value)
|
53
|
+
if self.casting == "date":
|
54
|
+
if self.op == "range" and isinstance(val, (list, tuple)) and len(val) == 2:
|
55
|
+
lo, hi = pd.Timestamp(val[0]), pd.Timestamp(val[1])
|
56
|
+
return [(self.field, ">=", lo), (self.field, "<=", hi)]
|
57
|
+
if isinstance(val, list):
|
58
|
+
val = [pd.Timestamp(v) for v in val]
|
59
|
+
else:
|
60
|
+
val = pd.Timestamp(val)
|
61
|
+
if self.op == "exact": return [(self.field, "=", val)]
|
62
|
+
if self.op in {"gt","gte","lt","lte"}:
|
63
|
+
sym = {"gt": ">", "gte": ">=", "lt": "<", "lte": "<="}[self.op]
|
64
|
+
return [(self.field, sym, val)]
|
65
|
+
if self.op == "in": return [(self.field, "in", list(val) if not isinstance(val, list) else val)]
|
66
|
+
if self.op == "range":
|
67
|
+
lo, hi = val
|
68
|
+
return [(self.field, ">=", lo), (self.field, "<=", hi)]
|
69
|
+
return []
|
70
|
+
|
71
|
+
|
72
|
+
@dataclass(frozen=True)
|
73
|
+
class And(Expr):
|
74
|
+
left: Expr; right: Expr
|
75
|
+
def mask(self, df: dd.DataFrame) -> dd.Series: return self.left.mask(df) & self.right.mask(df)
|
76
|
+
def to_parquet_filters(self):
|
77
|
+
# AND = concatenate both sides' AND-terms
|
78
|
+
return [*self.left.to_parquet_filters(), *self.right.to_parquet_filters()]
|
79
|
+
|
80
|
+
|
81
|
+
@dataclass(frozen=True)
|
82
|
+
class Or(Expr):
|
83
|
+
left: Expr; right: Expr
|
84
|
+
def mask(self, df: dd.DataFrame) -> dd.Series: return self.left.mask(df) | self.right.mask(df)
|
85
|
+
def to_parquet_filters(self):
|
86
|
+
# OR must be returned as list-of-lists; if either side has non-pushdown, defer to mask
|
87
|
+
lf, rf = self.left.to_parquet_filters(), self.right.to_parquet_filters()
|
88
|
+
if not lf or not rf:
|
89
|
+
return []
|
90
|
+
return [lf, rf]
|
91
|
+
|
92
|
+
|
93
|
+
@dataclass(frozen=True)
|
94
|
+
class Not(Expr):
|
95
|
+
inner: Expr
|
96
|
+
def mask(self, df: dd.DataFrame) -> dd.Series: return ~self.inner.mask(df)
|
97
|
+
def to_parquet_filters(self): return []
|
98
|
+
|
99
|
+
|
100
|
+
# -------------------- Filter handler --------------------
|
11
101
|
class FilterHandler:
|
12
102
|
"""
|
13
|
-
Handles the application of filters to
|
14
|
-
|
15
|
-
|
16
|
-
SQLAlchemy queries and Dask DataFrames. It supports multiple filtering operations, including
|
17
|
-
exact matches, comparisons, and string-related operations such as contains and regex. The handler
|
18
|
-
automatically determines and applies backend-specific processing, enabling seamless integration with
|
19
|
-
different data models or backends.
|
20
|
-
|
21
|
-
:ivar backend: The backend in use ('sqlalchemy' or 'dask').
|
22
|
-
:type backend: str
|
23
|
-
:ivar logger: An optional logger instance for debugging and logging purposes.
|
24
|
-
:type logger: Logger
|
25
|
-
:ivar backend_methods: A dictionary mapping backend-specific methods for column retrieval and operation application.
|
26
|
-
:type backend_methods: dict
|
103
|
+
Handles the application of filters to SQLAlchemy and Dask backends.
|
104
|
+
Also compiles dicts into deferred expressions (Expr) and can split
|
105
|
+
pushdown-friendly predicates from residual ones.
|
27
106
|
"""
|
28
107
|
def __init__(self, backend, logger=None, debug=False):
|
29
|
-
"""
|
30
|
-
Initialize the FilterHandler.
|
31
|
-
|
32
|
-
Args:
|
33
|
-
backend: The backend to use ('sqlalchemy' or 'dask').
|
34
|
-
logger: Optional logger for debugging purposes.
|
35
|
-
"""
|
36
108
|
self.backend = backend
|
37
|
-
self.logger = logger or Logger.default_logger(
|
38
|
-
logger_name=self.__class__.__name__) # No-op logger if none provided
|
109
|
+
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
39
110
|
self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
|
40
111
|
self.backend_methods = self._get_backend_methods(backend)
|
41
112
|
|
42
|
-
|
43
|
-
|
44
|
-
|
113
|
+
# --------- NEW: pushdown helpers ---------
|
114
|
+
def _pushdown_ops(self) -> set[str]:
|
115
|
+
"""Ops that can be translated to PyArrow parquet filters."""
|
116
|
+
return {"exact", "gt", "gte", "lt", "lte", "in", "range"}
|
45
117
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
118
|
+
def to_parquet_filters(self, filters: Optional[Dict[str, Any]] = None
|
119
|
+
) -> List[Tuple[str, str, Any]]:
|
120
|
+
"""
|
121
|
+
Convert a subset of filters into PyArrow parquet filters (AND semantics).
|
122
|
+
Unsupported ops are skipped here and should be applied later as a Dask mask.
|
123
|
+
"""
|
124
|
+
filters = filters or {}
|
125
|
+
out: List[Tuple[str, str, Any]] = []
|
50
126
|
|
51
|
-
|
52
|
-
|
127
|
+
for key, value in filters.items():
|
128
|
+
field, casting, op = self._parse_filter_key(key)
|
129
|
+
if op not in self._pushdown_ops():
|
130
|
+
continue
|
131
|
+
|
132
|
+
val = self._parse_filter_value(casting, value)
|
133
|
+
|
134
|
+
# Normalize dates to Timestamp for Arrow
|
135
|
+
if casting == "date":
|
136
|
+
if op == "range" and isinstance(val, (list, tuple)) and len(val) == 2:
|
137
|
+
lo, hi = pd.Timestamp(val[0]), pd.Timestamp(val[1])
|
138
|
+
out.extend([(field, ">=", lo), (field, "<=", hi)])
|
139
|
+
continue
|
140
|
+
if isinstance(val, list):
|
141
|
+
val = [pd.Timestamp(v) for v in val]
|
142
|
+
else:
|
143
|
+
val = pd.Timestamp(val)
|
144
|
+
|
145
|
+
if op == "exact":
|
146
|
+
out.append((field, "=", val))
|
147
|
+
elif op in {"gt", "gte", "lt", "lte"}:
|
148
|
+
sym = {"gt": ">", "gte": ">=", "lt": "<", "lte": "<="}[op]
|
149
|
+
out.append((field, sym, val))
|
150
|
+
elif op == "in":
|
151
|
+
out.append((field, "in", list(val) if not isinstance(val, list) else val))
|
152
|
+
elif op == "range":
|
153
|
+
lo, hi = val
|
154
|
+
out.extend([(field, ">=", lo), (field, "<=", hi)])
|
155
|
+
|
156
|
+
return out
|
157
|
+
|
158
|
+
def split_pushdown_and_residual(self, filters: Dict[str, Any]
|
159
|
+
) -> Tuple[List[Tuple[str, str, Any]], Dict[str, Any]]:
|
160
|
+
"""
|
161
|
+
Split input filter dict into:
|
162
|
+
- parquet_filters: list of (col, op, val) tuples for dd.read_parquet(..., filters=...)
|
163
|
+
- residual_filters: dict to be applied later via a Dask boolean mask
|
164
|
+
"""
|
165
|
+
push_keys = set()
|
166
|
+
for key in filters.keys():
|
167
|
+
_, casting, op = self._parse_filter_key(key)
|
168
|
+
if op in self._pushdown_ops():
|
169
|
+
push_keys.add(key)
|
170
|
+
|
171
|
+
pushdown_subset = {k: filters[k] for k in push_keys}
|
172
|
+
parquet_filters = self.to_parquet_filters(pushdown_subset)
|
173
|
+
residual_filters = {k: v for k, v in filters.items() if k not in push_keys}
|
174
|
+
return parquet_filters, residual_filters
|
175
|
+
|
176
|
+
# --------- Expression compiler / mask builder ---------
|
177
|
+
def compile_filters(self, filters: Optional[Dict[str, Any]] = None) -> Expr:
|
178
|
+
"""
|
179
|
+
Compile a dict into a deferred expression tree (no df required).
|
180
|
+
Supports boolean forms: {"$and": [...]}, {"$or": [...]}, {"$not": {...}}.
|
181
|
+
Default combination for plain dicts: AND of all terms.
|
53
182
|
"""
|
183
|
+
filters = filters or {}
|
184
|
+
if not filters:
|
185
|
+
return TrueExpr()
|
186
|
+
|
187
|
+
# boolean forms
|
188
|
+
if "$and" in filters:
|
189
|
+
expr = TrueExpr()
|
190
|
+
for sub in filters["$and"]:
|
191
|
+
expr = expr & self.compile_filters(sub)
|
192
|
+
return expr
|
193
|
+
|
194
|
+
if "$or" in filters:
|
195
|
+
subs = [self.compile_filters(sub) for sub in filters["$or"]]
|
196
|
+
if not subs: return TrueExpr()
|
197
|
+
expr = subs[0]
|
198
|
+
for s in subs[1:]:
|
199
|
+
expr = expr | s
|
200
|
+
return expr
|
201
|
+
|
202
|
+
if "$not" in filters:
|
203
|
+
return ~self.compile_filters(filters["$not"])
|
204
|
+
|
205
|
+
# plain dict => AND across keys
|
206
|
+
expr: Expr = TrueExpr()
|
207
|
+
for key, value in filters.items():
|
208
|
+
field, casting, op = self._parse_filter_key(key)
|
209
|
+
expr = expr & ColOp(field=field, casting=casting, op=op, value=value, handler=self)
|
210
|
+
return expr
|
211
|
+
|
212
|
+
def build_mask_fn(self, filters: Optional[Dict[str, Any]] = None) -> Callable[[dd.DataFrame], dd.Series]:
|
213
|
+
"""Return a callable (df -> boolean mask) without touching df now."""
|
214
|
+
expr = self.compile_filters(filters)
|
215
|
+
def _fn(df: dd.DataFrame) -> dd.Series:
|
216
|
+
return expr.mask(df)
|
217
|
+
return _fn
|
218
|
+
|
219
|
+
# --------- Existing “apply now” API (kept as-is) ---------
|
220
|
+
def apply_filters(self, query_or_df, model=None, filters=None):
|
54
221
|
filters = filters or {}
|
55
222
|
for key, value in filters.items():
|
56
223
|
field_name, casting, operation = self._parse_filter_key(key)
|
57
224
|
parsed_value = self._parse_filter_value(casting, value)
|
58
|
-
# print(field_name, casting, operation, parsed_value)
|
59
|
-
# Get the column and apply backend-specific transformations
|
60
225
|
if self.backend == "sqlalchemy":
|
61
226
|
column = self.backend_methods["get_column"](field_name, model, casting)
|
62
227
|
condition = self.backend_methods["apply_operation"](column, operation, parsed_value)
|
63
228
|
query_or_df = self.backend_methods["apply_condition"](query_or_df, condition)
|
64
|
-
|
65
229
|
elif self.backend == "dask":
|
66
230
|
column = self.backend_methods["get_column"](query_or_df, field_name, casting)
|
67
231
|
condition = self.backend_methods["apply_operation"](column, operation, parsed_value)
|
68
232
|
query_or_df = self.backend_methods["apply_condition"](query_or_df, condition)
|
69
233
|
else:
|
70
234
|
raise ValueError(f"Unsupported backend: {self.backend}")
|
71
|
-
|
72
235
|
return query_or_df
|
73
236
|
|
237
|
+
# --------- Parsing & backend plumbing (unchanged) ---------
|
74
238
|
@staticmethod
|
75
239
|
def _parse_filter_key(key):
|
76
240
|
parts = key.split("__")
|
@@ -120,29 +284,15 @@ class FilterHandler:
|
|
120
284
|
|
121
285
|
@staticmethod
|
122
286
|
def _get_sqlalchemy_column(field_name, model, casting):
|
123
|
-
"""
|
124
|
-
Retrieve and cast a column for SQLAlchemy based on the field name and casting.
|
125
|
-
|
126
|
-
Args:
|
127
|
-
field_name: The name of the field/column in the model.
|
128
|
-
model: The SQLAlchemy model.
|
129
|
-
casting: The casting type ('date', 'time', etc.).
|
130
|
-
|
131
|
-
Returns:
|
132
|
-
The SQLAlchemy column object, optionally cast or transformed.
|
133
|
-
"""
|
134
287
|
column = getattr(model, field_name, None)
|
135
288
|
if not column:
|
136
289
|
raise AttributeError(f"Field '{field_name}' not found in model '{model.__name__}'")
|
137
290
|
|
138
291
|
if casting == "date":
|
139
|
-
# Cast the column to Date for whole-date comparisons
|
140
292
|
column = cast(column, Date)
|
141
293
|
elif casting == "time":
|
142
|
-
# Cast the column to Time for time-specific comparisons
|
143
294
|
column = cast(column, Time)
|
144
295
|
elif casting in FilterHandler._date_operators():
|
145
|
-
# Extract date part (e.g., year, month) using SQLAlchemy functions
|
146
296
|
column = func.extract(casting, column)
|
147
297
|
|
148
298
|
return column
|
@@ -196,13 +346,13 @@ class FilterHandler:
|
|
196
346
|
"isnull": lambda col, val: col.is_(None) if val else col.isnot(None),
|
197
347
|
"not_exact": lambda col, val: col != val,
|
198
348
|
"not_contains": lambda col, val: ~col.like(f"%{val}%"),
|
199
|
-
"not_in": lambda col, val: ~col.in_(val),
|
200
|
-
"regex": lambda col, val: col.op("~")(val),
|
201
|
-
"icontains": lambda col, val: col.ilike(f"%{val}%"),
|
202
|
-
"istartswith": lambda col, val: col.ilike(f"{val}%"),
|
203
|
-
"iendswith": lambda col, val: col.ilike(f"%{val}"),
|
204
|
-
"iexact": lambda col, val: col.ilike(val),
|
205
|
-
"iregex": lambda col, val: col.op("~*")(val),
|
349
|
+
"not_in": lambda col, val: ~col.in_(val),
|
350
|
+
"regex": lambda col, val: col.op("~")(val),
|
351
|
+
"icontains": lambda col, val: col.ilike(f"%{val}%"),
|
352
|
+
"istartswith": lambda col, val: col.ilike(f"{val}%"),
|
353
|
+
"iendswith": lambda col, val: col.ilike(f"%{val}"),
|
354
|
+
"iexact": lambda col, val: col.ilike(val),
|
355
|
+
"iregex": lambda col, val: col.op("~*")(val),
|
206
356
|
}
|
207
357
|
|
208
358
|
@staticmethod
|
@@ -214,7 +364,7 @@ class FilterHandler:
|
|
214
364
|
"lt": lambda col, val: col < val,
|
215
365
|
"lte": lambda col, val: col <= val,
|
216
366
|
|
217
|
-
#
|
367
|
+
# type-safe "in" and "not_in"
|
218
368
|
"in": lambda col, val: FilterHandler._align_in_types(col, val)[0].isin(
|
219
369
|
FilterHandler._align_in_types(col, val)[1]),
|
220
370
|
"not_in": lambda col, val: ~FilterHandler._align_in_types(col, val)[0].isin(
|
@@ -228,12 +378,9 @@ class FilterHandler:
|
|
228
378
|
"endswith": lambda col, val: FilterHandler._as_str(col).str.endswith(val, na=False),
|
229
379
|
"not_contains": lambda col, val: ~FilterHandler._as_str(col).str.contains(val, regex=True, na=False),
|
230
380
|
"regex": lambda col, val: FilterHandler._as_str(col).str.contains(val, regex=True, na=False),
|
231
|
-
"icontains": lambda col, val: FilterHandler._as_str(col).str.contains(val, case=False, regex=True,
|
232
|
-
|
233
|
-
"
|
234
|
-
na=False),
|
235
|
-
"iendswith": lambda col, val: FilterHandler._as_str(col).str.lower().str.endswith(str(val).lower(),
|
236
|
-
na=False),
|
381
|
+
"icontains": lambda col, val: FilterHandler._as_str(col).str.contains(val, case=False, regex=True, na=False),
|
382
|
+
"istartswith": lambda col, val: FilterHandler._as_str(col).str.lower().str.startswith(str(val).lower(), na=False),
|
383
|
+
"iendswith": lambda col, val: FilterHandler._as_str(col).str.lower().str.endswith(str(val).lower(), na=False),
|
237
384
|
"iexact": lambda col, val: FilterHandler._as_str(col).str.lower() == str(val).lower(),
|
238
385
|
"iregex": lambda col, val: FilterHandler._as_str(col).str.contains(val, case=False, regex=True, na=False),
|
239
386
|
|
@@ -243,12 +390,10 @@ class FilterHandler:
|
|
243
390
|
|
244
391
|
@staticmethod
|
245
392
|
def _as_str(col):
|
246
|
-
# Force a reliable string view (works with object, categorical, etc.)
|
247
393
|
return col.astype("string").fillna("")
|
248
394
|
|
249
395
|
@staticmethod
|
250
396
|
def _strip_tz(col):
|
251
|
-
# Make tz-aware datetimes naive so they compare to tz-naive filter values
|
252
397
|
import pandas as pd
|
253
398
|
def _part(s: pd.Series) -> pd.Series:
|
254
399
|
try:
|
@@ -258,12 +403,10 @@ class FilterHandler:
|
|
258
403
|
return s.dt.tz_localize(None)
|
259
404
|
except Exception:
|
260
405
|
return s
|
261
|
-
|
262
406
|
return col.map_partitions(_part, meta=col._meta)
|
263
407
|
|
264
408
|
@staticmethod
|
265
409
|
def _time_to_seconds(t):
|
266
|
-
# t can be datetime.time or a "HH:MM[:SS]" str
|
267
410
|
if isinstance(t, str):
|
268
411
|
t = datetime.time.fromisoformat(t)
|
269
412
|
return t.hour * 3600 + t.minute * 60 + t.second
|
@@ -288,10 +431,6 @@ class FilterHandler:
|
|
288
431
|
|
289
432
|
@staticmethod
|
290
433
|
def _align_in_types(col, val):
|
291
|
-
"""
|
292
|
-
Return (coerced_col, coerced_values) with compatible dtypes
|
293
|
-
so that .isin(...) behaves as expected across partitions.
|
294
|
-
"""
|
295
434
|
# normalize val to a list
|
296
435
|
if isinstance(val, (set, tuple)):
|
297
436
|
vals = list(val)
|
@@ -300,7 +439,6 @@ class FilterHandler:
|
|
300
439
|
else:
|
301
440
|
vals = [val]
|
302
441
|
|
303
|
-
# try numeric alignment first if column is numeric-like
|
304
442
|
kind = getattr(getattr(col, "dtype", None), "kind", None)
|
305
443
|
if kind in ("i", "u"): # integer
|
306
444
|
def to_ints(xs):
|
@@ -309,13 +447,10 @@ class FilterHandler:
|
|
309
447
|
try:
|
310
448
|
out.append(int(x))
|
311
449
|
except Exception:
|
312
|
-
# if any value can't be int, fall back to strings below
|
313
450
|
return None
|
314
451
|
return out
|
315
|
-
|
316
452
|
ints = to_ints(vals)
|
317
453
|
if ints is not None:
|
318
|
-
# nullable Int64 handles missing values
|
319
454
|
return col.astype("Int64"), ints
|
320
455
|
|
321
456
|
if kind in ("f",): # float
|
@@ -327,10 +462,8 @@ class FilterHandler:
|
|
327
462
|
except Exception:
|
328
463
|
return None
|
329
464
|
return out
|
330
|
-
|
331
465
|
flts = to_floats(vals)
|
332
466
|
if flts is not None:
|
333
467
|
return col.astype("float64"), flts
|
334
468
|
|
335
|
-
|
336
|
-
return FilterHandler._as_str(col), [str(x) for x in vals]
|
469
|
+
return FilterHandler._as_str(col), [str(x) for x in vals]
|
@@ -2,15 +2,14 @@ sibi_dst/__init__.py,sha256=D01Z2Ds4zES8uz5Zp7qOWD0EcfCllWgew7AWt2X1SQg,445
|
|
2
2
|
sibi_dst/df_helper/__init__.py,sha256=CyDXtFhRnMrycktxNO8jGGkP0938QiScl56kMZS1Sf8,578
|
3
3
|
sibi_dst/df_helper/_artifact_updater_async.py,sha256=0lUwel-IkmKewRnmMv9GtuT-P6SivkIKtgOHvKchHlc,8462
|
4
4
|
sibi_dst/df_helper/_artifact_updater_threaded.py,sha256=M5GNZismOqMmBrcyfolP1DPv87VILQf_P18is_epn50,7238
|
5
|
-
sibi_dst/df_helper/_df_helper.py,sha256=
|
5
|
+
sibi_dst/df_helper/_df_helper.py,sha256=g1ftfSMO40l60EJWRLE0DDZvbIowrqvG1GMf2zXqYGw,12957
|
6
6
|
sibi_dst/df_helper/_parquet_artifact.py,sha256=tqYOjwxHV1MsADmn-RNFuVI_RrEvvmCJHZieRcsVXuc,12334
|
7
7
|
sibi_dst/df_helper/_parquet_reader.py,sha256=tFq0OQVczozbKZou93vscokp2R6O2DIJ1zHbZqVjagc,3069
|
8
8
|
sibi_dst/df_helper/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
9
|
sibi_dst/df_helper/backends/http/__init__.py,sha256=d1pfgYxbiYg7E0Iw8RbJ7xfqIfJShqqTBQQGU_S6OOo,105
|
10
10
|
sibi_dst/df_helper/backends/http/_http_config.py,sha256=eGPFdqZ5M3Tscqx2P93B6XoBEEzlmdt7yNg7PXUQnNQ,4726
|
11
|
-
sibi_dst/df_helper/backends/parquet/__init__.py,sha256=
|
12
|
-
sibi_dst/df_helper/backends/parquet/
|
13
|
-
sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=FusVcLVysitLoc8Ui_zU4JMhdHW1MMn4i0vnMbl2K84,12017
|
11
|
+
sibi_dst/df_helper/backends/parquet/__init__.py,sha256=0A6BGHZLwiLBmuBBaUvEHfeWTcInvy2NbymlrI_nuXE,104
|
12
|
+
sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=V6y1Vco3_uY4UBF79_JPd1CFK5DpNsnGYHCc5PDPGZo,13798
|
14
13
|
sibi_dst/df_helper/backends/sqlalchemy/__init__.py,sha256=LjWm9B7CweTvlvFOgB90XjSe0lVLILAIYMWKPkFXFm8,265
|
15
14
|
sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py,sha256=R3_WY_lsQrfQwD6yAzH66MqvsgZdMd0HKcVChDQcbpM,8401
|
16
15
|
sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py,sha256=GQwDy2JwPUx37vpwxPM5hg4ZydilPIP824y5C_clsl0,383
|
@@ -20,7 +19,7 @@ sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py,sha256=MHk64f5WDOKHQ_L
|
|
20
19
|
sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py,sha256=RiCaVPME5wzgZ9xUGY0JOs_c2C0KcDIbTeMGpPupIa0,5242
|
21
20
|
sibi_dst/df_helper/core/__init__.py,sha256=LfmTqFh6GUZup-g95bcXgAxX7J5Hkve7ftLE_CJg_AE,409
|
22
21
|
sibi_dst/df_helper/core/_defaults.py,sha256=9UMEMu2wXznO5UzEhnQ82f_ZazZ20JRyRXIi3HP3gDw,4043
|
23
|
-
sibi_dst/df_helper/core/_filter_handler.py,sha256=
|
22
|
+
sibi_dst/df_helper/core/_filter_handler.py,sha256=9C30zrT8wSGy1X8ryiTWc0XfnbpeoHndHgoOcHKOPOo,19309
|
24
23
|
sibi_dst/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxRrQKE5FQRxcEWsac,6736
|
25
24
|
sibi_dst/df_helper/core/_query_config.py,sha256=1ApqmuSGXTC3CdF-xMsSbCa3V2Z5hOP3Wq5huhzZwqY,439
|
26
25
|
sibi_dst/df_helper/data_cleaner.py,sha256=lkxQoXLvGzXCicFUimnA5nen5qkrO1oxgl_p2Be2o8w,5183
|
@@ -79,6 +78,6 @@ sibi_dst/v2/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxR
|
|
79
78
|
sibi_dst/v2/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
|
80
79
|
sibi_dst/v2/utils/__init__.py,sha256=6H4cvhqTiFufnFPETBF0f8beVVMpfJfvUs6Ne0TQZNY,58
|
81
80
|
sibi_dst/v2/utils/log_utils.py,sha256=rfk5VsLAt-FKpv6aPTC1FToIPiyrnHAFFBAkHme24po,4123
|
82
|
-
sibi_dst-2025.8.
|
83
|
-
sibi_dst-2025.8.
|
84
|
-
sibi_dst-2025.8.
|
81
|
+
sibi_dst-2025.8.5.dist-info/METADATA,sha256=ADWrf_9UI4NiTWslrJ0LgfmHTTdxSSCIc0AaP-mqSQg,2610
|
82
|
+
sibi_dst-2025.8.5.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
83
|
+
sibi_dst-2025.8.5.dist-info/RECORD,,
|
@@ -1,126 +0,0 @@
|
|
1
|
-
import dask.dataframe as dd
|
2
|
-
import pandas as pd
|
3
|
-
|
4
|
-
from sibi_dst.utils import Logger
|
5
|
-
|
6
|
-
|
7
|
-
class ParquetFilterHandler(object):
|
8
|
-
"""
|
9
|
-
Handles parquet filtering operations using dask dataframes.
|
10
|
-
|
11
|
-
This class is designed to apply complex filtering logic on dask dataframes
|
12
|
-
based on specified filter criteria. It includes support for operations such
|
13
|
-
as exact matches, ranges, string pattern matches, and null checks. Additionally,
|
14
|
-
it handles datetime-related field filtering including precise truncations and
|
15
|
-
specific date/time attributes.
|
16
|
-
|
17
|
-
:ivar logger: Logger object to handle logging within the class. Defaults to the class-level logger.
|
18
|
-
:type logger: Logger
|
19
|
-
"""
|
20
|
-
def __init__(self, logger=None, debug=False):
|
21
|
-
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
22
|
-
self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
|
23
|
-
|
24
|
-
@staticmethod
|
25
|
-
def apply_filters_dask(df, filters):
|
26
|
-
"""
|
27
|
-
Applies a set of filters to a Dask DataFrame, enabling complex filtering operations
|
28
|
-
such as comparisons, ranges, string match operations, and more. Handles special
|
29
|
-
cases for datetime operations, including casting and extracting specific datetime
|
30
|
-
components for filtering.
|
31
|
-
|
32
|
-
:param df: Dask DataFrame to which the filters will be applied.
|
33
|
-
:type df: dask.dataframe.DataFrame
|
34
|
-
:param filters: Dictionary defining the filtering logic, where the keys specify
|
35
|
-
the column name and filter operation, and the values specify the corresponding
|
36
|
-
filter values to apply.
|
37
|
-
:type filters: dict
|
38
|
-
:return: A filtered Dask DataFrame based on the defined logic in the filters.
|
39
|
-
:rtype: dask.dataframe.DataFrame
|
40
|
-
:raises ValueError: If an unsupported operation is encountered in the filters.
|
41
|
-
"""
|
42
|
-
dt_operators = ['date', 'time']
|
43
|
-
date_operators = ['year', 'month', 'day', 'hour', 'minute', 'second', 'week_day']
|
44
|
-
comparison_operators = [
|
45
|
-
'gte',
|
46
|
-
'lte',
|
47
|
-
'gt',
|
48
|
-
'lt',
|
49
|
-
'exact',
|
50
|
-
'in',
|
51
|
-
'range',
|
52
|
-
'contains',
|
53
|
-
'icontains',
|
54
|
-
'startswith',
|
55
|
-
'endswith',
|
56
|
-
'isnull'
|
57
|
-
]
|
58
|
-
|
59
|
-
operation_map = {
|
60
|
-
'exact': lambda col, val: col == val,
|
61
|
-
'gt': lambda col, val: col > val,
|
62
|
-
'gte': lambda col, val: col >= val,
|
63
|
-
'lt': lambda col, val: col < val,
|
64
|
-
'lte': lambda col, val: col <= val,
|
65
|
-
'in': lambda col, val: col.isin(val),
|
66
|
-
'range': lambda col, val: (col >= val[0]) & (col <= val[1]),
|
67
|
-
'contains': lambda col, val: col.str.contains(val, regex=True),
|
68
|
-
'icontains': lambda col, val: col.str.contains(val, case=False),
|
69
|
-
'startswith': lambda col, val: col.str.startswith(val),
|
70
|
-
'endswith': lambda col, val: col.str.endswith(val),
|
71
|
-
'isnull': lambda col, val: col.isnull() if val else col.notnull(),
|
72
|
-
}
|
73
|
-
|
74
|
-
def parse_filter_value(casting, value):
|
75
|
-
"""
|
76
|
-
Convert filter value to appropriate type based on the casting (e.g., date).
|
77
|
-
"""
|
78
|
-
if casting == 'date':
|
79
|
-
if isinstance(value, str):
|
80
|
-
return pd.Timestamp(value) # Convert to datetime64[ns]
|
81
|
-
if isinstance(value, list):
|
82
|
-
return [pd.Timestamp(v) for v in value] # Convert list elements
|
83
|
-
return value
|
84
|
-
|
85
|
-
def get_temp_col(dask_df, field_name, casting):
|
86
|
-
"""
|
87
|
-
Handle datetime conversion and field retrieval.
|
88
|
-
"""
|
89
|
-
temp_col = dd.to_datetime(dask_df[field_name], errors='coerce') if casting in dt_operators else dask_df[
|
90
|
-
field_name]
|
91
|
-
if casting == 'date':
|
92
|
-
temp_col = temp_col.dt.floor('D') # Keep it as datetime64[ns] truncated to the day level
|
93
|
-
elif casting in date_operators:
|
94
|
-
temp_col = getattr(temp_col.dt, casting)
|
95
|
-
return temp_col
|
96
|
-
|
97
|
-
for key, value in filters.items():
|
98
|
-
parts = key.split('__')
|
99
|
-
field_name = parts[0]
|
100
|
-
casting = None
|
101
|
-
operation = 'exact'
|
102
|
-
|
103
|
-
if len(parts) == 3:
|
104
|
-
# Adjust logic based on the parts
|
105
|
-
_, casting, operation = parts
|
106
|
-
elif len(parts) == 2:
|
107
|
-
# Could be either a casting or an operation
|
108
|
-
if parts[1] in comparison_operators:
|
109
|
-
operation = parts[1]
|
110
|
-
elif parts[1] in dt_operators + date_operators:
|
111
|
-
casting = parts[1]
|
112
|
-
|
113
|
-
# Convert the filter value to the correct type
|
114
|
-
parsed_value = parse_filter_value(casting, value)
|
115
|
-
|
116
|
-
# Get the column to filter
|
117
|
-
temp_col = get_temp_col(df, field_name, casting)
|
118
|
-
|
119
|
-
if operation in operation_map:
|
120
|
-
# Apply the filter operation
|
121
|
-
condition = operation_map[operation](temp_col, parsed_value)
|
122
|
-
df = df[condition]
|
123
|
-
else:
|
124
|
-
raise ValueError(f"Unsupported operation: {operation}")
|
125
|
-
|
126
|
-
return df
|
File without changes
|