sibi-dst 2025.8.2__py3-none-any.whl → 2025.8.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_df_helper.py +29 -304
- sibi_dst/df_helper/backends/parquet/__init__.py +0 -2
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +1 -1
- sibi_dst/df_helper/core/_filter_handler.py +306 -94
- {sibi_dst-2025.8.2.dist-info → sibi_dst-2025.8.4.dist-info}/METADATA +1 -1
- {sibi_dst-2025.8.2.dist-info → sibi_dst-2025.8.4.dist-info}/RECORD +7 -8
- sibi_dst/df_helper/backends/parquet/_filter_handler.py +0 -126
- {sibi_dst-2025.8.2.dist-info → sibi_dst-2025.8.4.dist-info}/WHEEL +0 -0
sibi_dst/df_helper/_df_helper.py
CHANGED
@@ -7,7 +7,7 @@ import pandas as pd
|
|
7
7
|
from fsspec import AbstractFileSystem
|
8
8
|
from pydantic import BaseModel
|
9
9
|
|
10
|
-
from sibi_dst.df_helper.core import QueryConfig, ParamsConfig
|
10
|
+
from sibi_dst.df_helper.core import QueryConfig, ParamsConfig
|
11
11
|
from sibi_dst.utils import ManagedResource, ParquetSaver, ClickHouseWriter
|
12
12
|
from .backends.http import HttpConfig
|
13
13
|
from .backends.parquet import ParquetConfig
|
@@ -54,24 +54,40 @@ class SqlAlchemyBackend(BaseBackend):
|
|
54
54
|
class ParquetBackend(BaseBackend):
|
55
55
|
def load(self, **options):
|
56
56
|
try:
|
57
|
-
df = self.helper.backend_parquet.load_files()
|
58
|
-
if
|
59
|
-
return -1,
|
60
|
-
|
61
|
-
if
|
62
|
-
|
63
|
-
|
64
|
-
self.logger.debug("No records after filters; returning empty DataFrame.")
|
65
|
-
return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
57
|
+
df = self.helper.backend_parquet.load_files(**options)
|
58
|
+
if self._is_empty(df):
|
59
|
+
return -1, self._empty_like(df)
|
60
|
+
nrows = self._row_count(df)
|
61
|
+
if nrows == 0:
|
62
|
+
self.logger.debug("No records after filters; returning empty DataFrame.")
|
63
|
+
return 0, self._empty_like(df)
|
66
64
|
|
67
65
|
df = df.persist()
|
68
|
-
self.total_records =
|
66
|
+
self.total_records = self._row_count(df) or -1
|
69
67
|
return self.total_records, df
|
68
|
+
|
70
69
|
except Exception as e:
|
71
|
-
self.total_records = -1
|
70
|
+
self.total_records = -1 # Reset total_records on failure
|
72
71
|
self.logger.error(f"Failed to load data from parquet: {e}", exc_info=self.debug)
|
73
72
|
return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
74
73
|
|
74
|
+
def _is_empty(self, ddf) -> bool:
|
75
|
+
"""True if no rows across all partitions."""
|
76
|
+
try:
|
77
|
+
# head with npartitions=-1 walks partitions until it gets n rows
|
78
|
+
return ddf.head(1, npartitions=-1).shape[0] == 0
|
79
|
+
except Exception:
|
80
|
+
return True
|
81
|
+
|
82
|
+
def _row_count(self, ddf) -> int:
|
83
|
+
"""Reliable row count for Dask DataFrame."""
|
84
|
+
return int(ddf.map_partitions(len).sum().compute())
|
85
|
+
|
86
|
+
def _empty_like(self, ddf):
|
87
|
+
"""Return an empty Dask DF with the SAME columns/dtypes."""
|
88
|
+
empty_pdf = ddf._meta.iloc[0:0]
|
89
|
+
return dd.from_pandas(empty_pdf, npartitions=1)
|
90
|
+
|
75
91
|
|
76
92
|
class HttpBackend(BaseBackend):
|
77
93
|
def load(self, **options):
|
@@ -279,295 +295,4 @@ class DfHelper(ManagedResource):
|
|
279
295
|
self.logger.debug(f"Period load generated filters: {kwargs}")
|
280
296
|
return kwargs
|
281
297
|
|
282
|
-
|
283
|
-
#
|
284
|
-
# from typing import Any, Dict, Optional, Union, TypeVar
|
285
|
-
#
|
286
|
-
# import dask.dataframe as dd
|
287
|
-
# import pandas as pd
|
288
|
-
# from fsspec import AbstractFileSystem
|
289
|
-
# from pydantic import BaseModel
|
290
|
-
#
|
291
|
-
# from sibi_dst.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
|
292
|
-
# from sibi_dst.utils import ManagedResource
|
293
|
-
# from sibi_dst.utils import ParquetSaver, ClickHouseWriter
|
294
|
-
# from .backends.http import HttpConfig
|
295
|
-
# from .backends.parquet import ParquetConfig
|
296
|
-
# from .backends.sqlalchemy import SqlAlchemyConnectionConfig, SqlAlchemyLoadFromDb
|
297
|
-
#
|
298
|
-
# T = TypeVar("T", bound=BaseModel)
|
299
|
-
#
|
300
|
-
#
|
301
|
-
# # --- Backend Strategy Pattern Implementation ---
|
302
|
-
#
|
303
|
-
# class BaseBackend:
|
304
|
-
# """Abstract base class defining clear sync and async loading interfaces."""
|
305
|
-
#
|
306
|
-
# def __init__(self, helper: DfHelper):
|
307
|
-
# self.helper = helper
|
308
|
-
# self.logger = helper.logger
|
309
|
-
# self.debug = helper.debug
|
310
|
-
# self.total_records = helper.total_records # no records loaded yet
|
311
|
-
#
|
312
|
-
# def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
|
313
|
-
# """Synchronous data loading method. Must be implemented by sync backends."""
|
314
|
-
# raise NotImplementedError(f"Backend '{self.__class__.__name__}' does not support synchronous loading.")
|
315
|
-
#
|
316
|
-
# async def aload(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
|
317
|
-
# """Asynchronous data loading method. By default, it calls the sync version."""
|
318
|
-
# return self.load(**options)
|
319
|
-
#
|
320
|
-
#
|
321
|
-
# class SqlAlchemyBackend(BaseBackend):
|
322
|
-
# def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
|
323
|
-
# try:
|
324
|
-
# # Process incoming filter options into the ParamsConfig object
|
325
|
-
# if options and hasattr(self.helper._backend_params, 'parse_params'):
|
326
|
-
# self.helper._backend_params.parse_params(options)
|
327
|
-
#
|
328
|
-
# with SqlAlchemyLoadFromDb(
|
329
|
-
# plugin_sqlalchemy=self.helper.backend_db_connection,
|
330
|
-
# plugin_query=self.helper._backend_query,
|
331
|
-
# plugin_params=self.helper._backend_params,
|
332
|
-
# logger=self.logger,
|
333
|
-
# debug=self.debug
|
334
|
-
# ) as db_loader:
|
335
|
-
# self.total_records, result = db_loader.build_and_load()
|
336
|
-
# return self.total_records, result
|
337
|
-
# except Exception as e:
|
338
|
-
# self.logger.error(f"Failed to load data from sqlalchemy: {e}", exc_info=self.debug)
|
339
|
-
# return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
340
|
-
#
|
341
|
-
#
|
342
|
-
# class ParquetBackend(BaseBackend):
|
343
|
-
# """This backend is also purely synchronous."""
|
344
|
-
#
|
345
|
-
# def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
|
346
|
-
# try:
|
347
|
-
# df = self.helper.backend_parquet.load_files()
|
348
|
-
# if len(df.head(1)) == 0:
|
349
|
-
# return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
350
|
-
# if options and df is not None:
|
351
|
-
# df = FilterHandler('dask', logger=self.logger, debug=False).apply_filters(df, filters=options)
|
352
|
-
# if len(df.head(1)) == 0:
|
353
|
-
# self.logger.debug("No records found after applying filters; returning empty DataFrame.")
|
354
|
-
# return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
355
|
-
# df = df.persist()
|
356
|
-
#
|
357
|
-
# self.total_records = len(df) or -1 # If df is empty, set total_records to -1
|
358
|
-
# return self.total_records, df
|
359
|
-
# except Exception as e:
|
360
|
-
# self.total_records = -1 # Reset total_records on failure
|
361
|
-
# self.logger.error(f"Failed to load data from parquet: {e}")
|
362
|
-
# return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
363
|
-
#
|
364
|
-
#
|
365
|
-
# class HttpBackend(BaseBackend):
|
366
|
-
# """This backend is purely asynchronous."""
|
367
|
-
#
|
368
|
-
# def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
|
369
|
-
# # This will correctly fail by raising NotImplementedError from the base class.
|
370
|
-
# return self.helper.backend_http.fetch_data(**options)
|
371
|
-
#
|
372
|
-
# async def aload(self, **options) -> tuple[Any, Any] | Union[pd.DataFrame, dd.DataFrame]:
|
373
|
-
# if not self.helper.backend_http:
|
374
|
-
# self.logger.warning("HTTP plugin not configured properly.")
|
375
|
-
# self.total_records = -1
|
376
|
-
# return self.total_records, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
377
|
-
# result = await self.helper.backend_http.fetch_data(**options)
|
378
|
-
# self.total_records = len(result)
|
379
|
-
# return self.total_records, result
|
380
|
-
#
|
381
|
-
#
|
382
|
-
# # --- Main DfHelper Facade Class ---
|
383
|
-
#
|
384
|
-
# class DfHelper(ManagedResource):
|
385
|
-
# """
|
386
|
-
# A reusable utility for loading data. It provides both sync (`load`) and
|
387
|
-
# async (`aload`) methods to accommodate different backends.
|
388
|
-
# """
|
389
|
-
# _BACKEND_STRATEGIES = {
|
390
|
-
# 'sqlalchemy': SqlAlchemyBackend,
|
391
|
-
# 'parquet': ParquetBackend,
|
392
|
-
# 'http': HttpBackend,
|
393
|
-
# }
|
394
|
-
#
|
395
|
-
# _BACKEND_ATTR_MAP = {
|
396
|
-
# 'sqlalchemy': 'backend_db_connection',
|
397
|
-
# 'parquet': 'backend_parquet',
|
398
|
-
# 'http': 'backend_http',
|
399
|
-
# }
|
400
|
-
#
|
401
|
-
# default_config: Dict = None
|
402
|
-
#
|
403
|
-
# def __init__(self, backend='sqlalchemy', **kwargs):
|
404
|
-
# self.default_config = self.default_config or {}
|
405
|
-
# kwargs = {**self.default_config.copy(), **kwargs}
|
406
|
-
# super().__init__(**kwargs)
|
407
|
-
# self.backend = backend
|
408
|
-
#
|
409
|
-
# # Need to set default values for backend-specific configurations
|
410
|
-
# kwargs.setdefault("debug", self.debug)
|
411
|
-
# kwargs.setdefault("fs", self.fs)
|
412
|
-
# kwargs.setdefault("logger", self.logger)
|
413
|
-
# self.total_records = -1 # Initialize total_records to -1 to indicate no records loaded yet
|
414
|
-
# self._backend_query = self._get_config(QueryConfig, kwargs)
|
415
|
-
# self._backend_params = self._get_config(ParamsConfig, kwargs)
|
416
|
-
# self.backend_db_connection: Optional[SqlAlchemyConnectionConfig] = None
|
417
|
-
# self.backend_parquet: Optional[ParquetConfig] = None
|
418
|
-
# self.backend_http: Optional[HttpConfig] = None
|
419
|
-
#
|
420
|
-
# if self.backend == 'sqlalchemy':
|
421
|
-
# self.backend_db_connection = self._get_config(SqlAlchemyConnectionConfig, kwargs)
|
422
|
-
# elif self.backend == 'parquet':
|
423
|
-
# self.backend_parquet = self._get_config(ParquetConfig, kwargs)
|
424
|
-
# elif self.backend == 'http':
|
425
|
-
# self.backend_http = self._get_config(HttpConfig, kwargs)
|
426
|
-
#
|
427
|
-
# strategy_class = self._BACKEND_STRATEGIES.get(self.backend)
|
428
|
-
# if not strategy_class: raise ValueError(f"Unsupported backend: {self.backend}")
|
429
|
-
# self.backend_strategy = strategy_class(self)
|
430
|
-
#
|
431
|
-
# def _cleanup(self):
|
432
|
-
# attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
|
433
|
-
# if not attr_name:
|
434
|
-
# self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.")
|
435
|
-
# return
|
436
|
-
# # Get the actual config object (e.g., self.backend_db_connection)
|
437
|
-
# active_config = getattr(self, attr_name, None)
|
438
|
-
#
|
439
|
-
# if active_config and hasattr(active_config, "close"):
|
440
|
-
# self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.")
|
441
|
-
# active_config.close()
|
442
|
-
#
|
443
|
-
# async def _acleanup(self):
|
444
|
-
# self.logger.warning("DfHelper instance was not used in an async context manager; cleanup is being called manually.")
|
445
|
-
# attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
|
446
|
-
# if not attr_name:
|
447
|
-
# self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.")
|
448
|
-
# return
|
449
|
-
# # Get the actual config object (e.g., self.backend_db_connection)
|
450
|
-
# active_config = getattr(self, attr_name, None)
|
451
|
-
# if active_config and hasattr(active_config, "aclose"):
|
452
|
-
# self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.")
|
453
|
-
# await active_config.aclose()
|
454
|
-
#
|
455
|
-
# def _get_config(self, model: T, kwargs: Dict[str, Any]) -> T:
|
456
|
-
# recognized_keys = set(model.model_fields.keys())
|
457
|
-
# model_kwargs = {k: kwargs[k] for k in recognized_keys if k in kwargs}
|
458
|
-
# return model(**model_kwargs)
|
459
|
-
#
|
460
|
-
# def load(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
461
|
-
# """Loads data synchronously. Fails if backend is async-only."""
|
462
|
-
# self.logger.debug(f"Loading data from {self.backend} backend with options: {options}")
|
463
|
-
# self.total_records, df = self.backend_strategy.load(**options)
|
464
|
-
# df = self._process_loaded_data(df)
|
465
|
-
# df = self._post_process_df(df)
|
466
|
-
# self.logger.debug(f"Finished loading data from {self.backend} backend with options: {options}")
|
467
|
-
# return df.compute() if as_pandas else df
|
468
|
-
#
|
469
|
-
# async def aload(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
470
|
-
# """Loads data asynchronously from any backend."""
|
471
|
-
# self.total_records, df = await self.backend_strategy.aload(**options)
|
472
|
-
# df = self._process_loaded_data(df)
|
473
|
-
# df = self._post_process_df(df)
|
474
|
-
# return df.compute() if as_pandas else df
|
475
|
-
#
|
476
|
-
# def _post_process_df(self, df: dd.DataFrame) -> dd.DataFrame:
|
477
|
-
# self.logger.debug("Post-processing DataFrame.")
|
478
|
-
# df_params = self._backend_params.df_params
|
479
|
-
# if not df_params: return df
|
480
|
-
# fieldnames, column_names, index_col = (df_params.get("fieldnames"), df_params.get("column_names"),
|
481
|
-
# df_params.get("index_col"))
|
482
|
-
# if not any([fieldnames, column_names, index_col]): return df
|
483
|
-
#
|
484
|
-
# if fieldnames:
|
485
|
-
# valid_fieldnames = [f for f in fieldnames if f in df.columns]
|
486
|
-
# if len(valid_fieldnames) < len(fieldnames): self.logger.warning(
|
487
|
-
# f"Missing columns for filtering: {set(fieldnames) - set(valid_fieldnames)}")
|
488
|
-
# df = df[valid_fieldnames]
|
489
|
-
# if column_names:
|
490
|
-
# if len(df.columns) != len(column_names): raise ValueError(
|
491
|
-
# f"Length mismatch: DataFrame has {len(df.columns)} columns, but {len(column_names)} names were provided.")
|
492
|
-
# df = df.rename(columns=dict(zip(df.columns, column_names)))
|
493
|
-
# if index_col:
|
494
|
-
# if index_col not in df.columns: raise ValueError(f"Index column '{index_col}' not found in DataFrame.")
|
495
|
-
# df = df.set_index(index_col)
|
496
|
-
# self.logger.debug("Post-processing complete.")
|
497
|
-
# return df
|
498
|
-
#
|
499
|
-
# def _process_loaded_data(self, df: dd.DataFrame) -> dd.DataFrame:
|
500
|
-
# field_map = self._backend_params.field_map or {}
|
501
|
-
# if not isinstance(field_map, dict) or not field_map: return df
|
502
|
-
# if hasattr(df, 'npartitions') and df.npartitions == 1 and not len(df.head(1)): return df
|
503
|
-
# self.logger.debug("Processing loaded data...applying rename mapping if necessary.")
|
504
|
-
# rename_mapping = {k: v for k, v in field_map.items() if k in df.columns}
|
505
|
-
# if rename_mapping: df = df.rename(columns=rename_mapping)
|
506
|
-
# self.logger.debug("Rename mapping complete...")
|
507
|
-
# return df
|
508
|
-
#
|
509
|
-
# def save_to_parquet(self, df: dd.DataFrame, parquet_filename: str, **kwargs):
|
510
|
-
# """Saves a Dask DataFrame to a Parquet file with validation."""
|
511
|
-
#
|
512
|
-
# # Use .get() for cleaner access to optional arguments.
|
513
|
-
# fs: AbstractFileSystem = kwargs.get('fs', self.fs)
|
514
|
-
# path: str = kwargs.get('parquet_storage_path')
|
515
|
-
#
|
516
|
-
# # Guard clauses to fail fast with clear errors.
|
517
|
-
# if not fs:
|
518
|
-
# raise ValueError("A filesystem (fs) must be provided to save the parquet file.")
|
519
|
-
# if not path:
|
520
|
-
# raise ValueError("A 'parquet_storage_path' keyword argument must be provided.")
|
521
|
-
#
|
522
|
-
# # An efficient, idiomatic way to check if a Dask DataFrame is empty.
|
523
|
-
# if len(df.head(1)) == 0:
|
524
|
-
# self.logger.warning("Skipping save: The provided DataFrame is empty.")
|
525
|
-
# return
|
526
|
-
#
|
527
|
-
# with ParquetSaver(
|
528
|
-
# df_result=df,
|
529
|
-
# parquet_storage_path=path,
|
530
|
-
# fs=fs,
|
531
|
-
# debug=self.debug,
|
532
|
-
# logger=self.logger,
|
533
|
-
# verbose=self.verbose,
|
534
|
-
# **kwargs
|
535
|
-
# ) as saver:
|
536
|
-
# saver.save_to_parquet(parquet_filename)
|
537
|
-
#
|
538
|
-
# self.logger.debug(f"Successfully saved '{parquet_filename}' to '{path}'.")
|
539
|
-
#
|
540
|
-
# def save_to_clickhouse(self, df: dd.DataFrame, **credentials):
|
541
|
-
# if hasattr(df, 'npartitions') and df.npartitions == 1 and not len(df.head(1)):
|
542
|
-
# self.logger.warning("Cannot write to ClickHouse; DataFrame is empty.")
|
543
|
-
# return
|
544
|
-
#
|
545
|
-
# with ClickHouseWriter(debug=self.debug, logger=self.logger, verbose=self.verbose, **credentials) as writer:
|
546
|
-
# writer.save_to_clickhouse(df)
|
547
|
-
# self.logger.debug("Save to ClickHouse completed.")
|
548
|
-
#
|
549
|
-
# def load_period(self, dt_field: str, start: str, end: str, **kwargs) -> Union[pd.DataFrame, dd.DataFrame]:
|
550
|
-
# """Synchronous convenience method for loading a date range."""
|
551
|
-
# final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
|
552
|
-
# return self.load(**final_kwargs)
|
553
|
-
#
|
554
|
-
# async def aload_period(self, dt_field: str, start: str, end: str, **kwargs) -> Union[pd.DataFrame, dd.DataFrame]:
|
555
|
-
# """Asynchronous convenience method for loading a date range."""
|
556
|
-
# final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
|
557
|
-
# return await self.aload(**final_kwargs)
|
558
|
-
#
|
559
|
-
# def _prepare_period_filters(self, dt_field: str, start: str, end: str, **kwargs) -> dict:
|
560
|
-
# start_date, end_date = pd.to_datetime(start).date(), pd.to_datetime(end).date()
|
561
|
-
# if start_date > end_date:
|
562
|
-
# raise ValueError("'start' date cannot be later than 'end' date.")
|
563
|
-
# field_map = self._backend_params.field_map or {}
|
564
|
-
# reverse_map = {v: k for k, v in field_map.items()} if field_map else {}
|
565
|
-
# if len(reverse_map) != len(field_map):
|
566
|
-
# self.logger.warning("field_map values are not unique; reverse mapping may be unreliable.")
|
567
|
-
# mapped_field = reverse_map.get(dt_field, dt_field)
|
568
|
-
# if start_date == end_date:
|
569
|
-
# kwargs[f"{mapped_field}__date"] = start_date
|
570
|
-
# else:
|
571
|
-
# kwargs[f"{mapped_field}__date__range"] = [start_date, end_date]
|
572
|
-
# self.logger.debug(f"Period load generated filters: {kwargs}")
|
573
|
-
# return kwargs
|
298
|
+
|
@@ -175,7 +175,7 @@ class ParquetConfig(BaseModel):
|
|
175
175
|
total_size += self.fs.size(path)
|
176
176
|
return total_size
|
177
177
|
|
178
|
-
def load_files(self):
|
178
|
+
def load_files(self, **filters):
|
179
179
|
"""
|
180
180
|
Loads parquet files into a Dask DataFrame based on the specified conditions. This
|
181
181
|
method checks if parquet file loading is enabled and loads either from a list of
|
@@ -1,4 +1,6 @@
|
|
1
1
|
import datetime
|
2
|
+
from dataclasses import dataclass
|
3
|
+
from typing import Optional, Dict, Any, List, Union, Tuple, Callable
|
2
4
|
|
3
5
|
import dask.dataframe as dd
|
4
6
|
import pandas as pd
|
@@ -8,69 +10,231 @@ from sqlalchemy.sql.sqltypes import Date, Time
|
|
8
10
|
from sibi_dst.utils import Logger
|
9
11
|
|
10
12
|
|
13
|
+
# -------------------- Deferred filter expression AST --------------------
|
14
|
+
class Expr:
|
15
|
+
def mask(self, df: dd.DataFrame) -> dd.Series:
|
16
|
+
raise NotImplementedError
|
17
|
+
|
18
|
+
def to_parquet_filters(self) -> List[Union[Tuple[str, str, Any], List[Tuple[str, str, Any]]]]:
|
19
|
+
# By default, nothing to push down
|
20
|
+
return []
|
21
|
+
|
22
|
+
def __and__(self, other: "Expr") -> "Expr": return And(self, other)
|
23
|
+
def __or__(self, other: "Expr") -> "Expr": return Or(self, other)
|
24
|
+
def __invert__(self) -> "Expr": return Not(self)
|
25
|
+
|
26
|
+
|
27
|
+
@dataclass(frozen=True)
|
28
|
+
class TrueExpr(Expr):
|
29
|
+
"""Matches all rows; useful as a neutral starting point."""
|
30
|
+
def mask(self, df: dd.DataFrame) -> dd.Series:
|
31
|
+
return df.map_partitions(lambda p: pd.Series(True, index=p.index),
|
32
|
+
meta=pd.Series(dtype=bool))
|
33
|
+
|
34
|
+
|
35
|
+
@dataclass(frozen=True)
|
36
|
+
class ColOp(Expr):
|
37
|
+
field: str
|
38
|
+
casting: Optional[str]
|
39
|
+
op: str
|
40
|
+
value: Any
|
41
|
+
handler: "FilterHandler" # reuse your parsing + Dask ops
|
42
|
+
|
43
|
+
def mask(self, df: dd.DataFrame) -> dd.Series:
|
44
|
+
col = self.handler._get_dask_column(df, self.field, self.casting)
|
45
|
+
val = self.handler._parse_filter_value(self.casting, self.value)
|
46
|
+
return self.handler._apply_operation_dask(col, self.op, val)
|
47
|
+
|
48
|
+
def to_parquet_filters(self):
|
49
|
+
# Only basic comparisons can be pushed down
|
50
|
+
if self.op not in {"exact", "gt", "gte", "lt", "lte", "in", "range"}:
|
51
|
+
return []
|
52
|
+
val = self.handler._parse_filter_value(self.casting, self.value)
|
53
|
+
if self.casting == "date":
|
54
|
+
if self.op == "range" and isinstance(val, (list, tuple)) and len(val) == 2:
|
55
|
+
lo, hi = pd.Timestamp(val[0]), pd.Timestamp(val[1])
|
56
|
+
return [(self.field, ">=", lo), (self.field, "<=", hi)]
|
57
|
+
if isinstance(val, list):
|
58
|
+
val = [pd.Timestamp(v) for v in val]
|
59
|
+
else:
|
60
|
+
val = pd.Timestamp(val)
|
61
|
+
if self.op == "exact": return [(self.field, "=", val)]
|
62
|
+
if self.op in {"gt","gte","lt","lte"}:
|
63
|
+
sym = {"gt": ">", "gte": ">=", "lt": "<", "lte": "<="}[self.op]
|
64
|
+
return [(self.field, sym, val)]
|
65
|
+
if self.op == "in": return [(self.field, "in", list(val) if not isinstance(val, list) else val)]
|
66
|
+
if self.op == "range":
|
67
|
+
lo, hi = val
|
68
|
+
return [(self.field, ">=", lo), (self.field, "<=", hi)]
|
69
|
+
return []
|
70
|
+
|
71
|
+
|
72
|
+
@dataclass(frozen=True)
|
73
|
+
class And(Expr):
|
74
|
+
left: Expr; right: Expr
|
75
|
+
def mask(self, df: dd.DataFrame) -> dd.Series: return self.left.mask(df) & self.right.mask(df)
|
76
|
+
def to_parquet_filters(self):
|
77
|
+
# AND = concatenate both sides' AND-terms
|
78
|
+
return [*self.left.to_parquet_filters(), *self.right.to_parquet_filters()]
|
79
|
+
|
80
|
+
|
81
|
+
@dataclass(frozen=True)
|
82
|
+
class Or(Expr):
|
83
|
+
left: Expr; right: Expr
|
84
|
+
def mask(self, df: dd.DataFrame) -> dd.Series: return self.left.mask(df) | self.right.mask(df)
|
85
|
+
def to_parquet_filters(self):
|
86
|
+
# OR must be returned as list-of-lists; if either side has non-pushdown, defer to mask
|
87
|
+
lf, rf = self.left.to_parquet_filters(), self.right.to_parquet_filters()
|
88
|
+
if not lf or not rf:
|
89
|
+
return []
|
90
|
+
return [lf, rf]
|
91
|
+
|
92
|
+
|
93
|
+
@dataclass(frozen=True)
|
94
|
+
class Not(Expr):
|
95
|
+
inner: Expr
|
96
|
+
def mask(self, df: dd.DataFrame) -> dd.Series: return ~self.inner.mask(df)
|
97
|
+
def to_parquet_filters(self): return []
|
98
|
+
|
99
|
+
|
100
|
+
# -------------------- Filter handler --------------------
|
11
101
|
class FilterHandler:
|
12
102
|
"""
|
13
|
-
Handles the application of filters to
|
14
|
-
|
15
|
-
|
16
|
-
SQLAlchemy queries and Dask DataFrames. It supports multiple filtering operations, including
|
17
|
-
exact matches, comparisons, and string-related operations such as contains and regex. The handler
|
18
|
-
automatically determines and applies backend-specific processing, enabling seamless integration with
|
19
|
-
different data models or backends.
|
20
|
-
|
21
|
-
:ivar backend: The backend in use ('sqlalchemy' or 'dask').
|
22
|
-
:type backend: str
|
23
|
-
:ivar logger: An optional logger instance for debugging and logging purposes.
|
24
|
-
:type logger: Logger
|
25
|
-
:ivar backend_methods: A dictionary mapping backend-specific methods for column retrieval and operation application.
|
26
|
-
:type backend_methods: dict
|
103
|
+
Handles the application of filters to SQLAlchemy and Dask backends.
|
104
|
+
Also compiles dicts into deferred expressions (Expr) and can split
|
105
|
+
pushdown-friendly predicates from residual ones.
|
27
106
|
"""
|
28
107
|
def __init__(self, backend, logger=None, debug=False):
|
29
|
-
"""
|
30
|
-
Initialize the FilterHandler.
|
31
|
-
|
32
|
-
Args:
|
33
|
-
backend: The backend to use ('sqlalchemy' or 'dask').
|
34
|
-
logger: Optional logger for debugging purposes.
|
35
|
-
"""
|
36
108
|
self.backend = backend
|
37
|
-
self.logger = logger or Logger.default_logger(
|
38
|
-
logger_name=self.__class__.__name__) # No-op logger if none provided
|
109
|
+
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
39
110
|
self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
|
40
111
|
self.backend_methods = self._get_backend_methods(backend)
|
41
112
|
|
42
|
-
|
43
|
-
|
44
|
-
|
113
|
+
# --------- NEW: pushdown helpers ---------
|
114
|
+
def _pushdown_ops(self) -> set[str]:
|
115
|
+
"""Ops that can be translated to PyArrow parquet filters."""
|
116
|
+
return {"exact", "gt", "gte", "lt", "lte", "in", "range"}
|
45
117
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
118
|
+
def to_parquet_filters(self, filters: Optional[Dict[str, Any]] = None
|
119
|
+
) -> List[Tuple[str, str, Any]]:
|
120
|
+
"""
|
121
|
+
Convert a subset of filters into PyArrow parquet filters (AND semantics).
|
122
|
+
Unsupported ops are skipped here and should be applied later as a Dask mask.
|
123
|
+
"""
|
124
|
+
filters = filters or {}
|
125
|
+
out: List[Tuple[str, str, Any]] = []
|
50
126
|
|
51
|
-
|
52
|
-
|
127
|
+
for key, value in filters.items():
|
128
|
+
field, casting, op = self._parse_filter_key(key)
|
129
|
+
if op not in self._pushdown_ops():
|
130
|
+
continue
|
131
|
+
|
132
|
+
val = self._parse_filter_value(casting, value)
|
133
|
+
|
134
|
+
# Normalize dates to Timestamp for Arrow
|
135
|
+
if casting == "date":
|
136
|
+
if op == "range" and isinstance(val, (list, tuple)) and len(val) == 2:
|
137
|
+
lo, hi = pd.Timestamp(val[0]), pd.Timestamp(val[1])
|
138
|
+
out.extend([(field, ">=", lo), (field, "<=", hi)])
|
139
|
+
continue
|
140
|
+
if isinstance(val, list):
|
141
|
+
val = [pd.Timestamp(v) for v in val]
|
142
|
+
else:
|
143
|
+
val = pd.Timestamp(val)
|
144
|
+
|
145
|
+
if op == "exact":
|
146
|
+
out.append((field, "=", val))
|
147
|
+
elif op in {"gt", "gte", "lt", "lte"}:
|
148
|
+
sym = {"gt": ">", "gte": ">=", "lt": "<", "lte": "<="}[op]
|
149
|
+
out.append((field, sym, val))
|
150
|
+
elif op == "in":
|
151
|
+
out.append((field, "in", list(val) if not isinstance(val, list) else val))
|
152
|
+
elif op == "range":
|
153
|
+
lo, hi = val
|
154
|
+
out.extend([(field, ">=", lo), (field, "<=", hi)])
|
155
|
+
|
156
|
+
return out
|
157
|
+
|
158
|
+
def split_pushdown_and_residual(self, filters: Dict[str, Any]
|
159
|
+
) -> Tuple[List[Tuple[str, str, Any]], Dict[str, Any]]:
|
160
|
+
"""
|
161
|
+
Split input filter dict into:
|
162
|
+
- parquet_filters: list of (col, op, val) tuples for dd.read_parquet(..., filters=...)
|
163
|
+
- residual_filters: dict to be applied later via a Dask boolean mask
|
53
164
|
"""
|
165
|
+
push_keys = set()
|
166
|
+
for key in filters.keys():
|
167
|
+
_, casting, op = self._parse_filter_key(key)
|
168
|
+
if op in self._pushdown_ops():
|
169
|
+
push_keys.add(key)
|
170
|
+
|
171
|
+
pushdown_subset = {k: filters[k] for k in push_keys}
|
172
|
+
parquet_filters = self.to_parquet_filters(pushdown_subset)
|
173
|
+
residual_filters = {k: v for k, v in filters.items() if k not in push_keys}
|
174
|
+
return parquet_filters, residual_filters
|
175
|
+
|
176
|
+
# --------- Expression compiler / mask builder ---------
|
177
|
+
def compile_filters(self, filters: Optional[Dict[str, Any]] = None) -> Expr:
|
178
|
+
"""
|
179
|
+
Compile a dict into a deferred expression tree (no df required).
|
180
|
+
Supports boolean forms: {"$and": [...]}, {"$or": [...]}, {"$not": {...}}.
|
181
|
+
Default combination for plain dicts: AND of all terms.
|
182
|
+
"""
|
183
|
+
filters = filters or {}
|
184
|
+
if not filters:
|
185
|
+
return TrueExpr()
|
186
|
+
|
187
|
+
# boolean forms
|
188
|
+
if "$and" in filters:
|
189
|
+
expr = TrueExpr()
|
190
|
+
for sub in filters["$and"]:
|
191
|
+
expr = expr & self.compile_filters(sub)
|
192
|
+
return expr
|
193
|
+
|
194
|
+
if "$or" in filters:
|
195
|
+
subs = [self.compile_filters(sub) for sub in filters["$or"]]
|
196
|
+
if not subs: return TrueExpr()
|
197
|
+
expr = subs[0]
|
198
|
+
for s in subs[1:]:
|
199
|
+
expr = expr | s
|
200
|
+
return expr
|
201
|
+
|
202
|
+
if "$not" in filters:
|
203
|
+
return ~self.compile_filters(filters["$not"])
|
204
|
+
|
205
|
+
# plain dict => AND across keys
|
206
|
+
expr: Expr = TrueExpr()
|
207
|
+
for key, value in filters.items():
|
208
|
+
field, casting, op = self._parse_filter_key(key)
|
209
|
+
expr = expr & ColOp(field=field, casting=casting, op=op, value=value, handler=self)
|
210
|
+
return expr
|
211
|
+
|
212
|
+
def build_mask_fn(self, filters: Optional[Dict[str, Any]] = None) -> Callable[[dd.DataFrame], dd.Series]:
|
213
|
+
"""Return a callable (df -> boolean mask) without touching df now."""
|
214
|
+
expr = self.compile_filters(filters)
|
215
|
+
def _fn(df: dd.DataFrame) -> dd.Series:
|
216
|
+
return expr.mask(df)
|
217
|
+
return _fn
|
218
|
+
|
219
|
+
# --------- Existing “apply now” API (kept as-is) ---------
|
220
|
+
def apply_filters(self, query_or_df, model=None, filters=None):
|
54
221
|
filters = filters or {}
|
55
222
|
for key, value in filters.items():
|
56
223
|
field_name, casting, operation = self._parse_filter_key(key)
|
57
224
|
parsed_value = self._parse_filter_value(casting, value)
|
58
|
-
# print(field_name, casting, operation, parsed_value)
|
59
|
-
# Get the column and apply backend-specific transformations
|
60
225
|
if self.backend == "sqlalchemy":
|
61
226
|
column = self.backend_methods["get_column"](field_name, model, casting)
|
62
227
|
condition = self.backend_methods["apply_operation"](column, operation, parsed_value)
|
63
228
|
query_or_df = self.backend_methods["apply_condition"](query_or_df, condition)
|
64
|
-
|
65
229
|
elif self.backend == "dask":
|
66
230
|
column = self.backend_methods["get_column"](query_or_df, field_name, casting)
|
67
231
|
condition = self.backend_methods["apply_operation"](column, operation, parsed_value)
|
68
232
|
query_or_df = self.backend_methods["apply_condition"](query_or_df, condition)
|
69
233
|
else:
|
70
234
|
raise ValueError(f"Unsupported backend: {self.backend}")
|
71
|
-
|
72
235
|
return query_or_df
|
73
236
|
|
237
|
+
# --------- Parsing & backend plumbing (unchanged) ---------
|
74
238
|
@staticmethod
|
75
239
|
def _parse_filter_key(key):
|
76
240
|
parts = key.split("__")
|
@@ -89,20 +253,16 @@ class FilterHandler:
|
|
89
253
|
return field_name, casting, operation
|
90
254
|
|
91
255
|
def _parse_filter_value(self, casting, value):
|
92
|
-
"""
|
93
|
-
Convert filter value to appropriate type based on the casting (e.g., date).
|
94
|
-
"""
|
95
256
|
if casting == "date":
|
96
257
|
if isinstance(value, str):
|
97
|
-
|
98
|
-
|
258
|
+
return pd.Timestamp(value)
|
259
|
+
if isinstance(value, list):
|
260
|
+
return [pd.Timestamp(v) for v in value]
|
261
|
+
elif casting == "time":
|
262
|
+
# convert to seconds since midnight
|
99
263
|
if isinstance(value, list):
|
100
|
-
|
101
|
-
|
102
|
-
elif casting == "time" and isinstance(value, str):
|
103
|
-
parsed = datetime.time.fromisoformat(value)
|
104
|
-
self.logger.debug(f"Parsed value (time): {parsed}")
|
105
|
-
return parsed
|
264
|
+
return [self._time_to_seconds(v) for v in value]
|
265
|
+
return self._time_to_seconds(value)
|
106
266
|
return value
|
107
267
|
|
108
268
|
@staticmethod
|
@@ -124,53 +284,35 @@ class FilterHandler:
|
|
124
284
|
|
125
285
|
@staticmethod
|
126
286
|
def _get_sqlalchemy_column(field_name, model, casting):
|
127
|
-
"""
|
128
|
-
Retrieve and cast a column for SQLAlchemy based on the field name and casting.
|
129
|
-
|
130
|
-
Args:
|
131
|
-
field_name: The name of the field/column in the model.
|
132
|
-
model: The SQLAlchemy model.
|
133
|
-
casting: The casting type ('date', 'time', etc.).
|
134
|
-
|
135
|
-
Returns:
|
136
|
-
The SQLAlchemy column object, optionally cast or transformed.
|
137
|
-
"""
|
138
287
|
column = getattr(model, field_name, None)
|
139
288
|
if not column:
|
140
289
|
raise AttributeError(f"Field '{field_name}' not found in model '{model.__name__}'")
|
141
290
|
|
142
291
|
if casting == "date":
|
143
|
-
# Cast the column to Date for whole-date comparisons
|
144
292
|
column = cast(column, Date)
|
145
293
|
elif casting == "time":
|
146
|
-
# Cast the column to Time for time-specific comparisons
|
147
294
|
column = cast(column, Time)
|
148
295
|
elif casting in FilterHandler._date_operators():
|
149
|
-
# Extract date part (e.g., year, month) using SQLAlchemy functions
|
150
296
|
column = func.extract(casting, column)
|
151
297
|
|
152
298
|
return column
|
153
299
|
|
154
300
|
@staticmethod
|
155
301
|
def _get_dask_column(df, field_name, casting):
|
156
|
-
|
157
|
-
|
302
|
+
needs_dt = casting in (FilterHandler._dt_operators() + FilterHandler._date_operators())
|
303
|
+
column = dd.to_datetime(df[field_name], errors="coerce") if needs_dt else df[field_name]
|
158
304
|
|
159
|
-
|
160
|
-
|
161
|
-
field_name: The name of the field/column in the DataFrame.
|
162
|
-
casting: The casting type ('date', 'time', etc.).
|
163
|
-
|
164
|
-
Returns:
|
165
|
-
The Dask Series object, optionally cast or transformed.
|
166
|
-
"""
|
167
|
-
column = dd.to_datetime(df[field_name], errors="coerce") if casting in FilterHandler._dt_operators() else df[
|
168
|
-
field_name]
|
305
|
+
if needs_dt:
|
306
|
+
column = FilterHandler._strip_tz(column)
|
169
307
|
|
170
308
|
if casting == "date":
|
171
|
-
column = column.dt.floor("D")
|
309
|
+
column = column.dt.floor("D")
|
310
|
+
elif casting == "time":
|
311
|
+
# compare as "seconds since midnight"
|
312
|
+
column = (column.dt.hour * 3600 + column.dt.minute * 60 + column.dt.second)
|
172
313
|
elif casting in FilterHandler._date_operators():
|
173
|
-
|
314
|
+
attr = "weekday" if casting == "week_day" else casting
|
315
|
+
column = getattr(column.dt, attr)
|
174
316
|
|
175
317
|
return column
|
176
318
|
|
@@ -204,13 +346,13 @@ class FilterHandler:
|
|
204
346
|
"isnull": lambda col, val: col.is_(None) if val else col.isnot(None),
|
205
347
|
"not_exact": lambda col, val: col != val,
|
206
348
|
"not_contains": lambda col, val: ~col.like(f"%{val}%"),
|
207
|
-
"not_in": lambda col, val: ~col.in_(val),
|
208
|
-
"regex": lambda col, val: col.op("~")(val),
|
209
|
-
"icontains": lambda col, val: col.ilike(f"%{val}%"),
|
210
|
-
"istartswith": lambda col, val: col.ilike(f"{val}%"),
|
211
|
-
"iendswith": lambda col, val: col.ilike(f"%{val}"),
|
212
|
-
"iexact": lambda col, val: col.ilike(val),
|
213
|
-
"iregex": lambda col, val: col.op("~*")(val),
|
349
|
+
"not_in": lambda col, val: ~col.in_(val),
|
350
|
+
"regex": lambda col, val: col.op("~")(val),
|
351
|
+
"icontains": lambda col, val: col.ilike(f"%{val}%"),
|
352
|
+
"istartswith": lambda col, val: col.ilike(f"{val}%"),
|
353
|
+
"iendswith": lambda col, val: col.ilike(f"%{val}"),
|
354
|
+
"iexact": lambda col, val: col.ilike(val),
|
355
|
+
"iregex": lambda col, val: col.op("~*")(val),
|
214
356
|
}
|
215
357
|
|
216
358
|
@staticmethod
|
@@ -221,23 +363,54 @@ class FilterHandler:
|
|
221
363
|
"gte": lambda col, val: col >= val,
|
222
364
|
"lt": lambda col, val: col < val,
|
223
365
|
"lte": lambda col, val: col <= val,
|
224
|
-
|
366
|
+
|
367
|
+
# type-safe "in" and "not_in"
|
368
|
+
"in": lambda col, val: FilterHandler._align_in_types(col, val)[0].isin(
|
369
|
+
FilterHandler._align_in_types(col, val)[1]),
|
370
|
+
"not_in": lambda col, val: ~FilterHandler._align_in_types(col, val)[0].isin(
|
371
|
+
FilterHandler._align_in_types(col, val)[1]),
|
372
|
+
|
225
373
|
"range": lambda col, val: (col >= val[0]) & (col <= val[1]),
|
226
|
-
|
227
|
-
|
228
|
-
"
|
374
|
+
|
375
|
+
# robust string ops (dtype-agnostic)
|
376
|
+
"contains": lambda col, val: FilterHandler._as_str(col).str.contains(val, regex=True, na=False),
|
377
|
+
"startswith": lambda col, val: FilterHandler._as_str(col).str.startswith(val, na=False),
|
378
|
+
"endswith": lambda col, val: FilterHandler._as_str(col).str.endswith(val, na=False),
|
379
|
+
"not_contains": lambda col, val: ~FilterHandler._as_str(col).str.contains(val, regex=True, na=False),
|
380
|
+
"regex": lambda col, val: FilterHandler._as_str(col).str.contains(val, regex=True, na=False),
|
381
|
+
"icontains": lambda col, val: FilterHandler._as_str(col).str.contains(val, case=False, regex=True, na=False),
|
382
|
+
"istartswith": lambda col, val: FilterHandler._as_str(col).str.lower().str.startswith(str(val).lower(), na=False),
|
383
|
+
"iendswith": lambda col, val: FilterHandler._as_str(col).str.lower().str.endswith(str(val).lower(), na=False),
|
384
|
+
"iexact": lambda col, val: FilterHandler._as_str(col).str.lower() == str(val).lower(),
|
385
|
+
"iregex": lambda col, val: FilterHandler._as_str(col).str.contains(val, case=False, regex=True, na=False),
|
386
|
+
|
229
387
|
"isnull": lambda col, val: col.isnull() if val else col.notnull(),
|
230
388
|
"not_exact": lambda col, val: col != val,
|
231
|
-
"not_contains": lambda col, val: ~col.str.contains(val, regex=True),
|
232
|
-
"not_in": lambda col, val: ~col.isin(val), # Custom operation
|
233
|
-
"regex": lambda col, val: col.str.contains(val, regex=True), # Custom operation
|
234
|
-
"icontains": lambda col, val: col.str.contains(val, case=False, regex=True), # Custom operation
|
235
|
-
"istartswith": lambda col, val: col.str.startswith(val, case=False), # Custom operation
|
236
|
-
"iendswith": lambda col, val: col.str.endswith(val, case=False), # Custom operation
|
237
|
-
"iexact": lambda col, val: col.str.contains(f"^{val}$", case=False, regex=True), # Added iexact
|
238
|
-
"iregex": lambda col, val: col.str.contains(val, case=False, regex=True), # Added iregex
|
239
389
|
}
|
240
390
|
|
391
|
+
@staticmethod
|
392
|
+
def _as_str(col):
|
393
|
+
return col.astype("string").fillna("")
|
394
|
+
|
395
|
+
@staticmethod
|
396
|
+
def _strip_tz(col):
|
397
|
+
import pandas as pd
|
398
|
+
def _part(s: pd.Series) -> pd.Series:
|
399
|
+
try:
|
400
|
+
return s.dt.tz_convert("UTC").dt.tz_localize(None)
|
401
|
+
except Exception:
|
402
|
+
try:
|
403
|
+
return s.dt.tz_localize(None)
|
404
|
+
except Exception:
|
405
|
+
return s
|
406
|
+
return col.map_partitions(_part, meta=col._meta)
|
407
|
+
|
408
|
+
@staticmethod
|
409
|
+
def _time_to_seconds(t):
|
410
|
+
if isinstance(t, str):
|
411
|
+
t = datetime.time.fromisoformat(t)
|
412
|
+
return t.hour * 3600 + t.minute * 60 + t.second
|
413
|
+
|
241
414
|
@staticmethod
|
242
415
|
def _dt_operators():
|
243
416
|
return ["date", "time"]
|
@@ -255,3 +428,42 @@ class FilterHandler:
|
|
255
428
|
"regex", "icontains", "istartswith", "iendswith",
|
256
429
|
"iexact", "iregex"
|
257
430
|
]
|
431
|
+
|
432
|
+
@staticmethod
|
433
|
+
def _align_in_types(col, val):
|
434
|
+
# normalize val to a list
|
435
|
+
if isinstance(val, (set, tuple)):
|
436
|
+
vals = list(val)
|
437
|
+
elif isinstance(val, list):
|
438
|
+
vals = val
|
439
|
+
else:
|
440
|
+
vals = [val]
|
441
|
+
|
442
|
+
kind = getattr(getattr(col, "dtype", None), "kind", None)
|
443
|
+
if kind in ("i", "u"): # integer
|
444
|
+
def to_ints(xs):
|
445
|
+
out = []
|
446
|
+
for x in xs:
|
447
|
+
try:
|
448
|
+
out.append(int(x))
|
449
|
+
except Exception:
|
450
|
+
return None
|
451
|
+
return out
|
452
|
+
ints = to_ints(vals)
|
453
|
+
if ints is not None:
|
454
|
+
return col.astype("Int64"), ints
|
455
|
+
|
456
|
+
if kind in ("f",): # float
|
457
|
+
def to_floats(xs):
|
458
|
+
out = []
|
459
|
+
for x in xs:
|
460
|
+
try:
|
461
|
+
out.append(float(x))
|
462
|
+
except Exception:
|
463
|
+
return None
|
464
|
+
return out
|
465
|
+
flts = to_floats(vals)
|
466
|
+
if flts is not None:
|
467
|
+
return col.astype("float64"), flts
|
468
|
+
|
469
|
+
return FilterHandler._as_str(col), [str(x) for x in vals]
|
@@ -2,15 +2,14 @@ sibi_dst/__init__.py,sha256=D01Z2Ds4zES8uz5Zp7qOWD0EcfCllWgew7AWt2X1SQg,445
|
|
2
2
|
sibi_dst/df_helper/__init__.py,sha256=CyDXtFhRnMrycktxNO8jGGkP0938QiScl56kMZS1Sf8,578
|
3
3
|
sibi_dst/df_helper/_artifact_updater_async.py,sha256=0lUwel-IkmKewRnmMv9GtuT-P6SivkIKtgOHvKchHlc,8462
|
4
4
|
sibi_dst/df_helper/_artifact_updater_threaded.py,sha256=M5GNZismOqMmBrcyfolP1DPv87VILQf_P18is_epn50,7238
|
5
|
-
sibi_dst/df_helper/_df_helper.py,sha256=
|
5
|
+
sibi_dst/df_helper/_df_helper.py,sha256=nG5iITvwyRsdnPgTOql6-w47LEOsZUXYF7-tIM2yGBE,12798
|
6
6
|
sibi_dst/df_helper/_parquet_artifact.py,sha256=tqYOjwxHV1MsADmn-RNFuVI_RrEvvmCJHZieRcsVXuc,12334
|
7
7
|
sibi_dst/df_helper/_parquet_reader.py,sha256=tFq0OQVczozbKZou93vscokp2R6O2DIJ1zHbZqVjagc,3069
|
8
8
|
sibi_dst/df_helper/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
9
|
sibi_dst/df_helper/backends/http/__init__.py,sha256=d1pfgYxbiYg7E0Iw8RbJ7xfqIfJShqqTBQQGU_S6OOo,105
|
10
10
|
sibi_dst/df_helper/backends/http/_http_config.py,sha256=eGPFdqZ5M3Tscqx2P93B6XoBEEzlmdt7yNg7PXUQnNQ,4726
|
11
|
-
sibi_dst/df_helper/backends/parquet/__init__.py,sha256=
|
12
|
-
sibi_dst/df_helper/backends/parquet/
|
13
|
-
sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=FusVcLVysitLoc8Ui_zU4JMhdHW1MMn4i0vnMbl2K84,12017
|
11
|
+
sibi_dst/df_helper/backends/parquet/__init__.py,sha256=0A6BGHZLwiLBmuBBaUvEHfeWTcInvy2NbymlrI_nuXE,104
|
12
|
+
sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=yQ5pZuF2Tf7eM_krOPkxhPkDFtEKzV7BKjUerTqX0tg,12028
|
14
13
|
sibi_dst/df_helper/backends/sqlalchemy/__init__.py,sha256=LjWm9B7CweTvlvFOgB90XjSe0lVLILAIYMWKPkFXFm8,265
|
15
14
|
sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py,sha256=R3_WY_lsQrfQwD6yAzH66MqvsgZdMd0HKcVChDQcbpM,8401
|
16
15
|
sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py,sha256=GQwDy2JwPUx37vpwxPM5hg4ZydilPIP824y5C_clsl0,383
|
@@ -20,7 +19,7 @@ sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py,sha256=MHk64f5WDOKHQ_L
|
|
20
19
|
sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py,sha256=RiCaVPME5wzgZ9xUGY0JOs_c2C0KcDIbTeMGpPupIa0,5242
|
21
20
|
sibi_dst/df_helper/core/__init__.py,sha256=LfmTqFh6GUZup-g95bcXgAxX7J5Hkve7ftLE_CJg_AE,409
|
22
21
|
sibi_dst/df_helper/core/_defaults.py,sha256=9UMEMu2wXznO5UzEhnQ82f_ZazZ20JRyRXIi3HP3gDw,4043
|
23
|
-
sibi_dst/df_helper/core/_filter_handler.py,sha256=
|
22
|
+
sibi_dst/df_helper/core/_filter_handler.py,sha256=9C30zrT8wSGy1X8ryiTWc0XfnbpeoHndHgoOcHKOPOo,19309
|
24
23
|
sibi_dst/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxRrQKE5FQRxcEWsac,6736
|
25
24
|
sibi_dst/df_helper/core/_query_config.py,sha256=1ApqmuSGXTC3CdF-xMsSbCa3V2Z5hOP3Wq5huhzZwqY,439
|
26
25
|
sibi_dst/df_helper/data_cleaner.py,sha256=lkxQoXLvGzXCicFUimnA5nen5qkrO1oxgl_p2Be2o8w,5183
|
@@ -79,6 +78,6 @@ sibi_dst/v2/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxR
|
|
79
78
|
sibi_dst/v2/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
|
80
79
|
sibi_dst/v2/utils/__init__.py,sha256=6H4cvhqTiFufnFPETBF0f8beVVMpfJfvUs6Ne0TQZNY,58
|
81
80
|
sibi_dst/v2/utils/log_utils.py,sha256=rfk5VsLAt-FKpv6aPTC1FToIPiyrnHAFFBAkHme24po,4123
|
82
|
-
sibi_dst-2025.8.
|
83
|
-
sibi_dst-2025.8.
|
84
|
-
sibi_dst-2025.8.
|
81
|
+
sibi_dst-2025.8.4.dist-info/METADATA,sha256=LFL_mbMveA_TrO5zelvtZ1rBiEuMWtvhjrAs42DnOd0,2610
|
82
|
+
sibi_dst-2025.8.4.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
83
|
+
sibi_dst-2025.8.4.dist-info/RECORD,,
|
@@ -1,126 +0,0 @@
|
|
1
|
-
import dask.dataframe as dd
|
2
|
-
import pandas as pd
|
3
|
-
|
4
|
-
from sibi_dst.utils import Logger
|
5
|
-
|
6
|
-
|
7
|
-
class ParquetFilterHandler(object):
|
8
|
-
"""
|
9
|
-
Handles parquet filtering operations using dask dataframes.
|
10
|
-
|
11
|
-
This class is designed to apply complex filtering logic on dask dataframes
|
12
|
-
based on specified filter criteria. It includes support for operations such
|
13
|
-
as exact matches, ranges, string pattern matches, and null checks. Additionally,
|
14
|
-
it handles datetime-related field filtering including precise truncations and
|
15
|
-
specific date/time attributes.
|
16
|
-
|
17
|
-
:ivar logger: Logger object to handle logging within the class. Defaults to the class-level logger.
|
18
|
-
:type logger: Logger
|
19
|
-
"""
|
20
|
-
def __init__(self, logger=None, debug=False):
|
21
|
-
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
22
|
-
self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
|
23
|
-
|
24
|
-
@staticmethod
|
25
|
-
def apply_filters_dask(df, filters):
|
26
|
-
"""
|
27
|
-
Applies a set of filters to a Dask DataFrame, enabling complex filtering operations
|
28
|
-
such as comparisons, ranges, string match operations, and more. Handles special
|
29
|
-
cases for datetime operations, including casting and extracting specific datetime
|
30
|
-
components for filtering.
|
31
|
-
|
32
|
-
:param df: Dask DataFrame to which the filters will be applied.
|
33
|
-
:type df: dask.dataframe.DataFrame
|
34
|
-
:param filters: Dictionary defining the filtering logic, where the keys specify
|
35
|
-
the column name and filter operation, and the values specify the corresponding
|
36
|
-
filter values to apply.
|
37
|
-
:type filters: dict
|
38
|
-
:return: A filtered Dask DataFrame based on the defined logic in the filters.
|
39
|
-
:rtype: dask.dataframe.DataFrame
|
40
|
-
:raises ValueError: If an unsupported operation is encountered in the filters.
|
41
|
-
"""
|
42
|
-
dt_operators = ['date', 'time']
|
43
|
-
date_operators = ['year', 'month', 'day', 'hour', 'minute', 'second', 'week_day']
|
44
|
-
comparison_operators = [
|
45
|
-
'gte',
|
46
|
-
'lte',
|
47
|
-
'gt',
|
48
|
-
'lt',
|
49
|
-
'exact',
|
50
|
-
'in',
|
51
|
-
'range',
|
52
|
-
'contains',
|
53
|
-
'icontains',
|
54
|
-
'startswith',
|
55
|
-
'endswith',
|
56
|
-
'isnull'
|
57
|
-
]
|
58
|
-
|
59
|
-
operation_map = {
|
60
|
-
'exact': lambda col, val: col == val,
|
61
|
-
'gt': lambda col, val: col > val,
|
62
|
-
'gte': lambda col, val: col >= val,
|
63
|
-
'lt': lambda col, val: col < val,
|
64
|
-
'lte': lambda col, val: col <= val,
|
65
|
-
'in': lambda col, val: col.isin(val),
|
66
|
-
'range': lambda col, val: (col >= val[0]) & (col <= val[1]),
|
67
|
-
'contains': lambda col, val: col.str.contains(val, regex=True),
|
68
|
-
'icontains': lambda col, val: col.str.contains(val, case=False),
|
69
|
-
'startswith': lambda col, val: col.str.startswith(val),
|
70
|
-
'endswith': lambda col, val: col.str.endswith(val),
|
71
|
-
'isnull': lambda col, val: col.isnull() if val else col.notnull(),
|
72
|
-
}
|
73
|
-
|
74
|
-
def parse_filter_value(casting, value):
|
75
|
-
"""
|
76
|
-
Convert filter value to appropriate type based on the casting (e.g., date).
|
77
|
-
"""
|
78
|
-
if casting == 'date':
|
79
|
-
if isinstance(value, str):
|
80
|
-
return pd.Timestamp(value) # Convert to datetime64[ns]
|
81
|
-
if isinstance(value, list):
|
82
|
-
return [pd.Timestamp(v) for v in value] # Convert list elements
|
83
|
-
return value
|
84
|
-
|
85
|
-
def get_temp_col(dask_df, field_name, casting):
|
86
|
-
"""
|
87
|
-
Handle datetime conversion and field retrieval.
|
88
|
-
"""
|
89
|
-
temp_col = dd.to_datetime(dask_df[field_name], errors='coerce') if casting in dt_operators else dask_df[
|
90
|
-
field_name]
|
91
|
-
if casting == 'date':
|
92
|
-
temp_col = temp_col.dt.floor('D') # Keep it as datetime64[ns] truncated to the day level
|
93
|
-
elif casting in date_operators:
|
94
|
-
temp_col = getattr(temp_col.dt, casting)
|
95
|
-
return temp_col
|
96
|
-
|
97
|
-
for key, value in filters.items():
|
98
|
-
parts = key.split('__')
|
99
|
-
field_name = parts[0]
|
100
|
-
casting = None
|
101
|
-
operation = 'exact'
|
102
|
-
|
103
|
-
if len(parts) == 3:
|
104
|
-
# Adjust logic based on the parts
|
105
|
-
_, casting, operation = parts
|
106
|
-
elif len(parts) == 2:
|
107
|
-
# Could be either a casting or an operation
|
108
|
-
if parts[1] in comparison_operators:
|
109
|
-
operation = parts[1]
|
110
|
-
elif parts[1] in dt_operators + date_operators:
|
111
|
-
casting = parts[1]
|
112
|
-
|
113
|
-
# Convert the filter value to the correct type
|
114
|
-
parsed_value = parse_filter_value(casting, value)
|
115
|
-
|
116
|
-
# Get the column to filter
|
117
|
-
temp_col = get_temp_col(df, field_name, casting)
|
118
|
-
|
119
|
-
if operation in operation_map:
|
120
|
-
# Apply the filter operation
|
121
|
-
condition = operation_map[operation](temp_col, parsed_value)
|
122
|
-
df = df[condition]
|
123
|
-
else:
|
124
|
-
raise ValueError(f"Unsupported operation: {operation}")
|
125
|
-
|
126
|
-
return df
|
File without changes
|