sibi-dst 2025.8.2__py3-none-any.whl → 2025.8.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,7 +7,7 @@ import pandas as pd
7
7
  from fsspec import AbstractFileSystem
8
8
  from pydantic import BaseModel
9
9
 
10
- from sibi_dst.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
10
+ from sibi_dst.df_helper.core import QueryConfig, ParamsConfig
11
11
  from sibi_dst.utils import ManagedResource, ParquetSaver, ClickHouseWriter
12
12
  from .backends.http import HttpConfig
13
13
  from .backends.parquet import ParquetConfig
@@ -54,24 +54,40 @@ class SqlAlchemyBackend(BaseBackend):
54
54
  class ParquetBackend(BaseBackend):
55
55
  def load(self, **options):
56
56
  try:
57
- df = self.helper.backend_parquet.load_files()
58
- if len(df.head(1)) == 0:
59
- return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
60
-
61
- if options and df is not None:
62
- df = FilterHandler("dask", logger=self.logger, debug=False).apply_filters(df, filters=options)
63
- if len(df.head(1)) == 0:
64
- self.logger.debug("No records after filters; returning empty DataFrame.")
65
- return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
57
+ df = self.helper.backend_parquet.load_files(**options)
58
+ if self._is_empty(df):
59
+ return -1, self._empty_like(df)
60
+ nrows = self._row_count(df)
61
+ if nrows == 0:
62
+ self.logger.debug("No records after filters; returning empty DataFrame.")
63
+ return 0, self._empty_like(df)
66
64
 
67
65
  df = df.persist()
68
- self.total_records = len(df) or -1
66
+ self.total_records = self._row_count(df) or -1
69
67
  return self.total_records, df
68
+
70
69
  except Exception as e:
71
- self.total_records = -1
70
+ self.total_records = -1 # Reset total_records on failure
72
71
  self.logger.error(f"Failed to load data from parquet: {e}", exc_info=self.debug)
73
72
  return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
74
73
 
74
+ def _is_empty(self, ddf) -> bool:
75
+ """True if no rows across all partitions."""
76
+ try:
77
+ # head with npartitions=-1 walks partitions until it gets n rows
78
+ return ddf.head(1, npartitions=-1).shape[0] == 0
79
+ except Exception:
80
+ return True
81
+
82
+ def _row_count(self, ddf) -> int:
83
+ """Reliable row count for Dask DataFrame."""
84
+ return int(ddf.map_partitions(len).sum().compute())
85
+
86
+ def _empty_like(self, ddf):
87
+ """Return an empty Dask DF with the SAME columns/dtypes."""
88
+ empty_pdf = ddf._meta.iloc[0:0]
89
+ return dd.from_pandas(empty_pdf, npartitions=1)
90
+
75
91
 
76
92
  class HttpBackend(BaseBackend):
77
93
  def load(self, **options):
@@ -279,295 +295,4 @@ class DfHelper(ManagedResource):
279
295
  self.logger.debug(f"Period load generated filters: {kwargs}")
280
296
  return kwargs
281
297
 
282
- # from __future__ import annotations
283
- #
284
- # from typing import Any, Dict, Optional, Union, TypeVar
285
- #
286
- # import dask.dataframe as dd
287
- # import pandas as pd
288
- # from fsspec import AbstractFileSystem
289
- # from pydantic import BaseModel
290
- #
291
- # from sibi_dst.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
292
- # from sibi_dst.utils import ManagedResource
293
- # from sibi_dst.utils import ParquetSaver, ClickHouseWriter
294
- # from .backends.http import HttpConfig
295
- # from .backends.parquet import ParquetConfig
296
- # from .backends.sqlalchemy import SqlAlchemyConnectionConfig, SqlAlchemyLoadFromDb
297
- #
298
- # T = TypeVar("T", bound=BaseModel)
299
- #
300
- #
301
- # # --- Backend Strategy Pattern Implementation ---
302
- #
303
- # class BaseBackend:
304
- # """Abstract base class defining clear sync and async loading interfaces."""
305
- #
306
- # def __init__(self, helper: DfHelper):
307
- # self.helper = helper
308
- # self.logger = helper.logger
309
- # self.debug = helper.debug
310
- # self.total_records = helper.total_records # no records loaded yet
311
- #
312
- # def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
313
- # """Synchronous data loading method. Must be implemented by sync backends."""
314
- # raise NotImplementedError(f"Backend '{self.__class__.__name__}' does not support synchronous loading.")
315
- #
316
- # async def aload(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
317
- # """Asynchronous data loading method. By default, it calls the sync version."""
318
- # return self.load(**options)
319
- #
320
- #
321
- # class SqlAlchemyBackend(BaseBackend):
322
- # def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
323
- # try:
324
- # # Process incoming filter options into the ParamsConfig object
325
- # if options and hasattr(self.helper._backend_params, 'parse_params'):
326
- # self.helper._backend_params.parse_params(options)
327
- #
328
- # with SqlAlchemyLoadFromDb(
329
- # plugin_sqlalchemy=self.helper.backend_db_connection,
330
- # plugin_query=self.helper._backend_query,
331
- # plugin_params=self.helper._backend_params,
332
- # logger=self.logger,
333
- # debug=self.debug
334
- # ) as db_loader:
335
- # self.total_records, result = db_loader.build_and_load()
336
- # return self.total_records, result
337
- # except Exception as e:
338
- # self.logger.error(f"Failed to load data from sqlalchemy: {e}", exc_info=self.debug)
339
- # return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
340
- #
341
- #
342
- # class ParquetBackend(BaseBackend):
343
- # """This backend is also purely synchronous."""
344
- #
345
- # def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
346
- # try:
347
- # df = self.helper.backend_parquet.load_files()
348
- # if len(df.head(1)) == 0:
349
- # return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
350
- # if options and df is not None:
351
- # df = FilterHandler('dask', logger=self.logger, debug=False).apply_filters(df, filters=options)
352
- # if len(df.head(1)) == 0:
353
- # self.logger.debug("No records found after applying filters; returning empty DataFrame.")
354
- # return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
355
- # df = df.persist()
356
- #
357
- # self.total_records = len(df) or -1 # If df is empty, set total_records to -1
358
- # return self.total_records, df
359
- # except Exception as e:
360
- # self.total_records = -1 # Reset total_records on failure
361
- # self.logger.error(f"Failed to load data from parquet: {e}")
362
- # return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
363
- #
364
- #
365
- # class HttpBackend(BaseBackend):
366
- # """This backend is purely asynchronous."""
367
- #
368
- # def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
369
- # # This will correctly fail by raising NotImplementedError from the base class.
370
- # return self.helper.backend_http.fetch_data(**options)
371
- #
372
- # async def aload(self, **options) -> tuple[Any, Any] | Union[pd.DataFrame, dd.DataFrame]:
373
- # if not self.helper.backend_http:
374
- # self.logger.warning("HTTP plugin not configured properly.")
375
- # self.total_records = -1
376
- # return self.total_records, dd.from_pandas(pd.DataFrame(), npartitions=1)
377
- # result = await self.helper.backend_http.fetch_data(**options)
378
- # self.total_records = len(result)
379
- # return self.total_records, result
380
- #
381
- #
382
- # # --- Main DfHelper Facade Class ---
383
- #
384
- # class DfHelper(ManagedResource):
385
- # """
386
- # A reusable utility for loading data. It provides both sync (`load`) and
387
- # async (`aload`) methods to accommodate different backends.
388
- # """
389
- # _BACKEND_STRATEGIES = {
390
- # 'sqlalchemy': SqlAlchemyBackend,
391
- # 'parquet': ParquetBackend,
392
- # 'http': HttpBackend,
393
- # }
394
- #
395
- # _BACKEND_ATTR_MAP = {
396
- # 'sqlalchemy': 'backend_db_connection',
397
- # 'parquet': 'backend_parquet',
398
- # 'http': 'backend_http',
399
- # }
400
- #
401
- # default_config: Dict = None
402
- #
403
- # def __init__(self, backend='sqlalchemy', **kwargs):
404
- # self.default_config = self.default_config or {}
405
- # kwargs = {**self.default_config.copy(), **kwargs}
406
- # super().__init__(**kwargs)
407
- # self.backend = backend
408
- #
409
- # # Need to set default values for backend-specific configurations
410
- # kwargs.setdefault("debug", self.debug)
411
- # kwargs.setdefault("fs", self.fs)
412
- # kwargs.setdefault("logger", self.logger)
413
- # self.total_records = -1 # Initialize total_records to -1 to indicate no records loaded yet
414
- # self._backend_query = self._get_config(QueryConfig, kwargs)
415
- # self._backend_params = self._get_config(ParamsConfig, kwargs)
416
- # self.backend_db_connection: Optional[SqlAlchemyConnectionConfig] = None
417
- # self.backend_parquet: Optional[ParquetConfig] = None
418
- # self.backend_http: Optional[HttpConfig] = None
419
- #
420
- # if self.backend == 'sqlalchemy':
421
- # self.backend_db_connection = self._get_config(SqlAlchemyConnectionConfig, kwargs)
422
- # elif self.backend == 'parquet':
423
- # self.backend_parquet = self._get_config(ParquetConfig, kwargs)
424
- # elif self.backend == 'http':
425
- # self.backend_http = self._get_config(HttpConfig, kwargs)
426
- #
427
- # strategy_class = self._BACKEND_STRATEGIES.get(self.backend)
428
- # if not strategy_class: raise ValueError(f"Unsupported backend: {self.backend}")
429
- # self.backend_strategy = strategy_class(self)
430
- #
431
- # def _cleanup(self):
432
- # attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
433
- # if not attr_name:
434
- # self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.")
435
- # return
436
- # # Get the actual config object (e.g., self.backend_db_connection)
437
- # active_config = getattr(self, attr_name, None)
438
- #
439
- # if active_config and hasattr(active_config, "close"):
440
- # self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.")
441
- # active_config.close()
442
- #
443
- # async def _acleanup(self):
444
- # self.logger.warning("DfHelper instance was not used in an async context manager; cleanup is being called manually.")
445
- # attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
446
- # if not attr_name:
447
- # self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.")
448
- # return
449
- # # Get the actual config object (e.g., self.backend_db_connection)
450
- # active_config = getattr(self, attr_name, None)
451
- # if active_config and hasattr(active_config, "aclose"):
452
- # self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.")
453
- # await active_config.aclose()
454
- #
455
- # def _get_config(self, model: T, kwargs: Dict[str, Any]) -> T:
456
- # recognized_keys = set(model.model_fields.keys())
457
- # model_kwargs = {k: kwargs[k] for k in recognized_keys if k in kwargs}
458
- # return model(**model_kwargs)
459
- #
460
- # def load(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
461
- # """Loads data synchronously. Fails if backend is async-only."""
462
- # self.logger.debug(f"Loading data from {self.backend} backend with options: {options}")
463
- # self.total_records, df = self.backend_strategy.load(**options)
464
- # df = self._process_loaded_data(df)
465
- # df = self._post_process_df(df)
466
- # self.logger.debug(f"Finished loading data from {self.backend} backend with options: {options}")
467
- # return df.compute() if as_pandas else df
468
- #
469
- # async def aload(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
470
- # """Loads data asynchronously from any backend."""
471
- # self.total_records, df = await self.backend_strategy.aload(**options)
472
- # df = self._process_loaded_data(df)
473
- # df = self._post_process_df(df)
474
- # return df.compute() if as_pandas else df
475
- #
476
- # def _post_process_df(self, df: dd.DataFrame) -> dd.DataFrame:
477
- # self.logger.debug("Post-processing DataFrame.")
478
- # df_params = self._backend_params.df_params
479
- # if not df_params: return df
480
- # fieldnames, column_names, index_col = (df_params.get("fieldnames"), df_params.get("column_names"),
481
- # df_params.get("index_col"))
482
- # if not any([fieldnames, column_names, index_col]): return df
483
- #
484
- # if fieldnames:
485
- # valid_fieldnames = [f for f in fieldnames if f in df.columns]
486
- # if len(valid_fieldnames) < len(fieldnames): self.logger.warning(
487
- # f"Missing columns for filtering: {set(fieldnames) - set(valid_fieldnames)}")
488
- # df = df[valid_fieldnames]
489
- # if column_names:
490
- # if len(df.columns) != len(column_names): raise ValueError(
491
- # f"Length mismatch: DataFrame has {len(df.columns)} columns, but {len(column_names)} names were provided.")
492
- # df = df.rename(columns=dict(zip(df.columns, column_names)))
493
- # if index_col:
494
- # if index_col not in df.columns: raise ValueError(f"Index column '{index_col}' not found in DataFrame.")
495
- # df = df.set_index(index_col)
496
- # self.logger.debug("Post-processing complete.")
497
- # return df
498
- #
499
- # def _process_loaded_data(self, df: dd.DataFrame) -> dd.DataFrame:
500
- # field_map = self._backend_params.field_map or {}
501
- # if not isinstance(field_map, dict) or not field_map: return df
502
- # if hasattr(df, 'npartitions') and df.npartitions == 1 and not len(df.head(1)): return df
503
- # self.logger.debug("Processing loaded data...applying rename mapping if necessary.")
504
- # rename_mapping = {k: v for k, v in field_map.items() if k in df.columns}
505
- # if rename_mapping: df = df.rename(columns=rename_mapping)
506
- # self.logger.debug("Rename mapping complete...")
507
- # return df
508
- #
509
- # def save_to_parquet(self, df: dd.DataFrame, parquet_filename: str, **kwargs):
510
- # """Saves a Dask DataFrame to a Parquet file with validation."""
511
- #
512
- # # Use .get() for cleaner access to optional arguments.
513
- # fs: AbstractFileSystem = kwargs.get('fs', self.fs)
514
- # path: str = kwargs.get('parquet_storage_path')
515
- #
516
- # # Guard clauses to fail fast with clear errors.
517
- # if not fs:
518
- # raise ValueError("A filesystem (fs) must be provided to save the parquet file.")
519
- # if not path:
520
- # raise ValueError("A 'parquet_storage_path' keyword argument must be provided.")
521
- #
522
- # # An efficient, idiomatic way to check if a Dask DataFrame is empty.
523
- # if len(df.head(1)) == 0:
524
- # self.logger.warning("Skipping save: The provided DataFrame is empty.")
525
- # return
526
- #
527
- # with ParquetSaver(
528
- # df_result=df,
529
- # parquet_storage_path=path,
530
- # fs=fs,
531
- # debug=self.debug,
532
- # logger=self.logger,
533
- # verbose=self.verbose,
534
- # **kwargs
535
- # ) as saver:
536
- # saver.save_to_parquet(parquet_filename)
537
- #
538
- # self.logger.debug(f"Successfully saved '{parquet_filename}' to '{path}'.")
539
- #
540
- # def save_to_clickhouse(self, df: dd.DataFrame, **credentials):
541
- # if hasattr(df, 'npartitions') and df.npartitions == 1 and not len(df.head(1)):
542
- # self.logger.warning("Cannot write to ClickHouse; DataFrame is empty.")
543
- # return
544
- #
545
- # with ClickHouseWriter(debug=self.debug, logger=self.logger, verbose=self.verbose, **credentials) as writer:
546
- # writer.save_to_clickhouse(df)
547
- # self.logger.debug("Save to ClickHouse completed.")
548
- #
549
- # def load_period(self, dt_field: str, start: str, end: str, **kwargs) -> Union[pd.DataFrame, dd.DataFrame]:
550
- # """Synchronous convenience method for loading a date range."""
551
- # final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
552
- # return self.load(**final_kwargs)
553
- #
554
- # async def aload_period(self, dt_field: str, start: str, end: str, **kwargs) -> Union[pd.DataFrame, dd.DataFrame]:
555
- # """Asynchronous convenience method for loading a date range."""
556
- # final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
557
- # return await self.aload(**final_kwargs)
558
- #
559
- # def _prepare_period_filters(self, dt_field: str, start: str, end: str, **kwargs) -> dict:
560
- # start_date, end_date = pd.to_datetime(start).date(), pd.to_datetime(end).date()
561
- # if start_date > end_date:
562
- # raise ValueError("'start' date cannot be later than 'end' date.")
563
- # field_map = self._backend_params.field_map or {}
564
- # reverse_map = {v: k for k, v in field_map.items()} if field_map else {}
565
- # if len(reverse_map) != len(field_map):
566
- # self.logger.warning("field_map values are not unique; reverse mapping may be unreliable.")
567
- # mapped_field = reverse_map.get(dt_field, dt_field)
568
- # if start_date == end_date:
569
- # kwargs[f"{mapped_field}__date"] = start_date
570
- # else:
571
- # kwargs[f"{mapped_field}__date__range"] = [start_date, end_date]
572
- # self.logger.debug(f"Period load generated filters: {kwargs}")
573
- # return kwargs
298
+
@@ -1,9 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
- from ._filter_handler import ParquetFilterHandler
4
3
  from ._parquet_options import *
5
4
 
6
5
  __all__ = [
7
6
  "ParquetConfig",
8
- "ParquetFilterHandler",
9
7
  ]
@@ -175,7 +175,7 @@ class ParquetConfig(BaseModel):
175
175
  total_size += self.fs.size(path)
176
176
  return total_size
177
177
 
178
- def load_files(self):
178
+ def load_files(self, **filters):
179
179
  """
180
180
  Loads parquet files into a Dask DataFrame based on the specified conditions. This
181
181
  method checks if parquet file loading is enabled and loads either from a list of
@@ -1,4 +1,6 @@
1
1
  import datetime
2
+ from dataclasses import dataclass
3
+ from typing import Optional, Dict, Any, List, Union, Tuple, Callable
2
4
 
3
5
  import dask.dataframe as dd
4
6
  import pandas as pd
@@ -8,69 +10,231 @@ from sqlalchemy.sql.sqltypes import Date, Time
8
10
  from sibi_dst.utils import Logger
9
11
 
10
12
 
13
+ # -------------------- Deferred filter expression AST --------------------
14
+ class Expr:
15
+ def mask(self, df: dd.DataFrame) -> dd.Series:
16
+ raise NotImplementedError
17
+
18
+ def to_parquet_filters(self) -> List[Union[Tuple[str, str, Any], List[Tuple[str, str, Any]]]]:
19
+ # By default, nothing to push down
20
+ return []
21
+
22
+ def __and__(self, other: "Expr") -> "Expr": return And(self, other)
23
+ def __or__(self, other: "Expr") -> "Expr": return Or(self, other)
24
+ def __invert__(self) -> "Expr": return Not(self)
25
+
26
+
27
+ @dataclass(frozen=True)
28
+ class TrueExpr(Expr):
29
+ """Matches all rows; useful as a neutral starting point."""
30
+ def mask(self, df: dd.DataFrame) -> dd.Series:
31
+ return df.map_partitions(lambda p: pd.Series(True, index=p.index),
32
+ meta=pd.Series(dtype=bool))
33
+
34
+
35
+ @dataclass(frozen=True)
36
+ class ColOp(Expr):
37
+ field: str
38
+ casting: Optional[str]
39
+ op: str
40
+ value: Any
41
+ handler: "FilterHandler" # reuse your parsing + Dask ops
42
+
43
+ def mask(self, df: dd.DataFrame) -> dd.Series:
44
+ col = self.handler._get_dask_column(df, self.field, self.casting)
45
+ val = self.handler._parse_filter_value(self.casting, self.value)
46
+ return self.handler._apply_operation_dask(col, self.op, val)
47
+
48
+ def to_parquet_filters(self):
49
+ # Only basic comparisons can be pushed down
50
+ if self.op not in {"exact", "gt", "gte", "lt", "lte", "in", "range"}:
51
+ return []
52
+ val = self.handler._parse_filter_value(self.casting, self.value)
53
+ if self.casting == "date":
54
+ if self.op == "range" and isinstance(val, (list, tuple)) and len(val) == 2:
55
+ lo, hi = pd.Timestamp(val[0]), pd.Timestamp(val[1])
56
+ return [(self.field, ">=", lo), (self.field, "<=", hi)]
57
+ if isinstance(val, list):
58
+ val = [pd.Timestamp(v) for v in val]
59
+ else:
60
+ val = pd.Timestamp(val)
61
+ if self.op == "exact": return [(self.field, "=", val)]
62
+ if self.op in {"gt","gte","lt","lte"}:
63
+ sym = {"gt": ">", "gte": ">=", "lt": "<", "lte": "<="}[self.op]
64
+ return [(self.field, sym, val)]
65
+ if self.op == "in": return [(self.field, "in", list(val) if not isinstance(val, list) else val)]
66
+ if self.op == "range":
67
+ lo, hi = val
68
+ return [(self.field, ">=", lo), (self.field, "<=", hi)]
69
+ return []
70
+
71
+
72
+ @dataclass(frozen=True)
73
+ class And(Expr):
74
+ left: Expr; right: Expr
75
+ def mask(self, df: dd.DataFrame) -> dd.Series: return self.left.mask(df) & self.right.mask(df)
76
+ def to_parquet_filters(self):
77
+ # AND = concatenate both sides' AND-terms
78
+ return [*self.left.to_parquet_filters(), *self.right.to_parquet_filters()]
79
+
80
+
81
+ @dataclass(frozen=True)
82
+ class Or(Expr):
83
+ left: Expr; right: Expr
84
+ def mask(self, df: dd.DataFrame) -> dd.Series: return self.left.mask(df) | self.right.mask(df)
85
+ def to_parquet_filters(self):
86
+ # OR must be returned as list-of-lists; if either side has non-pushdown, defer to mask
87
+ lf, rf = self.left.to_parquet_filters(), self.right.to_parquet_filters()
88
+ if not lf or not rf:
89
+ return []
90
+ return [lf, rf]
91
+
92
+
93
+ @dataclass(frozen=True)
94
+ class Not(Expr):
95
+ inner: Expr
96
+ def mask(self, df: dd.DataFrame) -> dd.Series: return ~self.inner.mask(df)
97
+ def to_parquet_filters(self): return []
98
+
99
+
100
+ # -------------------- Filter handler --------------------
11
101
  class FilterHandler:
12
102
  """
13
- Handles the application of filters to data sources with support for SQLAlchemy and Dask backends.
14
-
15
- The FilterHandler class abstracts the process of applying filters to various backends, specifically
16
- SQLAlchemy queries and Dask DataFrames. It supports multiple filtering operations, including
17
- exact matches, comparisons, and string-related operations such as contains and regex. The handler
18
- automatically determines and applies backend-specific processing, enabling seamless integration with
19
- different data models or backends.
20
-
21
- :ivar backend: The backend in use ('sqlalchemy' or 'dask').
22
- :type backend: str
23
- :ivar logger: An optional logger instance for debugging and logging purposes.
24
- :type logger: Logger
25
- :ivar backend_methods: A dictionary mapping backend-specific methods for column retrieval and operation application.
26
- :type backend_methods: dict
103
+ Handles the application of filters to SQLAlchemy and Dask backends.
104
+ Also compiles dicts into deferred expressions (Expr) and can split
105
+ pushdown-friendly predicates from residual ones.
27
106
  """
28
107
  def __init__(self, backend, logger=None, debug=False):
29
- """
30
- Initialize the FilterHandler.
31
-
32
- Args:
33
- backend: The backend to use ('sqlalchemy' or 'dask').
34
- logger: Optional logger for debugging purposes.
35
- """
36
108
  self.backend = backend
37
- self.logger = logger or Logger.default_logger(
38
- logger_name=self.__class__.__name__) # No-op logger if none provided
109
+ self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
39
110
  self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
40
111
  self.backend_methods = self._get_backend_methods(backend)
41
112
 
42
- def apply_filters(self, query_or_df, model=None, filters=None):
43
- """
44
- Apply filters to the data source based on the backend.
113
+ # --------- NEW: pushdown helpers ---------
114
+ def _pushdown_ops(self) -> set[str]:
115
+ """Ops that can be translated to PyArrow parquet filters."""
116
+ return {"exact", "gt", "gte", "lt", "lte", "in", "range"}
45
117
 
46
- Args:
47
- query_or_df: SQLAlchemy query or Dask DataFrame.
48
- model: SQLAlchemy model (required for SQLAlchemy backend).
49
- filters: Dictionary of filters.
118
+ def to_parquet_filters(self, filters: Optional[Dict[str, Any]] = None
119
+ ) -> List[Tuple[str, str, Any]]:
120
+ """
121
+ Convert a subset of filters into PyArrow parquet filters (AND semantics).
122
+ Unsupported ops are skipped here and should be applied later as a Dask mask.
123
+ """
124
+ filters = filters or {}
125
+ out: List[Tuple[str, str, Any]] = []
50
126
 
51
- Returns:
52
- Filtered query or DataFrame.
127
+ for key, value in filters.items():
128
+ field, casting, op = self._parse_filter_key(key)
129
+ if op not in self._pushdown_ops():
130
+ continue
131
+
132
+ val = self._parse_filter_value(casting, value)
133
+
134
+ # Normalize dates to Timestamp for Arrow
135
+ if casting == "date":
136
+ if op == "range" and isinstance(val, (list, tuple)) and len(val) == 2:
137
+ lo, hi = pd.Timestamp(val[0]), pd.Timestamp(val[1])
138
+ out.extend([(field, ">=", lo), (field, "<=", hi)])
139
+ continue
140
+ if isinstance(val, list):
141
+ val = [pd.Timestamp(v) for v in val]
142
+ else:
143
+ val = pd.Timestamp(val)
144
+
145
+ if op == "exact":
146
+ out.append((field, "=", val))
147
+ elif op in {"gt", "gte", "lt", "lte"}:
148
+ sym = {"gt": ">", "gte": ">=", "lt": "<", "lte": "<="}[op]
149
+ out.append((field, sym, val))
150
+ elif op == "in":
151
+ out.append((field, "in", list(val) if not isinstance(val, list) else val))
152
+ elif op == "range":
153
+ lo, hi = val
154
+ out.extend([(field, ">=", lo), (field, "<=", hi)])
155
+
156
+ return out
157
+
158
+ def split_pushdown_and_residual(self, filters: Dict[str, Any]
159
+ ) -> Tuple[List[Tuple[str, str, Any]], Dict[str, Any]]:
160
+ """
161
+ Split input filter dict into:
162
+ - parquet_filters: list of (col, op, val) tuples for dd.read_parquet(..., filters=...)
163
+ - residual_filters: dict to be applied later via a Dask boolean mask
53
164
  """
165
+ push_keys = set()
166
+ for key in filters.keys():
167
+ _, casting, op = self._parse_filter_key(key)
168
+ if op in self._pushdown_ops():
169
+ push_keys.add(key)
170
+
171
+ pushdown_subset = {k: filters[k] for k in push_keys}
172
+ parquet_filters = self.to_parquet_filters(pushdown_subset)
173
+ residual_filters = {k: v for k, v in filters.items() if k not in push_keys}
174
+ return parquet_filters, residual_filters
175
+
176
+ # --------- Expression compiler / mask builder ---------
177
+ def compile_filters(self, filters: Optional[Dict[str, Any]] = None) -> Expr:
178
+ """
179
+ Compile a dict into a deferred expression tree (no df required).
180
+ Supports boolean forms: {"$and": [...]}, {"$or": [...]}, {"$not": {...}}.
181
+ Default combination for plain dicts: AND of all terms.
182
+ """
183
+ filters = filters or {}
184
+ if not filters:
185
+ return TrueExpr()
186
+
187
+ # boolean forms
188
+ if "$and" in filters:
189
+ expr = TrueExpr()
190
+ for sub in filters["$and"]:
191
+ expr = expr & self.compile_filters(sub)
192
+ return expr
193
+
194
+ if "$or" in filters:
195
+ subs = [self.compile_filters(sub) for sub in filters["$or"]]
196
+ if not subs: return TrueExpr()
197
+ expr = subs[0]
198
+ for s in subs[1:]:
199
+ expr = expr | s
200
+ return expr
201
+
202
+ if "$not" in filters:
203
+ return ~self.compile_filters(filters["$not"])
204
+
205
+ # plain dict => AND across keys
206
+ expr: Expr = TrueExpr()
207
+ for key, value in filters.items():
208
+ field, casting, op = self._parse_filter_key(key)
209
+ expr = expr & ColOp(field=field, casting=casting, op=op, value=value, handler=self)
210
+ return expr
211
+
212
+ def build_mask_fn(self, filters: Optional[Dict[str, Any]] = None) -> Callable[[dd.DataFrame], dd.Series]:
213
+ """Return a callable (df -> boolean mask) without touching df now."""
214
+ expr = self.compile_filters(filters)
215
+ def _fn(df: dd.DataFrame) -> dd.Series:
216
+ return expr.mask(df)
217
+ return _fn
218
+
219
+ # --------- Existing “apply now” API (kept as-is) ---------
220
+ def apply_filters(self, query_or_df, model=None, filters=None):
54
221
  filters = filters or {}
55
222
  for key, value in filters.items():
56
223
  field_name, casting, operation = self._parse_filter_key(key)
57
224
  parsed_value = self._parse_filter_value(casting, value)
58
- # print(field_name, casting, operation, parsed_value)
59
- # Get the column and apply backend-specific transformations
60
225
  if self.backend == "sqlalchemy":
61
226
  column = self.backend_methods["get_column"](field_name, model, casting)
62
227
  condition = self.backend_methods["apply_operation"](column, operation, parsed_value)
63
228
  query_or_df = self.backend_methods["apply_condition"](query_or_df, condition)
64
-
65
229
  elif self.backend == "dask":
66
230
  column = self.backend_methods["get_column"](query_or_df, field_name, casting)
67
231
  condition = self.backend_methods["apply_operation"](column, operation, parsed_value)
68
232
  query_or_df = self.backend_methods["apply_condition"](query_or_df, condition)
69
233
  else:
70
234
  raise ValueError(f"Unsupported backend: {self.backend}")
71
-
72
235
  return query_or_df
73
236
 
237
+ # --------- Parsing & backend plumbing (unchanged) ---------
74
238
  @staticmethod
75
239
  def _parse_filter_key(key):
76
240
  parts = key.split("__")
@@ -89,20 +253,16 @@ class FilterHandler:
89
253
  return field_name, casting, operation
90
254
 
91
255
  def _parse_filter_value(self, casting, value):
92
- """
93
- Convert filter value to appropriate type based on the casting (e.g., date).
94
- """
95
256
  if casting == "date":
96
257
  if isinstance(value, str):
97
- parsed = pd.Timestamp(value) # Convert to datetime64[ns]
98
- return parsed
258
+ return pd.Timestamp(value)
259
+ if isinstance(value, list):
260
+ return [pd.Timestamp(v) for v in value]
261
+ elif casting == "time":
262
+ # convert to seconds since midnight
99
263
  if isinstance(value, list):
100
- parsed = [pd.Timestamp(v) for v in value]
101
- return parsed
102
- elif casting == "time" and isinstance(value, str):
103
- parsed = datetime.time.fromisoformat(value)
104
- self.logger.debug(f"Parsed value (time): {parsed}")
105
- return parsed
264
+ return [self._time_to_seconds(v) for v in value]
265
+ return self._time_to_seconds(value)
106
266
  return value
107
267
 
108
268
  @staticmethod
@@ -124,53 +284,35 @@ class FilterHandler:
124
284
 
125
285
  @staticmethod
126
286
  def _get_sqlalchemy_column(field_name, model, casting):
127
- """
128
- Retrieve and cast a column for SQLAlchemy based on the field name and casting.
129
-
130
- Args:
131
- field_name: The name of the field/column in the model.
132
- model: The SQLAlchemy model.
133
- casting: The casting type ('date', 'time', etc.).
134
-
135
- Returns:
136
- The SQLAlchemy column object, optionally cast or transformed.
137
- """
138
287
  column = getattr(model, field_name, None)
139
288
  if not column:
140
289
  raise AttributeError(f"Field '{field_name}' not found in model '{model.__name__}'")
141
290
 
142
291
  if casting == "date":
143
- # Cast the column to Date for whole-date comparisons
144
292
  column = cast(column, Date)
145
293
  elif casting == "time":
146
- # Cast the column to Time for time-specific comparisons
147
294
  column = cast(column, Time)
148
295
  elif casting in FilterHandler._date_operators():
149
- # Extract date part (e.g., year, month) using SQLAlchemy functions
150
296
  column = func.extract(casting, column)
151
297
 
152
298
  return column
153
299
 
154
300
  @staticmethod
155
301
  def _get_dask_column(df, field_name, casting):
156
- """
157
- Retrieve and optionally cast a column for Dask based on the field name and casting.
302
+ needs_dt = casting in (FilterHandler._dt_operators() + FilterHandler._date_operators())
303
+ column = dd.to_datetime(df[field_name], errors="coerce") if needs_dt else df[field_name]
158
304
 
159
- Args:
160
- df: The Dask DataFrame.
161
- field_name: The name of the field/column in the DataFrame.
162
- casting: The casting type ('date', 'time', etc.).
163
-
164
- Returns:
165
- The Dask Series object, optionally cast or transformed.
166
- """
167
- column = dd.to_datetime(df[field_name], errors="coerce") if casting in FilterHandler._dt_operators() else df[
168
- field_name]
305
+ if needs_dt:
306
+ column = FilterHandler._strip_tz(column)
169
307
 
170
308
  if casting == "date":
171
- column = column.dt.floor("D") # Ensure truncation to the date level
309
+ column = column.dt.floor("D")
310
+ elif casting == "time":
311
+ # compare as "seconds since midnight"
312
+ column = (column.dt.hour * 3600 + column.dt.minute * 60 + column.dt.second)
172
313
  elif casting in FilterHandler._date_operators():
173
- column = getattr(column.dt, casting)
314
+ attr = "weekday" if casting == "week_day" else casting
315
+ column = getattr(column.dt, attr)
174
316
 
175
317
  return column
176
318
 
@@ -204,13 +346,13 @@ class FilterHandler:
204
346
  "isnull": lambda col, val: col.is_(None) if val else col.isnot(None),
205
347
  "not_exact": lambda col, val: col != val,
206
348
  "not_contains": lambda col, val: ~col.like(f"%{val}%"),
207
- "not_in": lambda col, val: ~col.in_(val), # Custom operation
208
- "regex": lambda col, val: col.op("~")(val), # Custom operation
209
- "icontains": lambda col, val: col.ilike(f"%{val}%"), # Custom operation
210
- "istartswith": lambda col, val: col.ilike(f"{val}%"), # Custom operation
211
- "iendswith": lambda col, val: col.ilike(f"%{val}"), # Custom operation
212
- "iexact": lambda col, val: col.ilike(val), # Added iexact
213
- "iregex": lambda col, val: col.op("~*")(val), # Added iregex
349
+ "not_in": lambda col, val: ~col.in_(val),
350
+ "regex": lambda col, val: col.op("~")(val),
351
+ "icontains": lambda col, val: col.ilike(f"%{val}%"),
352
+ "istartswith": lambda col, val: col.ilike(f"{val}%"),
353
+ "iendswith": lambda col, val: col.ilike(f"%{val}"),
354
+ "iexact": lambda col, val: col.ilike(val),
355
+ "iregex": lambda col, val: col.op("~*")(val),
214
356
  }
215
357
 
216
358
  @staticmethod
@@ -221,23 +363,54 @@ class FilterHandler:
221
363
  "gte": lambda col, val: col >= val,
222
364
  "lt": lambda col, val: col < val,
223
365
  "lte": lambda col, val: col <= val,
224
- "in": lambda col, val: col.isin(val),
366
+
367
+ # type-safe "in" and "not_in"
368
+ "in": lambda col, val: FilterHandler._align_in_types(col, val)[0].isin(
369
+ FilterHandler._align_in_types(col, val)[1]),
370
+ "not_in": lambda col, val: ~FilterHandler._align_in_types(col, val)[0].isin(
371
+ FilterHandler._align_in_types(col, val)[1]),
372
+
225
373
  "range": lambda col, val: (col >= val[0]) & (col <= val[1]),
226
- "contains": lambda col, val: col.str.contains(val, regex=True),
227
- "startswith": lambda col, val: col.str.startswith(val),
228
- "endswith": lambda col, val: col.str.endswith(val),
374
+
375
+ # robust string ops (dtype-agnostic)
376
+ "contains": lambda col, val: FilterHandler._as_str(col).str.contains(val, regex=True, na=False),
377
+ "startswith": lambda col, val: FilterHandler._as_str(col).str.startswith(val, na=False),
378
+ "endswith": lambda col, val: FilterHandler._as_str(col).str.endswith(val, na=False),
379
+ "not_contains": lambda col, val: ~FilterHandler._as_str(col).str.contains(val, regex=True, na=False),
380
+ "regex": lambda col, val: FilterHandler._as_str(col).str.contains(val, regex=True, na=False),
381
+ "icontains": lambda col, val: FilterHandler._as_str(col).str.contains(val, case=False, regex=True, na=False),
382
+ "istartswith": lambda col, val: FilterHandler._as_str(col).str.lower().str.startswith(str(val).lower(), na=False),
383
+ "iendswith": lambda col, val: FilterHandler._as_str(col).str.lower().str.endswith(str(val).lower(), na=False),
384
+ "iexact": lambda col, val: FilterHandler._as_str(col).str.lower() == str(val).lower(),
385
+ "iregex": lambda col, val: FilterHandler._as_str(col).str.contains(val, case=False, regex=True, na=False),
386
+
229
387
  "isnull": lambda col, val: col.isnull() if val else col.notnull(),
230
388
  "not_exact": lambda col, val: col != val,
231
- "not_contains": lambda col, val: ~col.str.contains(val, regex=True),
232
- "not_in": lambda col, val: ~col.isin(val), # Custom operation
233
- "regex": lambda col, val: col.str.contains(val, regex=True), # Custom operation
234
- "icontains": lambda col, val: col.str.contains(val, case=False, regex=True), # Custom operation
235
- "istartswith": lambda col, val: col.str.startswith(val, case=False), # Custom operation
236
- "iendswith": lambda col, val: col.str.endswith(val, case=False), # Custom operation
237
- "iexact": lambda col, val: col.str.contains(f"^{val}$", case=False, regex=True), # Added iexact
238
- "iregex": lambda col, val: col.str.contains(val, case=False, regex=True), # Added iregex
239
389
  }
240
390
 
391
+ @staticmethod
392
+ def _as_str(col):
393
+ return col.astype("string").fillna("")
394
+
395
+ @staticmethod
396
+ def _strip_tz(col):
397
+ import pandas as pd
398
+ def _part(s: pd.Series) -> pd.Series:
399
+ try:
400
+ return s.dt.tz_convert("UTC").dt.tz_localize(None)
401
+ except Exception:
402
+ try:
403
+ return s.dt.tz_localize(None)
404
+ except Exception:
405
+ return s
406
+ return col.map_partitions(_part, meta=col._meta)
407
+
408
+ @staticmethod
409
+ def _time_to_seconds(t):
410
+ if isinstance(t, str):
411
+ t = datetime.time.fromisoformat(t)
412
+ return t.hour * 3600 + t.minute * 60 + t.second
413
+
241
414
  @staticmethod
242
415
  def _dt_operators():
243
416
  return ["date", "time"]
@@ -255,3 +428,42 @@ class FilterHandler:
255
428
  "regex", "icontains", "istartswith", "iendswith",
256
429
  "iexact", "iregex"
257
430
  ]
431
+
432
+ @staticmethod
433
+ def _align_in_types(col, val):
434
+ # normalize val to a list
435
+ if isinstance(val, (set, tuple)):
436
+ vals = list(val)
437
+ elif isinstance(val, list):
438
+ vals = val
439
+ else:
440
+ vals = [val]
441
+
442
+ kind = getattr(getattr(col, "dtype", None), "kind", None)
443
+ if kind in ("i", "u"): # integer
444
+ def to_ints(xs):
445
+ out = []
446
+ for x in xs:
447
+ try:
448
+ out.append(int(x))
449
+ except Exception:
450
+ return None
451
+ return out
452
+ ints = to_ints(vals)
453
+ if ints is not None:
454
+ return col.astype("Int64"), ints
455
+
456
+ if kind in ("f",): # float
457
+ def to_floats(xs):
458
+ out = []
459
+ for x in xs:
460
+ try:
461
+ out.append(float(x))
462
+ except Exception:
463
+ return None
464
+ return out
465
+ flts = to_floats(vals)
466
+ if flts is not None:
467
+ return col.astype("float64"), flts
468
+
469
+ return FilterHandler._as_str(col), [str(x) for x in vals]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 2025.8.2
3
+ Version: 2025.8.4
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -2,15 +2,14 @@ sibi_dst/__init__.py,sha256=D01Z2Ds4zES8uz5Zp7qOWD0EcfCllWgew7AWt2X1SQg,445
2
2
  sibi_dst/df_helper/__init__.py,sha256=CyDXtFhRnMrycktxNO8jGGkP0938QiScl56kMZS1Sf8,578
3
3
  sibi_dst/df_helper/_artifact_updater_async.py,sha256=0lUwel-IkmKewRnmMv9GtuT-P6SivkIKtgOHvKchHlc,8462
4
4
  sibi_dst/df_helper/_artifact_updater_threaded.py,sha256=M5GNZismOqMmBrcyfolP1DPv87VILQf_P18is_epn50,7238
5
- sibi_dst/df_helper/_df_helper.py,sha256=wZtsFinZZ7gbPP5MLMyVCRG0bcj_eL-fZ-2ZirGD2WI,26880
5
+ sibi_dst/df_helper/_df_helper.py,sha256=nG5iITvwyRsdnPgTOql6-w47LEOsZUXYF7-tIM2yGBE,12798
6
6
  sibi_dst/df_helper/_parquet_artifact.py,sha256=tqYOjwxHV1MsADmn-RNFuVI_RrEvvmCJHZieRcsVXuc,12334
7
7
  sibi_dst/df_helper/_parquet_reader.py,sha256=tFq0OQVczozbKZou93vscokp2R6O2DIJ1zHbZqVjagc,3069
8
8
  sibi_dst/df_helper/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
9
  sibi_dst/df_helper/backends/http/__init__.py,sha256=d1pfgYxbiYg7E0Iw8RbJ7xfqIfJShqqTBQQGU_S6OOo,105
10
10
  sibi_dst/df_helper/backends/http/_http_config.py,sha256=eGPFdqZ5M3Tscqx2P93B6XoBEEzlmdt7yNg7PXUQnNQ,4726
11
- sibi_dst/df_helper/backends/parquet/__init__.py,sha256=esWJ9aSuYC26d-T01z9dPrJ1uqJzvdaPNTYRb5qXTlQ,182
12
- sibi_dst/df_helper/backends/parquet/_filter_handler.py,sha256=TvDf0RXta7mwJv11GNQttYJsXgFf2XDj4oLIjt4xTzA,5219
13
- sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=FusVcLVysitLoc8Ui_zU4JMhdHW1MMn4i0vnMbl2K84,12017
11
+ sibi_dst/df_helper/backends/parquet/__init__.py,sha256=0A6BGHZLwiLBmuBBaUvEHfeWTcInvy2NbymlrI_nuXE,104
12
+ sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=yQ5pZuF2Tf7eM_krOPkxhPkDFtEKzV7BKjUerTqX0tg,12028
14
13
  sibi_dst/df_helper/backends/sqlalchemy/__init__.py,sha256=LjWm9B7CweTvlvFOgB90XjSe0lVLILAIYMWKPkFXFm8,265
15
14
  sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py,sha256=R3_WY_lsQrfQwD6yAzH66MqvsgZdMd0HKcVChDQcbpM,8401
16
15
  sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py,sha256=GQwDy2JwPUx37vpwxPM5hg4ZydilPIP824y5C_clsl0,383
@@ -20,7 +19,7 @@ sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py,sha256=MHk64f5WDOKHQ_L
20
19
  sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py,sha256=RiCaVPME5wzgZ9xUGY0JOs_c2C0KcDIbTeMGpPupIa0,5242
21
20
  sibi_dst/df_helper/core/__init__.py,sha256=LfmTqFh6GUZup-g95bcXgAxX7J5Hkve7ftLE_CJg_AE,409
22
21
  sibi_dst/df_helper/core/_defaults.py,sha256=9UMEMu2wXznO5UzEhnQ82f_ZazZ20JRyRXIi3HP3gDw,4043
23
- sibi_dst/df_helper/core/_filter_handler.py,sha256=Pmbzygry2mpkNPVS7DBMulHpAb1yYZNFqUU0bJTWJF0,11214
22
+ sibi_dst/df_helper/core/_filter_handler.py,sha256=9C30zrT8wSGy1X8ryiTWc0XfnbpeoHndHgoOcHKOPOo,19309
24
23
  sibi_dst/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxRrQKE5FQRxcEWsac,6736
25
24
  sibi_dst/df_helper/core/_query_config.py,sha256=1ApqmuSGXTC3CdF-xMsSbCa3V2Z5hOP3Wq5huhzZwqY,439
26
25
  sibi_dst/df_helper/data_cleaner.py,sha256=lkxQoXLvGzXCicFUimnA5nen5qkrO1oxgl_p2Be2o8w,5183
@@ -79,6 +78,6 @@ sibi_dst/v2/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxR
79
78
  sibi_dst/v2/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
80
79
  sibi_dst/v2/utils/__init__.py,sha256=6H4cvhqTiFufnFPETBF0f8beVVMpfJfvUs6Ne0TQZNY,58
81
80
  sibi_dst/v2/utils/log_utils.py,sha256=rfk5VsLAt-FKpv6aPTC1FToIPiyrnHAFFBAkHme24po,4123
82
- sibi_dst-2025.8.2.dist-info/METADATA,sha256=SivBygwgks3A7cD__dfdnhqpBgKtG5fmnP_DeNf78gE,2610
83
- sibi_dst-2025.8.2.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
84
- sibi_dst-2025.8.2.dist-info/RECORD,,
81
+ sibi_dst-2025.8.4.dist-info/METADATA,sha256=LFL_mbMveA_TrO5zelvtZ1rBiEuMWtvhjrAs42DnOd0,2610
82
+ sibi_dst-2025.8.4.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
83
+ sibi_dst-2025.8.4.dist-info/RECORD,,
@@ -1,126 +0,0 @@
1
- import dask.dataframe as dd
2
- import pandas as pd
3
-
4
- from sibi_dst.utils import Logger
5
-
6
-
7
- class ParquetFilterHandler(object):
8
- """
9
- Handles parquet filtering operations using dask dataframes.
10
-
11
- This class is designed to apply complex filtering logic on dask dataframes
12
- based on specified filter criteria. It includes support for operations such
13
- as exact matches, ranges, string pattern matches, and null checks. Additionally,
14
- it handles datetime-related field filtering including precise truncations and
15
- specific date/time attributes.
16
-
17
- :ivar logger: Logger object to handle logging within the class. Defaults to the class-level logger.
18
- :type logger: Logger
19
- """
20
- def __init__(self, logger=None, debug=False):
21
- self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
22
- self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
23
-
24
- @staticmethod
25
- def apply_filters_dask(df, filters):
26
- """
27
- Applies a set of filters to a Dask DataFrame, enabling complex filtering operations
28
- such as comparisons, ranges, string match operations, and more. Handles special
29
- cases for datetime operations, including casting and extracting specific datetime
30
- components for filtering.
31
-
32
- :param df: Dask DataFrame to which the filters will be applied.
33
- :type df: dask.dataframe.DataFrame
34
- :param filters: Dictionary defining the filtering logic, where the keys specify
35
- the column name and filter operation, and the values specify the corresponding
36
- filter values to apply.
37
- :type filters: dict
38
- :return: A filtered Dask DataFrame based on the defined logic in the filters.
39
- :rtype: dask.dataframe.DataFrame
40
- :raises ValueError: If an unsupported operation is encountered in the filters.
41
- """
42
- dt_operators = ['date', 'time']
43
- date_operators = ['year', 'month', 'day', 'hour', 'minute', 'second', 'week_day']
44
- comparison_operators = [
45
- 'gte',
46
- 'lte',
47
- 'gt',
48
- 'lt',
49
- 'exact',
50
- 'in',
51
- 'range',
52
- 'contains',
53
- 'icontains',
54
- 'startswith',
55
- 'endswith',
56
- 'isnull'
57
- ]
58
-
59
- operation_map = {
60
- 'exact': lambda col, val: col == val,
61
- 'gt': lambda col, val: col > val,
62
- 'gte': lambda col, val: col >= val,
63
- 'lt': lambda col, val: col < val,
64
- 'lte': lambda col, val: col <= val,
65
- 'in': lambda col, val: col.isin(val),
66
- 'range': lambda col, val: (col >= val[0]) & (col <= val[1]),
67
- 'contains': lambda col, val: col.str.contains(val, regex=True),
68
- 'icontains': lambda col, val: col.str.contains(val, case=False),
69
- 'startswith': lambda col, val: col.str.startswith(val),
70
- 'endswith': lambda col, val: col.str.endswith(val),
71
- 'isnull': lambda col, val: col.isnull() if val else col.notnull(),
72
- }
73
-
74
- def parse_filter_value(casting, value):
75
- """
76
- Convert filter value to appropriate type based on the casting (e.g., date).
77
- """
78
- if casting == 'date':
79
- if isinstance(value, str):
80
- return pd.Timestamp(value) # Convert to datetime64[ns]
81
- if isinstance(value, list):
82
- return [pd.Timestamp(v) for v in value] # Convert list elements
83
- return value
84
-
85
- def get_temp_col(dask_df, field_name, casting):
86
- """
87
- Handle datetime conversion and field retrieval.
88
- """
89
- temp_col = dd.to_datetime(dask_df[field_name], errors='coerce') if casting in dt_operators else dask_df[
90
- field_name]
91
- if casting == 'date':
92
- temp_col = temp_col.dt.floor('D') # Keep it as datetime64[ns] truncated to the day level
93
- elif casting in date_operators:
94
- temp_col = getattr(temp_col.dt, casting)
95
- return temp_col
96
-
97
- for key, value in filters.items():
98
- parts = key.split('__')
99
- field_name = parts[0]
100
- casting = None
101
- operation = 'exact'
102
-
103
- if len(parts) == 3:
104
- # Adjust logic based on the parts
105
- _, casting, operation = parts
106
- elif len(parts) == 2:
107
- # Could be either a casting or an operation
108
- if parts[1] in comparison_operators:
109
- operation = parts[1]
110
- elif parts[1] in dt_operators + date_operators:
111
- casting = parts[1]
112
-
113
- # Convert the filter value to the correct type
114
- parsed_value = parse_filter_value(casting, value)
115
-
116
- # Get the column to filter
117
- temp_col = get_temp_col(df, field_name, casting)
118
-
119
- if operation in operation_map:
120
- # Apply the filter operation
121
- condition = operation_map[operation](temp_col, parsed_value)
122
- df = df[condition]
123
- else:
124
- raise ValueError(f"Unsupported operation: {operation}")
125
-
126
- return df