sibi-dst 2025.8.3__py3-none-any.whl → 2025.8.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,7 +7,7 @@ import pandas as pd
7
7
  from fsspec import AbstractFileSystem
8
8
  from pydantic import BaseModel
9
9
 
10
- from sibi_dst.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
10
+ from sibi_dst.df_helper.core import QueryConfig, ParamsConfig
11
11
  from sibi_dst.utils import ManagedResource, ParquetSaver, ClickHouseWriter
12
12
  from .backends.http import HttpConfig
13
13
  from .backends.parquet import ParquetConfig
@@ -54,16 +54,13 @@ class SqlAlchemyBackend(BaseBackend):
54
54
  class ParquetBackend(BaseBackend):
55
55
  def load(self, **options):
56
56
  try:
57
- df = self.helper.backend_parquet.load_files()
57
+ df = self.helper.backend_parquet.load_files(**options)
58
58
  if self._is_empty(df):
59
59
  return -1, self._empty_like(df)
60
-
61
- if options and df is not None:
62
- df = FilterHandler("dask", logger=self.logger, debug=False).apply_filters(df, filters=options)
63
- nrows = self._row_count(df)
64
- if nrows == 0:
65
- self.logger.debug("No records after filters; returning empty DataFrame.")
66
- return 0, self._empty_like(df)
60
+ nrows = self._row_count(df)
61
+ if nrows == 0:
62
+ self.logger.debug("No records after filters; returning empty DataFrame.")
63
+ return 0, self._empty_like(df)
67
64
 
68
65
  df = df.persist()
69
66
  self.total_records = self._row_count(df) or -1
@@ -185,18 +182,20 @@ class DfHelper(ManagedResource):
185
182
  return model(**model_kwargs)
186
183
 
187
184
  # ---------- load/aload ----------
188
- def load(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
185
+ def load(self, *, persist: bool = False, as_pandas: bool = False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
189
186
  self.logger.debug(f"Loading data from {self.backend} backend with options: {options}")
190
187
  self.total_records, df = self.backend_strategy.load(**options)
191
188
  df = self._process_loaded_data(df)
192
189
  df = self._post_process_df(df)
193
- self.logger.debug(f"Finished loading data from {self.backend} backend with options: {options}")
190
+ #self.logger.debug(f"Finished loading data from {self.backend} backend with options: {options}")
191
+ df = df.persist() if persist else df
194
192
  return df.compute() if as_pandas else df
195
193
 
196
- async def aload(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
194
+ async def aload(self, *, persist: bool = False, as_pandas: bool = False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
197
195
  self.total_records, df = await self.backend_strategy.aload(**options)
198
196
  df = self._process_loaded_data(df)
199
197
  df = self._post_process_df(df)
198
+ df = df.persist() if persist else df
200
199
  return df.compute() if as_pandas else df
201
200
 
202
201
  # ---------- dataframe post-processing ----------
@@ -298,295 +297,4 @@ class DfHelper(ManagedResource):
298
297
  self.logger.debug(f"Period load generated filters: {kwargs}")
299
298
  return kwargs
300
299
 
301
- # from __future__ import annotations
302
- #
303
- # from typing import Any, Dict, Optional, Union, TypeVar
304
- #
305
- # import dask.dataframe as dd
306
- # import pandas as pd
307
- # from fsspec import AbstractFileSystem
308
- # from pydantic import BaseModel
309
- #
310
- # from sibi_dst.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
311
- # from sibi_dst.utils import ManagedResource
312
- # from sibi_dst.utils import ParquetSaver, ClickHouseWriter
313
- # from .backends.http import HttpConfig
314
- # from .backends.parquet import ParquetConfig
315
- # from .backends.sqlalchemy import SqlAlchemyConnectionConfig, SqlAlchemyLoadFromDb
316
- #
317
- # T = TypeVar("T", bound=BaseModel)
318
- #
319
- #
320
- # # --- Backend Strategy Pattern Implementation ---
321
- #
322
- # class BaseBackend:
323
- # """Abstract base class defining clear sync and async loading interfaces."""
324
- #
325
- # def __init__(self, helper: DfHelper):
326
- # self.helper = helper
327
- # self.logger = helper.logger
328
- # self.debug = helper.debug
329
- # self.total_records = helper.total_records # no records loaded yet
330
- #
331
- # def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
332
- # """Synchronous data loading method. Must be implemented by sync backends."""
333
- # raise NotImplementedError(f"Backend '{self.__class__.__name__}' does not support synchronous loading.")
334
- #
335
- # async def aload(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
336
- # """Asynchronous data loading method. By default, it calls the sync version."""
337
- # return self.load(**options)
338
- #
339
- #
340
- # class SqlAlchemyBackend(BaseBackend):
341
- # def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
342
- # try:
343
- # # Process incoming filter options into the ParamsConfig object
344
- # if options and hasattr(self.helper._backend_params, 'parse_params'):
345
- # self.helper._backend_params.parse_params(options)
346
- #
347
- # with SqlAlchemyLoadFromDb(
348
- # plugin_sqlalchemy=self.helper.backend_db_connection,
349
- # plugin_query=self.helper._backend_query,
350
- # plugin_params=self.helper._backend_params,
351
- # logger=self.logger,
352
- # debug=self.debug
353
- # ) as db_loader:
354
- # self.total_records, result = db_loader.build_and_load()
355
- # return self.total_records, result
356
- # except Exception as e:
357
- # self.logger.error(f"Failed to load data from sqlalchemy: {e}", exc_info=self.debug)
358
- # return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
359
- #
360
- #
361
- # class ParquetBackend(BaseBackend):
362
- # """This backend is also purely synchronous."""
363
- #
364
- # def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
365
- # try:
366
- # df = self.helper.backend_parquet.load_files()
367
- # if len(df.head(1)) == 0:
368
- # return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
369
- # if options and df is not None:
370
- # df = FilterHandler('dask', logger=self.logger, debug=False).apply_filters(df, filters=options)
371
- # if len(df.head(1)) == 0:
372
- # self.logger.debug("No records found after applying filters; returning empty DataFrame.")
373
- # return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
374
- # df = df.persist()
375
- #
376
- # self.total_records = len(df) or -1 # If df is empty, set total_records to -1
377
- # return self.total_records, df
378
- # except Exception as e:
379
- # self.total_records = -1 # Reset total_records on failure
380
- # self.logger.error(f"Failed to load data from parquet: {e}")
381
- # return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
382
- #
383
- #
384
- # class HttpBackend(BaseBackend):
385
- # """This backend is purely asynchronous."""
386
- #
387
- # def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
388
- # # This will correctly fail by raising NotImplementedError from the base class.
389
- # return self.helper.backend_http.fetch_data(**options)
390
- #
391
- # async def aload(self, **options) -> tuple[Any, Any] | Union[pd.DataFrame, dd.DataFrame]:
392
- # if not self.helper.backend_http:
393
- # self.logger.warning("HTTP plugin not configured properly.")
394
- # self.total_records = -1
395
- # return self.total_records, dd.from_pandas(pd.DataFrame(), npartitions=1)
396
- # result = await self.helper.backend_http.fetch_data(**options)
397
- # self.total_records = len(result)
398
- # return self.total_records, result
399
- #
400
- #
401
- # # --- Main DfHelper Facade Class ---
402
- #
403
- # class DfHelper(ManagedResource):
404
- # """
405
- # A reusable utility for loading data. It provides both sync (`load`) and
406
- # async (`aload`) methods to accommodate different backends.
407
- # """
408
- # _BACKEND_STRATEGIES = {
409
- # 'sqlalchemy': SqlAlchemyBackend,
410
- # 'parquet': ParquetBackend,
411
- # 'http': HttpBackend,
412
- # }
413
- #
414
- # _BACKEND_ATTR_MAP = {
415
- # 'sqlalchemy': 'backend_db_connection',
416
- # 'parquet': 'backend_parquet',
417
- # 'http': 'backend_http',
418
- # }
419
- #
420
- # default_config: Dict = None
421
- #
422
- # def __init__(self, backend='sqlalchemy', **kwargs):
423
- # self.default_config = self.default_config or {}
424
- # kwargs = {**self.default_config.copy(), **kwargs}
425
- # super().__init__(**kwargs)
426
- # self.backend = backend
427
- #
428
- # # Need to set default values for backend-specific configurations
429
- # kwargs.setdefault("debug", self.debug)
430
- # kwargs.setdefault("fs", self.fs)
431
- # kwargs.setdefault("logger", self.logger)
432
- # self.total_records = -1 # Initialize total_records to -1 to indicate no records loaded yet
433
- # self._backend_query = self._get_config(QueryConfig, kwargs)
434
- # self._backend_params = self._get_config(ParamsConfig, kwargs)
435
- # self.backend_db_connection: Optional[SqlAlchemyConnectionConfig] = None
436
- # self.backend_parquet: Optional[ParquetConfig] = None
437
- # self.backend_http: Optional[HttpConfig] = None
438
- #
439
- # if self.backend == 'sqlalchemy':
440
- # self.backend_db_connection = self._get_config(SqlAlchemyConnectionConfig, kwargs)
441
- # elif self.backend == 'parquet':
442
- # self.backend_parquet = self._get_config(ParquetConfig, kwargs)
443
- # elif self.backend == 'http':
444
- # self.backend_http = self._get_config(HttpConfig, kwargs)
445
- #
446
- # strategy_class = self._BACKEND_STRATEGIES.get(self.backend)
447
- # if not strategy_class: raise ValueError(f"Unsupported backend: {self.backend}")
448
- # self.backend_strategy = strategy_class(self)
449
- #
450
- # def _cleanup(self):
451
- # attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
452
- # if not attr_name:
453
- # self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.")
454
- # return
455
- # # Get the actual config object (e.g., self.backend_db_connection)
456
- # active_config = getattr(self, attr_name, None)
457
- #
458
- # if active_config and hasattr(active_config, "close"):
459
- # self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.")
460
- # active_config.close()
461
- #
462
- # async def _acleanup(self):
463
- # self.logger.warning("DfHelper instance was not used in an async context manager; cleanup is being called manually.")
464
- # attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
465
- # if not attr_name:
466
- # self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.")
467
- # return
468
- # # Get the actual config object (e.g., self.backend_db_connection)
469
- # active_config = getattr(self, attr_name, None)
470
- # if active_config and hasattr(active_config, "aclose"):
471
- # self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.")
472
- # await active_config.aclose()
473
- #
474
- # def _get_config(self, model: T, kwargs: Dict[str, Any]) -> T:
475
- # recognized_keys = set(model.model_fields.keys())
476
- # model_kwargs = {k: kwargs[k] for k in recognized_keys if k in kwargs}
477
- # return model(**model_kwargs)
478
- #
479
- # def load(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
480
- # """Loads data synchronously. Fails if backend is async-only."""
481
- # self.logger.debug(f"Loading data from {self.backend} backend with options: {options}")
482
- # self.total_records, df = self.backend_strategy.load(**options)
483
- # df = self._process_loaded_data(df)
484
- # df = self._post_process_df(df)
485
- # self.logger.debug(f"Finished loading data from {self.backend} backend with options: {options}")
486
- # return df.compute() if as_pandas else df
487
- #
488
- # async def aload(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
489
- # """Loads data asynchronously from any backend."""
490
- # self.total_records, df = await self.backend_strategy.aload(**options)
491
- # df = self._process_loaded_data(df)
492
- # df = self._post_process_df(df)
493
- # return df.compute() if as_pandas else df
494
- #
495
- # def _post_process_df(self, df: dd.DataFrame) -> dd.DataFrame:
496
- # self.logger.debug("Post-processing DataFrame.")
497
- # df_params = self._backend_params.df_params
498
- # if not df_params: return df
499
- # fieldnames, column_names, index_col = (df_params.get("fieldnames"), df_params.get("column_names"),
500
- # df_params.get("index_col"))
501
- # if not any([fieldnames, column_names, index_col]): return df
502
- #
503
- # if fieldnames:
504
- # valid_fieldnames = [f for f in fieldnames if f in df.columns]
505
- # if len(valid_fieldnames) < len(fieldnames): self.logger.warning(
506
- # f"Missing columns for filtering: {set(fieldnames) - set(valid_fieldnames)}")
507
- # df = df[valid_fieldnames]
508
- # if column_names:
509
- # if len(df.columns) != len(column_names): raise ValueError(
510
- # f"Length mismatch: DataFrame has {len(df.columns)} columns, but {len(column_names)} names were provided.")
511
- # df = df.rename(columns=dict(zip(df.columns, column_names)))
512
- # if index_col:
513
- # if index_col not in df.columns: raise ValueError(f"Index column '{index_col}' not found in DataFrame.")
514
- # df = df.set_index(index_col)
515
- # self.logger.debug("Post-processing complete.")
516
- # return df
517
- #
518
- # def _process_loaded_data(self, df: dd.DataFrame) -> dd.DataFrame:
519
- # field_map = self._backend_params.field_map or {}
520
- # if not isinstance(field_map, dict) or not field_map: return df
521
- # if hasattr(df, 'npartitions') and df.npartitions == 1 and not len(df.head(1)): return df
522
- # self.logger.debug("Processing loaded data...applying rename mapping if necessary.")
523
- # rename_mapping = {k: v for k, v in field_map.items() if k in df.columns}
524
- # if rename_mapping: df = df.rename(columns=rename_mapping)
525
- # self.logger.debug("Rename mapping complete...")
526
- # return df
527
- #
528
- # def save_to_parquet(self, df: dd.DataFrame, parquet_filename: str, **kwargs):
529
- # """Saves a Dask DataFrame to a Parquet file with validation."""
530
- #
531
- # # Use .get() for cleaner access to optional arguments.
532
- # fs: AbstractFileSystem = kwargs.get('fs', self.fs)
533
- # path: str = kwargs.get('parquet_storage_path')
534
- #
535
- # # Guard clauses to fail fast with clear errors.
536
- # if not fs:
537
- # raise ValueError("A filesystem (fs) must be provided to save the parquet file.")
538
- # if not path:
539
- # raise ValueError("A 'parquet_storage_path' keyword argument must be provided.")
540
- #
541
- # # An efficient, idiomatic way to check if a Dask DataFrame is empty.
542
- # if len(df.head(1)) == 0:
543
- # self.logger.warning("Skipping save: The provided DataFrame is empty.")
544
- # return
545
- #
546
- # with ParquetSaver(
547
- # df_result=df,
548
- # parquet_storage_path=path,
549
- # fs=fs,
550
- # debug=self.debug,
551
- # logger=self.logger,
552
- # verbose=self.verbose,
553
- # **kwargs
554
- # ) as saver:
555
- # saver.save_to_parquet(parquet_filename)
556
- #
557
- # self.logger.debug(f"Successfully saved '{parquet_filename}' to '{path}'.")
558
- #
559
- # def save_to_clickhouse(self, df: dd.DataFrame, **credentials):
560
- # if hasattr(df, 'npartitions') and df.npartitions == 1 and not len(df.head(1)):
561
- # self.logger.warning("Cannot write to ClickHouse; DataFrame is empty.")
562
- # return
563
- #
564
- # with ClickHouseWriter(debug=self.debug, logger=self.logger, verbose=self.verbose, **credentials) as writer:
565
- # writer.save_to_clickhouse(df)
566
- # self.logger.debug("Save to ClickHouse completed.")
567
- #
568
- # def load_period(self, dt_field: str, start: str, end: str, **kwargs) -> Union[pd.DataFrame, dd.DataFrame]:
569
- # """Synchronous convenience method for loading a date range."""
570
- # final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
571
- # return self.load(**final_kwargs)
572
- #
573
- # async def aload_period(self, dt_field: str, start: str, end: str, **kwargs) -> Union[pd.DataFrame, dd.DataFrame]:
574
- # """Asynchronous convenience method for loading a date range."""
575
- # final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
576
- # return await self.aload(**final_kwargs)
577
- #
578
- # def _prepare_period_filters(self, dt_field: str, start: str, end: str, **kwargs) -> dict:
579
- # start_date, end_date = pd.to_datetime(start).date(), pd.to_datetime(end).date()
580
- # if start_date > end_date:
581
- # raise ValueError("'start' date cannot be later than 'end' date.")
582
- # field_map = self._backend_params.field_map or {}
583
- # reverse_map = {v: k for k, v in field_map.items()} if field_map else {}
584
- # if len(reverse_map) != len(field_map):
585
- # self.logger.warning("field_map values are not unique; reverse mapping may be unreliable.")
586
- # mapped_field = reverse_map.get(dt_field, dt_field)
587
- # if start_date == end_date:
588
- # kwargs[f"{mapped_field}__date"] = start_date
589
- # else:
590
- # kwargs[f"{mapped_field}__date__range"] = [start_date, end_date]
591
- # self.logger.debug(f"Period load generated filters: {kwargs}")
592
- # return kwargs
300
+
@@ -1,9 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
- from ._filter_handler import ParquetFilterHandler
4
3
  from ._parquet_options import *
5
4
 
6
5
  __all__ = [
7
6
  "ParquetConfig",
8
- "ParquetFilterHandler",
9
7
  ]
@@ -6,6 +6,8 @@ import dask.dataframe as dd
6
6
  import fsspec
7
7
  import pandas as pd
8
8
  from pydantic import BaseModel, model_validator, ConfigDict
9
+
10
+ from sibi_dst.df_helper.core import FilterHandler
9
11
  from sibi_dst.utils import FilePathGenerator
10
12
  from sibi_dst.utils import Logger
11
13
 
@@ -175,40 +177,79 @@ class ParquetConfig(BaseModel):
175
177
  total_size += self.fs.size(path)
176
178
  return total_size
177
179
 
178
- def load_files(self):
180
+ def load_files(self, **filters):
179
181
  """
180
- Loads parquet files into a Dask DataFrame based on the specified conditions. This
181
- method checks if parquet file loading is enabled and loads either from a list of
182
- parquet folder paths or a single specified parquet path.
183
-
184
- :return: A Dask DataFrame containing loaded parquet file data.
185
- :rtype: dask.dataframe.DataFrame
182
+ Loads parquet files into a Dask DataFrame based on the specified conditions.
183
+ Supports Parquet predicate pushdown (pyarrow) + residual Dask mask.
186
184
  """
187
185
  if not self.load_parquet:
188
186
  self.logger.warning("Parquet loading is disabled. Returning empty DataFrame.")
189
187
  return dd.from_pandas(pd.DataFrame(), npartitions=1)
190
188
 
189
+ # Resolve paths
191
190
  paths_to_load = []
192
191
  if self.parquet_folder_list:
193
- # Filter out any None values from the list
194
- paths_to_load = [p for p in self.parquet_folder_list if p is not None]
192
+ paths_to_load = [p for p in self.parquet_folder_list if p]
195
193
  elif self.parquet_full_path:
196
- # Treat the single path as a list with one item
197
194
  paths_to_load = [self.parquet_full_path]
198
195
 
199
196
  if not paths_to_load:
200
197
  self.logger.warning("No valid parquet file paths were provided. Returning empty DataFrame.")
201
198
  return dd.from_pandas(pd.DataFrame(), npartitions=1)
202
199
 
200
+ # Prepare filters
201
+ fh = None
202
+ expr = None
203
+ pq_filters = None
204
+ residual_filters = None
205
+ if filters:
206
+ fh = FilterHandler(backend="dask", debug=self.debug, logger=self.logger)
207
+
208
+ # Use the compiler + pushdown split so we don't double-apply
209
+ try:
210
+ # If you added split_pushdown_and_residual earlier:
211
+ pq_filters, residual_filters = fh.split_pushdown_and_residual(filters)
212
+ expr = fh.compile_filters(residual_filters) if residual_filters else None
213
+ except AttributeError:
214
+ # Fallback if you didn't add split_*: push everything down and also mask (redundant but correct)
215
+ expr = fh.compile_filters(filters)
216
+ pq_filters = expr.to_parquet_filters()
217
+
203
218
  try:
204
219
  self.logger.debug(f"Attempting to load Parquet data from: {paths_to_load}")
205
- dd_result=dd.read_parquet(
206
- paths_to_load,
207
- engine="pyarrow",
208
- filesystem=self.fs,
209
- exclude=["_*", ".*"]
210
- )
220
+
221
+ # Optional: prune columns. Keep it simple unless you want to compute from filters.
222
+ columns = None # or a concrete list if you know it
223
+
224
+ if fh and pq_filters:
225
+ self.logger.debug(f"Applying Parquet filters: {pq_filters}")
226
+ dd_result = dd.read_parquet(
227
+ paths_to_load,
228
+ engine="pyarrow",
229
+ filesystem=self.fs, # your fsspec filesystem (e.g., s3fs)
230
+ filters=pq_filters,
231
+ columns=columns,
232
+ gather_statistics=False, # uncomment if you have *many* files and don't need global stats
233
+ )
234
+ # Apply only residual mask (if any)
235
+ if expr is not None:
236
+ dd_result = dd_result[expr.mask(dd_result)]
237
+ else:
238
+ dd_result = dd.read_parquet(
239
+ paths_to_load,
240
+ engine="pyarrow",
241
+ filesystem=self.fs,
242
+ columns=columns,
243
+ gather_statistics=False,
244
+ )
245
+ # If we didn't push down, but have filters, apply them here
246
+ if expr is None and fh and filters:
247
+ expr = fh.compile_filters(filters)
248
+ if expr is not None:
249
+ dd_result = dd_result[expr.mask(dd_result)]
250
+
211
251
  return dd_result
252
+
212
253
  except FileNotFoundError as e:
213
254
  self.logger.debug(f"Parquet files not found at paths {paths_to_load}: {e}")
214
255
  self.logger.debug("Returning empty DataFrame due to missing parquet files.")
@@ -1,4 +1,6 @@
1
1
  import datetime
2
+ from dataclasses import dataclass
3
+ from typing import Optional, Dict, Any, List, Union, Tuple, Callable
2
4
 
3
5
  import dask.dataframe as dd
4
6
  import pandas as pd
@@ -8,69 +10,231 @@ from sqlalchemy.sql.sqltypes import Date, Time
8
10
  from sibi_dst.utils import Logger
9
11
 
10
12
 
13
+ # -------------------- Deferred filter expression AST --------------------
14
+ class Expr:
15
+ def mask(self, df: dd.DataFrame) -> dd.Series:
16
+ raise NotImplementedError
17
+
18
+ def to_parquet_filters(self) -> List[Union[Tuple[str, str, Any], List[Tuple[str, str, Any]]]]:
19
+ # By default, nothing to push down
20
+ return []
21
+
22
+ def __and__(self, other: "Expr") -> "Expr": return And(self, other)
23
+ def __or__(self, other: "Expr") -> "Expr": return Or(self, other)
24
+ def __invert__(self) -> "Expr": return Not(self)
25
+
26
+
27
+ @dataclass(frozen=True)
28
+ class TrueExpr(Expr):
29
+ """Matches all rows; useful as a neutral starting point."""
30
+ def mask(self, df: dd.DataFrame) -> dd.Series:
31
+ return df.map_partitions(lambda p: pd.Series(True, index=p.index),
32
+ meta=pd.Series(dtype=bool))
33
+
34
+
35
+ @dataclass(frozen=True)
36
+ class ColOp(Expr):
37
+ field: str
38
+ casting: Optional[str]
39
+ op: str
40
+ value: Any
41
+ handler: "FilterHandler" # reuse your parsing + Dask ops
42
+
43
+ def mask(self, df: dd.DataFrame) -> dd.Series:
44
+ col = self.handler._get_dask_column(df, self.field, self.casting)
45
+ val = self.handler._parse_filter_value(self.casting, self.value)
46
+ return self.handler._apply_operation_dask(col, self.op, val)
47
+
48
+ def to_parquet_filters(self):
49
+ # Only basic comparisons can be pushed down
50
+ if self.op not in {"exact", "gt", "gte", "lt", "lte", "in", "range"}:
51
+ return []
52
+ val = self.handler._parse_filter_value(self.casting, self.value)
53
+ if self.casting == "date":
54
+ if self.op == "range" and isinstance(val, (list, tuple)) and len(val) == 2:
55
+ lo, hi = pd.Timestamp(val[0]), pd.Timestamp(val[1])
56
+ return [(self.field, ">=", lo), (self.field, "<=", hi)]
57
+ if isinstance(val, list):
58
+ val = [pd.Timestamp(v) for v in val]
59
+ else:
60
+ val = pd.Timestamp(val)
61
+ if self.op == "exact": return [(self.field, "=", val)]
62
+ if self.op in {"gt","gte","lt","lte"}:
63
+ sym = {"gt": ">", "gte": ">=", "lt": "<", "lte": "<="}[self.op]
64
+ return [(self.field, sym, val)]
65
+ if self.op == "in": return [(self.field, "in", list(val) if not isinstance(val, list) else val)]
66
+ if self.op == "range":
67
+ lo, hi = val
68
+ return [(self.field, ">=", lo), (self.field, "<=", hi)]
69
+ return []
70
+
71
+
72
+ @dataclass(frozen=True)
73
+ class And(Expr):
74
+ left: Expr; right: Expr
75
+ def mask(self, df: dd.DataFrame) -> dd.Series: return self.left.mask(df) & self.right.mask(df)
76
+ def to_parquet_filters(self):
77
+ # AND = concatenate both sides' AND-terms
78
+ return [*self.left.to_parquet_filters(), *self.right.to_parquet_filters()]
79
+
80
+
81
+ @dataclass(frozen=True)
82
+ class Or(Expr):
83
+ left: Expr; right: Expr
84
+ def mask(self, df: dd.DataFrame) -> dd.Series: return self.left.mask(df) | self.right.mask(df)
85
+ def to_parquet_filters(self):
86
+ # OR must be returned as list-of-lists; if either side has non-pushdown, defer to mask
87
+ lf, rf = self.left.to_parquet_filters(), self.right.to_parquet_filters()
88
+ if not lf or not rf:
89
+ return []
90
+ return [lf, rf]
91
+
92
+
93
+ @dataclass(frozen=True)
94
+ class Not(Expr):
95
+ inner: Expr
96
+ def mask(self, df: dd.DataFrame) -> dd.Series: return ~self.inner.mask(df)
97
+ def to_parquet_filters(self): return []
98
+
99
+
100
+ # -------------------- Filter handler --------------------
11
101
  class FilterHandler:
12
102
  """
13
- Handles the application of filters to data sources with support for SQLAlchemy and Dask backends.
14
-
15
- The FilterHandler class abstracts the process of applying filters to various backends, specifically
16
- SQLAlchemy queries and Dask DataFrames. It supports multiple filtering operations, including
17
- exact matches, comparisons, and string-related operations such as contains and regex. The handler
18
- automatically determines and applies backend-specific processing, enabling seamless integration with
19
- different data models or backends.
20
-
21
- :ivar backend: The backend in use ('sqlalchemy' or 'dask').
22
- :type backend: str
23
- :ivar logger: An optional logger instance for debugging and logging purposes.
24
- :type logger: Logger
25
- :ivar backend_methods: A dictionary mapping backend-specific methods for column retrieval and operation application.
26
- :type backend_methods: dict
103
+ Handles the application of filters to SQLAlchemy and Dask backends.
104
+ Also compiles dicts into deferred expressions (Expr) and can split
105
+ pushdown-friendly predicates from residual ones.
27
106
  """
28
107
  def __init__(self, backend, logger=None, debug=False):
29
- """
30
- Initialize the FilterHandler.
31
-
32
- Args:
33
- backend: The backend to use ('sqlalchemy' or 'dask').
34
- logger: Optional logger for debugging purposes.
35
- """
36
108
  self.backend = backend
37
- self.logger = logger or Logger.default_logger(
38
- logger_name=self.__class__.__name__) # No-op logger if none provided
109
+ self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
39
110
  self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
40
111
  self.backend_methods = self._get_backend_methods(backend)
41
112
 
42
- def apply_filters(self, query_or_df, model=None, filters=None):
43
- """
44
- Apply filters to the data source based on the backend.
113
+ # --------- NEW: pushdown helpers ---------
114
+ def _pushdown_ops(self) -> set[str]:
115
+ """Ops that can be translated to PyArrow parquet filters."""
116
+ return {"exact", "gt", "gte", "lt", "lte", "in", "range"}
45
117
 
46
- Args:
47
- query_or_df: SQLAlchemy query or Dask DataFrame.
48
- model: SQLAlchemy model (required for SQLAlchemy backend).
49
- filters: Dictionary of filters.
118
+ def to_parquet_filters(self, filters: Optional[Dict[str, Any]] = None
119
+ ) -> List[Tuple[str, str, Any]]:
120
+ """
121
+ Convert a subset of filters into PyArrow parquet filters (AND semantics).
122
+ Unsupported ops are skipped here and should be applied later as a Dask mask.
123
+ """
124
+ filters = filters or {}
125
+ out: List[Tuple[str, str, Any]] = []
50
126
 
51
- Returns:
52
- Filtered query or DataFrame.
127
+ for key, value in filters.items():
128
+ field, casting, op = self._parse_filter_key(key)
129
+ if op not in self._pushdown_ops():
130
+ continue
131
+
132
+ val = self._parse_filter_value(casting, value)
133
+
134
+ # Normalize dates to Timestamp for Arrow
135
+ if casting == "date":
136
+ if op == "range" and isinstance(val, (list, tuple)) and len(val) == 2:
137
+ lo, hi = pd.Timestamp(val[0]), pd.Timestamp(val[1])
138
+ out.extend([(field, ">=", lo), (field, "<=", hi)])
139
+ continue
140
+ if isinstance(val, list):
141
+ val = [pd.Timestamp(v) for v in val]
142
+ else:
143
+ val = pd.Timestamp(val)
144
+
145
+ if op == "exact":
146
+ out.append((field, "=", val))
147
+ elif op in {"gt", "gte", "lt", "lte"}:
148
+ sym = {"gt": ">", "gte": ">=", "lt": "<", "lte": "<="}[op]
149
+ out.append((field, sym, val))
150
+ elif op == "in":
151
+ out.append((field, "in", list(val) if not isinstance(val, list) else val))
152
+ elif op == "range":
153
+ lo, hi = val
154
+ out.extend([(field, ">=", lo), (field, "<=", hi)])
155
+
156
+ return out
157
+
158
+ def split_pushdown_and_residual(self, filters: Dict[str, Any]
159
+ ) -> Tuple[List[Tuple[str, str, Any]], Dict[str, Any]]:
160
+ """
161
+ Split input filter dict into:
162
+ - parquet_filters: list of (col, op, val) tuples for dd.read_parquet(..., filters=...)
163
+ - residual_filters: dict to be applied later via a Dask boolean mask
164
+ """
165
+ push_keys = set()
166
+ for key in filters.keys():
167
+ _, casting, op = self._parse_filter_key(key)
168
+ if op in self._pushdown_ops():
169
+ push_keys.add(key)
170
+
171
+ pushdown_subset = {k: filters[k] for k in push_keys}
172
+ parquet_filters = self.to_parquet_filters(pushdown_subset)
173
+ residual_filters = {k: v for k, v in filters.items() if k not in push_keys}
174
+ return parquet_filters, residual_filters
175
+
176
+ # --------- Expression compiler / mask builder ---------
177
+ def compile_filters(self, filters: Optional[Dict[str, Any]] = None) -> Expr:
178
+ """
179
+ Compile a dict into a deferred expression tree (no df required).
180
+ Supports boolean forms: {"$and": [...]}, {"$or": [...]}, {"$not": {...}}.
181
+ Default combination for plain dicts: AND of all terms.
53
182
  """
183
+ filters = filters or {}
184
+ if not filters:
185
+ return TrueExpr()
186
+
187
+ # boolean forms
188
+ if "$and" in filters:
189
+ expr = TrueExpr()
190
+ for sub in filters["$and"]:
191
+ expr = expr & self.compile_filters(sub)
192
+ return expr
193
+
194
+ if "$or" in filters:
195
+ subs = [self.compile_filters(sub) for sub in filters["$or"]]
196
+ if not subs: return TrueExpr()
197
+ expr = subs[0]
198
+ for s in subs[1:]:
199
+ expr = expr | s
200
+ return expr
201
+
202
+ if "$not" in filters:
203
+ return ~self.compile_filters(filters["$not"])
204
+
205
+ # plain dict => AND across keys
206
+ expr: Expr = TrueExpr()
207
+ for key, value in filters.items():
208
+ field, casting, op = self._parse_filter_key(key)
209
+ expr = expr & ColOp(field=field, casting=casting, op=op, value=value, handler=self)
210
+ return expr
211
+
212
+ def build_mask_fn(self, filters: Optional[Dict[str, Any]] = None) -> Callable[[dd.DataFrame], dd.Series]:
213
+ """Return a callable (df -> boolean mask) without touching df now."""
214
+ expr = self.compile_filters(filters)
215
+ def _fn(df: dd.DataFrame) -> dd.Series:
216
+ return expr.mask(df)
217
+ return _fn
218
+
219
+ # --------- Existing “apply now” API (kept as-is) ---------
220
+ def apply_filters(self, query_or_df, model=None, filters=None):
54
221
  filters = filters or {}
55
222
  for key, value in filters.items():
56
223
  field_name, casting, operation = self._parse_filter_key(key)
57
224
  parsed_value = self._parse_filter_value(casting, value)
58
- # print(field_name, casting, operation, parsed_value)
59
- # Get the column and apply backend-specific transformations
60
225
  if self.backend == "sqlalchemy":
61
226
  column = self.backend_methods["get_column"](field_name, model, casting)
62
227
  condition = self.backend_methods["apply_operation"](column, operation, parsed_value)
63
228
  query_or_df = self.backend_methods["apply_condition"](query_or_df, condition)
64
-
65
229
  elif self.backend == "dask":
66
230
  column = self.backend_methods["get_column"](query_or_df, field_name, casting)
67
231
  condition = self.backend_methods["apply_operation"](column, operation, parsed_value)
68
232
  query_or_df = self.backend_methods["apply_condition"](query_or_df, condition)
69
233
  else:
70
234
  raise ValueError(f"Unsupported backend: {self.backend}")
71
-
72
235
  return query_or_df
73
236
 
237
+ # --------- Parsing & backend plumbing (unchanged) ---------
74
238
  @staticmethod
75
239
  def _parse_filter_key(key):
76
240
  parts = key.split("__")
@@ -120,29 +284,15 @@ class FilterHandler:
120
284
 
121
285
  @staticmethod
122
286
  def _get_sqlalchemy_column(field_name, model, casting):
123
- """
124
- Retrieve and cast a column for SQLAlchemy based on the field name and casting.
125
-
126
- Args:
127
- field_name: The name of the field/column in the model.
128
- model: The SQLAlchemy model.
129
- casting: The casting type ('date', 'time', etc.).
130
-
131
- Returns:
132
- The SQLAlchemy column object, optionally cast or transformed.
133
- """
134
287
  column = getattr(model, field_name, None)
135
288
  if not column:
136
289
  raise AttributeError(f"Field '{field_name}' not found in model '{model.__name__}'")
137
290
 
138
291
  if casting == "date":
139
- # Cast the column to Date for whole-date comparisons
140
292
  column = cast(column, Date)
141
293
  elif casting == "time":
142
- # Cast the column to Time for time-specific comparisons
143
294
  column = cast(column, Time)
144
295
  elif casting in FilterHandler._date_operators():
145
- # Extract date part (e.g., year, month) using SQLAlchemy functions
146
296
  column = func.extract(casting, column)
147
297
 
148
298
  return column
@@ -196,13 +346,13 @@ class FilterHandler:
196
346
  "isnull": lambda col, val: col.is_(None) if val else col.isnot(None),
197
347
  "not_exact": lambda col, val: col != val,
198
348
  "not_contains": lambda col, val: ~col.like(f"%{val}%"),
199
- "not_in": lambda col, val: ~col.in_(val), # Custom operation
200
- "regex": lambda col, val: col.op("~")(val), # Custom operation
201
- "icontains": lambda col, val: col.ilike(f"%{val}%"), # Custom operation
202
- "istartswith": lambda col, val: col.ilike(f"{val}%"), # Custom operation
203
- "iendswith": lambda col, val: col.ilike(f"%{val}"), # Custom operation
204
- "iexact": lambda col, val: col.ilike(val), # Added iexact
205
- "iregex": lambda col, val: col.op("~*")(val), # Added iregex
349
+ "not_in": lambda col, val: ~col.in_(val),
350
+ "regex": lambda col, val: col.op("~")(val),
351
+ "icontains": lambda col, val: col.ilike(f"%{val}%"),
352
+ "istartswith": lambda col, val: col.ilike(f"{val}%"),
353
+ "iendswith": lambda col, val: col.ilike(f"%{val}"),
354
+ "iexact": lambda col, val: col.ilike(val),
355
+ "iregex": lambda col, val: col.op("~*")(val),
206
356
  }
207
357
 
208
358
  @staticmethod
@@ -214,7 +364,7 @@ class FilterHandler:
214
364
  "lt": lambda col, val: col < val,
215
365
  "lte": lambda col, val: col <= val,
216
366
 
217
- # <-- type-safe "in" and "not_in"
367
+ # type-safe "in" and "not_in"
218
368
  "in": lambda col, val: FilterHandler._align_in_types(col, val)[0].isin(
219
369
  FilterHandler._align_in_types(col, val)[1]),
220
370
  "not_in": lambda col, val: ~FilterHandler._align_in_types(col, val)[0].isin(
@@ -228,12 +378,9 @@ class FilterHandler:
228
378
  "endswith": lambda col, val: FilterHandler._as_str(col).str.endswith(val, na=False),
229
379
  "not_contains": lambda col, val: ~FilterHandler._as_str(col).str.contains(val, regex=True, na=False),
230
380
  "regex": lambda col, val: FilterHandler._as_str(col).str.contains(val, regex=True, na=False),
231
- "icontains": lambda col, val: FilterHandler._as_str(col).str.contains(val, case=False, regex=True,
232
- na=False),
233
- "istartswith": lambda col, val: FilterHandler._as_str(col).str.lower().str.startswith(str(val).lower(),
234
- na=False),
235
- "iendswith": lambda col, val: FilterHandler._as_str(col).str.lower().str.endswith(str(val).lower(),
236
- na=False),
381
+ "icontains": lambda col, val: FilterHandler._as_str(col).str.contains(val, case=False, regex=True, na=False),
382
+ "istartswith": lambda col, val: FilterHandler._as_str(col).str.lower().str.startswith(str(val).lower(), na=False),
383
+ "iendswith": lambda col, val: FilterHandler._as_str(col).str.lower().str.endswith(str(val).lower(), na=False),
237
384
  "iexact": lambda col, val: FilterHandler._as_str(col).str.lower() == str(val).lower(),
238
385
  "iregex": lambda col, val: FilterHandler._as_str(col).str.contains(val, case=False, regex=True, na=False),
239
386
 
@@ -243,12 +390,10 @@ class FilterHandler:
243
390
 
244
391
  @staticmethod
245
392
  def _as_str(col):
246
- # Force a reliable string view (works with object, categorical, etc.)
247
393
  return col.astype("string").fillna("")
248
394
 
249
395
  @staticmethod
250
396
  def _strip_tz(col):
251
- # Make tz-aware datetimes naive so they compare to tz-naive filter values
252
397
  import pandas as pd
253
398
  def _part(s: pd.Series) -> pd.Series:
254
399
  try:
@@ -258,12 +403,10 @@ class FilterHandler:
258
403
  return s.dt.tz_localize(None)
259
404
  except Exception:
260
405
  return s
261
-
262
406
  return col.map_partitions(_part, meta=col._meta)
263
407
 
264
408
  @staticmethod
265
409
  def _time_to_seconds(t):
266
- # t can be datetime.time or a "HH:MM[:SS]" str
267
410
  if isinstance(t, str):
268
411
  t = datetime.time.fromisoformat(t)
269
412
  return t.hour * 3600 + t.minute * 60 + t.second
@@ -288,10 +431,6 @@ class FilterHandler:
288
431
 
289
432
  @staticmethod
290
433
  def _align_in_types(col, val):
291
- """
292
- Return (coerced_col, coerced_values) with compatible dtypes
293
- so that .isin(...) behaves as expected across partitions.
294
- """
295
434
  # normalize val to a list
296
435
  if isinstance(val, (set, tuple)):
297
436
  vals = list(val)
@@ -300,7 +439,6 @@ class FilterHandler:
300
439
  else:
301
440
  vals = [val]
302
441
 
303
- # try numeric alignment first if column is numeric-like
304
442
  kind = getattr(getattr(col, "dtype", None), "kind", None)
305
443
  if kind in ("i", "u"): # integer
306
444
  def to_ints(xs):
@@ -309,13 +447,10 @@ class FilterHandler:
309
447
  try:
310
448
  out.append(int(x))
311
449
  except Exception:
312
- # if any value can't be int, fall back to strings below
313
450
  return None
314
451
  return out
315
-
316
452
  ints = to_ints(vals)
317
453
  if ints is not None:
318
- # nullable Int64 handles missing values
319
454
  return col.astype("Int64"), ints
320
455
 
321
456
  if kind in ("f",): # float
@@ -327,10 +462,8 @@ class FilterHandler:
327
462
  except Exception:
328
463
  return None
329
464
  return out
330
-
331
465
  flts = to_floats(vals)
332
466
  if flts is not None:
333
467
  return col.astype("float64"), flts
334
468
 
335
- # fallback: compare as strings (robust across object/categorical/mixed)
336
- return FilterHandler._as_str(col), [str(x) for x in vals]
469
+ return FilterHandler._as_str(col), [str(x) for x in vals]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 2025.8.3
3
+ Version: 2025.8.5
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -2,15 +2,14 @@ sibi_dst/__init__.py,sha256=D01Z2Ds4zES8uz5Zp7qOWD0EcfCllWgew7AWt2X1SQg,445
2
2
  sibi_dst/df_helper/__init__.py,sha256=CyDXtFhRnMrycktxNO8jGGkP0938QiScl56kMZS1Sf8,578
3
3
  sibi_dst/df_helper/_artifact_updater_async.py,sha256=0lUwel-IkmKewRnmMv9GtuT-P6SivkIKtgOHvKchHlc,8462
4
4
  sibi_dst/df_helper/_artifact_updater_threaded.py,sha256=M5GNZismOqMmBrcyfolP1DPv87VILQf_P18is_epn50,7238
5
- sibi_dst/df_helper/_df_helper.py,sha256=uXG7Ku8ttHuP2kVlMVilek6tkTzpKCJGhw-O0K1JS18,27550
5
+ sibi_dst/df_helper/_df_helper.py,sha256=g1ftfSMO40l60EJWRLE0DDZvbIowrqvG1GMf2zXqYGw,12957
6
6
  sibi_dst/df_helper/_parquet_artifact.py,sha256=tqYOjwxHV1MsADmn-RNFuVI_RrEvvmCJHZieRcsVXuc,12334
7
7
  sibi_dst/df_helper/_parquet_reader.py,sha256=tFq0OQVczozbKZou93vscokp2R6O2DIJ1zHbZqVjagc,3069
8
8
  sibi_dst/df_helper/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
9
  sibi_dst/df_helper/backends/http/__init__.py,sha256=d1pfgYxbiYg7E0Iw8RbJ7xfqIfJShqqTBQQGU_S6OOo,105
10
10
  sibi_dst/df_helper/backends/http/_http_config.py,sha256=eGPFdqZ5M3Tscqx2P93B6XoBEEzlmdt7yNg7PXUQnNQ,4726
11
- sibi_dst/df_helper/backends/parquet/__init__.py,sha256=esWJ9aSuYC26d-T01z9dPrJ1uqJzvdaPNTYRb5qXTlQ,182
12
- sibi_dst/df_helper/backends/parquet/_filter_handler.py,sha256=TvDf0RXta7mwJv11GNQttYJsXgFf2XDj4oLIjt4xTzA,5219
13
- sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=FusVcLVysitLoc8Ui_zU4JMhdHW1MMn4i0vnMbl2K84,12017
11
+ sibi_dst/df_helper/backends/parquet/__init__.py,sha256=0A6BGHZLwiLBmuBBaUvEHfeWTcInvy2NbymlrI_nuXE,104
12
+ sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=V6y1Vco3_uY4UBF79_JPd1CFK5DpNsnGYHCc5PDPGZo,13798
14
13
  sibi_dst/df_helper/backends/sqlalchemy/__init__.py,sha256=LjWm9B7CweTvlvFOgB90XjSe0lVLILAIYMWKPkFXFm8,265
15
14
  sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py,sha256=R3_WY_lsQrfQwD6yAzH66MqvsgZdMd0HKcVChDQcbpM,8401
16
15
  sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py,sha256=GQwDy2JwPUx37vpwxPM5hg4ZydilPIP824y5C_clsl0,383
@@ -20,7 +19,7 @@ sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py,sha256=MHk64f5WDOKHQ_L
20
19
  sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py,sha256=RiCaVPME5wzgZ9xUGY0JOs_c2C0KcDIbTeMGpPupIa0,5242
21
20
  sibi_dst/df_helper/core/__init__.py,sha256=LfmTqFh6GUZup-g95bcXgAxX7J5Hkve7ftLE_CJg_AE,409
22
21
  sibi_dst/df_helper/core/_defaults.py,sha256=9UMEMu2wXznO5UzEhnQ82f_ZazZ20JRyRXIi3HP3gDw,4043
23
- sibi_dst/df_helper/core/_filter_handler.py,sha256=CYyeSmCyy7qVw_duRfBeGzEKaSQyyM-ZN9U8KsjwxXM,14295
22
+ sibi_dst/df_helper/core/_filter_handler.py,sha256=9C30zrT8wSGy1X8ryiTWc0XfnbpeoHndHgoOcHKOPOo,19309
24
23
  sibi_dst/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxRrQKE5FQRxcEWsac,6736
25
24
  sibi_dst/df_helper/core/_query_config.py,sha256=1ApqmuSGXTC3CdF-xMsSbCa3V2Z5hOP3Wq5huhzZwqY,439
26
25
  sibi_dst/df_helper/data_cleaner.py,sha256=lkxQoXLvGzXCicFUimnA5nen5qkrO1oxgl_p2Be2o8w,5183
@@ -79,6 +78,6 @@ sibi_dst/v2/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxR
79
78
  sibi_dst/v2/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
80
79
  sibi_dst/v2/utils/__init__.py,sha256=6H4cvhqTiFufnFPETBF0f8beVVMpfJfvUs6Ne0TQZNY,58
81
80
  sibi_dst/v2/utils/log_utils.py,sha256=rfk5VsLAt-FKpv6aPTC1FToIPiyrnHAFFBAkHme24po,4123
82
- sibi_dst-2025.8.3.dist-info/METADATA,sha256=zdQXSnLpJ6bVQPpI-N4fnwB2ajCzyyRrGFzmEfpzjvk,2610
83
- sibi_dst-2025.8.3.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
84
- sibi_dst-2025.8.3.dist-info/RECORD,,
81
+ sibi_dst-2025.8.5.dist-info/METADATA,sha256=ADWrf_9UI4NiTWslrJ0LgfmHTTdxSSCIc0AaP-mqSQg,2610
82
+ sibi_dst-2025.8.5.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
83
+ sibi_dst-2025.8.5.dist-info/RECORD,,
@@ -1,126 +0,0 @@
1
- import dask.dataframe as dd
2
- import pandas as pd
3
-
4
- from sibi_dst.utils import Logger
5
-
6
-
7
- class ParquetFilterHandler(object):
8
- """
9
- Handles parquet filtering operations using dask dataframes.
10
-
11
- This class is designed to apply complex filtering logic on dask dataframes
12
- based on specified filter criteria. It includes support for operations such
13
- as exact matches, ranges, string pattern matches, and null checks. Additionally,
14
- it handles datetime-related field filtering including precise truncations and
15
- specific date/time attributes.
16
-
17
- :ivar logger: Logger object to handle logging within the class. Defaults to the class-level logger.
18
- :type logger: Logger
19
- """
20
- def __init__(self, logger=None, debug=False):
21
- self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
22
- self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
23
-
24
- @staticmethod
25
- def apply_filters_dask(df, filters):
26
- """
27
- Applies a set of filters to a Dask DataFrame, enabling complex filtering operations
28
- such as comparisons, ranges, string match operations, and more. Handles special
29
- cases for datetime operations, including casting and extracting specific datetime
30
- components for filtering.
31
-
32
- :param df: Dask DataFrame to which the filters will be applied.
33
- :type df: dask.dataframe.DataFrame
34
- :param filters: Dictionary defining the filtering logic, where the keys specify
35
- the column name and filter operation, and the values specify the corresponding
36
- filter values to apply.
37
- :type filters: dict
38
- :return: A filtered Dask DataFrame based on the defined logic in the filters.
39
- :rtype: dask.dataframe.DataFrame
40
- :raises ValueError: If an unsupported operation is encountered in the filters.
41
- """
42
- dt_operators = ['date', 'time']
43
- date_operators = ['year', 'month', 'day', 'hour', 'minute', 'second', 'week_day']
44
- comparison_operators = [
45
- 'gte',
46
- 'lte',
47
- 'gt',
48
- 'lt',
49
- 'exact',
50
- 'in',
51
- 'range',
52
- 'contains',
53
- 'icontains',
54
- 'startswith',
55
- 'endswith',
56
- 'isnull'
57
- ]
58
-
59
- operation_map = {
60
- 'exact': lambda col, val: col == val,
61
- 'gt': lambda col, val: col > val,
62
- 'gte': lambda col, val: col >= val,
63
- 'lt': lambda col, val: col < val,
64
- 'lte': lambda col, val: col <= val,
65
- 'in': lambda col, val: col.isin(val),
66
- 'range': lambda col, val: (col >= val[0]) & (col <= val[1]),
67
- 'contains': lambda col, val: col.str.contains(val, regex=True),
68
- 'icontains': lambda col, val: col.str.contains(val, case=False),
69
- 'startswith': lambda col, val: col.str.startswith(val),
70
- 'endswith': lambda col, val: col.str.endswith(val),
71
- 'isnull': lambda col, val: col.isnull() if val else col.notnull(),
72
- }
73
-
74
- def parse_filter_value(casting, value):
75
- """
76
- Convert filter value to appropriate type based on the casting (e.g., date).
77
- """
78
- if casting == 'date':
79
- if isinstance(value, str):
80
- return pd.Timestamp(value) # Convert to datetime64[ns]
81
- if isinstance(value, list):
82
- return [pd.Timestamp(v) for v in value] # Convert list elements
83
- return value
84
-
85
- def get_temp_col(dask_df, field_name, casting):
86
- """
87
- Handle datetime conversion and field retrieval.
88
- """
89
- temp_col = dd.to_datetime(dask_df[field_name], errors='coerce') if casting in dt_operators else dask_df[
90
- field_name]
91
- if casting == 'date':
92
- temp_col = temp_col.dt.floor('D') # Keep it as datetime64[ns] truncated to the day level
93
- elif casting in date_operators:
94
- temp_col = getattr(temp_col.dt, casting)
95
- return temp_col
96
-
97
- for key, value in filters.items():
98
- parts = key.split('__')
99
- field_name = parts[0]
100
- casting = None
101
- operation = 'exact'
102
-
103
- if len(parts) == 3:
104
- # Adjust logic based on the parts
105
- _, casting, operation = parts
106
- elif len(parts) == 2:
107
- # Could be either a casting or an operation
108
- if parts[1] in comparison_operators:
109
- operation = parts[1]
110
- elif parts[1] in dt_operators + date_operators:
111
- casting = parts[1]
112
-
113
- # Convert the filter value to the correct type
114
- parsed_value = parse_filter_value(casting, value)
115
-
116
- # Get the column to filter
117
- temp_col = get_temp_col(df, field_name, casting)
118
-
119
- if operation in operation_map:
120
- # Apply the filter operation
121
- condition = operation_map[operation](temp_col, parsed_value)
122
- df = df[condition]
123
- else:
124
- raise ValueError(f"Unsupported operation: {operation}")
125
-
126
- return df