sibi-dst 2025.9.3__py3-none-any.whl → 2025.9.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +6 -4
- sibi_dst/df_helper/__init__.py +1 -0
- sibi_dst/df_helper/_parquet_artifact.py +533 -113
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +1 -281
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +349 -142
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +17 -0
- sibi_dst/tests/test_baseclass.py +403 -0
- sibi_dst/utils/base.py +0 -254
- sibi_dst/utils/boilerplate/__init__.py +4 -1
- sibi_dst/utils/boilerplate/hybrid_data_loader.py +144 -0
- sibi_dst/utils/data_wrapper.py +460 -61
- sibi_dst/utils/parquet_saver.py +403 -161
- sibi_dst/utils/update_planner.py +553 -319
- sibi_dst/utils/write_gatekeeper.py +18 -0
- {sibi_dst-2025.9.3.dist-info → sibi_dst-2025.9.5.dist-info}/METADATA +2 -2
- {sibi_dst-2025.9.3.dist-info → sibi_dst-2025.9.5.dist-info}/RECORD +17 -14
- {sibi_dst-2025.9.3.dist-info → sibi_dst-2025.9.5.dist-info}/WHEEL +0 -0
@@ -1,8 +1,7 @@
|
|
1
1
|
import datetime as dt
|
2
|
-
import logging
|
3
2
|
import posixpath
|
4
3
|
from pathlib import Path
|
5
|
-
from typing import Optional, List
|
4
|
+
from typing import Optional, List
|
6
5
|
|
7
6
|
import dask.dataframe as dd
|
8
7
|
import fsspec
|
@@ -286,282 +285,3 @@ class ParquetConfig(BaseModel):
|
|
286
285
|
path = Path(filepath)
|
287
286
|
return str(path.with_suffix(f".{extension}")) if path.suffix != f".{extension}" else filepath
|
288
287
|
|
289
|
-
# import datetime
|
290
|
-
# from pathlib import Path
|
291
|
-
# from typing import Optional, List
|
292
|
-
#
|
293
|
-
# import dask.dataframe as dd
|
294
|
-
# import fsspec
|
295
|
-
# import pandas as pd
|
296
|
-
# from pydantic import BaseModel, model_validator, ConfigDict
|
297
|
-
#
|
298
|
-
# from sibi_dst.df_helper.core import FilterHandler
|
299
|
-
# from sibi_dst.utils import FilePathGenerator
|
300
|
-
# from sibi_dst.utils import Logger
|
301
|
-
#
|
302
|
-
#
|
303
|
-
# class ParquetConfig(BaseModel):
|
304
|
-
# """
|
305
|
-
# Represents configuration for managing and validating parquet file operations.
|
306
|
-
#
|
307
|
-
# The `ParquetConfig` class provides attributes and methods necessary to handle operations
|
308
|
-
# on parquet files in a file system. It includes functionalities for ensuring file paths
|
309
|
-
# and extensions, validating storage paths and parameters, determining file recency,
|
310
|
-
# and calculating the size of parquet files. This class is designed with flexibility to handle
|
311
|
-
# different file systems through the integration with `fsspec` and allows storage path validations
|
312
|
-
# with optional logging support.
|
313
|
-
#
|
314
|
-
# :ivar load_parquet: Indicates whether parquet data should be loaded based on the
|
315
|
-
# current configuration and validation.
|
316
|
-
# :type load_parquet: bool
|
317
|
-
# :ivar parquet_filename: The name of the parquet file, optional if folders are used.
|
318
|
-
# :type parquet_filename: Optional[str]
|
319
|
-
# :ivar parquet_storage_path: The base path for storing or retrieving parquet files.
|
320
|
-
# :type parquet_storage_path: Optional[str]
|
321
|
-
# :ivar parquet_full_path: The full path to a specific parquet file, derived from the
|
322
|
-
# storage path and filename when applicable.
|
323
|
-
# :type parquet_full_path: Optional[str]
|
324
|
-
# :ivar parquet_folder_list: A list of folder paths to parquet data, derived from start
|
325
|
-
# and end dates if specified.
|
326
|
-
# :type parquet_folder_list: Optional[List[str]]
|
327
|
-
# :ivar parquet_size_bytes: The total size of the parquet files, in bytes.
|
328
|
-
# :type parquet_size_bytes: int
|
329
|
-
# :ivar parquet_max_age_minutes: Maximum acceptable age of the most recent parquet file, in minutes.
|
330
|
-
# :type parquet_max_age_minutes: int
|
331
|
-
# :ivar parquet_is_recent: Indicates whether the parquet file is considered recent based
|
332
|
-
# on the `parquet_max_age_minutes` condition.
|
333
|
-
# :type parquet_is_recent: bool
|
334
|
-
# :ivar parquet_start_date: The start date for parquet file validation or file path generation.
|
335
|
-
# :type parquet_start_date: Optional[str]
|
336
|
-
# :ivar parquet_end_date: The end date for parquet file validation or file path generation.
|
337
|
-
# :type parquet_end_date: Optional[str]
|
338
|
-
# :ivar fs: The file system object used for storage operations, compliant with `fsspec`.
|
339
|
-
# :type fs: Optional[fsspec.spec.AbstractFileSystem]
|
340
|
-
# :ivar logger: A logger for handling logging operations.
|
341
|
-
# :type logger: Optional[Logger]
|
342
|
-
# """
|
343
|
-
# load_parquet: bool = False
|
344
|
-
# parquet_filename: Optional[str] = None
|
345
|
-
# parquet_storage_path: Optional[str] = None
|
346
|
-
# parquet_full_path: Optional[str] = None
|
347
|
-
# parquet_folder_list: Optional[List[str]] = None
|
348
|
-
# parquet_size_bytes: int = 0
|
349
|
-
# parquet_max_age_minutes: int = 0
|
350
|
-
# parquet_is_recent: bool = False
|
351
|
-
# parquet_start_date: Optional[str] = None
|
352
|
-
# parquet_end_date: Optional[str] = None
|
353
|
-
# fs: Optional[fsspec.spec.AbstractFileSystem] = None # Your fsspec filesystem object
|
354
|
-
# logger: Optional[Logger] = None
|
355
|
-
# debug: bool = False
|
356
|
-
# model_config = ConfigDict(arbitrary_types_allowed=True)
|
357
|
-
#
|
358
|
-
# @model_validator(mode='after')
|
359
|
-
# def check_parquet_params(self):
|
360
|
-
# """
|
361
|
-
# Validates and configures the parameters required for managing parquet files. This includes
|
362
|
-
# configuring paths through `fsspec`, identifying file storage paths, checking the validity of
|
363
|
-
# dates related to parquet files, ensuring proper parquet file extensions, and determining
|
364
|
-
# whether existing parquet files are recent and loadable.
|
365
|
-
#
|
366
|
-
# :return: The current instance with validated and migrated attributes configured for
|
367
|
-
# handling parquet files.
|
368
|
-
#
|
369
|
-
# :raises ValueError: If certain conditions are not met, such as missing or invalid
|
370
|
-
# `parquet_storage_path`, providing only one of
|
371
|
-
# `parquet_start_date` or `parquet_end_date`, or if the
|
372
|
-
# `parquet_end_date` is earlier than the `parquet_start_date`.
|
373
|
-
# """
|
374
|
-
# # Configure paths based on fsspec
|
375
|
-
# if self.logger is None:
|
376
|
-
# self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
|
377
|
-
# self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
|
378
|
-
# if self.fs is None:
|
379
|
-
# raise ValueError('Parquet Options: File system (fs) must be specified')
|
380
|
-
#
|
381
|
-
# if self.parquet_storage_path is None:
|
382
|
-
# raise ValueError('Parquet storage path must be specified')
|
383
|
-
# self.parquet_storage_path = self.parquet_storage_path.rstrip('/')
|
384
|
-
# #if not self.fs.exists(self.parquet_storage_path):
|
385
|
-
# # self.fs.mkdirs(self.parquet_storage_path, exist_ok=True)
|
386
|
-
# # self.logger.debug(f'Parquet storage path {self.parquet_storage_path} does not exist')
|
387
|
-
# self.load_parquet = False
|
388
|
-
# if self.parquet_filename is not None:
|
389
|
-
# self.parquet_full_path = self.ensure_file_extension(
|
390
|
-
# filepath=self.fs.sep.join([str(self.parquet_storage_path), str(self.parquet_filename)]),
|
391
|
-
# extension='parquet'
|
392
|
-
# )
|
393
|
-
# self.parquet_is_recent = self.is_file_recent()
|
394
|
-
# self.load_parquet = self.parquet_is_recent and self.fs.exists(self.parquet_full_path)
|
395
|
-
#
|
396
|
-
# if self.parquet_start_date is not None:
|
397
|
-
# if self.parquet_end_date is None:
|
398
|
-
# raise ValueError('Parquet end date must be specified if start date is provided')
|
399
|
-
#
|
400
|
-
# start_date = datetime.datetime.strptime(self.parquet_start_date, '%Y-%m-%d')
|
401
|
-
# end_date = datetime.datetime.strptime(self.parquet_end_date, '%Y-%m-%d')
|
402
|
-
# if end_date < start_date:
|
403
|
-
# raise ValueError('Parquet end date must be greater than start date')
|
404
|
-
#
|
405
|
-
# # Saving to parquet is disabled when start and end dates are provided, as we will load parquet files
|
406
|
-
# self.parquet_folder_list = FilePathGenerator(str(self.parquet_storage_path), fs=self.fs,
|
407
|
-
# logger=self.logger).generate_file_paths(start_date, end_date)
|
408
|
-
#
|
409
|
-
# self.parquet_size_bytes = self.get_parquet_size_bytes()
|
410
|
-
# self.load_parquet = True
|
411
|
-
# # self.load_parquet = all([self.fs.exists(folder) for folder in self.parquet_folder_list]) and self.parquet_size_bytes > 0
|
412
|
-
# elif self.parquet_end_date is not None:
|
413
|
-
# raise ValueError('Parquet start date must be specified if end date is provided')
|
414
|
-
#
|
415
|
-
# return self
|
416
|
-
#
|
417
|
-
# def is_file_recent(self):
|
418
|
-
# """
|
419
|
-
# Determines whether the file at the specified parquet path is considered recent
|
420
|
-
# based on its modification time and the maximum age limit defined.
|
421
|
-
#
|
422
|
-
# The function first checks for the existence of the file at the specified
|
423
|
-
# `parquet_full_path`. If the file does not exist, the function will return
|
424
|
-
# False. If `parquet_max_age_minutes` is set to 0, it implies no maximum age
|
425
|
-
# limit, and the function will return True. Otherwise, it retrieves the file's
|
426
|
-
# last modified time and calculates the age of the file by comparing it with the
|
427
|
-
# current time. The function returns True if the file's age does not exceed the
|
428
|
-
# maximum age specified by `parquet_max_age_minutes`, otherwise it returns
|
429
|
-
# False.
|
430
|
-
#
|
431
|
-
# :return: Whether the file is considered recent based on its existence,
|
432
|
-
# modification time, and maximum age limit.
|
433
|
-
# :rtype: bool
|
434
|
-
# """
|
435
|
-
# if not self.fs.exists(self.parquet_full_path):
|
436
|
-
# return False
|
437
|
-
# if self.parquet_max_age_minutes == 0:
|
438
|
-
# return True
|
439
|
-
# file_time = datetime.datetime.fromtimestamp(self.fs.modified(self.parquet_full_path))
|
440
|
-
# return (datetime.datetime.now() - file_time) <= datetime.timedelta(minutes=self.parquet_max_age_minutes)
|
441
|
-
#
|
442
|
-
# def get_parquet_size_bytes(self):
|
443
|
-
# """
|
444
|
-
# Calculate the total size, in bytes, of all Parquet files within the defined
|
445
|
-
# folders specified by `parquet_folder_list`. The function iteratively goes
|
446
|
-
# through each folder in the provided list, applying a recursive wildcard
|
447
|
-
# search to include all levels of nested directories, and calculates the
|
448
|
-
# cumulative size of all found Parquet files using the file system's size
|
449
|
-
# retrieval method.
|
450
|
-
#
|
451
|
-
# :raises AttributeError: If `fs` or `parquet_folder_list` attributes are not set
|
452
|
-
# or improperly configured when the method is called.
|
453
|
-
# :raises NotImplementedError: If the `fs.size` or `fs.glob` methods are
|
454
|
-
# unimplemented in the provided file system object or it otherwise lacks
|
455
|
-
# necessary support for these operations.
|
456
|
-
#
|
457
|
-
# :return: The cumulative size of all Parquet files located in the folders
|
458
|
-
# defined by `parquet_folder_list`, measured in bytes.
|
459
|
-
# :rtype: int
|
460
|
-
# """
|
461
|
-
# total_size = 0
|
462
|
-
# for folder in self.parquet_folder_list:
|
463
|
-
# # Use a double wildcard ** to match any level of nested directories
|
464
|
-
# for path in self.fs.glob(f"{folder}/**/*.parquet"):
|
465
|
-
# total_size += self.fs.size(path)
|
466
|
-
# return total_size
|
467
|
-
#
|
468
|
-
# def load_files(self, **filters):
|
469
|
-
# """
|
470
|
-
# Loads parquet files into a Dask DataFrame based on the specified conditions.
|
471
|
-
# Supports Parquet predicate pushdown (pyarrow) + residual Dask mask.
|
472
|
-
# """
|
473
|
-
# if not self.load_parquet:
|
474
|
-
# self.logger.warning("Parquet loading is disabled. Returning empty DataFrame.")
|
475
|
-
# return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
476
|
-
#
|
477
|
-
# # Resolve paths
|
478
|
-
# paths_to_load = []
|
479
|
-
# if self.parquet_folder_list:
|
480
|
-
# import posixpath
|
481
|
-
# paths_to_load = sorted(set([posixpath.dirname(p) for p in self.parquet_folder_list]))
|
482
|
-
# paths_to_load = [p.rstrip("/") + "/*.parquet" for p in paths_to_load]
|
483
|
-
# elif self.parquet_full_path:
|
484
|
-
# paths_to_load = [self.parquet_full_path]
|
485
|
-
#
|
486
|
-
# if not paths_to_load:
|
487
|
-
# self.logger.warning("No valid parquet file paths were provided. Returning empty DataFrame.")
|
488
|
-
# return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
489
|
-
#
|
490
|
-
# # Prepare filters
|
491
|
-
# fh = None
|
492
|
-
# expr = None
|
493
|
-
# pq_filters = None
|
494
|
-
# residual_filters = None
|
495
|
-
# if filters:
|
496
|
-
# fh = FilterHandler(backend="dask", debug=self.debug, logger=self.logger)
|
497
|
-
#
|
498
|
-
# # Use the compiler + pushdown split so we don't double-apply
|
499
|
-
# try:
|
500
|
-
# # If you added split_pushdown_and_residual earlier:
|
501
|
-
# pq_filters, residual_filters = fh.split_pushdown_and_residual(filters)
|
502
|
-
# expr = fh.compile_filters(residual_filters) if residual_filters else None
|
503
|
-
# except AttributeError:
|
504
|
-
# # Fallback if you didn't add split_*: push everything down and also mask (redundant but correct)
|
505
|
-
# expr = fh.compile_filters(filters)
|
506
|
-
# pq_filters = expr.to_parquet_filters()
|
507
|
-
#
|
508
|
-
# try:
|
509
|
-
# self.logger.debug(f"Attempting to load Parquet data from: {paths_to_load}")
|
510
|
-
#
|
511
|
-
# # Optional: prune columns. Keep it simple unless you want to compute from filters.
|
512
|
-
# columns = None # or a concrete list if you know it
|
513
|
-
#
|
514
|
-
# if fh and pq_filters:
|
515
|
-
# self.logger.debug(f"Applying Parquet filters: {pq_filters}")
|
516
|
-
# dd_result = dd.read_parquet(
|
517
|
-
# paths_to_load,
|
518
|
-
# engine="pyarrow",
|
519
|
-
# filesystem=self.fs, # your fsspec filesystem (e.g., s3fs)
|
520
|
-
# filters=pq_filters,
|
521
|
-
# columns=columns,
|
522
|
-
# gather_statistics=False, # uncomment if you have *many* files and don't need global stats
|
523
|
-
# ignore_metadata_file=True
|
524
|
-
# )
|
525
|
-
# # Apply only residual mask (if any)
|
526
|
-
# if expr is not None:
|
527
|
-
# dd_result = dd_result[expr.mask(dd_result)]
|
528
|
-
# else:
|
529
|
-
# dd_result = dd.read_parquet(
|
530
|
-
# paths_to_load,
|
531
|
-
# engine="pyarrow",
|
532
|
-
# filesystem=self.fs,
|
533
|
-
# columns=columns,
|
534
|
-
# gather_statistics=False,
|
535
|
-
# ignore_metadata_file=True
|
536
|
-
# )
|
537
|
-
# # If we didn't push down, but have filters, apply them here
|
538
|
-
# if expr is None and fh and filters:
|
539
|
-
# expr = fh.compile_filters(filters)
|
540
|
-
# if expr is not None:
|
541
|
-
# dd_result = dd_result[expr.mask(dd_result)]
|
542
|
-
#
|
543
|
-
# return dd_result
|
544
|
-
#
|
545
|
-
# except FileNotFoundError as e:
|
546
|
-
# self.logger.debug(f"Parquet files not found at paths {paths_to_load}: {e}")
|
547
|
-
# self.logger.debug("Returning empty DataFrame due to missing parquet files.")
|
548
|
-
# return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
549
|
-
# except Exception as e:
|
550
|
-
# self.logger.debug(f"Parquet loading failed for paths {paths_to_load}: {e}")
|
551
|
-
# self.logger.debug("Returning empty DataFrame due to loading error.")
|
552
|
-
# return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
553
|
-
#
|
554
|
-
#
|
555
|
-
# @staticmethod
|
556
|
-
# def ensure_file_extension(filepath: str, extension: str) -> str:
|
557
|
-
# """
|
558
|
-
# Ensures that the specified file has the desired extension. If the file already has the
|
559
|
-
# specified extension, it returns the filepath unchanged. Otherwise, it updates the file
|
560
|
-
# extension to the given one and returns the modified filepath.
|
561
|
-
#
|
562
|
-
# :param filepath: The path to the file as a string.
|
563
|
-
# :param extension: The desired file extension, without the leading dot.
|
564
|
-
# :return: The updated file path as a string, ensuring it has the specified extension.
|
565
|
-
# """
|
566
|
-
# path = Path(filepath)
|
567
|
-
# return str(path.with_suffix(f".{extension}")) if path.suffix != f".{extension}" else filepath
|