sibi-dst 2025.9.3__py3-none-any.whl → 2025.9.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,7 @@
1
1
  import datetime as dt
2
- import logging
3
2
  import posixpath
4
3
  from pathlib import Path
5
- from typing import Optional, List, Tuple
4
+ from typing import Optional, List
6
5
 
7
6
  import dask.dataframe as dd
8
7
  import fsspec
@@ -286,282 +285,3 @@ class ParquetConfig(BaseModel):
286
285
  path = Path(filepath)
287
286
  return str(path.with_suffix(f".{extension}")) if path.suffix != f".{extension}" else filepath
288
287
 
289
- # import datetime
290
- # from pathlib import Path
291
- # from typing import Optional, List
292
- #
293
- # import dask.dataframe as dd
294
- # import fsspec
295
- # import pandas as pd
296
- # from pydantic import BaseModel, model_validator, ConfigDict
297
- #
298
- # from sibi_dst.df_helper.core import FilterHandler
299
- # from sibi_dst.utils import FilePathGenerator
300
- # from sibi_dst.utils import Logger
301
- #
302
- #
303
- # class ParquetConfig(BaseModel):
304
- # """
305
- # Represents configuration for managing and validating parquet file operations.
306
- #
307
- # The `ParquetConfig` class provides attributes and methods necessary to handle operations
308
- # on parquet files in a file system. It includes functionalities for ensuring file paths
309
- # and extensions, validating storage paths and parameters, determining file recency,
310
- # and calculating the size of parquet files. This class is designed with flexibility to handle
311
- # different file systems through the integration with `fsspec` and allows storage path validations
312
- # with optional logging support.
313
- #
314
- # :ivar load_parquet: Indicates whether parquet data should be loaded based on the
315
- # current configuration and validation.
316
- # :type load_parquet: bool
317
- # :ivar parquet_filename: The name of the parquet file, optional if folders are used.
318
- # :type parquet_filename: Optional[str]
319
- # :ivar parquet_storage_path: The base path for storing or retrieving parquet files.
320
- # :type parquet_storage_path: Optional[str]
321
- # :ivar parquet_full_path: The full path to a specific parquet file, derived from the
322
- # storage path and filename when applicable.
323
- # :type parquet_full_path: Optional[str]
324
- # :ivar parquet_folder_list: A list of folder paths to parquet data, derived from start
325
- # and end dates if specified.
326
- # :type parquet_folder_list: Optional[List[str]]
327
- # :ivar parquet_size_bytes: The total size of the parquet files, in bytes.
328
- # :type parquet_size_bytes: int
329
- # :ivar parquet_max_age_minutes: Maximum acceptable age of the most recent parquet file, in minutes.
330
- # :type parquet_max_age_minutes: int
331
- # :ivar parquet_is_recent: Indicates whether the parquet file is considered recent based
332
- # on the `parquet_max_age_minutes` condition.
333
- # :type parquet_is_recent: bool
334
- # :ivar parquet_start_date: The start date for parquet file validation or file path generation.
335
- # :type parquet_start_date: Optional[str]
336
- # :ivar parquet_end_date: The end date for parquet file validation or file path generation.
337
- # :type parquet_end_date: Optional[str]
338
- # :ivar fs: The file system object used for storage operations, compliant with `fsspec`.
339
- # :type fs: Optional[fsspec.spec.AbstractFileSystem]
340
- # :ivar logger: A logger for handling logging operations.
341
- # :type logger: Optional[Logger]
342
- # """
343
- # load_parquet: bool = False
344
- # parquet_filename: Optional[str] = None
345
- # parquet_storage_path: Optional[str] = None
346
- # parquet_full_path: Optional[str] = None
347
- # parquet_folder_list: Optional[List[str]] = None
348
- # parquet_size_bytes: int = 0
349
- # parquet_max_age_minutes: int = 0
350
- # parquet_is_recent: bool = False
351
- # parquet_start_date: Optional[str] = None
352
- # parquet_end_date: Optional[str] = None
353
- # fs: Optional[fsspec.spec.AbstractFileSystem] = None # Your fsspec filesystem object
354
- # logger: Optional[Logger] = None
355
- # debug: bool = False
356
- # model_config = ConfigDict(arbitrary_types_allowed=True)
357
- #
358
- # @model_validator(mode='after')
359
- # def check_parquet_params(self):
360
- # """
361
- # Validates and configures the parameters required for managing parquet files. This includes
362
- # configuring paths through `fsspec`, identifying file storage paths, checking the validity of
363
- # dates related to parquet files, ensuring proper parquet file extensions, and determining
364
- # whether existing parquet files are recent and loadable.
365
- #
366
- # :return: The current instance with validated and migrated attributes configured for
367
- # handling parquet files.
368
- #
369
- # :raises ValueError: If certain conditions are not met, such as missing or invalid
370
- # `parquet_storage_path`, providing only one of
371
- # `parquet_start_date` or `parquet_end_date`, or if the
372
- # `parquet_end_date` is earlier than the `parquet_start_date`.
373
- # """
374
- # # Configure paths based on fsspec
375
- # if self.logger is None:
376
- # self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
377
- # self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
378
- # if self.fs is None:
379
- # raise ValueError('Parquet Options: File system (fs) must be specified')
380
- #
381
- # if self.parquet_storage_path is None:
382
- # raise ValueError('Parquet storage path must be specified')
383
- # self.parquet_storage_path = self.parquet_storage_path.rstrip('/')
384
- # #if not self.fs.exists(self.parquet_storage_path):
385
- # # self.fs.mkdirs(self.parquet_storage_path, exist_ok=True)
386
- # # self.logger.debug(f'Parquet storage path {self.parquet_storage_path} does not exist')
387
- # self.load_parquet = False
388
- # if self.parquet_filename is not None:
389
- # self.parquet_full_path = self.ensure_file_extension(
390
- # filepath=self.fs.sep.join([str(self.parquet_storage_path), str(self.parquet_filename)]),
391
- # extension='parquet'
392
- # )
393
- # self.parquet_is_recent = self.is_file_recent()
394
- # self.load_parquet = self.parquet_is_recent and self.fs.exists(self.parquet_full_path)
395
- #
396
- # if self.parquet_start_date is not None:
397
- # if self.parquet_end_date is None:
398
- # raise ValueError('Parquet end date must be specified if start date is provided')
399
- #
400
- # start_date = datetime.datetime.strptime(self.parquet_start_date, '%Y-%m-%d')
401
- # end_date = datetime.datetime.strptime(self.parquet_end_date, '%Y-%m-%d')
402
- # if end_date < start_date:
403
- # raise ValueError('Parquet end date must be greater than start date')
404
- #
405
- # # Saving to parquet is disabled when start and end dates are provided, as we will load parquet files
406
- # self.parquet_folder_list = FilePathGenerator(str(self.parquet_storage_path), fs=self.fs,
407
- # logger=self.logger).generate_file_paths(start_date, end_date)
408
- #
409
- # self.parquet_size_bytes = self.get_parquet_size_bytes()
410
- # self.load_parquet = True
411
- # # self.load_parquet = all([self.fs.exists(folder) for folder in self.parquet_folder_list]) and self.parquet_size_bytes > 0
412
- # elif self.parquet_end_date is not None:
413
- # raise ValueError('Parquet start date must be specified if end date is provided')
414
- #
415
- # return self
416
- #
417
- # def is_file_recent(self):
418
- # """
419
- # Determines whether the file at the specified parquet path is considered recent
420
- # based on its modification time and the maximum age limit defined.
421
- #
422
- # The function first checks for the existence of the file at the specified
423
- # `parquet_full_path`. If the file does not exist, the function will return
424
- # False. If `parquet_max_age_minutes` is set to 0, it implies no maximum age
425
- # limit, and the function will return True. Otherwise, it retrieves the file's
426
- # last modified time and calculates the age of the file by comparing it with the
427
- # current time. The function returns True if the file's age does not exceed the
428
- # maximum age specified by `parquet_max_age_minutes`, otherwise it returns
429
- # False.
430
- #
431
- # :return: Whether the file is considered recent based on its existence,
432
- # modification time, and maximum age limit.
433
- # :rtype: bool
434
- # """
435
- # if not self.fs.exists(self.parquet_full_path):
436
- # return False
437
- # if self.parquet_max_age_minutes == 0:
438
- # return True
439
- # file_time = datetime.datetime.fromtimestamp(self.fs.modified(self.parquet_full_path))
440
- # return (datetime.datetime.now() - file_time) <= datetime.timedelta(minutes=self.parquet_max_age_minutes)
441
- #
442
- # def get_parquet_size_bytes(self):
443
- # """
444
- # Calculate the total size, in bytes, of all Parquet files within the defined
445
- # folders specified by `parquet_folder_list`. The function iteratively goes
446
- # through each folder in the provided list, applying a recursive wildcard
447
- # search to include all levels of nested directories, and calculates the
448
- # cumulative size of all found Parquet files using the file system's size
449
- # retrieval method.
450
- #
451
- # :raises AttributeError: If `fs` or `parquet_folder_list` attributes are not set
452
- # or improperly configured when the method is called.
453
- # :raises NotImplementedError: If the `fs.size` or `fs.glob` methods are
454
- # unimplemented in the provided file system object or it otherwise lacks
455
- # necessary support for these operations.
456
- #
457
- # :return: The cumulative size of all Parquet files located in the folders
458
- # defined by `parquet_folder_list`, measured in bytes.
459
- # :rtype: int
460
- # """
461
- # total_size = 0
462
- # for folder in self.parquet_folder_list:
463
- # # Use a double wildcard ** to match any level of nested directories
464
- # for path in self.fs.glob(f"{folder}/**/*.parquet"):
465
- # total_size += self.fs.size(path)
466
- # return total_size
467
- #
468
- # def load_files(self, **filters):
469
- # """
470
- # Loads parquet files into a Dask DataFrame based on the specified conditions.
471
- # Supports Parquet predicate pushdown (pyarrow) + residual Dask mask.
472
- # """
473
- # if not self.load_parquet:
474
- # self.logger.warning("Parquet loading is disabled. Returning empty DataFrame.")
475
- # return dd.from_pandas(pd.DataFrame(), npartitions=1)
476
- #
477
- # # Resolve paths
478
- # paths_to_load = []
479
- # if self.parquet_folder_list:
480
- # import posixpath
481
- # paths_to_load = sorted(set([posixpath.dirname(p) for p in self.parquet_folder_list]))
482
- # paths_to_load = [p.rstrip("/") + "/*.parquet" for p in paths_to_load]
483
- # elif self.parquet_full_path:
484
- # paths_to_load = [self.parquet_full_path]
485
- #
486
- # if not paths_to_load:
487
- # self.logger.warning("No valid parquet file paths were provided. Returning empty DataFrame.")
488
- # return dd.from_pandas(pd.DataFrame(), npartitions=1)
489
- #
490
- # # Prepare filters
491
- # fh = None
492
- # expr = None
493
- # pq_filters = None
494
- # residual_filters = None
495
- # if filters:
496
- # fh = FilterHandler(backend="dask", debug=self.debug, logger=self.logger)
497
- #
498
- # # Use the compiler + pushdown split so we don't double-apply
499
- # try:
500
- # # If you added split_pushdown_and_residual earlier:
501
- # pq_filters, residual_filters = fh.split_pushdown_and_residual(filters)
502
- # expr = fh.compile_filters(residual_filters) if residual_filters else None
503
- # except AttributeError:
504
- # # Fallback if you didn't add split_*: push everything down and also mask (redundant but correct)
505
- # expr = fh.compile_filters(filters)
506
- # pq_filters = expr.to_parquet_filters()
507
- #
508
- # try:
509
- # self.logger.debug(f"Attempting to load Parquet data from: {paths_to_load}")
510
- #
511
- # # Optional: prune columns. Keep it simple unless you want to compute from filters.
512
- # columns = None # or a concrete list if you know it
513
- #
514
- # if fh and pq_filters:
515
- # self.logger.debug(f"Applying Parquet filters: {pq_filters}")
516
- # dd_result = dd.read_parquet(
517
- # paths_to_load,
518
- # engine="pyarrow",
519
- # filesystem=self.fs, # your fsspec filesystem (e.g., s3fs)
520
- # filters=pq_filters,
521
- # columns=columns,
522
- # gather_statistics=False, # uncomment if you have *many* files and don't need global stats
523
- # ignore_metadata_file=True
524
- # )
525
- # # Apply only residual mask (if any)
526
- # if expr is not None:
527
- # dd_result = dd_result[expr.mask(dd_result)]
528
- # else:
529
- # dd_result = dd.read_parquet(
530
- # paths_to_load,
531
- # engine="pyarrow",
532
- # filesystem=self.fs,
533
- # columns=columns,
534
- # gather_statistics=False,
535
- # ignore_metadata_file=True
536
- # )
537
- # # If we didn't push down, but have filters, apply them here
538
- # if expr is None and fh and filters:
539
- # expr = fh.compile_filters(filters)
540
- # if expr is not None:
541
- # dd_result = dd_result[expr.mask(dd_result)]
542
- #
543
- # return dd_result
544
- #
545
- # except FileNotFoundError as e:
546
- # self.logger.debug(f"Parquet files not found at paths {paths_to_load}: {e}")
547
- # self.logger.debug("Returning empty DataFrame due to missing parquet files.")
548
- # return dd.from_pandas(pd.DataFrame(), npartitions=1)
549
- # except Exception as e:
550
- # self.logger.debug(f"Parquet loading failed for paths {paths_to_load}: {e}")
551
- # self.logger.debug("Returning empty DataFrame due to loading error.")
552
- # return dd.from_pandas(pd.DataFrame(), npartitions=1)
553
- #
554
- #
555
- # @staticmethod
556
- # def ensure_file_extension(filepath: str, extension: str) -> str:
557
- # """
558
- # Ensures that the specified file has the desired extension. If the file already has the
559
- # specified extension, it returns the filepath unchanged. Otherwise, it updates the file
560
- # extension to the given one and returns the modified filepath.
561
- #
562
- # :param filepath: The path to the file as a string.
563
- # :param extension: The desired file extension, without the leading dot.
564
- # :return: The updated file path as a string, ensuring it has the specified extension.
565
- # """
566
- # path = Path(filepath)
567
- # return str(path.with_suffix(f".{extension}")) if path.suffix != f".{extension}" else filepath