PyPI - sibi-dst - Versions diffs - 0.3.33__py3-none-any.whl → 0.3.34__py3-none-any.whl - Mend

sibi-dst 0.3.33py3-none-any.whl → 0.3.34py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

sibi_dst/df_helper/_df_helper.py CHANGED Viewed

@@ -91,7 +91,7 @@ class DfHelper:
         self.filesystem_options = kwargs.pop('filesystem_options', {})
         kwargs.setdefault("live", True)
         kwargs.setdefault("logger", self.logger)
-        kwargs.setdefault("fs", fsspec.filesystem('file'))
+        self.fs =kwargs.setdefault("fs", fsspec.filesystem('file'))
         self.__post_init(**kwargs)
     def __str__(self):
@@ -208,6 +208,18 @@ class DfHelper:
                 return asyncio.run(self.__load_from_http(**options))
     def __load_from_sqlalchemy(self, **options):
+        """
+        Loads data from an SQLAlchemy database source into a dataframe. The method processes
+        the loaded data and applies post-processing to transform it into the desired structure.
+        If the operation fails, an empty pandas DataFrame is created as a fallback.
+        :param options: Additional keyword arguments to configure the data loading process.
+            These options can include configurations such as 'debug' and other parameters
+            required by the `SqlAlchemyLoadFromDb` class.
+        :type options: dict
+        :return: A dataframe containing the data loaded from the SQLAlchemy database.
+        :rtype: dask.dataframe.DataFrame
+        """
         try:
             options.setdefault("debug", self.debug)
             db_loader = SqlAlchemyLoadFromDb(
@@ -228,6 +240,17 @@ class DfHelper:
         return self.df
     def __load_from_db(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
+        """
+        Loads data from a Django database using a specific backend query mechanism. Processes the loaded data
+        and applies further post-processing before returning the dataframe. If the operation fails, an
+        empty dataframe with a single partition is returned instead.
+        :param options: Additional settings for the database loading process, which include optional configurations
+            like debug mode, among others.
+        :type options: dict
+        :return: A dataframe containing the loaded data either as a Pandas or Dask dataframe.
+        :rtype: Union[pd.DataFrame, dd.DataFrame]
+        """
         try:
             options.setdefault("debug", self.debug)
             db_loader = DjangoLoadFromDb(
@@ -248,7 +271,18 @@ class DfHelper:
         return self.df
     async def __load_from_http(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
-        """Delegate asynchronous HTTP data loading to HttpDatabackend plugin."""
+        """
+        Loads data asynchronously from an HTTP source using the configured HTTP plugin.
+        If the HTTP plugin is not properly configured, this method logs a debug message and
+        returns an empty Dask DataFrame. If an exception occurs during data fetching, the error
+        is logged and an empty Dask DataFrame with one partition is returned.
+        :param options: Additional keyword arguments that are passed to the HTTP plugin for
+                        fetching the data.
+        :returns: A DataFrame object that can either be a pandas or a Dask DataFrame. When the
+                  fetching operation fails, it defaults to returning an empty Dask DataFrame
+                  with a single partition.
+        """
         if not self.backend_http:
             self.logger.debug("HTTP plugin not configured properly.")
             return dd.from_pandas(pd.DataFrame(), npartitions=1)
@@ -339,12 +373,45 @@ class DfHelper:
             self.logger.debug("Processing of loaded data completed.")
-    def save_to_parquet(self, parquet_filename: Optional[str] = None):
-        ps = ParquetSaver(self.df, self.parquet_storage_path, self.logger)
+    def save_to_parquet(self, parquet_filename: Optional[str] = None, **kwargs):
+        """
+        Save the dataframe result to a Parquet file using specified configurations.
+        This method leverages the ParquetSaver class to store the dataframe result
+        into a Parquet file. It also provides functionality for overriding the default
+        filesystem (`fs`) and storage path (`parquet_storage_path`). The method logs
+        details about the saving operation for debugging purposes.
+        :param parquet_filename: The name of the Parquet file to save the dataframe to.
+                                  If not provided, a default name will be used.
+        :param kwargs: Additional arguments to customize the saving process. These may
+                       include:
+                       - `fs`: Filesystem to be used for saving Parquet files. If not
+                         provided, defaults to the instance's filesystem attribute.
+                       - `parquet_storage_path`: The root path in the filesystem where
+                         Parquet files should be saved. If not provided, defaults to
+                         the instance's attribute for storage path.
+        :return: None
+        """
+        fs = kwargs.pop('fs', self.fs)
+        parquet_storage_path = kwargs.pop('parquet_storage_path', self.parquet_storage_path)
+        ps = ParquetSaver(df_result=self.df, parquet_storage_path=parquet_storage_path, logger=self.logger, fs=fs)
         ps.save_to_parquet(parquet_filename)
-        self.logger.debug(f"Parquet saved to {parquet_filename} in parquet storage: {self.parquet_storage_path}.")
+        self.logger.debug(f"Parquet saved to {parquet_filename} in parquet storage: {parquet_storage_path}.")
     def save_to_clickhouse(self, **credentials):
+        """
+        Saves the current DataFrame to ClickHouse using the provided credentials. This
+        method first checks if the DataFrame is empty. If it is empty, the method logs
+        a debug message and does not proceed with saving. Otherwise, it initializes
+        a ClickHouseWriter instance and uses it to save the DataFrame to ClickHouse,
+        logging a debug message upon successful completion.
+        :param credentials: Credentials required to connect to ClickHouse as keyword
+            arguments.
+        :type credentials: dict
+        :return: None
+        """
         if self.df.map_partitions(len).compute().sum() == 0:
             self.logger.debug("Cannot write to clickhouse since Dataframe is empty")
             return
@@ -353,6 +420,21 @@ class DfHelper:
         self.logger.debug("Save to ClickHouse completed.")
     def __load_from_parquet(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
+        """
+        Loads data from parquet files into a DataFrame, applies provided filters, and handles exceptions.
+        This method leverages a backend-specific implementation to load data from parquet files into a
+        DataFrame. If additional options are provided and the data is successfully loaded, filters are
+        applied to the DataFrame using a filter handler. Errors during this process are handled gracefully
+        by logging the issue and returning an empty Dask DataFrame.
+        :param options: A dictionary of filter options to be applied to the DataFrame.
+        :type options: dict
+        :return: A DataFrame containing the loaded and filtered data. If the operation fails, an empty
+            Dask DataFrame is returned.
+        :rtype: Union[pd.DataFrame, dd.DataFrame]
+        """
         try:
             self.df = self.backend_parquet.load_files()
             if options and self.df is not None:
@@ -368,6 +450,27 @@ class DfHelper:
             return dd.from_pandas(pd.DataFrame(), npartitions=1)
     def load_period(self, **kwargs):
+        """
+        Loads a period with specified parameters.
+        This method acts as a wrapper around the private ``__load_period`` method. It
+        accepts arbitrary keyword arguments that are passed directly to the private
+        method for execution. The purpose of allowing keyword arguments is to permit
+        flexible configuration or parameterization for loading a specific period, based
+        on the internal implementation of the private ``__load_period`` method.
+        Note:
+        The arguments and return values are entirely determined by the private
+        method's behavior. This method is intentionally designed to mask details
+        of the internal logic behind the abstraction.
+        :param kwargs: Arbitrary keyword arguments to parameterize the internal logic
+            of loading a period. The specific keys and values expected by the
+            ``__load_period`` method depend on its own internal implementation.
+        :return: The result of calling the private ``__load_period`` method with the
+            provided keyword arguments. The return type is dependent on the internal
+            implementation of ``__load_period``.
+        """
         return self.__load_period(**kwargs)
     def __load_period(self, **kwargs):

sibi_dst/utils/parquet_saver.py CHANGED Viewed

@@ -121,109 +121,3 @@ class ParquetSaver:
             write_index=False,
         )
-# from pathlib import Path
-# from typing import Optional
-#
-# import fsspec
-# import pyarrow as pa
-#
-# from sibi_dst.utils import Logger
-#
-#
-# class ParquetSaver:
-#     def __init__(self, df_result, parquet_storage_path, logger=None, fs=None):
-#         # Ensure df_result is a Dask DataFrame
-#         self.fs = fs or fsspec.filesystem("file")
-#         self.df_result = df_result
-#         self.parquet_storage_path = parquet_storage_path
-#         self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
-#
-#     def save_to_parquet(self, parquet_filename: Optional[str] = None, clear_existing=True):
-#         full_path = self._construct_full_path(parquet_filename)
-#
-#         # We cannot check for empty DataFrame directly with Dask without computation
-#         # Proceed with saving; if the DataFrame is empty, an empty Parquet file will be created
-#
-#         # Ensure directory exists and clear if necessary
-#         self._ensure_directory_exists(full_path, clear_existing=clear_existing)
-#
-#         # Define schema and save DataFrame to Parquet
-#         schema = self._define_schema()
-#         self._convert_dtypes(schema)
-#         self._save_dataframe_to_parquet(full_path, schema)
-#
-#     def _define_schema(self) -> pa.Schema:
-#         """Define a PyArrow schema dynamically based on df_result column types."""
-#         pandas_dtype_to_pa = {
-#             'object': pa.string(),
-#             'string': pa.string(),
-#             'Int64': pa.int64(),
-#             'int64': pa.int64(),
-#             'float64': pa.float64(),
-#             'float32': pa.float32(),
-#             'bool': pa.bool_(),
-#             'boolean': pa.bool_(),  # pandas nullable boolean
-#             'datetime64[ns]': pa.timestamp('ns'),
-#             'timedelta[ns]': pa.duration('ns')
-#         }
-#
-#         dtypes = self.df_result.dtypes  # No need to call .compute()
-#
-#         fields = [
-#             pa.field(col, pandas_dtype_to_pa.get(str(dtype), pa.string()))
-#             for col, dtype in dtypes.items()
-#         ]
-#         return pa.schema(fields)
-#
-#     def _convert_dtypes(self, schema: pa.Schema):
-#         """Convert DataFrame columns to match the specified schema."""
-#         dtype_mapping = {}
-#         for field in schema:
-#             col_name = field.name
-#             if col_name in self.df_result.columns:
-#                 if pa.types.is_string(field.type):
-#                     dtype_mapping[col_name] = 'string'
-#                 elif pa.types.is_int64(field.type):
-#                     dtype_mapping[col_name] = 'Int64'  # pandas nullable integer
-#                 elif pa.types.is_float64(field.type):
-#                     dtype_mapping[col_name] = 'float64'
-#                 elif pa.types.is_float32(field.type):
-#                     dtype_mapping[col_name] = 'float32'
-#                 elif pa.types.is_boolean(field.type):
-#                     dtype_mapping[col_name] = 'boolean'  # pandas nullable boolean
-#                 elif pa.types.is_timestamp(field.type):
-#                     dtype_mapping[col_name] = 'datetime64[ns]'
-#                 else:
-#                     dtype_mapping[col_name] = 'object'  # Fallback to object
-#         # Convert dtypes
-#         self.df_result = self.df_result.astype(dtype_mapping)
-#
-#     def _construct_full_path(self, parquet_filename: Optional[str]) -> Path:
-#         """Construct and return the full path for the Parquet file."""
-#         _, base_path = fsspec.core.url_to_fs(self.parquet_storage_path)
-#         parquet_filename = parquet_filename or "default.parquet"
-#         return Path(base_path) / parquet_filename
-#
-#     @staticmethod
-#     def _ensure_directory_exists(full_path: Path, clear_existing=False):
-#         """Ensure that the directory for the path exists, clearing it if specified."""
-#         fs, _ = fsspec.core.url_to_fs(str(full_path))
-#         directory = str(full_path.parent)
-#
-#         if fs.exists(directory):
-#             if clear_existing:
-#                 fs.rm(directory, recursive=True)
-#         else:
-#             fs.mkdirs(directory, exist_ok=True)
-#
-#     def _save_dataframe_to_parquet(self, full_path: Path, schema: pa.Schema):
-#         """Save the DataFrame to Parquet using the specified schema."""
-#         fs, _ = fsspec.core.url_to_fs(str(full_path))
-#         print(f"Saving to {str(full_path)}")
-#         if fs.exists(str(full_path)):
-#             fs.rm(str(full_path), recursive=True)
-#
-#         # Save the Dask DataFrame to Parquet
-#         self.df_result.to_parquet(
-#             str(full_path), engine="pyarrow", schema=schema, write_index=False
-#         )

{sibi_dst-0.3.33.dist-info → sibi_dst-0.3.34.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sibi-dst
-Version: 0.3.33
+Version: 0.3.34
 Summary: Data Science Toolkit
 Author: Luis Valverde
 Author-email: lvalverdeb@gmail.com

{sibi_dst-0.3.33.dist-info → sibi_dst-0.3.34.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
 sibi_dst/__init__.py,sha256=CLHfzrFNqklNx5uMKAPtbZfkbBbVYR5qsiMro0RTfmA,252
 sibi_dst/df_helper/__init__.py,sha256=5yzslP6zYYOHsTtAzHnNDXHYjf_T6yW7baxwgtduWqQ,292
-sibi_dst/df_helper/_df_helper.py,sha256=sZaI998N9yd7FuUgZ8Esrz-K0eh2kXky53h9K8-l4cw,23650
+sibi_dst/df_helper/_df_helper.py,sha256=NRiLdHHO45SPwhif5JIQpfj56iC8HcffaRAyT7-TC2w,29585
 sibi_dst/df_helper/_parquet_artifact.py,sha256=K9FnKjXDmkqCzYqv5weS9scLHsPGyj0UUUoVzOtWv30,8858
 sibi_dst/df_helper/_parquet_reader.py,sha256=HhzhKtV_7qABHJvmpU2CssjNLgQHUB07eF0CqqzmkOs,3654
 sibi_dst/df_helper/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -48,8 +48,8 @@ sibi_dst/utils/df_utils.py,sha256=OFEtcwVKIilvf9qVf-IfIOHp4jcFAHX5l2IDGudhPZg,10
 sibi_dst/utils/file_utils.py,sha256=JpsybYj3XvVJisSBeVU6YSaZnYRm4_6YWTI3TLnnY4Y,1257
 sibi_dst/utils/filepath_generator.py,sha256=volVm0SSlBrtZp1RpTHxyui5rj5asNcVsWEBRY5FOUQ,6673
 sibi_dst/utils/log_utils.py,sha256=XUbeXa1JsOlcEJyW8jnBlWo295rLUnuYi-HMzyhHwJg,3145
-sibi_dst/utils/parquet_saver.py,sha256=kR4FsjdMurQF46M0jc2Kvze4Ue70lUxefEzS0iszln8,9740
+sibi_dst/utils/parquet_saver.py,sha256=FmSTOVhKruGw6r5G1sH3kKqsP0tCuU32KTlyQBLpXos,5092
 sibi_dst/utils/storage_manager.py,sha256=qHo5vTv-dr1roRr_mOcprSTdlAfH4Q2Dy5tQUz06Pnk,4228
-sibi_dst-0.3.33.dist-info/METADATA,sha256=yghZuscDKJkUhggh-hpGnJm6of6pq-_BRVcKc4wt1_E,2564
-sibi_dst-0.3.33.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
-sibi_dst-0.3.33.dist-info/RECORD,,
+sibi_dst-0.3.34.dist-info/METADATA,sha256=ewd8lmlRjJg0lEeEI0ju5g20zGk7Lk1bdgBxunNpf3s,2564
+sibi_dst-0.3.34.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
+sibi_dst-0.3.34.dist-info/RECORD,,

{sibi_dst-0.3.33.dist-info → sibi_dst-0.3.34.dist-info}/WHEEL RENAMED Viewed

File without changes

sibi-dst 0.3.33__py3-none-any.whl → 0.3.34__py3-none-any.whl

sibi-dst 0.3.33py3-none-any.whl → 0.3.34py3-none-any.whl