PyPI - sibi-dst - Versions diffs - 0.3.29__tar.gz → 0.3.31__tar.gz - Mend

sibi-dst 0.3.29tar.gz → 0.3.31tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

{sibi_dst-0.3.29 → sibi_dst-0.3.31}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sibi-dst
-Version: 0.3.29
+Version: 0.3.31
 Summary: Data Science Toolkit
 Author: Luis Valverde
 Author-email: lvalverdeb@gmail.com
@@ -26,6 +26,7 @@ Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
 Requires-Dist: mysqlclient (>=2.2.6,<3.0.0)
 Requires-Dist: nltk (>=3.9.1,<4.0.0)
 Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
+Requires-Dist: osmnx (>=2.0.1,<3.0.0)
 Requires-Dist: pandas (>=2.2.3,<3.0.0)
 Requires-Dist: paramiko (>=3.5.0,<4.0.0)
 Requires-Dist: psutil (>=6.1.0,<7.0.0)

{sibi_dst-0.3.29 → sibi_dst-0.3.31}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "sibi-dst"
-version = "0.3.29"
+version = "0.3.31"
 description = "Data Science Toolkit"
 authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
 readme = "README.md"
@@ -40,6 +40,7 @@ s3fs = "^2024.12.0"
 nltk = "^3.9.1"
 folium = "^0.19.4"
 geopandas = "^1.0.1"
+osmnx = "^2.0.1"
 [build-system]

{sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/df_helper/_df_helper.py RENAMED Viewed

@@ -9,6 +9,7 @@ import dask.dataframe as dd
 from dask import delayed, compute
 import pandas as pd
 from pydantic import BaseModel
+import fsspec
 from sibi_dst.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
 from sibi_dst.utils import Logger
@@ -86,8 +87,11 @@ class DfHelper:
         self.parquet_storage_path = kwargs.setdefault("parquet_storage_path", None)
         self.dt_field = kwargs.setdefault("dt_field", None)
         self.as_pandas = kwargs.setdefault("as_pandas", False)
+        self.filesystem = kwargs.pop('filesystem', 'file')
+        self.filesystem_options = kwargs.pop('filesystem_options', {})
         kwargs.setdefault("live", True)
         kwargs.setdefault("logger", self.logger)
+        kwargs.setdefault("fs", fsspec.filesystem('file'))
         self.__post_init(**kwargs)
     def __str__(self):

{sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/df_helper/_parquet_artifact.py RENAMED Viewed

@@ -13,7 +13,7 @@ class ParquetArtifact(DfHelper):
         'backend': 'parquet'
     }
-    def __init__(self, data_wrapper_class, filesystem_type="file", filesystem_options=None, **kwargs):
+    def __init__(self, data_wrapper_class,  **kwargs):
         self.config = {
             **self.DEFAULT_CONFIG,
             **kwargs,
@@ -39,13 +39,14 @@ class ParquetArtifact(DfHelper):
             raise ValueError('parquet_end_date must be set')
         # Filesystem setup
-        self.filesystem_type = filesystem_type
-        self.filesystem_options = filesystem_options or {}
-        self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
+        self.filesystem_type = self.config.setdefault('filesystem_type', 'file')
+        self.filesystem_options = self.config.setdefault('filesystem_options', {})
+        self.fs = self.config.setdefault('fs', None)
+        if self.fs is None:
+            self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
+        self.config.setdefault('fs', self.fs)
         # Ensure the directory exists
         self.ensure_directory_exists(self.parquet_storage_path)
         super().__init__(**self.config)
     def load(self, **kwargs):
@@ -97,6 +98,7 @@ class ParquetArtifact(DfHelper):
             'history_days_threshold': kwargs.pop('history_days_threshold', 30),
             'max_age_minutes': kwargs.pop('max_age_minutes', 10),
             'show_progress': kwargs.pop('show_progress', False),
+            'fs': self.fs,
             'filesystem_type': self.filesystem_type,
             'filesystem_options': self.filesystem_options,
         }

{sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/df_helper/backends/parquet/_parquet_options.py RENAMED Viewed

@@ -13,8 +13,8 @@ from sibi_dst.utils import Logger
 class ParquetConfig(BaseModel):
     load_parquet: bool = False
     parquet_filename: Optional[str] = None
-    parquet_storage_path: Optional[DirectoryPath] = None
-    parquet_full_path: Optional[FilePath] = None
+    parquet_storage_path: Optional[str] = None
+    parquet_full_path: Optional[str] = None
     parquet_folder_list: Optional[List[str]] = None
     parquet_size_bytes: int = 0
     parquet_max_age_minutes: int = 0
@@ -30,14 +30,17 @@ class ParquetConfig(BaseModel):
         # Configure paths based on fsspec
         if self.logger is None:
             self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
-        self.fs = fsspec.filesystem("file") if "://" not in str(self.parquet_storage_path) else fsspec.filesystem(
-            str(self.parquet_storage_path).split("://")[0])
+        #self.fs = fsspec.filesystem("file") if "://" not in str(self.parquet_storage_path) else fsspec.filesystem(
+        #    str(self.parquet_storage_path).split("://")[0])
         # Validation for parquet path
         if self.parquet_storage_path is None:
             raise ValueError('Parquet storage path must be specified')
+        self.parquet_storage_path = self.parquet_storage_path.rstrip('/')
         if not self.fs.exists(self.parquet_storage_path):
-            raise ValueError('Parquet storage path does not exist')
+            self.fs.mkdirs(self.parquet_storage_path, exist_ok=True)
+            #raise ValueError('Parquet storage path does not exist')
         self.load_parquet = False
         if self.parquet_filename is not None:
             self.parquet_full_path = self.ensure_file_extension(
@@ -57,8 +60,9 @@ class ParquetConfig(BaseModel):
                 raise ValueError('Parquet end date must be greater than start date')
             # Saving to parquet is disabled when start and end dates are provided, as we will load parquet files
-            self.parquet_folder_list = FilePathGenerator(str(self.parquet_storage_path),
+            self.parquet_folder_list = FilePathGenerator(str(self.parquet_storage_path), fs=self.fs,
                                                          logger=self.logger).generate_file_paths(start_date, end_date)
             self.parquet_size_bytes = self.get_parquet_size_bytes()
             self.load_parquet = True
             # self.load_parquet = all([self.fs.exists(folder) for folder in self.parquet_folder_list]) and self.parquet_size_bytes > 0
@@ -84,11 +88,12 @@ class ParquetConfig(BaseModel):
         return total_size
     def load_files(self):
         if self.load_parquet:
             if self.parquet_folder_list:
-                return dd.read_parquet(self.parquet_folder_list, engine="pyarrow")
+                return dd.read_parquet(self.parquet_folder_list, engine="pyarrow", filesystem=self.fs)
             else:
-                return dd.read_parquet(self.parquet_full_path, engine="pyarrow")
+                return dd.read_parquet(self.parquet_full_path, engine="pyarrow", filesystem=self.fs)
     @staticmethod
     def ensure_file_extension(filepath: str, extension: str) -> str:

{sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/utils/data_wrapper.py RENAMED Viewed

@@ -22,6 +22,7 @@ class DataWrapper:
                  parquet_filename: str,
                  start_date: Any,
                  end_date: Any,
+                 fs: Optional[fsspec.AbstractFileSystem] = None,
                  filesystem_type: str = "file",
                  filesystem_options: Optional[Dict] = None,
                  verbose: bool = False,
@@ -41,7 +42,7 @@ class DataWrapper:
         self.parquet_filename = parquet_filename
         self.filesystem_type = filesystem_type
         self.filesystem_options = filesystem_options or {}
-        self.fs = fsspec.filesystem(filesystem_type, **self.filesystem_options)
+        self.fs = fs or fsspec.filesystem(filesystem_type, **self.filesystem_options)
         self.verbose = verbose
         self.class_params = class_params or {}
         self.load_params = load_params or {}
@@ -129,23 +130,45 @@ class DataWrapper:
     def is_file_older_than(self, file_path: str) -> bool:
         """
         Check if a file is older than the specified max_age_minutes.
+        :param file_path: Path to the file.
+        :return: True if the file is older than max_age_minutes, False otherwise.
         """
         try:
+            # Get file info
             info = self.fs.info(file_path)
-            file_modification_time = info['mtime']
-            file_modification_datetime = datetime.datetime.fromtimestamp(
-                file_modification_time, tz=datetime.timezone.utc
-            )
+            # Determine the modification time from available keys
+            file_modification_time = None
+            if "mtime" in info:  # Local filesystem
+                file_modification_time = info["mtime"]
+                file_modification_datetime = datetime.datetime.fromtimestamp(
+                    file_modification_time, tz=datetime.timezone.utc
+                )
+            elif "LastModified" in info:  # S3-compatible filesystem
+                file_modification_datetime = (
+                    info["LastModified"] if isinstance(info["LastModified"], datetime.datetime)
+                    else datetime.datetime.strptime(info["LastModified"], "%Y-%m-%dT%H:%M:%S.%fZ")
+                )
+            else:
+                self.logger.warning(f"Modification time not available for {file_path}.")
+                return True  # Assume file is too old if we cannot determine its age
+            # Compare file age
             current_time = datetime.datetime.now(datetime.timezone.utc)
             file_age_minutes = (current_time - file_modification_datetime).total_seconds() / 60
             self.logger.info(
                 f"File {file_path} is {round(file_age_minutes, 2)} minutes old "
                 f"(threshold: {self.max_age_minutes} minutes)"
             )
             return file_age_minutes > self.max_age_minutes
         except FileNotFoundError:
-            return True
+            self.logger.warning(f"File {file_path} not found.")
+            return True  # File is considered old if it doesn't exist
+        except Exception as e:
+            self.logger.error(f"Error checking file age for {file_path}: {str(e)}")
+            return True  #
     def process_date(self, date: datetime.date):
         """Process a specific date by regenerating data as necessary."""
@@ -162,7 +185,7 @@ class DataWrapper:
             self.logger.error("No data found for the specified date.")
             return
-        parquet_saver = ParquetSaver(df, folder, self.logger)
+        parquet_saver = ParquetSaver(df, parquet_storage_path=folder, logger=self.logger, fs=self.fs)
         parquet_saver.save_to_parquet(self.parquet_filename, clear_existing=True)
         end_time = datetime.datetime.now()

{sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/utils/filepath_generator.py RENAMED Viewed

@@ -91,7 +91,7 @@ class FilePathGenerator:
         if engine == 'dask':
             # Collect individual file paths
             file_pattern = f"{base_dir}/**/*.{self.file_extension}"
-            all_paths = self.fs.glob(file_pattern, recursive=True)
+            all_paths = self.fs.glob(file_pattern)
             if not all_paths and self.debug:
                 self.logger.debug(f"No files found with pattern: {file_pattern}")

sibi_dst-0.3.31/sibi_dst/utils/parquet_saver.py ADDED Viewed

@@ -0,0 +1,228 @@
+from pathlib import Path
+from typing import Optional
+import pyarrow as pa
+import fsspec
+from sibi_dst.utils import Logger
+class ParquetSaver:
+    def __init__(self, df_result, parquet_storage_path, logger=None, fs=None):
+        """
+        Initialize ParquetSaver.
+        :param df_result: Dask DataFrame to save.
+        :param parquet_storage_path: Base storage path (e.g., "s3://bucket-name/path/").
+        :param logger: Logger instance for logging messages.
+        :param fs: Pre-initialized fsspec filesystem instance. Defaults to 'file' if None.
+        """
+        self.df_result = df_result
+        self.parquet_storage_path = parquet_storage_path.rstrip("/")
+        self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
+        # Default to the local filesystem if `fs` is not provided
+        self.fs = fs or fsspec.filesystem("file")
+    def save_to_parquet(self, parquet_filename: Optional[str] = None, clear_existing=True):
+        """
+        Save the DataFrame to Parquet format.
+        :param parquet_filename: Filename for the Parquet file.
+        :param clear_existing: Whether to clear existing files in the target directory.
+        """
+        full_path = self._construct_full_path(parquet_filename)
+        # Ensure directory exists and clear if necessary
+        self._ensure_directory_exists(full_path, clear_existing=clear_existing)
+        # Define schema and save DataFrame to Parquet
+        schema = self._define_schema()
+        self._convert_dtypes(schema)
+        self._save_dataframe_to_parquet(full_path, schema)
+    def _define_schema(self) -> pa.Schema:
+        """Define a PyArrow schema dynamically based on df_result column types."""
+        pandas_dtype_to_pa = {
+            "object": pa.string(),
+            "string": pa.string(),
+            "Int64": pa.int64(),
+            "int64": pa.int64(),
+            "float64": pa.float64(),
+            "float32": pa.float32(),
+            "bool": pa.bool_(),
+            "boolean": pa.bool_(),  # pandas nullable boolean
+            "datetime64[ns]": pa.timestamp("ns"),
+            "timedelta[ns]": pa.duration("ns"),
+        }
+        dtypes = self.df_result.dtypes
+        fields = [
+            pa.field(col, pandas_dtype_to_pa.get(str(dtype), pa.string()))
+            for col, dtype in dtypes.items()
+        ]
+        return pa.schema(fields)
+    def _convert_dtypes(self, schema: pa.Schema):
+        """Convert DataFrame columns to match the specified schema."""
+        dtype_mapping = {}
+        for field in schema:
+            col_name = field.name
+            if col_name in self.df_result.columns:
+                if pa.types.is_string(field.type):
+                    dtype_mapping[col_name] = "string"
+                elif pa.types.is_int64(field.type):
+                    dtype_mapping[col_name] = "Int64"
+                elif pa.types.is_float64(field.type):
+                    dtype_mapping[col_name] = "float64"
+                elif pa.types.is_float32(field.type):
+                    dtype_mapping[col_name] = "float32"
+                elif pa.types.is_boolean(field.type):
+                    dtype_mapping[col_name] = "boolean"
+                elif pa.types.is_timestamp(field.type):
+                    dtype_mapping[col_name] = "datetime64[ns]"
+                else:
+                    dtype_mapping[col_name] = "object"
+        self.df_result = self.df_result.astype(dtype_mapping)
+    def _construct_full_path(self, parquet_filename: Optional[str]) -> str:
+        """Construct and return the full path for the Parquet file."""
+        parquet_filename = parquet_filename or "default.parquet"
+        return f"{self.parquet_storage_path}/{parquet_filename}"
+    def _ensure_directory_exists(self, full_path: str, clear_existing=False):
+        """
+        Ensure that the directory for the path exists, clearing it if specified.
+        :param full_path: Full path for the target file.
+        :param clear_existing: Whether to clear existing files/directories.
+        """
+        directory = "/".join(full_path.split("/")[:-1])
+        if self.fs.exists(directory):
+            if clear_existing:
+                self.logger.info(f"Clearing existing directory: {directory}")
+                self.fs.rm(directory, recursive=True)
+        else:
+            self.logger.info(f"Creating directory: {directory}")
+            self.fs.mkdirs(directory, exist_ok=True)
+    def _save_dataframe_to_parquet(self, full_path: str, schema: pa.Schema):
+        """Save the DataFrame to Parquet using the specified schema."""
+        if self.fs.exists(full_path):
+            self.logger.info(f"Overwriting existing file: {full_path}")
+            self.fs.rm(full_path, recursive=True)
+        self.logger.info(f"Saving Parquet file to: {full_path}")
+        self.df_result.to_parquet(
+            full_path,
+            engine="pyarrow",
+            schema=schema,
+            storage_options=self.fs.storage_options if hasattr(self.fs, "storage_options") else None,
+            write_index=False,
+        )
+# from pathlib import Path
+# from typing import Optional
+#
+# import fsspec
+# import pyarrow as pa
+#
+# from sibi_dst.utils import Logger
+#
+#
+# class ParquetSaver:
+#     def __init__(self, df_result, parquet_storage_path, logger=None, fs=None):
+#         # Ensure df_result is a Dask DataFrame
+#         self.fs = fs or fsspec.filesystem("file")
+#         self.df_result = df_result
+#         self.parquet_storage_path = parquet_storage_path
+#         self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
+#
+#     def save_to_parquet(self, parquet_filename: Optional[str] = None, clear_existing=True):
+#         full_path = self._construct_full_path(parquet_filename)
+#
+#         # We cannot check for empty DataFrame directly with Dask without computation
+#         # Proceed with saving; if the DataFrame is empty, an empty Parquet file will be created
+#
+#         # Ensure directory exists and clear if necessary
+#         self._ensure_directory_exists(full_path, clear_existing=clear_existing)
+#
+#         # Define schema and save DataFrame to Parquet
+#         schema = self._define_schema()
+#         self._convert_dtypes(schema)
+#         self._save_dataframe_to_parquet(full_path, schema)
+#
+#     def _define_schema(self) -> pa.Schema:
+#         """Define a PyArrow schema dynamically based on df_result column types."""
+#         pandas_dtype_to_pa = {
+#             'object': pa.string(),
+#             'string': pa.string(),
+#             'Int64': pa.int64(),
+#             'int64': pa.int64(),
+#             'float64': pa.float64(),
+#             'float32': pa.float32(),
+#             'bool': pa.bool_(),
+#             'boolean': pa.bool_(),  # pandas nullable boolean
+#             'datetime64[ns]': pa.timestamp('ns'),
+#             'timedelta[ns]': pa.duration('ns')
+#         }
+#
+#         dtypes = self.df_result.dtypes  # No need to call .compute()
+#
+#         fields = [
+#             pa.field(col, pandas_dtype_to_pa.get(str(dtype), pa.string()))
+#             for col, dtype in dtypes.items()
+#         ]
+#         return pa.schema(fields)
+#
+#     def _convert_dtypes(self, schema: pa.Schema):
+#         """Convert DataFrame columns to match the specified schema."""
+#         dtype_mapping = {}
+#         for field in schema:
+#             col_name = field.name
+#             if col_name in self.df_result.columns:
+#                 if pa.types.is_string(field.type):
+#                     dtype_mapping[col_name] = 'string'
+#                 elif pa.types.is_int64(field.type):
+#                     dtype_mapping[col_name] = 'Int64'  # pandas nullable integer
+#                 elif pa.types.is_float64(field.type):
+#                     dtype_mapping[col_name] = 'float64'
+#                 elif pa.types.is_float32(field.type):
+#                     dtype_mapping[col_name] = 'float32'
+#                 elif pa.types.is_boolean(field.type):
+#                     dtype_mapping[col_name] = 'boolean'  # pandas nullable boolean
+#                 elif pa.types.is_timestamp(field.type):
+#                     dtype_mapping[col_name] = 'datetime64[ns]'
+#                 else:
+#                     dtype_mapping[col_name] = 'object'  # Fallback to object
+#         # Convert dtypes
+#         self.df_result = self.df_result.astype(dtype_mapping)
+#
+#     def _construct_full_path(self, parquet_filename: Optional[str]) -> Path:
+#         """Construct and return the full path for the Parquet file."""
+#         _, base_path = fsspec.core.url_to_fs(self.parquet_storage_path)
+#         parquet_filename = parquet_filename or "default.parquet"
+#         return Path(base_path) / parquet_filename
+#
+#     @staticmethod
+#     def _ensure_directory_exists(full_path: Path, clear_existing=False):
+#         """Ensure that the directory for the path exists, clearing it if specified."""
+#         fs, _ = fsspec.core.url_to_fs(str(full_path))
+#         directory = str(full_path.parent)
+#
+#         if fs.exists(directory):
+#             if clear_existing:
+#                 fs.rm(directory, recursive=True)
+#         else:
+#             fs.mkdirs(directory, exist_ok=True)
+#
+#     def _save_dataframe_to_parquet(self, full_path: Path, schema: pa.Schema):
+#         """Save the DataFrame to Parquet using the specified schema."""
+#         fs, _ = fsspec.core.url_to_fs(str(full_path))
+#         print(f"Saving to {str(full_path)}")
+#         if fs.exists(str(full_path)):
+#             fs.rm(str(full_path), recursive=True)
+#
+#         # Save the Dask DataFrame to Parquet
+#         self.df_result.to_parquet(
+#             str(full_path), engine="pyarrow", schema=schema, write_index=False
+#         )

{sibi_dst-0.3.29 → sibi_dst-0.3.31}/sibi_dst/utils/storage_manager.py RENAMED Viewed

@@ -7,11 +7,12 @@ class StorageManager:
     def __init__(self, storage_path, fs_type="file", fs_options=None):
         """
         Initializes the StorageManager with the base storage path and file system settings.
-        :param storage_path: Base path for the storage.
+        :param storage_path: Base path for the storage (e.g., "s3://my-bucket").
         :param fs_type: File system type (e.g., "file", "s3").
         :param fs_options: Dictionary of options for fsspec file system (e.g., credentials).
         """
-        self.storage_path = storage_path
+        # Ensure the storage_path ends with a slash for consistency
+        self.storage_path = storage_path.rstrip("/")
         self.fs_type = fs_type
         self.fs_options = fs_options or {}
         self.fs = fsspec.filesystem(fs_type, **self.fs_options)
@@ -33,6 +34,7 @@ class StorageManager:
         :param dirs_to_create: List of subdirectories to create.
         :param clear_existing: Whether to clear existing directories.
         """
+        print(f"Setting up directories under: {base_path}")
         if clear_existing:
             print(f"Warning: All existing contents in {base_path} will be removed.")
             if self.fs.exists(base_path):
@@ -44,6 +46,7 @@ class StorageManager:
         # Create subdirectories
         for sub_directory in dirs_to_create:
             sub_path = self.join_paths(base_path, sub_directory)
+            print(f"Creating directory: {sub_path}")
             if clear_existing and self.fs.exists(sub_path):
                 self.fs.rm(sub_path, recursive=True)
             self.fs.mkdirs(sub_path, exist_ok=True)
@@ -59,6 +62,7 @@ class StorageManager:
         # Ensure directories exist (optionally clear existing ones)
         for depot, sub_directories in depots.items():
             depot_path = self.join_paths(self.storage_path, depot)
+            print(f"Rebuilding depot at: {depot_path}")
             self.setup_directories(depot_path, sub_directories, clear_existing=clear_existing)
         # Generate depot_paths dictionary
@@ -86,4 +90,4 @@ class StorageManager:
         """
         print("Rebuilding depot structure...")
         self.rebuild_depot_paths(depots, clear_existing=clear_existing)
-        print("Rebuild complete.")
+        print("Rebuild complete.")

sibi_dst-0.3.29/sibi_dst/utils/parquet_saver.py DELETED Viewed

@@ -1,104 +0,0 @@
-from pathlib import Path
-from typing import Optional
-import fsspec
-import pyarrow as pa
-from sibi_dst.utils import Logger
-class ParquetSaver:
-    def __init__(self, df_result, parquet_storage_path, logger=None):
-        # Ensure df_result is a Dask DataFrame
-        self.df_result = df_result
-        self.parquet_storage_path = parquet_storage_path
-        self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
-    def save_to_parquet(self, parquet_filename: Optional[str] = None, clear_existing=True):
-        full_path = self._construct_full_path(parquet_filename)
-        # We cannot check for empty DataFrame directly with Dask without computation
-        # Proceed with saving; if the DataFrame is empty, an empty Parquet file will be created
-        # Ensure directory exists and clear if necessary
-        self._ensure_directory_exists(full_path, clear_existing=clear_existing)
-        # Define schema and save DataFrame to Parquet
-        schema = self._define_schema()
-        self._convert_dtypes(schema)
-        self._save_dataframe_to_parquet(full_path, schema)
-    def _define_schema(self) -> pa.Schema:
-        """Define a PyArrow schema dynamically based on df_result column types."""
-        pandas_dtype_to_pa = {
-            'object': pa.string(),
-            'string': pa.string(),
-            'Int64': pa.int64(),
-            'int64': pa.int64(),
-            'float64': pa.float64(),
-            'float32': pa.float32(),
-            'bool': pa.bool_(),
-            'boolean': pa.bool_(),  # pandas nullable boolean
-            'datetime64[ns]': pa.timestamp('ns'),
-            'timedelta[ns]': pa.duration('ns')
-        }
-        dtypes = self.df_result.dtypes  # No need to call .compute()
-        fields = [
-            pa.field(col, pandas_dtype_to_pa.get(str(dtype), pa.string()))
-            for col, dtype in dtypes.items()
-        ]
-        return pa.schema(fields)
-    def _convert_dtypes(self, schema: pa.Schema):
-        """Convert DataFrame columns to match the specified schema."""
-        dtype_mapping = {}
-        for field in schema:
-            col_name = field.name
-            if col_name in self.df_result.columns:
-                if pa.types.is_string(field.type):
-                    dtype_mapping[col_name] = 'string'
-                elif pa.types.is_int64(field.type):
-                    dtype_mapping[col_name] = 'Int64'  # pandas nullable integer
-                elif pa.types.is_float64(field.type):
-                    dtype_mapping[col_name] = 'float64'
-                elif pa.types.is_float32(field.type):
-                    dtype_mapping[col_name] = 'float32'
-                elif pa.types.is_boolean(field.type):
-                    dtype_mapping[col_name] = 'boolean'  # pandas nullable boolean
-                elif pa.types.is_timestamp(field.type):
-                    dtype_mapping[col_name] = 'datetime64[ns]'
-                else:
-                    dtype_mapping[col_name] = 'object'  # Fallback to object
-        # Convert dtypes
-        self.df_result = self.df_result.astype(dtype_mapping)
-    def _construct_full_path(self, parquet_filename: Optional[str]) -> Path:
-        """Construct and return the full path for the Parquet file."""
-        _, base_path = fsspec.core.url_to_fs(self.parquet_storage_path)
-        parquet_filename = parquet_filename or "default.parquet"
-        return Path(base_path) / parquet_filename
-    @staticmethod
-    def _ensure_directory_exists(full_path: Path, clear_existing=False):
-        """Ensure that the directory for the path exists, clearing it if specified."""
-        fs, _ = fsspec.core.url_to_fs(str(full_path))
-        directory = str(full_path.parent)
-        if fs.exists(directory):
-            if clear_existing:
-                fs.rm(directory, recursive=True)
-        else:
-            fs.mkdirs(directory, exist_ok=True)
-    def _save_dataframe_to_parquet(self, full_path: Path, schema: pa.Schema):
-        """Save the DataFrame to Parquet using the specified schema."""
-        fs, _ = fsspec.core.url_to_fs(str(full_path))
-        if fs.exists(str(full_path)):
-            fs.rm(str(full_path), recursive=True)
-        # Save the Dask DataFrame to Parquet
-        self.df_result.to_parquet(
-            str(full_path), engine="pyarrow", schema=schema, write_index=False
-        )