PyPI - sibi-dst - Versions diffs - 2025.1.13__py3-none-any.whl → 2025.8.1__py3-none-any.whl - Mend

sibi-dst 2025.1.13py3-none-any.whl → 2025.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

sibi_dst/__init__.py +7 -1
sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +235 -342
sibi_dst/df_helper/_df_helper.py +417 -117
sibi_dst/df_helper/_parquet_artifact.py +255 -283
sibi_dst/df_helper/backends/parquet/_parquet_options.py +8 -4
sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +68 -107
sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +105 -255
sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +90 -42
sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +192 -0
sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +122 -72
sibi_dst/osmnx_helper/route_path_builder.py +45 -46
sibi_dst/utils/base.py +302 -96
sibi_dst/utils/clickhouse_writer.py +472 -206
sibi_dst/utils/data_utils.py +139 -186
sibi_dst/utils/data_wrapper.py +317 -73
sibi_dst/utils/date_utils.py +1 -0
sibi_dst/utils/df_utils.py +193 -213
sibi_dst/utils/file_utils.py +3 -2
sibi_dst/utils/filepath_generator.py +314 -152
sibi_dst/utils/log_utils.py +581 -242
sibi_dst/utils/manifest_manager.py +60 -76
sibi_dst/utils/parquet_saver.py +33 -27
sibi_dst/utils/phone_formatter.py +88 -95
sibi_dst/utils/update_planner.py +180 -178
sibi_dst/utils/webdav_client.py +116 -166
{sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.1.dist-info}/METADATA +1 -1
{sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.1.dist-info}/RECORD +29 -27
{sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.1.dist-info}/WHEEL +0 -0

sibi_dst/df_helper/_parquet_artifact.py CHANGED Viewed

@@ -1,328 +1,300 @@
 from __future__ import annotations
-import datetime
+import datetime as dt
 import threading
-from typing import Optional, Any, Dict, ClassVar
+from functools import cached_property
+from typing import Any, Dict, Type, TypeVar
-import dask.dataframe as dd
-import fsspec
+from sibi_dst.utils import DataWrapper, DateUtils, UpdatePlanner, ManagedResource
+from sibi_dst.utils import MissingManifestManager, Logger
-from sibi_dst.df_helper import DfHelper
-from sibi_dst.utils import DataWrapper, DateUtils, UpdatePlanner
-from sibi_dst.utils import MissingManifestManager
+T = TypeVar("T")
-class ParquetArtifact(DfHelper):
+class ParquetArtifact(ManagedResource):
     """
-    Class designed to manage Parquet data storage and retrieval using a specified
-    DataWrapper class for data processing. It provides functionality for loading,
-    updating, rebuilding, and generating Parquet files within a configurable
-    storage filesystem. The class ensures that all essential configurations and
-    filesystems are properly set up before operations.
-    Detailed functionality includes support for dynamically managing and generating
-    Parquet files based on time periods, with customizable options for paths,
-    filenames, date fields, and more. It is an abstraction for efficiently handling
-    storage tasks related to distributed or local file systems.
-    :ivar config: Configuration dictionary containing all configurable parameters
-                  for managing Parquet data storage, such as paths, filenames,
-                  and date ranges.
-    :type config: dict
-    :ivar df: Cached Dask DataFrame used to store and manipulate data loaded
-              from the Parquet file.
-    :type df: Optional[dask.dataframe.DataFrame]
-    :ivar data_wrapper_class: Class responsible for abstracting data processing
-                              operations required for Parquet file generation.
-    :type data_wrapper_class: type
-    :ivar date_field: Name of the field used to identify and process data by date.
-    :type date_field: Optional[str]
-    :ivar parquet_storage_path: Filesystem path to store Parquet files.
-    :type parquet_storage_path: Optional[str]
-    :ivar parquet_filename: Name of the Parquet file to be generated and managed.
-    :type parquet_filename: Optional[str]
-    :ivar parquet_start_date: Date string specifying the start date for data range
-                              processing.
-    :type parquet_start_date: Optional[str]
-    :ivar parquet_end_date: Date string specifying the end date for data range
-                            processing.
-    :type parquet_end_date: Optional[str]
-    :ivar filesystem_type: Type of the filesystem used for managing storage
-                           operations (e.g., `file`, `s3`, etc.).
-    :type filesystem_type: str
-    :ivar filesystem_options: Additional options for configuring the filesystem.
-    :type filesystem_options: dict
-    :ivar fs: Filesystem object used for storage operations.
-    :type fs: fsspec.AbstractFileSystem
+    Orchestrates a single dataset:
+      - Builds/uses MissingManifestManager
+      - Plans work with UpdatePlanner
+      - Executes with DataWrapper (threaded) saving Dask → Parquet
+      - Prevents duplicate concurrent runs per (storage_path, filename)
+      - Forwards retry/backoff knobs to DataWrapper.process()
     """
-    DEFAULT_CONFIG: ClassVar[Dict[str, str]] = {
-        'backend': 'parquet'
-    }
+    _global_lock = threading.RLock()
+    _active_runs: set[tuple[str, str]] = set()
-    def __init__(self, data_wrapper_class, **kwargs):
-        """
-        Initializes an instance of the class with given configuration and validates
-        required parameters. Sets up the filesystem to handle storage, ensuring
-        necessary directories exist. The configuration supports a variety of options
-        to manage parquet storage requirements, including paths, filenames, and date
-        ranges.
-        :param data_wrapper_class: The class responsible for wrapping data to be managed
-                                   by this instance.
-        :type data_wrapper_class: type
-        :param kwargs: Arbitrary keyword arguments to override default configuration.
-                       Includes settings for `date_field`, `parquet_storage_path`,
-                       `parquet_filename`, `parquet_start_date`, `parquet_end_date`,
-                       `filesystem_type`, `filesystem_options`, and `fs`.
-        :type kwargs: dict
-        :raises ValueError: If any of the required configuration options
-                            (`date_field`, `parquet_storage_path`,
-                            `parquet_filename`, `parquet_start_date`,
-                            or `parquet_end_date`) are missing or not set properly.
-        """
+    def __init__(self, **kwargs: Any):
+        # Merge defaults from ManagedResource and caller kwargs
+        self.all_kwargs: Dict[str, Any] = {**kwargs}
+        super().__init__(**self.all_kwargs)
-        """Initialize with config, validate required fields, and setup filesystem."""
-        self._lock = threading.Lock()
-        self.config = {
-            **self.DEFAULT_CONFIG,
-            **kwargs,
-        }
-        self.df: Optional[dd.DataFrame] = None
-        super().__init__(**self.config)
-        self.data_wrapper_class = data_wrapper_class
-        self.date_field = self._validate_required('date_field')
-        self.parquet_storage_path = self._validate_required('parquet_storage_path')
-        self.parquet_filename = self._validate_required('parquet_filename')
-        self.parquet_start_date = self._validate_required('parquet_start_date')
-        self.parquet_end_date = self._validate_required('parquet_end_date')
-        self.class_params = self.config.pop('class_params', {
-            'debug': self.debug,
-            'logger': self.logger,
-            'fs': self.fs,
-            'verbose': self.verbose,
-        })
-        # Populate parameters to pass to load method of DataWrapper class
-        self.load_params = self.config.setdefault('load_params', {})
-        # Ensure the directory exists
-        self.ensure_directory_exists(self.parquet_storage_path)
-        #super().__init__(**self.config)
-        self.update_planner_params = {}
-        self.datawrapper_params = {}
-    def _validate_required(self, key: str) -> Any:
-        """Validate required configuration fields."""
-        value = self.config.setdefault(key, None)
-        if value is None:
-            raise ValueError(f'{key} must be set')
-        return value
-    def _setup_manifest(self, overwrite: bool = False, ignore_missing: bool = False):
-        self.skipped = []
-        self.missing_manifest_path = f"{self.parquet_storage_path}_manifests/missing.parquet"
-        self.mmanifest = MissingManifestManager(
+        # Persist the minimal config we depend on frequently
+        self._lock = threading.RLock()
+        # Required knobs
+        self._storage_path: str = self.all_kwargs["parquet_storage_path"]
+        self._parquet_filename: str = self.all_kwargs["parquet_filename"]
+        self._data_wrapper_class = self.all_kwargs.get("data_wrapper_class")
+    # ---------- helpers ----------
+    def _invalidate_cached(self, *names: str) -> None:
+        for n in names:
+            self.__dict__.pop(n, None)
+    def _build_manifest_path(self) -> str:
+        base = f"{self._storage_path}".rstrip("/") + "/"
+        return f"{base}_manifests/missing.parquet"
+    # ---------- lazy members ----------
+    @cached_property
+    def mmanifest(self) -> MissingManifestManager:
+        self.logger.info("Initializing MissingManifestManager...")
+        manifest_path = self._build_manifest_path()
+        # ensure manifest directory exists
+        manifest_dir = manifest_path.rsplit("/", 1)[0] if "/" in manifest_path else manifest_path
+        self.ensure_directory_exists(manifest_dir)
+        mgr = MissingManifestManager(
             fs=self.fs,
-            manifest_path=self.missing_manifest_path,
-            clear_existing=overwrite,
-            debug= self.debug,
-            logger=self.logger
+            manifest_path=manifest_path,
+            clear_existing=self.all_kwargs.get("overwrite", False),
+            debug=self.debug,
+            logger=self.logger,
+            overwrite=self.all_kwargs.get("overwrite", False),
         )
-        # Initialize skipped files
-        manifest_exists = self.mmanifest._safe_exists(self.missing_manifest_path)
-        if not manifest_exists:
-            self.logger.info(f"Creating new manifest at {self.missing_manifest_path}")
-            self.mmanifest.save()
-            #self.mmanifest.cleanup_temp_manifests()
+        if not mgr._safe_exists(mgr.manifest_path):
+            self.logger.info(f"Creating new manifest at {mgr.manifest_path}")
+            mgr.save()
         else:
-            self.logger.info(f"Manifest already exists at {self.missing_manifest_path}")
+            self.logger.info(f"Manifest already exists at {mgr.manifest_path}")
+        return mgr
+    @cached_property
+    def update_planner(self) -> UpdatePlanner:
+        self.logger.info("Initializing UpdatePlanner...")
+        skipped_files = self.mmanifest.load_existing() or []
+        cfg = {
+            **self.all_kwargs,
+            "fs": self.fs,
+            "debug": self.debug,
+            "logger": self.logger,
+            "description": getattr(self._data_wrapper_class, "__name__", "DataWrapper"),
+            "skipped": list(skipped_files),
+            "mmanifest": self.mmanifest,
+        }
+        return UpdatePlanner(**cfg)
-        # Load skipped files if manifest exists and ignore_missing is True
-        self.skipped = self.mmanifest.load_existing()  # if ignore_missing and manifest_exists else []
-        self.logger.info(f"Skipped: {self.skipped}")
-        if overwrite:
-            self.skipped = []
-            self.ignore_missing = False
+    @cached_property
+    def data_wrapper(self) -> DataWrapper:
+        self.logger.info("Initializing DataWrapper...")
-    def _setup_update_planner(self, **kwargs) -> None:
-        self._prepare_update_params(**kwargs)
-        self.update_planner = UpdatePlanner(**self.update_planner_params)
-        self.update_planner.generate_plan(start=self.start_date,end= self.end_date)
+        # Ensure the planner has a plan
+        if getattr(self.update_planner, "plan", None) is None:
+            self.update_planner.generate_plan()
-    def load(self, **kwargs):
-        with self._lock:
-            self.df = super().load(**kwargs)
-        return self.df
+        class_params = {
+            "debug": self.debug,
+            "logger": self.logger,
+            "fs": self.fs,
+            "verbose": self.verbose,
+        }
-    def generate_parquet(self, **kwargs) -> None:
-        """
-        Generate a Parquet file using the configured DataWrapper class.
-        """
-        with self._lock:
-            overwrite = kwargs.get('overwrite', False)
-            ignore_missing = kwargs.get('ignore_missing', False)
-            self._setup_manifest(overwrite, ignore_missing)
-            self._setup_update_planner(**kwargs)
-            params = self.datawrapper_params.copy()
-            params.update({
-                'mmanifest': self.mmanifest,
-                'update_planner': self.update_planner
-            })
-            with DataWrapper(self.data_wrapper_class, **params) as dw:
-                dw.process()
-    def __exit__(self, exc_type, exc_value, traceback):
-        try:
-            if self.mmanifest and self.mmanifest._new_records:
-                self.mmanifest.save()
-        except Exception as e:
-            self.logger.warning(f"Error closing filesystem: {e}")
-        finally:
-            super().__exit__(exc_type, exc_value, traceback)
-        # return False so exceptions aren’t suppressed
-        return False
+        cfg = {
+            "data_path": self._storage_path,
+            "parquet_filename": self._parquet_filename,
+            "fs": self.fs,
+            "debug": self.debug,
+            "logger": self.logger,
+            "verbose": self.verbose,
+            "dataclass": self._data_wrapper_class,
+            "class_params": class_params,
+            "load_params": self.all_kwargs.get("load_params", {}) or {},
+            "mmanifest": self.mmanifest,
+            "update_planner": self.update_planner,
+            "date_field": self.all_kwargs.get("date_field"),
+            # pipeline execution knobs
+            "show_progress": bool(self.all_kwargs.get("show_progress", False)),
+            "timeout": float(self.all_kwargs.get("timeout", 30.0)),
+            "max_threads": int(self.all_kwargs.get("max_threads", 3)),
+        }
+        return DataWrapper(**cfg)
-    def get_size_estimate(self, **kwargs) -> int:
+    # ---------- public API ----------
+    def load(self, **kwargs: Any):
         """
-        Synchronously estimates artifact size for use in multi-threaded environments.
-        This method safely executes asynchronous I/O operations from a synchronous
-        context, handling variations in fsspec filesystem implementations.
+        Direct load using the configured data_wrapper_class (no planner/manifest round-trip).
+        Expected to return a Dask DataFrame from the loader.
         """
+        self.logger.info(f"Loading data from {self._storage_path}")
+        if not self._data_wrapper_class:
+            raise ValueError("data_wrapper_class is not configured.")
+        params = {
+            "backend": "parquet",
+            "fs": self.fs,
+            "logger": self.logger,
+            "debug": self.debug,
+            "parquet_storage_path": self._storage_path,
+            "parquet_filename": self._parquet_filename,
+            "parquet_start_date": self.all_kwargs.get("parquet_start_date"),
+            "parquet_end_date": self.all_kwargs.get("parquet_end_date"),
+            **(self.all_kwargs.get("class_params") or {}),
+        }
-        async def _get_total_bytes_async():
-            """A helper async coroutine to perform the I/O."""
-            import asyncio
-            files = await self.fs._glob(f"{self.parquet_storage_path}/*.parquet")
-            if not files:
-                return 0
+        cls = self._data_wrapper_class
+        with cls(**params) as instance:
+            return instance.load(**kwargs)
-            size_tasks = [self.fs._size(f) for f in files]
-            sizes = await asyncio.gather(*size_tasks)
-            return sum(s for s in sizes if s is not None)
+    def generate_parquet(self, **kwargs: Any) -> None:
+        """
+        Generate or update Parquet according to the plan.
+        - Merges runtime kwargs
+        - Invalidates dependent caches
+        - Guards against duplicate concurrent runs
+        - Forwards retry/backoff to DataWrapper.process()
+        """
+        # Merge and invalidate caches that depend on runtime changes
+        self.all_kwargs.update(kwargs)
+        self._invalidate_cached("update_planner", "data_wrapper")
+        if "overwrite" in kwargs:
+            self._invalidate_cached("mmanifest")
+        # Global de-dupe guard
+        key = (self._storage_path, self._parquet_filename)
+        with ParquetArtifact._global_lock:
+            if key in ParquetArtifact._active_runs:
+                self.logger.info(
+                    f"Run already in progress for {key}; skipping this invocation."
+                )
+                return
+            ParquetArtifact._active_runs.add(key)
         try:
-            # Attempt the standard fsspec method first
-            total_bytes = self.fs.sync(_get_total_bytes_async())
-        except AttributeError:
-            #  fallback for filesystems like s3fs that lack .sync()
-            total_bytes = self.fs.loop.run_until_complete(_get_total_bytes_async())
+            self.ensure_directory_exists(self._storage_path)
+            self.update_planner.generate_plan()
+            plan = getattr(self.update_planner, "plan", None)
+            if plan is None or (hasattr(plan, "empty") and plan.empty):
+                # Planning uses Pandas; this is safe to check.
+                self.logger.info("No updates needed. Skipping Parquet generation.")
+                return
+            # Print plan once per run
+            if (
+                getattr(self.update_planner, "show_progress", False)
+                and not getattr(self.update_planner, "_printed_this_run", False)
+            ):
+                self.update_planner.show_update_plan()
+                setattr(self.update_planner, "_printed_this_run", True)
+            # ---- forward retry/backoff knobs to DataWrapper.process() ----
+            dw_retry_kwargs = {
+                k: self.all_kwargs[k]
+                for k in ("max_retries", "backoff_base", "backoff_jitter", "backoff_max")
+                if k in self.all_kwargs
+            }
-        # Convert to megabytes, ensuring a minimum of 1
-        return max(1, int(total_bytes / (1024 ** 2)))
+            with self._lock:
+                dw = self.data_wrapper  # single cached_property access
+                if hasattr(dw, "process"):
+                    dw.process(**dw_retry_kwargs)
+                    if getattr(self.update_planner, "show_progress", False) and hasattr(
+                        dw, "show_benchmark_summary"
+                    ):
+                        dw.show_benchmark_summary()
-    def update_parquet(self, period: str = 'today', **kwargs) -> None:
-        """Update the Parquet file with data from a specific period."""
+        finally:
+            with ParquetArtifact._global_lock:
+                ParquetArtifact._active_runs.discard(key)
+    def update_parquet(self, period: str = "today", **kwargs: Any) -> None:
+        """
+        High-level entry point to update Parquet for a given period:
+          - 'today', 'yesterday', 'last_7_days', etc. via DateUtils.parse_period
+          - 'ytd'
+          - 'itd' (requires history_begins_on)
+          - 'custom' (requires start_on / end_on)
+        Also accepts retry/backoff knobs which flow to DataWrapper.process().
+        """
+        final_kwargs = {**self.all_kwargs, **kwargs}
         def itd_config():
-            try:
-                start_date = kwargs.pop('history_begins_on')
-            except KeyError:
-                raise ValueError("For period 'itd', you must provide 'history_begins_on' in kwargs.")
-            return {'parquet_start_date': start_date, 'parquet_end_date': datetime.date.today().strftime('%Y-%m-%d')}
+            start_date = final_kwargs.get("history_begins_on")
+            if not start_date:
+                raise ValueError(
+                    "For period 'itd', 'history_begins_on' must be configured."
+                )
+            return {
+                "parquet_start_date": start_date,
+                "parquet_end_date": dt.date.today(),
+            }
         def ytd_config():
             return {
-                'parquet_start_date': datetime.date(datetime.date.today().year, 1, 1).strftime('%Y-%m-%d'),
-                'parquet_end_date': datetime.date.today().strftime('%Y-%m-%d')
+                "parquet_start_date": dt.date(dt.date.today().year, 1, 1),
+                "parquet_end_date": dt.date.today(),
             }
         def custom_config():
-            try:
-                start_date = kwargs.pop('start_on')
-                end_date = kwargs.pop('end_on')
-            except KeyError:
-                raise ValueError("For period 'custom', you must provide 'start_on' in kwargs.")
+            if "start_on" not in final_kwargs or "end_on" not in final_kwargs:
+                raise ValueError(
+                    "For period 'custom', provide 'start_on' and 'end_on'."
+                )
             return {
-                'parquet_start_date': start_date,
-                'parquet_end_date': end_date
+                "parquet_start_date": final_kwargs["start_on"],
+                "parquet_end_date": final_kwargs["end_on"],
             }
-        config_map = {
-            'itd': itd_config,
-            'ytd': ytd_config,
-            'custom': custom_config,
-        }
-        if period in config_map:
-            kwargs.update(config_map[period]())
+        if period == "itd":
+            period_params = itd_config()
+        elif period == "ytd":
+            period_params = ytd_config()
+        elif period == "custom":
+            period_params = custom_config()
         else:
-            kwargs.update(self.parse_parquet_period(period=period))
-        self.logger.debug(f"kwargs passed to update parquet: {kwargs}")
-        self.generate_parquet(**kwargs)
-    def rebuild_parquet(self, **kwargs) -> None:
-        """Rebuild the Parquet file from the start to end date."""
-        kwargs.update(self._get_rebuild_params(kwargs))
-        self.generate_parquet(**kwargs)
-    def _get_rebuild_params(self, kwargs: Dict[str, Any]) -> Dict[str, Any]:
-        """Prepare parameters for rebuilding the Parquet file."""
-        return {
-            'overwrite': True,
-            'reverse_order': True,
-            'start_date': kwargs.get('parquet_start_date', self.parquet_start_date),
-            'end_date': kwargs.get('parquet_end_date', self.parquet_end_date),
-        }
+            start_date, end_date = DateUtils.parse_period(period=period)
+            period_params = {
+                "parquet_start_date": start_date,
+                "parquet_end_date": end_date,
+            }
-    def _prepare_update_params(self, **kwargs) -> Dict[str, Any]:
-        self.reverse_order = kwargs.pop('reverse_order', True)
-        self.overwrite = kwargs.pop('overwrite', False)
-        self.ignore_missing = kwargs.pop('ignore_missing', False)
-        self.history_days_threshold = kwargs.pop('history_days_threshold', 30)
-        self.max_age_minutes = kwargs.pop('max_age_minutes', 10)
-        self.show_progress = kwargs.pop('show_progress', False)
-        self.start_date = kwargs.pop('parquet_start_date', self.parquet_start_date)
-        self.end_date = kwargs.pop('parquet_end_date', self.parquet_end_date)
-        self.parquet_filename = kwargs.pop('parquet_filename', self.parquet_filename)
-        self.verbose = kwargs.pop('verbose', False)
-        self.update_planner_params.update({
-            'filename': self.parquet_filename,
-            'data_path': self.parquet_storage_path,
-            'fs': self.fs,
-            'debug': self.debug,
-            'logger': self.logger,
-            'reverse_order': self.reverse_order,
-            'overwrite': self.overwrite,
-            'ignore_missing': self.ignore_missing,
-            'history_days_threshold': self.history_days_threshold,
-            'max_age_minutes': self.max_age_minutes,
-            'show_progress': self.show_progress,
-            'description': f"{self.data_wrapper_class.__name__}",
-            'skipped': self.skipped,
-            'verbose': self.verbose,
-        })
-        self.datawrapper_params = {
-            'parquet_filename': self.parquet_filename,
-            'data_path': self.parquet_storage_path,
-            'fs': self.fs,
-            'debug': self.debug,
-            'logger': self.logger,
-            'class_params': self.class_params,
-            'date_field': self.date_field,
-            'load_params': self.load_params,
-            'verbose': self.verbose
-        }
+        final_kwargs.update(period_params)
+        self.logger.debug(
+            f"kwargs passed to update_parquet/generate_parquet: {final_kwargs}"
+        )
-    def parse_parquet_period(self, **kwargs):
-        start_date, end_date = DateUtils.parse_period(**kwargs)
-        self.parquet_start_date = start_date.strftime('%Y-%m-%d')
-        self.parquet_end_date = end_date.strftime('%Y-%m-%d')
-        return {
-            'parquet_start_date': self.parquet_start_date,
-            'parquet_end_date': self.parquet_end_date,
-        }
+        # Delegate to generator (handles cache invalidation + forwarding knobs)
+        self.generate_parquet(**final_kwargs)
+    # ---------- utils ----------
     def ensure_directory_exists(self, path: str) -> None:
-        """Ensure the directory exists in the specified filesystem."""
+        """Ensure the directory exists across fsspec backends."""
         with self._lock:
-            try:
-                self.fs.makedirs(path, exist_ok=True)
-            except Exception as e:
-                raise ValueError(f"Error creating directory {path} in filesystem {self.filesystem_type}: {e}")
+            if not self.fs.exists(path):
+                self.logger.info(f"Creating directory: {path}")
+                try:
+                    self.fs.makedirs(path, exist_ok=True)
+                except TypeError:
+                    try:
+                        self.fs.makedirs(path)
+                    except FileExistsError:
+                        pass
+    def _cleanup(self):
+        """Clean up resources upon exit."""
+        try:
+            if "mmanifest" in self.__dict__ and getattr(
+                self.mmanifest, "_new_records", None
+            ):
+                if self.mmanifest._new_records:
+                    self.mmanifest.save()
+            if "data_wrapper" in self.__dict__ and hasattr(self.data_wrapper, "close"):
+                self.data_wrapper.close()
+        except Exception as e:
+            self.logger.warning(f"Error during resource cleanup: {e}")

sibi_dst/df_helper/backends/parquet/_parquet_options.py CHANGED Viewed

@@ -202,16 +202,20 @@ class ParquetConfig(BaseModel):
         try:
             self.logger.debug(f"Attempting to load Parquet data from: {paths_to_load}")
-            return dd.read_parquet(
+            dd_result=dd.read_parquet(
                 paths_to_load,
                 engine="pyarrow",
                 filesystem=self.fs,
                 exclude=["_*", ".*"]
             )
+            return dd_result
+        except FileNotFoundError as e:
+            self.logger.debug(f"Parquet files not found at paths {paths_to_load}: {e}")
+            self.logger.debug("Returning empty DataFrame due to missing parquet files.")
+            return dd.from_pandas(pd.DataFrame(), npartitions=1)
         except Exception as e:
-            # This robust error handling is excellent.
-            self.logger.error(f"Parquet loading failed for paths {paths_to_load}: {e}", exc_info=True)
-            self.logger.warning("Returning empty DataFrame due to loading error.")
+            self.logger.debug(f"Parquet loading failed for paths {paths_to_load}: {e}")
+            self.logger.debug("Returning empty DataFrame due to loading error.")
             return dd.from_pandas(pd.DataFrame(), npartitions=1)

sibi-dst 2025.1.13__py3-none-any.whl → 2025.8.1__py3-none-any.whl

sibi-dst 2025.1.13py3-none-any.whl → 2025.8.1py3-none-any.whl