PyPI - sibi-dst - Versions diffs - 2025.8.5__py3-none-any.whl → 2025.8.7__py3-none-any.whl - Mend

sibi-dst 2025.8.5py3-none-any.whl → 2025.8.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

sibi_dst/df_helper/_df_helper.py +40 -6
sibi_dst/utils/async_utils.py +12 -0
sibi_dst/utils/clickhouse_writer.py +4 -241
sibi_dst/utils/storage_config.py +2 -2
sibi_dst/utils/storage_hive.py +195 -0
sibi_dst/utils/storage_manager.py +3 -2
{sibi_dst-2025.8.5.dist-info → sibi_dst-2025.8.7.dist-info}/METADATA +1 -1
{sibi_dst-2025.8.5.dist-info → sibi_dst-2025.8.7.dist-info}/RECORD +9 -7
{sibi_dst-2025.8.5.dist-info → sibi_dst-2025.8.7.dist-info}/WHEEL +0 -0

sibi_dst/df_helper/_df_helper.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import asyncio
 from typing import Any, Dict, Optional, TypeVar, Union
 import dask.dataframe as dd
@@ -104,7 +105,6 @@ class HttpBackend(BaseBackend):
         return self.total_records, result
-# ---- Main DfHelper ----
 class DfHelper(ManagedResource):
     _BACKEND_STRATEGIES = {
         "sqlalchemy": SqlAlchemyBackend,
@@ -198,6 +198,37 @@ class DfHelper(ManagedResource):
         df = df.persist() if persist else df
         return df.compute() if as_pandas else df
+    async def load_async(
+            self,
+            *,
+            persist: bool = False,
+            as_pandas: bool = False,
+            prefer_native: bool = False,
+            **options,
+    ):
+        """
+        Async load that prefers native async backends when available,
+        otherwise runs the sync `load()` in a worker thread via asyncio.to_thread.
+        Args:
+            persist: same as `load`
+            as_pandas: same as `load`
+            prefer_native: if True and the backend overrides `aload`, use it.
+                           otherwise force thread offload of `load()`.
+            **options: forwarded to `load` / `aload`
+        """
+        # If the backend provided an override for `aload`, use it
+        if prefer_native and type(self.backend_strategy).aload is not BaseBackend.aload:
+            return await self.aload(persist=persist, as_pandas=as_pandas, **options)
+        # Fall back to offloading the sync path to a thread
+        return await asyncio.to_thread(
+            self.load,
+            persist=persist,
+            as_pandas=as_pandas,
+            **options,
+        )
     # ---------- dataframe post-processing ----------
     def _post_process_df(self, df: dd.DataFrame) -> dd.DataFrame:
         self.logger.debug("Post-processing DataFrame.")
@@ -240,9 +271,12 @@ class DfHelper(ManagedResource):
         return df
     # ---------- sinks ----------
-    def save_to_parquet(self, df: dd.DataFrame, parquet_filename: str, **kwargs):
-        fs: AbstractFileSystem = kwargs.get("fs", self.fs)
-        path: str = kwargs.get("parquet_storage_path")
+    def save_to_parquet(self, df: dd.DataFrame, **kwargs):
+        fs: AbstractFileSystem = kwargs.pop("fs", self.fs)
+        path: str = kwargs.pop("parquet_storage_path", self.backend_parquet.parquet_storage_path if self.backend_parquet else None)
+        parquet_filename = kwargs.pop("parquet_filename" or self._backend_params.parquet_filename if self.backend_parquet else None)
+        if not parquet_filename:
+            raise ValueError("A 'parquet_filename' keyword argument must be provided.")
         if not fs:
             raise ValueError("A filesystem (fs) must be provided to save the parquet file.")
         if not path:
@@ -268,11 +302,11 @@ class DfHelper(ManagedResource):
         if hasattr(df, "npartitions") and df.npartitions == 1 and not len(df.head(1)):
             self.logger.warning("Cannot write to ClickHouse; DataFrame is empty.")
             return
-        with ClickHouseWriter(debug=self.debug, logger=self.logger, fs=self.fs, verbose=self.verbose, **credentials) as writer:
+        with ClickHouseWriter(debug=self.debug, logger=self.logger, verbose=self.verbose, **credentials) as writer:
             writer.save_to_clickhouse(df)
             self.logger.debug("Save to ClickHouse completed.")
-    # ---------- convenience period loaders ----------
+    # ---------- period loaders ----------
     def load_period(self, dt_field: str, start: str, end: str, **kwargs):
         final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
         return self.load(**final_kwargs)

sibi_dst/utils/async_utils.py ADDED Viewed

@@ -0,0 +1,12 @@
+import asyncio
+import dask.dataframe as dd
+def is_dask_dataframe(df):
+    """Check if the given object is a Dask DataFrame."""
+    return isinstance(df, dd.DataFrame)
+async def to_thread(func, *args, **kwargs):
+    """Explicit helper to keep code clear where we hop off the event loop."""
+    return await asyncio.to_thread(func, *args, **kwargs)

sibi_dst/utils/clickhouse_writer.py CHANGED Viewed

@@ -91,7 +91,7 @@ class ClickHouseWriter(ManagedResource):
             return
         # lazily fill missing values per-partition (no global compute)
-        df = df.map_partitions(self._fill_missing_partition, meta=df)
+        df = df.map_partitions(type(self)._fill_missing_partition, meta=df._meta)
         # (re)create table
         ow = self.overwrite if overwrite is None else bool(overwrite)
@@ -201,23 +201,21 @@ class ClickHouseWriter(ManagedResource):
     # ------------- missing values (lazy) -------------
-    def _fill_missing_partition(self, pdf: pd.DataFrame) -> pd.DataFrame:
-        # Fill by dtype family; leave real NaT for datetimes so Nullable(DateTime) accepts NULL
+    @staticmethod
+    def _fill_missing_partition(pdf: pd.DataFrame) -> pd.DataFrame:
+        # (unchanged body)
         for col in pdf.columns:
             s = pdf[col]
             if pd.api.types.is_integer_dtype(s.dtype):
-                # pandas nullable IntX supports NA → fill where needed
                 if pd.api.types.is_extension_array_dtype(s.dtype):
                     pdf[col] = s.fillna(pd.NA)
                 else:
                     pdf[col] = s.fillna(0)
             elif pd.api.types.is_bool_dtype(s.dtype):
-                # boolean pandas extension supports NA, ClickHouse uses UInt8; keep NA → Nullable
                 pdf[col] = s.fillna(pd.NA)
             elif pd.api.types.is_float_dtype(s.dtype):
                 pdf[col] = s.fillna(0.0)
             elif pd.api.types.is_datetime64_any_dtype(s.dtype):
-                # keep NaT; ClickHouse Nullable(DateTime) will take NULL
                 pass
             else:
                 pdf[col] = s.fillna("")
@@ -264,238 +262,3 @@ class ClickHouseWriter(ManagedResource):
             if hasattr(self._tlocal, "client"):
                 delattr(self._tlocal, "client")
-# from concurrent.futures import ThreadPoolExecutor
-# from typing import ClassVar, Dict
-#
-# import clickhouse_connect
-# import pandas as pd
-# from clickhouse_driver import Client
-# import dask.dataframe as dd
-#
-# from . import ManagedResource
-#
-#
-# class ClickHouseWriter(ManagedResource):
-#     """
-#     Provides functionality to write a Dask DataFrame to a ClickHouse database using
-#     a specified schema. This class handles the creation of tables, schema generation,
-#     data transformation, and data insertion. It ensures compatibility between Dask
-#     data types and ClickHouse types.
-#
-#     :ivar clickhouse_host: Host address of the ClickHouse database.
-#     :type clickhouse_host: str
-#     :ivar clickhouse_port: Port of the ClickHouse database.
-#     :type clickhouse_port: int
-#     :ivar clickhouse_dbname: Name of the database to connect to in ClickHouse.
-#     :type clickhouse_dbname: str
-#     :ivar clickhouse_user: Username for database authentication.
-#     :type clickhouse_user: str
-#     :ivar clickhouse_password: Password for database authentication.
-#     :type clickhouse_password: str
-#     :ivar clickhouse_table: Name of the table to store the data in.
-#     :type clickhouse_table: str
-#     :ivar logger: Logger instance for logging messages.
-#     :type logger: logging.Logger
-#     :ivar client: Instance of the ClickHouse database client.
-#     :type client: clickhouse_connect.Client or None
-#     :ivar df: Dask DataFrame to be written into ClickHouse.
-#     :type df: dask.dataframe.DataFrame
-#     :ivar order_by: Field or column name to use for table ordering.
-#     :type order_by: str
-#     """
-#     dtype_to_clickhouse:  ClassVar[Dict[str, str]] = {
-#         'int64': 'Int64',
-#         'int32': 'Int32',
-#         'float64': 'Float64',
-#         'float32': 'Float32',
-#         'bool': 'UInt8',
-#         'datetime64[ns]': 'DateTime',
-#         'object': 'String',
-#         'category': 'String',
-#     }
-#     df: dd.DataFrame
-#
-#     def __init__(self, **kwargs):
-#         super().__init__(**kwargs)
-#         self.clickhouse_host = kwargs.setdefault('host', "localhost")
-#         self.clickhouse_port = kwargs.setdefault('port', 8123)
-#         self.clickhouse_dbname = kwargs.setdefault('database', 'sibi_data')
-#         self.clickhouse_user = kwargs.setdefault('user', 'default')
-#         self.clickhouse_password = kwargs.setdefault('password', '')
-#         self.clickhouse_table = kwargs.setdefault('table', 'test_sibi_table')
-#
-#         #self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
-#         self.client = None
-#         self.order_by = kwargs.setdefault('order_by', 'id')
-#
-#     def save_to_clickhouse(self, df, **kwargs):
-#         self.df = df.copy()
-#         self.order_by = kwargs.setdefault('order_by', self.order_by)
-#         if len(self.df.head().index) == 0:
-#             self.logger.debug("Dataframe is empty")
-#             return
-#         self._handle_missing_values()
-#         self._connect()
-#         self._drop_table()
-#         self._create_table_from_dask()
-#         self._write_data()
-#
-#     def _connect(self):
-#         try:
-#             self.client = clickhouse_connect.get_client(
-#                 host=self.clickhouse_host,
-#                 port=self.clickhouse_port,
-#                 database=self.clickhouse_dbname,
-#                 user=self.clickhouse_user,
-#                 password=self.clickhouse_password
-#             )
-#             self.logger.debug("Connected to ClickHouse")
-#         except Exception as e:
-#             self.logger.error(e)
-#             raise
-#
-#     @staticmethod
-#     def _generate_clickhouse_schema(dask_dtypes, dtype_map):
-#         schema = []
-#         for col, dtype in dask_dtypes.items():
-#             # Handle pandas nullable types explicitly
-#             if isinstance(dtype, pd.Int64Dtype):  # pandas nullable Int64
-#                 clickhouse_type = 'Int64'
-#             elif isinstance(dtype, pd.Float64Dtype):  # pandas nullable Float64
-#                 clickhouse_type = 'Float64'
-#             elif isinstance(dtype, pd.BooleanDtype):  # pandas nullable Boolean
-#                 clickhouse_type = 'UInt8'
-#             elif isinstance(dtype, pd.DatetimeTZDtype) or 'datetime' in str(dtype):  # Nullable datetime
-#                 clickhouse_type = 'Nullable(DateTime)'
-#             elif isinstance(dtype, pd.StringDtype):  # pandas nullable String
-#                 clickhouse_type = 'String'
-#             else:
-#                 # Default mapping using the provided dtype_map
-#                 clickhouse_type = dtype_map.get(str(dtype), 'String')
-#             schema.append(f"`{col}` {clickhouse_type}")
-#         return ', '.join(schema)
-#
-#     def _drop_table(self):
-#         if self.client:
-#             self.client.command('DROP TABLE IF EXISTS {}'.format(self.clickhouse_table))
-#             self.logger.debug(f"Dropped table {self.clickhouse_table}")
-#
-#     def _create_table_from_dask(self, engine=None):
-#         if engine is None:
-#             engine = f"ENGINE = MergeTree() order by {self.order_by}"
-#         dtypes = self.df.dtypes
-#         clickhouse_schema = self._generate_clickhouse_schema(dtypes, self.dtype_to_clickhouse)
-#         create_table_sql = f"CREATE TABLE IF NOT EXISTS {self.clickhouse_table} ({clickhouse_schema}) {engine};"
-#         self.logger.debug(f"Creating table SQL:{create_table_sql}")
-#         if self.client:
-#             self.client.command(create_table_sql)
-#             self.logger.debug("Created table '{}'".format(self.clickhouse_table))
-#
-#     def _handle_missing_values(self):
-#         """
-#         Handle missing values in the Dask DataFrame before writing to ClickHouse.
-#         """
-#         self.logger.debug("Checking for missing values...")
-#         missing_counts = self.df.isnull().sum().compute()
-#         self.logger.debug(f"Missing values per column:\n{missing_counts}")
-#
-#         # Replace missing values based on column types
-#         def replace_missing_values(df):
-#             for col in df.columns:
-#                 if pd.api.types.is_integer_dtype(df[col]):
-#                     df[col] = df[col].fillna(0)  # Replace NA with 0 for integers
-#                 elif pd.api.types.is_float_dtype(df[col]):
-#                     df[col] = df[col].fillna(0.0)  # Replace NA with 0.0 for floats
-#                 elif pd.api.types.is_bool_dtype(df[col]):
-#                     df[col] = df[col].fillna(False)  # Replace NA with False for booleans
-#                 else:
-#                     df[col] = df[col].fillna('')  # Replace NA with empty string for other types
-#             return df
-#
-#         # Apply replacement
-#         self.df = replace_missing_values(self.df)
-#         self.logger.debug("Missing values replaced.")
-#
-#     def _write_data(self):
-#         """
-#         Writes the Dask DataFrame to a ClickHouse table partition by partition.
-#         """
-#         if len(self.df.index) == 0:
-#             self.logger.debug("No data found. Nothing written.")
-#             return
-#
-#         for i, partition in enumerate(self.df.to_delayed()):
-#             try:
-#                 # Compute the current partition into a pandas DataFrame
-#                 df = partition.compute()
-#
-#                 if df.empty:
-#                     self.logger.debug(f"Partition {i} is empty. Skipping...")
-#                     continue
-#
-#                 self.logger.debug(f"Writing partition {i} with {len(df)} rows to ClickHouse.")
-#
-#                 # Write the partition to the ClickHouse table
-#                 self.client.insert_df(self.clickhouse_table, df)
-#             except Exception as e:
-#                 self.logger.error(f"Error writing partition {i}: {e}")
-#
-#     def _write_data_multi_not_working_yet(self):
-#         """
-#         Writes the Dask DataFrame to a ClickHouse table partition by partition.
-#         Ensures a separate client instance is used per thread to avoid session conflicts.
-#         """
-#         if len(self.df.index) == 0:
-#             self.logger.debug("No data found. Nothing written.")
-#             return
-#
-#         def create_client():
-#             client = Client(
-#                 host=self.clickhouse_host,
-#                 port=self.clickhouse_port,
-#                 database=self.clickhouse_dbname,
-#                 user=self.clickhouse_user,
-#                 password=self.clickhouse_password
-#             )
-#             """
-#             Create a new instance of the ClickHouse client for each thread.
-#             This avoids session conflicts during concurrent writes.
-#             """
-#             return client
-#
-#         def write_partition(partition, index):
-#             """
-#             Write a single partition to ClickHouse using a separate client instance.
-#             """
-#             try:
-#                 self.logger.debug(f"Starting to process partition {index}")
-#                 client = create_client()  # Create a new client for the thread
-#
-#                 # Compute the Dask partition into a Pandas DataFrame
-#                 df = partition.compute()
-#                 if df.empty:
-#                     self.logger.debug(f"Partition {index} is empty. Skipping...")
-#                     return
-#
-#                 # Convert DataFrame to list of tuples
-#                 data = [tuple(row) for row in df.to_numpy()]
-#                 columns = df.columns.tolist()
-#
-#                 # Perform the insert
-#                 self.logger.debug(f"Writing partition {index} with {len(df)} rows to ClickHouse.")
-#                 client.execute(f"INSERT INTO {self.clickhouse_table} ({', '.join(columns)}) VALUES", data)
-#
-#             except Exception as e:
-#                 self.logger.error(f"Error writing partition {index}: {e}")
-#             finally:
-#                 if 'client' in locals() and hasattr(client, 'close'):
-#                     client.close()
-#                     self.logger.debug(f"Closed client for partition {index}")
-#
-#         try:
-#             # Get delayed partitions and enumerate them
-#             partitions = self.df.to_delayed()
-#             with ThreadPoolExecutor() as executor:
-#                 executor.map(write_partition, partitions, range(len(partitions)))
-#         except Exception as e:
-#             self.logger.error(f"Error during multi-partition write: {e}")

sibi_dst/utils/storage_config.py CHANGED Viewed

@@ -6,13 +6,13 @@ from .storage_manager import StorageManager
 from .credentials import ConfigManager
 class StorageConfig:
-    def __init__(self, config:ConfigManager, depots:dict=None):
+    def __init__(self, config:ConfigManager, depots:dict=None, clear_existing=False, write_mode="full-access"):
         self.conf = config
         self.depots = depots
         self._initialize_storage()
         self.storage_manager = StorageManager(self.base_storage, self.filesystem_type, self.filesystem_options)
         if self.depots is not None:
-            self.depot_paths, self.depot_names = self.storage_manager.rebuild_depot_paths(depots)
+            self.depot_paths, self.depot_names = self.storage_manager.rebuild_depot_paths(depots, clear_existing=clear_existing, write_mode=write_mode)
         else:
             self.depot_paths = None
             self.depot_names = None

sibi_dst/utils/storage_hive.py ADDED Viewed

@@ -0,0 +1,195 @@
+from __future__ import annotations
+import pandas as pd
+import dask.dataframe as dd
+from typing import Iterable, Optional, List, Tuple, Union
+import fsspec
+DNFFilter = List[List[Tuple[str, str, Union[str, int]]]]
+class HiveDatePartitionedStore:
+    """
+    Dask-only Parquet store with Hive-style yyyy=…/mm=…/dd=… partitions.
+    - `write(...)` safely "overwrites" S3 prefixes via per-object deletes (no bulk DeleteObjects).
+    - `read_range(...)` builds DNF filters and auto-matches partition types (string vs int).
+    """
+    def __init__(
+        self,
+        path: str,
+        *,
+        filesystem=None,                # fsspec filesystem or None to infer from path
+        date_col: str = "tracking_dt",
+        compression: str = "zstd",
+        partition_values_as_strings: bool = True,  # keep mm=07, dd=01 folder names
+        logger=None,
+    ) -> None:
+        self.path = path
+        self.fs = filesystem or fsspec.open(path).fs
+        self.date_col = date_col
+        self.compression = compression
+        self.partition_values_as_strings = partition_values_as_strings
+        self.log = logger
+    # ----------------- public API -----------------
+    def write(
+        self,
+        df: dd.DataFrame,
+        *,
+        repartition: Optional[int] = None,
+        overwrite: bool = False,
+    ) -> None:
+        """Write Dask DataFrame to Hive-style yyyy/mm/dd partitions."""
+        self._require_col(df, self.date_col)
+        ser = dd.to_datetime(df[self.date_col], errors="coerce")
+        if self.partition_values_as_strings:
+            parts = {
+                "yyyy": ser.dt.strftime("%Y"),
+                "mm":   ser.dt.strftime("%m"),
+                "dd":   ser.dt.strftime("%d"),
+            }
+        else:
+            parts = {
+                "yyyy": ser.dt.year.astype("int32"),
+                "mm":   ser.dt.month.astype("int8"),
+                "dd":   ser.dt.day.astype("int8"),
+            }
+        df = df.assign(**{self.date_col: ser}, **parts)
+        if repartition:
+            df = df.repartition(npartitions=repartition)
+        if overwrite:
+            self._safe_rm_prefix(self.path)
+        if self.log:
+            self.log.info(f"Writing parquet to {self.path} (hive yyyy/mm/dd)…")
+        df.to_parquet(
+            self.path,
+            engine="pyarrow",
+            write_index=False,
+            filesystem=self.fs,
+            partition_on=["yyyy", "mm", "dd"],
+            compression=self.compression,
+            overwrite=False,  # we pre-cleaned if overwrite=True
+        )
+    def read_range(
+        self,
+        start: Union[str, pd.Timestamp],
+        end: Union[str, pd.Timestamp],
+        *,
+        columns: Optional[Iterable[str]] = None,
+    ) -> dd.DataFrame:
+        """
+        Read a date window with partition pruning. Tries string filters first,
+        falls back to integer filters if Arrow infers partition types as ints.
+        """
+        str_filters = self._dnf_filters_for_range_str(start, end)
+        try:
+            return dd.read_parquet(
+                self.path,
+                engine="pyarrow",
+                filesystem=self.fs,
+                columns=list(columns) if columns else None,
+                filters=str_filters,
+            )
+        except Exception:
+            int_filters = self._dnf_filters_for_range_int(start, end)
+            return dd.read_parquet(
+                self.path,
+                engine="pyarrow",
+                filesystem=self.fs,
+                columns=list(columns) if columns else None,
+                filters=int_filters,
+            )
+    # Convenience: full month / single day
+    def read_month(self, year: int, month: int, *, columns=None) -> dd.DataFrame:
+        start = pd.Timestamp(year=year, month=month, day=1)
+        end = (start + pd.offsets.MonthEnd(0))
+        return self.read_range(start, end, columns=columns)
+    def read_day(self, year: int, month: int, day: int, *, columns=None) -> dd.DataFrame:
+        ts = pd.Timestamp(year=year, month=month, day=day)
+        return self.read_range(ts, ts, columns=columns)
+    # ----------------- internals -----------------
+    @staticmethod
+    def _pad2(n: int) -> str:
+        return f"{n:02d}"
+    def _safe_rm_prefix(self, path: str) -> None:
+        """Per-object delete to avoid S3 bulk DeleteObjects (and Content-MD5 issues)."""
+        if not self.fs.exists(path):
+            return
+        if self.log:
+            self.log.info(f"Cleaning prefix (safe delete): {path}")
+        for k in self.fs.find(path):
+            try:
+                (self.fs.rm_file(k) if hasattr(self.fs, "rm_file") else self.fs.rm(k, recursive=False))
+            except Exception as e:
+                if self.log:
+                    self.log.warning(f"Could not delete {k}: {e}")
+    @staticmethod
+    def _require_col(df: dd.DataFrame, col: str) -> None:
+        if col not in df.columns:
+            raise KeyError(f"'{col}' not in DataFrame")
+    # ---- DNF builders (string vs int) ----
+    def _dnf_filters_for_range_str(self, start, end) -> DNFFilter:
+        s, e = pd.Timestamp(start), pd.Timestamp(end)
+        if s > e:
+            raise ValueError("start > end")
+        sY, sM, sD = s.year, s.month, s.day
+        eY, eM, eD = e.year, e.month, e.day
+        p2 = self._pad2
+        if sY == eY and sM == eM:
+            return [[("yyyy","==",str(sY)),("mm","==",p2(sM)),("dd",">=",p2(sD)),("dd","<=",p2(eD))]]
+        clauses: DNFFilter = [
+            [("yyyy","==",str(sY)),("mm","==",p2(sM)),("dd",">=",p2(sD))],
+            [("yyyy","==",str(eY)),("mm","==",p2(eM)),("dd","<=",p2(eD))]
+        ]
+        if sY == eY:
+            for m in range(sM+1, eM):
+                clauses.append([("yyyy","==",str(sY)),("mm","==",p2(m))])
+            return clauses
+        for m in range(sM+1, 13):
+            clauses.append([("yyyy","==",str(sY)),("mm","==",p2(m))])
+        for y in range(sY+1, eY):
+            clauses.append([("yyyy","==",str(y))])
+        for m in range(1, eM):
+            clauses.append([("yyyy","==",str(eY)),("mm","==",p2(m))])
+        return clauses
+    @staticmethod
+    def _dnf_filters_for_range_int(start, end) -> DNFFilter:
+        s, e = pd.Timestamp(start), pd.Timestamp(end)
+        if s > e:
+            raise ValueError("start > end")
+        sY, sM, sD = s.year, s.month, s.day
+        eY, eM, eD = e.year, e.month, e.day
+        if sY == eY and sM == eM:
+            return [[("yyyy","==",sY),("mm","==",sM),("dd",">=",sD),("dd","<=",eD)]]
+        clauses: DNFFilter = [
+            [("yyyy","==",sY),("mm","==",sM),("dd",">=",sD)],
+            [("yyyy","==",eY),("mm","==",eM),("dd","<=",eD)],
+        ]
+        if sY == eY:
+            for m in range(sM+1, eM):
+                clauses.append([("yyyy","==",sY),("mm","==",m)])
+            return clauses
+        for m in range(sM+1, 13):
+            clauses.append([("yyyy","==",sY),("mm","==",m)])
+        for y in range(sY+1, eY):
+            clauses.append([("yyyy","==",y)])
+        for m in range(1, eM):
+            clauses.append([("yyyy","==",eY),("mm","==",m)])
+        return clauses

sibi_dst/utils/storage_manager.py CHANGED Viewed

@@ -83,7 +83,7 @@ class StorageManager:
                 self.fs.rm(sub_path, recursive=True)
             self.fs.mkdirs(sub_path, exist_ok=True)
-    def rebuild_depot_paths(self, depots, clear_existing=False):
+    def rebuild_depot_paths(self, depots, clear_existing=False, write_mode="full-access"):
         """
         Rebuilds depot_paths (dictionary) and depot_name (SimpleNamespace).
         Handles clear_existing scenario by resetting directories when required.
@@ -96,7 +96,8 @@ class StorageManager:
             depot_path = self.join_paths(self.storage_path, depot)
             if self.debug:
                 print(f"Rebuilding depot at: {depot_path}")
-            self.setup_directories(depot_path, sub_directories, clear_existing=clear_existing)
+            if write_mode == "full-access":
+                self.setup_directories(depot_path, sub_directories, clear_existing=clear_existing)
         # Generate depot_paths dictionary
         self.depot_paths = {

{sibi_dst-2025.8.5.dist-info → sibi_dst-2025.8.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sibi-dst
-Version: 2025.8.5
+Version: 2025.8.7
 Summary: Data Science Toolkit
 Author: Luis Valverde
 Author-email: lvalverdeb@gmail.com

{sibi_dst-2025.8.5.dist-info → sibi_dst-2025.8.7.dist-info}/RECORD RENAMED Viewed

@@ -2,7 +2,7 @@ sibi_dst/__init__.py,sha256=D01Z2Ds4zES8uz5Zp7qOWD0EcfCllWgew7AWt2X1SQg,445
 sibi_dst/df_helper/__init__.py,sha256=CyDXtFhRnMrycktxNO8jGGkP0938QiScl56kMZS1Sf8,578
 sibi_dst/df_helper/_artifact_updater_async.py,sha256=0lUwel-IkmKewRnmMv9GtuT-P6SivkIKtgOHvKchHlc,8462
 sibi_dst/df_helper/_artifact_updater_threaded.py,sha256=M5GNZismOqMmBrcyfolP1DPv87VILQf_P18is_epn50,7238
-sibi_dst/df_helper/_df_helper.py,sha256=g1ftfSMO40l60EJWRLE0DDZvbIowrqvG1GMf2zXqYGw,12957
+sibi_dst/df_helper/_df_helper.py,sha256=IqlfTPnbXyaLLkwn8iaulHLuJ6LlBB3hSR3e5O8ixQ0,14360
 sibi_dst/df_helper/_parquet_artifact.py,sha256=tqYOjwxHV1MsADmn-RNFuVI_RrEvvmCJHZieRcsVXuc,12334
 sibi_dst/df_helper/_parquet_reader.py,sha256=tFq0OQVczozbKZou93vscokp2R6O2DIJ1zHbZqVjagc,3069
 sibi_dst/df_helper/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -37,9 +37,10 @@ sibi_dst/osmnx_helper/utils.py,sha256=HfxrmXVPq3akf68SiwncbAp7XI1ER-zp8YN_doh7Ya
 sibi_dst/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sibi_dst/tests/test_data_wrapper_class.py,sha256=6uFmZR2DxnxQz49L5jT2ehlKvlLnpUHMLFB_PqqUq7k,3336
 sibi_dst/utils/__init__.py,sha256=vShNCOMPw8KKwlb4tq5XGrpjqakJ_OE8YDc_xDAWAxI,1302
+sibi_dst/utils/async_utils.py,sha256=53aywfgq1Q6-0OVr9qR1Sf6g7Qv3I9qunAAR4fjFXBE,351
 sibi_dst/utils/base.py,sha256=IyObjZ7AaE-YjVU0RLIXNCnQKWwzi5NH2I6D1KfcIyk,8716
 sibi_dst/utils/business_days.py,sha256=dP0Xj4FhTBIvZZrZYLOHZl5zOpDAgWkD4p_1a7BOT7I,8461
-sibi_dst/utils/clickhouse_writer.py,sha256=pE-igxddDdxekJywsaWQqKlGXIvLjPMpoFBUJ24t9Tw,20255
+sibi_dst/utils/clickhouse_writer.py,sha256=NngJyJpx2PjUQWsX0YmwCuGdeViK77Wi3HmYqHz3jTc,9544
 sibi_dst/utils/credentials.py,sha256=cHJPPsmVyijqbUQIq7WWPe-lIallA-mI5RAy3YUuRME,1724
 sibi_dst/utils/data_from_http_source.py,sha256=AcpKNsqTgN2ClNwuhgUpuNCx62r5_DdsAiKY8vcHEBA,1867
 sibi_dst/utils/data_utils.py,sha256=7bLidEjppieNoozDFb6OuRY0W995cxg4tiGAlkGfePI,7768
@@ -54,8 +55,9 @@ sibi_dst/utils/manifest_manager.py,sha256=9y4cV-Ig8O-ekhApp_UObTY-cTsl-bGnvKIThI
 sibi_dst/utils/parquet_saver.py,sha256=aYBlijqPAn-yuJXhmaRIteAN_IAQZvPh8I8Os2TLGgI,4861
 sibi_dst/utils/periods.py,sha256=8eTGi-bToa6_a8Vwyg4fkBPryyzft9Nzy-3ToxjqC8c,1434
 sibi_dst/utils/phone_formatter.py,sha256=oeM22nLjhObENrpItCNeVpkYS4pXRm5hSxdk0M4nvwU,4580
-sibi_dst/utils/storage_config.py,sha256=uaCBF8rgCeYkk-lxVSCjsic8O8HJKAu455MR-OBliCo,4325
-sibi_dst/utils/storage_manager.py,sha256=yyZqT8XjTf4MKFrfznCmxXxOYz_TiWgtQhzqPoXR9So,6569
+sibi_dst/utils/storage_config.py,sha256=DLtP5jKVM0mdFdgRw6LQfRqyavMjJcCVU7GhsUCRH78,4427
+sibi_dst/utils/storage_hive.py,sha256=FCF6zSTM_VWBEvSuTjn2bmb69oqsYjSS6nvnSZrJRFY,7123
+sibi_dst/utils/storage_manager.py,sha256=La1NY79bhRAmHWXp7QcXJZtbHoRboJMgoXOSXbIl1SA,6643
 sibi_dst/utils/update_planner.py,sha256=smlMHpr1p8guZnP5SyzCe6RsC-XkPOJWIsdeospUyb0,11471
 sibi_dst/utils/webdav_client.py,sha256=D9J5d1f1qQwHGm5FE5AMVpOPwcU5oD7K8JZoKGP8NpM,5811
 sibi_dst/v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -78,6 +80,6 @@ sibi_dst/v2/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxR
 sibi_dst/v2/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
 sibi_dst/v2/utils/__init__.py,sha256=6H4cvhqTiFufnFPETBF0f8beVVMpfJfvUs6Ne0TQZNY,58
 sibi_dst/v2/utils/log_utils.py,sha256=rfk5VsLAt-FKpv6aPTC1FToIPiyrnHAFFBAkHme24po,4123
-sibi_dst-2025.8.5.dist-info/METADATA,sha256=ADWrf_9UI4NiTWslrJ0LgfmHTTdxSSCIc0AaP-mqSQg,2610
-sibi_dst-2025.8.5.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
-sibi_dst-2025.8.5.dist-info/RECORD,,
+sibi_dst-2025.8.7.dist-info/METADATA,sha256=6sDcEFzHqZK8J1kSjtOCT_m-e5peFg4gFHpAGfeZWRw,2610
+sibi_dst-2025.8.7.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
+sibi_dst-2025.8.7.dist-info/RECORD,,

{sibi_dst-2025.8.5.dist-info → sibi_dst-2025.8.7.dist-info}/WHEEL RENAMED Viewed

File without changes

sibi-dst 2025.8.5__py3-none-any.whl → 2025.8.7__py3-none-any.whl

sibi-dst 2025.8.5py3-none-any.whl → 2025.8.7py3-none-any.whl