PyPI - sibi-dst - Versions diffs - 2025.9.6__py3-none-any.whl → 2025.9.8__py3-none-any.whl - Mend

sibi-dst 2025.9.6py3-none-any.whl → 2025.9.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

sibi_dst/utils/boilerplate/__init__.py CHANGED Viewed

@@ -3,6 +3,7 @@ from .base_data_cube import BaseDataCube
 from .base_attacher import make_attacher
 from .base_parquet_reader import BaseParquetReader
 from .hybrid_data_loader import HybridDataLoader
+from .base_pipeline import BasePipeline
 __all__ = [
     "BaseDataCube",
@@ -10,5 +11,6 @@ __all__ = [
     "make_attacher",
     "BaseParquetReader",
     "HybridDataLoader",
+    "BasePipeline",
 ]

sibi_dst/utils/boilerplate/base_pipeline.py ADDED Viewed

@@ -0,0 +1,178 @@
+from __future__ import annotations
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+from typing import Type, Any, Callable, List
+import pandas as pd
+import dask.dataframe as dd
+from sibi_dst.utils import ManagedResource, ParquetSaver
+from sibi_dst.df_helper import ParquetReader
+from sibi_dst.utils.dask_utils import dask_is_empty
+class DateRangeHelper:
+    @staticmethod
+    def generate_daily_ranges(start_date: str, end_date: str, date_format: str = "%Y-%m-%d") -> List[str]:
+        start = pd.to_datetime(start_date)
+        end = pd.to_datetime(end_date)
+        return [d.strftime(date_format) for d in pd.date_range(start, end, freq="D")]
+    @staticmethod
+    def generate_monthly_ranges(start_date: str, end_date: str, date_format: str = "%Y-%m-%d") -> List[tuple[str, str]]:
+        """
+        Generate (start_date, end_date) tuples for each calendar month in range.
+        Always includes the first and last month, even if partial.
+        """
+        start = pd.to_datetime(start_date)
+        end = pd.to_datetime(end_date)
+        ranges = []
+        current = start.replace(day=1)
+        while current <= end:
+            month_end = (current + pd.offsets.MonthEnd(0)).normalize()
+            ranges.append((
+                current.strftime(date_format),
+                min(month_end, end).strftime(date_format)
+            ))
+            current += pd.DateOffset(months=1)
+        return ranges
+class BasePipeline(ManagedResource):
+    def __init__(
+        self,
+        start_date: str,
+        end_date: str,
+        dataset_cls: Type,
+        parquet_storage_path: str,
+        *,
+        fs: Any,
+        filename: str = "dataset",
+        date_field: str = "date",
+        max_workers: int = 4,
+        dataset_kwargs: dict = None,
+        **kwargs,
+    ):
+        kwargs["fs"] = fs
+        super().__init__(**kwargs)
+        self.start_date = start_date
+        self.end_date = end_date
+        self.fs = fs
+        self.filename = filename
+        self.date_field = date_field
+        self.max_workers = max_workers
+        self.storage_path = parquet_storage_path.rstrip("/")
+        self.df: dd.DataFrame | None = None
+        self.ds = dataset_cls(
+            start_date=self.start_date,
+            end_date=self.end_date,
+            debug=self.debug,
+            logger=self.logger,
+            **(dataset_kwargs or {}),
+        )
+    def _get_storage_path_for_date(self, date: pd.Timestamp) -> str:
+        return f"{self.storage_path}/{date.year}/{date.month:02d}/{date.day:02d}"
+    def _get_output_filename(self, fmt: str = "parquet") -> str:
+        return f"{self.filename}.{fmt}"
+    async def aload(self, **kwargs) -> dd.DataFrame:
+        await self.emit("status", message="Loading dataset...", progress=5)
+        self.df = await self.ds.aload(**kwargs)
+        return self.df
+    async def to_parquet(self, **kwargs) -> None:
+        df = await self.aload(**kwargs)
+        if dask_is_empty(df):
+            self.logger.warning("No data to save.")
+            return
+        df[self.date_field] = dd.to_datetime(df[self.date_field], errors="coerce")
+        dates = DateRangeHelper.generate_daily_ranges(self.start_date, self.end_date)
+        tasks = []
+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            for date_str in dates:
+                date_obj = pd.to_datetime(date_str).date()
+                df_day = df[df[self.date_field].dt.date == date_obj]
+                if dask_is_empty(df_day):
+                    self.logger.info(f"No data for {date_obj}, skipping.")
+                    continue
+                path = self._get_storage_path_for_date(pd.Timestamp(date_obj))
+                await self.emit("status", message=f"Saving data for {date_obj}")
+                saver = ParquetSaver(
+                    df_result=df_day,
+                    parquet_storage_path=path,
+                    fs=self.fs,
+                    debug=self.debug,
+                    logger=self.logger,
+                )
+                tasks.append(
+                    asyncio.get_running_loop().run_in_executor(
+                        executor, saver.save_to_parquet, self._get_output_filename()
+                    )
+                )
+            await asyncio.gather(*tasks)
+        await self.emit("complete", message="All partitions written.")
+    async def from_parquet(self, **kwargs) -> dd.DataFrame:
+        reader = ParquetReader(
+            parquet_start_date=self.start_date,
+            parquet_end_date=self.end_date,
+            parquet_storage_path=self.storage_path,
+            parquet_filename=self._get_output_filename(),
+            fs=self.fs,
+            debug=self.debug,
+            logger=self.logger,
+        )
+        return await reader.aload(**kwargs)
+    async def to_clickhouse(self, clk_conf: dict, **kwargs):
+        """
+        Writes daily-partitioned data to ClickHouse using concurrent threads.
+        """
+        from sibi_dst.utils import ClickHouseWriter
+        df = await self.from_parquet(**kwargs)
+        if dask_is_empty(df):
+            self.logger.warning("No data to write to ClickHouse.")
+            return
+        df[self.date_field] = dd.to_datetime(df[self.date_field], errors="coerce")
+        df = df.persist()
+        unique_dates = df[self.date_field].dt.date.dropna().unique().compute()
+        if len(unique_dates)==0:
+            self.logger.warning("No valid dates found for partitioning.")
+            return
+        clk = ClickHouseWriter(**clk_conf)
+        loop = asyncio.get_running_loop()
+        tasks = []
+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            for date in unique_dates:
+                df_day = df[df[self.date_field].dt.date == date]
+                if dask_is_empty(df_day):
+                    self.logger.info(f"[ClickHouse] No data for {date}, skipping.")
+                    continue
+                self.logger.info(f"[ClickHouse] Writing {len(df_day)} rows for {date}")
+                tasks.append(
+                    loop.run_in_executor(executor, clk.save_to_clickhouse, df_day)
+                )
+            await asyncio.gather(*tasks)
+        self.logger.info(f"ClickHouse write complete for {len(unique_dates)} daily partitions.")
+__all__ = ["BasePipeline"]

sibi_dst/utils/clickhouse_writer.py CHANGED Viewed

@@ -90,10 +90,14 @@ class ClickHouseWriter(ManagedResource):
         # one client per thread to avoid session contention
         self._tlocal = threading.local()
+        ow = self.overwrite
+        if ow:
+            self._command(f"DROP TABLE IF EXISTS {self._ident(self.table)}")
+            self.logger.info(f"Dropped table {self.table} (overwrite=True)")
     # ------------- public -------------
-    def save_to_clickhouse(self, df: dd.DataFrame, *, overwrite: Optional[bool] = None) -> None:
+    def save_to_clickhouse(self, df: dd.DataFrame) -> None:
         """
         Persist a Dask DataFrame into ClickHouse.
@@ -118,15 +122,10 @@ class ClickHouseWriter(ManagedResource):
         )
         # (re)create table
-        ow = self.overwrite if overwrite is None else bool(overwrite)
         dtypes = df._meta_nonempty.dtypes  # metadata-only types (no compute)
         schema_sql = self._generate_clickhouse_schema(dtypes)
         engine_sql = self._default_engine_sql() if not self.engine else self.engine
-        if ow:
-            self._command(f"DROP TABLE IF EXISTS {self._ident(self.table)}")
-            self.logger.info(f"Dropped table {self.table} (overwrite=True)")
         create_sql = f"CREATE TABLE IF NOT EXISTS {self._ident(self.table)} ({schema_sql}) {engine_sql}"
         self._command(create_sql)
         self.logger.info(f"Ensured table {self.table} exists")

{sibi_dst-2025.9.6.dist-info → sibi_dst-2025.9.8.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sibi-dst
-Version: 2025.9.6
+Version: 2025.9.8
 Summary: Data Science Toolkit
 Author: Luis Valverde
 Author-email: lvalverdeb@gmail.com

{sibi_dst-2025.9.6.dist-info → sibi_dst-2025.9.8.dist-info}/RECORD RENAMED Viewed

@@ -40,14 +40,15 @@ sibi_dst/tests/test_data_wrapper_class.py,sha256=6uFmZR2DxnxQz49L5jT2ehlKvlLnpUH
 sibi_dst/utils/__init__.py,sha256=vShNCOMPw8KKwlb4tq5XGrpjqakJ_OE8YDc_xDAWAxI,1302
 sibi_dst/utils/async_utils.py,sha256=53aywfgq1Q6-0OVr9qR1Sf6g7Qv3I9qunAAR4fjFXBE,351
 sibi_dst/utils/base.py,sha256=sFngliI7Ku8bZMz0YdVhppuaPNZ0dvqRwCsPe9XdF1A,16256
-sibi_dst/utils/boilerplate/__init__.py,sha256=zgkQ50-cKmRugOz1bHqhjVXb3Hb8rsIwN7d5-kVsRls,370
+sibi_dst/utils/boilerplate/__init__.py,sha256=Zi4jHfYm_fGsXwG6TVxUUPjWQMYgZS-HsGcva7QxosU,430
 sibi_dst/utils/boilerplate/base_attacher.py,sha256=JRAyvfljQjKVD5BJDDd09cBY9pGPIe8LQp0aUv_xJs0,736
 sibi_dst/utils/boilerplate/base_data_cube.py,sha256=ErKTM2kT8LsSXADcyYvT436O_Mp0J2hm8xs1IUircb4,2760
 sibi_dst/utils/boilerplate/base_parquet_artifact.py,sha256=oqPbjHFfChA9j1WL-eDAh7XLA3zmf-Rq7s_kzITVniA,3753
 sibi_dst/utils/boilerplate/base_parquet_reader.py,sha256=3kN9_bbxyX-WuJLMBWejeApW2V_BDArSljhSUOAOhVU,692
+sibi_dst/utils/boilerplate/base_pipeline.py,sha256=R9_mMEn8gCtfTS7c3DyzWMf_oQjCSL_O7CR8z_t3nmc,6323
 sibi_dst/utils/boilerplate/hybrid_data_loader.py,sha256=Tazn7QL3FmWKVMXxzkvxPrG_2ucsPHvSotIW9dBLoNc,6018
 sibi_dst/utils/business_days.py,sha256=dP0Xj4FhTBIvZZrZYLOHZl5zOpDAgWkD4p_1a7BOT7I,8461
-sibi_dst/utils/clickhouse_writer.py,sha256=AOv0bYFzAI0u4dEkEBoUqtHekwPMISdNT5-ywCtDe4I,17049
+sibi_dst/utils/clickhouse_writer.py,sha256=IQJ_rgd7VuF-g-aPbo9TfqZi0EB_3evCFTzcCNHSmpw,16969
 sibi_dst/utils/credentials.py,sha256=cHJPPsmVyijqbUQIq7WWPe-lIallA-mI5RAy3YUuRME,1724
 sibi_dst/utils/dask_utils.py,sha256=QhFcmpH4fXAy6b3DugIX5JvH4h-P3M3hXKnBYTLRkq0,1991
 sibi_dst/utils/data_from_http_source.py,sha256=AcpKNsqTgN2ClNwuhgUpuNCx62r5_DdsAiKY8vcHEBA,1867
@@ -93,6 +94,6 @@ sibi_dst/v2/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxR
 sibi_dst/v2/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
 sibi_dst/v2/utils/__init__.py,sha256=6H4cvhqTiFufnFPETBF0f8beVVMpfJfvUs6Ne0TQZNY,58
 sibi_dst/v2/utils/log_utils.py,sha256=rfk5VsLAt-FKpv6aPTC1FToIPiyrnHAFFBAkHme24po,4123
-sibi_dst-2025.9.6.dist-info/METADATA,sha256=e9vt1wbHivyTJhyubiEjJcMFBNDF1m9nERTlBgYvq9o,2710
-sibi_dst-2025.9.6.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
-sibi_dst-2025.9.6.dist-info/RECORD,,
+sibi_dst-2025.9.8.dist-info/METADATA,sha256=rQ9QLcSm_bvFK2KOgi1ZmIgVZMwixMWvXT9SNmBU6fg,2710
+sibi_dst-2025.9.8.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
+sibi_dst-2025.9.8.dist-info/RECORD,,

{sibi_dst-2025.9.6.dist-info → sibi_dst-2025.9.8.dist-info}/WHEEL RENAMED Viewed

File without changes

sibi-dst 2025.9.6__py3-none-any.whl → 2025.9.8__py3-none-any.whl

sibi-dst 2025.9.6py3-none-any.whl → 2025.9.8py3-none-any.whl