sibi-dst 2025.9.6__py3-none-any.whl → 2025.9.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,7 @@ from .base_data_cube import BaseDataCube
3
3
  from .base_attacher import make_attacher
4
4
  from .base_parquet_reader import BaseParquetReader
5
5
  from .hybrid_data_loader import HybridDataLoader
6
+ from .base_pipeline import BasePipeline
6
7
 
7
8
  __all__ = [
8
9
  "BaseDataCube",
@@ -10,5 +11,6 @@ __all__ = [
10
11
  "make_attacher",
11
12
  "BaseParquetReader",
12
13
  "HybridDataLoader",
14
+ "BasePipeline",
13
15
  ]
14
16
 
@@ -0,0 +1,178 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ from concurrent.futures import ThreadPoolExecutor
5
+ from typing import Type, Any, Callable, List
6
+
7
+ import pandas as pd
8
+ import dask.dataframe as dd
9
+
10
+ from sibi_dst.utils import ManagedResource, ParquetSaver
11
+ from sibi_dst.df_helper import ParquetReader
12
+ from sibi_dst.utils.dask_utils import dask_is_empty
13
+
14
+
15
+ class DateRangeHelper:
16
+ @staticmethod
17
+ def generate_daily_ranges(start_date: str, end_date: str, date_format: str = "%Y-%m-%d") -> List[str]:
18
+ start = pd.to_datetime(start_date)
19
+ end = pd.to_datetime(end_date)
20
+ return [d.strftime(date_format) for d in pd.date_range(start, end, freq="D")]
21
+
22
+ @staticmethod
23
+ def generate_monthly_ranges(start_date: str, end_date: str, date_format: str = "%Y-%m-%d") -> List[tuple[str, str]]:
24
+ """
25
+ Generate (start_date, end_date) tuples for each calendar month in range.
26
+ Always includes the first and last month, even if partial.
27
+ """
28
+ start = pd.to_datetime(start_date)
29
+ end = pd.to_datetime(end_date)
30
+ ranges = []
31
+ current = start.replace(day=1)
32
+ while current <= end:
33
+ month_end = (current + pd.offsets.MonthEnd(0)).normalize()
34
+ ranges.append((
35
+ current.strftime(date_format),
36
+ min(month_end, end).strftime(date_format)
37
+ ))
38
+ current += pd.DateOffset(months=1)
39
+ return ranges
40
+
41
+ class BasePipeline(ManagedResource):
42
+ def __init__(
43
+ self,
44
+ start_date: str,
45
+ end_date: str,
46
+ dataset_cls: Type,
47
+ parquet_storage_path: str,
48
+ *,
49
+ fs: Any,
50
+ filename: str = "dataset",
51
+ date_field: str = "date",
52
+ max_workers: int = 4,
53
+ dataset_kwargs: dict = None,
54
+ **kwargs,
55
+ ):
56
+ kwargs["fs"] = fs
57
+ super().__init__(**kwargs)
58
+
59
+ self.start_date = start_date
60
+ self.end_date = end_date
61
+ self.fs = fs
62
+ self.filename = filename
63
+ self.date_field = date_field
64
+ self.max_workers = max_workers
65
+ self.storage_path = parquet_storage_path.rstrip("/")
66
+ self.df: dd.DataFrame | None = None
67
+
68
+ self.ds = dataset_cls(
69
+ start_date=self.start_date,
70
+ end_date=self.end_date,
71
+ debug=self.debug,
72
+ logger=self.logger,
73
+ **(dataset_kwargs or {}),
74
+ )
75
+
76
+ def _get_storage_path_for_date(self, date: pd.Timestamp) -> str:
77
+ return f"{self.storage_path}/{date.year}/{date.month:02d}/{date.day:02d}"
78
+
79
+ def _get_output_filename(self, fmt: str = "parquet") -> str:
80
+ return f"{self.filename}.{fmt}"
81
+
82
+ async def aload(self, **kwargs) -> dd.DataFrame:
83
+ await self.emit("status", message="Loading dataset...", progress=5)
84
+ self.df = await self.ds.aload(**kwargs)
85
+ return self.df
86
+
87
+ async def to_parquet(self, **kwargs) -> None:
88
+ df = await self.aload(**kwargs)
89
+ if dask_is_empty(df):
90
+ self.logger.warning("No data to save.")
91
+ return
92
+
93
+ df[self.date_field] = dd.to_datetime(df[self.date_field], errors="coerce")
94
+ dates = DateRangeHelper.generate_daily_ranges(self.start_date, self.end_date)
95
+
96
+ tasks = []
97
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
98
+ for date_str in dates:
99
+ date_obj = pd.to_datetime(date_str).date()
100
+ df_day = df[df[self.date_field].dt.date == date_obj]
101
+ if dask_is_empty(df_day):
102
+ self.logger.info(f"No data for {date_obj}, skipping.")
103
+ continue
104
+
105
+ path = self._get_storage_path_for_date(pd.Timestamp(date_obj))
106
+ await self.emit("status", message=f"Saving data for {date_obj}")
107
+
108
+ saver = ParquetSaver(
109
+ df_result=df_day,
110
+ parquet_storage_path=path,
111
+ fs=self.fs,
112
+ debug=self.debug,
113
+ logger=self.logger,
114
+ )
115
+
116
+ tasks.append(
117
+ asyncio.get_running_loop().run_in_executor(
118
+ executor, saver.save_to_parquet, self._get_output_filename()
119
+ )
120
+ )
121
+
122
+ await asyncio.gather(*tasks)
123
+ await self.emit("complete", message="All partitions written.")
124
+
125
+ async def from_parquet(self, **kwargs) -> dd.DataFrame:
126
+ reader = ParquetReader(
127
+ parquet_start_date=self.start_date,
128
+ parquet_end_date=self.end_date,
129
+ parquet_storage_path=self.storage_path,
130
+ parquet_filename=self._get_output_filename(),
131
+ fs=self.fs,
132
+ debug=self.debug,
133
+ logger=self.logger,
134
+ )
135
+ return await reader.aload(**kwargs)
136
+
137
+ async def to_clickhouse(self, clk_conf: dict, **kwargs):
138
+ """
139
+ Writes daily-partitioned data to ClickHouse using concurrent threads.
140
+ """
141
+ from sibi_dst.utils import ClickHouseWriter
142
+
143
+ df = await self.from_parquet(**kwargs)
144
+ if dask_is_empty(df):
145
+ self.logger.warning("No data to write to ClickHouse.")
146
+ return
147
+
148
+ df[self.date_field] = dd.to_datetime(df[self.date_field], errors="coerce")
149
+ df = df.persist()
150
+
151
+ unique_dates = df[self.date_field].dt.date.dropna().unique().compute()
152
+ if len(unique_dates)==0:
153
+ self.logger.warning("No valid dates found for partitioning.")
154
+ return
155
+
156
+ clk = ClickHouseWriter(**clk_conf)
157
+ loop = asyncio.get_running_loop()
158
+ tasks = []
159
+
160
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
161
+ for date in unique_dates:
162
+ df_day = df[df[self.date_field].dt.date == date]
163
+ if dask_is_empty(df_day):
164
+ self.logger.info(f"[ClickHouse] No data for {date}, skipping.")
165
+ continue
166
+
167
+ self.logger.info(f"[ClickHouse] Writing {len(df_day)} rows for {date}")
168
+
169
+ tasks.append(
170
+ loop.run_in_executor(executor, clk.save_to_clickhouse, df_day)
171
+ )
172
+
173
+ await asyncio.gather(*tasks)
174
+
175
+ self.logger.info(f"ClickHouse write complete for {len(unique_dates)} daily partitions.")
176
+
177
+
178
+ __all__ = ["BasePipeline"]
@@ -90,10 +90,14 @@ class ClickHouseWriter(ManagedResource):
90
90
 
91
91
  # one client per thread to avoid session contention
92
92
  self._tlocal = threading.local()
93
+ ow = self.overwrite
94
+ if ow:
95
+ self._command(f"DROP TABLE IF EXISTS {self._ident(self.table)}")
96
+ self.logger.info(f"Dropped table {self.table} (overwrite=True)")
93
97
 
94
98
  # ------------- public -------------
95
99
 
96
- def save_to_clickhouse(self, df: dd.DataFrame, *, overwrite: Optional[bool] = None) -> None:
100
+ def save_to_clickhouse(self, df: dd.DataFrame) -> None:
97
101
  """
98
102
  Persist a Dask DataFrame into ClickHouse.
99
103
 
@@ -118,15 +122,10 @@ class ClickHouseWriter(ManagedResource):
118
122
  )
119
123
 
120
124
  # (re)create table
121
- ow = self.overwrite if overwrite is None else bool(overwrite)
122
125
  dtypes = df._meta_nonempty.dtypes # metadata-only types (no compute)
123
126
  schema_sql = self._generate_clickhouse_schema(dtypes)
124
127
  engine_sql = self._default_engine_sql() if not self.engine else self.engine
125
128
 
126
- if ow:
127
- self._command(f"DROP TABLE IF EXISTS {self._ident(self.table)}")
128
- self.logger.info(f"Dropped table {self.table} (overwrite=True)")
129
-
130
129
  create_sql = f"CREATE TABLE IF NOT EXISTS {self._ident(self.table)} ({schema_sql}) {engine_sql}"
131
130
  self._command(create_sql)
132
131
  self.logger.info(f"Ensured table {self.table} exists")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 2025.9.6
3
+ Version: 2025.9.8
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -40,14 +40,15 @@ sibi_dst/tests/test_data_wrapper_class.py,sha256=6uFmZR2DxnxQz49L5jT2ehlKvlLnpUH
40
40
  sibi_dst/utils/__init__.py,sha256=vShNCOMPw8KKwlb4tq5XGrpjqakJ_OE8YDc_xDAWAxI,1302
41
41
  sibi_dst/utils/async_utils.py,sha256=53aywfgq1Q6-0OVr9qR1Sf6g7Qv3I9qunAAR4fjFXBE,351
42
42
  sibi_dst/utils/base.py,sha256=sFngliI7Ku8bZMz0YdVhppuaPNZ0dvqRwCsPe9XdF1A,16256
43
- sibi_dst/utils/boilerplate/__init__.py,sha256=zgkQ50-cKmRugOz1bHqhjVXb3Hb8rsIwN7d5-kVsRls,370
43
+ sibi_dst/utils/boilerplate/__init__.py,sha256=Zi4jHfYm_fGsXwG6TVxUUPjWQMYgZS-HsGcva7QxosU,430
44
44
  sibi_dst/utils/boilerplate/base_attacher.py,sha256=JRAyvfljQjKVD5BJDDd09cBY9pGPIe8LQp0aUv_xJs0,736
45
45
  sibi_dst/utils/boilerplate/base_data_cube.py,sha256=ErKTM2kT8LsSXADcyYvT436O_Mp0J2hm8xs1IUircb4,2760
46
46
  sibi_dst/utils/boilerplate/base_parquet_artifact.py,sha256=oqPbjHFfChA9j1WL-eDAh7XLA3zmf-Rq7s_kzITVniA,3753
47
47
  sibi_dst/utils/boilerplate/base_parquet_reader.py,sha256=3kN9_bbxyX-WuJLMBWejeApW2V_BDArSljhSUOAOhVU,692
48
+ sibi_dst/utils/boilerplate/base_pipeline.py,sha256=R9_mMEn8gCtfTS7c3DyzWMf_oQjCSL_O7CR8z_t3nmc,6323
48
49
  sibi_dst/utils/boilerplate/hybrid_data_loader.py,sha256=Tazn7QL3FmWKVMXxzkvxPrG_2ucsPHvSotIW9dBLoNc,6018
49
50
  sibi_dst/utils/business_days.py,sha256=dP0Xj4FhTBIvZZrZYLOHZl5zOpDAgWkD4p_1a7BOT7I,8461
50
- sibi_dst/utils/clickhouse_writer.py,sha256=AOv0bYFzAI0u4dEkEBoUqtHekwPMISdNT5-ywCtDe4I,17049
51
+ sibi_dst/utils/clickhouse_writer.py,sha256=IQJ_rgd7VuF-g-aPbo9TfqZi0EB_3evCFTzcCNHSmpw,16969
51
52
  sibi_dst/utils/credentials.py,sha256=cHJPPsmVyijqbUQIq7WWPe-lIallA-mI5RAy3YUuRME,1724
52
53
  sibi_dst/utils/dask_utils.py,sha256=QhFcmpH4fXAy6b3DugIX5JvH4h-P3M3hXKnBYTLRkq0,1991
53
54
  sibi_dst/utils/data_from_http_source.py,sha256=AcpKNsqTgN2ClNwuhgUpuNCx62r5_DdsAiKY8vcHEBA,1867
@@ -93,6 +94,6 @@ sibi_dst/v2/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxR
93
94
  sibi_dst/v2/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
94
95
  sibi_dst/v2/utils/__init__.py,sha256=6H4cvhqTiFufnFPETBF0f8beVVMpfJfvUs6Ne0TQZNY,58
95
96
  sibi_dst/v2/utils/log_utils.py,sha256=rfk5VsLAt-FKpv6aPTC1FToIPiyrnHAFFBAkHme24po,4123
96
- sibi_dst-2025.9.6.dist-info/METADATA,sha256=e9vt1wbHivyTJhyubiEjJcMFBNDF1m9nERTlBgYvq9o,2710
97
- sibi_dst-2025.9.6.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
98
- sibi_dst-2025.9.6.dist-info/RECORD,,
97
+ sibi_dst-2025.9.8.dist-info/METADATA,sha256=rQ9QLcSm_bvFK2KOgi1ZmIgVZMwixMWvXT9SNmBU6fg,2710
98
+ sibi_dst-2025.9.8.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
99
+ sibi_dst-2025.9.8.dist-info/RECORD,,