sibi-dst 2025.9.6__py3-none-any.whl → 2025.9.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/utils/boilerplate/__init__.py +2 -0
- sibi_dst/utils/boilerplate/base_pipeline.py +178 -0
- {sibi_dst-2025.9.6.dist-info → sibi_dst-2025.9.7.dist-info}/METADATA +1 -1
- {sibi_dst-2025.9.6.dist-info → sibi_dst-2025.9.7.dist-info}/RECORD +5 -4
- {sibi_dst-2025.9.6.dist-info → sibi_dst-2025.9.7.dist-info}/WHEEL +0 -0
@@ -3,6 +3,7 @@ from .base_data_cube import BaseDataCube
|
|
3
3
|
from .base_attacher import make_attacher
|
4
4
|
from .base_parquet_reader import BaseParquetReader
|
5
5
|
from .hybrid_data_loader import HybridDataLoader
|
6
|
+
from .base_pipeline import BasePipeline
|
6
7
|
|
7
8
|
__all__ = [
|
8
9
|
"BaseDataCube",
|
@@ -10,5 +11,6 @@ __all__ = [
|
|
10
11
|
"make_attacher",
|
11
12
|
"BaseParquetReader",
|
12
13
|
"HybridDataLoader",
|
14
|
+
"BasePipeline",
|
13
15
|
]
|
14
16
|
|
@@ -0,0 +1,178 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import asyncio
|
4
|
+
from concurrent.futures import ThreadPoolExecutor
|
5
|
+
from typing import Type, Any, Callable, List
|
6
|
+
|
7
|
+
import pandas as pd
|
8
|
+
import dask.dataframe as dd
|
9
|
+
|
10
|
+
from sibi_dst.utils import ManagedResource, ParquetSaver
|
11
|
+
from sibi_dst.df_helper import ParquetReader
|
12
|
+
from sibi_dst.utils.dask_utils import dask_is_empty
|
13
|
+
|
14
|
+
|
15
|
+
class DateRangeHelper:
|
16
|
+
@staticmethod
|
17
|
+
def generate_daily_ranges(start_date: str, end_date: str, date_format: str = "%Y-%m-%d") -> List[str]:
|
18
|
+
start = pd.to_datetime(start_date)
|
19
|
+
end = pd.to_datetime(end_date)
|
20
|
+
return [d.strftime(date_format) for d in pd.date_range(start, end, freq="D")]
|
21
|
+
|
22
|
+
@staticmethod
|
23
|
+
def generate_monthly_ranges(start_date: str, end_date: str, date_format: str = "%Y-%m-%d") -> List[tuple[str, str]]:
|
24
|
+
"""
|
25
|
+
Generate (start_date, end_date) tuples for each calendar month in range.
|
26
|
+
Always includes the first and last month, even if partial.
|
27
|
+
"""
|
28
|
+
start = pd.to_datetime(start_date)
|
29
|
+
end = pd.to_datetime(end_date)
|
30
|
+
ranges = []
|
31
|
+
current = start.replace(day=1)
|
32
|
+
while current <= end:
|
33
|
+
month_end = (current + pd.offsets.MonthEnd(0)).normalize()
|
34
|
+
ranges.append((
|
35
|
+
current.strftime(date_format),
|
36
|
+
min(month_end, end).strftime(date_format)
|
37
|
+
))
|
38
|
+
current += pd.DateOffset(months=1)
|
39
|
+
return ranges
|
40
|
+
|
41
|
+
class BasePipeline(ManagedResource):
|
42
|
+
def __init__(
|
43
|
+
self,
|
44
|
+
start_date: str,
|
45
|
+
end_date: str,
|
46
|
+
dataset_cls: Type,
|
47
|
+
parquet_storage_path: str,
|
48
|
+
*,
|
49
|
+
fs: Any,
|
50
|
+
filename: str = "dataset",
|
51
|
+
date_field: str = "date",
|
52
|
+
max_workers: int = 4,
|
53
|
+
dataset_kwargs: dict = None,
|
54
|
+
**kwargs,
|
55
|
+
):
|
56
|
+
kwargs["fs"] = fs
|
57
|
+
super().__init__(**kwargs)
|
58
|
+
|
59
|
+
self.start_date = start_date
|
60
|
+
self.end_date = end_date
|
61
|
+
self.fs = fs
|
62
|
+
self.filename = filename
|
63
|
+
self.date_field = date_field
|
64
|
+
self.max_workers = max_workers
|
65
|
+
self.storage_path = parquet_storage_path.rstrip("/")
|
66
|
+
self.df: dd.DataFrame | None = None
|
67
|
+
|
68
|
+
self.ds = dataset_cls(
|
69
|
+
start_date=self.start_date,
|
70
|
+
end_date=self.end_date,
|
71
|
+
debug=self.debug,
|
72
|
+
logger=self.logger,
|
73
|
+
**(dataset_kwargs or {}),
|
74
|
+
)
|
75
|
+
|
76
|
+
def _get_storage_path_for_date(self, date: pd.Timestamp) -> str:
|
77
|
+
return f"{self.storage_path}/{date.year}/{date.month:02d}/{date.day:02d}"
|
78
|
+
|
79
|
+
def _get_output_filename(self, fmt: str = "parquet") -> str:
|
80
|
+
return f"{self.filename}.{fmt}"
|
81
|
+
|
82
|
+
async def aload(self, **kwargs) -> dd.DataFrame:
|
83
|
+
await self.emit("status", message="Loading dataset...", progress=5)
|
84
|
+
self.df = await self.ds.aload(**kwargs)
|
85
|
+
return self.df
|
86
|
+
|
87
|
+
async def to_parquet(self, **kwargs) -> None:
|
88
|
+
df = await self.aload(**kwargs)
|
89
|
+
if dask_is_empty(df):
|
90
|
+
self.logger.warning("No data to save.")
|
91
|
+
return
|
92
|
+
|
93
|
+
df[self.date_field] = dd.to_datetime(df[self.date_field], errors="coerce")
|
94
|
+
dates = DateRangeHelper.generate_daily_ranges(self.start_date, self.end_date)
|
95
|
+
|
96
|
+
tasks = []
|
97
|
+
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
98
|
+
for date_str in dates:
|
99
|
+
date_obj = pd.to_datetime(date_str).date()
|
100
|
+
df_day = df[df[self.date_field].dt.date == date_obj]
|
101
|
+
if dask_is_empty(df_day):
|
102
|
+
self.logger.info(f"No data for {date_obj}, skipping.")
|
103
|
+
continue
|
104
|
+
|
105
|
+
path = self._get_storage_path_for_date(pd.Timestamp(date_obj))
|
106
|
+
await self.emit("status", message=f"Saving data for {date_obj}")
|
107
|
+
|
108
|
+
saver = ParquetSaver(
|
109
|
+
df_result=df_day,
|
110
|
+
parquet_storage_path=path,
|
111
|
+
fs=self.fs,
|
112
|
+
debug=self.debug,
|
113
|
+
logger=self.logger,
|
114
|
+
)
|
115
|
+
|
116
|
+
tasks.append(
|
117
|
+
asyncio.get_running_loop().run_in_executor(
|
118
|
+
executor, saver.save_to_parquet, self._get_output_filename()
|
119
|
+
)
|
120
|
+
)
|
121
|
+
|
122
|
+
await asyncio.gather(*tasks)
|
123
|
+
await self.emit("complete", message="All partitions written.")
|
124
|
+
|
125
|
+
async def from_parquet(self, **kwargs) -> dd.DataFrame:
|
126
|
+
reader = ParquetReader(
|
127
|
+
parquet_start_date=self.start_date,
|
128
|
+
parquet_end_date=self.end_date,
|
129
|
+
parquet_storage_path=self.storage_path,
|
130
|
+
parquet_filename=self._get_output_filename(),
|
131
|
+
fs=self.fs,
|
132
|
+
debug=self.debug,
|
133
|
+
logger=self.logger,
|
134
|
+
)
|
135
|
+
return await reader.aload(**kwargs)
|
136
|
+
|
137
|
+
async def to_clickhouse(self, clk_conf: dict, **kwargs):
|
138
|
+
"""
|
139
|
+
Writes daily-partitioned data to ClickHouse using concurrent threads.
|
140
|
+
"""
|
141
|
+
from sibi_dst.utils import ClickHouseWriter
|
142
|
+
|
143
|
+
df = await self.from_parquet(**kwargs)
|
144
|
+
if dask_is_empty(df):
|
145
|
+
self.logger.warning("No data to write to ClickHouse.")
|
146
|
+
return
|
147
|
+
|
148
|
+
df[self.date_field] = dd.to_datetime(df[self.date_field], errors="coerce")
|
149
|
+
df = df.persist()
|
150
|
+
|
151
|
+
unique_dates = df[self.date_field].dt.date.dropna().unique().compute()
|
152
|
+
if len(unique_dates)==0:
|
153
|
+
self.logger.warning("No valid dates found for partitioning.")
|
154
|
+
return
|
155
|
+
|
156
|
+
clk = ClickHouseWriter(**clk_conf)
|
157
|
+
loop = asyncio.get_running_loop()
|
158
|
+
tasks = []
|
159
|
+
|
160
|
+
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
161
|
+
for date in unique_dates:
|
162
|
+
df_day = df[df[self.date_field].dt.date == date]
|
163
|
+
if dask_is_empty(df_day):
|
164
|
+
self.logger.info(f"[ClickHouse] No data for {date}, skipping.")
|
165
|
+
continue
|
166
|
+
|
167
|
+
self.logger.info(f"[ClickHouse] Writing {len(df_day)} rows for {date}")
|
168
|
+
|
169
|
+
tasks.append(
|
170
|
+
loop.run_in_executor(executor, clk.save_to_clickhouse, df_day)
|
171
|
+
)
|
172
|
+
|
173
|
+
await asyncio.gather(*tasks)
|
174
|
+
|
175
|
+
self.logger.info(f"ClickHouse write complete for {len(unique_dates)} daily partitions.")
|
176
|
+
|
177
|
+
|
178
|
+
__all__ = ["BasePipeline"]
|
@@ -40,11 +40,12 @@ sibi_dst/tests/test_data_wrapper_class.py,sha256=6uFmZR2DxnxQz49L5jT2ehlKvlLnpUH
|
|
40
40
|
sibi_dst/utils/__init__.py,sha256=vShNCOMPw8KKwlb4tq5XGrpjqakJ_OE8YDc_xDAWAxI,1302
|
41
41
|
sibi_dst/utils/async_utils.py,sha256=53aywfgq1Q6-0OVr9qR1Sf6g7Qv3I9qunAAR4fjFXBE,351
|
42
42
|
sibi_dst/utils/base.py,sha256=sFngliI7Ku8bZMz0YdVhppuaPNZ0dvqRwCsPe9XdF1A,16256
|
43
|
-
sibi_dst/utils/boilerplate/__init__.py,sha256=
|
43
|
+
sibi_dst/utils/boilerplate/__init__.py,sha256=Zi4jHfYm_fGsXwG6TVxUUPjWQMYgZS-HsGcva7QxosU,430
|
44
44
|
sibi_dst/utils/boilerplate/base_attacher.py,sha256=JRAyvfljQjKVD5BJDDd09cBY9pGPIe8LQp0aUv_xJs0,736
|
45
45
|
sibi_dst/utils/boilerplate/base_data_cube.py,sha256=ErKTM2kT8LsSXADcyYvT436O_Mp0J2hm8xs1IUircb4,2760
|
46
46
|
sibi_dst/utils/boilerplate/base_parquet_artifact.py,sha256=oqPbjHFfChA9j1WL-eDAh7XLA3zmf-Rq7s_kzITVniA,3753
|
47
47
|
sibi_dst/utils/boilerplate/base_parquet_reader.py,sha256=3kN9_bbxyX-WuJLMBWejeApW2V_BDArSljhSUOAOhVU,692
|
48
|
+
sibi_dst/utils/boilerplate/base_pipeline.py,sha256=R9_mMEn8gCtfTS7c3DyzWMf_oQjCSL_O7CR8z_t3nmc,6323
|
48
49
|
sibi_dst/utils/boilerplate/hybrid_data_loader.py,sha256=Tazn7QL3FmWKVMXxzkvxPrG_2ucsPHvSotIW9dBLoNc,6018
|
49
50
|
sibi_dst/utils/business_days.py,sha256=dP0Xj4FhTBIvZZrZYLOHZl5zOpDAgWkD4p_1a7BOT7I,8461
|
50
51
|
sibi_dst/utils/clickhouse_writer.py,sha256=AOv0bYFzAI0u4dEkEBoUqtHekwPMISdNT5-ywCtDe4I,17049
|
@@ -93,6 +94,6 @@ sibi_dst/v2/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxR
|
|
93
94
|
sibi_dst/v2/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
|
94
95
|
sibi_dst/v2/utils/__init__.py,sha256=6H4cvhqTiFufnFPETBF0f8beVVMpfJfvUs6Ne0TQZNY,58
|
95
96
|
sibi_dst/v2/utils/log_utils.py,sha256=rfk5VsLAt-FKpv6aPTC1FToIPiyrnHAFFBAkHme24po,4123
|
96
|
-
sibi_dst-2025.9.
|
97
|
-
sibi_dst-2025.9.
|
98
|
-
sibi_dst-2025.9.
|
97
|
+
sibi_dst-2025.9.7.dist-info/METADATA,sha256=oQWEtZDzZysq1YKGV8h5siw_DNhiuZvojgcmlx9BM8k,2710
|
98
|
+
sibi_dst-2025.9.7.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
99
|
+
sibi_dst-2025.9.7.dist-info/RECORD,,
|
File without changes
|