sibi-dst 2025.8.6__py3-none-any.whl → 2025.8.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_df_helper.py +111 -61
- sibi_dst/df_helper/_parquet_artifact.py +11 -10
- sibi_dst/df_helper/_parquet_reader.py +4 -0
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +504 -214
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +11 -10
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +9 -8
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +4 -76
- sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -104
- sibi_dst/utils/async_utils.py +12 -0
- sibi_dst/utils/boilerplate/__init__.py +6 -0
- sibi_dst/utils/boilerplate/base_data_artifact.py +110 -0
- sibi_dst/utils/boilerplate/base_data_cube.py +79 -0
- sibi_dst/utils/data_wrapper.py +22 -263
- sibi_dst/utils/iceberg_saver.py +126 -0
- sibi_dst/utils/log_utils.py +0 -346
- sibi_dst/utils/parquet_saver.py +110 -9
- sibi_dst/utils/progress/__init__.py +5 -0
- sibi_dst/utils/progress/jobs.py +82 -0
- sibi_dst/utils/progress/sse_runner.py +82 -0
- sibi_dst/utils/storage_hive.py +232 -0
- sibi_dst/utils/update_planner.py +617 -116
- {sibi_dst-2025.8.6.dist-info → sibi_dst-2025.8.8.dist-info}/METADATA +3 -2
- {sibi_dst-2025.8.6.dist-info → sibi_dst-2025.8.8.dist-info}/RECORD +24 -15
- {sibi_dst-2025.8.6.dist-info → sibi_dst-2025.8.8.dist-info}/WHEEL +0 -0
@@ -0,0 +1,232 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import asyncio
|
4
|
+
|
5
|
+
import pandas as pd
|
6
|
+
import dask.dataframe as dd
|
7
|
+
from typing import Iterable, Optional, List, Tuple, Union
|
8
|
+
import fsspec
|
9
|
+
|
10
|
+
DNFFilter = List[List[Tuple[str, str, Union[str, int]]]]
|
11
|
+
|
12
|
+
|
13
|
+
class HiveDatePartitionedStore:
|
14
|
+
"""
|
15
|
+
Dask-only Parquet store with Hive-style yyyy=…/mm=…/dd=… partitions.
|
16
|
+
|
17
|
+
- `write(...)` safely "overwrites" S3 prefixes via per-object deletes (no bulk DeleteObjects).
|
18
|
+
- `read_range(...)` builds DNF filters and auto-matches partition types (string vs int).
|
19
|
+
"""
|
20
|
+
|
21
|
+
def __init__(
|
22
|
+
self,
|
23
|
+
path: str,
|
24
|
+
*,
|
25
|
+
filesystem=None, # fsspec filesystem or None to infer from path
|
26
|
+
date_col: str = "tracking_dt",
|
27
|
+
compression: str = "zstd",
|
28
|
+
partition_values_as_strings: bool = True, # keep mm=07, dd=01 folder names
|
29
|
+
logger=None,
|
30
|
+
) -> None:
|
31
|
+
self.path = path
|
32
|
+
self.fs = filesystem or fsspec.open(path).fs
|
33
|
+
self.date_col = date_col
|
34
|
+
self.compression = compression
|
35
|
+
self.partition_values_as_strings = partition_values_as_strings
|
36
|
+
self.log = logger
|
37
|
+
|
38
|
+
# ----------------- public API -----------------
|
39
|
+
|
40
|
+
def write(
|
41
|
+
self,
|
42
|
+
df: dd.DataFrame,
|
43
|
+
*,
|
44
|
+
repartition: Optional[int] = None,
|
45
|
+
overwrite: bool = False,
|
46
|
+
) -> None:
|
47
|
+
"""Write Dask DataFrame to Hive-style yyyy/mm/dd partitions."""
|
48
|
+
self._require_col(df, self.date_col)
|
49
|
+
ser = dd.to_datetime(df[self.date_col], errors="coerce")
|
50
|
+
|
51
|
+
if self.partition_values_as_strings:
|
52
|
+
parts = {
|
53
|
+
"yyyy": ser.dt.strftime("%Y"),
|
54
|
+
"mm": ser.dt.strftime("%m"),
|
55
|
+
"dd": ser.dt.strftime("%d"),
|
56
|
+
}
|
57
|
+
else:
|
58
|
+
parts = {
|
59
|
+
"yyyy": ser.dt.year.astype("int32"),
|
60
|
+
"mm": ser.dt.month.astype("int8"),
|
61
|
+
"dd": ser.dt.day.astype("int8"),
|
62
|
+
}
|
63
|
+
|
64
|
+
df = df.assign(**{self.date_col: ser}, **parts)
|
65
|
+
|
66
|
+
if repartition:
|
67
|
+
df = df.repartition(npartitions=repartition)
|
68
|
+
|
69
|
+
if overwrite:
|
70
|
+
self._safe_rm_prefix(self.path)
|
71
|
+
|
72
|
+
if self.log:
|
73
|
+
self.log.info(f"Writing parquet to {self.path} (hive yyyy/mm/dd)…")
|
74
|
+
|
75
|
+
df.to_parquet(
|
76
|
+
self.path,
|
77
|
+
engine="pyarrow",
|
78
|
+
write_index=False,
|
79
|
+
filesystem=self.fs,
|
80
|
+
partition_on=["yyyy", "mm", "dd"],
|
81
|
+
compression=self.compression,
|
82
|
+
overwrite=False, # we pre-cleaned if overwrite=True
|
83
|
+
)
|
84
|
+
|
85
|
+
def read_range(
|
86
|
+
self,
|
87
|
+
start: Union[str, pd.Timestamp],
|
88
|
+
end: Union[str, pd.Timestamp],
|
89
|
+
*,
|
90
|
+
columns: Optional[Iterable[str]] = None,
|
91
|
+
) -> dd.DataFrame:
|
92
|
+
"""
|
93
|
+
Read a date window with partition pruning. Tries string filters first,
|
94
|
+
falls back to integer filters if Arrow infers partition types as ints.
|
95
|
+
"""
|
96
|
+
str_filters = self._dnf_filters_for_range_str(start, end)
|
97
|
+
try:
|
98
|
+
return dd.read_parquet(
|
99
|
+
self.path,
|
100
|
+
engine="pyarrow",
|
101
|
+
filesystem=self.fs,
|
102
|
+
columns=list(columns) if columns else None,
|
103
|
+
filters=str_filters,
|
104
|
+
)
|
105
|
+
except Exception:
|
106
|
+
int_filters = self._dnf_filters_for_range_int(start, end)
|
107
|
+
return dd.read_parquet(
|
108
|
+
self.path,
|
109
|
+
engine="pyarrow",
|
110
|
+
filesystem=self.fs,
|
111
|
+
columns=list(columns) if columns else None,
|
112
|
+
filters=int_filters,
|
113
|
+
)
|
114
|
+
|
115
|
+
# Convenience: full month / single day
|
116
|
+
def read_month(self, year: int, month: int, *, columns=None) -> dd.DataFrame:
|
117
|
+
start = pd.Timestamp(year=year, month=month, day=1)
|
118
|
+
end = (start + pd.offsets.MonthEnd(0))
|
119
|
+
return self.read_range(start, end, columns=columns)
|
120
|
+
|
121
|
+
def read_day(self, year: int, month: int, day: int, *, columns=None) -> dd.DataFrame:
|
122
|
+
ts = pd.Timestamp(year=year, month=month, day=day)
|
123
|
+
return self.read_range(ts, ts, columns=columns)
|
124
|
+
|
125
|
+
# ----------------- internals -----------------
|
126
|
+
|
127
|
+
@staticmethod
|
128
|
+
def _pad2(n: int) -> str:
|
129
|
+
return f"{n:02d}"
|
130
|
+
|
131
|
+
def _safe_rm_prefix(self, path: str) -> None:
|
132
|
+
"""Per-object delete to avoid S3 bulk DeleteObjects (and Content-MD5 issues)."""
|
133
|
+
if not self.fs.exists(path):
|
134
|
+
return
|
135
|
+
if self.log:
|
136
|
+
self.log.info(f"Cleaning prefix (safe delete): {path}")
|
137
|
+
for k in self.fs.find(path):
|
138
|
+
try:
|
139
|
+
(self.fs.rm_file(k) if hasattr(self.fs, "rm_file") else self.fs.rm(k, recursive=False))
|
140
|
+
except Exception as e:
|
141
|
+
if self.log:
|
142
|
+
self.log.warning(f"Could not delete {k}: {e}")
|
143
|
+
|
144
|
+
@staticmethod
|
145
|
+
def _require_col(df: dd.DataFrame, col: str) -> None:
|
146
|
+
if col not in df.columns:
|
147
|
+
raise KeyError(f"'{col}' not in DataFrame")
|
148
|
+
|
149
|
+
# ---- DNF builders (string vs int) ----
|
150
|
+
def _dnf_filters_for_range_str(self, start, end) -> DNFFilter:
|
151
|
+
s, e = pd.Timestamp(start), pd.Timestamp(end)
|
152
|
+
if s > e:
|
153
|
+
raise ValueError("start > end")
|
154
|
+
sY, sM, sD = s.year, s.month, s.day
|
155
|
+
eY, eM, eD = e.year, e.month, e.day
|
156
|
+
p2 = self._pad2
|
157
|
+
if sY == eY and sM == eM:
|
158
|
+
return [[("yyyy","==",str(sY)),("mm","==",p2(sM)),("dd",">=",p2(sD)),("dd","<=",p2(eD))]]
|
159
|
+
clauses: DNFFilter = [
|
160
|
+
[("yyyy","==",str(sY)),("mm","==",p2(sM)),("dd",">=",p2(sD))],
|
161
|
+
[("yyyy","==",str(eY)),("mm","==",p2(eM)),("dd","<=",p2(eD))]
|
162
|
+
]
|
163
|
+
if sY == eY:
|
164
|
+
for m in range(sM+1, eM):
|
165
|
+
clauses.append([("yyyy","==",str(sY)),("mm","==",p2(m))])
|
166
|
+
return clauses
|
167
|
+
for m in range(sM+1, 13):
|
168
|
+
clauses.append([("yyyy","==",str(sY)),("mm","==",p2(m))])
|
169
|
+
for y in range(sY+1, eY):
|
170
|
+
clauses.append([("yyyy","==",str(y))])
|
171
|
+
for m in range(1, eM):
|
172
|
+
clauses.append([("yyyy","==",str(eY)),("mm","==",p2(m))])
|
173
|
+
return clauses
|
174
|
+
|
175
|
+
@staticmethod
|
176
|
+
def _dnf_filters_for_range_int(start, end) -> DNFFilter:
|
177
|
+
s, e = pd.Timestamp(start), pd.Timestamp(end)
|
178
|
+
if s > e:
|
179
|
+
raise ValueError("start > end")
|
180
|
+
sY, sM, sD = s.year, s.month, s.day
|
181
|
+
eY, eM, eD = e.year, e.month, e.day
|
182
|
+
if sY == eY and sM == eM:
|
183
|
+
return [[("yyyy","==",sY),("mm","==",sM),("dd",">=",sD),("dd","<=",eD)]]
|
184
|
+
clauses: DNFFilter = [
|
185
|
+
[("yyyy","==",sY),("mm","==",sM),("dd",">=",sD)],
|
186
|
+
[("yyyy","==",eY),("mm","==",eM),("dd","<=",eD)],
|
187
|
+
]
|
188
|
+
if sY == eY:
|
189
|
+
for m in range(sM+1, eM):
|
190
|
+
clauses.append([("yyyy","==",sY),("mm","==",m)])
|
191
|
+
return clauses
|
192
|
+
for m in range(sM+1, 13):
|
193
|
+
clauses.append([("yyyy","==",sY),("mm","==",m)])
|
194
|
+
for y in range(sY+1, eY):
|
195
|
+
clauses.append([("yyyy","==",y)])
|
196
|
+
for m in range(1, eM):
|
197
|
+
clauses.append([("yyyy","==",eY),("mm","==",m)])
|
198
|
+
return clauses
|
199
|
+
|
200
|
+
async def write_async(
|
201
|
+
self,
|
202
|
+
df: dd.DataFrame,
|
203
|
+
*,
|
204
|
+
repartition: int | None = None,
|
205
|
+
overwrite: bool = False,
|
206
|
+
timeout: float | None = None,
|
207
|
+
) -> None:
|
208
|
+
async def _run():
|
209
|
+
return await asyncio.to_thread(self.write, df, repartition=repartition, overwrite=overwrite)
|
210
|
+
|
211
|
+
return await (asyncio.wait_for(_run(), timeout) if timeout else _run())
|
212
|
+
|
213
|
+
async def read_range_async(
|
214
|
+
self,
|
215
|
+
start, end, *, columns: Iterable[str] | None = None, timeout: float | None = None
|
216
|
+
) -> dd.DataFrame:
|
217
|
+
async def _run():
|
218
|
+
return await asyncio.to_thread(self.read_range, start, end, columns=columns)
|
219
|
+
|
220
|
+
return await (asyncio.wait_for(_run(), timeout) if timeout else _run())
|
221
|
+
|
222
|
+
async def read_month_async(self, year: int, month: int, *, columns=None, timeout: float | None = None):
|
223
|
+
async def _run():
|
224
|
+
return await asyncio.to_thread(self.read_month, year, month, columns=columns)
|
225
|
+
|
226
|
+
return await (asyncio.wait_for(_run(), timeout) if timeout else _run())
|
227
|
+
|
228
|
+
async def read_day_async(self, year: int, month: int, day: int, *, columns=None, timeout: float | None = None):
|
229
|
+
async def _run():
|
230
|
+
return await asyncio.to_thread(self.read_day, year, month, day, columns=columns)
|
231
|
+
|
232
|
+
return await (asyncio.wait_for(_run(), timeout) if timeout else _run())
|