sibi-dst 2025.8.1__py3-none-any.whl → 2025.8.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/__init__.py +3 -2
- sibi_dst/df_helper/_artifact_updater_async.py +238 -0
- sibi_dst/df_helper/_artifact_updater_threaded.py +195 -0
- sibi_dst/df_helper/_df_helper.py +26 -7
- sibi_dst/df_helper/_parquet_artifact.py +24 -4
- sibi_dst/df_helper/_parquet_reader.py +9 -10
- sibi_dst/df_helper/core/_filter_handler.py +116 -37
- sibi_dst/utils/__init__.py +2 -0
- sibi_dst/utils/base.py +153 -224
- sibi_dst/utils/business_days.py +248 -0
- sibi_dst/utils/data_wrapper.py +166 -106
- sibi_dst/utils/date_utils.py +711 -394
- sibi_dst/utils/file_age_checker.py +301 -0
- sibi_dst/utils/periods.py +42 -0
- sibi_dst/utils/update_planner.py +2 -2
- {sibi_dst-2025.8.1.dist-info → sibi_dst-2025.8.3.dist-info}/METADATA +1 -1
- {sibi_dst-2025.8.1.dist-info → sibi_dst-2025.8.3.dist-info}/RECORD +18 -14
- sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +0 -315
- {sibi_dst-2025.8.1.dist-info → sibi_dst-2025.8.3.dist-info}/WHEEL +0 -0
@@ -0,0 +1,248 @@
|
|
1
|
+
import datetime as dt
|
2
|
+
from typing import Any, Dict, Iterable, Optional
|
3
|
+
from sibi_dst.utils import Logger
|
4
|
+
import numpy as np
|
5
|
+
import pandas as pd
|
6
|
+
import dask.dataframe as dd
|
7
|
+
|
8
|
+
|
9
|
+
# ---------------- Vectorized helpers (used by Dask map_partitions) ----------------
|
10
|
+
|
11
|
+
def _to_np_days(series: pd.Series) -> np.ndarray:
|
12
|
+
"""Coerce to numpy datetime64[D] with NaT-safe conversion."""
|
13
|
+
# Use pandas for robust parsing, then cast to date-days
|
14
|
+
s = pd.to_datetime(series, errors="coerce")
|
15
|
+
# Convert to numpy datetime64[D] (day precision)
|
16
|
+
return s.values.astype("datetime64[D]")
|
17
|
+
|
18
|
+
|
19
|
+
def _vectorized_busday_count(
|
20
|
+
part: pd.DataFrame,
|
21
|
+
begin_col: str,
|
22
|
+
end_col: str,
|
23
|
+
holidays: Iterable[str],
|
24
|
+
weekmask: Optional[str],
|
25
|
+
inclusive: bool,
|
26
|
+
) -> pd.Series:
|
27
|
+
start = _to_np_days(part[begin_col]) # numpy datetime64[D]
|
28
|
+
end = _to_np_days(part[end_col]) # numpy datetime64[D]
|
29
|
+
|
30
|
+
kwargs: Dict[str, Any] = {}
|
31
|
+
if holidays:
|
32
|
+
kwargs["holidays"] = np.array(list(holidays), dtype="datetime64[D]")
|
33
|
+
if weekmask:
|
34
|
+
kwargs["weekmask"] = weekmask
|
35
|
+
|
36
|
+
end_adj = end
|
37
|
+
if inclusive:
|
38
|
+
with np.errstate(invalid="ignore"):
|
39
|
+
end_adj = end + np.timedelta64(1, "D")
|
40
|
+
|
41
|
+
valid = (~pd.isna(start)) & (~pd.isna(end)) # numpy bool mask
|
42
|
+
result = np.full(part.shape[0], np.nan, dtype="float64")
|
43
|
+
if valid.any():
|
44
|
+
counts = np.busday_count(
|
45
|
+
start[valid].astype("datetime64[D]"),
|
46
|
+
end_adj[valid].astype("datetime64[D]"),
|
47
|
+
**kwargs,
|
48
|
+
).astype("float64")
|
49
|
+
result[valid] = counts
|
50
|
+
|
51
|
+
return pd.Series(result, index=part.index)
|
52
|
+
|
53
|
+
|
54
|
+
def _vectorized_busday_offset(
|
55
|
+
part: pd.DataFrame,
|
56
|
+
start_col: str,
|
57
|
+
n_days_col: str,
|
58
|
+
holidays: Iterable[str],
|
59
|
+
weekmask: Optional[str],
|
60
|
+
roll: str,
|
61
|
+
) -> pd.Series:
|
62
|
+
start = _to_np_days(part[start_col]) # numpy datetime64[D]
|
63
|
+
n_days = pd.to_numeric(part[n_days_col], errors="coerce").to_numpy() # numpy float -> cast later
|
64
|
+
|
65
|
+
kwargs: Dict[str, Any] = {"roll": roll}
|
66
|
+
if holidays:
|
67
|
+
kwargs["holidays"] = np.array(list(holidays), dtype="datetime64[D]")
|
68
|
+
if weekmask:
|
69
|
+
kwargs["weekmask"] = weekmask
|
70
|
+
|
71
|
+
valid = (~pd.isna(start)) & (~pd.isna(n_days)) # numpy bool mask
|
72
|
+
out = np.full(part.shape[0], np.datetime64("NaT", "ns"), dtype="datetime64[ns]")
|
73
|
+
if valid.any():
|
74
|
+
offs = np.busday_offset(
|
75
|
+
start[valid].astype("datetime64[D]"),
|
76
|
+
n_days[valid].astype("int64"),
|
77
|
+
**kwargs,
|
78
|
+
).astype("datetime64[ns]")
|
79
|
+
out[valid] = offs
|
80
|
+
|
81
|
+
return pd.Series(out, index=part.index)
|
82
|
+
|
83
|
+
|
84
|
+
# ---------------- BusinessDays ----------------
|
85
|
+
|
86
|
+
class BusinessDays:
|
87
|
+
"""
|
88
|
+
Business day calculations with custom holidays and optional weekmask.
|
89
|
+
|
90
|
+
Features
|
91
|
+
- Scalar helpers:
|
92
|
+
- get_business_days_count(begin, end, inclusive=False) -> int
|
93
|
+
- add_business_days(start_date, n_days, roll='forward') -> np.datetime64
|
94
|
+
- Dask DataFrame helpers (vectorized via map_partitions):
|
95
|
+
- calc_business_days_from_df(df, begin_col, end_col, result_col='business_days', inclusive=False)
|
96
|
+
- calc_sla_end_date(df, start_date_col, n_days_col, result_col='sla_end_date', roll='forward')
|
97
|
+
|
98
|
+
Parameters
|
99
|
+
----------
|
100
|
+
holiday_list : dict[str, list[str]] | Iterable[str]
|
101
|
+
Either a mapping of year -> [YYYY-MM-DD, ...] or a flat iterable of YYYY-MM-DD strings.
|
102
|
+
logger : Any
|
103
|
+
Logger with .debug/.info/.warning/.error.
|
104
|
+
weekmask : str | None
|
105
|
+
A numpy business day weekmask like '1111100' (Mon–Fri). None means default Mon–Fri.
|
106
|
+
Examples:
|
107
|
+
'1111100' -> Mon-Fri
|
108
|
+
'1111110' -> Mon-Sat
|
109
|
+
"""
|
110
|
+
|
111
|
+
def __init__(
|
112
|
+
self,
|
113
|
+
holiday_list: Dict[str, list[str]] | Iterable[str],
|
114
|
+
debug: bool = False,
|
115
|
+
logger: Optional[Logger] = None,
|
116
|
+
weekmask: Optional[str] = None,
|
117
|
+
) -> None:
|
118
|
+
self.debug = debug
|
119
|
+
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
120
|
+
self.weekmask = weekmask
|
121
|
+
|
122
|
+
# Normalize holidays to a flat, sorted tuple of 'YYYY-MM-DD'
|
123
|
+
if isinstance(holiday_list, dict):
|
124
|
+
flat = [d for _, days in sorted(holiday_list.items()) for d in days]
|
125
|
+
else:
|
126
|
+
flat = list(holiday_list)
|
127
|
+
# Deduplicate while preserving order
|
128
|
+
seen = set()
|
129
|
+
flat_unique = []
|
130
|
+
for d in flat:
|
131
|
+
if d not in seen:
|
132
|
+
seen.add(d)
|
133
|
+
flat_unique.append(d)
|
134
|
+
self.holidays: tuple[str, ...] = tuple(flat_unique)
|
135
|
+
|
136
|
+
# -------- Scalar API --------
|
137
|
+
|
138
|
+
def get_business_days_count(
|
139
|
+
self,
|
140
|
+
begin_date: str | dt.date | pd.Timestamp,
|
141
|
+
end_date: str | dt.date | pd.Timestamp,
|
142
|
+
*,
|
143
|
+
inclusive: bool = False,
|
144
|
+
) -> int:
|
145
|
+
"""Business days between two dates. If inclusive=True, include the end date."""
|
146
|
+
b = pd.to_datetime(begin_date).date()
|
147
|
+
e = pd.to_datetime(end_date).date()
|
148
|
+
|
149
|
+
kwargs: Dict[str, Any] = {}
|
150
|
+
if self.holidays:
|
151
|
+
kwargs["holidays"] = np.array(self.holidays, dtype="datetime64[D]")
|
152
|
+
if self.weekmask:
|
153
|
+
kwargs["weekmask"] = self.weekmask
|
154
|
+
|
155
|
+
if inclusive:
|
156
|
+
e_np = np.datetime64(e) + np.timedelta64(1, "D")
|
157
|
+
else:
|
158
|
+
e_np = np.datetime64(e)
|
159
|
+
|
160
|
+
val = int(np.busday_count(np.datetime64(b), e_np, **kwargs))
|
161
|
+
return val
|
162
|
+
|
163
|
+
def add_business_days(
|
164
|
+
self,
|
165
|
+
start_date: str | dt.date | pd.Timestamp,
|
166
|
+
n_days: int,
|
167
|
+
*,
|
168
|
+
roll: str = "forward",
|
169
|
+
) -> np.datetime64:
|
170
|
+
"""
|
171
|
+
Add (or subtract) business days to a date. Returns numpy datetime64[D].
|
172
|
+
roll: {'forward','backward','following','preceding','modifiedfollowing',
|
173
|
+
'modifiedpreceding','nat'}
|
174
|
+
"""
|
175
|
+
s = pd.to_datetime(start_date).date()
|
176
|
+
kwargs: Dict[str, Any] = {"roll": roll}
|
177
|
+
if self.holidays:
|
178
|
+
kwargs["holidays"] = np.array(self.holidays, dtype="datetime64[D]")
|
179
|
+
if self.weekmask:
|
180
|
+
kwargs["weekmask"] = self.weekmask
|
181
|
+
|
182
|
+
return np.busday_offset(np.datetime64(s), int(n_days), **kwargs)
|
183
|
+
|
184
|
+
# -------- Dask API --------
|
185
|
+
|
186
|
+
def calc_business_days_from_df(
|
187
|
+
self,
|
188
|
+
df: dd.DataFrame,
|
189
|
+
begin_date_col: str,
|
190
|
+
end_date_col: str,
|
191
|
+
result_col: str = "business_days",
|
192
|
+
*,
|
193
|
+
inclusive: bool = False,
|
194
|
+
) -> dd.DataFrame:
|
195
|
+
"""
|
196
|
+
Vectorized business-day difference between two date columns.
|
197
|
+
Produces float64 (NaN where either side is missing).
|
198
|
+
"""
|
199
|
+
missing = {begin_date_col, end_date_col} - set(df.columns)
|
200
|
+
if missing:
|
201
|
+
self.logger.error(f"Missing columns: {missing}")
|
202
|
+
raise ValueError("Required columns are missing from DataFrame")
|
203
|
+
|
204
|
+
return df.assign(
|
205
|
+
**{
|
206
|
+
result_col: df.map_partitions(
|
207
|
+
_vectorized_busday_count,
|
208
|
+
begin_col=begin_date_col,
|
209
|
+
end_col=end_date_col,
|
210
|
+
holidays=self.holidays,
|
211
|
+
weekmask=self.weekmask,
|
212
|
+
inclusive=inclusive,
|
213
|
+
meta=(result_col, "f8"),
|
214
|
+
)
|
215
|
+
}
|
216
|
+
)
|
217
|
+
|
218
|
+
def calc_sla_end_date(
|
219
|
+
self,
|
220
|
+
df: dd.DataFrame,
|
221
|
+
start_date_col: str,
|
222
|
+
n_days_col: str,
|
223
|
+
result_col: str = "sla_end_date",
|
224
|
+
*,
|
225
|
+
roll: str = "forward",
|
226
|
+
) -> dd.DataFrame:
|
227
|
+
"""
|
228
|
+
Vectorized business-day offset for SLA end date.
|
229
|
+
Produces datetime64[ns] with NaT where invalid.
|
230
|
+
"""
|
231
|
+
missing = {start_date_col, n_days_col} - set(df.columns)
|
232
|
+
if missing:
|
233
|
+
self.logger.error(f"Missing columns: {missing}")
|
234
|
+
raise ValueError("Required columns are missing from DataFrame")
|
235
|
+
|
236
|
+
return df.assign(
|
237
|
+
**{
|
238
|
+
result_col: df.map_partitions(
|
239
|
+
_vectorized_busday_offset,
|
240
|
+
start_col=start_date_col,
|
241
|
+
n_days_col=n_days_col,
|
242
|
+
holidays=self.holidays,
|
243
|
+
weekmask=self.weekmask,
|
244
|
+
roll=roll,
|
245
|
+
meta=(result_col, "datetime64[ns]"),
|
246
|
+
)
|
247
|
+
}
|
248
|
+
)
|