sibi-dst 2025.8.1__py3-none-any.whl → 2025.8.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/__init__.py +3 -2
- sibi_dst/df_helper/_artifact_updater_async.py +238 -0
- sibi_dst/df_helper/_artifact_updater_threaded.py +195 -0
- sibi_dst/df_helper/_df_helper.py +1 -1
- sibi_dst/df_helper/_parquet_artifact.py +24 -4
- sibi_dst/df_helper/_parquet_reader.py +9 -10
- sibi_dst/utils/__init__.py +2 -0
- sibi_dst/utils/base.py +153 -224
- sibi_dst/utils/business_days.py +248 -0
- sibi_dst/utils/data_wrapper.py +166 -106
- sibi_dst/utils/date_utils.py +711 -394
- sibi_dst/utils/file_age_checker.py +301 -0
- sibi_dst/utils/periods.py +42 -0
- sibi_dst/utils/update_planner.py +2 -2
- {sibi_dst-2025.8.1.dist-info → sibi_dst-2025.8.2.dist-info}/METADATA +1 -1
- {sibi_dst-2025.8.1.dist-info → sibi_dst-2025.8.2.dist-info}/RECORD +17 -13
- sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +0 -315
- {sibi_dst-2025.8.1.dist-info → sibi_dst-2025.8.2.dist-info}/WHEEL +0 -0
sibi_dst/utils/date_utils.py
CHANGED
@@ -1,461 +1,778 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
import datetime
|
4
|
-
|
5
|
-
|
6
|
-
import fsspec
|
7
|
-
import numpy as np
|
8
|
-
import pandas as pd
|
9
|
-
import dask.dataframe as dd
|
10
|
-
from .log_utils import Logger
|
3
|
+
import datetime as dt
|
4
|
+
import re
|
5
|
+
from typing import Callable, Union
|
11
6
|
|
12
7
|
|
13
8
|
class DateUtils:
|
14
9
|
"""
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
:
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
10
|
+
Period resolution & normalization for ETL artifacts.
|
11
|
+
|
12
|
+
Canonical periods:
|
13
|
+
- 'today'
|
14
|
+
- 'current_month'
|
15
|
+
- 'ytd'
|
16
|
+
- 'itd'
|
17
|
+
- 'custom' (requires 'start_on' and 'end_on')
|
18
|
+
|
19
|
+
Extras:
|
20
|
+
- Register named periods at runtime (register_period)
|
21
|
+
- Register regex-based periods (register_pattern)
|
22
|
+
- Recognize explicit windows: 'YYYY-MM-DD..YYYY-MM-DD'
|
23
|
+
- Accept 'last_N_days' and 'last_N_hours' via default patterns
|
24
|
+
|
25
|
+
All dynamic/custom outputs standardize on:
|
26
|
+
- date windows: 'start_on' / 'end_on' (YYYY-MM-DD or date-like)
|
27
|
+
- time windows: 'start_ts' / 'end_ts' (ISO datetimes)
|
31
28
|
"""
|
32
|
-
_PERIOD_FUNCTIONS: Dict[str, Callable[[], Tuple[datetime.date, datetime.date]]] = {}
|
33
29
|
|
34
|
-
|
35
|
-
|
36
|
-
|
30
|
+
# ---- Dynamic registries ----
|
31
|
+
_PERIOD_FUNCTIONS: Dict[str, Callable[[], Tuple[dt.date, dt.date]]] = {}
|
32
|
+
_PERIOD_PATTERNS: List[Tuple[re.Pattern[str], Callable[[re.Match[str], dt.datetime], Dict[str, Any]]]] = []
|
37
33
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
34
|
+
_LAST_N_DAYS_RE = re.compile(r"^last_(\d+)_days$")
|
35
|
+
_WINDOW_RE = re.compile(r"^(\d{4}-\d{2}-\d{2})\.\.(\d{4}-\d{2}-\d{2})$")
|
36
|
+
|
37
|
+
# ---------------- Core coercion helpers ----------------
|
38
|
+
|
39
|
+
@staticmethod
|
40
|
+
def _ensure_date(value: Union[str, dt.date, dt.datetime, pd.Timestamp]) -> dt.date:
|
41
|
+
"""Ensure the input is converted to a datetime.date."""
|
42
|
+
if isinstance(value, dt.date) and not isinstance(value, dt.datetime):
|
44
43
|
return value
|
45
|
-
|
44
|
+
if isinstance(value, dt.datetime):
|
46
45
|
return value.date()
|
47
|
-
|
46
|
+
if isinstance(value, pd.Timestamp):
|
48
47
|
return value.to_pydatetime().date()
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
48
|
+
if isinstance(value, str):
|
49
|
+
# Try pandas parser first (robust), then ISO date
|
50
|
+
try:
|
51
|
+
return pd.to_datetime(value, errors="raise").date() # type: ignore[return-value]
|
52
|
+
except Exception:
|
53
|
+
pass
|
54
|
+
try:
|
55
|
+
return dt.date.fromisoformat(value)
|
56
|
+
except Exception:
|
57
|
+
pass
|
58
|
+
raise ValueError(f"Unsupported date format: {value!r}")
|
59
|
+
|
60
|
+
# Public alias (used by others)
|
58
61
|
ensure_date = _ensure_date
|
59
62
|
|
63
|
+
@staticmethod
|
64
|
+
def _ensure_datetime(
|
65
|
+
value: Union[str, dt.date, dt.datetime, pd.Timestamp],
|
66
|
+
tz: dt.tzinfo = dt.timezone.utc,
|
67
|
+
) -> dt.datetime:
|
68
|
+
"""Convert input to timezone-aware datetime (defaults to UTC)."""
|
69
|
+
if isinstance(value, dt.datetime):
|
70
|
+
return value if value.tzinfo else value.replace(tzinfo=tz)
|
71
|
+
if isinstance(value, dt.date):
|
72
|
+
return dt.datetime(value.year, value.month, value.day, tzinfo=tz)
|
73
|
+
if isinstance(value, pd.Timestamp):
|
74
|
+
dtt = value.to_pydatetime()
|
75
|
+
return dtt if dtt.tzinfo else dtt.replace(tzinfo=tz)
|
76
|
+
if isinstance(value, str):
|
77
|
+
ts = pd.to_datetime(value, errors="raise", utc=False)
|
78
|
+
dtt = ts.to_pydatetime()
|
79
|
+
return dtt if getattr(dtt, "tzinfo", None) else dtt.replace(tzinfo=tz)
|
80
|
+
raise ValueError(f"Unsupported datetime format: {value!r}")
|
81
|
+
|
82
|
+
# ---------------- Week / Month / Quarter helpers ----------------
|
83
|
+
|
60
84
|
@classmethod
|
61
|
-
def calc_week_range(cls, reference_date: Union[str,
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
reference_date = cls._ensure_date(reference_date)
|
67
|
-
start = reference_date - datetime.timedelta(days=reference_date.weekday())
|
68
|
-
end = start + datetime.timedelta(days=6)
|
85
|
+
def calc_week_range(cls, reference_date: Union[str, dt.date, dt.datetime, pd.Timestamp]) -> Tuple[dt.date, dt.date]:
|
86
|
+
"""Start (Mon) and end (Sun) for the week containing reference_date."""
|
87
|
+
ref = cls._ensure_date(reference_date)
|
88
|
+
start = ref - dt.timedelta(days=ref.weekday())
|
89
|
+
end = start + dt.timedelta(days=6)
|
69
90
|
return start, end
|
70
91
|
|
71
92
|
@staticmethod
|
72
|
-
def get_year_timerange(year: int) -> Tuple[
|
73
|
-
|
74
|
-
Get the start and end dates for a given year.
|
75
|
-
"""
|
76
|
-
return datetime.date(year, 1, 1), datetime.date(year, 12, 31)
|
93
|
+
def get_year_timerange(year: int) -> Tuple[dt.date, dt.date]:
|
94
|
+
return dt.date(year, 1, 1), dt.date(year, 12, 31)
|
77
95
|
|
78
96
|
@classmethod
|
79
|
-
def get_first_day_of_the_quarter(cls, reference_date: Union[
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
"""
|
84
|
-
reference_date = cls._ensure_date(reference_date)
|
85
|
-
quarter = (reference_date.month - 1) // 3 + 1
|
86
|
-
return datetime.date(reference_date.year, 3 * quarter - 2, 1)
|
97
|
+
def get_first_day_of_the_quarter(cls, reference_date: Union[str, dt.date, dt.datetime, pd.Timestamp]) -> dt.date:
|
98
|
+
ref = cls._ensure_date(reference_date)
|
99
|
+
quarter = (ref.month - 1) // 3 + 1
|
100
|
+
return dt.date(ref.year, 3 * quarter - 2, 1)
|
87
101
|
|
88
102
|
@classmethod
|
89
|
-
def get_last_day_of_the_quarter(cls, reference_date: Union[
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
reference_date = cls._ensure_date(reference_date)
|
95
|
-
quarter = (reference_date.month - 1) // 3 + 1
|
96
|
-
first_day_of_next_quarter = datetime.date(reference_date.year, 3 * quarter + 1, 1)
|
97
|
-
return first_day_of_next_quarter - datetime.timedelta(days=1)
|
103
|
+
def get_last_day_of_the_quarter(cls, reference_date: Union[str, dt.date, dt.datetime, pd.Timestamp]) -> dt.date:
|
104
|
+
ref = cls._ensure_date(reference_date)
|
105
|
+
quarter = (ref.month - 1) // 3 + 1
|
106
|
+
first_day_next_q = dt.date(ref.year, 3 * quarter + 1, 1)
|
107
|
+
return first_day_next_q - dt.timedelta(days=1)
|
98
108
|
|
99
109
|
@classmethod
|
100
|
-
def get_month_range(cls, n: int = 0) -> Tuple[
|
110
|
+
def get_month_range(cls, n: int = 0) -> Tuple[dt.date, dt.date]:
|
101
111
|
"""
|
102
|
-
|
112
|
+
Range for current month (n=0) or +/- n months relative to today.
|
113
|
+
If n == 0, end is today. Otherwise end is calendar month end.
|
103
114
|
"""
|
104
|
-
today =
|
115
|
+
today = dt.date.today()
|
105
116
|
target_month = (today.month - 1 + n) % 12 + 1
|
106
117
|
target_year = today.year + (today.month - 1 + n) // 12
|
107
|
-
start =
|
118
|
+
start = dt.date(target_year, target_month, 1)
|
108
119
|
if n == 0:
|
109
120
|
return start, today
|
110
121
|
next_month = (target_month % 12) + 1
|
111
122
|
next_year = target_year + (target_month == 12)
|
112
|
-
end =
|
123
|
+
end = dt.date(next_year, next_month, 1) - dt.timedelta(days=1)
|
113
124
|
return start, end
|
114
125
|
|
126
|
+
# ---------------- Period registration ----------------
|
127
|
+
|
115
128
|
@classmethod
|
116
|
-
def register_period(cls, name: str, func: Callable[[], Tuple[
|
129
|
+
def register_period(cls, name: str, func: Callable[[], Tuple[dt.date, dt.date]]) -> None:
|
117
130
|
"""
|
118
|
-
Dynamically register a new period
|
131
|
+
Dynamically register a new named period.
|
132
|
+
The callable must return (start_date, end_date) as datetime.date values.
|
119
133
|
"""
|
120
134
|
cls._PERIOD_FUNCTIONS[name] = func
|
121
135
|
|
122
136
|
@classmethod
|
123
|
-
def
|
137
|
+
def register_pattern(
|
138
|
+
cls,
|
139
|
+
pattern: str | re.Pattern[str],
|
140
|
+
resolver: Callable[[re.Match[str], dt.datetime], Dict[str, Any]],
|
141
|
+
) -> None:
|
124
142
|
"""
|
125
|
-
|
143
|
+
Register a regex-based dynamic period.
|
144
|
+
|
145
|
+
The resolver receives:
|
146
|
+
- match: regex match object
|
147
|
+
- now: timezone-aware datetime (UTC by default)
|
148
|
+
|
149
|
+
It must return a dict with optional keys:
|
150
|
+
- 'canonical' : str (defaults to 'custom')
|
151
|
+
- 'start_on'/'end_on' : ISO date strings (YYYY-MM-DD) OR
|
152
|
+
- 'start_ts'/'end_ts' : ISO datetime strings
|
153
|
+
- any additional per-period params
|
126
154
|
"""
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
raise ValueError(f"Unknown period '{period}'. Available periods: {list(period_functions.keys())}")
|
132
|
-
return period_functions[period]()
|
155
|
+
compiled = re.compile(pattern) if isinstance(pattern, str) else pattern
|
156
|
+
cls._PERIOD_PATTERNS.append((compiled, resolver))
|
157
|
+
|
158
|
+
# ---------------- Default named periods ----------------
|
133
159
|
|
134
160
|
@classmethod
|
135
|
-
def _get_default_periods(cls) -> Dict[str, Callable[[], Tuple[
|
136
|
-
|
137
|
-
Get default period functions.
|
138
|
-
"""
|
139
|
-
today = datetime.date.today
|
161
|
+
def _get_default_periods(cls) -> Dict[str, Callable[[], Tuple[dt.date, dt.date]]]:
|
162
|
+
today = dt.date.today
|
140
163
|
return {
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
cls.get_first_day_of_the_quarter(today()),
|
151
|
-
|
164
|
+
"today": lambda: (today(), today()),
|
165
|
+
"yesterday": lambda: (today() - dt.timedelta(days=1), today() - dt.timedelta(days=1)),
|
166
|
+
"current_week": lambda: cls.calc_week_range(today()),
|
167
|
+
"last_week": lambda: cls.calc_week_range(today() - dt.timedelta(days=7)),
|
168
|
+
"current_month": lambda: cls.get_month_range(n=0),
|
169
|
+
"last_month": lambda: cls.get_month_range(n=-1),
|
170
|
+
"current_year": lambda: cls.get_year_timerange(today().year),
|
171
|
+
"last_year": lambda: cls.get_year_timerange(today().year - 1),
|
172
|
+
"current_quarter": lambda: (
|
173
|
+
cls.get_first_day_of_the_quarter(today()),
|
174
|
+
cls.get_last_day_of_the_quarter(today()),
|
175
|
+
),
|
176
|
+
"ytd": lambda: (dt.date(today().year, 1, 1), today()),
|
177
|
+
"itd": lambda: (dt.date(1900, 1, 1), today()),
|
152
178
|
}
|
153
179
|
|
180
|
+
@classmethod
|
181
|
+
def period_keys(cls) -> Iterable[str]:
|
182
|
+
"""List available named periods (defaults + registered)."""
|
183
|
+
d = dict(cls._get_default_periods())
|
184
|
+
d.update(cls._PERIOD_FUNCTIONS)
|
185
|
+
return d.keys()
|
154
186
|
|
155
|
-
|
156
|
-
def __init__(self, debug=False, logger=None):
|
157
|
-
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
158
|
-
self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
|
159
|
-
def is_file_older_than(
|
160
|
-
self,
|
161
|
-
file_path: str,
|
162
|
-
max_age_minutes: int,
|
163
|
-
fs: Optional[fsspec.AbstractFileSystem] = None,
|
164
|
-
ignore_missing: bool = False,
|
165
|
-
verbose: bool = False,
|
166
|
-
) -> bool:
|
167
|
-
"""
|
168
|
-
Check if a file or directory is older than the specified max_age_minutes.
|
169
|
-
|
170
|
-
:param file_path: Path to the file or directory.
|
171
|
-
:param max_age_minutes: Maximum allowed age in minutes.
|
172
|
-
:param fs: Filesystem object. Defaults to local filesystem.
|
173
|
-
:param ignore_missing: Treat missing paths as not old if True.
|
174
|
-
:param verbose: Enable detailed logging.
|
175
|
-
:return: True if older than max_age_minutes, False otherwise.
|
176
|
-
"""
|
177
|
-
fs = fs or fsspec.filesystem("file")
|
178
|
-
self.logger.debug(f"Checking age for {file_path}...")
|
179
|
-
|
180
|
-
try:
|
181
|
-
if not fs.exists(file_path):
|
182
|
-
self.logger.debug(f"Path not found: {file_path}.")
|
183
|
-
return not ignore_missing
|
184
|
-
|
185
|
-
if fs.isdir(file_path):
|
186
|
-
self.logger.debug(f"Found directory: {file_path}")
|
187
|
-
age = self._get_directory_age_minutes(file_path, fs, verbose)
|
188
|
-
elif fs.isfile(file_path):
|
189
|
-
age = self._get_file_age_minutes(file_path, fs, verbose)
|
190
|
-
else:
|
191
|
-
self.logger.warning(f"Path {file_path} is neither file nor directory.")
|
192
|
-
return True
|
193
|
-
|
194
|
-
return age > max_age_minutes
|
195
|
-
|
196
|
-
except Exception as e:
|
197
|
-
self.logger.warning(f"Error checking {file_path}: {str(e)}")
|
198
|
-
return True
|
199
|
-
|
200
|
-
def get_file_or_dir_age_minutes(
|
201
|
-
self,
|
202
|
-
file_path: str,
|
203
|
-
fs: Optional[fsspec.AbstractFileSystem] = None,
|
204
|
-
) -> float:
|
205
|
-
"""
|
206
|
-
Get age of file/directory in minutes. Returns infinity for errors/missing paths.
|
187
|
+
# ---------------- Flexible resolver ----------------
|
207
188
|
|
208
|
-
|
209
|
-
|
210
|
-
|
189
|
+
@classmethod
|
190
|
+
def resolve_period(
|
191
|
+
cls,
|
192
|
+
period: Optional[str] = None,
|
193
|
+
*,
|
194
|
+
now: Optional[dt.datetime] = None,
|
195
|
+
tz: dt.tzinfo = dt.timezone.utc,
|
196
|
+
**overrides: Any,
|
197
|
+
) -> Tuple[str, Dict[str, Any]]:
|
211
198
|
"""
|
212
|
-
|
213
|
-
try:
|
214
|
-
if not fs.exists(file_path):
|
215
|
-
self.logger.debug(f"Path not found: {file_path}")
|
216
|
-
return float("inf")
|
217
|
-
|
218
|
-
if fs.isdir(file_path):
|
219
|
-
return self._get_directory_age_minutes(file_path, fs, verbose=False)
|
220
|
-
if fs.isfile(file_path):
|
221
|
-
return self._get_file_age_minutes(file_path, fs, verbose=False)
|
222
|
-
|
223
|
-
self.logger.warning(f"Invalid path type: {file_path}")
|
224
|
-
return float("inf")
|
225
|
-
|
226
|
-
except Exception as e:
|
227
|
-
self.logger.warning(f"Error getting age for {file_path}: {str(e)}")
|
228
|
-
return float("inf")
|
229
|
-
|
230
|
-
def _get_directory_age_minutes(
|
231
|
-
self,
|
232
|
-
dir_path: str,
|
233
|
-
fs: fsspec.AbstractFileSystem,
|
234
|
-
verbose: bool,
|
235
|
-
) -> float:
|
236
|
-
"""Calculate age of oldest file in directory."""
|
237
|
-
try:
|
238
|
-
all_files = fs.ls(dir_path)
|
239
|
-
except Exception as e:
|
240
|
-
self.logger.warning(f"Error listing {dir_path}: {str(e)}")
|
241
|
-
return float("inf")
|
242
|
-
|
243
|
-
if not all_files:
|
244
|
-
self.logger.debug(f"Empty directory: {dir_path}")
|
245
|
-
return float("inf")
|
246
|
-
|
247
|
-
modification_times = []
|
248
|
-
for file in all_files:
|
249
|
-
try:
|
250
|
-
info = fs.info(file)
|
251
|
-
mod_time = self._get_modification_time(info, file)
|
252
|
-
modification_times.append(mod_time)
|
253
|
-
except Exception as e:
|
254
|
-
self.logger.warning(f"Skipping {file}: {str(e)}")
|
255
|
-
|
256
|
-
if not modification_times:
|
257
|
-
self.logger.warning(f"No valid files in {dir_path}")
|
258
|
-
return float("inf")
|
259
|
-
|
260
|
-
oldest = min(modification_times)
|
261
|
-
age = (datetime.datetime.now(datetime.timezone.utc) - oldest).total_seconds() / 60
|
262
|
-
self.logger.debug(f"Oldest in {dir_path}: {age:.2f} minutes")
|
263
|
-
|
264
|
-
return age
|
265
|
-
|
266
|
-
def _get_file_age_minutes(
|
267
|
-
self,
|
268
|
-
file_path: str,
|
269
|
-
fs: fsspec.AbstractFileSystem,
|
270
|
-
verbose: bool,
|
271
|
-
) -> float:
|
272
|
-
"""Calculate file age in minutes."""
|
273
|
-
try:
|
274
|
-
info = fs.info(file_path)
|
275
|
-
mod_time = self._get_modification_time(info, file_path)
|
276
|
-
age = (datetime.datetime.now(datetime.timezone.utc) - mod_time).total_seconds() / 60
|
277
|
-
|
278
|
-
if verbose:
|
279
|
-
self.logger.debug(f"{file_path} info: {info}")
|
280
|
-
self.logger.debug(f"File age: {age:.2f} minutes")
|
281
|
-
|
282
|
-
return age
|
283
|
-
|
284
|
-
except Exception as e:
|
285
|
-
self.logger.warning(f"Error processing {file_path}: {str(e)}")
|
286
|
-
return float("inf")
|
287
|
-
|
288
|
-
def _get_modification_time(self, info: Dict, file_path: str) -> datetime.datetime:
|
289
|
-
"""Extract modification time from filesystem info with timezone awareness."""
|
290
|
-
try:
|
291
|
-
if "LastModified" in info: # S3-like
|
292
|
-
lm = info["LastModified"]
|
293
|
-
return lm if isinstance(lm, datetime.datetime) else datetime.datetime.fromisoformat(
|
294
|
-
lm[:-1]).astimezone()
|
295
|
-
|
296
|
-
if "mtime" in info: # Local filesystem
|
297
|
-
return datetime.datetime.fromtimestamp(info["mtime"], tz=datetime.timezone.utc)
|
298
|
-
|
299
|
-
if "modified" in info: # FTP/SSH
|
300
|
-
return datetime.datetime.strptime(
|
301
|
-
info["modified"], "%Y-%m-%d %H:%M:%S"
|
302
|
-
).replace(tzinfo=datetime.timezone.utc)
|
303
|
-
|
304
|
-
raise KeyError("No valid modification time key found")
|
305
|
-
|
306
|
-
except (KeyError, ValueError) as e:
|
307
|
-
self.logger.warning(f"Invalid mod time for {file_path}: {str(e)}")
|
308
|
-
raise ValueError(f"Unsupported modification time format for {file_path}") from e
|
309
|
-
|
310
|
-
|
311
|
-
# --- Vectorized Helper Functions ---
|
312
|
-
|
313
|
-
def _vectorized_busday_count(partition, begin_col, end_col, holidays):
|
314
|
-
"""
|
315
|
-
Calculates the number of business days between a start and end date.
|
316
|
-
"""
|
317
|
-
# Extract the raw columns
|
318
|
-
start_dates_raw = partition[begin_col]
|
319
|
-
end_dates_raw = partition[end_col]
|
320
|
-
|
199
|
+
Resolve a period into (canonical_key, params).
|
321
200
|
|
322
|
-
|
323
|
-
|
201
|
+
Priority:
|
202
|
+
1) exact named period (default + registered)
|
203
|
+
2) registered regex patterns (e.g., 'last_7_days', 'last_36_hours')
|
204
|
+
3) explicit window 'YYYY-MM-DD..YYYY-MM-DD'
|
205
|
+
4) fallback: pass the period verbatim with just overrides
|
324
206
|
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
# Perform the vectorized calculation only on the valid subset
|
332
|
-
# Convert to NumPy arrays of date type for the calculation
|
333
|
-
result.loc[valid_mask] = np.busday_count(
|
334
|
-
start_dates[valid_mask].values.astype('datetime64[D]'),
|
335
|
-
end_dates[valid_mask].values.astype('datetime64[D]'),
|
336
|
-
holidays=holidays
|
337
|
-
)
|
338
|
-
|
339
|
-
return result
|
207
|
+
Returns:
|
208
|
+
- canonical_key: e.g., 'today', 'current_month', or 'custom'
|
209
|
+
- params: dict containing computed keys and merged overrides
|
210
|
+
"""
|
211
|
+
key = (period or "today").strip()
|
212
|
+
now = (now or dt.datetime.now(tz)).astimezone(tz)
|
340
213
|
|
214
|
+
# 1) named periods
|
215
|
+
period_functions = cls._get_default_periods()
|
216
|
+
period_functions.update(cls._PERIOD_FUNCTIONS)
|
217
|
+
if key in period_functions:
|
218
|
+
start, end = period_functions[key]()
|
219
|
+
params = {"start_on": start.isoformat(), "end_on": end.isoformat()}
|
220
|
+
params.update(overrides)
|
221
|
+
return key, params
|
222
|
+
|
223
|
+
# 2) regex patterns (user-registered)
|
224
|
+
for patt, resolver in cls._PERIOD_PATTERNS:
|
225
|
+
m = patt.fullmatch(key)
|
226
|
+
if m:
|
227
|
+
out = resolver(m, now)
|
228
|
+
canonical = out.get("canonical", "custom")
|
229
|
+
params = {k: v for k, v in out.items() if k != "canonical"}
|
230
|
+
params.update(overrides)
|
231
|
+
return canonical, params
|
232
|
+
|
233
|
+
# 2b) default 'last_N_days'
|
234
|
+
m = cls._LAST_N_DAYS_RE.match(key)
|
235
|
+
if m:
|
236
|
+
days = int(m.group(1))
|
237
|
+
end = now.date()
|
238
|
+
start = (now - dt.timedelta(days=days)).date()
|
239
|
+
params = {"start_on": start.isoformat(), "end_on": end.isoformat()}
|
240
|
+
params.update(overrides)
|
241
|
+
return "custom", params
|
242
|
+
|
243
|
+
# 3) explicit date window: YYYY-MM-DD..YYYY-MM-DD
|
244
|
+
m2 = cls._WINDOW_RE.fullmatch(key)
|
245
|
+
if m2:
|
246
|
+
start_on, end_on = m2.group(1), m2.group(2)
|
247
|
+
params = {"start_on": start_on, "end_on": end_on}
|
248
|
+
params.update(overrides)
|
249
|
+
return "custom", params
|
250
|
+
|
251
|
+
# 4) fallback (unknown key)
|
252
|
+
return key, dict(overrides)
|
253
|
+
|
254
|
+
# ---------------- Backward-compatible API ----------------
|
341
255
|
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
# Extract the relevant columns as pandas Series
|
347
|
-
start_dates_raw = partition[start_col]
|
348
|
-
sla_days = partition[n_days_col]
|
256
|
+
@classmethod
|
257
|
+
def parse_period(cls, **kwargs: Any) -> Tuple[dt.date, dt.date]:
|
258
|
+
"""
|
259
|
+
Return (start_date, end_date) as datetime.date.
|
349
260
|
|
261
|
+
Accepts:
|
262
|
+
- period='today' | 'current_month' | 'last_7_days' | 'YYYY-MM-DD..YYYY-MM-DD' | ...
|
263
|
+
- optional overrides (e.g., start_on/end_on for 'custom')
|
264
|
+
"""
|
265
|
+
period = kwargs.setdefault("period", "today")
|
350
266
|
|
351
|
-
|
267
|
+
# Try named periods first
|
268
|
+
period_functions = cls._get_default_periods()
|
269
|
+
period_functions.update(cls._PERIOD_FUNCTIONS)
|
270
|
+
if period in period_functions:
|
271
|
+
return period_functions[period]()
|
352
272
|
|
353
|
-
|
354
|
-
|
273
|
+
# Otherwise, resolve and coerce
|
274
|
+
canonical, params = cls.resolve_period(period, **kwargs)
|
355
275
|
|
356
|
-
|
357
|
-
|
276
|
+
if "start_on" in params and "end_on" in params:
|
277
|
+
start = cls._ensure_date(params["start_on"])
|
278
|
+
end = cls._ensure_date(params["end_on"])
|
279
|
+
return start, end
|
358
280
|
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
sla_days[valid_mask].astype(int), # Ensure days are integers
|
364
|
-
roll='forward',
|
365
|
-
holidays=holidays
|
366
|
-
)
|
281
|
+
if "start_ts" in params and "end_ts" in params:
|
282
|
+
sdt = cls._ensure_datetime(params["start_ts"]).date()
|
283
|
+
edt = cls._ensure_datetime(params["end_ts"]).date()
|
284
|
+
return sdt, edt
|
367
285
|
|
368
|
-
|
286
|
+
raise ValueError(
|
287
|
+
f"Could not derive date range from period '{period}' (canonical='{canonical}'). "
|
288
|
+
f"Params: {params}"
|
289
|
+
)
|
369
290
|
|
370
291
|
|
371
|
-
#
|
292
|
+
# ---------------- Default dynamic patterns registration ----------------
|
372
293
|
|
373
|
-
|
294
|
+
def _register_default_patterns() -> None:
|
374
295
|
"""
|
375
|
-
|
376
|
-
|
296
|
+
Register common dynamic patterns:
|
297
|
+
- last_{n}_hours (ISO datetimes; useful for freshness windows)
|
377
298
|
"""
|
378
299
|
|
379
|
-
def
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
end_date: str | datetime.date | pd.Timestamp,
|
391
|
-
) -> int:
|
392
|
-
"""Scalar method to count business days between two dates."""
|
393
|
-
begin = pd.to_datetime(begin_date)
|
394
|
-
end = pd.to_datetime(end_date)
|
395
|
-
return int(np.busday_count(begin.date(), end.date(), holidays=list(self.holidays)))
|
396
|
-
|
397
|
-
def calc_business_days_from_df(
|
398
|
-
self,
|
399
|
-
df: dd.DataFrame,
|
400
|
-
begin_date_col: str,
|
401
|
-
end_date_col: str,
|
402
|
-
result_col: str = "business_days",
|
403
|
-
) -> dd.DataFrame:
|
404
|
-
"""Calculates business days between two columns in a Dask DataFrame."""
|
405
|
-
missing = {begin_date_col, end_date_col} - set(df.columns)
|
406
|
-
if missing:
|
407
|
-
self.logger.error(f"Missing columns: {missing}")
|
408
|
-
raise ValueError("Required columns are missing from DataFrame")
|
409
|
-
|
410
|
-
return df.assign(
|
411
|
-
**{result_col: df.map_partitions(
|
412
|
-
_vectorized_busday_count,
|
413
|
-
begin_col=begin_date_col,
|
414
|
-
end_col=end_date_col,
|
415
|
-
holidays=list(self.holidays),
|
416
|
-
meta=(result_col, 'f8') # f8 is float64
|
417
|
-
)}
|
418
|
-
)
|
419
|
-
|
420
|
-
def add_business_days(
|
421
|
-
self,
|
422
|
-
start_date: str | datetime.date | pd.Timestamp,
|
423
|
-
n_days: int,
|
424
|
-
) -> np.datetime64:
|
425
|
-
"""Scalar method to add N business days to a start date."""
|
426
|
-
start = pd.to_datetime(start_date)
|
427
|
-
return np.busday_offset(
|
428
|
-
start.date(),
|
429
|
-
n_days,
|
430
|
-
roll='forward',
|
431
|
-
holidays=list(self.holidays),
|
432
|
-
)
|
433
|
-
|
434
|
-
def calc_sla_end_date(
|
435
|
-
self,
|
436
|
-
df: dd.DataFrame,
|
437
|
-
start_date_col: str,
|
438
|
-
n_days_col: str,
|
439
|
-
result_col: str = "sla_end_date",
|
440
|
-
) -> dd.DataFrame:
|
441
|
-
"""Calculates an SLA end date column for a Dask DataFrame."""
|
442
|
-
missing = {start_date_col, n_days_col} - set(df.columns)
|
443
|
-
if missing:
|
444
|
-
self.logger.error(f"Missing columns: {missing}")
|
445
|
-
raise ValueError("Required columns are missing from DataFrame")
|
446
|
-
|
447
|
-
return df.assign(
|
448
|
-
**{result_col: df.map_partitions(
|
449
|
-
_vectorized_sla_end_date,
|
450
|
-
start_col=start_date_col,
|
451
|
-
n_days_col=n_days_col,
|
452
|
-
holidays=list(self.holidays),
|
453
|
-
meta=(result_col, 'datetime64[ns]')
|
454
|
-
)}
|
455
|
-
)
|
300
|
+
def last_x_hours(match: re.Match[str], now: dt.datetime) -> Dict[str, Any]:
|
301
|
+
hours = int(match.group(1))
|
302
|
+
end_ts = now
|
303
|
+
start_ts = now - dt.timedelta(hours=hours)
|
304
|
+
return {
|
305
|
+
"canonical": "custom",
|
306
|
+
"start_ts": start_ts.isoformat(),
|
307
|
+
"end_ts": end_ts.isoformat(),
|
308
|
+
# Sensible default that callers can override:
|
309
|
+
"max_age_minutes": max(15, min(hours * 10, 240)),
|
310
|
+
}
|
456
311
|
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
#
|
461
|
-
|
312
|
+
DateUtils.register_pattern(r"last_(\d+)_hours", last_x_hours)
|
313
|
+
|
314
|
+
|
315
|
+
# Register defaults at import time
|
316
|
+
_register_default_patterns()
|
317
|
+
|
318
|
+
# from __future__ import annotations
|
319
|
+
#
|
320
|
+
# import datetime
|
321
|
+
# from typing import Union, Tuple, Callable, Dict, Optional
|
322
|
+
#
|
323
|
+
# import fsspec
|
324
|
+
# import numpy as np
|
325
|
+
# import pandas as pd
|
326
|
+
# import dask.dataframe as dd
|
327
|
+
# from .log_utils import Logger
|
328
|
+
#
|
329
|
+
#
|
330
|
+
# class DateUtils:
|
331
|
+
# """
|
332
|
+
# Utility class for date-related operations.
|
333
|
+
#
|
334
|
+
# The DateUtils class provides a variety of operations to manipulate and retrieve
|
335
|
+
# information about dates, such as calculating week ranges, determining start or
|
336
|
+
# end dates for specific periods (quarters, months, years), and dynamically
|
337
|
+
# registering custom time period functions. It also supports parsing specific
|
338
|
+
# periods for date range computations and ensuring the input date is correctly
|
339
|
+
# converted to the desired format.
|
340
|
+
#
|
341
|
+
# :ivar logger: Logger instance used for logging messages. Defaults to the logger
|
342
|
+
# for the current class if not provided.
|
343
|
+
# :type logger: Logger
|
344
|
+
#
|
345
|
+
# :ivar _PERIOD_FUNCTIONS: Stores dynamically registered period functions that
|
346
|
+
# return start and end dates.
|
347
|
+
# :type _PERIOD_FUNCTIONS: Dict[str, Callable[[], Tuple[datetime.date, datetime.date]]]
|
348
|
+
# """
|
349
|
+
# _PERIOD_FUNCTIONS: Dict[str, Callable[[], Tuple[datetime.date, datetime.date]]] = {}
|
350
|
+
#
|
351
|
+
# def __init__(self, logger=None, debug=False):
|
352
|
+
# self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
353
|
+
# self.debug = debug
|
354
|
+
#
|
355
|
+
# @classmethod
|
356
|
+
# def _ensure_date(cls, value: Union[str, datetime.date, datetime.datetime, pd.Timestamp]) -> datetime.date:
|
357
|
+
# """
|
358
|
+
# Ensure the input is converted to a datetime.date object.
|
359
|
+
# """
|
360
|
+
# if isinstance(value, datetime.date) and not isinstance(value, datetime.datetime):
|
361
|
+
# return value
|
362
|
+
# elif isinstance(value, datetime.datetime):
|
363
|
+
# return value.date()
|
364
|
+
# elif isinstance(value, pd.Timestamp):
|
365
|
+
# return value.to_pydatetime().date()
|
366
|
+
# elif isinstance(value, str):
|
367
|
+
# for fmt in ('%Y-%m-%d %H:%M:%S', '%Y-%m-%d'):
|
368
|
+
# try:
|
369
|
+
# return datetime.datetime.strptime(value, fmt).date()
|
370
|
+
# except ValueError:
|
371
|
+
# continue
|
372
|
+
# raise ValueError(f"Unsupported date format: {value}")
|
373
|
+
#
|
374
|
+
# # Public alias to access _ensure_date from other classes
|
375
|
+
# ensure_date = _ensure_date
|
376
|
+
#
|
377
|
+
# @classmethod
|
378
|
+
# def calc_week_range(cls, reference_date: Union[str, datetime.date, datetime.datetime, pd.Timestamp]) -> Tuple[
|
379
|
+
# datetime.date, datetime.date]:
|
380
|
+
# """
|
381
|
+
# Calculate the start and end of the week for a given reference date.
|
382
|
+
# """
|
383
|
+
# reference_date = cls._ensure_date(reference_date)
|
384
|
+
# start = reference_date - datetime.timedelta(days=reference_date.weekday())
|
385
|
+
# end = start + datetime.timedelta(days=6)
|
386
|
+
# return start, end
|
387
|
+
#
|
388
|
+
# @staticmethod
|
389
|
+
# def get_year_timerange(year: int) -> Tuple[datetime.date, datetime.date]:
|
390
|
+
# """
|
391
|
+
# Get the start and end dates for a given year.
|
392
|
+
# """
|
393
|
+
# return datetime.date(year, 1, 1), datetime.date(year, 12, 31)
|
394
|
+
#
|
395
|
+
# @classmethod
|
396
|
+
# def get_first_day_of_the_quarter(cls, reference_date: Union[
|
397
|
+
# str, datetime.date, datetime.datetime, pd.Timestamp]) -> datetime.date:
|
398
|
+
# """
|
399
|
+
# Get the first day of the quarter for a given date.
|
400
|
+
# """
|
401
|
+
# reference_date = cls._ensure_date(reference_date)
|
402
|
+
# quarter = (reference_date.month - 1) // 3 + 1
|
403
|
+
# return datetime.date(reference_date.year, 3 * quarter - 2, 1)
|
404
|
+
#
|
405
|
+
# @classmethod
|
406
|
+
# def get_last_day_of_the_quarter(cls, reference_date: Union[
|
407
|
+
# str, datetime.date, datetime.datetime, pd.Timestamp]) -> datetime.date:
|
408
|
+
# """
|
409
|
+
# Get the last day of the quarter for a given date.
|
410
|
+
# """
|
411
|
+
# reference_date = cls._ensure_date(reference_date)
|
412
|
+
# quarter = (reference_date.month - 1) // 3 + 1
|
413
|
+
# first_day_of_next_quarter = datetime.date(reference_date.year, 3 * quarter + 1, 1)
|
414
|
+
# return first_day_of_next_quarter - datetime.timedelta(days=1)
|
415
|
+
#
|
416
|
+
# @classmethod
|
417
|
+
# def get_month_range(cls, n: int = 0) -> Tuple[datetime.date, datetime.date]:
|
418
|
+
# """
|
419
|
+
# Get the date range for the current month or the month `n` months in the past or future.
|
420
|
+
# """
|
421
|
+
# today = datetime.date.today()
|
422
|
+
# target_month = (today.month - 1 + n) % 12 + 1
|
423
|
+
# target_year = today.year + (today.month - 1 + n) // 12
|
424
|
+
# start = datetime.date(target_year, target_month, 1)
|
425
|
+
# if n == 0:
|
426
|
+
# return start, today
|
427
|
+
# next_month = (target_month % 12) + 1
|
428
|
+
# next_year = target_year + (target_month == 12)
|
429
|
+
# end = datetime.date(next_year, next_month, 1) - datetime.timedelta(days=1)
|
430
|
+
# return start, end
|
431
|
+
#
|
432
|
+
# @classmethod
|
433
|
+
# def register_period(cls, name: str, func: Callable[[], Tuple[datetime.date, datetime.date]]):
|
434
|
+
# """
|
435
|
+
# Dynamically register a new period function.
|
436
|
+
# """
|
437
|
+
# cls._PERIOD_FUNCTIONS[name] = func
|
438
|
+
#
|
439
|
+
# @classmethod
|
440
|
+
# def parse_period(cls, **kwargs) -> Tuple[datetime.date, datetime.date]:
|
441
|
+
# """
|
442
|
+
# Parse the period keyword to determine the start and end date for date range operations.
|
443
|
+
# """
|
444
|
+
# period = kwargs.setdefault('period', 'today')
|
445
|
+
# period_functions = cls._get_default_periods()
|
446
|
+
# period_functions.update(cls._PERIOD_FUNCTIONS)
|
447
|
+
# if period not in period_functions:
|
448
|
+
# raise ValueError(f"Unknown period '{period}'. Available periods: {list(period_functions.keys())}")
|
449
|
+
# return period_functions[period]()
|
450
|
+
#
|
451
|
+
# @classmethod
|
452
|
+
# def _get_default_periods(cls) -> Dict[str, Callable[[], Tuple[datetime.date, datetime.date]]]:
|
453
|
+
# """
|
454
|
+
# Get default period functions.
|
455
|
+
# """
|
456
|
+
# today = datetime.date.today
|
457
|
+
# return {
|
458
|
+
# 'today': lambda: (today(), today()),
|
459
|
+
# 'yesterday': lambda: (today() - datetime.timedelta(days=1), today() - datetime.timedelta(days=1)),
|
460
|
+
# 'current_week': lambda: cls.calc_week_range(today()),
|
461
|
+
# 'last_week': lambda: cls.calc_week_range(today() - datetime.timedelta(days=7)),
|
462
|
+
# 'current_month': lambda: cls.get_month_range(n=0),
|
463
|
+
# 'last_month': lambda: cls.get_month_range(n=-1),
|
464
|
+
# 'current_year': lambda: cls.get_year_timerange(today().year),
|
465
|
+
# 'last_year': lambda: cls.get_year_timerange(today().year - 1),
|
466
|
+
# 'current_quarter': lambda: (
|
467
|
+
# cls.get_first_day_of_the_quarter(today()), cls.get_last_day_of_the_quarter(today())),
|
468
|
+
# 'ytd': lambda: (datetime.date(today().year, 1, 1), today()),
|
469
|
+
# }
|
470
|
+
#
|
471
|
+
#
|
472
|
+
# class FileAgeChecker:
|
473
|
+
# def __init__(self, debug=False, logger=None):
|
474
|
+
# self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
475
|
+
# self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
|
476
|
+
# def is_file_older_than(
|
477
|
+
# self,
|
478
|
+
# file_path: str,
|
479
|
+
# max_age_minutes: int,
|
480
|
+
# fs: Optional[fsspec.AbstractFileSystem] = None,
|
481
|
+
# ignore_missing: bool = False,
|
482
|
+
# verbose: bool = False,
|
483
|
+
# ) -> bool:
|
484
|
+
# """
|
485
|
+
# Check if a file or directory is older than the specified max_age_minutes.
|
486
|
+
#
|
487
|
+
# :param file_path: Path to the file or directory.
|
488
|
+
# :param max_age_minutes: Maximum allowed age in minutes.
|
489
|
+
# :param fs: Filesystem object. Defaults to local filesystem.
|
490
|
+
# :param ignore_missing: Treat missing paths as not old if True.
|
491
|
+
# :param verbose: Enable detailed logging.
|
492
|
+
# :return: True if older than max_age_minutes, False otherwise.
|
493
|
+
# """
|
494
|
+
# fs = fs or fsspec.filesystem("file")
|
495
|
+
# self.logger.debug(f"Checking age for {file_path}...")
|
496
|
+
#
|
497
|
+
# try:
|
498
|
+
# if not fs.exists(file_path):
|
499
|
+
# self.logger.debug(f"Path not found: {file_path}.")
|
500
|
+
# return not ignore_missing
|
501
|
+
#
|
502
|
+
# if fs.isdir(file_path):
|
503
|
+
# self.logger.debug(f"Found directory: {file_path}")
|
504
|
+
# age = self._get_directory_age_minutes(file_path, fs, verbose)
|
505
|
+
# elif fs.isfile(file_path):
|
506
|
+
# age = self._get_file_age_minutes(file_path, fs, verbose)
|
507
|
+
# else:
|
508
|
+
# self.logger.warning(f"Path {file_path} is neither file nor directory.")
|
509
|
+
# return True
|
510
|
+
#
|
511
|
+
# return age > max_age_minutes
|
512
|
+
#
|
513
|
+
# except Exception as e:
|
514
|
+
# self.logger.warning(f"Error checking {file_path}: {str(e)}")
|
515
|
+
# return True
|
516
|
+
#
|
517
|
+
# def get_file_or_dir_age_minutes(
|
518
|
+
# self,
|
519
|
+
# file_path: str,
|
520
|
+
# fs: Optional[fsspec.AbstractFileSystem] = None,
|
521
|
+
# ) -> float:
|
522
|
+
# """
|
523
|
+
# Get age of file/directory in minutes. Returns infinity for errors/missing paths.
|
524
|
+
#
|
525
|
+
# :param file_path: Path to check.
|
526
|
+
# :param fs: Filesystem object. Defaults to local filesystem.
|
527
|
+
# :return: Age in minutes or infinity if unavailable.
|
528
|
+
# """
|
529
|
+
# fs = fs or fsspec.filesystem("file")
|
530
|
+
# try:
|
531
|
+
# if not fs.exists(file_path):
|
532
|
+
# self.logger.debug(f"Path not found: {file_path}")
|
533
|
+
# return float("inf")
|
534
|
+
#
|
535
|
+
# if fs.isdir(file_path):
|
536
|
+
# return self._get_directory_age_minutes(file_path, fs, verbose=False)
|
537
|
+
# if fs.isfile(file_path):
|
538
|
+
# return self._get_file_age_minutes(file_path, fs, verbose=False)
|
539
|
+
#
|
540
|
+
# self.logger.warning(f"Invalid path type: {file_path}")
|
541
|
+
# return float("inf")
|
542
|
+
#
|
543
|
+
# except Exception as e:
|
544
|
+
# self.logger.warning(f"Error getting age for {file_path}: {str(e)}")
|
545
|
+
# return float("inf")
|
546
|
+
#
|
547
|
+
# def _get_directory_age_minutes(
|
548
|
+
# self,
|
549
|
+
# dir_path: str,
|
550
|
+
# fs: fsspec.AbstractFileSystem,
|
551
|
+
# verbose: bool,
|
552
|
+
# ) -> float:
|
553
|
+
# """Calculate age of oldest file in directory."""
|
554
|
+
# try:
|
555
|
+
# all_files = fs.ls(dir_path)
|
556
|
+
# except Exception as e:
|
557
|
+
# self.logger.warning(f"Error listing {dir_path}: {str(e)}")
|
558
|
+
# return float("inf")
|
559
|
+
#
|
560
|
+
# if not all_files:
|
561
|
+
# self.logger.debug(f"Empty directory: {dir_path}")
|
562
|
+
# return float("inf")
|
563
|
+
#
|
564
|
+
# modification_times = []
|
565
|
+
# for file in all_files:
|
566
|
+
# try:
|
567
|
+
# info = fs.info(file)
|
568
|
+
# mod_time = self._get_modification_time(info, file)
|
569
|
+
# modification_times.append(mod_time)
|
570
|
+
# except Exception as e:
|
571
|
+
# self.logger.warning(f"Skipping {file}: {str(e)}")
|
572
|
+
#
|
573
|
+
# if not modification_times:
|
574
|
+
# self.logger.warning(f"No valid files in {dir_path}")
|
575
|
+
# return float("inf")
|
576
|
+
#
|
577
|
+
# oldest = min(modification_times)
|
578
|
+
# age = (datetime.datetime.now(datetime.timezone.utc) - oldest).total_seconds() / 60
|
579
|
+
# self.logger.debug(f"Oldest in {dir_path}: {age:.2f} minutes")
|
580
|
+
#
|
581
|
+
# return age
|
582
|
+
#
|
583
|
+
# def _get_file_age_minutes(
|
584
|
+
# self,
|
585
|
+
# file_path: str,
|
586
|
+
# fs: fsspec.AbstractFileSystem,
|
587
|
+
# verbose: bool,
|
588
|
+
# ) -> float:
|
589
|
+
# """Calculate file age in minutes."""
|
590
|
+
# try:
|
591
|
+
# info = fs.info(file_path)
|
592
|
+
# mod_time = self._get_modification_time(info, file_path)
|
593
|
+
# age = (datetime.datetime.now(datetime.timezone.utc) - mod_time).total_seconds() / 60
|
594
|
+
#
|
595
|
+
# if verbose:
|
596
|
+
# self.logger.debug(f"{file_path} info: {info}")
|
597
|
+
# self.logger.debug(f"File age: {age:.2f} minutes")
|
598
|
+
#
|
599
|
+
# return age
|
600
|
+
#
|
601
|
+
# except Exception as e:
|
602
|
+
# self.logger.warning(f"Error processing {file_path}: {str(e)}")
|
603
|
+
# return float("inf")
|
604
|
+
#
|
605
|
+
# def _get_modification_time(self, info: Dict, file_path: str) -> datetime.datetime:
|
606
|
+
# """Extract modification time from filesystem info with timezone awareness."""
|
607
|
+
# try:
|
608
|
+
# if "LastModified" in info: # S3-like
|
609
|
+
# lm = info["LastModified"]
|
610
|
+
# return lm if isinstance(lm, datetime.datetime) else datetime.datetime.fromisoformat(
|
611
|
+
# lm[:-1]).astimezone()
|
612
|
+
#
|
613
|
+
# if "mtime" in info: # Local filesystem
|
614
|
+
# return datetime.datetime.fromtimestamp(info["mtime"], tz=datetime.timezone.utc)
|
615
|
+
#
|
616
|
+
# if "modified" in info: # FTP/SSH
|
617
|
+
# return datetime.datetime.strptime(
|
618
|
+
# info["modified"], "%Y-%m-%d %H:%M:%S"
|
619
|
+
# ).replace(tzinfo=datetime.timezone.utc)
|
620
|
+
#
|
621
|
+
# raise KeyError("No valid modification time key found")
|
622
|
+
#
|
623
|
+
# except (KeyError, ValueError) as e:
|
624
|
+
# self.logger.warning(f"Invalid mod time for {file_path}: {str(e)}")
|
625
|
+
# raise ValueError(f"Unsupported modification time format for {file_path}") from e
|
626
|
+
#
|
627
|
+
#
|
628
|
+
# # --- Vectorized Helper Functions ---
|
629
|
+
#
|
630
|
+
# def _vectorized_busday_count(partition, begin_col, end_col, holidays):
|
631
|
+
# """
|
632
|
+
# Calculates the number of business days between a start and end date.
|
633
|
+
# """
|
634
|
+
# # Extract the raw columns
|
635
|
+
# start_dates_raw = partition[begin_col]
|
636
|
+
# end_dates_raw = partition[end_col]
|
637
|
+
#
|
638
|
+
#
|
639
|
+
# start_dates = pd.to_datetime(start_dates_raw, errors='coerce')
|
640
|
+
# end_dates = pd.to_datetime(end_dates_raw, errors='coerce')
|
641
|
+
#
|
642
|
+
# # Initialize the result Series with NaN, as the output is a number
|
643
|
+
# result = pd.Series(np.nan, index=partition.index)
|
644
|
+
#
|
645
|
+
# # Create a mask for rows where both start and end dates are valid
|
646
|
+
# valid_mask = pd.notna(start_dates) & pd.notna(end_dates)
|
647
|
+
#
|
648
|
+
# # Perform the vectorized calculation only on the valid subset
|
649
|
+
# # Convert to NumPy arrays of date type for the calculation
|
650
|
+
# result.loc[valid_mask] = np.busday_count(
|
651
|
+
# start_dates[valid_mask].values.astype('datetime64[D]'),
|
652
|
+
# end_dates[valid_mask].values.astype('datetime64[D]'),
|
653
|
+
# holidays=holidays
|
654
|
+
# )
|
655
|
+
#
|
656
|
+
# return result
|
657
|
+
#
|
658
|
+
#
|
659
|
+
# def _vectorized_sla_end_date(partition, start_col, n_days_col, holidays):
|
660
|
+
# """
|
661
|
+
# Calculates the end date of an SLA, skipping weekends and holidays.
|
662
|
+
# """
|
663
|
+
# # Extract the relevant columns as pandas Series
|
664
|
+
# start_dates_raw = partition[start_col]
|
665
|
+
# sla_days = partition[n_days_col]
|
666
|
+
#
|
667
|
+
#
|
668
|
+
# start_dates = pd.to_datetime(start_dates_raw, errors='coerce')
|
669
|
+
#
|
670
|
+
# # Initialize the result Series with NaT (Not a Time)
|
671
|
+
# result = pd.Series(pd.NaT, index=partition.index, dtype='datetime64[ns]')
|
672
|
+
#
|
673
|
+
# # Create a mask for rows that have valid start dates and SLA days
|
674
|
+
# valid_mask = pd.notna(start_dates) & pd.notna(sla_days)
|
675
|
+
#
|
676
|
+
# # Perform the vectorized calculation only on the valid subset
|
677
|
+
# # Note: np.busday_offset requires a NumPy array, so we use .values
|
678
|
+
# result.loc[valid_mask] = np.busday_offset(
|
679
|
+
# start_dates[valid_mask].values.astype('datetime64[D]'), # Convert to numpy array of dates
|
680
|
+
# sla_days[valid_mask].astype(int), # Ensure days are integers
|
681
|
+
# roll='forward',
|
682
|
+
# holidays=holidays
|
683
|
+
# )
|
684
|
+
#
|
685
|
+
# return result
|
686
|
+
#
|
687
|
+
#
|
688
|
+
# # --- Refactored BusinessDays Class ---
|
689
|
+
#
|
690
|
+
# class BusinessDays:
|
691
|
+
# """
|
692
|
+
# Business days calculations with a custom holiday list.
|
693
|
+
# Supports scalar and efficient, vectorized Dask DataFrame operations.
|
694
|
+
# """
|
695
|
+
#
|
696
|
+
# def __init__(self, holiday_list: dict[str, list[str]], logger) -> None:
|
697
|
+
# self.logger = logger
|
698
|
+
# self.HOLIDAY_LIST = holiday_list
|
699
|
+
#
|
700
|
+
# # Flatten and store as tuple for determinism
|
701
|
+
# bd_holidays = [day for year in self.HOLIDAY_LIST for day in self.HOLIDAY_LIST[year]]
|
702
|
+
# self.holidays = tuple(bd_holidays)
|
703
|
+
#
|
704
|
+
# def get_business_days_count(
|
705
|
+
# self,
|
706
|
+
# begin_date: str | datetime.date | pd.Timestamp,
|
707
|
+
# end_date: str | datetime.date | pd.Timestamp,
|
708
|
+
# ) -> int:
|
709
|
+
# """Scalar method to count business days between two dates."""
|
710
|
+
# begin = pd.to_datetime(begin_date)
|
711
|
+
# end = pd.to_datetime(end_date)
|
712
|
+
# return int(np.busday_count(begin.date(), end.date(), holidays=list(self.holidays)))
|
713
|
+
#
|
714
|
+
# def calc_business_days_from_df(
|
715
|
+
# self,
|
716
|
+
# df: dd.DataFrame,
|
717
|
+
# begin_date_col: str,
|
718
|
+
# end_date_col: str,
|
719
|
+
# result_col: str = "business_days",
|
720
|
+
# ) -> dd.DataFrame:
|
721
|
+
# """Calculates business days between two columns in a Dask DataFrame."""
|
722
|
+
# missing = {begin_date_col, end_date_col} - set(df.columns)
|
723
|
+
# if missing:
|
724
|
+
# self.logger.error(f"Missing columns: {missing}")
|
725
|
+
# raise ValueError("Required columns are missing from DataFrame")
|
726
|
+
#
|
727
|
+
# return df.assign(
|
728
|
+
# **{result_col: df.map_partitions(
|
729
|
+
# _vectorized_busday_count,
|
730
|
+
# begin_col=begin_date_col,
|
731
|
+
# end_col=end_date_col,
|
732
|
+
# holidays=list(self.holidays),
|
733
|
+
# meta=(result_col, 'f8') # f8 is float64
|
734
|
+
# )}
|
735
|
+
# )
|
736
|
+
#
|
737
|
+
# def add_business_days(
|
738
|
+
# self,
|
739
|
+
# start_date: str | datetime.date | pd.Timestamp,
|
740
|
+
# n_days: int,
|
741
|
+
# ) -> np.datetime64:
|
742
|
+
# """Scalar method to add N business days to a start date."""
|
743
|
+
# start = pd.to_datetime(start_date)
|
744
|
+
# return np.busday_offset(
|
745
|
+
# start.date(),
|
746
|
+
# n_days,
|
747
|
+
# roll='forward',
|
748
|
+
# holidays=list(self.holidays),
|
749
|
+
# )
|
750
|
+
#
|
751
|
+
# def calc_sla_end_date(
|
752
|
+
# self,
|
753
|
+
# df: dd.DataFrame,
|
754
|
+
# start_date_col: str,
|
755
|
+
# n_days_col: str,
|
756
|
+
# result_col: str = "sla_end_date",
|
757
|
+
# ) -> dd.DataFrame:
|
758
|
+
# """Calculates an SLA end date column for a Dask DataFrame."""
|
759
|
+
# missing = {start_date_col, n_days_col} - set(df.columns)
|
760
|
+
# if missing:
|
761
|
+
# self.logger.error(f"Missing columns: {missing}")
|
762
|
+
# raise ValueError("Required columns are missing from DataFrame")
|
763
|
+
#
|
764
|
+
# return df.assign(
|
765
|
+
# **{result_col: df.map_partitions(
|
766
|
+
# _vectorized_sla_end_date,
|
767
|
+
# start_col=start_date_col,
|
768
|
+
# n_days_col=n_days_col,
|
769
|
+
# holidays=list(self.holidays),
|
770
|
+
# meta=(result_col, 'datetime64[ns]')
|
771
|
+
# )}
|
772
|
+
# )
|
773
|
+
#
|
774
|
+
# # Class enhancements
|
775
|
+
# # DateUtils.register_period('next_week', lambda: (datetime.date.today() + datetime.timedelta(days=7),
|
776
|
+
# # datetime.date.today() + datetime.timedelta(days=13)))
|
777
|
+
# # start, end = DateUtils.parse_period(period='next_week')
|
778
|
+
# # print(f"Next Week: {start} to {end}")
|