sibi-dst 2025.1.13__py3-none-any.whl → 2025.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. sibi_dst/__init__.py +7 -1
  2. sibi_dst/df_helper/__init__.py +3 -2
  3. sibi_dst/df_helper/_artifact_updater_async.py +238 -0
  4. sibi_dst/df_helper/_artifact_updater_threaded.py +195 -0
  5. sibi_dst/df_helper/_df_helper.py +418 -118
  6. sibi_dst/df_helper/_parquet_artifact.py +275 -283
  7. sibi_dst/df_helper/_parquet_reader.py +9 -10
  8. sibi_dst/df_helper/backends/parquet/_parquet_options.py +8 -4
  9. sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +68 -107
  10. sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
  11. sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +105 -255
  12. sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +90 -42
  13. sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +192 -0
  14. sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +122 -72
  15. sibi_dst/osmnx_helper/route_path_builder.py +45 -46
  16. sibi_dst/utils/__init__.py +2 -0
  17. sibi_dst/utils/base.py +235 -100
  18. sibi_dst/utils/business_days.py +248 -0
  19. sibi_dst/utils/clickhouse_writer.py +472 -206
  20. sibi_dst/utils/data_utils.py +139 -186
  21. sibi_dst/utils/data_wrapper.py +392 -88
  22. sibi_dst/utils/date_utils.py +711 -393
  23. sibi_dst/utils/df_utils.py +193 -213
  24. sibi_dst/utils/file_age_checker.py +301 -0
  25. sibi_dst/utils/file_utils.py +3 -2
  26. sibi_dst/utils/filepath_generator.py +314 -152
  27. sibi_dst/utils/log_utils.py +581 -242
  28. sibi_dst/utils/manifest_manager.py +60 -76
  29. sibi_dst/utils/parquet_saver.py +33 -27
  30. sibi_dst/utils/periods.py +42 -0
  31. sibi_dst/utils/phone_formatter.py +88 -95
  32. sibi_dst/utils/update_planner.py +180 -178
  33. sibi_dst/utils/webdav_client.py +116 -166
  34. {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/METADATA +1 -1
  35. {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/RECORD +36 -30
  36. sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +0 -422
  37. {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/WHEEL +0 -0
@@ -1,460 +1,778 @@
1
1
  from __future__ import annotations
2
2
 
3
- import datetime
4
- from typing import Union, Tuple, Callable, Dict, Optional
5
-
6
- import fsspec
7
- import numpy as np
8
- import pandas as pd
9
- import dask.dataframe as dd
10
- from .log_utils import Logger
3
+ import datetime as dt
4
+ import re
5
+ from typing import Callable, Union
11
6
 
12
7
 
13
8
  class DateUtils:
14
9
  """
15
- Utility class for date-related operations.
16
-
17
- The DateUtils class provides a variety of operations to manipulate and retrieve
18
- information about dates, such as calculating week ranges, determining start or
19
- end dates for specific periods (quarters, months, years), and dynamically
20
- registering custom time period functions. It also supports parsing specific
21
- periods for date range computations and ensuring the input date is correctly
22
- converted to the desired format.
23
-
24
- :ivar logger: Logger instance used for logging messages. Defaults to the logger
25
- for the current class if not provided.
26
- :type logger: Logger
27
-
28
- :ivar _PERIOD_FUNCTIONS: Stores dynamically registered period functions that
29
- return start and end dates.
30
- :type _PERIOD_FUNCTIONS: Dict[str, Callable[[], Tuple[datetime.date, datetime.date]]]
10
+ Period resolution & normalization for ETL artifacts.
11
+
12
+ Canonical periods:
13
+ - 'today'
14
+ - 'current_month'
15
+ - 'ytd'
16
+ - 'itd'
17
+ - 'custom' (requires 'start_on' and 'end_on')
18
+
19
+ Extras:
20
+ - Register named periods at runtime (register_period)
21
+ - Register regex-based periods (register_pattern)
22
+ - Recognize explicit windows: 'YYYY-MM-DD..YYYY-MM-DD'
23
+ - Accept 'last_N_days' and 'last_N_hours' via default patterns
24
+
25
+ All dynamic/custom outputs standardize on:
26
+ - date windows: 'start_on' / 'end_on' (YYYY-MM-DD or date-like)
27
+ - time windows: 'start_ts' / 'end_ts' (ISO datetimes)
31
28
  """
32
- _PERIOD_FUNCTIONS: Dict[str, Callable[[], Tuple[datetime.date, datetime.date]]] = {}
33
29
 
34
- def __init__(self, logger=None, debug=False):
35
- self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
36
- self.debug = debug
30
+ # ---- Dynamic registries ----
31
+ _PERIOD_FUNCTIONS: Dict[str, Callable[[], Tuple[dt.date, dt.date]]] = {}
32
+ _PERIOD_PATTERNS: List[Tuple[re.Pattern[str], Callable[[re.Match[str], dt.datetime], Dict[str, Any]]]] = []
37
33
 
38
- @classmethod
39
- def _ensure_date(cls, value: Union[str, datetime.date, datetime.datetime, pd.Timestamp]) -> datetime.date:
40
- """
41
- Ensure the input is converted to a datetime.date object.
42
- """
43
- if isinstance(value, datetime.date) and not isinstance(value, datetime.datetime):
34
+ _LAST_N_DAYS_RE = re.compile(r"^last_(\d+)_days$")
35
+ _WINDOW_RE = re.compile(r"^(\d{4}-\d{2}-\d{2})\.\.(\d{4}-\d{2}-\d{2})$")
36
+
37
+ # ---------------- Core coercion helpers ----------------
38
+
39
+ @staticmethod
40
+ def _ensure_date(value: Union[str, dt.date, dt.datetime, pd.Timestamp]) -> dt.date:
41
+ """Ensure the input is converted to a datetime.date."""
42
+ if isinstance(value, dt.date) and not isinstance(value, dt.datetime):
44
43
  return value
45
- elif isinstance(value, datetime.datetime):
44
+ if isinstance(value, dt.datetime):
46
45
  return value.date()
47
- elif isinstance(value, pd.Timestamp):
46
+ if isinstance(value, pd.Timestamp):
48
47
  return value.to_pydatetime().date()
49
- elif isinstance(value, str):
50
- for fmt in ('%Y-%m-%d %H:%M:%S', '%Y-%m-%d'):
51
- try:
52
- return datetime.datetime.strptime(value, fmt).date()
53
- except ValueError:
54
- continue
55
- raise ValueError(f"Unsupported date format: {value}")
56
-
57
- # Public alias to access _ensure_date from other classes
48
+ if isinstance(value, str):
49
+ # Try pandas parser first (robust), then ISO date
50
+ try:
51
+ return pd.to_datetime(value, errors="raise").date() # type: ignore[return-value]
52
+ except Exception:
53
+ pass
54
+ try:
55
+ return dt.date.fromisoformat(value)
56
+ except Exception:
57
+ pass
58
+ raise ValueError(f"Unsupported date format: {value!r}")
59
+
60
+ # Public alias (used by others)
58
61
  ensure_date = _ensure_date
59
62
 
63
+ @staticmethod
64
+ def _ensure_datetime(
65
+ value: Union[str, dt.date, dt.datetime, pd.Timestamp],
66
+ tz: dt.tzinfo = dt.timezone.utc,
67
+ ) -> dt.datetime:
68
+ """Convert input to timezone-aware datetime (defaults to UTC)."""
69
+ if isinstance(value, dt.datetime):
70
+ return value if value.tzinfo else value.replace(tzinfo=tz)
71
+ if isinstance(value, dt.date):
72
+ return dt.datetime(value.year, value.month, value.day, tzinfo=tz)
73
+ if isinstance(value, pd.Timestamp):
74
+ dtt = value.to_pydatetime()
75
+ return dtt if dtt.tzinfo else dtt.replace(tzinfo=tz)
76
+ if isinstance(value, str):
77
+ ts = pd.to_datetime(value, errors="raise", utc=False)
78
+ dtt = ts.to_pydatetime()
79
+ return dtt if getattr(dtt, "tzinfo", None) else dtt.replace(tzinfo=tz)
80
+ raise ValueError(f"Unsupported datetime format: {value!r}")
81
+
82
+ # ---------------- Week / Month / Quarter helpers ----------------
83
+
60
84
  @classmethod
61
- def calc_week_range(cls, reference_date: Union[str, datetime.date, datetime.datetime, pd.Timestamp]) -> Tuple[
62
- datetime.date, datetime.date]:
63
- """
64
- Calculate the start and end of the week for a given reference date.
65
- """
66
- reference_date = cls._ensure_date(reference_date)
67
- start = reference_date - datetime.timedelta(days=reference_date.weekday())
68
- end = start + datetime.timedelta(days=6)
85
+ def calc_week_range(cls, reference_date: Union[str, dt.date, dt.datetime, pd.Timestamp]) -> Tuple[dt.date, dt.date]:
86
+ """Start (Mon) and end (Sun) for the week containing reference_date."""
87
+ ref = cls._ensure_date(reference_date)
88
+ start = ref - dt.timedelta(days=ref.weekday())
89
+ end = start + dt.timedelta(days=6)
69
90
  return start, end
70
91
 
71
92
  @staticmethod
72
- def get_year_timerange(year: int) -> Tuple[datetime.date, datetime.date]:
73
- """
74
- Get the start and end dates for a given year.
75
- """
76
- return datetime.date(year, 1, 1), datetime.date(year, 12, 31)
93
+ def get_year_timerange(year: int) -> Tuple[dt.date, dt.date]:
94
+ return dt.date(year, 1, 1), dt.date(year, 12, 31)
77
95
 
78
96
  @classmethod
79
- def get_first_day_of_the_quarter(cls, reference_date: Union[
80
- str, datetime.date, datetime.datetime, pd.Timestamp]) -> datetime.date:
81
- """
82
- Get the first day of the quarter for a given date.
83
- """
84
- reference_date = cls._ensure_date(reference_date)
85
- quarter = (reference_date.month - 1) // 3 + 1
86
- return datetime.date(reference_date.year, 3 * quarter - 2, 1)
97
+ def get_first_day_of_the_quarter(cls, reference_date: Union[str, dt.date, dt.datetime, pd.Timestamp]) -> dt.date:
98
+ ref = cls._ensure_date(reference_date)
99
+ quarter = (ref.month - 1) // 3 + 1
100
+ return dt.date(ref.year, 3 * quarter - 2, 1)
87
101
 
88
102
  @classmethod
89
- def get_last_day_of_the_quarter(cls, reference_date: Union[
90
- str, datetime.date, datetime.datetime, pd.Timestamp]) -> datetime.date:
91
- """
92
- Get the last day of the quarter for a given date.
93
- """
94
- reference_date = cls._ensure_date(reference_date)
95
- quarter = (reference_date.month - 1) // 3 + 1
96
- first_day_of_next_quarter = datetime.date(reference_date.year, 3 * quarter + 1, 1)
97
- return first_day_of_next_quarter - datetime.timedelta(days=1)
103
+ def get_last_day_of_the_quarter(cls, reference_date: Union[str, dt.date, dt.datetime, pd.Timestamp]) -> dt.date:
104
+ ref = cls._ensure_date(reference_date)
105
+ quarter = (ref.month - 1) // 3 + 1
106
+ first_day_next_q = dt.date(ref.year, 3 * quarter + 1, 1)
107
+ return first_day_next_q - dt.timedelta(days=1)
98
108
 
99
109
  @classmethod
100
- def get_month_range(cls, n: int = 0) -> Tuple[datetime.date, datetime.date]:
110
+ def get_month_range(cls, n: int = 0) -> Tuple[dt.date, dt.date]:
101
111
  """
102
- Get the date range for the current month or the month `n` months in the past or future.
112
+ Range for current month (n=0) or +/- n months relative to today.
113
+ If n == 0, end is today. Otherwise end is calendar month end.
103
114
  """
104
- today = datetime.date.today()
115
+ today = dt.date.today()
105
116
  target_month = (today.month - 1 + n) % 12 + 1
106
117
  target_year = today.year + (today.month - 1 + n) // 12
107
- start = datetime.date(target_year, target_month, 1)
118
+ start = dt.date(target_year, target_month, 1)
108
119
  if n == 0:
109
120
  return start, today
110
121
  next_month = (target_month % 12) + 1
111
122
  next_year = target_year + (target_month == 12)
112
- end = datetime.date(next_year, next_month, 1) - datetime.timedelta(days=1)
123
+ end = dt.date(next_year, next_month, 1) - dt.timedelta(days=1)
113
124
  return start, end
114
125
 
126
+ # ---------------- Period registration ----------------
127
+
115
128
  @classmethod
116
- def register_period(cls, name: str, func: Callable[[], Tuple[datetime.date, datetime.date]]):
129
+ def register_period(cls, name: str, func: Callable[[], Tuple[dt.date, dt.date]]) -> None:
117
130
  """
118
- Dynamically register a new period function.
131
+ Dynamically register a new named period.
132
+ The callable must return (start_date, end_date) as datetime.date values.
119
133
  """
120
134
  cls._PERIOD_FUNCTIONS[name] = func
121
135
 
122
136
  @classmethod
123
- def parse_period(cls, **kwargs) -> Tuple[datetime.date, datetime.date]:
137
+ def register_pattern(
138
+ cls,
139
+ pattern: str | re.Pattern[str],
140
+ resolver: Callable[[re.Match[str], dt.datetime], Dict[str, Any]],
141
+ ) -> None:
124
142
  """
125
- Parse the period keyword to determine the start and end date for date range operations.
143
+ Register a regex-based dynamic period.
144
+
145
+ The resolver receives:
146
+ - match: regex match object
147
+ - now: timezone-aware datetime (UTC by default)
148
+
149
+ It must return a dict with optional keys:
150
+ - 'canonical' : str (defaults to 'custom')
151
+ - 'start_on'/'end_on' : ISO date strings (YYYY-MM-DD) OR
152
+ - 'start_ts'/'end_ts' : ISO datetime strings
153
+ - any additional per-period params
126
154
  """
127
- period = kwargs.setdefault('period', 'today')
128
- period_functions = cls._get_default_periods()
129
- period_functions.update(cls._PERIOD_FUNCTIONS)
130
- if period not in period_functions:
131
- raise ValueError(f"Unknown period '{period}'. Available periods: {list(period_functions.keys())}")
132
- return period_functions[period]()
155
+ compiled = re.compile(pattern) if isinstance(pattern, str) else pattern
156
+ cls._PERIOD_PATTERNS.append((compiled, resolver))
157
+
158
+ # ---------------- Default named periods ----------------
133
159
 
134
160
  @classmethod
135
- def _get_default_periods(cls) -> Dict[str, Callable[[], Tuple[datetime.date, datetime.date]]]:
136
- """
137
- Get default period functions.
138
- """
139
- today = datetime.date.today
161
+ def _get_default_periods(cls) -> Dict[str, Callable[[], Tuple[dt.date, dt.date]]]:
162
+ today = dt.date.today
140
163
  return {
141
- 'today': lambda: (today(), today()),
142
- 'yesterday': lambda: (today() - datetime.timedelta(days=1), today() - datetime.timedelta(days=1)),
143
- 'current_week': lambda: cls.calc_week_range(today()),
144
- 'last_week': lambda: cls.calc_week_range(today() - datetime.timedelta(days=7)),
145
- 'current_month': lambda: cls.get_month_range(n=0),
146
- 'last_month': lambda: cls.get_month_range(n=-1),
147
- 'current_year': lambda: cls.get_year_timerange(today().year),
148
- 'current_quarter': lambda: (
149
- cls.get_first_day_of_the_quarter(today()), cls.get_last_day_of_the_quarter(today())),
150
- 'ytd': lambda: (datetime.date(today().year, 1, 1), today()),
164
+ "today": lambda: (today(), today()),
165
+ "yesterday": lambda: (today() - dt.timedelta(days=1), today() - dt.timedelta(days=1)),
166
+ "current_week": lambda: cls.calc_week_range(today()),
167
+ "last_week": lambda: cls.calc_week_range(today() - dt.timedelta(days=7)),
168
+ "current_month": lambda: cls.get_month_range(n=0),
169
+ "last_month": lambda: cls.get_month_range(n=-1),
170
+ "current_year": lambda: cls.get_year_timerange(today().year),
171
+ "last_year": lambda: cls.get_year_timerange(today().year - 1),
172
+ "current_quarter": lambda: (
173
+ cls.get_first_day_of_the_quarter(today()),
174
+ cls.get_last_day_of_the_quarter(today()),
175
+ ),
176
+ "ytd": lambda: (dt.date(today().year, 1, 1), today()),
177
+ "itd": lambda: (dt.date(1900, 1, 1), today()),
151
178
  }
152
179
 
180
+ @classmethod
181
+ def period_keys(cls) -> Iterable[str]:
182
+ """List available named periods (defaults + registered)."""
183
+ d = dict(cls._get_default_periods())
184
+ d.update(cls._PERIOD_FUNCTIONS)
185
+ return d.keys()
153
186
 
154
- class FileAgeChecker:
155
- def __init__(self, debug=False, logger=None):
156
- self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
157
- self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
158
- def is_file_older_than(
159
- self,
160
- file_path: str,
161
- max_age_minutes: int,
162
- fs: Optional[fsspec.AbstractFileSystem] = None,
163
- ignore_missing: bool = False,
164
- verbose: bool = False,
165
- ) -> bool:
166
- """
167
- Check if a file or directory is older than the specified max_age_minutes.
168
-
169
- :param file_path: Path to the file or directory.
170
- :param max_age_minutes: Maximum allowed age in minutes.
171
- :param fs: Filesystem object. Defaults to local filesystem.
172
- :param ignore_missing: Treat missing paths as not old if True.
173
- :param verbose: Enable detailed logging.
174
- :return: True if older than max_age_minutes, False otherwise.
175
- """
176
- fs = fs or fsspec.filesystem("file")
177
- self.logger.debug(f"Checking age for {file_path}...")
178
-
179
- try:
180
- if not fs.exists(file_path):
181
- self.logger.debug(f"Path not found: {file_path}.")
182
- return not ignore_missing
183
-
184
- if fs.isdir(file_path):
185
- self.logger.debug(f"Found directory: {file_path}")
186
- age = self._get_directory_age_minutes(file_path, fs, verbose)
187
- elif fs.isfile(file_path):
188
- age = self._get_file_age_minutes(file_path, fs, verbose)
189
- else:
190
- self.logger.warning(f"Path {file_path} is neither file nor directory.")
191
- return True
192
-
193
- return age > max_age_minutes
194
-
195
- except Exception as e:
196
- self.logger.warning(f"Error checking {file_path}: {str(e)}")
197
- return True
198
-
199
- def get_file_or_dir_age_minutes(
200
- self,
201
- file_path: str,
202
- fs: Optional[fsspec.AbstractFileSystem] = None,
203
- ) -> float:
204
- """
205
- Get age of file/directory in minutes. Returns infinity for errors/missing paths.
187
+ # ---------------- Flexible resolver ----------------
206
188
 
207
- :param file_path: Path to check.
208
- :param fs: Filesystem object. Defaults to local filesystem.
209
- :return: Age in minutes or infinity if unavailable.
189
+ @classmethod
190
+ def resolve_period(
191
+ cls,
192
+ period: Optional[str] = None,
193
+ *,
194
+ now: Optional[dt.datetime] = None,
195
+ tz: dt.tzinfo = dt.timezone.utc,
196
+ **overrides: Any,
197
+ ) -> Tuple[str, Dict[str, Any]]:
210
198
  """
211
- fs = fs or fsspec.filesystem("file")
212
- try:
213
- if not fs.exists(file_path):
214
- self.logger.debug(f"Path not found: {file_path}")
215
- return float("inf")
216
-
217
- if fs.isdir(file_path):
218
- return self._get_directory_age_minutes(file_path, fs, verbose=False)
219
- if fs.isfile(file_path):
220
- return self._get_file_age_minutes(file_path, fs, verbose=False)
221
-
222
- self.logger.warning(f"Invalid path type: {file_path}")
223
- return float("inf")
224
-
225
- except Exception as e:
226
- self.logger.warning(f"Error getting age for {file_path}: {str(e)}")
227
- return float("inf")
228
-
229
- def _get_directory_age_minutes(
230
- self,
231
- dir_path: str,
232
- fs: fsspec.AbstractFileSystem,
233
- verbose: bool,
234
- ) -> float:
235
- """Calculate age of oldest file in directory."""
236
- try:
237
- all_files = fs.ls(dir_path)
238
- except Exception as e:
239
- self.logger.warning(f"Error listing {dir_path}: {str(e)}")
240
- return float("inf")
241
-
242
- if not all_files:
243
- self.logger.debug(f"Empty directory: {dir_path}")
244
- return float("inf")
245
-
246
- modification_times = []
247
- for file in all_files:
248
- try:
249
- info = fs.info(file)
250
- mod_time = self._get_modification_time(info, file)
251
- modification_times.append(mod_time)
252
- except Exception as e:
253
- self.logger.warning(f"Skipping {file}: {str(e)}")
254
-
255
- if not modification_times:
256
- self.logger.warning(f"No valid files in {dir_path}")
257
- return float("inf")
258
-
259
- oldest = min(modification_times)
260
- age = (datetime.datetime.now(datetime.timezone.utc) - oldest).total_seconds() / 60
261
- self.logger.debug(f"Oldest in {dir_path}: {age:.2f} minutes")
262
-
263
- return age
264
-
265
- def _get_file_age_minutes(
266
- self,
267
- file_path: str,
268
- fs: fsspec.AbstractFileSystem,
269
- verbose: bool,
270
- ) -> float:
271
- """Calculate file age in minutes."""
272
- try:
273
- info = fs.info(file_path)
274
- mod_time = self._get_modification_time(info, file_path)
275
- age = (datetime.datetime.now(datetime.timezone.utc) - mod_time).total_seconds() / 60
276
-
277
- if verbose:
278
- self.logger.debug(f"{file_path} info: {info}")
279
- self.logger.debug(f"File age: {age:.2f} minutes")
280
-
281
- return age
282
-
283
- except Exception as e:
284
- self.logger.warning(f"Error processing {file_path}: {str(e)}")
285
- return float("inf")
286
-
287
- def _get_modification_time(self, info: Dict, file_path: str) -> datetime.datetime:
288
- """Extract modification time from filesystem info with timezone awareness."""
289
- try:
290
- if "LastModified" in info: # S3-like
291
- lm = info["LastModified"]
292
- return lm if isinstance(lm, datetime.datetime) else datetime.datetime.fromisoformat(
293
- lm[:-1]).astimezone()
294
-
295
- if "mtime" in info: # Local filesystem
296
- return datetime.datetime.fromtimestamp(info["mtime"], tz=datetime.timezone.utc)
297
-
298
- if "modified" in info: # FTP/SSH
299
- return datetime.datetime.strptime(
300
- info["modified"], "%Y-%m-%d %H:%M:%S"
301
- ).replace(tzinfo=datetime.timezone.utc)
302
-
303
- raise KeyError("No valid modification time key found")
304
-
305
- except (KeyError, ValueError) as e:
306
- self.logger.warning(f"Invalid mod time for {file_path}: {str(e)}")
307
- raise ValueError(f"Unsupported modification time format for {file_path}") from e
308
-
309
-
310
- # --- Vectorized Helper Functions ---
311
-
312
- def _vectorized_busday_count(partition, begin_col, end_col, holidays):
313
- """
314
- Calculates the number of business days between a start and end date.
315
- """
316
- # Extract the raw columns
317
- start_dates_raw = partition[begin_col]
318
- end_dates_raw = partition[end_col]
319
-
199
+ Resolve a period into (canonical_key, params).
320
200
 
321
- start_dates = pd.to_datetime(start_dates_raw, errors='coerce')
322
- end_dates = pd.to_datetime(end_dates_raw, errors='coerce')
201
+ Priority:
202
+ 1) exact named period (default + registered)
203
+ 2) registered regex patterns (e.g., 'last_7_days', 'last_36_hours')
204
+ 3) explicit window 'YYYY-MM-DD..YYYY-MM-DD'
205
+ 4) fallback: pass the period verbatim with just overrides
323
206
 
324
- # Initialize the result Series with NaN, as the output is a number
325
- result = pd.Series(np.nan, index=partition.index)
326
-
327
- # Create a mask for rows where both start and end dates are valid
328
- valid_mask = pd.notna(start_dates) & pd.notna(end_dates)
329
-
330
- # Perform the vectorized calculation only on the valid subset
331
- # Convert to NumPy arrays of date type for the calculation
332
- result.loc[valid_mask] = np.busday_count(
333
- start_dates[valid_mask].values.astype('datetime64[D]'),
334
- end_dates[valid_mask].values.astype('datetime64[D]'),
335
- holidays=holidays
336
- )
337
-
338
- return result
207
+ Returns:
208
+ - canonical_key: e.g., 'today', 'current_month', or 'custom'
209
+ - params: dict containing computed keys and merged overrides
210
+ """
211
+ key = (period or "today").strip()
212
+ now = (now or dt.datetime.now(tz)).astimezone(tz)
339
213
 
214
+ # 1) named periods
215
+ period_functions = cls._get_default_periods()
216
+ period_functions.update(cls._PERIOD_FUNCTIONS)
217
+ if key in period_functions:
218
+ start, end = period_functions[key]()
219
+ params = {"start_on": start.isoformat(), "end_on": end.isoformat()}
220
+ params.update(overrides)
221
+ return key, params
222
+
223
+ # 2) regex patterns (user-registered)
224
+ for patt, resolver in cls._PERIOD_PATTERNS:
225
+ m = patt.fullmatch(key)
226
+ if m:
227
+ out = resolver(m, now)
228
+ canonical = out.get("canonical", "custom")
229
+ params = {k: v for k, v in out.items() if k != "canonical"}
230
+ params.update(overrides)
231
+ return canonical, params
232
+
233
+ # 2b) default 'last_N_days'
234
+ m = cls._LAST_N_DAYS_RE.match(key)
235
+ if m:
236
+ days = int(m.group(1))
237
+ end = now.date()
238
+ start = (now - dt.timedelta(days=days)).date()
239
+ params = {"start_on": start.isoformat(), "end_on": end.isoformat()}
240
+ params.update(overrides)
241
+ return "custom", params
242
+
243
+ # 3) explicit date window: YYYY-MM-DD..YYYY-MM-DD
244
+ m2 = cls._WINDOW_RE.fullmatch(key)
245
+ if m2:
246
+ start_on, end_on = m2.group(1), m2.group(2)
247
+ params = {"start_on": start_on, "end_on": end_on}
248
+ params.update(overrides)
249
+ return "custom", params
250
+
251
+ # 4) fallback (unknown key)
252
+ return key, dict(overrides)
253
+
254
+ # ---------------- Backward-compatible API ----------------
340
255
 
341
- def _vectorized_sla_end_date(partition, start_col, n_days_col, holidays):
342
- """
343
- Calculates the end date of an SLA, skipping weekends and holidays.
344
- """
345
- # Extract the relevant columns as pandas Series
346
- start_dates_raw = partition[start_col]
347
- sla_days = partition[n_days_col]
256
+ @classmethod
257
+ def parse_period(cls, **kwargs: Any) -> Tuple[dt.date, dt.date]:
258
+ """
259
+ Return (start_date, end_date) as datetime.date.
348
260
 
261
+ Accepts:
262
+ - period='today' | 'current_month' | 'last_7_days' | 'YYYY-MM-DD..YYYY-MM-DD' | ...
263
+ - optional overrides (e.g., start_on/end_on for 'custom')
264
+ """
265
+ period = kwargs.setdefault("period", "today")
349
266
 
350
- start_dates = pd.to_datetime(start_dates_raw, errors='coerce')
267
+ # Try named periods first
268
+ period_functions = cls._get_default_periods()
269
+ period_functions.update(cls._PERIOD_FUNCTIONS)
270
+ if period in period_functions:
271
+ return period_functions[period]()
351
272
 
352
- # Initialize the result Series with NaT (Not a Time)
353
- result = pd.Series(pd.NaT, index=partition.index, dtype='datetime64[ns]')
273
+ # Otherwise, resolve and coerce
274
+ canonical, params = cls.resolve_period(period, **kwargs)
354
275
 
355
- # Create a mask for rows that have valid start dates and SLA days
356
- valid_mask = pd.notna(start_dates) & pd.notna(sla_days)
276
+ if "start_on" in params and "end_on" in params:
277
+ start = cls._ensure_date(params["start_on"])
278
+ end = cls._ensure_date(params["end_on"])
279
+ return start, end
357
280
 
358
- # Perform the vectorized calculation only on the valid subset
359
- # Note: np.busday_offset requires a NumPy array, so we use .values
360
- result.loc[valid_mask] = np.busday_offset(
361
- start_dates[valid_mask].values.astype('datetime64[D]'), # Convert to numpy array of dates
362
- sla_days[valid_mask].astype(int), # Ensure days are integers
363
- roll='forward',
364
- holidays=holidays
365
- )
281
+ if "start_ts" in params and "end_ts" in params:
282
+ sdt = cls._ensure_datetime(params["start_ts"]).date()
283
+ edt = cls._ensure_datetime(params["end_ts"]).date()
284
+ return sdt, edt
366
285
 
367
- return result
286
+ raise ValueError(
287
+ f"Could not derive date range from period '{period}' (canonical='{canonical}'). "
288
+ f"Params: {params}"
289
+ )
368
290
 
369
291
 
370
- # --- Refactored BusinessDays Class ---
292
+ # ---------------- Default dynamic patterns registration ----------------
371
293
 
372
- class BusinessDays:
294
+ def _register_default_patterns() -> None:
373
295
  """
374
- Business days calculations with a custom holiday list.
375
- Supports scalar and efficient, vectorized Dask DataFrame operations.
296
+ Register common dynamic patterns:
297
+ - last_{n}_hours (ISO datetimes; useful for freshness windows)
376
298
  """
377
299
 
378
- def __init__(self, holiday_list: dict[str, list[str]], logger) -> None:
379
- self.logger = logger
380
- self.HOLIDAY_LIST = holiday_list
381
-
382
- # Flatten and store as tuple for determinism
383
- bd_holidays = [day for year in self.HOLIDAY_LIST for day in self.HOLIDAY_LIST[year]]
384
- self.holidays = tuple(bd_holidays)
385
-
386
- def get_business_days_count(
387
- self,
388
- begin_date: str | datetime.date | pd.Timestamp,
389
- end_date: str | datetime.date | pd.Timestamp,
390
- ) -> int:
391
- """Scalar method to count business days between two dates."""
392
- begin = pd.to_datetime(begin_date)
393
- end = pd.to_datetime(end_date)
394
- return int(np.busday_count(begin.date(), end.date(), holidays=list(self.holidays)))
395
-
396
- def calc_business_days_from_df(
397
- self,
398
- df: dd.DataFrame,
399
- begin_date_col: str,
400
- end_date_col: str,
401
- result_col: str = "business_days",
402
- ) -> dd.DataFrame:
403
- """Calculates business days between two columns in a Dask DataFrame."""
404
- missing = {begin_date_col, end_date_col} - set(df.columns)
405
- if missing:
406
- self.logger.error(f"Missing columns: {missing}")
407
- raise ValueError("Required columns are missing from DataFrame")
408
-
409
- return df.assign(
410
- **{result_col: df.map_partitions(
411
- _vectorized_busday_count,
412
- begin_col=begin_date_col,
413
- end_col=end_date_col,
414
- holidays=list(self.holidays),
415
- meta=(result_col, 'f8') # f8 is float64
416
- )}
417
- )
418
-
419
- def add_business_days(
420
- self,
421
- start_date: str | datetime.date | pd.Timestamp,
422
- n_days: int,
423
- ) -> np.datetime64:
424
- """Scalar method to add N business days to a start date."""
425
- start = pd.to_datetime(start_date)
426
- return np.busday_offset(
427
- start.date(),
428
- n_days,
429
- roll='forward',
430
- holidays=list(self.holidays),
431
- )
432
-
433
- def calc_sla_end_date(
434
- self,
435
- df: dd.DataFrame,
436
- start_date_col: str,
437
- n_days_col: str,
438
- result_col: str = "sla_end_date",
439
- ) -> dd.DataFrame:
440
- """Calculates an SLA end date column for a Dask DataFrame."""
441
- missing = {start_date_col, n_days_col} - set(df.columns)
442
- if missing:
443
- self.logger.error(f"Missing columns: {missing}")
444
- raise ValueError("Required columns are missing from DataFrame")
445
-
446
- return df.assign(
447
- **{result_col: df.map_partitions(
448
- _vectorized_sla_end_date,
449
- start_col=start_date_col,
450
- n_days_col=n_days_col,
451
- holidays=list(self.holidays),
452
- meta=(result_col, 'datetime64[ns]')
453
- )}
454
- )
300
+ def last_x_hours(match: re.Match[str], now: dt.datetime) -> Dict[str, Any]:
301
+ hours = int(match.group(1))
302
+ end_ts = now
303
+ start_ts = now - dt.timedelta(hours=hours)
304
+ return {
305
+ "canonical": "custom",
306
+ "start_ts": start_ts.isoformat(),
307
+ "end_ts": end_ts.isoformat(),
308
+ # Sensible default that callers can override:
309
+ "max_age_minutes": max(15, min(hours * 10, 240)),
310
+ }
455
311
 
456
- # Class enhancements
457
- # DateUtils.register_period('next_week', lambda: (datetime.date.today() + datetime.timedelta(days=7),
458
- # datetime.date.today() + datetime.timedelta(days=13)))
459
- # start, end = DateUtils.parse_period(period='next_week')
460
- # print(f"Next Week: {start} to {end}")
312
+ DateUtils.register_pattern(r"last_(\d+)_hours", last_x_hours)
313
+
314
+
315
+ # Register defaults at import time
316
+ _register_default_patterns()
317
+
318
+ # from __future__ import annotations
319
+ #
320
+ # import datetime
321
+ # from typing import Union, Tuple, Callable, Dict, Optional
322
+ #
323
+ # import fsspec
324
+ # import numpy as np
325
+ # import pandas as pd
326
+ # import dask.dataframe as dd
327
+ # from .log_utils import Logger
328
+ #
329
+ #
330
+ # class DateUtils:
331
+ # """
332
+ # Utility class for date-related operations.
333
+ #
334
+ # The DateUtils class provides a variety of operations to manipulate and retrieve
335
+ # information about dates, such as calculating week ranges, determining start or
336
+ # end dates for specific periods (quarters, months, years), and dynamically
337
+ # registering custom time period functions. It also supports parsing specific
338
+ # periods for date range computations and ensuring the input date is correctly
339
+ # converted to the desired format.
340
+ #
341
+ # :ivar logger: Logger instance used for logging messages. Defaults to the logger
342
+ # for the current class if not provided.
343
+ # :type logger: Logger
344
+ #
345
+ # :ivar _PERIOD_FUNCTIONS: Stores dynamically registered period functions that
346
+ # return start and end dates.
347
+ # :type _PERIOD_FUNCTIONS: Dict[str, Callable[[], Tuple[datetime.date, datetime.date]]]
348
+ # """
349
+ # _PERIOD_FUNCTIONS: Dict[str, Callable[[], Tuple[datetime.date, datetime.date]]] = {}
350
+ #
351
+ # def __init__(self, logger=None, debug=False):
352
+ # self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
353
+ # self.debug = debug
354
+ #
355
+ # @classmethod
356
+ # def _ensure_date(cls, value: Union[str, datetime.date, datetime.datetime, pd.Timestamp]) -> datetime.date:
357
+ # """
358
+ # Ensure the input is converted to a datetime.date object.
359
+ # """
360
+ # if isinstance(value, datetime.date) and not isinstance(value, datetime.datetime):
361
+ # return value
362
+ # elif isinstance(value, datetime.datetime):
363
+ # return value.date()
364
+ # elif isinstance(value, pd.Timestamp):
365
+ # return value.to_pydatetime().date()
366
+ # elif isinstance(value, str):
367
+ # for fmt in ('%Y-%m-%d %H:%M:%S', '%Y-%m-%d'):
368
+ # try:
369
+ # return datetime.datetime.strptime(value, fmt).date()
370
+ # except ValueError:
371
+ # continue
372
+ # raise ValueError(f"Unsupported date format: {value}")
373
+ #
374
+ # # Public alias to access _ensure_date from other classes
375
+ # ensure_date = _ensure_date
376
+ #
377
+ # @classmethod
378
+ # def calc_week_range(cls, reference_date: Union[str, datetime.date, datetime.datetime, pd.Timestamp]) -> Tuple[
379
+ # datetime.date, datetime.date]:
380
+ # """
381
+ # Calculate the start and end of the week for a given reference date.
382
+ # """
383
+ # reference_date = cls._ensure_date(reference_date)
384
+ # start = reference_date - datetime.timedelta(days=reference_date.weekday())
385
+ # end = start + datetime.timedelta(days=6)
386
+ # return start, end
387
+ #
388
+ # @staticmethod
389
+ # def get_year_timerange(year: int) -> Tuple[datetime.date, datetime.date]:
390
+ # """
391
+ # Get the start and end dates for a given year.
392
+ # """
393
+ # return datetime.date(year, 1, 1), datetime.date(year, 12, 31)
394
+ #
395
+ # @classmethod
396
+ # def get_first_day_of_the_quarter(cls, reference_date: Union[
397
+ # str, datetime.date, datetime.datetime, pd.Timestamp]) -> datetime.date:
398
+ # """
399
+ # Get the first day of the quarter for a given date.
400
+ # """
401
+ # reference_date = cls._ensure_date(reference_date)
402
+ # quarter = (reference_date.month - 1) // 3 + 1
403
+ # return datetime.date(reference_date.year, 3 * quarter - 2, 1)
404
+ #
405
+ # @classmethod
406
+ # def get_last_day_of_the_quarter(cls, reference_date: Union[
407
+ # str, datetime.date, datetime.datetime, pd.Timestamp]) -> datetime.date:
408
+ # """
409
+ # Get the last day of the quarter for a given date.
410
+ # """
411
+ # reference_date = cls._ensure_date(reference_date)
412
+ # quarter = (reference_date.month - 1) // 3 + 1
413
+ # first_day_of_next_quarter = datetime.date(reference_date.year, 3 * quarter + 1, 1)
414
+ # return first_day_of_next_quarter - datetime.timedelta(days=1)
415
+ #
416
+ # @classmethod
417
+ # def get_month_range(cls, n: int = 0) -> Tuple[datetime.date, datetime.date]:
418
+ # """
419
+ # Get the date range for the current month or the month `n` months in the past or future.
420
+ # """
421
+ # today = datetime.date.today()
422
+ # target_month = (today.month - 1 + n) % 12 + 1
423
+ # target_year = today.year + (today.month - 1 + n) // 12
424
+ # start = datetime.date(target_year, target_month, 1)
425
+ # if n == 0:
426
+ # return start, today
427
+ # next_month = (target_month % 12) + 1
428
+ # next_year = target_year + (target_month == 12)
429
+ # end = datetime.date(next_year, next_month, 1) - datetime.timedelta(days=1)
430
+ # return start, end
431
+ #
432
+ # @classmethod
433
+ # def register_period(cls, name: str, func: Callable[[], Tuple[datetime.date, datetime.date]]):
434
+ # """
435
+ # Dynamically register a new period function.
436
+ # """
437
+ # cls._PERIOD_FUNCTIONS[name] = func
438
+ #
439
+ # @classmethod
440
+ # def parse_period(cls, **kwargs) -> Tuple[datetime.date, datetime.date]:
441
+ # """
442
+ # Parse the period keyword to determine the start and end date for date range operations.
443
+ # """
444
+ # period = kwargs.setdefault('period', 'today')
445
+ # period_functions = cls._get_default_periods()
446
+ # period_functions.update(cls._PERIOD_FUNCTIONS)
447
+ # if period not in period_functions:
448
+ # raise ValueError(f"Unknown period '{period}'. Available periods: {list(period_functions.keys())}")
449
+ # return period_functions[period]()
450
+ #
451
+ # @classmethod
452
+ # def _get_default_periods(cls) -> Dict[str, Callable[[], Tuple[datetime.date, datetime.date]]]:
453
+ # """
454
+ # Get default period functions.
455
+ # """
456
+ # today = datetime.date.today
457
+ # return {
458
+ # 'today': lambda: (today(), today()),
459
+ # 'yesterday': lambda: (today() - datetime.timedelta(days=1), today() - datetime.timedelta(days=1)),
460
+ # 'current_week': lambda: cls.calc_week_range(today()),
461
+ # 'last_week': lambda: cls.calc_week_range(today() - datetime.timedelta(days=7)),
462
+ # 'current_month': lambda: cls.get_month_range(n=0),
463
+ # 'last_month': lambda: cls.get_month_range(n=-1),
464
+ # 'current_year': lambda: cls.get_year_timerange(today().year),
465
+ # 'last_year': lambda: cls.get_year_timerange(today().year - 1),
466
+ # 'current_quarter': lambda: (
467
+ # cls.get_first_day_of_the_quarter(today()), cls.get_last_day_of_the_quarter(today())),
468
+ # 'ytd': lambda: (datetime.date(today().year, 1, 1), today()),
469
+ # }
470
+ #
471
+ #
472
+ # class FileAgeChecker:
473
+ # def __init__(self, debug=False, logger=None):
474
+ # self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
475
+ # self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
476
+ # def is_file_older_than(
477
+ # self,
478
+ # file_path: str,
479
+ # max_age_minutes: int,
480
+ # fs: Optional[fsspec.AbstractFileSystem] = None,
481
+ # ignore_missing: bool = False,
482
+ # verbose: bool = False,
483
+ # ) -> bool:
484
+ # """
485
+ # Check if a file or directory is older than the specified max_age_minutes.
486
+ #
487
+ # :param file_path: Path to the file or directory.
488
+ # :param max_age_minutes: Maximum allowed age in minutes.
489
+ # :param fs: Filesystem object. Defaults to local filesystem.
490
+ # :param ignore_missing: Treat missing paths as not old if True.
491
+ # :param verbose: Enable detailed logging.
492
+ # :return: True if older than max_age_minutes, False otherwise.
493
+ # """
494
+ # fs = fs or fsspec.filesystem("file")
495
+ # self.logger.debug(f"Checking age for {file_path}...")
496
+ #
497
+ # try:
498
+ # if not fs.exists(file_path):
499
+ # self.logger.debug(f"Path not found: {file_path}.")
500
+ # return not ignore_missing
501
+ #
502
+ # if fs.isdir(file_path):
503
+ # self.logger.debug(f"Found directory: {file_path}")
504
+ # age = self._get_directory_age_minutes(file_path, fs, verbose)
505
+ # elif fs.isfile(file_path):
506
+ # age = self._get_file_age_minutes(file_path, fs, verbose)
507
+ # else:
508
+ # self.logger.warning(f"Path {file_path} is neither file nor directory.")
509
+ # return True
510
+ #
511
+ # return age > max_age_minutes
512
+ #
513
+ # except Exception as e:
514
+ # self.logger.warning(f"Error checking {file_path}: {str(e)}")
515
+ # return True
516
+ #
517
+ # def get_file_or_dir_age_minutes(
518
+ # self,
519
+ # file_path: str,
520
+ # fs: Optional[fsspec.AbstractFileSystem] = None,
521
+ # ) -> float:
522
+ # """
523
+ # Get age of file/directory in minutes. Returns infinity for errors/missing paths.
524
+ #
525
+ # :param file_path: Path to check.
526
+ # :param fs: Filesystem object. Defaults to local filesystem.
527
+ # :return: Age in minutes or infinity if unavailable.
528
+ # """
529
+ # fs = fs or fsspec.filesystem("file")
530
+ # try:
531
+ # if not fs.exists(file_path):
532
+ # self.logger.debug(f"Path not found: {file_path}")
533
+ # return float("inf")
534
+ #
535
+ # if fs.isdir(file_path):
536
+ # return self._get_directory_age_minutes(file_path, fs, verbose=False)
537
+ # if fs.isfile(file_path):
538
+ # return self._get_file_age_minutes(file_path, fs, verbose=False)
539
+ #
540
+ # self.logger.warning(f"Invalid path type: {file_path}")
541
+ # return float("inf")
542
+ #
543
+ # except Exception as e:
544
+ # self.logger.warning(f"Error getting age for {file_path}: {str(e)}")
545
+ # return float("inf")
546
+ #
547
+ # def _get_directory_age_minutes(
548
+ # self,
549
+ # dir_path: str,
550
+ # fs: fsspec.AbstractFileSystem,
551
+ # verbose: bool,
552
+ # ) -> float:
553
+ # """Calculate age of oldest file in directory."""
554
+ # try:
555
+ # all_files = fs.ls(dir_path)
556
+ # except Exception as e:
557
+ # self.logger.warning(f"Error listing {dir_path}: {str(e)}")
558
+ # return float("inf")
559
+ #
560
+ # if not all_files:
561
+ # self.logger.debug(f"Empty directory: {dir_path}")
562
+ # return float("inf")
563
+ #
564
+ # modification_times = []
565
+ # for file in all_files:
566
+ # try:
567
+ # info = fs.info(file)
568
+ # mod_time = self._get_modification_time(info, file)
569
+ # modification_times.append(mod_time)
570
+ # except Exception as e:
571
+ # self.logger.warning(f"Skipping {file}: {str(e)}")
572
+ #
573
+ # if not modification_times:
574
+ # self.logger.warning(f"No valid files in {dir_path}")
575
+ # return float("inf")
576
+ #
577
+ # oldest = min(modification_times)
578
+ # age = (datetime.datetime.now(datetime.timezone.utc) - oldest).total_seconds() / 60
579
+ # self.logger.debug(f"Oldest in {dir_path}: {age:.2f} minutes")
580
+ #
581
+ # return age
582
+ #
583
+ # def _get_file_age_minutes(
584
+ # self,
585
+ # file_path: str,
586
+ # fs: fsspec.AbstractFileSystem,
587
+ # verbose: bool,
588
+ # ) -> float:
589
+ # """Calculate file age in minutes."""
590
+ # try:
591
+ # info = fs.info(file_path)
592
+ # mod_time = self._get_modification_time(info, file_path)
593
+ # age = (datetime.datetime.now(datetime.timezone.utc) - mod_time).total_seconds() / 60
594
+ #
595
+ # if verbose:
596
+ # self.logger.debug(f"{file_path} info: {info}")
597
+ # self.logger.debug(f"File age: {age:.2f} minutes")
598
+ #
599
+ # return age
600
+ #
601
+ # except Exception as e:
602
+ # self.logger.warning(f"Error processing {file_path}: {str(e)}")
603
+ # return float("inf")
604
+ #
605
+ # def _get_modification_time(self, info: Dict, file_path: str) -> datetime.datetime:
606
+ # """Extract modification time from filesystem info with timezone awareness."""
607
+ # try:
608
+ # if "LastModified" in info: # S3-like
609
+ # lm = info["LastModified"]
610
+ # return lm if isinstance(lm, datetime.datetime) else datetime.datetime.fromisoformat(
611
+ # lm[:-1]).astimezone()
612
+ #
613
+ # if "mtime" in info: # Local filesystem
614
+ # return datetime.datetime.fromtimestamp(info["mtime"], tz=datetime.timezone.utc)
615
+ #
616
+ # if "modified" in info: # FTP/SSH
617
+ # return datetime.datetime.strptime(
618
+ # info["modified"], "%Y-%m-%d %H:%M:%S"
619
+ # ).replace(tzinfo=datetime.timezone.utc)
620
+ #
621
+ # raise KeyError("No valid modification time key found")
622
+ #
623
+ # except (KeyError, ValueError) as e:
624
+ # self.logger.warning(f"Invalid mod time for {file_path}: {str(e)}")
625
+ # raise ValueError(f"Unsupported modification time format for {file_path}") from e
626
+ #
627
+ #
628
+ # # --- Vectorized Helper Functions ---
629
+ #
630
+ # def _vectorized_busday_count(partition, begin_col, end_col, holidays):
631
+ # """
632
+ # Calculates the number of business days between a start and end date.
633
+ # """
634
+ # # Extract the raw columns
635
+ # start_dates_raw = partition[begin_col]
636
+ # end_dates_raw = partition[end_col]
637
+ #
638
+ #
639
+ # start_dates = pd.to_datetime(start_dates_raw, errors='coerce')
640
+ # end_dates = pd.to_datetime(end_dates_raw, errors='coerce')
641
+ #
642
+ # # Initialize the result Series with NaN, as the output is a number
643
+ # result = pd.Series(np.nan, index=partition.index)
644
+ #
645
+ # # Create a mask for rows where both start and end dates are valid
646
+ # valid_mask = pd.notna(start_dates) & pd.notna(end_dates)
647
+ #
648
+ # # Perform the vectorized calculation only on the valid subset
649
+ # # Convert to NumPy arrays of date type for the calculation
650
+ # result.loc[valid_mask] = np.busday_count(
651
+ # start_dates[valid_mask].values.astype('datetime64[D]'),
652
+ # end_dates[valid_mask].values.astype('datetime64[D]'),
653
+ # holidays=holidays
654
+ # )
655
+ #
656
+ # return result
657
+ #
658
+ #
659
+ # def _vectorized_sla_end_date(partition, start_col, n_days_col, holidays):
660
+ # """
661
+ # Calculates the end date of an SLA, skipping weekends and holidays.
662
+ # """
663
+ # # Extract the relevant columns as pandas Series
664
+ # start_dates_raw = partition[start_col]
665
+ # sla_days = partition[n_days_col]
666
+ #
667
+ #
668
+ # start_dates = pd.to_datetime(start_dates_raw, errors='coerce')
669
+ #
670
+ # # Initialize the result Series with NaT (Not a Time)
671
+ # result = pd.Series(pd.NaT, index=partition.index, dtype='datetime64[ns]')
672
+ #
673
+ # # Create a mask for rows that have valid start dates and SLA days
674
+ # valid_mask = pd.notna(start_dates) & pd.notna(sla_days)
675
+ #
676
+ # # Perform the vectorized calculation only on the valid subset
677
+ # # Note: np.busday_offset requires a NumPy array, so we use .values
678
+ # result.loc[valid_mask] = np.busday_offset(
679
+ # start_dates[valid_mask].values.astype('datetime64[D]'), # Convert to numpy array of dates
680
+ # sla_days[valid_mask].astype(int), # Ensure days are integers
681
+ # roll='forward',
682
+ # holidays=holidays
683
+ # )
684
+ #
685
+ # return result
686
+ #
687
+ #
688
+ # # --- Refactored BusinessDays Class ---
689
+ #
690
+ # class BusinessDays:
691
+ # """
692
+ # Business days calculations with a custom holiday list.
693
+ # Supports scalar and efficient, vectorized Dask DataFrame operations.
694
+ # """
695
+ #
696
+ # def __init__(self, holiday_list: dict[str, list[str]], logger) -> None:
697
+ # self.logger = logger
698
+ # self.HOLIDAY_LIST = holiday_list
699
+ #
700
+ # # Flatten and store as tuple for determinism
701
+ # bd_holidays = [day for year in self.HOLIDAY_LIST for day in self.HOLIDAY_LIST[year]]
702
+ # self.holidays = tuple(bd_holidays)
703
+ #
704
+ # def get_business_days_count(
705
+ # self,
706
+ # begin_date: str | datetime.date | pd.Timestamp,
707
+ # end_date: str | datetime.date | pd.Timestamp,
708
+ # ) -> int:
709
+ # """Scalar method to count business days between two dates."""
710
+ # begin = pd.to_datetime(begin_date)
711
+ # end = pd.to_datetime(end_date)
712
+ # return int(np.busday_count(begin.date(), end.date(), holidays=list(self.holidays)))
713
+ #
714
+ # def calc_business_days_from_df(
715
+ # self,
716
+ # df: dd.DataFrame,
717
+ # begin_date_col: str,
718
+ # end_date_col: str,
719
+ # result_col: str = "business_days",
720
+ # ) -> dd.DataFrame:
721
+ # """Calculates business days between two columns in a Dask DataFrame."""
722
+ # missing = {begin_date_col, end_date_col} - set(df.columns)
723
+ # if missing:
724
+ # self.logger.error(f"Missing columns: {missing}")
725
+ # raise ValueError("Required columns are missing from DataFrame")
726
+ #
727
+ # return df.assign(
728
+ # **{result_col: df.map_partitions(
729
+ # _vectorized_busday_count,
730
+ # begin_col=begin_date_col,
731
+ # end_col=end_date_col,
732
+ # holidays=list(self.holidays),
733
+ # meta=(result_col, 'f8') # f8 is float64
734
+ # )}
735
+ # )
736
+ #
737
+ # def add_business_days(
738
+ # self,
739
+ # start_date: str | datetime.date | pd.Timestamp,
740
+ # n_days: int,
741
+ # ) -> np.datetime64:
742
+ # """Scalar method to add N business days to a start date."""
743
+ # start = pd.to_datetime(start_date)
744
+ # return np.busday_offset(
745
+ # start.date(),
746
+ # n_days,
747
+ # roll='forward',
748
+ # holidays=list(self.holidays),
749
+ # )
750
+ #
751
+ # def calc_sla_end_date(
752
+ # self,
753
+ # df: dd.DataFrame,
754
+ # start_date_col: str,
755
+ # n_days_col: str,
756
+ # result_col: str = "sla_end_date",
757
+ # ) -> dd.DataFrame:
758
+ # """Calculates an SLA end date column for a Dask DataFrame."""
759
+ # missing = {start_date_col, n_days_col} - set(df.columns)
760
+ # if missing:
761
+ # self.logger.error(f"Missing columns: {missing}")
762
+ # raise ValueError("Required columns are missing from DataFrame")
763
+ #
764
+ # return df.assign(
765
+ # **{result_col: df.map_partitions(
766
+ # _vectorized_sla_end_date,
767
+ # start_col=start_date_col,
768
+ # n_days_col=n_days_col,
769
+ # holidays=list(self.holidays),
770
+ # meta=(result_col, 'datetime64[ns]')
771
+ # )}
772
+ # )
773
+ #
774
+ # # Class enhancements
775
+ # # DateUtils.register_period('next_week', lambda: (datetime.date.today() + datetime.timedelta(days=7),
776
+ # # datetime.date.today() + datetime.timedelta(days=13)))
777
+ # # start, end = DateUtils.parse_period(period='next_week')
778
+ # # print(f"Next Week: {start} to {end}")