sibi-dst 2025.9.9__py3-none-any.whl → 2025.9.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_artifact_updater_async.py +191 -137
- sibi_dst/df_helper/_parquet_artifact.py +6 -326
- sibi_dst/df_helper/_parquet_reader.py +2 -1
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +24 -2
- sibi_dst/utils/boilerplate/__init__.py +5 -3
- sibi_dst/utils/boilerplate/base_pipeline.py +14 -29
- sibi_dst/utils/business_days.py +19 -51
- sibi_dst/utils/clickhouse_writer.py +1 -1
- sibi_dst/utils/data_wrapper.py +46 -312
- sibi_dst/utils/filepath_generator.py +1 -154
- sibi_dst/utils/parquet_saver.py +29 -16
- sibi_dst/utils/progress/sse_runner.py +39 -11
- sibi_dst/utils/update_planner.py +161 -805
- {sibi_dst-2025.9.9.dist-info → sibi_dst-2025.9.11.dist-info}/METADATA +2 -1
- {sibi_dst-2025.9.9.dist-info → sibi_dst-2025.9.11.dist-info}/RECORD +16 -16
- {sibi_dst-2025.9.9.dist-info → sibi_dst-2025.9.11.dist-info}/WHEEL +0 -0
sibi_dst/utils/business_days.py
CHANGED
@@ -1,19 +1,22 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import datetime as dt
|
2
4
|
from typing import Any, Dict, Iterable, Optional
|
3
|
-
|
5
|
+
|
6
|
+
import dask.dataframe as dd
|
4
7
|
import numpy as np
|
5
8
|
import pandas as pd
|
6
|
-
|
9
|
+
|
10
|
+
from sibi_dst.utils import Logger
|
7
11
|
|
8
12
|
|
9
13
|
# ---------------- Vectorized helpers (used by Dask map_partitions) ----------------
|
10
14
|
|
11
15
|
def _to_np_days(series: pd.Series) -> np.ndarray:
|
12
16
|
"""Coerce to numpy datetime64[D] with NaT-safe conversion."""
|
13
|
-
# Use pandas for robust parsing, then cast to date-days
|
14
17
|
s = pd.to_datetime(series, errors="coerce")
|
15
|
-
#
|
16
|
-
return s.
|
18
|
+
# Return day precision array directly
|
19
|
+
return s.dt.floor("D").to_numpy(dtype="datetime64[D]")
|
17
20
|
|
18
21
|
|
19
22
|
def _vectorized_busday_count(
|
@@ -24,8 +27,8 @@ def _vectorized_busday_count(
|
|
24
27
|
weekmask: Optional[str],
|
25
28
|
inclusive: bool,
|
26
29
|
) -> pd.Series:
|
27
|
-
start = _to_np_days(part[begin_col])
|
28
|
-
end = _to_np_days(part[end_col])
|
30
|
+
start = _to_np_days(part[begin_col])
|
31
|
+
end = _to_np_days(part[end_col])
|
29
32
|
|
30
33
|
kwargs: Dict[str, Any] = {}
|
31
34
|
if holidays:
|
@@ -38,7 +41,7 @@ def _vectorized_busday_count(
|
|
38
41
|
with np.errstate(invalid="ignore"):
|
39
42
|
end_adj = end + np.timedelta64(1, "D")
|
40
43
|
|
41
|
-
valid = (~pd.isna(start)) & (~pd.isna(end))
|
44
|
+
valid = (~pd.isna(start)) & (~pd.isna(end))
|
42
45
|
result = np.full(part.shape[0], np.nan, dtype="float64")
|
43
46
|
if valid.any():
|
44
47
|
counts = np.busday_count(
|
@@ -59,8 +62,8 @@ def _vectorized_busday_offset(
|
|
59
62
|
weekmask: Optional[str],
|
60
63
|
roll: str,
|
61
64
|
) -> pd.Series:
|
62
|
-
start = _to_np_days(part[start_col])
|
63
|
-
n_days = pd.to_numeric(part[n_days_col], errors="coerce").to_numpy()
|
65
|
+
start = _to_np_days(part[start_col])
|
66
|
+
n_days = pd.to_numeric(part[n_days_col], errors="coerce").to_numpy()
|
64
67
|
|
65
68
|
kwargs: Dict[str, Any] = {"roll": roll}
|
66
69
|
if holidays:
|
@@ -68,7 +71,7 @@ def _vectorized_busday_offset(
|
|
68
71
|
if weekmask:
|
69
72
|
kwargs["weekmask"] = weekmask
|
70
73
|
|
71
|
-
valid = (~pd.isna(start)) & (~pd.isna(n_days))
|
74
|
+
valid = (~pd.isna(start)) & (~pd.isna(n_days))
|
72
75
|
out = np.full(part.shape[0], np.datetime64("NaT", "ns"), dtype="datetime64[ns]")
|
73
76
|
if valid.any():
|
74
77
|
offs = np.busday_offset(
|
@@ -86,26 +89,6 @@ def _vectorized_busday_offset(
|
|
86
89
|
class BusinessDays:
|
87
90
|
"""
|
88
91
|
Business day calculations with custom holidays and optional weekmask.
|
89
|
-
|
90
|
-
Features
|
91
|
-
- Scalar helpers:
|
92
|
-
- get_business_days_count(begin, end, inclusive=False) -> int
|
93
|
-
- add_business_days(start_date, n_days, roll='forward') -> np.datetime64
|
94
|
-
- Dask DataFrame helpers (vectorized via map_partitions):
|
95
|
-
- calc_business_days_from_df(df, begin_col, end_col, result_col='business_days', inclusive=False)
|
96
|
-
- calc_sla_end_date(df, start_date_col, n_days_col, result_col='sla_end_date', roll='forward')
|
97
|
-
|
98
|
-
Parameters
|
99
|
-
----------
|
100
|
-
holiday_list : dict[str, list[str]] | Iterable[str]
|
101
|
-
Either a mapping of year -> [YYYY-MM-DD, ...] or a flat iterable of YYYY-MM-DD strings.
|
102
|
-
logger : Any
|
103
|
-
Logger with .debug/.info/.warning/.error.
|
104
|
-
weekmask : str | None
|
105
|
-
A numpy business day weekmask like '1111100' (Mon–Fri). None means default Mon–Fri.
|
106
|
-
Examples:
|
107
|
-
'1111100' -> Mon-Fri
|
108
|
-
'1111110' -> Mon-Sat
|
109
92
|
"""
|
110
93
|
|
111
94
|
def __init__(
|
@@ -119,12 +102,11 @@ class BusinessDays:
|
|
119
102
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
120
103
|
self.weekmask = weekmask
|
121
104
|
|
122
|
-
# Normalize holidays to a flat, sorted tuple of 'YYYY-MM-DD'
|
123
105
|
if isinstance(holiday_list, dict):
|
124
106
|
flat = [d for _, days in sorted(holiday_list.items()) for d in days]
|
125
107
|
else:
|
126
108
|
flat = list(holiday_list)
|
127
|
-
|
109
|
+
|
128
110
|
seen = set()
|
129
111
|
flat_unique = []
|
130
112
|
for d in flat:
|
@@ -142,7 +124,6 @@ class BusinessDays:
|
|
142
124
|
*,
|
143
125
|
inclusive: bool = False,
|
144
126
|
) -> int:
|
145
|
-
"""Business days between two dates. If inclusive=True, include the end date."""
|
146
127
|
b = pd.to_datetime(begin_date).date()
|
147
128
|
e = pd.to_datetime(end_date).date()
|
148
129
|
|
@@ -153,11 +134,11 @@ class BusinessDays:
|
|
153
134
|
kwargs["weekmask"] = self.weekmask
|
154
135
|
|
155
136
|
if inclusive:
|
156
|
-
e_np = np.datetime64(e) + np.timedelta64(1, "D")
|
137
|
+
e_np = np.datetime64(e, "D") + np.timedelta64(1, "D")
|
157
138
|
else:
|
158
|
-
e_np = np.datetime64(e)
|
139
|
+
e_np = np.datetime64(e, "D")
|
159
140
|
|
160
|
-
val = int(np.busday_count(np.datetime64(b), e_np, **kwargs))
|
141
|
+
val = int(np.busday_count(np.datetime64(b, "D"), e_np, **kwargs))
|
161
142
|
return val
|
162
143
|
|
163
144
|
def add_business_days(
|
@@ -167,11 +148,6 @@ class BusinessDays:
|
|
167
148
|
*,
|
168
149
|
roll: str = "forward",
|
169
150
|
) -> np.datetime64:
|
170
|
-
"""
|
171
|
-
Add (or subtract) business days to a date. Returns numpy datetime64[D].
|
172
|
-
roll: {'forward','backward','following','preceding','modifiedfollowing',
|
173
|
-
'modifiedpreceding','nat'}
|
174
|
-
"""
|
175
151
|
s = pd.to_datetime(start_date).date()
|
176
152
|
kwargs: Dict[str, Any] = {"roll": roll}
|
177
153
|
if self.holidays:
|
@@ -179,7 +155,7 @@ class BusinessDays:
|
|
179
155
|
if self.weekmask:
|
180
156
|
kwargs["weekmask"] = self.weekmask
|
181
157
|
|
182
|
-
return np.busday_offset(np.datetime64(s), int(n_days), **kwargs)
|
158
|
+
return np.busday_offset(np.datetime64(s, "D"), int(n_days), **kwargs)
|
183
159
|
|
184
160
|
# -------- Dask API --------
|
185
161
|
|
@@ -192,10 +168,6 @@ class BusinessDays:
|
|
192
168
|
*,
|
193
169
|
inclusive: bool = False,
|
194
170
|
) -> dd.DataFrame:
|
195
|
-
"""
|
196
|
-
Vectorized business-day difference between two date columns.
|
197
|
-
Produces float64 (NaN where either side is missing).
|
198
|
-
"""
|
199
171
|
missing = {begin_date_col, end_date_col} - set(df.columns)
|
200
172
|
if missing:
|
201
173
|
self.logger.error(f"Missing columns: {missing}")
|
@@ -224,10 +196,6 @@ class BusinessDays:
|
|
224
196
|
*,
|
225
197
|
roll: str = "forward",
|
226
198
|
) -> dd.DataFrame:
|
227
|
-
"""
|
228
|
-
Vectorized business-day offset for SLA end date.
|
229
|
-
Produces datetime64[ns] with NaT where invalid.
|
230
|
-
"""
|
231
199
|
missing = {start_date_col, n_days_col} - set(df.columns)
|
232
200
|
if missing:
|
233
201
|
self.logger.error(f"Missing columns: {missing}")
|
@@ -224,7 +224,7 @@ class ClickHouseWriter(ManagedResource):
|
|
224
224
|
def _default_engine_sql(self) -> str:
|
225
225
|
# minimal MergeTree clause; quote order_by safely
|
226
226
|
ob = self.order_by if self.order_by.startswith("(") else f"(`{self.order_by}`)"
|
227
|
-
return f"ENGINE = MergeTree ORDER BY {ob}"
|
227
|
+
return f"ENGINE = MergeTree ORDER BY {ob} SETTINGS allow_nullable_key = 1"
|
228
228
|
|
229
229
|
# ------------- partition write -------------
|
230
230
|
|
sibi_dst/utils/data_wrapper.py
CHANGED
@@ -37,7 +37,6 @@ class DataWrapper(ManagedResource):
|
|
37
37
|
dataclass: Type,
|
38
38
|
date_field: str,
|
39
39
|
data_path: str,
|
40
|
-
parquet_filename: str,
|
41
40
|
class_params: Optional[Dict] = None,
|
42
41
|
load_params: Optional[Dict] = None,
|
43
42
|
show_progress: bool = False,
|
@@ -50,7 +49,7 @@ class DataWrapper(ManagedResource):
|
|
50
49
|
self.dataclass: Type = dataclass
|
51
50
|
self.date_field: str = date_field
|
52
51
|
self.data_path: str = self._ensure_forward_slash(data_path)
|
53
|
-
self.
|
52
|
+
self.partition_on_date: bool = True # Assume Hive-style date partitioning by default
|
54
53
|
|
55
54
|
if self.fs is None:
|
56
55
|
raise ValueError("DataWrapper requires a File system (fs) to be provided.")
|
@@ -282,16 +281,23 @@ class DataWrapper(ManagedResource):
|
|
282
281
|
def _process_single_date(self, date: datetime.date):
|
283
282
|
"""Process a single date: load, save to Parquet."""
|
284
283
|
# --- 1. Setup paths and logging ---
|
285
|
-
path =
|
286
|
-
|
287
|
-
|
288
|
-
|
284
|
+
path = self.data_path.rstrip("/")+"/"
|
285
|
+
if not self.partition_on_date:
|
286
|
+
# not a Hive-style partitioned path
|
287
|
+
path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
|
288
|
+
log_extra = self._log_extra(date_context=date.isoformat())
|
289
|
+
self.logger.debug(f"Processing date {date.isoformat()} for legacy {path}", extra=log_extra)
|
290
|
+
else :
|
291
|
+
# Hive-style partitioned path
|
292
|
+
log_extra = self._log_extra(date_context=date.isoformat(), partition_on=self.date_field)
|
293
|
+
self.logger.debug(f"Processing date {date.isoformat()} for partitioned {self.data_path} with hive-style partitions", extra=log_extra)
|
289
294
|
# --- 2. Check if date/path should be skipped ---
|
290
295
|
if (self.update_planner and path in self.update_planner.skipped and
|
291
296
|
getattr(self.update_planner, 'ignore_missing', False)):
|
292
297
|
self.logger.debug(f"Skipping {date} as it exists in the skipped list", extra=log_extra)
|
293
298
|
return
|
294
|
-
|
299
|
+
|
300
|
+
self.logger.debug(f"Processing date {date.isoformat()} for {path}", extra=log_extra)
|
295
301
|
|
296
302
|
# --- 3. Timing ---
|
297
303
|
overall_start = time.perf_counter()
|
@@ -326,39 +332,44 @@ class DataWrapper(ManagedResource):
|
|
326
332
|
self.mmanifest.record(full_path=path)
|
327
333
|
except Exception as e:
|
328
334
|
self.logger.error(f"Failed to record missing path {path}: {e}", extra=log_extra)
|
329
|
-
self.logger.info(f"No data found for {
|
335
|
+
self.logger.info(f"No data found for {path}. Logged to missing manifest.", extra=log_extra)
|
330
336
|
return # Done for this date
|
331
337
|
|
332
338
|
if total_records < 0:
|
333
|
-
self.logger.warning(f"Negative record count ({total_records}) for {
|
339
|
+
self.logger.warning(f"Negative record count ({total_records}) for {path}. Proceeding.", extra=log_extra)
|
334
340
|
# Continue processing even with negative count
|
335
341
|
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
"
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
342
|
+
# --- 6. Save to Parquet ---
|
343
|
+
save_start = time.perf_counter()
|
344
|
+
|
345
|
+
|
346
|
+
parquet_params = {
|
347
|
+
"df_result": df,
|
348
|
+
"parquet_storage_path": path,
|
349
|
+
"fs": self.fs,
|
350
|
+
"logger": self.logger,
|
351
|
+
"debug": self.debug,
|
352
|
+
"verbose": self.verbose,
|
353
|
+
}
|
354
|
+
if self.partition_on_date:
|
355
|
+
df["partition_date"] = df[self.date_field].dt.date.astype(str)
|
356
|
+
parquet_params["partition_on"] = ["partition_date"]
|
357
|
+
self.logger.debug(f"{self.dataclass.__name__} saving to parquet started...", extra=log_extra)
|
358
|
+
with ParquetSaver(**parquet_params) as ps:
|
359
|
+
ps.save_to_parquet()
|
360
|
+
save_time = time.perf_counter() - save_start
|
361
|
+
self.logger.debug(f"Parquet saving for {date} completed in {save_time:.2f}s", extra=log_extra)
|
362
|
+
|
363
|
+
# --- 7. Benchmarking ---
|
364
|
+
total_time = time.perf_counter() - overall_start
|
365
|
+
self.benchmarks[date] = {
|
366
|
+
"load_duration": load_time,
|
367
|
+
"save_duration": save_time,
|
368
|
+
"total_duration": total_time,
|
369
|
+
}
|
370
|
+
|
371
|
+
# --- 8. Log Success ---
|
372
|
+
self._log_success(date, total_time, path)
|
362
373
|
|
363
374
|
except Exception as e:
|
364
375
|
# --- 9. Handle Errors ---
|
@@ -397,280 +408,3 @@ class DataWrapper(ManagedResource):
|
|
397
408
|
except Exception as e:
|
398
409
|
self.logger.error(f"Error generating benchmark summary: {e}", extra=self.logger_extra)
|
399
410
|
|
400
|
-
# import datetime
|
401
|
-
# import random
|
402
|
-
# import threading
|
403
|
-
# import time
|
404
|
-
# from concurrent.futures import ThreadPoolExecutor, as_completed
|
405
|
-
# from typing import Type, Any, Dict, Optional, Union, List, ClassVar
|
406
|
-
#
|
407
|
-
# import pandas as pd
|
408
|
-
# from tqdm import tqdm
|
409
|
-
#
|
410
|
-
# from . import ManagedResource
|
411
|
-
# from .parquet_saver import ParquetSaver
|
412
|
-
#
|
413
|
-
#
|
414
|
-
# class DataWrapper(ManagedResource):
|
415
|
-
# DEFAULT_PRIORITY_MAP: ClassVar[Dict[str, int]] = {
|
416
|
-
# "overwrite": 1,
|
417
|
-
# "missing_in_history": 2,
|
418
|
-
# "existing_but_stale": 3,
|
419
|
-
# "missing_outside_history": 4,
|
420
|
-
# "file_is_recent": 0,
|
421
|
-
# }
|
422
|
-
# DEFAULT_MAX_AGE_MINUTES: int = 1440
|
423
|
-
# DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
|
424
|
-
#
|
425
|
-
# logger_extra = {"sibi_dst_component": __name__}
|
426
|
-
#
|
427
|
-
# def __init__(
|
428
|
-
# self,
|
429
|
-
# dataclass: Type,
|
430
|
-
# date_field: str,
|
431
|
-
# data_path: str,
|
432
|
-
# parquet_filename: str,
|
433
|
-
# class_params: Optional[Dict] = None,
|
434
|
-
# load_params: Optional[Dict] = None,
|
435
|
-
# show_progress: bool = False,
|
436
|
-
# timeout: float = 30,
|
437
|
-
# max_threads: int = 3,
|
438
|
-
# **kwargs: Any,
|
439
|
-
# ):
|
440
|
-
# super().__init__(**kwargs)
|
441
|
-
# self.dataclass = dataclass
|
442
|
-
# self.date_field = date_field
|
443
|
-
# self.data_path = self._ensure_forward_slash(data_path)
|
444
|
-
# self.parquet_filename = parquet_filename
|
445
|
-
# if self.fs is None:
|
446
|
-
# raise ValueError("DataWrapper requires a File system (fs) to be provided.")
|
447
|
-
# self.show_progress = show_progress
|
448
|
-
# self.timeout = timeout
|
449
|
-
# self.max_threads = max_threads
|
450
|
-
# self.class_params = class_params or {
|
451
|
-
# "debug": self.debug,
|
452
|
-
# "logger": self.logger,
|
453
|
-
# "fs": self.fs,
|
454
|
-
# "verbose": self.verbose,
|
455
|
-
# }
|
456
|
-
# self.load_params = load_params or {}
|
457
|
-
#
|
458
|
-
# self._lock = threading.Lock()
|
459
|
-
# self.processed_dates: List[datetime.date] = []
|
460
|
-
# self.benchmarks: Dict[datetime.date, Dict[str, float]] = {}
|
461
|
-
# self.mmanifest = kwargs.get("mmanifest", None)
|
462
|
-
# self.update_planner = kwargs.get("update_planner", None)
|
463
|
-
#
|
464
|
-
# # --- NEW: stop gate tripped during cleanup/interrupt to block further scheduling/retries
|
465
|
-
# self._stop_event = threading.Event()
|
466
|
-
# self.logger_extra.update({"action_module_name": "data_wrapper", "dataclass": self.dataclass.__name__})
|
467
|
-
#
|
468
|
-
# # ensure manifest is saved on context exit
|
469
|
-
# def __exit__(self, exc_type, exc_val, exc_tb):
|
470
|
-
# if self.mmanifest:
|
471
|
-
# self.mmanifest.save()
|
472
|
-
# super().__exit__(exc_type, exc_val, exc_tb)
|
473
|
-
# return False
|
474
|
-
#
|
475
|
-
# # --- NEW: trip stop gate during class-specific cleanup (close/aclose/finalizer path)
|
476
|
-
# def _cleanup(self) -> None:
|
477
|
-
# self._stop_event.set()
|
478
|
-
#
|
479
|
-
# @staticmethod
|
480
|
-
# def _convert_to_date(date: Union[datetime.date, str]) -> datetime.date:
|
481
|
-
# if isinstance(date, datetime.date):
|
482
|
-
# return date
|
483
|
-
# try:
|
484
|
-
# return pd.to_datetime(date).date()
|
485
|
-
# except ValueError as e:
|
486
|
-
# raise ValueError(f"Error converting {date} to datetime: {e}")
|
487
|
-
#
|
488
|
-
# @staticmethod
|
489
|
-
# def _ensure_forward_slash(path: str) -> str:
|
490
|
-
# return path.rstrip("/") + "/"
|
491
|
-
#
|
492
|
-
# def process(
|
493
|
-
# self,
|
494
|
-
# max_retries: int = 3,
|
495
|
-
# backoff_base: float = 2.0,
|
496
|
-
# backoff_jitter: float = 0.1,
|
497
|
-
# backoff_max: float = 60.0,
|
498
|
-
# ):
|
499
|
-
# """
|
500
|
-
# Execute the update plan with concurrency, retries and exponential backoff.
|
501
|
-
# Stops scheduling immediately if closed or interrupted (Ctrl-C).
|
502
|
-
# """
|
503
|
-
# overall_start = time.perf_counter()
|
504
|
-
# tasks = list(self.update_planner.get_tasks_by_priority())
|
505
|
-
# if not tasks:
|
506
|
-
# self.logger.info("No updates required based on the current plan.")
|
507
|
-
# return
|
508
|
-
#
|
509
|
-
# if self.update_planner.show_progress:
|
510
|
-
# self.update_planner.show_update_plan()
|
511
|
-
#
|
512
|
-
# try:
|
513
|
-
# for priority, dates in tasks:
|
514
|
-
# if self._stop_event.is_set():
|
515
|
-
# break
|
516
|
-
# self._execute_task_batch(priority, dates, max_retries, backoff_base, backoff_jitter, backoff_max)
|
517
|
-
# except KeyboardInterrupt:
|
518
|
-
# self.logger.warning("KeyboardInterrupt received — stopping scheduling and shutting down.", extra=self.logger_extra)
|
519
|
-
# self._stop_event.set()
|
520
|
-
# raise
|
521
|
-
# finally:
|
522
|
-
# total_time = time.perf_counter() - overall_start
|
523
|
-
# if self.processed_dates:
|
524
|
-
# count = len(self.processed_dates)
|
525
|
-
# self.logger.info(f"Processed {count} dates in {total_time:.1f}s (avg {total_time / count:.1f}s/date)", extra=self.logger_extra)
|
526
|
-
# if self.update_planner.show_progress:
|
527
|
-
# self.show_benchmark_summary()
|
528
|
-
#
|
529
|
-
# def _execute_task_batch(
|
530
|
-
# self,
|
531
|
-
# priority: int,
|
532
|
-
# dates: List[datetime.date],
|
533
|
-
# max_retries: int,
|
534
|
-
# backoff_base: float,
|
535
|
-
# backoff_jitter: float,
|
536
|
-
# backoff_max: float,
|
537
|
-
# ):
|
538
|
-
# desc = f"Processing {self.dataclass.__name__}, priority: {priority}"
|
539
|
-
# max_thr = min(len(dates), self.max_threads)
|
540
|
-
# self.logger.info(f"Executing {len(dates)} tasks with priority {priority} using {max_thr} threads.", extra=self.logger_extra)
|
541
|
-
#
|
542
|
-
# # Use explicit try/finally so we can request cancel of queued tasks on teardown
|
543
|
-
# executor = ThreadPoolExecutor(max_workers=max_thr, thread_name_prefix="datawrapper")
|
544
|
-
# try:
|
545
|
-
# futures = {}
|
546
|
-
# for date in dates:
|
547
|
-
# if self._stop_event.is_set():
|
548
|
-
# break
|
549
|
-
# try:
|
550
|
-
# fut = executor.submit(
|
551
|
-
# self._process_date_with_retry, date, max_retries, backoff_base, backoff_jitter, backoff_max
|
552
|
-
# )
|
553
|
-
# futures[fut] = date
|
554
|
-
# except RuntimeError as e:
|
555
|
-
# # tolerate race: executor shutting down
|
556
|
-
# if "cannot schedule new futures after shutdown" in str(e).lower():
|
557
|
-
# self.logger.warning("Executor is shutting down; halting new submissions for this batch.", extra=self.logger_extra)
|
558
|
-
# break
|
559
|
-
# raise
|
560
|
-
#
|
561
|
-
# iterator = as_completed(futures)
|
562
|
-
# if self.show_progress:
|
563
|
-
# iterator = tqdm(iterator, total=len(futures), desc=desc)
|
564
|
-
#
|
565
|
-
# for future in iterator:
|
566
|
-
# try:
|
567
|
-
# future.result(timeout=self.timeout)
|
568
|
-
# except Exception as e:
|
569
|
-
# self.logger.error(f"Permanent failure for {futures[future]}: {e}", extra=self.logger_extra)
|
570
|
-
# finally:
|
571
|
-
# # Python 3.9+: cancel_futures prevents queued tasks from starting
|
572
|
-
# executor.shutdown(wait=True, cancel_futures=True)
|
573
|
-
#
|
574
|
-
# def _process_date_with_retry(
|
575
|
-
# self,
|
576
|
-
# date: datetime.date,
|
577
|
-
# max_retries: int,
|
578
|
-
# backoff_base: float,
|
579
|
-
# backoff_jitter: float,
|
580
|
-
# backoff_max: float,
|
581
|
-
# ):
|
582
|
-
# for attempt in range(max_retries):
|
583
|
-
# # --- NEW: bail out quickly if shutdown/interrupt began
|
584
|
-
# if self._stop_event.is_set():
|
585
|
-
# raise RuntimeError("shutting_down")
|
586
|
-
#
|
587
|
-
# try:
|
588
|
-
# self._process_single_date(date)
|
589
|
-
# return
|
590
|
-
# except Exception as e:
|
591
|
-
# if attempt < max_retries - 1 and not self._stop_event.is_set():
|
592
|
-
# base_delay = min(backoff_base ** attempt, backoff_max)
|
593
|
-
# delay = base_delay * (1 + random.uniform(0.0, max(0.0, backoff_jitter)))
|
594
|
-
# self.logger.warning(
|
595
|
-
# f"Retry {attempt + 1}/{max_retries} for {date}: {e} (sleep {delay:.2f}s)",
|
596
|
-
# extra=self.logger_extra
|
597
|
-
# )
|
598
|
-
# time.sleep(delay)
|
599
|
-
# else:
|
600
|
-
# self.logger.error(f"Failed processing {date} after {max_retries} attempts.", extra=self.logger_extra)
|
601
|
-
# raise
|
602
|
-
#
|
603
|
-
# def _process_single_date(self, date: datetime.date):
|
604
|
-
# path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
|
605
|
-
# self.logger.debug(f"Processing date {date.isoformat()} for {path}", extra=self.logger_extra)
|
606
|
-
# if path in self.update_planner.skipped and self.update_planner.ignore_missing:
|
607
|
-
# self.logger.debug(f"Skipping {date} as it exists in the skipped list", extra=self.logger_extra)
|
608
|
-
# return
|
609
|
-
# full_path = f"{path}{self.parquet_filename}"
|
610
|
-
#
|
611
|
-
# overall_start = time.perf_counter()
|
612
|
-
# try:
|
613
|
-
# load_start = time.perf_counter()
|
614
|
-
# date_filter = {f"{self.date_field}__date": {date.isoformat()}}
|
615
|
-
# self.logger.debug(f"{self.dataclass.__name__} is loading data for {date} with filter: {date_filter}", extra=self.logger_extra)
|
616
|
-
#
|
617
|
-
# local_load_params = self.load_params.copy()
|
618
|
-
# local_load_params.update(date_filter)
|
619
|
-
#
|
620
|
-
# with self.dataclass(**self.class_params) as local_class_instance:
|
621
|
-
# df = local_class_instance.load(**local_load_params) # expected to be Dask
|
622
|
-
# load_time = time.perf_counter() - load_start
|
623
|
-
#
|
624
|
-
# if hasattr(local_class_instance, "total_records"):
|
625
|
-
# total_records = int(local_class_instance.total_records)
|
626
|
-
# self.logger.debug(f"Total records loaded: {total_records}", extra=self.logger_extra)
|
627
|
-
#
|
628
|
-
# if total_records == 0:
|
629
|
-
# if self.mmanifest:
|
630
|
-
# self.mmanifest.record(full_path=path)
|
631
|
-
# self.logger.info(f"No data found for {full_path}. Logged to missing manifest.", extra=self.logger_extra)
|
632
|
-
# return
|
633
|
-
#
|
634
|
-
# if total_records < 0:
|
635
|
-
# self.logger.warning(f"Negative record count ({total_records}) for {full_path}.", extra=self.logger_extra)
|
636
|
-
# return
|
637
|
-
#
|
638
|
-
# save_start = time.perf_counter()
|
639
|
-
# parquet_params = {
|
640
|
-
# "df_result": df,
|
641
|
-
# "parquet_storage_path": path,
|
642
|
-
# "fs": self.fs,
|
643
|
-
# "logger": self.logger,
|
644
|
-
# "debug": self.debug,
|
645
|
-
# }
|
646
|
-
# with ParquetSaver(**parquet_params) as ps:
|
647
|
-
# ps.save_to_parquet(self.parquet_filename, overwrite=True)
|
648
|
-
# save_time = time.perf_counter() - save_start
|
649
|
-
#
|
650
|
-
# total_time = time.perf_counter() - overall_start
|
651
|
-
# self.benchmarks[date] = {
|
652
|
-
# "load_duration": load_time,
|
653
|
-
# "save_duration": save_time,
|
654
|
-
# "total_duration": total_time,
|
655
|
-
# }
|
656
|
-
# self._log_success(date, total_time, full_path)
|
657
|
-
#
|
658
|
-
# except Exception as e:
|
659
|
-
# self._log_failure(date, e)
|
660
|
-
# raise
|
661
|
-
#
|
662
|
-
# def _log_success(self, date: datetime.date, duration: float, path: str):
|
663
|
-
# self.logger.info(f"Completed {date} in {duration:.1f}s | Saved to {path}", extra=self.logger_extra)
|
664
|
-
# self.processed_dates.append(date)
|
665
|
-
#
|
666
|
-
# def _log_failure(self, date: datetime.date, error: Exception):
|
667
|
-
# self.logger.error(f"Failed processing {date}: {error}", extra=self.logger_extra)
|
668
|
-
#
|
669
|
-
# def show_benchmark_summary(self):
|
670
|
-
# if not self.benchmarks:
|
671
|
-
# self.logger.info("No benchmarking data to show", extra=self.logger_extra)
|
672
|
-
# return
|
673
|
-
# df_bench = pd.DataFrame.from_records([{"date": d, **m} for d, m in self.benchmarks.items()])
|
674
|
-
# df_bench = df_bench.set_index("date").sort_index(ascending=not self.update_planner.reverse_order)
|
675
|
-
# self.logger.info(f"Benchmark Summary:\n {self.dataclass.__name__}\n" + df_bench.to_string(), extra=self.logger_extra)
|
676
|
-
#
|