sibi-dst 2025.9.3__py3-none-any.whl → 2025.9.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +6 -4
- sibi_dst/df_helper/__init__.py +1 -0
- sibi_dst/df_helper/_parquet_artifact.py +533 -113
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +1 -281
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +349 -142
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +17 -0
- sibi_dst/utils/data_wrapper.py +460 -61
- sibi_dst/utils/parquet_saver.py +403 -161
- sibi_dst/utils/update_planner.py +553 -319
- sibi_dst/utils/write_gatekeeper.py +18 -0
- {sibi_dst-2025.9.3.dist-info → sibi_dst-2025.9.4.dist-info}/METADATA +2 -2
- {sibi_dst-2025.9.3.dist-info → sibi_dst-2025.9.4.dist-info}/RECORD +13 -12
- {sibi_dst-2025.9.3.dist-info → sibi_dst-2025.9.4.dist-info}/WHEEL +0 -0
sibi_dst/utils/data_wrapper.py
CHANGED
@@ -1,9 +1,12 @@
|
|
1
|
+
# data_wrapper.py
|
2
|
+
from __future__ import annotations
|
3
|
+
|
1
4
|
import datetime
|
2
5
|
import random
|
3
6
|
import threading
|
4
7
|
import time
|
5
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
6
|
-
from typing import Type, Any, Dict, Optional, Union, List, ClassVar
|
8
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed, Future
|
9
|
+
from typing import Type, Any, Dict, Optional, Union, List, ClassVar, Callable
|
7
10
|
|
8
11
|
import pandas as pd
|
9
12
|
from tqdm import tqdm
|
@@ -13,6 +16,10 @@ from .parquet_saver import ParquetSaver
|
|
13
16
|
|
14
17
|
|
15
18
|
class DataWrapper(ManagedResource):
|
19
|
+
"""
|
20
|
+
Manages the concurrent processing of data for multiple dates based on an update plan.
|
21
|
+
Orchestrates loading data via a dataclass, processing it, and saving it to Parquet.
|
22
|
+
"""
|
16
23
|
DEFAULT_PRIORITY_MAP: ClassVar[Dict[str, int]] = {
|
17
24
|
"overwrite": 1,
|
18
25
|
"missing_in_history": 2,
|
@@ -23,7 +30,7 @@ class DataWrapper(ManagedResource):
|
|
23
30
|
DEFAULT_MAX_AGE_MINUTES: int = 1440
|
24
31
|
DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
|
25
32
|
|
26
|
-
logger_extra = {"sibi_dst_component":
|
33
|
+
logger_extra = {"sibi_dst_component": "warehouse.data_wrapper"}
|
27
34
|
|
28
35
|
def __init__(
|
29
36
|
self,
|
@@ -39,57 +46,87 @@ class DataWrapper(ManagedResource):
|
|
39
46
|
**kwargs: Any,
|
40
47
|
):
|
41
48
|
super().__init__(**kwargs)
|
42
|
-
|
43
|
-
self.
|
44
|
-
self.
|
45
|
-
self.
|
49
|
+
# ---- Core Configuration ----
|
50
|
+
self.dataclass: Type = dataclass
|
51
|
+
self.date_field: str = date_field
|
52
|
+
self.data_path: str = self._ensure_forward_slash(data_path)
|
53
|
+
self.parquet_filename: str = parquet_filename
|
54
|
+
|
46
55
|
if self.fs is None:
|
47
56
|
raise ValueError("DataWrapper requires a File system (fs) to be provided.")
|
48
|
-
|
49
|
-
|
50
|
-
self.
|
51
|
-
self.
|
57
|
+
|
58
|
+
# ---- Execution Parameters ----
|
59
|
+
self.show_progress: bool = show_progress
|
60
|
+
self.timeout: float = timeout
|
61
|
+
self.max_threads: int = max_threads
|
62
|
+
|
63
|
+
# ---- Parameters for Dataclass Instantiation ----
|
64
|
+
self.class_params: Dict[str, Any] = class_params or {
|
52
65
|
"debug": self.debug,
|
53
66
|
"logger": self.logger,
|
54
67
|
"fs": self.fs,
|
55
68
|
"verbose": self.verbose,
|
56
69
|
}
|
57
|
-
self.load_params = load_params or {}
|
70
|
+
self.load_params: Dict[str, Any] = load_params or {}
|
58
71
|
|
72
|
+
# ---- Internal State & Coordination ----
|
59
73
|
self._lock = threading.Lock()
|
60
74
|
self.processed_dates: List[datetime.date] = []
|
61
75
|
self.benchmarks: Dict[datetime.date, Dict[str, float]] = {}
|
76
|
+
|
77
|
+
# ---- External Dependencies ----
|
62
78
|
self.mmanifest = kwargs.get("mmanifest", None)
|
63
79
|
self.update_planner = kwargs.get("update_planner", None)
|
64
80
|
|
65
|
-
#
|
81
|
+
# ---- Shutdown Coordination ----
|
82
|
+
# Stop gate to block further scheduling/retries during cleanup/interrupt
|
66
83
|
self._stop_event = threading.Event()
|
67
|
-
self.logger_extra.update({"action_module_name": "data_wrapper", "dataclass": self.dataclass.__name__})
|
68
84
|
|
69
|
-
|
85
|
+
# Update logger extra with specific context
|
86
|
+
self.logger_extra.update({
|
87
|
+
"action_module_name": "data_wrapper",
|
88
|
+
"dataclass": self.dataclass.__name__
|
89
|
+
})
|
90
|
+
|
91
|
+
# --------------------- Context Management ---------------------
|
70
92
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
93
|
+
"""Ensure manifest is saved and resources are cleaned up on context exit."""
|
71
94
|
if self.mmanifest:
|
72
|
-
|
73
|
-
|
74
|
-
|
95
|
+
try:
|
96
|
+
self.mmanifest.save()
|
97
|
+
except Exception as e:
|
98
|
+
self.logger.error(f"Failed to save manifest in __exit__: {e}", extra=self.logger_extra)
|
99
|
+
# Call parent's __exit__ which triggers _cleanup
|
100
|
+
return super().__exit__(exc_type, exc_val, exc_tb)
|
75
101
|
|
76
|
-
#
|
102
|
+
# --------------------- Cleanup ---------------------
|
77
103
|
def _cleanup(self) -> None:
|
104
|
+
"""Signal shutdown during class-specific cleanup."""
|
78
105
|
self._stop_event.set()
|
79
106
|
|
107
|
+
# --------------------- Utilities ---------------------
|
80
108
|
@staticmethod
|
81
109
|
def _convert_to_date(date: Union[datetime.date, str]) -> datetime.date:
|
110
|
+
"""Convert a string or date object to a datetime.date."""
|
82
111
|
if isinstance(date, datetime.date):
|
83
112
|
return date
|
84
113
|
try:
|
85
114
|
return pd.to_datetime(date).date()
|
86
115
|
except ValueError as e:
|
87
|
-
raise ValueError(f"Error converting {date} to datetime: {e}")
|
116
|
+
raise ValueError(f"Error converting {date} to datetime: {e}") from e
|
88
117
|
|
89
118
|
@staticmethod
|
90
119
|
def _ensure_forward_slash(path: str) -> str:
|
120
|
+
"""Ensure the path ends with a forward slash."""
|
91
121
|
return path.rstrip("/") + "/"
|
92
122
|
|
123
|
+
def _log_extra(self, **overrides) -> Dict[str, Any]:
|
124
|
+
"""Generate consistent logger extra context."""
|
125
|
+
base = self.logger_extra.copy()
|
126
|
+
base.update(overrides)
|
127
|
+
return base
|
128
|
+
|
129
|
+
# --------------------- Core Public API ---------------------
|
93
130
|
def process(
|
94
131
|
self,
|
95
132
|
max_retries: int = 3,
|
@@ -98,21 +135,22 @@ class DataWrapper(ManagedResource):
|
|
98
135
|
backoff_max: float = 60.0,
|
99
136
|
):
|
100
137
|
"""
|
101
|
-
Execute the update plan with concurrency, retries and exponential backoff.
|
138
|
+
Execute the update plan with concurrency, retries, and exponential backoff.
|
102
139
|
Stops scheduling immediately if closed or interrupted (Ctrl-C).
|
103
140
|
"""
|
104
141
|
overall_start = time.perf_counter()
|
105
|
-
tasks = list(self.update_planner.get_tasks_by_priority())
|
142
|
+
tasks = list(self.update_planner.get_tasks_by_priority()) if self.update_planner else []
|
106
143
|
if not tasks:
|
107
|
-
self.logger.info("No updates required based on the current plan.")
|
144
|
+
self.logger.info("No updates required based on the current plan.", extra=self.logger_extra)
|
108
145
|
return
|
109
146
|
|
110
|
-
if self.update_planner.show_progress:
|
147
|
+
if self.update_planner and self.update_planner.show_progress:
|
111
148
|
self.update_planner.show_update_plan()
|
112
149
|
|
113
150
|
try:
|
114
151
|
for priority, dates in tasks:
|
115
152
|
if self._stop_event.is_set():
|
153
|
+
self.logger.info("Stop event set, halting processing of remaining task batches.", extra=self.logger_extra)
|
116
154
|
break
|
117
155
|
self._execute_task_batch(priority, dates, max_retries, backoff_base, backoff_jitter, backoff_max)
|
118
156
|
except KeyboardInterrupt:
|
@@ -123,10 +161,15 @@ class DataWrapper(ManagedResource):
|
|
123
161
|
total_time = time.perf_counter() - overall_start
|
124
162
|
if self.processed_dates:
|
125
163
|
count = len(self.processed_dates)
|
126
|
-
|
127
|
-
|
164
|
+
avg_time = total_time / count if count > 0 else 0
|
165
|
+
self.logger.info(
|
166
|
+
f"Processed {count} dates in {total_time:.1f}s (avg {avg_time:.1f}s/date)",
|
167
|
+
extra=self.logger_extra
|
168
|
+
)
|
169
|
+
if self.update_planner and self.update_planner.show_progress:
|
128
170
|
self.show_benchmark_summary()
|
129
171
|
|
172
|
+
# --------------------- Task Execution ---------------------
|
130
173
|
def _execute_task_batch(
|
131
174
|
self,
|
132
175
|
priority: int,
|
@@ -136,42 +179,70 @@ class DataWrapper(ManagedResource):
|
|
136
179
|
backoff_jitter: float,
|
137
180
|
backoff_max: float,
|
138
181
|
):
|
182
|
+
"""Execute a batch of tasks (dates) with a given priority concurrently."""
|
139
183
|
desc = f"Processing {self.dataclass.__name__}, priority: {priority}"
|
140
184
|
max_thr = min(len(dates), self.max_threads)
|
141
|
-
self.logger.info(
|
185
|
+
self.logger.info(
|
186
|
+
f"Executing {len(dates)} tasks with priority {priority} using {max_thr} threads.",
|
187
|
+
extra=self.logger_extra
|
188
|
+
)
|
142
189
|
|
143
|
-
# Use explicit try/finally
|
190
|
+
# Use explicit try/finally for executor shutdown control
|
144
191
|
executor = ThreadPoolExecutor(max_workers=max_thr, thread_name_prefix="datawrapper")
|
145
192
|
try:
|
146
|
-
|
193
|
+
futures_to_dates: Dict[Future, datetime.date] = {}
|
194
|
+
submitted_count = 0
|
195
|
+
|
147
196
|
for date in dates:
|
148
197
|
if self._stop_event.is_set():
|
198
|
+
self.logger.debug(f"Stop event set, halting submission of new tasks in batch {priority}.", extra=self.logger_extra)
|
149
199
|
break
|
150
200
|
try:
|
151
|
-
|
152
|
-
self._process_date_with_retry,
|
201
|
+
future = executor.submit(
|
202
|
+
self._process_date_with_retry,
|
203
|
+
date,
|
204
|
+
max_retries,
|
205
|
+
backoff_base,
|
206
|
+
backoff_jitter,
|
207
|
+
backoff_max
|
153
208
|
)
|
154
|
-
|
209
|
+
futures_to_dates[future] = date
|
210
|
+
submitted_count += 1
|
155
211
|
except RuntimeError as e:
|
156
|
-
#
|
212
|
+
# Tolerate race: executor shutting down
|
157
213
|
if "cannot schedule new futures after shutdown" in str(e).lower():
|
158
|
-
self.logger.warning(
|
214
|
+
self.logger.warning(
|
215
|
+
"Executor is shutting down; halting new submissions for this batch.",
|
216
|
+
extra=self.logger_extra
|
217
|
+
)
|
159
218
|
break
|
160
|
-
|
219
|
+
else:
|
220
|
+
# Re-raise unexpected RuntimeErrors
|
221
|
+
raise
|
222
|
+
|
223
|
+
self.logger.debug(f"Submitted {submitted_count} tasks for priority {priority}.", extra=self.logger_extra)
|
161
224
|
|
162
|
-
|
225
|
+
# Use as_completed for processing results as they finish
|
226
|
+
iterator = as_completed(futures_to_dates)
|
163
227
|
if self.show_progress:
|
164
|
-
iterator = tqdm(iterator, total=len(
|
228
|
+
iterator = tqdm(iterator, total=len(futures_to_dates), desc=desc, leave=False)
|
165
229
|
|
166
230
|
for future in iterator:
|
231
|
+
date = futures_to_dates[future]
|
167
232
|
try:
|
233
|
+
# Get the result, respecting the overall timeout
|
168
234
|
future.result(timeout=self.timeout)
|
169
235
|
except Exception as e:
|
170
|
-
|
236
|
+
# Log errors for individual date processing failures
|
237
|
+
self.logger.error(f"Permanent failure for {date}: {e}", extra=self.logger_extra)
|
171
238
|
finally:
|
172
239
|
# Python 3.9+: cancel_futures prevents queued tasks from starting
|
240
|
+
# Tasks already running will still complete.
|
241
|
+
# shutdown(wait=True) ensures running tasks finish before returning.
|
173
242
|
executor.shutdown(wait=True, cancel_futures=True)
|
243
|
+
self.logger.debug(f"Executor for priority {priority} shut down.", extra=self.logger_extra)
|
174
244
|
|
245
|
+
# --------------------- Date Processing ---------------------
|
175
246
|
def _process_date_with_retry(
|
176
247
|
self,
|
177
248
|
date: datetime.date,
|
@@ -180,62 +251,89 @@ class DataWrapper(ManagedResource):
|
|
180
251
|
backoff_jitter: float,
|
181
252
|
backoff_max: float,
|
182
253
|
):
|
254
|
+
"""Process a single date with retry logic and exponential backoff."""
|
183
255
|
for attempt in range(max_retries):
|
184
|
-
#
|
256
|
+
# Bail out quickly if shutdown/interrupt began
|
185
257
|
if self._stop_event.is_set():
|
258
|
+
self.logger.debug(f"Stop event set, aborting retries for {date} (attempt {attempt + 1}).", extra=self.logger_extra)
|
186
259
|
raise RuntimeError("shutting_down")
|
187
260
|
|
188
261
|
try:
|
189
262
|
self._process_single_date(date)
|
190
|
-
return
|
263
|
+
return # Success, exit retry loop
|
191
264
|
except Exception as e:
|
192
265
|
if attempt < max_retries - 1 and not self._stop_event.is_set():
|
266
|
+
# Calculate delay with exponential backoff and jitter
|
193
267
|
base_delay = min(backoff_base ** attempt, backoff_max)
|
194
|
-
|
268
|
+
jitter_amount = random.uniform(0.0, max(0.0, backoff_jitter))
|
269
|
+
delay = base_delay * (1 + jitter_amount)
|
195
270
|
self.logger.warning(
|
196
271
|
f"Retry {attempt + 1}/{max_retries} for {date}: {e} (sleep {delay:.2f}s)",
|
197
272
|
extra=self.logger_extra
|
198
273
|
)
|
199
|
-
|
274
|
+
# Respect stop event even during sleep
|
275
|
+
if self._stop_event.wait(timeout=delay):
|
276
|
+
self.logger.debug(f"Stop event set during retry sleep for {date}.", extra=self.logger_extra)
|
277
|
+
raise RuntimeError("shutting_down") from e
|
200
278
|
else:
|
201
279
|
self.logger.error(f"Failed processing {date} after {max_retries} attempts.", extra=self.logger_extra)
|
202
|
-
raise
|
280
|
+
raise # Re-raise the last exception after max retries
|
203
281
|
|
204
282
|
def _process_single_date(self, date: datetime.date):
|
283
|
+
"""Process a single date: load, save to Parquet."""
|
284
|
+
# --- 1. Setup paths and logging ---
|
205
285
|
path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
|
206
|
-
self.
|
207
|
-
|
208
|
-
|
286
|
+
log_extra = self._log_extra(date_context=date.isoformat())
|
287
|
+
self.logger.debug(f"Processing date {date.isoformat()} for {path}", extra=log_extra)
|
288
|
+
|
289
|
+
# --- 2. Check if date/path should be skipped ---
|
290
|
+
if (self.update_planner and path in self.update_planner.skipped and
|
291
|
+
getattr(self.update_planner, 'ignore_missing', False)):
|
292
|
+
self.logger.debug(f"Skipping {date} as it exists in the skipped list", extra=log_extra)
|
209
293
|
return
|
210
294
|
full_path = f"{path}{self.parquet_filename}"
|
211
295
|
|
296
|
+
# --- 3. Timing ---
|
212
297
|
overall_start = time.perf_counter()
|
298
|
+
|
213
299
|
try:
|
300
|
+
# --- 4. Load Data ---
|
214
301
|
load_start = time.perf_counter()
|
215
|
-
date_filter = {f"{self.date_field}__date":
|
216
|
-
self.logger.debug(f"{self.dataclass.__name__} is loading data for {date} with filter: {date_filter}", extra=
|
302
|
+
date_filter = {f"{self.date_field}__date": date.isoformat()}
|
303
|
+
self.logger.debug(f"{self.dataclass.__name__} is loading data for {date} with filter: {date_filter}", extra=log_extra)
|
217
304
|
|
305
|
+
# Prepare load parameters
|
218
306
|
local_load_params = self.load_params.copy()
|
219
307
|
local_load_params.update(date_filter)
|
220
308
|
|
309
|
+
# Instantiate and use the dataclass (e.g., Etl...Dc) within a context manager
|
221
310
|
with self.dataclass(**self.class_params) as local_class_instance:
|
222
|
-
df = local_class_instance.load(**local_load_params) #
|
311
|
+
df = local_class_instance.load(**local_load_params) # Expected to return Dask DataFrame
|
312
|
+
|
223
313
|
load_time = time.perf_counter() - load_start
|
314
|
+
self.logger.debug(f"{self.dataclass.__name__} data loading for {date} completed in {load_time:.2f}s", extra=log_extra)
|
224
315
|
|
316
|
+
# --- 5. Handle Record Count ---
|
317
|
+
total_records = -1
|
225
318
|
if hasattr(local_class_instance, "total_records"):
|
226
|
-
total_records = int(local_class_instance
|
227
|
-
self.logger.debug(f"
|
319
|
+
total_records = int(getattr(local_class_instance, "total_records", -1))
|
320
|
+
self.logger.debug(f"{self.dataclass.__name__} total records loaded: {total_records}", extra=log_extra)
|
228
321
|
|
229
322
|
if total_records == 0:
|
323
|
+
# No data found, log to manifest if available
|
230
324
|
if self.mmanifest:
|
231
|
-
|
232
|
-
|
233
|
-
|
325
|
+
try:
|
326
|
+
self.mmanifest.record(full_path=path)
|
327
|
+
except Exception as e:
|
328
|
+
self.logger.error(f"Failed to record missing path {path}: {e}", extra=log_extra)
|
329
|
+
self.logger.info(f"No data found for {full_path}. Logged to missing manifest.", extra=log_extra)
|
330
|
+
return # Done for this date
|
234
331
|
|
235
332
|
if total_records < 0:
|
236
|
-
self.logger.warning(f"Negative record count ({total_records}) for {full_path}.", extra=
|
237
|
-
|
333
|
+
self.logger.warning(f"Negative record count ({total_records}) for {full_path}. Proceeding.", extra=log_extra)
|
334
|
+
# Continue processing even with negative count
|
238
335
|
|
336
|
+
# --- 6. Save to Parquet ---
|
239
337
|
save_start = time.perf_counter()
|
240
338
|
parquet_params = {
|
241
339
|
"df_result": df,
|
@@ -243,35 +341,336 @@ class DataWrapper(ManagedResource):
|
|
243
341
|
"fs": self.fs,
|
244
342
|
"logger": self.logger,
|
245
343
|
"debug": self.debug,
|
344
|
+
"verbose": self.verbose,
|
246
345
|
}
|
346
|
+
self.logger.debug(f"{self.dataclass.__name__} saving to parquet started...", extra=log_extra)
|
247
347
|
with ParquetSaver(**parquet_params) as ps:
|
248
348
|
ps.save_to_parquet(self.parquet_filename, overwrite=True)
|
249
349
|
save_time = time.perf_counter() - save_start
|
350
|
+
self.logger.debug(f"Parquet saving for {date} completed in {save_time:.2f}s", extra=log_extra)
|
250
351
|
|
352
|
+
# --- 7. Benchmarking ---
|
251
353
|
total_time = time.perf_counter() - overall_start
|
252
354
|
self.benchmarks[date] = {
|
253
355
|
"load_duration": load_time,
|
254
356
|
"save_duration": save_time,
|
255
357
|
"total_duration": total_time,
|
256
358
|
}
|
359
|
+
|
360
|
+
# --- 8. Log Success ---
|
257
361
|
self._log_success(date, total_time, full_path)
|
258
362
|
|
259
363
|
except Exception as e:
|
364
|
+
# --- 9. Handle Errors ---
|
260
365
|
self._log_failure(date, e)
|
261
|
-
raise
|
366
|
+
raise # Re-raise to trigger retry logic
|
262
367
|
|
368
|
+
# --------------------- Logging / Benchmarking ---------------------
|
263
369
|
def _log_success(self, date: datetime.date, duration: float, path: str):
|
370
|
+
"""Log a successful date processing."""
|
264
371
|
self.logger.info(f"Completed {date} in {duration:.1f}s | Saved to {path}", extra=self.logger_extra)
|
265
|
-
self.
|
372
|
+
with self._lock: # Protect the shared list
|
373
|
+
self.processed_dates.append(date)
|
266
374
|
|
267
375
|
def _log_failure(self, date: datetime.date, error: Exception):
|
376
|
+
"""Log a failed date processing."""
|
268
377
|
self.logger.error(f"Failed processing {date}: {error}", extra=self.logger_extra)
|
269
378
|
|
270
379
|
def show_benchmark_summary(self):
|
380
|
+
"""Display a summary of processing times."""
|
271
381
|
if not self.benchmarks:
|
272
382
|
self.logger.info("No benchmarking data to show", extra=self.logger_extra)
|
273
383
|
return
|
274
|
-
df_bench = pd.DataFrame.from_records([{"date": d, **m} for d, m in self.benchmarks.items()])
|
275
|
-
df_bench = df_bench.set_index("date").sort_index(ascending=not self.update_planner.reverse_order)
|
276
|
-
self.logger.info(f"Benchmark Summary:\n {self.dataclass.__name__}\n" + df_bench.to_string(), extra=self.logger_extra)
|
277
384
|
|
385
|
+
try:
|
386
|
+
df_bench = pd.DataFrame.from_records(
|
387
|
+
[{"date": d, **m} for d, m in self.benchmarks.items()]
|
388
|
+
)
|
389
|
+
if not df_bench.empty:
|
390
|
+
df_bench = df_bench.set_index("date").sort_index(
|
391
|
+
ascending=not (self.update_planner.reverse_order if self.update_planner else False)
|
392
|
+
)
|
393
|
+
summary_str = df_bench.to_string()
|
394
|
+
self.logger.info(f"Benchmark Summary:\n {self.dataclass.__name__}\n{summary_str}", extra=self.logger_extra)
|
395
|
+
else:
|
396
|
+
self.logger.info("Benchmark DataFrame is empty.", extra=self.logger_extra)
|
397
|
+
except Exception as e:
|
398
|
+
self.logger.error(f"Error generating benchmark summary: {e}", extra=self.logger_extra)
|
399
|
+
|
400
|
+
# import datetime
|
401
|
+
# import random
|
402
|
+
# import threading
|
403
|
+
# import time
|
404
|
+
# from concurrent.futures import ThreadPoolExecutor, as_completed
|
405
|
+
# from typing import Type, Any, Dict, Optional, Union, List, ClassVar
|
406
|
+
#
|
407
|
+
# import pandas as pd
|
408
|
+
# from tqdm import tqdm
|
409
|
+
#
|
410
|
+
# from . import ManagedResource
|
411
|
+
# from .parquet_saver import ParquetSaver
|
412
|
+
#
|
413
|
+
#
|
414
|
+
# class DataWrapper(ManagedResource):
|
415
|
+
# DEFAULT_PRIORITY_MAP: ClassVar[Dict[str, int]] = {
|
416
|
+
# "overwrite": 1,
|
417
|
+
# "missing_in_history": 2,
|
418
|
+
# "existing_but_stale": 3,
|
419
|
+
# "missing_outside_history": 4,
|
420
|
+
# "file_is_recent": 0,
|
421
|
+
# }
|
422
|
+
# DEFAULT_MAX_AGE_MINUTES: int = 1440
|
423
|
+
# DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
|
424
|
+
#
|
425
|
+
# logger_extra = {"sibi_dst_component": __name__}
|
426
|
+
#
|
427
|
+
# def __init__(
|
428
|
+
# self,
|
429
|
+
# dataclass: Type,
|
430
|
+
# date_field: str,
|
431
|
+
# data_path: str,
|
432
|
+
# parquet_filename: str,
|
433
|
+
# class_params: Optional[Dict] = None,
|
434
|
+
# load_params: Optional[Dict] = None,
|
435
|
+
# show_progress: bool = False,
|
436
|
+
# timeout: float = 30,
|
437
|
+
# max_threads: int = 3,
|
438
|
+
# **kwargs: Any,
|
439
|
+
# ):
|
440
|
+
# super().__init__(**kwargs)
|
441
|
+
# self.dataclass = dataclass
|
442
|
+
# self.date_field = date_field
|
443
|
+
# self.data_path = self._ensure_forward_slash(data_path)
|
444
|
+
# self.parquet_filename = parquet_filename
|
445
|
+
# if self.fs is None:
|
446
|
+
# raise ValueError("DataWrapper requires a File system (fs) to be provided.")
|
447
|
+
# self.show_progress = show_progress
|
448
|
+
# self.timeout = timeout
|
449
|
+
# self.max_threads = max_threads
|
450
|
+
# self.class_params = class_params or {
|
451
|
+
# "debug": self.debug,
|
452
|
+
# "logger": self.logger,
|
453
|
+
# "fs": self.fs,
|
454
|
+
# "verbose": self.verbose,
|
455
|
+
# }
|
456
|
+
# self.load_params = load_params or {}
|
457
|
+
#
|
458
|
+
# self._lock = threading.Lock()
|
459
|
+
# self.processed_dates: List[datetime.date] = []
|
460
|
+
# self.benchmarks: Dict[datetime.date, Dict[str, float]] = {}
|
461
|
+
# self.mmanifest = kwargs.get("mmanifest", None)
|
462
|
+
# self.update_planner = kwargs.get("update_planner", None)
|
463
|
+
#
|
464
|
+
# # --- NEW: stop gate tripped during cleanup/interrupt to block further scheduling/retries
|
465
|
+
# self._stop_event = threading.Event()
|
466
|
+
# self.logger_extra.update({"action_module_name": "data_wrapper", "dataclass": self.dataclass.__name__})
|
467
|
+
#
|
468
|
+
# # ensure manifest is saved on context exit
|
469
|
+
# def __exit__(self, exc_type, exc_val, exc_tb):
|
470
|
+
# if self.mmanifest:
|
471
|
+
# self.mmanifest.save()
|
472
|
+
# super().__exit__(exc_type, exc_val, exc_tb)
|
473
|
+
# return False
|
474
|
+
#
|
475
|
+
# # --- NEW: trip stop gate during class-specific cleanup (close/aclose/finalizer path)
|
476
|
+
# def _cleanup(self) -> None:
|
477
|
+
# self._stop_event.set()
|
478
|
+
#
|
479
|
+
# @staticmethod
|
480
|
+
# def _convert_to_date(date: Union[datetime.date, str]) -> datetime.date:
|
481
|
+
# if isinstance(date, datetime.date):
|
482
|
+
# return date
|
483
|
+
# try:
|
484
|
+
# return pd.to_datetime(date).date()
|
485
|
+
# except ValueError as e:
|
486
|
+
# raise ValueError(f"Error converting {date} to datetime: {e}")
|
487
|
+
#
|
488
|
+
# @staticmethod
|
489
|
+
# def _ensure_forward_slash(path: str) -> str:
|
490
|
+
# return path.rstrip("/") + "/"
|
491
|
+
#
|
492
|
+
# def process(
|
493
|
+
# self,
|
494
|
+
# max_retries: int = 3,
|
495
|
+
# backoff_base: float = 2.0,
|
496
|
+
# backoff_jitter: float = 0.1,
|
497
|
+
# backoff_max: float = 60.0,
|
498
|
+
# ):
|
499
|
+
# """
|
500
|
+
# Execute the update plan with concurrency, retries and exponential backoff.
|
501
|
+
# Stops scheduling immediately if closed or interrupted (Ctrl-C).
|
502
|
+
# """
|
503
|
+
# overall_start = time.perf_counter()
|
504
|
+
# tasks = list(self.update_planner.get_tasks_by_priority())
|
505
|
+
# if not tasks:
|
506
|
+
# self.logger.info("No updates required based on the current plan.")
|
507
|
+
# return
|
508
|
+
#
|
509
|
+
# if self.update_planner.show_progress:
|
510
|
+
# self.update_planner.show_update_plan()
|
511
|
+
#
|
512
|
+
# try:
|
513
|
+
# for priority, dates in tasks:
|
514
|
+
# if self._stop_event.is_set():
|
515
|
+
# break
|
516
|
+
# self._execute_task_batch(priority, dates, max_retries, backoff_base, backoff_jitter, backoff_max)
|
517
|
+
# except KeyboardInterrupt:
|
518
|
+
# self.logger.warning("KeyboardInterrupt received — stopping scheduling and shutting down.", extra=self.logger_extra)
|
519
|
+
# self._stop_event.set()
|
520
|
+
# raise
|
521
|
+
# finally:
|
522
|
+
# total_time = time.perf_counter() - overall_start
|
523
|
+
# if self.processed_dates:
|
524
|
+
# count = len(self.processed_dates)
|
525
|
+
# self.logger.info(f"Processed {count} dates in {total_time:.1f}s (avg {total_time / count:.1f}s/date)", extra=self.logger_extra)
|
526
|
+
# if self.update_planner.show_progress:
|
527
|
+
# self.show_benchmark_summary()
|
528
|
+
#
|
529
|
+
# def _execute_task_batch(
|
530
|
+
# self,
|
531
|
+
# priority: int,
|
532
|
+
# dates: List[datetime.date],
|
533
|
+
# max_retries: int,
|
534
|
+
# backoff_base: float,
|
535
|
+
# backoff_jitter: float,
|
536
|
+
# backoff_max: float,
|
537
|
+
# ):
|
538
|
+
# desc = f"Processing {self.dataclass.__name__}, priority: {priority}"
|
539
|
+
# max_thr = min(len(dates), self.max_threads)
|
540
|
+
# self.logger.info(f"Executing {len(dates)} tasks with priority {priority} using {max_thr} threads.", extra=self.logger_extra)
|
541
|
+
#
|
542
|
+
# # Use explicit try/finally so we can request cancel of queued tasks on teardown
|
543
|
+
# executor = ThreadPoolExecutor(max_workers=max_thr, thread_name_prefix="datawrapper")
|
544
|
+
# try:
|
545
|
+
# futures = {}
|
546
|
+
# for date in dates:
|
547
|
+
# if self._stop_event.is_set():
|
548
|
+
# break
|
549
|
+
# try:
|
550
|
+
# fut = executor.submit(
|
551
|
+
# self._process_date_with_retry, date, max_retries, backoff_base, backoff_jitter, backoff_max
|
552
|
+
# )
|
553
|
+
# futures[fut] = date
|
554
|
+
# except RuntimeError as e:
|
555
|
+
# # tolerate race: executor shutting down
|
556
|
+
# if "cannot schedule new futures after shutdown" in str(e).lower():
|
557
|
+
# self.logger.warning("Executor is shutting down; halting new submissions for this batch.", extra=self.logger_extra)
|
558
|
+
# break
|
559
|
+
# raise
|
560
|
+
#
|
561
|
+
# iterator = as_completed(futures)
|
562
|
+
# if self.show_progress:
|
563
|
+
# iterator = tqdm(iterator, total=len(futures), desc=desc)
|
564
|
+
#
|
565
|
+
# for future in iterator:
|
566
|
+
# try:
|
567
|
+
# future.result(timeout=self.timeout)
|
568
|
+
# except Exception as e:
|
569
|
+
# self.logger.error(f"Permanent failure for {futures[future]}: {e}", extra=self.logger_extra)
|
570
|
+
# finally:
|
571
|
+
# # Python 3.9+: cancel_futures prevents queued tasks from starting
|
572
|
+
# executor.shutdown(wait=True, cancel_futures=True)
|
573
|
+
#
|
574
|
+
# def _process_date_with_retry(
|
575
|
+
# self,
|
576
|
+
# date: datetime.date,
|
577
|
+
# max_retries: int,
|
578
|
+
# backoff_base: float,
|
579
|
+
# backoff_jitter: float,
|
580
|
+
# backoff_max: float,
|
581
|
+
# ):
|
582
|
+
# for attempt in range(max_retries):
|
583
|
+
# # --- NEW: bail out quickly if shutdown/interrupt began
|
584
|
+
# if self._stop_event.is_set():
|
585
|
+
# raise RuntimeError("shutting_down")
|
586
|
+
#
|
587
|
+
# try:
|
588
|
+
# self._process_single_date(date)
|
589
|
+
# return
|
590
|
+
# except Exception as e:
|
591
|
+
# if attempt < max_retries - 1 and not self._stop_event.is_set():
|
592
|
+
# base_delay = min(backoff_base ** attempt, backoff_max)
|
593
|
+
# delay = base_delay * (1 + random.uniform(0.0, max(0.0, backoff_jitter)))
|
594
|
+
# self.logger.warning(
|
595
|
+
# f"Retry {attempt + 1}/{max_retries} for {date}: {e} (sleep {delay:.2f}s)",
|
596
|
+
# extra=self.logger_extra
|
597
|
+
# )
|
598
|
+
# time.sleep(delay)
|
599
|
+
# else:
|
600
|
+
# self.logger.error(f"Failed processing {date} after {max_retries} attempts.", extra=self.logger_extra)
|
601
|
+
# raise
|
602
|
+
#
|
603
|
+
# def _process_single_date(self, date: datetime.date):
|
604
|
+
# path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
|
605
|
+
# self.logger.debug(f"Processing date {date.isoformat()} for {path}", extra=self.logger_extra)
|
606
|
+
# if path in self.update_planner.skipped and self.update_planner.ignore_missing:
|
607
|
+
# self.logger.debug(f"Skipping {date} as it exists in the skipped list", extra=self.logger_extra)
|
608
|
+
# return
|
609
|
+
# full_path = f"{path}{self.parquet_filename}"
|
610
|
+
#
|
611
|
+
# overall_start = time.perf_counter()
|
612
|
+
# try:
|
613
|
+
# load_start = time.perf_counter()
|
614
|
+
# date_filter = {f"{self.date_field}__date": {date.isoformat()}}
|
615
|
+
# self.logger.debug(f"{self.dataclass.__name__} is loading data for {date} with filter: {date_filter}", extra=self.logger_extra)
|
616
|
+
#
|
617
|
+
# local_load_params = self.load_params.copy()
|
618
|
+
# local_load_params.update(date_filter)
|
619
|
+
#
|
620
|
+
# with self.dataclass(**self.class_params) as local_class_instance:
|
621
|
+
# df = local_class_instance.load(**local_load_params) # expected to be Dask
|
622
|
+
# load_time = time.perf_counter() - load_start
|
623
|
+
#
|
624
|
+
# if hasattr(local_class_instance, "total_records"):
|
625
|
+
# total_records = int(local_class_instance.total_records)
|
626
|
+
# self.logger.debug(f"Total records loaded: {total_records}", extra=self.logger_extra)
|
627
|
+
#
|
628
|
+
# if total_records == 0:
|
629
|
+
# if self.mmanifest:
|
630
|
+
# self.mmanifest.record(full_path=path)
|
631
|
+
# self.logger.info(f"No data found for {full_path}. Logged to missing manifest.", extra=self.logger_extra)
|
632
|
+
# return
|
633
|
+
#
|
634
|
+
# if total_records < 0:
|
635
|
+
# self.logger.warning(f"Negative record count ({total_records}) for {full_path}.", extra=self.logger_extra)
|
636
|
+
# return
|
637
|
+
#
|
638
|
+
# save_start = time.perf_counter()
|
639
|
+
# parquet_params = {
|
640
|
+
# "df_result": df,
|
641
|
+
# "parquet_storage_path": path,
|
642
|
+
# "fs": self.fs,
|
643
|
+
# "logger": self.logger,
|
644
|
+
# "debug": self.debug,
|
645
|
+
# }
|
646
|
+
# with ParquetSaver(**parquet_params) as ps:
|
647
|
+
# ps.save_to_parquet(self.parquet_filename, overwrite=True)
|
648
|
+
# save_time = time.perf_counter() - save_start
|
649
|
+
#
|
650
|
+
# total_time = time.perf_counter() - overall_start
|
651
|
+
# self.benchmarks[date] = {
|
652
|
+
# "load_duration": load_time,
|
653
|
+
# "save_duration": save_time,
|
654
|
+
# "total_duration": total_time,
|
655
|
+
# }
|
656
|
+
# self._log_success(date, total_time, full_path)
|
657
|
+
#
|
658
|
+
# except Exception as e:
|
659
|
+
# self._log_failure(date, e)
|
660
|
+
# raise
|
661
|
+
#
|
662
|
+
# def _log_success(self, date: datetime.date, duration: float, path: str):
|
663
|
+
# self.logger.info(f"Completed {date} in {duration:.1f}s | Saved to {path}", extra=self.logger_extra)
|
664
|
+
# self.processed_dates.append(date)
|
665
|
+
#
|
666
|
+
# def _log_failure(self, date: datetime.date, error: Exception):
|
667
|
+
# self.logger.error(f"Failed processing {date}: {error}", extra=self.logger_extra)
|
668
|
+
#
|
669
|
+
# def show_benchmark_summary(self):
|
670
|
+
# if not self.benchmarks:
|
671
|
+
# self.logger.info("No benchmarking data to show", extra=self.logger_extra)
|
672
|
+
# return
|
673
|
+
# df_bench = pd.DataFrame.from_records([{"date": d, **m} for d, m in self.benchmarks.items()])
|
674
|
+
# df_bench = df_bench.set_index("date").sort_index(ascending=not self.update_planner.reverse_order)
|
675
|
+
# self.logger.info(f"Benchmark Summary:\n {self.dataclass.__name__}\n" + df_bench.to_string(), extra=self.logger_extra)
|
676
|
+
#
|