sibi-dst 2025.9.3__py3-none-any.whl → 2025.9.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,12 @@
1
+ # data_wrapper.py
2
+ from __future__ import annotations
3
+
1
4
  import datetime
2
5
  import random
3
6
  import threading
4
7
  import time
5
- from concurrent.futures import ThreadPoolExecutor, as_completed
6
- from typing import Type, Any, Dict, Optional, Union, List, ClassVar
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed, Future
9
+ from typing import Type, Any, Dict, Optional, Union, List, ClassVar, Callable
7
10
 
8
11
  import pandas as pd
9
12
  from tqdm import tqdm
@@ -13,6 +16,10 @@ from .parquet_saver import ParquetSaver
13
16
 
14
17
 
15
18
  class DataWrapper(ManagedResource):
19
+ """
20
+ Manages the concurrent processing of data for multiple dates based on an update plan.
21
+ Orchestrates loading data via a dataclass, processing it, and saving it to Parquet.
22
+ """
16
23
  DEFAULT_PRIORITY_MAP: ClassVar[Dict[str, int]] = {
17
24
  "overwrite": 1,
18
25
  "missing_in_history": 2,
@@ -23,7 +30,7 @@ class DataWrapper(ManagedResource):
23
30
  DEFAULT_MAX_AGE_MINUTES: int = 1440
24
31
  DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
25
32
 
26
- logger_extra = {"sibi_dst_component": __name__}
33
+ logger_extra = {"sibi_dst_component": "warehouse.data_wrapper"}
27
34
 
28
35
  def __init__(
29
36
  self,
@@ -39,57 +46,87 @@ class DataWrapper(ManagedResource):
39
46
  **kwargs: Any,
40
47
  ):
41
48
  super().__init__(**kwargs)
42
- self.dataclass = dataclass
43
- self.date_field = date_field
44
- self.data_path = self._ensure_forward_slash(data_path)
45
- self.parquet_filename = parquet_filename
49
+ # ---- Core Configuration ----
50
+ self.dataclass: Type = dataclass
51
+ self.date_field: str = date_field
52
+ self.data_path: str = self._ensure_forward_slash(data_path)
53
+ self.parquet_filename: str = parquet_filename
54
+
46
55
  if self.fs is None:
47
56
  raise ValueError("DataWrapper requires a File system (fs) to be provided.")
48
- self.show_progress = show_progress
49
- self.timeout = timeout
50
- self.max_threads = max_threads
51
- self.class_params = class_params or {
57
+
58
+ # ---- Execution Parameters ----
59
+ self.show_progress: bool = show_progress
60
+ self.timeout: float = timeout
61
+ self.max_threads: int = max_threads
62
+
63
+ # ---- Parameters for Dataclass Instantiation ----
64
+ self.class_params: Dict[str, Any] = class_params or {
52
65
  "debug": self.debug,
53
66
  "logger": self.logger,
54
67
  "fs": self.fs,
55
68
  "verbose": self.verbose,
56
69
  }
57
- self.load_params = load_params or {}
70
+ self.load_params: Dict[str, Any] = load_params or {}
58
71
 
72
+ # ---- Internal State & Coordination ----
59
73
  self._lock = threading.Lock()
60
74
  self.processed_dates: List[datetime.date] = []
61
75
  self.benchmarks: Dict[datetime.date, Dict[str, float]] = {}
76
+
77
+ # ---- External Dependencies ----
62
78
  self.mmanifest = kwargs.get("mmanifest", None)
63
79
  self.update_planner = kwargs.get("update_planner", None)
64
80
 
65
- # --- NEW: stop gate tripped during cleanup/interrupt to block further scheduling/retries
81
+ # ---- Shutdown Coordination ----
82
+ # Stop gate to block further scheduling/retries during cleanup/interrupt
66
83
  self._stop_event = threading.Event()
67
- self.logger_extra.update({"action_module_name": "data_wrapper", "dataclass": self.dataclass.__name__})
68
84
 
69
- # ensure manifest is saved on context exit
85
+ # Update logger extra with specific context
86
+ self.logger_extra.update({
87
+ "action_module_name": "data_wrapper",
88
+ "dataclass": self.dataclass.__name__
89
+ })
90
+
91
+ # --------------------- Context Management ---------------------
70
92
  def __exit__(self, exc_type, exc_val, exc_tb):
93
+ """Ensure manifest is saved and resources are cleaned up on context exit."""
71
94
  if self.mmanifest:
72
- self.mmanifest.save()
73
- super().__exit__(exc_type, exc_val, exc_tb)
74
- return False
95
+ try:
96
+ self.mmanifest.save()
97
+ except Exception as e:
98
+ self.logger.error(f"Failed to save manifest in __exit__: {e}", extra=self.logger_extra)
99
+ # Call parent's __exit__ which triggers _cleanup
100
+ return super().__exit__(exc_type, exc_val, exc_tb)
75
101
 
76
- # --- NEW: trip stop gate during class-specific cleanup (close/aclose/finalizer path)
102
+ # --------------------- Cleanup ---------------------
77
103
  def _cleanup(self) -> None:
104
+ """Signal shutdown during class-specific cleanup."""
78
105
  self._stop_event.set()
79
106
 
107
+ # --------------------- Utilities ---------------------
80
108
  @staticmethod
81
109
  def _convert_to_date(date: Union[datetime.date, str]) -> datetime.date:
110
+ """Convert a string or date object to a datetime.date."""
82
111
  if isinstance(date, datetime.date):
83
112
  return date
84
113
  try:
85
114
  return pd.to_datetime(date).date()
86
115
  except ValueError as e:
87
- raise ValueError(f"Error converting {date} to datetime: {e}")
116
+ raise ValueError(f"Error converting {date} to datetime: {e}") from e
88
117
 
89
118
  @staticmethod
90
119
  def _ensure_forward_slash(path: str) -> str:
120
+ """Ensure the path ends with a forward slash."""
91
121
  return path.rstrip("/") + "/"
92
122
 
123
+ def _log_extra(self, **overrides) -> Dict[str, Any]:
124
+ """Generate consistent logger extra context."""
125
+ base = self.logger_extra.copy()
126
+ base.update(overrides)
127
+ return base
128
+
129
+ # --------------------- Core Public API ---------------------
93
130
  def process(
94
131
  self,
95
132
  max_retries: int = 3,
@@ -98,21 +135,22 @@ class DataWrapper(ManagedResource):
98
135
  backoff_max: float = 60.0,
99
136
  ):
100
137
  """
101
- Execute the update plan with concurrency, retries and exponential backoff.
138
+ Execute the update plan with concurrency, retries, and exponential backoff.
102
139
  Stops scheduling immediately if closed or interrupted (Ctrl-C).
103
140
  """
104
141
  overall_start = time.perf_counter()
105
- tasks = list(self.update_planner.get_tasks_by_priority())
142
+ tasks = list(self.update_planner.get_tasks_by_priority()) if self.update_planner else []
106
143
  if not tasks:
107
- self.logger.info("No updates required based on the current plan.")
144
+ self.logger.info("No updates required based on the current plan.", extra=self.logger_extra)
108
145
  return
109
146
 
110
- if self.update_planner.show_progress:
147
+ if self.update_planner and self.update_planner.show_progress:
111
148
  self.update_planner.show_update_plan()
112
149
 
113
150
  try:
114
151
  for priority, dates in tasks:
115
152
  if self._stop_event.is_set():
153
+ self.logger.info("Stop event set, halting processing of remaining task batches.", extra=self.logger_extra)
116
154
  break
117
155
  self._execute_task_batch(priority, dates, max_retries, backoff_base, backoff_jitter, backoff_max)
118
156
  except KeyboardInterrupt:
@@ -123,10 +161,15 @@ class DataWrapper(ManagedResource):
123
161
  total_time = time.perf_counter() - overall_start
124
162
  if self.processed_dates:
125
163
  count = len(self.processed_dates)
126
- self.logger.info(f"Processed {count} dates in {total_time:.1f}s (avg {total_time / count:.1f}s/date)", extra=self.logger_extra)
127
- if self.update_planner.show_progress:
164
+ avg_time = total_time / count if count > 0 else 0
165
+ self.logger.info(
166
+ f"Processed {count} dates in {total_time:.1f}s (avg {avg_time:.1f}s/date)",
167
+ extra=self.logger_extra
168
+ )
169
+ if self.update_planner and self.update_planner.show_progress:
128
170
  self.show_benchmark_summary()
129
171
 
172
+ # --------------------- Task Execution ---------------------
130
173
  def _execute_task_batch(
131
174
  self,
132
175
  priority: int,
@@ -136,42 +179,70 @@ class DataWrapper(ManagedResource):
136
179
  backoff_jitter: float,
137
180
  backoff_max: float,
138
181
  ):
182
+ """Execute a batch of tasks (dates) with a given priority concurrently."""
139
183
  desc = f"Processing {self.dataclass.__name__}, priority: {priority}"
140
184
  max_thr = min(len(dates), self.max_threads)
141
- self.logger.info(f"Executing {len(dates)} tasks with priority {priority} using {max_thr} threads.", extra=self.logger_extra)
185
+ self.logger.info(
186
+ f"Executing {len(dates)} tasks with priority {priority} using {max_thr} threads.",
187
+ extra=self.logger_extra
188
+ )
142
189
 
143
- # Use explicit try/finally so we can request cancel of queued tasks on teardown
190
+ # Use explicit try/finally for executor shutdown control
144
191
  executor = ThreadPoolExecutor(max_workers=max_thr, thread_name_prefix="datawrapper")
145
192
  try:
146
- futures = {}
193
+ futures_to_dates: Dict[Future, datetime.date] = {}
194
+ submitted_count = 0
195
+
147
196
  for date in dates:
148
197
  if self._stop_event.is_set():
198
+ self.logger.debug(f"Stop event set, halting submission of new tasks in batch {priority}.", extra=self.logger_extra)
149
199
  break
150
200
  try:
151
- fut = executor.submit(
152
- self._process_date_with_retry, date, max_retries, backoff_base, backoff_jitter, backoff_max
201
+ future = executor.submit(
202
+ self._process_date_with_retry,
203
+ date,
204
+ max_retries,
205
+ backoff_base,
206
+ backoff_jitter,
207
+ backoff_max
153
208
  )
154
- futures[fut] = date
209
+ futures_to_dates[future] = date
210
+ submitted_count += 1
155
211
  except RuntimeError as e:
156
- # tolerate race: executor shutting down
212
+ # Tolerate race: executor shutting down
157
213
  if "cannot schedule new futures after shutdown" in str(e).lower():
158
- self.logger.warning("Executor is shutting down; halting new submissions for this batch.", extra=self.logger_extra)
214
+ self.logger.warning(
215
+ "Executor is shutting down; halting new submissions for this batch.",
216
+ extra=self.logger_extra
217
+ )
159
218
  break
160
- raise
219
+ else:
220
+ # Re-raise unexpected RuntimeErrors
221
+ raise
222
+
223
+ self.logger.debug(f"Submitted {submitted_count} tasks for priority {priority}.", extra=self.logger_extra)
161
224
 
162
- iterator = as_completed(futures)
225
+ # Use as_completed for processing results as they finish
226
+ iterator = as_completed(futures_to_dates)
163
227
  if self.show_progress:
164
- iterator = tqdm(iterator, total=len(futures), desc=desc)
228
+ iterator = tqdm(iterator, total=len(futures_to_dates), desc=desc, leave=False)
165
229
 
166
230
  for future in iterator:
231
+ date = futures_to_dates[future]
167
232
  try:
233
+ # Get the result, respecting the overall timeout
168
234
  future.result(timeout=self.timeout)
169
235
  except Exception as e:
170
- self.logger.error(f"Permanent failure for {futures[future]}: {e}", extra=self.logger_extra)
236
+ # Log errors for individual date processing failures
237
+ self.logger.error(f"Permanent failure for {date}: {e}", extra=self.logger_extra)
171
238
  finally:
172
239
  # Python 3.9+: cancel_futures prevents queued tasks from starting
240
+ # Tasks already running will still complete.
241
+ # shutdown(wait=True) ensures running tasks finish before returning.
173
242
  executor.shutdown(wait=True, cancel_futures=True)
243
+ self.logger.debug(f"Executor for priority {priority} shut down.", extra=self.logger_extra)
174
244
 
245
+ # --------------------- Date Processing ---------------------
175
246
  def _process_date_with_retry(
176
247
  self,
177
248
  date: datetime.date,
@@ -180,62 +251,89 @@ class DataWrapper(ManagedResource):
180
251
  backoff_jitter: float,
181
252
  backoff_max: float,
182
253
  ):
254
+ """Process a single date with retry logic and exponential backoff."""
183
255
  for attempt in range(max_retries):
184
- # --- NEW: bail out quickly if shutdown/interrupt began
256
+ # Bail out quickly if shutdown/interrupt began
185
257
  if self._stop_event.is_set():
258
+ self.logger.debug(f"Stop event set, aborting retries for {date} (attempt {attempt + 1}).", extra=self.logger_extra)
186
259
  raise RuntimeError("shutting_down")
187
260
 
188
261
  try:
189
262
  self._process_single_date(date)
190
- return
263
+ return # Success, exit retry loop
191
264
  except Exception as e:
192
265
  if attempt < max_retries - 1 and not self._stop_event.is_set():
266
+ # Calculate delay with exponential backoff and jitter
193
267
  base_delay = min(backoff_base ** attempt, backoff_max)
194
- delay = base_delay * (1 + random.uniform(0.0, max(0.0, backoff_jitter)))
268
+ jitter_amount = random.uniform(0.0, max(0.0, backoff_jitter))
269
+ delay = base_delay * (1 + jitter_amount)
195
270
  self.logger.warning(
196
271
  f"Retry {attempt + 1}/{max_retries} for {date}: {e} (sleep {delay:.2f}s)",
197
272
  extra=self.logger_extra
198
273
  )
199
- time.sleep(delay)
274
+ # Respect stop event even during sleep
275
+ if self._stop_event.wait(timeout=delay):
276
+ self.logger.debug(f"Stop event set during retry sleep for {date}.", extra=self.logger_extra)
277
+ raise RuntimeError("shutting_down") from e
200
278
  else:
201
279
  self.logger.error(f"Failed processing {date} after {max_retries} attempts.", extra=self.logger_extra)
202
- raise
280
+ raise # Re-raise the last exception after max retries
203
281
 
204
282
  def _process_single_date(self, date: datetime.date):
283
+ """Process a single date: load, save to Parquet."""
284
+ # --- 1. Setup paths and logging ---
205
285
  path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
206
- self.logger.debug(f"Processing date {date.isoformat()} for {path}", extra=self.logger_extra)
207
- if path in self.update_planner.skipped and self.update_planner.ignore_missing:
208
- self.logger.debug(f"Skipping {date} as it exists in the skipped list", extra=self.logger_extra)
286
+ log_extra = self._log_extra(date_context=date.isoformat())
287
+ self.logger.debug(f"Processing date {date.isoformat()} for {path}", extra=log_extra)
288
+
289
+ # --- 2. Check if date/path should be skipped ---
290
+ if (self.update_planner and path in self.update_planner.skipped and
291
+ getattr(self.update_planner, 'ignore_missing', False)):
292
+ self.logger.debug(f"Skipping {date} as it exists in the skipped list", extra=log_extra)
209
293
  return
210
294
  full_path = f"{path}{self.parquet_filename}"
211
295
 
296
+ # --- 3. Timing ---
212
297
  overall_start = time.perf_counter()
298
+
213
299
  try:
300
+ # --- 4. Load Data ---
214
301
  load_start = time.perf_counter()
215
- date_filter = {f"{self.date_field}__date": {date.isoformat()}}
216
- self.logger.debug(f"{self.dataclass.__name__} is loading data for {date} with filter: {date_filter}", extra=self.logger_extra)
302
+ date_filter = {f"{self.date_field}__date": date.isoformat()}
303
+ self.logger.debug(f"{self.dataclass.__name__} is loading data for {date} with filter: {date_filter}", extra=log_extra)
217
304
 
305
+ # Prepare load parameters
218
306
  local_load_params = self.load_params.copy()
219
307
  local_load_params.update(date_filter)
220
308
 
309
+ # Instantiate and use the dataclass (e.g., Etl...Dc) within a context manager
221
310
  with self.dataclass(**self.class_params) as local_class_instance:
222
- df = local_class_instance.load(**local_load_params) # expected to be Dask
311
+ df = local_class_instance.load(**local_load_params) # Expected to return Dask DataFrame
312
+
223
313
  load_time = time.perf_counter() - load_start
314
+ self.logger.debug(f"{self.dataclass.__name__} data loading for {date} completed in {load_time:.2f}s", extra=log_extra)
224
315
 
316
+ # --- 5. Handle Record Count ---
317
+ total_records = -1
225
318
  if hasattr(local_class_instance, "total_records"):
226
- total_records = int(local_class_instance.total_records)
227
- self.logger.debug(f"Total records loaded: {total_records}", extra=self.logger_extra)
319
+ total_records = int(getattr(local_class_instance, "total_records", -1))
320
+ self.logger.debug(f"{self.dataclass.__name__} total records loaded: {total_records}", extra=log_extra)
228
321
 
229
322
  if total_records == 0:
323
+ # No data found, log to manifest if available
230
324
  if self.mmanifest:
231
- self.mmanifest.record(full_path=path)
232
- self.logger.info(f"No data found for {full_path}. Logged to missing manifest.", extra=self.logger_extra)
233
- return
325
+ try:
326
+ self.mmanifest.record(full_path=path)
327
+ except Exception as e:
328
+ self.logger.error(f"Failed to record missing path {path}: {e}", extra=log_extra)
329
+ self.logger.info(f"No data found for {full_path}. Logged to missing manifest.", extra=log_extra)
330
+ return # Done for this date
234
331
 
235
332
  if total_records < 0:
236
- self.logger.warning(f"Negative record count ({total_records}) for {full_path}.", extra=self.logger_extra)
237
- return
333
+ self.logger.warning(f"Negative record count ({total_records}) for {full_path}. Proceeding.", extra=log_extra)
334
+ # Continue processing even with negative count
238
335
 
336
+ # --- 6. Save to Parquet ---
239
337
  save_start = time.perf_counter()
240
338
  parquet_params = {
241
339
  "df_result": df,
@@ -243,35 +341,336 @@ class DataWrapper(ManagedResource):
243
341
  "fs": self.fs,
244
342
  "logger": self.logger,
245
343
  "debug": self.debug,
344
+ "verbose": self.verbose,
246
345
  }
346
+ self.logger.debug(f"{self.dataclass.__name__} saving to parquet started...", extra=log_extra)
247
347
  with ParquetSaver(**parquet_params) as ps:
248
348
  ps.save_to_parquet(self.parquet_filename, overwrite=True)
249
349
  save_time = time.perf_counter() - save_start
350
+ self.logger.debug(f"Parquet saving for {date} completed in {save_time:.2f}s", extra=log_extra)
250
351
 
352
+ # --- 7. Benchmarking ---
251
353
  total_time = time.perf_counter() - overall_start
252
354
  self.benchmarks[date] = {
253
355
  "load_duration": load_time,
254
356
  "save_duration": save_time,
255
357
  "total_duration": total_time,
256
358
  }
359
+
360
+ # --- 8. Log Success ---
257
361
  self._log_success(date, total_time, full_path)
258
362
 
259
363
  except Exception as e:
364
+ # --- 9. Handle Errors ---
260
365
  self._log_failure(date, e)
261
- raise
366
+ raise # Re-raise to trigger retry logic
262
367
 
368
+ # --------------------- Logging / Benchmarking ---------------------
263
369
  def _log_success(self, date: datetime.date, duration: float, path: str):
370
+ """Log a successful date processing."""
264
371
  self.logger.info(f"Completed {date} in {duration:.1f}s | Saved to {path}", extra=self.logger_extra)
265
- self.processed_dates.append(date)
372
+ with self._lock: # Protect the shared list
373
+ self.processed_dates.append(date)
266
374
 
267
375
  def _log_failure(self, date: datetime.date, error: Exception):
376
+ """Log a failed date processing."""
268
377
  self.logger.error(f"Failed processing {date}: {error}", extra=self.logger_extra)
269
378
 
270
379
  def show_benchmark_summary(self):
380
+ """Display a summary of processing times."""
271
381
  if not self.benchmarks:
272
382
  self.logger.info("No benchmarking data to show", extra=self.logger_extra)
273
383
  return
274
- df_bench = pd.DataFrame.from_records([{"date": d, **m} for d, m in self.benchmarks.items()])
275
- df_bench = df_bench.set_index("date").sort_index(ascending=not self.update_planner.reverse_order)
276
- self.logger.info(f"Benchmark Summary:\n {self.dataclass.__name__}\n" + df_bench.to_string(), extra=self.logger_extra)
277
384
 
385
+ try:
386
+ df_bench = pd.DataFrame.from_records(
387
+ [{"date": d, **m} for d, m in self.benchmarks.items()]
388
+ )
389
+ if not df_bench.empty:
390
+ df_bench = df_bench.set_index("date").sort_index(
391
+ ascending=not (self.update_planner.reverse_order if self.update_planner else False)
392
+ )
393
+ summary_str = df_bench.to_string()
394
+ self.logger.info(f"Benchmark Summary:\n {self.dataclass.__name__}\n{summary_str}", extra=self.logger_extra)
395
+ else:
396
+ self.logger.info("Benchmark DataFrame is empty.", extra=self.logger_extra)
397
+ except Exception as e:
398
+ self.logger.error(f"Error generating benchmark summary: {e}", extra=self.logger_extra)
399
+
400
+ # import datetime
401
+ # import random
402
+ # import threading
403
+ # import time
404
+ # from concurrent.futures import ThreadPoolExecutor, as_completed
405
+ # from typing import Type, Any, Dict, Optional, Union, List, ClassVar
406
+ #
407
+ # import pandas as pd
408
+ # from tqdm import tqdm
409
+ #
410
+ # from . import ManagedResource
411
+ # from .parquet_saver import ParquetSaver
412
+ #
413
+ #
414
+ # class DataWrapper(ManagedResource):
415
+ # DEFAULT_PRIORITY_MAP: ClassVar[Dict[str, int]] = {
416
+ # "overwrite": 1,
417
+ # "missing_in_history": 2,
418
+ # "existing_but_stale": 3,
419
+ # "missing_outside_history": 4,
420
+ # "file_is_recent": 0,
421
+ # }
422
+ # DEFAULT_MAX_AGE_MINUTES: int = 1440
423
+ # DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
424
+ #
425
+ # logger_extra = {"sibi_dst_component": __name__}
426
+ #
427
+ # def __init__(
428
+ # self,
429
+ # dataclass: Type,
430
+ # date_field: str,
431
+ # data_path: str,
432
+ # parquet_filename: str,
433
+ # class_params: Optional[Dict] = None,
434
+ # load_params: Optional[Dict] = None,
435
+ # show_progress: bool = False,
436
+ # timeout: float = 30,
437
+ # max_threads: int = 3,
438
+ # **kwargs: Any,
439
+ # ):
440
+ # super().__init__(**kwargs)
441
+ # self.dataclass = dataclass
442
+ # self.date_field = date_field
443
+ # self.data_path = self._ensure_forward_slash(data_path)
444
+ # self.parquet_filename = parquet_filename
445
+ # if self.fs is None:
446
+ # raise ValueError("DataWrapper requires a File system (fs) to be provided.")
447
+ # self.show_progress = show_progress
448
+ # self.timeout = timeout
449
+ # self.max_threads = max_threads
450
+ # self.class_params = class_params or {
451
+ # "debug": self.debug,
452
+ # "logger": self.logger,
453
+ # "fs": self.fs,
454
+ # "verbose": self.verbose,
455
+ # }
456
+ # self.load_params = load_params or {}
457
+ #
458
+ # self._lock = threading.Lock()
459
+ # self.processed_dates: List[datetime.date] = []
460
+ # self.benchmarks: Dict[datetime.date, Dict[str, float]] = {}
461
+ # self.mmanifest = kwargs.get("mmanifest", None)
462
+ # self.update_planner = kwargs.get("update_planner", None)
463
+ #
464
+ # # --- NEW: stop gate tripped during cleanup/interrupt to block further scheduling/retries
465
+ # self._stop_event = threading.Event()
466
+ # self.logger_extra.update({"action_module_name": "data_wrapper", "dataclass": self.dataclass.__name__})
467
+ #
468
+ # # ensure manifest is saved on context exit
469
+ # def __exit__(self, exc_type, exc_val, exc_tb):
470
+ # if self.mmanifest:
471
+ # self.mmanifest.save()
472
+ # super().__exit__(exc_type, exc_val, exc_tb)
473
+ # return False
474
+ #
475
+ # # --- NEW: trip stop gate during class-specific cleanup (close/aclose/finalizer path)
476
+ # def _cleanup(self) -> None:
477
+ # self._stop_event.set()
478
+ #
479
+ # @staticmethod
480
+ # def _convert_to_date(date: Union[datetime.date, str]) -> datetime.date:
481
+ # if isinstance(date, datetime.date):
482
+ # return date
483
+ # try:
484
+ # return pd.to_datetime(date).date()
485
+ # except ValueError as e:
486
+ # raise ValueError(f"Error converting {date} to datetime: {e}")
487
+ #
488
+ # @staticmethod
489
+ # def _ensure_forward_slash(path: str) -> str:
490
+ # return path.rstrip("/") + "/"
491
+ #
492
+ # def process(
493
+ # self,
494
+ # max_retries: int = 3,
495
+ # backoff_base: float = 2.0,
496
+ # backoff_jitter: float = 0.1,
497
+ # backoff_max: float = 60.0,
498
+ # ):
499
+ # """
500
+ # Execute the update plan with concurrency, retries and exponential backoff.
501
+ # Stops scheduling immediately if closed or interrupted (Ctrl-C).
502
+ # """
503
+ # overall_start = time.perf_counter()
504
+ # tasks = list(self.update_planner.get_tasks_by_priority())
505
+ # if not tasks:
506
+ # self.logger.info("No updates required based on the current plan.")
507
+ # return
508
+ #
509
+ # if self.update_planner.show_progress:
510
+ # self.update_planner.show_update_plan()
511
+ #
512
+ # try:
513
+ # for priority, dates in tasks:
514
+ # if self._stop_event.is_set():
515
+ # break
516
+ # self._execute_task_batch(priority, dates, max_retries, backoff_base, backoff_jitter, backoff_max)
517
+ # except KeyboardInterrupt:
518
+ # self.logger.warning("KeyboardInterrupt received — stopping scheduling and shutting down.", extra=self.logger_extra)
519
+ # self._stop_event.set()
520
+ # raise
521
+ # finally:
522
+ # total_time = time.perf_counter() - overall_start
523
+ # if self.processed_dates:
524
+ # count = len(self.processed_dates)
525
+ # self.logger.info(f"Processed {count} dates in {total_time:.1f}s (avg {total_time / count:.1f}s/date)", extra=self.logger_extra)
526
+ # if self.update_planner.show_progress:
527
+ # self.show_benchmark_summary()
528
+ #
529
+ # def _execute_task_batch(
530
+ # self,
531
+ # priority: int,
532
+ # dates: List[datetime.date],
533
+ # max_retries: int,
534
+ # backoff_base: float,
535
+ # backoff_jitter: float,
536
+ # backoff_max: float,
537
+ # ):
538
+ # desc = f"Processing {self.dataclass.__name__}, priority: {priority}"
539
+ # max_thr = min(len(dates), self.max_threads)
540
+ # self.logger.info(f"Executing {len(dates)} tasks with priority {priority} using {max_thr} threads.", extra=self.logger_extra)
541
+ #
542
+ # # Use explicit try/finally so we can request cancel of queued tasks on teardown
543
+ # executor = ThreadPoolExecutor(max_workers=max_thr, thread_name_prefix="datawrapper")
544
+ # try:
545
+ # futures = {}
546
+ # for date in dates:
547
+ # if self._stop_event.is_set():
548
+ # break
549
+ # try:
550
+ # fut = executor.submit(
551
+ # self._process_date_with_retry, date, max_retries, backoff_base, backoff_jitter, backoff_max
552
+ # )
553
+ # futures[fut] = date
554
+ # except RuntimeError as e:
555
+ # # tolerate race: executor shutting down
556
+ # if "cannot schedule new futures after shutdown" in str(e).lower():
557
+ # self.logger.warning("Executor is shutting down; halting new submissions for this batch.", extra=self.logger_extra)
558
+ # break
559
+ # raise
560
+ #
561
+ # iterator = as_completed(futures)
562
+ # if self.show_progress:
563
+ # iterator = tqdm(iterator, total=len(futures), desc=desc)
564
+ #
565
+ # for future in iterator:
566
+ # try:
567
+ # future.result(timeout=self.timeout)
568
+ # except Exception as e:
569
+ # self.logger.error(f"Permanent failure for {futures[future]}: {e}", extra=self.logger_extra)
570
+ # finally:
571
+ # # Python 3.9+: cancel_futures prevents queued tasks from starting
572
+ # executor.shutdown(wait=True, cancel_futures=True)
573
+ #
574
+ # def _process_date_with_retry(
575
+ # self,
576
+ # date: datetime.date,
577
+ # max_retries: int,
578
+ # backoff_base: float,
579
+ # backoff_jitter: float,
580
+ # backoff_max: float,
581
+ # ):
582
+ # for attempt in range(max_retries):
583
+ # # --- NEW: bail out quickly if shutdown/interrupt began
584
+ # if self._stop_event.is_set():
585
+ # raise RuntimeError("shutting_down")
586
+ #
587
+ # try:
588
+ # self._process_single_date(date)
589
+ # return
590
+ # except Exception as e:
591
+ # if attempt < max_retries - 1 and not self._stop_event.is_set():
592
+ # base_delay = min(backoff_base ** attempt, backoff_max)
593
+ # delay = base_delay * (1 + random.uniform(0.0, max(0.0, backoff_jitter)))
594
+ # self.logger.warning(
595
+ # f"Retry {attempt + 1}/{max_retries} for {date}: {e} (sleep {delay:.2f}s)",
596
+ # extra=self.logger_extra
597
+ # )
598
+ # time.sleep(delay)
599
+ # else:
600
+ # self.logger.error(f"Failed processing {date} after {max_retries} attempts.", extra=self.logger_extra)
601
+ # raise
602
+ #
603
+ # def _process_single_date(self, date: datetime.date):
604
+ # path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
605
+ # self.logger.debug(f"Processing date {date.isoformat()} for {path}", extra=self.logger_extra)
606
+ # if path in self.update_planner.skipped and self.update_planner.ignore_missing:
607
+ # self.logger.debug(f"Skipping {date} as it exists in the skipped list", extra=self.logger_extra)
608
+ # return
609
+ # full_path = f"{path}{self.parquet_filename}"
610
+ #
611
+ # overall_start = time.perf_counter()
612
+ # try:
613
+ # load_start = time.perf_counter()
614
+ # date_filter = {f"{self.date_field}__date": {date.isoformat()}}
615
+ # self.logger.debug(f"{self.dataclass.__name__} is loading data for {date} with filter: {date_filter}", extra=self.logger_extra)
616
+ #
617
+ # local_load_params = self.load_params.copy()
618
+ # local_load_params.update(date_filter)
619
+ #
620
+ # with self.dataclass(**self.class_params) as local_class_instance:
621
+ # df = local_class_instance.load(**local_load_params) # expected to be Dask
622
+ # load_time = time.perf_counter() - load_start
623
+ #
624
+ # if hasattr(local_class_instance, "total_records"):
625
+ # total_records = int(local_class_instance.total_records)
626
+ # self.logger.debug(f"Total records loaded: {total_records}", extra=self.logger_extra)
627
+ #
628
+ # if total_records == 0:
629
+ # if self.mmanifest:
630
+ # self.mmanifest.record(full_path=path)
631
+ # self.logger.info(f"No data found for {full_path}. Logged to missing manifest.", extra=self.logger_extra)
632
+ # return
633
+ #
634
+ # if total_records < 0:
635
+ # self.logger.warning(f"Negative record count ({total_records}) for {full_path}.", extra=self.logger_extra)
636
+ # return
637
+ #
638
+ # save_start = time.perf_counter()
639
+ # parquet_params = {
640
+ # "df_result": df,
641
+ # "parquet_storage_path": path,
642
+ # "fs": self.fs,
643
+ # "logger": self.logger,
644
+ # "debug": self.debug,
645
+ # }
646
+ # with ParquetSaver(**parquet_params) as ps:
647
+ # ps.save_to_parquet(self.parquet_filename, overwrite=True)
648
+ # save_time = time.perf_counter() - save_start
649
+ #
650
+ # total_time = time.perf_counter() - overall_start
651
+ # self.benchmarks[date] = {
652
+ # "load_duration": load_time,
653
+ # "save_duration": save_time,
654
+ # "total_duration": total_time,
655
+ # }
656
+ # self._log_success(date, total_time, full_path)
657
+ #
658
+ # except Exception as e:
659
+ # self._log_failure(date, e)
660
+ # raise
661
+ #
662
+ # def _log_success(self, date: datetime.date, duration: float, path: str):
663
+ # self.logger.info(f"Completed {date} in {duration:.1f}s | Saved to {path}", extra=self.logger_extra)
664
+ # self.processed_dates.append(date)
665
+ #
666
+ # def _log_failure(self, date: datetime.date, error: Exception):
667
+ # self.logger.error(f"Failed processing {date}: {error}", extra=self.logger_extra)
668
+ #
669
+ # def show_benchmark_summary(self):
670
+ # if not self.benchmarks:
671
+ # self.logger.info("No benchmarking data to show", extra=self.logger_extra)
672
+ # return
673
+ # df_bench = pd.DataFrame.from_records([{"date": d, **m} for d, m in self.benchmarks.items()])
674
+ # df_bench = df_bench.set_index("date").sort_index(ascending=not self.update_planner.reverse_order)
675
+ # self.logger.info(f"Benchmark Summary:\n {self.dataclass.__name__}\n" + df_bench.to_string(), extra=self.logger_extra)
676
+ #