sibi-dst 2025.8.1__py3-none-any.whl → 2025.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,10 @@
1
1
  import datetime
2
+ import random
2
3
  import threading
3
4
  import time
4
- import random
5
5
  from concurrent.futures import ThreadPoolExecutor, as_completed
6
6
  from typing import Type, Any, Dict, Optional, Union, List, ClassVar
7
7
 
8
- import dask.dataframe as dd
9
8
  import pandas as pd
10
9
  from tqdm import tqdm
11
10
 
@@ -61,12 +60,21 @@ class DataWrapper(ManagedResource):
61
60
  self.mmanifest = kwargs.get("mmanifest", None)
62
61
  self.update_planner = kwargs.get("update_planner", None)
63
62
 
63
+ # --- NEW: stop gate tripped during cleanup/interrupt to block further scheduling/retries
64
+ self._stop_event = threading.Event()
65
+ self.extra_logger = {"action_module_name": "data_wrapper", "dataclass": self.dataclass.__name__}
66
+
67
+ # ensure manifest is saved on context exit
64
68
  def __exit__(self, exc_type, exc_val, exc_tb):
65
69
  if self.mmanifest:
66
70
  self.mmanifest.save()
67
71
  super().__exit__(exc_type, exc_val, exc_tb)
68
72
  return False
69
73
 
74
+ # --- NEW: trip stop gate during class-specific cleanup (close/aclose/finalizer path)
75
+ def _cleanup(self) -> None:
76
+ self._stop_event.set()
77
+
70
78
  @staticmethod
71
79
  def _convert_to_date(date: Union[datetime.date, str]) -> datetime.date:
72
80
  if isinstance(date, datetime.date):
@@ -89,12 +97,7 @@ class DataWrapper(ManagedResource):
89
97
  ):
90
98
  """
91
99
  Execute the update plan with concurrency, retries and exponential backoff.
92
-
93
- Args:
94
- max_retries: attempts per date.
95
- backoff_base: base for exponential backoff (delay = base**attempt).
96
- backoff_jitter: multiplicative jitter factor in [0, backoff_jitter].
97
- backoff_max: maximum backoff seconds per attempt (before jitter).
100
+ Stops scheduling immediately if closed or interrupted (Ctrl-C).
98
101
  """
99
102
  overall_start = time.perf_counter()
100
103
  tasks = list(self.update_planner.get_tasks_by_priority())
@@ -105,15 +108,22 @@ class DataWrapper(ManagedResource):
105
108
  if self.update_planner.show_progress:
106
109
  self.update_planner.show_update_plan()
107
110
 
108
- for priority, dates in tasks:
109
- self._execute_task_batch(priority, dates, max_retries, backoff_base, backoff_jitter, backoff_max)
110
-
111
- total_time = time.perf_counter() - overall_start
112
- if self.processed_dates:
113
- count = len(self.processed_dates)
114
- self.logger.info(f"Processed {count} dates in {total_time:.1f}s (avg {total_time / count:.1f}s/date)")
115
- if self.update_planner.show_progress:
116
- self.show_benchmark_summary()
111
+ try:
112
+ for priority, dates in tasks:
113
+ if self._stop_event.is_set():
114
+ break
115
+ self._execute_task_batch(priority, dates, max_retries, backoff_base, backoff_jitter, backoff_max)
116
+ except KeyboardInterrupt:
117
+ self.logger.warning("KeyboardInterrupt received stopping scheduling and shutting down.")
118
+ self._stop_event.set()
119
+ raise
120
+ finally:
121
+ total_time = time.perf_counter() - overall_start
122
+ if self.processed_dates:
123
+ count = len(self.processed_dates)
124
+ self.logger.info(f"Processed {count} dates in {total_time:.1f}s (avg {total_time / count:.1f}s/date)")
125
+ if self.update_planner.show_progress:
126
+ self.show_benchmark_summary()
117
127
 
118
128
  def _execute_task_batch(
119
129
  self,
@@ -126,15 +136,27 @@ class DataWrapper(ManagedResource):
126
136
  ):
127
137
  desc = f"Processing {self.dataclass.__name__}, priority: {priority}"
128
138
  max_thr = min(len(dates), self.max_threads)
129
- self.logger.info(f"Executing {len(dates)} tasks with priority {priority} using {max_thr} threads.")
130
-
131
- with ThreadPoolExecutor(max_workers=max_thr) as executor:
132
- futures = {
133
- executor.submit(
134
- self._process_date_with_retry, date, max_retries, backoff_base, backoff_jitter, backoff_max
135
- ): date
136
- for date in dates
137
- }
139
+ self.logger.info(f"Executing {len(dates)} tasks with priority {priority} using {max_thr} threads.", extra=self.extra_logger)
140
+
141
+ # Use explicit try/finally so we can request cancel of queued tasks on teardown
142
+ executor = ThreadPoolExecutor(max_workers=max_thr, thread_name_prefix="datawrapper")
143
+ try:
144
+ futures = {}
145
+ for date in dates:
146
+ if self._stop_event.is_set():
147
+ break
148
+ try:
149
+ fut = executor.submit(
150
+ self._process_date_with_retry, date, max_retries, backoff_base, backoff_jitter, backoff_max
151
+ )
152
+ futures[fut] = date
153
+ except RuntimeError as e:
154
+ # tolerate race: executor shutting down
155
+ if "cannot schedule new futures after shutdown" in str(e).lower():
156
+ self.logger.warning("Executor is shutting down; halting new submissions for this batch.")
157
+ break
158
+ raise
159
+
138
160
  iterator = as_completed(futures)
139
161
  if self.show_progress:
140
162
  iterator = tqdm(iterator, total=len(futures), desc=desc)
@@ -143,7 +165,10 @@ class DataWrapper(ManagedResource):
143
165
  try:
144
166
  future.result(timeout=self.timeout)
145
167
  except Exception as e:
146
- self.logger.error(f"Permanent failure for {futures[future]}: {e}")
168
+ self.logger.error(f"Permanent failure for {futures[future]}: {e}", extra=self.extra_logger)
169
+ finally:
170
+ # Python 3.9+: cancel_futures prevents queued tasks from starting
171
+ executor.shutdown(wait=True, cancel_futures=True)
147
172
 
148
173
  def _process_date_with_retry(
149
174
  self,
@@ -154,11 +179,15 @@ class DataWrapper(ManagedResource):
154
179
  backoff_max: float,
155
180
  ):
156
181
  for attempt in range(max_retries):
182
+ # --- NEW: bail out quickly if shutdown/interrupt began
183
+ if self._stop_event.is_set():
184
+ raise RuntimeError("shutting_down")
185
+
157
186
  try:
158
187
  self._process_single_date(date)
159
188
  return
160
189
  except Exception as e:
161
- if attempt < max_retries - 1:
190
+ if attempt < max_retries - 1 and not self._stop_event.is_set():
162
191
  base_delay = min(backoff_base ** attempt, backoff_max)
163
192
  delay = base_delay * (1 + random.uniform(0.0, max(0.0, backoff_jitter)))
164
193
  self.logger.warning(
@@ -166,7 +195,8 @@ class DataWrapper(ManagedResource):
166
195
  )
167
196
  time.sleep(delay)
168
197
  else:
169
- self.logger.error(f"Failed processing {date} after {max_retries} attempts.")
198
+ self.logger.error(f"Failed processing {date} after {max_retries} attempts.", extra=self.extra_logger)
199
+ raise
170
200
 
171
201
  def _process_single_date(self, date: datetime.date):
172
202
  path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
@@ -228,26 +258,28 @@ class DataWrapper(ManagedResource):
228
258
  raise
229
259
 
230
260
  def _log_success(self, date: datetime.date, duration: float, path: str):
231
- self.logger.info(f"Completed {date} in {duration:.1f}s | Saved to {path}")
261
+ self.logger.info(f"Completed {date} in {duration:.1f}s | Saved to {path}", extra=self.extra_logger)
232
262
  self.processed_dates.append(date)
233
263
 
234
264
  def _log_failure(self, date: datetime.date, error: Exception):
235
- self.logger.error(f"Failed processing {date}: {error}")
265
+ self.logger.error(f"Failed processing {date}: {error}", extra=self.extra_logger)
236
266
 
237
267
  def show_benchmark_summary(self):
238
268
  if not self.benchmarks:
239
- self.logger.info("No benchmarking data to show")
269
+ self.logger.info("No benchmarking data to show", extra=self.extra_logger)
240
270
  return
241
271
  df_bench = pd.DataFrame.from_records([{"date": d, **m} for d, m in self.benchmarks.items()])
242
272
  df_bench = df_bench.set_index("date").sort_index(ascending=not self.update_planner.reverse_order)
243
- self.logger.info(f"Benchmark Summary:\n {self.dataclass.__name__}\n" + df_bench.to_string())
273
+ self.logger.info(f"Benchmark Summary:\n {self.dataclass.__name__}\n" + df_bench.to_string(), extra=self.extra_logger)
244
274
 
245
275
  # import datetime
246
276
  # import threading
247
277
  # import time
278
+ # import random
248
279
  # from concurrent.futures import ThreadPoolExecutor, as_completed
249
280
  # from typing import Type, Any, Dict, Optional, Union, List, ClassVar
250
281
  #
282
+ # import dask.dataframe as dd
251
283
  # import pandas as pd
252
284
  # from tqdm import tqdm
253
285
  #
@@ -261,23 +293,23 @@ class DataWrapper(ManagedResource):
261
293
  # "missing_in_history": 2,
262
294
  # "existing_but_stale": 3,
263
295
  # "missing_outside_history": 4,
264
- # "file_is_recent": 0
296
+ # "file_is_recent": 0,
265
297
  # }
266
298
  # DEFAULT_MAX_AGE_MINUTES: int = 1440
267
299
  # DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
268
300
  #
269
301
  # def __init__(
270
- # self,
271
- # dataclass: Type,
272
- # date_field: str,
273
- # data_path: str,
274
- # parquet_filename: str,
275
- # class_params: Optional[Dict] = None,
276
- # load_params: Optional[Dict] = None,
277
- # show_progress: bool = False,
278
- # timeout: float = 30,
279
- # max_threads: int = 3,
280
- # **kwargs: Any,
302
+ # self,
303
+ # dataclass: Type,
304
+ # date_field: str,
305
+ # data_path: str,
306
+ # parquet_filename: str,
307
+ # class_params: Optional[Dict] = None,
308
+ # load_params: Optional[Dict] = None,
309
+ # show_progress: bool = False,
310
+ # timeout: float = 30,
311
+ # max_threads: int = 3,
312
+ # **kwargs: Any,
281
313
  # ):
282
314
  # super().__init__(**kwargs)
283
315
  # self.dataclass = dataclass
@@ -285,15 +317,15 @@ class DataWrapper(ManagedResource):
285
317
  # self.data_path = self._ensure_forward_slash(data_path)
286
318
  # self.parquet_filename = parquet_filename
287
319
  # if self.fs is None:
288
- # raise ValueError("Datawrapper requires a File system (fs) to be provided .")
320
+ # raise ValueError("DataWrapper requires a File system (fs) to be provided.")
289
321
  # self.show_progress = show_progress
290
322
  # self.timeout = timeout
291
323
  # self.max_threads = max_threads
292
324
  # self.class_params = class_params or {
293
- # 'debug': self.debug,
294
- # 'logger': self.logger,
295
- # 'fs': self.fs,
296
- # 'verbose': self.verbose,
325
+ # "debug": self.debug,
326
+ # "logger": self.logger,
327
+ # "fs": self.fs,
328
+ # "verbose": self.verbose,
297
329
  # }
298
330
  # self.load_params = load_params or {}
299
331
  #
@@ -304,7 +336,6 @@ class DataWrapper(ManagedResource):
304
336
  # self.update_planner = kwargs.get("update_planner", None)
305
337
  #
306
338
  # def __exit__(self, exc_type, exc_val, exc_tb):
307
- # """Context manager exit"""
308
339
  # if self.mmanifest:
309
340
  # self.mmanifest.save()
310
341
  # super().__exit__(exc_type, exc_val, exc_tb)
@@ -321,10 +352,24 @@ class DataWrapper(ManagedResource):
321
352
  #
322
353
  # @staticmethod
323
354
  # def _ensure_forward_slash(path: str) -> str:
324
- # return path.rstrip('/') + '/'
355
+ # return path.rstrip("/") + "/"
325
356
  #
326
- # def process(self, max_retries: int = 3):
327
- # """Process updates with priority-based execution, retries, benchmarking and progress updates"""
357
+ # def process(
358
+ # self,
359
+ # max_retries: int = 3,
360
+ # backoff_base: float = 2.0,
361
+ # backoff_jitter: float = 0.1,
362
+ # backoff_max: float = 60.0,
363
+ # ):
364
+ # """
365
+ # Execute the update plan with concurrency, retries and exponential backoff.
366
+ #
367
+ # Args:
368
+ # max_retries: attempts per date.
369
+ # backoff_base: base for exponential backoff (delay = base**attempt).
370
+ # backoff_jitter: multiplicative jitter factor in [0, backoff_jitter].
371
+ # backoff_max: maximum backoff seconds per attempt (before jitter).
372
+ # """
328
373
  # overall_start = time.perf_counter()
329
374
  # tasks = list(self.update_planner.get_tasks_by_priority())
330
375
  # if not tasks:
@@ -335,7 +380,7 @@ class DataWrapper(ManagedResource):
335
380
  # self.update_planner.show_update_plan()
336
381
  #
337
382
  # for priority, dates in tasks:
338
- # self._execute_task_batch(priority, dates, max_retries)
383
+ # self._execute_task_batch(priority, dates, max_retries, backoff_base, backoff_jitter, backoff_max)
339
384
  #
340
385
  # total_time = time.perf_counter() - overall_start
341
386
  # if self.processed_dates:
@@ -344,14 +389,26 @@ class DataWrapper(ManagedResource):
344
389
  # if self.update_planner.show_progress:
345
390
  # self.show_benchmark_summary()
346
391
  #
347
- # def _execute_task_batch(self, priority: int, dates: List[datetime.date], max_retries: int):
348
- # """Executes a single batch of tasks (dates) using a thread pool."""
392
+ # def _execute_task_batch(
393
+ # self,
394
+ # priority: int,
395
+ # dates: List[datetime.date],
396
+ # max_retries: int,
397
+ # backoff_base: float,
398
+ # backoff_jitter: float,
399
+ # backoff_max: float,
400
+ # ):
349
401
  # desc = f"Processing {self.dataclass.__name__}, priority: {priority}"
350
402
  # max_thr = min(len(dates), self.max_threads)
351
403
  # self.logger.info(f"Executing {len(dates)} tasks with priority {priority} using {max_thr} threads.")
352
404
  #
353
405
  # with ThreadPoolExecutor(max_workers=max_thr) as executor:
354
- # futures = {executor.submit(self._process_date_with_retry, date, max_retries): date for date in dates}
406
+ # futures = {
407
+ # executor.submit(
408
+ # self._process_date_with_retry, date, max_retries, backoff_base, backoff_jitter, backoff_max
409
+ # ): date
410
+ # for date in dates
411
+ # }
355
412
  # iterator = as_completed(futures)
356
413
  # if self.show_progress:
357
414
  # iterator = tqdm(iterator, total=len(futures), desc=desc)
@@ -362,22 +419,30 @@ class DataWrapper(ManagedResource):
362
419
  # except Exception as e:
363
420
  # self.logger.error(f"Permanent failure for {futures[future]}: {e}")
364
421
  #
365
- # def _process_date_with_retry(self, date: datetime.date, max_retries: int):
366
- # """Wrapper to apply retry logic to single date processing."""
422
+ # def _process_date_with_retry(
423
+ # self,
424
+ # date: datetime.date,
425
+ # max_retries: int,
426
+ # backoff_base: float,
427
+ # backoff_jitter: float,
428
+ # backoff_max: float,
429
+ # ):
367
430
  # for attempt in range(max_retries):
368
431
  # try:
369
432
  # self._process_single_date(date)
370
433
  # return
371
434
  # except Exception as e:
372
435
  # if attempt < max_retries - 1:
373
- # self.logger.warning(f"Retry {attempt + 1}/{max_retries} for {date}: {e}")
374
- # time.sleep(2 ** attempt) # Exponential backoff
436
+ # base_delay = min(backoff_base ** attempt, backoff_max)
437
+ # delay = base_delay * (1 + random.uniform(0.0, max(0.0, backoff_jitter)))
438
+ # self.logger.warning(
439
+ # f"Retry {attempt + 1}/{max_retries} for {date}: {e} (sleep {delay:.2f}s)"
440
+ # )
441
+ # time.sleep(delay)
375
442
  # else:
376
443
  # self.logger.error(f"Failed processing {date} after {max_retries} attempts.")
377
- # # raise
378
444
  #
379
445
  # def _process_single_date(self, date: datetime.date):
380
- # """Core date processing logic with load/save timing and thread reporting"""
381
446
  # path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
382
447
  # self.logger.debug(f"Processing date {date.isoformat()} for {path}")
383
448
  # if path in self.update_planner.skipped and self.update_planner.ignore_missing:
@@ -385,74 +450,69 @@ class DataWrapper(ManagedResource):
385
450
  # return
386
451
  # full_path = f"{path}{self.parquet_filename}"
387
452
  #
388
- # # thread_name = threading.current_thread().name
389
- # # self.logger.debug(f"[{thread_name}] Executing date: {date} -> saving to: {full_path}")
390
- #
391
453
  # overall_start = time.perf_counter()
392
454
  # try:
393
455
  # load_start = time.perf_counter()
394
456
  # date_filter = {f"{self.date_field}__date": {date.isoformat()}}
395
457
  # self.logger.debug(f"Loading data for {date} with filter: {date_filter}")
396
- # # Load data using the dataclass with the provided date filter
397
- # # Create a copy to avoid mutating the shared instance dictionary
458
+ #
398
459
  # local_load_params = self.load_params.copy()
399
460
  # local_load_params.update(date_filter)
461
+ #
400
462
  # with self.dataclass(**self.class_params) as local_class_instance:
401
- # df = local_class_instance.load(**local_load_params)
463
+ # df = local_class_instance.load(**local_load_params) # expected to be Dask
402
464
  # load_time = time.perf_counter() - load_start
403
465
  #
404
466
  # if hasattr(local_class_instance, "total_records"):
405
- # self.logger.debug(
406
- # f"Total records loaded by {local_class_instance.__class__.__name__}: {local_class_instance.total_records}")
407
- # if int(local_class_instance.total_records) == 0: # If no records were loaded but not due to an error
467
+ # total_records = int(local_class_instance.total_records)
468
+ # self.logger.debug(f"Total records loaded: {total_records}")
469
+ #
470
+ # if total_records == 0:
408
471
  # if self.mmanifest:
409
- # self.mmanifest.record(
410
- # full_path=path
411
- # )
472
+ # self.mmanifest.record(full_path=path)
412
473
  # self.logger.info(f"No data found for {full_path}. Logged to missing manifest.")
413
- # elif int(local_class_instance.total_records) < 0:
414
- # self.logger.warning(
415
- # f"Negative record count ({local_class_instance.total_records}) for {full_path}. "
416
- # "This may indicate an error in the data loading process."
417
- # )
418
- # else:
419
- # save_start = time.perf_counter()
420
- # parquet_params ={
421
- # "df_result": df,
422
- # "parquet_storage_path": path,
423
- # "fs": self.fs,
424
- # "logger": self.logger,
425
- # "debug": self.debug,
426
- # }
427
- # with ParquetSaver(**parquet_params) as ps:
428
- # ps.save_to_parquet(self.parquet_filename, overwrite=True)
429
- # save_time = time.perf_counter() - save_start
430
- #
431
- # total_time = time.perf_counter() - overall_start
432
- # self.benchmarks[date] = {
433
- # "load_duration": load_time,
434
- # "save_duration": save_time,
435
- # "total_duration": total_time
436
- # }
437
- # self._log_success(date, total_time, full_path)
474
+ # return
475
+ #
476
+ # if total_records < 0:
477
+ # self.logger.warning(f"Negative record count ({total_records}) for {full_path}.")
478
+ # return
479
+ #
480
+ # save_start = time.perf_counter()
481
+ # parquet_params = {
482
+ # "df_result": df,
483
+ # "parquet_storage_path": path,
484
+ # "fs": self.fs,
485
+ # "logger": self.logger,
486
+ # "debug": self.debug,
487
+ # }
488
+ # with ParquetSaver(**parquet_params) as ps:
489
+ # ps.save_to_parquet(self.parquet_filename, overwrite=True)
490
+ # save_time = time.perf_counter() - save_start
491
+ #
492
+ # total_time = time.perf_counter() - overall_start
493
+ # self.benchmarks[date] = {
494
+ # "load_duration": load_time,
495
+ # "save_duration": save_time,
496
+ # "total_duration": total_time,
497
+ # }
498
+ # self._log_success(date, total_time, full_path)
499
+ #
438
500
  # except Exception as e:
439
501
  # self._log_failure(date, e)
440
502
  # raise
441
503
  #
442
504
  # def _log_success(self, date: datetime.date, duration: float, path: str):
443
- # msg = f"Completed {date} in {duration:.1f}s | Saved to {path}"
444
- # self.logger.info(msg)
505
+ # self.logger.info(f"Completed {date} in {duration:.1f}s | Saved to {path}")
445
506
  # self.processed_dates.append(date)
446
507
  #
447
508
  # def _log_failure(self, date: datetime.date, error: Exception):
448
- # msg = f"Failed processing {date}: {error}"
449
- # self.logger.error(msg)
509
+ # self.logger.error(f"Failed processing {date}: {error}")
450
510
  #
451
511
  # def show_benchmark_summary(self):
452
- # """Display a summary of load/save timings per date"""
453
512
  # if not self.benchmarks:
454
513
  # self.logger.info("No benchmarking data to show")
455
514
  # return
456
515
  # df_bench = pd.DataFrame.from_records([{"date": d, **m} for d, m in self.benchmarks.items()])
457
516
  # df_bench = df_bench.set_index("date").sort_index(ascending=not self.update_planner.reverse_order)
458
517
  # self.logger.info(f"Benchmark Summary:\n {self.dataclass.__name__}\n" + df_bench.to_string())
518
+ #