sibi-dst 2025.1.13__py3-none-any.whl → 2025.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. sibi_dst/__init__.py +7 -1
  2. sibi_dst/df_helper/__init__.py +3 -2
  3. sibi_dst/df_helper/_artifact_updater_async.py +238 -0
  4. sibi_dst/df_helper/_artifact_updater_threaded.py +195 -0
  5. sibi_dst/df_helper/_df_helper.py +418 -118
  6. sibi_dst/df_helper/_parquet_artifact.py +275 -283
  7. sibi_dst/df_helper/_parquet_reader.py +9 -10
  8. sibi_dst/df_helper/backends/parquet/_parquet_options.py +8 -4
  9. sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +68 -107
  10. sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
  11. sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +105 -255
  12. sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +90 -42
  13. sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +192 -0
  14. sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +122 -72
  15. sibi_dst/osmnx_helper/route_path_builder.py +45 -46
  16. sibi_dst/utils/__init__.py +2 -0
  17. sibi_dst/utils/base.py +235 -100
  18. sibi_dst/utils/business_days.py +248 -0
  19. sibi_dst/utils/clickhouse_writer.py +472 -206
  20. sibi_dst/utils/data_utils.py +139 -186
  21. sibi_dst/utils/data_wrapper.py +392 -88
  22. sibi_dst/utils/date_utils.py +711 -393
  23. sibi_dst/utils/df_utils.py +193 -213
  24. sibi_dst/utils/file_age_checker.py +301 -0
  25. sibi_dst/utils/file_utils.py +3 -2
  26. sibi_dst/utils/filepath_generator.py +314 -152
  27. sibi_dst/utils/log_utils.py +581 -242
  28. sibi_dst/utils/manifest_manager.py +60 -76
  29. sibi_dst/utils/parquet_saver.py +33 -27
  30. sibi_dst/utils/periods.py +42 -0
  31. sibi_dst/utils/phone_formatter.py +88 -95
  32. sibi_dst/utils/update_planner.py +180 -178
  33. sibi_dst/utils/webdav_client.py +116 -166
  34. {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/METADATA +1 -1
  35. {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/RECORD +36 -30
  36. sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +0 -422
  37. {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/WHEEL +0 -0
@@ -1,4 +1,5 @@
1
1
  import datetime
2
+ import random
2
3
  import threading
3
4
  import time
4
5
  from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -17,23 +18,23 @@ class DataWrapper(ManagedResource):
17
18
  "missing_in_history": 2,
18
19
  "existing_but_stale": 3,
19
20
  "missing_outside_history": 4,
20
- "file_is_recent": 0
21
+ "file_is_recent": 0,
21
22
  }
22
23
  DEFAULT_MAX_AGE_MINUTES: int = 1440
23
24
  DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
24
25
 
25
26
  def __init__(
26
- self,
27
- dataclass: Type,
28
- date_field: str,
29
- data_path: str,
30
- parquet_filename: str,
31
- class_params: Optional[Dict] = None,
32
- load_params: Optional[Dict] = None,
33
- show_progress: bool = False,
34
- timeout: float = 30,
35
- max_threads: int = 3,
36
- **kwargs: Any,
27
+ self,
28
+ dataclass: Type,
29
+ date_field: str,
30
+ data_path: str,
31
+ parquet_filename: str,
32
+ class_params: Optional[Dict] = None,
33
+ load_params: Optional[Dict] = None,
34
+ show_progress: bool = False,
35
+ timeout: float = 30,
36
+ max_threads: int = 3,
37
+ **kwargs: Any,
37
38
  ):
38
39
  super().__init__(**kwargs)
39
40
  self.dataclass = dataclass
@@ -41,15 +42,15 @@ class DataWrapper(ManagedResource):
41
42
  self.data_path = self._ensure_forward_slash(data_path)
42
43
  self.parquet_filename = parquet_filename
43
44
  if self.fs is None:
44
- raise ValueError("Datawrapper requires a File system (fs) to be provided .")
45
+ raise ValueError("DataWrapper requires a File system (fs) to be provided.")
45
46
  self.show_progress = show_progress
46
47
  self.timeout = timeout
47
48
  self.max_threads = max_threads
48
49
  self.class_params = class_params or {
49
- 'debug': self.debug,
50
- 'logger': self.logger,
51
- 'fs': self.fs,
52
- 'verbose': self.verbose,
50
+ "debug": self.debug,
51
+ "logger": self.logger,
52
+ "fs": self.fs,
53
+ "verbose": self.verbose,
53
54
  }
54
55
  self.load_params = load_params or {}
55
56
 
@@ -59,13 +60,21 @@ class DataWrapper(ManagedResource):
59
60
  self.mmanifest = kwargs.get("mmanifest", None)
60
61
  self.update_planner = kwargs.get("update_planner", None)
61
62
 
63
+ # --- NEW: stop gate tripped during cleanup/interrupt to block further scheduling/retries
64
+ self._stop_event = threading.Event()
65
+ self.extra_logger = {"action_module_name": "data_wrapper", "dataclass": self.dataclass.__name__}
66
+
67
+ # ensure manifest is saved on context exit
62
68
  def __exit__(self, exc_type, exc_val, exc_tb):
63
- """Context manager exit"""
64
69
  if self.mmanifest:
65
70
  self.mmanifest.save()
66
71
  super().__exit__(exc_type, exc_val, exc_tb)
67
72
  return False
68
73
 
74
+ # --- NEW: trip stop gate during class-specific cleanup (close/aclose/finalizer path)
75
+ def _cleanup(self) -> None:
76
+ self._stop_event.set()
77
+
69
78
  @staticmethod
70
79
  def _convert_to_date(date: Union[datetime.date, str]) -> datetime.date:
71
80
  if isinstance(date, datetime.date):
@@ -77,10 +86,19 @@ class DataWrapper(ManagedResource):
77
86
 
78
87
  @staticmethod
79
88
  def _ensure_forward_slash(path: str) -> str:
80
- return path.rstrip('/') + '/'
89
+ return path.rstrip("/") + "/"
81
90
 
82
- def process(self, max_retries: int = 3):
83
- """Process updates with priority-based execution, retries, benchmarking and progress updates"""
91
+ def process(
92
+ self,
93
+ max_retries: int = 3,
94
+ backoff_base: float = 2.0,
95
+ backoff_jitter: float = 0.1,
96
+ backoff_max: float = 60.0,
97
+ ):
98
+ """
99
+ Execute the update plan with concurrency, retries and exponential backoff.
100
+ Stops scheduling immediately if closed or interrupted (Ctrl-C).
101
+ """
84
102
  overall_start = time.perf_counter()
85
103
  tasks = list(self.update_planner.get_tasks_by_priority())
86
104
  if not tasks:
@@ -90,24 +108,55 @@ class DataWrapper(ManagedResource):
90
108
  if self.update_planner.show_progress:
91
109
  self.update_planner.show_update_plan()
92
110
 
93
- for priority, dates in tasks:
94
- self._execute_task_batch(priority, dates, max_retries)
95
-
96
- total_time = time.perf_counter() - overall_start
97
- if self.processed_dates:
98
- count = len(self.processed_dates)
99
- self.logger.info(f"Processed {count} dates in {total_time:.1f}s (avg {total_time / count:.1f}s/date)")
100
- if self.update_planner.show_progress:
101
- self.show_benchmark_summary()
111
+ try:
112
+ for priority, dates in tasks:
113
+ if self._stop_event.is_set():
114
+ break
115
+ self._execute_task_batch(priority, dates, max_retries, backoff_base, backoff_jitter, backoff_max)
116
+ except KeyboardInterrupt:
117
+ self.logger.warning("KeyboardInterrupt received stopping scheduling and shutting down.")
118
+ self._stop_event.set()
119
+ raise
120
+ finally:
121
+ total_time = time.perf_counter() - overall_start
122
+ if self.processed_dates:
123
+ count = len(self.processed_dates)
124
+ self.logger.info(f"Processed {count} dates in {total_time:.1f}s (avg {total_time / count:.1f}s/date)")
125
+ if self.update_planner.show_progress:
126
+ self.show_benchmark_summary()
102
127
 
103
- def _execute_task_batch(self, priority: int, dates: List[datetime.date], max_retries: int):
104
- """Executes a single batch of tasks (dates) using a thread pool."""
128
+ def _execute_task_batch(
129
+ self,
130
+ priority: int,
131
+ dates: List[datetime.date],
132
+ max_retries: int,
133
+ backoff_base: float,
134
+ backoff_jitter: float,
135
+ backoff_max: float,
136
+ ):
105
137
  desc = f"Processing {self.dataclass.__name__}, priority: {priority}"
106
138
  max_thr = min(len(dates), self.max_threads)
107
- self.logger.info(f"Executing {len(dates)} tasks with priority {priority} using {max_thr} threads.")
139
+ self.logger.info(f"Executing {len(dates)} tasks with priority {priority} using {max_thr} threads.", extra=self.extra_logger)
140
+
141
+ # Use explicit try/finally so we can request cancel of queued tasks on teardown
142
+ executor = ThreadPoolExecutor(max_workers=max_thr, thread_name_prefix="datawrapper")
143
+ try:
144
+ futures = {}
145
+ for date in dates:
146
+ if self._stop_event.is_set():
147
+ break
148
+ try:
149
+ fut = executor.submit(
150
+ self._process_date_with_retry, date, max_retries, backoff_base, backoff_jitter, backoff_max
151
+ )
152
+ futures[fut] = date
153
+ except RuntimeError as e:
154
+ # tolerate race: executor shutting down
155
+ if "cannot schedule new futures after shutdown" in str(e).lower():
156
+ self.logger.warning("Executor is shutting down; halting new submissions for this batch.")
157
+ break
158
+ raise
108
159
 
109
- with ThreadPoolExecutor(max_workers=max_thr) as executor:
110
- futures = {executor.submit(self._process_date_with_retry, date, max_retries): date for date in dates}
111
160
  iterator = as_completed(futures)
112
161
  if self.show_progress:
113
162
  iterator = tqdm(iterator, total=len(futures), desc=desc)
@@ -116,24 +165,40 @@ class DataWrapper(ManagedResource):
116
165
  try:
117
166
  future.result(timeout=self.timeout)
118
167
  except Exception as e:
119
- self.logger.error(f"Permanent failure for {futures[future]}: {e}")
168
+ self.logger.error(f"Permanent failure for {futures[future]}: {e}", extra=self.extra_logger)
169
+ finally:
170
+ # Python 3.9+: cancel_futures prevents queued tasks from starting
171
+ executor.shutdown(wait=True, cancel_futures=True)
120
172
 
121
- def _process_date_with_retry(self, date: datetime.date, max_retries: int):
122
- """Wrapper to apply retry logic to single date processing."""
173
+ def _process_date_with_retry(
174
+ self,
175
+ date: datetime.date,
176
+ max_retries: int,
177
+ backoff_base: float,
178
+ backoff_jitter: float,
179
+ backoff_max: float,
180
+ ):
123
181
  for attempt in range(max_retries):
182
+ # --- NEW: bail out quickly if shutdown/interrupt began
183
+ if self._stop_event.is_set():
184
+ raise RuntimeError("shutting_down")
185
+
124
186
  try:
125
187
  self._process_single_date(date)
126
188
  return
127
189
  except Exception as e:
128
- if attempt < max_retries - 1:
129
- self.logger.warning(f"Retry {attempt + 1}/{max_retries} for {date}: {e}")
130
- time.sleep(2 ** attempt) # Exponential backoff
190
+ if attempt < max_retries - 1 and not self._stop_event.is_set():
191
+ base_delay = min(backoff_base ** attempt, backoff_max)
192
+ delay = base_delay * (1 + random.uniform(0.0, max(0.0, backoff_jitter)))
193
+ self.logger.warning(
194
+ f"Retry {attempt + 1}/{max_retries} for {date}: {e} (sleep {delay:.2f}s)"
195
+ )
196
+ time.sleep(delay)
131
197
  else:
132
- self.logger.error(f"Failed processing {date} after {max_retries} attempts.")
133
- # raise
198
+ self.logger.error(f"Failed processing {date} after {max_retries} attempts.", extra=self.extra_logger)
199
+ raise
134
200
 
135
201
  def _process_single_date(self, date: datetime.date):
136
- """Core date processing logic with load/save timing and thread reporting"""
137
202
  path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
138
203
  self.logger.debug(f"Processing date {date.isoformat()} for {path}")
139
204
  if path in self.update_planner.skipped and self.update_planner.ignore_missing:
@@ -141,74 +206,313 @@ class DataWrapper(ManagedResource):
141
206
  return
142
207
  full_path = f"{path}{self.parquet_filename}"
143
208
 
144
- # thread_name = threading.current_thread().name
145
- # self.logger.debug(f"[{thread_name}] Executing date: {date} -> saving to: {full_path}")
146
-
147
209
  overall_start = time.perf_counter()
148
210
  try:
149
211
  load_start = time.perf_counter()
150
212
  date_filter = {f"{self.date_field}__date": {date.isoformat()}}
151
213
  self.logger.debug(f"Loading data for {date} with filter: {date_filter}")
152
- # Load data using the dataclass with the provided date filter
153
- # Create a copy to avoid mutating the shared instance dictionary
214
+
154
215
  local_load_params = self.load_params.copy()
155
216
  local_load_params.update(date_filter)
217
+
156
218
  with self.dataclass(**self.class_params) as local_class_instance:
157
- df = local_class_instance.load(**local_load_params)
219
+ df = local_class_instance.load(**local_load_params) # expected to be Dask
158
220
  load_time = time.perf_counter() - load_start
159
221
 
160
222
  if hasattr(local_class_instance, "total_records"):
161
- self.logger.debug(
162
- f"Total records loaded by {local_class_instance.__class__.__name__}: {local_class_instance.total_records}")
163
- if int(local_class_instance.total_records) == 0: # If no records were loaded but not due to an error
223
+ total_records = int(local_class_instance.total_records)
224
+ self.logger.debug(f"Total records loaded: {total_records}")
225
+
226
+ if total_records == 0:
164
227
  if self.mmanifest:
165
- self.mmanifest.record(
166
- full_path=path
167
- )
228
+ self.mmanifest.record(full_path=path)
168
229
  self.logger.info(f"No data found for {full_path}. Logged to missing manifest.")
169
- elif int(local_class_instance.total_records) < 0:
170
- self.logger.warning(
171
- f"Negative record count ({local_class_instance.total_records}) for {full_path}. "
172
- "This may indicate an error in the data loading process."
173
- )
174
- else:
175
- save_start = time.perf_counter()
176
- parquet_params ={
177
- "df_result": df,
178
- "parquet_storage_path": path,
179
- "fs": self.fs,
180
- "logger": self.logger,
181
- "debug": self.debug,
182
- }
183
- with ParquetSaver(**parquet_params) as ps:
184
- ps.save_to_parquet(self.parquet_filename, overwrite=True)
185
- save_time = time.perf_counter() - save_start
186
-
187
- total_time = time.perf_counter() - overall_start
188
- self.benchmarks[date] = {
189
- "load_duration": load_time,
190
- "save_duration": save_time,
191
- "total_duration": total_time
192
- }
193
- self._log_success(date, total_time, full_path)
230
+ return
231
+
232
+ if total_records < 0:
233
+ self.logger.warning(f"Negative record count ({total_records}) for {full_path}.")
234
+ return
235
+
236
+ save_start = time.perf_counter()
237
+ parquet_params = {
238
+ "df_result": df,
239
+ "parquet_storage_path": path,
240
+ "fs": self.fs,
241
+ "logger": self.logger,
242
+ "debug": self.debug,
243
+ }
244
+ with ParquetSaver(**parquet_params) as ps:
245
+ ps.save_to_parquet(self.parquet_filename, overwrite=True)
246
+ save_time = time.perf_counter() - save_start
247
+
248
+ total_time = time.perf_counter() - overall_start
249
+ self.benchmarks[date] = {
250
+ "load_duration": load_time,
251
+ "save_duration": save_time,
252
+ "total_duration": total_time,
253
+ }
254
+ self._log_success(date, total_time, full_path)
255
+
194
256
  except Exception as e:
195
257
  self._log_failure(date, e)
196
258
  raise
197
259
 
198
260
  def _log_success(self, date: datetime.date, duration: float, path: str):
199
- msg = f"Completed {date} in {duration:.1f}s | Saved to {path}"
200
- self.logger.info(msg)
261
+ self.logger.info(f"Completed {date} in {duration:.1f}s | Saved to {path}", extra=self.extra_logger)
201
262
  self.processed_dates.append(date)
202
263
 
203
264
  def _log_failure(self, date: datetime.date, error: Exception):
204
- msg = f"Failed processing {date}: {error}"
205
- self.logger.error(msg)
265
+ self.logger.error(f"Failed processing {date}: {error}", extra=self.extra_logger)
206
266
 
207
267
  def show_benchmark_summary(self):
208
- """Display a summary of load/save timings per date"""
209
268
  if not self.benchmarks:
210
- self.logger.info("No benchmarking data to show")
269
+ self.logger.info("No benchmarking data to show", extra=self.extra_logger)
211
270
  return
212
271
  df_bench = pd.DataFrame.from_records([{"date": d, **m} for d, m in self.benchmarks.items()])
213
272
  df_bench = df_bench.set_index("date").sort_index(ascending=not self.update_planner.reverse_order)
214
- self.logger.info(f"Benchmark Summary:\n {self.dataclass.__name__}\n" + df_bench.to_string())
273
+ self.logger.info(f"Benchmark Summary:\n {self.dataclass.__name__}\n" + df_bench.to_string(), extra=self.extra_logger)
274
+
275
+ # import datetime
276
+ # import threading
277
+ # import time
278
+ # import random
279
+ # from concurrent.futures import ThreadPoolExecutor, as_completed
280
+ # from typing import Type, Any, Dict, Optional, Union, List, ClassVar
281
+ #
282
+ # import dask.dataframe as dd
283
+ # import pandas as pd
284
+ # from tqdm import tqdm
285
+ #
286
+ # from . import ManagedResource
287
+ # from .parquet_saver import ParquetSaver
288
+ #
289
+ #
290
+ # class DataWrapper(ManagedResource):
291
+ # DEFAULT_PRIORITY_MAP: ClassVar[Dict[str, int]] = {
292
+ # "overwrite": 1,
293
+ # "missing_in_history": 2,
294
+ # "existing_but_stale": 3,
295
+ # "missing_outside_history": 4,
296
+ # "file_is_recent": 0,
297
+ # }
298
+ # DEFAULT_MAX_AGE_MINUTES: int = 1440
299
+ # DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
300
+ #
301
+ # def __init__(
302
+ # self,
303
+ # dataclass: Type,
304
+ # date_field: str,
305
+ # data_path: str,
306
+ # parquet_filename: str,
307
+ # class_params: Optional[Dict] = None,
308
+ # load_params: Optional[Dict] = None,
309
+ # show_progress: bool = False,
310
+ # timeout: float = 30,
311
+ # max_threads: int = 3,
312
+ # **kwargs: Any,
313
+ # ):
314
+ # super().__init__(**kwargs)
315
+ # self.dataclass = dataclass
316
+ # self.date_field = date_field
317
+ # self.data_path = self._ensure_forward_slash(data_path)
318
+ # self.parquet_filename = parquet_filename
319
+ # if self.fs is None:
320
+ # raise ValueError("DataWrapper requires a File system (fs) to be provided.")
321
+ # self.show_progress = show_progress
322
+ # self.timeout = timeout
323
+ # self.max_threads = max_threads
324
+ # self.class_params = class_params or {
325
+ # "debug": self.debug,
326
+ # "logger": self.logger,
327
+ # "fs": self.fs,
328
+ # "verbose": self.verbose,
329
+ # }
330
+ # self.load_params = load_params or {}
331
+ #
332
+ # self._lock = threading.Lock()
333
+ # self.processed_dates: List[datetime.date] = []
334
+ # self.benchmarks: Dict[datetime.date, Dict[str, float]] = {}
335
+ # self.mmanifest = kwargs.get("mmanifest", None)
336
+ # self.update_planner = kwargs.get("update_planner", None)
337
+ #
338
+ # def __exit__(self, exc_type, exc_val, exc_tb):
339
+ # if self.mmanifest:
340
+ # self.mmanifest.save()
341
+ # super().__exit__(exc_type, exc_val, exc_tb)
342
+ # return False
343
+ #
344
+ # @staticmethod
345
+ # def _convert_to_date(date: Union[datetime.date, str]) -> datetime.date:
346
+ # if isinstance(date, datetime.date):
347
+ # return date
348
+ # try:
349
+ # return pd.to_datetime(date).date()
350
+ # except ValueError as e:
351
+ # raise ValueError(f"Error converting {date} to datetime: {e}")
352
+ #
353
+ # @staticmethod
354
+ # def _ensure_forward_slash(path: str) -> str:
355
+ # return path.rstrip("/") + "/"
356
+ #
357
+ # def process(
358
+ # self,
359
+ # max_retries: int = 3,
360
+ # backoff_base: float = 2.0,
361
+ # backoff_jitter: float = 0.1,
362
+ # backoff_max: float = 60.0,
363
+ # ):
364
+ # """
365
+ # Execute the update plan with concurrency, retries and exponential backoff.
366
+ #
367
+ # Args:
368
+ # max_retries: attempts per date.
369
+ # backoff_base: base for exponential backoff (delay = base**attempt).
370
+ # backoff_jitter: multiplicative jitter factor in [0, backoff_jitter].
371
+ # backoff_max: maximum backoff seconds per attempt (before jitter).
372
+ # """
373
+ # overall_start = time.perf_counter()
374
+ # tasks = list(self.update_planner.get_tasks_by_priority())
375
+ # if not tasks:
376
+ # self.logger.info("No updates required based on the current plan.")
377
+ # return
378
+ #
379
+ # if self.update_planner.show_progress:
380
+ # self.update_planner.show_update_plan()
381
+ #
382
+ # for priority, dates in tasks:
383
+ # self._execute_task_batch(priority, dates, max_retries, backoff_base, backoff_jitter, backoff_max)
384
+ #
385
+ # total_time = time.perf_counter() - overall_start
386
+ # if self.processed_dates:
387
+ # count = len(self.processed_dates)
388
+ # self.logger.info(f"Processed {count} dates in {total_time:.1f}s (avg {total_time / count:.1f}s/date)")
389
+ # if self.update_planner.show_progress:
390
+ # self.show_benchmark_summary()
391
+ #
392
+ # def _execute_task_batch(
393
+ # self,
394
+ # priority: int,
395
+ # dates: List[datetime.date],
396
+ # max_retries: int,
397
+ # backoff_base: float,
398
+ # backoff_jitter: float,
399
+ # backoff_max: float,
400
+ # ):
401
+ # desc = f"Processing {self.dataclass.__name__}, priority: {priority}"
402
+ # max_thr = min(len(dates), self.max_threads)
403
+ # self.logger.info(f"Executing {len(dates)} tasks with priority {priority} using {max_thr} threads.")
404
+ #
405
+ # with ThreadPoolExecutor(max_workers=max_thr) as executor:
406
+ # futures = {
407
+ # executor.submit(
408
+ # self._process_date_with_retry, date, max_retries, backoff_base, backoff_jitter, backoff_max
409
+ # ): date
410
+ # for date in dates
411
+ # }
412
+ # iterator = as_completed(futures)
413
+ # if self.show_progress:
414
+ # iterator = tqdm(iterator, total=len(futures), desc=desc)
415
+ #
416
+ # for future in iterator:
417
+ # try:
418
+ # future.result(timeout=self.timeout)
419
+ # except Exception as e:
420
+ # self.logger.error(f"Permanent failure for {futures[future]}: {e}")
421
+ #
422
+ # def _process_date_with_retry(
423
+ # self,
424
+ # date: datetime.date,
425
+ # max_retries: int,
426
+ # backoff_base: float,
427
+ # backoff_jitter: float,
428
+ # backoff_max: float,
429
+ # ):
430
+ # for attempt in range(max_retries):
431
+ # try:
432
+ # self._process_single_date(date)
433
+ # return
434
+ # except Exception as e:
435
+ # if attempt < max_retries - 1:
436
+ # base_delay = min(backoff_base ** attempt, backoff_max)
437
+ # delay = base_delay * (1 + random.uniform(0.0, max(0.0, backoff_jitter)))
438
+ # self.logger.warning(
439
+ # f"Retry {attempt + 1}/{max_retries} for {date}: {e} (sleep {delay:.2f}s)"
440
+ # )
441
+ # time.sleep(delay)
442
+ # else:
443
+ # self.logger.error(f"Failed processing {date} after {max_retries} attempts.")
444
+ #
445
+ # def _process_single_date(self, date: datetime.date):
446
+ # path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
447
+ # self.logger.debug(f"Processing date {date.isoformat()} for {path}")
448
+ # if path in self.update_planner.skipped and self.update_planner.ignore_missing:
449
+ # self.logger.debug(f"Skipping {date} as it exists in the skipped list")
450
+ # return
451
+ # full_path = f"{path}{self.parquet_filename}"
452
+ #
453
+ # overall_start = time.perf_counter()
454
+ # try:
455
+ # load_start = time.perf_counter()
456
+ # date_filter = {f"{self.date_field}__date": {date.isoformat()}}
457
+ # self.logger.debug(f"Loading data for {date} with filter: {date_filter}")
458
+ #
459
+ # local_load_params = self.load_params.copy()
460
+ # local_load_params.update(date_filter)
461
+ #
462
+ # with self.dataclass(**self.class_params) as local_class_instance:
463
+ # df = local_class_instance.load(**local_load_params) # expected to be Dask
464
+ # load_time = time.perf_counter() - load_start
465
+ #
466
+ # if hasattr(local_class_instance, "total_records"):
467
+ # total_records = int(local_class_instance.total_records)
468
+ # self.logger.debug(f"Total records loaded: {total_records}")
469
+ #
470
+ # if total_records == 0:
471
+ # if self.mmanifest:
472
+ # self.mmanifest.record(full_path=path)
473
+ # self.logger.info(f"No data found for {full_path}. Logged to missing manifest.")
474
+ # return
475
+ #
476
+ # if total_records < 0:
477
+ # self.logger.warning(f"Negative record count ({total_records}) for {full_path}.")
478
+ # return
479
+ #
480
+ # save_start = time.perf_counter()
481
+ # parquet_params = {
482
+ # "df_result": df,
483
+ # "parquet_storage_path": path,
484
+ # "fs": self.fs,
485
+ # "logger": self.logger,
486
+ # "debug": self.debug,
487
+ # }
488
+ # with ParquetSaver(**parquet_params) as ps:
489
+ # ps.save_to_parquet(self.parquet_filename, overwrite=True)
490
+ # save_time = time.perf_counter() - save_start
491
+ #
492
+ # total_time = time.perf_counter() - overall_start
493
+ # self.benchmarks[date] = {
494
+ # "load_duration": load_time,
495
+ # "save_duration": save_time,
496
+ # "total_duration": total_time,
497
+ # }
498
+ # self._log_success(date, total_time, full_path)
499
+ #
500
+ # except Exception as e:
501
+ # self._log_failure(date, e)
502
+ # raise
503
+ #
504
+ # def _log_success(self, date: datetime.date, duration: float, path: str):
505
+ # self.logger.info(f"Completed {date} in {duration:.1f}s | Saved to {path}")
506
+ # self.processed_dates.append(date)
507
+ #
508
+ # def _log_failure(self, date: datetime.date, error: Exception):
509
+ # self.logger.error(f"Failed processing {date}: {error}")
510
+ #
511
+ # def show_benchmark_summary(self):
512
+ # if not self.benchmarks:
513
+ # self.logger.info("No benchmarking data to show")
514
+ # return
515
+ # df_bench = pd.DataFrame.from_records([{"date": d, **m} for d, m in self.benchmarks.items()])
516
+ # df_bench = df_bench.set_index("date").sort_index(ascending=not self.update_planner.reverse_order)
517
+ # self.logger.info(f"Benchmark Summary:\n {self.dataclass.__name__}\n" + df_bench.to_string())
518
+ #