sibi-dst 2025.9.9__py3-none-any.whl → 2025.9.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,19 +1,22 @@
1
+ from __future__ import annotations
2
+
1
3
  import datetime as dt
2
4
  from typing import Any, Dict, Iterable, Optional
3
- from sibi_dst.utils import Logger
5
+
6
+ import dask.dataframe as dd
4
7
  import numpy as np
5
8
  import pandas as pd
6
- import dask.dataframe as dd
9
+
10
+ from sibi_dst.utils import Logger
7
11
 
8
12
 
9
13
  # ---------------- Vectorized helpers (used by Dask map_partitions) ----------------
10
14
 
11
15
  def _to_np_days(series: pd.Series) -> np.ndarray:
12
16
  """Coerce to numpy datetime64[D] with NaT-safe conversion."""
13
- # Use pandas for robust parsing, then cast to date-days
14
17
  s = pd.to_datetime(series, errors="coerce")
15
- # Convert to numpy datetime64[D] (day precision)
16
- return s.values.astype("datetime64[D]")
18
+ # Return day precision array directly
19
+ return s.dt.floor("D").to_numpy(dtype="datetime64[D]")
17
20
 
18
21
 
19
22
  def _vectorized_busday_count(
@@ -24,8 +27,8 @@ def _vectorized_busday_count(
24
27
  weekmask: Optional[str],
25
28
  inclusive: bool,
26
29
  ) -> pd.Series:
27
- start = _to_np_days(part[begin_col]) # numpy datetime64[D]
28
- end = _to_np_days(part[end_col]) # numpy datetime64[D]
30
+ start = _to_np_days(part[begin_col])
31
+ end = _to_np_days(part[end_col])
29
32
 
30
33
  kwargs: Dict[str, Any] = {}
31
34
  if holidays:
@@ -38,7 +41,7 @@ def _vectorized_busday_count(
38
41
  with np.errstate(invalid="ignore"):
39
42
  end_adj = end + np.timedelta64(1, "D")
40
43
 
41
- valid = (~pd.isna(start)) & (~pd.isna(end)) # numpy bool mask
44
+ valid = (~pd.isna(start)) & (~pd.isna(end))
42
45
  result = np.full(part.shape[0], np.nan, dtype="float64")
43
46
  if valid.any():
44
47
  counts = np.busday_count(
@@ -59,8 +62,8 @@ def _vectorized_busday_offset(
59
62
  weekmask: Optional[str],
60
63
  roll: str,
61
64
  ) -> pd.Series:
62
- start = _to_np_days(part[start_col]) # numpy datetime64[D]
63
- n_days = pd.to_numeric(part[n_days_col], errors="coerce").to_numpy() # numpy float -> cast later
65
+ start = _to_np_days(part[start_col])
66
+ n_days = pd.to_numeric(part[n_days_col], errors="coerce").to_numpy()
64
67
 
65
68
  kwargs: Dict[str, Any] = {"roll": roll}
66
69
  if holidays:
@@ -68,7 +71,7 @@ def _vectorized_busday_offset(
68
71
  if weekmask:
69
72
  kwargs["weekmask"] = weekmask
70
73
 
71
- valid = (~pd.isna(start)) & (~pd.isna(n_days)) # numpy bool mask
74
+ valid = (~pd.isna(start)) & (~pd.isna(n_days))
72
75
  out = np.full(part.shape[0], np.datetime64("NaT", "ns"), dtype="datetime64[ns]")
73
76
  if valid.any():
74
77
  offs = np.busday_offset(
@@ -86,26 +89,6 @@ def _vectorized_busday_offset(
86
89
  class BusinessDays:
87
90
  """
88
91
  Business day calculations with custom holidays and optional weekmask.
89
-
90
- Features
91
- - Scalar helpers:
92
- - get_business_days_count(begin, end, inclusive=False) -> int
93
- - add_business_days(start_date, n_days, roll='forward') -> np.datetime64
94
- - Dask DataFrame helpers (vectorized via map_partitions):
95
- - calc_business_days_from_df(df, begin_col, end_col, result_col='business_days', inclusive=False)
96
- - calc_sla_end_date(df, start_date_col, n_days_col, result_col='sla_end_date', roll='forward')
97
-
98
- Parameters
99
- ----------
100
- holiday_list : dict[str, list[str]] | Iterable[str]
101
- Either a mapping of year -> [YYYY-MM-DD, ...] or a flat iterable of YYYY-MM-DD strings.
102
- logger : Any
103
- Logger with .debug/.info/.warning/.error.
104
- weekmask : str | None
105
- A numpy business day weekmask like '1111100' (Mon–Fri). None means default Mon–Fri.
106
- Examples:
107
- '1111100' -> Mon-Fri
108
- '1111110' -> Mon-Sat
109
92
  """
110
93
 
111
94
  def __init__(
@@ -119,12 +102,11 @@ class BusinessDays:
119
102
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
120
103
  self.weekmask = weekmask
121
104
 
122
- # Normalize holidays to a flat, sorted tuple of 'YYYY-MM-DD'
123
105
  if isinstance(holiday_list, dict):
124
106
  flat = [d for _, days in sorted(holiday_list.items()) for d in days]
125
107
  else:
126
108
  flat = list(holiday_list)
127
- # Deduplicate while preserving order
109
+
128
110
  seen = set()
129
111
  flat_unique = []
130
112
  for d in flat:
@@ -142,7 +124,6 @@ class BusinessDays:
142
124
  *,
143
125
  inclusive: bool = False,
144
126
  ) -> int:
145
- """Business days between two dates. If inclusive=True, include the end date."""
146
127
  b = pd.to_datetime(begin_date).date()
147
128
  e = pd.to_datetime(end_date).date()
148
129
 
@@ -153,11 +134,11 @@ class BusinessDays:
153
134
  kwargs["weekmask"] = self.weekmask
154
135
 
155
136
  if inclusive:
156
- e_np = np.datetime64(e) + np.timedelta64(1, "D")
137
+ e_np = np.datetime64(e, "D") + np.timedelta64(1, "D")
157
138
  else:
158
- e_np = np.datetime64(e)
139
+ e_np = np.datetime64(e, "D")
159
140
 
160
- val = int(np.busday_count(np.datetime64(b), e_np, **kwargs))
141
+ val = int(np.busday_count(np.datetime64(b, "D"), e_np, **kwargs))
161
142
  return val
162
143
 
163
144
  def add_business_days(
@@ -167,11 +148,6 @@ class BusinessDays:
167
148
  *,
168
149
  roll: str = "forward",
169
150
  ) -> np.datetime64:
170
- """
171
- Add (or subtract) business days to a date. Returns numpy datetime64[D].
172
- roll: {'forward','backward','following','preceding','modifiedfollowing',
173
- 'modifiedpreceding','nat'}
174
- """
175
151
  s = pd.to_datetime(start_date).date()
176
152
  kwargs: Dict[str, Any] = {"roll": roll}
177
153
  if self.holidays:
@@ -179,7 +155,7 @@ class BusinessDays:
179
155
  if self.weekmask:
180
156
  kwargs["weekmask"] = self.weekmask
181
157
 
182
- return np.busday_offset(np.datetime64(s), int(n_days), **kwargs)
158
+ return np.busday_offset(np.datetime64(s, "D"), int(n_days), **kwargs)
183
159
 
184
160
  # -------- Dask API --------
185
161
 
@@ -192,10 +168,6 @@ class BusinessDays:
192
168
  *,
193
169
  inclusive: bool = False,
194
170
  ) -> dd.DataFrame:
195
- """
196
- Vectorized business-day difference between two date columns.
197
- Produces float64 (NaN where either side is missing).
198
- """
199
171
  missing = {begin_date_col, end_date_col} - set(df.columns)
200
172
  if missing:
201
173
  self.logger.error(f"Missing columns: {missing}")
@@ -224,10 +196,6 @@ class BusinessDays:
224
196
  *,
225
197
  roll: str = "forward",
226
198
  ) -> dd.DataFrame:
227
- """
228
- Vectorized business-day offset for SLA end date.
229
- Produces datetime64[ns] with NaT where invalid.
230
- """
231
199
  missing = {start_date_col, n_days_col} - set(df.columns)
232
200
  if missing:
233
201
  self.logger.error(f"Missing columns: {missing}")
@@ -224,7 +224,7 @@ class ClickHouseWriter(ManagedResource):
224
224
  def _default_engine_sql(self) -> str:
225
225
  # minimal MergeTree clause; quote order_by safely
226
226
  ob = self.order_by if self.order_by.startswith("(") else f"(`{self.order_by}`)"
227
- return f"ENGINE = MergeTree ORDER BY {ob}"
227
+ return f"ENGINE = MergeTree ORDER BY {ob} SETTINGS allow_nullable_key = 1"
228
228
 
229
229
  # ------------- partition write -------------
230
230
 
@@ -37,7 +37,6 @@ class DataWrapper(ManagedResource):
37
37
  dataclass: Type,
38
38
  date_field: str,
39
39
  data_path: str,
40
- parquet_filename: str,
41
40
  class_params: Optional[Dict] = None,
42
41
  load_params: Optional[Dict] = None,
43
42
  show_progress: bool = False,
@@ -50,7 +49,7 @@ class DataWrapper(ManagedResource):
50
49
  self.dataclass: Type = dataclass
51
50
  self.date_field: str = date_field
52
51
  self.data_path: str = self._ensure_forward_slash(data_path)
53
- self.parquet_filename: str = parquet_filename
52
+ self.partition_on_date: bool = True # Assume Hive-style date partitioning by default
54
53
 
55
54
  if self.fs is None:
56
55
  raise ValueError("DataWrapper requires a File system (fs) to be provided.")
@@ -282,16 +281,23 @@ class DataWrapper(ManagedResource):
282
281
  def _process_single_date(self, date: datetime.date):
283
282
  """Process a single date: load, save to Parquet."""
284
283
  # --- 1. Setup paths and logging ---
285
- path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
286
- log_extra = self._log_extra(date_context=date.isoformat())
287
- self.logger.debug(f"Processing date {date.isoformat()} for {path}", extra=log_extra)
288
-
284
+ path = self.data_path.rstrip("/")+"/"
285
+ if not self.partition_on_date:
286
+ # not a Hive-style partitioned path
287
+ path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
288
+ log_extra = self._log_extra(date_context=date.isoformat())
289
+ self.logger.debug(f"Processing date {date.isoformat()} for legacy {path}", extra=log_extra)
290
+ else :
291
+ # Hive-style partitioned path
292
+ log_extra = self._log_extra(date_context=date.isoformat(), partition_on=self.date_field)
293
+ self.logger.debug(f"Processing date {date.isoformat()} for partitioned {self.data_path} with hive-style partitions", extra=log_extra)
289
294
  # --- 2. Check if date/path should be skipped ---
290
295
  if (self.update_planner and path in self.update_planner.skipped and
291
296
  getattr(self.update_planner, 'ignore_missing', False)):
292
297
  self.logger.debug(f"Skipping {date} as it exists in the skipped list", extra=log_extra)
293
298
  return
294
- full_path = f"{path}{self.parquet_filename}"
299
+
300
+ self.logger.debug(f"Processing date {date.isoformat()} for {path}", extra=log_extra)
295
301
 
296
302
  # --- 3. Timing ---
297
303
  overall_start = time.perf_counter()
@@ -326,39 +332,44 @@ class DataWrapper(ManagedResource):
326
332
  self.mmanifest.record(full_path=path)
327
333
  except Exception as e:
328
334
  self.logger.error(f"Failed to record missing path {path}: {e}", extra=log_extra)
329
- self.logger.info(f"No data found for {full_path}. Logged to missing manifest.", extra=log_extra)
335
+ self.logger.info(f"No data found for {path}. Logged to missing manifest.", extra=log_extra)
330
336
  return # Done for this date
331
337
 
332
338
  if total_records < 0:
333
- self.logger.warning(f"Negative record count ({total_records}) for {full_path}. Proceeding.", extra=log_extra)
339
+ self.logger.warning(f"Negative record count ({total_records}) for {path}. Proceeding.", extra=log_extra)
334
340
  # Continue processing even with negative count
335
341
 
336
- # --- 6. Save to Parquet ---
337
- save_start = time.perf_counter()
338
- parquet_params = {
339
- "df_result": df,
340
- "parquet_storage_path": path,
341
- "fs": self.fs,
342
- "logger": self.logger,
343
- "debug": self.debug,
344
- "verbose": self.verbose,
345
- }
346
- self.logger.debug(f"{self.dataclass.__name__} saving to parquet started...", extra=log_extra)
347
- with ParquetSaver(**parquet_params) as ps:
348
- ps.save_to_parquet(self.parquet_filename, overwrite=True)
349
- save_time = time.perf_counter() - save_start
350
- self.logger.debug(f"Parquet saving for {date} completed in {save_time:.2f}s", extra=log_extra)
351
-
352
- # --- 7. Benchmarking ---
353
- total_time = time.perf_counter() - overall_start
354
- self.benchmarks[date] = {
355
- "load_duration": load_time,
356
- "save_duration": save_time,
357
- "total_duration": total_time,
358
- }
359
-
360
- # --- 8. Log Success ---
361
- self._log_success(date, total_time, full_path)
342
+ # --- 6. Save to Parquet ---
343
+ save_start = time.perf_counter()
344
+
345
+
346
+ parquet_params = {
347
+ "df_result": df,
348
+ "parquet_storage_path": path,
349
+ "fs": self.fs,
350
+ "logger": self.logger,
351
+ "debug": self.debug,
352
+ "verbose": self.verbose,
353
+ }
354
+ if self.partition_on_date:
355
+ df["partition_date"] = df[self.date_field].dt.date.astype(str)
356
+ parquet_params["partition_on"] = ["partition_date"]
357
+ self.logger.debug(f"{self.dataclass.__name__} saving to parquet started...", extra=log_extra)
358
+ with ParquetSaver(**parquet_params) as ps:
359
+ ps.save_to_parquet()
360
+ save_time = time.perf_counter() - save_start
361
+ self.logger.debug(f"Parquet saving for {date} completed in {save_time:.2f}s", extra=log_extra)
362
+
363
+ # --- 7. Benchmarking ---
364
+ total_time = time.perf_counter() - overall_start
365
+ self.benchmarks[date] = {
366
+ "load_duration": load_time,
367
+ "save_duration": save_time,
368
+ "total_duration": total_time,
369
+ }
370
+
371
+ # --- 8. Log Success ---
372
+ self._log_success(date, total_time, path)
362
373
 
363
374
  except Exception as e:
364
375
  # --- 9. Handle Errors ---
@@ -397,280 +408,3 @@ class DataWrapper(ManagedResource):
397
408
  except Exception as e:
398
409
  self.logger.error(f"Error generating benchmark summary: {e}", extra=self.logger_extra)
399
410
 
400
- # import datetime
401
- # import random
402
- # import threading
403
- # import time
404
- # from concurrent.futures import ThreadPoolExecutor, as_completed
405
- # from typing import Type, Any, Dict, Optional, Union, List, ClassVar
406
- #
407
- # import pandas as pd
408
- # from tqdm import tqdm
409
- #
410
- # from . import ManagedResource
411
- # from .parquet_saver import ParquetSaver
412
- #
413
- #
414
- # class DataWrapper(ManagedResource):
415
- # DEFAULT_PRIORITY_MAP: ClassVar[Dict[str, int]] = {
416
- # "overwrite": 1,
417
- # "missing_in_history": 2,
418
- # "existing_but_stale": 3,
419
- # "missing_outside_history": 4,
420
- # "file_is_recent": 0,
421
- # }
422
- # DEFAULT_MAX_AGE_MINUTES: int = 1440
423
- # DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
424
- #
425
- # logger_extra = {"sibi_dst_component": __name__}
426
- #
427
- # def __init__(
428
- # self,
429
- # dataclass: Type,
430
- # date_field: str,
431
- # data_path: str,
432
- # parquet_filename: str,
433
- # class_params: Optional[Dict] = None,
434
- # load_params: Optional[Dict] = None,
435
- # show_progress: bool = False,
436
- # timeout: float = 30,
437
- # max_threads: int = 3,
438
- # **kwargs: Any,
439
- # ):
440
- # super().__init__(**kwargs)
441
- # self.dataclass = dataclass
442
- # self.date_field = date_field
443
- # self.data_path = self._ensure_forward_slash(data_path)
444
- # self.parquet_filename = parquet_filename
445
- # if self.fs is None:
446
- # raise ValueError("DataWrapper requires a File system (fs) to be provided.")
447
- # self.show_progress = show_progress
448
- # self.timeout = timeout
449
- # self.max_threads = max_threads
450
- # self.class_params = class_params or {
451
- # "debug": self.debug,
452
- # "logger": self.logger,
453
- # "fs": self.fs,
454
- # "verbose": self.verbose,
455
- # }
456
- # self.load_params = load_params or {}
457
- #
458
- # self._lock = threading.Lock()
459
- # self.processed_dates: List[datetime.date] = []
460
- # self.benchmarks: Dict[datetime.date, Dict[str, float]] = {}
461
- # self.mmanifest = kwargs.get("mmanifest", None)
462
- # self.update_planner = kwargs.get("update_planner", None)
463
- #
464
- # # --- NEW: stop gate tripped during cleanup/interrupt to block further scheduling/retries
465
- # self._stop_event = threading.Event()
466
- # self.logger_extra.update({"action_module_name": "data_wrapper", "dataclass": self.dataclass.__name__})
467
- #
468
- # # ensure manifest is saved on context exit
469
- # def __exit__(self, exc_type, exc_val, exc_tb):
470
- # if self.mmanifest:
471
- # self.mmanifest.save()
472
- # super().__exit__(exc_type, exc_val, exc_tb)
473
- # return False
474
- #
475
- # # --- NEW: trip stop gate during class-specific cleanup (close/aclose/finalizer path)
476
- # def _cleanup(self) -> None:
477
- # self._stop_event.set()
478
- #
479
- # @staticmethod
480
- # def _convert_to_date(date: Union[datetime.date, str]) -> datetime.date:
481
- # if isinstance(date, datetime.date):
482
- # return date
483
- # try:
484
- # return pd.to_datetime(date).date()
485
- # except ValueError as e:
486
- # raise ValueError(f"Error converting {date} to datetime: {e}")
487
- #
488
- # @staticmethod
489
- # def _ensure_forward_slash(path: str) -> str:
490
- # return path.rstrip("/") + "/"
491
- #
492
- # def process(
493
- # self,
494
- # max_retries: int = 3,
495
- # backoff_base: float = 2.0,
496
- # backoff_jitter: float = 0.1,
497
- # backoff_max: float = 60.0,
498
- # ):
499
- # """
500
- # Execute the update plan with concurrency, retries and exponential backoff.
501
- # Stops scheduling immediately if closed or interrupted (Ctrl-C).
502
- # """
503
- # overall_start = time.perf_counter()
504
- # tasks = list(self.update_planner.get_tasks_by_priority())
505
- # if not tasks:
506
- # self.logger.info("No updates required based on the current plan.")
507
- # return
508
- #
509
- # if self.update_planner.show_progress:
510
- # self.update_planner.show_update_plan()
511
- #
512
- # try:
513
- # for priority, dates in tasks:
514
- # if self._stop_event.is_set():
515
- # break
516
- # self._execute_task_batch(priority, dates, max_retries, backoff_base, backoff_jitter, backoff_max)
517
- # except KeyboardInterrupt:
518
- # self.logger.warning("KeyboardInterrupt received — stopping scheduling and shutting down.", extra=self.logger_extra)
519
- # self._stop_event.set()
520
- # raise
521
- # finally:
522
- # total_time = time.perf_counter() - overall_start
523
- # if self.processed_dates:
524
- # count = len(self.processed_dates)
525
- # self.logger.info(f"Processed {count} dates in {total_time:.1f}s (avg {total_time / count:.1f}s/date)", extra=self.logger_extra)
526
- # if self.update_planner.show_progress:
527
- # self.show_benchmark_summary()
528
- #
529
- # def _execute_task_batch(
530
- # self,
531
- # priority: int,
532
- # dates: List[datetime.date],
533
- # max_retries: int,
534
- # backoff_base: float,
535
- # backoff_jitter: float,
536
- # backoff_max: float,
537
- # ):
538
- # desc = f"Processing {self.dataclass.__name__}, priority: {priority}"
539
- # max_thr = min(len(dates), self.max_threads)
540
- # self.logger.info(f"Executing {len(dates)} tasks with priority {priority} using {max_thr} threads.", extra=self.logger_extra)
541
- #
542
- # # Use explicit try/finally so we can request cancel of queued tasks on teardown
543
- # executor = ThreadPoolExecutor(max_workers=max_thr, thread_name_prefix="datawrapper")
544
- # try:
545
- # futures = {}
546
- # for date in dates:
547
- # if self._stop_event.is_set():
548
- # break
549
- # try:
550
- # fut = executor.submit(
551
- # self._process_date_with_retry, date, max_retries, backoff_base, backoff_jitter, backoff_max
552
- # )
553
- # futures[fut] = date
554
- # except RuntimeError as e:
555
- # # tolerate race: executor shutting down
556
- # if "cannot schedule new futures after shutdown" in str(e).lower():
557
- # self.logger.warning("Executor is shutting down; halting new submissions for this batch.", extra=self.logger_extra)
558
- # break
559
- # raise
560
- #
561
- # iterator = as_completed(futures)
562
- # if self.show_progress:
563
- # iterator = tqdm(iterator, total=len(futures), desc=desc)
564
- #
565
- # for future in iterator:
566
- # try:
567
- # future.result(timeout=self.timeout)
568
- # except Exception as e:
569
- # self.logger.error(f"Permanent failure for {futures[future]}: {e}", extra=self.logger_extra)
570
- # finally:
571
- # # Python 3.9+: cancel_futures prevents queued tasks from starting
572
- # executor.shutdown(wait=True, cancel_futures=True)
573
- #
574
- # def _process_date_with_retry(
575
- # self,
576
- # date: datetime.date,
577
- # max_retries: int,
578
- # backoff_base: float,
579
- # backoff_jitter: float,
580
- # backoff_max: float,
581
- # ):
582
- # for attempt in range(max_retries):
583
- # # --- NEW: bail out quickly if shutdown/interrupt began
584
- # if self._stop_event.is_set():
585
- # raise RuntimeError("shutting_down")
586
- #
587
- # try:
588
- # self._process_single_date(date)
589
- # return
590
- # except Exception as e:
591
- # if attempt < max_retries - 1 and not self._stop_event.is_set():
592
- # base_delay = min(backoff_base ** attempt, backoff_max)
593
- # delay = base_delay * (1 + random.uniform(0.0, max(0.0, backoff_jitter)))
594
- # self.logger.warning(
595
- # f"Retry {attempt + 1}/{max_retries} for {date}: {e} (sleep {delay:.2f}s)",
596
- # extra=self.logger_extra
597
- # )
598
- # time.sleep(delay)
599
- # else:
600
- # self.logger.error(f"Failed processing {date} after {max_retries} attempts.", extra=self.logger_extra)
601
- # raise
602
- #
603
- # def _process_single_date(self, date: datetime.date):
604
- # path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
605
- # self.logger.debug(f"Processing date {date.isoformat()} for {path}", extra=self.logger_extra)
606
- # if path in self.update_planner.skipped and self.update_planner.ignore_missing:
607
- # self.logger.debug(f"Skipping {date} as it exists in the skipped list", extra=self.logger_extra)
608
- # return
609
- # full_path = f"{path}{self.parquet_filename}"
610
- #
611
- # overall_start = time.perf_counter()
612
- # try:
613
- # load_start = time.perf_counter()
614
- # date_filter = {f"{self.date_field}__date": {date.isoformat()}}
615
- # self.logger.debug(f"{self.dataclass.__name__} is loading data for {date} with filter: {date_filter}", extra=self.logger_extra)
616
- #
617
- # local_load_params = self.load_params.copy()
618
- # local_load_params.update(date_filter)
619
- #
620
- # with self.dataclass(**self.class_params) as local_class_instance:
621
- # df = local_class_instance.load(**local_load_params) # expected to be Dask
622
- # load_time = time.perf_counter() - load_start
623
- #
624
- # if hasattr(local_class_instance, "total_records"):
625
- # total_records = int(local_class_instance.total_records)
626
- # self.logger.debug(f"Total records loaded: {total_records}", extra=self.logger_extra)
627
- #
628
- # if total_records == 0:
629
- # if self.mmanifest:
630
- # self.mmanifest.record(full_path=path)
631
- # self.logger.info(f"No data found for {full_path}. Logged to missing manifest.", extra=self.logger_extra)
632
- # return
633
- #
634
- # if total_records < 0:
635
- # self.logger.warning(f"Negative record count ({total_records}) for {full_path}.", extra=self.logger_extra)
636
- # return
637
- #
638
- # save_start = time.perf_counter()
639
- # parquet_params = {
640
- # "df_result": df,
641
- # "parquet_storage_path": path,
642
- # "fs": self.fs,
643
- # "logger": self.logger,
644
- # "debug": self.debug,
645
- # }
646
- # with ParquetSaver(**parquet_params) as ps:
647
- # ps.save_to_parquet(self.parquet_filename, overwrite=True)
648
- # save_time = time.perf_counter() - save_start
649
- #
650
- # total_time = time.perf_counter() - overall_start
651
- # self.benchmarks[date] = {
652
- # "load_duration": load_time,
653
- # "save_duration": save_time,
654
- # "total_duration": total_time,
655
- # }
656
- # self._log_success(date, total_time, full_path)
657
- #
658
- # except Exception as e:
659
- # self._log_failure(date, e)
660
- # raise
661
- #
662
- # def _log_success(self, date: datetime.date, duration: float, path: str):
663
- # self.logger.info(f"Completed {date} in {duration:.1f}s | Saved to {path}", extra=self.logger_extra)
664
- # self.processed_dates.append(date)
665
- #
666
- # def _log_failure(self, date: datetime.date, error: Exception):
667
- # self.logger.error(f"Failed processing {date}: {error}", extra=self.logger_extra)
668
- #
669
- # def show_benchmark_summary(self):
670
- # if not self.benchmarks:
671
- # self.logger.info("No benchmarking data to show", extra=self.logger_extra)
672
- # return
673
- # df_bench = pd.DataFrame.from_records([{"date": d, **m} for d, m in self.benchmarks.items()])
674
- # df_bench = df_bench.set_index("date").sort_index(ascending=not self.update_planner.reverse_order)
675
- # self.logger.info(f"Benchmark Summary:\n {self.dataclass.__name__}\n" + df_bench.to_string(), extra=self.logger_extra)
676
- #