sibi-dst 2025.1.13__py3-none-any.whl → 2025.8.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +7 -1
- sibi_dst/df_helper/__init__.py +3 -2
- sibi_dst/df_helper/_artifact_updater_async.py +238 -0
- sibi_dst/df_helper/_artifact_updater_threaded.py +195 -0
- sibi_dst/df_helper/_df_helper.py +418 -118
- sibi_dst/df_helper/_parquet_artifact.py +275 -283
- sibi_dst/df_helper/_parquet_reader.py +9 -10
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +8 -4
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +68 -107
- sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +105 -255
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +90 -42
- sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +192 -0
- sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +122 -72
- sibi_dst/osmnx_helper/route_path_builder.py +45 -46
- sibi_dst/utils/__init__.py +2 -0
- sibi_dst/utils/base.py +235 -100
- sibi_dst/utils/business_days.py +248 -0
- sibi_dst/utils/clickhouse_writer.py +472 -206
- sibi_dst/utils/data_utils.py +139 -186
- sibi_dst/utils/data_wrapper.py +392 -88
- sibi_dst/utils/date_utils.py +711 -393
- sibi_dst/utils/df_utils.py +193 -213
- sibi_dst/utils/file_age_checker.py +301 -0
- sibi_dst/utils/file_utils.py +3 -2
- sibi_dst/utils/filepath_generator.py +314 -152
- sibi_dst/utils/log_utils.py +581 -242
- sibi_dst/utils/manifest_manager.py +60 -76
- sibi_dst/utils/parquet_saver.py +33 -27
- sibi_dst/utils/periods.py +42 -0
- sibi_dst/utils/phone_formatter.py +88 -95
- sibi_dst/utils/update_planner.py +180 -178
- sibi_dst/utils/webdav_client.py +116 -166
- {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/METADATA +1 -1
- {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/RECORD +36 -30
- sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +0 -422
- {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/WHEEL +0 -0
sibi_dst/utils/data_wrapper.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
import datetime
|
2
|
+
import random
|
2
3
|
import threading
|
3
4
|
import time
|
4
5
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
@@ -17,23 +18,23 @@ class DataWrapper(ManagedResource):
|
|
17
18
|
"missing_in_history": 2,
|
18
19
|
"existing_but_stale": 3,
|
19
20
|
"missing_outside_history": 4,
|
20
|
-
"file_is_recent": 0
|
21
|
+
"file_is_recent": 0,
|
21
22
|
}
|
22
23
|
DEFAULT_MAX_AGE_MINUTES: int = 1440
|
23
24
|
DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
|
24
25
|
|
25
26
|
def __init__(
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
27
|
+
self,
|
28
|
+
dataclass: Type,
|
29
|
+
date_field: str,
|
30
|
+
data_path: str,
|
31
|
+
parquet_filename: str,
|
32
|
+
class_params: Optional[Dict] = None,
|
33
|
+
load_params: Optional[Dict] = None,
|
34
|
+
show_progress: bool = False,
|
35
|
+
timeout: float = 30,
|
36
|
+
max_threads: int = 3,
|
37
|
+
**kwargs: Any,
|
37
38
|
):
|
38
39
|
super().__init__(**kwargs)
|
39
40
|
self.dataclass = dataclass
|
@@ -41,15 +42,15 @@ class DataWrapper(ManagedResource):
|
|
41
42
|
self.data_path = self._ensure_forward_slash(data_path)
|
42
43
|
self.parquet_filename = parquet_filename
|
43
44
|
if self.fs is None:
|
44
|
-
raise ValueError("
|
45
|
+
raise ValueError("DataWrapper requires a File system (fs) to be provided.")
|
45
46
|
self.show_progress = show_progress
|
46
47
|
self.timeout = timeout
|
47
48
|
self.max_threads = max_threads
|
48
49
|
self.class_params = class_params or {
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
50
|
+
"debug": self.debug,
|
51
|
+
"logger": self.logger,
|
52
|
+
"fs": self.fs,
|
53
|
+
"verbose": self.verbose,
|
53
54
|
}
|
54
55
|
self.load_params = load_params or {}
|
55
56
|
|
@@ -59,13 +60,21 @@ class DataWrapper(ManagedResource):
|
|
59
60
|
self.mmanifest = kwargs.get("mmanifest", None)
|
60
61
|
self.update_planner = kwargs.get("update_planner", None)
|
61
62
|
|
63
|
+
# --- NEW: stop gate tripped during cleanup/interrupt to block further scheduling/retries
|
64
|
+
self._stop_event = threading.Event()
|
65
|
+
self.extra_logger = {"action_module_name": "data_wrapper", "dataclass": self.dataclass.__name__}
|
66
|
+
|
67
|
+
# ensure manifest is saved on context exit
|
62
68
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
63
|
-
"""Context manager exit"""
|
64
69
|
if self.mmanifest:
|
65
70
|
self.mmanifest.save()
|
66
71
|
super().__exit__(exc_type, exc_val, exc_tb)
|
67
72
|
return False
|
68
73
|
|
74
|
+
# --- NEW: trip stop gate during class-specific cleanup (close/aclose/finalizer path)
|
75
|
+
def _cleanup(self) -> None:
|
76
|
+
self._stop_event.set()
|
77
|
+
|
69
78
|
@staticmethod
|
70
79
|
def _convert_to_date(date: Union[datetime.date, str]) -> datetime.date:
|
71
80
|
if isinstance(date, datetime.date):
|
@@ -77,10 +86,19 @@ class DataWrapper(ManagedResource):
|
|
77
86
|
|
78
87
|
@staticmethod
|
79
88
|
def _ensure_forward_slash(path: str) -> str:
|
80
|
-
return path.rstrip(
|
89
|
+
return path.rstrip("/") + "/"
|
81
90
|
|
82
|
-
def process(
|
83
|
-
|
91
|
+
def process(
|
92
|
+
self,
|
93
|
+
max_retries: int = 3,
|
94
|
+
backoff_base: float = 2.0,
|
95
|
+
backoff_jitter: float = 0.1,
|
96
|
+
backoff_max: float = 60.0,
|
97
|
+
):
|
98
|
+
"""
|
99
|
+
Execute the update plan with concurrency, retries and exponential backoff.
|
100
|
+
Stops scheduling immediately if closed or interrupted (Ctrl-C).
|
101
|
+
"""
|
84
102
|
overall_start = time.perf_counter()
|
85
103
|
tasks = list(self.update_planner.get_tasks_by_priority())
|
86
104
|
if not tasks:
|
@@ -90,24 +108,55 @@ class DataWrapper(ManagedResource):
|
|
90
108
|
if self.update_planner.show_progress:
|
91
109
|
self.update_planner.show_update_plan()
|
92
110
|
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
self.logger.
|
100
|
-
|
101
|
-
|
111
|
+
try:
|
112
|
+
for priority, dates in tasks:
|
113
|
+
if self._stop_event.is_set():
|
114
|
+
break
|
115
|
+
self._execute_task_batch(priority, dates, max_retries, backoff_base, backoff_jitter, backoff_max)
|
116
|
+
except KeyboardInterrupt:
|
117
|
+
self.logger.warning("KeyboardInterrupt received — stopping scheduling and shutting down.")
|
118
|
+
self._stop_event.set()
|
119
|
+
raise
|
120
|
+
finally:
|
121
|
+
total_time = time.perf_counter() - overall_start
|
122
|
+
if self.processed_dates:
|
123
|
+
count = len(self.processed_dates)
|
124
|
+
self.logger.info(f"Processed {count} dates in {total_time:.1f}s (avg {total_time / count:.1f}s/date)")
|
125
|
+
if self.update_planner.show_progress:
|
126
|
+
self.show_benchmark_summary()
|
102
127
|
|
103
|
-
def _execute_task_batch(
|
104
|
-
|
128
|
+
def _execute_task_batch(
|
129
|
+
self,
|
130
|
+
priority: int,
|
131
|
+
dates: List[datetime.date],
|
132
|
+
max_retries: int,
|
133
|
+
backoff_base: float,
|
134
|
+
backoff_jitter: float,
|
135
|
+
backoff_max: float,
|
136
|
+
):
|
105
137
|
desc = f"Processing {self.dataclass.__name__}, priority: {priority}"
|
106
138
|
max_thr = min(len(dates), self.max_threads)
|
107
|
-
self.logger.info(f"Executing {len(dates)} tasks with priority {priority} using {max_thr} threads.")
|
139
|
+
self.logger.info(f"Executing {len(dates)} tasks with priority {priority} using {max_thr} threads.", extra=self.extra_logger)
|
140
|
+
|
141
|
+
# Use explicit try/finally so we can request cancel of queued tasks on teardown
|
142
|
+
executor = ThreadPoolExecutor(max_workers=max_thr, thread_name_prefix="datawrapper")
|
143
|
+
try:
|
144
|
+
futures = {}
|
145
|
+
for date in dates:
|
146
|
+
if self._stop_event.is_set():
|
147
|
+
break
|
148
|
+
try:
|
149
|
+
fut = executor.submit(
|
150
|
+
self._process_date_with_retry, date, max_retries, backoff_base, backoff_jitter, backoff_max
|
151
|
+
)
|
152
|
+
futures[fut] = date
|
153
|
+
except RuntimeError as e:
|
154
|
+
# tolerate race: executor shutting down
|
155
|
+
if "cannot schedule new futures after shutdown" in str(e).lower():
|
156
|
+
self.logger.warning("Executor is shutting down; halting new submissions for this batch.")
|
157
|
+
break
|
158
|
+
raise
|
108
159
|
|
109
|
-
with ThreadPoolExecutor(max_workers=max_thr) as executor:
|
110
|
-
futures = {executor.submit(self._process_date_with_retry, date, max_retries): date for date in dates}
|
111
160
|
iterator = as_completed(futures)
|
112
161
|
if self.show_progress:
|
113
162
|
iterator = tqdm(iterator, total=len(futures), desc=desc)
|
@@ -116,24 +165,40 @@ class DataWrapper(ManagedResource):
|
|
116
165
|
try:
|
117
166
|
future.result(timeout=self.timeout)
|
118
167
|
except Exception as e:
|
119
|
-
self.logger.error(f"Permanent failure for {futures[future]}: {e}")
|
168
|
+
self.logger.error(f"Permanent failure for {futures[future]}: {e}", extra=self.extra_logger)
|
169
|
+
finally:
|
170
|
+
# Python 3.9+: cancel_futures prevents queued tasks from starting
|
171
|
+
executor.shutdown(wait=True, cancel_futures=True)
|
120
172
|
|
121
|
-
def _process_date_with_retry(
|
122
|
-
|
173
|
+
def _process_date_with_retry(
|
174
|
+
self,
|
175
|
+
date: datetime.date,
|
176
|
+
max_retries: int,
|
177
|
+
backoff_base: float,
|
178
|
+
backoff_jitter: float,
|
179
|
+
backoff_max: float,
|
180
|
+
):
|
123
181
|
for attempt in range(max_retries):
|
182
|
+
# --- NEW: bail out quickly if shutdown/interrupt began
|
183
|
+
if self._stop_event.is_set():
|
184
|
+
raise RuntimeError("shutting_down")
|
185
|
+
|
124
186
|
try:
|
125
187
|
self._process_single_date(date)
|
126
188
|
return
|
127
189
|
except Exception as e:
|
128
|
-
if attempt < max_retries - 1:
|
129
|
-
|
130
|
-
|
190
|
+
if attempt < max_retries - 1 and not self._stop_event.is_set():
|
191
|
+
base_delay = min(backoff_base ** attempt, backoff_max)
|
192
|
+
delay = base_delay * (1 + random.uniform(0.0, max(0.0, backoff_jitter)))
|
193
|
+
self.logger.warning(
|
194
|
+
f"Retry {attempt + 1}/{max_retries} for {date}: {e} (sleep {delay:.2f}s)"
|
195
|
+
)
|
196
|
+
time.sleep(delay)
|
131
197
|
else:
|
132
|
-
self.logger.error(f"Failed processing {date} after {max_retries} attempts.")
|
133
|
-
|
198
|
+
self.logger.error(f"Failed processing {date} after {max_retries} attempts.", extra=self.extra_logger)
|
199
|
+
raise
|
134
200
|
|
135
201
|
def _process_single_date(self, date: datetime.date):
|
136
|
-
"""Core date processing logic with load/save timing and thread reporting"""
|
137
202
|
path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
|
138
203
|
self.logger.debug(f"Processing date {date.isoformat()} for {path}")
|
139
204
|
if path in self.update_planner.skipped and self.update_planner.ignore_missing:
|
@@ -141,74 +206,313 @@ class DataWrapper(ManagedResource):
|
|
141
206
|
return
|
142
207
|
full_path = f"{path}{self.parquet_filename}"
|
143
208
|
|
144
|
-
# thread_name = threading.current_thread().name
|
145
|
-
# self.logger.debug(f"[{thread_name}] Executing date: {date} -> saving to: {full_path}")
|
146
|
-
|
147
209
|
overall_start = time.perf_counter()
|
148
210
|
try:
|
149
211
|
load_start = time.perf_counter()
|
150
212
|
date_filter = {f"{self.date_field}__date": {date.isoformat()}}
|
151
213
|
self.logger.debug(f"Loading data for {date} with filter: {date_filter}")
|
152
|
-
|
153
|
-
# Create a copy to avoid mutating the shared instance dictionary
|
214
|
+
|
154
215
|
local_load_params = self.load_params.copy()
|
155
216
|
local_load_params.update(date_filter)
|
217
|
+
|
156
218
|
with self.dataclass(**self.class_params) as local_class_instance:
|
157
|
-
df = local_class_instance.load(**local_load_params)
|
219
|
+
df = local_class_instance.load(**local_load_params) # expected to be Dask
|
158
220
|
load_time = time.perf_counter() - load_start
|
159
221
|
|
160
222
|
if hasattr(local_class_instance, "total_records"):
|
161
|
-
|
162
|
-
|
163
|
-
|
223
|
+
total_records = int(local_class_instance.total_records)
|
224
|
+
self.logger.debug(f"Total records loaded: {total_records}")
|
225
|
+
|
226
|
+
if total_records == 0:
|
164
227
|
if self.mmanifest:
|
165
|
-
self.mmanifest.record(
|
166
|
-
full_path=path
|
167
|
-
)
|
228
|
+
self.mmanifest.record(full_path=path)
|
168
229
|
self.logger.info(f"No data found for {full_path}. Logged to missing manifest.")
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
230
|
+
return
|
231
|
+
|
232
|
+
if total_records < 0:
|
233
|
+
self.logger.warning(f"Negative record count ({total_records}) for {full_path}.")
|
234
|
+
return
|
235
|
+
|
236
|
+
save_start = time.perf_counter()
|
237
|
+
parquet_params = {
|
238
|
+
"df_result": df,
|
239
|
+
"parquet_storage_path": path,
|
240
|
+
"fs": self.fs,
|
241
|
+
"logger": self.logger,
|
242
|
+
"debug": self.debug,
|
243
|
+
}
|
244
|
+
with ParquetSaver(**parquet_params) as ps:
|
245
|
+
ps.save_to_parquet(self.parquet_filename, overwrite=True)
|
246
|
+
save_time = time.perf_counter() - save_start
|
247
|
+
|
248
|
+
total_time = time.perf_counter() - overall_start
|
249
|
+
self.benchmarks[date] = {
|
250
|
+
"load_duration": load_time,
|
251
|
+
"save_duration": save_time,
|
252
|
+
"total_duration": total_time,
|
253
|
+
}
|
254
|
+
self._log_success(date, total_time, full_path)
|
255
|
+
|
194
256
|
except Exception as e:
|
195
257
|
self._log_failure(date, e)
|
196
258
|
raise
|
197
259
|
|
198
260
|
def _log_success(self, date: datetime.date, duration: float, path: str):
|
199
|
-
|
200
|
-
self.logger.info(msg)
|
261
|
+
self.logger.info(f"Completed {date} in {duration:.1f}s | Saved to {path}", extra=self.extra_logger)
|
201
262
|
self.processed_dates.append(date)
|
202
263
|
|
203
264
|
def _log_failure(self, date: datetime.date, error: Exception):
|
204
|
-
|
205
|
-
self.logger.error(msg)
|
265
|
+
self.logger.error(f"Failed processing {date}: {error}", extra=self.extra_logger)
|
206
266
|
|
207
267
|
def show_benchmark_summary(self):
|
208
|
-
"""Display a summary of load/save timings per date"""
|
209
268
|
if not self.benchmarks:
|
210
|
-
self.logger.info("No benchmarking data to show")
|
269
|
+
self.logger.info("No benchmarking data to show", extra=self.extra_logger)
|
211
270
|
return
|
212
271
|
df_bench = pd.DataFrame.from_records([{"date": d, **m} for d, m in self.benchmarks.items()])
|
213
272
|
df_bench = df_bench.set_index("date").sort_index(ascending=not self.update_planner.reverse_order)
|
214
|
-
self.logger.info(f"Benchmark Summary:\n {self.dataclass.__name__}\n" + df_bench.to_string())
|
273
|
+
self.logger.info(f"Benchmark Summary:\n {self.dataclass.__name__}\n" + df_bench.to_string(), extra=self.extra_logger)
|
274
|
+
|
275
|
+
# import datetime
|
276
|
+
# import threading
|
277
|
+
# import time
|
278
|
+
# import random
|
279
|
+
# from concurrent.futures import ThreadPoolExecutor, as_completed
|
280
|
+
# from typing import Type, Any, Dict, Optional, Union, List, ClassVar
|
281
|
+
#
|
282
|
+
# import dask.dataframe as dd
|
283
|
+
# import pandas as pd
|
284
|
+
# from tqdm import tqdm
|
285
|
+
#
|
286
|
+
# from . import ManagedResource
|
287
|
+
# from .parquet_saver import ParquetSaver
|
288
|
+
#
|
289
|
+
#
|
290
|
+
# class DataWrapper(ManagedResource):
|
291
|
+
# DEFAULT_PRIORITY_MAP: ClassVar[Dict[str, int]] = {
|
292
|
+
# "overwrite": 1,
|
293
|
+
# "missing_in_history": 2,
|
294
|
+
# "existing_but_stale": 3,
|
295
|
+
# "missing_outside_history": 4,
|
296
|
+
# "file_is_recent": 0,
|
297
|
+
# }
|
298
|
+
# DEFAULT_MAX_AGE_MINUTES: int = 1440
|
299
|
+
# DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
|
300
|
+
#
|
301
|
+
# def __init__(
|
302
|
+
# self,
|
303
|
+
# dataclass: Type,
|
304
|
+
# date_field: str,
|
305
|
+
# data_path: str,
|
306
|
+
# parquet_filename: str,
|
307
|
+
# class_params: Optional[Dict] = None,
|
308
|
+
# load_params: Optional[Dict] = None,
|
309
|
+
# show_progress: bool = False,
|
310
|
+
# timeout: float = 30,
|
311
|
+
# max_threads: int = 3,
|
312
|
+
# **kwargs: Any,
|
313
|
+
# ):
|
314
|
+
# super().__init__(**kwargs)
|
315
|
+
# self.dataclass = dataclass
|
316
|
+
# self.date_field = date_field
|
317
|
+
# self.data_path = self._ensure_forward_slash(data_path)
|
318
|
+
# self.parquet_filename = parquet_filename
|
319
|
+
# if self.fs is None:
|
320
|
+
# raise ValueError("DataWrapper requires a File system (fs) to be provided.")
|
321
|
+
# self.show_progress = show_progress
|
322
|
+
# self.timeout = timeout
|
323
|
+
# self.max_threads = max_threads
|
324
|
+
# self.class_params = class_params or {
|
325
|
+
# "debug": self.debug,
|
326
|
+
# "logger": self.logger,
|
327
|
+
# "fs": self.fs,
|
328
|
+
# "verbose": self.verbose,
|
329
|
+
# }
|
330
|
+
# self.load_params = load_params or {}
|
331
|
+
#
|
332
|
+
# self._lock = threading.Lock()
|
333
|
+
# self.processed_dates: List[datetime.date] = []
|
334
|
+
# self.benchmarks: Dict[datetime.date, Dict[str, float]] = {}
|
335
|
+
# self.mmanifest = kwargs.get("mmanifest", None)
|
336
|
+
# self.update_planner = kwargs.get("update_planner", None)
|
337
|
+
#
|
338
|
+
# def __exit__(self, exc_type, exc_val, exc_tb):
|
339
|
+
# if self.mmanifest:
|
340
|
+
# self.mmanifest.save()
|
341
|
+
# super().__exit__(exc_type, exc_val, exc_tb)
|
342
|
+
# return False
|
343
|
+
#
|
344
|
+
# @staticmethod
|
345
|
+
# def _convert_to_date(date: Union[datetime.date, str]) -> datetime.date:
|
346
|
+
# if isinstance(date, datetime.date):
|
347
|
+
# return date
|
348
|
+
# try:
|
349
|
+
# return pd.to_datetime(date).date()
|
350
|
+
# except ValueError as e:
|
351
|
+
# raise ValueError(f"Error converting {date} to datetime: {e}")
|
352
|
+
#
|
353
|
+
# @staticmethod
|
354
|
+
# def _ensure_forward_slash(path: str) -> str:
|
355
|
+
# return path.rstrip("/") + "/"
|
356
|
+
#
|
357
|
+
# def process(
|
358
|
+
# self,
|
359
|
+
# max_retries: int = 3,
|
360
|
+
# backoff_base: float = 2.0,
|
361
|
+
# backoff_jitter: float = 0.1,
|
362
|
+
# backoff_max: float = 60.0,
|
363
|
+
# ):
|
364
|
+
# """
|
365
|
+
# Execute the update plan with concurrency, retries and exponential backoff.
|
366
|
+
#
|
367
|
+
# Args:
|
368
|
+
# max_retries: attempts per date.
|
369
|
+
# backoff_base: base for exponential backoff (delay = base**attempt).
|
370
|
+
# backoff_jitter: multiplicative jitter factor in [0, backoff_jitter].
|
371
|
+
# backoff_max: maximum backoff seconds per attempt (before jitter).
|
372
|
+
# """
|
373
|
+
# overall_start = time.perf_counter()
|
374
|
+
# tasks = list(self.update_planner.get_tasks_by_priority())
|
375
|
+
# if not tasks:
|
376
|
+
# self.logger.info("No updates required based on the current plan.")
|
377
|
+
# return
|
378
|
+
#
|
379
|
+
# if self.update_planner.show_progress:
|
380
|
+
# self.update_planner.show_update_plan()
|
381
|
+
#
|
382
|
+
# for priority, dates in tasks:
|
383
|
+
# self._execute_task_batch(priority, dates, max_retries, backoff_base, backoff_jitter, backoff_max)
|
384
|
+
#
|
385
|
+
# total_time = time.perf_counter() - overall_start
|
386
|
+
# if self.processed_dates:
|
387
|
+
# count = len(self.processed_dates)
|
388
|
+
# self.logger.info(f"Processed {count} dates in {total_time:.1f}s (avg {total_time / count:.1f}s/date)")
|
389
|
+
# if self.update_planner.show_progress:
|
390
|
+
# self.show_benchmark_summary()
|
391
|
+
#
|
392
|
+
# def _execute_task_batch(
|
393
|
+
# self,
|
394
|
+
# priority: int,
|
395
|
+
# dates: List[datetime.date],
|
396
|
+
# max_retries: int,
|
397
|
+
# backoff_base: float,
|
398
|
+
# backoff_jitter: float,
|
399
|
+
# backoff_max: float,
|
400
|
+
# ):
|
401
|
+
# desc = f"Processing {self.dataclass.__name__}, priority: {priority}"
|
402
|
+
# max_thr = min(len(dates), self.max_threads)
|
403
|
+
# self.logger.info(f"Executing {len(dates)} tasks with priority {priority} using {max_thr} threads.")
|
404
|
+
#
|
405
|
+
# with ThreadPoolExecutor(max_workers=max_thr) as executor:
|
406
|
+
# futures = {
|
407
|
+
# executor.submit(
|
408
|
+
# self._process_date_with_retry, date, max_retries, backoff_base, backoff_jitter, backoff_max
|
409
|
+
# ): date
|
410
|
+
# for date in dates
|
411
|
+
# }
|
412
|
+
# iterator = as_completed(futures)
|
413
|
+
# if self.show_progress:
|
414
|
+
# iterator = tqdm(iterator, total=len(futures), desc=desc)
|
415
|
+
#
|
416
|
+
# for future in iterator:
|
417
|
+
# try:
|
418
|
+
# future.result(timeout=self.timeout)
|
419
|
+
# except Exception as e:
|
420
|
+
# self.logger.error(f"Permanent failure for {futures[future]}: {e}")
|
421
|
+
#
|
422
|
+
# def _process_date_with_retry(
|
423
|
+
# self,
|
424
|
+
# date: datetime.date,
|
425
|
+
# max_retries: int,
|
426
|
+
# backoff_base: float,
|
427
|
+
# backoff_jitter: float,
|
428
|
+
# backoff_max: float,
|
429
|
+
# ):
|
430
|
+
# for attempt in range(max_retries):
|
431
|
+
# try:
|
432
|
+
# self._process_single_date(date)
|
433
|
+
# return
|
434
|
+
# except Exception as e:
|
435
|
+
# if attempt < max_retries - 1:
|
436
|
+
# base_delay = min(backoff_base ** attempt, backoff_max)
|
437
|
+
# delay = base_delay * (1 + random.uniform(0.0, max(0.0, backoff_jitter)))
|
438
|
+
# self.logger.warning(
|
439
|
+
# f"Retry {attempt + 1}/{max_retries} for {date}: {e} (sleep {delay:.2f}s)"
|
440
|
+
# )
|
441
|
+
# time.sleep(delay)
|
442
|
+
# else:
|
443
|
+
# self.logger.error(f"Failed processing {date} after {max_retries} attempts.")
|
444
|
+
#
|
445
|
+
# def _process_single_date(self, date: datetime.date):
|
446
|
+
# path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
|
447
|
+
# self.logger.debug(f"Processing date {date.isoformat()} for {path}")
|
448
|
+
# if path in self.update_planner.skipped and self.update_planner.ignore_missing:
|
449
|
+
# self.logger.debug(f"Skipping {date} as it exists in the skipped list")
|
450
|
+
# return
|
451
|
+
# full_path = f"{path}{self.parquet_filename}"
|
452
|
+
#
|
453
|
+
# overall_start = time.perf_counter()
|
454
|
+
# try:
|
455
|
+
# load_start = time.perf_counter()
|
456
|
+
# date_filter = {f"{self.date_field}__date": {date.isoformat()}}
|
457
|
+
# self.logger.debug(f"Loading data for {date} with filter: {date_filter}")
|
458
|
+
#
|
459
|
+
# local_load_params = self.load_params.copy()
|
460
|
+
# local_load_params.update(date_filter)
|
461
|
+
#
|
462
|
+
# with self.dataclass(**self.class_params) as local_class_instance:
|
463
|
+
# df = local_class_instance.load(**local_load_params) # expected to be Dask
|
464
|
+
# load_time = time.perf_counter() - load_start
|
465
|
+
#
|
466
|
+
# if hasattr(local_class_instance, "total_records"):
|
467
|
+
# total_records = int(local_class_instance.total_records)
|
468
|
+
# self.logger.debug(f"Total records loaded: {total_records}")
|
469
|
+
#
|
470
|
+
# if total_records == 0:
|
471
|
+
# if self.mmanifest:
|
472
|
+
# self.mmanifest.record(full_path=path)
|
473
|
+
# self.logger.info(f"No data found for {full_path}. Logged to missing manifest.")
|
474
|
+
# return
|
475
|
+
#
|
476
|
+
# if total_records < 0:
|
477
|
+
# self.logger.warning(f"Negative record count ({total_records}) for {full_path}.")
|
478
|
+
# return
|
479
|
+
#
|
480
|
+
# save_start = time.perf_counter()
|
481
|
+
# parquet_params = {
|
482
|
+
# "df_result": df,
|
483
|
+
# "parquet_storage_path": path,
|
484
|
+
# "fs": self.fs,
|
485
|
+
# "logger": self.logger,
|
486
|
+
# "debug": self.debug,
|
487
|
+
# }
|
488
|
+
# with ParquetSaver(**parquet_params) as ps:
|
489
|
+
# ps.save_to_parquet(self.parquet_filename, overwrite=True)
|
490
|
+
# save_time = time.perf_counter() - save_start
|
491
|
+
#
|
492
|
+
# total_time = time.perf_counter() - overall_start
|
493
|
+
# self.benchmarks[date] = {
|
494
|
+
# "load_duration": load_time,
|
495
|
+
# "save_duration": save_time,
|
496
|
+
# "total_duration": total_time,
|
497
|
+
# }
|
498
|
+
# self._log_success(date, total_time, full_path)
|
499
|
+
#
|
500
|
+
# except Exception as e:
|
501
|
+
# self._log_failure(date, e)
|
502
|
+
# raise
|
503
|
+
#
|
504
|
+
# def _log_success(self, date: datetime.date, duration: float, path: str):
|
505
|
+
# self.logger.info(f"Completed {date} in {duration:.1f}s | Saved to {path}")
|
506
|
+
# self.processed_dates.append(date)
|
507
|
+
#
|
508
|
+
# def _log_failure(self, date: datetime.date, error: Exception):
|
509
|
+
# self.logger.error(f"Failed processing {date}: {error}")
|
510
|
+
#
|
511
|
+
# def show_benchmark_summary(self):
|
512
|
+
# if not self.benchmarks:
|
513
|
+
# self.logger.info("No benchmarking data to show")
|
514
|
+
# return
|
515
|
+
# df_bench = pd.DataFrame.from_records([{"date": d, **m} for d, m in self.benchmarks.items()])
|
516
|
+
# df_bench = df_bench.set_index("date").sort_index(ascending=not self.update_planner.reverse_order)
|
517
|
+
# self.logger.info(f"Benchmark Summary:\n {self.dataclass.__name__}\n" + df_bench.to_string())
|
518
|
+
#
|