sibi-dst 2025.1.13__py3-none-any.whl → 2025.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +7 -1
- sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +235 -342
- sibi_dst/df_helper/_df_helper.py +417 -117
- sibi_dst/df_helper/_parquet_artifact.py +255 -283
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +8 -4
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +68 -107
- sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +105 -255
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +90 -42
- sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +192 -0
- sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +122 -72
- sibi_dst/osmnx_helper/route_path_builder.py +45 -46
- sibi_dst/utils/base.py +302 -96
- sibi_dst/utils/clickhouse_writer.py +472 -206
- sibi_dst/utils/data_utils.py +139 -186
- sibi_dst/utils/data_wrapper.py +317 -73
- sibi_dst/utils/date_utils.py +1 -0
- sibi_dst/utils/df_utils.py +193 -213
- sibi_dst/utils/file_utils.py +3 -2
- sibi_dst/utils/filepath_generator.py +314 -152
- sibi_dst/utils/log_utils.py +581 -242
- sibi_dst/utils/manifest_manager.py +60 -76
- sibi_dst/utils/parquet_saver.py +33 -27
- sibi_dst/utils/phone_formatter.py +88 -95
- sibi_dst/utils/update_planner.py +180 -178
- sibi_dst/utils/webdav_client.py +116 -166
- {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.1.dist-info}/METADATA +1 -1
- {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.1.dist-info}/RECORD +29 -27
- {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.1.dist-info}/WHEEL +0 -0
sibi_dst/utils/data_wrapper.py
CHANGED
@@ -1,9 +1,11 @@
|
|
1
1
|
import datetime
|
2
2
|
import threading
|
3
3
|
import time
|
4
|
+
import random
|
4
5
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
5
6
|
from typing import Type, Any, Dict, Optional, Union, List, ClassVar
|
6
7
|
|
8
|
+
import dask.dataframe as dd
|
7
9
|
import pandas as pd
|
8
10
|
from tqdm import tqdm
|
9
11
|
|
@@ -17,23 +19,23 @@ class DataWrapper(ManagedResource):
|
|
17
19
|
"missing_in_history": 2,
|
18
20
|
"existing_but_stale": 3,
|
19
21
|
"missing_outside_history": 4,
|
20
|
-
"file_is_recent": 0
|
22
|
+
"file_is_recent": 0,
|
21
23
|
}
|
22
24
|
DEFAULT_MAX_AGE_MINUTES: int = 1440
|
23
25
|
DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
|
24
26
|
|
25
27
|
def __init__(
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
28
|
+
self,
|
29
|
+
dataclass: Type,
|
30
|
+
date_field: str,
|
31
|
+
data_path: str,
|
32
|
+
parquet_filename: str,
|
33
|
+
class_params: Optional[Dict] = None,
|
34
|
+
load_params: Optional[Dict] = None,
|
35
|
+
show_progress: bool = False,
|
36
|
+
timeout: float = 30,
|
37
|
+
max_threads: int = 3,
|
38
|
+
**kwargs: Any,
|
37
39
|
):
|
38
40
|
super().__init__(**kwargs)
|
39
41
|
self.dataclass = dataclass
|
@@ -41,15 +43,15 @@ class DataWrapper(ManagedResource):
|
|
41
43
|
self.data_path = self._ensure_forward_slash(data_path)
|
42
44
|
self.parquet_filename = parquet_filename
|
43
45
|
if self.fs is None:
|
44
|
-
raise ValueError("
|
46
|
+
raise ValueError("DataWrapper requires a File system (fs) to be provided.")
|
45
47
|
self.show_progress = show_progress
|
46
48
|
self.timeout = timeout
|
47
49
|
self.max_threads = max_threads
|
48
50
|
self.class_params = class_params or {
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
51
|
+
"debug": self.debug,
|
52
|
+
"logger": self.logger,
|
53
|
+
"fs": self.fs,
|
54
|
+
"verbose": self.verbose,
|
53
55
|
}
|
54
56
|
self.load_params = load_params or {}
|
55
57
|
|
@@ -60,7 +62,6 @@ class DataWrapper(ManagedResource):
|
|
60
62
|
self.update_planner = kwargs.get("update_planner", None)
|
61
63
|
|
62
64
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
63
|
-
"""Context manager exit"""
|
64
65
|
if self.mmanifest:
|
65
66
|
self.mmanifest.save()
|
66
67
|
super().__exit__(exc_type, exc_val, exc_tb)
|
@@ -77,10 +78,24 @@ class DataWrapper(ManagedResource):
|
|
77
78
|
|
78
79
|
@staticmethod
|
79
80
|
def _ensure_forward_slash(path: str) -> str:
|
80
|
-
return path.rstrip(
|
81
|
+
return path.rstrip("/") + "/"
|
81
82
|
|
82
|
-
def process(
|
83
|
-
|
83
|
+
def process(
|
84
|
+
self,
|
85
|
+
max_retries: int = 3,
|
86
|
+
backoff_base: float = 2.0,
|
87
|
+
backoff_jitter: float = 0.1,
|
88
|
+
backoff_max: float = 60.0,
|
89
|
+
):
|
90
|
+
"""
|
91
|
+
Execute the update plan with concurrency, retries and exponential backoff.
|
92
|
+
|
93
|
+
Args:
|
94
|
+
max_retries: attempts per date.
|
95
|
+
backoff_base: base for exponential backoff (delay = base**attempt).
|
96
|
+
backoff_jitter: multiplicative jitter factor in [0, backoff_jitter].
|
97
|
+
backoff_max: maximum backoff seconds per attempt (before jitter).
|
98
|
+
"""
|
84
99
|
overall_start = time.perf_counter()
|
85
100
|
tasks = list(self.update_planner.get_tasks_by_priority())
|
86
101
|
if not tasks:
|
@@ -91,7 +106,7 @@ class DataWrapper(ManagedResource):
|
|
91
106
|
self.update_planner.show_update_plan()
|
92
107
|
|
93
108
|
for priority, dates in tasks:
|
94
|
-
self._execute_task_batch(priority, dates, max_retries)
|
109
|
+
self._execute_task_batch(priority, dates, max_retries, backoff_base, backoff_jitter, backoff_max)
|
95
110
|
|
96
111
|
total_time = time.perf_counter() - overall_start
|
97
112
|
if self.processed_dates:
|
@@ -100,14 +115,26 @@ class DataWrapper(ManagedResource):
|
|
100
115
|
if self.update_planner.show_progress:
|
101
116
|
self.show_benchmark_summary()
|
102
117
|
|
103
|
-
def _execute_task_batch(
|
104
|
-
|
118
|
+
def _execute_task_batch(
|
119
|
+
self,
|
120
|
+
priority: int,
|
121
|
+
dates: List[datetime.date],
|
122
|
+
max_retries: int,
|
123
|
+
backoff_base: float,
|
124
|
+
backoff_jitter: float,
|
125
|
+
backoff_max: float,
|
126
|
+
):
|
105
127
|
desc = f"Processing {self.dataclass.__name__}, priority: {priority}"
|
106
128
|
max_thr = min(len(dates), self.max_threads)
|
107
129
|
self.logger.info(f"Executing {len(dates)} tasks with priority {priority} using {max_thr} threads.")
|
108
130
|
|
109
131
|
with ThreadPoolExecutor(max_workers=max_thr) as executor:
|
110
|
-
futures = {
|
132
|
+
futures = {
|
133
|
+
executor.submit(
|
134
|
+
self._process_date_with_retry, date, max_retries, backoff_base, backoff_jitter, backoff_max
|
135
|
+
): date
|
136
|
+
for date in dates
|
137
|
+
}
|
111
138
|
iterator = as_completed(futures)
|
112
139
|
if self.show_progress:
|
113
140
|
iterator = tqdm(iterator, total=len(futures), desc=desc)
|
@@ -118,22 +145,30 @@ class DataWrapper(ManagedResource):
|
|
118
145
|
except Exception as e:
|
119
146
|
self.logger.error(f"Permanent failure for {futures[future]}: {e}")
|
120
147
|
|
121
|
-
def _process_date_with_retry(
|
122
|
-
|
148
|
+
def _process_date_with_retry(
|
149
|
+
self,
|
150
|
+
date: datetime.date,
|
151
|
+
max_retries: int,
|
152
|
+
backoff_base: float,
|
153
|
+
backoff_jitter: float,
|
154
|
+
backoff_max: float,
|
155
|
+
):
|
123
156
|
for attempt in range(max_retries):
|
124
157
|
try:
|
125
158
|
self._process_single_date(date)
|
126
159
|
return
|
127
160
|
except Exception as e:
|
128
161
|
if attempt < max_retries - 1:
|
129
|
-
|
130
|
-
|
162
|
+
base_delay = min(backoff_base ** attempt, backoff_max)
|
163
|
+
delay = base_delay * (1 + random.uniform(0.0, max(0.0, backoff_jitter)))
|
164
|
+
self.logger.warning(
|
165
|
+
f"Retry {attempt + 1}/{max_retries} for {date}: {e} (sleep {delay:.2f}s)"
|
166
|
+
)
|
167
|
+
time.sleep(delay)
|
131
168
|
else:
|
132
169
|
self.logger.error(f"Failed processing {date} after {max_retries} attempts.")
|
133
|
-
# raise
|
134
170
|
|
135
171
|
def _process_single_date(self, date: datetime.date):
|
136
|
-
"""Core date processing logic with load/save timing and thread reporting"""
|
137
172
|
path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
|
138
173
|
self.logger.debug(f"Processing date {date.isoformat()} for {path}")
|
139
174
|
if path in self.update_planner.skipped and self.update_planner.ignore_missing:
|
@@ -141,74 +176,283 @@ class DataWrapper(ManagedResource):
|
|
141
176
|
return
|
142
177
|
full_path = f"{path}{self.parquet_filename}"
|
143
178
|
|
144
|
-
# thread_name = threading.current_thread().name
|
145
|
-
# self.logger.debug(f"[{thread_name}] Executing date: {date} -> saving to: {full_path}")
|
146
|
-
|
147
179
|
overall_start = time.perf_counter()
|
148
180
|
try:
|
149
181
|
load_start = time.perf_counter()
|
150
182
|
date_filter = {f"{self.date_field}__date": {date.isoformat()}}
|
151
183
|
self.logger.debug(f"Loading data for {date} with filter: {date_filter}")
|
152
|
-
|
153
|
-
# Create a copy to avoid mutating the shared instance dictionary
|
184
|
+
|
154
185
|
local_load_params = self.load_params.copy()
|
155
186
|
local_load_params.update(date_filter)
|
187
|
+
|
156
188
|
with self.dataclass(**self.class_params) as local_class_instance:
|
157
|
-
df = local_class_instance.load(**local_load_params)
|
189
|
+
df = local_class_instance.load(**local_load_params) # expected to be Dask
|
158
190
|
load_time = time.perf_counter() - load_start
|
159
191
|
|
160
192
|
if hasattr(local_class_instance, "total_records"):
|
161
|
-
|
162
|
-
|
163
|
-
|
193
|
+
total_records = int(local_class_instance.total_records)
|
194
|
+
self.logger.debug(f"Total records loaded: {total_records}")
|
195
|
+
|
196
|
+
if total_records == 0:
|
164
197
|
if self.mmanifest:
|
165
|
-
self.mmanifest.record(
|
166
|
-
full_path=path
|
167
|
-
)
|
198
|
+
self.mmanifest.record(full_path=path)
|
168
199
|
self.logger.info(f"No data found for {full_path}. Logged to missing manifest.")
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
200
|
+
return
|
201
|
+
|
202
|
+
if total_records < 0:
|
203
|
+
self.logger.warning(f"Negative record count ({total_records}) for {full_path}.")
|
204
|
+
return
|
205
|
+
|
206
|
+
save_start = time.perf_counter()
|
207
|
+
parquet_params = {
|
208
|
+
"df_result": df,
|
209
|
+
"parquet_storage_path": path,
|
210
|
+
"fs": self.fs,
|
211
|
+
"logger": self.logger,
|
212
|
+
"debug": self.debug,
|
213
|
+
}
|
214
|
+
with ParquetSaver(**parquet_params) as ps:
|
215
|
+
ps.save_to_parquet(self.parquet_filename, overwrite=True)
|
216
|
+
save_time = time.perf_counter() - save_start
|
217
|
+
|
218
|
+
total_time = time.perf_counter() - overall_start
|
219
|
+
self.benchmarks[date] = {
|
220
|
+
"load_duration": load_time,
|
221
|
+
"save_duration": save_time,
|
222
|
+
"total_duration": total_time,
|
223
|
+
}
|
224
|
+
self._log_success(date, total_time, full_path)
|
225
|
+
|
194
226
|
except Exception as e:
|
195
227
|
self._log_failure(date, e)
|
196
228
|
raise
|
197
229
|
|
198
230
|
def _log_success(self, date: datetime.date, duration: float, path: str):
|
199
|
-
|
200
|
-
self.logger.info(msg)
|
231
|
+
self.logger.info(f"Completed {date} in {duration:.1f}s | Saved to {path}")
|
201
232
|
self.processed_dates.append(date)
|
202
233
|
|
203
234
|
def _log_failure(self, date: datetime.date, error: Exception):
|
204
|
-
|
205
|
-
self.logger.error(msg)
|
235
|
+
self.logger.error(f"Failed processing {date}: {error}")
|
206
236
|
|
207
237
|
def show_benchmark_summary(self):
|
208
|
-
"""Display a summary of load/save timings per date"""
|
209
238
|
if not self.benchmarks:
|
210
239
|
self.logger.info("No benchmarking data to show")
|
211
240
|
return
|
212
241
|
df_bench = pd.DataFrame.from_records([{"date": d, **m} for d, m in self.benchmarks.items()])
|
213
242
|
df_bench = df_bench.set_index("date").sort_index(ascending=not self.update_planner.reverse_order)
|
214
243
|
self.logger.info(f"Benchmark Summary:\n {self.dataclass.__name__}\n" + df_bench.to_string())
|
244
|
+
|
245
|
+
# import datetime
|
246
|
+
# import threading
|
247
|
+
# import time
|
248
|
+
# from concurrent.futures import ThreadPoolExecutor, as_completed
|
249
|
+
# from typing import Type, Any, Dict, Optional, Union, List, ClassVar
|
250
|
+
#
|
251
|
+
# import pandas as pd
|
252
|
+
# from tqdm import tqdm
|
253
|
+
#
|
254
|
+
# from . import ManagedResource
|
255
|
+
# from .parquet_saver import ParquetSaver
|
256
|
+
#
|
257
|
+
#
|
258
|
+
# class DataWrapper(ManagedResource):
|
259
|
+
# DEFAULT_PRIORITY_MAP: ClassVar[Dict[str, int]] = {
|
260
|
+
# "overwrite": 1,
|
261
|
+
# "missing_in_history": 2,
|
262
|
+
# "existing_but_stale": 3,
|
263
|
+
# "missing_outside_history": 4,
|
264
|
+
# "file_is_recent": 0
|
265
|
+
# }
|
266
|
+
# DEFAULT_MAX_AGE_MINUTES: int = 1440
|
267
|
+
# DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
|
268
|
+
#
|
269
|
+
# def __init__(
|
270
|
+
# self,
|
271
|
+
# dataclass: Type,
|
272
|
+
# date_field: str,
|
273
|
+
# data_path: str,
|
274
|
+
# parquet_filename: str,
|
275
|
+
# class_params: Optional[Dict] = None,
|
276
|
+
# load_params: Optional[Dict] = None,
|
277
|
+
# show_progress: bool = False,
|
278
|
+
# timeout: float = 30,
|
279
|
+
# max_threads: int = 3,
|
280
|
+
# **kwargs: Any,
|
281
|
+
# ):
|
282
|
+
# super().__init__(**kwargs)
|
283
|
+
# self.dataclass = dataclass
|
284
|
+
# self.date_field = date_field
|
285
|
+
# self.data_path = self._ensure_forward_slash(data_path)
|
286
|
+
# self.parquet_filename = parquet_filename
|
287
|
+
# if self.fs is None:
|
288
|
+
# raise ValueError("Datawrapper requires a File system (fs) to be provided .")
|
289
|
+
# self.show_progress = show_progress
|
290
|
+
# self.timeout = timeout
|
291
|
+
# self.max_threads = max_threads
|
292
|
+
# self.class_params = class_params or {
|
293
|
+
# 'debug': self.debug,
|
294
|
+
# 'logger': self.logger,
|
295
|
+
# 'fs': self.fs,
|
296
|
+
# 'verbose': self.verbose,
|
297
|
+
# }
|
298
|
+
# self.load_params = load_params or {}
|
299
|
+
#
|
300
|
+
# self._lock = threading.Lock()
|
301
|
+
# self.processed_dates: List[datetime.date] = []
|
302
|
+
# self.benchmarks: Dict[datetime.date, Dict[str, float]] = {}
|
303
|
+
# self.mmanifest = kwargs.get("mmanifest", None)
|
304
|
+
# self.update_planner = kwargs.get("update_planner", None)
|
305
|
+
#
|
306
|
+
# def __exit__(self, exc_type, exc_val, exc_tb):
|
307
|
+
# """Context manager exit"""
|
308
|
+
# if self.mmanifest:
|
309
|
+
# self.mmanifest.save()
|
310
|
+
# super().__exit__(exc_type, exc_val, exc_tb)
|
311
|
+
# return False
|
312
|
+
#
|
313
|
+
# @staticmethod
|
314
|
+
# def _convert_to_date(date: Union[datetime.date, str]) -> datetime.date:
|
315
|
+
# if isinstance(date, datetime.date):
|
316
|
+
# return date
|
317
|
+
# try:
|
318
|
+
# return pd.to_datetime(date).date()
|
319
|
+
# except ValueError as e:
|
320
|
+
# raise ValueError(f"Error converting {date} to datetime: {e}")
|
321
|
+
#
|
322
|
+
# @staticmethod
|
323
|
+
# def _ensure_forward_slash(path: str) -> str:
|
324
|
+
# return path.rstrip('/') + '/'
|
325
|
+
#
|
326
|
+
# def process(self, max_retries: int = 3):
|
327
|
+
# """Process updates with priority-based execution, retries, benchmarking and progress updates"""
|
328
|
+
# overall_start = time.perf_counter()
|
329
|
+
# tasks = list(self.update_planner.get_tasks_by_priority())
|
330
|
+
# if not tasks:
|
331
|
+
# self.logger.info("No updates required based on the current plan.")
|
332
|
+
# return
|
333
|
+
#
|
334
|
+
# if self.update_planner.show_progress:
|
335
|
+
# self.update_planner.show_update_plan()
|
336
|
+
#
|
337
|
+
# for priority, dates in tasks:
|
338
|
+
# self._execute_task_batch(priority, dates, max_retries)
|
339
|
+
#
|
340
|
+
# total_time = time.perf_counter() - overall_start
|
341
|
+
# if self.processed_dates:
|
342
|
+
# count = len(self.processed_dates)
|
343
|
+
# self.logger.info(f"Processed {count} dates in {total_time:.1f}s (avg {total_time / count:.1f}s/date)")
|
344
|
+
# if self.update_planner.show_progress:
|
345
|
+
# self.show_benchmark_summary()
|
346
|
+
#
|
347
|
+
# def _execute_task_batch(self, priority: int, dates: List[datetime.date], max_retries: int):
|
348
|
+
# """Executes a single batch of tasks (dates) using a thread pool."""
|
349
|
+
# desc = f"Processing {self.dataclass.__name__}, priority: {priority}"
|
350
|
+
# max_thr = min(len(dates), self.max_threads)
|
351
|
+
# self.logger.info(f"Executing {len(dates)} tasks with priority {priority} using {max_thr} threads.")
|
352
|
+
#
|
353
|
+
# with ThreadPoolExecutor(max_workers=max_thr) as executor:
|
354
|
+
# futures = {executor.submit(self._process_date_with_retry, date, max_retries): date for date in dates}
|
355
|
+
# iterator = as_completed(futures)
|
356
|
+
# if self.show_progress:
|
357
|
+
# iterator = tqdm(iterator, total=len(futures), desc=desc)
|
358
|
+
#
|
359
|
+
# for future in iterator:
|
360
|
+
# try:
|
361
|
+
# future.result(timeout=self.timeout)
|
362
|
+
# except Exception as e:
|
363
|
+
# self.logger.error(f"Permanent failure for {futures[future]}: {e}")
|
364
|
+
#
|
365
|
+
# def _process_date_with_retry(self, date: datetime.date, max_retries: int):
|
366
|
+
# """Wrapper to apply retry logic to single date processing."""
|
367
|
+
# for attempt in range(max_retries):
|
368
|
+
# try:
|
369
|
+
# self._process_single_date(date)
|
370
|
+
# return
|
371
|
+
# except Exception as e:
|
372
|
+
# if attempt < max_retries - 1:
|
373
|
+
# self.logger.warning(f"Retry {attempt + 1}/{max_retries} for {date}: {e}")
|
374
|
+
# time.sleep(2 ** attempt) # Exponential backoff
|
375
|
+
# else:
|
376
|
+
# self.logger.error(f"Failed processing {date} after {max_retries} attempts.")
|
377
|
+
# # raise
|
378
|
+
#
|
379
|
+
# def _process_single_date(self, date: datetime.date):
|
380
|
+
# """Core date processing logic with load/save timing and thread reporting"""
|
381
|
+
# path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
|
382
|
+
# self.logger.debug(f"Processing date {date.isoformat()} for {path}")
|
383
|
+
# if path in self.update_planner.skipped and self.update_planner.ignore_missing:
|
384
|
+
# self.logger.debug(f"Skipping {date} as it exists in the skipped list")
|
385
|
+
# return
|
386
|
+
# full_path = f"{path}{self.parquet_filename}"
|
387
|
+
#
|
388
|
+
# # thread_name = threading.current_thread().name
|
389
|
+
# # self.logger.debug(f"[{thread_name}] Executing date: {date} -> saving to: {full_path}")
|
390
|
+
#
|
391
|
+
# overall_start = time.perf_counter()
|
392
|
+
# try:
|
393
|
+
# load_start = time.perf_counter()
|
394
|
+
# date_filter = {f"{self.date_field}__date": {date.isoformat()}}
|
395
|
+
# self.logger.debug(f"Loading data for {date} with filter: {date_filter}")
|
396
|
+
# # Load data using the dataclass with the provided date filter
|
397
|
+
# # Create a copy to avoid mutating the shared instance dictionary
|
398
|
+
# local_load_params = self.load_params.copy()
|
399
|
+
# local_load_params.update(date_filter)
|
400
|
+
# with self.dataclass(**self.class_params) as local_class_instance:
|
401
|
+
# df = local_class_instance.load(**local_load_params)
|
402
|
+
# load_time = time.perf_counter() - load_start
|
403
|
+
#
|
404
|
+
# if hasattr(local_class_instance, "total_records"):
|
405
|
+
# self.logger.debug(
|
406
|
+
# f"Total records loaded by {local_class_instance.__class__.__name__}: {local_class_instance.total_records}")
|
407
|
+
# if int(local_class_instance.total_records) == 0: # If no records were loaded but not due to an error
|
408
|
+
# if self.mmanifest:
|
409
|
+
# self.mmanifest.record(
|
410
|
+
# full_path=path
|
411
|
+
# )
|
412
|
+
# self.logger.info(f"No data found for {full_path}. Logged to missing manifest.")
|
413
|
+
# elif int(local_class_instance.total_records) < 0:
|
414
|
+
# self.logger.warning(
|
415
|
+
# f"Negative record count ({local_class_instance.total_records}) for {full_path}. "
|
416
|
+
# "This may indicate an error in the data loading process."
|
417
|
+
# )
|
418
|
+
# else:
|
419
|
+
# save_start = time.perf_counter()
|
420
|
+
# parquet_params ={
|
421
|
+
# "df_result": df,
|
422
|
+
# "parquet_storage_path": path,
|
423
|
+
# "fs": self.fs,
|
424
|
+
# "logger": self.logger,
|
425
|
+
# "debug": self.debug,
|
426
|
+
# }
|
427
|
+
# with ParquetSaver(**parquet_params) as ps:
|
428
|
+
# ps.save_to_parquet(self.parquet_filename, overwrite=True)
|
429
|
+
# save_time = time.perf_counter() - save_start
|
430
|
+
#
|
431
|
+
# total_time = time.perf_counter() - overall_start
|
432
|
+
# self.benchmarks[date] = {
|
433
|
+
# "load_duration": load_time,
|
434
|
+
# "save_duration": save_time,
|
435
|
+
# "total_duration": total_time
|
436
|
+
# }
|
437
|
+
# self._log_success(date, total_time, full_path)
|
438
|
+
# except Exception as e:
|
439
|
+
# self._log_failure(date, e)
|
440
|
+
# raise
|
441
|
+
#
|
442
|
+
# def _log_success(self, date: datetime.date, duration: float, path: str):
|
443
|
+
# msg = f"Completed {date} in {duration:.1f}s | Saved to {path}"
|
444
|
+
# self.logger.info(msg)
|
445
|
+
# self.processed_dates.append(date)
|
446
|
+
#
|
447
|
+
# def _log_failure(self, date: datetime.date, error: Exception):
|
448
|
+
# msg = f"Failed processing {date}: {error}"
|
449
|
+
# self.logger.error(msg)
|
450
|
+
#
|
451
|
+
# def show_benchmark_summary(self):
|
452
|
+
# """Display a summary of load/save timings per date"""
|
453
|
+
# if not self.benchmarks:
|
454
|
+
# self.logger.info("No benchmarking data to show")
|
455
|
+
# return
|
456
|
+
# df_bench = pd.DataFrame.from_records([{"date": d, **m} for d, m in self.benchmarks.items()])
|
457
|
+
# df_bench = df_bench.set_index("date").sort_index(ascending=not self.update_planner.reverse_order)
|
458
|
+
# self.logger.info(f"Benchmark Summary:\n {self.dataclass.__name__}\n" + df_bench.to_string())
|
sibi_dst/utils/date_utils.py
CHANGED
@@ -145,6 +145,7 @@ class DateUtils:
|
|
145
145
|
'current_month': lambda: cls.get_month_range(n=0),
|
146
146
|
'last_month': lambda: cls.get_month_range(n=-1),
|
147
147
|
'current_year': lambda: cls.get_year_timerange(today().year),
|
148
|
+
'last_year': lambda: cls.get_year_timerange(today().year - 1),
|
148
149
|
'current_quarter': lambda: (
|
149
150
|
cls.get_first_day_of_the_quarter(today()), cls.get_last_day_of_the_quarter(today())),
|
150
151
|
'ytd': lambda: (datetime.date(today().year, 1, 1), today()),
|