sibi-dst 2025.1.7__py3-none-any.whl → 2025.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sibi_dst/utils/data_wrapper.py
CHANGED
@@ -1,16 +1,13 @@
|
|
1
1
|
import datetime
|
2
|
-
import logging
|
3
2
|
import threading
|
4
3
|
import time
|
5
4
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
6
5
|
from typing import Type, Any, Dict, Optional, Union, List, ClassVar
|
7
6
|
|
8
|
-
import fsspec
|
9
7
|
import pandas as pd
|
10
8
|
from tqdm import tqdm
|
11
9
|
|
12
10
|
from . import ManagedResource
|
13
|
-
from .log_utils import Logger
|
14
11
|
from .parquet_saver import ParquetSaver
|
15
12
|
|
16
13
|
|
@@ -60,8 +57,7 @@ class DataWrapper(ManagedResource):
|
|
60
57
|
self.processed_dates: List[datetime.date] = []
|
61
58
|
self.benchmarks: Dict[datetime.date, Dict[str, float]] = {}
|
62
59
|
self.mmanifest = kwargs.get("mmanifest", None)
|
63
|
-
self.update_planner=kwargs.get("update_planner", None)
|
64
|
-
|
60
|
+
self.update_planner = kwargs.get("update_planner", None)
|
65
61
|
|
66
62
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
67
63
|
"""Context manager exit"""
|
@@ -104,7 +100,6 @@ class DataWrapper(ManagedResource):
|
|
104
100
|
if self.update_planner.show_progress:
|
105
101
|
self.show_benchmark_summary()
|
106
102
|
|
107
|
-
|
108
103
|
def _execute_task_batch(self, priority: int, dates: List[datetime.date], max_retries: int):
|
109
104
|
"""Executes a single batch of tasks (dates) using a thread pool."""
|
110
105
|
desc = f"Processing {self.dataclass.__name__}, priority: {priority}"
|
@@ -135,7 +130,7 @@ class DataWrapper(ManagedResource):
|
|
135
130
|
time.sleep(2 ** attempt) # Exponential backoff
|
136
131
|
else:
|
137
132
|
self.logger.error(f"Failed processing {date} after {max_retries} attempts.")
|
138
|
-
#raise
|
133
|
+
# raise
|
139
134
|
|
140
135
|
def _process_single_date(self, date: datetime.date):
|
141
136
|
"""Core date processing logic with load/save timing and thread reporting"""
|
@@ -146,8 +141,8 @@ class DataWrapper(ManagedResource):
|
|
146
141
|
return
|
147
142
|
full_path = f"{path}{self.parquet_filename}"
|
148
143
|
|
149
|
-
#thread_name = threading.current_thread().name
|
150
|
-
#self.logger.debug(f"[{thread_name}] Executing date: {date} -> saving to: {full_path}")
|
144
|
+
# thread_name = threading.current_thread().name
|
145
|
+
# self.logger.debug(f"[{thread_name}] Executing date: {date} -> saving to: {full_path}")
|
151
146
|
|
152
147
|
overall_start = time.perf_counter()
|
153
148
|
try:
|
@@ -159,35 +154,43 @@ class DataWrapper(ManagedResource):
|
|
159
154
|
local_load_params = self.load_params.copy()
|
160
155
|
local_load_params.update(date_filter)
|
161
156
|
local_class_instance = self.dataclass(**self.class_params)
|
162
|
-
df=local_class_instance.load(**local_load_params)
|
157
|
+
df = local_class_instance.load(**local_load_params)
|
163
158
|
load_time = time.perf_counter() - load_start
|
164
159
|
|
165
160
|
if hasattr(local_class_instance, "total_records"):
|
166
|
-
self.logger.debug(
|
161
|
+
self.logger.debug(
|
162
|
+
f"Total records loaded by {local_class_instance.__class__.__name__}: {local_class_instance.total_records}")
|
167
163
|
if int(local_class_instance.total_records) == 0: # If no records were loaded but not due to an error
|
168
164
|
if self.mmanifest:
|
169
165
|
self.mmanifest.record(
|
170
166
|
full_path=path
|
171
167
|
)
|
172
|
-
self.logger.info(f"No data found for {
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
168
|
+
self.logger.info(f"No data found for {full_path}. Logged to missing manifest.")
|
169
|
+
elif int(local_class_instance.total_records) < 0:
|
170
|
+
self.logger.warning(
|
171
|
+
f"Negative record count ({local_class_instance.total_records}) for {full_path}. "
|
172
|
+
"This may indicate an error in the data loading process."
|
173
|
+
)
|
174
|
+
else:
|
175
|
+
save_start = time.perf_counter()
|
176
|
+
parquet_params ={
|
177
|
+
"df_result": df,
|
178
|
+
"parquet_storage_path": path,
|
179
|
+
"fs": self.fs,
|
180
|
+
"logger": self.logger,
|
181
|
+
"debug": self.debug,
|
182
|
+
}
|
183
|
+
with ParquetSaver(**parquet_params) as ps:
|
184
|
+
ps.save_to_parquet(self.parquet_filename, overwrite=True)
|
185
|
+
save_time = time.perf_counter() - save_start
|
186
|
+
|
187
|
+
total_time = time.perf_counter() - overall_start
|
188
|
+
self.benchmarks[date] = {
|
189
|
+
"load_duration": load_time,
|
190
|
+
"save_duration": save_time,
|
191
|
+
"total_duration": total_time
|
192
|
+
}
|
193
|
+
self._log_success(date, total_time, full_path)
|
191
194
|
except Exception as e:
|
192
195
|
self._log_failure(date, e)
|
193
196
|
raise
|
sibi_dst/utils/parquet_saver.py
CHANGED
@@ -26,10 +26,6 @@ class ParquetSaver(ManagedResource):
|
|
26
26
|
super().__init__(**kwargs)
|
27
27
|
self.df_result = df_result
|
28
28
|
self.parquet_storage_path = parquet_storage_path.rstrip("/")
|
29
|
-
#self.debug = debug
|
30
|
-
#self.logger = logger or Logger.default_logger(self.__class__.__name__)
|
31
|
-
#self.logger.set_level(logging.DEBUG if self.debug else logging.INFO)
|
32
|
-
#self.fs = fs
|
33
29
|
# Determine protocol for special handling (e.g., 's3')
|
34
30
|
if not self.fs:
|
35
31
|
raise ValueError("File system (fs) must be provided to ParquetSaver.")
|
@@ -38,14 +38,14 @@ sibi_dst/utils/clickhouse_writer.py,sha256=mNUJoYOreIdRrEFv2mQ6pdtLi1Iz_2rALDyO6
|
|
38
38
|
sibi_dst/utils/credentials.py,sha256=cHJPPsmVyijqbUQIq7WWPe-lIallA-mI5RAy3YUuRME,1724
|
39
39
|
sibi_dst/utils/data_from_http_source.py,sha256=AcpKNsqTgN2ClNwuhgUpuNCx62r5_DdsAiKY8vcHEBA,1867
|
40
40
|
sibi_dst/utils/data_utils.py,sha256=MqbwXk33BuANWeKKmsabHouhb8GZswSmbM-VetWWE-M,10357
|
41
|
-
sibi_dst/utils/data_wrapper.py,sha256=
|
41
|
+
sibi_dst/utils/data_wrapper.py,sha256=deUz2760T_v42Ni1twLUcGS4ucIQM63vJnC6p8sWsb4,9470
|
42
42
|
sibi_dst/utils/date_utils.py,sha256=8fwPpOYqSdM3nHeNykh7Ftk-uPdFa44cEAy5S8iUNw4,18667
|
43
43
|
sibi_dst/utils/df_utils.py,sha256=TzIAUCLbgOn3bvCFvzkc1S9YU-OlZTImdCj-88dtg8g,11401
|
44
44
|
sibi_dst/utils/file_utils.py,sha256=Z99CZ_4nPDIaZqbCfzzUDfAYJjSudWDj-mwEO8grhbc,1253
|
45
45
|
sibi_dst/utils/filepath_generator.py,sha256=-HHO0U-PR8fysDDFwnWdHRlgqksh_RkmgBZLWv9hM7s,6669
|
46
46
|
sibi_dst/utils/log_utils.py,sha256=_YnpCnMcjT--ou3BU0EGJma1xMULrA4V5v5UU4IbjAo,14102
|
47
47
|
sibi_dst/utils/manifest_manager.py,sha256=Rw7i2phoKJjGlPHYLg_1kr40syVKxd9LJEmfxvZPeDg,8544
|
48
|
-
sibi_dst/utils/parquet_saver.py,sha256=
|
48
|
+
sibi_dst/utils/parquet_saver.py,sha256=B1ztPZMJvsulbgXMBnJdSkPhLFvdv8sRnmyqjjBBRTI,4735
|
49
49
|
sibi_dst/utils/phone_formatter.py,sha256=tsVTDamuthFYgy4-5UwmQkPQ-FGTGH7MjZyH8utAkIY,4945
|
50
50
|
sibi_dst/utils/storage_config.py,sha256=uaCBF8rgCeYkk-lxVSCjsic8O8HJKAu455MR-OBliCo,4325
|
51
51
|
sibi_dst/utils/storage_manager.py,sha256=yyZqT8XjTf4MKFrfznCmxXxOYz_TiWgtQhzqPoXR9So,6569
|
@@ -71,6 +71,6 @@ sibi_dst/v2/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxR
|
|
71
71
|
sibi_dst/v2/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
|
72
72
|
sibi_dst/v2/utils/__init__.py,sha256=6H4cvhqTiFufnFPETBF0f8beVVMpfJfvUs6Ne0TQZNY,58
|
73
73
|
sibi_dst/v2/utils/log_utils.py,sha256=rfk5VsLAt-FKpv6aPTC1FToIPiyrnHAFFBAkHme24po,4123
|
74
|
-
sibi_dst-2025.1.
|
75
|
-
sibi_dst-2025.1.
|
76
|
-
sibi_dst-2025.1.
|
74
|
+
sibi_dst-2025.1.8.dist-info/METADATA,sha256=DcMnd1S5S6CGThkl1-ebn4HWkmGC0ujPHZYHMPrJHd0,2610
|
75
|
+
sibi_dst-2025.1.8.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
76
|
+
sibi_dst-2025.1.8.dist-info/RECORD,,
|
File without changes
|