sibi-dst 0.3.40__py3-none-any.whl → 0.3.42__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/__init__.py +2 -0
- sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +257 -0
- sibi_dst/utils/__init__.py +3 -0
- sibi_dst/utils/data_utils.py +66 -25
- sibi_dst/utils/data_wrapper.py +586 -285
- sibi_dst/utils/date_utils.py +118 -113
- sibi_dst/utils/log_utils.py +57 -18
- sibi_dst/utils/phone_formatter.py +127 -0
- {sibi_dst-0.3.40.dist-info → sibi_dst-0.3.42.dist-info}/METADATA +1 -1
- {sibi_dst-0.3.40.dist-info → sibi_dst-0.3.42.dist-info}/RECORD +11 -9
- {sibi_dst-0.3.40.dist-info → sibi_dst-0.3.42.dist-info}/WHEEL +0 -0
sibi_dst/utils/data_wrapper.py
CHANGED
@@ -1,73 +1,25 @@
|
|
1
1
|
import datetime
|
2
|
-
|
3
|
-
from
|
2
|
+
import logging
|
3
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
4
|
+
from typing import Type, Any, Dict, Optional, Union, List, Tuple
|
4
5
|
from threading import Lock
|
5
6
|
import fsspec
|
6
7
|
import pandas as pd
|
7
8
|
from IPython.display import display
|
8
9
|
from tqdm import tqdm
|
9
10
|
|
10
|
-
from sibi_dst.utils import Logger,
|
11
|
+
from sibi_dst.utils import Logger, FileAgeChecker
|
11
12
|
from sibi_dst.utils import ParquetSaver
|
12
13
|
|
13
14
|
|
14
15
|
class DataWrapper:
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
support for different filesystem types and options.
|
23
|
-
|
24
|
-
It also provides features like logging actions, managing processing threads, generating
|
25
|
-
update plans, checking file age, and dynamically creating date ranges for data operations.
|
26
|
-
|
27
|
-
The design supports flexible integration with user-defined classes (dataclasses) to define
|
28
|
-
custom loading and processing behavior.
|
29
|
-
|
30
|
-
:ivar dataclass: The user-defined class for data processing.
|
31
|
-
:type dataclass: Type
|
32
|
-
:ivar date_field: The name of the date field in the user-defined class.
|
33
|
-
:type date_field: str
|
34
|
-
:ivar data_path: Base path for the dataset storage.
|
35
|
-
:type data_path: str
|
36
|
-
:ivar parquet_filename: File name for the Parquet file.
|
37
|
-
:type parquet_filename: str
|
38
|
-
:ivar start_date: Start date for processing.
|
39
|
-
:type start_date: datetime.date
|
40
|
-
:ivar end_date: End date for processing.
|
41
|
-
:type end_date: datetime.date
|
42
|
-
:ivar fs: File system object for managing files.
|
43
|
-
:type fs: Optional[fsspec.AbstractFileSystem]
|
44
|
-
:ivar filesystem_type: Type of the filesystem (e.g., "file", "s3").
|
45
|
-
:type filesystem_type: str
|
46
|
-
:ivar filesystem_options: Additional options for initializing the filesystem.
|
47
|
-
:type filesystem_options: Optional[Dict]
|
48
|
-
:ivar verbose: Flag to enable verbose logging.
|
49
|
-
:type verbose: bool
|
50
|
-
:ivar class_params: Parameters to initialize the dataclass.
|
51
|
-
:type class_params: Optional[Dict]
|
52
|
-
:ivar load_params: Additional parameters for loading functions.
|
53
|
-
:type load_params: Optional[Dict]
|
54
|
-
:ivar reverse_order: Flag to reverse the order of date range generation.
|
55
|
-
:type reverse_order: bool
|
56
|
-
:ivar overwrite: Whether to overwrite all files during processing.
|
57
|
-
:type overwrite: bool
|
58
|
-
:ivar ignore_missing: Whether to ignore missing files.
|
59
|
-
:type ignore_missing: bool
|
60
|
-
:ivar logger: Logger instance for logging information.
|
61
|
-
:type logger: Optional[Logger]
|
62
|
-
:ivar max_age_minutes: Maximum file age threshold in minutes.
|
63
|
-
:type max_age_minutes: int
|
64
|
-
:ivar history_days_threshold: Number of days for the history threshold.
|
65
|
-
:type history_days_threshold: int
|
66
|
-
:ivar show_progress: Flag to enable progress display.
|
67
|
-
:type show_progress: bool
|
68
|
-
:ivar timeout: Timeout in seconds for processing tasks with threads.
|
69
|
-
:type timeout: Optional[int]
|
70
|
-
"""
|
16
|
+
DEFAULT_PRIORITY_MAP = {
|
17
|
+
"overwrite": 1,
|
18
|
+
"missing_in_history": 2,
|
19
|
+
"existing_but_stale": 3,
|
20
|
+
"missing_outside_history": 4,
|
21
|
+
"file_is_recent": 0
|
22
|
+
}
|
71
23
|
DEFAULT_MAX_AGE_MINUTES = 1440
|
72
24
|
DEFAULT_HISTORY_DAYS_THRESHOLD = 30
|
73
25
|
|
@@ -81,6 +33,7 @@ class DataWrapper:
|
|
81
33
|
fs: Optional[fsspec.AbstractFileSystem] = None,
|
82
34
|
filesystem_type: str = "file",
|
83
35
|
filesystem_options: Optional[Dict] = None,
|
36
|
+
debug: bool = False,
|
84
37
|
verbose: bool = False,
|
85
38
|
class_params: Optional[Dict] = None,
|
86
39
|
load_params: Optional[Dict] = None,
|
@@ -91,14 +44,17 @@ class DataWrapper:
|
|
91
44
|
max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
|
92
45
|
history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
|
93
46
|
show_progress: bool = False,
|
94
|
-
timeout: float = 60
|
47
|
+
timeout: float = 60,
|
48
|
+
reference_date: datetime.date = None,
|
49
|
+
custom_priority_map: Dict[str, int] = None):
|
95
50
|
self.dataclass = dataclass
|
96
51
|
self.date_field = date_field
|
97
|
-
self.data_path = self.
|
52
|
+
self.data_path = self._ensure_forward_slash(data_path)
|
98
53
|
self.parquet_filename = parquet_filename
|
99
54
|
self.filesystem_type = filesystem_type
|
100
55
|
self.filesystem_options = filesystem_options or {}
|
101
|
-
self.fs = fs
|
56
|
+
self.fs = fs or self._init_filesystem()
|
57
|
+
self.debug = debug
|
102
58
|
self.verbose = verbose
|
103
59
|
self.class_params = class_params or {}
|
104
60
|
self.load_params = load_params or {}
|
@@ -106,23 +62,26 @@ class DataWrapper:
|
|
106
62
|
self.overwrite = overwrite
|
107
63
|
self.ignore_missing = ignore_missing
|
108
64
|
self.logger = logger or Logger.default_logger(logger_name=self.dataclass.__name__)
|
65
|
+
self.logger.set_level(logging.DEBUG if debug else logging.INFO)
|
109
66
|
self.max_age_minutes = max_age_minutes
|
110
67
|
self.history_days_threshold = history_days_threshold
|
111
68
|
self.show_progress = show_progress
|
112
69
|
self.timeout = timeout
|
70
|
+
self.reference_date = reference_date or datetime.date.today()
|
71
|
+
self.priority_map = custom_priority_map or self.DEFAULT_PRIORITY_MAP
|
113
72
|
|
114
|
-
self.start_date = self.
|
115
|
-
self.end_date = self.
|
73
|
+
self.start_date = self._convert_to_date(start_date)
|
74
|
+
self.end_date = self._convert_to_date(end_date)
|
116
75
|
self._lock = Lock()
|
117
76
|
self.processed_dates = []
|
118
|
-
self.
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
77
|
+
self.age_checker = FileAgeChecker(logger=self.logger)
|
78
|
+
|
79
|
+
def _init_filesystem(self) -> fsspec.AbstractFileSystem:
|
80
|
+
with self._lock:
|
81
|
+
return fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
|
123
82
|
|
124
83
|
@staticmethod
|
125
|
-
def
|
84
|
+
def _convert_to_date(date: Union[datetime.date, str]) -> datetime.date:
|
126
85
|
if isinstance(date, datetime.date):
|
127
86
|
return date
|
128
87
|
try:
|
@@ -131,234 +90,576 @@ class DataWrapper:
|
|
131
90
|
raise ValueError(f"Error converting {date} to datetime: {e}")
|
132
91
|
|
133
92
|
@staticmethod
|
134
|
-
def
|
135
|
-
return path
|
93
|
+
def _ensure_forward_slash(path: str) -> str:
|
94
|
+
return path.rstrip('/') + '/'
|
95
|
+
|
96
|
+
def generate_date_range(self) -> List[datetime.date]:
|
97
|
+
"""Generate ordered date range with future date handling"""
|
98
|
+
date_range = pd.date_range(
|
99
|
+
start=self.start_date,
|
100
|
+
end=self.end_date,
|
101
|
+
freq='D'
|
102
|
+
).date.tolist()
|
136
103
|
|
137
|
-
def generate_date_range(self):
|
138
|
-
"""Generate a range of dates between start_date and end_date."""
|
139
|
-
date_range = pd.date_range(self.start_date, self.end_date, freq='D')
|
140
104
|
if self.reverse_order:
|
141
|
-
date_range
|
142
|
-
|
143
|
-
|
105
|
+
date_range.reverse()
|
106
|
+
|
107
|
+
return [
|
108
|
+
d for d in date_range
|
109
|
+
if d <= self.reference_date or self.overwrite
|
110
|
+
]
|
144
111
|
|
145
112
|
def process(self, max_retries: int = 3):
|
146
|
-
"""
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
grouping them by priority levels, and processing them in parallel threads.
|
152
|
-
Each thread handles the updates for a specific priority level, ensuring a streamlined approach
|
153
|
-
to handling the updates efficiently.
|
154
|
-
|
155
|
-
:param max_retries: Maximum number of retries for a task after a timeout. Defaults to 3.
|
156
|
-
:raises TimeoutError: If a thread processing a priority level exceeds the allowed timeout duration.
|
157
|
-
:return: None
|
158
|
-
"""
|
159
|
-
update_plan_table = self.generate_update_plan_with_conditions()
|
160
|
-
|
161
|
-
# Filter out rows that do not require updates (priority 0 means skip)
|
162
|
-
with self._lock:
|
163
|
-
update_plan_table = update_plan_table[
|
164
|
-
(update_plan_table["update_required"] == True) & (update_plan_table["update_priority"] != 0)
|
165
|
-
]
|
166
|
-
# Display the update plan table to the user if requested
|
167
|
-
if len(update_plan_table.index) == 0:
|
113
|
+
"""Process updates with priority-based execution and retries"""
|
114
|
+
update_plan = self.generate_update_plan()
|
115
|
+
|
116
|
+
if update_plan.empty:
|
117
|
+
self.logger.info("No updates required")
|
168
118
|
return
|
119
|
+
# Filter for required updates first
|
120
|
+
update_plan = update_plan[update_plan["update_required"] == True]
|
121
|
+
|
169
122
|
if self.show_progress:
|
170
|
-
display(
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
date_iterator = dates_to_process
|
185
|
-
if self.show_progress:
|
186
|
-
date_iterator = tqdm(date_iterator,
|
187
|
-
desc=f"Processing priority {priority}:{self.dataclass.__name__}",
|
188
|
-
unit="date")
|
189
|
-
|
190
|
-
# Process each date for this priority
|
191
|
-
for current_date in date_iterator:
|
192
|
-
self.process_date(current_date)
|
193
|
-
|
194
|
-
# Launch a separate thread for each priority
|
195
|
-
with ThreadPoolExecutor(max_workers=len(priorities)) as executor:
|
196
|
-
futures = {executor.submit(process_priority, p): p for p in priorities}
|
197
|
-
retries = {p: 0 for p in priorities} # Track retry counts for each priority
|
198
|
-
|
199
|
-
while futures:
|
200
|
-
for future in list(futures.keys()):
|
201
|
-
try:
|
202
|
-
future.result(timeout=self.timeout)
|
203
|
-
del futures[future] # Remove completed future
|
204
|
-
except TimeoutError:
|
205
|
-
priority = futures[future]
|
206
|
-
retries[priority] += 1
|
207
|
-
|
208
|
-
if retries[priority] <= max_retries:
|
209
|
-
self.logger.warning(
|
210
|
-
f"Thread for priority {priority} timed out. Retrying ({retries[priority]}/{max_retries})..."
|
211
|
-
)
|
212
|
-
new_future = executor.submit(process_priority, priority)
|
213
|
-
futures[new_future] = priority
|
214
|
-
else:
|
215
|
-
self.logger.error(
|
216
|
-
f"Thread for priority {priority} timed out. Max retries ({max_retries}) exceeded. Skipping."
|
217
|
-
)
|
218
|
-
del futures[future] # Remove the timed-out future
|
219
|
-
except Exception as e:
|
220
|
-
self.logger.error(f"Error processing priority {futures[future]}: {e}")
|
221
|
-
del futures[future] # Remove the failed future
|
222
|
-
|
223
|
-
def process_date(self, date: datetime.date):
|
224
|
-
"""
|
225
|
-
Processes data for a given date and saves it as a Parquet file.
|
226
|
-
|
227
|
-
This method processes data for the specified date by loading the data
|
228
|
-
corresponding to that day, saving it into a structured storage format
|
229
|
-
(Parquet), and logging relevant information such as processing time
|
230
|
-
and errors that may occur during the process. It uses provided
|
231
|
-
dataclass and parameters to operate and ensures the data is stored
|
232
|
-
in a structured folder hierarchy.
|
233
|
-
|
234
|
-
:param date: The specific date for which data processing and saving should occur
|
235
|
-
:type date: datetime.date
|
236
|
-
:return: None
|
237
|
-
"""
|
238
|
-
folder = f'{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/'
|
239
|
-
full_parquet_filename = f"{folder}{self.parquet_filename}"
|
123
|
+
#display(self._enhanced_display_table(update_plan))
|
124
|
+
display(update_plan)
|
125
|
+
|
126
|
+
for priority in sorted(update_plan["update_priority"].unique()):
|
127
|
+
self._process_priority_group(update_plan, priority, max_retries)
|
128
|
+
|
129
|
+
def _process_priority_group(self,
|
130
|
+
update_plan: pd.DataFrame,
|
131
|
+
priority: int,
|
132
|
+
max_retries: int):
|
133
|
+
"""Process a single priority group with parallel execution"""
|
134
|
+
dates = update_plan[update_plan["update_priority"] == priority]["date"].tolist()
|
135
|
+
if not dates:
|
136
|
+
return
|
240
137
|
|
241
|
-
|
242
|
-
self.logger.info(f"
|
243
|
-
self.logger.info(f"Processing {full_parquet_filename}...")
|
138
|
+
desc = f"Processing {self.dataclass.__name__}, task: {self._priority_label(priority)}"
|
139
|
+
self.logger.info(f"Starting {desc.lower()}")
|
244
140
|
|
245
|
-
|
246
|
-
|
141
|
+
with ThreadPoolExecutor() as executor:
|
142
|
+
futures = {
|
143
|
+
executor.submit(self._process_date_with_retry, date, max_retries): date
|
144
|
+
for date in dates
|
145
|
+
}
|
247
146
|
|
248
|
-
|
249
|
-
|
250
|
-
|
147
|
+
for future in tqdm(as_completed(futures),
|
148
|
+
total=len(futures),
|
149
|
+
desc=desc,
|
150
|
+
disable=not self.show_progress):
|
151
|
+
date = futures[future]
|
152
|
+
try:
|
153
|
+
future.result(timeout=self.timeout)
|
154
|
+
except Exception as e:
|
155
|
+
self.logger.error(f"Permanent failure processing {date}: {str(e)}")
|
156
|
+
|
157
|
+
def _priority_label(self, priority: int) -> str:
|
158
|
+
"""Get human-readable label for priority level"""
|
159
|
+
return next(
|
160
|
+
(k for k, v in self.priority_map.items() if v == priority),
|
161
|
+
f"Unknown Priority {priority}"
|
162
|
+
)
|
163
|
+
|
164
|
+
def _enhanced_display_table(self, df: pd.DataFrame) -> pd.DataFrame.style:
|
165
|
+
"""Format the update plan table for better readability"""
|
166
|
+
return df.style \
|
167
|
+
.bar(subset=["file_age_minutes"], color="#5fba7d") \
|
168
|
+
.background_gradient(subset=["update_priority"], cmap="YlOrBr") \
|
169
|
+
.set_caption(f"Update Plan: {self.dataclass.__name__}")
|
170
|
+
|
171
|
+
def generate_update_plan(self) -> pd.DataFrame:
|
172
|
+
"""Generate update plan with parallel file status checks"""
|
173
|
+
dates = self.generate_date_range()
|
174
|
+
history_start = self.reference_date - datetime.timedelta(days=self.history_days_threshold)
|
175
|
+
rows = []
|
251
176
|
|
252
|
-
with
|
253
|
-
|
254
|
-
|
177
|
+
with ThreadPoolExecutor() as executor:
|
178
|
+
future_to_date = {
|
179
|
+
executor.submit(self._get_file_status, date): date
|
180
|
+
for date in dates
|
181
|
+
}
|
255
182
|
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
183
|
+
for future in tqdm(as_completed(future_to_date),
|
184
|
+
total=len(future_to_date),
|
185
|
+
desc=f"Analyzing files for {self.dataclass.__name__} ",
|
186
|
+
disable=not self.show_progress):
|
187
|
+
current_date = future_to_date[future]
|
188
|
+
file_exists, file_age = future.result()
|
189
|
+
rows.append(self._create_plan_row(
|
190
|
+
current_date,
|
191
|
+
history_start,
|
192
|
+
file_exists,
|
193
|
+
file_age
|
194
|
+
))
|
195
|
+
|
196
|
+
return pd.DataFrame(rows).sort_values("update_priority")
|
197
|
+
|
198
|
+
def _get_file_status(self, date: datetime.date) -> Tuple[bool, float]:
|
199
|
+
"""Get file existence and age with error handling"""
|
200
|
+
path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/{self.parquet_filename}"
|
201
|
+
try:
|
202
|
+
exists = self.fs.exists(path)
|
203
|
+
age = self.age_checker.get_file_or_dir_age_minutes(path, self.fs) if exists else None
|
204
|
+
return exists, age
|
205
|
+
except Exception as e:
|
206
|
+
self.logger.warning(f"Error checking {path}: {str(e)}")
|
207
|
+
return False, None
|
208
|
+
|
209
|
+
def _create_plan_row(self,
|
210
|
+
date: datetime.date,
|
211
|
+
history_start: datetime.date,
|
212
|
+
file_exists: bool,
|
213
|
+
file_age: float) -> dict:
|
214
|
+
"""Create a row for the update plan DataFrame"""
|
215
|
+
within_history = history_start <= date <= self.reference_date
|
216
|
+
category, update_required = "file_is_recent", False
|
217
|
+
|
218
|
+
if self.overwrite:
|
219
|
+
category, update_required = "overwrite", True
|
220
|
+
elif within_history:
|
221
|
+
if not file_exists:
|
222
|
+
category, update_required = "missing_in_history", True
|
223
|
+
elif file_age > self.max_age_minutes:
|
224
|
+
category, update_required = "existing_but_stale", True
|
225
|
+
elif not file_exists and not self.ignore_missing:
|
226
|
+
category, update_required = "missing_outside_history", True
|
227
|
+
|
228
|
+
return {
|
229
|
+
"date": date,
|
230
|
+
"file_exists": file_exists,
|
231
|
+
"file_age_minutes": file_age,
|
232
|
+
"age_threshold": self.max_age_minutes,
|
233
|
+
"within_history": within_history,
|
234
|
+
"ignore_missing": self.ignore_missing,
|
235
|
+
"update_category": category,
|
236
|
+
"update_priority": self.priority_map[category],
|
237
|
+
"update_required": update_required,
|
238
|
+
"class": self.dataclass.__name__
|
239
|
+
}
|
261
240
|
|
262
|
-
|
263
|
-
|
241
|
+
def _process_date_with_retry(self, date: datetime.date, max_retries: int):
|
242
|
+
"""Process a date with retry logic"""
|
243
|
+
for attempt in range(1, max_retries + 1):
|
244
|
+
try:
|
245
|
+
self._process_single_date(date)
|
246
|
+
return
|
247
|
+
except Exception as e:
|
248
|
+
if attempt < max_retries:
|
249
|
+
self.logger.warning(f"Retry {attempt}/{max_retries} for {date}: {str(e)}")
|
250
|
+
else:
|
251
|
+
raise RuntimeError(f"Failed processing {date} after {max_retries} attempts") from e
|
264
252
|
|
265
|
-
def
|
266
|
-
"""
|
267
|
-
|
268
|
-
|
269
|
-
specified historical threshold, and the necessity to overwrite or handle missing files. A priority map is utilized to
|
270
|
-
assign priority levels to update categories.
|
253
|
+
def _process_single_date(self, date: datetime.date):
|
254
|
+
"""Core date processing logic"""
|
255
|
+
path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
|
256
|
+
full_path = f"{path}{self.parquet_filename}"
|
271
257
|
|
272
|
-
|
273
|
-
|
258
|
+
self.logger.info(f"Processing {date} ({full_path})")
|
259
|
+
start_time = datetime.datetime.now()
|
274
260
|
|
275
|
-
:
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
261
|
+
try:
|
262
|
+
data = self.dataclass(**self.class_params)
|
263
|
+
df = data.load_period(
|
264
|
+
dt_field=self.date_field,
|
265
|
+
start=date,
|
266
|
+
end=date,
|
267
|
+
**self.load_params
|
268
|
+
)
|
280
269
|
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
#
|
349
|
-
#
|
350
|
-
#
|
351
|
-
#
|
352
|
-
#
|
353
|
-
#
|
354
|
-
#
|
355
|
-
#
|
356
|
-
#
|
357
|
-
#
|
358
|
-
#
|
359
|
-
#
|
360
|
-
#
|
361
|
-
#
|
362
|
-
#
|
363
|
-
#
|
364
|
-
#
|
270
|
+
if len(df.index)==0:
|
271
|
+
self.logger.warning(f"No data found for {date}")
|
272
|
+
return
|
273
|
+
|
274
|
+
with self._lock:
|
275
|
+
ParquetSaver(
|
276
|
+
df_result=df,
|
277
|
+
parquet_storage_path=path,
|
278
|
+
fs=self.fs,
|
279
|
+
logger=self.logger
|
280
|
+
).save_to_parquet(self.parquet_filename)
|
281
|
+
|
282
|
+
duration = (datetime.datetime.now() - start_time).total_seconds()
|
283
|
+
self._log_success(date, duration, full_path)
|
284
|
+
|
285
|
+
except Exception as e:
|
286
|
+
self._log_failure(date, e)
|
287
|
+
raise
|
288
|
+
|
289
|
+
def _log_success(self, date: datetime.date, duration: float, path: str):
|
290
|
+
"""Handle successful processing logging"""
|
291
|
+
msg = f"Completed {date} in {duration:.1f}s | Saved to {path}"
|
292
|
+
self.logger.info(msg)
|
293
|
+
self.processed_dates.append(date)
|
294
|
+
|
295
|
+
def _log_failure(self, date: datetime.date, error: Exception):
|
296
|
+
"""Handle error logging"""
|
297
|
+
msg = f"Failed processing {date}: {str(error)}"
|
298
|
+
self.logger.error(msg)
|
299
|
+
|
300
|
+
# Helper functions omitted for brevity (date conversion, path normalization)
|
301
|
+
|
302
|
+
# import datetime
|
303
|
+
# from concurrent.futures import ThreadPoolExecutor
|
304
|
+
# from typing import Type, Any, Dict, Optional, Union
|
305
|
+
# from threading import Lock
|
306
|
+
# import fsspec
|
307
|
+
# import pandas as pd
|
308
|
+
# from IPython.display import display
|
309
|
+
# from tqdm import tqdm
|
310
|
+
#
|
311
|
+
# from sibi_dst.utils import Logger, DateUtils
|
312
|
+
# from sibi_dst.utils import ParquetSaver
|
313
|
+
#
|
314
|
+
#
|
315
|
+
# class DataWrapper:
|
316
|
+
# """
|
317
|
+
# Utility class for handling file-based operations, including processing and saving data
|
318
|
+
# in Parquet format, while managing a hierarchy of conditions such as overwrite, history
|
319
|
+
# threshold, and missing file detection.
|
320
|
+
#
|
321
|
+
# This class aims to simplify the process of managing large datasets stored in a filesystem.
|
322
|
+
# It allows for controlled updates to data files based on parameters set by the user, with
|
323
|
+
# support for different filesystem types and options.
|
324
|
+
#
|
325
|
+
# It also provides features like logging actions, managing processing threads, generating
|
326
|
+
# update plans, checking file age, and dynamically creating date ranges for data operations.
|
327
|
+
#
|
328
|
+
# The design supports flexible integration with user-defined classes (dataclasses) to define
|
329
|
+
# custom loading and processing behavior.
|
330
|
+
#
|
331
|
+
# :ivar dataclass: The user-defined class for data processing.
|
332
|
+
# :type dataclass: Type
|
333
|
+
# :ivar date_field: The name of the date field in the user-defined class.
|
334
|
+
# :type date_field: str
|
335
|
+
# :ivar data_path: Base path for the dataset storage.
|
336
|
+
# :type data_path: str
|
337
|
+
# :ivar parquet_filename: File name for the Parquet file.
|
338
|
+
# :type parquet_filename: str
|
339
|
+
# :ivar start_date: Start date for processing.
|
340
|
+
# :type start_date: datetime.date
|
341
|
+
# :ivar end_date: End date for processing.
|
342
|
+
# :type end_date: datetime.date
|
343
|
+
# :ivar fs: File system object for managing files.
|
344
|
+
# :type fs: Optional[fsspec.AbstractFileSystem]
|
345
|
+
# :ivar filesystem_type: Type of the filesystem (e.g., "file", "s3").
|
346
|
+
# :type filesystem_type: str
|
347
|
+
# :ivar filesystem_options: Additional options for initializing the filesystem.
|
348
|
+
# :type filesystem_options: Optional[Dict]
|
349
|
+
# :ivar verbose: Flag to enable verbose logging.
|
350
|
+
# :type verbose: bool
|
351
|
+
# :ivar class_params: Parameters to initialize the dataclass.
|
352
|
+
# :type class_params: Optional[Dict]
|
353
|
+
# :ivar load_params: Additional parameters for loading functions.
|
354
|
+
# :type load_params: Optional[Dict]
|
355
|
+
# :ivar reverse_order: Flag to reverse the order of date range generation.
|
356
|
+
# :type reverse_order: bool
|
357
|
+
# :ivar overwrite: Whether to overwrite all files during processing.
|
358
|
+
# :type overwrite: bool
|
359
|
+
# :ivar ignore_missing: Whether to ignore missing files.
|
360
|
+
# :type ignore_missing: bool
|
361
|
+
# :ivar logger: Logger instance for logging information.
|
362
|
+
# :type logger: Optional[Logger]
|
363
|
+
# :ivar max_age_minutes: Maximum file age threshold in minutes.
|
364
|
+
# :type max_age_minutes: int
|
365
|
+
# :ivar history_days_threshold: Number of days for the history threshold.
|
366
|
+
# :type history_days_threshold: int
|
367
|
+
# :ivar show_progress: Flag to enable progress display.
|
368
|
+
# :type show_progress: bool
|
369
|
+
# :ivar timeout: Timeout in seconds for processing tasks with threads.
|
370
|
+
# :type timeout: Optional[int]
|
371
|
+
# """
|
372
|
+
# DEFAULT_MAX_AGE_MINUTES = 1440
|
373
|
+
# DEFAULT_HISTORY_DAYS_THRESHOLD = 30
|
374
|
+
#
|
375
|
+
# def __init__(self,
|
376
|
+
# dataclass: Type,
|
377
|
+
# date_field: str,
|
378
|
+
# data_path: str,
|
379
|
+
# parquet_filename: str,
|
380
|
+
# start_date: Any,
|
381
|
+
# end_date: Any,
|
382
|
+
# fs: Optional[fsspec.AbstractFileSystem] = None,
|
383
|
+
# filesystem_type: str = "file",
|
384
|
+
# filesystem_options: Optional[Dict] = None,
|
385
|
+
# verbose: bool = False,
|
386
|
+
# class_params: Optional[Dict] = None,
|
387
|
+
# load_params: Optional[Dict] = None,
|
388
|
+
# reverse_order: bool = False,
|
389
|
+
# overwrite: bool = False,
|
390
|
+
# ignore_missing: bool = False,
|
391
|
+
# logger: Logger = None,
|
392
|
+
# max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
|
393
|
+
# history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
|
394
|
+
# show_progress: bool = False,
|
395
|
+
# timeout: float = 60):
|
396
|
+
# self.dataclass = dataclass
|
397
|
+
# self.date_field = date_field
|
398
|
+
# self.data_path = self.ensure_forward_slash(data_path)
|
399
|
+
# self.parquet_filename = parquet_filename
|
400
|
+
# self.filesystem_type = filesystem_type
|
401
|
+
# self.filesystem_options = filesystem_options or {}
|
402
|
+
# self.fs = fs
|
403
|
+
# self.verbose = verbose
|
404
|
+
# self.class_params = class_params or {}
|
405
|
+
# self.load_params = load_params or {}
|
406
|
+
# self.reverse_order = reverse_order
|
407
|
+
# self.overwrite = overwrite
|
408
|
+
# self.ignore_missing = ignore_missing
|
409
|
+
# self.logger = logger or Logger.default_logger(logger_name=self.dataclass.__name__)
|
410
|
+
# self.max_age_minutes = max_age_minutes
|
411
|
+
# self.history_days_threshold = history_days_threshold
|
412
|
+
# self.show_progress = show_progress
|
413
|
+
# self.timeout = timeout
|
414
|
+
#
|
415
|
+
# self.start_date = self.convert_to_date(start_date)
|
416
|
+
# self.end_date = self.convert_to_date(end_date)
|
417
|
+
# self._lock = Lock()
|
418
|
+
# self.processed_dates = []
|
419
|
+
# self.date_utils = DateUtils(logger=self.logger)
|
420
|
+
# if self.fs is None:
|
421
|
+
# with self._lock:
|
422
|
+
# if self.fs is None:
|
423
|
+
# self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
|
424
|
+
#
|
425
|
+
# @staticmethod
|
426
|
+
# def convert_to_date(date: Union[datetime.date, str]) -> datetime.date:
|
427
|
+
# if isinstance(date, datetime.date):
|
428
|
+
# return date
|
429
|
+
# try:
|
430
|
+
# return pd.to_datetime(date).date()
|
431
|
+
# except ValueError as e:
|
432
|
+
# raise ValueError(f"Error converting {date} to datetime: {e}")
|
433
|
+
#
|
434
|
+
# @staticmethod
|
435
|
+
# def ensure_forward_slash(path: str) -> str:
|
436
|
+
# return path if path.endswith('/') else path + '/'
|
437
|
+
#
|
438
|
+
# def generate_date_range(self):
|
439
|
+
# """Generate a range of dates between start_date and end_date."""
|
440
|
+
# date_range = pd.date_range(self.start_date, self.end_date, freq='D')
|
441
|
+
# if self.reverse_order:
|
442
|
+
# date_range = date_range[::-1]
|
443
|
+
# for date in date_range:
|
444
|
+
# yield date.date()
|
445
|
+
#
|
446
|
+
# def process(self, max_retries: int = 3):
|
447
|
+
# """
|
448
|
+
# Processes update tasks by generating an update plan, filtering required updates, and distributing
|
449
|
+
# the workload across threads based on priority levels.
|
450
|
+
#
|
451
|
+
# This method operates by assessing required updates through generated conditions,
|
452
|
+
# grouping them by priority levels, and processing them in parallel threads.
|
453
|
+
# Each thread handles the updates for a specific priority level, ensuring a streamlined approach
|
454
|
+
# to handling the updates efficiently.
|
455
|
+
#
|
456
|
+
# :param max_retries: Maximum number of retries for a task after a timeout. Defaults to 3.
|
457
|
+
# :raises TimeoutError: If a thread processing a priority level exceeds the allowed timeout duration.
|
458
|
+
# :return: None
|
459
|
+
# """
|
460
|
+
# update_plan_table = self.generate_update_plan_with_conditions()
|
461
|
+
#
|
462
|
+
# # Filter out rows that do not require updates (priority 0 means skip)
|
463
|
+
# with self._lock:
|
464
|
+
# update_plan_table = update_plan_table[
|
465
|
+
# (update_plan_table["update_required"] == True) & (update_plan_table["update_priority"] != 0)
|
466
|
+
# ]
|
467
|
+
# # Display the update plan table to the user if requested
|
468
|
+
# if len(update_plan_table.index) == 0:
|
469
|
+
# return
|
470
|
+
# if self.show_progress:
|
471
|
+
# display(update_plan_table)
|
472
|
+
# # Group by priority
|
473
|
+
# with self._lock:
|
474
|
+
# priorities = sorted(update_plan_table["update_priority"].unique())
|
475
|
+
#
|
476
|
+
# # We will process each priority level in its own thread.
|
477
|
+
# # Each thread will handle all dates associated with that priority.
|
478
|
+
# def process_priority(priority):
|
479
|
+
# # Extract dates for the current priority
|
480
|
+
# dates_to_process = update_plan_table[
|
481
|
+
# update_plan_table["update_priority"] == priority
|
482
|
+
# ]["date"].tolist()
|
483
|
+
#
|
484
|
+
# # If show_progress is True, wrap in a progress bar
|
485
|
+
# date_iterator = dates_to_process
|
486
|
+
# if self.show_progress:
|
487
|
+
# date_iterator = tqdm(date_iterator,
|
488
|
+
# desc=f"Processing priority {priority}:{self.dataclass.__name__}",
|
489
|
+
# unit="date")
|
490
|
+
#
|
491
|
+
# # Process each date for this priority
|
492
|
+
# for current_date in date_iterator:
|
493
|
+
# self.process_date(current_date)
|
494
|
+
#
|
495
|
+
# # Launch a separate thread for each priority
|
496
|
+
# with ThreadPoolExecutor(max_workers=len(priorities)) as executor:
|
497
|
+
# futures = {executor.submit(process_priority, p): p for p in priorities}
|
498
|
+
# retries = {p: 0 for p in priorities} # Track retry counts for each priority
|
499
|
+
#
|
500
|
+
# while futures:
|
501
|
+
# for future in list(futures.keys()):
|
502
|
+
# try:
|
503
|
+
# future.result(timeout=self.timeout)
|
504
|
+
# del futures[future] # Remove completed future
|
505
|
+
# except TimeoutError:
|
506
|
+
# priority = futures[future]
|
507
|
+
# retries[priority] += 1
|
508
|
+
#
|
509
|
+
# if retries[priority] <= max_retries:
|
510
|
+
# self.logger.warning(
|
511
|
+
# f"Thread for priority {priority} timed out. Retrying ({retries[priority]}/{max_retries})..."
|
512
|
+
# )
|
513
|
+
# new_future = executor.submit(process_priority, priority)
|
514
|
+
# futures[new_future] = priority
|
515
|
+
# else:
|
516
|
+
# self.logger.error(
|
517
|
+
# f"Thread for priority {priority} timed out. Max retries ({max_retries}) exceeded. Skipping."
|
518
|
+
# )
|
519
|
+
# del futures[future] # Remove the timed-out future
|
520
|
+
# except Exception as e:
|
521
|
+
# self.logger.error(f"Error processing priority {futures[future]}: {e}")
|
522
|
+
# del futures[future] # Remove the failed future
|
523
|
+
#
|
524
|
+
# def process_date(self, date: datetime.date):
|
525
|
+
# """
|
526
|
+
# Processes data for a given date and saves it as a Parquet file.
|
527
|
+
#
|
528
|
+
# This method processes data for the specified date by loading the data
|
529
|
+
# corresponding to that day, saving it into a structured storage format
|
530
|
+
# (Parquet), and logging relevant information such as processing time
|
531
|
+
# and errors that may occur during the process. It uses provided
|
532
|
+
# dataclass and parameters to operate and ensures the data is stored
|
533
|
+
# in a structured folder hierarchy.
|
534
|
+
#
|
535
|
+
# :param date: The specific date for which data processing and saving should occur
|
536
|
+
# :type date: datetime.date
|
537
|
+
# :return: None
|
538
|
+
# """
|
539
|
+
# folder = f'{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/'
|
540
|
+
# full_parquet_filename = f"{folder}{self.parquet_filename}"
|
541
|
+
#
|
542
|
+
# start_time = datetime.datetime.now()
|
543
|
+
# self.logger.info(f"Processing date: {date}")
|
544
|
+
# self.logger.info(f"Processing {full_parquet_filename}...")
|
545
|
+
#
|
546
|
+
# data_object = self.dataclass(**self.class_params)
|
547
|
+
# df = data_object.load_period(dt_field=self.date_field, start=date, end=date)
|
548
|
+
#
|
549
|
+
# if len(df.index) == 0:
|
550
|
+
# self.logger.error("No data found for the specified date.")
|
551
|
+
# return
|
552
|
+
#
|
553
|
+
# with self._lock:
|
554
|
+
# parquet_saver = ParquetSaver(df, parquet_storage_path=folder, logger=self.logger, fs=self.fs)
|
555
|
+
# parquet_saver.save_to_parquet(self.parquet_filename, clear_existing=True)
|
556
|
+
#
|
557
|
+
# end_time = datetime.datetime.now()
|
558
|
+
# duration_seconds = (end_time - start_time).total_seconds()
|
559
|
+
# self.logger.info(
|
560
|
+
# f"Data saved to {full_parquet_filename}. Processing time: {duration_seconds:.2f} seconds"
|
561
|
+
# )
|
562
|
+
#
|
563
|
+
# self.processed_dates.append(date)
|
564
|
+
# self.logger.info(f"Finished processing date: {date}")
|
565
|
+
#
|
566
|
+
# def generate_update_plan_with_conditions(self):
|
567
|
+
# """
|
568
|
+
# Generates an update plan for data files based on specific conditions. The function evaluates the need for updating or
|
569
|
+
# overwriting data files for a given date range. Conditions include file existence, whether the file falls within a
|
570
|
+
# specified historical threshold, and the necessity to overwrite or handle missing files. A priority map is utilized to
|
571
|
+
# assign priority levels to update categories.
|
572
|
+
#
|
573
|
+
# :raises FileNotFoundError: If any file is referenced that does not exist and the ``ignore_missing`` property is set to False.
|
574
|
+
# :raises AttributeError: If any required attribute like ``fs``, ``dataclass``, or others are not properly set or initialized.
|
575
|
+
#
|
576
|
+
# :return: A Pandas DataFrame representing the update plan, where each row contains information about a date, the conditions
|
577
|
+
# evaluated for that date, and the determined update priority.
|
578
|
+
# :rtype: pandas.DataFrame
|
579
|
+
# """
|
580
|
+
# rows = []
|
581
|
+
#
|
582
|
+
# today = datetime.date.today()
|
583
|
+
# history_start_date = today - datetime.timedelta(days=self.history_days_threshold)
|
584
|
+
# priority_map = {
|
585
|
+
# "file is recent":0,
|
586
|
+
# "overwrite": 1,
|
587
|
+
# "history_days": 2,
|
588
|
+
# "missing_files": 3
|
589
|
+
# }
|
590
|
+
# date_range = self.generate_date_range()
|
591
|
+
# if self.show_progress:
|
592
|
+
# date_range = tqdm(date_range, desc=f"Evaluating update plan:{self.dataclass.__name__}", unit="date")
|
593
|
+
#
|
594
|
+
# for current_date in date_range:
|
595
|
+
# folder = f'{self.data_path}{current_date.year}/{current_date.month:02d}/{current_date.day:02d}/'
|
596
|
+
# full_parquet_filename = f"{folder}{self.parquet_filename}"
|
597
|
+
#
|
598
|
+
# file_exists = self.fs.exists(full_parquet_filename)
|
599
|
+
# within_history = history_start_date <= current_date <= today
|
600
|
+
# missing_file = not file_exists and not self.ignore_missing
|
601
|
+
# category = None
|
602
|
+
# update_required = False
|
603
|
+
#
|
604
|
+
# # Hierarchy 1: Overwrite
|
605
|
+
# if self.overwrite:
|
606
|
+
# category = "overwrite"
|
607
|
+
# update_required = True
|
608
|
+
# elif missing_file and current_date < today:
|
609
|
+
# category = "missing_files"
|
610
|
+
# update_required = True
|
611
|
+
#
|
612
|
+
# elif within_history:
|
613
|
+
# if file_exists:
|
614
|
+
# if self.date_utils.is_file_older_than(
|
615
|
+
# full_parquet_filename,
|
616
|
+
# max_age_minutes=self.max_age_minutes,
|
617
|
+
# fs=self.fs,
|
618
|
+
# ignore_missing=self.ignore_missing,
|
619
|
+
# verbose=self.verbose
|
620
|
+
# ):
|
621
|
+
# category = "history_days"
|
622
|
+
# update_required = True
|
623
|
+
# else:
|
624
|
+
# category = "file is recent"
|
625
|
+
# update_required = False
|
626
|
+
# else:
|
627
|
+
# category = "missing_files"
|
628
|
+
# update_required = True
|
629
|
+
# else:
|
630
|
+
# category = "No Update Required"
|
631
|
+
# update_required = False
|
632
|
+
#
|
633
|
+
# # Collect condition descriptions for the update plan table
|
634
|
+
# row = {
|
635
|
+
# "date": current_date,
|
636
|
+
# "file_exists": file_exists,
|
637
|
+
# "within_history": within_history,
|
638
|
+
# "missing_file": missing_file,
|
639
|
+
# "update_required": update_required,
|
640
|
+
# "update_category": category,
|
641
|
+
# "datawrapper class": self.dataclass.__name__,
|
642
|
+
# "update_priority": priority_map.get(category, 0)
|
643
|
+
# }
|
644
|
+
# rows.append(row)
|
645
|
+
#
|
646
|
+
# update_plan_table = pd.DataFrame(rows)
|
647
|
+
# return update_plan_table
|
648
|
+
#
|
649
|
+
# # # wrapper.process()
|
650
|
+
# # # wrapper = DataWrapper(
|
651
|
+
# # # dataclass=YourDataClass,
|
652
|
+
# # # date_field="created_at",
|
653
|
+
# # # data_path="s3://your-bucket-name/path/to/data",
|
654
|
+
# # # parquet_filename="data.parquet",
|
655
|
+
# # # start_date="2022-01-01",
|
656
|
+
# # # end_date="2022-12-31",
|
657
|
+
# # # filesystem_type="s3",
|
658
|
+
# # # filesystem_options={
|
659
|
+
# # # "key": "your_aws_access_key",
|
660
|
+
# # # "secret": "your_aws_secret_key",
|
661
|
+
# # # "client_kwargs": {"endpoint_url": "https://s3.amazonaws.com"}
|
662
|
+
# # # },
|
663
|
+
# # # verbose=True
|
664
|
+
# # #)
|
665
|
+
# # #wrapper.process()
|