sibi-dst 0.3.38__py3-none-any.whl → 0.3.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sibi_dst/utils/data_wrapper.py
CHANGED
@@ -98,7 +98,7 @@ class DataWrapper:
|
|
98
98
|
self.parquet_filename = parquet_filename
|
99
99
|
self.filesystem_type = filesystem_type
|
100
100
|
self.filesystem_options = filesystem_options or {}
|
101
|
-
self.fs = fs
|
101
|
+
self.fs = fs
|
102
102
|
self.verbose = verbose
|
103
103
|
self.class_params = class_params or {}
|
104
104
|
self.load_params = load_params or {}
|
@@ -116,6 +116,10 @@ class DataWrapper:
|
|
116
116
|
self._lock = Lock()
|
117
117
|
self.processed_dates = []
|
118
118
|
self.date_utils = DateUtils(logger=self.logger)
|
119
|
+
if self.fs is None:
|
120
|
+
with self._lock:
|
121
|
+
if self.fs is None:
|
122
|
+
self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
|
119
123
|
|
120
124
|
@staticmethod
|
121
125
|
def convert_to_date(date: Union[datetime.date, str]) -> datetime.date:
|
@@ -172,21 +176,20 @@ class DataWrapper:
|
|
172
176
|
# Each thread will handle all dates associated with that priority.
|
173
177
|
def process_priority(priority):
|
174
178
|
# Extract dates for the current priority
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
]["date"].tolist()
|
179
|
+
dates_to_process = update_plan_table[
|
180
|
+
update_plan_table["update_priority"] == priority
|
181
|
+
]["date"].tolist()
|
179
182
|
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
183
|
+
# If show_progress is True, wrap in a progress bar
|
184
|
+
date_iterator = dates_to_process
|
185
|
+
if self.show_progress:
|
186
|
+
date_iterator = tqdm(date_iterator,
|
187
|
+
desc=f"Processing priority {priority}:{self.dataclass.__name__}",
|
188
|
+
unit="date")
|
186
189
|
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
+
# Process each date for this priority
|
191
|
+
for current_date in date_iterator:
|
192
|
+
self.process_date(current_date)
|
190
193
|
|
191
194
|
# Launch a separate thread for each priority
|
192
195
|
with ThreadPoolExecutor(max_workers=len(priorities)) as executor:
|
@@ -232,21 +235,21 @@ class DataWrapper:
|
|
232
235
|
:type date: datetime.date
|
233
236
|
:return: None
|
234
237
|
"""
|
235
|
-
|
236
|
-
|
237
|
-
full_parquet_filename = f"{folder}{self.parquet_filename}"
|
238
|
+
folder = f'{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/'
|
239
|
+
full_parquet_filename = f"{folder}{self.parquet_filename}"
|
238
240
|
|
239
|
-
|
240
|
-
|
241
|
-
|
241
|
+
start_time = datetime.datetime.now()
|
242
|
+
self.logger.info(f"Processing date: {date}")
|
243
|
+
self.logger.info(f"Processing {full_parquet_filename}...")
|
242
244
|
|
243
|
-
|
244
|
-
|
245
|
+
data_object = self.dataclass(**self.class_params)
|
246
|
+
df = data_object.load_period(dt_field=self.date_field, start=date, end=date)
|
245
247
|
|
246
|
-
|
247
|
-
|
248
|
-
|
248
|
+
if len(df.index) == 0:
|
249
|
+
self.logger.error("No data found for the specified date.")
|
250
|
+
return
|
249
251
|
|
252
|
+
with self._lock:
|
250
253
|
parquet_saver = ParquetSaver(df, parquet_storage_path=folder, logger=self.logger, fs=self.fs)
|
251
254
|
parquet_saver.save_to_parquet(self.parquet_filename, clear_existing=True)
|
252
255
|
|
@@ -255,8 +258,9 @@ class DataWrapper:
|
|
255
258
|
self.logger.info(
|
256
259
|
f"Data saved to {full_parquet_filename}. Processing time: {duration_seconds:.2f} seconds"
|
257
260
|
)
|
261
|
+
|
258
262
|
self.processed_dates.append(date)
|
259
|
-
|
263
|
+
self.logger.info(f"Finished processing date: {date}")
|
260
264
|
|
261
265
|
def generate_update_plan_with_conditions(self):
|
262
266
|
"""
|
@@ -301,18 +305,23 @@ class DataWrapper:
|
|
301
305
|
update_required = True
|
302
306
|
# Hierarchy 2: History threshold evaluation
|
303
307
|
elif within_history:
|
304
|
-
if
|
308
|
+
if file_exists:
|
309
|
+
if self.date_utils.is_file_older_than(
|
305
310
|
full_parquet_filename,
|
306
311
|
max_age_minutes=self.max_age_minutes,
|
307
312
|
fs=self.fs,
|
308
313
|
ignore_missing=self.ignore_missing,
|
309
314
|
verbose=self.verbose
|
310
|
-
|
311
|
-
|
312
|
-
|
315
|
+
):
|
316
|
+
category = "history_days"
|
317
|
+
update_required = True
|
318
|
+
else:
|
319
|
+
category = "file is recent"
|
320
|
+
update_required = False
|
313
321
|
else:
|
314
|
-
category = "
|
315
|
-
update_required =
|
322
|
+
category = "missing_files"
|
323
|
+
update_required = True
|
324
|
+
|
316
325
|
# Hierarchy 3: Missing files
|
317
326
|
elif missing_file and current_date <= today:
|
318
327
|
category = "missing_files"
|
@@ -334,12 +343,6 @@ class DataWrapper:
|
|
334
343
|
}
|
335
344
|
rows.append(row)
|
336
345
|
|
337
|
-
|
338
|
-
for row in rows:
|
339
|
-
category = row.get("update_category")
|
340
|
-
# Default to None if no category assigned (no update required)
|
341
|
-
row["update_priority"] = priority_map.get(category, 0)
|
342
|
-
|
343
346
|
update_plan_table = pd.DataFrame(rows)
|
344
347
|
return update_plan_table
|
345
348
|
|
@@ -42,7 +42,7 @@ sibi_dst/utils/airflow_manager.py,sha256=-d44EKUZNYJyp4wuNwRvilRQktunArPOB5fZuWd
|
|
42
42
|
sibi_dst/utils/clickhouse_writer.py,sha256=syXGN9NG1FS8soHuMj6QNRqTRWi-thuYUF-_BWDc_KI,9883
|
43
43
|
sibi_dst/utils/credentials.py,sha256=cHJPPsmVyijqbUQIq7WWPe-lIallA-mI5RAy3YUuRME,1724
|
44
44
|
sibi_dst/utils/data_utils.py,sha256=j-lEKt6EJL2fm0z7adcjtVG7yFYLRpQL8xSgh2CVmJg,8769
|
45
|
-
sibi_dst/utils/data_wrapper.py,sha256=
|
45
|
+
sibi_dst/utils/data_wrapper.py,sha256=SMmr4hwnUvCrN6nouI8N7aRiczuatjSMdYzNOBcDnr8,16283
|
46
46
|
sibi_dst/utils/date_utils.py,sha256=UppOs1vfm41Si9JITAM7Qn9qqOi9yb6ukYz1E2mnA1I,19214
|
47
47
|
sibi_dst/utils/df_utils.py,sha256=OFEtcwVKIilvf9qVf-IfIOHp4jcFAHX5l2IDGudhPZg,10989
|
48
48
|
sibi_dst/utils/file_utils.py,sha256=JpsybYj3XvVJisSBeVU6YSaZnYRm4_6YWTI3TLnnY4Y,1257
|
@@ -50,6 +50,6 @@ sibi_dst/utils/filepath_generator.py,sha256=volVm0SSlBrtZp1RpTHxyui5rj5asNcVsWEB
|
|
50
50
|
sibi_dst/utils/log_utils.py,sha256=XUbeXa1JsOlcEJyW8jnBlWo295rLUnuYi-HMzyhHwJg,3145
|
51
51
|
sibi_dst/utils/parquet_saver.py,sha256=_QkXL3IiC2b4m7sxHpCSeqPwBWxXeiP5sH_WheSMEm4,8042
|
52
52
|
sibi_dst/utils/storage_manager.py,sha256=-zlMrRo_6o6mCd_OHknKqNQl7m0I9VW89grDAUO1V5c,4229
|
53
|
-
sibi_dst-0.3.
|
54
|
-
sibi_dst-0.3.
|
55
|
-
sibi_dst-0.3.
|
53
|
+
sibi_dst-0.3.39.dist-info/METADATA,sha256=SZJ04u1MedYe00Z3h_YexNMEzB2Nfd9BFyTdDXVX6ZM,2564
|
54
|
+
sibi_dst-0.3.39.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
55
|
+
sibi_dst-0.3.39.dist-info/RECORD,,
|
File without changes
|