sibi-dst 0.3.58__py3-none-any.whl → 0.3.59__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +211 -233
- sibi_dst/df_helper/_df_helper.py +7 -3
- sibi_dst/df_helper/_parquet_artifact.py +143 -52
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +3 -3
- sibi_dst/utils/__init__.py +3 -2
- sibi_dst/utils/data_wrapper.py +149 -140
- sibi_dst/utils/date_utils.py +8 -8
- sibi_dst/utils/log_utils.py +1 -1
- sibi_dst/utils/manifest_manager.py +154 -0
- sibi_dst/utils/update_planner.py +96 -85
- {sibi_dst-0.3.58.dist-info → sibi_dst-0.3.59.dist-info}/METADATA +1 -1
- {sibi_dst-0.3.58.dist-info → sibi_dst-0.3.59.dist-info}/RECORD +13 -12
- {sibi_dst-0.3.58.dist-info → sibi_dst-0.3.59.dist-info}/WHEEL +0 -0
sibi_dst/utils/data_wrapper.py
CHANGED
@@ -1,17 +1,16 @@
|
|
1
1
|
import datetime
|
2
2
|
import logging
|
3
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
4
|
-
from typing import Type, Any, Dict, Optional, Union, List, Tuple
|
5
3
|
import threading
|
4
|
+
import time
|
5
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
6
|
+
from typing import Type, Any, Dict, Optional, Union, List
|
7
|
+
|
6
8
|
import fsspec
|
7
9
|
import pandas as pd
|
8
|
-
from IPython.display import display
|
9
10
|
from tqdm import tqdm
|
10
11
|
|
11
12
|
from .log_utils import Logger
|
12
|
-
from .date_utils import FileAgeChecker
|
13
13
|
from .parquet_saver import ParquetSaver
|
14
|
-
from .update_planner import UpdatePlanner
|
15
14
|
|
16
15
|
|
17
16
|
class DataWrapper:
|
@@ -25,77 +24,80 @@ class DataWrapper:
|
|
25
24
|
DEFAULT_MAX_AGE_MINUTES = 1440
|
26
25
|
DEFAULT_HISTORY_DAYS_THRESHOLD = 30
|
27
26
|
|
28
|
-
def __init__(
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
27
|
+
def __init__(
|
28
|
+
self,
|
29
|
+
dataclass: Type,
|
30
|
+
date_field: str,
|
31
|
+
data_path: str,
|
32
|
+
parquet_filename: str,
|
33
|
+
#start_date: Any,
|
34
|
+
#end_date: Any,
|
35
|
+
fs: Optional[fsspec.AbstractFileSystem] = None,
|
36
|
+
#filesystem_type: str = "file",
|
37
|
+
#filesystem_options: Optional[Dict] = None,
|
38
|
+
debug: bool = False,
|
39
|
+
verbose: bool = False,
|
40
|
+
class_params: Optional[Dict] = None,
|
41
|
+
load_params: Optional[Dict] = None,
|
42
|
+
#reverse_order: bool = False,
|
43
|
+
#overwrite: bool = False,
|
44
|
+
#ignore_missing: bool = False,
|
45
|
+
logger: Logger = None,
|
46
|
+
#max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
|
47
|
+
#history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
|
48
|
+
show_progress: bool = False,
|
49
|
+
timeout: float = 60,
|
50
|
+
#reference_date: datetime.date = None,
|
51
|
+
#custom_priority_map: Dict[str, int] = None,
|
52
|
+
max_threads: int = 3,
|
53
|
+
**kwargs: Any,
|
54
|
+
):
|
53
55
|
self.dataclass = dataclass
|
54
56
|
self.date_field = date_field
|
55
57
|
self.data_path = self._ensure_forward_slash(data_path)
|
56
58
|
self.parquet_filename = parquet_filename
|
57
|
-
self.filesystem_type = filesystem_type
|
58
|
-
self.filesystem_options = filesystem_options or {}
|
59
|
-
self.fs = fs or
|
59
|
+
#self.filesystem_type = filesystem_type
|
60
|
+
#self.filesystem_options = filesystem_options or {}
|
61
|
+
self.fs = fs or None
|
60
62
|
self.debug = debug
|
61
63
|
self.verbose = verbose
|
62
|
-
self.
|
63
|
-
self.
|
64
|
-
self.
|
65
|
-
self.overwrite = overwrite
|
66
|
-
self.ignore_missing = ignore_missing
|
64
|
+
# self.reverse_order = reverse_order
|
65
|
+
# self.overwrite = overwrite
|
66
|
+
# self.ignore_missing = ignore_missing
|
67
67
|
self.logger = logger or Logger.default_logger(logger_name=self.dataclass.__name__)
|
68
68
|
self.logger.set_level(logging.DEBUG if debug else logging.INFO)
|
69
|
-
self.max_age_minutes = max_age_minutes
|
70
|
-
self.history_days_threshold = history_days_threshold
|
69
|
+
# self.max_age_minutes = max_age_minutes
|
70
|
+
# self.history_days_threshold = history_days_threshold
|
71
71
|
self.show_progress = show_progress
|
72
72
|
self.timeout = timeout
|
73
|
-
self.reference_date = reference_date or datetime.date.today()
|
74
|
-
self.priority_map = custom_priority_map or self.DEFAULT_PRIORITY_MAP
|
73
|
+
#self.reference_date = reference_date or datetime.date.today()
|
74
|
+
#self.priority_map = custom_priority_map or self.DEFAULT_PRIORITY_MAP
|
75
75
|
self.max_threads = max_threads
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
self.age_checker = FileAgeChecker(logger=self.logger)
|
82
|
-
|
83
|
-
self.update_planner_params = {
|
84
|
-
"data_path": self.data_path,
|
85
|
-
"filename": self.parquet_filename,
|
86
|
-
"fs": self.fs,
|
87
|
-
"debug": self.debug,
|
88
|
-
"logger": self.logger,
|
89
|
-
"reverse_order": self.reverse_order,
|
90
|
-
"overwrite": self.overwrite,
|
91
|
-
"ignore_missing": self.ignore_missing,
|
92
|
-
"history_days_threshold": history_days_threshold,
|
93
|
-
"max_age_minutes": max_age_minutes,
|
94
|
-
"show_progress": self.show_progress,
|
95
|
-
"description": f"{self.dataclass.__name__}"
|
76
|
+
self.class_params = class_params or {
|
77
|
+
'debug': self.debug,
|
78
|
+
'logger': self.logger,
|
79
|
+
'fs': self.fs,
|
80
|
+
'verbose': self.verbose,
|
96
81
|
}
|
97
|
-
self.
|
82
|
+
self.load_params = load_params or {}
|
98
83
|
|
84
|
+
self._lock = threading.Lock()
|
85
|
+
self.processed_dates: List[datetime.date] = []
|
86
|
+
self.benchmarks: Dict[datetime.date, Dict[str, float]] = {}
|
87
|
+
self.mmanifest = kwargs.get("mmanifest", None)
|
88
|
+
self.update_planner=kwargs.get("update_planner", None)
|
89
|
+
|
90
|
+
def __enter__(self):
|
91
|
+
"""Context manager entry"""
|
92
|
+
return self
|
93
|
+
|
94
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
95
|
+
"""Context manager exit"""
|
96
|
+
if self.mmanifest and self.mmanifest._new_records:
|
97
|
+
self.mmanifest.save()
|
98
|
+
if exc_type is not None:
|
99
|
+
self.logger.error(f"Exception occurred: {exc_val}")
|
100
|
+
return False
|
99
101
|
|
100
102
|
def _init_filesystem(self) -> fsspec.AbstractFileSystem:
|
101
103
|
with self._lock:
|
@@ -114,100 +116,86 @@ class DataWrapper:
|
|
114
116
|
def _ensure_forward_slash(path: str) -> str:
|
115
117
|
return path.rstrip('/') + '/'
|
116
118
|
|
117
|
-
def generate_date_range(self) -> List[datetime.date]:
|
118
|
-
"""Generate ordered date range with future date handling"""
|
119
|
-
date_range = pd.date_range(
|
120
|
-
start=self.start_date,
|
121
|
-
end=self.end_date,
|
122
|
-
freq='D'
|
123
|
-
).date.tolist()
|
124
|
-
|
125
|
-
if self.reverse_order:
|
126
|
-
date_range.reverse()
|
127
|
-
|
128
|
-
return [
|
129
|
-
d for d in date_range
|
130
|
-
if d <= self.reference_date or self.overwrite
|
131
|
-
]
|
132
|
-
|
133
119
|
def process(self, max_retries: int = 3):
|
134
|
-
"""Process updates with priority-based execution and
|
135
|
-
|
136
|
-
|
137
|
-
|
120
|
+
"""Process updates with priority-based execution, retries, benchmarking and progress updates"""
|
121
|
+
overall_start = time.perf_counter()
|
122
|
+
plan = self.update_planner.plan
|
123
|
+
# Use len(plan.index) instead of plan.empty for Dask compatibility
|
124
|
+
plan_count = len(plan.index)
|
125
|
+
if plan_count == 0:
|
138
126
|
self.logger.info("No updates required")
|
139
127
|
return
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
128
|
+
self.logger.info(f"Update plan for {self.dataclass.__name__} includes {plan_count} items for update")
|
129
|
+
|
130
|
+
if self.verbose:
|
131
|
+
self.update_planner.show_update_plan()
|
132
|
+
|
133
|
+
for priority in sorted(plan["update_priority"].unique()):
|
134
|
+
self._process_priority_group(plan, priority, max_retries)
|
135
|
+
|
136
|
+
total_time = time.perf_counter() - overall_start
|
137
|
+
processed = len(self.processed_dates)
|
138
|
+
if processed:
|
139
|
+
self.logger.info(
|
140
|
+
f"Processed {processed} dates in {total_time:.1f}s "
|
141
|
+
f"(avg {total_time / processed:.1f}s per date)"
|
142
|
+
)
|
143
|
+
if self.show_progress or self.verbose:
|
144
|
+
self.show_benchmark_summary()
|
145
|
+
|
146
|
+
def _process_priority_group(
|
147
|
+
self,
|
148
|
+
plan: pd.DataFrame,
|
149
|
+
priority: int,
|
150
|
+
max_retries: int
|
151
|
+
):
|
152
|
+
"""Process a single priority group with parallel execution and timing"""
|
153
|
+
dates = plan[plan["update_priority"] == priority]["date"].tolist()
|
156
154
|
if not dates:
|
157
155
|
return
|
158
|
-
|
159
|
-
desc = f"Processing {self.dataclass.__name__}, task: {self._priority_label(priority)}"
|
156
|
+
desc = f"Processing {self.dataclass.__name__}, priority: {priority}"
|
160
157
|
self.logger.debug(f"Starting {desc.lower()}")
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
}
|
168
|
-
|
169
|
-
for future in tqdm(as_completed(futures),
|
170
|
-
total=len(futures),
|
171
|
-
desc=desc,
|
172
|
-
disable=not self.show_progress):
|
158
|
+
group_start = time.perf_counter()
|
159
|
+
max_thr = min(len(dates), self.max_threads)
|
160
|
+
self.logger.debug(f"Max threads for priority {priority}: {max_thr}")
|
161
|
+
with ThreadPoolExecutor(max_workers=max_thr) as executor:
|
162
|
+
futures = {executor.submit(self._process_date_with_retry, date, max_retries): date for date in dates}
|
163
|
+
for future in tqdm(as_completed(futures), total=len(futures), desc=desc, disable=not self.show_progress):
|
173
164
|
date = futures[future]
|
174
165
|
try:
|
175
166
|
future.result(timeout=self.timeout)
|
176
167
|
except Exception as e:
|
177
|
-
self.logger.error(f"Permanent failure processing {date}: {
|
178
|
-
|
179
|
-
|
180
|
-
"""Get human-readable label for priority level"""
|
181
|
-
return next(
|
182
|
-
(k for k, v in self.priority_map.items() if v == priority),
|
183
|
-
f"Unknown Priority {priority}"
|
184
|
-
)
|
168
|
+
self.logger.error(f"Permanent failure processing {date}: {e}")
|
169
|
+
group_time = time.perf_counter() - group_start
|
170
|
+
self.logger.info(f"Priority {priority} group processed {len(dates)} dates in {group_time:.1f}s")
|
185
171
|
|
186
172
|
def _process_date_with_retry(self, date: datetime.date, max_retries: int):
|
187
|
-
"""Process a date with retry logic"""
|
188
173
|
for attempt in range(1, max_retries + 1):
|
189
174
|
try:
|
190
175
|
self._process_single_date(date)
|
191
176
|
return
|
192
177
|
except Exception as e:
|
193
178
|
if attempt < max_retries:
|
194
|
-
self.logger.warning(f"Retry {attempt}/{max_retries} for {date}: {
|
179
|
+
self.logger.warning(f"Retry {attempt}/{max_retries} for {date}: {e}")
|
195
180
|
else:
|
196
181
|
raise RuntimeError(f"Failed processing {date} after {max_retries} attempts") from e
|
197
182
|
|
198
183
|
def _process_single_date(self, date: datetime.date):
|
199
|
-
"""Core date processing logic"""
|
184
|
+
"""Core date processing logic with load/save timing and thread reporting"""
|
200
185
|
path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
|
186
|
+
self.logger.info(f"Processing date {date.isoformat()} for {path}")
|
187
|
+
# self.logger.info(f"Path {path} in {self.skipped}: {path in self.skipped}")
|
188
|
+
#if path in self.skipped:
|
189
|
+
# self.logger.info(f"Skipping {date} as it exists in the skipped list")
|
190
|
+
# return
|
201
191
|
full_path = f"{path}{self.parquet_filename}"
|
202
192
|
|
203
|
-
|
204
|
-
|
193
|
+
thread_name = threading.current_thread().name
|
194
|
+
self.logger.info(f"[{thread_name}] Executing date: {date} -> saving to: {full_path}")
|
205
195
|
|
196
|
+
overall_start = time.perf_counter()
|
206
197
|
try:
|
207
|
-
|
208
|
-
self.logger.debug(f"Load Params: {self.load_params}")
|
209
|
-
|
210
|
-
df = pd.DataFrame()
|
198
|
+
load_start = time.perf_counter()
|
211
199
|
with self.dataclass(**self.class_params) as data:
|
212
200
|
df = data.load_period(
|
213
201
|
dt_field=self.date_field,
|
@@ -215,11 +203,22 @@ class DataWrapper:
|
|
215
203
|
end=date,
|
216
204
|
**self.load_params
|
217
205
|
)
|
218
|
-
|
219
|
-
|
220
|
-
|
206
|
+
load_time = time.perf_counter() - load_start
|
207
|
+
|
208
|
+
if df.head(1, compute=True).empty:
|
209
|
+
if self.mmanifest:
|
210
|
+
schema = df._meta.dtypes.astype(str).to_dict()
|
211
|
+
self.mmanifest.record(
|
212
|
+
full_path=path
|
213
|
+
)
|
214
|
+
self.logger.info(f"No data found for {date}. Logged to missing manifest.")
|
221
215
|
return
|
216
|
+
# Dask-compatible empty check
|
217
|
+
# if len(df.index) == 0:
|
218
|
+
# self.logger.warning(f"No data found for {date}")
|
219
|
+
# return
|
222
220
|
|
221
|
+
save_start = time.perf_counter()
|
223
222
|
with self._lock:
|
224
223
|
ParquetSaver(
|
225
224
|
df_result=df,
|
@@ -227,23 +226,33 @@ class DataWrapper:
|
|
227
226
|
fs=self.fs,
|
228
227
|
logger=self.logger
|
229
228
|
).save_to_parquet(self.parquet_filename)
|
229
|
+
save_time = time.perf_counter() - save_start
|
230
230
|
|
231
|
-
|
232
|
-
self.
|
233
|
-
|
231
|
+
total_time = time.perf_counter() - overall_start
|
232
|
+
self.benchmarks[date] = {
|
233
|
+
"load_duration": load_time,
|
234
|
+
"save_duration": save_time,
|
235
|
+
"total_duration": total_time
|
236
|
+
}
|
237
|
+
self._log_success(date, total_time, full_path)
|
234
238
|
except Exception as e:
|
235
239
|
self._log_failure(date, e)
|
236
240
|
raise
|
237
241
|
|
238
242
|
def _log_success(self, date: datetime.date, duration: float, path: str):
|
239
|
-
"""Handle successful processing logging"""
|
240
243
|
msg = f"Completed {date} in {duration:.1f}s | Saved to {path}"
|
241
244
|
self.logger.info(msg)
|
242
245
|
self.processed_dates.append(date)
|
243
246
|
|
244
247
|
def _log_failure(self, date: datetime.date, error: Exception):
|
245
|
-
"
|
246
|
-
msg = f"Failed processing {date}: {str(error)}"
|
248
|
+
msg = f"Failed processing {date}: {error}"
|
247
249
|
self.logger.error(msg)
|
248
250
|
|
249
|
-
|
251
|
+
def show_benchmark_summary(self):
|
252
|
+
"""Display a summary of load/save timings per date"""
|
253
|
+
if not self.benchmarks:
|
254
|
+
self.logger.info("No benchmarking data to show")
|
255
|
+
return
|
256
|
+
df_bench = pd.DataFrame.from_records([{"date": d, **m} for d, m in self.benchmarks.items()])
|
257
|
+
df_bench = df_bench.set_index("date").sort_index(ascending=not self.reverse_order)
|
258
|
+
self.logger.info("Benchmark Summary:\n" + df_bench.to_string())
|
sibi_dst/utils/date_utils.py
CHANGED
@@ -149,9 +149,9 @@ class DateUtils:
|
|
149
149
|
|
150
150
|
|
151
151
|
class FileAgeChecker:
|
152
|
-
def __init__(self, logger=None):
|
152
|
+
def __init__(self, debug=False, logger=None):
|
153
153
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
154
|
-
|
154
|
+
self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
|
155
155
|
def is_file_older_than(
|
156
156
|
self,
|
157
157
|
file_path: str,
|
@@ -171,15 +171,15 @@ class FileAgeChecker:
|
|
171
171
|
:return: True if older than max_age_minutes, False otherwise.
|
172
172
|
"""
|
173
173
|
fs = fs or fsspec.filesystem("file")
|
174
|
-
self.logger.
|
174
|
+
self.logger.debug(f"Checking age for {file_path}...")
|
175
175
|
|
176
176
|
try:
|
177
177
|
if not fs.exists(file_path):
|
178
|
-
self.logger.
|
178
|
+
self.logger.debug(f"Path not found: {file_path}.")
|
179
179
|
return not ignore_missing
|
180
180
|
|
181
181
|
if fs.isdir(file_path):
|
182
|
-
self.logger.
|
182
|
+
self.logger.debug(f"Found directory: {file_path}")
|
183
183
|
age = self._get_directory_age_minutes(file_path, fs, verbose)
|
184
184
|
elif fs.isfile(file_path):
|
185
185
|
age = self._get_file_age_minutes(file_path, fs, verbose)
|
@@ -208,7 +208,7 @@ class FileAgeChecker:
|
|
208
208
|
fs = fs or fsspec.filesystem("file")
|
209
209
|
try:
|
210
210
|
if not fs.exists(file_path):
|
211
|
-
self.logger.
|
211
|
+
self.logger.debug(f"Path not found: {file_path}")
|
212
212
|
return float("inf")
|
213
213
|
|
214
214
|
if fs.isdir(file_path):
|
@@ -237,7 +237,7 @@ class FileAgeChecker:
|
|
237
237
|
return float("inf")
|
238
238
|
|
239
239
|
if not all_files:
|
240
|
-
self.logger.
|
240
|
+
self.logger.debug(f"Empty directory: {dir_path}")
|
241
241
|
return float("inf")
|
242
242
|
|
243
243
|
modification_times = []
|
@@ -255,7 +255,7 @@ class FileAgeChecker:
|
|
255
255
|
|
256
256
|
oldest = min(modification_times)
|
257
257
|
age = (datetime.datetime.now(datetime.timezone.utc) - oldest).total_seconds() / 60
|
258
|
-
self.logger.
|
258
|
+
self.logger.debug(f"Oldest in {dir_path}: {age:.2f} minutes")
|
259
259
|
|
260
260
|
return age
|
261
261
|
|
sibi_dst/utils/log_utils.py
CHANGED
@@ -77,7 +77,7 @@ class Logger:
|
|
77
77
|
formatter.converter = time.localtime # << Set local time explicitly
|
78
78
|
|
79
79
|
# Create a file handler
|
80
|
-
file_handler = logging.FileHandler(log_file_path)
|
80
|
+
file_handler = logging.FileHandler(log_file_path, delay=True)
|
81
81
|
file_handler.setFormatter(formatter)
|
82
82
|
self.logger.addHandler(file_handler)
|
83
83
|
|
@@ -0,0 +1,154 @@
|
|
1
|
+
import pandas as pd
|
2
|
+
import fsspec
|
3
|
+
import threading
|
4
|
+
import uuid
|
5
|
+
from typing import List, Optional, Set, Dict, Any
|
6
|
+
|
7
|
+
from sibi_dst.utils import Logger
|
8
|
+
|
9
|
+
|
10
|
+
class MissingManifestManager:
|
11
|
+
"""
|
12
|
+
Thread-safe manager for a “missing-partitions” manifest (Parquet file).
|
13
|
+
"""
|
14
|
+
|
15
|
+
def __init__(
|
16
|
+
self,
|
17
|
+
fs: fsspec.AbstractFileSystem,
|
18
|
+
manifest_path: str,
|
19
|
+
clear_existing: bool = False,
|
20
|
+
**kwargs: Any,
|
21
|
+
):
|
22
|
+
self.fs = fs
|
23
|
+
self.manifest_path = manifest_path.rstrip("/")
|
24
|
+
self.clear_existing = clear_existing
|
25
|
+
|
26
|
+
self.debug: bool = kwargs.get("debug", False)
|
27
|
+
self.logger = kwargs.get(
|
28
|
+
"logger",
|
29
|
+
Logger.default_logger(logger_name="missing_manifest_manager")
|
30
|
+
)
|
31
|
+
self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
|
32
|
+
|
33
|
+
# In-memory list for new paths
|
34
|
+
self._new_records: List[Dict[str, str]] = []
|
35
|
+
# Cached set of existing paths
|
36
|
+
self._loaded_paths: Optional[Set[str]] = None
|
37
|
+
|
38
|
+
# Use a reentrant lock so save() can call load_existing() safely
|
39
|
+
self._lock = threading.RLock()
|
40
|
+
|
41
|
+
def _safe_exists(self, path: str) -> bool:
|
42
|
+
try:
|
43
|
+
return self.fs.exists(path)
|
44
|
+
except PermissionError:
|
45
|
+
if self.debug:
|
46
|
+
self.logger.debug(f"Permission denied checking existence of '{path}'")
|
47
|
+
return False
|
48
|
+
except Exception as e:
|
49
|
+
self.logger.warning(f"Error checking existence of '{path}': {e}")
|
50
|
+
return False
|
51
|
+
|
52
|
+
def load_existing(self) -> Set[str]:
|
53
|
+
"""
|
54
|
+
Load and cache existing manifest paths.
|
55
|
+
"""
|
56
|
+
with self._lock:
|
57
|
+
if self._loaded_paths is not None:
|
58
|
+
return self._loaded_paths
|
59
|
+
|
60
|
+
if not self._safe_exists(self.manifest_path):
|
61
|
+
self._loaded_paths = set()
|
62
|
+
return self._loaded_paths
|
63
|
+
|
64
|
+
try:
|
65
|
+
df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
|
66
|
+
paths = (
|
67
|
+
df.get("path", pd.Series(dtype=str))
|
68
|
+
.dropna().astype(str)
|
69
|
+
.loc[lambda s: s.str.strip().astype(bool)]
|
70
|
+
)
|
71
|
+
self._loaded_paths = set(paths.tolist())
|
72
|
+
except Exception as e:
|
73
|
+
self.logger.warning(f"Failed to load manifest '{self.manifest_path}': {e}")
|
74
|
+
self._loaded_paths = set()
|
75
|
+
|
76
|
+
return self._loaded_paths
|
77
|
+
|
78
|
+
def record(self, full_path: str) -> None:
|
79
|
+
"""
|
80
|
+
Register a missing file path.
|
81
|
+
"""
|
82
|
+
if not full_path or not isinstance(full_path, str):
|
83
|
+
return
|
84
|
+
with self._lock:
|
85
|
+
self._new_records.append({"path": full_path})
|
86
|
+
|
87
|
+
def save(self) -> None:
|
88
|
+
"""
|
89
|
+
Merge new records into the manifest and write it out atomically.
|
90
|
+
"""
|
91
|
+
with self._lock:
|
92
|
+
# Build DataFrame of new entries
|
93
|
+
new_df = pd.DataFrame(self._new_records)
|
94
|
+
should_overwrite = self.clear_existing or not self._safe_exists(self.manifest_path)
|
95
|
+
if new_df.empty and not should_overwrite:
|
96
|
+
return
|
97
|
+
|
98
|
+
# Clean new_df
|
99
|
+
new_df = (
|
100
|
+
new_df.get("path", pd.Series(dtype=str))
|
101
|
+
.dropna().astype(str)
|
102
|
+
.loc[lambda s: s.str.strip().astype(bool)]
|
103
|
+
.to_frame()
|
104
|
+
)
|
105
|
+
|
106
|
+
# Merge or overwrite
|
107
|
+
if should_overwrite:
|
108
|
+
out_df = new_df
|
109
|
+
else:
|
110
|
+
try:
|
111
|
+
old_df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
|
112
|
+
old_paths = (
|
113
|
+
old_df.get("path", pd.Series(dtype=str))
|
114
|
+
.dropna().astype(str)
|
115
|
+
.loc[lambda s: s.str.strip().astype(bool)]
|
116
|
+
.to_frame()
|
117
|
+
)
|
118
|
+
out_df = pd.concat([old_paths, new_df], ignore_index=True)
|
119
|
+
except Exception as e:
|
120
|
+
self.logger.warning(f"Could not merge manifest, overwriting: {e}")
|
121
|
+
out_df = new_df
|
122
|
+
|
123
|
+
out_df = out_df.drop_duplicates(subset=["path"]).reset_index(drop=True)
|
124
|
+
|
125
|
+
# Ensure parent dir
|
126
|
+
parent = self.manifest_path.rsplit("/", 1)[0]
|
127
|
+
try:
|
128
|
+
self.fs.makedirs(parent, exist_ok=True)
|
129
|
+
except Exception as e:
|
130
|
+
self.logger.warning(f"Could not create manifest directory '{parent}': {e}")
|
131
|
+
|
132
|
+
# Write atomically: temp file + rename
|
133
|
+
temp_path = f"{self.manifest_path}.tmp-{uuid.uuid4().hex}"
|
134
|
+
try:
|
135
|
+
out_df.to_parquet(
|
136
|
+
temp_path,
|
137
|
+
filesystem=self.fs,
|
138
|
+
index=False
|
139
|
+
)
|
140
|
+
# rename into place (atomic in most filesystems)
|
141
|
+
self.fs.mv(temp_path, self.manifest_path, recursive=False)
|
142
|
+
except Exception as e:
|
143
|
+
self.logger.error(f"Failed to write or rename manifest: {e}")
|
144
|
+
# Clean up temp if it exists
|
145
|
+
try:
|
146
|
+
if self.fs.exists(temp_path):
|
147
|
+
self.fs.rm(temp_path, recursive=True)
|
148
|
+
except Exception:
|
149
|
+
pass
|
150
|
+
raise
|
151
|
+
|
152
|
+
# Reset memory & cache
|
153
|
+
self._new_records.clear()
|
154
|
+
self._loaded_paths = set(out_df["path"].tolist())
|