sibi-dst 2025.1.3__py3-none-any.whl → 2025.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +4 -1
- sibi_dst/df_helper/__init__.py +2 -2
- sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +355 -163
- sibi_dst/df_helper/_df_helper.py +47 -30
- sibi_dst/df_helper/_parquet_artifact.py +57 -47
- sibi_dst/df_helper/_parquet_reader.py +9 -13
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +15 -11
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +23 -16
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +17 -11
- sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +1 -103
- sibi_dst/utils/__init__.py +3 -2
- sibi_dst/utils/base.py +97 -0
- sibi_dst/utils/clickhouse_writer.py +5 -4
- sibi_dst/utils/data_wrapper.py +69 -84
- sibi_dst/utils/date_utils.py +2 -1
- sibi_dst/utils/log_utils.py +309 -77
- sibi_dst/utils/manifest_manager.py +96 -375
- sibi_dst/utils/parquet_saver.py +98 -173
- sibi_dst/utils/storage_config.py +6 -0
- sibi_dst/utils/storage_manager.py +2 -1
- sibi_dst/utils/update_planner.py +72 -22
- {sibi_dst-2025.1.3.dist-info → sibi_dst-2025.1.5.dist-info}/METADATA +3 -1
- {sibi_dst-2025.1.3.dist-info → sibi_dst-2025.1.5.dist-info}/RECORD +24 -27
- sibi_dst/v3/__init__.py +0 -0
- sibi_dst/v3/backends/__init__.py +0 -0
- sibi_dst/v3/df_helper/__init__.py +0 -0
- sibi_dst/v3/df_helper/_df_helper.py +0 -91
- {sibi_dst-2025.1.3.dist-info → sibi_dst-2025.1.5.dist-info}/WHEEL +0 -0
@@ -2,17 +2,26 @@ import pandas as pd
|
|
2
2
|
import fsspec
|
3
3
|
import threading
|
4
4
|
import uuid
|
5
|
-
import hashlib
|
6
|
-
import base64
|
7
|
-
import json
|
8
5
|
from typing import List, Optional, Set, Dict, Any
|
9
|
-
|
6
|
+
import json, base64, hashlib
|
10
7
|
from sibi_dst.utils import Logger
|
11
8
|
|
12
9
|
|
13
10
|
class MissingManifestManager:
|
14
11
|
"""
|
15
|
-
|
12
|
+
A thread-safe manager for a Parquet file manifest.
|
13
|
+
|
14
|
+
This class handles creating, reading, and appending to a Parquet manifest file
|
15
|
+
that tracks a list of paths. It is designed to be resilient, using atomic
|
16
|
+
file operations to prevent data corruption during writes, and can clean up
|
17
|
+
orphaned temporary files from previous runs.
|
18
|
+
|
19
|
+
Attributes:
|
20
|
+
fs (fsspec.AbstractFileSystem): The filesystem object to interact with.
|
21
|
+
manifest_path (str): The full path to the manifest file.
|
22
|
+
clear_existing (bool): If True, any existing manifest will be overwritten
|
23
|
+
on the first save operation of this instance's lifecycle.
|
24
|
+
logger (Logger): A logger instance for logging messages.
|
16
25
|
"""
|
17
26
|
|
18
27
|
def __init__(
|
@@ -22,12 +31,12 @@ class MissingManifestManager:
|
|
22
31
|
clear_existing: bool = False,
|
23
32
|
**kwargs: Any,
|
24
33
|
):
|
25
|
-
self.fs = fs
|
26
|
-
self.manifest_path = manifest_path.rstrip("/")
|
27
|
-
self.clear_existing = clear_existing
|
34
|
+
self.fs: fsspec.AbstractFileSystem = fs
|
35
|
+
self.manifest_path: str = manifest_path.rstrip("/")
|
36
|
+
self.clear_existing: bool = clear_existing
|
28
37
|
|
29
38
|
self.debug: bool = kwargs.get("debug", False)
|
30
|
-
self.logger = kwargs.get(
|
39
|
+
self.logger: Logger = kwargs.get(
|
31
40
|
"logger",
|
32
41
|
Logger.default_logger(logger_name="missing_manifest_manager")
|
33
42
|
)
|
@@ -35,9 +44,13 @@ class MissingManifestManager:
|
|
35
44
|
|
36
45
|
self._new_records: List[Dict[str, str]] = []
|
37
46
|
self._loaded_paths: Optional[Set[str]] = None
|
38
|
-
self._lock = threading.
|
47
|
+
self._lock = threading.Lock() # A standard Lock is sufficient
|
48
|
+
|
49
|
+
# Clean up any orphaned temp files from previous failed runs
|
50
|
+
self._cleanup_orphaned_files()
|
39
51
|
|
40
52
|
def _safe_exists(self, path: str) -> bool:
|
53
|
+
"""Safely check if a path exists, handling potential exceptions."""
|
41
54
|
try:
|
42
55
|
return self.fs.exists(path)
|
43
56
|
except Exception as e:
|
@@ -45,6 +58,15 @@ class MissingManifestManager:
|
|
45
58
|
return False
|
46
59
|
|
47
60
|
def load_existing(self) -> Set[str]:
|
61
|
+
"""
|
62
|
+
Loads the set of paths from the existing manifest file.
|
63
|
+
|
64
|
+
The result is cached in memory. If the manifest does not exist or fails
|
65
|
+
to load, an empty set is returned. This operation is thread-safe.
|
66
|
+
|
67
|
+
Returns:
|
68
|
+
A set of strings, where each string is a path from the manifest.
|
69
|
+
"""
|
48
70
|
with self._lock:
|
49
71
|
if self._loaded_paths is not None:
|
50
72
|
return self._loaded_paths
|
@@ -55,6 +77,7 @@ class MissingManifestManager:
|
|
55
77
|
|
56
78
|
try:
|
57
79
|
df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
|
80
|
+
# Robustly extract non-empty, non-null paths
|
58
81
|
paths = (
|
59
82
|
df.get("path", pd.Series(dtype=str))
|
60
83
|
.dropna().astype(str)
|
@@ -62,104 +85,117 @@ class MissingManifestManager:
|
|
62
85
|
)
|
63
86
|
self._loaded_paths = set(paths.tolist())
|
64
87
|
except Exception as e:
|
65
|
-
self.logger.warning(
|
88
|
+
self.logger.warning(
|
89
|
+
f"Failed to load manifest '{self.manifest_path}', "
|
90
|
+
f"treating as empty. Error: {e}"
|
91
|
+
)
|
66
92
|
self._loaded_paths = set()
|
67
93
|
|
68
94
|
return self._loaded_paths
|
69
95
|
|
70
96
|
def record(self, full_path: str) -> None:
|
97
|
+
"""
|
98
|
+
Records a new path to be added to the manifest upon the next save.
|
99
|
+
|
100
|
+
Args:
|
101
|
+
full_path: The path to record.
|
102
|
+
"""
|
71
103
|
if not full_path or not isinstance(full_path, str):
|
72
104
|
return
|
73
105
|
with self._lock:
|
74
106
|
self._new_records.append({"path": full_path})
|
75
107
|
|
76
108
|
def save(self) -> None:
|
109
|
+
"""
|
110
|
+
Saves all new records to the manifest file.
|
111
|
+
|
112
|
+
This method merges new records with existing ones (unless `clear_existing`
|
113
|
+
is True), removes duplicates, and writes the result back to the manifest.
|
114
|
+
The write operation is performed atomically by writing to a temporary file
|
115
|
+
first, then renaming or copying it to the final destination.
|
116
|
+
"""
|
77
117
|
with self._lock:
|
78
|
-
|
79
|
-
|
80
|
-
if new_df.empty and not should_overwrite:
|
118
|
+
if not self._new_records and not self.clear_existing:
|
119
|
+
self.logger.debug("Manifest Manager: No new records to save.")
|
81
120
|
return
|
82
121
|
|
122
|
+
new_df = pd.DataFrame(self._new_records)
|
83
123
|
new_df = (
|
84
124
|
new_df.get("path", pd.Series(dtype=str))
|
85
125
|
.dropna().astype(str)
|
86
126
|
.loc[lambda s: s.str.strip().astype(bool)]
|
87
|
-
.to_frame()
|
127
|
+
.to_frame(name="path")
|
88
128
|
)
|
89
129
|
|
130
|
+
# Determine the final DataFrame to be written
|
131
|
+
should_overwrite = self.clear_existing or not self._safe_exists(self.manifest_path)
|
90
132
|
if should_overwrite:
|
91
133
|
out_df = new_df
|
92
134
|
else:
|
93
135
|
try:
|
94
136
|
old_df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
|
95
|
-
|
96
|
-
old_df.get("path", pd.Series(dtype=str))
|
97
|
-
.dropna().astype(str)
|
98
|
-
.loc[lambda s: s.str.strip().astype(bool)]
|
99
|
-
.to_frame()
|
100
|
-
)
|
101
|
-
out_df = pd.concat([old_paths, new_df], ignore_index=True)
|
137
|
+
out_df = pd.concat([old_df, new_df], ignore_index=True)
|
102
138
|
except Exception as e:
|
103
|
-
self.logger.warning(f"Could not
|
139
|
+
self.logger.warning(f"Could not read existing manifest to merge, overwriting. Error: {e}")
|
104
140
|
out_df = new_df
|
105
141
|
|
106
142
|
out_df = out_df.drop_duplicates(subset=["path"]).reset_index(drop=True)
|
107
143
|
|
144
|
+
# Ensure parent directory exists
|
108
145
|
parent = self.manifest_path.rsplit("/", 1)[0]
|
109
|
-
|
110
|
-
self.fs.makedirs(parent, exist_ok=True)
|
111
|
-
except Exception as e:
|
112
|
-
self.logger.warning(f"Could not create manifest directory '{parent}': {e}")
|
146
|
+
self.fs.makedirs(parent, exist_ok=True)
|
113
147
|
|
148
|
+
# Perform an atomic write using a temporary file
|
114
149
|
temp_path = f"{self.manifest_path}.tmp-{uuid.uuid4().hex}"
|
115
150
|
try:
|
116
|
-
out_df.to_parquet(
|
117
|
-
temp_path,
|
118
|
-
filesystem=self.fs,
|
119
|
-
index=False
|
120
|
-
)
|
151
|
+
out_df.to_parquet(temp_path, filesystem=self.fs, index=False)
|
121
152
|
self.fs.copy(temp_path, self.manifest_path)
|
153
|
+
self.fs.rm_file(temp_path)
|
122
154
|
self.logger.info(f"Copied manifest to {self.manifest_path} (temp: {temp_path})")
|
123
155
|
except Exception as e:
|
124
|
-
self.logger.error(f"Failed to write or
|
125
|
-
raise
|
126
|
-
|
127
|
-
|
156
|
+
self.logger.error(f"Failed to write or move manifest: {e}")
|
157
|
+
# Re-raise so the caller knows the save operation failed
|
158
|
+
#raise
|
159
|
+
finally:
|
160
|
+
# CRITICAL: Always clean up the temporary file
|
161
|
+
if self._safe_exists(temp_path):
|
162
|
+
try:
|
163
|
+
self._cleanup_orphaned_files()
|
164
|
+
except Exception as e:
|
165
|
+
self.logger.error(f"Failed to remove temporary file '{temp_path}': {e}")
|
166
|
+
|
167
|
+
# Reset internal state
|
128
168
|
self._new_records.clear()
|
129
169
|
self._loaded_paths = set(out_df["path"].tolist())
|
170
|
+
# After the first successful save, disable clear_existing behavior
|
171
|
+
self.clear_existing = False
|
130
172
|
|
131
|
-
def
|
173
|
+
def _cleanup_orphaned_files(self) -> None:
|
174
|
+
"""Finds and removes any orphaned temporary manifest files from prior runs."""
|
175
|
+
self.logger.debug("Checking for orphaned temporary files...")
|
132
176
|
if not hasattr(self.fs, "s3"):
|
133
177
|
self.logger.info("Filesystem is not s3fs; skipping temp cleanup.")
|
134
178
|
return
|
135
|
-
|
136
179
|
try:
|
137
|
-
bucket, prefix = self._parse_s3_path(self.manifest_path.rsplit("/", 1)[0])
|
138
|
-
files = self.fs.ls(f"s3://{bucket}/{prefix}", detail=True)
|
139
|
-
temp_files = [
|
140
|
-
f for f in files
|
141
|
-
if f["name"].endswith(".parquet") and ".tmp-" in f["name"]
|
142
|
-
]
|
143
|
-
if not temp_files:
|
144
|
-
return
|
145
180
|
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
"Quiet": True
|
150
|
-
}
|
181
|
+
# Use glob to find all files matching the temp pattern in a filesystem-agnostic way
|
182
|
+
temp_file_pattern = f"{self.manifest_path}.tmp-*"
|
183
|
+
orphaned_files = self.fs.glob(temp_file_pattern)
|
151
184
|
|
152
|
-
|
153
|
-
|
185
|
+
if not orphaned_files:
|
186
|
+
self.logger.debug("No orphaned files found.")
|
187
|
+
return
|
154
188
|
|
155
|
-
self.
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
189
|
+
self.logger.info(f"Found {orphaned_files} orphaned temp manifest(s). Cleaning up...")
|
190
|
+
for f_path in orphaned_files:
|
191
|
+
try:
|
192
|
+
self.fs.rm_file(f_path)
|
193
|
+
self.logger.info(f"Deleted orphaned file: {f_path}")
|
194
|
+
except Exception as e:
|
195
|
+
self.logger.warning(f"Failed to delete orphaned temp file '{f_path}': {e}")
|
161
196
|
except Exception as e:
|
162
|
-
|
197
|
+
# This is a non-critical operation, so we just log the error
|
198
|
+
self.logger.error(f"An unexpected error occurred during temp file cleanup: {e}")
|
163
199
|
|
164
200
|
@staticmethod
|
165
201
|
def _parse_s3_path(s3_path: str):
|
@@ -169,318 +205,3 @@ class MissingManifestManager:
|
|
169
205
|
bucket_name = path_parts[0]
|
170
206
|
prefix = path_parts[1] if len(path_parts) > 1 else ""
|
171
207
|
return bucket_name, prefix
|
172
|
-
|
173
|
-
# import pandas as pd
|
174
|
-
# import fsspec
|
175
|
-
# import threading
|
176
|
-
# import uuid
|
177
|
-
# from typing import List, Optional, Set, Dict, Any
|
178
|
-
#
|
179
|
-
# from sibi_dst.utils import Logger
|
180
|
-
#
|
181
|
-
#
|
182
|
-
# class MissingManifestManager:
|
183
|
-
# """
|
184
|
-
# Thread-safe manager for a “missing-partitions” manifest (Parquet file).
|
185
|
-
# """
|
186
|
-
#
|
187
|
-
# def __init__(
|
188
|
-
# self,
|
189
|
-
# fs: fsspec.AbstractFileSystem,
|
190
|
-
# manifest_path: str,
|
191
|
-
# clear_existing: bool = False,
|
192
|
-
# **kwargs: Any,
|
193
|
-
# ):
|
194
|
-
# self.fs = fs
|
195
|
-
# self.manifest_path = manifest_path.rstrip("/")
|
196
|
-
# self.clear_existing = clear_existing
|
197
|
-
#
|
198
|
-
# self.debug: bool = kwargs.get("debug", False)
|
199
|
-
# self.logger = kwargs.get(
|
200
|
-
# "logger",
|
201
|
-
# Logger.default_logger(logger_name="missing_manifest_manager")
|
202
|
-
# )
|
203
|
-
# self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
|
204
|
-
#
|
205
|
-
# # In-memory list for new paths
|
206
|
-
# self._new_records: List[Dict[str, str]] = []
|
207
|
-
# # Cached set of existing paths
|
208
|
-
# self._loaded_paths: Optional[Set[str]] = None
|
209
|
-
#
|
210
|
-
# # Use a reentrant lock so save() can call load_existing() safely
|
211
|
-
# self._lock = threading.RLock()
|
212
|
-
#
|
213
|
-
# def _safe_exists(self, path: str) -> bool:
|
214
|
-
# try:
|
215
|
-
# return self.fs.exists(path)
|
216
|
-
# except PermissionError:
|
217
|
-
# if self.debug:
|
218
|
-
# self.logger.debug(f"Permission denied checking existence of '{path}'")
|
219
|
-
# return False
|
220
|
-
# except Exception as e:
|
221
|
-
# self.logger.warning(f"Error checking existence of '{path}': {e}")
|
222
|
-
# return False
|
223
|
-
#
|
224
|
-
# def load_existing(self) -> Set[str]:
|
225
|
-
# """
|
226
|
-
# Load and cache existing manifest paths.
|
227
|
-
# """
|
228
|
-
# with self._lock:
|
229
|
-
# if self._loaded_paths is not None:
|
230
|
-
# return self._loaded_paths
|
231
|
-
#
|
232
|
-
# if not self._safe_exists(self.manifest_path):
|
233
|
-
# self._loaded_paths = set()
|
234
|
-
# return self._loaded_paths
|
235
|
-
#
|
236
|
-
# try:
|
237
|
-
# df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
|
238
|
-
# paths = (
|
239
|
-
# df.get("path", pd.Series(dtype=str))
|
240
|
-
# .dropna().astype(str)
|
241
|
-
# .loc[lambda s: s.str.strip().astype(bool)]
|
242
|
-
# )
|
243
|
-
# self._loaded_paths = set(paths.tolist())
|
244
|
-
# except Exception as e:
|
245
|
-
# self.logger.warning(f"Failed to load manifest '{self.manifest_path}': {e}")
|
246
|
-
# self._loaded_paths = set()
|
247
|
-
#
|
248
|
-
# return self._loaded_paths
|
249
|
-
#
|
250
|
-
# def record(self, full_path: str) -> None:
|
251
|
-
# """
|
252
|
-
# Register a missing file path.
|
253
|
-
# """
|
254
|
-
# if not full_path or not isinstance(full_path, str):
|
255
|
-
# return
|
256
|
-
# with self._lock:
|
257
|
-
# self._new_records.append({"path": full_path})
|
258
|
-
#
|
259
|
-
# def save(self) -> None:
|
260
|
-
# """
|
261
|
-
# Merge new records into the manifest and write it out atomically.
|
262
|
-
# """
|
263
|
-
# with self._lock:
|
264
|
-
# # Build DataFrame of new entries
|
265
|
-
# new_df = pd.DataFrame(self._new_records)
|
266
|
-
# should_overwrite = self.clear_existing or not self._safe_exists(self.manifest_path)
|
267
|
-
# if new_df.empty and not should_overwrite:
|
268
|
-
# return
|
269
|
-
#
|
270
|
-
# # Clean new_df
|
271
|
-
# new_df = (
|
272
|
-
# new_df.get("path", pd.Series(dtype=str))
|
273
|
-
# .dropna().astype(str)
|
274
|
-
# .loc[lambda s: s.str.strip().astype(bool)]
|
275
|
-
# .to_frame()
|
276
|
-
# )
|
277
|
-
#
|
278
|
-
# # Merge or overwrite
|
279
|
-
# if should_overwrite:
|
280
|
-
# out_df = new_df
|
281
|
-
# else:
|
282
|
-
# try:
|
283
|
-
# old_df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
|
284
|
-
# old_paths = (
|
285
|
-
# old_df.get("path", pd.Series(dtype=str))
|
286
|
-
# .dropna().astype(str)
|
287
|
-
# .loc[lambda s: s.str.strip().astype(bool)]
|
288
|
-
# .to_frame()
|
289
|
-
# )
|
290
|
-
# out_df = pd.concat([old_paths, new_df], ignore_index=True)
|
291
|
-
# except Exception as e:
|
292
|
-
# self.logger.warning(f"Could not merge manifest, overwriting: {e}")
|
293
|
-
# out_df = new_df
|
294
|
-
#
|
295
|
-
# out_df = out_df.drop_duplicates(subset=["path"]).reset_index(drop=True)
|
296
|
-
#
|
297
|
-
# # Ensure parent dir
|
298
|
-
# parent = self.manifest_path.rsplit("/", 1)[0]
|
299
|
-
# try:
|
300
|
-
# self.fs.makedirs(parent, exist_ok=True)
|
301
|
-
# except Exception as e:
|
302
|
-
# self.logger.warning(f"Could not create manifest directory '{parent}': {e}")
|
303
|
-
#
|
304
|
-
# # Write atomically: temp file + rename
|
305
|
-
# temp_path = f"{self.manifest_path}.tmp-{uuid.uuid4().hex}"
|
306
|
-
# try:
|
307
|
-
# out_df.to_parquet(
|
308
|
-
# temp_path,
|
309
|
-
# filesystem=self.fs,
|
310
|
-
# index=False
|
311
|
-
# )
|
312
|
-
# # rename into place (atomic in most filesystems)
|
313
|
-
# #self.fs.mv(temp_path, self.manifest_path, recursive=False)
|
314
|
-
# try:
|
315
|
-
# self.fs.copy(temp_path, self.manifest_path)
|
316
|
-
# self.fs.rm(temp_path)
|
317
|
-
# except Exception as e:
|
318
|
-
# self.logger.error(f"Failed to copy or delete manifest: {e}")
|
319
|
-
# raise
|
320
|
-
# except Exception as e:
|
321
|
-
# self.logger.error(f"Failed to write or rename manifest: {e}")
|
322
|
-
# # Clean up temp if it exists
|
323
|
-
# try:
|
324
|
-
# if self.fs.exists(temp_path):
|
325
|
-
# self.fs.rm(temp_path, recursive=True)
|
326
|
-
# except Exception:
|
327
|
-
# pass
|
328
|
-
# raise
|
329
|
-
#
|
330
|
-
# # Reset memory & cache
|
331
|
-
# self._new_records.clear()
|
332
|
-
# self._loaded_paths = set(out_df["path"].tolist())
|
333
|
-
# import pandas as pd
|
334
|
-
# import fsspec
|
335
|
-
# import threading
|
336
|
-
# import uuid
|
337
|
-
# from typing import List, Optional, Set, Dict, Any
|
338
|
-
#
|
339
|
-
# from sibi_dst.utils import Logger
|
340
|
-
#
|
341
|
-
#
|
342
|
-
# class MissingManifestManager:
|
343
|
-
# """
|
344
|
-
# Thread-safe manager for a “missing-partitions” manifest (Parquet file).
|
345
|
-
# """
|
346
|
-
#
|
347
|
-
# def __init__(
|
348
|
-
# self,
|
349
|
-
# fs: fsspec.AbstractFileSystem,
|
350
|
-
# manifest_path: str,
|
351
|
-
# clear_existing: bool = False,
|
352
|
-
# **kwargs: Any,
|
353
|
-
# ):
|
354
|
-
# self.fs = fs
|
355
|
-
# self.manifest_path = manifest_path.rstrip("/")
|
356
|
-
# self.clear_existing = clear_existing
|
357
|
-
#
|
358
|
-
# self.debug: bool = kwargs.get("debug", False)
|
359
|
-
# self.logger = kwargs.get(
|
360
|
-
# "logger",
|
361
|
-
# Logger.default_logger(logger_name="missing_manifest_manager")
|
362
|
-
# )
|
363
|
-
# self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
|
364
|
-
#
|
365
|
-
# # In-memory list for new paths
|
366
|
-
# self._new_records: List[Dict[str, str]] = []
|
367
|
-
# # Cached set of existing paths
|
368
|
-
# self._loaded_paths: Optional[Set[str]] = None
|
369
|
-
#
|
370
|
-
# # Use a reentrant lock so save() can call load_existing() safely
|
371
|
-
# self._lock = threading.RLock()
|
372
|
-
#
|
373
|
-
# def _safe_exists(self, path: str) -> bool:
|
374
|
-
# try:
|
375
|
-
# return self.fs.exists(path)
|
376
|
-
# except PermissionError:
|
377
|
-
# if self.debug:
|
378
|
-
# self.logger.debug(f"Permission denied checking existence of '{path}'")
|
379
|
-
# return False
|
380
|
-
# except Exception as e:
|
381
|
-
# self.logger.warning(f"Error checking existence of '{path}': {e}")
|
382
|
-
# return False
|
383
|
-
#
|
384
|
-
# def load_existing(self) -> Set[str]:
|
385
|
-
# """
|
386
|
-
# Load and cache existing manifest paths.
|
387
|
-
# """
|
388
|
-
# with self._lock:
|
389
|
-
# if self._loaded_paths is not None:
|
390
|
-
# return self._loaded_paths
|
391
|
-
#
|
392
|
-
# if not self._safe_exists(self.manifest_path):
|
393
|
-
# self._loaded_paths = set()
|
394
|
-
# return self._loaded_paths
|
395
|
-
#
|
396
|
-
# try:
|
397
|
-
# df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
|
398
|
-
# paths = (
|
399
|
-
# df.get("path", pd.Series(dtype=str))
|
400
|
-
# .dropna().astype(str)
|
401
|
-
# .loc[lambda s: s.str.strip().astype(bool)]
|
402
|
-
# )
|
403
|
-
# self._loaded_paths = set(paths.tolist())
|
404
|
-
# except Exception as e:
|
405
|
-
# self.logger.warning(f"Failed to load manifest '{self.manifest_path}': {e}")
|
406
|
-
# self._loaded_paths = set()
|
407
|
-
#
|
408
|
-
# return self._loaded_paths
|
409
|
-
#
|
410
|
-
# def record(self, full_path: str) -> None:
|
411
|
-
# """
|
412
|
-
# Register a missing file path.
|
413
|
-
# """
|
414
|
-
# if not full_path or not isinstance(full_path, str):
|
415
|
-
# return
|
416
|
-
# with self._lock:
|
417
|
-
# self._new_records.append({"path": full_path})
|
418
|
-
#
|
419
|
-
# def save(self) -> None:
|
420
|
-
# """
|
421
|
-
# Merge new records into the manifest and write it out atomically.
|
422
|
-
# """
|
423
|
-
# with self._lock:
|
424
|
-
# # Build DataFrame of new entries
|
425
|
-
# new_df = pd.DataFrame(self._new_records)
|
426
|
-
# should_overwrite = self.clear_existing or not self._safe_exists(self.manifest_path)
|
427
|
-
# if new_df.empty and not should_overwrite:
|
428
|
-
# return
|
429
|
-
#
|
430
|
-
# # Clean new_df
|
431
|
-
# new_df = (
|
432
|
-
# new_df.get("path", pd.Series(dtype=str))
|
433
|
-
# .dropna().astype(str)
|
434
|
-
# .loc[lambda s: s.str.strip().astype(bool)]
|
435
|
-
# .to_frame()
|
436
|
-
# )
|
437
|
-
#
|
438
|
-
# # Merge or overwrite
|
439
|
-
# if should_overwrite:
|
440
|
-
# out_df = new_df
|
441
|
-
# else:
|
442
|
-
# try:
|
443
|
-
# old_df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
|
444
|
-
# old_paths = (
|
445
|
-
# old_df.get("path", pd.Series(dtype=str))
|
446
|
-
# .dropna().astype(str)
|
447
|
-
# .loc[lambda s: s.str.strip().astype(bool)]
|
448
|
-
# .to_frame()
|
449
|
-
# )
|
450
|
-
# out_df = pd.concat([old_paths, new_df], ignore_index=True)
|
451
|
-
# except Exception as e:
|
452
|
-
# self.logger.warning(f"Could not merge manifest, overwriting: {e}")
|
453
|
-
# out_df = new_df
|
454
|
-
#
|
455
|
-
# out_df = out_df.drop_duplicates(subset=["path"]).reset_index(drop=True)
|
456
|
-
#
|
457
|
-
# # Ensure parent dir
|
458
|
-
# parent = self.manifest_path.rsplit("/", 1)[0]
|
459
|
-
# try:
|
460
|
-
# self.fs.makedirs(parent, exist_ok=True)
|
461
|
-
# except Exception as e:
|
462
|
-
# self.logger.warning(f"Could not create manifest directory '{parent}': {e}")
|
463
|
-
#
|
464
|
-
# # Write atomically: temp file + rename
|
465
|
-
# temp_path = f"{self.manifest_path}.tmp-{uuid.uuid4().hex}"
|
466
|
-
# try:
|
467
|
-
# out_df.to_parquet(
|
468
|
-
# temp_path,
|
469
|
-
# filesystem=self.fs,
|
470
|
-
# index=False
|
471
|
-
# )
|
472
|
-
# # rename into place (atomic in most filesystems)
|
473
|
-
# self.fs.mv(temp_path, self.manifest_path, recursive=False)
|
474
|
-
# except Exception as e:
|
475
|
-
# self.logger.error(f"Failed to write or rename manifest: {e}")
|
476
|
-
# # Clean up temp if it exists
|
477
|
-
# try:
|
478
|
-
# if self.fs.exists(temp_path):
|
479
|
-
# self.fs.rm(temp_path, recursive=True)
|
480
|
-
# except Exception:
|
481
|
-
# pass
|
482
|
-
# raise
|
483
|
-
#
|
484
|
-
# # Reset memory & cache
|
485
|
-
# self._new_records.clear()
|
486
|
-
# self._loaded_paths = set(out_df["path"].tolist())
|