sibi-dst 2025.1.4__py3-none-any.whl → 2025.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,17 +2,26 @@ import pandas as pd
2
2
  import fsspec
3
3
  import threading
4
4
  import uuid
5
- import hashlib
6
- import base64
7
- import json
8
5
  from typing import List, Optional, Set, Dict, Any
9
-
6
+ import json, base64, hashlib
10
7
  from sibi_dst.utils import Logger
11
8
 
12
9
 
13
10
  class MissingManifestManager:
14
11
  """
15
- Thread-safe manager for a “missing-partitions” manifest (Parquet file).
12
+ A thread-safe manager for a Parquet file manifest.
13
+
14
+ This class handles creating, reading, and appending to a Parquet manifest file
15
+ that tracks a list of paths. It is designed to be resilient, using atomic
16
+ file operations to prevent data corruption during writes, and can clean up
17
+ orphaned temporary files from previous runs.
18
+
19
+ Attributes:
20
+ fs (fsspec.AbstractFileSystem): The filesystem object to interact with.
21
+ manifest_path (str): The full path to the manifest file.
22
+ clear_existing (bool): If True, any existing manifest will be overwritten
23
+ on the first save operation of this instance's lifecycle.
24
+ logger (Logger): A logger instance for logging messages.
16
25
  """
17
26
 
18
27
  def __init__(
@@ -22,12 +31,12 @@ class MissingManifestManager:
22
31
  clear_existing: bool = False,
23
32
  **kwargs: Any,
24
33
  ):
25
- self.fs = fs
26
- self.manifest_path = manifest_path.rstrip("/")
27
- self.clear_existing = clear_existing
34
+ self.fs: fsspec.AbstractFileSystem = fs
35
+ self.manifest_path: str = manifest_path.rstrip("/")
36
+ self.clear_existing: bool = clear_existing
28
37
 
29
38
  self.debug: bool = kwargs.get("debug", False)
30
- self.logger = kwargs.get(
39
+ self.logger: Logger = kwargs.get(
31
40
  "logger",
32
41
  Logger.default_logger(logger_name="missing_manifest_manager")
33
42
  )
@@ -35,9 +44,13 @@ class MissingManifestManager:
35
44
 
36
45
  self._new_records: List[Dict[str, str]] = []
37
46
  self._loaded_paths: Optional[Set[str]] = None
38
- self._lock = threading.RLock()
47
+ self._lock = threading.Lock() # A standard Lock is sufficient
48
+
49
+ # Clean up any orphaned temp files from previous failed runs
50
+ self._cleanup_orphaned_files()
39
51
 
40
52
  def _safe_exists(self, path: str) -> bool:
53
+ """Safely check if a path exists, handling potential exceptions."""
41
54
  try:
42
55
  return self.fs.exists(path)
43
56
  except Exception as e:
@@ -45,6 +58,15 @@ class MissingManifestManager:
45
58
  return False
46
59
 
47
60
  def load_existing(self) -> Set[str]:
61
+ """
62
+ Loads the set of paths from the existing manifest file.
63
+
64
+ The result is cached in memory. If the manifest does not exist or fails
65
+ to load, an empty set is returned. This operation is thread-safe.
66
+
67
+ Returns:
68
+ A set of strings, where each string is a path from the manifest.
69
+ """
48
70
  with self._lock:
49
71
  if self._loaded_paths is not None:
50
72
  return self._loaded_paths
@@ -55,6 +77,7 @@ class MissingManifestManager:
55
77
 
56
78
  try:
57
79
  df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
80
+ # Robustly extract non-empty, non-null paths
58
81
  paths = (
59
82
  df.get("path", pd.Series(dtype=str))
60
83
  .dropna().astype(str)
@@ -62,24 +85,41 @@ class MissingManifestManager:
62
85
  )
63
86
  self._loaded_paths = set(paths.tolist())
64
87
  except Exception as e:
65
- self.logger.warning(f"Failed to load manifest '{self.manifest_path}': {e}")
88
+ self.logger.warning(
89
+ f"Failed to load manifest '{self.manifest_path}', "
90
+ f"treating as empty. Error: {e}"
91
+ )
66
92
  self._loaded_paths = set()
67
93
 
68
94
  return self._loaded_paths
69
95
 
70
96
  def record(self, full_path: str) -> None:
97
+ """
98
+ Records a new path to be added to the manifest upon the next save.
99
+
100
+ Args:
101
+ full_path: The path to record.
102
+ """
71
103
  if not full_path or not isinstance(full_path, str):
72
104
  return
73
105
  with self._lock:
74
106
  self._new_records.append({"path": full_path})
75
107
 
76
108
  def save(self) -> None:
109
+ """
110
+ Saves all new records to the manifest file.
111
+
112
+ This method merges new records with existing ones (unless `clear_existing`
113
+ is True), removes duplicates, and writes the result back to the manifest.
114
+ The write operation is performed atomically by writing to a temporary file
115
+ first, then renaming or copying it to the final destination.
116
+ """
77
117
  with self._lock:
78
- new_df = pd.DataFrame(self._new_records)
79
- should_overwrite = self.clear_existing or not self._safe_exists(self.manifest_path)
80
- if new_df.empty and not should_overwrite:
118
+ if not self._new_records and not self.clear_existing:
119
+ self.logger.debug("Manifest Manager: No new records to save.")
81
120
  return
82
121
 
122
+ new_df = pd.DataFrame(self._new_records)
83
123
  new_df = (
84
124
  new_df.get("path", pd.Series(dtype=str))
85
125
  .dropna().astype(str)
@@ -87,79 +127,75 @@ class MissingManifestManager:
87
127
  .to_frame(name="path")
88
128
  )
89
129
 
130
+ # Determine the final DataFrame to be written
131
+ should_overwrite = self.clear_existing or not self._safe_exists(self.manifest_path)
90
132
  if should_overwrite:
91
133
  out_df = new_df
92
134
  else:
93
135
  try:
94
136
  old_df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
95
- old_paths = (
96
- old_df.get("path", pd.Series(dtype=str))
97
- .dropna().astype(str)
98
- .loc[lambda s: s.str.strip().astype(bool)]
99
- .to_frame(name="path")
100
- )
101
- out_df = pd.concat([old_paths, new_df], ignore_index=True)
137
+ out_df = pd.concat([old_df, new_df], ignore_index=True)
102
138
  except Exception as e:
103
- self.logger.warning(f"Could not merge manifest, overwriting: {e}")
139
+ self.logger.warning(f"Could not read existing manifest to merge, overwriting. Error: {e}")
104
140
  out_df = new_df
105
141
 
106
142
  out_df = out_df.drop_duplicates(subset=["path"]).reset_index(drop=True)
107
143
 
144
+ # Ensure parent directory exists
108
145
  parent = self.manifest_path.rsplit("/", 1)[0]
109
- try:
110
- self.fs.makedirs(parent, exist_ok=True)
111
- except Exception as e:
112
- self.logger.warning(f"Could not create manifest directory '{parent}': {e}")
146
+ self.fs.makedirs(parent, exist_ok=True)
113
147
 
148
+ # Perform an atomic write using a temporary file
114
149
  temp_path = f"{self.manifest_path}.tmp-{uuid.uuid4().hex}"
115
150
  try:
116
- out_df.to_parquet(
117
- temp_path,
118
- filesystem=self.fs,
119
- index=False
120
- )
151
+ out_df.to_parquet(temp_path, filesystem=self.fs, index=False)
121
152
  self.fs.copy(temp_path, self.manifest_path)
153
+ self.fs.rm_file(temp_path)
122
154
  self.logger.info(f"Copied manifest to {self.manifest_path} (temp: {temp_path})")
123
155
  except Exception as e:
124
- self.logger.error(f"Failed to write or copy manifest: {e}")
156
+ self.logger.error(f"Failed to write or move manifest: {e}")
157
+ # Re-raise so the caller knows the save operation failed
125
158
  #raise
126
-
127
- self.logger.debug(f"Temp file left behind: {temp_path}")
159
+ finally:
160
+ # CRITICAL: Always clean up the temporary file
161
+ if self._safe_exists(temp_path):
162
+ try:
163
+ self._cleanup_orphaned_files()
164
+ except Exception as e:
165
+ self.logger.error(f"Failed to remove temporary file '{temp_path}': {e}")
166
+
167
+ # Reset internal state
128
168
  self._new_records.clear()
129
169
  self._loaded_paths = set(out_df["path"].tolist())
170
+ # After the first successful save, disable clear_existing behavior
171
+ self.clear_existing = False
130
172
 
131
- def cleanup_temp_manifests(self) -> None:
173
+ def _cleanup_orphaned_files(self) -> None:
174
+ """Finds and removes any orphaned temporary manifest files from prior runs."""
175
+ self.logger.debug("Checking for orphaned temporary files...")
132
176
  if not hasattr(self.fs, "s3"):
133
177
  self.logger.info("Filesystem is not s3fs; skipping temp cleanup.")
134
178
  return
135
-
136
179
  try:
137
- bucket, prefix = self._parse_s3_path(self.manifest_path.rsplit("/", 1)[0])
138
- files = self.fs.ls(f"s3://{bucket}/{prefix}", detail=True)
139
- temp_files = [
140
- f for f in files
141
- if f["name"].endswith(".parquet") and ".tmp-" in f["name"]
142
- ]
143
- if not temp_files:
144
- return
145
180
 
146
- objects = [{"Key": f["name"].replace(f"{bucket}/", "", 1)} for f in temp_files]
147
- delete_payload = {
148
- "Objects": objects,
149
- "Quiet": True
150
- }
181
+ # Use glob to find all files matching the temp pattern in a filesystem-agnostic way
182
+ temp_file_pattern = f"{self.manifest_path}.tmp-*"
183
+ orphaned_files = self.fs.glob(temp_file_pattern)
151
184
 
152
- json_payload = json.dumps(delete_payload).encode("utf-8")
153
- content_md5 = base64.b64encode(hashlib.md5(json_payload).digest()).decode("utf-8")
185
+ if not orphaned_files:
186
+ self.logger.debug("No orphaned files found.")
187
+ return
154
188
 
155
- self.fs.s3.meta.client.delete_objects(
156
- Bucket=bucket,
157
- Delete=delete_payload,
158
- ContentMD5=content_md5
159
- )
160
- self.logger.info(f"Deleted {len(objects)} temp manifest files in s3://{bucket}/{prefix}")
189
+ self.logger.info(f"Found {orphaned_files} orphaned temp manifest(s). Cleaning up...")
190
+ for f_path in orphaned_files:
191
+ try:
192
+ self.fs.rm_file(f_path)
193
+ self.logger.info(f"Deleted orphaned file: {f_path}")
194
+ except Exception as e:
195
+ self.logger.warning(f"Failed to delete orphaned temp file '{f_path}': {e}")
161
196
  except Exception as e:
162
- self.logger.error(f"Failed to cleanup temp manifest files: {e}")
197
+ # This is a non-critical operation, so we just log the error
198
+ self.logger.error(f"An unexpected error occurred during temp file cleanup: {e}")
163
199
 
164
200
  @staticmethod
165
201
  def _parse_s3_path(s3_path: str):
@@ -169,318 +205,3 @@ class MissingManifestManager:
169
205
  bucket_name = path_parts[0]
170
206
  prefix = path_parts[1] if len(path_parts) > 1 else ""
171
207
  return bucket_name, prefix
172
-
173
- # import pandas as pd
174
- # import fsspec
175
- # import threading
176
- # import uuid
177
- # from typing import List, Optional, Set, Dict, Any
178
- #
179
- # from sibi_dst.utils import Logger
180
- #
181
- #
182
- # class MissingManifestManager:
183
- # """
184
- # Thread-safe manager for a “missing-partitions” manifest (Parquet file).
185
- # """
186
- #
187
- # def __init__(
188
- # self,
189
- # fs: fsspec.AbstractFileSystem,
190
- # manifest_path: str,
191
- # clear_existing: bool = False,
192
- # **kwargs: Any,
193
- # ):
194
- # self.fs = fs
195
- # self.manifest_path = manifest_path.rstrip("/")
196
- # self.clear_existing = clear_existing
197
- #
198
- # self.debug: bool = kwargs.get("debug", False)
199
- # self.logger = kwargs.get(
200
- # "logger",
201
- # Logger.default_logger(logger_name="missing_manifest_manager")
202
- # )
203
- # self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
204
- #
205
- # # In-memory list for new paths
206
- # self._new_records: List[Dict[str, str]] = []
207
- # # Cached set of existing paths
208
- # self._loaded_paths: Optional[Set[str]] = None
209
- #
210
- # # Use a reentrant lock so save() can call load_existing() safely
211
- # self._lock = threading.RLock()
212
- #
213
- # def _safe_exists(self, path: str) -> bool:
214
- # try:
215
- # return self.fs.exists(path)
216
- # except PermissionError:
217
- # if self.debug:
218
- # self.logger.debug(f"Permission denied checking existence of '{path}'")
219
- # return False
220
- # except Exception as e:
221
- # self.logger.warning(f"Error checking existence of '{path}': {e}")
222
- # return False
223
- #
224
- # def load_existing(self) -> Set[str]:
225
- # """
226
- # Load and cache existing manifest paths.
227
- # """
228
- # with self._lock:
229
- # if self._loaded_paths is not None:
230
- # return self._loaded_paths
231
- #
232
- # if not self._safe_exists(self.manifest_path):
233
- # self._loaded_paths = set()
234
- # return self._loaded_paths
235
- #
236
- # try:
237
- # df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
238
- # paths = (
239
- # df.get("path", pd.Series(dtype=str))
240
- # .dropna().astype(str)
241
- # .loc[lambda s: s.str.strip().astype(bool)]
242
- # )
243
- # self._loaded_paths = set(paths.tolist())
244
- # except Exception as e:
245
- # self.logger.warning(f"Failed to load manifest '{self.manifest_path}': {e}")
246
- # self._loaded_paths = set()
247
- #
248
- # return self._loaded_paths
249
- #
250
- # def record(self, full_path: str) -> None:
251
- # """
252
- # Register a missing file path.
253
- # """
254
- # if not full_path or not isinstance(full_path, str):
255
- # return
256
- # with self._lock:
257
- # self._new_records.append({"path": full_path})
258
- #
259
- # def save(self) -> None:
260
- # """
261
- # Merge new records into the manifest and write it out atomically.
262
- # """
263
- # with self._lock:
264
- # # Build DataFrame of new entries
265
- # new_df = pd.DataFrame(self._new_records)
266
- # should_overwrite = self.clear_existing or not self._safe_exists(self.manifest_path)
267
- # if new_df.empty and not should_overwrite:
268
- # return
269
- #
270
- # # Clean new_df
271
- # new_df = (
272
- # new_df.get("path", pd.Series(dtype=str))
273
- # .dropna().astype(str)
274
- # .loc[lambda s: s.str.strip().astype(bool)]
275
- # .to_frame()
276
- # )
277
- #
278
- # # Merge or overwrite
279
- # if should_overwrite:
280
- # out_df = new_df
281
- # else:
282
- # try:
283
- # old_df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
284
- # old_paths = (
285
- # old_df.get("path", pd.Series(dtype=str))
286
- # .dropna().astype(str)
287
- # .loc[lambda s: s.str.strip().astype(bool)]
288
- # .to_frame()
289
- # )
290
- # out_df = pd.concat([old_paths, new_df], ignore_index=True)
291
- # except Exception as e:
292
- # self.logger.warning(f"Could not merge manifest, overwriting: {e}")
293
- # out_df = new_df
294
- #
295
- # out_df = out_df.drop_duplicates(subset=["path"]).reset_index(drop=True)
296
- #
297
- # # Ensure parent dir
298
- # parent = self.manifest_path.rsplit("/", 1)[0]
299
- # try:
300
- # self.fs.makedirs(parent, exist_ok=True)
301
- # except Exception as e:
302
- # self.logger.warning(f"Could not create manifest directory '{parent}': {e}")
303
- #
304
- # # Write atomically: temp file + rename
305
- # temp_path = f"{self.manifest_path}.tmp-{uuid.uuid4().hex}"
306
- # try:
307
- # out_df.to_parquet(
308
- # temp_path,
309
- # filesystem=self.fs,
310
- # index=False
311
- # )
312
- # # rename into place (atomic in most filesystems)
313
- # #self.fs.mv(temp_path, self.manifest_path, recursive=False)
314
- # try:
315
- # self.fs.copy(temp_path, self.manifest_path)
316
- # self.fs.rm(temp_path)
317
- # except Exception as e:
318
- # self.logger.error(f"Failed to copy or delete manifest: {e}")
319
- # raise
320
- # except Exception as e:
321
- # self.logger.error(f"Failed to write or rename manifest: {e}")
322
- # # Clean up temp if it exists
323
- # try:
324
- # if self.fs.exists(temp_path):
325
- # self.fs.rm(temp_path, recursive=True)
326
- # except Exception:
327
- # pass
328
- # raise
329
- #
330
- # # Reset memory & cache
331
- # self._new_records.clear()
332
- # self._loaded_paths = set(out_df["path"].tolist())
333
- # import pandas as pd
334
- # import fsspec
335
- # import threading
336
- # import uuid
337
- # from typing import List, Optional, Set, Dict, Any
338
- #
339
- # from sibi_dst.utils import Logger
340
- #
341
- #
342
- # class MissingManifestManager:
343
- # """
344
- # Thread-safe manager for a “missing-partitions” manifest (Parquet file).
345
- # """
346
- #
347
- # def __init__(
348
- # self,
349
- # fs: fsspec.AbstractFileSystem,
350
- # manifest_path: str,
351
- # clear_existing: bool = False,
352
- # **kwargs: Any,
353
- # ):
354
- # self.fs = fs
355
- # self.manifest_path = manifest_path.rstrip("/")
356
- # self.clear_existing = clear_existing
357
- #
358
- # self.debug: bool = kwargs.get("debug", False)
359
- # self.logger = kwargs.get(
360
- # "logger",
361
- # Logger.default_logger(logger_name="missing_manifest_manager")
362
- # )
363
- # self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
364
- #
365
- # # In-memory list for new paths
366
- # self._new_records: List[Dict[str, str]] = []
367
- # # Cached set of existing paths
368
- # self._loaded_paths: Optional[Set[str]] = None
369
- #
370
- # # Use a reentrant lock so save() can call load_existing() safely
371
- # self._lock = threading.RLock()
372
- #
373
- # def _safe_exists(self, path: str) -> bool:
374
- # try:
375
- # return self.fs.exists(path)
376
- # except PermissionError:
377
- # if self.debug:
378
- # self.logger.debug(f"Permission denied checking existence of '{path}'")
379
- # return False
380
- # except Exception as e:
381
- # self.logger.warning(f"Error checking existence of '{path}': {e}")
382
- # return False
383
- #
384
- # def load_existing(self) -> Set[str]:
385
- # """
386
- # Load and cache existing manifest paths.
387
- # """
388
- # with self._lock:
389
- # if self._loaded_paths is not None:
390
- # return self._loaded_paths
391
- #
392
- # if not self._safe_exists(self.manifest_path):
393
- # self._loaded_paths = set()
394
- # return self._loaded_paths
395
- #
396
- # try:
397
- # df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
398
- # paths = (
399
- # df.get("path", pd.Series(dtype=str))
400
- # .dropna().astype(str)
401
- # .loc[lambda s: s.str.strip().astype(bool)]
402
- # )
403
- # self._loaded_paths = set(paths.tolist())
404
- # except Exception as e:
405
- # self.logger.warning(f"Failed to load manifest '{self.manifest_path}': {e}")
406
- # self._loaded_paths = set()
407
- #
408
- # return self._loaded_paths
409
- #
410
- # def record(self, full_path: str) -> None:
411
- # """
412
- # Register a missing file path.
413
- # """
414
- # if not full_path or not isinstance(full_path, str):
415
- # return
416
- # with self._lock:
417
- # self._new_records.append({"path": full_path})
418
- #
419
- # def save(self) -> None:
420
- # """
421
- # Merge new records into the manifest and write it out atomically.
422
- # """
423
- # with self._lock:
424
- # # Build DataFrame of new entries
425
- # new_df = pd.DataFrame(self._new_records)
426
- # should_overwrite = self.clear_existing or not self._safe_exists(self.manifest_path)
427
- # if new_df.empty and not should_overwrite:
428
- # return
429
- #
430
- # # Clean new_df
431
- # new_df = (
432
- # new_df.get("path", pd.Series(dtype=str))
433
- # .dropna().astype(str)
434
- # .loc[lambda s: s.str.strip().astype(bool)]
435
- # .to_frame()
436
- # )
437
- #
438
- # # Merge or overwrite
439
- # if should_overwrite:
440
- # out_df = new_df
441
- # else:
442
- # try:
443
- # old_df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
444
- # old_paths = (
445
- # old_df.get("path", pd.Series(dtype=str))
446
- # .dropna().astype(str)
447
- # .loc[lambda s: s.str.strip().astype(bool)]
448
- # .to_frame()
449
- # )
450
- # out_df = pd.concat([old_paths, new_df], ignore_index=True)
451
- # except Exception as e:
452
- # self.logger.warning(f"Could not merge manifest, overwriting: {e}")
453
- # out_df = new_df
454
- #
455
- # out_df = out_df.drop_duplicates(subset=["path"]).reset_index(drop=True)
456
- #
457
- # # Ensure parent dir
458
- # parent = self.manifest_path.rsplit("/", 1)[0]
459
- # try:
460
- # self.fs.makedirs(parent, exist_ok=True)
461
- # except Exception as e:
462
- # self.logger.warning(f"Could not create manifest directory '{parent}': {e}")
463
- #
464
- # # Write atomically: temp file + rename
465
- # temp_path = f"{self.manifest_path}.tmp-{uuid.uuid4().hex}"
466
- # try:
467
- # out_df.to_parquet(
468
- # temp_path,
469
- # filesystem=self.fs,
470
- # index=False
471
- # )
472
- # # rename into place (atomic in most filesystems)
473
- # self.fs.mv(temp_path, self.manifest_path, recursive=False)
474
- # except Exception as e:
475
- # self.logger.error(f"Failed to write or rename manifest: {e}")
476
- # # Clean up temp if it exists
477
- # try:
478
- # if self.fs.exists(temp_path):
479
- # self.fs.rm(temp_path, recursive=True)
480
- # except Exception:
481
- # pass
482
- # raise
483
- #
484
- # # Reset memory & cache
485
- # self._new_records.clear()
486
- # self._loaded_paths = set(out_df["path"].tolist())