sibi-dst 0.3.60__py3-none-any.whl → 0.3.61__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,9 @@ import pandas as pd
2
2
  import fsspec
3
3
  import threading
4
4
  import uuid
5
+ import hashlib
6
+ import base64
7
+ import json
5
8
  from typing import List, Optional, Set, Dict, Any
6
9
 
7
10
  from sibi_dst.utils import Logger
@@ -13,11 +16,11 @@ class MissingManifestManager:
13
16
  """
14
17
 
15
18
  def __init__(
16
- self,
17
- fs: fsspec.AbstractFileSystem,
18
- manifest_path: str,
19
- clear_existing: bool = False,
20
- **kwargs: Any,
19
+ self,
20
+ fs: fsspec.AbstractFileSystem,
21
+ manifest_path: str,
22
+ clear_existing: bool = False,
23
+ **kwargs: Any,
21
24
  ):
22
25
  self.fs = fs
23
26
  self.manifest_path = manifest_path.rstrip("/")
@@ -30,29 +33,18 @@ class MissingManifestManager:
30
33
  )
31
34
  self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
32
35
 
33
- # In-memory list for new paths
34
36
  self._new_records: List[Dict[str, str]] = []
35
- # Cached set of existing paths
36
37
  self._loaded_paths: Optional[Set[str]] = None
37
-
38
- # Use a reentrant lock so save() can call load_existing() safely
39
38
  self._lock = threading.RLock()
40
39
 
41
40
  def _safe_exists(self, path: str) -> bool:
42
41
  try:
43
42
  return self.fs.exists(path)
44
- except PermissionError:
45
- if self.debug:
46
- self.logger.debug(f"Permission denied checking existence of '{path}'")
47
- return False
48
43
  except Exception as e:
49
44
  self.logger.warning(f"Error checking existence of '{path}': {e}")
50
45
  return False
51
46
 
52
47
  def load_existing(self) -> Set[str]:
53
- """
54
- Load and cache existing manifest paths.
55
- """
56
48
  with self._lock:
57
49
  if self._loaded_paths is not None:
58
50
  return self._loaded_paths
@@ -65,8 +57,8 @@ class MissingManifestManager:
65
57
  df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
66
58
  paths = (
67
59
  df.get("path", pd.Series(dtype=str))
68
- .dropna().astype(str)
69
- .loc[lambda s: s.str.strip().astype(bool)]
60
+ .dropna().astype(str)
61
+ .loc[lambda s: s.str.strip().astype(bool)]
70
62
  )
71
63
  self._loaded_paths = set(paths.tolist())
72
64
  except Exception as e:
@@ -76,34 +68,25 @@ class MissingManifestManager:
76
68
  return self._loaded_paths
77
69
 
78
70
  def record(self, full_path: str) -> None:
79
- """
80
- Register a missing file path.
81
- """
82
71
  if not full_path or not isinstance(full_path, str):
83
72
  return
84
73
  with self._lock:
85
74
  self._new_records.append({"path": full_path})
86
75
 
87
76
  def save(self) -> None:
88
- """
89
- Merge new records into the manifest and write it out atomically.
90
- """
91
77
  with self._lock:
92
- # Build DataFrame of new entries
93
78
  new_df = pd.DataFrame(self._new_records)
94
79
  should_overwrite = self.clear_existing or not self._safe_exists(self.manifest_path)
95
80
  if new_df.empty and not should_overwrite:
96
81
  return
97
82
 
98
- # Clean new_df
99
83
  new_df = (
100
84
  new_df.get("path", pd.Series(dtype=str))
101
- .dropna().astype(str)
102
- .loc[lambda s: s.str.strip().astype(bool)]
103
- .to_frame()
85
+ .dropna().astype(str)
86
+ .loc[lambda s: s.str.strip().astype(bool)]
87
+ .to_frame()
104
88
  )
105
89
 
106
- # Merge or overwrite
107
90
  if should_overwrite:
108
91
  out_df = new_df
109
92
  else:
@@ -111,9 +94,9 @@ class MissingManifestManager:
111
94
  old_df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
112
95
  old_paths = (
113
96
  old_df.get("path", pd.Series(dtype=str))
114
- .dropna().astype(str)
115
- .loc[lambda s: s.str.strip().astype(bool)]
116
- .to_frame()
97
+ .dropna().astype(str)
98
+ .loc[lambda s: s.str.strip().astype(bool)]
99
+ .to_frame()
117
100
  )
118
101
  out_df = pd.concat([old_paths, new_df], ignore_index=True)
119
102
  except Exception as e:
@@ -122,14 +105,12 @@ class MissingManifestManager:
122
105
 
123
106
  out_df = out_df.drop_duplicates(subset=["path"]).reset_index(drop=True)
124
107
 
125
- # Ensure parent dir
126
108
  parent = self.manifest_path.rsplit("/", 1)[0]
127
109
  try:
128
110
  self.fs.makedirs(parent, exist_ok=True)
129
111
  except Exception as e:
130
112
  self.logger.warning(f"Could not create manifest directory '{parent}': {e}")
131
113
 
132
- # Write atomically: temp file + rename
133
114
  temp_path = f"{self.manifest_path}.tmp-{uuid.uuid4().hex}"
134
115
  try:
135
116
  out_df.to_parquet(
@@ -137,18 +118,369 @@ class MissingManifestManager:
137
118
  filesystem=self.fs,
138
119
  index=False
139
120
  )
140
- # rename into place (atomic in most filesystems)
141
- self.fs.mv(temp_path, self.manifest_path, recursive=False)
121
+ self.fs.copy(temp_path, self.manifest_path)
122
+ self.logger.info(f"Copied manifest to {self.manifest_path} (temp: {temp_path})")
142
123
  except Exception as e:
143
- self.logger.error(f"Failed to write or rename manifest: {e}")
144
- # Clean up temp if it exists
145
- try:
146
- if self.fs.exists(temp_path):
147
- self.fs.rm(temp_path, recursive=True)
148
- except Exception:
149
- pass
124
+ self.logger.error(f"Failed to write or copy manifest: {e}")
150
125
  raise
151
126
 
152
- # Reset memory & cache
127
+ self.logger.debug(f"Temp file left behind: {temp_path}")
153
128
  self._new_records.clear()
154
- self._loaded_paths = set(out_df["path"].tolist())
129
+ self._loaded_paths = set(out_df["path"].tolist())
130
+
131
+ def cleanup_temp_manifests(self) -> None:
132
+ if not hasattr(self.fs, "s3"):
133
+ self.logger.info("Filesystem is not s3fs; skipping temp cleanup.")
134
+ return
135
+
136
+ try:
137
+ bucket, prefix = self._parse_s3_path(self.manifest_path.rsplit("/", 1)[0])
138
+ files = self.fs.ls(f"s3://{bucket}/{prefix}", detail=True)
139
+ temp_files = [
140
+ f for f in files
141
+ if f["name"].endswith(".parquet") and ".tmp-" in f["name"]
142
+ ]
143
+ if not temp_files:
144
+ return
145
+
146
+ objects = [{"Key": f["name"].replace(f"{bucket}/", "", 1)} for f in temp_files]
147
+ delete_payload = {
148
+ "Objects": objects,
149
+ "Quiet": True
150
+ }
151
+
152
+ json_payload = json.dumps(delete_payload).encode("utf-8")
153
+ content_md5 = base64.b64encode(hashlib.md5(json_payload).digest()).decode("utf-8")
154
+
155
+ self.fs.s3.meta.client.delete_objects(
156
+ Bucket=bucket,
157
+ Delete=delete_payload,
158
+ ContentMD5=content_md5
159
+ )
160
+ self.logger.info(f"Deleted {len(objects)} temp manifest files in s3://{bucket}/{prefix}")
161
+ except Exception as e:
162
+ self.logger.error(f"Failed to cleanup temp manifest files: {e}")
163
+
164
+ @staticmethod
165
+ def _parse_s3_path(s3_path: str):
166
+ if not s3_path.startswith("s3://"):
167
+ raise ValueError("Invalid S3 path. Must start with 's3://'.")
168
+ path_parts = s3_path[5:].split("/", 1)
169
+ bucket_name = path_parts[0]
170
+ prefix = path_parts[1] if len(path_parts) > 1 else ""
171
+ return bucket_name, prefix
172
+
173
+ # import pandas as pd
174
+ # import fsspec
175
+ # import threading
176
+ # import uuid
177
+ # from typing import List, Optional, Set, Dict, Any
178
+ #
179
+ # from sibi_dst.utils import Logger
180
+ #
181
+ #
182
+ # class MissingManifestManager:
183
+ # """
184
+ # Thread-safe manager for a “missing-partitions” manifest (Parquet file).
185
+ # """
186
+ #
187
+ # def __init__(
188
+ # self,
189
+ # fs: fsspec.AbstractFileSystem,
190
+ # manifest_path: str,
191
+ # clear_existing: bool = False,
192
+ # **kwargs: Any,
193
+ # ):
194
+ # self.fs = fs
195
+ # self.manifest_path = manifest_path.rstrip("/")
196
+ # self.clear_existing = clear_existing
197
+ #
198
+ # self.debug: bool = kwargs.get("debug", False)
199
+ # self.logger = kwargs.get(
200
+ # "logger",
201
+ # Logger.default_logger(logger_name="missing_manifest_manager")
202
+ # )
203
+ # self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
204
+ #
205
+ # # In-memory list for new paths
206
+ # self._new_records: List[Dict[str, str]] = []
207
+ # # Cached set of existing paths
208
+ # self._loaded_paths: Optional[Set[str]] = None
209
+ #
210
+ # # Use a reentrant lock so save() can call load_existing() safely
211
+ # self._lock = threading.RLock()
212
+ #
213
+ # def _safe_exists(self, path: str) -> bool:
214
+ # try:
215
+ # return self.fs.exists(path)
216
+ # except PermissionError:
217
+ # if self.debug:
218
+ # self.logger.debug(f"Permission denied checking existence of '{path}'")
219
+ # return False
220
+ # except Exception as e:
221
+ # self.logger.warning(f"Error checking existence of '{path}': {e}")
222
+ # return False
223
+ #
224
+ # def load_existing(self) -> Set[str]:
225
+ # """
226
+ # Load and cache existing manifest paths.
227
+ # """
228
+ # with self._lock:
229
+ # if self._loaded_paths is not None:
230
+ # return self._loaded_paths
231
+ #
232
+ # if not self._safe_exists(self.manifest_path):
233
+ # self._loaded_paths = set()
234
+ # return self._loaded_paths
235
+ #
236
+ # try:
237
+ # df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
238
+ # paths = (
239
+ # df.get("path", pd.Series(dtype=str))
240
+ # .dropna().astype(str)
241
+ # .loc[lambda s: s.str.strip().astype(bool)]
242
+ # )
243
+ # self._loaded_paths = set(paths.tolist())
244
+ # except Exception as e:
245
+ # self.logger.warning(f"Failed to load manifest '{self.manifest_path}': {e}")
246
+ # self._loaded_paths = set()
247
+ #
248
+ # return self._loaded_paths
249
+ #
250
+ # def record(self, full_path: str) -> None:
251
+ # """
252
+ # Register a missing file path.
253
+ # """
254
+ # if not full_path or not isinstance(full_path, str):
255
+ # return
256
+ # with self._lock:
257
+ # self._new_records.append({"path": full_path})
258
+ #
259
+ # def save(self) -> None:
260
+ # """
261
+ # Merge new records into the manifest and write it out atomically.
262
+ # """
263
+ # with self._lock:
264
+ # # Build DataFrame of new entries
265
+ # new_df = pd.DataFrame(self._new_records)
266
+ # should_overwrite = self.clear_existing or not self._safe_exists(self.manifest_path)
267
+ # if new_df.empty and not should_overwrite:
268
+ # return
269
+ #
270
+ # # Clean new_df
271
+ # new_df = (
272
+ # new_df.get("path", pd.Series(dtype=str))
273
+ # .dropna().astype(str)
274
+ # .loc[lambda s: s.str.strip().astype(bool)]
275
+ # .to_frame()
276
+ # )
277
+ #
278
+ # # Merge or overwrite
279
+ # if should_overwrite:
280
+ # out_df = new_df
281
+ # else:
282
+ # try:
283
+ # old_df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
284
+ # old_paths = (
285
+ # old_df.get("path", pd.Series(dtype=str))
286
+ # .dropna().astype(str)
287
+ # .loc[lambda s: s.str.strip().astype(bool)]
288
+ # .to_frame()
289
+ # )
290
+ # out_df = pd.concat([old_paths, new_df], ignore_index=True)
291
+ # except Exception as e:
292
+ # self.logger.warning(f"Could not merge manifest, overwriting: {e}")
293
+ # out_df = new_df
294
+ #
295
+ # out_df = out_df.drop_duplicates(subset=["path"]).reset_index(drop=True)
296
+ #
297
+ # # Ensure parent dir
298
+ # parent = self.manifest_path.rsplit("/", 1)[0]
299
+ # try:
300
+ # self.fs.makedirs(parent, exist_ok=True)
301
+ # except Exception as e:
302
+ # self.logger.warning(f"Could not create manifest directory '{parent}': {e}")
303
+ #
304
+ # # Write atomically: temp file + rename
305
+ # temp_path = f"{self.manifest_path}.tmp-{uuid.uuid4().hex}"
306
+ # try:
307
+ # out_df.to_parquet(
308
+ # temp_path,
309
+ # filesystem=self.fs,
310
+ # index=False
311
+ # )
312
+ # # rename into place (atomic in most filesystems)
313
+ # #self.fs.mv(temp_path, self.manifest_path, recursive=False)
314
+ # try:
315
+ # self.fs.copy(temp_path, self.manifest_path)
316
+ # self.fs.rm(temp_path)
317
+ # except Exception as e:
318
+ # self.logger.error(f"Failed to copy or delete manifest: {e}")
319
+ # raise
320
+ # except Exception as e:
321
+ # self.logger.error(f"Failed to write or rename manifest: {e}")
322
+ # # Clean up temp if it exists
323
+ # try:
324
+ # if self.fs.exists(temp_path):
325
+ # self.fs.rm(temp_path, recursive=True)
326
+ # except Exception:
327
+ # pass
328
+ # raise
329
+ #
330
+ # # Reset memory & cache
331
+ # self._new_records.clear()
332
+ # self._loaded_paths = set(out_df["path"].tolist())
333
+ # import pandas as pd
334
+ # import fsspec
335
+ # import threading
336
+ # import uuid
337
+ # from typing import List, Optional, Set, Dict, Any
338
+ #
339
+ # from sibi_dst.utils import Logger
340
+ #
341
+ #
342
+ # class MissingManifestManager:
343
+ # """
344
+ # Thread-safe manager for a “missing-partitions” manifest (Parquet file).
345
+ # """
346
+ #
347
+ # def __init__(
348
+ # self,
349
+ # fs: fsspec.AbstractFileSystem,
350
+ # manifest_path: str,
351
+ # clear_existing: bool = False,
352
+ # **kwargs: Any,
353
+ # ):
354
+ # self.fs = fs
355
+ # self.manifest_path = manifest_path.rstrip("/")
356
+ # self.clear_existing = clear_existing
357
+ #
358
+ # self.debug: bool = kwargs.get("debug", False)
359
+ # self.logger = kwargs.get(
360
+ # "logger",
361
+ # Logger.default_logger(logger_name="missing_manifest_manager")
362
+ # )
363
+ # self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
364
+ #
365
+ # # In-memory list for new paths
366
+ # self._new_records: List[Dict[str, str]] = []
367
+ # # Cached set of existing paths
368
+ # self._loaded_paths: Optional[Set[str]] = None
369
+ #
370
+ # # Use a reentrant lock so save() can call load_existing() safely
371
+ # self._lock = threading.RLock()
372
+ #
373
+ # def _safe_exists(self, path: str) -> bool:
374
+ # try:
375
+ # return self.fs.exists(path)
376
+ # except PermissionError:
377
+ # if self.debug:
378
+ # self.logger.debug(f"Permission denied checking existence of '{path}'")
379
+ # return False
380
+ # except Exception as e:
381
+ # self.logger.warning(f"Error checking existence of '{path}': {e}")
382
+ # return False
383
+ #
384
+ # def load_existing(self) -> Set[str]:
385
+ # """
386
+ # Load and cache existing manifest paths.
387
+ # """
388
+ # with self._lock:
389
+ # if self._loaded_paths is not None:
390
+ # return self._loaded_paths
391
+ #
392
+ # if not self._safe_exists(self.manifest_path):
393
+ # self._loaded_paths = set()
394
+ # return self._loaded_paths
395
+ #
396
+ # try:
397
+ # df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
398
+ # paths = (
399
+ # df.get("path", pd.Series(dtype=str))
400
+ # .dropna().astype(str)
401
+ # .loc[lambda s: s.str.strip().astype(bool)]
402
+ # )
403
+ # self._loaded_paths = set(paths.tolist())
404
+ # except Exception as e:
405
+ # self.logger.warning(f"Failed to load manifest '{self.manifest_path}': {e}")
406
+ # self._loaded_paths = set()
407
+ #
408
+ # return self._loaded_paths
409
+ #
410
+ # def record(self, full_path: str) -> None:
411
+ # """
412
+ # Register a missing file path.
413
+ # """
414
+ # if not full_path or not isinstance(full_path, str):
415
+ # return
416
+ # with self._lock:
417
+ # self._new_records.append({"path": full_path})
418
+ #
419
+ # def save(self) -> None:
420
+ # """
421
+ # Merge new records into the manifest and write it out atomically.
422
+ # """
423
+ # with self._lock:
424
+ # # Build DataFrame of new entries
425
+ # new_df = pd.DataFrame(self._new_records)
426
+ # should_overwrite = self.clear_existing or not self._safe_exists(self.manifest_path)
427
+ # if new_df.empty and not should_overwrite:
428
+ # return
429
+ #
430
+ # # Clean new_df
431
+ # new_df = (
432
+ # new_df.get("path", pd.Series(dtype=str))
433
+ # .dropna().astype(str)
434
+ # .loc[lambda s: s.str.strip().astype(bool)]
435
+ # .to_frame()
436
+ # )
437
+ #
438
+ # # Merge or overwrite
439
+ # if should_overwrite:
440
+ # out_df = new_df
441
+ # else:
442
+ # try:
443
+ # old_df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
444
+ # old_paths = (
445
+ # old_df.get("path", pd.Series(dtype=str))
446
+ # .dropna().astype(str)
447
+ # .loc[lambda s: s.str.strip().astype(bool)]
448
+ # .to_frame()
449
+ # )
450
+ # out_df = pd.concat([old_paths, new_df], ignore_index=True)
451
+ # except Exception as e:
452
+ # self.logger.warning(f"Could not merge manifest, overwriting: {e}")
453
+ # out_df = new_df
454
+ #
455
+ # out_df = out_df.drop_duplicates(subset=["path"]).reset_index(drop=True)
456
+ #
457
+ # # Ensure parent dir
458
+ # parent = self.manifest_path.rsplit("/", 1)[0]
459
+ # try:
460
+ # self.fs.makedirs(parent, exist_ok=True)
461
+ # except Exception as e:
462
+ # self.logger.warning(f"Could not create manifest directory '{parent}': {e}")
463
+ #
464
+ # # Write atomically: temp file + rename
465
+ # temp_path = f"{self.manifest_path}.tmp-{uuid.uuid4().hex}"
466
+ # try:
467
+ # out_df.to_parquet(
468
+ # temp_path,
469
+ # filesystem=self.fs,
470
+ # index=False
471
+ # )
472
+ # # rename into place (atomic in most filesystems)
473
+ # self.fs.mv(temp_path, self.manifest_path, recursive=False)
474
+ # except Exception as e:
475
+ # self.logger.error(f"Failed to write or rename manifest: {e}")
476
+ # # Clean up temp if it exists
477
+ # try:
478
+ # if self.fs.exists(temp_path):
479
+ # self.fs.rm(temp_path, recursive=True)
480
+ # except Exception:
481
+ # pass
482
+ # raise
483
+ #
484
+ # # Reset memory & cache
485
+ # self._new_records.clear()
486
+ # self._loaded_paths = set(out_df["path"].tolist())
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.60
3
+ Version: 0.3.61
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -50,7 +50,7 @@ sibi_dst/utils/df_utils.py,sha256=TzIAUCLbgOn3bvCFvzkc1S9YU-OlZTImdCj-88dtg8g,11
50
50
  sibi_dst/utils/file_utils.py,sha256=Z99CZ_4nPDIaZqbCfzzUDfAYJjSudWDj-mwEO8grhbc,1253
51
51
  sibi_dst/utils/filepath_generator.py,sha256=-HHO0U-PR8fysDDFwnWdHRlgqksh_RkmgBZLWv9hM7s,6669
52
52
  sibi_dst/utils/log_utils.py,sha256=77xACRagKU83H9vn7aVeBzkQjxWlbe4dg4KuxPRCgvw,4635
53
- sibi_dst/utils/manifest_manager.py,sha256=abm97TuWgJqNViPXMbpl5W7ttrg1BeiJkf2SMGc4hd8,5512
53
+ sibi_dst/utils/manifest_manager.py,sha256=eyk6Dvrn86gUpAaAsnQvNnEJn5-Tno-sDDJsDMfHtTA,18161
54
54
  sibi_dst/utils/parquet_saver.py,sha256=O62xwPfphOpKgEiHqnts20CPSU96pxs49Cg7PVetLK0,8193
55
55
  sibi_dst/utils/phone_formatter.py,sha256=tsVTDamuthFYgy4-5UwmQkPQ-FGTGH7MjZyH8utAkIY,4945
56
56
  sibi_dst/utils/storage_config.py,sha256=TE15H-7d0mqwYPSUgrdidK9U7N7p87Z8JfUQH4-jdPs,4123
@@ -77,6 +77,6 @@ sibi_dst/v2/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxR
77
77
  sibi_dst/v2/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
78
78
  sibi_dst/v2/utils/__init__.py,sha256=6H4cvhqTiFufnFPETBF0f8beVVMpfJfvUs6Ne0TQZNY,58
79
79
  sibi_dst/v2/utils/log_utils.py,sha256=rfk5VsLAt-FKpv6aPTC1FToIPiyrnHAFFBAkHme24po,4123
80
- sibi_dst-0.3.60.dist-info/METADATA,sha256=WuBvzuHMuZBUpHF2-dAO65MI1e5EhuZ1-Hvil3oQY6o,4292
81
- sibi_dst-0.3.60.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
82
- sibi_dst-0.3.60.dist-info/RECORD,,
80
+ sibi_dst-0.3.61.dist-info/METADATA,sha256=GZ-Yz9oiehgGgI2iJoCejdExgtclAlaz-N-sI5hGIi0,4292
81
+ sibi_dst-0.3.61.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
82
+ sibi_dst-0.3.61.dist-info/RECORD,,