sibi-dst 0.3.59__py3-none-any.whl → 0.3.61__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/utils/__init__.py +3 -1
- sibi_dst/utils/manifest_manager.py +378 -46
- sibi_dst/utils/storage_config.py +59 -1
- {sibi_dst-0.3.59.dist-info → sibi_dst-0.3.61.dist-info}/METADATA +1 -1
- {sibi_dst-0.3.59.dist-info → sibi_dst-0.3.61.dist-info}/RECORD +6 -6
- {sibi_dst-0.3.59.dist-info → sibi_dst-0.3.61.dist-info}/WHEEL +0 -0
sibi_dst/utils/__init__.py
CHANGED
@@ -14,7 +14,7 @@ from .airflow_manager import AirflowDAGManager
|
|
14
14
|
from .credentials import *
|
15
15
|
from .update_planner import UpdatePlanner
|
16
16
|
from .data_wrapper import DataWrapper
|
17
|
-
from .storage_config import StorageConfig
|
17
|
+
from .storage_config import StorageConfig, FsRegistry
|
18
18
|
from .data_from_http_source import DataFromHttpSource
|
19
19
|
from .webdav_client import WebDAVClient
|
20
20
|
from .manifest_manager import MissingManifestManager
|
@@ -37,7 +37,9 @@ __all__ = [
|
|
37
37
|
"ClickHouseWriter",
|
38
38
|
"AirflowDAGManager",
|
39
39
|
"StorageConfig",
|
40
|
+
"FsRegistry",
|
40
41
|
"DataFromHttpSource",
|
41
42
|
"WebDAVClient",
|
42
43
|
"MissingManifestManager",
|
43
44
|
]
|
45
|
+
|
@@ -2,6 +2,9 @@ import pandas as pd
|
|
2
2
|
import fsspec
|
3
3
|
import threading
|
4
4
|
import uuid
|
5
|
+
import hashlib
|
6
|
+
import base64
|
7
|
+
import json
|
5
8
|
from typing import List, Optional, Set, Dict, Any
|
6
9
|
|
7
10
|
from sibi_dst.utils import Logger
|
@@ -13,11 +16,11 @@ class MissingManifestManager:
|
|
13
16
|
"""
|
14
17
|
|
15
18
|
def __init__(
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
19
|
+
self,
|
20
|
+
fs: fsspec.AbstractFileSystem,
|
21
|
+
manifest_path: str,
|
22
|
+
clear_existing: bool = False,
|
23
|
+
**kwargs: Any,
|
21
24
|
):
|
22
25
|
self.fs = fs
|
23
26
|
self.manifest_path = manifest_path.rstrip("/")
|
@@ -30,29 +33,18 @@ class MissingManifestManager:
|
|
30
33
|
)
|
31
34
|
self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
|
32
35
|
|
33
|
-
# In-memory list for new paths
|
34
36
|
self._new_records: List[Dict[str, str]] = []
|
35
|
-
# Cached set of existing paths
|
36
37
|
self._loaded_paths: Optional[Set[str]] = None
|
37
|
-
|
38
|
-
# Use a reentrant lock so save() can call load_existing() safely
|
39
38
|
self._lock = threading.RLock()
|
40
39
|
|
41
40
|
def _safe_exists(self, path: str) -> bool:
|
42
41
|
try:
|
43
42
|
return self.fs.exists(path)
|
44
|
-
except PermissionError:
|
45
|
-
if self.debug:
|
46
|
-
self.logger.debug(f"Permission denied checking existence of '{path}'")
|
47
|
-
return False
|
48
43
|
except Exception as e:
|
49
44
|
self.logger.warning(f"Error checking existence of '{path}': {e}")
|
50
45
|
return False
|
51
46
|
|
52
47
|
def load_existing(self) -> Set[str]:
|
53
|
-
"""
|
54
|
-
Load and cache existing manifest paths.
|
55
|
-
"""
|
56
48
|
with self._lock:
|
57
49
|
if self._loaded_paths is not None:
|
58
50
|
return self._loaded_paths
|
@@ -65,8 +57,8 @@ class MissingManifestManager:
|
|
65
57
|
df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
|
66
58
|
paths = (
|
67
59
|
df.get("path", pd.Series(dtype=str))
|
68
|
-
|
69
|
-
|
60
|
+
.dropna().astype(str)
|
61
|
+
.loc[lambda s: s.str.strip().astype(bool)]
|
70
62
|
)
|
71
63
|
self._loaded_paths = set(paths.tolist())
|
72
64
|
except Exception as e:
|
@@ -76,34 +68,25 @@ class MissingManifestManager:
|
|
76
68
|
return self._loaded_paths
|
77
69
|
|
78
70
|
def record(self, full_path: str) -> None:
|
79
|
-
"""
|
80
|
-
Register a missing file path.
|
81
|
-
"""
|
82
71
|
if not full_path or not isinstance(full_path, str):
|
83
72
|
return
|
84
73
|
with self._lock:
|
85
74
|
self._new_records.append({"path": full_path})
|
86
75
|
|
87
76
|
def save(self) -> None:
|
88
|
-
"""
|
89
|
-
Merge new records into the manifest and write it out atomically.
|
90
|
-
"""
|
91
77
|
with self._lock:
|
92
|
-
# Build DataFrame of new entries
|
93
78
|
new_df = pd.DataFrame(self._new_records)
|
94
79
|
should_overwrite = self.clear_existing or not self._safe_exists(self.manifest_path)
|
95
80
|
if new_df.empty and not should_overwrite:
|
96
81
|
return
|
97
82
|
|
98
|
-
# Clean new_df
|
99
83
|
new_df = (
|
100
84
|
new_df.get("path", pd.Series(dtype=str))
|
101
|
-
|
102
|
-
|
103
|
-
|
85
|
+
.dropna().astype(str)
|
86
|
+
.loc[lambda s: s.str.strip().astype(bool)]
|
87
|
+
.to_frame()
|
104
88
|
)
|
105
89
|
|
106
|
-
# Merge or overwrite
|
107
90
|
if should_overwrite:
|
108
91
|
out_df = new_df
|
109
92
|
else:
|
@@ -111,9 +94,9 @@ class MissingManifestManager:
|
|
111
94
|
old_df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
|
112
95
|
old_paths = (
|
113
96
|
old_df.get("path", pd.Series(dtype=str))
|
114
|
-
|
115
|
-
|
116
|
-
|
97
|
+
.dropna().astype(str)
|
98
|
+
.loc[lambda s: s.str.strip().astype(bool)]
|
99
|
+
.to_frame()
|
117
100
|
)
|
118
101
|
out_df = pd.concat([old_paths, new_df], ignore_index=True)
|
119
102
|
except Exception as e:
|
@@ -122,14 +105,12 @@ class MissingManifestManager:
|
|
122
105
|
|
123
106
|
out_df = out_df.drop_duplicates(subset=["path"]).reset_index(drop=True)
|
124
107
|
|
125
|
-
# Ensure parent dir
|
126
108
|
parent = self.manifest_path.rsplit("/", 1)[0]
|
127
109
|
try:
|
128
110
|
self.fs.makedirs(parent, exist_ok=True)
|
129
111
|
except Exception as e:
|
130
112
|
self.logger.warning(f"Could not create manifest directory '{parent}': {e}")
|
131
113
|
|
132
|
-
# Write atomically: temp file + rename
|
133
114
|
temp_path = f"{self.manifest_path}.tmp-{uuid.uuid4().hex}"
|
134
115
|
try:
|
135
116
|
out_df.to_parquet(
|
@@ -137,18 +118,369 @@ class MissingManifestManager:
|
|
137
118
|
filesystem=self.fs,
|
138
119
|
index=False
|
139
120
|
)
|
140
|
-
|
141
|
-
self.
|
121
|
+
self.fs.copy(temp_path, self.manifest_path)
|
122
|
+
self.logger.info(f"Copied manifest to {self.manifest_path} (temp: {temp_path})")
|
142
123
|
except Exception as e:
|
143
|
-
self.logger.error(f"Failed to write or
|
144
|
-
# Clean up temp if it exists
|
145
|
-
try:
|
146
|
-
if self.fs.exists(temp_path):
|
147
|
-
self.fs.rm(temp_path, recursive=True)
|
148
|
-
except Exception:
|
149
|
-
pass
|
124
|
+
self.logger.error(f"Failed to write or copy manifest: {e}")
|
150
125
|
raise
|
151
126
|
|
152
|
-
|
127
|
+
self.logger.debug(f"Temp file left behind: {temp_path}")
|
153
128
|
self._new_records.clear()
|
154
|
-
self._loaded_paths = set(out_df["path"].tolist())
|
129
|
+
self._loaded_paths = set(out_df["path"].tolist())
|
130
|
+
|
131
|
+
def cleanup_temp_manifests(self) -> None:
|
132
|
+
if not hasattr(self.fs, "s3"):
|
133
|
+
self.logger.info("Filesystem is not s3fs; skipping temp cleanup.")
|
134
|
+
return
|
135
|
+
|
136
|
+
try:
|
137
|
+
bucket, prefix = self._parse_s3_path(self.manifest_path.rsplit("/", 1)[0])
|
138
|
+
files = self.fs.ls(f"s3://{bucket}/{prefix}", detail=True)
|
139
|
+
temp_files = [
|
140
|
+
f for f in files
|
141
|
+
if f["name"].endswith(".parquet") and ".tmp-" in f["name"]
|
142
|
+
]
|
143
|
+
if not temp_files:
|
144
|
+
return
|
145
|
+
|
146
|
+
objects = [{"Key": f["name"].replace(f"{bucket}/", "", 1)} for f in temp_files]
|
147
|
+
delete_payload = {
|
148
|
+
"Objects": objects,
|
149
|
+
"Quiet": True
|
150
|
+
}
|
151
|
+
|
152
|
+
json_payload = json.dumps(delete_payload).encode("utf-8")
|
153
|
+
content_md5 = base64.b64encode(hashlib.md5(json_payload).digest()).decode("utf-8")
|
154
|
+
|
155
|
+
self.fs.s3.meta.client.delete_objects(
|
156
|
+
Bucket=bucket,
|
157
|
+
Delete=delete_payload,
|
158
|
+
ContentMD5=content_md5
|
159
|
+
)
|
160
|
+
self.logger.info(f"Deleted {len(objects)} temp manifest files in s3://{bucket}/{prefix}")
|
161
|
+
except Exception as e:
|
162
|
+
self.logger.error(f"Failed to cleanup temp manifest files: {e}")
|
163
|
+
|
164
|
+
@staticmethod
|
165
|
+
def _parse_s3_path(s3_path: str):
|
166
|
+
if not s3_path.startswith("s3://"):
|
167
|
+
raise ValueError("Invalid S3 path. Must start with 's3://'.")
|
168
|
+
path_parts = s3_path[5:].split("/", 1)
|
169
|
+
bucket_name = path_parts[0]
|
170
|
+
prefix = path_parts[1] if len(path_parts) > 1 else ""
|
171
|
+
return bucket_name, prefix
|
172
|
+
|
173
|
+
# import pandas as pd
|
174
|
+
# import fsspec
|
175
|
+
# import threading
|
176
|
+
# import uuid
|
177
|
+
# from typing import List, Optional, Set, Dict, Any
|
178
|
+
#
|
179
|
+
# from sibi_dst.utils import Logger
|
180
|
+
#
|
181
|
+
#
|
182
|
+
# class MissingManifestManager:
|
183
|
+
# """
|
184
|
+
# Thread-safe manager for a “missing-partitions” manifest (Parquet file).
|
185
|
+
# """
|
186
|
+
#
|
187
|
+
# def __init__(
|
188
|
+
# self,
|
189
|
+
# fs: fsspec.AbstractFileSystem,
|
190
|
+
# manifest_path: str,
|
191
|
+
# clear_existing: bool = False,
|
192
|
+
# **kwargs: Any,
|
193
|
+
# ):
|
194
|
+
# self.fs = fs
|
195
|
+
# self.manifest_path = manifest_path.rstrip("/")
|
196
|
+
# self.clear_existing = clear_existing
|
197
|
+
#
|
198
|
+
# self.debug: bool = kwargs.get("debug", False)
|
199
|
+
# self.logger = kwargs.get(
|
200
|
+
# "logger",
|
201
|
+
# Logger.default_logger(logger_name="missing_manifest_manager")
|
202
|
+
# )
|
203
|
+
# self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
|
204
|
+
#
|
205
|
+
# # In-memory list for new paths
|
206
|
+
# self._new_records: List[Dict[str, str]] = []
|
207
|
+
# # Cached set of existing paths
|
208
|
+
# self._loaded_paths: Optional[Set[str]] = None
|
209
|
+
#
|
210
|
+
# # Use a reentrant lock so save() can call load_existing() safely
|
211
|
+
# self._lock = threading.RLock()
|
212
|
+
#
|
213
|
+
# def _safe_exists(self, path: str) -> bool:
|
214
|
+
# try:
|
215
|
+
# return self.fs.exists(path)
|
216
|
+
# except PermissionError:
|
217
|
+
# if self.debug:
|
218
|
+
# self.logger.debug(f"Permission denied checking existence of '{path}'")
|
219
|
+
# return False
|
220
|
+
# except Exception as e:
|
221
|
+
# self.logger.warning(f"Error checking existence of '{path}': {e}")
|
222
|
+
# return False
|
223
|
+
#
|
224
|
+
# def load_existing(self) -> Set[str]:
|
225
|
+
# """
|
226
|
+
# Load and cache existing manifest paths.
|
227
|
+
# """
|
228
|
+
# with self._lock:
|
229
|
+
# if self._loaded_paths is not None:
|
230
|
+
# return self._loaded_paths
|
231
|
+
#
|
232
|
+
# if not self._safe_exists(self.manifest_path):
|
233
|
+
# self._loaded_paths = set()
|
234
|
+
# return self._loaded_paths
|
235
|
+
#
|
236
|
+
# try:
|
237
|
+
# df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
|
238
|
+
# paths = (
|
239
|
+
# df.get("path", pd.Series(dtype=str))
|
240
|
+
# .dropna().astype(str)
|
241
|
+
# .loc[lambda s: s.str.strip().astype(bool)]
|
242
|
+
# )
|
243
|
+
# self._loaded_paths = set(paths.tolist())
|
244
|
+
# except Exception as e:
|
245
|
+
# self.logger.warning(f"Failed to load manifest '{self.manifest_path}': {e}")
|
246
|
+
# self._loaded_paths = set()
|
247
|
+
#
|
248
|
+
# return self._loaded_paths
|
249
|
+
#
|
250
|
+
# def record(self, full_path: str) -> None:
|
251
|
+
# """
|
252
|
+
# Register a missing file path.
|
253
|
+
# """
|
254
|
+
# if not full_path or not isinstance(full_path, str):
|
255
|
+
# return
|
256
|
+
# with self._lock:
|
257
|
+
# self._new_records.append({"path": full_path})
|
258
|
+
#
|
259
|
+
# def save(self) -> None:
|
260
|
+
# """
|
261
|
+
# Merge new records into the manifest and write it out atomically.
|
262
|
+
# """
|
263
|
+
# with self._lock:
|
264
|
+
# # Build DataFrame of new entries
|
265
|
+
# new_df = pd.DataFrame(self._new_records)
|
266
|
+
# should_overwrite = self.clear_existing or not self._safe_exists(self.manifest_path)
|
267
|
+
# if new_df.empty and not should_overwrite:
|
268
|
+
# return
|
269
|
+
#
|
270
|
+
# # Clean new_df
|
271
|
+
# new_df = (
|
272
|
+
# new_df.get("path", pd.Series(dtype=str))
|
273
|
+
# .dropna().astype(str)
|
274
|
+
# .loc[lambda s: s.str.strip().astype(bool)]
|
275
|
+
# .to_frame()
|
276
|
+
# )
|
277
|
+
#
|
278
|
+
# # Merge or overwrite
|
279
|
+
# if should_overwrite:
|
280
|
+
# out_df = new_df
|
281
|
+
# else:
|
282
|
+
# try:
|
283
|
+
# old_df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
|
284
|
+
# old_paths = (
|
285
|
+
# old_df.get("path", pd.Series(dtype=str))
|
286
|
+
# .dropna().astype(str)
|
287
|
+
# .loc[lambda s: s.str.strip().astype(bool)]
|
288
|
+
# .to_frame()
|
289
|
+
# )
|
290
|
+
# out_df = pd.concat([old_paths, new_df], ignore_index=True)
|
291
|
+
# except Exception as e:
|
292
|
+
# self.logger.warning(f"Could not merge manifest, overwriting: {e}")
|
293
|
+
# out_df = new_df
|
294
|
+
#
|
295
|
+
# out_df = out_df.drop_duplicates(subset=["path"]).reset_index(drop=True)
|
296
|
+
#
|
297
|
+
# # Ensure parent dir
|
298
|
+
# parent = self.manifest_path.rsplit("/", 1)[0]
|
299
|
+
# try:
|
300
|
+
# self.fs.makedirs(parent, exist_ok=True)
|
301
|
+
# except Exception as e:
|
302
|
+
# self.logger.warning(f"Could not create manifest directory '{parent}': {e}")
|
303
|
+
#
|
304
|
+
# # Write atomically: temp file + rename
|
305
|
+
# temp_path = f"{self.manifest_path}.tmp-{uuid.uuid4().hex}"
|
306
|
+
# try:
|
307
|
+
# out_df.to_parquet(
|
308
|
+
# temp_path,
|
309
|
+
# filesystem=self.fs,
|
310
|
+
# index=False
|
311
|
+
# )
|
312
|
+
# # rename into place (atomic in most filesystems)
|
313
|
+
# #self.fs.mv(temp_path, self.manifest_path, recursive=False)
|
314
|
+
# try:
|
315
|
+
# self.fs.copy(temp_path, self.manifest_path)
|
316
|
+
# self.fs.rm(temp_path)
|
317
|
+
# except Exception as e:
|
318
|
+
# self.logger.error(f"Failed to copy or delete manifest: {e}")
|
319
|
+
# raise
|
320
|
+
# except Exception as e:
|
321
|
+
# self.logger.error(f"Failed to write or rename manifest: {e}")
|
322
|
+
# # Clean up temp if it exists
|
323
|
+
# try:
|
324
|
+
# if self.fs.exists(temp_path):
|
325
|
+
# self.fs.rm(temp_path, recursive=True)
|
326
|
+
# except Exception:
|
327
|
+
# pass
|
328
|
+
# raise
|
329
|
+
#
|
330
|
+
# # Reset memory & cache
|
331
|
+
# self._new_records.clear()
|
332
|
+
# self._loaded_paths = set(out_df["path"].tolist())
|
333
|
+
# import pandas as pd
|
334
|
+
# import fsspec
|
335
|
+
# import threading
|
336
|
+
# import uuid
|
337
|
+
# from typing import List, Optional, Set, Dict, Any
|
338
|
+
#
|
339
|
+
# from sibi_dst.utils import Logger
|
340
|
+
#
|
341
|
+
#
|
342
|
+
# class MissingManifestManager:
|
343
|
+
# """
|
344
|
+
# Thread-safe manager for a “missing-partitions” manifest (Parquet file).
|
345
|
+
# """
|
346
|
+
#
|
347
|
+
# def __init__(
|
348
|
+
# self,
|
349
|
+
# fs: fsspec.AbstractFileSystem,
|
350
|
+
# manifest_path: str,
|
351
|
+
# clear_existing: bool = False,
|
352
|
+
# **kwargs: Any,
|
353
|
+
# ):
|
354
|
+
# self.fs = fs
|
355
|
+
# self.manifest_path = manifest_path.rstrip("/")
|
356
|
+
# self.clear_existing = clear_existing
|
357
|
+
#
|
358
|
+
# self.debug: bool = kwargs.get("debug", False)
|
359
|
+
# self.logger = kwargs.get(
|
360
|
+
# "logger",
|
361
|
+
# Logger.default_logger(logger_name="missing_manifest_manager")
|
362
|
+
# )
|
363
|
+
# self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
|
364
|
+
#
|
365
|
+
# # In-memory list for new paths
|
366
|
+
# self._new_records: List[Dict[str, str]] = []
|
367
|
+
# # Cached set of existing paths
|
368
|
+
# self._loaded_paths: Optional[Set[str]] = None
|
369
|
+
#
|
370
|
+
# # Use a reentrant lock so save() can call load_existing() safely
|
371
|
+
# self._lock = threading.RLock()
|
372
|
+
#
|
373
|
+
# def _safe_exists(self, path: str) -> bool:
|
374
|
+
# try:
|
375
|
+
# return self.fs.exists(path)
|
376
|
+
# except PermissionError:
|
377
|
+
# if self.debug:
|
378
|
+
# self.logger.debug(f"Permission denied checking existence of '{path}'")
|
379
|
+
# return False
|
380
|
+
# except Exception as e:
|
381
|
+
# self.logger.warning(f"Error checking existence of '{path}': {e}")
|
382
|
+
# return False
|
383
|
+
#
|
384
|
+
# def load_existing(self) -> Set[str]:
|
385
|
+
# """
|
386
|
+
# Load and cache existing manifest paths.
|
387
|
+
# """
|
388
|
+
# with self._lock:
|
389
|
+
# if self._loaded_paths is not None:
|
390
|
+
# return self._loaded_paths
|
391
|
+
#
|
392
|
+
# if not self._safe_exists(self.manifest_path):
|
393
|
+
# self._loaded_paths = set()
|
394
|
+
# return self._loaded_paths
|
395
|
+
#
|
396
|
+
# try:
|
397
|
+
# df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
|
398
|
+
# paths = (
|
399
|
+
# df.get("path", pd.Series(dtype=str))
|
400
|
+
# .dropna().astype(str)
|
401
|
+
# .loc[lambda s: s.str.strip().astype(bool)]
|
402
|
+
# )
|
403
|
+
# self._loaded_paths = set(paths.tolist())
|
404
|
+
# except Exception as e:
|
405
|
+
# self.logger.warning(f"Failed to load manifest '{self.manifest_path}': {e}")
|
406
|
+
# self._loaded_paths = set()
|
407
|
+
#
|
408
|
+
# return self._loaded_paths
|
409
|
+
#
|
410
|
+
# def record(self, full_path: str) -> None:
|
411
|
+
# """
|
412
|
+
# Register a missing file path.
|
413
|
+
# """
|
414
|
+
# if not full_path or not isinstance(full_path, str):
|
415
|
+
# return
|
416
|
+
# with self._lock:
|
417
|
+
# self._new_records.append({"path": full_path})
|
418
|
+
#
|
419
|
+
# def save(self) -> None:
|
420
|
+
# """
|
421
|
+
# Merge new records into the manifest and write it out atomically.
|
422
|
+
# """
|
423
|
+
# with self._lock:
|
424
|
+
# # Build DataFrame of new entries
|
425
|
+
# new_df = pd.DataFrame(self._new_records)
|
426
|
+
# should_overwrite = self.clear_existing or not self._safe_exists(self.manifest_path)
|
427
|
+
# if new_df.empty and not should_overwrite:
|
428
|
+
# return
|
429
|
+
#
|
430
|
+
# # Clean new_df
|
431
|
+
# new_df = (
|
432
|
+
# new_df.get("path", pd.Series(dtype=str))
|
433
|
+
# .dropna().astype(str)
|
434
|
+
# .loc[lambda s: s.str.strip().astype(bool)]
|
435
|
+
# .to_frame()
|
436
|
+
# )
|
437
|
+
#
|
438
|
+
# # Merge or overwrite
|
439
|
+
# if should_overwrite:
|
440
|
+
# out_df = new_df
|
441
|
+
# else:
|
442
|
+
# try:
|
443
|
+
# old_df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
|
444
|
+
# old_paths = (
|
445
|
+
# old_df.get("path", pd.Series(dtype=str))
|
446
|
+
# .dropna().astype(str)
|
447
|
+
# .loc[lambda s: s.str.strip().astype(bool)]
|
448
|
+
# .to_frame()
|
449
|
+
# )
|
450
|
+
# out_df = pd.concat([old_paths, new_df], ignore_index=True)
|
451
|
+
# except Exception as e:
|
452
|
+
# self.logger.warning(f"Could not merge manifest, overwriting: {e}")
|
453
|
+
# out_df = new_df
|
454
|
+
#
|
455
|
+
# out_df = out_df.drop_duplicates(subset=["path"]).reset_index(drop=True)
|
456
|
+
#
|
457
|
+
# # Ensure parent dir
|
458
|
+
# parent = self.manifest_path.rsplit("/", 1)[0]
|
459
|
+
# try:
|
460
|
+
# self.fs.makedirs(parent, exist_ok=True)
|
461
|
+
# except Exception as e:
|
462
|
+
# self.logger.warning(f"Could not create manifest directory '{parent}': {e}")
|
463
|
+
#
|
464
|
+
# # Write atomically: temp file + rename
|
465
|
+
# temp_path = f"{self.manifest_path}.tmp-{uuid.uuid4().hex}"
|
466
|
+
# try:
|
467
|
+
# out_df.to_parquet(
|
468
|
+
# temp_path,
|
469
|
+
# filesystem=self.fs,
|
470
|
+
# index=False
|
471
|
+
# )
|
472
|
+
# # rename into place (atomic in most filesystems)
|
473
|
+
# self.fs.mv(temp_path, self.manifest_path, recursive=False)
|
474
|
+
# except Exception as e:
|
475
|
+
# self.logger.error(f"Failed to write or rename manifest: {e}")
|
476
|
+
# # Clean up temp if it exists
|
477
|
+
# try:
|
478
|
+
# if self.fs.exists(temp_path):
|
479
|
+
# self.fs.rm(temp_path, recursive=True)
|
480
|
+
# except Exception:
|
481
|
+
# pass
|
482
|
+
# raise
|
483
|
+
#
|
484
|
+
# # Reset memory & cache
|
485
|
+
# self._new_records.clear()
|
486
|
+
# self._loaded_paths = set(out_df["path"].tolist())
|
sibi_dst/utils/storage_config.py
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
from threading import RLock
|
2
|
+
from typing import Dict, Callable, Any
|
3
|
+
|
4
|
+
from sibi_dst.utils import Logger
|
1
5
|
from .storage_manager import StorageManager
|
2
6
|
from .credentials import ConfigManager
|
3
7
|
|
@@ -46,4 +50,58 @@ class StorageConfig:
|
|
46
50
|
# defaulting to local filesystem
|
47
51
|
self.filesystem_type = 'file'
|
48
52
|
self.filesystem_options = {}
|
49
|
-
self.filesystem_options = {k: v for k, v in self.filesystem_options.items() if v}
|
53
|
+
self.filesystem_options = {k: v for k, v in self.filesystem_options.items() if v}
|
54
|
+
|
55
|
+
class FsRegistry:
|
56
|
+
def __init__(self, debug: bool = False, logger: Logger = None):
|
57
|
+
self._storage_registry: Dict[str, Callable[[], Any]]={}
|
58
|
+
self._fs_instance_cache: Dict[str, object] = {}
|
59
|
+
self._lock = RLock()
|
60
|
+
self.debug = debug
|
61
|
+
|
62
|
+
if logger:
|
63
|
+
self.logger = logger
|
64
|
+
else:
|
65
|
+
self.logger = Logger.default_logger(logger_name="FsRegistry")
|
66
|
+
self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
|
67
|
+
|
68
|
+
def register(self, name:str, manager: Any):
|
69
|
+
"""
|
70
|
+
Registers a filesystem manager instance with a name.
|
71
|
+
:param name: Name of the filesystem instance.
|
72
|
+
:param manager: Filesystem manager instance to register.
|
73
|
+
"""
|
74
|
+
if not hasattr(manager, 'get_fs_instance'):
|
75
|
+
raise TypeError("Manager must have a 'get_fs_instance' method.")
|
76
|
+
self._storage_registry[name] = lambda: manager
|
77
|
+
|
78
|
+
|
79
|
+
def get_fs_instance(self, name: str='source') -> object:
|
80
|
+
"""
|
81
|
+
Retrieve a filesystem instance from a registered storage manager.
|
82
|
+
Caches instances per name.
|
83
|
+
"""
|
84
|
+
if name in self._fs_instance_cache:
|
85
|
+
return self._fs_instance_cache[name]
|
86
|
+
|
87
|
+
if name not in self._storage_registry:
|
88
|
+
raise ValueError(f"Storage '{name}' has not been registered.")
|
89
|
+
|
90
|
+
manager = self._storage_registry[name]()
|
91
|
+
fs = manager.get_fs_instance()
|
92
|
+
self._fs_instance_cache[name] = fs
|
93
|
+
return fs
|
94
|
+
|
95
|
+
def unregister_fs(self, name: str):
|
96
|
+
"""
|
97
|
+
Unregister a storage and clear its cached fs instance.
|
98
|
+
"""
|
99
|
+
self._storage_registry.pop(name, None)
|
100
|
+
self._fs_instance_cache.pop(name, None)
|
101
|
+
|
102
|
+
|
103
|
+
def clear_fs_cache(self):
|
104
|
+
"""
|
105
|
+
Clear all cached fs instances.
|
106
|
+
"""
|
107
|
+
self._fs_instance_cache.clear()
|
@@ -38,7 +38,7 @@ sibi_dst/osmnx_helper/basemaps/router_plotter.py,sha256=UAiijn-J-jjX4YnL0_P9SFqT
|
|
38
38
|
sibi_dst/osmnx_helper/utils.py,sha256=BzuY8CtYnBAAO8UAr_M7EOk6CP1zcifNLs8pkdFZEFg,20577
|
39
39
|
sibi_dst/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
40
40
|
sibi_dst/tests/test_data_wrapper_class.py,sha256=6uFmZR2DxnxQz49L5jT2ehlKvlLnpUHMLFB_PqqUq7k,3336
|
41
|
-
sibi_dst/utils/__init__.py,sha256=
|
41
|
+
sibi_dst/utils/__init__.py,sha256=w0_q4rl3yD7x1Q5yWxH-GN_3Ju1XlebIzm3nJdrUeGE,1234
|
42
42
|
sibi_dst/utils/airflow_manager.py,sha256=-d44EKUZNYJyp4wuNwRvilRQktunArPOB5fZuWdQv10,7526
|
43
43
|
sibi_dst/utils/clickhouse_writer.py,sha256=iAUe4_Kn2WR1xZjpLW2FOWCWfOTw6fCGMTUcWxIQJ60,9877
|
44
44
|
sibi_dst/utils/credentials.py,sha256=cHJPPsmVyijqbUQIq7WWPe-lIallA-mI5RAy3YUuRME,1724
|
@@ -50,10 +50,10 @@ sibi_dst/utils/df_utils.py,sha256=TzIAUCLbgOn3bvCFvzkc1S9YU-OlZTImdCj-88dtg8g,11
|
|
50
50
|
sibi_dst/utils/file_utils.py,sha256=Z99CZ_4nPDIaZqbCfzzUDfAYJjSudWDj-mwEO8grhbc,1253
|
51
51
|
sibi_dst/utils/filepath_generator.py,sha256=-HHO0U-PR8fysDDFwnWdHRlgqksh_RkmgBZLWv9hM7s,6669
|
52
52
|
sibi_dst/utils/log_utils.py,sha256=77xACRagKU83H9vn7aVeBzkQjxWlbe4dg4KuxPRCgvw,4635
|
53
|
-
sibi_dst/utils/manifest_manager.py,sha256=
|
53
|
+
sibi_dst/utils/manifest_manager.py,sha256=eyk6Dvrn86gUpAaAsnQvNnEJn5-Tno-sDDJsDMfHtTA,18161
|
54
54
|
sibi_dst/utils/parquet_saver.py,sha256=O62xwPfphOpKgEiHqnts20CPSU96pxs49Cg7PVetLK0,8193
|
55
55
|
sibi_dst/utils/phone_formatter.py,sha256=tsVTDamuthFYgy4-5UwmQkPQ-FGTGH7MjZyH8utAkIY,4945
|
56
|
-
sibi_dst/utils/storage_config.py,sha256=
|
56
|
+
sibi_dst/utils/storage_config.py,sha256=TE15H-7d0mqwYPSUgrdidK9U7N7p87Z8JfUQH4-jdPs,4123
|
57
57
|
sibi_dst/utils/storage_manager.py,sha256=btecX7ggNb7rfu5EK9Xuu2q_FZA7r_rB_tfhQ8V96qc,6567
|
58
58
|
sibi_dst/utils/update_planner.py,sha256=dJXLC-KdbWrCs-MFe7Xa8F-ZhlNJq8P1szjLAzMJZk0,9684
|
59
59
|
sibi_dst/utils/webdav_client.py,sha256=pYF1UsGOuxYeGLq7aBfwZFvkvD4meOcbbaiZ4d6GW9I,7107
|
@@ -77,6 +77,6 @@ sibi_dst/v2/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxR
|
|
77
77
|
sibi_dst/v2/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
|
78
78
|
sibi_dst/v2/utils/__init__.py,sha256=6H4cvhqTiFufnFPETBF0f8beVVMpfJfvUs6Ne0TQZNY,58
|
79
79
|
sibi_dst/v2/utils/log_utils.py,sha256=rfk5VsLAt-FKpv6aPTC1FToIPiyrnHAFFBAkHme24po,4123
|
80
|
-
sibi_dst-0.3.
|
81
|
-
sibi_dst-0.3.
|
82
|
-
sibi_dst-0.3.
|
80
|
+
sibi_dst-0.3.61.dist-info/METADATA,sha256=GZ-Yz9oiehgGgI2iJoCejdExgtclAlaz-N-sI5hGIi0,4292
|
81
|
+
sibi_dst-0.3.61.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
82
|
+
sibi_dst-0.3.61.dist-info/RECORD,,
|
File without changes
|