sibi-dst 0.3.59__tar.gz → 0.3.61__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/PKG-INFO +1 -1
  2. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/pyproject.toml +1 -1
  3. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/utils/__init__.py +3 -1
  4. sibi_dst-0.3.61/sibi_dst/utils/manifest_manager.py +486 -0
  5. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/utils/storage_config.py +59 -1
  6. sibi_dst-0.3.59/sibi_dst/utils/manifest_manager.py +0 -154
  7. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/README.md +0 -0
  8. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/__init__.py +0 -0
  9. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/df_helper/__init__.py +0 -0
  10. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +0 -0
  11. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/df_helper/_df_helper.py +0 -0
  12. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
  13. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/df_helper/_parquet_reader.py +0 -0
  14. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/df_helper/backends/__init__.py +0 -0
  15. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/df_helper/backends/django/__init__.py +0 -0
  16. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/df_helper/backends/django/_db_connection.py +0 -0
  17. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/df_helper/backends/django/_io_dask.py +0 -0
  18. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/df_helper/backends/django/_load_from_db.py +0 -0
  19. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/df_helper/backends/django/_sql_model_builder.py +0 -0
  20. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
  21. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
  22. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
  23. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/df_helper/backends/parquet/_filter_handler.py +0 -0
  24. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
  25. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
  26. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  27. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py +0 -0
  28. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  29. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  30. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
  31. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/df_helper/core/__init__.py +0 -0
  32. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/df_helper/core/_defaults.py +0 -0
  33. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
  34. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/df_helper/core/_params_config.py +0 -0
  35. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/df_helper/core/_query_config.py +0 -0
  36. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/df_helper/data_cleaner.py +0 -0
  37. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/geopy_helper/__init__.py +0 -0
  38. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
  39. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/geopy_helper/utils.py +0 -0
  40. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/osmnx_helper/__init__.py +0 -0
  41. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
  42. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
  43. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
  44. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
  45. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/osmnx_helper/utils.py +0 -0
  46. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/tests/__init__.py +0 -0
  47. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
  48. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/utils/airflow_manager.py +0 -0
  49. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/utils/clickhouse_writer.py +0 -0
  50. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/utils/credentials.py +0 -0
  51. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/utils/data_from_http_source.py +0 -0
  52. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/utils/data_utils.py +0 -0
  53. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/utils/data_wrapper.py +0 -0
  54. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/utils/date_utils.py +0 -0
  55. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/utils/df_utils.py +0 -0
  56. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/utils/file_utils.py +0 -0
  57. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/utils/filepath_generator.py +0 -0
  58. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/utils/log_utils.py +0 -0
  59. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/utils/parquet_saver.py +0 -0
  60. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/utils/phone_formatter.py +0 -0
  61. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/utils/storage_manager.py +0 -0
  62. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/utils/update_planner.py +0 -0
  63. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/utils/webdav_client.py +0 -0
  64. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/v2/__init__.py +0 -0
  65. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/v2/df_helper/__init__.py +0 -0
  66. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
  67. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
  68. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
  69. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  70. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  71. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  72. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
  73. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
  74. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -0
  75. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +0 -0
  76. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
  77. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
  78. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
  79. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
  80. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
  81. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/v2/df_helper/core/_query_config.py +0 -0
  82. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/v2/utils/__init__.py +0 -0
  83. {sibi_dst-0.3.59 → sibi_dst-0.3.61}/sibi_dst/v2/utils/log_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.59
3
+ Version: 0.3.61
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sibi-dst"
3
- version = "0.3.59"
3
+ version = "0.3.61"
4
4
  description = "Data Science Toolkit"
5
5
  authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
6
6
  readme = "README.md"
@@ -14,7 +14,7 @@ from .airflow_manager import AirflowDAGManager
14
14
  from .credentials import *
15
15
  from .update_planner import UpdatePlanner
16
16
  from .data_wrapper import DataWrapper
17
- from .storage_config import StorageConfig
17
+ from .storage_config import StorageConfig, FsRegistry
18
18
  from .data_from_http_source import DataFromHttpSource
19
19
  from .webdav_client import WebDAVClient
20
20
  from .manifest_manager import MissingManifestManager
@@ -37,7 +37,9 @@ __all__ = [
37
37
  "ClickHouseWriter",
38
38
  "AirflowDAGManager",
39
39
  "StorageConfig",
40
+ "FsRegistry",
40
41
  "DataFromHttpSource",
41
42
  "WebDAVClient",
42
43
  "MissingManifestManager",
43
44
  ]
45
+
@@ -0,0 +1,486 @@
1
+ import pandas as pd
2
+ import fsspec
3
+ import threading
4
+ import uuid
5
+ import hashlib
6
+ import base64
7
+ import json
8
+ from typing import List, Optional, Set, Dict, Any
9
+
10
+ from sibi_dst.utils import Logger
11
+
12
+
13
+ class MissingManifestManager:
14
+ """
15
+ Thread-safe manager for a “missing-partitions” manifest (Parquet file).
16
+ """
17
+
18
+ def __init__(
19
+ self,
20
+ fs: fsspec.AbstractFileSystem,
21
+ manifest_path: str,
22
+ clear_existing: bool = False,
23
+ **kwargs: Any,
24
+ ):
25
+ self.fs = fs
26
+ self.manifest_path = manifest_path.rstrip("/")
27
+ self.clear_existing = clear_existing
28
+
29
+ self.debug: bool = kwargs.get("debug", False)
30
+ self.logger = kwargs.get(
31
+ "logger",
32
+ Logger.default_logger(logger_name="missing_manifest_manager")
33
+ )
34
+ self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
35
+
36
+ self._new_records: List[Dict[str, str]] = []
37
+ self._loaded_paths: Optional[Set[str]] = None
38
+ self._lock = threading.RLock()
39
+
40
+ def _safe_exists(self, path: str) -> bool:
41
+ try:
42
+ return self.fs.exists(path)
43
+ except Exception as e:
44
+ self.logger.warning(f"Error checking existence of '{path}': {e}")
45
+ return False
46
+
47
+ def load_existing(self) -> Set[str]:
48
+ with self._lock:
49
+ if self._loaded_paths is not None:
50
+ return self._loaded_paths
51
+
52
+ if not self._safe_exists(self.manifest_path):
53
+ self._loaded_paths = set()
54
+ return self._loaded_paths
55
+
56
+ try:
57
+ df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
58
+ paths = (
59
+ df.get("path", pd.Series(dtype=str))
60
+ .dropna().astype(str)
61
+ .loc[lambda s: s.str.strip().astype(bool)]
62
+ )
63
+ self._loaded_paths = set(paths.tolist())
64
+ except Exception as e:
65
+ self.logger.warning(f"Failed to load manifest '{self.manifest_path}': {e}")
66
+ self._loaded_paths = set()
67
+
68
+ return self._loaded_paths
69
+
70
+ def record(self, full_path: str) -> None:
71
+ if not full_path or not isinstance(full_path, str):
72
+ return
73
+ with self._lock:
74
+ self._new_records.append({"path": full_path})
75
+
76
+ def save(self) -> None:
77
+ with self._lock:
78
+ new_df = pd.DataFrame(self._new_records)
79
+ should_overwrite = self.clear_existing or not self._safe_exists(self.manifest_path)
80
+ if new_df.empty and not should_overwrite:
81
+ return
82
+
83
+ new_df = (
84
+ new_df.get("path", pd.Series(dtype=str))
85
+ .dropna().astype(str)
86
+ .loc[lambda s: s.str.strip().astype(bool)]
87
+ .to_frame()
88
+ )
89
+
90
+ if should_overwrite:
91
+ out_df = new_df
92
+ else:
93
+ try:
94
+ old_df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
95
+ old_paths = (
96
+ old_df.get("path", pd.Series(dtype=str))
97
+ .dropna().astype(str)
98
+ .loc[lambda s: s.str.strip().astype(bool)]
99
+ .to_frame()
100
+ )
101
+ out_df = pd.concat([old_paths, new_df], ignore_index=True)
102
+ except Exception as e:
103
+ self.logger.warning(f"Could not merge manifest, overwriting: {e}")
104
+ out_df = new_df
105
+
106
+ out_df = out_df.drop_duplicates(subset=["path"]).reset_index(drop=True)
107
+
108
+ parent = self.manifest_path.rsplit("/", 1)[0]
109
+ try:
110
+ self.fs.makedirs(parent, exist_ok=True)
111
+ except Exception as e:
112
+ self.logger.warning(f"Could not create manifest directory '{parent}': {e}")
113
+
114
+ temp_path = f"{self.manifest_path}.tmp-{uuid.uuid4().hex}"
115
+ try:
116
+ out_df.to_parquet(
117
+ temp_path,
118
+ filesystem=self.fs,
119
+ index=False
120
+ )
121
+ self.fs.copy(temp_path, self.manifest_path)
122
+ self.logger.info(f"Copied manifest to {self.manifest_path} (temp: {temp_path})")
123
+ except Exception as e:
124
+ self.logger.error(f"Failed to write or copy manifest: {e}")
125
+ raise
126
+
127
+ self.logger.debug(f"Temp file left behind: {temp_path}")
128
+ self._new_records.clear()
129
+ self._loaded_paths = set(out_df["path"].tolist())
130
+
131
+ def cleanup_temp_manifests(self) -> None:
132
+ if not hasattr(self.fs, "s3"):
133
+ self.logger.info("Filesystem is not s3fs; skipping temp cleanup.")
134
+ return
135
+
136
+ try:
137
+ bucket, prefix = self._parse_s3_path(self.manifest_path.rsplit("/", 1)[0])
138
+ files = self.fs.ls(f"s3://{bucket}/{prefix}", detail=True)
139
+ temp_files = [
140
+ f for f in files
141
+ if f["name"].endswith(".parquet") and ".tmp-" in f["name"]
142
+ ]
143
+ if not temp_files:
144
+ return
145
+
146
+ objects = [{"Key": f["name"].replace(f"{bucket}/", "", 1)} for f in temp_files]
147
+ delete_payload = {
148
+ "Objects": objects,
149
+ "Quiet": True
150
+ }
151
+
152
+ json_payload = json.dumps(delete_payload).encode("utf-8")
153
+ content_md5 = base64.b64encode(hashlib.md5(json_payload).digest()).decode("utf-8")
154
+
155
+ self.fs.s3.meta.client.delete_objects(
156
+ Bucket=bucket,
157
+ Delete=delete_payload,
158
+ ContentMD5=content_md5
159
+ )
160
+ self.logger.info(f"Deleted {len(objects)} temp manifest files in s3://{bucket}/{prefix}")
161
+ except Exception as e:
162
+ self.logger.error(f"Failed to cleanup temp manifest files: {e}")
163
+
164
+ @staticmethod
165
+ def _parse_s3_path(s3_path: str):
166
+ if not s3_path.startswith("s3://"):
167
+ raise ValueError("Invalid S3 path. Must start with 's3://'.")
168
+ path_parts = s3_path[5:].split("/", 1)
169
+ bucket_name = path_parts[0]
170
+ prefix = path_parts[1] if len(path_parts) > 1 else ""
171
+ return bucket_name, prefix
172
+
173
+ # import pandas as pd
174
+ # import fsspec
175
+ # import threading
176
+ # import uuid
177
+ # from typing import List, Optional, Set, Dict, Any
178
+ #
179
+ # from sibi_dst.utils import Logger
180
+ #
181
+ #
182
+ # class MissingManifestManager:
183
+ # """
184
+ # Thread-safe manager for a “missing-partitions” manifest (Parquet file).
185
+ # """
186
+ #
187
+ # def __init__(
188
+ # self,
189
+ # fs: fsspec.AbstractFileSystem,
190
+ # manifest_path: str,
191
+ # clear_existing: bool = False,
192
+ # **kwargs: Any,
193
+ # ):
194
+ # self.fs = fs
195
+ # self.manifest_path = manifest_path.rstrip("/")
196
+ # self.clear_existing = clear_existing
197
+ #
198
+ # self.debug: bool = kwargs.get("debug", False)
199
+ # self.logger = kwargs.get(
200
+ # "logger",
201
+ # Logger.default_logger(logger_name="missing_manifest_manager")
202
+ # )
203
+ # self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
204
+ #
205
+ # # In-memory list for new paths
206
+ # self._new_records: List[Dict[str, str]] = []
207
+ # # Cached set of existing paths
208
+ # self._loaded_paths: Optional[Set[str]] = None
209
+ #
210
+ # # Use a reentrant lock so save() can call load_existing() safely
211
+ # self._lock = threading.RLock()
212
+ #
213
+ # def _safe_exists(self, path: str) -> bool:
214
+ # try:
215
+ # return self.fs.exists(path)
216
+ # except PermissionError:
217
+ # if self.debug:
218
+ # self.logger.debug(f"Permission denied checking existence of '{path}'")
219
+ # return False
220
+ # except Exception as e:
221
+ # self.logger.warning(f"Error checking existence of '{path}': {e}")
222
+ # return False
223
+ #
224
+ # def load_existing(self) -> Set[str]:
225
+ # """
226
+ # Load and cache existing manifest paths.
227
+ # """
228
+ # with self._lock:
229
+ # if self._loaded_paths is not None:
230
+ # return self._loaded_paths
231
+ #
232
+ # if not self._safe_exists(self.manifest_path):
233
+ # self._loaded_paths = set()
234
+ # return self._loaded_paths
235
+ #
236
+ # try:
237
+ # df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
238
+ # paths = (
239
+ # df.get("path", pd.Series(dtype=str))
240
+ # .dropna().astype(str)
241
+ # .loc[lambda s: s.str.strip().astype(bool)]
242
+ # )
243
+ # self._loaded_paths = set(paths.tolist())
244
+ # except Exception as e:
245
+ # self.logger.warning(f"Failed to load manifest '{self.manifest_path}': {e}")
246
+ # self._loaded_paths = set()
247
+ #
248
+ # return self._loaded_paths
249
+ #
250
+ # def record(self, full_path: str) -> None:
251
+ # """
252
+ # Register a missing file path.
253
+ # """
254
+ # if not full_path or not isinstance(full_path, str):
255
+ # return
256
+ # with self._lock:
257
+ # self._new_records.append({"path": full_path})
258
+ #
259
+ # def save(self) -> None:
260
+ # """
261
+ # Merge new records into the manifest and write it out atomically.
262
+ # """
263
+ # with self._lock:
264
+ # # Build DataFrame of new entries
265
+ # new_df = pd.DataFrame(self._new_records)
266
+ # should_overwrite = self.clear_existing or not self._safe_exists(self.manifest_path)
267
+ # if new_df.empty and not should_overwrite:
268
+ # return
269
+ #
270
+ # # Clean new_df
271
+ # new_df = (
272
+ # new_df.get("path", pd.Series(dtype=str))
273
+ # .dropna().astype(str)
274
+ # .loc[lambda s: s.str.strip().astype(bool)]
275
+ # .to_frame()
276
+ # )
277
+ #
278
+ # # Merge or overwrite
279
+ # if should_overwrite:
280
+ # out_df = new_df
281
+ # else:
282
+ # try:
283
+ # old_df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
284
+ # old_paths = (
285
+ # old_df.get("path", pd.Series(dtype=str))
286
+ # .dropna().astype(str)
287
+ # .loc[lambda s: s.str.strip().astype(bool)]
288
+ # .to_frame()
289
+ # )
290
+ # out_df = pd.concat([old_paths, new_df], ignore_index=True)
291
+ # except Exception as e:
292
+ # self.logger.warning(f"Could not merge manifest, overwriting: {e}")
293
+ # out_df = new_df
294
+ #
295
+ # out_df = out_df.drop_duplicates(subset=["path"]).reset_index(drop=True)
296
+ #
297
+ # # Ensure parent dir
298
+ # parent = self.manifest_path.rsplit("/", 1)[0]
299
+ # try:
300
+ # self.fs.makedirs(parent, exist_ok=True)
301
+ # except Exception as e:
302
+ # self.logger.warning(f"Could not create manifest directory '{parent}': {e}")
303
+ #
304
+ # # Write atomically: temp file + rename
305
+ # temp_path = f"{self.manifest_path}.tmp-{uuid.uuid4().hex}"
306
+ # try:
307
+ # out_df.to_parquet(
308
+ # temp_path,
309
+ # filesystem=self.fs,
310
+ # index=False
311
+ # )
312
+ # # rename into place (atomic in most filesystems)
313
+ # #self.fs.mv(temp_path, self.manifest_path, recursive=False)
314
+ # try:
315
+ # self.fs.copy(temp_path, self.manifest_path)
316
+ # self.fs.rm(temp_path)
317
+ # except Exception as e:
318
+ # self.logger.error(f"Failed to copy or delete manifest: {e}")
319
+ # raise
320
+ # except Exception as e:
321
+ # self.logger.error(f"Failed to write or rename manifest: {e}")
322
+ # # Clean up temp if it exists
323
+ # try:
324
+ # if self.fs.exists(temp_path):
325
+ # self.fs.rm(temp_path, recursive=True)
326
+ # except Exception:
327
+ # pass
328
+ # raise
329
+ #
330
+ # # Reset memory & cache
331
+ # self._new_records.clear()
332
+ # self._loaded_paths = set(out_df["path"].tolist())
333
+ # import pandas as pd
334
+ # import fsspec
335
+ # import threading
336
+ # import uuid
337
+ # from typing import List, Optional, Set, Dict, Any
338
+ #
339
+ # from sibi_dst.utils import Logger
340
+ #
341
+ #
342
+ # class MissingManifestManager:
343
+ # """
344
+ # Thread-safe manager for a “missing-partitions” manifest (Parquet file).
345
+ # """
346
+ #
347
+ # def __init__(
348
+ # self,
349
+ # fs: fsspec.AbstractFileSystem,
350
+ # manifest_path: str,
351
+ # clear_existing: bool = False,
352
+ # **kwargs: Any,
353
+ # ):
354
+ # self.fs = fs
355
+ # self.manifest_path = manifest_path.rstrip("/")
356
+ # self.clear_existing = clear_existing
357
+ #
358
+ # self.debug: bool = kwargs.get("debug", False)
359
+ # self.logger = kwargs.get(
360
+ # "logger",
361
+ # Logger.default_logger(logger_name="missing_manifest_manager")
362
+ # )
363
+ # self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
364
+ #
365
+ # # In-memory list for new paths
366
+ # self._new_records: List[Dict[str, str]] = []
367
+ # # Cached set of existing paths
368
+ # self._loaded_paths: Optional[Set[str]] = None
369
+ #
370
+ # # Use a reentrant lock so save() can call load_existing() safely
371
+ # self._lock = threading.RLock()
372
+ #
373
+ # def _safe_exists(self, path: str) -> bool:
374
+ # try:
375
+ # return self.fs.exists(path)
376
+ # except PermissionError:
377
+ # if self.debug:
378
+ # self.logger.debug(f"Permission denied checking existence of '{path}'")
379
+ # return False
380
+ # except Exception as e:
381
+ # self.logger.warning(f"Error checking existence of '{path}': {e}")
382
+ # return False
383
+ #
384
+ # def load_existing(self) -> Set[str]:
385
+ # """
386
+ # Load and cache existing manifest paths.
387
+ # """
388
+ # with self._lock:
389
+ # if self._loaded_paths is not None:
390
+ # return self._loaded_paths
391
+ #
392
+ # if not self._safe_exists(self.manifest_path):
393
+ # self._loaded_paths = set()
394
+ # return self._loaded_paths
395
+ #
396
+ # try:
397
+ # df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
398
+ # paths = (
399
+ # df.get("path", pd.Series(dtype=str))
400
+ # .dropna().astype(str)
401
+ # .loc[lambda s: s.str.strip().astype(bool)]
402
+ # )
403
+ # self._loaded_paths = set(paths.tolist())
404
+ # except Exception as e:
405
+ # self.logger.warning(f"Failed to load manifest '{self.manifest_path}': {e}")
406
+ # self._loaded_paths = set()
407
+ #
408
+ # return self._loaded_paths
409
+ #
410
+ # def record(self, full_path: str) -> None:
411
+ # """
412
+ # Register a missing file path.
413
+ # """
414
+ # if not full_path or not isinstance(full_path, str):
415
+ # return
416
+ # with self._lock:
417
+ # self._new_records.append({"path": full_path})
418
+ #
419
+ # def save(self) -> None:
420
+ # """
421
+ # Merge new records into the manifest and write it out atomically.
422
+ # """
423
+ # with self._lock:
424
+ # # Build DataFrame of new entries
425
+ # new_df = pd.DataFrame(self._new_records)
426
+ # should_overwrite = self.clear_existing or not self._safe_exists(self.manifest_path)
427
+ # if new_df.empty and not should_overwrite:
428
+ # return
429
+ #
430
+ # # Clean new_df
431
+ # new_df = (
432
+ # new_df.get("path", pd.Series(dtype=str))
433
+ # .dropna().astype(str)
434
+ # .loc[lambda s: s.str.strip().astype(bool)]
435
+ # .to_frame()
436
+ # )
437
+ #
438
+ # # Merge or overwrite
439
+ # if should_overwrite:
440
+ # out_df = new_df
441
+ # else:
442
+ # try:
443
+ # old_df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
444
+ # old_paths = (
445
+ # old_df.get("path", pd.Series(dtype=str))
446
+ # .dropna().astype(str)
447
+ # .loc[lambda s: s.str.strip().astype(bool)]
448
+ # .to_frame()
449
+ # )
450
+ # out_df = pd.concat([old_paths, new_df], ignore_index=True)
451
+ # except Exception as e:
452
+ # self.logger.warning(f"Could not merge manifest, overwriting: {e}")
453
+ # out_df = new_df
454
+ #
455
+ # out_df = out_df.drop_duplicates(subset=["path"]).reset_index(drop=True)
456
+ #
457
+ # # Ensure parent dir
458
+ # parent = self.manifest_path.rsplit("/", 1)[0]
459
+ # try:
460
+ # self.fs.makedirs(parent, exist_ok=True)
461
+ # except Exception as e:
462
+ # self.logger.warning(f"Could not create manifest directory '{parent}': {e}")
463
+ #
464
+ # # Write atomically: temp file + rename
465
+ # temp_path = f"{self.manifest_path}.tmp-{uuid.uuid4().hex}"
466
+ # try:
467
+ # out_df.to_parquet(
468
+ # temp_path,
469
+ # filesystem=self.fs,
470
+ # index=False
471
+ # )
472
+ # # rename into place (atomic in most filesystems)
473
+ # self.fs.mv(temp_path, self.manifest_path, recursive=False)
474
+ # except Exception as e:
475
+ # self.logger.error(f"Failed to write or rename manifest: {e}")
476
+ # # Clean up temp if it exists
477
+ # try:
478
+ # if self.fs.exists(temp_path):
479
+ # self.fs.rm(temp_path, recursive=True)
480
+ # except Exception:
481
+ # pass
482
+ # raise
483
+ #
484
+ # # Reset memory & cache
485
+ # self._new_records.clear()
486
+ # self._loaded_paths = set(out_df["path"].tolist())
@@ -1,3 +1,7 @@
1
+ from threading import RLock
2
+ from typing import Dict, Callable, Any
3
+
4
+ from sibi_dst.utils import Logger
1
5
  from .storage_manager import StorageManager
2
6
  from .credentials import ConfigManager
3
7
 
@@ -46,4 +50,58 @@ class StorageConfig:
46
50
  # defaulting to local filesystem
47
51
  self.filesystem_type = 'file'
48
52
  self.filesystem_options = {}
49
- self.filesystem_options = {k: v for k, v in self.filesystem_options.items() if v}
53
+ self.filesystem_options = {k: v for k, v in self.filesystem_options.items() if v}
54
+
55
+ class FsRegistry:
56
+ def __init__(self, debug: bool = False, logger: Logger = None):
57
+ self._storage_registry: Dict[str, Callable[[], Any]]={}
58
+ self._fs_instance_cache: Dict[str, object] = {}
59
+ self._lock = RLock()
60
+ self.debug = debug
61
+
62
+ if logger:
63
+ self.logger = logger
64
+ else:
65
+ self.logger = Logger.default_logger(logger_name="FsRegistry")
66
+ self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
67
+
68
+ def register(self, name:str, manager: Any):
69
+ """
70
+ Registers a filesystem manager instance with a name.
71
+ :param name: Name of the filesystem instance.
72
+ :param manager: Filesystem manager instance to register.
73
+ """
74
+ if not hasattr(manager, 'get_fs_instance'):
75
+ raise TypeError("Manager must have a 'get_fs_instance' method.")
76
+ self._storage_registry[name] = lambda: manager
77
+
78
+
79
+ def get_fs_instance(self, name: str='source') -> object:
80
+ """
81
+ Retrieve a filesystem instance from a registered storage manager.
82
+ Caches instances per name.
83
+ """
84
+ if name in self._fs_instance_cache:
85
+ return self._fs_instance_cache[name]
86
+
87
+ if name not in self._storage_registry:
88
+ raise ValueError(f"Storage '{name}' has not been registered.")
89
+
90
+ manager = self._storage_registry[name]()
91
+ fs = manager.get_fs_instance()
92
+ self._fs_instance_cache[name] = fs
93
+ return fs
94
+
95
+ def unregister_fs(self, name: str):
96
+ """
97
+ Unregister a storage and clear its cached fs instance.
98
+ """
99
+ self._storage_registry.pop(name, None)
100
+ self._fs_instance_cache.pop(name, None)
101
+
102
+
103
+ def clear_fs_cache(self):
104
+ """
105
+ Clear all cached fs instances.
106
+ """
107
+ self._fs_instance_cache.clear()
@@ -1,154 +0,0 @@
1
- import pandas as pd
2
- import fsspec
3
- import threading
4
- import uuid
5
- from typing import List, Optional, Set, Dict, Any
6
-
7
- from sibi_dst.utils import Logger
8
-
9
-
10
- class MissingManifestManager:
11
- """
12
- Thread-safe manager for a “missing-partitions” manifest (Parquet file).
13
- """
14
-
15
- def __init__(
16
- self,
17
- fs: fsspec.AbstractFileSystem,
18
- manifest_path: str,
19
- clear_existing: bool = False,
20
- **kwargs: Any,
21
- ):
22
- self.fs = fs
23
- self.manifest_path = manifest_path.rstrip("/")
24
- self.clear_existing = clear_existing
25
-
26
- self.debug: bool = kwargs.get("debug", False)
27
- self.logger = kwargs.get(
28
- "logger",
29
- Logger.default_logger(logger_name="missing_manifest_manager")
30
- )
31
- self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
32
-
33
- # In-memory list for new paths
34
- self._new_records: List[Dict[str, str]] = []
35
- # Cached set of existing paths
36
- self._loaded_paths: Optional[Set[str]] = None
37
-
38
- # Use a reentrant lock so save() can call load_existing() safely
39
- self._lock = threading.RLock()
40
-
41
- def _safe_exists(self, path: str) -> bool:
42
- try:
43
- return self.fs.exists(path)
44
- except PermissionError:
45
- if self.debug:
46
- self.logger.debug(f"Permission denied checking existence of '{path}'")
47
- return False
48
- except Exception as e:
49
- self.logger.warning(f"Error checking existence of '{path}': {e}")
50
- return False
51
-
52
- def load_existing(self) -> Set[str]:
53
- """
54
- Load and cache existing manifest paths.
55
- """
56
- with self._lock:
57
- if self._loaded_paths is not None:
58
- return self._loaded_paths
59
-
60
- if not self._safe_exists(self.manifest_path):
61
- self._loaded_paths = set()
62
- return self._loaded_paths
63
-
64
- try:
65
- df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
66
- paths = (
67
- df.get("path", pd.Series(dtype=str))
68
- .dropna().astype(str)
69
- .loc[lambda s: s.str.strip().astype(bool)]
70
- )
71
- self._loaded_paths = set(paths.tolist())
72
- except Exception as e:
73
- self.logger.warning(f"Failed to load manifest '{self.manifest_path}': {e}")
74
- self._loaded_paths = set()
75
-
76
- return self._loaded_paths
77
-
78
- def record(self, full_path: str) -> None:
79
- """
80
- Register a missing file path.
81
- """
82
- if not full_path or not isinstance(full_path, str):
83
- return
84
- with self._lock:
85
- self._new_records.append({"path": full_path})
86
-
87
- def save(self) -> None:
88
- """
89
- Merge new records into the manifest and write it out atomically.
90
- """
91
- with self._lock:
92
- # Build DataFrame of new entries
93
- new_df = pd.DataFrame(self._new_records)
94
- should_overwrite = self.clear_existing or not self._safe_exists(self.manifest_path)
95
- if new_df.empty and not should_overwrite:
96
- return
97
-
98
- # Clean new_df
99
- new_df = (
100
- new_df.get("path", pd.Series(dtype=str))
101
- .dropna().astype(str)
102
- .loc[lambda s: s.str.strip().astype(bool)]
103
- .to_frame()
104
- )
105
-
106
- # Merge or overwrite
107
- if should_overwrite:
108
- out_df = new_df
109
- else:
110
- try:
111
- old_df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
112
- old_paths = (
113
- old_df.get("path", pd.Series(dtype=str))
114
- .dropna().astype(str)
115
- .loc[lambda s: s.str.strip().astype(bool)]
116
- .to_frame()
117
- )
118
- out_df = pd.concat([old_paths, new_df], ignore_index=True)
119
- except Exception as e:
120
- self.logger.warning(f"Could not merge manifest, overwriting: {e}")
121
- out_df = new_df
122
-
123
- out_df = out_df.drop_duplicates(subset=["path"]).reset_index(drop=True)
124
-
125
- # Ensure parent dir
126
- parent = self.manifest_path.rsplit("/", 1)[0]
127
- try:
128
- self.fs.makedirs(parent, exist_ok=True)
129
- except Exception as e:
130
- self.logger.warning(f"Could not create manifest directory '{parent}': {e}")
131
-
132
- # Write atomically: temp file + rename
133
- temp_path = f"{self.manifest_path}.tmp-{uuid.uuid4().hex}"
134
- try:
135
- out_df.to_parquet(
136
- temp_path,
137
- filesystem=self.fs,
138
- index=False
139
- )
140
- # rename into place (atomic in most filesystems)
141
- self.fs.mv(temp_path, self.manifest_path, recursive=False)
142
- except Exception as e:
143
- self.logger.error(f"Failed to write or rename manifest: {e}")
144
- # Clean up temp if it exists
145
- try:
146
- if self.fs.exists(temp_path):
147
- self.fs.rm(temp_path, recursive=True)
148
- except Exception:
149
- pass
150
- raise
151
-
152
- # Reset memory & cache
153
- self._new_records.clear()
154
- self._loaded_paths = set(out_df["path"].tolist())
File without changes