sibi-dst 0.3.35__py3-none-any.whl → 0.3.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/utils/data_wrapper.py +5 -1
- sibi_dst/utils/parquet_saver.py +88 -17
- sibi_dst/utils/storage_manager.py +1 -0
- {sibi_dst-0.3.35.dist-info → sibi_dst-0.3.37.dist-info}/METADATA +1 -1
- {sibi_dst-0.3.35.dist-info → sibi_dst-0.3.37.dist-info}/RECORD +6 -6
- {sibi_dst-0.3.35.dist-info → sibi_dst-0.3.37.dist-info}/WHEEL +0 -0
sibi_dst/utils/data_wrapper.py
CHANGED
@@ -205,6 +205,7 @@ class DataWrapper:
|
|
205
205
|
try:
|
206
206
|
# Get file info
|
207
207
|
info = self.fs.info(file_path)
|
208
|
+
self.logger.info(f"File info for {file_path}: {info}")
|
208
209
|
|
209
210
|
# Determine the modification time from available keys
|
210
211
|
file_modification_time = None
|
@@ -218,6 +219,7 @@ class DataWrapper:
|
|
218
219
|
info["LastModified"] if isinstance(info["LastModified"], datetime.datetime)
|
219
220
|
else datetime.datetime.strptime(info["LastModified"], "%Y-%m-%dT%H:%M:%S.%fZ")
|
220
221
|
)
|
222
|
+
self.logger.info(f"S3 File modification time: {file_modification_datetime}")
|
221
223
|
else:
|
222
224
|
self.logger.warning(f"Modification time not available for {file_path}.")
|
223
225
|
return True # Assume file is too old if we cannot determine its age
|
@@ -233,10 +235,12 @@ class DataWrapper:
|
|
233
235
|
|
234
236
|
except FileNotFoundError:
|
235
237
|
self.logger.warning(f"File {file_path} not found.")
|
238
|
+
if self.ignore_missing:
|
239
|
+
return False
|
236
240
|
return True # File is considered old if it doesn't exist
|
237
241
|
except Exception as e:
|
238
242
|
self.logger.error(f"Error checking file age for {file_path}: {str(e)}")
|
239
|
-
return True
|
243
|
+
return True
|
240
244
|
|
241
245
|
def process_date(self, date: datetime.date):
|
242
246
|
"""
|
sibi_dst/utils/parquet_saver.py
CHANGED
@@ -1,9 +1,17 @@
|
|
1
|
+
import base64
|
1
2
|
from pathlib import Path
|
2
3
|
from typing import Optional
|
3
4
|
|
4
5
|
import pyarrow as pa
|
5
6
|
import fsspec
|
7
|
+
import warnings
|
8
|
+
import hashlib
|
9
|
+
from s3fs import S3FileSystem
|
6
10
|
|
11
|
+
from fsspec import filesystem
|
12
|
+
|
13
|
+
# Suppress the specific UserWarning message
|
14
|
+
warnings.filterwarnings("ignore")
|
7
15
|
from sibi_dst.utils import Logger
|
8
16
|
|
9
17
|
|
@@ -20,8 +28,8 @@ class ParquetSaver:
|
|
20
28
|
self.parquet_storage_path = parquet_storage_path.rstrip("/")
|
21
29
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
22
30
|
|
23
|
-
|
24
|
-
self.
|
31
|
+
self.fs = fs
|
32
|
+
self.protocol = self.parquet_storage_path.split(":")[0]
|
25
33
|
|
26
34
|
def save_to_parquet(self, parquet_filename: Optional[str] = None, clear_existing=True):
|
27
35
|
"""
|
@@ -30,7 +38,7 @@ class ParquetSaver:
|
|
30
38
|
:param clear_existing: Whether to clear existing files in the target directory.
|
31
39
|
"""
|
32
40
|
full_path = self._construct_full_path(parquet_filename)
|
33
|
-
|
41
|
+
self.logger.info(f"Save method for :{full_path}")
|
34
42
|
# Ensure directory exists and clear if necessary
|
35
43
|
self._ensure_directory_exists(full_path, clear_existing=clear_existing)
|
36
44
|
|
@@ -99,27 +107,90 @@ class ParquetSaver:
|
|
99
107
|
:param clear_existing: Whether to clear existing files/directories.
|
100
108
|
"""
|
101
109
|
directory = "/".join(full_path.split("/")[:-1])
|
102
|
-
|
103
110
|
if self.fs.exists(directory):
|
111
|
+
self.logger.info(f"Directory already exists: {directory}")
|
104
112
|
if clear_existing:
|
105
|
-
self.
|
106
|
-
|
107
|
-
|
113
|
+
self._clear_directory(directory)
|
114
|
+
|
115
|
+
if not self.fs.exists(directory):
|
108
116
|
self.logger.info(f"Creating directory: {directory}")
|
109
117
|
self.fs.mkdirs(directory, exist_ok=True)
|
110
118
|
|
111
119
|
def _save_dataframe_to_parquet(self, full_path: str, schema: pa.Schema):
|
112
120
|
"""Save the DataFrame to Parquet using the specified schema."""
|
113
|
-
|
114
|
-
|
115
|
-
self.fs.rm(full_path, recursive=True)
|
116
|
-
|
117
|
-
self.logger.info(f"Saving Parquet file to: {full_path}")
|
121
|
+
#self._clear_directory(full_path)
|
122
|
+
self.logger.info(f"Saving/Overwrite Parquet file to: {full_path}")
|
118
123
|
self.df_result.to_parquet(
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
+
path=full_path,
|
125
|
+
engine="pyarrow",
|
126
|
+
schema=schema,
|
127
|
+
#overwrite=True,
|
128
|
+
filesystem=self.fs,
|
129
|
+
write_index=False,
|
124
130
|
)
|
125
131
|
|
132
|
+
def _clear_directory(self, directory: str):
|
133
|
+
"""
|
134
|
+
Clears the specified directory by removing all the files within it. If the
|
135
|
+
directory is not associated with the "s3" protocol, it will be removed using
|
136
|
+
the local filesystem's functionality. For directories using the "s3" protocol,
|
137
|
+
the bucket name and prefix are parsed, and files are deleted using the S3
|
138
|
+
client's `delete_objects` method.
|
139
|
+
|
140
|
+
:param directory: The directory path to clear. If the protocol is not "s3", it
|
141
|
+
represents a local filesystem path. Otherwise, it is assumed
|
142
|
+
to be an S3 path (e.g., "s3://bucket_name/prefix").
|
143
|
+
:type directory: str
|
144
|
+
"""
|
145
|
+
if self.protocol != "s3":
|
146
|
+
if self.fs.exists(directory):
|
147
|
+
self.logger.info(f"Clearing directory: {directory}")
|
148
|
+
self.fs.rm(directory, recursive=True)
|
149
|
+
return
|
150
|
+
# Parse bucket name and prefix
|
151
|
+
bucket_name, prefix = self._parse_s3_path(directory)
|
152
|
+
|
153
|
+
# List files in the directory
|
154
|
+
files = self.fs.ls(directory, detail=True)
|
155
|
+
if not files:
|
156
|
+
self.logger.info(f"No files to delete in directory: {directory}")
|
157
|
+
return
|
158
|
+
|
159
|
+
# Generate the delete payload
|
160
|
+
objects_to_delete = [{"Key": file["name"].replace(f"{bucket_name}/", "", 1)} for file in files]
|
161
|
+
delete_payload = {
|
162
|
+
"Objects": objects_to_delete,
|
163
|
+
"Quiet": True,
|
164
|
+
}
|
165
|
+
|
166
|
+
# Calculate Content-MD5
|
167
|
+
payload_bytes = str(delete_payload).encode("utf-8")
|
168
|
+
md5_hash = hashlib.md5(payload_bytes).digest()
|
169
|
+
content_md5 = base64.b64encode(md5_hash).decode("utf-8")
|
170
|
+
|
171
|
+
# Use the underlying s3 client to delete objects
|
172
|
+
try:
|
173
|
+
self.fs.s3.delete_objects(
|
174
|
+
Bucket=bucket_name,
|
175
|
+
Delete=delete_payload,
|
176
|
+
ContentMD5=content_md5,
|
177
|
+
)
|
178
|
+
self.logger.info(f"Successfully deleted files in {directory}.")
|
179
|
+
except Exception as e:
|
180
|
+
self.logger.error(f"Failed to delete objects in {directory}: {e}")
|
181
|
+
pass
|
182
|
+
|
183
|
+
@staticmethod
|
184
|
+
def _parse_s3_path(s3_path: str):
|
185
|
+
"""
|
186
|
+
Parse an S3 path into bucket name and prefix.
|
187
|
+
:param s3_path: Full S3 path (e.g., s3://bucket-name/path/).
|
188
|
+
:return: Tuple of bucket name and prefix.
|
189
|
+
"""
|
190
|
+
if not s3_path.startswith("s3://"):
|
191
|
+
raise ValueError("Invalid S3 path. Must start with 's3://'.")
|
192
|
+
path_parts = s3_path[5:].split("/", 1)
|
193
|
+
bucket_name = path_parts[0]
|
194
|
+
prefix = path_parts[1] if len(path_parts) > 1 else ""
|
195
|
+
return bucket_name, prefix
|
196
|
+
|
@@ -42,14 +42,14 @@ sibi_dst/utils/airflow_manager.py,sha256=-d44EKUZNYJyp4wuNwRvilRQktunArPOB5fZuWd
|
|
42
42
|
sibi_dst/utils/clickhouse_writer.py,sha256=syXGN9NG1FS8soHuMj6QNRqTRWi-thuYUF-_BWDc_KI,9883
|
43
43
|
sibi_dst/utils/credentials.py,sha256=cHJPPsmVyijqbUQIq7WWPe-lIallA-mI5RAy3YUuRME,1724
|
44
44
|
sibi_dst/utils/data_utils.py,sha256=j-lEKt6EJL2fm0z7adcjtVG7yFYLRpQL8xSgh2CVmJg,8769
|
45
|
-
sibi_dst/utils/data_wrapper.py,sha256=
|
45
|
+
sibi_dst/utils/data_wrapper.py,sha256=V9p1CKvlmT3yyfZ5d2BIJSvz0Ee1TIA4hufpTIYo6v8,16684
|
46
46
|
sibi_dst/utils/date_utils.py,sha256=ei7WgzIUk1tRa3sHniaVm_lNmfTGq12b_HzmMV91k18,12407
|
47
47
|
sibi_dst/utils/df_utils.py,sha256=OFEtcwVKIilvf9qVf-IfIOHp4jcFAHX5l2IDGudhPZg,10989
|
48
48
|
sibi_dst/utils/file_utils.py,sha256=JpsybYj3XvVJisSBeVU6YSaZnYRm4_6YWTI3TLnnY4Y,1257
|
49
49
|
sibi_dst/utils/filepath_generator.py,sha256=volVm0SSlBrtZp1RpTHxyui5rj5asNcVsWEBRY5FOUQ,6673
|
50
50
|
sibi_dst/utils/log_utils.py,sha256=XUbeXa1JsOlcEJyW8jnBlWo295rLUnuYi-HMzyhHwJg,3145
|
51
|
-
sibi_dst/utils/parquet_saver.py,sha256=
|
52
|
-
sibi_dst/utils/storage_manager.py,sha256
|
53
|
-
sibi_dst-0.3.
|
54
|
-
sibi_dst-0.3.
|
55
|
-
sibi_dst-0.3.
|
51
|
+
sibi_dst/utils/parquet_saver.py,sha256=_QkXL3IiC2b4m7sxHpCSeqPwBWxXeiP5sH_WheSMEm4,8042
|
52
|
+
sibi_dst/utils/storage_manager.py,sha256=-zlMrRo_6o6mCd_OHknKqNQl7m0I9VW89grDAUO1V5c,4229
|
53
|
+
sibi_dst-0.3.37.dist-info/METADATA,sha256=A09_8PykbkfOjpCHIpiij4Wn-nlS2tU6gsR0-MngZUQ,2564
|
54
|
+
sibi_dst-0.3.37.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
55
|
+
sibi_dst-0.3.37.dist-info/RECORD,,
|
File without changes
|