sibi-dst 0.3.35__py3-none-any.whl → 0.3.36__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -205,6 +205,7 @@ class DataWrapper:
205
205
  try:
206
206
  # Get file info
207
207
  info = self.fs.info(file_path)
208
+ self.logger.info(f"File info for {file_path}: {info}")
208
209
 
209
210
  # Determine the modification time from available keys
210
211
  file_modification_time = None
@@ -218,6 +219,7 @@ class DataWrapper:
218
219
  info["LastModified"] if isinstance(info["LastModified"], datetime.datetime)
219
220
  else datetime.datetime.strptime(info["LastModified"], "%Y-%m-%dT%H:%M:%S.%fZ")
220
221
  )
222
+ self.logger.info(f"S3 File modification time: {file_modification_datetime}")
221
223
  else:
222
224
  self.logger.warning(f"Modification time not available for {file_path}.")
223
225
  return True # Assume file is too old if we cannot determine its age
@@ -233,10 +235,12 @@ class DataWrapper:
233
235
 
234
236
  except FileNotFoundError:
235
237
  self.logger.warning(f"File {file_path} not found.")
238
+ if self.ignore_missing:
239
+ return False
236
240
  return True # File is considered old if it doesn't exist
237
241
  except Exception as e:
238
242
  self.logger.error(f"Error checking file age for {file_path}: {str(e)}")
239
- return True #
243
+ return True
240
244
 
241
245
  def process_date(self, date: datetime.date):
242
246
  """
@@ -1,9 +1,17 @@
1
+ import base64
1
2
  from pathlib import Path
2
3
  from typing import Optional
3
4
 
4
5
  import pyarrow as pa
5
6
  import fsspec
7
+ import warnings
8
+ import hashlib
9
+ from s3fs import S3FileSystem
6
10
 
11
+ from fsspec import filesystem
12
+
13
+ # Suppress the specific UserWarning message
14
+ warnings.filterwarnings("ignore")
7
15
  from sibi_dst.utils import Logger
8
16
 
9
17
 
@@ -20,8 +28,8 @@ class ParquetSaver:
20
28
  self.parquet_storage_path = parquet_storage_path.rstrip("/")
21
29
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
22
30
 
23
- # Default to the local filesystem if `fs` is not provided
24
- self.fs = fs or fsspec.filesystem("file")
31
+ self.fs = fs
32
+ self.protocol = self.parquet_storage_path.split(":")[0]
25
33
 
26
34
  def save_to_parquet(self, parquet_filename: Optional[str] = None, clear_existing=True):
27
35
  """
@@ -30,7 +38,7 @@ class ParquetSaver:
30
38
  :param clear_existing: Whether to clear existing files in the target directory.
31
39
  """
32
40
  full_path = self._construct_full_path(parquet_filename)
33
-
41
+ self.logger.info(f"Save method for :{full_path}")
34
42
  # Ensure directory exists and clear if necessary
35
43
  self._ensure_directory_exists(full_path, clear_existing=clear_existing)
36
44
 
@@ -99,27 +107,90 @@ class ParquetSaver:
99
107
  :param clear_existing: Whether to clear existing files/directories.
100
108
  """
101
109
  directory = "/".join(full_path.split("/")[:-1])
102
-
103
110
  if self.fs.exists(directory):
111
+ self.logger.info(f"Directory already exists: {directory}")
104
112
  if clear_existing:
105
- self.logger.info(f"Clearing existing directory: {directory}")
106
- self.fs.rm(directory, recursive=True)
107
- else:
113
+ self._clear_directory(directory)
114
+
115
+ if not self.fs.exists(directory):
108
116
  self.logger.info(f"Creating directory: {directory}")
109
117
  self.fs.mkdirs(directory, exist_ok=True)
110
118
 
111
119
  def _save_dataframe_to_parquet(self, full_path: str, schema: pa.Schema):
112
120
  """Save the DataFrame to Parquet using the specified schema."""
113
- if self.fs.exists(full_path):
114
- self.logger.info(f"Overwriting existing file: {full_path}")
115
- self.fs.rm(full_path, recursive=True)
116
-
117
- self.logger.info(f"Saving Parquet file to: {full_path}")
121
+ #self._clear_directory(full_path)
122
+ self.logger.info(f"Saving/Overwrite Parquet file to: {full_path}")
118
123
  self.df_result.to_parquet(
119
- full_path,
120
- engine="pyarrow",
121
- schema=schema,
122
- storage_options=self.fs.storage_options if hasattr(self.fs, "storage_options") else None,
123
- write_index=False,
124
+ path=full_path,
125
+ engine="pyarrow",
126
+ schema=schema,
127
+ #overwrite=True,
128
+ filesystem=self.fs,
129
+ write_index=False,
124
130
  )
125
131
 
132
+ def _clear_directory(self, directory: str):
133
+ """
134
+ Clears the specified directory by removing all the files within it. If the
135
+ directory is not associated with the "s3" protocol, it will be removed using
136
+ the local filesystem's functionality. For directories using the "s3" protocol,
137
+ the bucket name and prefix are parsed, and files are deleted using the S3
138
+ client's `delete_objects` method.
139
+
140
+ :param directory: The directory path to clear. If the protocol is not "s3", it
141
+ represents a local filesystem path. Otherwise, it is assumed
142
+ to be an S3 path (e.g., "s3://bucket_name/prefix").
143
+ :type directory: str
144
+ """
145
+ if self.protocol != "s3":
146
+ if self.fs.exists(directory):
147
+ self.logger.info(f"Clearing directory: {directory}")
148
+ self.fs.rm(directory, recursive=True)
149
+ return
150
+ # Parse bucket name and prefix
151
+ bucket_name, prefix = self._parse_s3_path(directory)
152
+
153
+ # List files in the directory
154
+ files = self.fs.ls(directory, detail=True)
155
+ if not files:
156
+ self.logger.info(f"No files to delete in directory: {directory}")
157
+ return
158
+
159
+ # Generate the delete payload
160
+ objects_to_delete = [{"Key": file["name"].replace(f"{bucket_name}/", "", 1)} for file in files]
161
+ delete_payload = {
162
+ "Objects": objects_to_delete,
163
+ "Quiet": True,
164
+ }
165
+
166
+ # Calculate Content-MD5
167
+ payload_bytes = str(delete_payload).encode("utf-8")
168
+ md5_hash = hashlib.md5(payload_bytes).digest()
169
+ content_md5 = base64.b64encode(md5_hash).decode("utf-8")
170
+
171
+ # Use the underlying s3 client to delete objects
172
+ try:
173
+ self.fs.s3.delete_objects(
174
+ Bucket=bucket_name,
175
+ Delete=delete_payload,
176
+ ContentMD5=content_md5,
177
+ )
178
+ self.logger.info(f"Successfully deleted files in {directory}.")
179
+ except Exception as e:
180
+ self.logger.error(f"Failed to delete objects in {directory}: {e}")
181
+ pass
182
+
183
+ @staticmethod
184
+ def _parse_s3_path(s3_path: str):
185
+ """
186
+ Parse an S3 path into bucket name and prefix.
187
+ :param s3_path: Full S3 path (e.g., s3://bucket-name/path/).
188
+ :return: Tuple of bucket name and prefix.
189
+ """
190
+ if not s3_path.startswith("s3://"):
191
+ raise ValueError("Invalid S3 path. Must start with 's3://'.")
192
+ path_parts = s3_path[5:].split("/", 1)
193
+ bucket_name = path_parts[0]
194
+ prefix = path_parts[1] if len(path_parts) > 1 else ""
195
+ return bucket_name, prefix
196
+
@@ -15,6 +15,7 @@ class StorageManager:
15
15
  self.storage_path = storage_path.rstrip("/")
16
16
  self.fs_type = fs_type
17
17
  self.fs_options = fs_options or {}
18
+
18
19
  self.fs = fsspec.filesystem(fs_type, **self.fs_options)
19
20
  self.depot_paths = {}
20
21
  self.depot_name = None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.35
3
+ Version: 0.3.36
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -10,6 +10,7 @@ Classifier: Programming Language :: Python :: 3.11
10
10
  Classifier: Programming Language :: Python :: 3.12
11
11
  Classifier: Programming Language :: Python :: 3.13
12
12
  Requires-Dist: apache-airflow-client (>=2.10.0,<3.0.0)
13
+ Requires-Dist: boto3 (>=1.36.3,<2.0.0)
13
14
  Requires-Dist: chardet (>=5.2.0,<6.0.0)
14
15
  Requires-Dist: charset-normalizer (>=3.4.0,<4.0.0)
15
16
  Requires-Dist: clickhouse-connect (>=0.8.7,<0.9.0)
@@ -42,14 +42,14 @@ sibi_dst/utils/airflow_manager.py,sha256=-d44EKUZNYJyp4wuNwRvilRQktunArPOB5fZuWd
42
42
  sibi_dst/utils/clickhouse_writer.py,sha256=syXGN9NG1FS8soHuMj6QNRqTRWi-thuYUF-_BWDc_KI,9883
43
43
  sibi_dst/utils/credentials.py,sha256=cHJPPsmVyijqbUQIq7WWPe-lIallA-mI5RAy3YUuRME,1724
44
44
  sibi_dst/utils/data_utils.py,sha256=j-lEKt6EJL2fm0z7adcjtVG7yFYLRpQL8xSgh2CVmJg,8769
45
- sibi_dst/utils/data_wrapper.py,sha256=Ybmn9V7XYuPdUliMz1QrwrXraR_YFOzSr2zJmZOVmWM,16462
45
+ sibi_dst/utils/data_wrapper.py,sha256=V9p1CKvlmT3yyfZ5d2BIJSvz0Ee1TIA4hufpTIYo6v8,16684
46
46
  sibi_dst/utils/date_utils.py,sha256=ei7WgzIUk1tRa3sHniaVm_lNmfTGq12b_HzmMV91k18,12407
47
47
  sibi_dst/utils/df_utils.py,sha256=OFEtcwVKIilvf9qVf-IfIOHp4jcFAHX5l2IDGudhPZg,10989
48
48
  sibi_dst/utils/file_utils.py,sha256=JpsybYj3XvVJisSBeVU6YSaZnYRm4_6YWTI3TLnnY4Y,1257
49
49
  sibi_dst/utils/filepath_generator.py,sha256=volVm0SSlBrtZp1RpTHxyui5rj5asNcVsWEBRY5FOUQ,6673
50
50
  sibi_dst/utils/log_utils.py,sha256=XUbeXa1JsOlcEJyW8jnBlWo295rLUnuYi-HMzyhHwJg,3145
51
- sibi_dst/utils/parquet_saver.py,sha256=Wj3jDlAXHACma6ea1zfuH0I3AJS2OVuwBG3lY2yZ27k,5238
52
- sibi_dst/utils/storage_manager.py,sha256=qHo5vTv-dr1roRr_mOcprSTdlAfH4Q2Dy5tQUz06Pnk,4228
53
- sibi_dst-0.3.35.dist-info/METADATA,sha256=emeZrFZZi6ixveE_vAwcb_ZD8AU24YnepxB6H1kMSb4,2564
54
- sibi_dst-0.3.35.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
55
- sibi_dst-0.3.35.dist-info/RECORD,,
51
+ sibi_dst/utils/parquet_saver.py,sha256=_QkXL3IiC2b4m7sxHpCSeqPwBWxXeiP5sH_WheSMEm4,8042
52
+ sibi_dst/utils/storage_manager.py,sha256=-zlMrRo_6o6mCd_OHknKqNQl7m0I9VW89grDAUO1V5c,4229
53
+ sibi_dst-0.3.36.dist-info/METADATA,sha256=B85r0xqRCktxcXQhMwjCzLJO9-c5G-pU7a7jfkrcqmQ,2603
54
+ sibi_dst-0.3.36.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
55
+ sibi_dst-0.3.36.dist-info/RECORD,,