sibi-dst 0.3.34__tar.gz → 0.3.36__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/PKG-INFO +2 -1
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/pyproject.toml +2 -1
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/utils/data_wrapper.py +5 -1
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/utils/parquet_saver.py +91 -18
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/utils/storage_manager.py +1 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/README.md +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/__init__.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/df_helper/__init__.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/df_helper/_df_helper.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/df_helper/_parquet_reader.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/df_helper/backends/__init__.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/df_helper/backends/django/__init__.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/df_helper/backends/django/_db_connection.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/df_helper/backends/django/_io_dask.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/df_helper/backends/django/_load_from_db.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/df_helper/backends/django/_sql_model_builder.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/df_helper/backends/parquet/_filter_handler.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/df_helper/core/__init__.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/df_helper/core/_defaults.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/df_helper/core/_params_config.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/df_helper/core/_query_config.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/df_helper/data_cleaner.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/geopy_helper/__init__.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/geopy_helper/utils.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/osmnx_helper/__init__.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/osmnx_helper/utils.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/tests/__init__.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/utils/__init__.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/utils/airflow_manager.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/utils/clickhouse_writer.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/utils/credentials.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/utils/data_utils.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/utils/date_utils.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/utils/df_utils.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/utils/file_utils.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/utils/filepath_generator.py +0 -0
- {sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/utils/log_utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sibi-dst
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.36
|
4
4
|
Summary: Data Science Toolkit
|
5
5
|
Author: Luis Valverde
|
6
6
|
Author-email: lvalverdeb@gmail.com
|
@@ -10,6 +10,7 @@ Classifier: Programming Language :: Python :: 3.11
|
|
10
10
|
Classifier: Programming Language :: Python :: 3.12
|
11
11
|
Classifier: Programming Language :: Python :: 3.13
|
12
12
|
Requires-Dist: apache-airflow-client (>=2.10.0,<3.0.0)
|
13
|
+
Requires-Dist: boto3 (>=1.36.3,<2.0.0)
|
13
14
|
Requires-Dist: chardet (>=5.2.0,<6.0.0)
|
14
15
|
Requires-Dist: charset-normalizer (>=3.4.0,<4.0.0)
|
15
16
|
Requires-Dist: clickhouse-connect (>=0.8.7,<0.9.0)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "sibi-dst"
|
3
|
-
version = "0.3.
|
3
|
+
version = "0.3.36"
|
4
4
|
description = "Data Science Toolkit"
|
5
5
|
authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
|
6
6
|
readme = "README.md"
|
@@ -43,6 +43,7 @@ geopandas = "^1.0.1"
|
|
43
43
|
osmnx = "^2.0.1"
|
44
44
|
gunicorn = "^23.0.0"
|
45
45
|
uvicorn-worker = "^0.3.0"
|
46
|
+
boto3 = "^1.36.3"
|
46
47
|
|
47
48
|
|
48
49
|
[build-system]
|
@@ -205,6 +205,7 @@ class DataWrapper:
|
|
205
205
|
try:
|
206
206
|
# Get file info
|
207
207
|
info = self.fs.info(file_path)
|
208
|
+
self.logger.info(f"File info for {file_path}: {info}")
|
208
209
|
|
209
210
|
# Determine the modification time from available keys
|
210
211
|
file_modification_time = None
|
@@ -218,6 +219,7 @@ class DataWrapper:
|
|
218
219
|
info["LastModified"] if isinstance(info["LastModified"], datetime.datetime)
|
219
220
|
else datetime.datetime.strptime(info["LastModified"], "%Y-%m-%dT%H:%M:%S.%fZ")
|
220
221
|
)
|
222
|
+
self.logger.info(f"S3 File modification time: {file_modification_datetime}")
|
221
223
|
else:
|
222
224
|
self.logger.warning(f"Modification time not available for {file_path}.")
|
223
225
|
return True # Assume file is too old if we cannot determine its age
|
@@ -233,10 +235,12 @@ class DataWrapper:
|
|
233
235
|
|
234
236
|
except FileNotFoundError:
|
235
237
|
self.logger.warning(f"File {file_path} not found.")
|
238
|
+
if self.ignore_missing:
|
239
|
+
return False
|
236
240
|
return True # File is considered old if it doesn't exist
|
237
241
|
except Exception as e:
|
238
242
|
self.logger.error(f"Error checking file age for {file_path}: {str(e)}")
|
239
|
-
return True
|
243
|
+
return True
|
240
244
|
|
241
245
|
def process_date(self, date: datetime.date):
|
242
246
|
"""
|
@@ -1,9 +1,17 @@
|
|
1
|
+
import base64
|
1
2
|
from pathlib import Path
|
2
3
|
from typing import Optional
|
3
4
|
|
4
5
|
import pyarrow as pa
|
5
6
|
import fsspec
|
7
|
+
import warnings
|
8
|
+
import hashlib
|
9
|
+
from s3fs import S3FileSystem
|
6
10
|
|
11
|
+
from fsspec import filesystem
|
12
|
+
|
13
|
+
# Suppress the specific UserWarning message
|
14
|
+
warnings.filterwarnings("ignore")
|
7
15
|
from sibi_dst.utils import Logger
|
8
16
|
|
9
17
|
|
@@ -20,8 +28,8 @@ class ParquetSaver:
|
|
20
28
|
self.parquet_storage_path = parquet_storage_path.rstrip("/")
|
21
29
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
22
30
|
|
23
|
-
|
24
|
-
self.
|
31
|
+
self.fs = fs
|
32
|
+
self.protocol = self.parquet_storage_path.split(":")[0]
|
25
33
|
|
26
34
|
def save_to_parquet(self, parquet_filename: Optional[str] = None, clear_existing=True):
|
27
35
|
"""
|
@@ -30,7 +38,7 @@ class ParquetSaver:
|
|
30
38
|
:param clear_existing: Whether to clear existing files in the target directory.
|
31
39
|
"""
|
32
40
|
full_path = self._construct_full_path(parquet_filename)
|
33
|
-
|
41
|
+
self.logger.info(f"Save method for :{full_path}")
|
34
42
|
# Ensure directory exists and clear if necessary
|
35
43
|
self._ensure_directory_exists(full_path, clear_existing=clear_existing)
|
36
44
|
|
@@ -38,7 +46,9 @@ class ParquetSaver:
|
|
38
46
|
schema = self._define_schema()
|
39
47
|
self._convert_dtypes(schema)
|
40
48
|
self._save_dataframe_to_parquet(full_path, schema)
|
41
|
-
|
49
|
+
# Close the filesystem if the close method exists
|
50
|
+
if hasattr(self.fs, 'close') and callable(getattr(self.fs, 'close', None)):
|
51
|
+
self.fs.close()
|
42
52
|
|
43
53
|
def _define_schema(self) -> pa.Schema:
|
44
54
|
"""Define a PyArrow schema dynamically based on df_result column types."""
|
@@ -97,27 +107,90 @@ class ParquetSaver:
|
|
97
107
|
:param clear_existing: Whether to clear existing files/directories.
|
98
108
|
"""
|
99
109
|
directory = "/".join(full_path.split("/")[:-1])
|
100
|
-
|
101
110
|
if self.fs.exists(directory):
|
111
|
+
self.logger.info(f"Directory already exists: {directory}")
|
102
112
|
if clear_existing:
|
103
|
-
self.
|
104
|
-
|
105
|
-
|
113
|
+
self._clear_directory(directory)
|
114
|
+
|
115
|
+
if not self.fs.exists(directory):
|
106
116
|
self.logger.info(f"Creating directory: {directory}")
|
107
117
|
self.fs.mkdirs(directory, exist_ok=True)
|
108
118
|
|
109
119
|
def _save_dataframe_to_parquet(self, full_path: str, schema: pa.Schema):
|
110
120
|
"""Save the DataFrame to Parquet using the specified schema."""
|
111
|
-
|
112
|
-
|
113
|
-
self.fs.rm(full_path, recursive=True)
|
114
|
-
|
115
|
-
self.logger.info(f"Saving Parquet file to: {full_path}")
|
121
|
+
#self._clear_directory(full_path)
|
122
|
+
self.logger.info(f"Saving/Overwrite Parquet file to: {full_path}")
|
116
123
|
self.df_result.to_parquet(
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
124
|
+
path=full_path,
|
125
|
+
engine="pyarrow",
|
126
|
+
schema=schema,
|
127
|
+
#overwrite=True,
|
128
|
+
filesystem=self.fs,
|
129
|
+
write_index=False,
|
122
130
|
)
|
123
131
|
|
132
|
+
def _clear_directory(self, directory: str):
|
133
|
+
"""
|
134
|
+
Clears the specified directory by removing all the files within it. If the
|
135
|
+
directory is not associated with the "s3" protocol, it will be removed using
|
136
|
+
the local filesystem's functionality. For directories using the "s3" protocol,
|
137
|
+
the bucket name and prefix are parsed, and files are deleted using the S3
|
138
|
+
client's `delete_objects` method.
|
139
|
+
|
140
|
+
:param directory: The directory path to clear. If the protocol is not "s3", it
|
141
|
+
represents a local filesystem path. Otherwise, it is assumed
|
142
|
+
to be an S3 path (e.g., "s3://bucket_name/prefix").
|
143
|
+
:type directory: str
|
144
|
+
"""
|
145
|
+
if self.protocol != "s3":
|
146
|
+
if self.fs.exists(directory):
|
147
|
+
self.logger.info(f"Clearing directory: {directory}")
|
148
|
+
self.fs.rm(directory, recursive=True)
|
149
|
+
return
|
150
|
+
# Parse bucket name and prefix
|
151
|
+
bucket_name, prefix = self._parse_s3_path(directory)
|
152
|
+
|
153
|
+
# List files in the directory
|
154
|
+
files = self.fs.ls(directory, detail=True)
|
155
|
+
if not files:
|
156
|
+
self.logger.info(f"No files to delete in directory: {directory}")
|
157
|
+
return
|
158
|
+
|
159
|
+
# Generate the delete payload
|
160
|
+
objects_to_delete = [{"Key": file["name"].replace(f"{bucket_name}/", "", 1)} for file in files]
|
161
|
+
delete_payload = {
|
162
|
+
"Objects": objects_to_delete,
|
163
|
+
"Quiet": True,
|
164
|
+
}
|
165
|
+
|
166
|
+
# Calculate Content-MD5
|
167
|
+
payload_bytes = str(delete_payload).encode("utf-8")
|
168
|
+
md5_hash = hashlib.md5(payload_bytes).digest()
|
169
|
+
content_md5 = base64.b64encode(md5_hash).decode("utf-8")
|
170
|
+
|
171
|
+
# Use the underlying s3 client to delete objects
|
172
|
+
try:
|
173
|
+
self.fs.s3.delete_objects(
|
174
|
+
Bucket=bucket_name,
|
175
|
+
Delete=delete_payload,
|
176
|
+
ContentMD5=content_md5,
|
177
|
+
)
|
178
|
+
self.logger.info(f"Successfully deleted files in {directory}.")
|
179
|
+
except Exception as e:
|
180
|
+
self.logger.error(f"Failed to delete objects in {directory}: {e}")
|
181
|
+
pass
|
182
|
+
|
183
|
+
@staticmethod
|
184
|
+
def _parse_s3_path(s3_path: str):
|
185
|
+
"""
|
186
|
+
Parse an S3 path into bucket name and prefix.
|
187
|
+
:param s3_path: Full S3 path (e.g., s3://bucket-name/path/).
|
188
|
+
:return: Tuple of bucket name and prefix.
|
189
|
+
"""
|
190
|
+
if not s3_path.startswith("s3://"):
|
191
|
+
raise ValueError("Invalid S3 path. Must start with 's3://'.")
|
192
|
+
path_parts = s3_path[5:].split("/", 1)
|
193
|
+
bucket_name = path_parts[0]
|
194
|
+
prefix = path_parts[1] if len(path_parts) > 1 else ""
|
195
|
+
return bucket_name, prefix
|
196
|
+
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/df_helper/backends/django/_sql_model_builder.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py
RENAMED
File without changes
|
{sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-0.3.34 → sibi_dst-0.3.36}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|